From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/librbd/AsioEngine.cc | 56 + src/librbd/AsioEngine.h | 80 + src/librbd/AsyncObjectThrottle.cc | 108 + src/librbd/AsyncObjectThrottle.h | 79 + src/librbd/AsyncRequest.cc | 71 + src/librbd/AsyncRequest.h | 76 + src/librbd/BlockGuard.h | 177 + src/librbd/CMakeLists.txt | 358 + src/librbd/ConfigWatcher.cc | 116 + src/librbd/ConfigWatcher.h | 47 + src/librbd/DeepCopyRequest.cc | 361 + src/librbd/DeepCopyRequest.h | 138 + src/librbd/ExclusiveLock.cc | 388 + src/librbd/ExclusiveLock.h | 117 + src/librbd/Features.cc | 111 + src/librbd/Features.h | 16 + src/librbd/ImageCtx.cc | 1029 +++ src/librbd/ImageCtx.h | 368 + src/librbd/ImageState.cc | 1040 +++ src/librbd/ImageState.h | 155 + src/librbd/ImageWatcher.cc | 1556 ++++ src/librbd/ImageWatcher.h | 313 + src/librbd/Journal.cc | 1862 +++++ src/librbd/Journal.h | 380 + src/librbd/LibrbdAdminSocketHook.cc | 92 + src/librbd/LibrbdAdminSocketHook.h | 35 + src/librbd/ManagedLock.cc | 859 +++ src/librbd/ManagedLock.h | 270 + src/librbd/MirroringWatcher.cc | 142 + src/librbd/MirroringWatcher.h | 67 + src/librbd/ObjectMap.cc | 380 + src/librbd/ObjectMap.h | 175 + src/librbd/Operations.cc | 1944 +++++ src/librbd/Operations.h | 158 + src/librbd/PluginRegistry.cc | 101 + src/librbd/PluginRegistry.h | 51 + src/librbd/TaskFinisher.h | 179 + src/librbd/TrashWatcher.cc | 116 + src/librbd/TrashWatcher.h | 58 + src/librbd/Types.h | 142 + src/librbd/Utils.cc | 246 + src/librbd/Utils.h | 286 + src/librbd/WatchNotifyTypes.cc | 557 ++ src/librbd/WatchNotifyTypes.h | 532 ++ src/librbd/Watcher.cc | 370 + src/librbd/Watcher.h | 183 + src/librbd/api/Config.cc | 233 + src/librbd/api/Config.h | 37 + src/librbd/api/DiffIterate.cc | 378 + src/librbd/api/DiffIterate.h | 66 + src/librbd/api/Group.cc | 1287 ++++ src/librbd/api/Group.h | 60 + src/librbd/api/Image.cc | 1015 +++ src/librbd/api/Image.h | 85 + src/librbd/api/Io.cc | 555 ++ src/librbd/api/Io.h | 65 + src/librbd/api/Migration.cc | 2126 ++++++ src/librbd/api/Migration.h | 113 + src/librbd/api/Mirror.cc | 2104 ++++++ src/librbd/api/Mirror.h | 126 + src/librbd/api/Namespace.cc | 235 + src/librbd/api/Namespace.h | 33 + src/librbd/api/Pool.cc | 375 + src/librbd/api/Pool.h | 38 + src/librbd/api/PoolMetadata.cc | 156 + src/librbd/api/PoolMetadata.h | 37 + src/librbd/api/Snapshot.cc | 444 ++ src/librbd/api/Snapshot.h | 67 + src/librbd/api/Trash.cc | 759 ++ src/librbd/api/Trash.h | 53 + src/librbd/api/Utils.cc | 102 + src/librbd/api/Utils.h | 28 + src/librbd/asio/ContextWQ.cc | 49 + src/librbd/asio/ContextWQ.h | 52 + src/librbd/asio/Utils.h | 33 + src/librbd/cache/ImageWriteback.cc | 146 + src/librbd/cache/ImageWriteback.h | 77 + src/librbd/cache/ObjectCacherObjectDispatch.cc | 486 ++ src/librbd/cache/ObjectCacherObjectDispatch.h | 132 + src/librbd/cache/ObjectCacherWriteback.cc | 287 + src/librbd/cache/ObjectCacherWriteback.h | 78 + src/librbd/cache/ParentCacheObjectDispatch.cc | 261 + src/librbd/cache/ParentCacheObjectDispatch.h | 161 + src/librbd/cache/TypeTraits.h | 26 + src/librbd/cache/Types.h | 28 + src/librbd/cache/Utils.h | 33 + src/librbd/cache/WriteAroundObjectDispatch.cc | 525 ++ src/librbd/cache/WriteAroundObjectDispatch.h | 212 + src/librbd/cache/WriteLogImageDispatch.cc | 235 + src/librbd/cache/WriteLogImageDispatch.h | 105 + src/librbd/cache/pwl/AbstractWriteLog.cc | 2187 ++++++ src/librbd/cache/pwl/AbstractWriteLog.h | 410 ++ src/librbd/cache/pwl/Builder.h | 61 + src/librbd/cache/pwl/DiscardRequest.cc | 160 + src/librbd/cache/pwl/DiscardRequest.h | 90 + src/librbd/cache/pwl/ImageCacheState.cc | 196 + src/librbd/cache/pwl/ImageCacheState.h | 86 + src/librbd/cache/pwl/InitRequest.cc | 226 + src/librbd/cache/pwl/InitRequest.h | 105 + src/librbd/cache/pwl/LogEntry.cc | 135 + src/librbd/cache/pwl/LogEntry.h | 280 + src/librbd/cache/pwl/LogMap.cc | 278 + src/librbd/cache/pwl/LogMap.h | 81 + src/librbd/cache/pwl/LogOperation.cc | 312 + src/librbd/cache/pwl/LogOperation.h | 224 + src/librbd/cache/pwl/ReadRequest.h | 45 + src/librbd/cache/pwl/Request.cc | 562 ++ src/librbd/cache/pwl/Request.h | 361 + src/librbd/cache/pwl/ShutdownRequest.cc | 161 + src/librbd/cache/pwl/ShutdownRequest.h | 95 + src/librbd/cache/pwl/SyncPoint.cc | 109 + src/librbd/cache/pwl/SyncPoint.h | 69 + src/librbd/cache/pwl/Types.cc | 185 + src/librbd/cache/pwl/Types.h | 445 ++ src/librbd/cache/pwl/rwl/Builder.h | 107 + src/librbd/cache/pwl/rwl/LogEntry.cc | 106 + src/librbd/cache/pwl/rwl/LogEntry.h | 68 + src/librbd/cache/pwl/rwl/LogOperation.cc | 39 + src/librbd/cache/pwl/rwl/LogOperation.h | 55 + src/librbd/cache/pwl/rwl/ReadRequest.cc | 70 + src/librbd/cache/pwl/rwl/ReadRequest.h | 34 + src/librbd/cache/pwl/rwl/Request.cc | 86 + src/librbd/cache/pwl/rwl/Request.h | 90 + src/librbd/cache/pwl/rwl/WriteLog.cc | 1011 +++ src/librbd/cache/pwl/rwl/WriteLog.h | 124 + src/librbd/cache/pwl/ssd/Builder.h | 108 + src/librbd/cache/pwl/ssd/LogEntry.cc | 63 + src/librbd/cache/pwl/ssd/LogEntry.h | 75 + src/librbd/cache/pwl/ssd/LogOperation.cc | 36 + src/librbd/cache/pwl/ssd/LogOperation.h | 35 + src/librbd/cache/pwl/ssd/ReadRequest.cc | 92 + src/librbd/cache/pwl/ssd/ReadRequest.h | 34 + src/librbd/cache/pwl/ssd/Request.cc | 63 + src/librbd/cache/pwl/ssd/Request.h | 92 + src/librbd/cache/pwl/ssd/Types.h | 51 + src/librbd/cache/pwl/ssd/WriteLog.cc | 1160 +++ src/librbd/cache/pwl/ssd/WriteLog.h | 156 + src/librbd/crypto/BlockCrypto.cc | 132 + src/librbd/crypto/BlockCrypto.h | 60 + src/librbd/crypto/CryptoContextPool.cc | 44 + src/librbd/crypto/CryptoContextPool.h | 68 + src/librbd/crypto/CryptoImageDispatch.cc | 49 + src/librbd/crypto/CryptoImageDispatch.h | 111 + src/librbd/crypto/CryptoInterface.h | 125 + src/librbd/crypto/CryptoObjectDispatch.cc | 691 ++ src/librbd/crypto/CryptoObjectDispatch.h | 115 + src/librbd/crypto/DataCryptor.h | 37 + src/librbd/crypto/EncryptionFormat.h | 33 + src/librbd/crypto/FormatRequest.cc | 136 + src/librbd/crypto/FormatRequest.h | 49 + src/librbd/crypto/LoadRequest.cc | 195 + src/librbd/crypto/LoadRequest.h | 58 + src/librbd/crypto/ShutDownCryptoRequest.cc | 116 + src/librbd/crypto/ShutDownCryptoRequest.h | 43 + src/librbd/crypto/Types.h | 18 + src/librbd/crypto/Utils.cc | 79 + src/librbd/crypto/Utils.h | 33 + src/librbd/crypto/luks/FlattenRequest.cc | 154 + src/librbd/crypto/luks/FlattenRequest.h | 65 + src/librbd/crypto/luks/FormatRequest.cc | 200 + src/librbd/crypto/luks/FormatRequest.h | 59 + src/librbd/crypto/luks/Header.cc | 261 + src/librbd/crypto/luks/Header.h | 52 + src/librbd/crypto/luks/LUKSEncryptionFormat.cc | 85 + src/librbd/crypto/luks/LUKSEncryptionFormat.h | 100 + src/librbd/crypto/luks/LoadRequest.cc | 272 + src/librbd/crypto/luks/LoadRequest.h | 71 + src/librbd/crypto/luks/Magic.cc | 139 + src/librbd/crypto/luks/Magic.h | 32 + src/librbd/crypto/openssl/DataCryptor.cc | 153 + src/librbd/crypto/openssl/DataCryptor.h | 49 + src/librbd/deep_copy/Handler.h | 50 + src/librbd/deep_copy/ImageCopyRequest.cc | 278 + src/librbd/deep_copy/ImageCopyRequest.h | 123 + src/librbd/deep_copy/MetadataCopyRequest.cc | 117 + src/librbd/deep_copy/MetadataCopyRequest.h | 78 + src/librbd/deep_copy/ObjectCopyRequest.cc | 839 +++ src/librbd/deep_copy/ObjectCopyRequest.h | 163 + src/librbd/deep_copy/SetHeadRequest.cc | 223 + src/librbd/deep_copy/SetHeadRequest.h | 87 + src/librbd/deep_copy/SnapshotCopyRequest.cc | 729 ++ src/librbd/deep_copy/SnapshotCopyRequest.h | 151 + src/librbd/deep_copy/SnapshotCreateRequest.cc | 187 + src/librbd/deep_copy/SnapshotCreateRequest.h | 98 + src/librbd/deep_copy/Types.h | 28 + src/librbd/deep_copy/Utils.cc | 61 + src/librbd/deep_copy/Utils.h | 29 + src/librbd/exclusive_lock/AutomaticPolicy.cc | 29 + src/librbd/exclusive_lock/AutomaticPolicy.h | 34 + src/librbd/exclusive_lock/ImageDispatch.cc | 320 + src/librbd/exclusive_lock/ImageDispatch.h | 133 + src/librbd/exclusive_lock/Policy.h | 31 + src/librbd/exclusive_lock/PostAcquireRequest.cc | 368 + src/librbd/exclusive_lock/PostAcquireRequest.h | 124 + src/librbd/exclusive_lock/PreAcquireRequest.cc | 95 + src/librbd/exclusive_lock/PreAcquireRequest.h | 75 + src/librbd/exclusive_lock/PreReleaseRequest.cc | 363 + src/librbd/exclusive_lock/PreReleaseRequest.h | 139 + src/librbd/exclusive_lock/StandardPolicy.cc | 29 + src/librbd/exclusive_lock/StandardPolicy.h | 37 + src/librbd/image/AttachChildRequest.cc | 261 + src/librbd/image/AttachChildRequest.h | 105 + src/librbd/image/AttachParentRequest.cc | 90 + src/librbd/image/AttachParentRequest.h | 79 + src/librbd/image/CloneRequest.cc | 607 ++ src/librbd/image/CloneRequest.h | 181 + src/librbd/image/CloseRequest.cc | 350 + src/librbd/image/CloseRequest.h | 127 + src/librbd/image/CreateRequest.cc | 835 +++ src/librbd/image/CreateRequest.h | 191 + src/librbd/image/DetachChildRequest.cc | 392 + src/librbd/image/DetachChildRequest.h | 119 + src/librbd/image/DetachParentRequest.cc | 81 + src/librbd/image/DetachParentRequest.h | 66 + src/librbd/image/GetMetadataRequest.cc | 121 + src/librbd/image/GetMetadataRequest.h | 83 + src/librbd/image/ListWatchersRequest.cc | 174 + src/librbd/image/ListWatchersRequest.h | 82 + src/librbd/image/OpenRequest.cc | 727 ++ src/librbd/image/OpenRequest.h | 149 + src/librbd/image/PreRemoveRequest.cc | 348 + src/librbd/image/PreRemoveRequest.h | 100 + src/librbd/image/RefreshParentRequest.cc | 244 + src/librbd/image/RefreshParentRequest.h | 109 + src/librbd/image/RefreshRequest.cc | 1575 +++++ src/librbd/image/RefreshRequest.h | 275 + src/librbd/image/RemoveRequest.cc | 617 ++ src/librbd/image/RemoveRequest.h | 197 + src/librbd/image/SetFlagsRequest.cc | 78 + src/librbd/image/SetFlagsRequest.h | 61 + src/librbd/image/SetSnapRequest.cc | 368 + src/librbd/image/SetSnapRequest.h | 118 + src/librbd/image/TypeTraits.h | 21 + src/librbd/image/Types.h | 20 + src/librbd/image/ValidatePoolRequest.cc | 234 + src/librbd/image/ValidatePoolRequest.h | 93 + src/librbd/image_watcher/NotifyLockOwner.cc | 96 + src/librbd/image_watcher/NotifyLockOwner.h | 50 + src/librbd/internal.cc | 1740 +++++ src/librbd/internal.h | 145 + src/librbd/io/AioCompletion.cc | 294 + src/librbd/io/AioCompletion.h | 203 + src/librbd/io/AsyncOperation.cc | 94 + src/librbd/io/AsyncOperation.h | 52 + src/librbd/io/CopyupRequest.cc | 773 ++ src/librbd/io/CopyupRequest.h | 145 + src/librbd/io/Dispatcher.h | 252 + src/librbd/io/DispatcherInterface.h | 37 + src/librbd/io/FlushTracker.cc | 126 + src/librbd/io/FlushTracker.h | 61 + src/librbd/io/ImageDispatch.cc | 200 + src/librbd/io/ImageDispatch.h | 95 + src/librbd/io/ImageDispatchInterface.h | 87 + src/librbd/io/ImageDispatchSpec.cc | 54 + src/librbd/io/ImageDispatchSpec.h | 254 + src/librbd/io/ImageDispatcher.cc | 324 + src/librbd/io/ImageDispatcher.h | 77 + src/librbd/io/ImageDispatcherInterface.h | 41 + src/librbd/io/ImageRequest.cc | 909 +++ src/librbd/io/ImageRequest.h | 377 + src/librbd/io/IoOperations.cc | 101 + src/librbd/io/IoOperations.h | 18 + src/librbd/io/ObjectDispatch.cc | 161 + src/librbd/io/ObjectDispatch.h | 115 + src/librbd/io/ObjectDispatchInterface.h | 102 + src/librbd/io/ObjectDispatchSpec.cc | 47 + src/librbd/io/ObjectDispatchSpec.h | 295 + src/librbd/io/ObjectDispatcher.cc | 208 + src/librbd/io/ObjectDispatcher.h | 60 + src/librbd/io/ObjectDispatcherInterface.h | 35 + src/librbd/io/ObjectRequest.cc | 1073 +++ src/librbd/io/ObjectRequest.h | 505 ++ src/librbd/io/QosImageDispatch.cc | 328 + src/librbd/io/QosImageDispatch.h | 135 + src/librbd/io/QueueImageDispatch.cc | 154 + src/librbd/io/QueueImageDispatch.h | 110 + src/librbd/io/ReadResult.cc | 262 + src/librbd/io/ReadResult.h | 129 + src/librbd/io/RefreshImageDispatch.cc | 166 + src/librbd/io/RefreshImageDispatch.h | 101 + src/librbd/io/SimpleSchedulerObjectDispatch.cc | 565 ++ src/librbd/io/SimpleSchedulerObjectDispatch.h | 227 + src/librbd/io/TypeTraits.h | 20 + src/librbd/io/Types.cc | 49 + src/librbd/io/Types.h | 328 + src/librbd/io/Utils.cc | 249 + src/librbd/io/Utils.h | 83 + src/librbd/io/WriteBlockImageDispatch.cc | 270 + src/librbd/io/WriteBlockImageDispatch.h | 134 + src/librbd/journal/CreateRequest.cc | 234 + src/librbd/journal/CreateRequest.h | 106 + src/librbd/journal/DemoteRequest.cc | 255 + src/librbd/journal/DemoteRequest.h | 107 + src/librbd/journal/DisabledPolicy.h | 31 + src/librbd/journal/ObjectDispatch.cc | 257 + src/librbd/journal/ObjectDispatch.h | 124 + src/librbd/journal/OpenRequest.cc | 144 + src/librbd/journal/OpenRequest.h | 85 + src/librbd/journal/Policy.h | 25 + src/librbd/journal/PromoteRequest.cc | 237 + src/librbd/journal/PromoteRequest.h | 109 + src/librbd/journal/RemoveRequest.cc | 153 + src/librbd/journal/RemoveRequest.h | 81 + src/librbd/journal/Replay.cc | 1175 +++ src/librbd/journal/Replay.h | 205 + src/librbd/journal/ResetRequest.cc | 162 + src/librbd/journal/ResetRequest.h | 110 + src/librbd/journal/StandardPolicy.cc | 32 + src/librbd/journal/StandardPolicy.h | 38 + src/librbd/journal/TypeTraits.h | 29 + src/librbd/journal/Types.cc | 956 +++ src/librbd/journal/Types.h | 685 ++ src/librbd/journal/Utils.cc | 86 + src/librbd/journal/Utils.h | 80 + src/librbd/librbd.cc | 7459 ++++++++++++++++++++ src/librbd/managed_lock/AcquireRequest.cc | 184 + src/librbd/managed_lock/AcquireRequest.h | 102 + src/librbd/managed_lock/BreakRequest.cc | 249 + src/librbd/managed_lock/BreakRequest.h | 120 + src/librbd/managed_lock/GetLockerRequest.cc | 131 + src/librbd/managed_lock/GetLockerRequest.h | 58 + src/librbd/managed_lock/ReacquireRequest.cc | 79 + src/librbd/managed_lock/ReacquireRequest.h | 69 + src/librbd/managed_lock/ReleaseRequest.cc | 97 + src/librbd/managed_lock/ReleaseRequest.h | 72 + src/librbd/managed_lock/Types.h | 46 + src/librbd/managed_lock/Utils.cc | 43 + src/librbd/managed_lock/Utils.h | 23 + src/librbd/migration/FileStream.cc | 232 + src/librbd/migration/FileStream.h | 68 + src/librbd/migration/FormatInterface.h | 53 + src/librbd/migration/HttpClient.cc | 947 +++ src/librbd/migration/HttpClient.h | 205 + src/librbd/migration/HttpProcessorInterface.h | 27 + src/librbd/migration/HttpStream.cc | 83 + src/librbd/migration/HttpStream.h | 68 + src/librbd/migration/ImageDispatch.cc | 156 + src/librbd/migration/ImageDispatch.h | 101 + src/librbd/migration/NativeFormat.cc | 309 + src/librbd/migration/NativeFormat.h | 82 + src/librbd/migration/OpenSourceImageRequest.cc | 249 + src/librbd/migration/OpenSourceImageRequest.h | 103 + src/librbd/migration/QCOW.h | 466 ++ src/librbd/migration/QCOWFormat.cc | 1545 ++++ src/librbd/migration/QCOWFormat.h | 211 + src/librbd/migration/RawFormat.cc | 235 + src/librbd/migration/RawFormat.h | 78 + src/librbd/migration/RawSnapshot.cc | 220 + src/librbd/migration/RawSnapshot.h | 75 + src/librbd/migration/S3Stream.cc | 200 + src/librbd/migration/S3Stream.h | 78 + src/librbd/migration/SnapshotInterface.h | 48 + src/librbd/migration/SourceSpecBuilder.cc | 147 + src/librbd/migration/SourceSpecBuilder.h | 54 + src/librbd/migration/StreamInterface.h | 32 + src/librbd/migration/Types.h | 42 + src/librbd/migration/Utils.cc | 133 + src/librbd/migration/Utils.h | 30 + src/librbd/mirror/DemoteRequest.cc | 216 + src/librbd/mirror/DemoteRequest.h | 86 + src/librbd/mirror/DisableRequest.cc | 479 ++ src/librbd/mirror/DisableRequest.h | 143 + src/librbd/mirror/EnableRequest.cc | 329 + src/librbd/mirror/EnableRequest.h | 135 + src/librbd/mirror/GetInfoRequest.cc | 290 + src/librbd/mirror/GetInfoRequest.h | 123 + src/librbd/mirror/GetStatusRequest.cc | 116 + src/librbd/mirror/GetStatusRequest.h | 86 + src/librbd/mirror/GetUuidRequest.cc | 86 + src/librbd/mirror/GetUuidRequest.h | 69 + src/librbd/mirror/ImageRemoveRequest.cc | 98 + src/librbd/mirror/ImageRemoveRequest.h | 77 + src/librbd/mirror/ImageStateUpdateRequest.cc | 151 + src/librbd/mirror/ImageStateUpdateRequest.h | 92 + src/librbd/mirror/PromoteRequest.cc | 115 + src/librbd/mirror/PromoteRequest.h | 76 + src/librbd/mirror/Types.h | 21 + .../mirror/snapshot/CreateNonPrimaryRequest.cc | 273 + .../mirror/snapshot/CreateNonPrimaryRequest.h | 123 + src/librbd/mirror/snapshot/CreatePrimaryRequest.cc | 281 + src/librbd/mirror/snapshot/CreatePrimaryRequest.h | 106 + src/librbd/mirror/snapshot/DemoteRequest.cc | 110 + src/librbd/mirror/snapshot/DemoteRequest.h | 76 + src/librbd/mirror/snapshot/GetImageStateRequest.cc | 114 + src/librbd/mirror/snapshot/GetImageStateRequest.h | 76 + src/librbd/mirror/snapshot/ImageMeta.cc | 175 + src/librbd/mirror/snapshot/ImageMeta.h | 78 + src/librbd/mirror/snapshot/PromoteRequest.cc | 405 ++ src/librbd/mirror/snapshot/PromoteRequest.h | 151 + .../mirror/snapshot/RemoveImageStateRequest.cc | 131 + .../mirror/snapshot/RemoveImageStateRequest.h | 75 + src/librbd/mirror/snapshot/SetImageStateRequest.cc | 235 + src/librbd/mirror/snapshot/SetImageStateRequest.h | 96 + src/librbd/mirror/snapshot/Types.cc | 109 + src/librbd/mirror/snapshot/Types.h | 122 + src/librbd/mirror/snapshot/UnlinkPeerRequest.cc | 230 + src/librbd/mirror/snapshot/UnlinkPeerRequest.h | 98 + src/librbd/mirror/snapshot/Utils.cc | 186 + src/librbd/mirror/snapshot/Utils.h | 38 + .../mirror/snapshot/WriteImageStateRequest.cc | 120 + .../mirror/snapshot/WriteImageStateRequest.h | 73 + src/librbd/mirroring_watcher/Types.cc | 136 + src/librbd/mirroring_watcher/Types.h | 102 + src/librbd/object_map/CreateRequest.cc | 94 + src/librbd/object_map/CreateRequest.h | 59 + src/librbd/object_map/DiffRequest.cc | 258 + src/librbd/object_map/DiffRequest.h | 87 + src/librbd/object_map/InvalidateRequest.cc | 83 + src/librbd/object_map/InvalidateRequest.h | 45 + src/librbd/object_map/LockRequest.cc | 157 + src/librbd/object_map/LockRequest.h | 75 + src/librbd/object_map/RefreshRequest.cc | 311 + src/librbd/object_map/RefreshRequest.h | 102 + src/librbd/object_map/RemoveRequest.cc | 88 + src/librbd/object_map/RemoveRequest.h | 63 + src/librbd/object_map/Request.cc | 74 + src/librbd/object_map/Request.h | 66 + src/librbd/object_map/ResizeRequest.cc | 65 + src/librbd/object_map/ResizeRequest.h | 51 + src/librbd/object_map/SnapshotCreateRequest.cc | 147 + src/librbd/object_map/SnapshotCreateRequest.h | 80 + src/librbd/object_map/SnapshotRemoveRequest.cc | 227 + src/librbd/object_map/SnapshotRemoveRequest.h | 88 + src/librbd/object_map/SnapshotRollbackRequest.cc | 131 + src/librbd/object_map/SnapshotRollbackRequest.h | 74 + src/librbd/object_map/Types.h | 20 + src/librbd/object_map/UnlockRequest.cc | 66 + src/librbd/object_map/UnlockRequest.h | 47 + src/librbd/object_map/UpdateRequest.cc | 129 + src/librbd/object_map/UpdateRequest.h | 106 + src/librbd/operation/DisableFeaturesRequest.cc | 655 ++ src/librbd/operation/DisableFeaturesRequest.h | 171 + src/librbd/operation/EnableFeaturesRequest.cc | 494 ++ src/librbd/operation/EnableFeaturesRequest.h | 135 + src/librbd/operation/FlattenRequest.cc | 265 + src/librbd/operation/FlattenRequest.h | 83 + src/librbd/operation/MetadataRemoveRequest.cc | 60 + src/librbd/operation/MetadataRemoveRequest.h | 44 + src/librbd/operation/MetadataSetRequest.cc | 62 + src/librbd/operation/MetadataSetRequest.h | 47 + src/librbd/operation/MigrateRequest.cc | 238 + src/librbd/operation/MigrateRequest.h | 68 + src/librbd/operation/ObjectMapIterate.cc | 308 + src/librbd/operation/ObjectMapIterate.h | 65 + src/librbd/operation/RebuildObjectMapRequest.cc | 250 + src/librbd/operation/RebuildObjectMapRequest.h | 84 + src/librbd/operation/RenameRequest.cc | 257 + src/librbd/operation/RenameRequest.h | 95 + src/librbd/operation/Request.cc | 183 + src/librbd/operation/Request.h | 106 + src/librbd/operation/ResizeRequest.cc | 466 ++ src/librbd/operation/ResizeRequest.h | 156 + src/librbd/operation/SnapshotCreateRequest.cc | 449 ++ src/librbd/operation/SnapshotCreateRequest.h | 148 + src/librbd/operation/SnapshotLimitRequest.cc | 66 + src/librbd/operation/SnapshotLimitRequest.h | 44 + src/librbd/operation/SnapshotProtectRequest.cc | 118 + src/librbd/operation/SnapshotProtectRequest.h | 68 + src/librbd/operation/SnapshotRemoveRequest.cc | 506 ++ src/librbd/operation/SnapshotRemoveRequest.h | 128 + src/librbd/operation/SnapshotRenameRequest.cc | 102 + src/librbd/operation/SnapshotRenameRequest.h | 63 + src/librbd/operation/SnapshotRollbackRequest.cc | 424 ++ src/librbd/operation/SnapshotRollbackRequest.h | 122 + src/librbd/operation/SnapshotUnprotectRequest.cc | 353 + src/librbd/operation/SnapshotUnprotectRequest.h | 94 + src/librbd/operation/SparsifyRequest.cc | 519 ++ src/librbd/operation/SparsifyRequest.h | 64 + src/librbd/operation/TrimRequest.cc | 382 + src/librbd/operation/TrimRequest.h | 107 + src/librbd/plugin/Api.cc | 92 + src/librbd/plugin/Api.h | 84 + src/librbd/plugin/ParentCache.cc | 81 + src/librbd/plugin/ParentCache.h | 38 + src/librbd/plugin/Types.h | 45 + src/librbd/plugin/WriteLogImageCache.cc | 104 + src/librbd/plugin/WriteLogImageCache.h | 53 + src/librbd/trash/MoveRequest.cc | 126 + src/librbd/trash/MoveRequest.h | 87 + src/librbd/trash/RemoveRequest.cc | 170 + src/librbd/trash/RemoveRequest.h | 118 + src/librbd/trash_watcher/Types.cc | 130 + src/librbd/trash_watcher/Types.h | 97 + src/librbd/watcher/Notifier.cc | 99 + src/librbd/watcher/Notifier.h | 64 + src/librbd/watcher/RewatchRequest.cc | 108 + src/librbd/watcher/RewatchRequest.h | 75 + src/librbd/watcher/Types.cc | 45 + src/librbd/watcher/Types.h | 71 + src/librbd/watcher/Utils.h | 74 + 490 files changed, 110387 insertions(+) create mode 100644 src/librbd/AsioEngine.cc create mode 100644 src/librbd/AsioEngine.h create mode 100644 src/librbd/AsyncObjectThrottle.cc create mode 100644 src/librbd/AsyncObjectThrottle.h create mode 100644 src/librbd/AsyncRequest.cc create mode 100644 src/librbd/AsyncRequest.h create mode 100644 src/librbd/BlockGuard.h create mode 100644 src/librbd/CMakeLists.txt create mode 100644 src/librbd/ConfigWatcher.cc create mode 100644 src/librbd/ConfigWatcher.h create mode 100644 src/librbd/DeepCopyRequest.cc create mode 100644 src/librbd/DeepCopyRequest.h create mode 100644 src/librbd/ExclusiveLock.cc create mode 100644 src/librbd/ExclusiveLock.h create mode 100644 src/librbd/Features.cc create mode 100644 src/librbd/Features.h create mode 100644 src/librbd/ImageCtx.cc create mode 100644 src/librbd/ImageCtx.h create mode 100644 src/librbd/ImageState.cc create mode 100644 src/librbd/ImageState.h create mode 100644 src/librbd/ImageWatcher.cc create mode 100644 src/librbd/ImageWatcher.h create mode 100644 src/librbd/Journal.cc create mode 100644 src/librbd/Journal.h create mode 100644 src/librbd/LibrbdAdminSocketHook.cc create mode 100644 src/librbd/LibrbdAdminSocketHook.h create mode 100644 src/librbd/ManagedLock.cc create mode 100644 src/librbd/ManagedLock.h create mode 100644 src/librbd/MirroringWatcher.cc create mode 100644 src/librbd/MirroringWatcher.h create mode 100644 src/librbd/ObjectMap.cc create mode 100644 src/librbd/ObjectMap.h create mode 100644 src/librbd/Operations.cc create mode 100644 src/librbd/Operations.h create mode 100644 src/librbd/PluginRegistry.cc create mode 100644 src/librbd/PluginRegistry.h create mode 100644 src/librbd/TaskFinisher.h create mode 100644 src/librbd/TrashWatcher.cc create mode 100644 src/librbd/TrashWatcher.h create mode 100644 src/librbd/Types.h create mode 100644 src/librbd/Utils.cc create mode 100644 src/librbd/Utils.h create mode 100644 src/librbd/WatchNotifyTypes.cc create mode 100644 src/librbd/WatchNotifyTypes.h create mode 100644 src/librbd/Watcher.cc create mode 100644 src/librbd/Watcher.h create mode 100644 src/librbd/api/Config.cc create mode 100644 src/librbd/api/Config.h create mode 100644 src/librbd/api/DiffIterate.cc create mode 100644 src/librbd/api/DiffIterate.h create mode 100644 src/librbd/api/Group.cc create mode 100644 src/librbd/api/Group.h create mode 100644 src/librbd/api/Image.cc create mode 100644 src/librbd/api/Image.h create mode 100644 src/librbd/api/Io.cc create mode 100644 src/librbd/api/Io.h create mode 100644 src/librbd/api/Migration.cc create mode 100644 src/librbd/api/Migration.h create mode 100644 src/librbd/api/Mirror.cc create mode 100644 src/librbd/api/Mirror.h create mode 100644 src/librbd/api/Namespace.cc create mode 100644 src/librbd/api/Namespace.h create mode 100644 src/librbd/api/Pool.cc create mode 100644 src/librbd/api/Pool.h create mode 100644 src/librbd/api/PoolMetadata.cc create mode 100644 src/librbd/api/PoolMetadata.h create mode 100644 src/librbd/api/Snapshot.cc create mode 100644 src/librbd/api/Snapshot.h create mode 100644 src/librbd/api/Trash.cc create mode 100644 src/librbd/api/Trash.h create mode 100644 src/librbd/api/Utils.cc create mode 100644 src/librbd/api/Utils.h create mode 100644 src/librbd/asio/ContextWQ.cc create mode 100644 src/librbd/asio/ContextWQ.h create mode 100644 src/librbd/asio/Utils.h create mode 100644 src/librbd/cache/ImageWriteback.cc create mode 100644 src/librbd/cache/ImageWriteback.h create mode 100644 src/librbd/cache/ObjectCacherObjectDispatch.cc create mode 100644 src/librbd/cache/ObjectCacherObjectDispatch.h create mode 100644 src/librbd/cache/ObjectCacherWriteback.cc create mode 100644 src/librbd/cache/ObjectCacherWriteback.h create mode 100644 src/librbd/cache/ParentCacheObjectDispatch.cc create mode 100644 src/librbd/cache/ParentCacheObjectDispatch.h create mode 100644 src/librbd/cache/TypeTraits.h create mode 100644 src/librbd/cache/Types.h create mode 100644 src/librbd/cache/Utils.h create mode 100644 src/librbd/cache/WriteAroundObjectDispatch.cc create mode 100644 src/librbd/cache/WriteAroundObjectDispatch.h create mode 100644 src/librbd/cache/WriteLogImageDispatch.cc create mode 100644 src/librbd/cache/WriteLogImageDispatch.h create mode 100644 src/librbd/cache/pwl/AbstractWriteLog.cc create mode 100644 src/librbd/cache/pwl/AbstractWriteLog.h create mode 100644 src/librbd/cache/pwl/Builder.h create mode 100644 src/librbd/cache/pwl/DiscardRequest.cc create mode 100644 src/librbd/cache/pwl/DiscardRequest.h create mode 100644 src/librbd/cache/pwl/ImageCacheState.cc create mode 100644 src/librbd/cache/pwl/ImageCacheState.h create mode 100644 src/librbd/cache/pwl/InitRequest.cc create mode 100644 src/librbd/cache/pwl/InitRequest.h create mode 100644 src/librbd/cache/pwl/LogEntry.cc create mode 100644 src/librbd/cache/pwl/LogEntry.h create mode 100644 src/librbd/cache/pwl/LogMap.cc create mode 100644 src/librbd/cache/pwl/LogMap.h create mode 100644 src/librbd/cache/pwl/LogOperation.cc create mode 100644 src/librbd/cache/pwl/LogOperation.h create mode 100644 src/librbd/cache/pwl/ReadRequest.h create mode 100644 src/librbd/cache/pwl/Request.cc create mode 100644 src/librbd/cache/pwl/Request.h create mode 100644 src/librbd/cache/pwl/ShutdownRequest.cc create mode 100644 src/librbd/cache/pwl/ShutdownRequest.h create mode 100644 src/librbd/cache/pwl/SyncPoint.cc create mode 100644 src/librbd/cache/pwl/SyncPoint.h create mode 100644 src/librbd/cache/pwl/Types.cc create mode 100644 src/librbd/cache/pwl/Types.h create mode 100644 src/librbd/cache/pwl/rwl/Builder.h create mode 100644 src/librbd/cache/pwl/rwl/LogEntry.cc create mode 100644 src/librbd/cache/pwl/rwl/LogEntry.h create mode 100644 src/librbd/cache/pwl/rwl/LogOperation.cc create mode 100644 src/librbd/cache/pwl/rwl/LogOperation.h create mode 100644 src/librbd/cache/pwl/rwl/ReadRequest.cc create mode 100644 src/librbd/cache/pwl/rwl/ReadRequest.h create mode 100644 src/librbd/cache/pwl/rwl/Request.cc create mode 100644 src/librbd/cache/pwl/rwl/Request.h create mode 100644 src/librbd/cache/pwl/rwl/WriteLog.cc create mode 100644 src/librbd/cache/pwl/rwl/WriteLog.h create mode 100644 src/librbd/cache/pwl/ssd/Builder.h create mode 100644 src/librbd/cache/pwl/ssd/LogEntry.cc create mode 100644 src/librbd/cache/pwl/ssd/LogEntry.h create mode 100644 src/librbd/cache/pwl/ssd/LogOperation.cc create mode 100644 src/librbd/cache/pwl/ssd/LogOperation.h create mode 100644 src/librbd/cache/pwl/ssd/ReadRequest.cc create mode 100644 src/librbd/cache/pwl/ssd/ReadRequest.h create mode 100644 src/librbd/cache/pwl/ssd/Request.cc create mode 100644 src/librbd/cache/pwl/ssd/Request.h create mode 100644 src/librbd/cache/pwl/ssd/Types.h create mode 100644 src/librbd/cache/pwl/ssd/WriteLog.cc create mode 100644 src/librbd/cache/pwl/ssd/WriteLog.h create mode 100644 src/librbd/crypto/BlockCrypto.cc create mode 100644 src/librbd/crypto/BlockCrypto.h create mode 100644 src/librbd/crypto/CryptoContextPool.cc create mode 100644 src/librbd/crypto/CryptoContextPool.h create mode 100644 src/librbd/crypto/CryptoImageDispatch.cc create mode 100644 src/librbd/crypto/CryptoImageDispatch.h create mode 100644 src/librbd/crypto/CryptoInterface.h create mode 100644 src/librbd/crypto/CryptoObjectDispatch.cc create mode 100644 src/librbd/crypto/CryptoObjectDispatch.h create mode 100644 src/librbd/crypto/DataCryptor.h create mode 100644 src/librbd/crypto/EncryptionFormat.h create mode 100644 src/librbd/crypto/FormatRequest.cc create mode 100644 src/librbd/crypto/FormatRequest.h create mode 100644 src/librbd/crypto/LoadRequest.cc create mode 100644 src/librbd/crypto/LoadRequest.h create mode 100644 src/librbd/crypto/ShutDownCryptoRequest.cc create mode 100644 src/librbd/crypto/ShutDownCryptoRequest.h create mode 100644 src/librbd/crypto/Types.h create mode 100644 src/librbd/crypto/Utils.cc create mode 100644 src/librbd/crypto/Utils.h create mode 100644 src/librbd/crypto/luks/FlattenRequest.cc create mode 100644 src/librbd/crypto/luks/FlattenRequest.h create mode 100644 src/librbd/crypto/luks/FormatRequest.cc create mode 100644 src/librbd/crypto/luks/FormatRequest.h create mode 100644 src/librbd/crypto/luks/Header.cc create mode 100644 src/librbd/crypto/luks/Header.h create mode 100644 src/librbd/crypto/luks/LUKSEncryptionFormat.cc create mode 100644 src/librbd/crypto/luks/LUKSEncryptionFormat.h create mode 100644 src/librbd/crypto/luks/LoadRequest.cc create mode 100644 src/librbd/crypto/luks/LoadRequest.h create mode 100644 src/librbd/crypto/luks/Magic.cc create mode 100644 src/librbd/crypto/luks/Magic.h create mode 100644 src/librbd/crypto/openssl/DataCryptor.cc create mode 100644 src/librbd/crypto/openssl/DataCryptor.h create mode 100644 src/librbd/deep_copy/Handler.h create mode 100644 src/librbd/deep_copy/ImageCopyRequest.cc create mode 100644 src/librbd/deep_copy/ImageCopyRequest.h create mode 100644 src/librbd/deep_copy/MetadataCopyRequest.cc create mode 100644 src/librbd/deep_copy/MetadataCopyRequest.h create mode 100644 src/librbd/deep_copy/ObjectCopyRequest.cc create mode 100644 src/librbd/deep_copy/ObjectCopyRequest.h create mode 100644 src/librbd/deep_copy/SetHeadRequest.cc create mode 100644 src/librbd/deep_copy/SetHeadRequest.h create mode 100644 src/librbd/deep_copy/SnapshotCopyRequest.cc create mode 100644 src/librbd/deep_copy/SnapshotCopyRequest.h create mode 100644 src/librbd/deep_copy/SnapshotCreateRequest.cc create mode 100644 src/librbd/deep_copy/SnapshotCreateRequest.h create mode 100644 src/librbd/deep_copy/Types.h create mode 100644 src/librbd/deep_copy/Utils.cc create mode 100644 src/librbd/deep_copy/Utils.h create mode 100644 src/librbd/exclusive_lock/AutomaticPolicy.cc create mode 100644 src/librbd/exclusive_lock/AutomaticPolicy.h create mode 100644 src/librbd/exclusive_lock/ImageDispatch.cc create mode 100644 src/librbd/exclusive_lock/ImageDispatch.h create mode 100644 src/librbd/exclusive_lock/Policy.h create mode 100644 src/librbd/exclusive_lock/PostAcquireRequest.cc create mode 100644 src/librbd/exclusive_lock/PostAcquireRequest.h create mode 100644 src/librbd/exclusive_lock/PreAcquireRequest.cc create mode 100644 src/librbd/exclusive_lock/PreAcquireRequest.h create mode 100644 src/librbd/exclusive_lock/PreReleaseRequest.cc create mode 100644 src/librbd/exclusive_lock/PreReleaseRequest.h create mode 100644 src/librbd/exclusive_lock/StandardPolicy.cc create mode 100644 src/librbd/exclusive_lock/StandardPolicy.h create mode 100644 src/librbd/image/AttachChildRequest.cc create mode 100644 src/librbd/image/AttachChildRequest.h create mode 100644 src/librbd/image/AttachParentRequest.cc create mode 100644 src/librbd/image/AttachParentRequest.h create mode 100644 src/librbd/image/CloneRequest.cc create mode 100644 src/librbd/image/CloneRequest.h create mode 100644 src/librbd/image/CloseRequest.cc create mode 100644 src/librbd/image/CloseRequest.h create mode 100644 src/librbd/image/CreateRequest.cc create mode 100644 src/librbd/image/CreateRequest.h create mode 100644 src/librbd/image/DetachChildRequest.cc create mode 100644 src/librbd/image/DetachChildRequest.h create mode 100644 src/librbd/image/DetachParentRequest.cc create mode 100644 src/librbd/image/DetachParentRequest.h create mode 100644 src/librbd/image/GetMetadataRequest.cc create mode 100644 src/librbd/image/GetMetadataRequest.h create mode 100644 src/librbd/image/ListWatchersRequest.cc create mode 100644 src/librbd/image/ListWatchersRequest.h create mode 100644 src/librbd/image/OpenRequest.cc create mode 100644 src/librbd/image/OpenRequest.h create mode 100644 src/librbd/image/PreRemoveRequest.cc create mode 100644 src/librbd/image/PreRemoveRequest.h create mode 100644 src/librbd/image/RefreshParentRequest.cc create mode 100644 src/librbd/image/RefreshParentRequest.h create mode 100644 src/librbd/image/RefreshRequest.cc create mode 100644 src/librbd/image/RefreshRequest.h create mode 100644 src/librbd/image/RemoveRequest.cc create mode 100644 src/librbd/image/RemoveRequest.h create mode 100644 src/librbd/image/SetFlagsRequest.cc create mode 100644 src/librbd/image/SetFlagsRequest.h create mode 100644 src/librbd/image/SetSnapRequest.cc create mode 100644 src/librbd/image/SetSnapRequest.h create mode 100644 src/librbd/image/TypeTraits.h create mode 100644 src/librbd/image/Types.h create mode 100644 src/librbd/image/ValidatePoolRequest.cc create mode 100644 src/librbd/image/ValidatePoolRequest.h create mode 100644 src/librbd/image_watcher/NotifyLockOwner.cc create mode 100644 src/librbd/image_watcher/NotifyLockOwner.h create mode 100644 src/librbd/internal.cc create mode 100644 src/librbd/internal.h create mode 100644 src/librbd/io/AioCompletion.cc create mode 100644 src/librbd/io/AioCompletion.h create mode 100644 src/librbd/io/AsyncOperation.cc create mode 100644 src/librbd/io/AsyncOperation.h create mode 100644 src/librbd/io/CopyupRequest.cc create mode 100644 src/librbd/io/CopyupRequest.h create mode 100644 src/librbd/io/Dispatcher.h create mode 100644 src/librbd/io/DispatcherInterface.h create mode 100644 src/librbd/io/FlushTracker.cc create mode 100644 src/librbd/io/FlushTracker.h create mode 100644 src/librbd/io/ImageDispatch.cc create mode 100644 src/librbd/io/ImageDispatch.h create mode 100644 src/librbd/io/ImageDispatchInterface.h create mode 100644 src/librbd/io/ImageDispatchSpec.cc create mode 100644 src/librbd/io/ImageDispatchSpec.h create mode 100644 src/librbd/io/ImageDispatcher.cc create mode 100644 src/librbd/io/ImageDispatcher.h create mode 100644 src/librbd/io/ImageDispatcherInterface.h create mode 100644 src/librbd/io/ImageRequest.cc create mode 100644 src/librbd/io/ImageRequest.h create mode 100644 src/librbd/io/IoOperations.cc create mode 100644 src/librbd/io/IoOperations.h create mode 100644 src/librbd/io/ObjectDispatch.cc create mode 100644 src/librbd/io/ObjectDispatch.h create mode 100644 src/librbd/io/ObjectDispatchInterface.h create mode 100644 src/librbd/io/ObjectDispatchSpec.cc create mode 100644 src/librbd/io/ObjectDispatchSpec.h create mode 100644 src/librbd/io/ObjectDispatcher.cc create mode 100644 src/librbd/io/ObjectDispatcher.h create mode 100644 src/librbd/io/ObjectDispatcherInterface.h create mode 100644 src/librbd/io/ObjectRequest.cc create mode 100644 src/librbd/io/ObjectRequest.h create mode 100644 src/librbd/io/QosImageDispatch.cc create mode 100644 src/librbd/io/QosImageDispatch.h create mode 100644 src/librbd/io/QueueImageDispatch.cc create mode 100644 src/librbd/io/QueueImageDispatch.h create mode 100644 src/librbd/io/ReadResult.cc create mode 100644 src/librbd/io/ReadResult.h create mode 100644 src/librbd/io/RefreshImageDispatch.cc create mode 100644 src/librbd/io/RefreshImageDispatch.h create mode 100644 src/librbd/io/SimpleSchedulerObjectDispatch.cc create mode 100644 src/librbd/io/SimpleSchedulerObjectDispatch.h create mode 100644 src/librbd/io/TypeTraits.h create mode 100644 src/librbd/io/Types.cc create mode 100644 src/librbd/io/Types.h create mode 100644 src/librbd/io/Utils.cc create mode 100644 src/librbd/io/Utils.h create mode 100644 src/librbd/io/WriteBlockImageDispatch.cc create mode 100644 src/librbd/io/WriteBlockImageDispatch.h create mode 100644 src/librbd/journal/CreateRequest.cc create mode 100644 src/librbd/journal/CreateRequest.h create mode 100644 src/librbd/journal/DemoteRequest.cc create mode 100644 src/librbd/journal/DemoteRequest.h create mode 100644 src/librbd/journal/DisabledPolicy.h create mode 100644 src/librbd/journal/ObjectDispatch.cc create mode 100644 src/librbd/journal/ObjectDispatch.h create mode 100644 src/librbd/journal/OpenRequest.cc create mode 100644 src/librbd/journal/OpenRequest.h create mode 100644 src/librbd/journal/Policy.h create mode 100644 src/librbd/journal/PromoteRequest.cc create mode 100644 src/librbd/journal/PromoteRequest.h create mode 100644 src/librbd/journal/RemoveRequest.cc create mode 100644 src/librbd/journal/RemoveRequest.h create mode 100644 src/librbd/journal/Replay.cc create mode 100644 src/librbd/journal/Replay.h create mode 100644 src/librbd/journal/ResetRequest.cc create mode 100644 src/librbd/journal/ResetRequest.h create mode 100644 src/librbd/journal/StandardPolicy.cc create mode 100644 src/librbd/journal/StandardPolicy.h create mode 100644 src/librbd/journal/TypeTraits.h create mode 100644 src/librbd/journal/Types.cc create mode 100644 src/librbd/journal/Types.h create mode 100644 src/librbd/journal/Utils.cc create mode 100644 src/librbd/journal/Utils.h create mode 100644 src/librbd/librbd.cc create mode 100644 src/librbd/managed_lock/AcquireRequest.cc create mode 100644 src/librbd/managed_lock/AcquireRequest.h create mode 100644 src/librbd/managed_lock/BreakRequest.cc create mode 100644 src/librbd/managed_lock/BreakRequest.h create mode 100644 src/librbd/managed_lock/GetLockerRequest.cc create mode 100644 src/librbd/managed_lock/GetLockerRequest.h create mode 100644 src/librbd/managed_lock/ReacquireRequest.cc create mode 100644 src/librbd/managed_lock/ReacquireRequest.h create mode 100644 src/librbd/managed_lock/ReleaseRequest.cc create mode 100644 src/librbd/managed_lock/ReleaseRequest.h create mode 100644 src/librbd/managed_lock/Types.h create mode 100644 src/librbd/managed_lock/Utils.cc create mode 100644 src/librbd/managed_lock/Utils.h create mode 100644 src/librbd/migration/FileStream.cc create mode 100644 src/librbd/migration/FileStream.h create mode 100644 src/librbd/migration/FormatInterface.h create mode 100644 src/librbd/migration/HttpClient.cc create mode 100644 src/librbd/migration/HttpClient.h create mode 100644 src/librbd/migration/HttpProcessorInterface.h create mode 100644 src/librbd/migration/HttpStream.cc create mode 100644 src/librbd/migration/HttpStream.h create mode 100644 src/librbd/migration/ImageDispatch.cc create mode 100644 src/librbd/migration/ImageDispatch.h create mode 100644 src/librbd/migration/NativeFormat.cc create mode 100644 src/librbd/migration/NativeFormat.h create mode 100644 src/librbd/migration/OpenSourceImageRequest.cc create mode 100644 src/librbd/migration/OpenSourceImageRequest.h create mode 100644 src/librbd/migration/QCOW.h create mode 100644 src/librbd/migration/QCOWFormat.cc create mode 100644 src/librbd/migration/QCOWFormat.h create mode 100644 src/librbd/migration/RawFormat.cc create mode 100644 src/librbd/migration/RawFormat.h create mode 100644 src/librbd/migration/RawSnapshot.cc create mode 100644 src/librbd/migration/RawSnapshot.h create mode 100644 src/librbd/migration/S3Stream.cc create mode 100644 src/librbd/migration/S3Stream.h create mode 100644 src/librbd/migration/SnapshotInterface.h create mode 100644 src/librbd/migration/SourceSpecBuilder.cc create mode 100644 src/librbd/migration/SourceSpecBuilder.h create mode 100644 src/librbd/migration/StreamInterface.h create mode 100644 src/librbd/migration/Types.h create mode 100644 src/librbd/migration/Utils.cc create mode 100644 src/librbd/migration/Utils.h create mode 100644 src/librbd/mirror/DemoteRequest.cc create mode 100644 src/librbd/mirror/DemoteRequest.h create mode 100644 src/librbd/mirror/DisableRequest.cc create mode 100644 src/librbd/mirror/DisableRequest.h create mode 100644 src/librbd/mirror/EnableRequest.cc create mode 100644 src/librbd/mirror/EnableRequest.h create mode 100644 src/librbd/mirror/GetInfoRequest.cc create mode 100644 src/librbd/mirror/GetInfoRequest.h create mode 100644 src/librbd/mirror/GetStatusRequest.cc create mode 100644 src/librbd/mirror/GetStatusRequest.h create mode 100644 src/librbd/mirror/GetUuidRequest.cc create mode 100644 src/librbd/mirror/GetUuidRequest.h create mode 100644 src/librbd/mirror/ImageRemoveRequest.cc create mode 100644 src/librbd/mirror/ImageRemoveRequest.h create mode 100644 src/librbd/mirror/ImageStateUpdateRequest.cc create mode 100644 src/librbd/mirror/ImageStateUpdateRequest.h create mode 100644 src/librbd/mirror/PromoteRequest.cc create mode 100644 src/librbd/mirror/PromoteRequest.h create mode 100644 src/librbd/mirror/Types.h create mode 100644 src/librbd/mirror/snapshot/CreateNonPrimaryRequest.cc create mode 100644 src/librbd/mirror/snapshot/CreateNonPrimaryRequest.h create mode 100644 src/librbd/mirror/snapshot/CreatePrimaryRequest.cc create mode 100644 src/librbd/mirror/snapshot/CreatePrimaryRequest.h create mode 100644 src/librbd/mirror/snapshot/DemoteRequest.cc create mode 100644 src/librbd/mirror/snapshot/DemoteRequest.h create mode 100644 src/librbd/mirror/snapshot/GetImageStateRequest.cc create mode 100644 src/librbd/mirror/snapshot/GetImageStateRequest.h create mode 100644 src/librbd/mirror/snapshot/ImageMeta.cc create mode 100644 src/librbd/mirror/snapshot/ImageMeta.h create mode 100644 src/librbd/mirror/snapshot/PromoteRequest.cc create mode 100644 src/librbd/mirror/snapshot/PromoteRequest.h create mode 100644 src/librbd/mirror/snapshot/RemoveImageStateRequest.cc create mode 100644 src/librbd/mirror/snapshot/RemoveImageStateRequest.h create mode 100644 src/librbd/mirror/snapshot/SetImageStateRequest.cc create mode 100644 src/librbd/mirror/snapshot/SetImageStateRequest.h create mode 100644 src/librbd/mirror/snapshot/Types.cc create mode 100644 src/librbd/mirror/snapshot/Types.h create mode 100644 src/librbd/mirror/snapshot/UnlinkPeerRequest.cc create mode 100644 src/librbd/mirror/snapshot/UnlinkPeerRequest.h create mode 100644 src/librbd/mirror/snapshot/Utils.cc create mode 100644 src/librbd/mirror/snapshot/Utils.h create mode 100644 src/librbd/mirror/snapshot/WriteImageStateRequest.cc create mode 100644 src/librbd/mirror/snapshot/WriteImageStateRequest.h create mode 100644 src/librbd/mirroring_watcher/Types.cc create mode 100644 src/librbd/mirroring_watcher/Types.h create mode 100644 src/librbd/object_map/CreateRequest.cc create mode 100644 src/librbd/object_map/CreateRequest.h create mode 100644 src/librbd/object_map/DiffRequest.cc create mode 100644 src/librbd/object_map/DiffRequest.h create mode 100644 src/librbd/object_map/InvalidateRequest.cc create mode 100644 src/librbd/object_map/InvalidateRequest.h create mode 100644 src/librbd/object_map/LockRequest.cc create mode 100644 src/librbd/object_map/LockRequest.h create mode 100644 src/librbd/object_map/RefreshRequest.cc create mode 100644 src/librbd/object_map/RefreshRequest.h create mode 100644 src/librbd/object_map/RemoveRequest.cc create mode 100644 src/librbd/object_map/RemoveRequest.h create mode 100644 src/librbd/object_map/Request.cc create mode 100644 src/librbd/object_map/Request.h create mode 100644 src/librbd/object_map/ResizeRequest.cc create mode 100644 src/librbd/object_map/ResizeRequest.h create mode 100644 src/librbd/object_map/SnapshotCreateRequest.cc create mode 100644 src/librbd/object_map/SnapshotCreateRequest.h create mode 100644 src/librbd/object_map/SnapshotRemoveRequest.cc create mode 100644 src/librbd/object_map/SnapshotRemoveRequest.h create mode 100644 src/librbd/object_map/SnapshotRollbackRequest.cc create mode 100644 src/librbd/object_map/SnapshotRollbackRequest.h create mode 100644 src/librbd/object_map/Types.h create mode 100644 src/librbd/object_map/UnlockRequest.cc create mode 100644 src/librbd/object_map/UnlockRequest.h create mode 100644 src/librbd/object_map/UpdateRequest.cc create mode 100644 src/librbd/object_map/UpdateRequest.h create mode 100644 src/librbd/operation/DisableFeaturesRequest.cc create mode 100644 src/librbd/operation/DisableFeaturesRequest.h create mode 100644 src/librbd/operation/EnableFeaturesRequest.cc create mode 100644 src/librbd/operation/EnableFeaturesRequest.h create mode 100644 src/librbd/operation/FlattenRequest.cc create mode 100644 src/librbd/operation/FlattenRequest.h create mode 100644 src/librbd/operation/MetadataRemoveRequest.cc create mode 100644 src/librbd/operation/MetadataRemoveRequest.h create mode 100644 src/librbd/operation/MetadataSetRequest.cc create mode 100644 src/librbd/operation/MetadataSetRequest.h create mode 100644 src/librbd/operation/MigrateRequest.cc create mode 100644 src/librbd/operation/MigrateRequest.h create mode 100644 src/librbd/operation/ObjectMapIterate.cc create mode 100644 src/librbd/operation/ObjectMapIterate.h create mode 100644 src/librbd/operation/RebuildObjectMapRequest.cc create mode 100644 src/librbd/operation/RebuildObjectMapRequest.h create mode 100644 src/librbd/operation/RenameRequest.cc create mode 100644 src/librbd/operation/RenameRequest.h create mode 100644 src/librbd/operation/Request.cc create mode 100644 src/librbd/operation/Request.h create mode 100644 src/librbd/operation/ResizeRequest.cc create mode 100644 src/librbd/operation/ResizeRequest.h create mode 100644 src/librbd/operation/SnapshotCreateRequest.cc create mode 100644 src/librbd/operation/SnapshotCreateRequest.h create mode 100644 src/librbd/operation/SnapshotLimitRequest.cc create mode 100644 src/librbd/operation/SnapshotLimitRequest.h create mode 100644 src/librbd/operation/SnapshotProtectRequest.cc create mode 100644 src/librbd/operation/SnapshotProtectRequest.h create mode 100644 src/librbd/operation/SnapshotRemoveRequest.cc create mode 100644 src/librbd/operation/SnapshotRemoveRequest.h create mode 100644 src/librbd/operation/SnapshotRenameRequest.cc create mode 100644 src/librbd/operation/SnapshotRenameRequest.h create mode 100644 src/librbd/operation/SnapshotRollbackRequest.cc create mode 100644 src/librbd/operation/SnapshotRollbackRequest.h create mode 100644 src/librbd/operation/SnapshotUnprotectRequest.cc create mode 100644 src/librbd/operation/SnapshotUnprotectRequest.h create mode 100644 src/librbd/operation/SparsifyRequest.cc create mode 100644 src/librbd/operation/SparsifyRequest.h create mode 100644 src/librbd/operation/TrimRequest.cc create mode 100644 src/librbd/operation/TrimRequest.h create mode 100644 src/librbd/plugin/Api.cc create mode 100644 src/librbd/plugin/Api.h create mode 100644 src/librbd/plugin/ParentCache.cc create mode 100644 src/librbd/plugin/ParentCache.h create mode 100644 src/librbd/plugin/Types.h create mode 100644 src/librbd/plugin/WriteLogImageCache.cc create mode 100644 src/librbd/plugin/WriteLogImageCache.h create mode 100644 src/librbd/trash/MoveRequest.cc create mode 100644 src/librbd/trash/MoveRequest.h create mode 100644 src/librbd/trash/RemoveRequest.cc create mode 100644 src/librbd/trash/RemoveRequest.h create mode 100644 src/librbd/trash_watcher/Types.cc create mode 100644 src/librbd/trash_watcher/Types.h create mode 100644 src/librbd/watcher/Notifier.cc create mode 100644 src/librbd/watcher/Notifier.h create mode 100644 src/librbd/watcher/RewatchRequest.cc create mode 100644 src/librbd/watcher/RewatchRequest.h create mode 100644 src/librbd/watcher/Types.cc create mode 100644 src/librbd/watcher/Types.h create mode 100644 src/librbd/watcher/Utils.h (limited to 'src/librbd') diff --git a/src/librbd/AsioEngine.cc b/src/librbd/AsioEngine.cc new file mode 100644 index 000000000..8e2beb49c --- /dev/null +++ b/src/librbd/AsioEngine.cc @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/AsioEngine.h" +#include "include/Context.h" +#include "include/neorados/RADOS.hpp" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "librbd/asio/ContextWQ.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::AsioEngine: " \ + << this << " " << __func__ << ": " + +namespace librbd { + +AsioEngine::AsioEngine(std::shared_ptr rados) + : m_rados_api(std::make_shared( + neorados::RADOS::make_with_librados(*rados))), + m_cct(m_rados_api->cct()), + m_io_context(m_rados_api->get_io_context()), + m_api_strand(std::make_unique( + m_io_context)), + m_context_wq(std::make_unique(m_cct, m_io_context)) { + ldout(m_cct, 20) << dendl; + + auto rados_threads = m_cct->_conf.get_val("librados_thread_count"); + auto rbd_threads = m_cct->_conf.get_val("rbd_op_threads"); + if (rbd_threads > rados_threads) { + // inherit the librados thread count -- but increase it if librbd wants to + // utilize more threads + m_cct->_conf.set_val_or_die("librados_thread_count", + std::to_string(rbd_threads)); + m_cct->_conf.apply_changes(nullptr); + } +} + +AsioEngine::AsioEngine(librados::IoCtx& io_ctx) + : AsioEngine(std::make_shared(io_ctx)) { +} + +AsioEngine::~AsioEngine() { + ldout(m_cct, 20) << dendl; + m_api_strand.reset(); +} + +void AsioEngine::dispatch(Context* ctx, int r) { + dispatch([ctx, r]() { ctx->complete(r); }); +} + +void AsioEngine::post(Context* ctx, int r) { + post([ctx, r]() { ctx->complete(r); }); +} + +} // namespace librbd diff --git a/src/librbd/AsioEngine.h b/src/librbd/AsioEngine.h new file mode 100644 index 000000000..0f476d80b --- /dev/null +++ b/src/librbd/AsioEngine.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_ASIO_ENGINE_H +#define CEPH_LIBRBD_ASIO_ENGINE_H + +#include "include/common_fwd.h" +#include "include/rados/librados_fwd.hpp" +#include +#include +#include +#include +#include + +struct Context; +namespace neorados { struct RADOS; } + +namespace librbd { + +namespace asio { struct ContextWQ; } + +class AsioEngine { +public: + explicit AsioEngine(std::shared_ptr rados); + explicit AsioEngine(librados::IoCtx& io_ctx); + ~AsioEngine(); + + AsioEngine(AsioEngine&&) = delete; + AsioEngine(const AsioEngine&) = delete; + AsioEngine& operator=(const AsioEngine&) = delete; + + inline neorados::RADOS& get_rados_api() { + return *m_rados_api; + } + + inline boost::asio::io_context& get_io_context() { + return m_io_context; + } + inline operator boost::asio::io_context&() { + return m_io_context; + } + + using executor_type = boost::asio::io_context::executor_type; + inline executor_type get_executor() { + return m_io_context.get_executor(); + } + + inline boost::asio::io_context::strand& get_api_strand() { + // API client callbacks should never fire concurrently + return *m_api_strand; + } + + inline asio::ContextWQ* get_work_queue() { + return m_context_wq.get(); + } + + template + void dispatch(T&& t) { + boost::asio::dispatch(m_io_context, std::forward(t)); + } + void dispatch(Context* ctx, int r); + + template + void post(T&& t) { + boost::asio::post(m_io_context, std::forward(t)); + } + void post(Context* ctx, int r); + +private: + std::shared_ptr m_rados_api; + CephContext* m_cct; + + boost::asio::io_context& m_io_context; + std::unique_ptr m_api_strand; + std::unique_ptr m_context_wq; +}; + +} // namespace librbd + +#endif // CEPH_LIBRBD_ASIO_ENGINE_H diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc new file mode 100644 index 000000000..e0fcefff1 --- /dev/null +++ b/src/librbd/AsyncObjectThrottle.cc @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/AsyncRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" + +namespace librbd +{ + +template +AsyncObjectThrottle::AsyncObjectThrottle( + const AsyncRequest* async_request, T &image_ctx, + const ContextFactory& context_factory, Context *ctx, + ProgressContext *prog_ctx, uint64_t object_no, uint64_t end_object_no) + : m_lock(ceph::make_mutex( + util::unique_lock_name("librbd::AsyncThrottle::m_lock", this))), + m_async_request(async_request), m_image_ctx(image_ctx), + m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx), + m_object_no(object_no), m_end_object_no(end_object_no), m_current_ops(0), + m_ret(0) +{ +} + +template +void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + bool complete; + { + std::lock_guard l{m_lock}; + for (uint64_t i = 0; i < max_concurrent; ++i) { + start_next_op(); + if (m_ret < 0 && m_current_ops == 0) { + break; + } + } + complete = (m_current_ops == 0); + } + if (complete) { + // avoid re-entrant callback + m_image_ctx.op_work_queue->queue(m_ctx, m_ret); + delete this; + } +} + +template +void AsyncObjectThrottle::finish_op(int r) { + bool complete; + { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::lock_guard locker{m_lock}; + --m_current_ops; + if (r < 0 && r != -ENOENT && m_ret == 0) { + m_ret = r; + } + + start_next_op(); + complete = (m_current_ops == 0); + } + if (complete) { + m_ctx->complete(m_ret); + delete this; + } +} + +template +void AsyncObjectThrottle::start_next_op() { + bool done = false; + while (!done) { + if (m_async_request != NULL && m_async_request->is_canceled() && + m_ret == 0) { + // allow in-flight ops to complete, but don't start new ops + m_ret = -ERESTART; + return; + } else if (m_ret != 0 || m_object_no >= m_end_object_no) { + return; + } + + uint64_t ono = m_object_no++; + C_AsyncObjectThrottle *ctx = m_context_factory(*this, ono); + + int r = ctx->send(); + if (r < 0) { + m_ret = r; + delete ctx; + return; + } else if (r > 0) { + // op completed immediately + delete ctx; + } else { + ++m_current_ops; + done = true; + } + if (m_prog_ctx != NULL) { + r = m_prog_ctx->update_progress(ono, m_end_object_no); + if (r < 0) { + m_ret = r; + } + } + } +} + +} // namespace librbd + +#ifndef TEST_F +template class librbd::AsyncObjectThrottle; +#endif diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h new file mode 100644 index 000000000..64397f9e4 --- /dev/null +++ b/src/librbd/AsyncObjectThrottle.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H +#define CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H + +#include "include/int_types.h" +#include "include/Context.h" + +#include + +namespace librbd +{ +template class AsyncRequest; +class ProgressContext; +struct ImageCtx; + +class AsyncObjectThrottleFinisher { +public: + virtual ~AsyncObjectThrottleFinisher() {}; + virtual void finish_op(int r) = 0; +}; + +template +class C_AsyncObjectThrottle : public Context { +public: + C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher, + ImageCtxT &image_ctx) + : m_image_ctx(image_ctx), m_finisher(finisher) { + } + + virtual int send() = 0; + +protected: + ImageCtxT &m_image_ctx; + + void finish(int r) override { + m_finisher.finish_op(r); + } + +private: + AsyncObjectThrottleFinisher &m_finisher; +}; + +template +class AsyncObjectThrottle : public AsyncObjectThrottleFinisher { +public: + typedef boost::function< + C_AsyncObjectThrottle* (AsyncObjectThrottle&, + uint64_t)> ContextFactory; + + AsyncObjectThrottle(const AsyncRequest *async_request, + ImageCtxT &image_ctx, + const ContextFactory& context_factory, Context *ctx, + ProgressContext *prog_ctx, uint64_t object_no, + uint64_t end_object_no); + + void start_ops(uint64_t max_concurrent); + void finish_op(int r) override; + +private: + ceph::mutex m_lock; + const AsyncRequest *m_async_request; + ImageCtxT &m_image_ctx; + ContextFactory m_context_factory; + Context *m_ctx; + ProgressContext *m_prog_ctx; + uint64_t m_object_no; + uint64_t m_end_object_no; + uint64_t m_current_ops; + int m_ret; + + void start_next_op(); +}; + +} // namespace librbd + +extern template class librbd::AsyncObjectThrottle; + +#endif // CEPH_LIBRBD_ASYNC_OBJECT_THROTTLE_H diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc new file mode 100644 index 000000000..c189613d0 --- /dev/null +++ b/src/librbd/AsyncRequest.cc @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "librbd/AsyncRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" + +namespace librbd +{ + +template +AsyncRequest::AsyncRequest(T &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish), m_canceled(false), + m_xlist_item(this) { + ceph_assert(m_on_finish != NULL); + start_request(); +} + +template +AsyncRequest::~AsyncRequest() { +} + +template +void AsyncRequest::async_complete(int r) { + m_image_ctx.op_work_queue->queue(create_callback_context(), r); +} + +template +librados::AioCompletion *AsyncRequest::create_callback_completion() { + return util::create_rados_callback(this); +} + +template +Context *AsyncRequest::create_callback_context() { + return util::create_context_callback(this); +} + +template +Context *AsyncRequest::create_async_callback_context() { + return util::create_context_callback, + &AsyncRequest::async_complete>(this); +} + +template +void AsyncRequest::start_request() { + std::lock_guard async_ops_locker{m_image_ctx.async_ops_lock}; + m_image_ctx.async_requests.push_back(&m_xlist_item); +} + +template +void AsyncRequest::finish_request() { + decltype(m_image_ctx.async_requests_waiters) waiters; + { + std::lock_guard async_ops_locker{m_image_ctx.async_ops_lock}; + ceph_assert(m_xlist_item.remove_myself()); + + if (m_image_ctx.async_requests.empty()) { + waiters = std::move(m_image_ctx.async_requests_waiters); + } + } + + for (auto ctx : waiters) { + ctx->complete(0); + } +} + +} // namespace librbd + +#ifndef TEST_F +template class librbd::AsyncRequest; +#endif diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h new file mode 100644 index 000000000..f74368dc6 --- /dev/null +++ b/src/librbd/AsyncRequest.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_ASYNC_REQUEST_H +#define CEPH_LIBRBD_ASYNC_REQUEST_H + +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "include/xlist.h" +#include "include/compat.h" + +namespace librbd { + +class ImageCtx; + +template +class AsyncRequest +{ +public: + AsyncRequest(ImageCtxT &image_ctx, Context *on_finish); + virtual ~AsyncRequest(); + + void complete(int r) { + if (should_complete(r)) { + r = filter_return_code(r); + finish_and_destroy(r); + } + } + + virtual void send() = 0; + + inline bool is_canceled() const { + return m_canceled; + } + inline void cancel() { + m_canceled = true; + } + +protected: + ImageCtxT &m_image_ctx; + + librados::AioCompletion *create_callback_completion(); + Context *create_callback_context(); + Context *create_async_callback_context(); + + void async_complete(int r); + + virtual bool should_complete(int r) = 0; + virtual int filter_return_code(int r) const { + return r; + } + + // NOTE: temporary until converted to new state machine format + virtual void finish_and_destroy(int r) { + finish(r); + delete this; + } + + virtual void finish(int r) { + finish_request(); + m_on_finish->complete(r); + } + +private: + Context *m_on_finish; + bool m_canceled; + typename xlist *>::item m_xlist_item; + + void start_request(); + void finish_request(); +}; + +} // namespace librbd + +extern template class librbd::AsyncRequest; + +#endif //CEPH_LIBRBD_ASYNC_REQUEST_H diff --git a/src/librbd/BlockGuard.h b/src/librbd/BlockGuard.h new file mode 100644 index 000000000..1e56a6eed --- /dev/null +++ b/src/librbd/BlockGuard.h @@ -0,0 +1,177 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_BLOCK_GUARD_H +#define CEPH_LIBRBD_IO_BLOCK_GUARD_H + +#include "include/int_types.h" +#include "common/dout.h" +#include "common/ceph_mutex.h" +#include +#include +#include +#include +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::BlockGuard: " << this << " " \ + << __func__ << ": " + +namespace librbd { + +struct BlockExtent { + // [block_start, block_end) + uint64_t block_start = 0; + uint64_t block_end = 0; + + BlockExtent() { + } + BlockExtent(uint64_t block_start, uint64_t block_end) + : block_start(block_start), block_end(block_end) { + } + + friend std::ostream& operator<< (std::ostream& os, const BlockExtent& block_extent) { + os << "[block_start=" << block_extent.block_start + << ", block_end=" << block_extent.block_end << "]"; + return os; + } +}; + +struct BlockGuardCell { +}; + +/** + * Helper class to restrict and order concurrent IO to the same block. The + * definition of a block is dependent upon the user of this class. It might + * represent a backing object, 512 byte sectors, etc. + */ +template +class BlockGuard { +private: + struct DetainedBlockExtent; + +public: + typedef std::list BlockOperations; + + BlockGuard(CephContext *cct) + : m_cct(cct) { + } + + BlockGuard(const BlockGuard&) = delete; + BlockGuard &operator=(const BlockGuard&) = delete; + + /** + * Detain future IO for a range of blocks. the guard will keep + * ownership of the provided operation if the operation is blocked. + * @return 0 upon success and IO can be issued + * >0 if the IO is blocked, + * <0 upon error + */ + int detain(const BlockExtent &block_extent, BlockOperation *block_operation, + BlockGuardCell **cell) { + std::lock_guard locker{m_lock}; + ldout(m_cct, 20) << block_extent + << ", free_slots=" + << m_free_detained_block_extents.size() + << dendl; + + DetainedBlockExtent *detained_block_extent; + auto it = m_detained_block_extents.find(block_extent); + if (it != m_detained_block_extents.end()) { + // request against an already detained block + detained_block_extent = &(*it); + if (block_operation != nullptr) { + detained_block_extent->block_operations.emplace_back( + std::move(*block_operation)); + } + + // alert the caller that the IO was detained + *cell = nullptr; + return detained_block_extent->block_operations.size(); + } else { + if (!m_free_detained_block_extents.empty()) { + detained_block_extent = &m_free_detained_block_extents.front(); + detained_block_extent->block_operations.clear(); + m_free_detained_block_extents.pop_front(); + } else { + ldout(m_cct, 20) << "no free detained block cells" << dendl; + m_detained_block_extent_pool.emplace_back(); + detained_block_extent = &m_detained_block_extent_pool.back(); + } + + detained_block_extent->block_extent = block_extent; + m_detained_block_extents.insert(*detained_block_extent); + *cell = reinterpret_cast(detained_block_extent); + return 0; + } + } + + /** + * Release any detained IO operations from the provided cell. + */ + void release(BlockGuardCell *cell, BlockOperations *block_operations) { + std::lock_guard locker{m_lock}; + + ceph_assert(cell != nullptr); + auto &detained_block_extent = reinterpret_cast( + *cell); + ldout(m_cct, 20) << detained_block_extent.block_extent + << ", pending_ops=" + << detained_block_extent.block_operations.size() + << dendl; + + *block_operations = std::move(detained_block_extent.block_operations); + m_detained_block_extents.erase(detained_block_extent.block_extent); + m_free_detained_block_extents.push_back(detained_block_extent); + } + +private: + struct DetainedBlockExtent : public boost::intrusive::list_base_hook<>, + public boost::intrusive::set_base_hook<> { + BlockExtent block_extent; + BlockOperations block_operations; + }; + + struct DetainedBlockExtentKey { + typedef BlockExtent type; + const BlockExtent &operator()(const DetainedBlockExtent &value) { + return value.block_extent; + } + }; + + struct DetainedBlockExtentCompare { + bool operator()(const BlockExtent &lhs, + const BlockExtent &rhs) const { + // check for range overlap (lhs < rhs) + if (lhs.block_end <= rhs.block_start) { + return true; + } + return false; + } + }; + + typedef std::deque DetainedBlockExtentsPool; + typedef boost::intrusive::list DetainedBlockExtents; + typedef boost::intrusive::set< + DetainedBlockExtent, + boost::intrusive::compare, + boost::intrusive::key_of_value > + BlockExtentToDetainedBlockExtents; + + CephContext *m_cct; + + ceph::mutex m_lock = ceph::make_mutex("librbd::BlockGuard::m_lock"); + DetainedBlockExtentsPool m_detained_block_extent_pool; + DetainedBlockExtents m_free_detained_block_extents; + BlockExtentToDetainedBlockExtents m_detained_block_extents; + +}; + +} // namespace librbd + +#undef dout_subsys +#undef dout_prefix +#define dout_prefix *_dout + +#endif // CEPH_LIBRBD_IO_BLOCK_GUARD_H diff --git a/src/librbd/CMakeLists.txt b/src/librbd/CMakeLists.txt new file mode 100644 index 000000000..1d699b6f4 --- /dev/null +++ b/src/librbd/CMakeLists.txt @@ -0,0 +1,358 @@ +set(librbd_types_srcs + journal/Types.cc + mirroring_watcher/Types.cc + trash_watcher/Types.cc + watcher/Types.cc + WatchNotifyTypes.cc) + +if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE) + list(APPEND librbd_types_srcs cache/pwl/Types.cc) +endif() + +add_library(rbd_types STATIC + ${librbd_types_srcs}) + +if (WITH_RBD_RWL) + target_link_libraries(rbd_types + PUBLIC pmdk::pmemobj) +endif() + +set(librbd_internal_srcs + AsioEngine.cc + AsyncObjectThrottle.cc + AsyncRequest.cc + ConfigWatcher.cc + DeepCopyRequest.cc + ExclusiveLock.cc + ImageCtx.cc + ImageState.cc + ImageWatcher.cc + internal.cc + Journal.cc + LibrbdAdminSocketHook.cc + ManagedLock.cc + MirroringWatcher.cc + ObjectMap.cc + Operations.cc + PluginRegistry.cc + TrashWatcher.cc + Utils.cc + Watcher.cc + api/Config.cc + api/DiffIterate.cc + api/Group.cc + api/Image.cc + api/Io.cc + api/Migration.cc + api/Mirror.cc + api/Namespace.cc + api/Pool.cc + api/PoolMetadata.cc + api/Snapshot.cc + api/Trash.cc + api/Utils.cc + asio/ContextWQ.cc + cache/ImageWriteback.cc + cache/ObjectCacherObjectDispatch.cc + cache/ObjectCacherWriteback.cc + cache/WriteAroundObjectDispatch.cc + crypto/BlockCrypto.cc + crypto/CryptoContextPool.cc + crypto/CryptoImageDispatch.cc + crypto/CryptoObjectDispatch.cc + crypto/FormatRequest.cc + crypto/LoadRequest.cc + crypto/ShutDownCryptoRequest.cc + crypto/Utils.cc + crypto/openssl/DataCryptor.cc + deep_copy/ImageCopyRequest.cc + deep_copy/MetadataCopyRequest.cc + deep_copy/ObjectCopyRequest.cc + deep_copy/SetHeadRequest.cc + deep_copy/SnapshotCopyRequest.cc + deep_copy/SnapshotCreateRequest.cc + deep_copy/Utils.cc + exclusive_lock/AutomaticPolicy.cc + exclusive_lock/ImageDispatch.cc + exclusive_lock/PreAcquireRequest.cc + exclusive_lock/PostAcquireRequest.cc + exclusive_lock/PreReleaseRequest.cc + exclusive_lock/StandardPolicy.cc + image/AttachChildRequest.cc + image/AttachParentRequest.cc + image/CloneRequest.cc + image/CloseRequest.cc + image/CreateRequest.cc + image/DetachChildRequest.cc + image/DetachParentRequest.cc + image/GetMetadataRequest.cc + image/ListWatchersRequest.cc + image/OpenRequest.cc + image/PreRemoveRequest.cc + image/RefreshParentRequest.cc + image/RefreshRequest.cc + image/RemoveRequest.cc + image/SetFlagsRequest.cc + image/SetSnapRequest.cc + image/ValidatePoolRequest.cc + image_watcher/NotifyLockOwner.cc + io/AioCompletion.cc + io/AsyncOperation.cc + io/CopyupRequest.cc + io/FlushTracker.cc + io/ImageDispatch.cc + io/ImageDispatchSpec.cc + io/ImageDispatcher.cc + io/ImageRequest.cc + io/ObjectDispatch.cc + io/ObjectDispatchSpec.cc + io/ObjectDispatcher.cc + io/ObjectRequest.cc + io/QosImageDispatch.cc + io/QueueImageDispatch.cc + io/ReadResult.cc + io/RefreshImageDispatch.cc + io/SimpleSchedulerObjectDispatch.cc + io/Types.cc + io/Utils.cc + io/WriteBlockImageDispatch.cc + io/IoOperations.cc + journal/CreateRequest.cc + journal/DemoteRequest.cc + journal/ObjectDispatch.cc + journal/OpenRequest.cc + journal/PromoteRequest.cc + journal/RemoveRequest.cc + journal/Replay.cc + journal/ResetRequest.cc + journal/StandardPolicy.cc + journal/Utils.cc + managed_lock/AcquireRequest.cc + managed_lock/BreakRequest.cc + managed_lock/GetLockerRequest.cc + managed_lock/ReacquireRequest.cc + managed_lock/ReleaseRequest.cc + managed_lock/Utils.cc + migration/FileStream.cc + migration/HttpClient.cc + migration/HttpStream.cc + migration/ImageDispatch.cc + migration/NativeFormat.cc + migration/OpenSourceImageRequest.cc + migration/QCOWFormat.cc + migration/RawFormat.cc + migration/RawSnapshot.cc + migration/S3Stream.cc + migration/SourceSpecBuilder.cc + migration/Utils.cc + mirror/DemoteRequest.cc + mirror/DisableRequest.cc + mirror/EnableRequest.cc + mirror/GetInfoRequest.cc + mirror/GetStatusRequest.cc + mirror/GetUuidRequest.cc + mirror/ImageRemoveRequest.cc + mirror/ImageStateUpdateRequest.cc + mirror/PromoteRequest.cc + mirror/snapshot/CreateNonPrimaryRequest.cc + mirror/snapshot/CreatePrimaryRequest.cc + mirror/snapshot/DemoteRequest.cc + mirror/snapshot/GetImageStateRequest.cc + mirror/snapshot/ImageMeta.cc + mirror/snapshot/PromoteRequest.cc + mirror/snapshot/RemoveImageStateRequest.cc + mirror/snapshot/SetImageStateRequest.cc + mirror/snapshot/Types.cc + mirror/snapshot/UnlinkPeerRequest.cc + mirror/snapshot/Utils.cc + mirror/snapshot/WriteImageStateRequest.cc + object_map/CreateRequest.cc + object_map/DiffRequest.cc + object_map/InvalidateRequest.cc + object_map/LockRequest.cc + object_map/RefreshRequest.cc + object_map/RemoveRequest.cc + object_map/Request.cc + object_map/ResizeRequest.cc + object_map/SnapshotCreateRequest.cc + object_map/SnapshotRemoveRequest.cc + object_map/SnapshotRollbackRequest.cc + object_map/UnlockRequest.cc + object_map/UpdateRequest.cc + operation/DisableFeaturesRequest.cc + operation/EnableFeaturesRequest.cc + operation/FlattenRequest.cc + operation/MetadataRemoveRequest.cc + operation/MetadataSetRequest.cc + operation/MigrateRequest.cc + operation/ObjectMapIterate.cc + operation/RebuildObjectMapRequest.cc + operation/RenameRequest.cc + operation/Request.cc + operation/ResizeRequest.cc + operation/SnapshotCreateRequest.cc + operation/SnapshotProtectRequest.cc + operation/SnapshotRemoveRequest.cc + operation/SnapshotRenameRequest.cc + operation/SnapshotRollbackRequest.cc + operation/SnapshotUnprotectRequest.cc + operation/SnapshotLimitRequest.cc + operation/SparsifyRequest.cc + operation/TrimRequest.cc + plugin/Api.cc + trash/MoveRequest.cc + trash/RemoveRequest.cc + watcher/Notifier.cc + watcher/RewatchRequest.cc + ${CMAKE_SOURCE_DIR}/src/common/ContextCompletion.cc) + +if(WITH_EVENTTRACE) + list(APPEND librbd_internal_srcs ../common/EventTrace.cc) +endif() + +if(LINUX AND HAVE_LIBCRYPTSETUP) + list(APPEND librbd_internal_srcs + crypto/luks/LUKSEncryptionFormat.cc + crypto/luks/FormatRequest.cc + crypto/luks/FlattenRequest.cc + crypto/luks/Header.cc + crypto/luks/LoadRequest.cc + crypto/luks/Magic.cc) +endif() + +add_library(rbd_api STATIC librbd.cc) +add_library(rbd_internal STATIC + ${librbd_internal_srcs} + $) +if(WITH_LTTNG) + # librbd.cc includes tracing/librbd.h + add_dependencies(rbd_api librbd-tp) + # io/AioCompletion.cc includes tracing/librbd.h + add_dependencies(rbd_internal librbd-tp) +endif() +if(WITH_EVENTTRACE) + add_dependencies(rbd_internal eventtrace_tp) +endif() +target_link_libraries(rbd_internal PRIVATE + osdc rbd_types + OpenSSL::SSL) +target_include_directories(rbd_internal PRIVATE ${OPENSSL_INCLUDE_DIR}) +if(LINUX AND HAVE_LIBCRYPTSETUP) + target_include_directories(rbd_internal PRIVATE ${LIBCRYPTSETUP_INCLUDE_DIR}) + target_link_libraries(rbd_internal PRIVATE ${LIBCRYPTSETUP_LIBRARIES}) +endif() + +add_custom_target(librbd_plugins) +set(librbd_plugins_dir ${CEPH_INSTALL_PKGLIBDIR}/librbd) + +set(rbd_plugin_parent_cache_srcs + cache/ParentCacheObjectDispatch.cc + plugin/ParentCache.cc) +add_library(librbd_plugin_parent_cache SHARED + ${rbd_plugin_parent_cache_srcs}) +target_link_libraries(librbd_plugin_parent_cache PRIVATE + ceph_immutable_object_cache_lib ceph-common librbd + libneorados + librados) +set_target_properties(librbd_plugin_parent_cache PROPERTIES + OUTPUT_NAME ceph_librbd_parent_cache + VERSION 1.0.0 + SOVERSION 1) +install(TARGETS librbd_plugin_parent_cache DESTINATION ${librbd_plugins_dir}) +add_dependencies(librbd_plugins librbd_plugin_parent_cache) + +if(WITH_RBD_RWL OR WITH_RBD_SSD_CACHE) + set(rbd_plugin_pwl_srcs + cache/WriteLogImageDispatch.cc + cache/pwl/AbstractWriteLog.cc + cache/pwl/DiscardRequest.cc + cache/pwl/ImageCacheState.cc + cache/pwl/InitRequest.cc + cache/pwl/LogEntry.cc + cache/pwl/LogMap.cc + cache/pwl/LogOperation.cc + cache/pwl/Request.cc + cache/pwl/ShutdownRequest.cc + cache/pwl/SyncPoint.cc + cache/pwl/Types.cc + plugin/WriteLogImageCache.cc) + + if(WITH_RBD_SSD_CACHE) + set(rbd_plugin_pwl_srcs + ${rbd_plugin_pwl_srcs} + cache/pwl/ssd/LogEntry.cc + cache/pwl/ssd/LogOperation.cc + cache/pwl/ssd/ReadRequest.cc + cache/pwl/ssd/Request.cc + cache/pwl/ssd/WriteLog.cc) + endif() + if(WITH_RBD_RWL) + set(rbd_plugin_pwl_srcs + ${rbd_plugin_pwl_srcs} + cache/pwl/rwl/WriteLog.cc + cache/pwl/rwl/LogEntry.cc + cache/pwl/rwl/LogOperation.cc + cache/pwl/rwl/ReadRequest.cc + cache/pwl/rwl/Request.cc) + endif() + + add_library(librbd_plugin_pwl_cache SHARED + ${rbd_plugin_pwl_srcs}) + target_link_libraries(librbd_plugin_pwl_cache PRIVATE + blk + ceph-common + cls_rbd_client + libneorados + librados + StdFilesystem::filesystem) + + if(WITH_RBD_RWL) + target_link_libraries(librbd_plugin_pwl_cache + PUBLIC pmdk::pmemobj + PRIVATE pmdk::pmem) + endif() + + set_target_properties(librbd_plugin_pwl_cache PROPERTIES + OUTPUT_NAME ceph_librbd_pwl_cache + VERSION 1.0.0 + SOVERSION 1) + install(TARGETS librbd_plugin_pwl_cache DESTINATION ${librbd_plugins_dir}) + add_dependencies(librbd_plugins librbd_plugin_pwl_cache) +endif() + +add_library(librbd ${CEPH_SHARED} + librbd.cc) +if(WITH_LTTNG) + add_dependencies(librbd librbd-tp) +endif() + +target_link_libraries(librbd PRIVATE + rbd_internal + rbd_types + journal + cls_rbd_client + cls_lock_client + cls_journal_client + libneorados + librados + ceph-common + pthread + ${CMAKE_DL_LIBS} + ${EXTRALIBS} ${GSSAPI_LIBRARIES}) +if(HAVE_UDEV) + target_link_libraries(librbd PRIVATE + udev) +endif() +if(ENABLE_SHARED) + set_target_properties(librbd PROPERTIES + OUTPUT_NAME rbd + VERSION 1.18.0 + SOVERSION 1 + CXX_VISIBILITY_PRESET hidden + VISIBILITY_INLINES_HIDDEN ON) + if(NOT APPLE AND NOT WIN32) + set_property(TARGET librbd APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--exclude-libs,ALL") + endif() +endif(ENABLE_SHARED) +install(TARGETS librbd DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/src/librbd/ConfigWatcher.cc b/src/librbd/ConfigWatcher.cc new file mode 100644 index 000000000..0e4127804 --- /dev/null +++ b/src/librbd/ConfigWatcher.cc @@ -0,0 +1,116 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ConfigWatcher.h" +#include "common/config_obs.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/api/Config.h" +#include +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ConfigWatcher: " \ + << __func__ << ": " + +namespace librbd { + +template +struct ConfigWatcher::Observer : public md_config_obs_t { + ConfigWatcher* m_config_watcher; + + std::deque m_config_key_strs; + mutable std::vector m_config_keys; + + Observer(CephContext* cct, ConfigWatcher* config_watcher) + : m_config_watcher(config_watcher) { + const std::string rbd_key_prefix("rbd_"); + auto& schema = cct->_conf.get_schema(); + for (auto& pair : schema) { + // watch all "rbd_" keys for simplicity + if (!boost::starts_with(pair.first, rbd_key_prefix)) { + continue; + } + + m_config_key_strs.emplace_back(pair.first); + } + + m_config_keys.reserve(m_config_key_strs.size()); + for (auto& key : m_config_key_strs) { + m_config_keys.emplace_back(key.c_str()); + } + m_config_keys.emplace_back(nullptr); + } + + const char** get_tracked_conf_keys() const override { + ceph_assert(!m_config_keys.empty()); + return &m_config_keys[0]; + } + + void handle_conf_change(const ConfigProxy& conf, + const std::set &changed) override { + m_config_watcher->handle_global_config_change(changed); + } +}; + +template +ConfigWatcher::ConfigWatcher(I& image_ctx) + : m_image_ctx(image_ctx) { +} + +template +ConfigWatcher::~ConfigWatcher() { + ceph_assert(m_observer == nullptr); +} + +template +void ConfigWatcher::init() { + auto cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + m_observer = new Observer(cct, this); + cct->_conf.add_observer(m_observer); +} + +template +void ConfigWatcher::shut_down() { + auto cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + ceph_assert(m_observer != nullptr); + cct->_conf.remove_observer(m_observer); + + delete m_observer; + m_observer = nullptr; +} + +template +void ConfigWatcher::handle_global_config_change( + std::set changed_keys) { + + { + // ignore any global changes that are being overridden + std::shared_lock image_locker{m_image_ctx.image_lock}; + for (auto& key : m_image_ctx.config_overrides) { + changed_keys.erase(key); + } + } + if (changed_keys.empty()) { + return; + } + + auto cct = m_image_ctx.cct; + ldout(cct, 10) << "changed_keys=" << changed_keys << dendl; + + // refresh the image to pick up any global config overrides + m_image_ctx.state->handle_update_notification(); +} + +} // namespace librbd + +template class librbd::ConfigWatcher; diff --git a/src/librbd/ConfigWatcher.h b/src/librbd/ConfigWatcher.h new file mode 100644 index 000000000..1f10c8cb8 --- /dev/null +++ b/src/librbd/ConfigWatcher.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CONFIG_WATCHER_H +#define CEPH_LIBRBD_CONFIG_WATCHER_H + +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +template +class ConfigWatcher { +public: + static ConfigWatcher* create(ImageCtxT& image_ctx) { + return new ConfigWatcher(image_ctx); + } + + ConfigWatcher(ImageCtxT& image_ctx); + ~ConfigWatcher(); + + ConfigWatcher(const ConfigWatcher&) = delete; + ConfigWatcher& operator=(const ConfigWatcher&) = delete; + + void init(); + void shut_down(); + +private: + struct Observer; + + ImageCtxT& m_image_ctx; + + Observer* m_observer = nullptr; + + void handle_global_config_change(std::set changed); + +}; + +} // namespace librbd + +extern template class librbd::ConfigWatcher; + +#endif // CEPH_LIBRBD_CONFIG_WATCHER_H diff --git a/src/librbd/DeepCopyRequest.cc b/src/librbd/DeepCopyRequest.cc new file mode 100644 index 000000000..af26ef0c9 --- /dev/null +++ b/src/librbd/DeepCopyRequest.cc @@ -0,0 +1,361 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "DeepCopyRequest.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/deep_copy/ImageCopyRequest.h" +#include "librbd/deep_copy/MetadataCopyRequest.h" +#include "librbd/deep_copy/SnapshotCopyRequest.h" +#include "librbd/internal.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::DeepCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { + +using namespace librbd::deep_copy; + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; +using librbd::util::unique_lock_name; + +template +DeepCopyRequest::DeepCopyRequest(I *src_image_ctx, I *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, + const ObjectNumber &object_number, + asio::ContextWQ *work_queue, + SnapSeqs *snap_seqs, + deep_copy::Handler *handler, + Context *on_finish) + : RefCountedObject(dst_image_ctx->cct), m_src_image_ctx(src_image_ctx), + m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start), + m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start), + m_flatten(flatten), m_object_number(object_number), + m_work_queue(work_queue), m_snap_seqs(snap_seqs), m_handler(handler), + m_on_finish(on_finish), m_cct(dst_image_ctx->cct), + m_lock(ceph::make_mutex(unique_lock_name("DeepCopyRequest::m_lock", this))) { +} + +template +DeepCopyRequest::~DeepCopyRequest() { + ceph_assert(m_snapshot_copy_request == nullptr); + ceph_assert(m_image_copy_request == nullptr); +} + +template +void DeepCopyRequest::send() { + if (!m_src_image_ctx->data_ctx.is_valid()) { + lderr(m_cct) << "missing data pool for source image" << dendl; + finish(-ENODEV); + return; + } + + if (!m_dst_image_ctx->data_ctx.is_valid()) { + lderr(m_cct) << "missing data pool for destination image" << dendl; + finish(-ENODEV); + return; + } + + int r = validate_copy_points(); + if (r < 0) { + finish(r); + return; + } + + send_copy_snapshots(); +} + +template +void DeepCopyRequest::cancel() { + std::lock_guard locker{m_lock}; + + ldout(m_cct, 20) << dendl; + + m_canceled = true; + + if (m_snapshot_copy_request != nullptr) { + m_snapshot_copy_request->cancel(); + } + + if (m_image_copy_request != nullptr) { + m_image_copy_request->cancel(); + } +} + +template +void DeepCopyRequest::send_copy_snapshots() { + m_lock.lock(); + if (m_canceled) { + m_lock.unlock(); + finish(-ECANCELED); + return; + } + + ldout(m_cct, 20) << dendl; + + Context *ctx = create_context_callback< + DeepCopyRequest, &DeepCopyRequest::handle_copy_snapshots>(this); + m_snapshot_copy_request = SnapshotCopyRequest::create( + m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_src_snap_id_end, + m_dst_snap_id_start, m_flatten, m_work_queue, m_snap_seqs, ctx); + m_snapshot_copy_request->get(); + m_lock.unlock(); + + m_snapshot_copy_request->send(); +} + +template +void DeepCopyRequest::handle_copy_snapshots(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + { + std::lock_guard locker{m_lock}; + m_snapshot_copy_request->put(); + m_snapshot_copy_request = nullptr; + if (r == 0 && m_canceled) { + r = -ECANCELED; + } + } + + if (r == -ECANCELED) { + ldout(m_cct, 10) << "snapshot copy canceled" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to copy snapshot metadata: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (m_src_snap_id_end == CEPH_NOSNAP) { + (*m_snap_seqs)[CEPH_NOSNAP] = CEPH_NOSNAP; + } + + send_copy_image(); +} + +template +void DeepCopyRequest::send_copy_image() { + m_lock.lock(); + if (m_canceled) { + m_lock.unlock(); + finish(-ECANCELED); + return; + } + + ldout(m_cct, 20) << dendl; + + Context *ctx = create_context_callback< + DeepCopyRequest, &DeepCopyRequest::handle_copy_image>(this); + m_image_copy_request = ImageCopyRequest::create( + m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_src_snap_id_end, + m_dst_snap_id_start, m_flatten, m_object_number, *m_snap_seqs, m_handler, + ctx); + m_image_copy_request->get(); + m_lock.unlock(); + + m_image_copy_request->send(); +} + +template +void DeepCopyRequest::handle_copy_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + { + std::lock_guard locker{m_lock}; + m_image_copy_request->put(); + m_image_copy_request = nullptr; + if (r == 0 && m_canceled) { + r = -ECANCELED; + } + } + + if (r == -ECANCELED) { + ldout(m_cct, 10) << "image copy canceled" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to copy image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_copy_object_map(); +} + +template +void DeepCopyRequest::send_copy_object_map() { + m_dst_image_ctx->owner_lock.lock_shared(); + m_dst_image_ctx->image_lock.lock_shared(); + + if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP, + m_dst_image_ctx->image_lock)) { + m_dst_image_ctx->image_lock.unlock_shared(); + m_dst_image_ctx->owner_lock.unlock_shared(); + send_copy_metadata(); + return; + } + if (m_src_snap_id_end == CEPH_NOSNAP) { + m_dst_image_ctx->image_lock.unlock_shared(); + m_dst_image_ctx->owner_lock.unlock_shared(); + send_refresh_object_map(); + return; + } + + ceph_assert(m_dst_image_ctx->object_map != nullptr); + + ldout(m_cct, 20) << dendl; + + Context *finish_op_ctx = nullptr; + int r; + if (m_dst_image_ctx->exclusive_lock != nullptr) { + finish_op_ctx = m_dst_image_ctx->exclusive_lock->start_op(&r); + } + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + m_dst_image_ctx->image_lock.unlock_shared(); + m_dst_image_ctx->owner_lock.unlock_shared(); + finish(r); + return; + } + + // rollback the object map (copy snapshot object map to HEAD) + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_copy_object_map(r); + finish_op_ctx->complete(0); + }); + ceph_assert(m_snap_seqs->count(m_src_snap_id_end) > 0); + librados::snap_t copy_snap_id = (*m_snap_seqs)[m_src_snap_id_end]; + m_dst_image_ctx->object_map->rollback(copy_snap_id, ctx); + m_dst_image_ctx->image_lock.unlock_shared(); + m_dst_image_ctx->owner_lock.unlock_shared(); +} + +template +void DeepCopyRequest::handle_copy_object_map(int r) { + ldout(m_cct, 20) << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to roll back object map: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + send_refresh_object_map(); +} + +template +void DeepCopyRequest::send_refresh_object_map() { + int r; + Context *finish_op_ctx = nullptr; + { + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + if (m_dst_image_ctx->exclusive_lock != nullptr) { + finish_op_ctx = m_dst_image_ctx->exclusive_lock->start_op(&r); + } + } + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + ldout(m_cct, 20) << dendl; + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_refresh_object_map(r); + finish_op_ctx->complete(0); + }); + m_object_map = m_dst_image_ctx->create_object_map(CEPH_NOSNAP); + m_object_map->open(ctx); +} + +template +void DeepCopyRequest::handle_refresh_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to open object map: " << cpp_strerror(r) + << dendl; + delete m_object_map; + + finish(r); + return; + } + + { + std::unique_lock image_locker{m_dst_image_ctx->image_lock}; + std::swap(m_dst_image_ctx->object_map, m_object_map); + } + m_object_map->put(); + + send_copy_metadata(); +} + +template +void DeepCopyRequest::send_copy_metadata() { + ldout(m_cct, 20) << dendl; + + Context *ctx = create_context_callback< + DeepCopyRequest, &DeepCopyRequest::handle_copy_metadata>(this); + auto request = MetadataCopyRequest::create(m_src_image_ctx, + m_dst_image_ctx, ctx); + request->send(); +} + +template +void DeepCopyRequest::handle_copy_metadata(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template +int DeepCopyRequest::validate_copy_points() { + std::shared_lock image_locker{m_src_image_ctx->image_lock}; + + if (m_src_snap_id_start != 0 && + m_src_image_ctx->snap_info.find(m_src_snap_id_start) == + m_src_image_ctx->snap_info.end()) { + lderr(m_cct) << "invalid start snap_id " << m_src_snap_id_start << dendl; + return -EINVAL; + } + + if (m_src_snap_id_end != CEPH_NOSNAP && + m_src_image_ctx->snap_info.find(m_src_snap_id_end) == + m_src_image_ctx->snap_info.end()) { + lderr(m_cct) << "invalid end snap_id " << m_src_snap_id_end << dendl; + return -EINVAL; + } + + return 0; +} + +template +void DeepCopyRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + put(); +} + +} // namespace librbd + +template class librbd::DeepCopyRequest; diff --git a/src/librbd/DeepCopyRequest.h b/src/librbd/DeepCopyRequest.h new file mode 100644 index 000000000..c8bd02299 --- /dev/null +++ b/src/librbd/DeepCopyRequest.h @@ -0,0 +1,138 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_REQUEST_H + +#include "common/ceph_mutex.h" +#include "common/RefCountedObj.h" +#include "include/int_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include "librbd/deep_copy/Types.h" + +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; +namespace asio { struct ContextWQ; } + +namespace deep_copy { + +template class ImageCopyRequest; +template class SnapshotCopyRequest; +struct Handler; + +} + +template +class DeepCopyRequest : public RefCountedObject { +public: + static DeepCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, + const deep_copy::ObjectNumber &object_number, + asio::ContextWQ *work_queue, + SnapSeqs *snap_seqs, + deep_copy::Handler *handler, + Context *on_finish) { + return new DeepCopyRequest(src_image_ctx, dst_image_ctx, src_snap_id_start, + src_snap_id_end, dst_snap_id_start, flatten, + object_number, work_queue, snap_seqs, handler, + on_finish); + } + + DeepCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, const deep_copy::ObjectNumber &object_number, + asio::ContextWQ *work_queue, SnapSeqs *snap_seqs, + deep_copy::Handler *handler, Context *on_finish); + ~DeepCopyRequest(); + + void send(); + void cancel(); + +private: + /** + * @verbatim + * + * + * | + * v + * COPY_SNAPSHOTS + * | + * v + * COPY_IMAGE . . . . . . . . . . . . . . + * | . + * v . + * COPY_OBJECT_MAP (skip if object . + * | map disabled) . + * v . + * REFRESH_OBJECT_MAP (skip if object . (image copy canceled) + * | map disabled) . + * v . + * COPY_METADATA . + * | . + * v . + * < . . . . . . . . . . . . . . + * + * @endverbatim + */ + + typedef std::vector SnapIds; + typedef std::map SnapMap; + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + librados::snap_t m_src_snap_id_start; + librados::snap_t m_src_snap_id_end; + librados::snap_t m_dst_snap_id_start; + bool m_flatten; + deep_copy::ObjectNumber m_object_number; + asio::ContextWQ *m_work_queue; + SnapSeqs *m_snap_seqs; + deep_copy::Handler *m_handler; + Context *m_on_finish; + + CephContext *m_cct; + ceph::mutex m_lock; + bool m_canceled = false; + + deep_copy::SnapshotCopyRequest *m_snapshot_copy_request = nullptr; + deep_copy::ImageCopyRequest *m_image_copy_request = nullptr; + decltype(ImageCtxT::object_map) m_object_map = nullptr; + + void send_copy_snapshots(); + void handle_copy_snapshots(int r); + + void send_copy_image(); + void handle_copy_image(int r); + + void send_copy_object_map(); + void handle_copy_object_map(int r); + + void send_refresh_object_map(); + void handle_refresh_object_map(int r); + + void send_copy_metadata(); + void handle_copy_metadata(int r); + + int validate_copy_points(); + + void finish(int r); +}; + +} // namespace librbd + +extern template class librbd::DeepCopyRequest; + +#endif // CEPH_LIBRBD_DEEP_COPY_REQUEST_H diff --git a/src/librbd/ExclusiveLock.cc b/src/librbd/ExclusiveLock.cc new file mode 100644 index 000000000..bc148b1f5 --- /dev/null +++ b/src/librbd/ExclusiveLock.cc @@ -0,0 +1,388 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/Utils.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ImageState.h" +#include "librbd/exclusive_lock/ImageDispatch.h" +#include "librbd/exclusive_lock/PreAcquireRequest.h" +#include "librbd/exclusive_lock/PostAcquireRequest.h" +#include "librbd/exclusive_lock/PreReleaseRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "common/ceph_mutex.h" +#include "common/dout.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ExclusiveLock: " << this << " " \ + << __func__ + +namespace librbd { + +using namespace exclusive_lock; +using librbd::util::create_context_callback; + +template +using ML = ManagedLock; + +template +ExclusiveLock::ExclusiveLock(I &image_ctx) + : RefCountedObject(image_ctx.cct), + ML(image_ctx.md_ctx, *image_ctx.asio_engine, image_ctx.header_oid, + image_ctx.image_watcher, managed_lock::EXCLUSIVE, + image_ctx.config.template get_val("rbd_blocklist_on_break_lock"), + image_ctx.config.template get_val("rbd_blocklist_expire_seconds")), + m_image_ctx(image_ctx) { + std::lock_guard locker{ML::m_lock}; + ML::set_state_uninitialized(); +} + +template +bool ExclusiveLock::accept_request(OperationRequestType request_type, + int *ret_val) const { + std::lock_guard locker{ML::m_lock}; + + bool accept_request = + (!ML::is_state_shutdown() && ML::is_state_locked() && + (m_request_blocked_count == 0 || + m_image_ctx.get_exclusive_lock_policy()->accept_blocked_request( + request_type))); + if (ret_val != nullptr) { + *ret_val = accept_request ? 0 : m_request_blocked_ret_val; + } + + ldout(m_image_ctx.cct, 20) << "=" << accept_request << " (request_type=" + << request_type << ")" << dendl; + return accept_request; +} + +template +bool ExclusiveLock::accept_ops() const { + std::lock_guard locker{ML::m_lock}; + bool accept = accept_ops(ML::m_lock); + ldout(m_image_ctx.cct, 20) << "=" << accept << dendl; + return accept; +} + +template +bool ExclusiveLock::accept_ops(const ceph::mutex &lock) const { + return (!ML::is_state_shutdown() && + (ML::is_state_locked() || ML::is_state_post_acquiring())); +} + +template +void ExclusiveLock::set_require_lock(bool init_shutdown, + io::Direction direction, + Context* on_finish) { + m_image_dispatch->set_require_lock(init_shutdown, direction, on_finish); +} + +template +void ExclusiveLock::unset_require_lock(io::Direction direction) { + m_image_dispatch->unset_require_lock(direction); +} + +template +void ExclusiveLock::block_requests(int r) { + std::lock_guard locker{ML::m_lock}; + + m_request_blocked_count++; + if (m_request_blocked_ret_val == 0) { + m_request_blocked_ret_val = r; + } + + ldout(m_image_ctx.cct, 20) << ": r=" << r << dendl; +} + +template +void ExclusiveLock::unblock_requests() { + std::lock_guard locker{ML::m_lock}; + + ceph_assert(m_request_blocked_count > 0); + m_request_blocked_count--; + if (m_request_blocked_count == 0) { + m_request_blocked_ret_val = 0; + } + + ldout(m_image_ctx.cct, 20) << dendl; +} + +template +int ExclusiveLock::get_unlocked_op_error() const { + if (m_image_ctx.image_watcher->is_blocklisted()) { + return -EBLOCKLISTED; + } + return -EROFS; +} + +template +void ExclusiveLock::init(uint64_t features, Context *on_init) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + + on_init = create_context_callback(on_init, this); + + ldout(m_image_ctx.cct, 10) << ": features=" << features << dendl; + + { + std::lock_guard locker{ML::m_lock}; + ML::set_state_initializing(); + } + + m_image_dispatch = exclusive_lock::ImageDispatch::create(&m_image_ctx); + m_image_ctx.io_image_dispatcher->register_dispatch(m_image_dispatch); + + on_init = new LambdaContext([this, on_init](int r) { + { + std::lock_guard locker{ML::m_lock}; + ML::set_state_unlocked(); + } + + on_init->complete(r); + }); + + bool pwl_enabled = cache::util::is_pwl_enabled(m_image_ctx); + if (m_image_ctx.clone_copy_on_read || + (features & RBD_FEATURE_JOURNALING) != 0 || + pwl_enabled) { + m_image_dispatch->set_require_lock(true, io::DIRECTION_BOTH, on_init); + } else { + m_image_dispatch->set_require_lock(true, io::DIRECTION_WRITE, on_init); + } +} + +template +void ExclusiveLock::shut_down(Context *on_shut_down) { + ldout(m_image_ctx.cct, 10) << dendl; + + auto ref = ceph::ref_t>(this); + on_shut_down = create_context_callback(on_shut_down, this); + + ML::shut_down(on_shut_down); + + // if stalled in request state machine -- abort + handle_peer_notification(0); +} + +template +void ExclusiveLock::handle_peer_notification(int r) { + std::lock_guard locker{ML::m_lock}; + if (!ML::is_state_waiting_for_lock()) { + return; + } + + ldout(m_image_ctx.cct, 10) << dendl; + ceph_assert(ML::is_action_acquire_lock()); + + m_acquire_lock_peer_ret_val = r; + ML::execute_next_action(); +} + +template +Context *ExclusiveLock::start_op(int* ret_val) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + std::lock_guard locker{ML::m_lock}; + + if (!accept_ops(ML::m_lock)) { + *ret_val = get_unlocked_op_error(); + return nullptr; + } + + m_async_op_tracker.start_op(); + return new LambdaContext([this](int r) { + m_async_op_tracker.finish_op(); + }); +} + +template +void ExclusiveLock::shutdown_handler(int r, Context *on_finish) { + ldout(m_image_ctx.cct, 10) << dendl; + + { + std::unique_lock owner_locker{m_image_ctx.owner_lock}; + m_image_ctx.exclusive_lock = nullptr; + } + + on_finish = new LambdaContext([this, on_finish](int r) { + m_image_dispatch = nullptr; + m_image_ctx.image_watcher->flush(on_finish); + }); + m_image_ctx.io_image_dispatcher->shut_down_dispatch( + m_image_dispatch->get_dispatch_layer(), on_finish); +} + +template +void ExclusiveLock::pre_acquire_lock_handler(Context *on_finish) { + ldout(m_image_ctx.cct, 10) << dendl; + + int acquire_lock_peer_ret_val = 0; + { + std::lock_guard locker{ML::m_lock}; + std::swap(acquire_lock_peer_ret_val, m_acquire_lock_peer_ret_val); + } + + if (acquire_lock_peer_ret_val == -EROFS) { + ldout(m_image_ctx.cct, 10) << ": peer nacked lock request" << dendl; + on_finish->complete(acquire_lock_peer_ret_val); + return; + } + + PreAcquireRequest *req = PreAcquireRequest::create(m_image_ctx, + on_finish); + m_image_ctx.op_work_queue->queue(new LambdaContext([req](int r) { + req->send(); + })); +} + +template +void ExclusiveLock::post_acquire_lock_handler(int r, Context *on_finish) { + ldout(m_image_ctx.cct, 10) << ": r=" << r << dendl; + + if (r == -EROFS) { + // peer refused to release the exclusive lock + on_finish->complete(r); + return; + } else if (r < 0) { + ML::m_lock.lock(); + ceph_assert(ML::is_state_acquiring()); + + // PostAcquire state machine will not run, so we need complete prepare + m_image_ctx.state->handle_prepare_lock_complete(); + + // if lock is in-use by another client, request the lock + if (ML::is_action_acquire_lock() && (r == -EBUSY || r == -EAGAIN)) { + ML::set_state_waiting_for_lock(); + ML::m_lock.unlock(); + + // request the lock from a peer + m_image_ctx.image_watcher->notify_request_lock(); + + // inform manage lock that we have interrupted the state machine + r = -ECANCELED; + } else { + ML::m_lock.unlock(); + + // clear error if peer owns lock + if (r == -EAGAIN) { + r = 0; + } + } + + on_finish->complete(r); + return; + } + + std::lock_guard locker{ML::m_lock}; + m_pre_post_callback = on_finish; + using EL = ExclusiveLock; + PostAcquireRequest *req = PostAcquireRequest::create(m_image_ctx, + util::create_context_callback(this), + util::create_context_callback(this)); + + m_image_ctx.op_work_queue->queue(new LambdaContext([req](int r) { + req->send(); + })); +} + +template +void ExclusiveLock::handle_post_acquiring_lock(int r) { + ldout(m_image_ctx.cct, 10) << dendl; + + std::lock_guard locker{ML::m_lock}; + + ceph_assert(r == 0); + + // lock is owned at this point + ML::set_state_post_acquiring(); +} + +template +void ExclusiveLock::handle_post_acquired_lock(int r) { + ldout(m_image_ctx.cct, 10) << ": r=" << r << dendl; + + Context *on_finish = nullptr; + { + std::lock_guard locker{ML::m_lock}; + ceph_assert(ML::is_state_acquiring() || + ML::is_state_post_acquiring()); + + assert (m_pre_post_callback != nullptr); + std::swap(m_pre_post_callback, on_finish); + } + + if (r < 0) { + on_finish->complete(r); + return; + } + + m_image_ctx.perfcounter->tset(l_librbd_lock_acquired_time, + ceph_clock_now()); + m_image_ctx.image_watcher->notify_acquired_lock(); + m_image_dispatch->unset_require_lock(io::DIRECTION_BOTH); + + on_finish->complete(0); +} + +template +void ExclusiveLock::pre_release_lock_handler(bool shutting_down, + Context *on_finish) { + ldout(m_image_ctx.cct, 10) << dendl; + std::lock_guard locker{ML::m_lock}; + + auto req = PreReleaseRequest::create( + m_image_ctx, m_image_dispatch, shutting_down, m_async_op_tracker, + on_finish); + m_image_ctx.op_work_queue->queue(new LambdaContext([req](int r) { + req->send(); + })); +} + +template +void ExclusiveLock::post_release_lock_handler(bool shutting_down, int r, + Context *on_finish) { + ldout(m_image_ctx.cct, 10) << ": r=" << r << " shutting_down=" + << shutting_down << dendl; + if (!shutting_down) { + { + std::lock_guard locker{ML::m_lock}; + ceph_assert(ML::is_state_pre_releasing() || + ML::is_state_releasing()); + } + + if (r >= 0) { + m_image_ctx.image_watcher->notify_released_lock(); + } + + on_finish->complete(r); + } else { + { + std::unique_lock owner_locker{m_image_ctx.owner_lock}; + m_image_ctx.exclusive_lock = nullptr; + } + + on_finish = new LambdaContext([this, r, on_finish](int) { + m_image_dispatch = nullptr; + m_image_ctx.image_watcher->notify_released_lock(); + on_finish->complete(r); + }); + m_image_ctx.io_image_dispatcher->shut_down_dispatch( + m_image_dispatch->get_dispatch_layer(), on_finish); + } +} + +template +void ExclusiveLock::post_reacquire_lock_handler(int r, Context *on_finish) { + ldout(m_image_ctx.cct, 10) << dendl; + if (r >= 0) { + m_image_ctx.image_watcher->notify_acquired_lock(); + } + + on_finish->complete(r); +} + +} // namespace librbd + +template class librbd::ExclusiveLock; diff --git a/src/librbd/ExclusiveLock.h b/src/librbd/ExclusiveLock.h new file mode 100644 index 000000000..9915262f9 --- /dev/null +++ b/src/librbd/ExclusiveLock.h @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_H + +#include "common/AsyncOpTracker.h" +#include "librbd/ManagedLock.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/io/Types.h" +#include "common/RefCountedObj.h" + +struct Context; + +namespace librbd { + +namespace exclusive_lock { template struct ImageDispatch; } + +template +class ExclusiveLock : public RefCountedObject, + public ManagedLock { +public: + static ExclusiveLock *create(ImageCtxT &image_ctx) { + return new ExclusiveLock(image_ctx); + } + + ExclusiveLock(ImageCtxT &image_ctx); + + bool accept_request(exclusive_lock::OperationRequestType request_type, + int *ret_val) const; + bool accept_ops() const; + + void set_require_lock(bool init_shutdown, io::Direction direction, + Context* on_finish); + void unset_require_lock(io::Direction direction); + + void block_requests(int r); + void unblock_requests(); + + void init(uint64_t features, Context *on_init); + void shut_down(Context *on_shutdown); + + void handle_peer_notification(int r); + + int get_unlocked_op_error() const; + Context *start_op(int* ret_val); + +protected: + void shutdown_handler(int r, Context *on_finish) override; + void pre_acquire_lock_handler(Context *on_finish) override; + void post_acquire_lock_handler(int r, Context *on_finish) override; + void pre_release_lock_handler(bool shutting_down, + Context *on_finish) override; + void post_release_lock_handler(bool shutting_down, int r, + Context *on_finish) override; + void post_reacquire_lock_handler(int r, Context *on_finish) override; + +private: + + /** + * @verbatim + * + * * * > WAITING_FOR_REGISTER --------\ + * | * (watch not registered) | + * | * | + * | * * > WAITING_FOR_PEER ------------\ + * | * (request_lock busy) | + * | * | + * | * * * * * * * * * * * * * * | + * | * | + * v (init) (try_lock/request_lock) * | + * UNINITIALIZED -------> UNLOCKED ------------------------> ACQUIRING <--/ + * ^ | + * | v + * RELEASING POST_ACQUIRING + * | | + * | | + * | (release_lock) v + * PRE_RELEASING <------------------------ LOCKED + * + * + * | + * v + * REACQUIRING -------------------------------------> + * . ^ + * . | + * . . . > ---> ---/ + * + * + * | + * | + * v + * PRE_SHUTTING_DOWN ---> SHUTTING_DOWN ---> SHUTDOWN ---> + * + * @endverbatim + */ + + ImageCtxT& m_image_ctx; + exclusive_lock::ImageDispatch* m_image_dispatch = nullptr; + Context *m_pre_post_callback = nullptr; + + AsyncOpTracker m_async_op_tracker; + + uint32_t m_request_blocked_count = 0; + int m_request_blocked_ret_val = 0; + + int m_acquire_lock_peer_ret_val = 0; + + bool accept_ops(const ceph::mutex &lock) const; + + void handle_post_acquiring_lock(int r); + void handle_post_acquired_lock(int r); +}; + +} // namespace librbd + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_H diff --git a/src/librbd/Features.cc b/src/librbd/Features.cc new file mode 100644 index 000000000..9da5b1dc4 --- /dev/null +++ b/src/librbd/Features.cc @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "librbd/Features.h" +#include "include/rbd/features.h" + +#include +#include + +static const std::map RBD_FEATURE_MAP = { + {RBD_FEATURE_NAME_LAYERING, RBD_FEATURE_LAYERING}, + {RBD_FEATURE_NAME_STRIPINGV2, RBD_FEATURE_STRIPINGV2}, + {RBD_FEATURE_NAME_EXCLUSIVE_LOCK, RBD_FEATURE_EXCLUSIVE_LOCK}, + {RBD_FEATURE_NAME_OBJECT_MAP, RBD_FEATURE_OBJECT_MAP}, + {RBD_FEATURE_NAME_FAST_DIFF, RBD_FEATURE_FAST_DIFF}, + {RBD_FEATURE_NAME_DEEP_FLATTEN, RBD_FEATURE_DEEP_FLATTEN}, + {RBD_FEATURE_NAME_JOURNALING, RBD_FEATURE_JOURNALING}, + {RBD_FEATURE_NAME_DATA_POOL, RBD_FEATURE_DATA_POOL}, + {RBD_FEATURE_NAME_OPERATIONS, RBD_FEATURE_OPERATIONS}, + {RBD_FEATURE_NAME_MIGRATING, RBD_FEATURE_MIGRATING}, + {RBD_FEATURE_NAME_NON_PRIMARY, RBD_FEATURE_NON_PRIMARY}, + {RBD_FEATURE_NAME_DIRTY_CACHE, RBD_FEATURE_DIRTY_CACHE}, +}; +static_assert((RBD_FEATURE_DIRTY_CACHE << 1) > RBD_FEATURES_ALL, + "new RBD feature added"); + + +namespace librbd { + +std::string rbd_features_to_string(uint64_t features, + std::ostream *err) +{ + std::string r; + for (auto& i : RBD_FEATURE_MAP) { + if (features & i.second) { + if (!r.empty()) { + r += ","; + } + r += i.first; + features &= ~i.second; + } + } + if (err && features) { + *err << "ignoring unknown feature mask 0x" + << std::hex << features << std::dec; + } + return r; +} + +uint64_t rbd_features_from_string(const std::string& orig_value, + std::ostream *err) +{ + uint64_t features = 0; + std::string value = orig_value; + boost::trim(value); + + // empty string means default features + if (!value.size()) { + return RBD_FEATURES_DEFAULT; + } + + try { + // numeric? + features = boost::lexical_cast(value); + + // drop unrecognized bits + uint64_t unsupported_features = (features & ~RBD_FEATURES_ALL); + if (unsupported_features != 0ull) { + features &= RBD_FEATURES_ALL; + if (err) { + *err << "ignoring unknown feature mask 0x" + << std::hex << unsupported_features << std::dec; + } + } + + uint64_t ignore_features_mask = ( + RBD_FEATURES_INTERNAL | RBD_FEATURES_MUTABLE_INTERNAL); + uint64_t ignored_features = (features & ignore_features_mask); + if (ignored_features != 0ULL) { + features &= ~ignore_features_mask; + if (err) { + *err << "ignoring feature mask 0x" << std::hex << ignored_features; + } + } + } catch (boost::bad_lexical_cast&) { + // feature name list? + bool errors = false; + std::vector feature_names; + boost::split(feature_names, value, boost::is_any_of(",")); + for (auto feature_name: feature_names) { + boost::trim(feature_name); + auto feature_it = RBD_FEATURE_MAP.find(feature_name); + if (feature_it != RBD_FEATURE_MAP.end()) { + features += feature_it->second; + } else if (err) { + if (errors) { + *err << ", "; + } else { + errors = true; + } + *err << "ignoring unknown feature " << feature_name; + } + } + } + return features; +} + +} // namespace librbd diff --git a/src/librbd/Features.h b/src/librbd/Features.h new file mode 100644 index 000000000..6a88827cf --- /dev/null +++ b/src/librbd/Features.h @@ -0,0 +1,16 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +namespace librbd { + + std::string rbd_features_to_string(uint64_t features, + std::ostream *err); + uint64_t rbd_features_from_string(const std::string& value, + std::ostream *err); + +} // librbd diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc new file mode 100644 index 000000000..01b7847c0 --- /dev/null +++ b/src/librbd/ImageCtx.cc @@ -0,0 +1,1029 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include +#include +#include + +#include "include/neorados/RADOS.hpp" + +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/Timer.h" + +#include "librbd/AsioEngine.h" +#include "librbd/AsyncRequest.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/internal.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Journal.h" +#include "librbd/LibrbdAdminSocketHook.h" +#include "librbd/ObjectMap.h" +#include "librbd/Operations.h" +#include "librbd/PluginRegistry.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/exclusive_lock/AutomaticPolicy.h" +#include "librbd/exclusive_lock/StandardPolicy.h" +#include "librbd/crypto/EncryptionFormat.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ImageDispatcher.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/io/QosImageDispatch.h" +#include "librbd/io/IoOperations.h" +#include "librbd/io/Utils.h" +#include "librbd/journal/StandardPolicy.h" +#include "librbd/operation/ResizeRequest.h" + +#include "osdc/Striper.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ImageCtx: " + +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; + +using ceph::bufferlist; +using librados::snap_t; +using librados::IoCtx; + +namespace librbd { + +namespace { + +class SafeTimerSingleton : public CommonSafeTimer { +public: + ceph::mutex lock = ceph::make_mutex("librbd::SafeTimerSingleton::lock"); + + explicit SafeTimerSingleton(CephContext *cct) + : SafeTimer(cct, lock, true) { + init(); + } + ~SafeTimerSingleton() { + std::lock_guard locker{lock}; + shutdown(); + } +}; + +librados::IoCtx duplicate_io_ctx(librados::IoCtx& io_ctx) { + librados::IoCtx dup_io_ctx; + dup_io_ctx.dup(io_ctx); + return dup_io_ctx; +} + +} // anonymous namespace + + const string ImageCtx::METADATA_CONF_PREFIX = "conf_"; + + ImageCtx::ImageCtx(const string &image_name, const string &image_id, + const char *snap, IoCtx& p, bool ro) + : cct((CephContext*)p.cct()), + config(cct->_conf), + perfcounter(NULL), + snap_id(CEPH_NOSNAP), + snap_exists(true), + read_only(ro), + read_only_flags(ro ? IMAGE_READ_ONLY_FLAG_USER : 0U), + exclusive_locked(false), + name(image_name), + asio_engine(std::make_shared(p)), + rados_api(asio_engine->get_rados_api()), + data_ctx(duplicate_io_ctx(p)), + md_ctx(duplicate_io_ctx(p)), + image_watcher(NULL), + journal(NULL), + owner_lock(ceph::make_shared_mutex(util::unique_lock_name("librbd::ImageCtx::owner_lock", this))), + image_lock(ceph::make_shared_mutex(util::unique_lock_name("librbd::ImageCtx::image_lock", this))), + timestamp_lock(ceph::make_shared_mutex(util::unique_lock_name("librbd::ImageCtx::timestamp_lock", this))), + async_ops_lock(ceph::make_mutex(util::unique_lock_name("librbd::ImageCtx::async_ops_lock", this))), + copyup_list_lock(ceph::make_mutex(util::unique_lock_name("librbd::ImageCtx::copyup_list_lock", this))), + extra_read_flags(0), + old_format(false), + order(0), size(0), features(0), + format_string(NULL), + id(image_id), parent(NULL), + stripe_unit(0), stripe_count(0), flags(0), + readahead(), + total_bytes_read(0), + state(new ImageState<>(this)), + operations(new Operations<>(*this)), + exclusive_lock(nullptr), object_map(nullptr), + op_work_queue(asio_engine->get_work_queue()), + plugin_registry(new PluginRegistry(this)), + event_socket_completions(32), + asok_hook(nullptr), + trace_endpoint("librbd") + { + ldout(cct, 10) << this << " " << __func__ << ": " + << "image_name=" << image_name << ", " + << "image_id=" << image_id << dendl; + + if (snap) + snap_name = snap; + + rebuild_data_io_context(); + + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&header, 0, sizeof(header)); + + io_image_dispatcher = new io::ImageDispatcher(this); + io_object_dispatcher = new io::ObjectDispatcher(this); + + if (cct->_conf.get_val("rbd_auto_exclusive_lock_until_manual_request")) { + exclusive_lock_policy = new exclusive_lock::AutomaticPolicy(this); + } else { + exclusive_lock_policy = new exclusive_lock::StandardPolicy(this); + } + journal_policy = new journal::StandardPolicy(this); + } + + ImageCtx::ImageCtx(const string &image_name, const string &image_id, + uint64_t snap_id, IoCtx& p, bool ro) + : ImageCtx(image_name, image_id, "", p, ro) { + open_snap_id = snap_id; + } + + ImageCtx::~ImageCtx() { + ldout(cct, 10) << this << " " << __func__ << dendl; + + ceph_assert(config_watcher == nullptr); + ceph_assert(image_watcher == NULL); + ceph_assert(exclusive_lock == NULL); + ceph_assert(object_map == NULL); + ceph_assert(journal == NULL); + ceph_assert(asok_hook == NULL); + + if (perfcounter) { + perf_stop(); + } + delete[] format_string; + + md_ctx.aio_flush(); + if (data_ctx.is_valid()) { + data_ctx.aio_flush(); + } + + delete io_object_dispatcher; + delete io_image_dispatcher; + + delete journal_policy; + delete exclusive_lock_policy; + delete operations; + delete state; + + delete plugin_registry; + } + + void ImageCtx::init() { + ceph_assert(!header_oid.empty()); + ceph_assert(old_format || !id.empty()); + + asok_hook = new LibrbdAdminSocketHook(this); + + string pname = string("librbd-") + id + string("-") + + md_ctx.get_pool_name() + string("-") + name; + if (!snap_name.empty()) { + pname += "-"; + pname += snap_name; + } + + trace_endpoint.copy_name(pname); + perf_start(pname); + + ceph_assert(image_watcher == NULL); + image_watcher = new ImageWatcher<>(*this); + } + + void ImageCtx::shutdown() { + delete image_watcher; + image_watcher = nullptr; + + delete asok_hook; + asok_hook = nullptr; + } + + void ImageCtx::init_layout(int64_t pool_id) + { + if (stripe_unit == 0 || stripe_count == 0) { + stripe_unit = 1ull << order; + stripe_count = 1; + } + + vector alignments; + alignments.push_back(stripe_count << order); // object set (in file striping terminology) + alignments.push_back(stripe_unit * stripe_count); // stripe + alignments.push_back(stripe_unit); // stripe unit + readahead.set_alignments(alignments); + + layout = file_layout_t(); + layout.stripe_unit = stripe_unit; + layout.stripe_count = stripe_count; + layout.object_size = 1ull << order; + layout.pool_id = pool_id; // FIXME: pool id overflow? + + delete[] format_string; + size_t len = object_prefix.length() + 16; + format_string = new char[len]; + if (old_format) { + snprintf(format_string, len, "%s.%%012llx", object_prefix.c_str()); + } else { + snprintf(format_string, len, "%s.%%016llx", object_prefix.c_str()); + } + + ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit + << " stripe_count " << stripe_count + << " object_size " << layout.object_size + << " prefix " << object_prefix + << " format " << format_string + << dendl; + } + + void ImageCtx::perf_start(string name) { + auto perf_prio = PerfCountersBuilder::PRIO_DEBUGONLY; + if (child == nullptr) { + // ensure top-level IO stats are exported for librbd daemons + perf_prio = PerfCountersBuilder::PRIO_USEFUL; + } + + PerfCountersBuilder plb(cct, name, l_librbd_first, l_librbd_last); + + plb.add_u64_counter(l_librbd_rd, "rd", "Reads", "r", perf_prio); + plb.add_u64_counter(l_librbd_rd_bytes, "rd_bytes", "Data size in reads", + "rb", perf_prio, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_rd_latency, "rd_latency", "Latency of reads", + "rl", perf_prio); + plb.add_u64_counter(l_librbd_wr, "wr", "Writes", "w", perf_prio); + plb.add_u64_counter(l_librbd_wr_bytes, "wr_bytes", "Written data", + "wb", perf_prio, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_wr_latency, "wr_latency", "Write latency", + "wl", perf_prio); + plb.add_u64_counter(l_librbd_discard, "discard", "Discards"); + plb.add_u64_counter(l_librbd_discard_bytes, "discard_bytes", "Discarded data", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_discard_latency, "discard_latency", "Discard latency"); + plb.add_u64_counter(l_librbd_flush, "flush", "Flushes"); + plb.add_time_avg(l_librbd_flush_latency, "flush_latency", "Latency of flushes"); + plb.add_u64_counter(l_librbd_ws, "ws", "WriteSames"); + plb.add_u64_counter(l_librbd_ws_bytes, "ws_bytes", "WriteSame data", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_ws_latency, "ws_latency", "WriteSame latency"); + plb.add_u64_counter(l_librbd_cmp, "cmp", "CompareAndWrites"); + plb.add_u64_counter(l_librbd_cmp_bytes, "cmp_bytes", "Data size in cmps", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_time_avg(l_librbd_cmp_latency, "cmp_latency", "Latency of cmps"); + plb.add_u64_counter(l_librbd_snap_create, "snap_create", "Snap creations"); + plb.add_u64_counter(l_librbd_snap_remove, "snap_remove", "Snap removals"); + plb.add_u64_counter(l_librbd_snap_rollback, "snap_rollback", "Snap rollbacks"); + plb.add_u64_counter(l_librbd_snap_rename, "snap_rename", "Snap rename"); + plb.add_u64_counter(l_librbd_notify, "notify", "Updated header notifications"); + plb.add_u64_counter(l_librbd_resize, "resize", "Resizes"); + plb.add_u64_counter(l_librbd_readahead, "readahead", "Read ahead"); + plb.add_u64_counter(l_librbd_readahead_bytes, "readahead_bytes", "Data size in read ahead", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_librbd_invalidate_cache, "invalidate_cache", "Cache invalidates"); + + plb.add_time(l_librbd_opened_time, "opened_time", "Opened time", + "ots", perf_prio); + plb.add_time(l_librbd_lock_acquired_time, "lock_acquired_time", + "Lock acquired time", "lats", perf_prio); + + perfcounter = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perfcounter); + + perfcounter->tset(l_librbd_opened_time, ceph_clock_now()); + } + + void ImageCtx::perf_stop() { + ceph_assert(perfcounter); + cct->get_perfcounters_collection()->remove(perfcounter); + delete perfcounter; + } + + void ImageCtx::set_read_flag(unsigned flag) { + extra_read_flags |= flag; + } + + int ImageCtx::get_read_flags(snap_t snap_id) { + int flags = librados::OPERATION_NOFLAG | read_flags; + if (flags != 0) + return flags; + + flags = librados::OPERATION_NOFLAG | extra_read_flags; + if (snap_id == LIBRADOS_SNAP_HEAD) + return flags; + + if (config.get_val("rbd_balance_snap_reads")) + flags |= librados::OPERATION_BALANCE_READS; + else if (config.get_val("rbd_localize_snap_reads")) + flags |= librados::OPERATION_LOCALIZE_READS; + return flags; + } + + int ImageCtx::snap_set(uint64_t in_snap_id) { + ceph_assert(ceph_mutex_is_wlocked(image_lock)); + auto it = snap_info.find(in_snap_id); + if (in_snap_id != CEPH_NOSNAP && it != snap_info.end()) { + snap_id = in_snap_id; + snap_namespace = it->second.snap_namespace; + snap_name = it->second.name; + snap_exists = true; + if (data_ctx.is_valid()) { + data_ctx.snap_set_read(snap_id); + rebuild_data_io_context(); + } + return 0; + } + return -ENOENT; + } + + void ImageCtx::snap_unset() + { + ceph_assert(ceph_mutex_is_wlocked(image_lock)); + snap_id = CEPH_NOSNAP; + snap_namespace = {}; + snap_name = ""; + snap_exists = true; + if (data_ctx.is_valid()) { + data_ctx.snap_set_read(snap_id); + rebuild_data_io_context(); + } + } + + snap_t ImageCtx::get_snap_id(const cls::rbd::SnapshotNamespace& in_snap_namespace, + const string& in_snap_name) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + auto it = snap_ids.find({in_snap_namespace, in_snap_name}); + if (it != snap_ids.end()) { + return it->second; + } + return CEPH_NOSNAP; + } + + const SnapInfo* ImageCtx::get_snap_info(snap_t in_snap_id) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + map::const_iterator it = + snap_info.find(in_snap_id); + if (it != snap_info.end()) + return &it->second; + return nullptr; + } + + int ImageCtx::get_snap_name(snap_t in_snap_id, + string *out_snap_name) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *out_snap_name = info->name; + return 0; + } + return -ENOENT; + } + + int ImageCtx::get_snap_namespace(snap_t in_snap_id, + cls::rbd::SnapshotNamespace *out_snap_namespace) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *out_snap_namespace = info->snap_namespace; + return 0; + } + return -ENOENT; + } + + int ImageCtx::get_parent_spec(snap_t in_snap_id, + cls::rbd::ParentImageSpec *out_pspec) const + { + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *out_pspec = info->parent.spec; + return 0; + } + return -ENOENT; + } + + uint64_t ImageCtx::get_current_size() const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + return size; + } + + uint64_t ImageCtx::get_object_size() const + { + return 1ull << order; + } + + string ImageCtx::get_object_name(uint64_t num) const { + return util::data_object_name(this, num); + } + + uint64_t ImageCtx::get_stripe_unit() const + { + return stripe_unit; + } + + uint64_t ImageCtx::get_stripe_count() const + { + return stripe_count; + } + + uint64_t ImageCtx::get_stripe_period() const + { + return stripe_count * (1ull << order); + } + + utime_t ImageCtx::get_create_timestamp() const + { + return create_timestamp; + } + + utime_t ImageCtx::get_access_timestamp() const + { + return access_timestamp; + } + + utime_t ImageCtx::get_modify_timestamp() const + { + return modify_timestamp; + } + + void ImageCtx::set_access_timestamp(utime_t at) + { + ceph_assert(ceph_mutex_is_wlocked(timestamp_lock)); + access_timestamp = at; + } + + void ImageCtx::set_modify_timestamp(utime_t mt) + { + ceph_assert(ceph_mutex_is_locked(timestamp_lock)); + modify_timestamp = mt; + } + + int ImageCtx::is_snap_protected(snap_t in_snap_id, + bool *is_protected) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *is_protected = + (info->protection_status == RBD_PROTECTION_STATUS_PROTECTED); + return 0; + } + return -ENOENT; + } + + int ImageCtx::is_snap_unprotected(snap_t in_snap_id, + bool *is_unprotected) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + *is_unprotected = + (info->protection_status == RBD_PROTECTION_STATUS_UNPROTECTED); + return 0; + } + return -ENOENT; + } + + void ImageCtx::add_snap(cls::rbd::SnapshotNamespace in_snap_namespace, + string in_snap_name, + snap_t id, uint64_t in_size, + const ParentImageInfo &parent, + uint8_t protection_status, uint64_t flags, + utime_t timestamp) + { + ceph_assert(ceph_mutex_is_wlocked(image_lock)); + snaps.push_back(id); + SnapInfo info(in_snap_name, in_snap_namespace, + in_size, parent, protection_status, flags, timestamp); + snap_info.insert({id, info}); + snap_ids.insert({{in_snap_namespace, in_snap_name}, id}); + } + + void ImageCtx::rm_snap(cls::rbd::SnapshotNamespace in_snap_namespace, + string in_snap_name, + snap_t id) + { + ceph_assert(ceph_mutex_is_wlocked(image_lock)); + snaps.erase(std::remove(snaps.begin(), snaps.end(), id), snaps.end()); + snap_info.erase(id); + snap_ids.erase({in_snap_namespace, in_snap_name}); + } + + uint64_t ImageCtx::get_image_size(snap_t in_snap_id) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + if (in_snap_id == CEPH_NOSNAP) { + if (!resize_reqs.empty() && + resize_reqs.front()->shrinking()) { + return resize_reqs.front()->get_image_size(); + } + return size; + } + + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) { + return info->size; + } + return 0; + } + + uint64_t ImageCtx::get_area_size(io::ImageArea area) const { + // image areas are defined only for the "opened at" snap_id + // (i.e. where encryption may be loaded) + uint64_t raw_size = get_image_size(snap_id); + if (raw_size == 0) { + return 0; + } + + auto size = io::util::raw_to_area_offset(*this, raw_size); + ceph_assert(size.first <= raw_size && size.second == io::ImageArea::DATA); + + switch (area) { + case io::ImageArea::DATA: + return size.first; + case io::ImageArea::CRYPTO_HEADER: + // CRYPTO_HEADER area ends where DATA area begins + return raw_size - size.first; + default: + ceph_abort(); + } + } + + uint64_t ImageCtx::get_object_count(snap_t in_snap_id) const { + ceph_assert(ceph_mutex_is_locked(image_lock)); + uint64_t image_size = get_image_size(in_snap_id); + return Striper::get_num_objects(layout, image_size); + } + + bool ImageCtx::test_features(uint64_t features) const + { + std::shared_lock l{image_lock}; + return test_features(features, image_lock); + } + + bool ImageCtx::test_features(uint64_t in_features, + const ceph::shared_mutex &in_image_lock) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + return ((features & in_features) == in_features); + } + + bool ImageCtx::test_op_features(uint64_t in_op_features) const + { + std::shared_lock l{image_lock}; + return test_op_features(in_op_features, image_lock); + } + + bool ImageCtx::test_op_features(uint64_t in_op_features, + const ceph::shared_mutex &in_image_lock) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + return ((op_features & in_op_features) == in_op_features); + } + + int ImageCtx::get_flags(librados::snap_t _snap_id, uint64_t *_flags) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + if (_snap_id == CEPH_NOSNAP) { + *_flags = flags; + return 0; + } + const SnapInfo *info = get_snap_info(_snap_id); + if (info) { + *_flags = info->flags; + return 0; + } + return -ENOENT; + } + + int ImageCtx::test_flags(librados::snap_t in_snap_id, + uint64_t flags, bool *flags_set) const + { + std::shared_lock l{image_lock}; + return test_flags(in_snap_id, flags, image_lock, flags_set); + } + + int ImageCtx::test_flags(librados::snap_t in_snap_id, + uint64_t flags, + const ceph::shared_mutex &in_image_lock, + bool *flags_set) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + uint64_t snap_flags; + int r = get_flags(in_snap_id, &snap_flags); + if (r < 0) { + return r; + } + *flags_set = ((snap_flags & flags) == flags); + return 0; + } + + int ImageCtx::update_flags(snap_t in_snap_id, uint64_t flag, bool enabled) + { + ceph_assert(ceph_mutex_is_wlocked(image_lock)); + uint64_t *_flags; + if (in_snap_id == CEPH_NOSNAP) { + _flags = &flags; + } else { + map::iterator it = snap_info.find(in_snap_id); + if (it == snap_info.end()) { + return -ENOENT; + } + _flags = &it->second.flags; + } + + if (enabled) { + (*_flags) |= flag; + } else { + (*_flags) &= ~flag; + } + return 0; + } + + const ParentImageInfo* ImageCtx::get_parent_info(snap_t in_snap_id) const + { + ceph_assert(ceph_mutex_is_locked(image_lock)); + if (in_snap_id == CEPH_NOSNAP) + return &parent_md; + const SnapInfo *info = get_snap_info(in_snap_id); + if (info) + return &info->parent; + return NULL; + } + + int64_t ImageCtx::get_parent_pool_id(snap_t in_snap_id) const + { + const auto info = get_parent_info(in_snap_id); + if (info) + return info->spec.pool_id; + return -1; + } + + string ImageCtx::get_parent_image_id(snap_t in_snap_id) const + { + const auto info = get_parent_info(in_snap_id); + if (info) + return info->spec.image_id; + return ""; + } + + uint64_t ImageCtx::get_parent_snap_id(snap_t in_snap_id) const + { + const auto info = get_parent_info(in_snap_id); + if (info) + return info->spec.snap_id; + return CEPH_NOSNAP; + } + + int ImageCtx::get_parent_overlap(snap_t in_snap_id, + uint64_t* raw_overlap) const { + const auto info = get_parent_info(in_snap_id); + if (info) { + *raw_overlap = info->overlap; + return 0; + } + return -ENOENT; + } + + std::pair ImageCtx::reduce_parent_overlap( + uint64_t raw_overlap, bool migration_write) const { + ceph_assert(ceph_mutex_is_locked(image_lock)); + if (migration_write) { + // don't reduce migration write overlap -- it may be larger as + // it's the largest overlap across snapshots by construction + return io::util::raw_to_area_offset(*this, raw_overlap); + } + if (raw_overlap == 0 || parent == nullptr) { + // image opened with OPEN_FLAG_SKIP_OPEN_PARENT -> no overlap + return io::util::raw_to_area_offset(*this, 0); + } + // DATA area in the parent may be smaller than the part of DATA + // area in the clone that is still within the overlap (e.g. for + // LUKS2-encrypted parent + LUKS1-encrypted clone, due to LUKS2 + // header usually being bigger than LUKS1 header) + auto overlap = io::util::raw_to_area_offset(*this, raw_overlap); + std::shared_lock parent_image_locker(parent->image_lock); + overlap.first = std::min(overlap.first, + parent->get_area_size(overlap.second)); + return overlap; + } + + uint64_t ImageCtx::prune_parent_extents(io::Extents& image_extents, + io::ImageArea area, + uint64_t raw_overlap, + bool migration_write) const { + ceph_assert(ceph_mutex_is_locked(image_lock)); + ldout(cct, 10) << __func__ << ": image_extents=" << image_extents + << " area=" << area << " raw_overlap=" << raw_overlap + << " migration_write=" << migration_write << dendl; + if (raw_overlap == 0) { + image_extents.clear(); + return 0; + } + + auto overlap = reduce_parent_overlap(raw_overlap, migration_write); + if (area == overlap.second) { + // drop extents completely beyond the overlap + while (!image_extents.empty() && + image_extents.back().first >= overlap.first) { + image_extents.pop_back(); + } + if (!image_extents.empty()) { + // trim final overlapping extent + auto& last_extent = image_extents.back(); + if (last_extent.first + last_extent.second > overlap.first) { + last_extent.second = overlap.first - last_extent.first; + } + } + } else if (area == io::ImageArea::DATA && + overlap.second == io::ImageArea::CRYPTO_HEADER) { + // all extents completely beyond the overlap + image_extents.clear(); + } else { + // all extents completely within the overlap + ceph_assert(area == io::ImageArea::CRYPTO_HEADER && + overlap.second == io::ImageArea::DATA); + } + + uint64_t overlap_bytes = 0; + for (auto [_, len] : image_extents) { + overlap_bytes += len; + } + ldout(cct, 10) << __func__ << ": overlap=" << overlap.first + << "/" << overlap.second + << " got overlap_bytes=" << overlap_bytes + << " at " << image_extents << dendl; + return overlap_bytes; + } + + void ImageCtx::register_watch(Context *on_finish) { + ceph_assert(image_watcher != NULL); + image_watcher->register_watch(on_finish); + } + + void ImageCtx::cancel_async_requests() { + C_SaferCond ctx; + cancel_async_requests(&ctx); + ctx.wait(); + } + + void ImageCtx::cancel_async_requests(Context *on_finish) { + { + std::lock_guard async_ops_locker{async_ops_lock}; + if (!async_requests.empty()) { + ldout(cct, 10) << "canceling async requests: count=" + << async_requests.size() << dendl; + for (auto req : async_requests) { + ldout(cct, 10) << "canceling async request: " << req << dendl; + req->cancel(); + } + async_requests_waiters.push_back(on_finish); + return; + } + } + + on_finish->complete(0); + } + + void ImageCtx::apply_metadata(const std::map &meta, + bool thread_safe) { + ldout(cct, 20) << __func__ << dendl; + + std::unique_lock image_locker(image_lock); + + // reset settings back to global defaults + config_overrides.clear(); + config.set_config_values(cct->_conf.get_config_values()); + + // extract config overrides + for (auto meta_pair : meta) { + if (!boost::starts_with(meta_pair.first, METADATA_CONF_PREFIX)) { + continue; + } + + std::string key = meta_pair.first.substr(METADATA_CONF_PREFIX.size()); + if (!boost::starts_with(key, "rbd_")) { + // ignore non-RBD configuration keys + // TODO use option schema to determine applicable subsystem + ldout(cct, 0) << __func__ << ": ignoring config " << key << dendl; + continue; + } + + if (config.find_option(key) != nullptr) { + std::string val(meta_pair.second.c_str(), meta_pair.second.length()); + int r = config.set_val(key, val); + if (r >= 0) { + ldout(cct, 20) << __func__ << ": " << key << "=" << val << dendl; + config_overrides.insert(key); + } else { + lderr(cct) << __func__ << ": failed to set config " << key << " " + << "with value " << val << ": " << cpp_strerror(r) + << dendl; + } + } + } + + image_locker.unlock(); + +#define ASSIGN_OPTION(param, type) \ + param = config.get_val("rbd_"#param) + + bool skip_partial_discard = true; + ASSIGN_OPTION(non_blocking_aio, bool); + ASSIGN_OPTION(cache, bool); + ASSIGN_OPTION(sparse_read_threshold_bytes, Option::size_t); + ASSIGN_OPTION(clone_copy_on_read, bool); + ASSIGN_OPTION(enable_alloc_hint, bool); + ASSIGN_OPTION(mirroring_replay_delay, uint64_t); + ASSIGN_OPTION(mtime_update_interval, uint64_t); + ASSIGN_OPTION(atime_update_interval, uint64_t); + ASSIGN_OPTION(skip_partial_discard, bool); + ASSIGN_OPTION(discard_granularity_bytes, uint64_t); + ASSIGN_OPTION(blkin_trace_all, bool); + + auto cache_policy = config.get_val("rbd_cache_policy"); + if (cache_policy == "writethrough" || cache_policy == "writeback") { + ASSIGN_OPTION(readahead_max_bytes, Option::size_t); + ASSIGN_OPTION(readahead_disable_after_bytes, Option::size_t); + } + +#undef ASSIGN_OPTION + + if (sparse_read_threshold_bytes == 0) { + sparse_read_threshold_bytes = get_object_size(); + } + + bool dirty_cache = test_features(RBD_FEATURE_DIRTY_CACHE); + if (!skip_partial_discard || dirty_cache) { + discard_granularity_bytes = 0; + } + + alloc_hint_flags = 0; + auto compression_hint = config.get_val("rbd_compression_hint"); + if (compression_hint == "compressible") { + alloc_hint_flags |= librados::ALLOC_HINT_FLAG_COMPRESSIBLE; + } else if (compression_hint == "incompressible") { + alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE; + } + + librados::Rados rados(md_ctx); + int8_t require_osd_release; + int r = rados.get_min_compatible_osd(&require_osd_release); + if (r == 0 && require_osd_release >= CEPH_RELEASE_OCTOPUS) { + read_flags = 0; + auto read_policy = config.get_val("rbd_read_from_replica_policy"); + if (read_policy == "balance") { + read_flags |= librados::OPERATION_BALANCE_READS; + } else if (read_policy == "localize") { + read_flags |= librados::OPERATION_LOCALIZE_READS; + } + } + + io_image_dispatcher->apply_qos_schedule_tick_min( + config.get_val("rbd_qos_schedule_tick_min")); + + io_image_dispatcher->apply_qos_limit( + io::IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE, + config.get_val("rbd_qos_iops_limit"), + config.get_val("rbd_qos_iops_burst"), + config.get_val("rbd_qos_iops_burst_seconds")); + io_image_dispatcher->apply_qos_limit( + io::IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE, + config.get_val("rbd_qos_bps_limit"), + config.get_val("rbd_qos_bps_burst"), + config.get_val("rbd_qos_bps_burst_seconds")); + io_image_dispatcher->apply_qos_limit( + io::IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE, + config.get_val("rbd_qos_read_iops_limit"), + config.get_val("rbd_qos_read_iops_burst"), + config.get_val("rbd_qos_read_iops_burst_seconds")); + io_image_dispatcher->apply_qos_limit( + io::IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE, + config.get_val("rbd_qos_write_iops_limit"), + config.get_val("rbd_qos_write_iops_burst"), + config.get_val("rbd_qos_write_iops_burst_seconds")); + io_image_dispatcher->apply_qos_limit( + io::IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE, + config.get_val("rbd_qos_read_bps_limit"), + config.get_val("rbd_qos_read_bps_burst"), + config.get_val("rbd_qos_read_bps_burst_seconds")); + io_image_dispatcher->apply_qos_limit( + io::IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE, + config.get_val("rbd_qos_write_bps_limit"), + config.get_val("rbd_qos_write_bps_burst"), + config.get_val("rbd_qos_write_bps_burst_seconds")); + io_image_dispatcher->apply_qos_exclude_ops( + librbd::io::rbd_io_operations_from_string( + config.get_val("rbd_qos_exclude_ops"), nullptr)); + + if (!disable_zero_copy && + config.get_val("rbd_disable_zero_copy_writes")) { + ldout(cct, 5) << this << ": disabling zero-copy writes" << dendl; + disable_zero_copy = true; + } + } + + ExclusiveLock *ImageCtx::create_exclusive_lock() { + return new ExclusiveLock(*this); + } + + ObjectMap *ImageCtx::create_object_map(uint64_t snap_id) { + return new ObjectMap(*this, snap_id); + } + + Journal *ImageCtx::create_journal() { + return new Journal(*this); + } + + void ImageCtx::set_image_name(const std::string &image_name) { + // update the name so rename can be invoked repeatedly + std::shared_lock owner_locker{owner_lock}; + std::unique_lock image_locker{image_lock}; + name = image_name; + if (old_format) { + header_oid = util::old_header_name(image_name); + } + } + + void ImageCtx::notify_update() { + state->handle_update_notification(); + ImageWatcher<>::notify_header_update(md_ctx, header_oid); + } + + void ImageCtx::notify_update(Context *on_finish) { + state->handle_update_notification(); + image_watcher->notify_header_update(on_finish); + } + + exclusive_lock::Policy *ImageCtx::get_exclusive_lock_policy() const { + ceph_assert(ceph_mutex_is_locked(owner_lock)); + ceph_assert(exclusive_lock_policy != nullptr); + return exclusive_lock_policy; + } + + void ImageCtx::set_exclusive_lock_policy(exclusive_lock::Policy *policy) { + ceph_assert(ceph_mutex_is_wlocked(owner_lock)); + ceph_assert(policy != nullptr); + delete exclusive_lock_policy; + exclusive_lock_policy = policy; + } + + journal::Policy *ImageCtx::get_journal_policy() const { + ceph_assert(ceph_mutex_is_locked(image_lock)); + ceph_assert(journal_policy != nullptr); + return journal_policy; + } + + void ImageCtx::set_journal_policy(journal::Policy *policy) { + ceph_assert(ceph_mutex_is_wlocked(image_lock)); + ceph_assert(policy != nullptr); + delete journal_policy; + journal_policy = policy; + } + + void ImageCtx::rebuild_data_io_context() { + auto ctx = std::make_shared( + data_ctx.get_id(), data_ctx.get_namespace()); + if (snap_id != CEPH_NOSNAP) { + ctx->read_snap(snap_id); + } + if (!snapc.snaps.empty()) { + ctx->write_snap_context( + {{snapc.seq, {snapc.snaps.begin(), snapc.snaps.end()}}}); + } + if (data_ctx.get_pool_full_try()) { + ctx->full_try(true); + } + + // atomically reset the data IOContext to new version + atomic_store(&data_io_context, ctx); + } + + IOContext ImageCtx::get_data_io_context() const { + return atomic_load(&data_io_context); + } + + IOContext ImageCtx::duplicate_data_io_context() const { + auto ctx = get_data_io_context(); + return std::make_shared(*ctx); + } + + void ImageCtx::get_timer_instance(CephContext *cct, SafeTimer **timer, + ceph::mutex **timer_lock) { + auto safe_timer_singleton = + &cct->lookup_or_create_singleton_object( + "librbd::journal::safe_timer", false, cct); + *timer = safe_timer_singleton; + *timer_lock = &safe_timer_singleton->lock; + } +} diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h new file mode 100644 index 000000000..9a432c764 --- /dev/null +++ b/src/librbd/ImageCtx.h @@ -0,0 +1,368 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_IMAGECTX_H +#define CEPH_LIBRBD_IMAGECTX_H + +#include "include/int_types.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "common/Timer.h" +#include "common/ceph_mutex.h" +#include "common/config_proxy.h" +#include "common/event_socket.h" +#include "common/Readahead.h" +#include "common/snap_types.h" +#include "common/zipkin_trace.h" + +#include "include/common_fwd.h" +#include "include/buffer_fwd.h" +#include "include/rbd/librbd.hpp" +#include "include/rbd_types.h" +#include "include/types.h" +#include "include/xlist.h" + +#include "cls/rbd/cls_rbd_types.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/AsyncRequest.h" +#include "librbd/Types.h" + +#include +#include + +namespace neorados { +class IOContext; +class RADOS; +} // namespace neorados + +namespace librbd { + + struct AsioEngine; + template class ConfigWatcher; + template class ExclusiveLock; + template class ImageState; + template class ImageWatcher; + template class Journal; + class LibrbdAdminSocketHook; + template class ObjectMap; + template class Operations; + template class PluginRegistry; + + namespace asio { struct ContextWQ; } + namespace crypto { template class EncryptionFormat; } + namespace exclusive_lock { struct Policy; } + namespace io { + class AioCompletion; + class AsyncOperation; + template class CopyupRequest; + enum class ImageArea; + struct ImageDispatcherInterface; + struct ObjectDispatcherInterface; + } + namespace journal { struct Policy; } + + namespace operation { + template class ResizeRequest; + } + + struct ImageCtx { + typedef std::pair SnapKey; + struct SnapKeyComparator { + inline bool operator()(const SnapKey& lhs, const SnapKey& rhs) const { + // only compare by namespace type and name + if (lhs.first.index() != rhs.first.index()) { + return lhs.first.index() < rhs.first.index(); + } + return lhs.second < rhs.second; + } + }; + + static const std::string METADATA_CONF_PREFIX; + + CephContext *cct; + ConfigProxy config; + std::set config_overrides; + + PerfCounters *perfcounter; + struct rbd_obj_header_ondisk header; + ::SnapContext snapc; + std::vector snaps; // this mirrors snapc.snaps, but is in + // a format librados can understand + std::map snap_info; + std::map snap_ids; + uint64_t open_snap_id = CEPH_NOSNAP; + uint64_t snap_id; + bool snap_exists; // false if our snap_id was deleted + // whether the image was opened read-only. cannot be changed after opening + bool read_only; + uint32_t read_only_flags = 0U; + uint32_t read_only_mask = ~0U; + + std::map lockers; + bool exclusive_locked; + std::string lock_tag; + + std::string name; + cls::rbd::SnapshotNamespace snap_namespace; + std::string snap_name; + + std::shared_ptr asio_engine; + + // New ASIO-style RADOS API + neorados::RADOS& rados_api; + + // Legacy RADOS API + librados::IoCtx data_ctx; + librados::IoCtx md_ctx; + + ConfigWatcher *config_watcher = nullptr; + ImageWatcher *image_watcher; + Journal *journal; + + /** + * Lock ordering: + * + * owner_lock, image_lock + * async_op_lock, timestamp_lock + */ + ceph::shared_mutex owner_lock; // protects exclusive lock leadership updates + mutable ceph::shared_mutex image_lock; // protects snapshot-related member variables, + // features (and associated helper classes), and flags + // protects access to the mutable image metadata that + // isn't guarded by other locks below, and blocks writes + // when held exclusively, so snapshots can be consistent. + // Fields guarded include: + // total_bytes_read + // exclusive_locked + // lock_tag + // lockers + // object_map + // parent_md and parent + // encryption_format + + ceph::shared_mutex timestamp_lock; // protects (create/access/modify)_timestamp + ceph::mutex async_ops_lock; // protects async_ops and async_requests + ceph::mutex copyup_list_lock; // protects copyup_waiting_list + + unsigned extra_read_flags; // librados::OPERATION_* + + bool old_format; + uint8_t order; + uint64_t size; + uint64_t features; + std::string object_prefix; + char *format_string; + std::string header_oid; + std::string id; // only used for new-format images + ParentImageInfo parent_md; + ImageCtx *parent; + ImageCtx *child = nullptr; + MigrationInfo migration_info; + cls::rbd::GroupSpec group_spec; + uint64_t stripe_unit, stripe_count; + uint64_t flags; + uint64_t op_features = 0; + bool operations_disabled = false; + utime_t create_timestamp; + utime_t access_timestamp; + utime_t modify_timestamp; + + file_layout_t layout; + + Readahead readahead; + std::atomic total_bytes_read = {0}; + + std::map*> copyup_list; + + xlist async_ops; + xlist*> async_requests; + std::list async_requests_waiters; + + ImageState *state; + Operations *operations; + + ExclusiveLock *exclusive_lock; + ObjectMap *object_map; + + xlist*> resize_reqs; + + io::ImageDispatcherInterface *io_image_dispatcher = nullptr; + io::ObjectDispatcherInterface *io_object_dispatcher = nullptr; + + asio::ContextWQ *op_work_queue; + + PluginRegistry* plugin_registry; + + using Completions = boost::lockfree::queue; + + Completions event_socket_completions; + EventSocket event_socket; + + bool ignore_migrating = false; + bool disable_zero_copy = false; + bool enable_sparse_copyup = false; + + /// Cached latency-sensitive configuration settings + bool non_blocking_aio; + bool cache; + uint64_t sparse_read_threshold_bytes; + uint64_t readahead_max_bytes = 0; + uint64_t readahead_disable_after_bytes = 0; + bool clone_copy_on_read; + bool enable_alloc_hint; + uint32_t alloc_hint_flags = 0U; + uint32_t read_flags = 0U; // librados::OPERATION_* + uint32_t discard_granularity_bytes = 0; + bool blkin_trace_all; + uint64_t mirroring_replay_delay; + uint64_t mtime_update_interval; + uint64_t atime_update_interval; + + LibrbdAdminSocketHook *asok_hook; + + exclusive_lock::Policy *exclusive_lock_policy = nullptr; + journal::Policy *journal_policy = nullptr; + + ZTracer::Endpoint trace_endpoint; + + std::unique_ptr> encryption_format; + + // unit test mock helpers + static ImageCtx* create(const std::string &image_name, + const std::string &image_id, + const char *snap, IoCtx& p, bool read_only) { + return new ImageCtx(image_name, image_id, snap, p, read_only); + } + static ImageCtx* create(const std::string &image_name, + const std::string &image_id, + librados::snap_t snap_id, IoCtx& p, + bool read_only) { + return new ImageCtx(image_name, image_id, snap_id, p, read_only); + } + + /** + * Either image_name or image_id must be set. + * If id is not known, pass the empty std::string, + * and init() will look it up. + */ + ImageCtx(const std::string &image_name, const std::string &image_id, + const char *snap, IoCtx& p, bool read_only); + ImageCtx(const std::string &image_name, const std::string &image_id, + librados::snap_t snap_id, IoCtx& p, bool read_only); + ~ImageCtx(); + void init(); + void shutdown(); + void init_layout(int64_t pool_id); + void perf_start(std::string name); + void perf_stop(); + void set_read_flag(unsigned flag); + int get_read_flags(librados::snap_t snap_id); + int snap_set(uint64_t snap_id); + void snap_unset(); + librados::snap_t get_snap_id(const cls::rbd::SnapshotNamespace& in_snap_namespace, + const std::string& in_snap_name) const; + const SnapInfo* get_snap_info(librados::snap_t in_snap_id) const; + int get_snap_name(librados::snap_t in_snap_id, + std::string *out_snap_name) const; + int get_snap_namespace(librados::snap_t in_snap_id, + cls::rbd::SnapshotNamespace *out_snap_namespace) const; + int get_parent_spec(librados::snap_t in_snap_id, + cls::rbd::ParentImageSpec *pspec) const; + int is_snap_protected(librados::snap_t in_snap_id, + bool *is_protected) const; + int is_snap_unprotected(librados::snap_t in_snap_id, + bool *is_unprotected) const; + + uint64_t get_current_size() const; + uint64_t get_object_size() const; + std::string get_object_name(uint64_t num) const; + uint64_t get_stripe_unit() const; + uint64_t get_stripe_count() const; + uint64_t get_stripe_period() const; + utime_t get_create_timestamp() const; + utime_t get_access_timestamp() const; + utime_t get_modify_timestamp() const; + + void set_access_timestamp(utime_t at); + void set_modify_timestamp(utime_t at); + + void add_snap(cls::rbd::SnapshotNamespace in_snap_namespace, + std::string in_snap_name, + librados::snap_t id, + uint64_t in_size, const ParentImageInfo &parent, + uint8_t protection_status, uint64_t flags, utime_t timestamp); + void rm_snap(cls::rbd::SnapshotNamespace in_snap_namespace, + std::string in_snap_name, + librados::snap_t id); + uint64_t get_image_size(librados::snap_t in_snap_id) const; + uint64_t get_area_size(io::ImageArea area) const; + uint64_t get_object_count(librados::snap_t in_snap_id) const; + bool test_features(uint64_t test_features) const; + bool test_features(uint64_t test_features, + const ceph::shared_mutex &in_image_lock) const; + bool test_op_features(uint64_t op_features) const; + bool test_op_features(uint64_t op_features, + const ceph::shared_mutex &in_image_lock) const; + int get_flags(librados::snap_t in_snap_id, uint64_t *flags) const; + int test_flags(librados::snap_t in_snap_id, + uint64_t test_flags, bool *flags_set) const; + int test_flags(librados::snap_t in_snap_id, + uint64_t test_flags, const ceph::shared_mutex &in_image_lock, + bool *flags_set) const; + int update_flags(librados::snap_t in_snap_id, uint64_t flag, bool enabled); + + const ParentImageInfo* get_parent_info(librados::snap_t in_snap_id) const; + int64_t get_parent_pool_id(librados::snap_t in_snap_id) const; + std::string get_parent_image_id(librados::snap_t in_snap_id) const; + uint64_t get_parent_snap_id(librados::snap_t in_snap_id) const; + int get_parent_overlap(librados::snap_t in_snap_id, + uint64_t* raw_overlap) const; + std::pair reduce_parent_overlap( + uint64_t raw_overlap, bool migration_write) const; + uint64_t prune_parent_extents( + std::vector>& image_extents, + io::ImageArea area, uint64_t raw_overlap, bool migration_write) const; + + void register_watch(Context *on_finish); + + void cancel_async_requests(); + void cancel_async_requests(Context *on_finish); + + void apply_metadata(const std::map &meta, + bool thread_safe); + + ExclusiveLock *create_exclusive_lock(); + ObjectMap *create_object_map(uint64_t snap_id); + Journal *create_journal(); + + void set_image_name(const std::string &name); + + void notify_update(); + void notify_update(Context *on_finish); + + exclusive_lock::Policy *get_exclusive_lock_policy() const; + void set_exclusive_lock_policy(exclusive_lock::Policy *policy); + + journal::Policy *get_journal_policy() const; + void set_journal_policy(journal::Policy *policy); + + void rebuild_data_io_context(); + IOContext get_data_io_context() const; + IOContext duplicate_data_io_context() const; + + static void get_timer_instance(CephContext *cct, SafeTimer **timer, + ceph::mutex **timer_lock); + + private: + std::shared_ptr data_io_context; + }; +} + +#endif diff --git a/src/librbd/ImageState.cc b/src/librbd/ImageState.cc new file mode 100644 index 000000000..a81a8373d --- /dev/null +++ b/src/librbd/ImageState.cc @@ -0,0 +1,1040 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ImageState.h" +#include "include/rbd/librbd.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "common/WorkQueue.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/TaskFinisher.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/image/CloseRequest.h" +#include "librbd/image/OpenRequest.h" +#include "librbd/image/RefreshRequest.h" +#include "librbd/image/SetSnapRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ImageState: " << this << " " + +namespace librbd { + +using util::create_async_context_callback; +using util::create_context_callback; + +class ImageUpdateWatchers { +public: + + explicit ImageUpdateWatchers(CephContext *cct) : m_cct(cct), + m_lock(ceph::make_mutex(util::unique_lock_name("librbd::ImageUpdateWatchers::m_lock", this))) { + } + + ~ImageUpdateWatchers() { + ceph_assert(m_watchers.empty()); + ceph_assert(m_in_flight.empty()); + ceph_assert(m_pending_unregister.empty()); + ceph_assert(m_on_shut_down_finish == nullptr); + + destroy_work_queue(); + } + + void flush(Context *on_finish) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl; + { + std::lock_guard locker{m_lock}; + if (!m_in_flight.empty()) { + Context *ctx = new LambdaContext( + [this, on_finish](int r) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing flush" << dendl; + on_finish->complete(r); + }); + m_work_queue->queue(ctx, 0); + return; + } + } + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing flush" << dendl; + on_finish->complete(0); + } + + void shut_down(Context *on_finish) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_on_shut_down_finish == nullptr); + m_watchers.clear(); + if (!m_in_flight.empty()) { + m_on_shut_down_finish = on_finish; + return; + } + } + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing shut down" << dendl; + on_finish->complete(0); + } + + void register_watcher(UpdateWatchCtx *watcher, uint64_t *handle) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": watcher=" + << watcher << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_on_shut_down_finish == nullptr); + + create_work_queue(); + + *handle = m_next_handle++; + m_watchers.insert(std::make_pair(*handle, watcher)); + } + + void unregister_watcher(uint64_t handle, Context *on_finish) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle=" + << handle << dendl; + int r = 0; + { + std::lock_guard locker{m_lock}; + auto it = m_watchers.find(handle); + if (it == m_watchers.end()) { + r = -ENOENT; + } else { + if (m_in_flight.find(handle) != m_in_flight.end()) { + ceph_assert(m_pending_unregister.find(handle) == m_pending_unregister.end()); + m_pending_unregister[handle] = on_finish; + on_finish = nullptr; + } + m_watchers.erase(it); + } + } + + if (on_finish) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing unregister" << dendl; + on_finish->complete(r); + } + } + + void notify() { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << dendl; + + std::lock_guard locker{m_lock}; + for (auto it : m_watchers) { + send_notify(it.first, it.second); + } + } + + void send_notify(uint64_t handle, UpdateWatchCtx *watcher) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle=" + << handle << ", watcher=" << watcher << dendl; + + m_in_flight.insert(handle); + + Context *ctx = new LambdaContext( + [this, handle, watcher](int r) { + handle_notify(handle, watcher); + }); + + m_work_queue->queue(ctx, 0); + } + + void handle_notify(uint64_t handle, UpdateWatchCtx *watcher) { + + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ << ": handle=" + << handle << ", watcher=" << watcher << dendl; + + watcher->handle_notify(); + + Context *on_unregister_finish = nullptr; + Context *on_shut_down_finish = nullptr; + + { + std::lock_guard locker{m_lock}; + + auto in_flight_it = m_in_flight.find(handle); + ceph_assert(in_flight_it != m_in_flight.end()); + m_in_flight.erase(in_flight_it); + + // If there is no more in flight notifications for this watcher + // and it is pending unregister, complete it now. + if (m_in_flight.find(handle) == m_in_flight.end()) { + auto it = m_pending_unregister.find(handle); + if (it != m_pending_unregister.end()) { + on_unregister_finish = it->second; + m_pending_unregister.erase(it); + } + } + + if (m_in_flight.empty()) { + ceph_assert(m_pending_unregister.empty()); + if (m_on_shut_down_finish != nullptr) { + std::swap(m_on_shut_down_finish, on_shut_down_finish); + } + } + } + + if (on_unregister_finish != nullptr) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing unregister" << dendl; + on_unregister_finish->complete(0); + } + + if (on_shut_down_finish != nullptr) { + ldout(m_cct, 20) << "ImageUpdateWatchers::" << __func__ + << ": completing shut down" << dendl; + on_shut_down_finish->complete(0); + } + } + +private: + class ThreadPoolSingleton : public ThreadPool { + public: + explicit ThreadPoolSingleton(CephContext *cct) + : ThreadPool(cct, "librbd::ImageUpdateWatchers::thread_pool", "tp_librbd", + 1) { + start(); + } + ~ThreadPoolSingleton() override { + stop(); + } + }; + + CephContext *m_cct; + ceph::mutex m_lock; + ContextWQ *m_work_queue = nullptr; + std::map m_watchers; + uint64_t m_next_handle = 0; + std::multiset m_in_flight; + std::map m_pending_unregister; + Context *m_on_shut_down_finish = nullptr; + + void create_work_queue() { + if (m_work_queue != nullptr) { + return; + } + auto& thread_pool = m_cct->lookup_or_create_singleton_object< + ThreadPoolSingleton>("librbd::ImageUpdateWatchers::thread_pool", + false, m_cct); + m_work_queue = new ContextWQ("librbd::ImageUpdateWatchers::work_queue", + ceph::make_timespan( + m_cct->_conf.get_val("rbd_op_thread_timeout")), + &thread_pool); + } + + void destroy_work_queue() { + if (m_work_queue == nullptr) { + return; + } + m_work_queue->drain(); + delete m_work_queue; + } +}; + +class QuiesceWatchers { +public: + explicit QuiesceWatchers(CephContext *cct, asio::ContextWQ* work_queue) + : m_cct(cct), + m_work_queue(work_queue), + m_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::QuiesceWatchers::m_lock", this))) { + } + + ~QuiesceWatchers() { + ceph_assert(m_pending_unregister.empty()); + ceph_assert(m_on_notify == nullptr); + } + + void register_watcher(QuiesceWatchCtx *watcher, uint64_t *handle) { + ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << ": watcher=" + << watcher << dendl; + + std::lock_guard locker{m_lock}; + + *handle = m_next_handle++; + m_watchers[*handle] = watcher; + } + + void unregister_watcher(uint64_t handle, Context *on_finish) { + int r = 0; + { + std::lock_guard locker{m_lock}; + auto it = m_watchers.find(handle); + if (it == m_watchers.end()) { + r = -ENOENT; + } else { + if (m_on_notify != nullptr) { + ceph_assert(!m_pending_unregister.count(handle)); + m_pending_unregister[handle] = on_finish; + on_finish = nullptr; + } + m_watchers.erase(it); + } + } + + if (on_finish) { + ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ + << ": completing unregister " << handle << dendl; + on_finish->complete(r); + } + } + + void notify_quiesce(Context *on_finish) { + std::lock_guard locker{m_lock}; + if (m_blocked) { + ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << ": queue" << dendl; + m_pending_notify.push_back(on_finish); + return; + } + + notify(QUIESCE, on_finish); + } + + void notify_unquiesce(Context *on_finish) { + std::lock_guard locker{m_lock}; + + notify(UNQUIESCE, on_finish); + } + + void quiesce_complete(uint64_t handle, int r) { + Context *on_notify = nullptr; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_on_notify != nullptr); + ceph_assert(m_handle_quiesce_cnt > 0); + + m_handle_quiesce_cnt--; + + if (r < 0) { + ldout(m_cct, 10) << "QuiesceWatchers::" << __func__ << ": watcher " + << handle << " failed" << dendl; + m_failed_watchers.insert(handle); + m_ret_val = r; + } + + if (m_handle_quiesce_cnt > 0) { + return; + } + + std::swap(on_notify, m_on_notify); + r = m_ret_val; + } + + on_notify->complete(r); + } + +private: + enum EventType {QUIESCE, UNQUIESCE}; + + CephContext *m_cct; + asio::ContextWQ *m_work_queue; + + ceph::mutex m_lock; + std::map m_watchers; + uint64_t m_next_handle = 0; + Context *m_on_notify = nullptr; + std::list m_pending_notify; + std::map m_pending_unregister; + uint64_t m_handle_quiesce_cnt = 0; + std::set m_failed_watchers; + bool m_blocked = false; + int m_ret_val = 0; + + void notify(EventType event_type, Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + if (m_watchers.empty()) { + m_work_queue->queue(on_finish); + return; + } + + ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << " event: " + << event_type << dendl; + + Context *ctx = nullptr; + if (event_type == QUIESCE) { + ceph_assert(!m_blocked); + ceph_assert(m_handle_quiesce_cnt == 0); + + m_blocked = true; + m_handle_quiesce_cnt = m_watchers.size(); + m_failed_watchers.clear(); + m_ret_val = 0; + } else { + ceph_assert(event_type == UNQUIESCE); + ceph_assert(m_blocked); + + ctx = create_async_context_callback( + m_work_queue, create_context_callback< + QuiesceWatchers, &QuiesceWatchers::handle_notify_unquiesce>(this)); + } + auto gather_ctx = new C_Gather(m_cct, ctx); + + ceph_assert(m_on_notify == nullptr); + + m_on_notify = on_finish; + + for (auto &[handle, watcher] : m_watchers) { + send_notify(handle, watcher, event_type, gather_ctx->new_sub()); + } + + gather_ctx->activate(); + } + + void send_notify(uint64_t handle, QuiesceWatchCtx *watcher, + EventType event_type, Context *on_finish) { + auto ctx = new LambdaContext( + [this, handle, watcher, event_type, on_finish](int) { + ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << ": handle=" + << handle << ", event_type=" << event_type << dendl; + switch (event_type) { + case QUIESCE: + watcher->handle_quiesce(); + break; + case UNQUIESCE: + { + std::lock_guard locker{m_lock}; + + if (m_failed_watchers.count(handle)) { + ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ + << ": skip for failed watcher" << dendl; + break; + } + } + watcher->handle_unquiesce(); + break; + default: + ceph_abort_msgf("invalid event_type %d", event_type); + } + + on_finish->complete(0); + }); + + m_work_queue->queue(ctx); + } + + void handle_notify_unquiesce(int r) { + ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ << ": r=" << r + << dendl; + + ceph_assert(r == 0); + + std::unique_lock locker{m_lock}; + + if (!m_pending_unregister.empty()) { + std::map pending_unregister; + std::swap(pending_unregister, m_pending_unregister); + locker.unlock(); + for (auto &it : pending_unregister) { + ldout(m_cct, 20) << "QuiesceWatchers::" << __func__ + << ": completing unregister " << it.first << dendl; + it.second->complete(0); + } + locker.lock(); + } + + Context *on_notify = nullptr; + std::swap(on_notify, m_on_notify); + + ceph_assert(m_blocked); + m_blocked = false; + + if (!m_pending_notify.empty()) { + auto on_finish = m_pending_notify.front(); + m_pending_notify.pop_front(); + notify(QUIESCE, on_finish); + } + + locker.unlock(); + on_notify->complete(0); + } +}; + +template +ImageState::ImageState(I *image_ctx) + : m_image_ctx(image_ctx), m_state(STATE_UNINITIALIZED), + m_lock(ceph::make_mutex(util::unique_lock_name("librbd::ImageState::m_lock", this))), + m_last_refresh(0), m_refresh_seq(0), + m_update_watchers(new ImageUpdateWatchers(image_ctx->cct)), + m_quiesce_watchers(new QuiesceWatchers( + image_ctx->cct, image_ctx->asio_engine->get_work_queue())) { +} + +template +ImageState::~ImageState() { + ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED); + delete m_update_watchers; + delete m_quiesce_watchers; +} + +template +int ImageState::open(uint64_t flags) { + C_SaferCond ctx; + open(flags, &ctx); + + int r = ctx.wait(); + return r; +} + +template +void ImageState::open(uint64_t flags, Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_lock.lock(); + ceph_assert(m_state == STATE_UNINITIALIZED); + m_open_flags = flags; + + Action action(ACTION_TYPE_OPEN); + action.refresh_seq = m_refresh_seq; + + execute_action_unlock(action, on_finish); +} + +template +int ImageState::close() { + C_SaferCond ctx; + close(&ctx); + + int r = ctx.wait(); + return r; +} + +template +void ImageState::close(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_lock.lock(); + ceph_assert(!is_closed()); + + Action action(ACTION_TYPE_CLOSE); + action.refresh_seq = m_refresh_seq; + execute_action_unlock(action, on_finish); +} + +template +void ImageState::handle_update_notification() { + std::lock_guard locker{m_lock}; + ++m_refresh_seq; + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": refresh_seq = " << m_refresh_seq << ", " + << "last_refresh = " << m_last_refresh << dendl; + + switch (m_state) { + case STATE_UNINITIALIZED: + case STATE_CLOSED: + case STATE_OPENING: + case STATE_CLOSING: + ldout(cct, 5) << "dropping update notification to watchers" << dendl; + return; + default: + break; + } + + m_update_watchers->notify(); +} + +template +bool ImageState::is_refresh_required() const { + std::lock_guard locker{m_lock}; + return (m_last_refresh != m_refresh_seq || find_pending_refresh() != nullptr); +} + +template +int ImageState::refresh() { + C_SaferCond refresh_ctx; + refresh(&refresh_ctx); + return refresh_ctx.wait(); +} + +template +void ImageState::refresh(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_lock.lock(); + if (is_closed()) { + m_lock.unlock(); + on_finish->complete(-ESHUTDOWN); + return; + } + + Action action(ACTION_TYPE_REFRESH); + action.refresh_seq = m_refresh_seq; + execute_action_unlock(action, on_finish); +} + +template +int ImageState::refresh_if_required() { + C_SaferCond ctx; + { + m_lock.lock(); + Action action(ACTION_TYPE_REFRESH); + action.refresh_seq = m_refresh_seq; + + auto refresh_action = find_pending_refresh(); + if (refresh_action != nullptr) { + // if a refresh is in-flight, delay until it is finished + action = *refresh_action; + } else if (m_last_refresh == m_refresh_seq) { + m_lock.unlock(); + return 0; + } else if (is_closed()) { + m_lock.unlock(); + return -ESHUTDOWN; + } + + execute_action_unlock(action, &ctx); + } + + return ctx.wait(); +} + +template +const typename ImageState::Action * +ImageState::find_pending_refresh() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + auto it = std::find_if(m_actions_contexts.rbegin(), + m_actions_contexts.rend(), + [](const ActionContexts& action_contexts) { + return (action_contexts.first == ACTION_TYPE_REFRESH); + }); + if (it != m_actions_contexts.rend()) { + return &it->first; + } + return nullptr; +} + +template +void ImageState::snap_set(uint64_t snap_id, Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": snap_id=" << snap_id << dendl; + + Action action(ACTION_TYPE_SET_SNAP); + action.snap_id = snap_id; + + m_lock.lock(); + execute_action_unlock(action, on_finish); +} + +template +void ImageState::prepare_lock(Context *on_ready) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << dendl; + + m_lock.lock(); + if (is_closed()) { + m_lock.unlock(); + on_ready->complete(-ESHUTDOWN); + return; + } + + Action action(ACTION_TYPE_LOCK); + action.on_ready = on_ready; + execute_action_unlock(action, nullptr); +} + +template +void ImageState::handle_prepare_lock_complete() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << dendl; + + m_lock.lock(); + if (m_state != STATE_PREPARING_LOCK) { + m_lock.unlock(); + return; + } + + complete_action_unlock(STATE_OPEN, 0); +} + +template +int ImageState::register_update_watcher(UpdateWatchCtx *watcher, + uint64_t *handle) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_update_watchers->register_watcher(watcher, handle); + + ldout(cct, 20) << __func__ << ": handle=" << *handle << dendl; + return 0; +} + +template +void ImageState::unregister_update_watcher(uint64_t handle, + Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": handle=" << handle << dendl; + + m_update_watchers->unregister_watcher(handle, on_finish); +} + +template +int ImageState::unregister_update_watcher(uint64_t handle) { + C_SaferCond ctx; + unregister_update_watcher(handle, &ctx); + return ctx.wait(); +} + +template +void ImageState::flush_update_watchers(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_update_watchers->flush(on_finish); +} + +template +void ImageState::shut_down_update_watchers(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_update_watchers->shut_down(on_finish); +} + +template +bool ImageState::is_transition_state() const { + switch (m_state) { + case STATE_UNINITIALIZED: + case STATE_OPEN: + case STATE_CLOSED: + return false; + case STATE_OPENING: + case STATE_CLOSING: + case STATE_REFRESHING: + case STATE_SETTING_SNAP: + case STATE_PREPARING_LOCK: + break; + } + return true; +} + +template +bool ImageState::is_closed() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + return ((m_state == STATE_CLOSED) || + (!m_actions_contexts.empty() && + m_actions_contexts.back().first.action_type == ACTION_TYPE_CLOSE)); +} + +template +void ImageState::append_context(const Action &action, Context *context) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + ActionContexts *action_contexts = nullptr; + for (auto &action_ctxs : m_actions_contexts) { + if (action == action_ctxs.first) { + action_contexts = &action_ctxs; + break; + } + } + + if (action_contexts == nullptr) { + m_actions_contexts.push_back({action, {}}); + action_contexts = &m_actions_contexts.back(); + } + + if (context != nullptr) { + action_contexts->second.push_back(context); + } +} + +template +void ImageState::execute_next_action_unlock() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(!m_actions_contexts.empty()); + switch (m_actions_contexts.front().first.action_type) { + case ACTION_TYPE_OPEN: + send_open_unlock(); + return; + case ACTION_TYPE_CLOSE: + send_close_unlock(); + return; + case ACTION_TYPE_REFRESH: + send_refresh_unlock(); + return; + case ACTION_TYPE_SET_SNAP: + send_set_snap_unlock(); + return; + case ACTION_TYPE_LOCK: + send_prepare_lock_unlock(); + return; + } + ceph_abort(); +} + +template +void ImageState::execute_action_unlock(const Action &action, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + append_context(action, on_finish); + if (!is_transition_state()) { + execute_next_action_unlock(); + } else { + m_lock.unlock(); + } +} + +template +void ImageState::complete_action_unlock(State next_state, int r) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(!m_actions_contexts.empty()); + + ActionContexts action_contexts(std::move(m_actions_contexts.front())); + m_actions_contexts.pop_front(); + + m_state = next_state; + m_lock.unlock(); + + if (next_state == STATE_CLOSED || + (next_state == STATE_UNINITIALIZED && r < 0)) { + // the ImageCtx must be deleted outside the scope of its callback threads + auto ctx = new LambdaContext( + [image_ctx=m_image_ctx, contexts=std::move(action_contexts.second)] + (int r) { + delete image_ctx; + for (auto ctx : contexts) { + ctx->complete(r); + } + }); + TaskFinisherSingleton::get_singleton(m_image_ctx->cct).queue(ctx, r); + } else { + for (auto ctx : action_contexts.second) { + if (next_state == STATE_OPEN) { + // we couldn't originally wrap the open callback w/ an async wrapper in + // case the image failed to open + ctx = create_async_context_callback(*m_image_ctx, ctx); + } + ctx->complete(r); + } + + m_lock.lock(); + if (!is_transition_state() && !m_actions_contexts.empty()) { + execute_next_action_unlock(); + } else { + m_lock.unlock(); + } + } +} + +template +void ImageState::send_open_unlock() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_state = STATE_OPENING; + + Context *ctx = create_context_callback< + ImageState, &ImageState::handle_open>(this); + image::OpenRequest *req = image::OpenRequest::create( + m_image_ctx, m_open_flags, ctx); + + m_lock.unlock(); + req->send(); +} + +template +void ImageState::handle_open(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl; + } + + m_lock.lock(); + complete_action_unlock(r < 0 ? STATE_UNINITIALIZED : STATE_OPEN, r); +} + +template +void ImageState::send_close_unlock() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_state = STATE_CLOSING; + + Context *ctx = create_context_callback< + ImageState, &ImageState::handle_close>(this); + image::CloseRequest *req = image::CloseRequest::create( + m_image_ctx, ctx); + + m_lock.unlock(); + req->send(); +} + +template +void ImageState::handle_close(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "error occurred while closing image: " << cpp_strerror(r) + << dendl; + } + + m_lock.lock(); + complete_action_unlock(STATE_CLOSED, r); +} + +template +void ImageState::send_refresh_unlock() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_state = STATE_REFRESHING; + ceph_assert(!m_actions_contexts.empty()); + auto &action_context = m_actions_contexts.front().first; + ceph_assert(action_context.action_type == ACTION_TYPE_REFRESH); + + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + ImageState, &ImageState::handle_refresh>(this)); + image::RefreshRequest *req = image::RefreshRequest::create( + *m_image_ctx, false, false, ctx); + + m_lock.unlock(); + req->send(); +} + +template +void ImageState::handle_refresh(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + m_lock.lock(); + ceph_assert(!m_actions_contexts.empty()); + + ActionContexts &action_contexts(m_actions_contexts.front()); + ceph_assert(action_contexts.first.action_type == ACTION_TYPE_REFRESH); + ceph_assert(m_last_refresh <= action_contexts.first.refresh_seq); + + if (r == -ERESTART) { + ldout(cct, 5) << "incomplete refresh: not updating sequence" << dendl; + r = 0; + } else { + m_last_refresh = action_contexts.first.refresh_seq; + } + + complete_action_unlock(STATE_OPEN, r); +} + +template +void ImageState::send_set_snap_unlock() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + m_state = STATE_SETTING_SNAP; + + ceph_assert(!m_actions_contexts.empty()); + ActionContexts &action_contexts(m_actions_contexts.front()); + ceph_assert(action_contexts.first.action_type == ACTION_TYPE_SET_SNAP); + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "snap_id=" << action_contexts.first.snap_id << dendl; + + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + ImageState, &ImageState::handle_set_snap>(this)); + image::SetSnapRequest *req = image::SetSnapRequest::create( + *m_image_ctx, action_contexts.first.snap_id, ctx); + + m_lock.unlock(); + req->send(); +} + +template +void ImageState::handle_set_snap(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to set snapshot: " << cpp_strerror(r) << dendl; + } + + m_lock.lock(); + complete_action_unlock(STATE_OPEN, r); +} + +template +void ImageState::send_prepare_lock_unlock() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + ceph_assert(ceph_mutex_is_locked(m_lock)); + m_state = STATE_PREPARING_LOCK; + + ceph_assert(!m_actions_contexts.empty()); + ActionContexts &action_contexts(m_actions_contexts.front()); + ceph_assert(action_contexts.first.action_type == ACTION_TYPE_LOCK); + + Context *on_ready = action_contexts.first.on_ready; + m_lock.unlock(); + + if (on_ready == nullptr) { + complete_action_unlock(STATE_OPEN, 0); + return; + } + + // wake up the lock handler now that its safe to proceed + on_ready->complete(0); +} + +template +int ImageState::register_quiesce_watcher(QuiesceWatchCtx *watcher, + uint64_t *handle) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_quiesce_watchers->register_watcher(watcher, handle); + + ldout(cct, 20) << __func__ << ": handle=" << *handle << dendl; + return 0; +} + +template +int ImageState::unregister_quiesce_watcher(uint64_t handle) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": handle=" << handle << dendl; + + C_SaferCond ctx; + m_quiesce_watchers->unregister_watcher(handle, &ctx); + return ctx.wait(); +} + +template +void ImageState::notify_quiesce(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_quiesce_watchers->notify_quiesce(on_finish); +} + +template +void ImageState::notify_unquiesce(Context *on_finish) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + m_quiesce_watchers->notify_unquiesce(on_finish); +} + +template +void ImageState::quiesce_complete(uint64_t handle, int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": handle=" << handle << " r=" << r << dendl; + m_quiesce_watchers->quiesce_complete(handle, r); +} + +} // namespace librbd + +template class librbd::ImageState; diff --git a/src/librbd/ImageState.h b/src/librbd/ImageState.h new file mode 100644 index 000000000..5107c1a17 --- /dev/null +++ b/src/librbd/ImageState.h @@ -0,0 +1,155 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_STATE_H +#define CEPH_LIBRBD_IMAGE_STATE_H + +#include "include/int_types.h" +#include "common/ceph_mutex.h" +#include +#include +#include +#include "cls/rbd/cls_rbd_types.h" + +class Context; +class RWLock; + +namespace librbd { + +class QuiesceWatchCtx; +class QuiesceWatchers; +class ImageCtx; +class ImageUpdateWatchers; +class UpdateWatchCtx; + +template +class ImageState { +public: + ImageState(ImageCtxT *image_ctx); + ~ImageState(); + + int open(uint64_t flags); + void open(uint64_t flags, Context *on_finish); + + int close(); + void close(Context *on_finish); + + void handle_update_notification(); + + bool is_refresh_required() const; + + int refresh(); + int refresh_if_required(); + void refresh(Context *on_finish); + + void snap_set(uint64_t snap_id, Context *on_finish); + + void prepare_lock(Context *on_ready); + void handle_prepare_lock_complete(); + + int register_update_watcher(UpdateWatchCtx *watcher, uint64_t *handle); + void unregister_update_watcher(uint64_t handle, Context *on_finish); + int unregister_update_watcher(uint64_t handle); + void flush_update_watchers(Context *on_finish); + void shut_down_update_watchers(Context *on_finish); + + int register_quiesce_watcher(QuiesceWatchCtx *watcher, uint64_t *handle); + int unregister_quiesce_watcher(uint64_t handle); + void notify_quiesce(Context *on_finish); + void notify_unquiesce(Context *on_finish); + void quiesce_complete(uint64_t handle, int r); + +private: + enum State { + STATE_UNINITIALIZED, + STATE_OPEN, + STATE_CLOSED, + STATE_OPENING, + STATE_CLOSING, + STATE_REFRESHING, + STATE_SETTING_SNAP, + STATE_PREPARING_LOCK + }; + + enum ActionType { + ACTION_TYPE_OPEN, + ACTION_TYPE_CLOSE, + ACTION_TYPE_REFRESH, + ACTION_TYPE_SET_SNAP, + ACTION_TYPE_LOCK + }; + + struct Action { + ActionType action_type; + uint64_t refresh_seq = 0; + uint64_t snap_id = CEPH_NOSNAP; + Context *on_ready = nullptr; + + Action(ActionType action_type) : action_type(action_type) { + } + inline bool operator==(const Action &action) const { + if (action_type != action.action_type) { + return false; + } + switch (action_type) { + case ACTION_TYPE_REFRESH: + return (refresh_seq == action.refresh_seq); + case ACTION_TYPE_SET_SNAP: + return (snap_id == action.snap_id); + case ACTION_TYPE_LOCK: + return false; + default: + return true; + } + } + }; + + typedef std::list Contexts; + typedef std::pair ActionContexts; + typedef std::list ActionsContexts; + + ImageCtxT *m_image_ctx; + State m_state; + + mutable ceph::mutex m_lock; + ActionsContexts m_actions_contexts; + + uint64_t m_last_refresh; + uint64_t m_refresh_seq; + + ImageUpdateWatchers *m_update_watchers; + QuiesceWatchers *m_quiesce_watchers; + + uint64_t m_open_flags; + + bool is_transition_state() const; + bool is_closed() const; + + const Action *find_pending_refresh() const; + + void append_context(const Action &action, Context *context); + void execute_next_action_unlock(); + void execute_action_unlock(const Action &action, Context *context); + void complete_action_unlock(State next_state, int r); + + void send_open_unlock(); + void handle_open(int r); + + void send_close_unlock(); + void handle_close(int r); + + void send_refresh_unlock(); + void handle_refresh(int r); + + void send_set_snap_unlock(); + void handle_set_snap(int r); + + void send_prepare_lock_unlock(); + +}; + +} // namespace librbd + +extern template class librbd::ImageState; + +#endif // CEPH_LIBRBD_IMAGE_STATE_H diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc new file mode 100644 index 000000000..fbb4c8339 --- /dev/null +++ b/src/librbd/ImageWatcher.cc @@ -0,0 +1,1556 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ImageWatcher.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/TaskFinisher.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/image_watcher/NotifyLockOwner.h" +#include "librbd/io/AioCompletion.h" +#include "include/encoding.h" +#include "common/errno.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ImageWatcher: " + +namespace librbd { + +using namespace image_watcher; +using namespace watch_notify; +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +using ceph::encode; +using ceph::decode; + +using namespace boost::placeholders; + +static const double RETRY_DELAY_SECONDS = 1.0; + +template +struct ImageWatcher::C_ProcessPayload : public Context { + ImageWatcher *image_watcher; + uint64_t notify_id; + uint64_t handle; + std::unique_ptr payload; + + C_ProcessPayload(ImageWatcher *image_watcher, uint64_t notify_id, + uint64_t handle, + std::unique_ptr &&payload) + : image_watcher(image_watcher), notify_id(notify_id), handle(handle), + payload(std::move(payload)) { + } + + void finish(int r) override { + image_watcher->m_async_op_tracker.start_op(); + if (image_watcher->notifications_blocked()) { + // requests are blocked -- just ack the notification + bufferlist bl; + image_watcher->acknowledge_notify(notify_id, handle, bl); + } else { + image_watcher->process_payload(notify_id, handle, payload.get()); + } + image_watcher->m_async_op_tracker.finish_op(); + } +}; + +template +ImageWatcher::ImageWatcher(I &image_ctx) + : Watcher(image_ctx.md_ctx, image_ctx.op_work_queue, image_ctx.header_oid), + m_image_ctx(image_ctx), + m_task_finisher(new TaskFinisher(*m_image_ctx.cct)), + m_async_request_lock(ceph::make_shared_mutex( + util::unique_lock_name("librbd::ImageWatcher::m_async_request_lock", this))), + m_owner_client_id_lock(ceph::make_mutex( + util::unique_lock_name("librbd::ImageWatcher::m_owner_client_id_lock", this))) +{ +} + +template +ImageWatcher::~ImageWatcher() +{ + delete m_task_finisher; +} + +template +void ImageWatcher::unregister_watch(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " unregistering image watcher" << dendl; + + cancel_async_requests(); + + // flush the task finisher queue before completing + on_finish = create_async_context_callback(m_task_finisher, on_finish); + + on_finish = new LambdaContext([this, on_finish](int r) { + cancel_quiesce_requests(); + m_task_finisher->cancel_all(); + m_async_op_tracker.wait_for_ops(on_finish); + }); + Watcher::unregister_watch(on_finish); +} + +template +void ImageWatcher::block_notifies(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + on_finish = new LambdaContext([this, on_finish](int r) { + cancel_async_requests(); + on_finish->complete(r); + }); + Watcher::block_notifies(on_finish); +} + +template +void ImageWatcher::schedule_async_progress(const AsyncRequestId &request, + uint64_t offset, uint64_t total) { + auto ctx = new LambdaContext([this, request, offset, total](int r) { + if (r != -ECANCELED) { + notify_async_progress(request, offset, total); + } + }); + m_task_finisher->queue(Task(TASK_CODE_ASYNC_PROGRESS, request), ctx); +} + +template +int ImageWatcher::notify_async_progress(const AsyncRequestId &request, + uint64_t offset, uint64_t total) { + ldout(m_image_ctx.cct, 20) << this << " remote async request progress: " + << request << " @ " << offset + << "/" << total << dendl; + + send_notify(new AsyncProgressPayload(request, offset, total)); + return 0; +} + +template +void ImageWatcher::schedule_async_complete(const AsyncRequestId &request, + int r) { + m_async_op_tracker.start_op(); + auto ctx = new LambdaContext([this, request, ret_val=r](int r) { + if (r != -ECANCELED) { + notify_async_complete(request, ret_val); + } + }); + m_task_finisher->queue(ctx); +} + +template +void ImageWatcher::notify_async_complete(const AsyncRequestId &request, + int r) { + ldout(m_image_ctx.cct, 20) << this << " remote async request finished: " + << request << "=" << r << dendl; + + send_notify(new AsyncCompletePayload(request, r), + new LambdaContext(boost::bind(&ImageWatcher::handle_async_complete, + this, request, r, _1))); +} + +template +void ImageWatcher::handle_async_complete(const AsyncRequestId &request, + int r, int ret_val) { + ldout(m_image_ctx.cct, 20) << this << " " << __func__ << ": " + << "request=" << request << ", r=" << ret_val + << dendl; + if (ret_val < 0) { + lderr(m_image_ctx.cct) << this << " failed to notify async complete: " + << cpp_strerror(ret_val) << dendl; + if (ret_val == -ETIMEDOUT && !is_unregistered()) { + schedule_async_complete(request, r); + m_async_op_tracker.finish_op(); + return; + } + } + + std::unique_lock async_request_locker{m_async_request_lock}; + mark_async_request_complete(request, r); + m_async_op_tracker.finish_op(); +} + +template +void ImageWatcher::notify_flatten(uint64_t request_id, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, new FlattenPayload(async_request_id), + prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_resize(uint64_t request_id, uint64_t size, + bool allow_shrink, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, + new ResizePayload(async_request_id, size, allow_shrink), + prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_snap_create(uint64_t request_id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t flags, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, + new SnapCreatePayload(async_request_id, snap_namespace, + snap_name, flags), + prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_snap_rename(uint64_t request_id, + const snapid_t &src_snap_id, + const std::string &dst_snap_name, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request( + async_request_id, + new SnapRenamePayload(async_request_id, src_snap_id, dst_snap_name), + m_no_op_prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_snap_remove( + uint64_t request_id, const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request( + async_request_id, + new SnapRemovePayload(async_request_id, snap_namespace, snap_name), + m_no_op_prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_snap_protect( + uint64_t request_id, const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request( + async_request_id, + new SnapProtectPayload(async_request_id, snap_namespace, snap_name), + m_no_op_prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_snap_unprotect( + uint64_t request_id, const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request( + async_request_id, + new SnapUnprotectPayload(async_request_id, snap_namespace, snap_name), + m_no_op_prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_rebuild_object_map(uint64_t request_id, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, + new RebuildObjectMapPayload(async_request_id), + prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_rename(uint64_t request_id, + const std::string &image_name, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, + new RenamePayload(async_request_id, image_name), + m_no_op_prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_update_features(uint64_t request_id, + uint64_t features, bool enabled, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, + new UpdateFeaturesPayload(async_request_id, features, enabled), + m_no_op_prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_migrate(uint64_t request_id, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, new MigratePayload(async_request_id), + prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_sparsify(uint64_t request_id, size_t sparse_size, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request(async_request_id, + new SparsifyPayload(async_request_id, sparse_size), + prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_header_update(Context *on_finish) { + ldout(m_image_ctx.cct, 10) << this << ": " << __func__ << dendl; + + // supports legacy (empty buffer) clients + send_notify(new HeaderUpdatePayload(), on_finish); +} + +template +void ImageWatcher::notify_header_update(librados::IoCtx &io_ctx, + const std::string &oid) { + // supports legacy (empty buffer) clients + bufferlist bl; + encode(NotifyMessage(new HeaderUpdatePayload()), bl); + io_ctx.notify2(oid, bl, watcher::Notifier::NOTIFY_TIMEOUT, nullptr); +} + +template +void ImageWatcher::notify_quiesce(uint64_t *request_id, + ProgressContext &prog_ctx, + Context *on_finish) { + *request_id = util::reserve_async_request_id(); + + ldout(m_image_ctx.cct, 10) << this << " " << __func__ << ": request_id=" + << request_id << dendl; + + AsyncRequestId async_request_id(get_client_id(), *request_id); + + auto attempts = m_image_ctx.config.template get_val( + "rbd_quiesce_notification_attempts"); + + notify_quiesce(async_request_id, attempts, prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_quiesce(const AsyncRequestId &async_request_id, + size_t attempts, ProgressContext &prog_ctx, + Context *on_finish) { + ldout(m_image_ctx.cct, 10) << this << " " << __func__ << ": async_request_id=" + << async_request_id << " attempts=" << attempts + << dendl; + + ceph_assert(attempts > 0); + auto notify_response = new watcher::NotifyResponse(); + auto on_notify = new LambdaContext( + [notify_response=std::unique_ptr(notify_response), + this, async_request_id, &prog_ctx, on_finish, attempts=attempts-1](int r) { + auto total_attempts = m_image_ctx.config.template get_val( + "rbd_quiesce_notification_attempts"); + if (total_attempts < attempts) { + total_attempts = attempts; + } + prog_ctx.update_progress(total_attempts - attempts, total_attempts); + + if (r == -ETIMEDOUT) { + ldout(m_image_ctx.cct, 10) << this << " " << __func__ << ": async_request_id=" + << async_request_id << " timed out" << dendl; + if (attempts > 0) { + notify_quiesce(async_request_id, attempts, prog_ctx, on_finish); + return; + } + } else if (r == 0) { + for (auto &[client_id, bl] : notify_response->acks) { + if (bl.length() == 0) { + continue; + } + try { + auto iter = bl.cbegin(); + + ResponseMessage response_message; + using ceph::decode; + decode(response_message, iter); + + if (response_message.result != -EOPNOTSUPP) { + r = response_message.result; + } + } catch (const buffer::error &err) { + r = -EINVAL; + } + if (r < 0) { + break; + } + } + } + if (r < 0) { + lderr(m_image_ctx.cct) << this << " failed to notify quiesce: " + << cpp_strerror(r) << dendl; + } + on_finish->complete(r); + }); + + bufferlist bl; + encode(NotifyMessage(new QuiescePayload(async_request_id)), bl); + Watcher::send_notify(bl, notify_response, on_notify); +} + +template +void ImageWatcher::notify_unquiesce(uint64_t request_id, Context *on_finish) { + ldout(m_image_ctx.cct, 10) << this << " " << __func__ << ": request_id=" + << request_id << dendl; + + AsyncRequestId async_request_id(get_client_id(), request_id); + + send_notify(new UnquiescePayload(async_request_id), on_finish); +} + +template +void ImageWatcher::notify_metadata_set(uint64_t request_id, + const std::string &key, + const std::string &value, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request( + async_request_id, + new MetadataUpdatePayload(async_request_id, key, + std::optional{value}), + m_no_op_prog_ctx, on_finish); +} + +template +void ImageWatcher::notify_metadata_remove(uint64_t request_id, + const std::string &key, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock && + !m_image_ctx.exclusive_lock->is_lock_owner()); + + AsyncRequestId async_request_id(get_client_id(), request_id); + + notify_async_request( + async_request_id, + new MetadataUpdatePayload(async_request_id, key, std::nullopt), + m_no_op_prog_ctx, on_finish); +} + +template +void ImageWatcher::schedule_cancel_async_requests() { + auto ctx = new LambdaContext([this](int r) { + if (r != -ECANCELED) { + cancel_async_requests(); + } + }); + m_task_finisher->queue(TASK_CODE_CANCEL_ASYNC_REQUESTS, ctx); +} + +template +void ImageWatcher::cancel_async_requests() { + std::unique_lock l{m_async_request_lock}; + for (auto iter = m_async_requests.begin(); iter != m_async_requests.end(); ) { + if (iter->second.second == nullptr) { + // Quiesce notify request. Skip. + iter++; + } else { + iter->second.first->complete(-ERESTART); + iter = m_async_requests.erase(iter); + } + } +} + +template +void ImageWatcher::set_owner_client_id(const ClientId& client_id) { + ceph_assert(ceph_mutex_is_locked(m_owner_client_id_lock)); + m_owner_client_id = client_id; + ldout(m_image_ctx.cct, 10) << this << " current lock owner: " + << m_owner_client_id << dendl; +} + +template +ClientId ImageWatcher::get_client_id() { + std::shared_lock l{this->m_watch_lock}; + return ClientId(m_image_ctx.md_ctx.get_instance_id(), this->m_watch_handle); +} + +template +void ImageWatcher::notify_acquired_lock() { + ldout(m_image_ctx.cct, 10) << this << " notify acquired lock" << dendl; + + ClientId client_id = get_client_id(); + { + std::lock_guard owner_client_id_locker{m_owner_client_id_lock}; + set_owner_client_id(client_id); + } + + send_notify(new AcquiredLockPayload(client_id)); +} + +template +void ImageWatcher::notify_released_lock() { + ldout(m_image_ctx.cct, 10) << this << " notify released lock" << dendl; + + { + std::lock_guard owner_client_id_locker{m_owner_client_id_lock}; + set_owner_client_id(ClientId()); + } + + send_notify(new ReleasedLockPayload(get_client_id())); +} + +template +void ImageWatcher::schedule_request_lock(bool use_timer, int timer_delay) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + + // see notify_request_lock() + if (m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()) { + return; + } + + if (is_registered()) { + ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl; + + auto ctx = new LambdaContext([this](int r) { + if (r != -ECANCELED) { + notify_request_lock(); + } + }); + + if (use_timer) { + if (timer_delay < 0) { + timer_delay = RETRY_DELAY_SECONDS; + } + m_task_finisher->add_event_after(TASK_CODE_REQUEST_LOCK, + timer_delay, ctx); + } else { + m_task_finisher->queue(TASK_CODE_REQUEST_LOCK, ctx); + } + } +} + +template +void ImageWatcher::notify_request_lock() { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::shared_lock image_locker{m_image_ctx.image_lock}; + + // ExclusiveLock state machine can be dynamically disabled or + // race with task cancel + if (m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()) { + return; + } + + ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl; + + notify_lock_owner(new RequestLockPayload(get_client_id(), false), + create_context_callback< + ImageWatcher, &ImageWatcher::handle_request_lock>(this)); +} + +template +void ImageWatcher::handle_request_lock(int r) { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::shared_lock image_locker{m_image_ctx.image_lock}; + + // ExclusiveLock state machine cannot transition -- but can be + // dynamically disabled + if (m_image_ctx.exclusive_lock == nullptr) { + return; + } + + if (r == -ETIMEDOUT) { + ldout(m_image_ctx.cct, 5) << this << " timed out requesting lock: retrying" + << dendl; + + // treat this is a dead client -- so retest acquiring the lock + m_image_ctx.exclusive_lock->handle_peer_notification(0); + } else if (r == -EROFS) { + ldout(m_image_ctx.cct, 5) << this << " peer will not release lock" << dendl; + m_image_ctx.exclusive_lock->handle_peer_notification(r); + } else if (r < 0) { + lderr(m_image_ctx.cct) << this << " error requesting lock: " + << cpp_strerror(r) << dendl; + schedule_request_lock(true); + } else { + // lock owner acked -- but resend if we don't see them release the lock + int retry_timeout = m_image_ctx.cct->_conf.template get_val( + "client_notify_timeout"); + ldout(m_image_ctx.cct, 15) << this << " will retry in " << retry_timeout + << " seconds" << dendl; + schedule_request_lock(true, retry_timeout); + } +} + +template +void ImageWatcher::notify_lock_owner(Payload *payload, Context *on_finish) { + ceph_assert(on_finish != nullptr); + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + + bufferlist bl; + encode(NotifyMessage(payload), bl); + + NotifyLockOwner *notify_lock_owner = NotifyLockOwner::create( + m_image_ctx, this->m_notifier, std::move(bl), on_finish); + notify_lock_owner->send(); +} + +template +bool ImageWatcher::is_new_request(const AsyncRequestId &id) const { + ceph_assert(ceph_mutex_is_locked(m_async_request_lock)); + + return m_async_pending.count(id) == 0 && m_async_complete.count(id) == 0; +} + +template +bool ImageWatcher::mark_async_request_complete(const AsyncRequestId &id, + int r) { + ceph_assert(ceph_mutex_is_locked(m_async_request_lock)); + + bool found = m_async_pending.erase(id); + + auto now = ceph_clock_now(); + + auto it = m_async_complete_expiration.begin(); + while (it != m_async_complete_expiration.end() && it->first < now) { + m_async_complete.erase(it->second); + it = m_async_complete_expiration.erase(it); + } + + if (!m_async_complete.insert({id, r}).second) { + for (it = m_async_complete_expiration.begin(); + it != m_async_complete_expiration.end(); it++) { + if (it->second == id) { + m_async_complete_expiration.erase(it); + break; + } + } + } + auto expiration_time = now; + expiration_time += 600; + m_async_complete_expiration.insert({expiration_time, id}); + + return found; +} + +template +Context *ImageWatcher::remove_async_request(const AsyncRequestId &id) { + std::unique_lock async_request_locker{m_async_request_lock}; + + return remove_async_request(id, m_async_request_lock); +} + +template +Context *ImageWatcher::remove_async_request(const AsyncRequestId &id, + ceph::shared_mutex &lock) { + ceph_assert(ceph_mutex_is_locked(lock)); + + ldout(m_image_ctx.cct, 20) << __func__ << ": " << id << dendl; + + auto it = m_async_requests.find(id); + if (it != m_async_requests.end()) { + Context *on_complete = it->second.first; + m_async_requests.erase(it); + return on_complete; + } + return nullptr; +} + +template +void ImageWatcher::schedule_async_request_timed_out(const AsyncRequestId &id) { + ldout(m_image_ctx.cct, 20) << "scheduling async request time out: " << id + << dendl; + + auto ctx = new LambdaContext([this, id](int r) { + if (r != -ECANCELED) { + async_request_timed_out(id); + } + }); + + Task task(TASK_CODE_ASYNC_REQUEST, id); + m_task_finisher->cancel(task); + + m_task_finisher->add_event_after( + task, m_image_ctx.config.template get_val("rbd_request_timed_out_seconds"), + ctx); +} + +template +void ImageWatcher::async_request_timed_out(const AsyncRequestId &id) { + Context *on_complete = remove_async_request(id); + if (on_complete != nullptr) { + ldout(m_image_ctx.cct, 5) << "async request timed out: " << id << dendl; + m_image_ctx.op_work_queue->queue(on_complete, -ETIMEDOUT); + } +} + +template +void ImageWatcher::notify_async_request( + const AsyncRequestId &async_request_id, Payload *payload, + ProgressContext& prog_ctx, Context *on_finish) { + ceph_assert(on_finish != nullptr); + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + + ldout(m_image_ctx.cct, 10) << this << " async request: " << async_request_id + << dendl; + + Context *on_notify = new LambdaContext([this, async_request_id](int r) { + if (r < 0) { + // notification failed -- don't expect updates + Context *on_complete = remove_async_request(async_request_id); + if (on_complete != nullptr) { + on_complete->complete(r); + } + } + }); + + Context *on_complete = new LambdaContext( + [this, async_request_id, on_finish](int r) { + m_task_finisher->cancel(Task(TASK_CODE_ASYNC_REQUEST, async_request_id)); + on_finish->complete(r); + }); + + { + std::unique_lock async_request_locker{m_async_request_lock}; + m_async_requests[async_request_id] = AsyncRequest(on_complete, &prog_ctx); + } + + schedule_async_request_timed_out(async_request_id); + notify_lock_owner(payload, on_notify); +} + +template +int ImageWatcher::prepare_async_request(const AsyncRequestId& async_request_id, + bool* new_request, Context** ctx, + ProgressContext** prog_ctx) { + if (async_request_id.client_id == get_client_id()) { + return -ERESTART; + } else { + std::unique_lock l{m_async_request_lock}; + if (is_new_request(async_request_id)) { + m_async_pending.insert(async_request_id); + *new_request = true; + *prog_ctx = new RemoteProgressContext(*this, async_request_id); + *ctx = new RemoteContext(*this, async_request_id, *prog_ctx); + } else { + *new_request = false; + auto it = m_async_complete.find(async_request_id); + if (it != m_async_complete.end()) { + int r = it->second; + // reset complete request expiration time + mark_async_request_complete(async_request_id, r); + return r; + } + } + } + return 0; +} + +template +Context *ImageWatcher::prepare_quiesce_request( + const AsyncRequestId &request, C_NotifyAck *ack_ctx) { + std::unique_lock locker{m_async_request_lock}; + + auto timeout = 2 * watcher::Notifier::NOTIFY_TIMEOUT / 1000; + + if (!is_new_request(request)) { + auto it = m_async_requests.find(request); + if (it != m_async_requests.end()) { + delete it->second.first; + it->second.first = ack_ctx; + } else { + auto it = m_async_complete.find(request); + ceph_assert(it != m_async_complete.end()); + m_task_finisher->queue(new C_ResponseMessage(ack_ctx), it->second); + // reset complete request expiration time + mark_async_request_complete(request, it->second); + } + locker.unlock(); + + m_task_finisher->reschedule_event_after(Task(TASK_CODE_QUIESCE, request), + timeout); + return nullptr; + } + + m_async_pending.insert(request); + m_async_requests[request] = AsyncRequest(ack_ctx, nullptr); + m_async_op_tracker.start_op(); + + return new LambdaContext( + [this, request, timeout](int r) { + auto unquiesce_ctx = new LambdaContext( + [this, request](int r) { + if (r == 0) { + ldout(m_image_ctx.cct, 10) << this << " quiesce request " + << request << " timed out" << dendl; + } + + auto on_finish = new LambdaContext( + [this](int r) { + m_async_op_tracker.finish_op(); + }); + + m_image_ctx.state->notify_unquiesce(on_finish); + }); + + m_task_finisher->add_event_after(Task(TASK_CODE_QUIESCE, request), + timeout, unquiesce_ctx); + + std::unique_lock async_request_locker{m_async_request_lock}; + mark_async_request_complete(request, r); + auto ctx = remove_async_request(request, m_async_request_lock); + async_request_locker.unlock(); + if (ctx != nullptr) { + ctx = new C_ResponseMessage(static_cast(ctx)); + ctx->complete(r); + } else { + m_task_finisher->cancel(Task(TASK_CODE_QUIESCE, request)); + } + }); +} + +template +void ImageWatcher::prepare_unquiesce_request(const AsyncRequestId &request) { + { + std::unique_lock async_request_locker{m_async_request_lock}; + auto it = m_async_complete.find(request); + if (it == m_async_complete.end()) { + ldout(m_image_ctx.cct, 20) << this << " " << request + << ": not found in complete" << dendl; + return; + } + // reset complete request expiration time + mark_async_request_complete(request, it->second); + } + + bool canceled = m_task_finisher->cancel(Task(TASK_CODE_QUIESCE, request)); + if (!canceled) { + ldout(m_image_ctx.cct, 20) << this << " " << request + << ": timer task not found" << dendl; + } +} + +template +void ImageWatcher::cancel_quiesce_requests() { + std::unique_lock l{m_async_request_lock}; + for (auto it = m_async_requests.begin(); it != m_async_requests.end(); ) { + if (it->second.second == nullptr) { + // Quiesce notify request. + mark_async_request_complete(it->first, 0); + delete it->second.first; + it = m_async_requests.erase(it); + } else { + it++; + } + } +} + +template +bool ImageWatcher::handle_operation_request( + const AsyncRequestId& async_request_id, + exclusive_lock::OperationRequestType request_type, Operation operation, + std::function execute, + C_NotifyAck *ack_ctx) { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + + if (m_image_ctx.exclusive_lock != nullptr) { + int r = 0; + if (m_image_ctx.exclusive_lock->accept_request(request_type, &r)) { + bool new_request; + Context *ctx; + ProgressContext *prog_ctx; + bool complete; + if (async_request_id) { + r = prepare_async_request(async_request_id, &new_request, &ctx, + &prog_ctx); + encode(ResponseMessage(r), ack_ctx->out); + complete = true; + } else { + new_request = true; + ctx = new C_ResponseMessage(ack_ctx); + prog_ctx = &m_no_op_prog_ctx; + complete = false; + } + if (r == 0 && new_request) { + ctx = new LambdaContext( + [this, operation, ctx](int r) { + m_image_ctx.operations->finish_op(operation, r); + ctx->complete(r); + }); + ctx = new LambdaContext( + [this, execute, prog_ctx, ctx](int r) { + if (r < 0) { + ctx->complete(r); + return; + } + std::shared_lock l{m_image_ctx.owner_lock}; + execute(*prog_ctx, ctx); + }); + m_image_ctx.operations->start_op(operation, ctx); + } + return complete; + } else if (r < 0) { + encode(ResponseMessage(r), ack_ctx->out); + } + } + return true; +} + +template +bool ImageWatcher::handle_payload(const HeaderUpdatePayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " image header updated" << dendl; + + m_image_ctx.state->handle_update_notification(); + m_image_ctx.perfcounter->inc(l_librbd_notify); + if (ack_ctx != nullptr) { + m_image_ctx.state->flush_update_watchers(new C_ResponseMessage(ack_ctx)); + return false; + } + return true; +} + +template +bool ImageWatcher::handle_payload(const AcquiredLockPayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " image exclusively locked announcement" + << dendl; + + bool cancel_async_requests = true; + if (payload.client_id.is_valid()) { + std::lock_guard owner_client_id_locker{m_owner_client_id_lock}; + if (payload.client_id == m_owner_client_id) { + cancel_async_requests = false; + } + set_owner_client_id(payload.client_id); + } + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + if (m_image_ctx.exclusive_lock != nullptr) { + // potentially wake up the exclusive lock state machine now that + // a lock owner has advertised itself + m_image_ctx.exclusive_lock->handle_peer_notification(0); + } + if (cancel_async_requests && + (m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->is_lock_owner())) { + schedule_cancel_async_requests(); + } + return true; +} + +template +bool ImageWatcher::handle_payload(const ReleasedLockPayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " exclusive lock released" << dendl; + + bool cancel_async_requests = true; + if (payload.client_id.is_valid()) { + std::lock_guard l{m_owner_client_id_lock}; + if (payload.client_id != m_owner_client_id) { + ldout(m_image_ctx.cct, 10) << this << " unexpected owner: " + << payload.client_id << " != " + << m_owner_client_id << dendl; + cancel_async_requests = false; + } else { + set_owner_client_id(ClientId()); + } + } + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + if (cancel_async_requests && + (m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->is_lock_owner())) { + schedule_cancel_async_requests(); + } + + // alert the exclusive lock state machine that the lock is available + if (m_image_ctx.exclusive_lock != nullptr && + !m_image_ctx.exclusive_lock->is_lock_owner()) { + m_task_finisher->cancel(TASK_CODE_REQUEST_LOCK); + m_image_ctx.exclusive_lock->handle_peer_notification(0); + } + return true; +} + +template +bool ImageWatcher::handle_payload(const RequestLockPayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " exclusive lock requested" << dendl; + if (payload.client_id == get_client_id()) { + return true; + } + + std::shared_lock l{m_image_ctx.owner_lock}; + if (m_image_ctx.exclusive_lock != nullptr && + m_image_ctx.exclusive_lock->is_lock_owner()) { + int r = 0; + bool accept_request = m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r); + + if (accept_request) { + ceph_assert(r == 0); + std::lock_guard owner_client_id_locker{m_owner_client_id_lock}; + if (!m_owner_client_id.is_valid()) { + return true; + } + + ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock" + << dendl; + r = m_image_ctx.get_exclusive_lock_policy()->lock_requested( + payload.force); + } + encode(ResponseMessage(r), ack_ctx->out); + } + return true; +} + +template +bool ImageWatcher::handle_payload(const AsyncProgressPayload &payload, + C_NotifyAck *ack_ctx) { + std::shared_lock l{m_async_request_lock}; + std::map::iterator req_it = + m_async_requests.find(payload.async_request_id); + if (req_it != m_async_requests.end()) { + ldout(m_image_ctx.cct, 20) << this << " request progress: " + << payload.async_request_id << " @ " + << payload.offset << "/" << payload.total + << dendl; + schedule_async_request_timed_out(payload.async_request_id); + req_it->second.second->update_progress(payload.offset, payload.total); + } + return true; +} + +template +bool ImageWatcher::handle_payload(const AsyncCompletePayload &payload, + C_NotifyAck *ack_ctx) { + Context *on_complete = remove_async_request(payload.async_request_id); + if (on_complete != nullptr) { + ldout(m_image_ctx.cct, 10) << this << " request finished: " + << payload.async_request_id << "=" + << payload.result << dendl; + on_complete->complete(payload.result); + } + return true; +} + +template +bool ImageWatcher::handle_payload(const FlattenPayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote flatten request: " + << payload.async_request_id << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_FLATTEN, std::bind(&Operations::execute_flatten, + m_image_ctx.operations, + std::placeholders::_1, + std::placeholders::_2), + ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const ResizePayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote resize request: " + << payload.async_request_id << " " + << payload.size << " " + << payload.allow_shrink << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_RESIZE, std::bind(&Operations::execute_resize, + m_image_ctx.operations, payload.size, + payload.allow_shrink, std::placeholders::_1, + std::placeholders::_2, 0), ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const SnapCreatePayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_create request: " + << payload.async_request_id << " " + << payload.snap_namespace << " " + << payload.snap_name << " " + << payload.flags << dendl; + + auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL; + + // rbd-mirror needs to accept forced promotion orphan snap create requests + auto mirror_ns = std::get_if( + &payload.snap_namespace); + if (mirror_ns != nullptr && mirror_ns->is_orphan()) { + request_type = exclusive_lock::OPERATION_REQUEST_TYPE_FORCE_PROMOTION; + } + + return handle_operation_request( + payload.async_request_id, request_type, + OPERATION_SNAP_CREATE, std::bind(&Operations::execute_snap_create, + m_image_ctx.operations, + payload.snap_namespace, + payload.snap_name, std::placeholders::_2, + 0, payload.flags, std::placeholders::_1), + ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const SnapRenamePayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_rename request: " + << payload.async_request_id << " " + << payload.snap_id << " to " + << payload.snap_name << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_SNAP_RENAME, std::bind(&Operations::execute_snap_rename, + m_image_ctx.operations, payload.snap_id, + payload.snap_name, + std::placeholders::_2), ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const SnapRemovePayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_remove request: " + << payload.snap_name << dendl; + + auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL; + if (cls::rbd::get_snap_namespace_type(payload.snap_namespace) == + cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) { + request_type = exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE; + } + + return handle_operation_request( + payload.async_request_id, request_type, OPERATION_SNAP_REMOVE, + std::bind(&Operations::execute_snap_remove, m_image_ctx.operations, + payload.snap_namespace, payload.snap_name, + std::placeholders::_2), ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const SnapProtectPayload& payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_protect request: " + << payload.async_request_id << " " + << payload.snap_name << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_SNAP_PROTECT, std::bind(&Operations::execute_snap_protect, + m_image_ctx.operations, + payload.snap_namespace, + payload.snap_name, + std::placeholders::_2), ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const SnapUnprotectPayload& payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote snap_unprotect request: " + << payload.async_request_id << " " + << payload.snap_name << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_SNAP_UNPROTECT, std::bind(&Operations::execute_snap_unprotect, + m_image_ctx.operations, + payload.snap_namespace, + payload.snap_name, + std::placeholders::_2), ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const RebuildObjectMapPayload& payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote rebuild object map request: " + << payload.async_request_id << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_REBUILD_OBJECT_MAP, + std::bind(&Operations::execute_rebuild_object_map, + m_image_ctx.operations, std::placeholders::_1, + std::placeholders::_2), ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const RenamePayload& payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote rename request: " + << payload.async_request_id << " " + << payload.image_name << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_RENAME, std::bind(&Operations::execute_rename, + m_image_ctx.operations, payload.image_name, + std::placeholders::_2), ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const UpdateFeaturesPayload& payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote update_features request: " + << payload.async_request_id << " " + << payload.features << " " + << (payload.enabled ? "enabled" : "disabled") + << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_UPDATE_FEATURES, + std::bind(&Operations::execute_update_features, m_image_ctx.operations, + payload.features, payload.enabled, std::placeholders::_2, 0), + ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const MigratePayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote migrate request: " + << payload.async_request_id << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_MIGRATE, std::bind(&Operations::execute_migrate, + m_image_ctx.operations, + std::placeholders::_1, + std::placeholders::_2), ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const SparsifyPayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " remote sparsify request: " + << payload.async_request_id << dendl; + + return handle_operation_request( + payload.async_request_id, exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_SPARSIFY, std::bind(&Operations::execute_sparsify, + m_image_ctx.operations, + payload.sparse_size, std::placeholders::_1, + std::placeholders::_2), ack_ctx); +} + +template +bool ImageWatcher::handle_payload(const MetadataUpdatePayload &payload, + C_NotifyAck *ack_ctx) { + if (payload.value) { + ldout(m_image_ctx.cct, 10) << this << " remote metadata_set request: " + << payload.async_request_id << " " + << "key=" << payload.key << ", value=" + << *payload.value << dendl; + + return handle_operation_request( + payload.async_request_id, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_METADATA_UPDATE, + std::bind(&Operations::execute_metadata_set, + m_image_ctx.operations, payload.key, *payload.value, + std::placeholders::_2), + ack_ctx); + } else { + ldout(m_image_ctx.cct, 10) << this << " remote metadata_remove request: " + << payload.async_request_id << " " + << "key=" << payload.key << dendl; + + return handle_operation_request( + payload.async_request_id, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + OPERATION_METADATA_UPDATE, + std::bind(&Operations::execute_metadata_remove, + m_image_ctx.operations, payload.key, std::placeholders::_2), + ack_ctx); + } +} + +template +bool ImageWatcher::handle_payload(const QuiescePayload &payload, + C_NotifyAck *ack_ctx) { + auto on_finish = prepare_quiesce_request(payload.async_request_id, ack_ctx); + if (on_finish == nullptr) { + ldout(m_image_ctx.cct, 10) << this << " duplicate quiesce request: " + << payload.async_request_id << dendl; + return false; + } + + ldout(m_image_ctx.cct, 10) << this << " quiesce request: " + << payload.async_request_id << dendl; + m_image_ctx.state->notify_quiesce(on_finish); + return false; +} + +template +bool ImageWatcher::handle_payload(const UnquiescePayload &payload, + C_NotifyAck *ack_ctx) { + ldout(m_image_ctx.cct, 10) << this << " unquiesce request: " + << payload.async_request_id << dendl; + + prepare_unquiesce_request(payload.async_request_id); + return true; +} + +template +bool ImageWatcher::handle_payload(const UnknownPayload &payload, + C_NotifyAck *ack_ctx) { + std::shared_lock l{m_image_ctx.owner_lock}; + if (m_image_ctx.exclusive_lock != nullptr) { + int r; + if (m_image_ctx.exclusive_lock->accept_request( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, &r) || r < 0) { + encode(ResponseMessage(-EOPNOTSUPP), ack_ctx->out); + } + } + return true; +} + +template +void ImageWatcher::process_payload(uint64_t notify_id, uint64_t handle, + Payload *payload) { + auto ctx = new Watcher::C_NotifyAck(this, notify_id, handle); + bool complete; + + switch (payload->get_notify_op()) { + case NOTIFY_OP_ACQUIRED_LOCK: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_RELEASED_LOCK: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_REQUEST_LOCK: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_HEADER_UPDATE: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_ASYNC_PROGRESS: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_ASYNC_COMPLETE: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_FLATTEN: + complete = handle_payload(*(static_cast(payload)), ctx); + break; + case NOTIFY_OP_RESIZE: + complete = handle_payload(*(static_cast(payload)), ctx); + break; + case NOTIFY_OP_SNAP_CREATE: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_SNAP_REMOVE: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_SNAP_RENAME: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_SNAP_PROTECT: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_SNAP_UNPROTECT: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_REBUILD_OBJECT_MAP: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_RENAME: + complete = handle_payload(*(static_cast(payload)), ctx); + break; + case NOTIFY_OP_UPDATE_FEATURES: + complete = handle_payload(*(static_cast(payload)), + ctx); + break; + case NOTIFY_OP_MIGRATE: + complete = handle_payload(*(static_cast(payload)), ctx); + break; + case NOTIFY_OP_SPARSIFY: + complete = handle_payload(*(static_cast(payload)), ctx); + break; + case NOTIFY_OP_QUIESCE: + complete = handle_payload(*(static_cast(payload)), ctx); + break; + case NOTIFY_OP_UNQUIESCE: + complete = handle_payload(*(static_cast(payload)), ctx); + break; + case NOTIFY_OP_METADATA_UPDATE: + complete = handle_payload(*(static_cast(payload)), ctx); + break; + default: + ceph_assert(payload->get_notify_op() == static_cast(-1)); + complete = handle_payload(*(static_cast(payload)), ctx); + } + + if (complete) { + ctx->complete(0); + } +} + +template +void ImageWatcher::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) { + NotifyMessage notify_message; + if (bl.length() == 0) { + // legacy notification for header updates + notify_message = NotifyMessage(new HeaderUpdatePayload()); + } else { + try { + auto iter = bl.cbegin(); + decode(notify_message, iter); + } catch (const buffer::error &err) { + lderr(m_image_ctx.cct) << this << " error decoding image notification: " + << err.what() << dendl; + return; + } + } + + // if an image refresh is required, refresh before processing the request + if (notify_message.check_for_refresh() && + m_image_ctx.state->is_refresh_required()) { + + m_image_ctx.state->refresh( + new C_ProcessPayload(this, notify_id, handle, + std::move(notify_message.payload))); + } else { + process_payload(notify_id, handle, notify_message.payload.get()); + } +} + +template +void ImageWatcher::handle_error(uint64_t handle, int err) { + lderr(m_image_ctx.cct) << this << " image watch failed: " << handle << ", " + << cpp_strerror(err) << dendl; + + { + std::lock_guard l{m_owner_client_id_lock}; + set_owner_client_id(ClientId()); + } + + Watcher::handle_error(handle, err); +} + +template +void ImageWatcher::handle_rewatch_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + if (m_image_ctx.exclusive_lock != nullptr) { + // update the lock cookie with the new watch handle + m_image_ctx.exclusive_lock->reacquire_lock(nullptr); + } + } + + // image might have been updated while we didn't have active watch + handle_payload(HeaderUpdatePayload(), nullptr); +} + +template +void ImageWatcher::send_notify(Payload *payload, Context *ctx) { + bufferlist bl; + + encode(NotifyMessage(payload), bl); + Watcher::send_notify(bl, nullptr, ctx); +} + +template +void ImageWatcher::RemoteContext::finish(int r) { + m_image_watcher.schedule_async_complete(m_async_request_id, r); +} + +template +void ImageWatcher::C_ResponseMessage::finish(int r) { + CephContext *cct = notify_ack->cct; + ldout(cct, 10) << this << " C_ResponseMessage: r=" << r << dendl; + + encode(ResponseMessage(r), notify_ack->out); + notify_ack->complete(0); +} + +} // namespace librbd + +template class librbd::ImageWatcher; diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h new file mode 100644 index 000000000..cda9a246e --- /dev/null +++ b/src/librbd/ImageWatcher.h @@ -0,0 +1,313 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_WATCHER_H +#define CEPH_LIBRBD_IMAGE_WATCHER_H + +#include "cls/rbd/cls_rbd_types.h" +#include "common/AsyncOpTracker.h" +#include "common/ceph_mutex.h" +#include "include/Context.h" +#include "include/rbd/librbd.hpp" +#include "librbd/Operations.h" +#include "librbd/Watcher.h" +#include "librbd/WatchNotifyTypes.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/internal.h" +#include +#include +#include +#include + +class entity_name_t; + +namespace librbd { + +class ImageCtx; +template class TaskFinisher; + +template +class ImageWatcher : public Watcher { +public: + ImageWatcher(ImageCtxT& image_ctx); + ~ImageWatcher() override; + + void unregister_watch(Context *on_finish) override; + void block_notifies(Context *on_finish) override; + + void notify_flatten(uint64_t request_id, ProgressContext &prog_ctx, + Context *on_finish); + void notify_resize(uint64_t request_id, uint64_t size, bool allow_shrink, + ProgressContext &prog_ctx, Context *on_finish); + void notify_snap_create(uint64_t request_id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t flags, + ProgressContext &prog_ctx, + Context *on_finish); + void notify_snap_rename(uint64_t request_id, + const snapid_t &src_snap_id, + const std::string &dst_snap_name, + Context *on_finish); + void notify_snap_remove(uint64_t request_id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish); + void notify_snap_protect(uint64_t request_id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish); + void notify_snap_unprotect(uint64_t request_id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish); + void notify_rebuild_object_map(uint64_t request_id, + ProgressContext &prog_ctx, Context *on_finish); + void notify_rename(uint64_t request_id, + const std::string &image_name, Context *on_finish); + + void notify_update_features(uint64_t request_id, + uint64_t features, bool enabled, + Context *on_finish); + + void notify_migrate(uint64_t request_id, ProgressContext &prog_ctx, + Context *on_finish); + + void notify_sparsify(uint64_t request_id, size_t sparse_size, + ProgressContext &prog_ctx, Context *on_finish); + + void notify_acquired_lock(); + void notify_released_lock(); + void notify_request_lock(); + + void notify_header_update(Context *on_finish); + static void notify_header_update(librados::IoCtx &io_ctx, + const std::string &oid); + + void notify_quiesce(uint64_t *request_id, ProgressContext &prog_ctx, + Context *on_finish); + void notify_unquiesce(uint64_t request_id, Context *on_finish); + + void notify_metadata_set(uint64_t request_id, + const std::string &key, const std::string &value, + Context *on_finish); + void notify_metadata_remove(uint64_t request_id, + const std::string &key, Context *on_finish); + +private: + enum TaskCode { + TASK_CODE_REQUEST_LOCK, + TASK_CODE_CANCEL_ASYNC_REQUESTS, + TASK_CODE_REREGISTER_WATCH, + TASK_CODE_ASYNC_REQUEST, + TASK_CODE_ASYNC_PROGRESS, + TASK_CODE_QUIESCE, + }; + + typedef std::pair AsyncRequest; + + class Task { + public: + Task(TaskCode task_code) : m_task_code(task_code) {} + Task(TaskCode task_code, const watch_notify::AsyncRequestId &id) + : m_task_code(task_code), m_async_request_id(id) {} + + inline bool operator<(const Task& rhs) const { + if (m_task_code != rhs.m_task_code) { + return m_task_code < rhs.m_task_code; + } else if ((m_task_code == TASK_CODE_ASYNC_REQUEST || + m_task_code == TASK_CODE_ASYNC_PROGRESS || + m_task_code == TASK_CODE_QUIESCE) && + m_async_request_id != rhs.m_async_request_id) { + return m_async_request_id < rhs.m_async_request_id; + } + return false; + } + private: + TaskCode m_task_code; + watch_notify::AsyncRequestId m_async_request_id; + }; + + class RemoteProgressContext : public ProgressContext { + public: + RemoteProgressContext(ImageWatcher &image_watcher, + const watch_notify::AsyncRequestId &id) + : m_image_watcher(image_watcher), m_async_request_id(id) + { + } + + int update_progress(uint64_t offset, uint64_t total) override { + m_image_watcher.schedule_async_progress(m_async_request_id, offset, + total); + return 0; + } + + private: + ImageWatcher &m_image_watcher; + watch_notify::AsyncRequestId m_async_request_id; + }; + + class RemoteContext : public Context { + public: + RemoteContext(ImageWatcher &image_watcher, + const watch_notify::AsyncRequestId &id, + ProgressContext *prog_ctx) + : m_image_watcher(image_watcher), m_async_request_id(id), + m_prog_ctx(prog_ctx) + { + } + + ~RemoteContext() override { + delete m_prog_ctx; + } + + void finish(int r) override; + + private: + ImageWatcher &m_image_watcher; + watch_notify::AsyncRequestId m_async_request_id; + ProgressContext *m_prog_ctx; + }; + + struct C_ProcessPayload; + struct C_ResponseMessage : public Context { + C_NotifyAck *notify_ack; + + C_ResponseMessage(C_NotifyAck *notify_ack) : notify_ack(notify_ack) { + } + void finish(int r) override; + }; + + ImageCtxT &m_image_ctx; + + TaskFinisher *m_task_finisher; + + ceph::shared_mutex m_async_request_lock; + std::map m_async_requests; + std::set m_async_pending; + std::map m_async_complete; + std::set> m_async_complete_expiration; + + ceph::mutex m_owner_client_id_lock; + watch_notify::ClientId m_owner_client_id; + + AsyncOpTracker m_async_op_tracker; + + NoOpProgressContext m_no_op_prog_ctx; + + void handle_register_watch(int r); + + void schedule_cancel_async_requests(); + void cancel_async_requests(); + + void set_owner_client_id(const watch_notify::ClientId &client_id); + watch_notify::ClientId get_client_id(); + + void handle_request_lock(int r); + void schedule_request_lock(bool use_timer, int timer_delay = -1); + + void notify_lock_owner(watch_notify::Payload *payload, Context *on_finish); + + bool is_new_request(const watch_notify::AsyncRequestId &id) const; + bool mark_async_request_complete(const watch_notify::AsyncRequestId &id, + int r); + Context *remove_async_request(const watch_notify::AsyncRequestId &id); + Context *remove_async_request(const watch_notify::AsyncRequestId &id, + ceph::shared_mutex &lock); + void schedule_async_request_timed_out(const watch_notify::AsyncRequestId &id); + void async_request_timed_out(const watch_notify::AsyncRequestId &id); + void notify_async_request(const watch_notify::AsyncRequestId &id, + watch_notify::Payload *payload, + ProgressContext& prog_ctx, + Context *on_finish); + + void schedule_async_progress(const watch_notify::AsyncRequestId &id, + uint64_t offset, uint64_t total); + int notify_async_progress(const watch_notify::AsyncRequestId &id, + uint64_t offset, uint64_t total); + void schedule_async_complete(const watch_notify::AsyncRequestId &id, int r); + void notify_async_complete(const watch_notify::AsyncRequestId &id, int r); + void handle_async_complete(const watch_notify::AsyncRequestId &request, int r, + int ret_val); + + int prepare_async_request(const watch_notify::AsyncRequestId& id, + bool* new_request, Context** ctx, + ProgressContext** prog_ctx); + + Context *prepare_quiesce_request(const watch_notify::AsyncRequestId &request, + C_NotifyAck *ack_ctx); + void prepare_unquiesce_request(const watch_notify::AsyncRequestId &request); + void cancel_quiesce_requests(); + + void notify_quiesce(const watch_notify::AsyncRequestId &async_request_id, + size_t attempts, ProgressContext &prog_ctx, + Context *on_finish); + + bool handle_operation_request( + const watch_notify::AsyncRequestId& async_request_id, + exclusive_lock::OperationRequestType request_type, Operation operation, + std::function execute, + C_NotifyAck *ack_ctx); + + bool handle_payload(const watch_notify::HeaderUpdatePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::AcquiredLockPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::ReleasedLockPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::RequestLockPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::AsyncProgressPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::AsyncCompletePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::FlattenPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::ResizePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapCreatePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapRenamePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapRemovePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapProtectPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SnapUnprotectPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::RebuildObjectMapPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::RenamePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::UpdateFeaturesPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::MigratePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::SparsifyPayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::QuiescePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::UnquiescePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::MetadataUpdatePayload& payload, + C_NotifyAck *ctx); + bool handle_payload(const watch_notify::UnknownPayload& payload, + C_NotifyAck *ctx); + void process_payload(uint64_t notify_id, uint64_t handle, + watch_notify::Payload *payload); + + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) override; + void handle_error(uint64_t cookie, int err) override; + void handle_rewatch_complete(int r) override; + + void send_notify(watch_notify::Payload *payload, Context *ctx = nullptr); + +}; + +} // namespace librbd + +extern template class librbd::ImageWatcher; + +#endif // CEPH_LIBRBD_IMAGE_WATCHER_H diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc new file mode 100644 index 000000000..8ddce2e8f --- /dev/null +++ b/src/librbd/Journal.cc @@ -0,0 +1,1862 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/Journal.h" +#include "include/rados/librados.hpp" +#include "common/AsyncOpTracker.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "cls/journal/cls_journal_types.h" +#include "journal/Journaler.h" +#include "journal/Policy.h" +#include "journal/ReplayEntry.h" +#include "journal/Settings.h" +#include "journal/Utils.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/journal/DemoteRequest.h" +#include "librbd/journal/ObjectDispatch.h" +#include "librbd/journal/OpenRequest.h" +#include "librbd/journal/RemoveRequest.h" +#include "librbd/journal/ResetRequest.h" +#include "librbd/journal/Replay.h" +#include "librbd/journal/PromoteRequest.h" + +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Journal: " + +namespace librbd { + +using util::create_async_context_callback; +using util::create_context_callback; +using journal::util::C_DecodeTag; +using journal::util::C_DecodeTags; + +namespace { + +// TODO: once journaler is 100% async and converted to ASIO, remove separate +// threads and reuse librbd's AsioEngine +class ThreadPoolSingleton : public ThreadPool { +public: + ContextWQ *work_queue; + + explicit ThreadPoolSingleton(CephContext *cct) + : ThreadPool(cct, "librbd::Journal", "tp_librbd_journ", 1), + work_queue(new ContextWQ("librbd::journal::work_queue", + ceph::make_timespan( + cct->_conf.get_val("rbd_op_thread_timeout")), + this)) { + start(); + } + ~ThreadPoolSingleton() override { + work_queue->drain(); + delete work_queue; + + stop(); + } +}; + +template +struct C_IsTagOwner : public Context { + librados::IoCtx &io_ctx; + std::string image_id; + bool *is_tag_owner; + asio::ContextWQ *op_work_queue; + Context *on_finish; + + CephContext *cct = nullptr; + Journaler *journaler; + cls::journal::Client client; + journal::ImageClientMeta client_meta; + uint64_t tag_tid = 0; + journal::TagData tag_data; + + C_IsTagOwner(librados::IoCtx &io_ctx, const std::string &image_id, + bool *is_tag_owner, asio::ContextWQ *op_work_queue, + Context *on_finish) + : io_ctx(io_ctx), image_id(image_id), is_tag_owner(is_tag_owner), + op_work_queue(op_work_queue), on_finish(on_finish), + cct(reinterpret_cast(io_ctx.cct())), + journaler(new Journaler(io_ctx, image_id, Journal<>::IMAGE_CLIENT_ID, + {}, nullptr)) { + } + + void finish(int r) override { + ldout(cct, 20) << this << " C_IsTagOwner::" << __func__ << ": r=" << r + << dendl; + if (r < 0) { + lderr(cct) << this << " C_IsTagOwner::" << __func__ << ": " + << "failed to get tag owner: " << cpp_strerror(r) << dendl; + } else { + *is_tag_owner = (tag_data.mirror_uuid == Journal<>::LOCAL_MIRROR_UUID); + } + + Journaler *journaler = this->journaler; + Context *on_finish = this->on_finish; + auto ctx = new LambdaContext( + [journaler, on_finish](int r) { + on_finish->complete(r); + delete journaler; + }); + op_work_queue->queue(ctx, r); + } +}; + +struct C_GetTagOwner : public Context { + std::string *mirror_uuid; + Context *on_finish; + + Journaler journaler; + cls::journal::Client client; + journal::ImageClientMeta client_meta; + uint64_t tag_tid = 0; + journal::TagData tag_data; + + C_GetTagOwner(librados::IoCtx &io_ctx, const std::string &image_id, + std::string *mirror_uuid, Context *on_finish) + : mirror_uuid(mirror_uuid), on_finish(on_finish), + journaler(io_ctx, image_id, Journal<>::IMAGE_CLIENT_ID, {}, nullptr) { + } + + virtual void finish(int r) { + if (r >= 0) { + *mirror_uuid = tag_data.mirror_uuid; + } + on_finish->complete(r); + } +}; + +template +struct GetTagsRequest { + CephContext *cct; + J *journaler; + cls::journal::Client *client; + journal::ImageClientMeta *client_meta; + uint64_t *tag_tid; + journal::TagData *tag_data; + Context *on_finish; + + ceph::mutex lock = ceph::make_mutex("lock"); + + GetTagsRequest(CephContext *cct, J *journaler, cls::journal::Client *client, + journal::ImageClientMeta *client_meta, uint64_t *tag_tid, + journal::TagData *tag_data, Context *on_finish) + : cct(cct), journaler(journaler), client(client), client_meta(client_meta), + tag_tid(tag_tid), tag_data(tag_data), on_finish(on_finish) { + } + + /** + * @verbatim + * + * + * | + * v + * GET_CLIENT * * * * * * * * * * * * + * | * + * v * + * GET_TAGS * * * * * * * * * * * * * (error) + * | * + * v * + * * * * * * * * * * * * * * + * + * @endverbatim + */ + + void send() { + send_get_client(); + } + + void send_get_client() { + ldout(cct, 20) << __func__ << dendl; + + auto ctx = new LambdaContext( + [this](int r) { + handle_get_client(r); + }); + journaler->get_client(Journal::IMAGE_CLIENT_ID, client, ctx); + } + + void handle_get_client(int r) { + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + if (r < 0) { + complete(r); + return; + } + + librbd::journal::ClientData client_data; + auto bl_it = client->data.cbegin(); + try { + decode(client_data, bl_it); + } catch (const buffer::error &err) { + lderr(cct) << this << " OpenJournalerRequest::" << __func__ << ": " + << "failed to decode client data" << dendl; + complete(-EBADMSG); + return; + } + + journal::ImageClientMeta *image_client_meta = + boost::get(&client_data.client_meta); + if (image_client_meta == nullptr) { + lderr(cct) << this << " OpenJournalerRequest::" << __func__ << ": " + << "failed to get client meta" << dendl; + complete(-EINVAL); + return; + } + *client_meta = *image_client_meta; + + send_get_tags(); + } + + void send_get_tags() { + ldout(cct, 20) << __func__ << dendl; + + auto ctx = new LambdaContext( + [this](int r) { + handle_get_tags(r); + }); + C_DecodeTags *tags_ctx = new C_DecodeTags(cct, &lock, tag_tid, tag_data, + ctx); + journaler->get_tags(client_meta->tag_class, &tags_ctx->tags, tags_ctx); + } + + void handle_get_tags(int r) { + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + complete(r); + } + + void complete(int r) { + on_finish->complete(r); + delete this; + } +}; + +template +void get_tags(CephContext *cct, J *journaler, + cls::journal::Client *client, + journal::ImageClientMeta *client_meta, + uint64_t *tag_tid, journal::TagData *tag_data, + Context *on_finish) { + ldout(cct, 20) << __func__ << dendl; + + GetTagsRequest *req = + new GetTagsRequest(cct, journaler, client, client_meta, tag_tid, + tag_data, on_finish); + req->send(); +} + +template +int allocate_journaler_tag(CephContext *cct, J *journaler, + uint64_t tag_class, + const journal::TagPredecessor &predecessor, + const std::string &mirror_uuid, + cls::journal::Tag *new_tag) { + journal::TagData tag_data; + tag_data.mirror_uuid = mirror_uuid; + tag_data.predecessor = predecessor; + + bufferlist tag_bl; + encode(tag_data, tag_bl); + + C_SaferCond allocate_tag_ctx; + journaler->allocate_tag(tag_class, tag_bl, new_tag, &allocate_tag_ctx); + + int r = allocate_tag_ctx.wait(); + if (r < 0) { + lderr(cct) << __func__ << ": " + << "failed to allocate tag: " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +} // anonymous namespace + +// client id for local image +template +const std::string Journal::IMAGE_CLIENT_ID(""); + +// mirror uuid to use for local images +template +const std::string Journal::LOCAL_MIRROR_UUID(""); + +// mirror uuid to use for orphaned (demoted) images +template +const std::string Journal::ORPHAN_MIRROR_UUID(""); + +template +std::ostream &operator<<(std::ostream &os, + const typename Journal::State &state) { + switch (state) { + case Journal::STATE_UNINITIALIZED: + os << "Uninitialized"; + break; + case Journal::STATE_INITIALIZING: + os << "Initializing"; + break; + case Journal::STATE_REPLAYING: + os << "Replaying"; + break; + case Journal::STATE_FLUSHING_RESTART: + os << "FlushingRestart"; + break; + case Journal::STATE_RESTARTING_REPLAY: + os << "RestartingReplay"; + break; + case Journal::STATE_FLUSHING_REPLAY: + os << "FlushingReplay"; + break; + case Journal::STATE_READY: + os << "Ready"; + break; + case Journal::STATE_STOPPING: + os << "Stopping"; + break; + case Journal::STATE_CLOSING: + os << "Closing"; + break; + case Journal::STATE_CLOSED: + os << "Closed"; + break; + default: + os << "Unknown (" << static_cast(state) << ")"; + break; + } + return os; +} + + +template +void Journal::MetadataListener::handle_update(::journal::JournalMetadata *) { + auto ctx = new LambdaContext([this](int r) { + journal->handle_metadata_updated(); + }); + journal->m_work_queue->queue(ctx, 0); +} + + +template +void Journal::get_work_queue(CephContext *cct, ContextWQ **work_queue) { + auto thread_pool_singleton = + &cct->lookup_or_create_singleton_object( + "librbd::journal::thread_pool", false, cct); + *work_queue = thread_pool_singleton->work_queue; +} + +template +Journal::Journal(I &image_ctx) + : RefCountedObject(image_ctx.cct), + m_image_ctx(image_ctx), m_journaler(NULL), + m_state(STATE_UNINITIALIZED), + m_error_result(0), m_replay_handler(this), m_close_pending(false), + m_event_tid(0), + m_blocking_writes(false), m_journal_replay(NULL), + m_metadata_listener(this) { + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << ": ictx=" << &m_image_ctx << dendl; + + get_work_queue(cct, &m_work_queue); + ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock); +} + +template +Journal::~Journal() { + if (m_work_queue != nullptr) { + m_work_queue->drain(); + } + + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_CLOSED); + ceph_assert(m_journaler == NULL); + ceph_assert(m_journal_replay == NULL); + ceph_assert(m_wait_for_state_contexts.empty()); +} + +template +bool Journal::is_journal_supported(I &image_ctx) { + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + return ((image_ctx.features & RBD_FEATURE_JOURNALING) && + !image_ctx.read_only && image_ctx.snap_id == CEPH_NOSNAP); +} + +template +int Journal::create(librados::IoCtx &io_ctx, const std::string &image_id, + uint8_t order, uint8_t splay_width, + const std::string &object_pool) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 5) << __func__ << ": image=" << image_id << dendl; + + ContextWQ *work_queue; + get_work_queue(cct, &work_queue); + + C_SaferCond cond; + journal::TagData tag_data(LOCAL_MIRROR_UUID); + journal::CreateRequest *req = journal::CreateRequest::create( + io_ctx, image_id, order, splay_width, object_pool, cls::journal::Tag::TAG_CLASS_NEW, + tag_data, IMAGE_CLIENT_ID, work_queue, &cond); + req->send(); + + return cond.wait(); +} + +template +int Journal::remove(librados::IoCtx &io_ctx, const std::string &image_id) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 5) << __func__ << ": image=" << image_id << dendl; + + ContextWQ *work_queue; + get_work_queue(cct, &work_queue); + + C_SaferCond cond; + journal::RemoveRequest *req = journal::RemoveRequest::create( + io_ctx, image_id, IMAGE_CLIENT_ID, work_queue, &cond); + req->send(); + + return cond.wait(); +} + +template +int Journal::reset(librados::IoCtx &io_ctx, const std::string &image_id) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 5) << __func__ << ": image=" << image_id << dendl; + + ContextWQ *work_queue; + get_work_queue(cct, &work_queue); + + C_SaferCond cond; + auto req = journal::ResetRequest::create(io_ctx, image_id, IMAGE_CLIENT_ID, + Journal<>::LOCAL_MIRROR_UUID, + work_queue, &cond); + req->send(); + + return cond.wait(); +} + +template +void Journal::is_tag_owner(I *image_ctx, bool *owner, + Context *on_finish) { + Journal::is_tag_owner(image_ctx->md_ctx, image_ctx->id, owner, + image_ctx->op_work_queue, on_finish); +} + +template +void Journal::is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id, + bool *is_tag_owner, + asio::ContextWQ *op_work_queue, + Context *on_finish) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << __func__ << dendl; + + C_IsTagOwner *is_tag_owner_ctx = new C_IsTagOwner( + io_ctx, image_id, is_tag_owner, op_work_queue, on_finish); + get_tags(cct, is_tag_owner_ctx->journaler, &is_tag_owner_ctx->client, + &is_tag_owner_ctx->client_meta, &is_tag_owner_ctx->tag_tid, + &is_tag_owner_ctx->tag_data, is_tag_owner_ctx); +} + +template +void Journal::get_tag_owner(IoCtx& io_ctx, std::string& image_id, + std::string *mirror_uuid, + asio::ContextWQ *op_work_queue, + Context *on_finish) { + CephContext *cct = static_cast(io_ctx.cct()); + ldout(cct, 20) << __func__ << dendl; + + auto ctx = new C_GetTagOwner(io_ctx, image_id, mirror_uuid, on_finish); + get_tags(cct, &ctx->journaler, &ctx->client, &ctx->client_meta, &ctx->tag_tid, + &ctx->tag_data, create_async_context_callback(op_work_queue, ctx)); +} + +template +int Journal::request_resync(I *image_ctx) { + CephContext *cct = image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + Journaler journaler(image_ctx->md_ctx, image_ctx->id, IMAGE_CLIENT_ID, {}, + nullptr); + + ceph::mutex lock = ceph::make_mutex("lock"); + journal::ImageClientMeta client_meta; + uint64_t tag_tid; + journal::TagData tag_data; + + C_SaferCond open_ctx; + auto open_req = journal::OpenRequest::create(image_ctx, &journaler, &lock, + &client_meta, &tag_tid, + &tag_data, &open_ctx); + open_req->send(); + + BOOST_SCOPE_EXIT_ALL(&journaler) { + journaler.shut_down(); + }; + + int r = open_ctx.wait(); + if (r < 0) { + return r; + } + + client_meta.resync_requested = true; + + journal::ClientData client_data(client_meta); + bufferlist client_data_bl; + encode(client_data, client_data_bl); + + C_SaferCond update_client_ctx; + journaler.update_client(client_data_bl, &update_client_ctx); + + r = update_client_ctx.wait(); + if (r < 0) { + lderr(cct) << __func__ << ": " + << "failed to update client: " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +template +void Journal::promote(I *image_ctx, Context *on_finish) { + CephContext *cct = image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + auto promote_req = journal::PromoteRequest::create(image_ctx, false, + on_finish); + promote_req->send(); +} + +template +void Journal::demote(I *image_ctx, Context *on_finish) { + CephContext *cct = image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + auto req = journal::DemoteRequest::create(*image_ctx, on_finish); + req->send(); +} + +template +bool Journal::is_journal_ready() const { + std::lock_guard locker{m_lock}; + return (m_state == STATE_READY); +} + +template +bool Journal::is_journal_replaying() const { + std::lock_guard locker{m_lock}; + return is_journal_replaying(m_lock); +} + +template +bool Journal::is_journal_replaying(const ceph::mutex &) const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return (m_state == STATE_REPLAYING || + m_state == STATE_FLUSHING_REPLAY || + m_state == STATE_FLUSHING_RESTART || + m_state == STATE_RESTARTING_REPLAY); +} + +template +bool Journal::is_journal_appending() const { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + std::lock_guard locker{m_lock}; + return (m_state == STATE_READY && + !m_image_ctx.get_journal_policy()->append_disabled()); +} + +template +void Journal::wait_for_journal_ready(Context *on_ready) { + on_ready = create_async_context_callback(m_image_ctx, on_ready); + + std::lock_guard locker{m_lock}; + if (m_state == STATE_READY) { + on_ready->complete(m_error_result); + } else { + wait_for_steady_state(on_ready); + } +} + +template +void Journal::open(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + on_finish = create_context_callback(on_finish, this); + + on_finish = create_async_context_callback(m_image_ctx, on_finish); + + // inject our handler into the object dispatcher chain + m_image_ctx.io_object_dispatcher->register_dispatch( + journal::ObjectDispatch::create(&m_image_ctx, this)); + + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_UNINITIALIZED); + wait_for_steady_state(on_finish); + create_journaler(); +} + +template +void Journal::close(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + on_finish = create_context_callback(on_finish, this); + + on_finish = new LambdaContext([this, on_finish](int r) { + // remove our handler from object dispatcher chain - preserve error + auto ctx = new LambdaContext([on_finish, r](int _) { + on_finish->complete(r); + }); + m_image_ctx.io_object_dispatcher->shut_down_dispatch( + io::OBJECT_DISPATCH_LAYER_JOURNAL, ctx); + }); + on_finish = create_async_context_callback(m_image_ctx, on_finish); + + std::unique_lock locker{m_lock}; + m_listener_cond.wait(locker, [this] { return !m_listener_notify; }); + + Listeners listeners(m_listeners); + m_listener_notify = true; + locker.unlock(); + for (auto listener : listeners) { + listener->handle_close(); + } + + locker.lock(); + m_listener_notify = false; + m_listener_cond.notify_all(); + + ceph_assert(m_state != STATE_UNINITIALIZED); + if (m_state == STATE_CLOSED) { + on_finish->complete(m_error_result); + return; + } + + if (m_state == STATE_READY) { + stop_recording(); + } + + m_close_pending = true; + wait_for_steady_state(on_finish); +} + +template +bool Journal::is_tag_owner() const { + std::lock_guard locker{m_lock}; + return is_tag_owner(m_lock); +} + +template +bool Journal::is_tag_owner(const ceph::mutex &) const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return (m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID); +} + +template +uint64_t Journal::get_tag_tid() const { + std::lock_guard locker{m_lock}; + return m_tag_tid; +} + +template +journal::TagData Journal::get_tag_data() const { + std::lock_guard locker{m_lock}; + return m_tag_data; +} + +template +void Journal::allocate_local_tag(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + journal::TagPredecessor predecessor; + predecessor.mirror_uuid = LOCAL_MIRROR_UUID; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_journaler != nullptr && is_tag_owner(m_lock)); + + cls::journal::Client client; + int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to retrieve client: " << cpp_strerror(r) << dendl; + m_image_ctx.op_work_queue->queue(on_finish, r); + return; + } + + // since we are primary, populate the predecessor with our known commit + // position + ceph_assert(m_tag_data.mirror_uuid == LOCAL_MIRROR_UUID); + if (!client.commit_position.object_positions.empty()) { + auto position = client.commit_position.object_positions.front(); + predecessor.commit_valid = true; + predecessor.tag_tid = position.tag_tid; + predecessor.entry_tid = position.entry_tid; + } + } + + allocate_tag(LOCAL_MIRROR_UUID, predecessor, on_finish); +} + +template +void Journal::allocate_tag(const std::string &mirror_uuid, + const journal::TagPredecessor &predecessor, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": mirror_uuid=" << mirror_uuid + << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_journaler != nullptr); + + journal::TagData tag_data; + tag_data.mirror_uuid = mirror_uuid; + tag_data.predecessor = predecessor; + + bufferlist tag_bl; + encode(tag_data, tag_bl); + + C_DecodeTag *decode_tag_ctx = new C_DecodeTag(cct, &m_lock, &m_tag_tid, + &m_tag_data, on_finish); + m_journaler->allocate_tag(m_tag_class, tag_bl, &decode_tag_ctx->tag, + decode_tag_ctx); +} + +template +void Journal::flush_commit_position(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_journaler != nullptr); + m_journaler->flush_commit_position(on_finish); +} + +template +void Journal::user_flushed() { + if (m_state == STATE_READY && !m_user_flushed.exchange(true) && + m_image_ctx.config.template get_val("rbd_journal_object_writethrough_until_flush")) { + std::lock_guard locker{m_lock}; + if (m_state == STATE_READY) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + ceph_assert(m_journaler != nullptr); + m_journaler->set_append_batch_options( + m_image_ctx.config.template get_val("rbd_journal_object_flush_interval"), + m_image_ctx.config.template get_val("rbd_journal_object_flush_bytes"), + m_image_ctx.config.template get_val("rbd_journal_object_flush_age")); + } else { + m_user_flushed = false; + } + } +} + +template +uint64_t Journal::append_write_event(uint64_t offset, size_t length, + const bufferlist &bl, + bool flush_entry) { + ceph_assert(m_max_append_size > journal::AioWriteEvent::get_fixed_size()); + uint64_t max_write_data_size = + m_max_append_size - journal::AioWriteEvent::get_fixed_size(); + + // ensure that the write event fits within the journal entry + Bufferlists bufferlists; + uint64_t bytes_remaining = length; + uint64_t event_offset = 0; + do { + uint64_t event_length = std::min(bytes_remaining, max_write_data_size); + + bufferlist event_bl; + event_bl.substr_of(bl, event_offset, event_length); + journal::EventEntry event_entry(journal::AioWriteEvent(offset + event_offset, + event_length, + event_bl), + ceph_clock_now()); + + bufferlists.emplace_back(); + encode(event_entry, bufferlists.back()); + + event_offset += event_length; + bytes_remaining -= event_length; + } while (bytes_remaining > 0); + + return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists, offset, + length, flush_entry, 0); +} + +template +uint64_t Journal::append_compare_and_write_event(uint64_t offset, + size_t length, + const bufferlist &cmp_bl, + const bufferlist &write_bl, + bool flush_entry) { + ceph_assert( + m_max_append_size > journal::AioCompareAndWriteEvent::get_fixed_size()); + uint64_t max_compare_and_write_data_size = + m_max_append_size - journal::AioCompareAndWriteEvent::get_fixed_size(); + // we need double the size because we store cmp and write buffers + max_compare_and_write_data_size /= 2; + + // ensure that the compare and write event fits within the journal entry + Bufferlists bufferlists; + uint64_t bytes_remaining = length; + uint64_t event_offset = 0; + do { + uint64_t event_length = std::min(bytes_remaining, + max_compare_and_write_data_size); + + bufferlist event_cmp_bl; + event_cmp_bl.substr_of(cmp_bl, event_offset, event_length); + bufferlist event_write_bl; + event_write_bl.substr_of(write_bl, event_offset, event_length); + journal::EventEntry event_entry( + journal::AioCompareAndWriteEvent(offset + event_offset, + event_length, + event_cmp_bl, + event_write_bl), + ceph_clock_now()); + + bufferlists.emplace_back(); + encode(event_entry, bufferlists.back()); + + event_offset += event_length; + bytes_remaining -= event_length; + } while (bytes_remaining > 0); + + return append_io_events(journal::EVENT_TYPE_AIO_COMPARE_AND_WRITE, + bufferlists, offset, length, flush_entry, -EILSEQ); +} + +template +uint64_t Journal::append_io_event(journal::EventEntry &&event_entry, + uint64_t offset, size_t length, + bool flush_entry, int filter_ret_val) { + bufferlist bl; + event_entry.timestamp = ceph_clock_now(); + encode(event_entry, bl); + return append_io_events(event_entry.get_event_type(), {bl}, offset, length, + flush_entry, filter_ret_val); +} + +template +uint64_t Journal::append_io_events(journal::EventType event_type, + const Bufferlists &bufferlists, + uint64_t offset, size_t length, + bool flush_entry, int filter_ret_val) { + ceph_assert(!bufferlists.empty()); + + uint64_t tid; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_READY); + + tid = ++m_event_tid; + ceph_assert(tid != 0); + } + + Futures futures; + for (auto &bl : bufferlists) { + ceph_assert(bl.length() <= m_max_append_size); + futures.push_back(m_journaler->append(m_tag_tid, bl)); + } + + { + std::lock_guard event_locker{m_event_lock}; + m_events[tid] = Event(futures, offset, length, filter_ret_val); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": " + << "event=" << event_type << ", " + << "offset=" << offset << ", " + << "length=" << length << ", " + << "flush=" << flush_entry << ", tid=" << tid << dendl; + + Context *on_safe = create_async_context_callback( + m_image_ctx, new C_IOEventSafe(this, tid)); + if (flush_entry) { + futures.back().flush(on_safe); + } else { + futures.back().wait(on_safe); + } + + return tid; +} + +template +void Journal::commit_io_event(uint64_t tid, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", " + "r=" << r << dendl; + + std::lock_guard event_locker{m_event_lock}; + typename Events::iterator it = m_events.find(tid); + if (it == m_events.end()) { + return; + } + complete_event(it, r); +} + +template +void Journal::commit_io_event_extent(uint64_t tid, uint64_t offset, + uint64_t length, int r) { + ceph_assert(length > 0); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", " + << "offset=" << offset << ", " + << "length=" << length << ", " + << "r=" << r << dendl; + + std::lock_guard event_locker{m_event_lock}; + typename Events::iterator it = m_events.find(tid); + if (it == m_events.end()) { + return; + } + + Event &event = it->second; + if (event.ret_val == 0 && r < 0) { + event.ret_val = r; + } + + ExtentInterval extent; + extent.insert(offset, length); + + ExtentInterval intersect; + intersect.intersection_of(extent, event.pending_extents); + + event.pending_extents.subtract(intersect); + if (!event.pending_extents.empty()) { + ldout(cct, 20) << this << " " << __func__ << ": " + << "pending extents: " << event.pending_extents << dendl; + return; + } + complete_event(it, event.ret_val); +} + +template +void Journal::append_op_event(uint64_t op_tid, + journal::EventEntry &&event_entry, + Context *on_safe) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + + bufferlist bl; + event_entry.timestamp = ceph_clock_now(); + encode(event_entry, bl); + + Future future; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_READY); + + future = m_journaler->append(m_tag_tid, bl); + + // delay committing op event to ensure consistent replay + ceph_assert(m_op_futures.count(op_tid) == 0); + m_op_futures[op_tid] = future; + } + + on_safe = create_async_context_callback(m_image_ctx, on_safe); + on_safe = new LambdaContext([this, on_safe](int r) { + // ensure all committed IO before this op is committed + m_journaler->flush_commit_position(on_safe); + }); + future.flush(on_safe); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "op_tid=" << op_tid << ", " + << "event=" << event_entry.get_event_type() << dendl; +} + +template +void Journal::commit_op_event(uint64_t op_tid, int r, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": op_tid=" << op_tid << ", " + << "r=" << r << dendl; + + journal::EventEntry event_entry((journal::OpFinishEvent(op_tid, r)), + ceph_clock_now()); + + bufferlist bl; + encode(event_entry, bl); + + Future op_start_future; + Future op_finish_future; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_READY); + + // ready to commit op event + auto it = m_op_futures.find(op_tid); + ceph_assert(it != m_op_futures.end()); + op_start_future = it->second; + m_op_futures.erase(it); + + op_finish_future = m_journaler->append(m_tag_tid, bl); + } + + op_finish_future.flush(create_async_context_callback( + m_image_ctx, new C_OpEventSafe(this, op_tid, op_start_future, + op_finish_future, on_safe))); +} + +template +void Journal::replay_op_ready(uint64_t op_tid, Context *on_resume) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": op_tid=" << op_tid << dendl; + + { + std::lock_guard locker{m_lock}; + ceph_assert(m_journal_replay != nullptr); + m_journal_replay->replay_op_ready(op_tid, on_resume); + } +} + +template +void Journal::flush_event(uint64_t tid, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", " + << "on_safe=" << on_safe << dendl; + + on_safe = create_context_callback(on_safe, this); + + Future future; + { + std::lock_guard event_locker{m_event_lock}; + future = wait_event(m_lock, tid, on_safe); + } + + if (future.is_valid()) { + future.flush(nullptr); + } +} + +template +void Journal::wait_event(uint64_t tid, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", " + << "on_safe=" << on_safe << dendl; + + on_safe = create_context_callback(on_safe, this); + + std::lock_guard event_locker{m_event_lock}; + wait_event(m_lock, tid, on_safe); +} + +template +typename Journal::Future Journal::wait_event(ceph::mutex &lock, uint64_t tid, + Context *on_safe) { + ceph_assert(ceph_mutex_is_locked(m_event_lock)); + CephContext *cct = m_image_ctx.cct; + + typename Events::iterator it = m_events.find(tid); + ceph_assert(it != m_events.end()); + + Event &event = it->second; + if (event.safe) { + // journal entry already safe + ldout(cct, 20) << this << " " << __func__ << ": " + << "journal entry already safe" << dendl; + m_image_ctx.op_work_queue->queue(on_safe, event.ret_val); + return Future(); + } + + event.on_safe_contexts.push_back(create_async_context_callback(m_image_ctx, + on_safe)); + return event.futures.back(); +} + +template +void Journal::start_external_replay(journal::Replay **journal_replay, + Context *on_start) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_READY); + ceph_assert(m_journal_replay == nullptr); + + on_start = util::create_async_context_callback(m_image_ctx, on_start); + on_start = new LambdaContext( + [this, journal_replay, on_start](int r) { + handle_start_external_replay(r, journal_replay, on_start); + }); + + // safely flush all in-flight events before starting external replay + m_journaler->stop_append(util::create_async_context_callback(m_image_ctx, + on_start)); +} + +template +void Journal::handle_start_external_replay(int r, + journal::Replay **journal_replay, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_READY); + ceph_assert(m_journal_replay == nullptr); + + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to stop recording: " << cpp_strerror(r) << dendl; + *journal_replay = nullptr; + + // get back to a sane-state + start_append(); + on_finish->complete(r); + return; + } + + transition_state(STATE_REPLAYING, 0); + m_journal_replay = journal::Replay::create(m_image_ctx); + *journal_replay = m_journal_replay; + on_finish->complete(0); +} + +template +void Journal::stop_external_replay() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_journal_replay != nullptr); + ceph_assert(m_state == STATE_REPLAYING); + + delete m_journal_replay; + m_journal_replay = nullptr; + + if (m_close_pending) { + destroy_journaler(0); + return; + } + + start_append(); +} + +template +void Journal::create_journaler() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_state == STATE_UNINITIALIZED || m_state == STATE_RESTARTING_REPLAY); + ceph_assert(m_journaler == NULL); + + transition_state(STATE_INITIALIZING, 0); + ::journal::Settings settings; + settings.commit_interval = + m_image_ctx.config.template get_val("rbd_journal_commit_age"); + settings.max_payload_bytes = + m_image_ctx.config.template get_val("rbd_journal_max_payload_bytes"); + settings.max_concurrent_object_sets = + m_image_ctx.config.template get_val("rbd_journal_max_concurrent_object_sets"); + // TODO: a configurable filter to exclude certain peers from being + // disconnected. + settings.ignored_laggy_clients = {IMAGE_CLIENT_ID}; + + m_journaler = new Journaler(m_work_queue, m_timer, m_timer_lock, + m_image_ctx.md_ctx, m_image_ctx.id, + IMAGE_CLIENT_ID, settings, nullptr); + m_journaler->add_listener(&m_metadata_listener); + + Context *ctx = create_async_context_callback( + m_image_ctx, create_context_callback< + Journal, &Journal::handle_open>(this)); + auto open_req = journal::OpenRequest::create(&m_image_ctx, m_journaler, + &m_lock, &m_client_meta, + &m_tag_tid, &m_tag_data, ctx); + open_req->send(); +} + +template +void Journal::destroy_journaler(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + ceph_assert(ceph_mutex_is_locked(m_lock)); + + delete m_journal_replay; + m_journal_replay = NULL; + + m_journaler->remove_listener(&m_metadata_listener); + + transition_state(STATE_CLOSING, r); + + Context *ctx = create_async_context_callback( + m_image_ctx, create_context_callback< + Journal, &Journal::handle_journal_destroyed>(this)); + ctx = new LambdaContext( + [this, ctx](int r) { + std::lock_guard locker{m_lock}; + m_journaler->shut_down(ctx); + }); + ctx = create_async_context_callback(m_image_ctx, ctx); + m_async_journal_op_tracker.wait_for_ops(ctx); +} + +template +void Journal::recreate_journaler(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_state == STATE_FLUSHING_RESTART || + m_state == STATE_FLUSHING_REPLAY); + + delete m_journal_replay; + m_journal_replay = NULL; + + m_journaler->remove_listener(&m_metadata_listener); + + transition_state(STATE_RESTARTING_REPLAY, r); + m_journaler->shut_down(create_async_context_callback( + m_image_ctx, create_context_callback< + Journal, &Journal::handle_journal_destroyed>(this))); +} + +template +void Journal::complete_event(typename Events::iterator it, int r) { + ceph_assert(ceph_mutex_is_locked(m_event_lock)); + ceph_assert(m_state == STATE_READY); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": tid=" << it->first << " " + << "r=" << r << dendl; + + Event &event = it->second; + if (r < 0 && r == event.filter_ret_val) { + // ignore allowed error codes + r = 0; + } + if (r < 0) { + // event recorded to journal but failed to update disk, we cannot + // commit this IO event. this event must be replayed. + ceph_assert(event.safe); + lderr(cct) << this << " " << __func__ << ": " + << "failed to commit IO to disk, replay required: " + << cpp_strerror(r) << dendl; + } + + event.committed_io = true; + if (event.safe) { + if (r >= 0) { + for (auto &future : event.futures) { + m_journaler->committed(future); + } + } + m_events.erase(it); + } +} + +template +void Journal::start_append() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + m_journaler->start_append( + m_image_ctx.config.template get_val("rbd_journal_object_max_in_flight_appends")); + if (!m_image_ctx.config.template get_val("rbd_journal_object_writethrough_until_flush")) { + m_journaler->set_append_batch_options( + m_image_ctx.config.template get_val("rbd_journal_object_flush_interval"), + m_image_ctx.config.template get_val("rbd_journal_object_flush_bytes"), + m_image_ctx.config.template get_val("rbd_journal_object_flush_age")); + } + + transition_state(STATE_READY, 0); +} + +template +void Journal::handle_open(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_INITIALIZING); + + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to initialize journal: " << cpp_strerror(r) + << dendl; + destroy_journaler(r); + return; + } + + m_tag_class = m_client_meta.tag_class; + m_max_append_size = m_journaler->get_max_append_size(); + ldout(cct, 20) << this << " " << __func__ << ": " + << "tag_class=" << m_tag_class << ", " + << "max_append_size=" << m_max_append_size << dendl; + + transition_state(STATE_REPLAYING, 0); + m_journal_replay = journal::Replay::create(m_image_ctx); + m_journaler->start_replay(&m_replay_handler); +} + +template +void Journal::handle_replay_ready() { + CephContext *cct = m_image_ctx.cct; + ReplayEntry replay_entry; + { + std::lock_guard locker{m_lock}; + if (m_state != STATE_REPLAYING) { + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + if (!m_journaler->try_pop_front(&replay_entry)) { + return; + } + + // only one entry should be in-flight at a time + ceph_assert(!m_processing_entry); + m_processing_entry = true; + } + + m_async_journal_op_tracker.start_op(); + + bufferlist data = replay_entry.get_data(); + auto it = data.cbegin(); + + journal::EventEntry event_entry; + int r = m_journal_replay->decode(&it, &event_entry); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to decode journal event entry" << dendl; + handle_replay_process_safe(replay_entry, r); + return; + } + + Context *on_ready = create_context_callback< + Journal, &Journal::handle_replay_process_ready>(this); + Context *on_commit = new C_ReplayProcessSafe(this, std::move(replay_entry)); + m_journal_replay->process(event_entry, on_ready, on_commit); +} + +template +void Journal::handle_replay_complete(int r) { + CephContext *cct = m_image_ctx.cct; + + bool cancel_ops = false; + { + std::lock_guard locker{m_lock}; + if (m_state != STATE_REPLAYING) { + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + if (r < 0) { + cancel_ops = true; + transition_state(STATE_FLUSHING_RESTART, r); + } else { + // state might change back to FLUSHING_RESTART on flush error + transition_state(STATE_FLUSHING_REPLAY, 0); + } + } + + Context *ctx = new LambdaContext([this, cct](int r) { + ldout(cct, 20) << this << " handle_replay_complete: " + << "handle shut down replay" << dendl; + + State state; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_FLUSHING_RESTART || + m_state == STATE_FLUSHING_REPLAY); + state = m_state; + } + + if (state == STATE_FLUSHING_RESTART) { + handle_flushing_restart(0); + } else { + handle_flushing_replay(); + } + }); + ctx = new LambdaContext([this, ctx](int r) { + // ensure the commit position is flushed to disk + m_journaler->flush_commit_position(ctx); + }); + ctx = create_async_context_callback(m_image_ctx, ctx); + ctx = new LambdaContext([this, ctx](int r) { + m_async_journal_op_tracker.wait_for_ops(ctx); + }); + ctx = new LambdaContext([this, cct, cancel_ops, ctx](int r) { + ldout(cct, 20) << this << " handle_replay_complete: " + << "shut down replay" << dendl; + m_journal_replay->shut_down(cancel_ops, ctx); + }); + + m_journaler->stop_replay(ctx); +} + +template +void Journal::handle_replay_process_ready(int r) { + // journal::Replay is ready for more events -- attempt to pop another + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(r == 0); + { + std::lock_guard locker{m_lock}; + ceph_assert(m_processing_entry); + m_processing_entry = false; + } + handle_replay_ready(); +} + +template +void Journal::handle_replay_process_safe(ReplayEntry replay_entry, int r) { + CephContext *cct = m_image_ctx.cct; + + std::unique_lock locker{m_lock}; + ceph_assert(m_state == STATE_REPLAYING || + m_state == STATE_FLUSHING_RESTART || + m_state == STATE_FLUSHING_REPLAY); + + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + if (r < 0) { + if (r != -ECANCELED) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to commit journal event to disk: " + << cpp_strerror(r) << dendl; + } + + if (m_state == STATE_REPLAYING) { + // abort the replay if we have an error + transition_state(STATE_FLUSHING_RESTART, r); + locker.unlock(); + + // stop replay, shut down, and restart + Context* ctx = create_context_callback< + Journal, &Journal::handle_flushing_restart>(this); + ctx = new LambdaContext([this, ctx](int r) { + // ensure the commit position is flushed to disk + m_journaler->flush_commit_position(ctx); + }); + ctx = new LambdaContext([this, cct, ctx](int r) { + ldout(cct, 20) << this << " handle_replay_process_safe: " + << "shut down replay" << dendl; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_FLUSHING_RESTART); + } + + m_journal_replay->shut_down(true, ctx); + }); + m_journaler->stop_replay(ctx); + m_async_journal_op_tracker.finish_op(); + return; + } else if (m_state == STATE_FLUSHING_REPLAY) { + // end-of-replay flush in-progress -- we need to restart replay + transition_state(STATE_FLUSHING_RESTART, r); + locker.unlock(); + m_async_journal_op_tracker.finish_op(); + return; + } + } else { + // only commit the entry if written successfully + m_journaler->committed(replay_entry); + } + locker.unlock(); + m_async_journal_op_tracker.finish_op(); +} + +template +void Journal::handle_flushing_restart(int r) { + std::lock_guard locker{m_lock}; + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(r == 0); + ceph_assert(m_state == STATE_FLUSHING_RESTART); + if (m_close_pending) { + destroy_journaler(r); + return; + } + + recreate_journaler(r); +} + +template +void Journal::handle_flushing_replay() { + std::lock_guard locker{m_lock}; + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(m_state == STATE_FLUSHING_REPLAY || + m_state == STATE_FLUSHING_RESTART); + if (m_close_pending) { + destroy_journaler(0); + return; + } else if (m_state == STATE_FLUSHING_RESTART) { + // failed to replay one-or-more events -- restart + recreate_journaler(0); + return; + } + + delete m_journal_replay; + m_journal_replay = NULL; + + m_error_result = 0; + start_append(); +} + +template +void Journal::handle_recording_stopped(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_STOPPING); + + destroy_journaler(r); +} + +template +void Journal::handle_journal_destroyed(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << this << " " << __func__ + << "error detected while closing journal: " << cpp_strerror(r) + << dendl; + } + + std::lock_guard locker{m_lock}; + delete m_journaler; + m_journaler = nullptr; + + ceph_assert(m_state == STATE_CLOSING || m_state == STATE_RESTARTING_REPLAY); + if (m_state == STATE_RESTARTING_REPLAY) { + create_journaler(); + return; + } + + transition_state(STATE_CLOSED, r); +} + +template +void Journal::handle_io_event_safe(int r, uint64_t tid) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << ", " + << "tid=" << tid << dendl; + + // journal will be flushed before closing + ceph_assert(m_state == STATE_READY || m_state == STATE_STOPPING); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to commit IO event: " << cpp_strerror(r) << dendl; + } + + Contexts on_safe_contexts; + { + std::lock_guard event_locker{m_event_lock}; + typename Events::iterator it = m_events.find(tid); + ceph_assert(it != m_events.end()); + + Event &event = it->second; + on_safe_contexts.swap(event.on_safe_contexts); + + if (r < 0 || event.committed_io) { + // failed journal write so IO won't be sent -- or IO extent was + // overwritten by future IO operations so this was a no-op IO event + event.ret_val = r; + for (auto &future : event.futures) { + m_journaler->committed(future); + } + } + + if (event.committed_io) { + m_events.erase(it); + } else { + event.safe = true; + } + } + + ldout(cct, 20) << this << " " << __func__ << ": " + << "completing tid=" << tid << dendl; + + // alert the cache about the journal event status + for (Contexts::iterator it = on_safe_contexts.begin(); + it != on_safe_contexts.end(); ++it) { + (*it)->complete(r); + } +} + +template +void Journal::handle_op_event_safe(int r, uint64_t tid, + const Future &op_start_future, + const Future &op_finish_future, + Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << ", " + << "tid=" << tid << dendl; + + // journal will be flushed before closing + ceph_assert(m_state == STATE_READY || m_state == STATE_STOPPING); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to commit op event: " << cpp_strerror(r) << dendl; + } + + m_journaler->committed(op_start_future); + m_journaler->committed(op_finish_future); + + // reduce the replay window after committing an op event + m_journaler->flush_commit_position(on_safe); +} + +template +void Journal::stop_recording() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_journaler != NULL); + + ceph_assert(m_state == STATE_READY); + transition_state(STATE_STOPPING, 0); + + m_journaler->stop_append(util::create_async_context_callback( + m_image_ctx, create_context_callback< + Journal, &Journal::handle_recording_stopped>(this))); +} + +template +void Journal::transition_state(State state, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": new state=" << state << dendl; + ceph_assert(ceph_mutex_is_locked(m_lock)); + m_state = state; + + if (m_error_result == 0 && r < 0) { + m_error_result = r; + } + + if (is_steady_state()) { + auto wait_for_state_contexts(std::move(m_wait_for_state_contexts)); + m_wait_for_state_contexts.clear(); + + for (auto ctx : wait_for_state_contexts) { + ctx->complete(m_error_result); + } + } +} + +template +bool Journal::is_steady_state() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + switch (m_state) { + case STATE_READY: + case STATE_CLOSED: + return true; + case STATE_UNINITIALIZED: + case STATE_INITIALIZING: + case STATE_REPLAYING: + case STATE_FLUSHING_RESTART: + case STATE_RESTARTING_REPLAY: + case STATE_FLUSHING_REPLAY: + case STATE_STOPPING: + case STATE_CLOSING: + break; + } + return false; +} + +template +void Journal::wait_for_steady_state(Context *on_state) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(!is_steady_state()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": on_state=" << on_state + << dendl; + m_wait_for_state_contexts.push_back(on_state); +} + +template +int Journal::is_resync_requested(bool *do_resync) { + std::lock_guard l{m_lock}; + return check_resync_requested(do_resync); +} + +template +int Journal::check_resync_requested(bool *do_resync) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(do_resync != nullptr); + + cls::journal::Client client; + int r = m_journaler->get_cached_client(IMAGE_CLIENT_ID, &client); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to retrieve client: " << cpp_strerror(r) << dendl; + return r; + } + + librbd::journal::ClientData client_data; + auto bl_it = client.data.cbegin(); + try { + decode(client_data, bl_it); + } catch (const buffer::error &err) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to decode client data: " << err.what() << dendl; + return -EINVAL; + } + + journal::ImageClientMeta *image_client_meta = + boost::get(&client_data.client_meta); + if (image_client_meta == nullptr) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to access image client meta struct" << dendl; + return -EINVAL; + } + + *do_resync = image_client_meta->resync_requested; + + return 0; +} + +struct C_RefreshTags : public Context { + AsyncOpTracker &async_op_tracker; + Context *on_finish = nullptr; + + ceph::mutex lock = + ceph::make_mutex("librbd::Journal::C_RefreshTags::lock"); + uint64_t tag_tid = 0; + journal::TagData tag_data; + + explicit C_RefreshTags(AsyncOpTracker &async_op_tracker) + : async_op_tracker(async_op_tracker) { + async_op_tracker.start_op(); + } + ~C_RefreshTags() override { + async_op_tracker.finish_op(); + } + + void finish(int r) override { + on_finish->complete(r); + } +}; + +template +void Journal::handle_metadata_updated() { + CephContext *cct = m_image_ctx.cct; + std::lock_guard locker{m_lock}; + + if (m_state != STATE_READY && !is_journal_replaying(m_lock)) { + return; + } else if (is_tag_owner(m_lock)) { + ldout(cct, 20) << this << " " << __func__ << ": primary image" << dendl; + return; + } else if (m_listeners.empty()) { + ldout(cct, 20) << this << " " << __func__ << ": no listeners" << dendl; + return; + } + + uint64_t refresh_sequence = ++m_refresh_sequence; + ldout(cct, 20) << this << " " << __func__ << ": " + << "refresh_sequence=" << refresh_sequence << dendl; + + // pull the most recent tags from the journal, decode, and + // update the internal tag state + C_RefreshTags *refresh_ctx = new C_RefreshTags(m_async_journal_op_tracker); + refresh_ctx->on_finish = new LambdaContext( + [this, refresh_sequence, refresh_ctx](int r) { + handle_refresh_metadata(refresh_sequence, refresh_ctx->tag_tid, + refresh_ctx->tag_data, r); + }); + C_DecodeTags *decode_tags_ctx = new C_DecodeTags( + cct, &refresh_ctx->lock, &refresh_ctx->tag_tid, + &refresh_ctx->tag_data, refresh_ctx); + m_journaler->get_tags(m_tag_tid == 0 ? 0 : m_tag_tid - 1, m_tag_class, + &decode_tags_ctx->tags, decode_tags_ctx); +} + +template +void Journal::handle_refresh_metadata(uint64_t refresh_sequence, + uint64_t tag_tid, + journal::TagData tag_data, int r) { + CephContext *cct = m_image_ctx.cct; + std::unique_lock locker{m_lock}; + + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": failed to refresh metadata: " + << cpp_strerror(r) << dendl; + return; + } else if (m_state != STATE_READY && !is_journal_replaying(m_lock)) { + return; + } else if (refresh_sequence != m_refresh_sequence) { + // another, more up-to-date refresh is in-flight + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": " + << "refresh_sequence=" << refresh_sequence << ", " + << "tag_tid=" << tag_tid << ", " + << "tag_data=" << tag_data << dendl; + m_listener_cond.wait(locker, [this] { return !m_listener_notify; }); + + bool was_tag_owner = is_tag_owner(m_lock); + if (m_tag_tid < tag_tid) { + m_tag_tid = tag_tid; + m_tag_data = tag_data; + } + bool promoted_to_primary = (!was_tag_owner && is_tag_owner(m_lock)); + + bool resync_requested = false; + r = check_resync_requested(&resync_requested); + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to check if a resync was requested" << dendl; + return; + } + + Listeners listeners(m_listeners); + m_listener_notify = true; + locker.unlock(); + + if (promoted_to_primary) { + for (auto listener : listeners) { + listener->handle_promoted(); + } + } else if (resync_requested) { + for (auto listener : listeners) { + listener->handle_resync(); + } + } + + locker.lock(); + m_listener_notify = false; + m_listener_cond.notify_all(); +} + +template +void Journal::add_listener(journal::Listener *listener) { + std::lock_guard locker{m_lock}; + m_listeners.insert(listener); +} + +template +void Journal::remove_listener(journal::Listener *listener) { + std::unique_lock locker{m_lock}; + m_listener_cond.wait(locker, [this] { return !m_listener_notify; }); + m_listeners.erase(listener); +} + +} // namespace librbd + +#ifndef TEST_F +template class librbd::Journal; +#endif diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h new file mode 100644 index 000000000..1ef9ffa88 --- /dev/null +++ b/src/librbd/Journal.h @@ -0,0 +1,380 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_H +#define CEPH_LIBRBD_JOURNAL_H + +#include "include/int_types.h" +#include "include/Context.h" +#include "include/interval_set.h" +#include "include/rados/librados_fwd.hpp" +#include "common/AsyncOpTracker.h" +#include "common/Cond.h" +#include "common/Timer.h" +#include "common/RefCountedObj.h" +#include "journal/Future.h" +#include "journal/JournalMetadataListener.h" +#include "journal/ReplayEntry.h" +#include "journal/ReplayHandler.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" + +#include +#include +#include +#include +#include + +class ContextWQ; +namespace journal { class Journaler; } + +namespace librbd { + +class ImageCtx; + +namespace journal { template class Replay; } + +template +class Journal : public RefCountedObject { +public: + /** + * @verbatim + * + * + * | + * v + * UNINITIALIZED ---> INITIALIZING ---> REPLAYING ------> FLUSHING ---> READY + * | * . ^ * . * | + * | * . | * . * | + * | * . | (error) * . . . . . . . * | + * | * . | * . * | + * | * . | v . * | + * | * . | FLUSHING_RESTART . * | + * | * . | | . * | + * | * . | | . * | + * | * . | v . * v + * | * . | RESTARTING < * * * * * STOPPING + * | * . | | . | + * | * . | | . | + * | * * * * * * . \-------------/ . | + * | * (error) . . | + * | * . . . . . . . . . . . . . . . . | + * | * . . | + * | v v v | + * | CLOSED <----- CLOSING <---------------------------------------/ + * | | + * | v + * \---> + * + * @endverbatim + */ + enum State { + STATE_UNINITIALIZED, + STATE_INITIALIZING, + STATE_REPLAYING, + STATE_FLUSHING_RESTART, + STATE_RESTARTING_REPLAY, + STATE_FLUSHING_REPLAY, + STATE_READY, + STATE_STOPPING, + STATE_CLOSING, + STATE_CLOSED + }; + + static const std::string IMAGE_CLIENT_ID; + static const std::string LOCAL_MIRROR_UUID; + static const std::string ORPHAN_MIRROR_UUID; + + Journal(ImageCtxT &image_ctx); + ~Journal(); + + static void get_work_queue(CephContext *cct, ContextWQ **work_queue); + + static bool is_journal_supported(ImageCtxT &image_ctx); + static int create(librados::IoCtx &io_ctx, const std::string &image_id, + uint8_t order, uint8_t splay_width, + const std::string &object_pool); + static int remove(librados::IoCtx &io_ctx, const std::string &image_id); + static int reset(librados::IoCtx &io_ctx, const std::string &image_id); + + static void is_tag_owner(ImageCtxT *image_ctx, bool *is_tag_owner, + Context *on_finish); + static void is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id, + bool *is_tag_owner, asio::ContextWQ *op_work_queue, + Context *on_finish); + static void get_tag_owner(librados::IoCtx& io_ctx, std::string& image_id, + std::string *mirror_uuid, + asio::ContextWQ *op_work_queue, Context *on_finish); + static int request_resync(ImageCtxT *image_ctx); + static void promote(ImageCtxT *image_ctx, Context *on_finish); + static void demote(ImageCtxT *image_ctx, Context *on_finish); + + bool is_journal_ready() const; + bool is_journal_replaying() const; + bool is_journal_appending() const; + + void wait_for_journal_ready(Context *on_ready); + + void open(Context *on_finish); + void close(Context *on_finish); + + bool is_tag_owner() const; + uint64_t get_tag_tid() const; + journal::TagData get_tag_data() const; + + void allocate_local_tag(Context *on_finish); + void allocate_tag(const std::string &mirror_uuid, + const journal::TagPredecessor &predecessor, + Context *on_finish); + + void flush_commit_position(Context *on_finish); + + void user_flushed(); + + uint64_t append_write_event(uint64_t offset, size_t length, + const bufferlist &bl, + bool flush_entry); + uint64_t append_compare_and_write_event(uint64_t offset, + size_t length, + const bufferlist &cmp_bl, + const bufferlist &write_bl, + bool flush_entry); + uint64_t append_io_event(journal::EventEntry &&event_entry, + uint64_t offset, size_t length, + bool flush_entry, int filter_ret_val); + void commit_io_event(uint64_t tid, int r); + void commit_io_event_extent(uint64_t tid, uint64_t offset, uint64_t length, + int r); + + void append_op_event(uint64_t op_tid, journal::EventEntry &&event_entry, + Context *on_safe); + void commit_op_event(uint64_t tid, int r, Context *on_safe); + void replay_op_ready(uint64_t op_tid, Context *on_resume); + + void flush_event(uint64_t tid, Context *on_safe); + void wait_event(uint64_t tid, Context *on_safe); + + uint64_t allocate_op_tid() { + uint64_t op_tid = ++m_op_tid; + ceph_assert(op_tid != 0); + return op_tid; + } + + void start_external_replay(journal::Replay **journal_replay, + Context *on_start); + void stop_external_replay(); + + void add_listener(journal::Listener *listener); + void remove_listener(journal::Listener *listener); + + int is_resync_requested(bool *do_resync); + + inline ContextWQ *get_work_queue() { + return m_work_queue; + } + +private: + ImageCtxT &m_image_ctx; + + // mock unit testing support + typedef journal::TypeTraits TypeTraits; + typedef typename TypeTraits::Journaler Journaler; + typedef typename TypeTraits::Future Future; + typedef typename TypeTraits::ReplayEntry ReplayEntry; + + typedef std::list Bufferlists; + typedef std::list Contexts; + typedef std::list Futures; + typedef interval_set ExtentInterval; + + struct Event { + Futures futures; + Contexts on_safe_contexts; + ExtentInterval pending_extents; + int filter_ret_val = 0; + bool committed_io = false; + bool safe = false; + int ret_val = 0; + + Event() { + } + Event(const Futures &_futures, uint64_t offset, size_t length, + int filter_ret_val) + : futures(_futures), filter_ret_val(filter_ret_val) { + if (length > 0) { + pending_extents.insert(offset, length); + } + } + }; + + typedef std::unordered_map Events; + typedef std::unordered_map TidToFutures; + + struct C_IOEventSafe : public Context { + Journal *journal; + uint64_t tid; + + C_IOEventSafe(Journal *_journal, uint64_t _tid) + : journal(_journal), tid(_tid) { + } + + void finish(int r) override { + journal->handle_io_event_safe(r, tid); + } + }; + + struct C_OpEventSafe : public Context { + Journal *journal; + uint64_t tid; + Future op_start_future; + Future op_finish_future; + Context *on_safe; + + C_OpEventSafe(Journal *journal, uint64_t tid, const Future &op_start_future, + const Future &op_finish_future, Context *on_safe) + : journal(journal), tid(tid), op_start_future(op_start_future), + op_finish_future(op_finish_future), on_safe(on_safe) { + } + + void finish(int r) override { + journal->handle_op_event_safe(r, tid, op_start_future, op_finish_future, + on_safe); + } + }; + + struct C_ReplayProcessSafe : public Context { + Journal *journal; + ReplayEntry replay_entry; + + C_ReplayProcessSafe(Journal *journal, ReplayEntry &&replay_entry) : + journal(journal), replay_entry(std::move(replay_entry)) { + } + void finish(int r) override { + journal->handle_replay_process_safe(replay_entry, r); + } + }; + + struct ReplayHandler : public ::journal::ReplayHandler { + Journal *journal; + ReplayHandler(Journal *_journal) : journal(_journal) { + } + + void handle_entries_available() override { + journal->handle_replay_ready(); + } + void handle_complete(int r) override { + journal->handle_replay_complete(r); + } + }; + + ContextWQ *m_work_queue = nullptr; + SafeTimer *m_timer = nullptr; + ceph::mutex *m_timer_lock = nullptr; + + Journaler *m_journaler; + mutable ceph::mutex m_lock = ceph::make_mutex("Journal::m_lock"); + State m_state; + uint64_t m_max_append_size = 0; + uint64_t m_tag_class = 0; + uint64_t m_tag_tid = 0; + journal::ImageClientMeta m_client_meta; + journal::TagData m_tag_data; + + int m_error_result; + Contexts m_wait_for_state_contexts; + + ReplayHandler m_replay_handler; + bool m_close_pending; + + ceph::mutex m_event_lock = ceph::make_mutex("Journal::m_event_lock"); + uint64_t m_event_tid; + Events m_events; + + std::atomic m_user_flushed = false; + + std::atomic m_op_tid = { 0 }; + TidToFutures m_op_futures; + + bool m_processing_entry = false; + bool m_blocking_writes; + + journal::Replay *m_journal_replay; + + AsyncOpTracker m_async_journal_op_tracker; + + struct MetadataListener : public ::journal::JournalMetadataListener { + Journal *journal; + + MetadataListener(Journal *journal) : journal(journal) { } + + void handle_update(::journal::JournalMetadata *) override; + } m_metadata_listener; + + typedef std::set Listeners; + Listeners m_listeners; + ceph::condition_variable m_listener_cond; + bool m_listener_notify = false; + + uint64_t m_refresh_sequence = 0; + + bool is_journal_replaying(const ceph::mutex &) const; + bool is_tag_owner(const ceph::mutex &) const; + + uint64_t append_io_events(journal::EventType event_type, + const Bufferlists &bufferlists, + uint64_t offset, size_t length, bool flush_entry, + int filter_ret_val); + Future wait_event(ceph::mutex &lock, uint64_t tid, Context *on_safe); + + void create_journaler(); + void destroy_journaler(int r); + void recreate_journaler(int r); + + void complete_event(typename Events::iterator it, int r); + + void start_append(); + + void handle_open(int r); + + void handle_replay_ready(); + void handle_replay_complete(int r); + void handle_replay_process_ready(int r); + void handle_replay_process_safe(ReplayEntry replay_entry, int r); + + void handle_start_external_replay(int r, + journal::Replay **journal_replay, + Context *on_finish); + + void handle_flushing_restart(int r); + void handle_flushing_replay(); + + void handle_recording_stopped(int r); + + void handle_journal_destroyed(int r); + + void handle_io_event_safe(int r, uint64_t tid); + void handle_op_event_safe(int r, uint64_t tid, const Future &op_start_future, + const Future &op_finish_future, Context *on_safe); + + void stop_recording(); + + void transition_state(State state, int r); + + bool is_steady_state() const; + void wait_for_steady_state(Context *on_state); + + int check_resync_requested(bool *do_resync); + + void handle_metadata_updated(); + void handle_refresh_metadata(uint64_t refresh_sequence, uint64_t tag_tid, + journal::TagData tag_data, int r); + +}; + +} // namespace librbd + +extern template class librbd::Journal; + +#endif // CEPH_LIBRBD_JOURNAL_H diff --git a/src/librbd/LibrbdAdminSocketHook.cc b/src/librbd/LibrbdAdminSocketHook.cc new file mode 100644 index 000000000..159c9eda5 --- /dev/null +++ b/src/librbd/LibrbdAdminSocketHook.cc @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" + +#include "librbd/ImageCtx.h" +#include "librbd/LibrbdAdminSocketHook.h" +#include "librbd/internal.h" +#include "librbd/api/Io.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbdadminsocket: " + +namespace librbd { + +class LibrbdAdminSocketCommand { +public: + virtual ~LibrbdAdminSocketCommand() {} + virtual int call(Formatter *f) = 0; +}; + +class FlushCacheCommand : public LibrbdAdminSocketCommand { +public: + explicit FlushCacheCommand(ImageCtx *ictx) : ictx(ictx) {} + + int call(Formatter *f) override { + return api::Io<>::flush(*ictx); + } + +private: + ImageCtx *ictx; +}; + +struct InvalidateCacheCommand : public LibrbdAdminSocketCommand { +public: + explicit InvalidateCacheCommand(ImageCtx *ictx) : ictx(ictx) {} + + int call(Formatter *f) override { + return invalidate_cache(ictx); + } + +private: + ImageCtx *ictx; +}; + +LibrbdAdminSocketHook::LibrbdAdminSocketHook(ImageCtx *ictx) : + admin_socket(ictx->cct->get_admin_socket()) { + + std::string command; + std::string imagename; + int r; + + imagename = ictx->md_ctx.get_pool_name() + "/" + ictx->name; + command = "rbd cache flush " + imagename; + + r = admin_socket->register_command(command, this, + "flush rbd image " + imagename + + " cache"); + if (r == 0) { + commands[command] = new FlushCacheCommand(ictx); + } + + command = "rbd cache invalidate " + imagename; + r = admin_socket->register_command(command, this, + "invalidate rbd image " + imagename + + " cache"); + if (r == 0) { + commands[command] = new InvalidateCacheCommand(ictx); + } +} + +LibrbdAdminSocketHook::~LibrbdAdminSocketHook() { + (void)admin_socket->unregister_commands(this); + for (Commands::const_iterator i = commands.begin(); i != commands.end(); + ++i) { + delete i->second; + } +} + +int LibrbdAdminSocketHook::call(std::string_view command, + const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& errss, + bufferlist& out) { + Commands::const_iterator i = commands.find(command); + ceph_assert(i != commands.end()); + return i->second->call(f); +} + +} // namespace librbd diff --git a/src/librbd/LibrbdAdminSocketHook.h b/src/librbd/LibrbdAdminSocketHook.h new file mode 100644 index 000000000..98ff06abb --- /dev/null +++ b/src/librbd/LibrbdAdminSocketHook.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H +#define CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H + +#include + +#include "common/admin_socket.h" + +namespace librbd { + + struct ImageCtx; + class LibrbdAdminSocketCommand; + + class LibrbdAdminSocketHook : public AdminSocketHook { + public: + LibrbdAdminSocketHook(ImageCtx *ictx); + ~LibrbdAdminSocketHook() override; + + int call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, + std::ostream& errss, + bufferlist& out) override; + + private: + typedef std::map> Commands; + + AdminSocket *admin_socket; + Commands commands; + }; +} + +#endif diff --git a/src/librbd/ManagedLock.cc b/src/librbd/ManagedLock.cc new file mode 100644 index 000000000..166a31c61 --- /dev/null +++ b/src/librbd/ManagedLock.cc @@ -0,0 +1,859 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ManagedLock.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Watcher.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/managed_lock/AcquireRequest.h" +#include "librbd/managed_lock/BreakRequest.h" +#include "librbd/managed_lock/GetLockerRequest.h" +#include "librbd/managed_lock/ReleaseRequest.h" +#include "librbd/managed_lock/ReacquireRequest.h" +#include "librbd/managed_lock/Types.h" +#include "librbd/managed_lock/Utils.h" +#include "cls/lock/cls_lock_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ManagedLock: " << this << " " \ + << __func__ << ": " + +namespace librbd { + +using std::string; +using namespace managed_lock; + +namespace { + +template +struct C_SendLockRequest : public Context { + R* request; + explicit C_SendLockRequest(R* request) : request(request) { + } + void finish(int r) override { + request->send(); + } +}; + +struct C_Tracked : public Context { + AsyncOpTracker &tracker; + Context *ctx; + C_Tracked(AsyncOpTracker &tracker, Context *ctx) + : tracker(tracker), ctx(ctx) { + tracker.start_op(); + } + ~C_Tracked() override { + tracker.finish_op(); + } + void finish(int r) override { + ctx->complete(r); + } +}; + +} // anonymous namespace + +using librbd::util::create_context_callback; +using librbd::util::unique_lock_name; +using managed_lock::util::decode_lock_cookie; +using managed_lock::util::encode_lock_cookie; + +template +ManagedLock::ManagedLock(librados::IoCtx &ioctx, AsioEngine& asio_engine, + const string& oid, Watcher *watcher, Mode mode, + bool blocklist_on_break_lock, + uint32_t blocklist_expire_seconds) + : m_lock(ceph::make_mutex(unique_lock_name("librbd::ManagedLock::m_lock", this))), + m_ioctx(ioctx), m_cct(reinterpret_cast(ioctx.cct())), + m_asio_engine(asio_engine), + m_work_queue(asio_engine.get_work_queue()), + m_oid(oid), + m_watcher(watcher), + m_mode(mode), + m_blocklist_on_break_lock(blocklist_on_break_lock), + m_blocklist_expire_seconds(blocklist_expire_seconds), + m_state(STATE_UNLOCKED) { +} + +template +ManagedLock::~ManagedLock() { + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_SHUTDOWN || m_state == STATE_UNLOCKED || + m_state == STATE_UNINITIALIZED); + if (m_state == STATE_UNINITIALIZED) { + // never initialized -- ensure any in-flight ops are complete + // since we wouldn't expect shut_down to be invoked + C_SaferCond ctx; + m_async_op_tracker.wait_for_ops(&ctx); + ctx.wait(); + } + ceph_assert(m_async_op_tracker.empty()); +} + +template +bool ManagedLock::is_lock_owner() const { + std::lock_guard locker{m_lock}; + + return is_lock_owner(m_lock); +} + +template +bool ManagedLock::is_lock_owner(ceph::mutex &lock) const { + + ceph_assert(ceph_mutex_is_locked(m_lock)); + + bool lock_owner; + + switch (m_state) { + case STATE_LOCKED: + case STATE_REACQUIRING: + case STATE_PRE_SHUTTING_DOWN: + case STATE_POST_ACQUIRING: + case STATE_PRE_RELEASING: + lock_owner = true; + break; + default: + lock_owner = false; + break; + } + + ldout(m_cct, 20) << lock_owner << dendl; + return lock_owner; +} + +template +void ManagedLock::shut_down(Context *on_shut_down) { + ldout(m_cct, 10) << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(!is_state_shutdown()); + + if (m_state == STATE_WAITING_FOR_REGISTER) { + // abort stalled acquire lock state + ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl; + Action active_action = get_active_action(); + ceph_assert(active_action == ACTION_TRY_LOCK || + active_action == ACTION_ACQUIRE_LOCK); + complete_active_action(STATE_UNLOCKED, -ERESTART); + } + + execute_action(ACTION_SHUT_DOWN, on_shut_down); +} + +template +void ManagedLock::acquire_lock(Context *on_acquired) { + int r = 0; + { + std::lock_guard locker{m_lock}; + if (is_state_shutdown()) { + r = -ERESTART; + } else if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) { + ldout(m_cct, 10) << dendl; + execute_action(ACTION_ACQUIRE_LOCK, on_acquired); + return; + } + } + + if (on_acquired != nullptr) { + on_acquired->complete(r); + } +} + +template +void ManagedLock::try_acquire_lock(Context *on_acquired) { + int r = 0; + { + std::lock_guard locker{m_lock}; + if (is_state_shutdown()) { + r = -ERESTART; + } else if (m_state != STATE_LOCKED || !m_actions_contexts.empty()) { + ldout(m_cct, 10) << dendl; + execute_action(ACTION_TRY_LOCK, on_acquired); + return; + } + } + + if (on_acquired != nullptr) { + on_acquired->complete(r); + } +} + +template +void ManagedLock::release_lock(Context *on_released) { + int r = 0; + { + std::lock_guard locker{m_lock}; + if (is_state_shutdown()) { + r = -ERESTART; + } else if (m_state != STATE_UNLOCKED || !m_actions_contexts.empty()) { + ldout(m_cct, 10) << dendl; + execute_action(ACTION_RELEASE_LOCK, on_released); + return; + } + } + + if (on_released != nullptr) { + on_released->complete(r); + } +} + +template +void ManagedLock::reacquire_lock(Context *on_reacquired) { + { + std::lock_guard locker{m_lock}; + + if (m_state == STATE_WAITING_FOR_REGISTER || + m_state == STATE_WAITING_FOR_LOCK) { + // restart the acquire lock process now that watch is valid + ldout(m_cct, 10) << "woke up waiting (re)acquire" << dendl; + Action active_action = get_active_action(); + ceph_assert(active_action == ACTION_TRY_LOCK || + active_action == ACTION_ACQUIRE_LOCK); + execute_next_action(); + } else if (!is_state_shutdown() && + (m_state == STATE_LOCKED || + m_state == STATE_ACQUIRING || + m_state == STATE_POST_ACQUIRING)) { + // interlock the lock operation with other state ops + ldout(m_cct, 10) << dendl; + execute_action(ACTION_REACQUIRE_LOCK, on_reacquired); + return; + } + } + + // ignore request if shutdown or not in a locked-related state + if (on_reacquired != nullptr) { + on_reacquired->complete(0); + } +} + +template +void ManagedLock::get_locker(managed_lock::Locker *locker, + Context *on_finish) { + ldout(m_cct, 10) << dendl; + + int r; + { + std::lock_guard l{m_lock}; + if (is_state_shutdown()) { + r = -ERESTART; + } else { + on_finish = new C_Tracked(m_async_op_tracker, on_finish); + auto req = managed_lock::GetLockerRequest::create( + m_ioctx, m_oid, m_mode == EXCLUSIVE, locker, on_finish); + req->send(); + return; + } + } + + on_finish->complete(r); +} + +template +void ManagedLock::break_lock(const managed_lock::Locker &locker, + bool force_break_lock, Context *on_finish) { + ldout(m_cct, 10) << dendl; + + int r; + { + std::lock_guard l{m_lock}; + if (is_state_shutdown()) { + r = -ERESTART; + } else if (is_lock_owner(m_lock)) { + r = -EBUSY; + } else { + on_finish = new C_Tracked(m_async_op_tracker, on_finish); + auto req = managed_lock::BreakRequest::create( + m_ioctx, m_asio_engine, m_oid, locker, m_mode == EXCLUSIVE, + m_blocklist_on_break_lock, m_blocklist_expire_seconds, force_break_lock, + on_finish); + req->send(); + return; + } + } + + on_finish->complete(r); +} + +template +int ManagedLock::assert_header_locked() { + ldout(m_cct, 10) << dendl; + + librados::ObjectReadOperation op; + { + std::lock_guard locker{m_lock}; + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, + (m_mode == EXCLUSIVE ? ClsLockType::EXCLUSIVE : + ClsLockType::SHARED), + m_cookie, + managed_lock::util::get_watcher_lock_tag()); + } + + int r = m_ioctx.operate(m_oid, &op, nullptr); + if (r < 0) { + if (r == -EBLOCKLISTED) { + ldout(m_cct, 5) << "client is not lock owner -- client blocklisted" + << dendl; + } else if (r == -ENOENT) { + ldout(m_cct, 5) << "client is not lock owner -- no lock detected" + << dendl; + } else if (r == -EBUSY) { + ldout(m_cct, 5) << "client is not lock owner -- owned by different client" + << dendl; + } else { + lderr(m_cct) << "failed to verify lock ownership: " << cpp_strerror(r) + << dendl; + } + + return r; + } + + return 0; +} + +template +void ManagedLock::shutdown_handler(int r, Context *on_finish) { + on_finish->complete(r); +} + +template +void ManagedLock::pre_acquire_lock_handler(Context *on_finish) { + on_finish->complete(0); +} + +template +void ManagedLock::post_acquire_lock_handler(int r, Context *on_finish) { + on_finish->complete(r); +} + +template +void ManagedLock::pre_release_lock_handler(bool shutting_down, + Context *on_finish) { + on_finish->complete(0); +} + +template +void ManagedLock::post_release_lock_handler(bool shutting_down, int r, + Context *on_finish) { + on_finish->complete(r); +} + +template +void ManagedLock::post_reacquire_lock_handler(int r, Context *on_finish) { + on_finish->complete(r); +} + +template +bool ManagedLock::is_transition_state() const { + switch (m_state) { + case STATE_ACQUIRING: + case STATE_WAITING_FOR_REGISTER: + case STATE_REACQUIRING: + case STATE_RELEASING: + case STATE_PRE_SHUTTING_DOWN: + case STATE_SHUTTING_DOWN: + case STATE_INITIALIZING: + case STATE_WAITING_FOR_LOCK: + case STATE_POST_ACQUIRING: + case STATE_PRE_RELEASING: + return true; + case STATE_UNLOCKED: + case STATE_LOCKED: + case STATE_SHUTDOWN: + case STATE_UNINITIALIZED: + break; + } + return false; +} + +template +void ManagedLock::append_context(Action action, Context *ctx) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + for (auto &action_ctxs : m_actions_contexts) { + if (action == action_ctxs.first) { + if (ctx != nullptr) { + action_ctxs.second.push_back(ctx); + } + return; + } + } + + Contexts contexts; + if (ctx != nullptr) { + contexts.push_back(ctx); + } + m_actions_contexts.push_back({action, std::move(contexts)}); +} + +template +void ManagedLock::execute_action(Action action, Context *ctx) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + append_context(action, ctx); + if (!is_transition_state()) { + execute_next_action(); + } +} + +template +void ManagedLock::execute_next_action() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(!m_actions_contexts.empty()); + switch (get_active_action()) { + case ACTION_ACQUIRE_LOCK: + case ACTION_TRY_LOCK: + send_acquire_lock(); + break; + case ACTION_REACQUIRE_LOCK: + send_reacquire_lock(); + break; + case ACTION_RELEASE_LOCK: + send_release_lock(); + break; + case ACTION_SHUT_DOWN: + send_shutdown(); + break; + default: + ceph_abort(); + break; + } +} + +template +typename ManagedLock::Action ManagedLock::get_active_action() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(!m_actions_contexts.empty()); + return m_actions_contexts.front().first; +} + +template +void ManagedLock::complete_active_action(State next_state, int r) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(!m_actions_contexts.empty()); + + ActionContexts action_contexts(std::move(m_actions_contexts.front())); + m_actions_contexts.pop_front(); + m_state = next_state; + + m_lock.unlock(); + for (auto ctx : action_contexts.second) { + ctx->complete(r); + } + m_lock.lock(); + + if (!is_transition_state() && !m_actions_contexts.empty()) { + execute_next_action(); + } +} + +template +bool ManagedLock::is_state_shutdown() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + switch (m_state) { + case STATE_PRE_SHUTTING_DOWN: + case STATE_SHUTTING_DOWN: + case STATE_SHUTDOWN: + return true; + default: + break; + } + + return (!m_actions_contexts.empty() && + m_actions_contexts.back().first == ACTION_SHUT_DOWN); +} + +template +void ManagedLock::send_acquire_lock() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + if (m_state == STATE_LOCKED) { + complete_active_action(STATE_LOCKED, 0); + return; + } + + ldout(m_cct, 10) << dendl; + + uint64_t watch_handle = m_watcher->get_watch_handle(); + if (watch_handle == 0) { + if (m_watcher->is_blocklisted()) { + lderr(m_cct) << "watcher not registered - client blocklisted" << dendl; + complete_active_action(STATE_UNLOCKED, -EBLOCKLISTED); + } else { + lderr(m_cct) << "watcher not registered - delaying request" << dendl; + m_state = STATE_WAITING_FOR_REGISTER; + + // shut down might race w/ release/re-acquire of the lock + if (is_state_shutdown()) { + complete_active_action(STATE_UNLOCKED, -ERESTART); + } + } + return; + } + + m_state = STATE_ACQUIRING; + m_cookie = encode_lock_cookie(watch_handle); + + m_work_queue->queue(new LambdaContext([this](int r) { + pre_acquire_lock_handler(create_context_callback< + ManagedLock, &ManagedLock::handle_pre_acquire_lock>(this)); + })); +} + +template +void ManagedLock::handle_pre_acquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + handle_acquire_lock(r); + return; + } + + using managed_lock::AcquireRequest; + AcquireRequest* req = AcquireRequest::create( + m_ioctx, m_watcher, m_asio_engine, m_oid, m_cookie, m_mode == EXCLUSIVE, + m_blocklist_on_break_lock, m_blocklist_expire_seconds, + create_context_callback< + ManagedLock, &ManagedLock::handle_acquire_lock>(this)); + m_work_queue->queue(new C_SendLockRequest>(req), 0); +} + +template +void ManagedLock::handle_acquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -EBUSY || r == -EAGAIN || r == -EROFS) { + ldout(m_cct, 5) << "unable to acquire exclusive lock" << dendl; + } else if (r < 0) { + lderr(m_cct) << "failed to acquire exclusive lock: " << cpp_strerror(r) + << dendl; + } else { + ldout(m_cct, 5) << "successfully acquired exclusive lock" << dendl; + } + + m_post_next_state = (r < 0 ? STATE_UNLOCKED : STATE_LOCKED); + + m_work_queue->queue(new LambdaContext([this, r](int ret) { + post_acquire_lock_handler(r, create_context_callback< + ManagedLock, &ManagedLock::handle_post_acquire_lock>(this)); + })); +} + +template +void ManagedLock::handle_post_acquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + std::lock_guard locker{m_lock}; + + if (r < 0 && m_post_next_state == STATE_LOCKED) { + // release_lock without calling pre and post handlers + revert_to_unlock_state(r); + } else if (r != -ECANCELED) { + // fail the lock request + complete_active_action(m_post_next_state, r); + } +} + +template +void ManagedLock::revert_to_unlock_state(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + using managed_lock::ReleaseRequest; + ReleaseRequest* req = ReleaseRequest::create(m_ioctx, m_watcher, + m_work_queue, m_oid, m_cookie, + new LambdaContext([this, r](int ret) { + std::lock_guard locker{m_lock}; + ceph_assert(ret == 0); + complete_active_action(STATE_UNLOCKED, r); + })); + m_work_queue->queue(new C_SendLockRequest>(req)); +} + +template +void ManagedLock::send_reacquire_lock() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + if (m_state != STATE_LOCKED) { + complete_active_action(m_state, 0); + return; + } + + ldout(m_cct, 10) << dendl; + m_state = STATE_REACQUIRING; + + uint64_t watch_handle = m_watcher->get_watch_handle(); + if (watch_handle == 0) { + // watch (re)failed while recovering + lderr(m_cct) << "aborting reacquire due to invalid watch handle" + << dendl; + + // treat double-watch failure as a lost lock and invoke the + // release/acquire handlers + release_acquire_lock(); + complete_active_action(STATE_LOCKED, 0); + return; + } + + m_new_cookie = encode_lock_cookie(watch_handle); + if (m_cookie == m_new_cookie && m_blocklist_on_break_lock) { + ldout(m_cct, 10) << "skipping reacquire since cookie still valid" + << dendl; + auto ctx = create_context_callback< + ManagedLock, &ManagedLock::handle_no_op_reacquire_lock>(this); + post_reacquire_lock_handler(0, ctx); + return; + } + + auto ctx = create_context_callback< + ManagedLock, &ManagedLock::handle_reacquire_lock>(this); + ctx = new LambdaContext([this, ctx](int r) { + post_reacquire_lock_handler(r, ctx); + }); + + using managed_lock::ReacquireRequest; + ReacquireRequest* req = ReacquireRequest::create(m_ioctx, m_oid, + m_cookie, m_new_cookie, m_mode == EXCLUSIVE, ctx); + m_work_queue->queue(new C_SendLockRequest>(req)); +} + +template +void ManagedLock::handle_reacquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_REACQUIRING); + + if (r < 0) { + if (r == -EOPNOTSUPP) { + ldout(m_cct, 10) << "updating lock is not supported" << dendl; + } else { + lderr(m_cct) << "failed to update lock cookie: " << cpp_strerror(r) + << dendl; + } + + release_acquire_lock(); + } else { + m_cookie = m_new_cookie; + } + + complete_active_action(STATE_LOCKED, 0); +} + +template +void ManagedLock::handle_no_op_reacquire_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + ceph_assert(m_state == STATE_REACQUIRING); + ceph_assert(r >= 0); + complete_active_action(STATE_LOCKED, 0); +} + +template +void ManagedLock::release_acquire_lock() { + assert(ceph_mutex_is_locked(m_lock)); + + if (!is_state_shutdown()) { + // queue a release and re-acquire of the lock since cookie cannot + // be updated on older OSDs + execute_action(ACTION_RELEASE_LOCK, nullptr); + + ceph_assert(!m_actions_contexts.empty()); + ActionContexts &action_contexts(m_actions_contexts.front()); + + // reacquire completes when the request lock completes + Contexts contexts; + std::swap(contexts, action_contexts.second); + if (contexts.empty()) { + execute_action(ACTION_ACQUIRE_LOCK, nullptr); + } else { + for (auto ctx : contexts) { + execute_action(ACTION_ACQUIRE_LOCK, ctx); + } + } + } +} + +template +void ManagedLock::send_release_lock() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + if (m_state == STATE_UNLOCKED) { + complete_active_action(STATE_UNLOCKED, 0); + return; + } + + ldout(m_cct, 10) << dendl; + m_state = STATE_PRE_RELEASING; + + m_work_queue->queue(new LambdaContext([this](int r) { + pre_release_lock_handler(false, create_context_callback< + ManagedLock, &ManagedLock::handle_pre_release_lock>(this)); + })); +} + +template +void ManagedLock::handle_pre_release_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + { + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_PRE_RELEASING); + m_state = STATE_RELEASING; + } + + if (r < 0) { + handle_release_lock(r); + return; + } + + using managed_lock::ReleaseRequest; + ReleaseRequest* req = ReleaseRequest::create(m_ioctx, m_watcher, + m_work_queue, m_oid, m_cookie, + create_context_callback< + ManagedLock, &ManagedLock::handle_release_lock>(this)); + m_work_queue->queue(new C_SendLockRequest>(req), 0); +} + +template +void ManagedLock::handle_release_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + std::lock_guard locker{m_lock}; + ceph_assert(m_state == STATE_RELEASING); + + if (r >= 0 || r == -EBLOCKLISTED || r == -ENOENT) { + m_cookie = ""; + m_post_next_state = STATE_UNLOCKED; + } else { + m_post_next_state = STATE_LOCKED; + } + + m_work_queue->queue(new LambdaContext([this, r](int ret) { + post_release_lock_handler(false, r, create_context_callback< + ManagedLock, &ManagedLock::handle_post_release_lock>(this)); + })); +} + +template +void ManagedLock::handle_post_release_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + std::lock_guard locker{m_lock}; + complete_active_action(m_post_next_state, r); +} + +template +void ManagedLock::send_shutdown() { + ldout(m_cct, 10) << dendl; + ceph_assert(ceph_mutex_is_locked(m_lock)); + if (m_state == STATE_UNLOCKED) { + m_state = STATE_SHUTTING_DOWN; + m_work_queue->queue(new LambdaContext([this](int r) { + shutdown_handler(r, create_context_callback< + ManagedLock, &ManagedLock::handle_shutdown>(this)); + })); + return; + } + + ceph_assert(m_state == STATE_LOCKED); + m_state = STATE_PRE_SHUTTING_DOWN; + + m_lock.unlock(); + m_work_queue->queue(new C_ShutDownRelease(this), 0); + m_lock.lock(); +} + +template +void ManagedLock::handle_shutdown(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + wait_for_tracked_ops(r); +} + +template +void ManagedLock::send_shutdown_release() { + ldout(m_cct, 10) << dendl; + + std::lock_guard locker{m_lock}; + + m_work_queue->queue(new LambdaContext([this](int r) { + pre_release_lock_handler(true, create_context_callback< + ManagedLock, &ManagedLock::handle_shutdown_pre_release>(this)); + })); +} + +template +void ManagedLock::handle_shutdown_pre_release(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + std::string cookie; + { + std::lock_guard locker{m_lock}; + cookie = m_cookie; + + ceph_assert(m_state == STATE_PRE_SHUTTING_DOWN); + m_state = STATE_SHUTTING_DOWN; + } + + using managed_lock::ReleaseRequest; + ReleaseRequest* req = ReleaseRequest::create(m_ioctx, m_watcher, + m_work_queue, m_oid, cookie, + new LambdaContext([this, r](int l) { + int rst = r < 0 ? r : l; + post_release_lock_handler(true, rst, create_context_callback< + ManagedLock, &ManagedLock::handle_shutdown_post_release>(this)); + })); + req->send(); + +} + +template +void ManagedLock::handle_shutdown_post_release(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + wait_for_tracked_ops(r); +} + +template +void ManagedLock::wait_for_tracked_ops(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + Context *ctx = new LambdaContext([this, r](int ret) { + complete_shutdown(r); + }); + + m_async_op_tracker.wait_for_ops(ctx); +} + +template +void ManagedLock::complete_shutdown(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to shut down lock: " << cpp_strerror(r) + << dendl; + } + + ActionContexts action_contexts; + { + std::lock_guard locker{m_lock}; + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_actions_contexts.size() == 1); + + action_contexts = std::move(m_actions_contexts.front()); + m_actions_contexts.pop_front(); + m_state = STATE_SHUTDOWN; + } + + // expect to be destroyed after firing callback + for (auto ctx : action_contexts.second) { + ctx->complete(r); + } +} + +} // namespace librbd + +template class librbd::ManagedLock; diff --git a/src/librbd/ManagedLock.h b/src/librbd/ManagedLock.h new file mode 100644 index 000000000..09fc413c0 --- /dev/null +++ b/src/librbd/ManagedLock.h @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_H +#define CEPH_LIBRBD_MANAGED_LOCK_H + +#include "include/int_types.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "common/AsyncOpTracker.h" +#include "cls/lock/cls_lock_types.h" +#include "librbd/watcher/Types.h" +#include "librbd/managed_lock/Types.h" +#include +#include +#include + +namespace librbd { + +struct AsioEngine; +struct ImageCtx; +namespace asio { struct ContextWQ; } +namespace managed_lock { struct Locker; } + +template +class ManagedLock { +private: + typedef watcher::Traits TypeTraits; + typedef typename TypeTraits::Watcher Watcher; + +public: + static ManagedLock *create(librados::IoCtx& ioctx, + AsioEngine& asio_engine, + const std::string& oid, Watcher *watcher, + managed_lock::Mode mode, + bool blocklist_on_break_lock, + uint32_t blocklist_expire_seconds) { + return new ManagedLock(ioctx, asio_engine, oid, watcher, mode, + blocklist_on_break_lock, blocklist_expire_seconds); + } + void destroy() { + delete this; + } + + ManagedLock(librados::IoCtx& ioctx, AsioEngine& asio_engine, + const std::string& oid, Watcher *watcher, + managed_lock::Mode mode, bool blocklist_on_break_lock, + uint32_t blocklist_expire_seconds); + virtual ~ManagedLock(); + + bool is_lock_owner() const; + + void shut_down(Context *on_shutdown); + void acquire_lock(Context *on_acquired); + void try_acquire_lock(Context *on_acquired); + void release_lock(Context *on_released); + void reacquire_lock(Context *on_reacquired); + void get_locker(managed_lock::Locker *locker, Context *on_finish); + void break_lock(const managed_lock::Locker &locker, bool force_break_lock, + Context *on_finish); + + int assert_header_locked(); + + bool is_shutdown() const { + std::lock_guard l{m_lock}; + return is_state_shutdown(); + } + +protected: + mutable ceph::mutex m_lock; + + inline void set_state_uninitialized() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_state == STATE_UNLOCKED); + m_state = STATE_UNINITIALIZED; + } + inline void set_state_initializing() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_state == STATE_UNINITIALIZED); + m_state = STATE_INITIALIZING; + } + inline void set_state_unlocked() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_state == STATE_INITIALIZING || m_state == STATE_RELEASING); + m_state = STATE_UNLOCKED; + } + inline void set_state_waiting_for_lock() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_state == STATE_ACQUIRING); + m_state = STATE_WAITING_FOR_LOCK; + } + inline void set_state_post_acquiring() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_state == STATE_ACQUIRING); + m_state = STATE_POST_ACQUIRING; + } + + bool is_state_shutdown() const; + inline bool is_state_acquiring() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return m_state == STATE_ACQUIRING; + } + inline bool is_state_post_acquiring() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return m_state == STATE_POST_ACQUIRING; + } + inline bool is_state_releasing() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return m_state == STATE_RELEASING; + } + inline bool is_state_pre_releasing() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return m_state == STATE_PRE_RELEASING; + } + inline bool is_state_locked() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return m_state == STATE_LOCKED; + } + inline bool is_state_waiting_for_lock() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return m_state == STATE_WAITING_FOR_LOCK; + } + + inline bool is_action_acquire_lock() const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return get_active_action() == ACTION_ACQUIRE_LOCK; + } + + virtual void shutdown_handler(int r, Context *on_finish); + virtual void pre_acquire_lock_handler(Context *on_finish); + virtual void post_acquire_lock_handler(int r, Context *on_finish); + virtual void pre_release_lock_handler(bool shutting_down, + Context *on_finish); + virtual void post_release_lock_handler(bool shutting_down, int r, + Context *on_finish); + virtual void post_reacquire_lock_handler(int r, Context *on_finish); + + void execute_next_action(); + +private: + /** + * @verbatim + * + * + * | + * | + * v (acquire_lock) + * UNLOCKED -----------------------------------------> ACQUIRING + * ^ | + * | | + * RELEASING | + * | | + * | | + * | (release_lock) v + * PRE_RELEASING <----------------------------------------- LOCKED + * + * + * | + * v + * REACQUIRING -------------------------------------> + * . ^ + * . | + * . . . > ---> ---/ + * + * + * | + * | + * v + * PRE_SHUTTING_DOWN ---> SHUTTING_DOWN ---> SHUTDOWN ---> + * + * @endverbatim + */ + enum State { + STATE_UNINITIALIZED, + STATE_INITIALIZING, + STATE_UNLOCKED, + STATE_LOCKED, + STATE_ACQUIRING, + STATE_POST_ACQUIRING, + STATE_WAITING_FOR_REGISTER, + STATE_WAITING_FOR_LOCK, + STATE_REACQUIRING, + STATE_PRE_RELEASING, + STATE_RELEASING, + STATE_PRE_SHUTTING_DOWN, + STATE_SHUTTING_DOWN, + STATE_SHUTDOWN, + }; + + enum Action { + ACTION_TRY_LOCK, + ACTION_ACQUIRE_LOCK, + ACTION_REACQUIRE_LOCK, + ACTION_RELEASE_LOCK, + ACTION_SHUT_DOWN + }; + + typedef std::list Contexts; + typedef std::pair ActionContexts; + typedef std::list ActionsContexts; + + struct C_ShutDownRelease : public Context { + ManagedLock *lock; + C_ShutDownRelease(ManagedLock *lock) + : lock(lock) { + } + void finish(int r) override { + lock->send_shutdown_release(); + } + }; + + librados::IoCtx& m_ioctx; + CephContext *m_cct; + AsioEngine& m_asio_engine; + asio::ContextWQ* m_work_queue; + std::string m_oid; + Watcher *m_watcher; + managed_lock::Mode m_mode; + bool m_blocklist_on_break_lock; + uint32_t m_blocklist_expire_seconds; + + std::string m_cookie; + std::string m_new_cookie; + + State m_state; + State m_post_next_state; + + ActionsContexts m_actions_contexts; + AsyncOpTracker m_async_op_tracker; + + bool is_lock_owner(ceph::mutex &lock) const; + bool is_transition_state() const; + + void append_context(Action action, Context *ctx); + void execute_action(Action action, Context *ctx); + + Action get_active_action() const; + void complete_active_action(State next_state, int r); + + void send_acquire_lock(); + void handle_pre_acquire_lock(int r); + void handle_acquire_lock(int r); + void handle_no_op_reacquire_lock(int r); + + void handle_post_acquire_lock(int r); + void revert_to_unlock_state(int r); + + void send_reacquire_lock(); + void handle_reacquire_lock(int r); + void release_acquire_lock(); + + void send_release_lock(); + void handle_pre_release_lock(int r); + void handle_release_lock(int r); + void handle_post_release_lock(int r); + + void send_shutdown(); + void handle_shutdown(int r); + void send_shutdown_release(); + void handle_shutdown_pre_release(int r); + void handle_shutdown_post_release(int r); + void wait_for_tracked_ops(int r); + void complete_shutdown(int r); +}; + +} // namespace librbd + +extern template class librbd::ManagedLock; + +#endif // CEPH_LIBRBD_MANAGED_LOCK_H diff --git a/src/librbd/MirroringWatcher.cc b/src/librbd/MirroringWatcher.cc new file mode 100644 index 000000000..c0cda5fa1 --- /dev/null +++ b/src/librbd/MirroringWatcher.cc @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/MirroringWatcher.h" +#include "include/rbd_types.h" +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "common/Cond.h" +#include "librbd/Utils.h" +#include "librbd/watcher/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::MirroringWatcher: " + +namespace librbd { + +using namespace mirroring_watcher; +using namespace watcher; + +using librbd::util::create_rados_callback; + +namespace { + +static const uint64_t NOTIFY_TIMEOUT_MS = 5000; + +} // anonymous namespace + +template +MirroringWatcher::MirroringWatcher(librados::IoCtx &io_ctx, + asio::ContextWQ *work_queue) + : Watcher(io_ctx, work_queue, RBD_MIRRORING) { +} + +template +int MirroringWatcher::notify_mode_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorMode mirror_mode) { + C_SaferCond ctx; + notify_mode_updated(io_ctx, mirror_mode, &ctx); + return ctx.wait(); +} + +template +void MirroringWatcher::notify_mode_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorMode mirror_mode, + Context *on_finish) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + bufferlist bl; + encode(NotifyMessage{ModeUpdatedPayload{mirror_mode}}, bl); + + librados::AioCompletion *comp = create_rados_callback(on_finish); + int r = io_ctx.aio_notify(RBD_MIRRORING, comp, bl, NOTIFY_TIMEOUT_MS, + nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template +int MirroringWatcher::notify_image_updated( + librados::IoCtx &io_ctx, cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, const std::string &global_image_id) { + C_SaferCond ctx; + notify_image_updated(io_ctx, mirror_image_state, image_id, global_image_id, + &ctx); + return ctx.wait(); +} + +template +void MirroringWatcher::notify_image_updated( + librados::IoCtx &io_ctx, cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, const std::string &global_image_id, + Context *on_finish) { + + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + bufferlist bl; + encode(NotifyMessage{ImageUpdatedPayload{ + mirror_image_state, image_id, global_image_id}}, bl); + + librados::AioCompletion *comp = create_rados_callback(on_finish); + int r = io_ctx.aio_notify(RBD_MIRRORING, comp, bl, NOTIFY_TIMEOUT_MS, + nullptr); + ceph_assert(r == 0); + comp->release(); + +} + +template +void MirroringWatcher::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) { + CephContext *cct = this->m_cct; + ldout(cct, 15) << ": notify_id=" << notify_id << ", " + << "handle=" << handle << dendl; + + + NotifyMessage notify_message; + try { + auto iter = bl.cbegin(); + decode(notify_message, iter); + } catch (const buffer::error &err) { + lderr(cct) << ": error decoding image notification: " << err.what() + << dendl; + Context *ctx = new C_NotifyAck(this, notify_id, handle); + ctx->complete(0); + return; + } + + apply_visitor(watcher::util::HandlePayloadVisitor>( + this, notify_id, handle), notify_message.payload); +} + +template +bool MirroringWatcher::handle_payload(const ModeUpdatedPayload &payload, + Context *on_notify_ack) { + CephContext *cct = this->m_cct; + ldout(cct, 20) << ": mode updated: " << payload.mirror_mode << dendl; + handle_mode_updated(payload.mirror_mode); + return true; +} + +template +bool MirroringWatcher::handle_payload(const ImageUpdatedPayload &payload, + Context *on_notify_ack) { + CephContext *cct = this->m_cct; + ldout(cct, 20) << ": image state updated" << dendl; + handle_image_updated(payload.mirror_image_state, payload.image_id, + payload.global_image_id); + return true; +} + +template +bool MirroringWatcher::handle_payload(const UnknownPayload &payload, + Context *on_notify_ack) { + return true; +} + +} // namespace librbd + +template class librbd::MirroringWatcher; diff --git a/src/librbd/MirroringWatcher.h b/src/librbd/MirroringWatcher.h new file mode 100644 index 000000000..e13762e9b --- /dev/null +++ b/src/librbd/MirroringWatcher.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_H +#define CEPH_LIBRBD_MIRRORING_WATCHER_H + +#include "include/int_types.h" +#include "include/rados/librados_fwd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Watcher.h" +#include "librbd/mirroring_watcher/Types.h" + +namespace librbd { + +namespace asio { struct ContextWQ; } +namespace watcher { +namespace util { +template struct HandlePayloadVisitor; +} +} + +template +class MirroringWatcher : public Watcher { + friend struct watcher::util::HandlePayloadVisitor>; + +public: + MirroringWatcher(librados::IoCtx &io_ctx, asio::ContextWQ *work_queue); + + static int notify_mode_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorMode mirror_mode); + static void notify_mode_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorMode mirror_mode, + Context *on_finish); + + static int notify_image_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, + const std::string &global_image_id); + static void notify_image_updated(librados::IoCtx &io_ctx, + cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, + const std::string &global_image_id, + Context *on_finish); + + virtual void handle_mode_updated(cls::rbd::MirrorMode mirror_mode) = 0; + virtual void handle_image_updated(cls::rbd::MirrorImageState state, + const std::string &image_id, + const std::string &global_image_id) = 0; + +private: + bool handle_payload(const mirroring_watcher::ModeUpdatedPayload &payload, + Context *on_notify_ack); + bool handle_payload(const mirroring_watcher::ImageUpdatedPayload &payload, + Context *on_notify_ack); + bool handle_payload(const mirroring_watcher::UnknownPayload &payload, + Context *on_notify_ack); + + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) override; +}; + +} // namespace librbd + +extern template class librbd::MirroringWatcher; + +#endif // CEPH_LIBRBD_MIRRORING_WATCHER_H diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc new file mode 100644 index 000000000..65e3fc4a4 --- /dev/null +++ b/src/librbd/ObjectMap.cc @@ -0,0 +1,380 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/ObjectMap.h" +#include "librbd/BlockGuard.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/object_map/RefreshRequest.h" +#include "librbd/object_map/ResizeRequest.h" +#include "librbd/object_map/SnapshotCreateRequest.h" +#include "librbd/object_map/SnapshotRemoveRequest.h" +#include "librbd/object_map/SnapshotRollbackRequest.h" +#include "librbd/object_map/UnlockRequest.h" +#include "librbd/object_map/UpdateRequest.h" +#include "librbd/Utils.h" +#include "common/dout.h" +#include "common/errno.h" + +#include "include/rados/librados.hpp" + +#include "cls/lock/cls_lock_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "include/stringify.h" +#include "osdc/Striper.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ObjectMap: " << this << " " << __func__ \ + << ": " + +namespace librbd { + +using librbd::util::create_context_callback; + +template +ObjectMap::ObjectMap(I &image_ctx, uint64_t snap_id) + : RefCountedObject(image_ctx.cct), + m_image_ctx(image_ctx), m_snap_id(snap_id), + m_lock(ceph::make_shared_mutex(util::unique_lock_name("librbd::ObjectMap::lock", this))), + m_update_guard(new UpdateGuard(m_image_ctx.cct)) { +} + +template +ObjectMap::~ObjectMap() { + delete m_update_guard; +} + +template +int ObjectMap::aio_remove(librados::IoCtx &io_ctx, const std::string &image_id, + librados::AioCompletion *c) { + return io_ctx.aio_remove(object_map_name(image_id, CEPH_NOSNAP), c); +} + +template +std::string ObjectMap::object_map_name(const std::string &image_id, + uint64_t snap_id) { + std::string oid(RBD_OBJECT_MAP_PREFIX + image_id); + if (snap_id != CEPH_NOSNAP) { + std::stringstream snap_suffix; + snap_suffix << "." << std::setfill('0') << std::setw(16) << std::hex + << snap_id; + oid += snap_suffix.str(); + } + return oid; +} + +template +bool ObjectMap::is_compatible(const file_layout_t& layout, uint64_t size) { + uint64_t object_count = Striper::get_num_objects(layout, size); + return (object_count <= cls::rbd::MAX_OBJECT_MAP_OBJECT_COUNT); +} + +template +uint8_t ObjectMap::operator[](uint64_t object_no) const +{ + std::shared_lock locker{m_lock}; + ceph_assert(object_no < m_object_map.size()); + return m_object_map[object_no]; +} + +template +bool ObjectMap::object_may_exist(uint64_t object_no) const +{ + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + + // Fall back to default logic if object map is disabled or invalid + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.image_lock)) { + return true; + } + + bool flags_set; + int r = m_image_ctx.test_flags(m_image_ctx.snap_id, + RBD_FLAG_OBJECT_MAP_INVALID, + m_image_ctx.image_lock, &flags_set); + if (r < 0 || flags_set) { + return true; + } + + uint8_t state = (*this)[object_no]; + bool exists = (state != OBJECT_NONEXISTENT); + ldout(m_image_ctx.cct, 20) << "object_no=" << object_no << " r=" << exists + << dendl; + return exists; +} + +template +bool ObjectMap::object_may_not_exist(uint64_t object_no) const +{ + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + + // Fall back to default logic if object map is disabled or invalid + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.image_lock)) { + return true; + } + + bool flags_set; + int r = m_image_ctx.test_flags(m_image_ctx.snap_id, + RBD_FLAG_OBJECT_MAP_INVALID, + m_image_ctx.image_lock, &flags_set); + if (r < 0 || flags_set) { + return true; + } + + uint8_t state = (*this)[object_no]; + bool nonexistent = (state != OBJECT_EXISTS && state != OBJECT_EXISTS_CLEAN); + ldout(m_image_ctx.cct, 20) << "object_no=" << object_no << " r=" + << nonexistent << dendl; + return nonexistent; +} + +template +bool ObjectMap::update_required(const ceph::BitVector<2>::Iterator& it, + uint8_t new_state) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + uint8_t state = *it; + if ((state == new_state) || + (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) || + (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) { + return false; + } + return true; +} + +template +void ObjectMap::open(Context *on_finish) { + Context *ctx = create_context_callback(on_finish, this); + + auto req = object_map::RefreshRequest::create( + m_image_ctx, &m_lock, &m_object_map, m_snap_id, ctx); + req->send(); +} + +template +void ObjectMap::close(Context *on_finish) { + Context *ctx = create_context_callback(on_finish, this); + + if (m_snap_id != CEPH_NOSNAP) { + m_image_ctx.op_work_queue->queue(ctx, 0); + return; + } + + ctx = new LambdaContext([this, ctx](int r) { + auto req = object_map::UnlockRequest::create(m_image_ctx, ctx); + req->send(); + }); + + // ensure the block guard for aio updates is empty before unlocking + // the object map + m_async_op_tracker.wait_for_ops(ctx); +} + +template +bool ObjectMap::set_object_map(ceph::BitVector<2> &target_object_map) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.image_lock)); + std::unique_lock locker{m_lock}; + m_object_map = target_object_map; + return true; +} + +template +void ObjectMap::rollback(uint64_t snap_id, Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + + std::unique_lock locker{m_lock}; + Context *ctx = create_context_callback(on_finish, this); + + object_map::SnapshotRollbackRequest *req = + new object_map::SnapshotRollbackRequest(m_image_ctx, snap_id, ctx); + req->send(); +} + +template +void ObjectMap::snapshot_add(uint64_t snap_id, Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0); + ceph_assert(snap_id != CEPH_NOSNAP); + + Context *ctx = create_context_callback(on_finish, this); + + object_map::SnapshotCreateRequest *req = + new object_map::SnapshotCreateRequest(m_image_ctx, &m_lock, &m_object_map, + snap_id, ctx); + req->send(); +} + +template +void ObjectMap::snapshot_remove(uint64_t snap_id, Context *on_finish) { + ceph_assert(ceph_mutex_is_wlocked(m_image_ctx.image_lock)); + ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0); + ceph_assert(snap_id != CEPH_NOSNAP); + + Context *ctx = create_context_callback(on_finish, this); + + object_map::SnapshotRemoveRequest *req = + new object_map::SnapshotRemoveRequest(m_image_ctx, &m_lock, &m_object_map, + snap_id, ctx); + req->send(); +} + +template +void ObjectMap::aio_save(Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.image_lock)); + std::shared_lock locker{m_lock}; + + librados::ObjectWriteOperation op; + if (m_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", ""); + } + cls_client::object_map_save(&op, m_object_map); + + Context *ctx = create_context_callback(on_finish, this); + + std::string oid(object_map_name(m_image_ctx.id, m_snap_id)); + librados::AioCompletion *comp = util::create_rados_callback(ctx); + + int r = m_image_ctx.md_ctx.aio_operate(oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void ObjectMap::aio_resize(uint64_t new_size, uint8_t default_object_state, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + ceph_assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.image_lock)); + ceph_assert(m_image_ctx.image_watcher != NULL); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + Context *ctx = create_context_callback(on_finish, this); + + object_map::ResizeRequest *req = new object_map::ResizeRequest( + m_image_ctx, &m_lock, &m_object_map, m_snap_id, new_size, + default_object_state, ctx); + req->send(); +} + +template +void ObjectMap::detained_aio_update(UpdateOperation &&op) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + ceph_assert(ceph_mutex_is_wlocked(m_lock)); + + BlockGuardCell *cell; + int r = m_update_guard->detain({op.start_object_no, op.end_object_no}, + &op, &cell); + if (r < 0) { + lderr(cct) << "failed to detain object map update: " << cpp_strerror(r) + << dendl; + m_image_ctx.op_work_queue->queue(op.on_finish, r); + m_async_op_tracker.finish_op(); + return; + } else if (r > 0) { + ldout(cct, 20) << "detaining object map update due to in-flight update: " + << "start=" << op.start_object_no << ", " + << "end=" << op.end_object_no << ", " + << (op.current_state ? + stringify(static_cast(*op.current_state)) : + "") + << "->" << static_cast(op.new_state) << dendl; + return; + } + + ldout(cct, 20) << "in-flight update cell: " << cell << dendl; + Context *on_finish = op.on_finish; + Context *ctx = new LambdaContext([this, cell, on_finish](int r) { + handle_detained_aio_update(cell, r, on_finish); + }); + aio_update(CEPH_NOSNAP, op.start_object_no, op.end_object_no, op.new_state, + op.current_state, op.parent_trace, op.ignore_enoent, ctx); +} + +template +void ObjectMap::handle_detained_aio_update(BlockGuardCell *cell, int r, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "cell=" << cell << ", r=" << r << dendl; + + typename UpdateGuard::BlockOperations block_ops; + m_update_guard->release(cell, &block_ops); + + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + std::unique_lock locker{m_lock}; + for (auto &op : block_ops) { + detained_aio_update(std::move(op)); + } + } + + on_finish->complete(r); + m_async_op_tracker.finish_op(); +} + +template +void ObjectMap::aio_update(uint64_t snap_id, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional ¤t_state, + const ZTracer::Trace &parent_trace, + bool ignore_enoent, Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + ceph_assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0); + ceph_assert(m_image_ctx.image_watcher != nullptr); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + ceph_assert(start_object_no < end_object_no); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "start=" << start_object_no << ", " + << "end=" << end_object_no << ", " + << (current_state ? + stringify(static_cast(*current_state)) : "") + << "->" << static_cast(new_state) << dendl; + if (snap_id == CEPH_NOSNAP) { + ceph_assert(ceph_mutex_is_wlocked(m_lock)); + end_object_no = std::min(end_object_no, m_object_map.size()); + if (start_object_no >= end_object_no) { + ldout(cct, 20) << "skipping update of invalid object map" << dendl; + m_image_ctx.op_work_queue->queue(on_finish, 0); + return; + } + + auto it = m_object_map.begin() + start_object_no; + auto end_it = m_object_map.begin() + end_object_no; + for (; it != end_it; ++it) { + if (update_required(it, new_state)) { + break; + } + } + if (it == end_it) { + ldout(cct, 20) << "object map update not required" << dendl; + m_image_ctx.op_work_queue->queue(on_finish, 0); + return; + } + } + + auto req = object_map::UpdateRequest::create( + m_image_ctx, &m_lock, &m_object_map, snap_id, start_object_no, + end_object_no, new_state, current_state, parent_trace, ignore_enoent, + on_finish); + req->send(); +} + +} // namespace librbd + +template class librbd::ObjectMap; + diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h new file mode 100644 index 000000000..8b5b352ef --- /dev/null +++ b/src/librbd/ObjectMap.h @@ -0,0 +1,175 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_H +#define CEPH_LIBRBD_OBJECT_MAP_H + +#include "include/int_types.h" +#include "include/fs_types.h" +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/object_map_types.h" +#include "common/AsyncOpTracker.h" +#include "common/bit_vector.hpp" +#include "common/RefCountedObj.h" +#include "librbd/Utils.h" +#include + +class Context; +namespace ZTracer { struct Trace; } + +namespace librbd { + +template class BlockGuard; +struct BlockGuardCell; +class ImageCtx; + +template +class ObjectMap : public RefCountedObject { +public: + static ObjectMap *create(ImageCtxT &image_ctx, uint64_t snap_id) { + return new ObjectMap(image_ctx, snap_id); + } + + ObjectMap(ImageCtxT &image_ctx, uint64_t snap_id); + ~ObjectMap(); + + static int aio_remove(librados::IoCtx &io_ctx, const std::string &image_id, librados::AioCompletion *c); + static std::string object_map_name(const std::string &image_id, + uint64_t snap_id); + + static bool is_compatible(const file_layout_t& layout, uint64_t size); + + uint8_t operator[](uint64_t object_no) const; + inline uint64_t size() const { + std::shared_lock locker{m_lock}; + return m_object_map.size(); + } + + inline void set_state(uint64_t object_no, uint8_t new_state, + const boost::optional ¤t_state) { + std::unique_lock locker{m_lock}; + ceph_assert(object_no < m_object_map.size()); + if (current_state && m_object_map[object_no] != *current_state) { + return; + } + m_object_map[object_no] = new_state; + } + + void open(Context *on_finish); + void close(Context *on_finish); + bool set_object_map(ceph::BitVector<2> &target_object_map); + bool object_may_exist(uint64_t object_no) const; + bool object_may_not_exist(uint64_t object_no) const; + + void aio_save(Context *on_finish); + void aio_resize(uint64_t new_size, uint8_t default_object_state, + Context *on_finish); + + template + bool aio_update(uint64_t snap_id, uint64_t start_object_no, uint8_t new_state, + const boost::optional ¤t_state, + const ZTracer::Trace &parent_trace, bool ignore_enoent, + T *callback_object) { + return aio_update(snap_id, start_object_no, start_object_no + 1, + new_state, current_state, parent_trace, + ignore_enoent, callback_object); + } + + template + bool aio_update(uint64_t snap_id, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional ¤t_state, + const ZTracer::Trace &parent_trace, bool ignore_enoent, + T *callback_object) { + ceph_assert(start_object_no < end_object_no); + std::unique_lock locker{m_lock}; + + if (snap_id == CEPH_NOSNAP) { + end_object_no = std::min(end_object_no, m_object_map.size()); + if (start_object_no >= end_object_no) { + return false; + } + + auto it = m_object_map.begin() + start_object_no; + auto end_it = m_object_map.begin() + end_object_no; + for (; it != end_it; ++it) { + if (update_required(it, new_state)) { + break; + } + } + + if (it == end_it) { + return false; + } + + m_async_op_tracker.start_op(); + UpdateOperation update_operation(start_object_no, end_object_no, + new_state, current_state, parent_trace, + ignore_enoent, + util::create_context_callback( + callback_object)); + detained_aio_update(std::move(update_operation)); + } else { + aio_update(snap_id, start_object_no, end_object_no, new_state, + current_state, parent_trace, ignore_enoent, + util::create_context_callback(callback_object)); + } + return true; + } + + void rollback(uint64_t snap_id, Context *on_finish); + void snapshot_add(uint64_t snap_id, Context *on_finish); + void snapshot_remove(uint64_t snap_id, Context *on_finish); + +private: + struct UpdateOperation { + uint64_t start_object_no; + uint64_t end_object_no; + uint8_t new_state; + boost::optional current_state; + ZTracer::Trace parent_trace; + bool ignore_enoent; + Context *on_finish; + + UpdateOperation(uint64_t start_object_no, uint64_t end_object_no, + uint8_t new_state, + const boost::optional ¤t_state, + const ZTracer::Trace &parent_trace, + bool ignore_enoent, Context *on_finish) + : start_object_no(start_object_no), end_object_no(end_object_no), + new_state(new_state), current_state(current_state), + parent_trace(parent_trace), ignore_enoent(ignore_enoent), + on_finish(on_finish) { + } + }; + + typedef BlockGuard UpdateGuard; + + ImageCtxT &m_image_ctx; + uint64_t m_snap_id; + + mutable ceph::shared_mutex m_lock; + ceph::BitVector<2> m_object_map; + + AsyncOpTracker m_async_op_tracker; + UpdateGuard *m_update_guard = nullptr; + + void detained_aio_update(UpdateOperation &&update_operation); + void handle_detained_aio_update(BlockGuardCell *cell, int r, + Context *on_finish); + + void aio_update(uint64_t snap_id, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional ¤t_state, + const ZTracer::Trace &parent_trace, bool ignore_enoent, + Context *on_finish); + bool update_required(const ceph::BitVector<2>::Iterator &it, + uint8_t new_state); + +}; + +} // namespace librbd + +extern template class librbd::ObjectMap; + +#endif // CEPH_LIBRBD_OBJECT_MAP_H diff --git a/src/librbd/Operations.cc b/src/librbd/Operations.cc new file mode 100644 index 000000000..ad6e5bcf6 --- /dev/null +++ b/src/librbd/Operations.cc @@ -0,0 +1,1944 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/Operations.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "osdc/Striper.h" + +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ObjectMap.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/Utils.h" +#include "librbd/journal/DisabledPolicy.h" +#include "librbd/journal/StandardPolicy.h" +#include "librbd/operation/DisableFeaturesRequest.h" +#include "librbd/operation/EnableFeaturesRequest.h" +#include "librbd/operation/FlattenRequest.h" +#include "librbd/operation/MetadataRemoveRequest.h" +#include "librbd/operation/MetadataSetRequest.h" +#include "librbd/operation/MigrateRequest.h" +#include "librbd/operation/ObjectMapIterate.h" +#include "librbd/operation/RebuildObjectMapRequest.h" +#include "librbd/operation/RenameRequest.h" +#include "librbd/operation/ResizeRequest.h" +#include "librbd/operation/SnapshotCreateRequest.h" +#include "librbd/operation/SnapshotProtectRequest.h" +#include "librbd/operation/SnapshotRemoveRequest.h" +#include "librbd/operation/SnapshotRenameRequest.h" +#include "librbd/operation/SnapshotRollbackRequest.h" +#include "librbd/operation/SnapshotUnprotectRequest.h" +#include "librbd/operation/SnapshotLimitRequest.h" +#include "librbd/operation/SparsifyRequest.h" +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Operations: " + +namespace librbd { + +using namespace boost::placeholders; + +namespace { + +std::ostream &operator<<(std::ostream &out, const Operation &op) { + switch (op) { + case OPERATION_CHECK_OBJECT_MAP: + out << "check object map"; + break; + case OPERATION_FLATTEN: + out << "flatten"; + break; + case OPERATION_METADATA_UPDATE: + out << "metadata update"; + break; + case OPERATION_MIGRATE: + out << "migrate"; + break; + case OPERATION_REBUILD_OBJECT_MAP: + out << "rebuild object map"; + break; + case OPERATION_RENAME: + out << "rename"; + break; + case OPERATION_RESIZE: + out << "resize"; + break; + case OPERATION_SNAP_CREATE: + out << "snap create"; + break; + case OPERATION_SNAP_PROTECT: + out << "snap protect"; + break; + case OPERATION_SNAP_REMOVE: + out << "snap remove"; + break; + case OPERATION_SNAP_RENAME: + out << "snap rename"; + break; + case OPERATION_SNAP_ROLLBACK: + out << "snap rollback"; + break; + case OPERATION_SNAP_UNPROTECT: + out << "snap unprotect"; + break; + case OPERATION_SPARSIFY: + out << "sparsify"; + break; + case OPERATION_UPDATE_FEATURES: + out << "update features"; + break; + default: + ceph_abort(); + break; + } + return out; +} + +template +struct C_NotifyUpdate : public Context { + I &image_ctx; + Context *on_finish; + bool notified = false; + + C_NotifyUpdate(I &image_ctx, Context *on_finish) + : image_ctx(image_ctx), on_finish(on_finish) { + } + + void complete(int r) override { + CephContext *cct = image_ctx.cct; + if (notified) { + if (r == -ETIMEDOUT) { + // don't fail the op if a peer fails to get the update notification + lderr(cct) << "update notification timed-out" << dendl; + r = 0; + } else if (r == -ENOENT) { + // don't fail if header is missing (e.g. v1 image rename) + ldout(cct, 5) << "update notification on missing header" << dendl; + r = 0; + } else if (r < 0) { + lderr(cct) << "update notification failed: " << cpp_strerror(r) + << dendl; + } + Context::complete(r); + return; + } + + if (r < 0) { + // op failed -- no need to send update notification + Context::complete(r); + return; + } + + notified = true; + image_ctx.notify_update(this); + } + void finish(int r) override { + on_finish->complete(r); + } +}; + +template +struct C_InvokeAsyncRequest : public Context { + /** + * @verbatim + * + * + * | + * . . . . . . | . . . . . . . . . . . . . . . . . . + * . . | . . + * . v v v . + * . REFRESH_IMAGE (skip if not needed) . + * . | . + * . v . + * . ACQUIRE_LOCK (skip if exclusive lock . + * . | disabled or has lock) . + * . | . + * . /--------/ \--------\ . . . . . . . . . . . . . + * . | | . + * . v v . + * LOCAL_REQUEST REMOTE_REQUEST + * | | + * | | + * \--------\ /--------/ + * | + * v + * + * + * @endverbatim + */ + + I &image_ctx; + Operation operation; + exclusive_lock::OperationRequestType request_type; + bool permit_snapshot; + boost::function local; + boost::function remote; + std::set filter_error_codes; + Context *on_finish; + bool request_lock = false; + + C_InvokeAsyncRequest(I &image_ctx, Operation operation, + exclusive_lock::OperationRequestType request_type, + bool permit_snapshot, + const boost::function& local, + const boost::function& remote, + const std::set &filter_error_codes, + Context *on_finish) + : image_ctx(image_ctx), operation(operation), request_type(request_type), + permit_snapshot(permit_snapshot), local(local), remote(remote), + filter_error_codes(filter_error_codes), on_finish(on_finish) { + } + + void send() { + send_refresh_image(); + } + + void send_refresh_image() { + if (!image_ctx.state->is_refresh_required()) { + send_acquire_exclusive_lock(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = util::create_context_callback< + C_InvokeAsyncRequest, + &C_InvokeAsyncRequest::handle_refresh_image>(this); + image_ctx.state->refresh(ctx); + } + + void handle_refresh_image(int r) { + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl; + complete(r); + return; + } + + send_acquire_exclusive_lock(); + } + + void send_acquire_exclusive_lock() { + // context can complete before owner_lock is unlocked + ceph::shared_mutex &owner_lock(image_ctx.owner_lock); + owner_lock.lock_shared(); + image_ctx.image_lock.lock_shared(); + if (image_ctx.read_only || + (!permit_snapshot && image_ctx.snap_id != CEPH_NOSNAP)) { + image_ctx.image_lock.unlock_shared(); + owner_lock.unlock_shared(); + complete(-EROFS); + return; + } + image_ctx.image_lock.unlock_shared(); + + if (image_ctx.exclusive_lock == nullptr) { + send_local_request(); + owner_lock.unlock_shared(); + return; + } else if (image_ctx.image_watcher == nullptr) { + owner_lock.unlock_shared(); + complete(-EROFS); + return; + } + + if (image_ctx.exclusive_lock->is_lock_owner() && + image_ctx.exclusive_lock->accept_request(request_type, nullptr)) { + send_local_request(); + owner_lock.unlock_shared(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = util::create_async_context_callback( + image_ctx, util::create_context_callback< + C_InvokeAsyncRequest, + &C_InvokeAsyncRequest::handle_acquire_exclusive_lock>( + this, image_ctx.exclusive_lock)); + + if (request_lock) { + // current lock owner doesn't support op -- try to perform + // the action locally + request_lock = false; + image_ctx.exclusive_lock->acquire_lock(ctx); + } else { + image_ctx.exclusive_lock->try_acquire_lock(ctx); + } + owner_lock.unlock_shared(); + } + + void handle_acquire_exclusive_lock(int r) { + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + if (r < 0) { + complete(r == -EBLOCKLISTED ? -EBLOCKLISTED : -EROFS); + return; + } + + // context can complete before owner_lock is unlocked + ceph::shared_mutex &owner_lock(image_ctx.owner_lock); + owner_lock.lock_shared(); + if (image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()) { + send_local_request(); + owner_lock.unlock_shared(); + return; + } + + send_remote_request(); + owner_lock.unlock_shared(); + } + + void send_remote_request() { + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = util::create_async_context_callback( + image_ctx, util::create_context_callback< + C_InvokeAsyncRequest, + &C_InvokeAsyncRequest::handle_remote_request>(this)); + remote(ctx); + } + + void handle_remote_request(int r) { + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + ldout(cct, 5) << operation << " not supported by current lock owner" + << dendl; + request_lock = true; + send_refresh_image(); + return; + } else if (r != -ETIMEDOUT && r != -ERESTART) { + image_ctx.state->handle_update_notification(); + + complete(r); + return; + } + + ldout(cct, 5) << operation << " timed out notifying lock owner" << dendl; + send_refresh_image(); + } + + void send_local_request() { + auto ctx = new LambdaContext( + [this](int r) { + if (r == -ERESTART) { + image_ctx.operations->finish_op(operation, r); + send_refresh_image(); + return; + } + execute_local_request(); + }); + + image_ctx.operations->start_op(operation, ctx); + } + + void execute_local_request() { + std::shared_lock owner_locker{image_ctx.owner_lock}; + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = util::create_async_context_callback( + image_ctx, util::create_context_callback< + C_InvokeAsyncRequest, + &C_InvokeAsyncRequest::handle_local_request>(this)); + local(ctx); + } + + void handle_local_request(int r) { + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << __func__ << ": r=" << r << dendl; + + image_ctx.operations->finish_op(operation, r); + + if (r == -ERESTART) { + send_refresh_image(); + return; + } + complete(r); + } + + void finish(int r) override { + if (filter_error_codes.count(r) != 0) { + r = 0; + } + on_finish->complete(r); + } +}; + +template +bool needs_invalidate(I& image_ctx, uint64_t object_no, + uint8_t current_state, uint8_t new_state) { + if ( (current_state == OBJECT_EXISTS || + current_state == OBJECT_EXISTS_CLEAN) && + (new_state == OBJECT_NONEXISTENT || + new_state == OBJECT_PENDING)) { + return false; + } + return true; +} + +} // anonymous namespace + +template +Operations::Operations(I &image_ctx) + : m_image_ctx(image_ctx), + m_queue_lock(ceph::make_mutex( + util::unique_lock_name("librbd::Operations::m_queue_lock", + this))) { +} + +template +void Operations::start_op(Operation op, Context *ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << __func__ << ": " << op << " " << ctx << dendl; + + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + bool requires_lock = m_image_ctx.exclusive_lock != nullptr; + + ctx = util::create_async_context_callback( + m_image_ctx, new LambdaContext( + [this, op, requires_lock, ctx](int r) { + Context *finish_op_ctx = nullptr; + if (requires_lock && r == 0) { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::shared_lock image_locker{m_image_ctx.image_lock}; + auto exclusive_lock = m_image_ctx.exclusive_lock; + + if (exclusive_lock == nullptr || + (finish_op_ctx = exclusive_lock->start_op(&r)) == nullptr) { + ldout(m_image_ctx.cct, 20) << "lock owner lost, restarting" + << dendl; + r = -ERESTART; + } + } + + ldout(m_image_ctx.cct, 20) << "start " << op << " " << ctx << dendl; + ctx->complete(r); + if (finish_op_ctx != nullptr) { + finish_op_ctx->complete(0); + } + })); + + std::unique_lock locker{m_queue_lock}; + if (!m_in_flight_ops.insert(op).second) { + ldout(cct, 20) << __func__ << ": " << op << " in flight" << dendl; + m_queued_ops[op].push_back(ctx); + return; + } + + ctx->complete(0); +} + +template +void Operations::finish_op(Operation op, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << __func__ << ": " << op << " r=" << r << dendl; + + std::unique_lock locker{m_queue_lock}; + auto &queue = m_queued_ops[op]; + if (queue.empty()) { + m_in_flight_ops.erase(op); + return; + } + + auto ctx = queue.front(); + queue.pop_front(); + // propagate -ERESTART through all the queue + ctx->complete(r == -ERESTART ? r : 0); +} + +template +int Operations::flatten(ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "flatten" << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } + + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + if (m_image_ctx.parent_md.spec.pool_id == -1) { + lderr(cct) << "image has no parent" << dendl; + return -EINVAL; + } + } + + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_FLATTEN, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations::execute_flatten, this, + boost::ref(prog_ctx), _1), + boost::bind(&ImageWatcher::notify_flatten, + m_image_ctx.image_watcher, request_id, + boost::ref(prog_ctx), _1)); + + if (r < 0 && r != -EINVAL) { + return r; + } + ldout(cct, 20) << "flatten finished" << dendl; + return 0; +} + +template +void Operations::execute_flatten(ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "flatten" << dendl; + + if (m_image_ctx.read_only || m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.lock_shared(); + + // can't flatten a non-clone + if (m_image_ctx.parent_md.spec.pool_id == -1) { + lderr(cct) << "image has no parent" << dendl; + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EINVAL); + return; + } + if (m_image_ctx.snap_id != CEPH_NOSNAP) { + lderr(cct) << "snapshots cannot be flattened" << dendl; + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EROFS); + return; + } + + uint64_t crypto_header_objects = Striper::get_num_objects( + m_image_ctx.layout, + m_image_ctx.get_area_size(io::ImageArea::CRYPTO_HEADER)); + + uint64_t raw_overlap; + int r = m_image_ctx.get_parent_overlap(CEPH_NOSNAP, &raw_overlap); + ceph_assert(r == 0); + auto overlap = m_image_ctx.reduce_parent_overlap(raw_overlap, false); + uint64_t data_overlap_objects = Striper::get_num_objects( + m_image_ctx.layout, + (overlap.second == io::ImageArea::DATA ? overlap.first : 0)); + + m_image_ctx.image_lock.unlock_shared(); + + // leave encryption header flattening to format-specific handler + operation::FlattenRequest *req = new operation::FlattenRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), + crypto_header_objects, data_overlap_objects, prog_ctx); + req->send(); +} + +template +int Operations::rebuild_object_map(ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "rebuild_object_map" << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_REBUILD_OBJECT_MAP, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true, + boost::bind(&Operations::execute_rebuild_object_map, + this, boost::ref(prog_ctx), _1), + boost::bind(&ImageWatcher::notify_rebuild_object_map, + m_image_ctx.image_watcher, request_id, + boost::ref(prog_ctx), _1)); + + ldout(cct, 10) << "rebuild object map finished" << dendl; + if (r < 0) { + return r; + } + return 0; +} + +template +void Operations::execute_rebuild_object_map(ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + if (m_image_ctx.read_only || m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + lderr(cct) << "image must support object-map feature" << dendl; + on_finish->complete(-EINVAL); + return; + } + + operation::RebuildObjectMapRequest *req = + new operation::RebuildObjectMapRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), prog_ctx); + req->send(); +} + +template +int Operations::check_object_map(ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + r = invoke_async_request(OPERATION_CHECK_OBJECT_MAP, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true, + boost::bind(&Operations::check_object_map, this, + boost::ref(prog_ctx), _1), + [this](Context *c) { + m_image_ctx.op_work_queue->queue(c, -EOPNOTSUPP); + }); + + return r; +} + +template +void Operations::object_map_iterate(ProgressContext &prog_ctx, + operation::ObjectIterateWork handle_mismatch, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + on_finish->complete(-EINVAL); + return; + } + + operation::ObjectMapIterateRequest *req = + new operation::ObjectMapIterateRequest(m_image_ctx, on_finish, + prog_ctx, handle_mismatch); + req->send(); +} + +template +void Operations::check_object_map(ProgressContext &prog_ctx, + Context *on_finish) { + object_map_iterate(prog_ctx, needs_invalidate, on_finish); +} + +template +int Operations::rename(const char *dstname) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": dest_name=" << dstname + << dendl; + + int r = librbd::detect_format(m_image_ctx.md_ctx, dstname, NULL, NULL); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error checking for existing image called " + << dstname << ":" << cpp_strerror(r) << dendl; + return r; + } + if (r == 0) { + lderr(cct) << "rbd image " << dstname << " already exists" << dendl; + return -EEXIST; + } + + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_RENAME, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true, + boost::bind(&Operations::execute_rename, this, + dstname, _1), + boost::bind(&ImageWatcher::notify_rename, + m_image_ctx.image_watcher, request_id, + dstname, _1)); + if (r < 0 && r != -EEXIST) { + return r; + } + + m_image_ctx.set_image_name(dstname); + return 0; +} + +template +void Operations::execute_rename(const std::string &dest_name, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": dest_name=" << dest_name + << dendl; + + if (m_image_ctx.old_format) { + m_image_ctx.image_lock.lock_shared(); + if (m_image_ctx.name == dest_name) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EEXIST); + return; + } + m_image_ctx.image_lock.unlock_shared(); + + // unregister watch before and register back after rename + on_finish = new C_NotifyUpdate(m_image_ctx, on_finish); + on_finish = new LambdaContext([this, on_finish](int r) { + if (m_image_ctx.old_format) { + m_image_ctx.image_watcher->set_oid(m_image_ctx.header_oid); + } + m_image_ctx.image_watcher->register_watch(on_finish); + }); + on_finish = new LambdaContext([this, dest_name, on_finish](int r) { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + operation::RenameRequest *req = new operation::RenameRequest( + m_image_ctx, on_finish, dest_name); + req->send(); + }); + m_image_ctx.image_watcher->unregister_watch(on_finish); + return; + } + operation::RenameRequest *req = new operation::RenameRequest( + m_image_ctx, on_finish, dest_name); + req->send(); +} + +template +int Operations::resize(uint64_t size, bool allow_shrink, ProgressContext& prog_ctx) { + CephContext *cct = m_image_ctx.cct; + + m_image_ctx.image_lock.lock_shared(); + uint64_t raw_size = io::util::area_to_raw_offset(m_image_ctx, size, + io::ImageArea::DATA); + ldout(cct, 5) << this << " " << __func__ + << ": size=" << size + << " raw_size=" << m_image_ctx.size + << " new_raw_size=" << raw_size << dendl; + m_image_ctx.image_lock.unlock_shared(); + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP) && + !ObjectMap<>::is_compatible(m_image_ctx.layout, raw_size)) { + lderr(cct) << "New size not compatible with object map" << dendl; + return -EINVAL; + } + + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_RESIZE, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations::execute_resize, this, + size, allow_shrink, boost::ref(prog_ctx), _1, 0), + boost::bind(&ImageWatcher::notify_resize, + m_image_ctx.image_watcher, request_id, + size, allow_shrink, boost::ref(prog_ctx), _1)); + + m_image_ctx.perfcounter->inc(l_librbd_resize); + ldout(cct, 2) << "resize finished" << dendl; + return r; +} + +template +void Operations::execute_resize(uint64_t size, bool allow_shrink, ProgressContext &prog_ctx, + Context *on_finish, + uint64_t journal_op_tid) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + m_image_ctx.image_lock.lock_shared(); + uint64_t raw_size = io::util::area_to_raw_offset(m_image_ctx, size, + io::ImageArea::DATA); + ldout(cct, 5) << this << " " << __func__ + << ": size=" << size + << " raw_size=" << m_image_ctx.size + << " new_raw_size=" << raw_size << dendl; + + if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only || + m_image_ctx.operations_disabled) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EROFS); + return; + } else if (m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.image_lock) && + !ObjectMap<>::is_compatible(m_image_ctx.layout, raw_size)) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EINVAL); + return; + } + m_image_ctx.image_lock.unlock_shared(); + + operation::ResizeRequest *req = new operation::ResizeRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), raw_size, + allow_shrink, prog_ctx, journal_op_tid, false); + req->send(); +} + +template +int Operations::snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string& snap_name, uint64_t flags, + ProgressContext &prog_ctx) { + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + snap_create(snap_namespace, snap_name, flags, prog_ctx, &ctx); + r = ctx.wait(); + + if (r < 0) { + return r; + } + + m_image_ctx.perfcounter->inc(l_librbd_snap_create); + return r; +} + +template +void Operations::snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string& snap_name, uint64_t flags, + ProgressContext &prog_ctx, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.read_only) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.lock_shared(); + if (m_image_ctx.get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EEXIST); + return; + } + m_image_ctx.image_lock.unlock_shared(); + + uint64_t request_id = util::reserve_async_request_id(); + C_InvokeAsyncRequest *req = new C_InvokeAsyncRequest( + m_image_ctx, OPERATION_SNAP_CREATE, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true, + boost::bind(&Operations::execute_snap_create, this, snap_namespace, snap_name, + _1, 0, flags, boost::ref(prog_ctx)), + boost::bind(&ImageWatcher::notify_snap_create, m_image_ctx.image_watcher, + request_id, snap_namespace, snap_name, flags, + boost::ref(prog_ctx), _1), + {-EEXIST}, on_finish); + req->send(); +} + +template +void Operations::execute_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + Context *on_finish, + uint64_t journal_op_tid, + uint64_t flags, + ProgressContext &prog_ctx) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.lock_shared(); + if (m_image_ctx.get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EEXIST); + return; + } + m_image_ctx.image_lock.unlock_shared(); + + operation::SnapshotCreateRequest *req = + new operation::SnapshotCreateRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), + snap_namespace, snap_name, journal_op_tid, flags, prog_ctx); + req->send(); +} + +template +int Operations::snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name, + ProgressContext& prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) + return r; + + C_SaferCond cond_ctx; + { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + { + // need to drop image_lock before invalidating cache + std::shared_lock image_locker{m_image_ctx.image_lock}; + if (!m_image_ctx.snap_exists) { + return -ENOENT; + } + + if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) { + return -EROFS; + } + + uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name); + if (snap_id == CEPH_NOSNAP) { + lderr(cct) << "No such snapshot found." << dendl; + return -ENOENT; + } + } + + r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false); + if (r < 0) { + return r; + } + + Context *ctx = new LambdaContext( + [this, ctx=&cond_ctx](int r) { + m_image_ctx.operations->finish_op(OPERATION_SNAP_ROLLBACK, r); + ctx->complete(r); + }); + ctx = new LambdaContext( + [this, snap_namespace, snap_name, &prog_ctx, ctx](int r) { + if (r < 0) { + ctx->complete(r); + return; + } + std::shared_lock l{m_image_ctx.owner_lock}; + execute_snap_rollback(snap_namespace, snap_name, prog_ctx, ctx); + }); + + m_image_ctx.operations->start_op(OPERATION_SNAP_ROLLBACK, ctx); + } + + r = cond_ctx.wait(); + if (r < 0) { + return r; + } + + m_image_ctx.perfcounter->inc(l_librbd_snap_rollback); + return r; +} + +template +void Operations::execute_snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + ProgressContext& prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.lock_shared(); + uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name); + if (snap_id == CEPH_NOSNAP) { + lderr(cct) << "No such snapshot found." << dendl; + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-ENOENT); + return; + } + + uint64_t new_size = m_image_ctx.get_image_size(snap_id); + m_image_ctx.image_lock.unlock_shared(); + + // async mode used for journal replay + operation::SnapshotRollbackRequest *request = + new operation::SnapshotRollbackRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), snap_namespace, snap_name, + snap_id, new_size, prog_ctx); + request->send(); +} + +template +int Operations::snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name) { + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + snap_remove(snap_namespace, snap_name, &ctx); + r = ctx.wait(); + + if (r < 0) { + return r; + } + + m_image_ctx.perfcounter->inc(l_librbd_snap_remove); + return 0; +} + +template +void Operations::snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.read_only) { + on_finish->complete(-EROFS); + return; + } + + // quickly filter out duplicate ops + m_image_ctx.image_lock.lock_shared(); + if (m_image_ctx.get_snap_id(snap_namespace, snap_name) == CEPH_NOSNAP) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-ENOENT); + return; + } + + bool proxy_op = ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 || + (m_image_ctx.features & RBD_FEATURE_JOURNALING) != 0); + m_image_ctx.image_lock.unlock_shared(); + + if (proxy_op) { + uint64_t request_id = util::reserve_async_request_id(); + auto request_type = exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL; + if (cls::rbd::get_snap_namespace_type(snap_namespace) == + cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) { + request_type = exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE; + } + C_InvokeAsyncRequest *req = new C_InvokeAsyncRequest( + m_image_ctx, OPERATION_SNAP_REMOVE, request_type, true, + boost::bind(&Operations::execute_snap_remove, this, snap_namespace, + snap_name, _1), + boost::bind(&ImageWatcher::notify_snap_remove, + m_image_ctx.image_watcher, request_id, snap_namespace, + snap_name, _1), + {-ENOENT}, on_finish); + req->send(); + } else { + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + execute_snap_remove(snap_namespace, snap_name, on_finish); + } +} + +template +void Operations::execute_snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + { + if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.lock_shared(); + uint64_t snap_id = m_image_ctx.get_snap_id(snap_namespace, snap_name); + if (snap_id == CEPH_NOSNAP) { + lderr(m_image_ctx.cct) << "No such snapshot found." << dendl; + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-ENOENT); + return; + } + + bool is_protected; + int r = m_image_ctx.is_snap_protected(snap_id, &is_protected); + if (r < 0) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(r); + return; + } else if (is_protected) { + lderr(m_image_ctx.cct) << "snapshot is protected" << dendl; + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EBUSY); + return; + } + m_image_ctx.image_lock.unlock_shared(); + + operation::SnapshotRemoveRequest *req = + new operation::SnapshotRemoveRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), + snap_namespace, snap_name, snap_id); + req->send(); +} + +template +int Operations::snap_rename(const char *srcname, const char *dstname) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": " + << "snap_name=" << srcname << ", " + << "new_snap_name=" << dstname << dendl; + + snapid_t snap_id; + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) + return r; + + { + std::shared_lock l{m_image_ctx.image_lock}; + snap_id = m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), srcname); + if (snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + if (m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), dstname) != CEPH_NOSNAP) { + return -EEXIST; + } + } + + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_SNAP_RENAME, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true, + boost::bind(&Operations::execute_snap_rename, + this, snap_id, dstname, _1), + boost::bind(&ImageWatcher::notify_snap_rename, + m_image_ctx.image_watcher, request_id, + snap_id, dstname, _1)); + if (r < 0 && r != -EEXIST) { + return r; + } + } else { + C_SaferCond cond_ctx; + { + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + execute_snap_rename(snap_id, dstname, &cond_ctx); + } + + r = cond_ctx.wait(); + if (r < 0) { + return r; + } + } + + m_image_ctx.perfcounter->inc(l_librbd_snap_rename); + return 0; +} + +template +void Operations::execute_snap_rename(const uint64_t src_snap_id, + const std::string &dest_snap_name, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + if ((m_image_ctx.features & RBD_FEATURE_JOURNALING) != 0) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.lock_shared(); + if (m_image_ctx.get_snap_id(cls::rbd::UserSnapshotNamespace(), + dest_snap_name) != CEPH_NOSNAP) { + // Renaming is supported for snapshots from user namespace only. + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EEXIST); + return; + } + m_image_ctx.image_lock.unlock_shared(); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": " + << "snap_id=" << src_snap_id << ", " + << "new_snap_name=" << dest_snap_name << dendl; + + operation::SnapshotRenameRequest *req = + new operation::SnapshotRenameRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), src_snap_id, + dest_snap_name); + req->send(); +} + +template +int Operations::snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.read_only) { + return -EROFS; + } + + if (!m_image_ctx.test_features(RBD_FEATURE_LAYERING)) { + lderr(cct) << "image must support layering" << dendl; + return -ENOSYS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + bool is_protected; + r = m_image_ctx.is_snap_protected(m_image_ctx.get_snap_id(snap_namespace, snap_name), + &is_protected); + if (r < 0) { + return r; + } + + if (is_protected) { + return -EBUSY; + } + } + + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_SNAP_PROTECT, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true, + boost::bind(&Operations::execute_snap_protect, + this, snap_namespace, snap_name, _1), + boost::bind(&ImageWatcher::notify_snap_protect, + m_image_ctx.image_watcher, request_id, + snap_namespace, snap_name, _1)); + if (r < 0 && r != -EBUSY) { + return r; + } + } else { + C_SaferCond cond_ctx; + { + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + execute_snap_protect(snap_namespace, snap_name, &cond_ctx); + } + + r = cond_ctx.wait(); + if (r < 0) { + return r; + } + } + return 0; +} + +template +void Operations::execute_snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.lock_shared(); + bool is_protected; + int r = m_image_ctx.is_snap_protected(m_image_ctx.get_snap_id(snap_namespace, snap_name), + &is_protected); + if (r < 0) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(r); + return; + } else if (is_protected) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EBUSY); + return; + } + m_image_ctx.image_lock.unlock_shared(); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + operation::SnapshotProtectRequest *request = + new operation::SnapshotProtectRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), snap_namespace, snap_name); + request->send(); +} + +template +int Operations::snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + bool is_unprotected; + r = m_image_ctx.is_snap_unprotected(m_image_ctx.get_snap_id(snap_namespace, snap_name), + &is_unprotected); + if (r < 0) { + return r; + } + + if (is_unprotected) { + return -EINVAL; + } + } + + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_SNAP_UNPROTECT, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true, + boost::bind(&Operations::execute_snap_unprotect, + this, snap_namespace, snap_name, _1), + boost::bind(&ImageWatcher::notify_snap_unprotect, + m_image_ctx.image_watcher, request_id, + snap_namespace, snap_name, _1)); + if (r < 0 && r != -EINVAL) { + return r; + } + } else { + C_SaferCond cond_ctx; + { + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + execute_snap_unprotect(snap_namespace, snap_name, &cond_ctx); + } + + r = cond_ctx.wait(); + if (r < 0) { + return r; + } + } + return 0; +} + +template +void Operations::execute_snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + if (m_image_ctx.test_features(RBD_FEATURE_JOURNALING)) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + } + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.lock_shared(); + bool is_unprotected; + int r = m_image_ctx.is_snap_unprotected(m_image_ctx.get_snap_id(snap_namespace, snap_name), + &is_unprotected); + if (r < 0) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(r); + return; + } else if (is_unprotected) { + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EINVAL); + return; + } + m_image_ctx.image_lock.unlock_shared(); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_name=" << snap_name + << dendl; + + operation::SnapshotUnprotectRequest *request = + new operation::SnapshotUnprotectRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), snap_namespace, snap_name); + request->send(); +} + +template +int Operations::snap_set_limit(uint64_t limit) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": limit=" << limit << dendl; + + if (m_image_ctx.read_only) { + return -EROFS; + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond limit_ctx; + { + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true); + if (r < 0) { + return r; + } + + execute_snap_set_limit(limit, &limit_ctx); + } + + r = limit_ctx.wait(); + return r; +} + +template +void Operations::execute_snap_set_limit(const uint64_t limit, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": limit=" << limit + << dendl; + + operation::SnapshotLimitRequest *request = + new operation::SnapshotLimitRequest(m_image_ctx, on_finish, limit); + request->send(); +} + +template +int Operations::update_features(uint64_t features, bool enabled) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": features=" << features + << ", enabled=" << enabled << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } else if (m_image_ctx.old_format) { + lderr(cct) << "old-format images do not support features" << dendl; + return -EINVAL; + } + + uint64_t disable_mask = (RBD_FEATURES_MUTABLE | + RBD_FEATURES_DISABLE_ONLY); + if ((enabled && (features & RBD_FEATURES_MUTABLE) != features) || + (!enabled && (features & disable_mask) != features) || + ((features & ~RBD_FEATURES_MUTABLE_INTERNAL) != features)) { + lderr(cct) << "cannot update immutable features" << dendl; + return -EINVAL; + } + + bool set_object_map = (features & RBD_FEATURE_OBJECT_MAP) == RBD_FEATURE_OBJECT_MAP; + bool set_fast_diff = (features & RBD_FEATURE_FAST_DIFF) == RBD_FEATURE_FAST_DIFF; + bool exist_fast_diff = (m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0; + bool exist_object_map = (m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0; + + if ((enabled && ((set_object_map && !exist_fast_diff) || (set_fast_diff && !exist_object_map))) + || (!enabled && (set_object_map && exist_fast_diff))) { + features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF); + } + + if (features == 0) { + lderr(cct) << "update requires at least one feature" << dendl; + return -EINVAL; + } + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + if (enabled && (features & m_image_ctx.features) != 0) { + lderr(cct) << "one or more requested features are already enabled" + << dendl; + return -EINVAL; + } + if (!enabled && (features & ~m_image_ctx.features) != 0) { + lderr(cct) << "one or more requested features are already disabled" + << dendl; + return -EINVAL; + } + } + + // if disabling journaling, avoid attempting to open the journal + // when acquiring the exclusive lock in case the journal is corrupt + bool disabling_journal = false; + if (!enabled && ((features & RBD_FEATURE_JOURNALING) != 0)) { + std::unique_lock image_locker{m_image_ctx.image_lock}; + m_image_ctx.set_journal_policy(new journal::DisabledPolicy()); + disabling_journal = true; + } + BOOST_SCOPE_EXIT_ALL( (this)(disabling_journal) ) { + if (disabling_journal) { + std::unique_lock image_locker{m_image_ctx.image_lock}; + m_image_ctx.set_journal_policy( + new journal::StandardPolicy(&m_image_ctx)); + } + }; + + // The journal options are not passed to the lock owner in the + // update features request. Therefore, if journaling is being + // enabled, the lock should be locally acquired instead of + // attempting to send the request to the peer. + if (enabled && (features & RBD_FEATURE_JOURNALING) != 0) { + C_SaferCond cond_ctx; + { + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + r = prepare_image_update(exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + true); + if (r < 0) { + return r; + } + + execute_update_features(features, enabled, &cond_ctx, 0); + } + + r = cond_ctx.wait(); + } else { + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_UPDATE_FEATURES, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations::execute_update_features, + this, features, enabled, _1, 0), + boost::bind(&ImageWatcher::notify_update_features, + m_image_ctx.image_watcher, request_id, + features, enabled, _1)); + } + ldout(cct, 2) << "update_features finished" << dendl; + return r; +} + +template +void Operations::execute_update_features(uint64_t features, bool enabled, + Context *on_finish, + uint64_t journal_op_tid) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": features=" << features + << ", enabled=" << enabled << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + if (enabled) { + operation::EnableFeaturesRequest *req = + new operation::EnableFeaturesRequest( + m_image_ctx, on_finish, journal_op_tid, features); + req->send(); + } else { + operation::DisableFeaturesRequest *req = + new operation::DisableFeaturesRequest( + m_image_ctx, on_finish, journal_op_tid, features, false); + req->send(); + } +} + +template +int Operations::metadata_set(const std::string &key, + const std::string &value) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": key=" << key << ", value=" + << value << dendl; + + std::string config_key; + bool config_override = util::is_metadata_config_override(key, &config_key); + if (config_override) { + // validate config setting + if (!librbd::api::Config::is_option_name(&m_image_ctx, config_key)) { + lderr(cct) << "validation for " << key + << " failed: not allowed image level override" << dendl; + return -EINVAL; + } + int r = ConfigProxy{false}.set_val(config_key.c_str(), value); + if (r < 0) { + return r; + } + } + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } + + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_METADATA_UPDATE, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations::execute_metadata_set, + this, key, value, _1), + boost::bind(&ImageWatcher::notify_metadata_set, + m_image_ctx.image_watcher, request_id, + key, value, _1)); + + if (config_override && r >= 0) { + // apply new config key immediately + r = m_image_ctx.state->refresh_if_required(); + } + + ldout(cct, 20) << "metadata_set finished" << dendl; + return r; +} + +template +void Operations::execute_metadata_set(const std::string &key, + const std::string &value, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": key=" << key << ", value=" + << value << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + operation::MetadataSetRequest *request = + new operation::MetadataSetRequest(m_image_ctx, + new C_NotifyUpdate(m_image_ctx, on_finish), + key, value); + request->send(); +} + +template +int Operations::metadata_remove(const std::string &key) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": key=" << key << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } + + std::string value; + r = cls_client::metadata_get(&m_image_ctx.md_ctx, m_image_ctx.header_oid, key, &value); + if(r < 0) + return r; + + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_METADATA_UPDATE, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations::execute_metadata_remove, + this, key, _1), + boost::bind(&ImageWatcher::notify_metadata_remove, + m_image_ctx.image_watcher, request_id, + key, _1)); + if (r == -ENOENT) { + r = 0; + } + + std::string config_key; + if (util::is_metadata_config_override(key, &config_key) && r >= 0) { + // apply new config key immediately + r = m_image_ctx.state->refresh_if_required(); + } + + ldout(cct, 20) << "metadata_remove finished" << dendl; + return r; +} + +template +void Operations::execute_metadata_remove(const std::string &key, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": key=" << key << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + operation::MetadataRemoveRequest *request = + new operation::MetadataRemoveRequest( + m_image_ctx, + new C_NotifyUpdate(m_image_ctx, on_finish), key); + request->send(); +} + +template +int Operations::migrate(ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "migrate" << dendl; + + int r = m_image_ctx.state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (m_image_ctx.read_only) { + return -EROFS; + } + + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + if (m_image_ctx.migration_info.empty()) { + lderr(cct) << "image has no migrating parent" << dendl; + return -EINVAL; + } + } + + uint64_t request_id = util::reserve_async_request_id(); + r = invoke_async_request(OPERATION_MIGRATE, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations::execute_migrate, this, + boost::ref(prog_ctx), _1), + boost::bind(&ImageWatcher::notify_migrate, + m_image_ctx.image_watcher, request_id, + boost::ref(prog_ctx), _1)); + + if (r < 0 && r != -EINVAL) { + return r; + } + ldout(cct, 20) << "migrate finished" << dendl; + return 0; +} + +template +void Operations::execute_migrate(ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "migrate" << dendl; + + if (m_image_ctx.read_only || m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.lock_shared(); + + if (m_image_ctx.migration_info.empty()) { + lderr(cct) << "image has no migrating parent" << dendl; + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EINVAL); + return; + } + if (m_image_ctx.snap_id != CEPH_NOSNAP) { + lderr(cct) << "snapshots cannot be migrated" << dendl; + m_image_ctx.image_lock.unlock_shared(); + on_finish->complete(-EROFS); + return; + } + + m_image_ctx.image_lock.unlock_shared(); + + operation::MigrateRequest *req = new operation::MigrateRequest( + m_image_ctx, new C_NotifyUpdate(m_image_ctx, on_finish), prog_ctx); + req->send(); +} + +template +int Operations::sparsify(size_t sparse_size, ProgressContext &prog_ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "sparsify" << dendl; + + if (sparse_size < 4096 || sparse_size > m_image_ctx.get_object_size() || + (sparse_size & (sparse_size - 1)) != 0) { + lderr(cct) << "sparse size should be power of two not less than 4096" + << " and not larger image object size" << dendl; + return -EINVAL; + } + + uint64_t request_id = util::reserve_async_request_id(); + int r = invoke_async_request(OPERATION_SPARSIFY, + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, + false, + boost::bind(&Operations::execute_sparsify, + this, sparse_size, + boost::ref(prog_ctx), _1), + boost::bind(&ImageWatcher::notify_sparsify, + m_image_ctx.image_watcher, + request_id, sparse_size, + boost::ref(prog_ctx), _1)); + if (r < 0 && r != -EINVAL) { + return r; + } + ldout(cct, 20) << "resparsify finished" << dendl; + return 0; +} + +template +void Operations::execute_sparsify(size_t sparse_size, + ProgressContext &prog_ctx, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "sparsify" << dendl; + + if (m_image_ctx.operations_disabled) { + on_finish->complete(-EROFS); + return; + } + + auto req = new operation::SparsifyRequest( + m_image_ctx, sparse_size, new C_NotifyUpdate(m_image_ctx, on_finish), + prog_ctx); + req->send(); +} + +template +int Operations::prepare_image_update( + exclusive_lock::OperationRequestType request_type, bool request_lock) { + ceph_assert(ceph_mutex_is_rlocked(m_image_ctx.owner_lock)); + if (m_image_ctx.image_watcher == nullptr) { + return -EROFS; + } + + // need to upgrade to a write lock + C_SaferCond ctx; + m_image_ctx.owner_lock.unlock_shared(); + bool attempting_lock = false; + { + std::unique_lock owner_locker{m_image_ctx.owner_lock}; + if (m_image_ctx.exclusive_lock != nullptr && + (!m_image_ctx.exclusive_lock->is_lock_owner() || + !m_image_ctx.exclusive_lock->accept_request(request_type, nullptr))) { + + attempting_lock = true; + m_image_ctx.exclusive_lock->block_requests(0); + + if (request_lock) { + m_image_ctx.exclusive_lock->acquire_lock(&ctx); + } else { + m_image_ctx.exclusive_lock->try_acquire_lock(&ctx); + } + } + } + + int r = 0; + if (attempting_lock) { + r = ctx.wait(); + } + + m_image_ctx.owner_lock.lock_shared(); + if (attempting_lock && m_image_ctx.exclusive_lock != nullptr) { + m_image_ctx.exclusive_lock->unblock_requests(); + } + + if (r == -EAGAIN || r == -EBUSY) { + r = 0; + } + if (r < 0) { + return r; + } else if (m_image_ctx.exclusive_lock != nullptr && + !m_image_ctx.exclusive_lock->is_lock_owner()) { + return m_image_ctx.exclusive_lock->get_unlocked_op_error(); + } + + return 0; +} + +template +int Operations::invoke_async_request( + Operation op, exclusive_lock::OperationRequestType request_type, + bool permit_snapshot, const boost::function& local_request, + const boost::function& remote_request) { + C_SaferCond ctx; + C_InvokeAsyncRequest *req = new C_InvokeAsyncRequest(m_image_ctx, op, + request_type, + permit_snapshot, + local_request, + remote_request, + {}, &ctx); + req->send(); + return ctx.wait(); +} + +} // namespace librbd + +template class librbd::Operations; diff --git a/src/librbd/Operations.h b/src/librbd/Operations.h new file mode 100644 index 000000000..52d1484e7 --- /dev/null +++ b/src/librbd/Operations.h @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATIONS_H +#define CEPH_LIBRBD_OPERATIONS_H + +#include "cls/rbd/cls_rbd_types.h" +#include "include/int_types.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/operation/ObjectMapIterate.h" +#include +#include +#include +#include +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +enum Operation { + OPERATION_CHECK_OBJECT_MAP, + OPERATION_FLATTEN, + OPERATION_METADATA_UPDATE, + OPERATION_MIGRATE, + OPERATION_REBUILD_OBJECT_MAP, + OPERATION_RENAME, + OPERATION_RESIZE, + OPERATION_SNAP_CREATE, + OPERATION_SNAP_PROTECT, + OPERATION_SNAP_REMOVE, + OPERATION_SNAP_RENAME, + OPERATION_SNAP_ROLLBACK, + OPERATION_SNAP_UNPROTECT, + OPERATION_SPARSIFY, + OPERATION_UPDATE_FEATURES, +}; + +template +class Operations { +public: + Operations(ImageCtxT &image_ctx); + + void start_op(enum Operation op, Context *ctx); + void finish_op(enum Operation op, int r); + + int flatten(ProgressContext &prog_ctx); + void execute_flatten(ProgressContext &prog_ctx, Context *on_finish); + + int rebuild_object_map(ProgressContext &prog_ctx); + void execute_rebuild_object_map(ProgressContext &prog_ctx, + Context *on_finish); + + int check_object_map(ProgressContext &prog_ctx); + void check_object_map(ProgressContext &prog_ctx, Context *on_finish); + + void object_map_iterate(ProgressContext &prog_ctx, + operation::ObjectIterateWork handle_mismatch, + Context* on_finish); + + int rename(const char *dstname); + void execute_rename(const std::string &dest_name, Context *on_finish); + + int resize(uint64_t size, bool allow_shrink, ProgressContext& prog_ctx); + void execute_resize(uint64_t size, bool allow_shrink, ProgressContext &prog_ctx, + Context *on_finish, uint64_t journal_op_tid); + + int snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string& snap_name, uint64_t flags, + ProgressContext& prog_ctx); + void snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string& snap_name, uint64_t flags, + ProgressContext& prog_ctx, Context *on_finish); + void execute_snap_create(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, Context *on_finish, + uint64_t journal_op_tid, uint64_t flags, + ProgressContext &prog_ctx); + + int snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name, + ProgressContext& prog_ctx); + void execute_snap_rollback(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + ProgressContext& prog_ctx, Context *on_finish); + + int snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name); + void snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name, + Context *on_finish); + void execute_snap_remove(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish); + + int snap_rename(const char *srcname, const char *dstname); + void execute_snap_rename(const uint64_t src_snap_id, + const std::string &dest_snap_name, + Context *on_finish); + + int snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name); + void execute_snap_protect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish); + + int snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name); + void execute_snap_unprotect(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + Context *on_finish); + + int snap_set_limit(uint64_t limit); + void execute_snap_set_limit(uint64_t limit, Context *on_finish); + + int update_features(uint64_t features, bool enabled); + void execute_update_features(uint64_t features, bool enabled, + Context *on_finish, uint64_t journal_op_tid); + + int metadata_set(const std::string &key, const std::string &value); + void execute_metadata_set(const std::string &key, const std::string &value, + Context *on_finish); + + int metadata_remove(const std::string &key); + void execute_metadata_remove(const std::string &key, Context *on_finish); + + int migrate(ProgressContext &prog_ctx); + void execute_migrate(ProgressContext &prog_ctx, Context *on_finish); + + int sparsify(size_t sparse_size, ProgressContext &prog_ctx); + void execute_sparsify(size_t sparse_size, ProgressContext &prog_ctx, + Context *on_finish); + + int prepare_image_update(exclusive_lock::OperationRequestType request_type, + bool request_lock); + +private: + ImageCtxT &m_image_ctx; + + mutable ceph::mutex m_queue_lock; + std::set m_in_flight_ops; + std::map> m_queued_ops; + + int invoke_async_request(Operation op, + exclusive_lock::OperationRequestType request_type, + bool permit_snapshot, + const boost::function& local, + const boost::function& remote); +}; + +} // namespace librbd + +extern template class librbd::Operations; + +#endif // CEPH_LIBRBD_OPERATIONS_H diff --git a/src/librbd/PluginRegistry.cc b/src/librbd/PluginRegistry.cc new file mode 100644 index 000000000..6ddf0a414 --- /dev/null +++ b/src/librbd/PluginRegistry.cc @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/PluginRegistry.h" +#include "include/Context.h" +#include "common/dout.h" +#include "librbd/cache/ImageWriteback.h" +#include "librbd/ImageCtx.h" +#include "librbd/plugin/Api.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::PluginRegistry: " \ + << this << " " << __func__ << ": " + +namespace librbd { + +template +PluginRegistry::PluginRegistry(I* image_ctx) + : m_image_ctx(image_ctx), m_plugin_api(std::make_unique>()), + m_image_writeback(std::make_unique>(*image_ctx)) { +} + +template +PluginRegistry::~PluginRegistry() { +} + +template +void PluginRegistry::init(const std::string& plugins, Context* on_finish) { + auto cct = m_image_ctx->cct; + auto plugin_registry = cct->get_plugin_registry(); + + auto gather_ctx = new C_Gather(cct, on_finish); + + boost::tokenizer> tokenizer(plugins); + for (auto token : tokenizer) { + ldout(cct, 5) << "attempting to load plugin: " << token << dendl; + + auto ctx = gather_ctx->new_sub(); + + auto plugin = dynamic_cast*>( + plugin_registry->get_with_load("librbd", "librbd_" + token)); + if (plugin == nullptr) { + lderr(cct) << "failed to load plugin: " << token << dendl; + ctx->complete(-ENOSYS); + break; + } + + plugin->init( + m_image_ctx, *m_plugin_api, *m_image_writeback, m_plugin_hook_points, ctx); + } + + gather_ctx->activate(); +} + +template +void PluginRegistry::acquired_exclusive_lock(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + auto gather_ctx = new C_Gather(cct, on_finish); + + for (auto &hook : m_plugin_hook_points) { + auto ctx = gather_ctx->new_sub(); + hook->acquired_exclusive_lock(ctx); + } + gather_ctx->activate(); +} + +template +void PluginRegistry::prerelease_exclusive_lock(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + auto gather_ctx = new C_Gather(cct, on_finish); + + for (auto &hook : m_plugin_hook_points) { + auto ctx = gather_ctx->new_sub(); + hook->prerelease_exclusive_lock(ctx); + } + gather_ctx->activate(); +} + +template +void PluginRegistry::discard(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + auto gather_ctx = new C_Gather(cct, on_finish); + + for (auto &hook : m_plugin_hook_points) { + auto ctx = gather_ctx->new_sub(); + hook->discard(ctx); + } + gather_ctx->activate(); +} + +} // namespace librbd + +template class librbd::PluginRegistry; diff --git a/src/librbd/PluginRegistry.h b/src/librbd/PluginRegistry.h new file mode 100644 index 000000000..92e183ce1 --- /dev/null +++ b/src/librbd/PluginRegistry.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_PLUGIN_REGISTRY_H +#define CEPH_LIBRBD_PLUGIN_REGISTRY_H + +#include "librbd/plugin/Types.h" +#include +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace cache { +class ImageWritebackInterface; +} + +namespace plugin { template struct Api; } + +template +class PluginRegistry { +public: + PluginRegistry(ImageCtxT* image_ctx); + ~PluginRegistry(); + + void init(const std::string& plugins, Context* on_finish); + + void acquired_exclusive_lock(Context* on_finish); + void prerelease_exclusive_lock(Context* on_finish); + void discard(Context* on_finish); + +private: + ImageCtxT* m_image_ctx; + std::unique_ptr> m_plugin_api; + std::unique_ptr m_image_writeback; + + std::string m_plugins; + + plugin::PluginHookPoints m_plugin_hook_points; + +}; + +} // namespace librbd + +extern template class librbd::PluginRegistry; + +#endif // CEPH_LIBRBD_PLUGIN_REGISTRY_H diff --git a/src/librbd/TaskFinisher.h b/src/librbd/TaskFinisher.h new file mode 100644 index 000000000..65e7da4a6 --- /dev/null +++ b/src/librbd/TaskFinisher.h @@ -0,0 +1,179 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef LIBRBD_TASK_FINISHER_H +#define LIBRBD_TASK_FINISHER_H + +#include "include/common_fwd.h" +#include "include/Context.h" +#include "common/ceph_context.h" +#include "common/Finisher.h" +#include "common/ceph_mutex.h" +#include "common/Timer.h" +#include +#include + + +namespace librbd { + +struct TaskFinisherSingleton { + ceph::mutex m_lock = ceph::make_mutex("librbd::TaskFinisher::m_lock"); + SafeTimer *m_safe_timer; + Finisher *m_finisher; + + static TaskFinisherSingleton& get_singleton(CephContext* cct) { + return cct->lookup_or_create_singleton_object< + TaskFinisherSingleton>("librbd::TaskFinisherSingleton", false, cct); + } + + explicit TaskFinisherSingleton(CephContext *cct) { + m_safe_timer = new SafeTimer(cct, m_lock, false); + m_safe_timer->init(); + m_finisher = new Finisher(cct, "librbd::TaskFinisher::m_finisher", "taskfin_librbd"); + m_finisher->start(); + } + virtual ~TaskFinisherSingleton() { + { + std::lock_guard l{m_lock}; + m_safe_timer->shutdown(); + delete m_safe_timer; + } + m_finisher->wait_for_empty(); + m_finisher->stop(); + delete m_finisher; + } + + void queue(Context* ctx, int r) { + m_finisher->queue(ctx, r); + } +}; + + +template +class TaskFinisher { +public: + TaskFinisher(CephContext &cct) : m_cct(cct) { + auto& singleton = TaskFinisherSingleton::get_singleton(&cct); + m_lock = &singleton.m_lock; + m_safe_timer = singleton.m_safe_timer; + m_finisher = singleton.m_finisher; + } + + bool cancel(const Task& task) { + std::lock_guard l{*m_lock}; + typename TaskContexts::iterator it = m_task_contexts.find(task); + if (it == m_task_contexts.end()) { + return false; + } + it->second.first->complete(-ECANCELED); + m_safe_timer->cancel_event(it->second.second); + m_task_contexts.erase(it); + return true; + } + + void cancel_all() { + std::lock_guard l{*m_lock}; + for (auto &[task, pair] : m_task_contexts) { + pair.first->complete(-ECANCELED); + m_safe_timer->cancel_event(pair.second); + } + m_task_contexts.clear(); + } + + bool add_event_after(const Task& task, double seconds, Context *ctx) { + std::lock_guard l{*m_lock}; + if (m_task_contexts.count(task) != 0) { + // task already scheduled on finisher or timer + delete ctx; + return false; + } + C_Task *timer_ctx = new C_Task(this, task); + m_task_contexts[task] = std::make_pair(ctx, timer_ctx); + + m_safe_timer->add_event_after(seconds, timer_ctx); + return true; + } + + bool reschedule_event_after(const Task& task, double seconds) { + std::lock_guard l{*m_lock}; + auto it = m_task_contexts.find(task); + if (it == m_task_contexts.end()) { + return false; + } + bool canceled = m_safe_timer->cancel_event(it->second.second); + if (!canceled) { + return false; + } + auto timer_ctx = new C_Task(this, task); + it->second.second = timer_ctx; + m_safe_timer->add_event_after(seconds, timer_ctx); + return true; + } + + void queue(Context *ctx, int r = 0) { + m_finisher->queue(ctx, r); + } + + bool queue(const Task& task, Context *ctx) { + std::lock_guard l{*m_lock}; + typename TaskContexts::iterator it = m_task_contexts.find(task); + if (it != m_task_contexts.end()) { + if (it->second.second != NULL && + m_safe_timer->cancel_event(it->second.second)) { + it->second.first->complete(-ECANCELED); + } else { + // task already scheduled on the finisher + ctx->complete(-ECANCELED); + return false; + } + } + m_task_contexts[task] = std::make_pair(ctx, reinterpret_cast(0)); + + m_finisher->queue(new C_Task(this, task)); + return true; + } + +private: + class C_Task : public Context { + public: + C_Task(TaskFinisher *task_finisher, const Task& task) + : m_task_finisher(task_finisher), m_task(task) + { + } + protected: + void finish(int r) override { + m_task_finisher->complete(m_task); + } + private: + TaskFinisher *m_task_finisher; + Task m_task; + }; + + CephContext &m_cct; + + ceph::mutex *m_lock; + Finisher *m_finisher; + SafeTimer *m_safe_timer; + + typedef std::map > TaskContexts; + TaskContexts m_task_contexts; + + void complete(const Task& task) { + Context *ctx = NULL; + { + std::lock_guard l{*m_lock}; + typename TaskContexts::iterator it = m_task_contexts.find(task); + if (it != m_task_contexts.end()) { + ctx = it->second.first; + m_task_contexts.erase(it); + } + } + + if (ctx != NULL) { + ctx->complete(0); + } + } +}; + +} // namespace librbd + +#endif // LIBRBD_TASK_FINISHER diff --git a/src/librbd/TrashWatcher.cc b/src/librbd/TrashWatcher.cc new file mode 100644 index 000000000..75d588205 --- /dev/null +++ b/src/librbd/TrashWatcher.cc @@ -0,0 +1,116 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/TrashWatcher.h" +#include "include/rbd_types.h" +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "librbd/watcher/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::TrashWatcher: " << __func__ << ": " + +namespace librbd { + +using namespace trash_watcher; +using namespace watcher; + +using librbd::util::create_rados_callback; + +namespace { + +static const uint64_t NOTIFY_TIMEOUT_MS = 5000; + +} // anonymous namespace + +template +TrashWatcher::TrashWatcher(librados::IoCtx &io_ctx, + asio::ContextWQ *work_queue) + : Watcher(io_ctx, work_queue, RBD_TRASH) { +} + +template +void TrashWatcher::notify_image_added( + librados::IoCtx &io_ctx, const std::string& image_id, + const cls::rbd::TrashImageSpec& trash_image_spec, Context *on_finish) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + bufferlist bl; + encode(NotifyMessage{ImageAddedPayload{image_id, trash_image_spec}}, bl); + + librados::AioCompletion *comp = create_rados_callback(on_finish); + int r = io_ctx.aio_notify(RBD_TRASH, comp, bl, NOTIFY_TIMEOUT_MS, nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template +void TrashWatcher::notify_image_removed(librados::IoCtx &io_ctx, + const std::string& image_id, + Context *on_finish) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + bufferlist bl; + encode(NotifyMessage{ImageRemovedPayload{image_id}}, bl); + + librados::AioCompletion *comp = create_rados_callback(on_finish); + int r = io_ctx.aio_notify(RBD_TRASH, comp, bl, NOTIFY_TIMEOUT_MS, nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template +void TrashWatcher::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) { + CephContext *cct = this->m_cct; + ldout(cct, 15) << "notify_id=" << notify_id << ", " + << "handle=" << handle << dendl; + + + NotifyMessage notify_message; + try { + auto iter = bl.cbegin(); + decode(notify_message, iter); + } catch (const buffer::error &err) { + lderr(cct) << "error decoding image notification: " << err.what() + << dendl; + Context *ctx = new C_NotifyAck(this, notify_id, handle); + ctx->complete(0); + return; + } + + apply_visitor(watcher::util::HandlePayloadVisitor>( + this, notify_id, handle), notify_message.payload); +} + +template +bool TrashWatcher::handle_payload(const ImageAddedPayload &payload, + Context *on_notify_ack) { + CephContext *cct = this->m_cct; + ldout(cct, 20) << dendl; + handle_image_added(payload.image_id, payload.trash_image_spec); + return true; +} + +template +bool TrashWatcher::handle_payload(const ImageRemovedPayload &payload, + Context *on_notify_ack) { + CephContext *cct = this->m_cct; + ldout(cct, 20) << dendl; + handle_image_removed(payload.image_id); + return true; +} + +template +bool TrashWatcher::handle_payload(const UnknownPayload &payload, + Context *on_notify_ack) { + return true; +} + +} // namespace librbd + +template class librbd::TrashWatcher; diff --git a/src/librbd/TrashWatcher.h b/src/librbd/TrashWatcher.h new file mode 100644 index 000000000..684eaf4f5 --- /dev/null +++ b/src/librbd/TrashWatcher.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_TRASH_WATCHER_H +#define CEPH_LIBRBD_TRASH_WATCHER_H + +#include "include/int_types.h" +#include "include/rados/librados_fwd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Watcher.h" +#include "librbd/trash_watcher/Types.h" + +namespace librbd { + +namespace asio { struct ContextWQ; } +namespace watcher { +namespace util { +template struct HandlePayloadVisitor; +} // namespace util +} // namespace watcher + +template +class TrashWatcher : public Watcher { + friend struct watcher::util::HandlePayloadVisitor>; +public: + TrashWatcher(librados::IoCtx &io_ctx, asio::ContextWQ *work_queue); + + static void notify_image_added(librados::IoCtx &io_ctx, + const std::string& image_id, + const cls::rbd::TrashImageSpec& spec, + Context *on_finish); + static void notify_image_removed(librados::IoCtx &io_ctx, + const std::string& image_id, + Context *on_finish); + +protected: + virtual void handle_image_added(const std::string &image_id, + const cls::rbd::TrashImageSpec& spec) = 0; + virtual void handle_image_removed(const std::string &image_id) = 0; + +private: + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) override; + + bool handle_payload(const trash_watcher::ImageAddedPayload &payload, + Context *on_notify_ack); + bool handle_payload(const trash_watcher::ImageRemovedPayload &payload, + Context *on_notify_ack); + bool handle_payload(const trash_watcher::UnknownPayload &payload, + Context *on_notify_ack); +}; + +} // namespace librbd + +extern template class librbd::TrashWatcher; + +#endif // CEPH_LIBRBD_TRASH_WATCHER_H diff --git a/src/librbd/Types.h b/src/librbd/Types.h new file mode 100644 index 000000000..f1c7d6c5d --- /dev/null +++ b/src/librbd/Types.h @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_TYPES_H +#define LIBRBD_TYPES_H + +#include "include/types.h" +#include "cls/rbd/cls_rbd_types.h" +#include "deep_copy/Types.h" +#include +#include +#include + +namespace neorados { class IOContext; } + +namespace librbd { + +// Performance counters +enum { + l_librbd_first = 26000, + + l_librbd_rd, // read ops + l_librbd_rd_bytes, // bytes read + l_librbd_rd_latency, // average latency + l_librbd_wr, + l_librbd_wr_bytes, + l_librbd_wr_latency, + l_librbd_discard, + l_librbd_discard_bytes, + l_librbd_discard_latency, + l_librbd_flush, + l_librbd_flush_latency, + + l_librbd_ws, + l_librbd_ws_bytes, + l_librbd_ws_latency, + + l_librbd_cmp, + l_librbd_cmp_bytes, + l_librbd_cmp_latency, + + l_librbd_snap_create, + l_librbd_snap_remove, + l_librbd_snap_rollback, + l_librbd_snap_rename, + + l_librbd_notify, + l_librbd_resize, + + l_librbd_readahead, + l_librbd_readahead_bytes, + + l_librbd_invalidate_cache, + + l_librbd_opened_time, + l_librbd_lock_acquired_time, + + l_librbd_last, +}; + +typedef std::shared_ptr IOContext; + +typedef std::map SnapSeqs; + +/// Full information about an image's parent. +struct ParentImageInfo { + /// Identification of the parent. + cls::rbd::ParentImageSpec spec; + + /** @brief Where the portion of data shared with the child image ends. + * Since images can be resized multiple times, the portion of data shared + * with the child image is not necessarily min(parent size, child size). + * If the child image is first shrunk and then enlarged, the common portion + * will be shorter. */ + uint64_t overlap = 0; +}; + +struct SnapInfo { + std::string name; + cls::rbd::SnapshotNamespace snap_namespace; + uint64_t size; + ParentImageInfo parent; + uint8_t protection_status; + uint64_t flags; + utime_t timestamp; + SnapInfo(std::string _name, + const cls::rbd::SnapshotNamespace &_snap_namespace, + uint64_t _size, const ParentImageInfo &_parent, + uint8_t _protection_status, uint64_t _flags, utime_t _timestamp) + : name(_name), snap_namespace(_snap_namespace), size(_size), + parent(_parent), protection_status(_protection_status), flags(_flags), + timestamp(_timestamp) { + } +}; + +enum { + OPEN_FLAG_SKIP_OPEN_PARENT = 1 << 0, + OPEN_FLAG_OLD_FORMAT = 1 << 1, + OPEN_FLAG_IGNORE_MIGRATING = 1 << 2 +}; + +enum ImageReadOnlyFlag { + IMAGE_READ_ONLY_FLAG_USER = 1 << 0, + IMAGE_READ_ONLY_FLAG_NON_PRIMARY = 1 << 1, +}; + +enum SnapCreateFlag { + SNAP_CREATE_FLAG_SKIP_OBJECT_MAP = 1 << 0, + SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE = 1 << 1, + SNAP_CREATE_FLAG_IGNORE_NOTIFY_QUIESCE_ERROR = 1 << 2, +}; + +struct MigrationInfo { + int64_t pool_id = -1; + std::string pool_namespace; + std::string image_name; + std::string image_id; + std::string source_spec; + deep_copy::SnapMap snap_map; + uint64_t overlap = 0; + bool flatten = false; + + MigrationInfo() { + } + MigrationInfo(int64_t pool_id, const std::string& pool_namespace, + const std::string& image_name, const std::string& image_id, + const std::string& source_spec, + const deep_copy::SnapMap &snap_map, uint64_t overlap, + bool flatten) + : pool_id(pool_id), pool_namespace(pool_namespace), image_name(image_name), + image_id(image_id), source_spec(source_spec), snap_map(snap_map), + overlap(overlap), flatten(flatten) { + } + + bool empty() const { + return (pool_id == -1 && source_spec.empty()); + } +}; + +} // namespace librbd + +#endif // LIBRBD_TYPES_H diff --git a/src/librbd/Utils.cc b/src/librbd/Utils.cc new file mode 100644 index 000000000..0ea31fc1c --- /dev/null +++ b/src/librbd/Utils.cc @@ -0,0 +1,246 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "librbd/Utils.h" +#include "include/random.h" +#include "include/rbd_types.h" +#include "include/stringify.h" +#include "include/neorados/RADOS.hpp" +#include "include/rbd/features.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Features.h" + +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::util::" << __func__ << ": " + +namespace librbd { +namespace util { +namespace { + +const std::string CONFIG_KEY_URI_PREFIX{"config://"}; + +} // anonymous namespace + +const std::string group_header_name(const std::string &group_id) +{ + return RBD_GROUP_HEADER_PREFIX + group_id; +} + +const std::string id_obj_name(const std::string &name) +{ + return RBD_ID_PREFIX + name; +} + +const std::string header_name(const std::string &image_id) +{ + return RBD_HEADER_PREFIX + image_id; +} + +const std::string old_header_name(const std::string &image_name) +{ + return image_name + RBD_SUFFIX; +} + +std::string unique_lock_name(const std::string &name, void *address) { + return name + " (" + stringify(address) + ")"; +} + +librados::AioCompletion *create_rados_callback(Context *on_finish) { + return create_rados_callback(on_finish); +} + +std::string generate_image_id(librados::IoCtx &ioctx) { + librados::Rados rados(ioctx); + + uint64_t bid = rados.get_instance_id(); + std::mt19937 generator{random_device_t{}()}; + std::uniform_int_distribution distribution{0, 0xFFFFFFFF}; + uint32_t extra = distribution(generator); + + std::ostringstream bid_ss; + bid_ss << std::hex << bid << std::hex << extra; + std::string id = bid_ss.str(); + + // ensure the image id won't overflow the fixed block name size + if (id.length() > RBD_MAX_IMAGE_ID_LENGTH) { + id = id.substr(id.length() - RBD_MAX_IMAGE_ID_LENGTH); + } + + return id; +} + +uint64_t get_rbd_default_features(CephContext* cct) +{ + auto value = cct->_conf.get_val("rbd_default_features"); + return librbd::rbd_features_from_string(value, nullptr); +} + + +bool calc_sparse_extent(const bufferptr &bp, + size_t sparse_size, + uint64_t length, + size_t *write_offset, + size_t *write_length, + size_t *offset) { + size_t extent_size; + if (*offset + sparse_size > length) { + extent_size = length - *offset; + } else { + extent_size = sparse_size; + } + + bufferptr extent(bp, *offset, extent_size); + *offset += extent_size; + + bool extent_is_zero = extent.is_zero(); + if (!extent_is_zero) { + *write_length += extent_size; + } + if (extent_is_zero && *write_length == 0) { + *write_offset += extent_size; + } + + if ((extent_is_zero || *offset == length) && *write_length != 0) { + return true; + } + return false; +} + +bool is_metadata_config_override(const std::string& metadata_key, + std::string* config_key) { + size_t prefix_len = librbd::ImageCtx::METADATA_CONF_PREFIX.size(); + if (metadata_key.size() > prefix_len && + metadata_key.compare(0, prefix_len, + librbd::ImageCtx::METADATA_CONF_PREFIX) == 0) { + *config_key = metadata_key.substr(prefix_len, + metadata_key.size() - prefix_len); + return true; + } + return false; +} + +int create_ioctx(librados::IoCtx& src_io_ctx, const std::string& pool_desc, + int64_t pool_id, + const std::optional& pool_namespace, + librados::IoCtx* dst_io_ctx) { + auto cct = (CephContext *)src_io_ctx.cct(); + + librados::Rados rados(src_io_ctx); + int r = rados.ioctx_create2(pool_id, *dst_io_ctx); + if (r == -ENOENT) { + ldout(cct, 1) << pool_desc << " pool " << pool_id << " no longer exists" + << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "error accessing " << pool_desc << " pool " << pool_id + << dendl; + return r; + } + + dst_io_ctx->set_namespace( + pool_namespace ? *pool_namespace : src_io_ctx.get_namespace()); + if (src_io_ctx.get_pool_full_try()) { + dst_io_ctx->set_pool_full_try(); + } + return 0; +} + +int snap_create_flags_api_to_internal(CephContext *cct, uint32_t api_flags, + uint64_t *internal_flags) { + *internal_flags = 0; + + if (api_flags & RBD_SNAP_CREATE_SKIP_QUIESCE) { + *internal_flags |= SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE; + api_flags &= ~RBD_SNAP_CREATE_SKIP_QUIESCE; + } else if (api_flags & RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR) { + *internal_flags |= SNAP_CREATE_FLAG_IGNORE_NOTIFY_QUIESCE_ERROR; + api_flags &= ~RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR; + } + + if (api_flags != 0) { + lderr(cct) << "invalid snap create flags: " + << std::bitset<32>(api_flags) << dendl; + return -EINVAL; + } + + return 0; +} + +uint32_t get_default_snap_create_flags(ImageCtx *ictx) { + auto mode = ictx->config.get_val( + "rbd_default_snapshot_quiesce_mode"); + + if (mode == "required") { + return 0; + } else if (mode == "ignore-error") { + return RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR; + } else if (mode == "skip") { + return RBD_SNAP_CREATE_SKIP_QUIESCE; + } else { + ceph_abort_msg("invalid rbd_default_snapshot_quiesce_mode"); + } +} + +SnapContext get_snap_context( + const std::optional< + std::pair>>& write_snap_context) { + SnapContext snapc; + if (write_snap_context) { + snapc = SnapContext{write_snap_context->first, + {write_snap_context->second.begin(), + write_snap_context->second.end()}}; + } + return snapc; +} + +uint64_t reserve_async_request_id() { + static std::atomic async_request_seq = 0; + + return ++async_request_seq; +} + +bool is_config_key_uri(const std::string& uri) { + return boost::starts_with(uri, CONFIG_KEY_URI_PREFIX); +} + +int get_config_key(librados::Rados& rados, const std::string& uri, + std::string* value) { + auto cct = reinterpret_cast(rados.cct()); + + if (!is_config_key_uri(uri)) { + return -EINVAL; + } + + std::string key = uri.substr(CONFIG_KEY_URI_PREFIX.size()); + std::string cmd = + "{" + "\"prefix\": \"config-key get\", " + "\"key\": \"" + key + "\"" + "}"; + + bufferlist in_bl; + bufferlist out_bl; + int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr); + if (r < 0) { + lderr(cct) << "failed to retrieve MON config key " << key << ": " + << cpp_strerror(r) << dendl; + return r; + } + + *value = std::string(out_bl.c_str(), out_bl.length()); + return 0; +} + +} // namespace util +} // namespace librbd diff --git a/src/librbd/Utils.h b/src/librbd/Utils.h new file mode 100644 index 000000000..dee91feee --- /dev/null +++ b/src/librbd/Utils.h @@ -0,0 +1,286 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_UTILS_H +#define CEPH_LIBRBD_UTILS_H + +#include "include/rados/librados.hpp" +#include "include/rbd_types.h" +#include "include/ceph_assert.h" +#include "include/Context.h" +#include "common/snap_types.h" +#include "common/zipkin_trace.h" +#include "common/RefCountedObj.h" + +#include +#include +#include +#include +#include +#include + +namespace librbd { + +class ImageCtx; + +namespace util { +namespace detail { + +template +void rados_callback(rados_completion_t c, void *arg) { + reinterpret_cast(arg)->complete(rados_aio_get_return_value(c)); +} + +template +void rados_callback(rados_completion_t c, void *arg) { + T *obj = reinterpret_cast(arg); + int r = rados_aio_get_return_value(c); + (obj->*MF)(r); +} + +template +void rados_state_callback(rados_completion_t c, void *arg) { + T *obj = reinterpret_cast(arg); + int r = rados_aio_get_return_value(c); + Context *on_finish = (obj->*MF)(&r); + if (on_finish != nullptr) { + on_finish->complete(r); + if (destroy) { + delete obj; + } + } +} + +template +class C_CallbackAdapter : public Context { + T *obj; +public: + C_CallbackAdapter(T *obj) : obj(obj) { + } + +protected: + void finish(int r) override { + (obj->*MF)(r); + } +}; + +template +class C_RefCallbackAdapter : public Context { + RefCountedPtr refptr; + Context *on_finish; + +public: + C_RefCallbackAdapter(T *obj, RefCountedPtr refptr) + : refptr(std::move(refptr)), + on_finish(new C_CallbackAdapter(obj)) { + } + +protected: + void finish(int r) override { + on_finish->complete(r); + } +}; + +template +class C_StateCallbackAdapter : public Context { + T *obj; +public: + C_StateCallbackAdapter(T *obj) : obj(obj){ + } + +protected: + void complete(int r) override { + Context *on_finish = (obj->*MF)(&r); + if (on_finish != nullptr) { + on_finish->complete(r); + if (destroy) { + delete obj; + } + } + Context::complete(r); + } + void finish(int r) override { + } +}; + +template +class C_RefStateCallbackAdapter : public Context { + RefCountedPtr refptr; + Context *on_finish; + +public: + C_RefStateCallbackAdapter(T *obj, RefCountedPtr refptr) + : refptr(std::move(refptr)), + on_finish(new C_StateCallbackAdapter(obj)) { + } + +protected: + void finish(int r) override { + on_finish->complete(r); + } +}; + +template +struct C_AsyncCallback : public Context { + WQ *op_work_queue; + Context *on_finish; + + C_AsyncCallback(WQ *op_work_queue, Context *on_finish) + : op_work_queue(op_work_queue), on_finish(on_finish) { + } + ~C_AsyncCallback() override { + delete on_finish; + } + void finish(int r) override { + op_work_queue->queue(on_finish, r); + on_finish = nullptr; + } +}; + +} // namespace detail + +std::string generate_image_id(librados::IoCtx &ioctx); + +template +inline std::string generate_image_id(librados::IoCtx &ioctx) { + return generate_image_id(ioctx); +} + +const std::string group_header_name(const std::string &group_id); +const std::string id_obj_name(const std::string &name); +const std::string header_name(const std::string &image_id); +const std::string old_header_name(const std::string &image_name); +std::string unique_lock_name(const std::string &name, void *address); + +template +std::string data_object_name(I* image_ctx, uint64_t object_no) { + char buf[RBD_MAX_OBJ_NAME_SIZE]; + size_t length = snprintf(buf, RBD_MAX_OBJ_NAME_SIZE, + image_ctx->format_string, object_no); + ceph_assert(length < RBD_MAX_OBJ_NAME_SIZE); + + std::string oid; + oid.reserve(RBD_MAX_OBJ_NAME_SIZE); + oid.append(buf, length); + return oid; +} + +librados::AioCompletion *create_rados_callback(Context *on_finish); + +template +librados::AioCompletion *create_rados_callback(T *obj) { + return librados::Rados::aio_create_completion( + obj, &detail::rados_callback); +} + +template +librados::AioCompletion *create_rados_callback(T *obj) { + return librados::Rados::aio_create_completion( + obj, &detail::rados_callback); +} + +template +librados::AioCompletion *create_rados_callback(T *obj) { + return librados::Rados::aio_create_completion( + obj, &detail::rados_state_callback); +} + +template +Context *create_context_callback(T *obj) { + return new detail::C_CallbackAdapter(obj); +} + +template +Context *create_context_callback(T *obj) { + return new detail::C_StateCallbackAdapter(obj); +} + +//for reference counting objects +template +Context *create_context_callback(T *obj, RefCountedPtr refptr) { + return new detail::C_RefCallbackAdapter(obj, refptr); +} + +template +Context *create_context_callback(T *obj, RefCountedPtr refptr) { + return new detail::C_RefStateCallbackAdapter(obj, refptr); +} + +//for objects that don't inherit from RefCountedObj, to handle unit tests +template +typename std::enable_if::value, Context*>::type +create_context_callback(T *obj, R *refptr) { + return new detail::C_CallbackAdapter(obj); +} + +template +typename std::enable_if::value, Context*>::type +create_context_callback(T *obj, R *refptr) { + return new detail::C_StateCallbackAdapter(obj); +} + +template +Context *create_async_context_callback(I &image_ctx, Context *on_finish) { + // use async callback to acquire a clean lock context + return new detail::C_AsyncCallback< + typename std::decay::type>( + image_ctx.op_work_queue, on_finish); +} + +template +Context *create_async_context_callback(WQ *work_queue, Context *on_finish) { + // use async callback to acquire a clean lock context + return new detail::C_AsyncCallback(work_queue, on_finish); +} + +// TODO: temporary until AioCompletion supports templated ImageCtx +inline ImageCtx *get_image_ctx(ImageCtx *image_ctx) { + return image_ctx; +} + +uint64_t get_rbd_default_features(CephContext* cct); + +bool calc_sparse_extent(const bufferptr &bp, + size_t sparse_size, + uint64_t length, + size_t *write_offset, + size_t *write_length, + size_t *offset); + +template +inline ZTracer::Trace create_trace(const I &image_ctx, const char *trace_name, + const ZTracer::Trace &parent_trace) { + if (parent_trace.valid()) { + return ZTracer::Trace(trace_name, &image_ctx.trace_endpoint, &parent_trace); + } + return ZTracer::Trace(); +} + +bool is_metadata_config_override(const std::string& metadata_key, + std::string* config_key); + +int create_ioctx(librados::IoCtx& src_io_ctx, const std::string& pool_desc, + int64_t pool_id, + const std::optional& pool_namespace, + librados::IoCtx* dst_io_ctx); + +int snap_create_flags_api_to_internal(CephContext *cct, uint32_t api_flags, + uint64_t *internal_flags); + +uint32_t get_default_snap_create_flags(ImageCtx *ictx); + +SnapContext get_snap_context( + const std::optional< + std::pair>>& write_snap_context); + +uint64_t reserve_async_request_id(); + +bool is_config_key_uri(const std::string& uri); +int get_config_key(librados::Rados& rados, const std::string& uri, + std::string* value); + +} // namespace util +} // namespace librbd + +#endif // CEPH_LIBRBD_UTILS_H diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc new file mode 100644 index 000000000..413983f3e --- /dev/null +++ b/src/librbd/WatchNotifyTypes.cc @@ -0,0 +1,557 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cls/rbd/cls_rbd_types.h" +#include "common/Formatter.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "librbd/WatchNotifyTypes.h" + +namespace librbd { +namespace watch_notify { + +void AsyncRequestId::encode(bufferlist &bl) const { + using ceph::encode; + encode(client_id, bl); + encode(request_id, bl); +} + +void AsyncRequestId::decode(bufferlist::const_iterator &iter) { + using ceph::decode; + decode(client_id, iter); + decode(request_id, iter); +} + +void AsyncRequestId::dump(Formatter *f) const { + f->open_object_section("client_id"); + client_id.dump(f); + f->close_section(); + f->dump_unsigned("request_id", request_id); +} + +void AcquiredLockPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(client_id, bl); +} + +void AcquiredLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + if (version >= 2) { + decode(client_id, iter); + } +} + +void AcquiredLockPayload::dump(Formatter *f) const { + f->open_object_section("client_id"); + client_id.dump(f); + f->close_section(); +} + +void ReleasedLockPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(client_id, bl); +} + +void ReleasedLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + if (version >= 2) { + decode(client_id, iter); + } +} + +void ReleasedLockPayload::dump(Formatter *f) const { + f->open_object_section("client_id"); + client_id.dump(f); + f->close_section(); +} + +void RequestLockPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(client_id, bl); + encode(force, bl); +} + +void RequestLockPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + if (version >= 2) { + decode(client_id, iter); + } + if (version >= 3) { + decode(force, iter); + } +} + +void RequestLockPayload::dump(Formatter *f) const { + f->open_object_section("client_id"); + client_id.dump(f); + f->close_section(); + f->dump_bool("force", force); +} + +void HeaderUpdatePayload::encode(bufferlist &bl) const { +} + +void HeaderUpdatePayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void HeaderUpdatePayload::dump(Formatter *f) const { +} + +void AsyncRequestPayloadBase::encode(bufferlist &bl) const { + using ceph::encode; + encode(async_request_id, bl); +} + +void AsyncRequestPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(async_request_id, iter); +} + +void AsyncRequestPayloadBase::dump(Formatter *f) const { + f->open_object_section("async_request_id"); + async_request_id.dump(f); + f->close_section(); +} + +void AsyncProgressPayload::encode(bufferlist &bl) const { + using ceph::encode; + AsyncRequestPayloadBase::encode(bl); + encode(offset, bl); + encode(total, bl); +} + +void AsyncProgressPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + AsyncRequestPayloadBase::decode(version, iter); + decode(offset, iter); + decode(total, iter); +} + +void AsyncProgressPayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_unsigned("offset", offset); + f->dump_unsigned("total", total); +} + +void AsyncCompletePayload::encode(bufferlist &bl) const { + using ceph::encode; + AsyncRequestPayloadBase::encode(bl); + encode(result, bl); +} + +void AsyncCompletePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + AsyncRequestPayloadBase::decode(version, iter); + decode(result, iter); +} + +void AsyncCompletePayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_int("result", result); +} + +void ResizePayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(size, bl); + AsyncRequestPayloadBase::encode(bl); + encode(allow_shrink, bl); +} + +void ResizePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(size, iter); + AsyncRequestPayloadBase::decode(version, iter); + + if (version >= 4) { + decode(allow_shrink, iter); + } +} + +void ResizePayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_unsigned("size", size); + f->dump_bool("allow_shrink", allow_shrink); +} + +void SnapPayloadBase::encode(bufferlist &bl) const { + using ceph::encode; + encode(snap_name, bl); + encode(snap_namespace, bl); + encode(async_request_id, bl); +} + +void SnapPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(snap_name, iter); + if (version >= 6) { + decode(snap_namespace, iter); + } + if (version >= 7) { + decode(async_request_id, iter); + } +} + +void SnapPayloadBase::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_string("snap_name", snap_name); + snap_namespace.dump(f); +} + +void SnapCreatePayload::encode(bufferlist &bl) const { + using ceph::encode; + SnapPayloadBase::encode(bl); + encode(flags, bl); +} + +void SnapCreatePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + SnapPayloadBase::decode(version, iter); + if (version == 5) { + decode(snap_namespace, iter); + } + if (version >= 7) { + decode(flags, iter); + } +} + +void SnapCreatePayload::dump(Formatter *f) const { + SnapPayloadBase::dump(f); + f->dump_unsigned("flags", flags); +} + +void SnapRenamePayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(snap_id, bl); + SnapPayloadBase::encode(bl); +} + +void SnapRenamePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(snap_id, iter); + SnapPayloadBase::decode(version, iter); +} + +void SnapRenamePayload::dump(Formatter *f) const { + SnapPayloadBase::dump(f); + f->dump_unsigned("src_snap_id", snap_id); +} + +void RenamePayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(image_name, bl); + encode(async_request_id, bl); +} + +void RenamePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(image_name, iter); + if (version >= 7) { + decode(async_request_id, iter); + } +} + +void RenamePayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_string("image_name", image_name); +} + +void UpdateFeaturesPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(features, bl); + encode(enabled, bl); + encode(async_request_id, bl); +} + +void UpdateFeaturesPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(features, iter); + decode(enabled, iter); + if (version >= 7) { + decode(async_request_id, iter); + } +} + +void UpdateFeaturesPayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_unsigned("features", features); + f->dump_bool("enabled", enabled); +} + +void SparsifyPayload::encode(bufferlist &bl) const { + using ceph::encode; + AsyncRequestPayloadBase::encode(bl); + encode(sparse_size, bl); +} + +void SparsifyPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + AsyncRequestPayloadBase::decode(version, iter); + decode(sparse_size, iter); +} + +void SparsifyPayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_unsigned("sparse_size", sparse_size); +} + +void MetadataUpdatePayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(key, bl); + encode(value, bl); + encode(async_request_id, bl); +} + +void MetadataUpdatePayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(key, iter); + decode(value, iter); + if (version >= 7) { + decode(async_request_id, iter); + } +} + +void MetadataUpdatePayload::dump(Formatter *f) const { + AsyncRequestPayloadBase::dump(f); + f->dump_string("key", key); + f->dump_string("value", *value); +} + +void UnknownPayload::encode(bufferlist &bl) const { + ceph_abort(); +} + +void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void UnknownPayload::dump(Formatter *f) const { +} + +bool NotifyMessage::check_for_refresh() const { + return payload->check_for_refresh(); +} + +void NotifyMessage::encode(bufferlist& bl) const { + ENCODE_START(7, 1, bl); + encode(static_cast(payload->get_notify_op()), bl); + payload->encode(bl); + ENCODE_FINISH(bl); +} + +void NotifyMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(1, iter); + + uint32_t notify_op; + decode(notify_op, iter); + + // select the correct payload variant based upon the encoded op + switch (notify_op) { + case NOTIFY_OP_ACQUIRED_LOCK: + payload.reset(new AcquiredLockPayload()); + break; + case NOTIFY_OP_RELEASED_LOCK: + payload.reset(new ReleasedLockPayload()); + break; + case NOTIFY_OP_REQUEST_LOCK: + payload.reset(new RequestLockPayload()); + break; + case NOTIFY_OP_HEADER_UPDATE: + payload.reset(new HeaderUpdatePayload()); + break; + case NOTIFY_OP_ASYNC_PROGRESS: + payload.reset(new AsyncProgressPayload()); + break; + case NOTIFY_OP_ASYNC_COMPLETE: + payload.reset(new AsyncCompletePayload()); + break; + case NOTIFY_OP_FLATTEN: + payload.reset(new FlattenPayload()); + break; + case NOTIFY_OP_RESIZE: + payload.reset(new ResizePayload()); + break; + case NOTIFY_OP_SNAP_CREATE: + payload.reset(new SnapCreatePayload()); + break; + case NOTIFY_OP_SNAP_REMOVE: + payload.reset(new SnapRemovePayload()); + break; + case NOTIFY_OP_SNAP_RENAME: + payload.reset(new SnapRenamePayload()); + break; + case NOTIFY_OP_SNAP_PROTECT: + payload.reset(new SnapProtectPayload()); + break; + case NOTIFY_OP_SNAP_UNPROTECT: + payload.reset(new SnapUnprotectPayload()); + break; + case NOTIFY_OP_REBUILD_OBJECT_MAP: + payload.reset(new RebuildObjectMapPayload()); + break; + case NOTIFY_OP_RENAME: + payload.reset(new RenamePayload()); + break; + case NOTIFY_OP_UPDATE_FEATURES: + payload.reset(new UpdateFeaturesPayload()); + break; + case NOTIFY_OP_MIGRATE: + payload.reset(new MigratePayload()); + break; + case NOTIFY_OP_SPARSIFY: + payload.reset(new SparsifyPayload()); + break; + case NOTIFY_OP_QUIESCE: + payload.reset(new QuiescePayload()); + break; + case NOTIFY_OP_UNQUIESCE: + payload.reset(new UnquiescePayload()); + break; + case NOTIFY_OP_METADATA_UPDATE: + payload.reset(new MetadataUpdatePayload()); + break; + } + + payload->decode(struct_v, iter); + DECODE_FINISH(iter); +} + +void NotifyMessage::dump(Formatter *f) const { + payload->dump(f); +} + +NotifyOp NotifyMessage::get_notify_op() const { + return payload->get_notify_op(); +} + +void NotifyMessage::generate_test_instances(std::list &o) { + o.push_back(new NotifyMessage(new AcquiredLockPayload(ClientId(1, 2)))); + o.push_back(new NotifyMessage(new ReleasedLockPayload(ClientId(1, 2)))); + o.push_back(new NotifyMessage(new RequestLockPayload(ClientId(1, 2), true))); + o.push_back(new NotifyMessage(new HeaderUpdatePayload())); + o.push_back(new NotifyMessage(new AsyncProgressPayload(AsyncRequestId(ClientId(0, 1), 2), 3, 4))); + o.push_back(new NotifyMessage(new AsyncCompletePayload(AsyncRequestId(ClientId(0, 1), 2), 3))); + o.push_back(new NotifyMessage(new FlattenPayload(AsyncRequestId(ClientId(0, 1), 2)))); + o.push_back(new NotifyMessage(new ResizePayload(AsyncRequestId(ClientId(0, 1), 2), 123, true))); + o.push_back(new NotifyMessage(new SnapCreatePayload(AsyncRequestId(ClientId(0, 1), 2), + cls::rbd::UserSnapshotNamespace(), + "foo", 1))); + o.push_back(new NotifyMessage(new SnapRemovePayload(AsyncRequestId(ClientId(0, 1), 2), + cls::rbd::UserSnapshotNamespace(), "foo"))); + o.push_back(new NotifyMessage(new SnapProtectPayload(AsyncRequestId(ClientId(0, 1), 2), + cls::rbd::UserSnapshotNamespace(), "foo"))); + o.push_back(new NotifyMessage(new SnapUnprotectPayload(AsyncRequestId(ClientId(0, 1), 2), + cls::rbd::UserSnapshotNamespace(), "foo"))); + o.push_back(new NotifyMessage(new RebuildObjectMapPayload(AsyncRequestId(ClientId(0, 1), 2)))); + o.push_back(new NotifyMessage(new RenamePayload(AsyncRequestId(ClientId(0, 1), 2), "foo"))); + o.push_back(new NotifyMessage(new UpdateFeaturesPayload(AsyncRequestId(ClientId(0, 1), 2), + 1, true))); + o.push_back(new NotifyMessage(new MigratePayload(AsyncRequestId(ClientId(0, 1), 2)))); + o.push_back(new NotifyMessage(new SparsifyPayload(AsyncRequestId(ClientId(0, 1), 2), 1))); + o.push_back(new NotifyMessage(new QuiescePayload(AsyncRequestId(ClientId(0, 1), 2)))); + o.push_back(new NotifyMessage(new UnquiescePayload(AsyncRequestId(ClientId(0, 1), 2)))); + o.push_back(new NotifyMessage(new MetadataUpdatePayload(AsyncRequestId(ClientId(0, 1), 2), + "foo", std::optional{"xyz"}))); +} + +void ResponseMessage::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(result, bl); + ENCODE_FINISH(bl); +} + +void ResponseMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(1, iter); + decode(result, iter); + DECODE_FINISH(iter); +} + +void ResponseMessage::dump(Formatter *f) const { + f->dump_int("result", result); +} + +void ResponseMessage::generate_test_instances(std::list &o) { + o.push_back(new ResponseMessage(1)); +} + +std::ostream &operator<<(std::ostream &out, + const librbd::watch_notify::NotifyOp &op) { + using namespace librbd::watch_notify; + + switch (op) { + case NOTIFY_OP_ACQUIRED_LOCK: + out << "AcquiredLock"; + break; + case NOTIFY_OP_RELEASED_LOCK: + out << "ReleasedLock"; + break; + case NOTIFY_OP_REQUEST_LOCK: + out << "RequestLock"; + break; + case NOTIFY_OP_HEADER_UPDATE: + out << "HeaderUpdate"; + break; + case NOTIFY_OP_ASYNC_PROGRESS: + out << "AsyncProgress"; + break; + case NOTIFY_OP_ASYNC_COMPLETE: + out << "AsyncComplete"; + break; + case NOTIFY_OP_FLATTEN: + out << "Flatten"; + break; + case NOTIFY_OP_RESIZE: + out << "Resize"; + break; + case NOTIFY_OP_SNAP_CREATE: + out << "SnapCreate"; + break; + case NOTIFY_OP_SNAP_REMOVE: + out << "SnapRemove"; + break; + case NOTIFY_OP_SNAP_RENAME: + out << "SnapRename"; + break; + case NOTIFY_OP_SNAP_PROTECT: + out << "SnapProtect"; + break; + case NOTIFY_OP_SNAP_UNPROTECT: + out << "SnapUnprotect"; + break; + case NOTIFY_OP_REBUILD_OBJECT_MAP: + out << "RebuildObjectMap"; + break; + case NOTIFY_OP_RENAME: + out << "Rename"; + break; + case NOTIFY_OP_UPDATE_FEATURES: + out << "UpdateFeatures"; + break; + case NOTIFY_OP_MIGRATE: + out << "Migrate"; + break; + case NOTIFY_OP_SPARSIFY: + out << "Sparsify"; + break; + case NOTIFY_OP_QUIESCE: + out << "Quiesce"; + break; + case NOTIFY_OP_UNQUIESCE: + out << "Unquiesce"; + break; + case NOTIFY_OP_METADATA_UPDATE: + out << "MetadataUpdate"; + break; + default: + out << "Unknown (" << static_cast(op) << ")"; + break; + } + return out; +} + +std::ostream &operator<<(std::ostream &out, + const librbd::watch_notify::AsyncRequestId &request) { + out << "[" << request.client_id.gid << "," << request.client_id.handle << "," + << request.request_id << "]"; + return out; +} +} // namespace watch_notify +} // namespace librbd diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h new file mode 100644 index 000000000..4fad31ffa --- /dev/null +++ b/src/librbd/WatchNotifyTypes.h @@ -0,0 +1,532 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef LIBRBD_WATCH_NOTIFY_TYPES_H +#define LIBRBD_WATCH_NOTIFY_TYPES_H + +#include "cls/rbd/cls_rbd_types.h" +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "librbd/watcher/Types.h" +#include +#include +#include +#include +#include + +namespace ceph { +class Formatter; +} + +namespace librbd { +namespace watch_notify { + +using librbd::watcher::ClientId; + +WRITE_CLASS_ENCODER(ClientId); + +struct AsyncRequestId { + ClientId client_id; + uint64_t request_id; + + AsyncRequestId() : request_id() {} + AsyncRequestId(const ClientId &client_id_, uint64_t request_id_) + : client_id(client_id_), request_id(request_id_) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + inline bool operator<(const AsyncRequestId &rhs) const { + if (client_id != rhs.client_id) { + return client_id < rhs.client_id; + } else { + return request_id < rhs.request_id; + } + } + inline bool operator!=(const AsyncRequestId &rhs) const { + return (client_id != rhs.client_id || request_id != rhs.request_id); + } + inline operator bool() const { + return (*this != AsyncRequestId()); + } +}; + +enum NotifyOp { + NOTIFY_OP_ACQUIRED_LOCK = 0, + NOTIFY_OP_RELEASED_LOCK = 1, + NOTIFY_OP_REQUEST_LOCK = 2, + NOTIFY_OP_HEADER_UPDATE = 3, + NOTIFY_OP_ASYNC_PROGRESS = 4, + NOTIFY_OP_ASYNC_COMPLETE = 5, + NOTIFY_OP_FLATTEN = 6, + NOTIFY_OP_RESIZE = 7, + NOTIFY_OP_SNAP_CREATE = 8, + NOTIFY_OP_SNAP_REMOVE = 9, + NOTIFY_OP_REBUILD_OBJECT_MAP = 10, + NOTIFY_OP_SNAP_RENAME = 11, + NOTIFY_OP_SNAP_PROTECT = 12, + NOTIFY_OP_SNAP_UNPROTECT = 13, + NOTIFY_OP_RENAME = 14, + NOTIFY_OP_UPDATE_FEATURES = 15, + NOTIFY_OP_MIGRATE = 16, + NOTIFY_OP_SPARSIFY = 17, + NOTIFY_OP_QUIESCE = 18, + NOTIFY_OP_UNQUIESCE = 19, + NOTIFY_OP_METADATA_UPDATE = 20, +}; + +struct Payload { + virtual ~Payload() {} + + virtual NotifyOp get_notify_op() const = 0; + virtual bool check_for_refresh() const = 0; + + virtual void encode(bufferlist &bl) const = 0; + virtual void decode(__u8 version, bufferlist::const_iterator &iter) = 0; + virtual void dump(Formatter *f) const = 0; +}; + +struct AcquiredLockPayload : public Payload { + ClientId client_id; + + AcquiredLockPayload() {} + AcquiredLockPayload(const ClientId &client_id) : client_id(client_id) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_ACQUIRED_LOCK; + } + bool check_for_refresh() const override { + return false; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct ReleasedLockPayload : public Payload { + ClientId client_id; + + ReleasedLockPayload() {} + ReleasedLockPayload(const ClientId &client_id) : client_id(client_id) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_RELEASED_LOCK; + } + bool check_for_refresh() const override { + return false; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct RequestLockPayload : public Payload { + ClientId client_id; + bool force = false; + + RequestLockPayload() {} + RequestLockPayload(const ClientId &client_id, bool force) + : client_id(client_id), force(force) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_REQUEST_LOCK; + } + bool check_for_refresh() const override { + return false; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct HeaderUpdatePayload : public Payload { + NotifyOp get_notify_op() const override { + return NOTIFY_OP_HEADER_UPDATE; + } + bool check_for_refresh() const override { + return false; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct AsyncRequestPayloadBase : public Payload { +public: + AsyncRequestId async_request_id; + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; + +protected: + AsyncRequestPayloadBase() {} + AsyncRequestPayloadBase(const AsyncRequestId &id) : async_request_id(id) {} +}; + +struct AsyncProgressPayload : public AsyncRequestPayloadBase { + uint64_t offset = 0; + uint64_t total = 0; + + AsyncProgressPayload() {} + AsyncProgressPayload(const AsyncRequestId &id, uint64_t offset, uint64_t total) + : AsyncRequestPayloadBase(id), offset(offset), total(total) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_ASYNC_PROGRESS; + } + bool check_for_refresh() const override { + return false; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct AsyncCompletePayload : public AsyncRequestPayloadBase { + int result = 0; + + AsyncCompletePayload() {} + AsyncCompletePayload(const AsyncRequestId &id, int r) + : AsyncRequestPayloadBase(id), result(r) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_ASYNC_COMPLETE; + } + bool check_for_refresh() const override { + return false; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct FlattenPayload : public AsyncRequestPayloadBase { + FlattenPayload() {} + FlattenPayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_FLATTEN; + } + bool check_for_refresh() const override { + return true; + } +}; + +struct ResizePayload : public AsyncRequestPayloadBase { + uint64_t size = 0; + bool allow_shrink = true; + + ResizePayload() {} + ResizePayload(const AsyncRequestId &id, uint64_t size, bool allow_shrink) + : AsyncRequestPayloadBase(id), size(size), allow_shrink(allow_shrink) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_RESIZE; + } + bool check_for_refresh() const override { + return true; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct SnapPayloadBase : public AsyncRequestPayloadBase { +public: + cls::rbd::SnapshotNamespace snap_namespace; + std::string snap_name; + + bool check_for_refresh() const override { + return true; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; + +protected: + SnapPayloadBase() {} + SnapPayloadBase(const AsyncRequestId &id, + const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &name) + : AsyncRequestPayloadBase(id), snap_namespace(snap_namespace), + snap_name(name) { + } +}; + +struct SnapCreatePayload : public SnapPayloadBase { + uint64_t flags = 0; + + SnapCreatePayload() {} + SnapCreatePayload(const AsyncRequestId &id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &name, uint64_t flags) + : SnapPayloadBase(id, snap_namespace, name), flags(flags) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_SNAP_CREATE; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct SnapRenamePayload : public SnapPayloadBase { + uint64_t snap_id = 0; + + SnapRenamePayload() {} + SnapRenamePayload(const AsyncRequestId &id, + const uint64_t &src_snap_id, + const std::string &dst_name) + : SnapPayloadBase(id, cls::rbd::UserSnapshotNamespace(), dst_name), + snap_id(src_snap_id) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_SNAP_RENAME; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct SnapRemovePayload : public SnapPayloadBase { + SnapRemovePayload() {} + SnapRemovePayload(const AsyncRequestId &id, + const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &name) + : SnapPayloadBase(id, snap_namespace, name) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_SNAP_REMOVE; + } +}; + +struct SnapProtectPayload : public SnapPayloadBase { + SnapProtectPayload() {} + SnapProtectPayload(const AsyncRequestId &id, + const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &name) + : SnapPayloadBase(id, snap_namespace, name) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_SNAP_PROTECT; + } +}; + +struct SnapUnprotectPayload : public SnapPayloadBase { + SnapUnprotectPayload() {} + SnapUnprotectPayload(const AsyncRequestId &id, + const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &name) + : SnapPayloadBase(id, snap_namespace, name) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_SNAP_UNPROTECT; + } +}; + +struct RebuildObjectMapPayload : public AsyncRequestPayloadBase { + RebuildObjectMapPayload() {} + RebuildObjectMapPayload(const AsyncRequestId &id) + : AsyncRequestPayloadBase(id) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_REBUILD_OBJECT_MAP; + } + bool check_for_refresh() const override { + return true; + } +}; + +struct RenamePayload : public AsyncRequestPayloadBase { + std::string image_name; + + RenamePayload() {} + RenamePayload(const AsyncRequestId &id, const std::string _image_name) + : AsyncRequestPayloadBase(id), image_name(_image_name) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_RENAME; + } + bool check_for_refresh() const override { + return true; + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct UpdateFeaturesPayload : public AsyncRequestPayloadBase { + uint64_t features = 0; + bool enabled = false; + + UpdateFeaturesPayload() {} + UpdateFeaturesPayload(const AsyncRequestId &id, uint64_t features, + bool enabled) + : AsyncRequestPayloadBase(id), features(features), enabled(enabled) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_UPDATE_FEATURES; + } + bool check_for_refresh() const override { + return true; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct MigratePayload : public AsyncRequestPayloadBase { + MigratePayload() {} + MigratePayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_MIGRATE; + } + bool check_for_refresh() const override { + return true; + } +}; + +struct SparsifyPayload : public AsyncRequestPayloadBase { + uint64_t sparse_size = 0; + + SparsifyPayload() {} + SparsifyPayload(const AsyncRequestId &id, uint64_t sparse_size) + : AsyncRequestPayloadBase(id), sparse_size(sparse_size) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_SPARSIFY; + } + bool check_for_refresh() const override { + return true; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct QuiescePayload : public AsyncRequestPayloadBase { + QuiescePayload() {} + QuiescePayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_QUIESCE; + } + bool check_for_refresh() const override { + return false; + } +}; + +struct UnquiescePayload : public AsyncRequestPayloadBase { + UnquiescePayload() {} + UnquiescePayload(const AsyncRequestId &id) : AsyncRequestPayloadBase(id) {} + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_UNQUIESCE; + } + bool check_for_refresh() const override { + return false; + } +}; + +struct MetadataUpdatePayload : public AsyncRequestPayloadBase { + std::string key; + std::optional value; + MetadataUpdatePayload() {} + MetadataUpdatePayload(const AsyncRequestId &id, std::string key, + std::optional value) + : AsyncRequestPayloadBase(id), key(key), value(value) { + } + + NotifyOp get_notify_op() const override { + return NOTIFY_OP_METADATA_UPDATE; + } + bool check_for_refresh() const override { + return false; + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct UnknownPayload : public Payload { + NotifyOp get_notify_op() const override { + return static_cast(-1); + } + bool check_for_refresh() const override { + return false; + } + + void encode(bufferlist &bl) const override; + void decode(__u8 version, bufferlist::const_iterator &iter) override; + void dump(Formatter *f) const override; +}; + +struct NotifyMessage { + NotifyMessage() : payload(new UnknownPayload()) {} + NotifyMessage(Payload *payload) : payload(payload) {} + + std::unique_ptr payload; + + bool check_for_refresh() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + NotifyOp get_notify_op() const; + + static void generate_test_instances(std::list &o); +}; + +struct ResponseMessage { + ResponseMessage() : result(0) {} + ResponseMessage(int result_) : result(result_) {} + + int result; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list &o); +}; + +std::ostream &operator<<(std::ostream &out, + const NotifyOp &op); +std::ostream &operator<<(std::ostream &out, + const AsyncRequestId &request); + +WRITE_CLASS_ENCODER(AsyncRequestId); +WRITE_CLASS_ENCODER(NotifyMessage); +WRITE_CLASS_ENCODER(ResponseMessage); + +} // namespace watch_notify +} // namespace librbd + + +#endif // LIBRBD_WATCH_NOTIFY_TYPES_H diff --git a/src/librbd/Watcher.cc b/src/librbd/Watcher.cc new file mode 100644 index 000000000..c215d6df7 --- /dev/null +++ b/src/librbd/Watcher.cc @@ -0,0 +1,370 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/Watcher.h" +#include "librbd/watcher/RewatchRequest.h" +#include "librbd/Utils.h" +#include "librbd/TaskFinisher.h" +#include "librbd/asio/ContextWQ.h" +#include "include/encoding.h" +#include "common/errno.h" +#include + +// re-include our assert to clobber the system one; fix dout: +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd + +namespace librbd { + +using namespace boost::placeholders; + +using namespace watcher; + +using util::create_context_callback; +using util::create_rados_callback; +using std::string; + +namespace { + +struct C_UnwatchAndFlush : public Context { + librados::Rados rados; + Context *on_finish; + bool flushing = false; + int ret_val = 0; + + C_UnwatchAndFlush(librados::IoCtx &io_ctx, Context *on_finish) + : rados(io_ctx), on_finish(on_finish) { + } + + void complete(int r) override { + if (ret_val == 0 && r < 0) { + ret_val = r; + } + + if (!flushing) { + flushing = true; + + librados::AioCompletion *aio_comp = create_rados_callback(this); + r = rados.aio_watch_flush(aio_comp); + ceph_assert(r == 0); + aio_comp->release(); + return; + } + + // ensure our reference to the RadosClient is released prior + // to completing the callback to avoid racing an explicit + // librados shutdown + Context *ctx = on_finish; + r = ret_val; + delete this; + + ctx->complete(r); + } + + void finish(int r) override { + } +}; + +} // anonymous namespace + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Watcher::C_NotifyAck " << this << " " \ + << __func__ << ": " + +Watcher::C_NotifyAck::C_NotifyAck(Watcher *watcher, uint64_t notify_id, + uint64_t handle) + : watcher(watcher), cct(watcher->m_cct), notify_id(notify_id), + handle(handle) { + ldout(cct, 10) << "id=" << notify_id << ", " << "handle=" << handle << dendl; +} + +void Watcher::C_NotifyAck::finish(int r) { + ldout(cct, 10) << "r=" << r << dendl; + ceph_assert(r == 0); + watcher->acknowledge_notify(notify_id, handle, out); +} + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Watcher: " << this << " " << __func__ \ + << ": " + +Watcher::Watcher(librados::IoCtx& ioctx, asio::ContextWQ *work_queue, + const string& oid) + : m_ioctx(ioctx), m_work_queue(work_queue), m_oid(oid), + m_cct(reinterpret_cast(ioctx.cct())), + m_watch_lock(ceph::make_shared_mutex( + util::unique_lock_name("librbd::Watcher::m_watch_lock", this))), + m_watch_handle(0), m_notifier(work_queue, ioctx, oid), + m_watch_state(WATCH_STATE_IDLE), m_watch_ctx(*this) { +} + +Watcher::~Watcher() { + std::shared_lock l{m_watch_lock}; + ceph_assert(is_unregistered(m_watch_lock)); +} + +void Watcher::register_watch(Context *on_finish) { + ldout(m_cct, 10) << dendl; + + std::unique_lock watch_locker{m_watch_lock}; + ceph_assert(is_unregistered(m_watch_lock)); + m_watch_state = WATCH_STATE_REGISTERING; + m_watch_blocklisted = false; + + librados::AioCompletion *aio_comp = create_rados_callback( + new C_RegisterWatch(this, on_finish)); + int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_watch_handle, &m_watch_ctx); + ceph_assert(r == 0); + aio_comp->release(); +} + +void Watcher::handle_register_watch(int r, Context *on_finish) { + ldout(m_cct, 10) << "r=" << r << dendl; + + bool watch_error = false; + Context *unregister_watch_ctx = nullptr; + { + std::unique_lock watch_locker{m_watch_lock}; + ceph_assert(m_watch_state == WATCH_STATE_REGISTERING); + + m_watch_state = WATCH_STATE_IDLE; + if (r < 0) { + lderr(m_cct) << "failed to register watch: " << cpp_strerror(r) + << dendl; + m_watch_handle = 0; + } + + if (m_unregister_watch_ctx != nullptr) { + std::swap(unregister_watch_ctx, m_unregister_watch_ctx); + } else if (r == 0 && m_watch_error) { + lderr(m_cct) << "re-registering watch after error" << dendl; + m_watch_state = WATCH_STATE_REWATCHING; + watch_error = true; + } else { + m_watch_blocklisted = (r == -EBLOCKLISTED); + } + } + + on_finish->complete(r); + + if (unregister_watch_ctx != nullptr) { + unregister_watch_ctx->complete(0); + } else if (watch_error) { + rewatch(); + } +} + +void Watcher::unregister_watch(Context *on_finish) { + ldout(m_cct, 10) << dendl; + + { + std::unique_lock watch_locker{m_watch_lock}; + if (m_watch_state != WATCH_STATE_IDLE) { + ldout(m_cct, 10) << "delaying unregister until register completed" + << dendl; + + ceph_assert(m_unregister_watch_ctx == nullptr); + m_unregister_watch_ctx = new LambdaContext([this, on_finish](int r) { + unregister_watch(on_finish); + }); + return; + } else if (is_registered(m_watch_lock)) { + librados::AioCompletion *aio_comp = create_rados_callback( + new C_UnwatchAndFlush(m_ioctx, on_finish)); + int r = m_ioctx.aio_unwatch(m_watch_handle, aio_comp); + ceph_assert(r == 0); + aio_comp->release(); + + m_watch_handle = 0; + m_watch_blocklisted = false; + return; + } + } + + on_finish->complete(0); +} + +bool Watcher::notifications_blocked() const { + std::shared_lock locker{m_watch_lock}; + + bool blocked = (m_blocked_count > 0); + ldout(m_cct, 5) << "blocked=" << blocked << dendl; + return blocked; +} + +void Watcher::block_notifies(Context *on_finish) { + { + std::unique_lock locker{m_watch_lock}; + ++m_blocked_count; + ldout(m_cct, 5) << "blocked_count=" << m_blocked_count << dendl; + } + m_async_op_tracker.wait_for_ops(on_finish); +} + +void Watcher::unblock_notifies() { + std::unique_lock locker{m_watch_lock}; + ceph_assert(m_blocked_count > 0); + --m_blocked_count; + ldout(m_cct, 5) << "blocked_count=" << m_blocked_count << dendl; +} + +void Watcher::flush(Context *on_finish) { + m_notifier.flush(on_finish); +} + +std::string Watcher::get_oid() const { + std::shared_lock locker{m_watch_lock}; + return m_oid; +} + +void Watcher::set_oid(const string& oid) { + std::unique_lock watch_locker{m_watch_lock}; + ceph_assert(is_unregistered(m_watch_lock)); + + m_oid = oid; +} + +void Watcher::handle_error(uint64_t handle, int err) { + lderr(m_cct) << "handle=" << handle << ": " << cpp_strerror(err) << dendl; + + std::unique_lock watch_locker{m_watch_lock}; + m_watch_error = true; + + if (is_registered(m_watch_lock)) { + m_watch_state = WATCH_STATE_REWATCHING; + if (err == -EBLOCKLISTED) { + m_watch_blocklisted = true; + } + + auto ctx = new LambdaContext( + boost::bind(&Watcher::rewatch, this)); + m_work_queue->queue(ctx); + } +} + +void Watcher::acknowledge_notify(uint64_t notify_id, uint64_t handle, + bufferlist &out) { + m_ioctx.notify_ack(m_oid, notify_id, handle, out); +} + +void Watcher::rewatch() { + ldout(m_cct, 10) << dendl; + + Context *unregister_watch_ctx = nullptr; + { + std::unique_lock watch_locker{m_watch_lock}; + ceph_assert(m_watch_state == WATCH_STATE_REWATCHING); + + if (m_unregister_watch_ctx != nullptr) { + m_watch_state = WATCH_STATE_IDLE; + std::swap(unregister_watch_ctx, m_unregister_watch_ctx); + } else { + m_watch_error = false; + auto ctx = create_context_callback< + Watcher, &Watcher::handle_rewatch>(this); + auto req = RewatchRequest::create(m_ioctx, m_oid, m_watch_lock, + &m_watch_ctx, &m_watch_handle, ctx); + req->send(); + return; + } + } + + unregister_watch_ctx->complete(0); +} + +void Watcher::handle_rewatch(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + bool watch_error = false; + Context *unregister_watch_ctx = nullptr; + { + std::unique_lock watch_locker{m_watch_lock}; + ceph_assert(m_watch_state == WATCH_STATE_REWATCHING); + + m_watch_blocklisted = false; + if (m_unregister_watch_ctx != nullptr) { + ldout(m_cct, 10) << "image is closing, skip rewatch" << dendl; + m_watch_state = WATCH_STATE_IDLE; + std::swap(unregister_watch_ctx, m_unregister_watch_ctx); + } else if (r == -EBLOCKLISTED) { + lderr(m_cct) << "client blocklisted" << dendl; + m_watch_blocklisted = true; + } else if (r == -ENOENT) { + ldout(m_cct, 5) << "object does not exist" << dendl; + } else if (r < 0) { + lderr(m_cct) << "failed to rewatch: " << cpp_strerror(r) << dendl; + watch_error = true; + } else if (m_watch_error) { + lderr(m_cct) << "re-registering watch after error" << dendl; + watch_error = true; + } + } + + if (unregister_watch_ctx != nullptr) { + unregister_watch_ctx->complete(0); + return; + } else if (watch_error) { + rewatch(); + return; + } + + auto ctx = create_context_callback< + Watcher, &Watcher::handle_rewatch_callback>(this); + m_work_queue->queue(ctx, r); +} + +void Watcher::handle_rewatch_callback(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + handle_rewatch_complete(r); + + bool watch_error = false; + Context *unregister_watch_ctx = nullptr; + { + std::unique_lock watch_locker{m_watch_lock}; + ceph_assert(m_watch_state == WATCH_STATE_REWATCHING); + + if (m_unregister_watch_ctx != nullptr) { + m_watch_state = WATCH_STATE_IDLE; + std::swap(unregister_watch_ctx, m_unregister_watch_ctx); + } else if (r == -EBLOCKLISTED || r == -ENOENT) { + m_watch_state = WATCH_STATE_IDLE; + } else if (r < 0 || m_watch_error) { + watch_error = true; + } else { + m_watch_state = WATCH_STATE_IDLE; + } + } + + if (unregister_watch_ctx != nullptr) { + unregister_watch_ctx->complete(0); + } else if (watch_error) { + rewatch(); + } +} + +void Watcher::send_notify(bufferlist& payload, + watcher::NotifyResponse *response, + Context *on_finish) { + m_notifier.notify(payload, response, on_finish); +} + +void Watcher::WatchCtx::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist& bl) { + // if notifications are blocked, finish the notification w/o + // bubbling the notification up to the derived class + watcher.m_async_op_tracker.start_op(); + if (watcher.notifications_blocked()) { + bufferlist bl; + watcher.acknowledge_notify(notify_id, handle, bl); + } else { + watcher.handle_notify(notify_id, handle, notifier_id, bl); + } + watcher.m_async_op_tracker.finish_op(); +} + +void Watcher::WatchCtx::handle_error(uint64_t handle, int err) { + watcher.handle_error(handle, err); +} + +} // namespace librbd diff --git a/src/librbd/Watcher.h b/src/librbd/Watcher.h new file mode 100644 index 000000000..e029430c5 --- /dev/null +++ b/src/librbd/Watcher.h @@ -0,0 +1,183 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_H +#define CEPH_LIBRBD_WATCHER_H + +#include "common/AsyncOpTracker.h" +#include "common/ceph_mutex.h" +#include "common/RWLock.h" +#include "include/rados/librados.hpp" +#include "librbd/watcher/Notifier.h" +#include "librbd/watcher/Types.h" +#include +#include + +namespace librbd { + +namespace asio { struct ContextWQ; } +namespace watcher { struct NotifyResponse; } + +class Watcher { +public: + struct C_NotifyAck : public Context { + Watcher *watcher; + CephContext *cct; + uint64_t notify_id; + uint64_t handle; + bufferlist out; + + C_NotifyAck(Watcher *watcher, uint64_t notify_id, uint64_t handle); + void finish(int r) override; + }; + + Watcher(librados::IoCtx& ioctx, asio::ContextWQ *work_queue, + const std::string& oid); + virtual ~Watcher(); + + void register_watch(Context *on_finish); + virtual void unregister_watch(Context *on_finish); + void flush(Context *on_finish); + + bool notifications_blocked() const; + virtual void block_notifies(Context *on_finish); + void unblock_notifies(); + + std::string get_oid() const; + void set_oid(const std::string& oid); + + uint64_t get_watch_handle() const { + std::shared_lock watch_locker{m_watch_lock}; + return m_watch_handle; + } + + bool is_registered() const { + std::shared_lock locker{m_watch_lock}; + return is_registered(m_watch_lock); + } + bool is_unregistered() const { + std::shared_lock locker{m_watch_lock}; + return is_unregistered(m_watch_lock); + } + bool is_blocklisted() const { + std::shared_lock locker{m_watch_lock}; + return m_watch_blocklisted; + } + +protected: + enum WatchState { + WATCH_STATE_IDLE, + WATCH_STATE_REGISTERING, + WATCH_STATE_REWATCHING + }; + + librados::IoCtx& m_ioctx; + asio::ContextWQ *m_work_queue; + std::string m_oid; + CephContext *m_cct; + mutable ceph::shared_mutex m_watch_lock; + uint64_t m_watch_handle; + watcher::Notifier m_notifier; + + WatchState m_watch_state; + bool m_watch_blocklisted = false; + + AsyncOpTracker m_async_op_tracker; + + bool is_registered(const ceph::shared_mutex&) const { + return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle != 0); + } + bool is_unregistered(const ceph::shared_mutex&) const { + return (m_watch_state == WATCH_STATE_IDLE && m_watch_handle == 0); + } + + void send_notify(bufferlist &payload, + watcher::NotifyResponse *response = nullptr, + Context *on_finish = nullptr); + + virtual void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist &bl) = 0; + + virtual void handle_error(uint64_t cookie, int err); + + void acknowledge_notify(uint64_t notify_id, uint64_t handle, + bufferlist &out); + + virtual void handle_rewatch_complete(int r) { } + +private: + /** + * @verbatim + * + * + * | + * v + * UNREGISTERED + * | + * | (register_watch) + * | + * REGISTERING + * | + * v (watch error) + * REGISTERED * * * * * * * > ERROR + * | ^ | + * | | | (rewatch) + * | | v + * | | REWATCHING + * | | | + * | | | + * | \---------------------/ + * | + * | (unregister_watch) + * | + * v + * UNREGISTERED + * | + * v + * + * + * @endverbatim + */ + + struct WatchCtx : public librados::WatchCtx2 { + Watcher &watcher; + + WatchCtx(Watcher &parent) : watcher(parent) {} + + void handle_notify(uint64_t notify_id, + uint64_t handle, + uint64_t notifier_id, + bufferlist& bl) override; + void handle_error(uint64_t handle, int err) override; + }; + + struct C_RegisterWatch : public Context { + Watcher *watcher; + Context *on_finish; + + C_RegisterWatch(Watcher *watcher, Context *on_finish) + : watcher(watcher), on_finish(on_finish) { + } + void finish(int r) override { + watcher->handle_register_watch(r, on_finish); + } + }; + + WatchCtx m_watch_ctx; + Context *m_unregister_watch_ctx = nullptr; + + bool m_watch_error = false; + + uint32_t m_blocked_count = 0; + + void handle_register_watch(int r, Context *on_finish); + + void rewatch(); + void handle_rewatch(int r); + void handle_rewatch_callback(int r); + +}; + +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_H diff --git a/src/librbd/api/Config.cc b/src/librbd/api/Config.cc new file mode 100644 index 000000000..8148607e3 --- /dev/null +++ b/src/librbd/api/Config.cc @@ -0,0 +1,233 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Config.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/api/PoolMetadata.h" +#include "librbd/image/GetMetadataRequest.h" +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Config: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +const uint32_t MAX_KEYS = 64; + +typedef std::map> Parent; + +static std::set EXCLUDE_OPTIONS { + "rbd_auto_exclusive_lock_until_manual_request", + "rbd_default_format", + "rbd_default_pool", + "rbd_discard_on_zeroed_write_same", + "rbd_op_thread_timeout", + "rbd_op_threads", + "rbd_tracing", + "rbd_validate_names", + "rbd_validate_pool", + "rbd_mirror_pool_replayers_refresh_interval", + "rbd_config_pool_override_update_timestamp" + }; +static std::set EXCLUDE_IMAGE_OPTIONS { + "rbd_default_clone_format", + "rbd_default_data_pool", + "rbd_default_features", + "rbd_default_format", + "rbd_default_order", + "rbd_default_stripe_count", + "rbd_default_stripe_unit", + "rbd_journal_order", + "rbd_journal_pool", + "rbd_journal_splay_width" + }; + +struct Options : Parent { + librados::IoCtx m_io_ctx; + + Options(librados::IoCtx& io_ctx, bool image_apply_only_options) { + m_io_ctx.dup(io_ctx); + m_io_ctx.set_namespace(""); + + CephContext *cct = reinterpret_cast(m_io_ctx.cct()); + + const std::string rbd_key_prefix("rbd_"); + const std::string rbd_mirror_key_prefix("rbd_mirror_"); + auto& schema = cct->_conf.get_schema(); + for (auto& pair : schema) { + if (!boost::starts_with(pair.first, rbd_key_prefix)) { + continue; + } else if (EXCLUDE_OPTIONS.count(pair.first) != 0) { + continue; + } else if (image_apply_only_options && + EXCLUDE_IMAGE_OPTIONS.count(pair.first) != 0) { + continue; + } else if (image_apply_only_options && + boost::starts_with(pair.first, rbd_mirror_key_prefix)) { + continue; + } + + insert({pair.first, {}}); + } + } + + int init() { + CephContext *cct = (CephContext *)m_io_ctx.cct(); + + for (auto& [k,v] : *this) { + int r = cct->_conf.get_val(k, &v.first); + ceph_assert(r == 0); + v.second = RBD_CONFIG_SOURCE_CONFIG; + } + + std::string last_key = ImageCtx::METADATA_CONF_PREFIX; + bool more_results = true; + + while (more_results) { + std::map pairs; + + int r = librbd::api::PoolMetadata<>::list(m_io_ctx, last_key, MAX_KEYS, + &pairs); + if (r < 0) { + return r; + } + + if (pairs.empty()) { + break; + } + + more_results = (pairs.size() == MAX_KEYS); + last_key = pairs.rbegin()->first; + + for (auto kv : pairs) { + std::string key; + if (!util::is_metadata_config_override(kv.first, &key)) { + more_results = false; + break; + } + auto it = find(key); + if (it != end()) { + it->second = {{kv.second.c_str(), kv.second.length()}, + RBD_CONFIG_SOURCE_POOL}; + } + } + } + return 0; + } +}; + +} // anonymous namespace + +template +bool Config::is_option_name(librados::IoCtx& io_ctx, + const std::string &name) { + Options opts(io_ctx, false); + + return (opts.find(name) != opts.end()); +} + +template +int Config::list(librados::IoCtx& io_ctx, + std::vector *options) { + Options opts(io_ctx, false); + + int r = opts.init(); + if (r < 0) { + return r; + } + + for (auto& [k,v] : opts) { + options->push_back({std::string{k}, v.first, v.second}); + } + + return 0; +} + +template +bool Config::is_option_name(I *image_ctx, const std::string &name) { + Options opts(image_ctx->md_ctx, true); + + return (opts.find(name) != opts.end()); +} + +template +int Config::list(I *image_ctx, std::vector *options) { + CephContext *cct = image_ctx->cct; + Options opts(image_ctx->md_ctx, true); + + int r = opts.init(); + if (r < 0) { + return r; + } + + std::map pairs; + C_SaferCond ctx; + auto req = image::GetMetadataRequest::create( + image_ctx->md_ctx, image_ctx->header_oid, true, + ImageCtx::METADATA_CONF_PREFIX, ImageCtx::METADATA_CONF_PREFIX, 0U, &pairs, + &ctx); + req->send(); + + r = ctx.wait(); + if (r < 0) { + lderr(cct) << "failed reading image metadata: " << cpp_strerror(r) + << dendl; + return r; + } + + for (auto kv : pairs) { + std::string key; + if (!util::is_metadata_config_override(kv.first, &key)) { + break; + } + auto it = opts.find(key); + if (it != opts.end()) { + it->second = {{kv.second.c_str(), kv.second.length()}, + RBD_CONFIG_SOURCE_IMAGE}; + } + } + + for (auto& [k,v] : opts) { + options->push_back({std::string{k}, v.first, v.second}); + } + + return 0; +} + +template +void Config::apply_pool_overrides(librados::IoCtx& io_ctx, + ConfigProxy* config) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + + Options opts(io_ctx, false); + int r = opts.init(); + if (r < 0) { + lderr(cct) << "failed to read pool config overrides: " << cpp_strerror(r) + << dendl; + return; + } + + for (auto& [k,v] : opts) { + if (v.second == RBD_CONFIG_SOURCE_POOL) { + r = config->set_val(k, v.first); + if (r < 0) { + lderr(cct) << "failed to override pool config " << k << "=" + << v.first << ": " << cpp_strerror(r) << dendl; + } + } + } +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Config; diff --git a/src/librbd/api/Config.h b/src/librbd/api/Config.h new file mode 100644 index 000000000..83225d287 --- /dev/null +++ b/src/librbd/api/Config.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_CONFIG_H +#define CEPH_LIBRBD_API_CONFIG_H + +#include "common/config_fwd.h" +#include "include/common_fwd.h" +#include "include/rbd/librbd.hpp" +#include "include/rados/librados_fwd.hpp" + +namespace librbd { + +class ImageCtx; + +namespace api { + +template +class Config { +public: + static bool is_option_name(librados::IoCtx& io_ctx, const std::string &name); + static int list(librados::IoCtx& io_ctx, + std::vector *options); + + static bool is_option_name(ImageCtxT *image_ctx, const std::string &name); + static int list(ImageCtxT *image_ctx, std::vector *options); + + static void apply_pool_overrides(librados::IoCtx& io_ctx, + ConfigProxy* config); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Config; + +#endif // CEPH_LIBRBD_API_CONFIG_H diff --git a/src/librbd/api/DiffIterate.cc b/src/librbd/api/DiffIterate.cc new file mode 100644 index 000000000..b400b5d5a --- /dev/null +++ b/src/librbd/api/DiffIterate.cc @@ -0,0 +1,378 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/DiffIterate.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/internal.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/object_map/DiffRequest.h" +#include "include/rados/librados.hpp" +#include "include/interval_set.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "common/Throttle.h" +#include "osdc/Striper.h" +#include +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::DiffIterate: " + +namespace librbd { +namespace api { + +namespace { + +struct DiffContext { + DiffIterate<>::Callback callback; + void *callback_arg; + bool whole_object; + bool include_parent; + uint64_t from_snap_id; + uint64_t end_snap_id; + OrderedThrottle throttle; + + template + DiffContext(I &image_ctx, DiffIterate<>::Callback callback, + void *callback_arg, bool _whole_object, bool _include_parent, + uint64_t _from_snap_id, uint64_t _end_snap_id) + : callback(callback), callback_arg(callback_arg), + whole_object(_whole_object), include_parent(_include_parent), + from_snap_id(_from_snap_id), end_snap_id(_end_snap_id), + throttle(image_ctx.config.template get_val("rbd_concurrent_management_ops"), true) { + } +}; + +template +class C_DiffObject : public Context { +public: + C_DiffObject(I &image_ctx, DiffContext &diff_context, uint64_t image_offset, + uint64_t image_length) + : m_image_ctx(image_ctx), m_cct(image_ctx.cct), + m_diff_context(diff_context), m_image_offset(image_offset), + m_image_length(image_length) { + } + + void send() { + Context* ctx = m_diff_context.throttle.start_op(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_GENERIC); + int list_snaps_flags = 0; + if (!m_diff_context.include_parent || m_diff_context.from_snap_id != 0) { + list_snaps_flags |= io::LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT; + } + if (m_diff_context.whole_object) { + list_snaps_flags |= io::LIST_SNAPS_FLAG_WHOLE_OBJECT; + } + auto req = io::ImageDispatchSpec::create_list_snaps( + m_image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, + aio_comp, {{m_image_offset, m_image_length}}, io::ImageArea::DATA, + {m_diff_context.from_snap_id, m_diff_context.end_snap_id}, + list_snaps_flags, &m_snapshot_delta, {}); + req->send(); + } + +protected: + typedef boost::tuple Diff; + typedef std::list Diffs; + + void finish(int r) override { + CephContext *cct = m_cct; + + if (r < 0) { + ldout(cct, 20) << "list_snaps failed: " << m_image_offset << "~" + << m_image_length << ": " << cpp_strerror(r) << dendl; + } + + Diffs diffs; + ldout(cct, 20) << "image extent " << m_image_offset << "~" + << m_image_length << ": list_snaps complete" << dendl; + + compute_diffs(&diffs); + for (Diffs::const_iterator d = diffs.begin(); d != diffs.end(); ++d) { + r = m_diff_context.callback(d->get<0>(), d->get<1>(), d->get<2>(), + m_diff_context.callback_arg); + if (r < 0) { + break; + } + } + m_diff_context.throttle.end_op(r); + } + +private: + I& m_image_ctx; + CephContext *m_cct; + DiffContext &m_diff_context; + uint64_t m_image_offset; + uint64_t m_image_length; + + io::SnapshotDelta m_snapshot_delta; + + void compute_diffs(Diffs *diffs) { + CephContext *cct = m_cct; + + // merge per-snapshot deltas into an aggregate + io::SparseExtents aggregate_snapshot_extents; + for (auto& [key, snapshot_extents] : m_snapshot_delta) { + for (auto& snapshot_extent : snapshot_extents) { + auto state = snapshot_extent.get_val().state; + + // ignore DNE object (and parent) + if ((state == io::SPARSE_EXTENT_STATE_DNE) || + (key == io::INITIAL_WRITE_READ_SNAP_IDS && + state == io::SPARSE_EXTENT_STATE_ZEROED)) { + continue; + } + + aggregate_snapshot_extents.insert( + snapshot_extent.get_off(), snapshot_extent.get_len(), + {state, snapshot_extent.get_len()}); + } + } + + // build delta callback set + for (auto& snapshot_extent : aggregate_snapshot_extents) { + ldout(cct, 20) << "off=" << snapshot_extent.get_off() << ", " + << "len=" << snapshot_extent.get_len() << ", " + << "state=" << snapshot_extent.get_val().state << dendl; + diffs->emplace_back( + snapshot_extent.get_off(), snapshot_extent.get_len(), + snapshot_extent.get_val().state == io::SPARSE_EXTENT_STATE_DATA); + } + } +}; + +int simple_diff_cb(uint64_t off, size_t len, int exists, void *arg) { + // it's possible for a discard to create a hole in the parent image -- ignore + if (exists) { + interval_set *diff = static_cast *>(arg); + diff->insert(off, len); + } + return 0; +} + +} // anonymous namespace + +template +int DiffIterate::diff_iterate(I *ictx, + const cls::rbd::SnapshotNamespace& from_snap_namespace, + const char *fromsnapname, + uint64_t off, uint64_t len, + bool include_parent, bool whole_object, + int (*cb)(uint64_t, size_t, int, void *), + void *arg) +{ + ldout(ictx->cct, 20) << "diff_iterate " << ictx << " off = " << off + << " len = " << len << dendl; + + if (!ictx->data_ctx.is_valid()) { + return -ENODEV; + } + + // ensure previous writes are visible to listsnaps + C_SaferCond flush_ctx; + { + std::shared_lock owner_locker{ictx->owner_lock}; + auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, ictx, + io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + *ictx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, + aio_comp, io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); + } + int r = flush_ctx.wait(); + if (r < 0) { + return r; + } + + r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + ictx->image_lock.lock_shared(); + r = clip_io(ictx, off, &len, io::ImageArea::DATA); + ictx->image_lock.unlock_shared(); + if (r < 0) { + return r; + } + + DiffIterate command(*ictx, from_snap_namespace, fromsnapname, off, len, + include_parent, whole_object, cb, arg); + r = command.execute(); + return r; +} + +template +int DiffIterate::execute() { + CephContext* cct = m_image_ctx.cct; + + ceph_assert(m_image_ctx.data_ctx.is_valid()); + + librados::snap_t from_snap_id = 0; + librados::snap_t end_snap_id; + uint64_t from_size = 0; + uint64_t end_size; + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + if (m_from_snap_name) { + from_snap_id = m_image_ctx.get_snap_id(m_from_snap_namespace, + m_from_snap_name); + from_size = m_image_ctx.get_image_size(from_snap_id); + } + end_snap_id = m_image_ctx.snap_id; + end_size = m_image_ctx.get_image_size(end_snap_id); + } + + if (from_snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + if (from_snap_id == end_snap_id) { + // no diff. + return 0; + } + if (from_snap_id >= end_snap_id) { + return -EINVAL; + } + + int r; + bool fast_diff_enabled = false; + BitVector<2> object_diff_state; + interval_set parent_diff; + if (m_whole_object) { + C_SaferCond ctx; + auto req = object_map::DiffRequest::create(&m_image_ctx, from_snap_id, + end_snap_id, + &object_diff_state, &ctx); + req->send(); + + r = ctx.wait(); + if (r < 0) { + ldout(cct, 5) << "fast diff disabled" << dendl; + } else { + ldout(cct, 5) << "fast diff enabled" << dendl; + fast_diff_enabled = true; + + // check parent overlap only if we are comparing to the beginning of time + if (m_include_parent && from_snap_id == 0) { + std::shared_lock image_locker{m_image_ctx.image_lock}; + uint64_t raw_overlap = 0; + m_image_ctx.get_parent_overlap(m_image_ctx.snap_id, &raw_overlap); + auto overlap = m_image_ctx.reduce_parent_overlap(raw_overlap, false); + if (overlap.first > 0 && overlap.second == io::ImageArea::DATA) { + ldout(cct, 10) << " first getting parent diff" << dendl; + DiffIterate diff_parent(*m_image_ctx.parent, {}, nullptr, 0, + overlap.first, true, true, &simple_diff_cb, + &parent_diff); + r = diff_parent.execute(); + if (r < 0) { + return r; + } + } + } + } + } + + ldout(cct, 5) << "diff_iterate from " << from_snap_id << " to " + << end_snap_id << " size from " << from_size + << " to " << end_size << dendl; + DiffContext diff_context(m_image_ctx, m_callback, m_callback_arg, + m_whole_object, m_include_parent, from_snap_id, + end_snap_id); + + uint64_t period = m_image_ctx.get_stripe_period(); + uint64_t off = m_offset; + uint64_t left = m_length; + + while (left > 0) { + uint64_t period_off = off - (off % period); + uint64_t read_len = std::min(period_off + period - off, left); + + if (fast_diff_enabled) { + // map to extents + std::map > object_extents; + Striper::file_to_extents(cct, m_image_ctx.format_string, + &m_image_ctx.layout, off, read_len, 0, + object_extents, 0); + + // get diff info for each object and merge adjacent stripe units + // into an aggregate (this also sorts them) + io::SparseExtents aggregate_sparse_extents; + for (auto& [object, extents] : object_extents) { + const uint64_t object_no = extents.front().objectno; + uint8_t diff_state = object_diff_state[object_no]; + ldout(cct, 20) << "object " << object << ": diff_state=" + << (int)diff_state << dendl; + + if (diff_state == object_map::DIFF_STATE_HOLE && + from_snap_id == 0 && !parent_diff.empty()) { + // no data in child object -- report parent diff instead + for (auto& oe : extents) { + for (auto& be : oe.buffer_extents) { + interval_set o; + o.insert(off + be.first, be.second); + o.intersection_of(parent_diff); + ldout(cct, 20) << " reporting parent overlap " << o << dendl; + for (auto e = o.begin(); e != o.end(); ++e) { + aggregate_sparse_extents.insert(e.get_start(), e.get_len(), + {io::SPARSE_EXTENT_STATE_DATA, + e.get_len()}); + } + } + } + } else if (diff_state == object_map::DIFF_STATE_HOLE_UPDATED || + diff_state == object_map::DIFF_STATE_DATA_UPDATED) { + auto state = (diff_state == object_map::DIFF_STATE_HOLE_UPDATED ? + io::SPARSE_EXTENT_STATE_ZEROED : io::SPARSE_EXTENT_STATE_DATA); + for (auto& oe : extents) { + for (auto& be : oe.buffer_extents) { + aggregate_sparse_extents.insert(off + be.first, be.second, + {state, be.second}); + } + } + } + } + + for (const auto& se : aggregate_sparse_extents) { + ldout(cct, 20) << "off=" << se.get_off() << ", len=" << se.get_len() + << ", state=" << se.get_val().state << dendl; + r = m_callback(se.get_off(), se.get_len(), + se.get_val().state == io::SPARSE_EXTENT_STATE_DATA, + m_callback_arg); + if (r < 0) { + return r; + } + } + } else { + auto diff_object = new C_DiffObject(m_image_ctx, diff_context, off, + read_len); + diff_object->send(); + + if (diff_context.throttle.pending_error()) { + r = diff_context.throttle.wait_for_ret(); + return r; + } + } + + left -= read_len; + off += read_len; + } + + r = diff_context.throttle.wait_for_ret(); + if (r < 0) { + return r; + } + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::DiffIterate; diff --git a/src/librbd/api/DiffIterate.h b/src/librbd/api/DiffIterate.h new file mode 100644 index 000000000..e6074d9cb --- /dev/null +++ b/src/librbd/api/DiffIterate.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_DIFF_ITERATE_H +#define CEPH_LIBRBD_API_DIFF_ITERATE_H + +#include "include/int_types.h" +#include "common/bit_vector.hpp" +#include "cls/rbd/cls_rbd_types.h" + +namespace librbd { + +class ImageCtx; + +namespace api { + +template +class DiffIterate { +public: + typedef int (*Callback)(uint64_t, size_t, int, void *); + + static int diff_iterate(ImageCtxT *ictx, + const cls::rbd::SnapshotNamespace& from_snap_namespace, + const char *fromsnapname, + uint64_t off, uint64_t len, bool include_parent, + bool whole_object, + int (*cb)(uint64_t, size_t, int, void *), + void *arg); + +private: + ImageCtxT &m_image_ctx; + cls::rbd::SnapshotNamespace m_from_snap_namespace; + const char* m_from_snap_name; + uint64_t m_offset; + uint64_t m_length; + bool m_include_parent; + bool m_whole_object; + Callback m_callback; + void *m_callback_arg; + + DiffIterate(ImageCtxT &image_ctx, + const cls::rbd::SnapshotNamespace& from_snap_namespace, + const char *from_snap_name, uint64_t off, uint64_t len, + bool include_parent, bool whole_object, Callback callback, + void *callback_arg) + : m_image_ctx(image_ctx), m_from_snap_namespace(from_snap_namespace), + m_from_snap_name(from_snap_name), m_offset(off), + m_length(len), m_include_parent(include_parent), + m_whole_object(whole_object), m_callback(callback), + m_callback_arg(callback_arg) + { + } + + int execute(); + + int diff_object_map(uint64_t from_snap_id, uint64_t to_snap_id, + BitVector<2>* object_diff_state); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::DiffIterate; + +#endif // CEPH_LIBRBD_API_DIFF_ITERATE_H diff --git a/src/librbd/api/Group.cc b/src/librbd/api/Group.cc new file mode 100644 index 000000000..e5f3da69c --- /dev/null +++ b/src/librbd/api/Group.cc @@ -0,0 +1,1287 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/Cond.h" +#include "common/errno.h" + +#include "librbd/ExclusiveLock.h" +#include "librbd/api/Group.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/internal.h" +#include "librbd/io/AioCompletion.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Group: " << __func__ << ": " + +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; +// list binds to list() here, so std::list is explicitly used below + +using ceph::bufferlist; +using librados::snap_t; +using librados::IoCtx; +using librados::Rados; + + +namespace librbd { +namespace api { + +namespace { + +template +snap_t get_group_snap_id(I* ictx, + const cls::rbd::SnapshotNamespace& in_snap_namespace) { + ceph_assert(ceph_mutex_is_locked(ictx->image_lock)); + auto it = ictx->snap_ids.lower_bound({cls::rbd::GroupSnapshotNamespace{}, + ""}); + for (; it != ictx->snap_ids.end(); ++it) { + if (it->first.first == in_snap_namespace) { + return it->second; + } else if (!std::holds_alternative( + it->first.first)) { + break; + } + } + return CEPH_NOSNAP; +} + +string generate_uuid(librados::IoCtx& io_ctx) +{ + Rados rados(io_ctx); + uint64_t bid = rados.get_instance_id(); + + uint32_t extra = rand() % 0xFFFFFFFF; + std::ostringstream bid_ss; + bid_ss << std::hex << bid << std::hex << extra; + return bid_ss.str(); +} + +int group_snap_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector *cls_snaps) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + vector ind_snap_names; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + string group_header_oid = util::group_header_name(group_id); + + const int max_read = 1024; + cls::rbd::GroupSnapshot snap_last; + + for (;;) { + vector snaps_page; + + r = cls_client::group_snap_list(&group_ioctx, group_header_oid, + snap_last, max_read, &snaps_page); + + if (r < 0) { + lderr(cct) << "error reading snap list from group: " + << cpp_strerror(-r) << dendl; + return r; + } + cls_snaps->insert(cls_snaps->end(), snaps_page.begin(), snaps_page.end()); + if (snaps_page.size() < max_read) { + break; + } + snap_last = *snaps_page.rbegin(); + } + + return 0; +} + +std::string calc_ind_image_snap_name(uint64_t pool_id, + const std::string &group_id, + const std::string &snap_id) +{ + std::stringstream ind_snap_name_stream; + ind_snap_name_stream << ".group." << std::hex << pool_id << "_" + << group_id << "_" << snap_id; + return ind_snap_name_stream.str(); +} + +int group_image_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector *image_ids) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + string group_header_oid = util::group_header_name(group_id); + + ldout(cct, 20) << "listing images in group name " + << group_name << " group id " << group_header_oid << dendl; + image_ids->clear(); + + const int max_read = 1024; + cls::rbd::GroupImageSpec start_last; + do { + std::vector image_ids_page; + + r = cls_client::group_image_list(&group_ioctx, group_header_oid, + start_last, max_read, &image_ids_page); + + if (r < 0) { + lderr(cct) << "error reading image list from group: " + << cpp_strerror(-r) << dendl; + return r; + } + image_ids->insert(image_ids->end(), + image_ids_page.begin(), image_ids_page.end()); + + if (image_ids_page.size() > 0) + start_last = image_ids_page.rbegin()->spec; + + r = image_ids_page.size(); + } while (r == max_read); + + return 0; +} + +int group_image_remove(librados::IoCtx& group_ioctx, string group_id, + librados::IoCtx& image_ioctx, string image_id) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_header_oid = util::group_header_name(group_id); + + string image_header_oid = util::header_name(image_id); + + ldout(cct, 20) << "removing image " << image_id + << " image id " << image_header_oid << dendl; + + cls::rbd::GroupSpec group_spec(group_id, group_ioctx.get_id()); + + cls::rbd::GroupImageStatus incomplete_st(image_id, image_ioctx.get_id(), + cls::rbd::GROUP_IMAGE_LINK_STATE_INCOMPLETE); + + cls::rbd::GroupImageSpec spec(image_id, image_ioctx.get_id()); + + int r = cls_client::group_image_set(&group_ioctx, group_header_oid, + incomplete_st); + + if (r < 0) { + lderr(cct) << "couldn't put image into removing state: " + << cpp_strerror(-r) << dendl; + return r; + } + + r = cls_client::image_group_remove(&image_ioctx, image_header_oid, + group_spec); + if ((r < 0) && (r != -ENOENT)) { + lderr(cct) << "couldn't remove group reference from image" + << cpp_strerror(-r) << dendl; + return r; + } else if (r >= 0) { + ImageWatcher<>::notify_header_update(image_ioctx, image_header_oid); + } + + r = cls_client::group_image_remove(&group_ioctx, group_header_oid, spec); + if (r < 0) { + lderr(cct) << "couldn't remove image from group" + << cpp_strerror(-r) << dendl; + return r; + } + + return 0; +} + +int group_snap_remove_by_record(librados::IoCtx& group_ioctx, + const cls::rbd::GroupSnapshot& group_snap, + const std::string& group_id, + const std::string& group_header_oid) { + + CephContext *cct = (CephContext *)group_ioctx.cct(); + std::vector on_finishes; + int r, ret_code; + + std::vector ictxs; + + cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id, + group_snap.id}; + + ldout(cct, 20) << "Removing snapshots" << dendl; + int snap_count = group_snap.snaps.size(); + + for (int i = 0; i < snap_count; ++i) { + librbd::IoCtx image_io_ctx; + r = util::create_ioctx(group_ioctx, "image", group_snap.snaps[i].pool, {}, + &image_io_ctx); + if (r < 0) { + return r; + } + + librbd::ImageCtx* image_ctx = new ImageCtx("", group_snap.snaps[i].image_id, + nullptr, image_io_ctx, false); + + C_SaferCond* on_finish = new C_SaferCond; + + image_ctx->state->open(0, on_finish); + + ictxs.push_back(image_ctx); + on_finishes.push_back(on_finish); + } + + ret_code = 0; + for (int i = 0; i < snap_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0) { + ictxs[i] = nullptr; + ret_code = r; + } + } + if (ret_code != 0) { + goto finish; + } + + ldout(cct, 20) << "Opened participating images. " << + "Deleting snapshots themselves." << dendl; + + for (int i = 0; i < snap_count; ++i) { + ImageCtx *ictx = ictxs[i]; + on_finishes[i] = new C_SaferCond; + + std::string snap_name; + ictx->image_lock.lock_shared(); + snap_t snap_id = get_group_snap_id(ictx, ne); + r = ictx->get_snap_name(snap_id, &snap_name); + ictx->image_lock.unlock_shared(); + + if (r >= 0) { + ldout(cct, 20) << "removing individual snapshot from image " << ictx->name + << dendl; + ictx->operations->snap_remove(ne, snap_name, on_finishes[i]); + } else { + // We are ok to ignore missing image snapshots. The snapshot could have + // been inconsistent in the first place. + on_finishes[i]->complete(0); + } + } + + for (int i = 0; i < snap_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0 && r != -ENOENT) { + // if previous attempts to remove this snapshot failed then the image's + // snapshot may not exist + lderr(cct) << "Failed deleting image snapshot. Ret code: " << r << dendl; + ret_code = r; + } + } + + if (ret_code != 0) { + goto finish; + } + + ldout(cct, 20) << "Removed images snapshots removing snapshot record." + << dendl; + + r = cls_client::group_snap_remove(&group_ioctx, group_header_oid, + group_snap.id); + if (r < 0) { + ret_code = r; + goto finish; + } + +finish: + for (int i = 0; i < snap_count; ++i) { + if (ictxs[i] != nullptr) { + ictxs[i]->state->close(); + } + } + return ret_code; +} + +int group_snap_rollback_by_record(librados::IoCtx& group_ioctx, + const cls::rbd::GroupSnapshot& group_snap, + const std::string& group_id, + const std::string& group_header_oid, + ProgressContext& pctx) { + CephContext *cct = (CephContext *)group_ioctx.cct(); + std::vector on_finishes; + int r, ret_code; + + std::vector ictxs; + + cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id, + group_snap.id}; + + ldout(cct, 20) << "Rolling back snapshots" << dendl; + int snap_count = group_snap.snaps.size(); + + for (int i = 0; i < snap_count; ++i) { + librados::IoCtx image_io_ctx; + r = util::create_ioctx(group_ioctx, "image", group_snap.snaps[i].pool, {}, + &image_io_ctx); + if (r < 0) { + return r; + } + + librbd::ImageCtx* image_ctx = new ImageCtx("", group_snap.snaps[i].image_id, + nullptr, image_io_ctx, false); + + C_SaferCond* on_finish = new C_SaferCond; + + image_ctx->state->open(0, on_finish); + + ictxs.push_back(image_ctx); + on_finishes.push_back(on_finish); + } + + ret_code = 0; + for (int i = 0; i < snap_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0) { + ictxs[i] = nullptr; + ret_code = r; + } + } + if (ret_code != 0) { + goto finish; + } + + ldout(cct, 20) << "Requesting exclusive locks for images" << dendl; + for (auto ictx: ictxs) { + std::shared_lock owner_lock{ictx->owner_lock}; + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->block_requests(-EBUSY); + } + } + for (int i = 0; i < snap_count; ++i) { + ImageCtx *ictx = ictxs[i]; + std::shared_lock owner_lock{ictx->owner_lock}; + + on_finishes[i] = new C_SaferCond; + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->acquire_lock(on_finishes[i]); + } + } + + ret_code = 0; + for (int i = 0; i < snap_count; ++i) { + r = 0; + ImageCtx *ictx = ictxs[i]; + if (ictx->exclusive_lock != nullptr) { + r = on_finishes[i]->wait(); + } + delete on_finishes[i]; + if (r < 0) { + ret_code = r; + } + } + if (ret_code != 0) { + goto finish; + } + + for (int i = 0; i < snap_count; ++i) { + ImageCtx *ictx = ictxs[i]; + on_finishes[i] = new C_SaferCond; + + std::shared_lock owner_locker{ictx->owner_lock}; + std::string snap_name; + ictx->image_lock.lock_shared(); + snap_t snap_id = get_group_snap_id(ictx, ne); + r = ictx->get_snap_name(snap_id, &snap_name); + ictx->image_lock.unlock_shared(); + + if (r >= 0) { + ldout(cct, 20) << "rolling back to individual snapshot for image " << ictx->name + << dendl; + ictx->operations->execute_snap_rollback(ne, snap_name, pctx, on_finishes[i]); + } else { + on_finishes[i]->complete(r); + } + } + + for (int i = 0; i < snap_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0 && r != -ENOENT) { + lderr(cct) << "Failed rolling back group to snapshot. Ret code: " << r << dendl; + ret_code = r; + } + } + +finish: + for (int i = 0; i < snap_count; ++i) { + if (ictxs[i] != nullptr) { + ictxs[i]->state->close(); + } + } + return ret_code; +} + +template +void notify_unquiesce(std::vector &ictxs, + const std::vector &requests) { + if (requests.empty()) { + return; + } + + ceph_assert(requests.size() == ictxs.size()); + int image_count = ictxs.size(); + std::vector on_finishes(image_count); + + for (int i = 0; i < image_count; ++i) { + ImageCtx *ictx = ictxs[i]; + + ictx->image_watcher->notify_unquiesce(requests[i], &on_finishes[i]); + } + + for (int i = 0; i < image_count; ++i) { + on_finishes[i].wait(); + } +} + +template +int notify_quiesce(std::vector &ictxs, ProgressContext &prog_ctx, + std::vector *requests) { + int image_count = ictxs.size(); + std::vector on_finishes(image_count); + + requests->resize(image_count); + for (int i = 0; i < image_count; ++i) { + auto ictx = ictxs[i]; + + ictx->image_watcher->notify_quiesce(&(*requests)[i], prog_ctx, + &on_finishes[i]); + } + + int ret_code = 0; + for (int i = 0; i < image_count; ++i) { + int r = on_finishes[i].wait(); + if (r < 0) { + ret_code = r; + } + } + + if (ret_code != 0) { + notify_unquiesce(ictxs, *requests); + } + + return ret_code; +} + +} // anonymous namespace + +template +int Group::image_remove_by_id(librados::IoCtx& group_ioctx, + const char *group_name, + librados::IoCtx& image_ioctx, + const char *image_id) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + ldout(cct, 20) << "io_ctx=" << &group_ioctx + << " group name " << group_name << " image " + << &image_ioctx << " id " << image_id << dendl; + + string group_id; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name, + &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 20) << "removing image from group name " << group_name + << " group id " << group_id << dendl; + + return group_image_remove(group_ioctx, group_id, image_ioctx, string(image_id)); +} + +template +int Group::create(librados::IoCtx& io_ctx, const char *group_name) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + + string id = generate_uuid(io_ctx); + + ldout(cct, 2) << "adding group to directory..." << dendl; + + int r = cls_client::group_dir_add(&io_ctx, RBD_GROUP_DIRECTORY, group_name, + id); + if (r < 0) { + lderr(cct) << "error adding group to directory: " + << cpp_strerror(r) + << dendl; + return r; + } + string header_oid = util::group_header_name(id); + + r = io_ctx.create(header_oid, true); + if (r < 0) { + lderr(cct) << "error creating group header: " << cpp_strerror(r) << dendl; + goto err_remove_from_dir; + } + + return 0; + +err_remove_from_dir: + int remove_r = cls_client::group_dir_remove(&io_ctx, RBD_GROUP_DIRECTORY, + group_name, id); + if (remove_r < 0) { + lderr(cct) << "error cleaning up group from rbd_directory " + << "object after creation failed: " << cpp_strerror(remove_r) + << dendl; + } + + return r; +} + +template +int Group::remove(librados::IoCtx& io_ctx, const char *group_name) +{ + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "group_remove " << &io_ctx << " " << group_name << dendl; + + std::string group_id; + int r = cls_client::dir_get_id(&io_ctx, RBD_GROUP_DIRECTORY, + std::string(group_name), &group_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error getting id of group" << dendl; + return r; + } + string group_header_oid = util::group_header_name(group_id); + + std::vector snaps; + r = group_snap_list(io_ctx, group_name, &snaps); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing group snapshots" << dendl; + return r; + } + + for (auto &snap : snaps) { + r = group_snap_remove_by_record(io_ctx, snap, group_id, group_header_oid); + if (r < 0) { + return r; + } + } + + std::vector images; + r = group_image_list(io_ctx, group_name, &images); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing group images" << dendl; + return r; + } + + for (auto image : images) { + IoCtx image_ioctx; + r = util::create_ioctx(io_ctx, "image", image.spec.pool_id, {}, + &image_ioctx); + if (r < 0) { + return r; + } + + r = group_image_remove(io_ctx, group_id, image_ioctx, image.spec.image_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error removing image from a group" << dendl; + return r; + } + } + + string header_oid = util::group_header_name(group_id); + + r = io_ctx.remove(header_oid); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error removing header: " << cpp_strerror(-r) << dendl; + return r; + } + + r = cls_client::group_dir_remove(&io_ctx, RBD_GROUP_DIRECTORY, + group_name, group_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error removing group from directory" << dendl; + return r; + } + + return 0; +} + +template +int Group::list(IoCtx& io_ctx, vector *names) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 20) << "io_ctx=" << &io_ctx << dendl; + + int max_read = 1024; + string last_read = ""; + int r; + do { + map groups; + r = cls_client::group_dir_list(&io_ctx, RBD_GROUP_DIRECTORY, last_read, + max_read, &groups); + if (r < 0) { + if (r != -ENOENT) { + lderr(cct) << "error listing group in directory: " + << cpp_strerror(r) << dendl; + } else { + r = 0; + } + return r; + } + for (pair group : groups) { + names->push_back(group.first); + } + if (!groups.empty()) { + last_read = groups.rbegin()->first; + } + r = groups.size(); + } while (r == max_read); + + return 0; +} + +template +int Group::image_add(librados::IoCtx& group_ioctx, const char *group_name, + librados::IoCtx& image_ioctx, const char *image_name) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + ldout(cct, 20) << "io_ctx=" << &group_ioctx + << " group name " << group_name << " image " + << &image_ioctx << " name " << image_name << dendl; + + if (group_ioctx.get_namespace() != image_ioctx.get_namespace()) { + lderr(cct) << "group and image cannot be in different namespaces" << dendl; + return -EINVAL; + } + + string group_id; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name, + &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + string group_header_oid = util::group_header_name(group_id); + + + ldout(cct, 20) << "adding image to group name " << group_name + << " group id " << group_header_oid << dendl; + + string image_id; + + r = cls_client::dir_get_id(&image_ioctx, RBD_DIRECTORY, image_name, + &image_id); + if (r < 0) { + lderr(cct) << "error reading image id object: " + << cpp_strerror(-r) << dendl; + return r; + } + + string image_header_oid = util::header_name(image_id); + + ldout(cct, 20) << "adding image " << image_name + << " image id " << image_header_oid << dendl; + + cls::rbd::GroupImageStatus incomplete_st( + image_id, image_ioctx.get_id(), + cls::rbd::GROUP_IMAGE_LINK_STATE_INCOMPLETE); + cls::rbd::GroupImageStatus attached_st( + image_id, image_ioctx.get_id(), cls::rbd::GROUP_IMAGE_LINK_STATE_ATTACHED); + + r = cls_client::group_image_set(&group_ioctx, group_header_oid, + incomplete_st); + + cls::rbd::GroupSpec group_spec(group_id, group_ioctx.get_id()); + + if (r < 0) { + lderr(cct) << "error adding image reference to group: " + << cpp_strerror(-r) << dendl; + return r; + } + + r = cls_client::image_group_add(&image_ioctx, image_header_oid, group_spec); + if (r < 0) { + lderr(cct) << "error adding group reference to image: " + << cpp_strerror(-r) << dendl; + cls::rbd::GroupImageSpec spec(image_id, image_ioctx.get_id()); + cls_client::group_image_remove(&group_ioctx, group_header_oid, spec); + // Ignore errors in the clean up procedure. + return r; + } + ImageWatcher<>::notify_header_update(image_ioctx, image_header_oid); + + r = cls_client::group_image_set(&group_ioctx, group_header_oid, + attached_st); + + return r; +} + +template +int Group::image_remove(librados::IoCtx& group_ioctx, const char *group_name, + librados::IoCtx& image_ioctx, const char *image_name) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + ldout(cct, 20) << "io_ctx=" << &group_ioctx + << " group name " << group_name << " image " + << &image_ioctx << " name " << image_name << dendl; + + if (group_ioctx.get_namespace() != image_ioctx.get_namespace()) { + lderr(cct) << "group and image cannot be in different namespaces" << dendl; + return -EINVAL; + } + + string group_id; + + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name, + &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 20) << "removing image from group name " << group_name + << " group id " << group_id << dendl; + + string image_id; + r = cls_client::dir_get_id(&image_ioctx, RBD_DIRECTORY, image_name, + &image_id); + if (r < 0) { + lderr(cct) << "error reading image id object: " + << cpp_strerror(-r) << dendl; + return r; + } + + r = group_image_remove(group_ioctx, group_id, image_ioctx, image_id); + + return r; +} + +template +int Group::image_list(librados::IoCtx& group_ioctx, + const char *group_name, + std::vector* images) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + ldout(cct, 20) << "io_ctx=" << &group_ioctx + << " group name " << group_name << dendl; + + std::vector image_ids; + + group_image_list(group_ioctx, group_name, &image_ids); + + for (auto image_id : image_ids) { + IoCtx ioctx; + int r = util::create_ioctx(group_ioctx, "image", image_id.spec.pool_id, {}, + &ioctx); + if (r < 0) { + return r; + } + + std::string image_name; + r = cls_client::dir_get_name(&ioctx, RBD_DIRECTORY, + image_id.spec.image_id, &image_name); + if (r < 0) { + return r; + } + + images->push_back( + group_image_info_t { + image_name, + ioctx.get_id(), + static_cast(image_id.state)}); + } + + return 0; +} + +template +int Group::rename(librados::IoCtx& io_ctx, const char *src_name, + const char *dest_name) +{ + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "group_rename " << &io_ctx << " " << src_name + << " -> " << dest_name << dendl; + + std::string group_id; + int r = cls_client::dir_get_id(&io_ctx, RBD_GROUP_DIRECTORY, + std::string(src_name), &group_id); + if (r < 0) { + if (r != -ENOENT) + lderr(cct) << "error getting id of group" << dendl; + return r; + } + + r = cls_client::group_dir_rename(&io_ctx, RBD_GROUP_DIRECTORY, + src_name, dest_name, group_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error renaming group from directory" << dendl; + return r; + } + + return 0; +} + + +template +int Group::image_get_group(I *ictx, group_info_t *group_info) +{ + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + if (RBD_GROUP_INVALID_POOL != ictx->group_spec.pool_id) { + IoCtx ioctx; + r = util::create_ioctx(ictx->md_ctx, "group", ictx->group_spec.pool_id, {}, + &ioctx); + if (r < 0) { + return r; + } + + std::string group_name; + r = cls_client::dir_get_name(&ioctx, RBD_GROUP_DIRECTORY, + ictx->group_spec.group_id, &group_name); + if (r < 0) + return r; + group_info->pool = ioctx.get_id(); + group_info->name = group_name; + } else { + group_info->pool = RBD_GROUP_INVALID_POOL; + group_info->name = ""; + } + + return 0; +} + +template +int Group::snap_create(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name, + uint32_t flags) { + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + cls::rbd::GroupSnapshot group_snap; + vector image_snaps; + std::string ind_snap_name; + + std::vector ictxs; + std::vector on_finishes; + std::vector quiesce_requests; + NoOpProgressContext prog_ctx; + uint64_t internal_flags = 0; + + int r = util::snap_create_flags_api_to_internal(cct, flags, &internal_flags); + if (r < 0) { + return r; + } + internal_flags &= ~(SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE | + SNAP_CREATE_FLAG_IGNORE_NOTIFY_QUIESCE_ERROR); + + r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, group_name, + &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + + std::vector images; + r = group_image_list(group_ioctx, group_name, &images); + if (r < 0) { + return r; + } + int image_count = images.size(); + + ldout(cct, 20) << "Found " << image_count << " images in group" << dendl; + + image_snaps = vector(image_count, + cls::rbd::ImageSnapshotSpec()); + + for (int i = 0; i < image_count; ++i) { + image_snaps[i].pool = images[i].spec.pool_id; + image_snaps[i].image_id = images[i].spec.image_id; + } + + string group_header_oid = util::group_header_name(group_id); + + group_snap.id = generate_uuid(group_ioctx); + group_snap.name = string(snap_name); + group_snap.state = cls::rbd::GROUP_SNAPSHOT_STATE_INCOMPLETE; + group_snap.snaps = image_snaps; + + cls::rbd::GroupSnapshotNamespace ne{group_ioctx.get_id(), group_id, + group_snap.id}; + + r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap); + if (r == -EEXIST) { + lderr(cct) << "snapshot with this name already exists: " + << cpp_strerror(r) + << dendl; + } + int ret_code = 0; + if (r < 0) { + ret_code = r; + goto finish; + } + + for (auto image: images) { + librbd::IoCtx image_io_ctx; + r = util::create_ioctx(group_ioctx, "image", image.spec.pool_id, {}, + &image_io_ctx); + if (r < 0) { + ret_code = r; + goto finish; + } + + ldout(cct, 20) << "Opening image with id " << image.spec.image_id << dendl; + + librbd::ImageCtx* image_ctx = new ImageCtx("", image.spec.image_id.c_str(), + nullptr, image_io_ctx, false); + + C_SaferCond* on_finish = new C_SaferCond; + + image_ctx->state->open(0, on_finish); + + ictxs.push_back(image_ctx); + on_finishes.push_back(on_finish); + } + ldout(cct, 20) << "Issued open request waiting for the completion" << dendl; + ret_code = 0; + for (int i = 0; i < image_count; ++i) { + + ldout(cct, 20) << "Waiting for completion on on_finish: " << + on_finishes[i] << dendl; + + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0) { + ictxs[i] = nullptr; + ret_code = r; + } + } + if (ret_code != 0) { + goto remove_record; + } + + if ((flags & RBD_SNAP_CREATE_SKIP_QUIESCE) == 0) { + ldout(cct, 20) << "Sending quiesce notification" << dendl; + ret_code = notify_quiesce(ictxs, prog_ctx, &quiesce_requests); + if (ret_code != 0 && (flags & RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR) == 0) { + goto remove_record; + } + } + + ldout(cct, 20) << "Requesting exclusive locks for images" << dendl; + + for (auto ictx: ictxs) { + std::shared_lock owner_lock{ictx->owner_lock}; + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->block_requests(-EBUSY); + } + } + for (int i = 0; i < image_count; ++i) { + ImageCtx *ictx = ictxs[i]; + std::shared_lock owner_lock{ictx->owner_lock}; + + on_finishes[i] = new C_SaferCond; + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->acquire_lock(on_finishes[i]); + } + } + + ret_code = 0; + for (int i = 0; i < image_count; ++i) { + r = 0; + ImageCtx *ictx = ictxs[i]; + if (ictx->exclusive_lock != nullptr) { + r = on_finishes[i]->wait(); + } + delete on_finishes[i]; + if (r < 0) { + ret_code = r; + } + } + if (ret_code != 0) { + notify_unquiesce(ictxs, quiesce_requests); + goto remove_record; + } + + ind_snap_name = calc_ind_image_snap_name(group_ioctx.get_id(), group_id, + group_snap.id); + + for (int i = 0; i < image_count; ++i) { + ImageCtx *ictx = ictxs[i]; + + C_SaferCond* on_finish = new C_SaferCond; + + std::shared_lock owner_locker{ictx->owner_lock}; + ictx->operations->execute_snap_create( + ne, ind_snap_name.c_str(), on_finish, 0, + SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE, prog_ctx); + + on_finishes[i] = on_finish; + } + + ret_code = 0; + for (int i = 0; i < image_count; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0) { + ret_code = r; + } else { + ImageCtx *ictx = ictxs[i]; + ictx->image_lock.lock_shared(); + snap_t snap_id = get_group_snap_id(ictx, ne); + ictx->image_lock.unlock_shared(); + if (snap_id == CEPH_NOSNAP) { + ldout(cct, 20) << "Couldn't find created snapshot with namespace: " + << ne << dendl; + ret_code = -ENOENT; + } else { + image_snaps[i].snap_id = snapid_t(snap_id); + image_snaps[i].pool = ictx->md_ctx.get_id(); + image_snaps[i].image_id = ictx->id; + } + } + } + if (ret_code != 0) { + goto remove_image_snaps; + } + + group_snap.snaps = image_snaps; + group_snap.state = cls::rbd::GROUP_SNAPSHOT_STATE_COMPLETE; + + r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap); + if (r < 0) { + ret_code = r; + goto remove_image_snaps; + } + + ldout(cct, 20) << "Sending unquiesce notification" << dendl; + notify_unquiesce(ictxs, quiesce_requests); + + goto finish; + +remove_image_snaps: + notify_unquiesce(ictxs, quiesce_requests); + + for (int i = 0; i < image_count; ++i) { + ImageCtx *ictx = ictxs[i]; + ldout(cct, 20) << "Removing individual snapshot with name: " << + ind_snap_name << dendl; + + on_finishes[i] = new C_SaferCond; + std::string snap_name; + ictx->image_lock.lock_shared(); + snap_t snap_id = get_group_snap_id(ictx, ne); + r = ictx->get_snap_name(snap_id, &snap_name); + ictx->image_lock.unlock_shared(); + if (r >= 0) { + ictx->operations->snap_remove(ne, snap_name.c_str(), on_finishes[i]); + } else { + // Ignore missing image snapshots. The whole snapshot could have been + // inconsistent. + on_finishes[i]->complete(0); + } + } + + for (int i = 0, n = on_finishes.size(); i < n; ++i) { + r = on_finishes[i]->wait(); + delete on_finishes[i]; + if (r < 0 && r != -ENOENT) { // if previous attempts to remove this snapshot failed then the image's snapshot may not exist + lderr(cct) << "Failed cleaning up image snapshot. Ret code: " << r << dendl; + // just report error, but don't abort the process + } + } + +remove_record: + r = cls_client::group_snap_remove(&group_ioctx, group_header_oid, + group_snap.id); + if (r < 0) { + lderr(cct) << "error while cleaning up group snapshot" << dendl; + // we ignore return value in clean up + } + +finish: + for (int i = 0, n = ictxs.size(); i < n; ++i) { + if (ictxs[i] != nullptr) { + ictxs[i]->state->close(); + } + } + return ret_code; +} + +template +int Group::snap_remove(librados::IoCtx& group_ioctx, const char *group_name, + const char *snap_name) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) + << dendl; + return r; + } + + std::vector snaps; + r = group_snap_list(group_ioctx, group_name, &snaps); + if (r < 0) { + return r; + } + + cls::rbd::GroupSnapshot *group_snap = nullptr; + for (auto &snap : snaps) { + if (snap.name == string(snap_name)) { + group_snap = &snap; + break; + } + } + if (group_snap == nullptr) { + return -ENOENT; + } + + string group_header_oid = util::group_header_name(group_id); + r = group_snap_remove_by_record(group_ioctx, *group_snap, group_id, + group_header_oid); + return r; +} + +template +int Group::snap_rename(librados::IoCtx& group_ioctx, const char *group_name, + const char *old_snap_name, + const char *new_snap_name) { + CephContext *cct = (CephContext *)group_ioctx.cct(); + if (0 == strcmp(old_snap_name, new_snap_name)) + return -EEXIST; + + std::string group_id; + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "error reading group id object: " << cpp_strerror(r) << dendl; + return r; + } + + std::vector group_snaps; + r = group_snap_list(group_ioctx, group_name, &group_snaps); + if (r < 0) { + return r; + } + + cls::rbd::GroupSnapshot group_snap; + for (auto &snap : group_snaps) { + if (snap.name == old_snap_name) { + group_snap = snap; + break; + } + } + + if (group_snap.id.empty()) { + return -ENOENT; + } + + std::string group_header_oid = util::group_header_name(group_id); + group_snap.name = new_snap_name; + r = cls_client::group_snap_set(&group_ioctx, group_header_oid, group_snap); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Group::snap_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector *snaps) +{ + std::vector cls_snaps; + + int r = group_snap_list(group_ioctx, group_name, &cls_snaps); + if (r < 0) { + return r; + } + + for (auto snap : cls_snaps) { + snaps->push_back( + group_snap_info_t { + snap.name, + static_cast(snap.state)}); + + } + return 0; +} + +template +int Group::snap_rollback(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name, + ProgressContext& pctx) +{ + CephContext *cct = (CephContext *)group_ioctx.cct(); + + string group_id; + int r = cls_client::dir_get_id(&group_ioctx, RBD_GROUP_DIRECTORY, + group_name, &group_id); + if (r < 0) { + lderr(cct) << "error reading group id object: " + << cpp_strerror(r) << dendl; + return r; + } + + std::vector snaps; + r = group_snap_list(group_ioctx, group_name, &snaps); + if (r < 0) { + return r; + } + + cls::rbd::GroupSnapshot *group_snap = nullptr; + for (auto &snap : snaps) { + if (snap.name == string(snap_name)) { + group_snap = &snap; + break; + } + } + if (group_snap == nullptr) { + return -ENOENT; + } + + string group_header_oid = util::group_header_name(group_id); + r = group_snap_rollback_by_record(group_ioctx, *group_snap, group_id, + group_header_oid, pctx); + return r; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Group; diff --git a/src/librbd/api/Group.h b/src/librbd/api/Group.h new file mode 100644 index 000000000..9d3abcc59 --- /dev/null +++ b/src/librbd/api/Group.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_GROUP_H +#define CEPH_LIBRBD_API_GROUP_H + +#include "include/rbd/librbd.hpp" +#include "include/rados/librados_fwd.hpp" +#include +#include + +namespace librbd { + +struct ImageCtx; + +namespace api { + +template +struct Group { + + static int create(librados::IoCtx& io_ctx, const char *group_name); + static int remove(librados::IoCtx& io_ctx, const char *group_name); + static int list(librados::IoCtx& io_ctx, std::vector *names); + static int rename(librados::IoCtx& io_ctx, const char *src_group_name, + const char *dest_group_name); + + static int image_add(librados::IoCtx& group_ioctx, const char *group_name, + librados::IoCtx& image_ioctx, const char *image_name); + static int image_remove(librados::IoCtx& group_ioctx, const char *group_name, + librados::IoCtx& image_ioctx, const char *image_name); + static int image_remove_by_id(librados::IoCtx& group_ioctx, + const char *group_name, + librados::IoCtx& image_ioctx, + const char *image_id); + static int image_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector *images); + + static int image_get_group(ImageCtxT *ictx, group_info_t *group_info); + + static int snap_create(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name, + uint32_t flags); + static int snap_remove(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name); + static int snap_rename(librados::IoCtx& group_ioctx, const char *group_name, + const char *old_snap_name, const char *new_snap_name); + static int snap_list(librados::IoCtx& group_ioctx, const char *group_name, + std::vector *snaps); + static int snap_rollback(librados::IoCtx& group_ioctx, + const char *group_name, const char *snap_name, + ProgressContext& pctx); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Group; + +#endif // CEPH_LIBRBD_API_GROUP_H diff --git a/src/librbd/api/Image.cc b/src/librbd/api/Image.cc new file mode 100644 index 000000000..19dc5aa68 --- /dev/null +++ b/src/librbd/api/Image.cc @@ -0,0 +1,1015 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Image.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/AsioEngine.h" +#include "librbd/DeepCopyRequest.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/api/Trash.h" +#include "librbd/api/Utils.h" +#include "librbd/crypto/FormatRequest.h" +#include "librbd/crypto/LoadRequest.h" +#include "librbd/deep_copy/Handler.h" +#include "librbd/image/CloneRequest.h" +#include "librbd/image/RemoveRequest.h" +#include "librbd/image/PreRemoveRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Image: " << __func__ << ": " + +using std::map; +using std::string; +using librados::snap_t; + +namespace librbd { +namespace api { + +namespace { + +bool compare_by_pool(const librbd::linked_image_spec_t& lhs, + const librbd::linked_image_spec_t& rhs) +{ + if (lhs.pool_id != rhs.pool_id) { + return lhs.pool_id < rhs.pool_id; + } else if (lhs.pool_namespace != rhs.pool_namespace) { + return lhs.pool_namespace < rhs.pool_namespace; + } + return false; +} + +bool compare(const librbd::linked_image_spec_t& lhs, + const librbd::linked_image_spec_t& rhs) +{ + if (lhs.pool_name != rhs.pool_name) { + return lhs.pool_name < rhs.pool_name; + } else if (lhs.pool_id != rhs.pool_id) { + return lhs.pool_id < rhs.pool_id; + } else if (lhs.pool_namespace != rhs.pool_namespace) { + return lhs.pool_namespace < rhs.pool_namespace; + } else if (lhs.image_name != rhs.image_name) { + return lhs.image_name < rhs.image_name; + } else if (lhs.image_id != rhs.image_id) { + return lhs.image_id < rhs.image_id; + } + return false; +} + +template +int pre_remove_image(librados::IoCtx& io_ctx, const std::string& image_id) { + I *image_ctx = I::create("", image_id, nullptr, io_ctx, false); + int r = image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + auto req = image::PreRemoveRequest::create(image_ctx, false, &ctx); + req->send(); + + r = ctx.wait(); + image_ctx->state->close(); + return r; +} + +} // anonymous namespace + +template +int64_t Image::get_data_pool_id(I *ictx) { + if (ictx->data_ctx.is_valid()) { + return ictx->data_ctx.get_id(); + } + + int64_t pool_id; + int r = cls_client::get_data_pool(&ictx->md_ctx, ictx->header_oid, &pool_id); + if (r < 0) { + CephContext *cct = ictx->cct; + lderr(cct) << "error getting data pool ID: " << cpp_strerror(r) << dendl; + return r; + } + + return pool_id; +} + +template +int Image::get_op_features(I *ictx, uint64_t *op_features) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "image_ctx=" << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + std::shared_lock image_locker{ictx->image_lock}; + *op_features = ictx->op_features; + return 0; +} + +template +int Image::list_images(librados::IoCtx& io_ctx, + std::vector *images) { + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 20) << "list " << &io_ctx << dendl; + + int r; + images->clear(); + + if (io_ctx.get_namespace().empty()) { + bufferlist bl; + r = io_ctx.read(RBD_DIRECTORY, bl, 0, 0); + if (r == -ENOENT) { + return 0; + } else if (r < 0) { + lderr(cct) << "error listing v1 images: " << cpp_strerror(r) << dendl; + return r; + } + + // V1 format images are in a tmap + if (bl.length()) { + auto p = bl.cbegin(); + bufferlist header; + std::map m; + decode(header, p); + decode(m, p); + for (auto& it : m) { + images->push_back({.id ="", .name = it.first}); + } + } + } + + // V2 format images + std::map image_names_to_ids; + r = list_images_v2(io_ctx, &image_names_to_ids); + if (r < 0) { + lderr(cct) << "error listing v2 images: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& img_pair : image_names_to_ids) { + images->push_back({.id = img_pair.second, + .name = img_pair.first}); + } + + // include V2 images in a partially removed state + std::vector trash_images; + r = Trash::list(io_ctx, trash_images, false); + if (r < 0 && r != -EOPNOTSUPP) { + lderr(cct) << "error listing trash images: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& trash_image : trash_images) { + if (trash_image.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + images->push_back({.id = trash_image.id, + .name = trash_image.name}); + + } + } + + return 0; +} + +template +int Image::list_images_v2(librados::IoCtx& io_ctx, ImageNameToIds *images) { + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 20) << "io_ctx=" << &io_ctx << dendl; + + // new format images are accessed by class methods + int r; + int max_read = 1024; + string last_read = ""; + do { + map images_page; + r = cls_client::dir_list(&io_ctx, RBD_DIRECTORY, last_read, max_read, + &images_page); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing image in directory: " + << cpp_strerror(r) << dendl; + return r; + } else if (r == -ENOENT) { + break; + } + for (map::const_iterator it = images_page.begin(); + it != images_page.end(); ++it) { + images->insert(*it); + } + if (!images_page.empty()) { + last_read = images_page.rbegin()->first; + } + r = images_page.size(); + } while (r == max_read); + + return 0; +} + +template +int Image::get_parent(I *ictx, + librbd::linked_image_spec_t *parent_image, + librbd::snap_spec_t *parent_snap) { + auto cct = ictx->cct; + ldout(cct, 20) << "image_ctx=" << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + std::shared_lock image_locker{ictx->image_lock}; + + bool release_image_lock = false; + BOOST_SCOPE_EXIT_ALL(ictx, &release_image_lock) { + if (release_image_lock) { + ictx->parent->image_lock.unlock_shared(); + } + }; + + // if a migration is in-progress, the true parent is the parent + // of the migration source image + auto parent = ictx->parent; + if (!ictx->migration_info.empty() && ictx->parent != nullptr) { + release_image_lock = true; + ictx->parent->image_lock.lock_shared(); + + parent = ictx->parent->parent; + } + + if (parent == nullptr) { + return -ENOENT; + } + + parent_image->pool_id = parent->md_ctx.get_id(); + parent_image->pool_name = parent->md_ctx.get_pool_name(); + parent_image->pool_namespace = parent->md_ctx.get_namespace(); + + std::shared_lock parent_image_locker{parent->image_lock}; + parent_snap->id = parent->snap_id; + parent_snap->namespace_type = RBD_SNAP_NAMESPACE_TYPE_USER; + if (parent->snap_id != CEPH_NOSNAP) { + auto snap_info = parent->get_snap_info(parent->snap_id); + if (snap_info == nullptr) { + lderr(cct) << "error finding parent snap name: " << cpp_strerror(r) + << dendl; + return -ENOENT; + } + + parent_snap->namespace_type = static_cast( + cls::rbd::get_snap_namespace_type(snap_info->snap_namespace)); + parent_snap->name = snap_info->name; + } + + parent_image->image_id = parent->id; + parent_image->image_name = parent->name; + parent_image->trash = true; + + librbd::trash_image_info_t trash_info; + r = Trash::get(parent->md_ctx, parent->id, &trash_info); + if (r == -ENOENT || r == -EOPNOTSUPP) { + parent_image->trash = false; + } else if (r < 0) { + lderr(cct) << "error looking up trash status: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template +int Image::list_children(I *ictx, + std::vector *images) { + images->clear(); + return list_descendants(ictx, 1, images); +} + +template +int Image::list_children(I *ictx, + const cls::rbd::ParentImageSpec &parent_spec, + std::vector *images) { + images->clear(); + return list_descendants(ictx, parent_spec, 1, images); +} + +template +int Image::list_descendants( + librados::IoCtx& io_ctx, const std::string &image_id, + const std::optional &max_level, + std::vector *images) { + ImageCtx *ictx = new librbd::ImageCtx("", image_id, nullptr, + io_ctx, true); + CephContext *cct = ictx->cct; + int r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + if (r < 0) { + if (r == -ENOENT) { + return 0; + } + lderr(cct) << "failed to open descendant " << image_id + << " from pool " << io_ctx.get_pool_name() << ":" + << cpp_strerror(r) << dendl; + return r; + } + + r = list_descendants(ictx, max_level, images); + + int r1 = ictx->state->close(); + if (r1 < 0) { + lderr(cct) << "error when closing descendant " << image_id + << " from pool " << io_ctx.get_pool_name() << ":" + << cpp_strerror(r1) << dendl; + } + + return r; +} + +template +int Image::list_descendants( + I *ictx, const std::optional &max_level, + std::vector *images) { + std::shared_lock l{ictx->image_lock}; + std::vector snap_ids; + if (ictx->snap_id != CEPH_NOSNAP) { + snap_ids.push_back(ictx->snap_id); + } else { + snap_ids = ictx->snaps; + } + for (auto snap_id : snap_ids) { + cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(), + ictx->md_ctx.get_namespace(), + ictx->id, snap_id}; + int r = list_descendants(ictx, parent_spec, max_level, images); + if (r < 0) { + return r; + } + } + return 0; +} + +template +int Image::list_descendants( + I *ictx, const cls::rbd::ParentImageSpec &parent_spec, + const std::optional &max_level, + std::vector *images) { + auto child_max_level = max_level; + if (child_max_level) { + if (child_max_level == 0) { + return 0; + } + (*child_max_level)--; + } + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + // no children for non-layered or old format image + if (!ictx->test_features(RBD_FEATURE_LAYERING, ictx->image_lock)) { + return 0; + } + + librados::Rados rados(ictx->md_ctx); + + // search all pools for clone v1 children dependent on this snapshot + std::list > pools; + int r = rados.pool_list2(pools); + if (r < 0) { + lderr(cct) << "error listing pools: " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& it : pools) { + int64_t base_tier; + r = rados.pool_get_base_tier(it.first, &base_tier); + if (r == -ENOENT) { + ldout(cct, 1) << "pool " << it.second << " no longer exists" << dendl; + continue; + } else if (r < 0) { + lderr(cct) << "error retrieving base tier for pool " << it.second + << dendl; + return r; + } + if (it.first != base_tier) { + // pool is a cache; skip it + continue; + } + + IoCtx ioctx; + r = librbd::util::create_ioctx( + ictx->md_ctx, "child image", it.first, {}, &ioctx); + if (r == -ENOENT) { + continue; + } else if (r < 0) { + return r; + } + + std::set image_ids; + r = cls_client::get_children(&ioctx, RBD_CHILDREN, parent_spec, + image_ids); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error reading list of children from pool " << it.second + << dendl; + return r; + } + + for (auto& image_id : image_ids) { + images->push_back({ + it.first, "", ictx->md_ctx.get_namespace(), image_id, "", false}); + r = list_descendants(ioctx, image_id, child_max_level, images); + if (r < 0) { + return r; + } + } + } + + // retrieve clone v2 children attached to this snapshot + IoCtx parent_io_ctx; + r = librbd::util::create_ioctx( + ictx->md_ctx, "parent image",parent_spec.pool_id, + parent_spec.pool_namespace, &parent_io_ctx); + if (r < 0) { + return r; + } + + cls::rbd::ChildImageSpecs child_images; + r = cls_client::children_list( + &parent_io_ctx, librbd::util::header_name(parent_spec.image_id), + parent_spec.snap_id, &child_images); + if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) { + lderr(cct) << "error retrieving children: " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& child_image : child_images) { + images->push_back({ + child_image.pool_id, "", child_image.pool_namespace, + child_image.image_id, "", false}); + if (!child_max_level || *child_max_level > 0) { + IoCtx ioctx; + r = librbd::util::create_ioctx( + ictx->md_ctx, "child image", child_image.pool_id, + child_image.pool_namespace, &ioctx); + if (r == -ENOENT) { + continue; + } else if (r < 0) { + return r; + } + r = list_descendants(ioctx, child_image.image_id, child_max_level, + images); + if (r < 0) { + return r; + } + } + } + + // batch lookups by pool + namespace + std::sort(images->begin(), images->end(), compare_by_pool); + + int64_t child_pool_id = -1; + librados::IoCtx child_io_ctx; + std::map> child_image_id_to_info; + for (auto& image : *images) { + if (child_pool_id == -1 || child_pool_id != image.pool_id || + child_io_ctx.get_namespace() != image.pool_namespace) { + r = librbd::util::create_ioctx( + ictx->md_ctx, "child image", image.pool_id, image.pool_namespace, + &child_io_ctx); + if (r == -ENOENT) { + image.pool_name = ""; + image.image_name = ""; + continue; + } else if (r < 0) { + return r; + } + child_pool_id = image.pool_id; + + child_image_id_to_info.clear(); + + std::map image_names_to_ids; + r = list_images_v2(child_io_ctx, &image_names_to_ids); + if (r < 0) { + lderr(cct) << "error listing v2 images: " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& [name, id] : image_names_to_ids) { + child_image_id_to_info.insert({id, {name, false}}); + } + + std::vector trash_images; + r = Trash::list(child_io_ctx, trash_images, false); + if (r < 0 && r != -EOPNOTSUPP) { + lderr(cct) << "error listing trash images: " << cpp_strerror(r) + << dendl; + return r; + } + + for (auto& it : trash_images) { + child_image_id_to_info.insert({ + it.id, + {it.name, + it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING ? false : true}}); + } + } + + auto it = child_image_id_to_info.find(image.image_id); + if (it == child_image_id_to_info.end()) { + lderr(cct) << "error looking up name for image id " + << image.image_id << " in pool " + << child_io_ctx.get_pool_name() + << (image.pool_namespace.empty() ? + "" : "/" + image.pool_namespace) << dendl; + return -ENOENT; + } + + image.pool_name = child_io_ctx.get_pool_name(); + image.image_name = it->second.first; + image.trash = it->second.second; + } + + // final sort by pool + image names + std::sort(images->begin(), images->end(), compare); + return 0; +} + +template +int Image::deep_copy(I *src, librados::IoCtx& dest_md_ctx, + const char *destname, ImageOptions& opts, + ProgressContext &prog_ctx) { + CephContext *cct = (CephContext *)dest_md_ctx.cct(); + ldout(cct, 20) << src->name + << (src->snap_name.length() ? "@" + src->snap_name : "") + << " -> " << destname << " opts = " << opts << dendl; + + uint64_t features; + uint64_t src_size; + { + std::shared_lock image_locker{src->image_lock}; + + if (!src->migration_info.empty()) { + lderr(cct) << "cannot deep copy migrating image" << dendl; + return -EBUSY; + } + + features = src->features; + src_size = src->get_image_size(src->snap_id); + } + uint64_t format = 2; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) { + opts.set(RBD_IMAGE_OPTION_FORMAT, format); + } + if (format == 1) { + lderr(cct) << "old format not supported for destination image" << dendl; + return -EINVAL; + } + uint64_t stripe_unit = src->stripe_unit; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + } + uint64_t stripe_count = src->stripe_count; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + } + uint64_t order = src->order; + if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) { + opts.set(RBD_IMAGE_OPTION_ORDER, order); + } + if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) { + opts.set(RBD_IMAGE_OPTION_FEATURES, features); + } + if (features & ~RBD_FEATURES_ALL) { + lderr(cct) << "librbd does not support requested features" << dendl; + return -ENOSYS; + } + + uint64_t flatten = 0; + if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) { + opts.unset(RBD_IMAGE_OPTION_FLATTEN); + } + + cls::rbd::ParentImageSpec parent_spec; + if (flatten > 0) { + parent_spec.pool_id = -1; + } else { + std::shared_lock image_locker{src->image_lock}; + + // use oldest snapshot or HEAD for parent spec + if (!src->snap_info.empty()) { + parent_spec = src->snap_info.begin()->second.parent.spec; + } else { + parent_spec = src->parent_md.spec; + } + } + + int r; + if (parent_spec.pool_id == -1) { + r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false); + } else { + librados::IoCtx parent_io_ctx; + r = librbd::util::create_ioctx( + src->md_ctx, "parent image", parent_spec.pool_id, + parent_spec.pool_namespace, &parent_io_ctx); + if (r < 0) { + return r; + } + + ConfigProxy config{cct->_conf}; + api::Config::apply_pool_overrides(dest_md_ctx, &config); + + C_SaferCond ctx; + std::string dest_id = librbd::util::generate_image_id(dest_md_ctx); + auto *req = image::CloneRequest::create( + config, parent_io_ctx, parent_spec.image_id, "", {}, parent_spec.snap_id, + dest_md_ctx, destname, dest_id, opts, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, + "", "", src->op_work_queue, &ctx); + req->send(); + r = ctx.wait(); + } + if (r < 0) { + lderr(cct) << "header creation failed" << dendl; + return r; + } + opts.set(RBD_IMAGE_OPTION_ORDER, static_cast(order)); + + auto dest = new I(destname, "", nullptr, dest_md_ctx, false); + r = dest->state->open(0); + if (r < 0) { + lderr(cct) << "failed to read newly created header" << dendl; + return r; + } + + C_SaferCond lock_ctx; + { + std::unique_lock locker{dest->owner_lock}; + + if (dest->exclusive_lock == nullptr || + dest->exclusive_lock->is_lock_owner()) { + lock_ctx.complete(0); + } else { + dest->exclusive_lock->acquire_lock(&lock_ctx); + } + } + + r = lock_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r) + << dendl; + dest->state->close(); + return r; + } + + r = deep_copy(src, dest, flatten > 0, prog_ctx); + + int close_r = dest->state->close(); + if (r == 0 && close_r < 0) { + r = close_r; + } + return r; +} + +template +int Image::deep_copy(I *src, I *dest, bool flatten, + ProgressContext &prog_ctx) { + // ensure previous writes are visible to dest + C_SaferCond flush_ctx; + { + std::shared_lock owner_locker{src->owner_lock}; + auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, src, + io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + *src, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, + aio_comp, io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); + } + int r = flush_ctx.wait(); + if (r < 0) { + return r; + } + + librados::snap_t snap_id_start = 0; + librados::snap_t snap_id_end; + { + std::shared_lock image_locker{src->image_lock}; + snap_id_end = src->snap_id; + } + + AsioEngine asio_engine(src->md_ctx); + + C_SaferCond cond; + SnapSeqs snap_seqs; + deep_copy::ProgressHandler progress_handler{&prog_ctx}; + auto req = DeepCopyRequest::create( + src, dest, snap_id_start, snap_id_end, 0U, flatten, boost::none, + asio_engine.get_work_queue(), &snap_seqs, &progress_handler, &cond); + req->send(); + r = cond.wait(); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Image::snap_set(I *ictx, + const cls::rbd::SnapshotNamespace &snap_namespace, + const char *snap_name) { + ldout(ictx->cct, 20) << "snap_set " << ictx << " snap = " + << (snap_name ? snap_name : "NULL") << dendl; + + // ignore return value, since we may be set to a non-existent + // snapshot and the user is trying to fix that + ictx->state->refresh_if_required(); + + uint64_t snap_id = CEPH_NOSNAP; + std::string name(snap_name == nullptr ? "" : snap_name); + if (!name.empty()) { + std::shared_lock image_locker{ictx->image_lock}; + snap_id = ictx->get_snap_id(snap_namespace, snap_name); + if (snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + } + + return snap_set(ictx, snap_id); +} + +template +int Image::snap_set(I *ictx, uint64_t snap_id) { + ldout(ictx->cct, 20) << "snap_set " << ictx << " " + << "snap_id=" << snap_id << dendl; + + // ignore return value, since we may be set to a non-existent + // snapshot and the user is trying to fix that + ictx->state->refresh_if_required(); + + C_SaferCond ctx; + ictx->state->snap_set(snap_id, &ctx); + int r = ctx.wait(); + if (r < 0) { + if (r != -ENOENT) { + lderr(ictx->cct) << "failed to " << (snap_id == CEPH_NOSNAP ? "un" : "") + << "set snapshot: " << cpp_strerror(r) << dendl; + } + return r; + } + + return 0; +} + +template +int Image::remove(IoCtx& io_ctx, const std::string &image_name, + ProgressContext& prog_ctx) +{ + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "name=" << image_name << dendl; + + // look up the V2 image id based on the image name + std::string image_id; + int r = cls_client::dir_get_id(&io_ctx, RBD_DIRECTORY, image_name, + &image_id); + if (r == -ENOENT) { + // check if it already exists in trash from an aborted trash remove attempt + std::vector trash_entries; + r = Trash::list(io_ctx, trash_entries, false); + if (r < 0) { + return r; + } + for (auto& entry : trash_entries) { + if (entry.name == image_name && + entry.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + cls::rbd::TrashImageSpec spec; + r = cls_client::trash_get(&io_ctx, entry.id, &spec); + if (r < 0) { + lderr(cct) << "error getting image id " << entry.id + << " info from trash: " << cpp_strerror(r) << dendl; + return r; + } + if (spec.state == cls::rbd::TRASH_IMAGE_STATE_MOVING) { + r = Trash::move(io_ctx, entry.source, entry.name, entry.id, 0); + if (r < 0) { + return r; + } + } + return Trash::remove(io_ctx, entry.id, true, prog_ctx); + } + } + + // fall-through if we failed to locate the image in the V2 directory and + // trash + } else if (r < 0) { + lderr(cct) << "failed to retrieve image id: " << cpp_strerror(r) << dendl; + return r; + } else { + // attempt to move the image to the trash (and optionally immediately + // delete the image) + ConfigProxy config(cct->_conf); + Config::apply_pool_overrides(io_ctx, &config); + + rbd_trash_image_source_t trash_image_source = + RBD_TRASH_IMAGE_SOURCE_REMOVING; + uint64_t expire_seconds = 0; + if (config.get_val("rbd_move_to_trash_on_remove")) { + // keep the image in the trash upon remove requests + trash_image_source = RBD_TRASH_IMAGE_SOURCE_USER; + expire_seconds = config.get_val( + "rbd_move_to_trash_on_remove_expire_seconds"); + } else { + // attempt to pre-validate the removal before moving to trash and + // removing + r = pre_remove_image(io_ctx, image_id); + if (r == -ECHILD) { + if (config.get_val("rbd_move_parent_to_trash_on_remove")) { + // keep the image in the trash until the last child is removed + trash_image_source = RBD_TRASH_IMAGE_SOURCE_USER_PARENT; + } else { + lderr(cct) << "image has snapshots - not removing" << dendl; + return -ENOTEMPTY; + } + } else if (r < 0 && r != -ENOENT) { + return r; + } + } + + r = Trash::move(io_ctx, trash_image_source, image_name, image_id, + expire_seconds); + if (r >= 0) { + if (trash_image_source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + // proceed with attempting to immediately remove the image + r = Trash::remove(io_ctx, image_id, true, prog_ctx); + + if (r == -ENOTEMPTY || r == -EBUSY || r == -EMLINK) { + // best-effort try to restore the image if the removal + // failed for possible expected reasons + Trash::restore(io_ctx, {cls::rbd::TRASH_IMAGE_SOURCE_REMOVING}, + image_id, image_name); + } + } + return r; + } else if (r < 0 && r != -EOPNOTSUPP) { + return r; + } + + // fall-through if trash isn't supported + } + + AsioEngine asio_engine(io_ctx); + + // might be a V1 image format that cannot be moved to the trash + // and would not have been listed in the V2 directory -- or the OSDs + // are too old and don't support the trash feature + C_SaferCond cond; + auto req = librbd::image::RemoveRequest::create( + io_ctx, image_name, "", false, false, prog_ctx, + asio_engine.get_work_queue(), &cond); + req->send(); + + return cond.wait(); +} + +template +int Image::flatten_children(I *ictx, const char* snap_name, + ProgressContext& pctx) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "children flatten " << ictx->name << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + std::shared_lock l{ictx->image_lock}; + snap_t snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(), + snap_name); + + cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(), + ictx->md_ctx.get_namespace(), + ictx->id, snap_id}; + std::vector child_images; + r = list_children(ictx, parent_spec, &child_images); + if (r < 0) { + return r; + } + + size_t size = child_images.size(); + if (size == 0) { + return 0; + } + + librados::IoCtx child_io_ctx; + int64_t child_pool_id = -1; + size_t i = 0; + for (auto &child_image : child_images){ + std::string pool = child_image.pool_name; + if (child_pool_id == -1 || + child_pool_id != child_image.pool_id || + child_io_ctx.get_namespace() != child_image.pool_namespace) { + r = librbd::util::create_ioctx( + ictx->md_ctx, "child image", child_image.pool_id, + child_image.pool_namespace, &child_io_ctx); + if (r < 0) { + return r; + } + + child_pool_id = child_image.pool_id; + } + + ImageCtx *imctx = new ImageCtx("", child_image.image_id, nullptr, + child_io_ctx, false); + r = imctx->state->open(0); + if (r < 0) { + lderr(cct) << "error opening image: " << cpp_strerror(r) << dendl; + return r; + } + + if ((imctx->features & RBD_FEATURE_DEEP_FLATTEN) == 0 && + !imctx->snaps.empty()) { + lderr(cct) << "snapshot in-use by " << pool << "/" << imctx->name + << dendl; + imctx->state->close(); + return -EBUSY; + } + + librbd::NoOpProgressContext prog_ctx; + r = imctx->operations->flatten(prog_ctx); + if (r < 0) { + lderr(cct) << "error flattening image: " << pool << "/" + << (child_image.pool_namespace.empty() ? + "" : "/" + child_image.pool_namespace) + << child_image.image_name << cpp_strerror(r) << dendl; + imctx->state->close(); + return r; + } + + r = imctx->state->close(); + if (r < 0) { + lderr(cct) << "failed to close image: " << cpp_strerror(r) << dendl; + return r; + } + + pctx.update_progress(++i, size); + ceph_assert(i <= size); + } + + return 0; +} + +template +int Image::encryption_format(I* ictx, encryption_format_t format, + encryption_options_t opts, size_t opts_size, + bool c_api) { + crypto::EncryptionFormat* result_format; + auto r = util::create_encryption_format( + ictx->cct, format, opts, opts_size, c_api, &result_format); + if (r != 0) { + return r; + } + + C_SaferCond cond; + auto req = librbd::crypto::FormatRequest::create( + ictx, std::unique_ptr>(result_format), + &cond); + req->send(); + return cond.wait(); +} + +template +int Image::encryption_load(I* ictx, const encryption_spec_t *specs, + size_t spec_count, bool c_api) { + std::vector>> formats; + + for (size_t i = 0; i < spec_count; ++i) { + crypto::EncryptionFormat* result_format; + auto r = util::create_encryption_format( + ictx->cct, specs[i].format, specs[i].opts, specs[i].opts_size, + c_api, &result_format); + if (r != 0) { + return r; + } + + formats.emplace_back(result_format); + } + + C_SaferCond cond; + auto req = librbd::crypto::LoadRequest::create( + ictx, std::move(formats), &cond); + req->send(); + return cond.wait(); +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Image; diff --git a/src/librbd/api/Image.h b/src/librbd/api/Image.h new file mode 100644 index 000000000..29398d6cd --- /dev/null +++ b/src/librbd/api/Image.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_API_IMAGE_H +#define LIBRBD_API_IMAGE_H + +#include "include/rbd/librbd.hpp" +#include "include/rados/librados_fwd.hpp" +#include "librbd/Types.h" +#include +#include +#include + +namespace librbd { + +class ImageOptions; +class ProgressContext; + +struct ImageCtx; + +namespace api { + +template +struct Image { + typedef std::map ImageNameToIds; + + static int64_t get_data_pool_id(ImageCtxT *ictx); + + static int get_op_features(ImageCtxT *ictx, uint64_t *op_features); + + static int list_images(librados::IoCtx& io_ctx, + std::vector *images); + static int list_images_v2(librados::IoCtx& io_ctx, + ImageNameToIds *images); + + static int get_parent(ImageCtxT *ictx, + librbd::linked_image_spec_t *parent_image, + librbd::snap_spec_t *parent_snap); + + static int list_children(ImageCtxT *ictx, + std::vector *images); + static int list_children(ImageCtxT *ictx, + const cls::rbd::ParentImageSpec &parent_spec, + std::vector *images); + + static int list_descendants(IoCtx& io_ctx, const std::string &image_id, + const std::optional &max_level, + std::vector *images); + static int list_descendants(ImageCtxT *ictx, + const std::optional &max_level, + std::vector *images); + static int list_descendants(ImageCtxT *ictx, + const cls::rbd::ParentImageSpec &parent_spec, + const std::optional &max_level, + std::vector *images); + + static int deep_copy(ImageCtxT *ictx, librados::IoCtx& dest_md_ctx, + const char *destname, ImageOptions& opts, + ProgressContext &prog_ctx); + static int deep_copy(ImageCtxT *src, ImageCtxT *dest, bool flatten, + ProgressContext &prog_ctx); + + static int snap_set(ImageCtxT *ictx, + const cls::rbd::SnapshotNamespace &snap_namespace, + const char *snap_name); + static int snap_set(ImageCtxT *ictx, uint64_t snap_id); + + static int remove(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext& prog_ctx); + + static int flatten_children(ImageCtxT *ictx, const char* snap_name, ProgressContext& pctx); + + static int encryption_format(ImageCtxT *ictx, encryption_format_t format, + encryption_options_t opts, size_t opts_size, + bool c_api); + static int encryption_load(ImageCtxT *ictx, const encryption_spec_t *specs, + size_t spec_count, bool c_api); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Image; + +#endif // LIBRBD_API_IMAGE_H diff --git a/src/librbd/api/Io.cc b/src/librbd/api/Io.cc new file mode 100644 index 000000000..c1bd38fc0 --- /dev/null +++ b/src/librbd/api/Io.cc @@ -0,0 +1,555 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Io.h" +#include "include/intarith.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "common/EventTrace.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/Types.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Io " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +template +bool is_valid_io(I& image_ctx, io::AioCompletion* aio_comp) { + auto cct = image_ctx.cct; + + if (!image_ctx.data_ctx.is_valid()) { + lderr(cct) << "missing data pool" << dendl; + + aio_comp->fail(-ENODEV); + return false; + } + + return true; +} + +} // anonymous namespace + +template +ssize_t Io::read( + I &image_ctx, uint64_t off, uint64_t len, io::ReadResult &&read_result, + int op_flags) { + auto cct = image_ctx.cct; + + ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", " + << "len = " << len << dendl; + + C_SaferCond ctx; + auto aio_comp = io::AioCompletion::create(&ctx); + aio_read(image_ctx, aio_comp, off, len, std::move(read_result), op_flags, + false); + return ctx.wait(); +} + +template +ssize_t Io::write( + I &image_ctx, uint64_t off, uint64_t len, bufferlist &&bl, int op_flags) { + auto cct = image_ctx.cct; + ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", " + << "len = " << len << dendl; + + image_ctx.image_lock.lock_shared(); + int r = clip_io(util::get_image_ctx(&image_ctx), off, &len, + io::ImageArea::DATA); + image_ctx.image_lock.unlock_shared(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond ctx; + auto aio_comp = io::AioCompletion::create(&ctx); + aio_write(image_ctx, aio_comp, off, len, std::move(bl), op_flags, false); + + r = ctx.wait(); + if (r < 0) { + return r; + } + return len; +} + +template +ssize_t Io::discard( + I &image_ctx, uint64_t off, uint64_t len, + uint32_t discard_granularity_bytes) { + auto cct = image_ctx.cct; + ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", " + << "len = " << len << dendl; + + image_ctx.image_lock.lock_shared(); + int r = clip_io(util::get_image_ctx(&image_ctx), off, &len, + io::ImageArea::DATA); + image_ctx.image_lock.unlock_shared(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond ctx; + auto aio_comp = io::AioCompletion::create(&ctx); + aio_discard(image_ctx, aio_comp, off, len, discard_granularity_bytes, false); + + r = ctx.wait(); + if (r < 0) { + return r; + } + return len; +} + +template +ssize_t Io::write_same( + I &image_ctx, uint64_t off, uint64_t len, bufferlist &&bl, int op_flags) { + auto cct = image_ctx.cct; + ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", " + << "len = " << len << ", data_len " << bl.length() << dendl; + + image_ctx.image_lock.lock_shared(); + int r = clip_io(util::get_image_ctx(&image_ctx), off, &len, + io::ImageArea::DATA); + image_ctx.image_lock.unlock_shared(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond ctx; + auto aio_comp = io::AioCompletion::create(&ctx); + aio_write_same(image_ctx, aio_comp, off, len, std::move(bl), op_flags, false); + + r = ctx.wait(); + if (r < 0) { + return r; + } + return len; +} + +template +ssize_t Io::write_zeroes(I& image_ctx, uint64_t off, uint64_t len, + int zero_flags, int op_flags) { + auto cct = image_ctx.cct; + ldout(cct, 20) << "ictx=" << &image_ctx << ", off=" << off << ", " + << "len = " << len << dendl; + + image_ctx.image_lock.lock_shared(); + int r = clip_io(util::get_image_ctx(&image_ctx), off, &len, + io::ImageArea::DATA); + image_ctx.image_lock.unlock_shared(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond ctx; + auto aio_comp = io::AioCompletion::create(&ctx); + aio_write_zeroes(image_ctx, aio_comp, off, len, zero_flags, op_flags, false); + + r = ctx.wait(); + if (r < 0) { + return r; + } + return len; +} + +template +ssize_t Io::compare_and_write( + I &image_ctx, uint64_t off, uint64_t len, bufferlist &&cmp_bl, + bufferlist &&bl, uint64_t *mismatch_off, int op_flags) { + auto cct = image_ctx.cct; + ldout(cct, 20) << "compare_and_write ictx=" << &image_ctx << ", off=" + << off << ", " << "len = " << len << dendl; + + image_ctx.image_lock.lock_shared(); + int r = clip_io(util::get_image_ctx(&image_ctx), off, &len, + io::ImageArea::DATA); + image_ctx.image_lock.unlock_shared(); + if (r < 0) { + lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond ctx; + auto aio_comp = io::AioCompletion::create(&ctx); + aio_compare_and_write(image_ctx, aio_comp, off, len, std::move(cmp_bl), + std::move(bl), mismatch_off, op_flags, false); + + r = ctx.wait(); + if (r < 0) { + return r; + } + return len; +} + +template +int Io::flush(I &image_ctx) { + auto cct = image_ctx.cct; + ldout(cct, 20) << "ictx=" << &image_ctx << dendl; + + C_SaferCond ctx; + auto aio_comp = io::AioCompletion::create(&ctx); + aio_flush(image_ctx, aio_comp, false); + + int r = ctx.wait(); + if (r < 0) { + return r; + } + + return 0; +} + +template +void Io::aio_read(I &image_ctx, io::AioCompletion *aio_comp, uint64_t off, + uint64_t len, io::ReadResult &&read_result, int op_flags, + bool native_async) { + auto cct = image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (image_ctx.blkin_trace_all) { + trace.init("io: read", &image_ctx.trace_endpoint); + trace.event("init"); + } + + aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_READ); + ldout(cct, 20) << "ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << ", off=" << off << ", " + << "len=" << len << ", " << "flags=" << op_flags << dendl; + + if (native_async && image_ctx.event_socket.is_valid()) { + aio_comp->set_event_notify(true); + } + + if (!is_valid_io(image_ctx, aio_comp)) { + return; + } + + auto req = io::ImageDispatchSpec::create_read( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{off, len}}, io::ImageArea::DATA, std::move(read_result), + image_ctx.get_data_io_context(), op_flags, 0, trace); + req->send(); +} + +template +void Io::aio_write(I &image_ctx, io::AioCompletion *aio_comp, uint64_t off, + uint64_t len, bufferlist &&bl, int op_flags, + bool native_async) { + auto cct = image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (image_ctx.blkin_trace_all) { + trace.init("io: write", &image_ctx.trace_endpoint); + trace.event("init"); + } + + aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_WRITE); + ldout(cct, 20) << "ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << ", off=" << off << ", " + << "len=" << len << ", flags=" << op_flags << dendl; + + if (native_async && image_ctx.event_socket.is_valid()) { + aio_comp->set_event_notify(true); + } + + if (!is_valid_io(image_ctx, aio_comp)) { + return; + } + + auto req = io::ImageDispatchSpec::create_write( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{off, len}}, io::ImageArea::DATA, std::move(bl), op_flags, trace); + req->send(); +} + +template +void Io::aio_discard(I &image_ctx, io::AioCompletion *aio_comp, uint64_t off, + uint64_t len, uint32_t discard_granularity_bytes, + bool native_async) { + auto cct = image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (image_ctx.blkin_trace_all) { + trace.init("io: discard", &image_ctx.trace_endpoint); + trace.event("init"); + } + + aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_DISCARD); + ldout(cct, 20) << "ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << ", off=" << off << ", " + << "len=" << len << dendl; + + if (native_async && image_ctx.event_socket.is_valid()) { + aio_comp->set_event_notify(true); + } + + if (!is_valid_io(image_ctx, aio_comp)) { + return; + } + + auto req = io::ImageDispatchSpec::create_discard( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{off, len}}, io::ImageArea::DATA, discard_granularity_bytes, trace); + req->send(); +} + +template +void Io::aio_write_same(I &image_ctx, io::AioCompletion *aio_comp, + uint64_t off, uint64_t len, bufferlist &&bl, + int op_flags, bool native_async) { + auto cct = image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (image_ctx.blkin_trace_all) { + trace.init("io: writesame", &image_ctx.trace_endpoint); + trace.event("init"); + } + + aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_WRITESAME); + ldout(cct, 20) << "ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << ", off=" << off << ", " + << "len=" << len << ", data_len = " << bl.length() << ", " + << "flags=" << op_flags << dendl; + + if (native_async && image_ctx.event_socket.is_valid()) { + aio_comp->set_event_notify(true); + } + + if (!is_valid_io(image_ctx, aio_comp)) { + return; + } + + auto req = io::ImageDispatchSpec::create_write_same( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{off, len}}, io::ImageArea::DATA, std::move(bl), op_flags, trace); + req->send(); +} + +template +void Io::aio_write_zeroes(I& image_ctx, io::AioCompletion *aio_comp, + uint64_t off, uint64_t len, int zero_flags, + int op_flags, bool native_async) { + auto cct = image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (image_ctx.blkin_trace_all) { + trace.init("io: write_zeroes", &image_ctx.trace_endpoint); + trace.event("init"); + } + + auto io_type = io::AIO_TYPE_DISCARD; + if ((zero_flags & RBD_WRITE_ZEROES_FLAG_THICK_PROVISION) != 0) { + zero_flags &= ~RBD_WRITE_ZEROES_FLAG_THICK_PROVISION; + io_type = io::AIO_TYPE_WRITESAME; + } + + aio_comp->init_time(util::get_image_ctx(&image_ctx), io_type); + ldout(cct, 20) << "ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << ", off=" << off << ", " + << "len=" << len << dendl; + + if (native_async && image_ctx.event_socket.is_valid()) { + aio_comp->set_event_notify(true); + } + + // validate the supported flags + if (zero_flags != 0U) { + aio_comp->fail(-EINVAL); + return; + } + + if (!is_valid_io(image_ctx, aio_comp)) { + return; + } + + if (io_type == io::AIO_TYPE_WRITESAME) { + // write-same needs to be aligned to its buffer but librbd has never forced + // block alignment. Hide that requirement from the user by adding optional + // writes. + const uint64_t data_length = 512; + uint64_t write_same_offset = p2roundup(off, data_length); + uint64_t write_same_offset_end = p2align(off + len, data_length); + uint64_t write_same_length = 0; + if (write_same_offset_end > write_same_offset) { + write_same_length = write_same_offset_end - write_same_offset; + } + + uint64_t prepend_offset = off; + uint64_t prepend_length = write_same_offset - off; + uint64_t append_offset = write_same_offset + write_same_length; + uint64_t append_length = len - prepend_length - write_same_length; + ldout(cct, 20) << "prepend_offset=" << prepend_offset << ", " + << "prepend_length=" << prepend_length << ", " + << "write_same_offset=" << write_same_offset << ", " + << "write_same_length=" << write_same_length << ", " + << "append_offset=" << append_offset << ", " + << "append_length=" << append_length << dendl; + ceph_assert(prepend_length + write_same_length + append_length == len); + + if (write_same_length <= data_length) { + // unaligned or small write-zeroes request -- use single write + bufferlist bl; + bl.append_zero(len); + + aio_comp->aio_type = io::AIO_TYPE_WRITE; + auto req = io::ImageDispatchSpec::create_write( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{off, len}}, io::ImageArea::DATA, std::move(bl), op_flags, trace); + req->send(); + return; + } else if (prepend_length == 0 && append_length == 0) { + // fully aligned -- use a single write-same image request + bufferlist bl; + bl.append_zero(data_length); + + auto req = io::ImageDispatchSpec::create_write_same( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{off, len}}, io::ImageArea::DATA, std::move(bl), op_flags, trace); + req->send(); + return; + } + + // to reach this point, we need at least one prepend/append write along with + // a write-same -- therefore we will need to wrap the provided AioCompletion + auto request_count = 1; + if (prepend_length > 0) { + ++request_count; + } + if (append_length > 0) { + ++request_count; + } + + ceph_assert(request_count > 1); + aio_comp->start_op(); + aio_comp->set_request_count(request_count); + + if (prepend_length > 0) { + bufferlist bl; + bl.append_zero(prepend_length); + + Context* prepend_ctx = new io::C_AioRequest(aio_comp); + auto prepend_aio_comp = io::AioCompletion::create_and_start( + prepend_ctx, &image_ctx, io::AIO_TYPE_WRITE); + auto prepend_req = io::ImageDispatchSpec::create_write( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, prepend_aio_comp, + {{prepend_offset, prepend_length}}, io::ImageArea::DATA, + std::move(bl), op_flags, trace); + prepend_req->send(); + } + + if (append_length > 0) { + bufferlist bl; + bl.append_zero(append_length); + + Context* append_ctx = new io::C_AioRequest(aio_comp); + auto append_aio_comp = io::AioCompletion::create_and_start( + append_ctx, &image_ctx, io::AIO_TYPE_WRITE); + auto append_req = io::ImageDispatchSpec::create_write( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, append_aio_comp, + {{append_offset, append_length}}, io::ImageArea::DATA, + std::move(bl), op_flags, trace); + append_req->send(); + } + + bufferlist bl; + bl.append_zero(data_length); + + Context* write_same_ctx = new io::C_AioRequest(aio_comp); + auto write_same_aio_comp = io::AioCompletion::create_and_start( + write_same_ctx, &image_ctx, io::AIO_TYPE_WRITESAME); + auto req = io::ImageDispatchSpec::create_write_same( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, write_same_aio_comp, + {{write_same_offset, write_same_length}}, io::ImageArea::DATA, + std::move(bl), op_flags, trace); + req->send(); + return; + } + + // enable partial discard (zeroing) of objects + uint32_t discard_granularity_bytes = 0; + + auto req = io::ImageDispatchSpec::create_discard( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{off, len}}, io::ImageArea::DATA, discard_granularity_bytes, trace); + req->send(); +} + +template +void Io::aio_compare_and_write(I &image_ctx, io::AioCompletion *aio_comp, + uint64_t off, uint64_t len, + bufferlist &&cmp_bl, + bufferlist &&bl, uint64_t *mismatch_off, + int op_flags, bool native_async) { + auto cct = image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (image_ctx.blkin_trace_all) { + trace.init("io: compare_and_write", &image_ctx.trace_endpoint); + trace.event("init"); + } + + aio_comp->init_time(util::get_image_ctx(&image_ctx), + io::AIO_TYPE_COMPARE_AND_WRITE); + ldout(cct, 20) << "ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << ", off=" << off << ", " + << "len=" << len << dendl; + + if (native_async && image_ctx.event_socket.is_valid()) { + aio_comp->set_event_notify(true); + } + + if (!is_valid_io(image_ctx, aio_comp)) { + return; + } + + auto req = io::ImageDispatchSpec::create_compare_and_write( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{off, len}}, io::ImageArea::DATA, std::move(cmp_bl), std::move(bl), + mismatch_off, op_flags, trace); + req->send(); +} + +template +void Io::aio_flush(I &image_ctx, io::AioCompletion *aio_comp, + bool native_async) { + auto cct = image_ctx.cct; + FUNCTRACE(cct); + ZTracer::Trace trace; + if (image_ctx.blkin_trace_all) { + trace.init("io: flush", &image_ctx.trace_endpoint); + trace.event("init"); + } + + aio_comp->init_time(util::get_image_ctx(&image_ctx), io::AIO_TYPE_FLUSH); + ldout(cct, 20) << "ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << dendl; + + if (native_async && image_ctx.event_socket.is_valid()) { + aio_comp->set_event_notify(true); + } + + if (!is_valid_io(image_ctx, aio_comp)) { + return; + } + + auto req = io::ImageDispatchSpec::create_flush( + image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + io::FLUSH_SOURCE_USER, trace); + req->send(); +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Io; diff --git a/src/librbd/api/Io.h b/src/librbd/api/Io.h new file mode 100644 index 000000000..4e2ec5028 --- /dev/null +++ b/src/librbd/api/Io.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_API_IO_H +#define LIBRBD_API_IO_H + +#include "include/int_types.h" +#include "librbd/io/ReadResult.h" + +namespace librbd { + +struct ImageCtx; +namespace io { struct AioCompletion; } + +namespace api { + +template +struct Io { + static ssize_t read(ImageCtxT &image_ctx, uint64_t off, uint64_t len, + io::ReadResult &&read_result, int op_flags); + static ssize_t write(ImageCtxT &image_ctx, uint64_t off, uint64_t len, + bufferlist &&bl, int op_flags); + static ssize_t discard(ImageCtxT &image_ctx, uint64_t off, uint64_t len, + uint32_t discard_granularity_bytes); + static ssize_t write_same(ImageCtxT &image_ctx, uint64_t off, uint64_t len, + bufferlist &&bl, int op_flags); + static ssize_t write_zeroes(ImageCtxT &image_ctx, uint64_t off, uint64_t len, + int zero_flags, int op_flags); + static ssize_t compare_and_write(ImageCtxT &image_ctx, uint64_t off, + uint64_t len, bufferlist &&cmp_bl, + bufferlist &&bl, uint64_t *mismatch_off, + int op_flags); + static int flush(ImageCtxT &image_ctx); + + static void aio_read(ImageCtxT &image_ctx, io::AioCompletion *c, uint64_t off, + uint64_t len, io::ReadResult &&read_result, int op_flags, + bool native_async); + static void aio_write(ImageCtxT &image_ctx, io::AioCompletion *c, + uint64_t off, uint64_t len, bufferlist &&bl, + int op_flags, bool native_async); + static void aio_discard(ImageCtxT &image_ctx, io::AioCompletion *c, + uint64_t off, uint64_t len, + uint32_t discard_granularity_bytes, + bool native_async); + static void aio_write_same(ImageCtxT &image_ctx, io::AioCompletion *c, + uint64_t off, uint64_t len, bufferlist &&bl, + int op_flags, bool native_async); + static void aio_write_zeroes(ImageCtxT &image_ctx, io::AioCompletion *c, + uint64_t off, uint64_t len, int zero_flags, + int op_flags, bool native_async); + static void aio_compare_and_write(ImageCtxT &image_ctx, io::AioCompletion *c, + uint64_t off, uint64_t len, + bufferlist &&cmp_bl, bufferlist &&bl, + uint64_t *mismatch_off, int op_flags, + bool native_async); + static void aio_flush(ImageCtxT &image_ctx, io::AioCompletion *c, + bool native_async); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Io; + +#endif // LIBRBD_API_IO_H diff --git a/src/librbd/api/Migration.cc b/src/librbd/api/Migration.cc new file mode 100644 index 000000000..957c872ac --- /dev/null +++ b/src/librbd/api/Migration.cc @@ -0,0 +1,2126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Migration.h" +#include "include/rados/librados.hpp" +#include "include/stringify.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/AsioEngine.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/api/Group.h" +#include "librbd/api/Image.h" +#include "librbd/api/Snapshot.h" +#include "librbd/api/Trash.h" +#include "librbd/deep_copy/Handler.h" +#include "librbd/deep_copy/ImageCopyRequest.h" +#include "librbd/deep_copy/MetadataCopyRequest.h" +#include "librbd/deep_copy/SnapshotCopyRequest.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/image/AttachChildRequest.h" +#include "librbd/image/AttachParentRequest.h" +#include "librbd/image/CloneRequest.h" +#include "librbd/image/CreateRequest.h" +#include "librbd/image/DetachChildRequest.h" +#include "librbd/image/DetachParentRequest.h" +#include "librbd/image/ListWatchersRequest.h" +#include "librbd/image/RemoveRequest.h" +#include "librbd/image/Types.h" +#include "librbd/internal.h" +#include "librbd/migration/FormatInterface.h" +#include "librbd/migration/OpenSourceImageRequest.h" +#include "librbd/migration/NativeFormat.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/mirror/EnableRequest.h" + +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Migration: " << __func__ << ": " + +namespace librbd { + +inline bool operator==(const linked_image_spec_t& rhs, + const linked_image_spec_t& lhs) { + bool result = (rhs.pool_id == lhs.pool_id && + rhs.pool_namespace == lhs.pool_namespace && + rhs.image_id == lhs.image_id); + return result; +} + +namespace api { + +using util::create_rados_callback; + +namespace { + +class MigrationProgressContext : public ProgressContext { +public: + MigrationProgressContext(librados::IoCtx& io_ctx, + const std::string &header_oid, + cls::rbd::MigrationState state, + ProgressContext *prog_ctx) + : m_io_ctx(io_ctx), m_header_oid(header_oid), m_state(state), + m_prog_ctx(prog_ctx), m_cct(reinterpret_cast(io_ctx.cct())), + m_lock(ceph::make_mutex( + util::unique_lock_name("librbd::api::MigrationProgressContext", + this))) { + ceph_assert(m_prog_ctx != nullptr); + } + + ~MigrationProgressContext() { + wait_for_in_flight_updates(); + } + + int update_progress(uint64_t offset, uint64_t total) override { + ldout(m_cct, 20) << "offset=" << offset << ", total=" << total << dendl; + + m_prog_ctx->update_progress(offset, total); + + std::string description = stringify(offset * 100 / total) + "% complete"; + + send_state_description_update(description); + + return 0; + } + +private: + librados::IoCtx& m_io_ctx; + std::string m_header_oid; + cls::rbd::MigrationState m_state; + ProgressContext *m_prog_ctx; + + CephContext* m_cct; + mutable ceph::mutex m_lock; + ceph::condition_variable m_cond; + std::string m_state_description; + bool m_pending_update = false; + int m_in_flight_state_updates = 0; + + void send_state_description_update(const std::string &description) { + std::lock_guard locker{m_lock}; + + if (description == m_state_description) { + return; + } + + m_state_description = description; + + if (m_in_flight_state_updates > 0) { + m_pending_update = true; + return; + } + + set_state_description(); + } + + void set_state_description() { + ldout(m_cct, 20) << "state_description=" << m_state_description << dendl; + + ceph_assert(ceph_mutex_is_locked(m_lock)); + + librados::ObjectWriteOperation op; + cls_client::migration_set_state(&op, m_state, m_state_description); + + using klass = MigrationProgressContext; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_operate(m_header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + + m_in_flight_state_updates++; + } + + void handle_set_state_description(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + std::lock_guard locker{m_lock}; + + m_in_flight_state_updates--; + + if (r < 0) { + lderr(m_cct) << "failed to update migration state: " << cpp_strerror(r) + << dendl; + } else if (m_pending_update) { + set_state_description(); + m_pending_update = false; + } else { + m_cond.notify_all(); + } + } + + void wait_for_in_flight_updates() { + std::unique_lock locker{m_lock}; + + ldout(m_cct, 20) << "m_in_flight_state_updates=" + << m_in_flight_state_updates << dendl; + m_pending_update = false; + m_cond.wait(locker, [this] { return m_in_flight_state_updates <= 0; }); + } +}; + +int trash_search(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, std::string *image_id) { + std::vector entries; + + int r = Trash<>::list(io_ctx, entries, false); + if (r < 0) { + return r; + } + + for (auto &entry : entries) { + if (entry.source == source && entry.name == image_name) { + *image_id = entry.id; + return 0; + } + } + + return -ENOENT; +} + +template +int open_images(librados::IoCtx& io_ctx, const std::string &image_name, + I **src_image_ctx, I **dst_image_ctx, + cls::rbd::MigrationSpec* src_migration_spec, + cls::rbd::MigrationSpec* dst_migration_spec, + bool skip_open_dst_image) { + CephContext* cct = reinterpret_cast(io_ctx.cct()); + + *src_image_ctx = nullptr; + *dst_image_ctx = nullptr; + + ldout(cct, 10) << "trying to open image by name " << io_ctx.get_pool_name() + << "/" << image_name << dendl; + auto image_ctx = I::create(image_name, "", nullptr, io_ctx, false); + int r = image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING); + if (r == -ENOENT) { + // presume user passed the source image so we need to search the trash + ldout(cct, 10) << "Source image is not found. Trying trash" << dendl; + + std::string src_image_id; + r = trash_search(io_ctx, RBD_TRASH_IMAGE_SOURCE_MIGRATION, image_name, + &src_image_id); + if (r < 0) { + lderr(cct) << "failed to determine image id: " << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 10) << "source image id from trash: " << src_image_id << dendl; + image_ctx = I::create(image_name, src_image_id, nullptr, io_ctx, false); + r = image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING); + } + + if (r < 0) { + if (r != -ENOENT) { + lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl; + return r; + } + image_ctx = nullptr; + } + + BOOST_SCOPE_EXIT_TPL(&r, &image_ctx, src_image_ctx, dst_image_ctx) { + if (r != 0) { + if (*src_image_ctx != nullptr) { + (*src_image_ctx)->state->close(); + } + if (*dst_image_ctx != nullptr) { + (*dst_image_ctx)->state->close(); + } + if (image_ctx != nullptr) { + image_ctx->state->close(); + } + } + } BOOST_SCOPE_EXIT_END; + + // The opened image is either a source or destination + cls::rbd::MigrationSpec migration_spec; + r = cls_client::migration_get(&image_ctx->md_ctx, image_ctx->header_oid, + &migration_spec); + if (r < 0) { + lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 10) << "migration spec: " << migration_spec << dendl; + if (migration_spec.header_type == cls::rbd::MIGRATION_HEADER_TYPE_SRC) { + ldout(cct, 10) << "the source image is opened" << dendl; + *src_image_ctx = image_ctx; + *src_migration_spec = migration_spec; + image_ctx = nullptr; + } else if (migration_spec.header_type == + cls::rbd::MIGRATION_HEADER_TYPE_DST) { + ldout(cct, 10) << "the destination image is opened" << dendl; + std::string image_id = image_ctx->id; + image_ctx->state->close(); + image_ctx = I::create(image_name, image_id, nullptr, io_ctx, false); + + if (!skip_open_dst_image) { + ldout(cct, 10) << "re-opening the destination image" << dendl; + r = image_ctx->state->open(0); + if (r < 0) { + image_ctx = nullptr; + lderr(cct) << "failed to re-open destination image: " << cpp_strerror(r) + << dendl; + return r; + } + } + + *dst_image_ctx = image_ctx; + *dst_migration_spec = migration_spec; + image_ctx = nullptr; + } else { + lderr(cct) << "unexpected migration header type: " + << migration_spec.header_type << dendl; + r = -EINVAL; + return r; + } + + // attempt to open the other (paired) image + I** other_image_ctx = nullptr; + std::string other_image_type; + std::string other_image_name; + std::string other_image_id; + cls::rbd::MigrationSpec* other_migration_spec = nullptr; + librados::IoCtx other_io_ctx; + + int flags = OPEN_FLAG_IGNORE_MIGRATING; + if (*src_image_ctx == nullptr && + dst_migration_spec->source_spec.empty()) { + r = util::create_ioctx(io_ctx, "source image", migration_spec.pool_id, + migration_spec.pool_namespace, &other_io_ctx); + if (r < 0) { + return r; + } + + other_image_type = "source"; + other_image_ctx = src_image_ctx; + other_migration_spec = src_migration_spec; + other_image_name = migration_spec.image_name; + other_image_id = migration_spec.image_id; + + if (other_image_id.empty()) { + ldout(cct, 20) << "trying to open v1 image by name " + << other_io_ctx.get_pool_name() << "/" + << other_image_name << dendl; + flags |= OPEN_FLAG_OLD_FORMAT; + } else { + ldout(cct, 20) << "trying to open v2 image by id " + << other_io_ctx.get_pool_name() << "/" + << other_image_id << dendl; + } + + *src_image_ctx = I::create(other_image_name, other_image_id, nullptr, + other_io_ctx, false); + } else if (*dst_image_ctx == nullptr) { + r = util::create_ioctx(io_ctx, "destination image", migration_spec.pool_id, + migration_spec.pool_namespace, &other_io_ctx); + if (r < 0) { + return r; + } + + other_image_name = migration_spec.image_name; + if (skip_open_dst_image) { + other_image_id = migration_spec.image_id; + } else { + other_image_type = "destination"; + other_image_ctx = dst_image_ctx; + other_migration_spec = dst_migration_spec; + other_image_id = migration_spec.image_id; + } + + *dst_image_ctx = I::create(other_image_name, other_image_id, nullptr, + other_io_ctx, false); + } + + if (other_image_ctx != nullptr) { + r = (*other_image_ctx)->state->open(flags); + if (r < 0) { + lderr(cct) << "failed to open " << other_image_type << " image " + << other_io_ctx.get_pool_name() + << "/" << (other_image_id.empty() ? + other_image_name : other_image_id) + << ": " << cpp_strerror(r) << dendl; + *other_image_ctx = nullptr; + return r; + } + + r = cls_client::migration_get(&(*other_image_ctx)->md_ctx, + (*other_image_ctx)->header_oid, + other_migration_spec); + if (r < 0) { + lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 20) << other_image_type << " migration spec: " + << *other_migration_spec << dendl; + } + + if (!skip_open_dst_image) { + // legacy clients will only store status in the source images + if (dst_migration_spec->source_spec.empty()) { + dst_migration_spec->state = migration_spec.state; + dst_migration_spec->state_description = + migration_spec.state_description; + } + } + + return 0; +} + +class SteppedProgressContext : public ProgressContext { +public: + SteppedProgressContext(ProgressContext* progress_ctx, size_t total_steps) + : m_progress_ctx(progress_ctx), m_total_steps(total_steps) { + } + + void next_step() { + ceph_assert(m_current_step < m_total_steps); + ++m_current_step; + } + + int update_progress(uint64_t object_number, + uint64_t object_count) override { + return m_progress_ctx->update_progress( + object_number + (object_count * (m_current_step - 1)), + object_count * m_total_steps); + } + +private: + ProgressContext* m_progress_ctx; + size_t m_total_steps; + size_t m_current_step = 1; +}; + +} // anonymous namespace + +template +int Migration::prepare(librados::IoCtx& io_ctx, + const std::string &image_name, + librados::IoCtx& dest_io_ctx, + const std::string &dest_image_name_, + ImageOptions& opts) { + CephContext* cct = reinterpret_cast(io_ctx.cct()); + + std::string dest_image_name = dest_image_name_.empty() ? image_name : + dest_image_name_; + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << " -> " + << dest_io_ctx.get_pool_name() << "/" << dest_image_name + << ", opts=" << opts << dendl; + + auto src_image_ctx = I::create(image_name, "", nullptr, io_ctx, false); + int r = src_image_ctx->state->open(0); + if (r < 0) { + lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl; + return r; + } + BOOST_SCOPE_EXIT_TPL(src_image_ctx) { + src_image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + std::list watchers; + int flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE | + librbd::image::LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES; + C_SaferCond on_list_watchers; + auto list_watchers_request = librbd::image::ListWatchersRequest::create( + *src_image_ctx, flags, &watchers, &on_list_watchers); + list_watchers_request->send(); + r = on_list_watchers.wait(); + if (r < 0) { + lderr(cct) << "failed listing watchers:" << cpp_strerror(r) << dendl; + return r; + } + if (!watchers.empty()) { + lderr(cct) << "image has watchers - not migrating" << dendl; + return -EBUSY; + } + + uint64_t format = 2; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) { + opts.set(RBD_IMAGE_OPTION_FORMAT, format); + } + if (format != 2) { + lderr(cct) << "unsupported destination image format: " << format << dendl; + return -EINVAL; + } + + uint64_t features; + { + std::shared_lock image_locker{src_image_ctx->image_lock}; + features = src_image_ctx->features; + } + opts.get(RBD_IMAGE_OPTION_FEATURES, &features); + if ((features & ~RBD_FEATURES_ALL) != 0) { + lderr(cct) << "librbd does not support requested features" << dendl; + return -ENOSYS; + } + opts.set(RBD_IMAGE_OPTION_FEATURES, features); + + uint64_t order = src_image_ctx->order; + if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) { + opts.set(RBD_IMAGE_OPTION_ORDER, order); + } + r = image::CreateRequest::validate_order(cct, order); + if (r < 0) { + return r; + } + + uint64_t stripe_unit = src_image_ctx->stripe_unit; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + } + uint64_t stripe_count = src_image_ctx->stripe_count; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + } + + uint64_t flatten = 0; + if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) { + opts.unset(RBD_IMAGE_OPTION_FLATTEN); + } + + ldout(cct, 20) << "updated opts=" << opts << dendl; + + auto dst_image_ctx = I::create( + dest_image_name, util::generate_image_id(dest_io_ctx), nullptr, + dest_io_ctx, false); + src_image_ctx->image_lock.lock_shared(); + cls::rbd::MigrationSpec dst_migration_spec{ + cls::rbd::MIGRATION_HEADER_TYPE_DST, + src_image_ctx->md_ctx.get_id(), src_image_ctx->md_ctx.get_namespace(), + src_image_ctx->name, src_image_ctx->id, "", {}, 0, false, + cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, flatten > 0, + cls::rbd::MIGRATION_STATE_PREPARING, ""}; + src_image_ctx->image_lock.unlock_shared(); + + Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec, + opts, nullptr); + r = migration.prepare(); + + return r; +} + +template +int Migration::prepare_import( + const std::string& source_spec, librados::IoCtx& dest_io_ctx, + const std::string &dest_image_name, ImageOptions& opts) { + if (source_spec.empty() || !dest_io_ctx.is_valid() || + dest_image_name.empty()) { + return -EINVAL; + } + + auto cct = reinterpret_cast(dest_io_ctx.cct()); + ldout(cct, 10) << source_spec << " -> " + << dest_io_ctx.get_pool_name() << "/" + << dest_image_name << ", opts=" << opts << dendl; + + I* src_image_ctx = nullptr; + C_SaferCond open_ctx; + auto req = migration::OpenSourceImageRequest::create( + dest_io_ctx, nullptr, CEPH_NOSNAP, + {-1, "", "", "", source_spec, {}, 0, false}, &src_image_ctx, &open_ctx); + req->send(); + + int r = open_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to open source image: " << cpp_strerror(r) << dendl; + return r; + } + + auto asio_engine = src_image_ctx->asio_engine; + BOOST_SCOPE_EXIT_TPL(src_image_ctx) { + src_image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + uint64_t image_format = 2; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &image_format) != 0) { + opts.set(RBD_IMAGE_OPTION_FORMAT, image_format); + } + if (image_format != 2) { + lderr(cct) << "unsupported destination image format: " << image_format + << dendl; + return -EINVAL; + } + + ldout(cct, 20) << "updated opts=" << opts << dendl; + + // use json-spirit to clean-up json formatting + json_spirit::mObject source_spec_object; + json_spirit::mValue json_root; + if(json_spirit::read(source_spec, json_root)) { + try { + source_spec_object = json_root.get_obj(); + } catch (std::runtime_error&) { + lderr(cct) << "failed to clean source spec" << dendl; + return -EINVAL; + } + } + + auto dst_image_ctx = I::create( + dest_image_name, util::generate_image_id(dest_io_ctx), nullptr, + dest_io_ctx, false); + cls::rbd::MigrationSpec dst_migration_spec{ + cls::rbd::MIGRATION_HEADER_TYPE_DST, -1, "", "", "", + json_spirit::write(source_spec_object), {}, + 0, false, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, true, + cls::rbd::MIGRATION_STATE_PREPARING, ""}; + + Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec, + opts, nullptr); + return migration.prepare_import(); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Migration::execute(librados::IoCtx& io_ctx, + const std::string &image_name, + ProgressContext &prog_ctx) { + CephContext* cct = reinterpret_cast(io_ctx.cct()); + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl; + + I *src_image_ctx; + I *dst_image_ctx; + cls::rbd::MigrationSpec src_migration_spec; + cls::rbd::MigrationSpec dst_migration_spec; + int r = open_images(io_ctx, image_name, &src_image_ctx, &dst_image_ctx, + &src_migration_spec, &dst_migration_spec, false); + if (r < 0) { + return r; + } + + // ensure the destination loads the migration info + dst_image_ctx->ignore_migrating = false; + r = dst_image_ctx->state->refresh(); + if (r < 0) { + lderr(cct) << "failed to refresh destination image: " << cpp_strerror(r) + << dendl; + return r; + } + + BOOST_SCOPE_EXIT_TPL(src_image_ctx, dst_image_ctx) { + dst_image_ctx->state->close(); + if (src_image_ctx != nullptr) { + src_image_ctx->state->close(); + } + } BOOST_SCOPE_EXIT_END; + + if (dst_migration_spec.state != cls::rbd::MIGRATION_STATE_PREPARED && + dst_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTING) { + lderr(cct) << "current migration state is '" << dst_migration_spec.state + << "' (should be 'prepared')" << dendl; + return -EINVAL; + } + + ldout(cct, 5) << "migrating "; + if (!dst_migration_spec.source_spec.empty()) { + *_dout << dst_migration_spec.source_spec; + } else { + *_dout << src_image_ctx->md_ctx.get_pool_name() << "/" + << src_image_ctx->name; + } + *_dout << " -> " << dst_image_ctx->md_ctx.get_pool_name() << "/" + << dst_image_ctx->name << dendl; + + ImageOptions opts; + Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec, + opts, &prog_ctx); + r = migration.execute(); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Migration::abort(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext &prog_ctx) { + CephContext* cct = reinterpret_cast(io_ctx.cct()); + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl; + + I *src_image_ctx; + I *dst_image_ctx; + cls::rbd::MigrationSpec src_migration_spec; + cls::rbd::MigrationSpec dst_migration_spec; + int r = open_images(io_ctx, image_name, &src_image_ctx, &dst_image_ctx, + &src_migration_spec, &dst_migration_spec, true); + if (r < 0) { + return r; + } + + ldout(cct, 5) << "canceling incomplete migration "; + if (!dst_migration_spec.source_spec.empty()) { + *_dout << dst_migration_spec.source_spec; + } else { + *_dout << src_image_ctx->md_ctx.get_pool_name() << "/" + << src_image_ctx->name; + } + *_dout << " -> " << dst_image_ctx->md_ctx.get_pool_name() << "/" + << dst_image_ctx->name << dendl; + + ImageOptions opts; + Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec, + opts, &prog_ctx); + r = migration.abort(); + + if (src_image_ctx != nullptr) { + src_image_ctx->state->close(); + } + + if (r < 0) { + return r; + } + + return 0; +} + +template +int Migration::commit(librados::IoCtx& io_ctx, + const std::string &image_name, + ProgressContext &prog_ctx) { + CephContext* cct = reinterpret_cast(io_ctx.cct()); + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl; + + I *src_image_ctx; + I *dst_image_ctx; + cls::rbd::MigrationSpec src_migration_spec; + cls::rbd::MigrationSpec dst_migration_spec; + int r = open_images(io_ctx, image_name, &src_image_ctx, &dst_image_ctx, + &src_migration_spec, &dst_migration_spec, false); + if (r < 0) { + return r; + } + + if (dst_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTED) { + lderr(cct) << "current migration state is '" << dst_migration_spec.state + << "' (should be 'executed')" << dendl; + dst_image_ctx->state->close(); + if (src_image_ctx != nullptr) { + src_image_ctx->state->close(); + } + return -EINVAL; + } + + // ensure the destination loads the migration info + dst_image_ctx->ignore_migrating = false; + r = dst_image_ctx->state->refresh(); + if (r < 0) { + lderr(cct) << "failed to refresh destination image: " << cpp_strerror(r) + << dendl; + return r; + } + + ldout(cct, 5) << "migrating "; + if (!dst_migration_spec.source_spec.empty()) { + *_dout << dst_migration_spec.source_spec; + } else { + *_dout << src_image_ctx->md_ctx.get_pool_name() << "/" + << src_image_ctx->name; + } + *_dout << " -> " << dst_image_ctx->md_ctx.get_pool_name() << "/" + << dst_image_ctx->name << dendl; + + ImageOptions opts; + Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec, + opts, &prog_ctx); + r = migration.commit(); + + // image_ctx is closed in commit when removing src image + if (r < 0) { + return r; + } + + return 0; +} + +template +int Migration::status(librados::IoCtx& io_ctx, + const std::string &image_name, + image_migration_status_t *status) { + CephContext* cct = reinterpret_cast(io_ctx.cct()); + + ldout(cct, 10) << io_ctx.get_pool_name() << "/" << image_name << dendl; + + I *src_image_ctx; + I *dst_image_ctx; + cls::rbd::MigrationSpec src_migration_spec; + cls::rbd::MigrationSpec dst_migration_spec; + int r = open_images(io_ctx, image_name, &src_image_ctx, &dst_image_ctx, + &src_migration_spec, &dst_migration_spec, false); + if (r < 0) { + return r; + } + + ldout(cct, 5) << "migrating "; + if (!dst_migration_spec.source_spec.empty()) { + *_dout << dst_migration_spec.source_spec; + } else { + *_dout << src_image_ctx->md_ctx.get_pool_name() << "/" + << src_image_ctx->name; + } + *_dout << " -> " << dst_image_ctx->md_ctx.get_pool_name() << "/" + << dst_image_ctx->name << dendl; + + ImageOptions opts; + Migration migration(src_image_ctx, dst_image_ctx, dst_migration_spec, + opts, nullptr); + r = migration.status(status); + + dst_image_ctx->state->close(); + if (src_image_ctx != nullptr) { + src_image_ctx->state->close(); + } + + if (r < 0) { + return r; + } + + return 0; +} + +template +int Migration::get_source_spec(I* image_ctx, std::string* source_spec) { + auto cct = image_ctx->cct; + ldout(cct, 10) << dendl; + + image_ctx->image_lock.lock_shared(); + auto migration_info = image_ctx->migration_info; + image_ctx->image_lock.unlock_shared(); + + if (migration_info.empty()) { + // attempt to directly read the spec in case the state is EXECUTED + cls::rbd::MigrationSpec migration_spec; + int r = cls_client::migration_get(&image_ctx->md_ctx, image_ctx->header_oid, + &migration_spec); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "failed retrieving migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + migration_info = { + migration_spec.pool_id, migration_spec.pool_namespace, + migration_spec.image_name, migration_spec.image_id, + migration_spec.source_spec, {}, 0, false}; + } + + if (!migration_info.source_spec.empty()) { + *source_spec = migration_info.source_spec; + } else { + // legacy migration source + *source_spec = migration::NativeFormat::build_source_spec( + migration_info.pool_id, + migration_info.pool_namespace, + migration_info.image_name, + migration_info.image_id); + } + + return 0; +} + +template +Migration::Migration(ImageCtx* src_image_ctx, + ImageCtx* dst_image_ctx, + const cls::rbd::MigrationSpec& dst_migration_spec, + ImageOptions& opts, ProgressContext *prog_ctx) + : m_cct(dst_image_ctx->cct), + m_src_image_ctx(src_image_ctx), m_dst_image_ctx(dst_image_ctx), + m_dst_io_ctx(dst_image_ctx->md_ctx), m_dst_image_name(dst_image_ctx->name), + m_dst_image_id(dst_image_ctx->id), + m_dst_header_oid(util::header_name(m_dst_image_id)), + m_image_options(opts), m_flatten(dst_migration_spec.flatten), + m_mirroring(dst_migration_spec.mirroring), + m_mirror_image_mode(dst_migration_spec.mirror_image_mode), + m_prog_ctx(prog_ctx), + m_src_migration_spec(cls::rbd::MIGRATION_HEADER_TYPE_SRC, + m_dst_io_ctx.get_id(), m_dst_io_ctx.get_namespace(), + m_dst_image_name, m_dst_image_id, "", {}, 0, + m_mirroring, m_mirror_image_mode, m_flatten, + dst_migration_spec.state, + dst_migration_spec.state_description), + m_dst_migration_spec(dst_migration_spec) { + m_dst_io_ctx.dup(dst_image_ctx->md_ctx); +} + +template +int Migration::prepare() { + ldout(m_cct, 10) << dendl; + + BOOST_SCOPE_EXIT_TPL(&m_dst_image_ctx) { + if (m_dst_image_ctx != nullptr) { + m_dst_image_ctx->state->close(); + } + } BOOST_SCOPE_EXIT_END; + + int r = validate_src_snaps(m_src_image_ctx); + if (r < 0) { + return r; + } + + r = disable_mirroring(m_src_image_ctx, &m_mirroring, &m_mirror_image_mode); + if (r < 0) { + return r; + } + + r = unlink_src_image(m_src_image_ctx); + if (r < 0) { + enable_mirroring(m_src_image_ctx, m_mirroring, m_mirror_image_mode); + return r; + } + + r = set_src_migration(m_src_image_ctx); + if (r < 0) { + relink_src_image(m_src_image_ctx); + enable_mirroring(m_src_image_ctx, m_mirroring, m_mirror_image_mode); + return r; + } + + r = create_dst_image(&m_dst_image_ctx); + if (r < 0) { + abort(); + return r; + } + + ldout(m_cct, 10) << "succeeded" << dendl; + + return 0; +} + +template +int Migration::prepare_import() { + ldout(m_cct, 10) << dendl; + + BOOST_SCOPE_EXIT_TPL(&m_dst_image_ctx) { + if (m_dst_image_ctx != nullptr) { + m_dst_image_ctx->state->close(); + } + } BOOST_SCOPE_EXIT_END; + + int r = create_dst_image(&m_dst_image_ctx); + if (r < 0) { + abort(); + return r; + } + + return 0; +} + +template +int Migration::execute() { + ldout(m_cct, 10) << dendl; + + int r = set_state(cls::rbd::MIGRATION_STATE_EXECUTING, ""); + if (r < 0) { + return r; + } + + { + MigrationProgressContext dst_prog_ctx( + m_dst_image_ctx->md_ctx, m_dst_image_ctx->header_oid, + cls::rbd::MIGRATION_STATE_EXECUTING, m_prog_ctx); + std::optional src_prog_ctx; + if (m_src_image_ctx != nullptr) { + src_prog_ctx.emplace(m_src_image_ctx->md_ctx, m_src_image_ctx->header_oid, + cls::rbd::MIGRATION_STATE_EXECUTING, &dst_prog_ctx); + } + + while (true) { + r = m_dst_image_ctx->operations->migrate( + *(src_prog_ctx ? &src_prog_ctx.value() : &dst_prog_ctx)); + if (r == -EROFS) { + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + if (m_dst_image_ctx->exclusive_lock != nullptr && + !m_dst_image_ctx->exclusive_lock->accept_ops()) { + ldout(m_cct, 5) << "lost exclusive lock, retrying remote" << dendl; + continue; + } + } + break; + } + } + + if (r < 0) { + lderr(m_cct) << "migration failed: " << cpp_strerror(r) << dendl; + return r; + } + + r = set_state(cls::rbd::MIGRATION_STATE_EXECUTED, ""); + if (r < 0) { + return r; + } + + m_dst_image_ctx->notify_update(); + + ldout(m_cct, 10) << "succeeded" << dendl; + + return 0; +} + +template +int Migration::abort() { + ldout(m_cct, 10) << dendl; + + int r; + if (m_src_image_ctx != nullptr) { + m_src_image_ctx->owner_lock.lock_shared(); + if (m_src_image_ctx->exclusive_lock != nullptr && + !m_src_image_ctx->exclusive_lock->is_lock_owner()) { + C_SaferCond ctx; + m_src_image_ctx->exclusive_lock->acquire_lock(&ctx); + m_src_image_ctx->owner_lock.unlock_shared(); + r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "error acquiring exclusive lock: " << cpp_strerror(r) + << dendl; + return r; + } + } else { + m_src_image_ctx->owner_lock.unlock_shared(); + } + } + + group_info_t group_info; + group_info.pool = -1; + + r = m_dst_image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING); + if (r < 0) { + ldout(m_cct, 1) << "failed to open destination image: " << cpp_strerror(r) + << dendl; + m_dst_image_ctx = nullptr; + } else { + BOOST_SCOPE_EXIT_TPL(&m_dst_image_ctx) { + if (m_dst_image_ctx != nullptr) { + m_dst_image_ctx->state->close(); + } + } BOOST_SCOPE_EXIT_END; + + std::list watchers; + int flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE | + librbd::image::LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES; + C_SaferCond on_list_watchers; + auto list_watchers_request = librbd::image::ListWatchersRequest::create( + *m_dst_image_ctx, flags, &watchers, &on_list_watchers); + list_watchers_request->send(); + r = on_list_watchers.wait(); + if (r < 0) { + lderr(m_cct) << "failed listing watchers:" << cpp_strerror(r) << dendl; + return r; + } + if (!watchers.empty()) { + lderr(m_cct) << "image has watchers - cannot abort migration" << dendl; + return -EBUSY; + } + + // ensure destination image is now read-only + r = set_state(cls::rbd::MIGRATION_STATE_ABORTING, ""); + if (r < 0) { + return r; + } + + SteppedProgressContext progress_ctx( + m_prog_ctx, (m_src_image_ctx != nullptr ? 2 : 1)); + if (m_src_image_ctx != nullptr) { + // copy dst HEAD -> src HEAD + revert_data(m_dst_image_ctx, m_src_image_ctx, &progress_ctx); + progress_ctx.next_step(); + + ldout(m_cct, 10) << "relinking children" << dendl; + r = relink_children(m_dst_image_ctx, m_src_image_ctx); + if (r < 0) { + return r; + } + } + + ldout(m_cct, 10) << "removing dst image snapshots" << dendl; + std::vector snaps; + r = Snapshot::list(m_dst_image_ctx, snaps); + if (r < 0) { + lderr(m_cct) << "failed listing snapshots: " << cpp_strerror(r) + << dendl; + return r; + } + + for (auto &snap : snaps) { + librbd::NoOpProgressContext prog_ctx; + int r = Snapshot::remove(m_dst_image_ctx, snap.name.c_str(), + RBD_SNAP_REMOVE_UNPROTECT, prog_ctx); + if (r < 0) { + lderr(m_cct) << "failed removing snapshot: " << cpp_strerror(r) + << dendl; + return r; + } + } + + ldout(m_cct, 10) << "removing group" << dendl; + + r = remove_group(m_dst_image_ctx, &group_info); + if (r < 0 && r != -ENOENT) { + return r; + } + + ldout(m_cct, 10) << "removing dst image" << dendl; + + ceph_assert(m_dst_image_ctx->ignore_migrating); + + auto asio_engine = m_dst_image_ctx->asio_engine; + librados::IoCtx dst_io_ctx(m_dst_image_ctx->md_ctx); + + C_SaferCond on_remove; + auto req = librbd::image::RemoveRequest<>::create( + dst_io_ctx, m_dst_image_ctx, false, false, progress_ctx, + asio_engine->get_work_queue(), &on_remove); + req->send(); + r = on_remove.wait(); + + m_dst_image_ctx = nullptr; + + if (r < 0) { + lderr(m_cct) << "failed removing destination image '" + << dst_io_ctx.get_pool_name() << "/" << m_dst_image_name + << " (" << m_dst_image_id << ")': " << cpp_strerror(r) + << dendl; + return r; + } + } + + if (m_src_image_ctx != nullptr) { + r = relink_src_image(m_src_image_ctx); + if (r < 0) { + return r; + } + + r = add_group(m_src_image_ctx, group_info); + if (r < 0) { + return r; + } + + r = remove_migration(m_src_image_ctx); + if (r < 0) { + return r; + } + + r = enable_mirroring(m_src_image_ctx, m_mirroring, m_mirror_image_mode); + if (r < 0) { + return r; + } + } + + ldout(m_cct, 10) << "succeeded" << dendl; + + return 0; +} + +template +int Migration::commit() { + ldout(m_cct, 10) << dendl; + + BOOST_SCOPE_EXIT_TPL(&m_dst_image_ctx, &m_src_image_ctx) { + m_dst_image_ctx->state->close(); + if (m_src_image_ctx != nullptr) { + m_src_image_ctx->state->close(); + } + } BOOST_SCOPE_EXIT_END; + + int r = remove_migration(m_dst_image_ctx); + if (r < 0) { + return r; + } + + if (m_src_image_ctx != nullptr) { + r = remove_src_image(&m_src_image_ctx); + if (r < 0) { + return r; + } + } + + r = enable_mirroring(m_dst_image_ctx, m_mirroring, m_mirror_image_mode); + if (r < 0) { + return r; + } + + ldout(m_cct, 10) << "succeeded" << dendl; + + return 0; +} + +template +int Migration::status(image_migration_status_t *status) { + ldout(m_cct, 10) << dendl; + + status->source_pool_id = m_dst_migration_spec.pool_id; + status->source_pool_namespace = m_dst_migration_spec.pool_namespace; + status->source_image_name = m_dst_migration_spec.image_name; + status->source_image_id = m_dst_migration_spec.image_id; + status->dest_pool_id = m_src_migration_spec.pool_id; + status->dest_pool_namespace = m_src_migration_spec.pool_namespace; + status->dest_image_name = m_src_migration_spec.image_name; + status->dest_image_id = m_src_migration_spec.image_id; + + switch (m_src_migration_spec.state) { + case cls::rbd::MIGRATION_STATE_ERROR: + status->state = RBD_IMAGE_MIGRATION_STATE_ERROR; + break; + case cls::rbd::MIGRATION_STATE_PREPARING: + status->state = RBD_IMAGE_MIGRATION_STATE_PREPARING; + break; + case cls::rbd::MIGRATION_STATE_PREPARED: + status->state = RBD_IMAGE_MIGRATION_STATE_PREPARED; + break; + case cls::rbd::MIGRATION_STATE_EXECUTING: + status->state = RBD_IMAGE_MIGRATION_STATE_EXECUTING; + break; + case cls::rbd::MIGRATION_STATE_EXECUTED: + status->state = RBD_IMAGE_MIGRATION_STATE_EXECUTED; + break; + default: + status->state = RBD_IMAGE_MIGRATION_STATE_UNKNOWN; + break; + } + + status->state_description = m_src_migration_spec.state_description; + + return 0; +} + +template +int Migration::set_state(I* image_ctx, const std::string& image_description, + cls::rbd::MigrationState state, + const std::string &description) { + int r = cls_client::migration_set_state(&image_ctx->md_ctx, + image_ctx->header_oid, + state, description); + if (r < 0) { + lderr(m_cct) << "failed to set " << image_description << " " + << "migration header: " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +template +int Migration::set_state(cls::rbd::MigrationState state, + const std::string &description) { + int r; + if (m_src_image_ctx != nullptr) { + r = set_state(m_src_image_ctx, "source", state, description); + if (r < 0) { + return r; + } + } + + r = set_state(m_dst_image_ctx, "destination", state, description); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Migration::list_src_snaps(I* image_ctx, + std::vector *snaps) { + ldout(m_cct, 10) << dendl; + + int r = Snapshot::list(image_ctx, *snaps); + if (r < 0) { + lderr(m_cct) << "failed listing snapshots: " << cpp_strerror(r) << dendl; + return r; + } + + for (auto &snap : *snaps) { + librbd::snap_namespace_type_t namespace_type; + r = Snapshot::get_namespace_type(image_ctx, snap.id, + &namespace_type); + if (r < 0) { + lderr(m_cct) << "error getting snap namespace type: " << cpp_strerror(r) + << dendl; + return r; + } + + if (namespace_type != RBD_SNAP_NAMESPACE_TYPE_USER) { + if (namespace_type == RBD_SNAP_NAMESPACE_TYPE_TRASH) { + lderr(m_cct) << "image has snapshots with linked clones that must be " + << "deleted or flattened before the image can be migrated" + << dendl; + } else { + lderr(m_cct) << "image has non-user type snapshots " + << "that are not supported by migration" << dendl; + } + return -EBUSY; + } + } + + return 0; +} + +template +int Migration::validate_src_snaps(I* image_ctx) { + ldout(m_cct, 10) << dendl; + + std::vector snaps; + int r = list_src_snaps(image_ctx, &snaps); + if (r < 0) { + return r; + } + + uint64_t dst_features = 0; + r = m_image_options.get(RBD_IMAGE_OPTION_FEATURES, &dst_features); + ceph_assert(r == 0); + + if (!image_ctx->test_features(RBD_FEATURE_LAYERING)) { + return 0; + } + + for (auto &snap : snaps) { + std::shared_lock image_locker{image_ctx->image_lock}; + cls::rbd::ParentImageSpec parent_spec{image_ctx->md_ctx.get_id(), + image_ctx->md_ctx.get_namespace(), + image_ctx->id, snap.id}; + std::vector child_images; + r = api::Image::list_children(image_ctx, parent_spec, + &child_images); + if (r < 0) { + lderr(m_cct) << "failed listing children: " << cpp_strerror(r) + << dendl; + return r; + } + if (!child_images.empty()) { + ldout(m_cct, 1) << image_ctx->name << "@" << snap.name + << " has children" << dendl; + + if ((dst_features & RBD_FEATURE_LAYERING) == 0) { + lderr(m_cct) << "can't migrate to destination without layering feature: " + << "image has children" << dendl; + return -EINVAL; + } + } + } + + return 0; +} + + +template +int Migration::set_src_migration(I* image_ctx) { + ldout(m_cct, 10) << dendl; + + image_ctx->ignore_migrating = true; + + int r = cls_client::migration_set(&image_ctx->md_ctx, image_ctx->header_oid, + m_src_migration_spec); + if (r < 0) { + lderr(m_cct) << "failed to set source migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + image_ctx->notify_update(); + + return 0; +} + +template +int Migration::remove_migration(I *image_ctx) { + ldout(m_cct, 10) << dendl; + + int r; + + r = cls_client::migration_remove(&image_ctx->md_ctx, image_ctx->header_oid); + if (r == -ENOENT) { + r = 0; + } + if (r < 0) { + lderr(m_cct) << "failed removing migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + image_ctx->notify_update(); + + return 0; +} + +template +int Migration::unlink_src_image(I* image_ctx) { + if (image_ctx->old_format) { + return v1_unlink_src_image(image_ctx); + } else { + return v2_unlink_src_image(image_ctx); + } +} + +template +int Migration::v1_unlink_src_image(I* image_ctx) { + ldout(m_cct, 10) << dendl; + + std::shared_lock image_locker{image_ctx->image_lock}; + int r = tmap_rm(image_ctx->md_ctx, image_ctx->name); + if (r < 0) { + lderr(m_cct) << "failed removing " << image_ctx->name << " from tmap: " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template +int Migration::v2_unlink_src_image(I* image_ctx) { + ldout(m_cct, 10) << dendl; + + image_ctx->owner_lock.lock_shared(); + if (image_ctx->exclusive_lock != nullptr && + image_ctx->exclusive_lock->is_lock_owner()) { + C_SaferCond ctx; + image_ctx->exclusive_lock->release_lock(&ctx); + image_ctx->owner_lock.unlock_shared(); + int r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "error releasing exclusive lock: " << cpp_strerror(r) + << dendl; + return r; + } + } else { + image_ctx->owner_lock.unlock_shared(); + } + + int r = Trash::move(image_ctx->md_ctx, RBD_TRASH_IMAGE_SOURCE_MIGRATION, + image_ctx->name, 0); + if (r < 0) { + lderr(m_cct) << "failed moving image to trash: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template +int Migration::relink_src_image(I* image_ctx) { + if (image_ctx->old_format) { + return v1_relink_src_image(image_ctx); + } else { + return v2_relink_src_image(image_ctx); + } +} + +template +int Migration::v1_relink_src_image(I* image_ctx) { + ldout(m_cct, 10) << dendl; + + std::shared_lock image_locker{image_ctx->image_lock}; + int r = tmap_set(image_ctx->md_ctx, image_ctx->name); + if (r < 0) { + lderr(m_cct) << "failed adding " << image_ctx->name << " to tmap: " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template +int Migration::v2_relink_src_image(I* image_ctx) { + ldout(m_cct, 10) << dendl; + + std::shared_lock image_locker{image_ctx->image_lock}; + int r = Trash::restore(image_ctx->md_ctx, + {cls::rbd::TRASH_IMAGE_SOURCE_MIGRATION}, + image_ctx->id, image_ctx->name); + if (r < 0) { + lderr(m_cct) << "failed restoring image from trash: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template +int Migration::create_dst_image(I** image_ctx) { + ldout(m_cct, 10) << dendl; + + uint64_t size; + cls::rbd::ParentImageSpec parent_spec; + { + std::shared_lock image_locker{m_src_image_ctx->image_lock}; + size = m_src_image_ctx->size; + + // use oldest snapshot or HEAD for parent spec + if (!m_src_image_ctx->snap_info.empty()) { + parent_spec = m_src_image_ctx->snap_info.begin()->second.parent.spec; + } else { + parent_spec = m_src_image_ctx->parent_md.spec; + } + } + + ConfigProxy config{m_cct->_conf}; + api::Config::apply_pool_overrides(m_dst_io_ctx, &config); + + uint64_t mirror_image_mode; + if (m_image_options.get(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE, + &mirror_image_mode) == 0) { + m_mirroring = true; + m_mirror_image_mode = static_cast( + mirror_image_mode); + m_image_options.unset(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE); + } + + int r; + C_SaferCond on_create; + librados::IoCtx parent_io_ctx; + if (parent_spec.pool_id == -1) { + auto *req = image::CreateRequest::create( + config, m_dst_io_ctx, m_dst_image_name, m_dst_image_id, size, + m_image_options, image::CREATE_FLAG_SKIP_MIRROR_ENABLE, + cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, "", "", + m_src_image_ctx->op_work_queue, &on_create); + req->send(); + } else { + r = util::create_ioctx(m_src_image_ctx->md_ctx, "parent image", + parent_spec.pool_id, parent_spec.pool_namespace, + &parent_io_ctx); + if (r < 0) { + return r; + } + + auto *req = image::CloneRequest::create( + config, parent_io_ctx, parent_spec.image_id, "", {}, parent_spec.snap_id, + m_dst_io_ctx, m_dst_image_name, m_dst_image_id, m_image_options, + cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, "", "", + m_src_image_ctx->op_work_queue, &on_create); + req->send(); + } + + r = on_create.wait(); + if (r < 0) { + lderr(m_cct) << "header creation failed: " << cpp_strerror(r) << dendl; + return r; + } + + auto dst_image_ctx = *image_ctx; + dst_image_ctx->id = m_dst_image_id; + *image_ctx = nullptr; // prevent prepare from cleaning up the ImageCtx + + r = dst_image_ctx->state->open(OPEN_FLAG_IGNORE_MIGRATING); + if (r < 0) { + lderr(m_cct) << "failed to open newly created header: " << cpp_strerror(r) + << dendl; + return r; + } + + BOOST_SCOPE_EXIT_TPL(dst_image_ctx) { + dst_image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + { + std::shared_lock owner_locker{dst_image_ctx->owner_lock}; + r = dst_image_ctx->operations->prepare_image_update( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true); + if (r < 0) { + lderr(m_cct) << "cannot obtain exclusive lock" << dendl; + return r; + } + if (dst_image_ctx->exclusive_lock != nullptr) { + dst_image_ctx->exclusive_lock->block_requests(0); + } + } + + SnapSeqs snap_seqs; + + C_SaferCond on_snapshot_copy; + auto snapshot_copy_req = librbd::deep_copy::SnapshotCopyRequest::create( + m_src_image_ctx, dst_image_ctx, 0, CEPH_NOSNAP, 0, m_flatten, + m_src_image_ctx->op_work_queue, &snap_seqs, &on_snapshot_copy); + snapshot_copy_req->send(); + r = on_snapshot_copy.wait(); + if (r < 0) { + lderr(m_cct) << "failed to copy snapshots: " << cpp_strerror(r) << dendl; + return r; + } + + if (!m_src_image_ctx->header_oid.empty()) { + C_SaferCond on_metadata_copy; + auto metadata_copy_req = librbd::deep_copy::MetadataCopyRequest::create( + m_src_image_ctx, dst_image_ctx, &on_metadata_copy); + metadata_copy_req->send(); + r = on_metadata_copy.wait(); + if (r < 0) { + lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl; + return r; + } + } + + m_dst_migration_spec.snap_seqs = snap_seqs; + m_dst_migration_spec.overlap = size; + m_dst_migration_spec.mirroring = m_mirroring; + m_dst_migration_spec.mirror_image_mode = m_mirror_image_mode; + m_dst_migration_spec.flatten = m_flatten; + r = cls_client::migration_set(&m_dst_io_ctx, m_dst_header_oid, + m_dst_migration_spec); + if (r < 0) { + lderr(m_cct) << "failed to set migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + if (m_dst_migration_spec.source_spec.empty()) { + r = update_group(m_src_image_ctx, dst_image_ctx); + if (r < 0) { + return r; + } + + r = set_state(m_src_image_ctx, "source", + cls::rbd::MIGRATION_STATE_PREPARED, ""); + if (r < 0) { + return r; + } + } + + r = set_state(dst_image_ctx, "destination", + cls::rbd::MIGRATION_STATE_PREPARED, ""); + if (r < 0) { + return r; + } + + if (m_dst_migration_spec.source_spec.empty()) { + r = dst_image_ctx->state->refresh(); + if (r < 0) { + lderr(m_cct) << "failed to refresh destination image: " << cpp_strerror(r) + << dendl; + return r; + } + + r = relink_children(m_src_image_ctx, dst_image_ctx); + if (r < 0) { + return r; + } + } + + return 0; +} + +template +int Migration::remove_group(I *image_ctx, group_info_t *group_info) { + int r = librbd::api::Group::image_get_group(image_ctx, group_info); + if (r < 0) { + lderr(m_cct) << "failed to get image group: " << cpp_strerror(r) << dendl; + return r; + } + + if (group_info->pool == -1) { + return -ENOENT; + } + + ceph_assert(!image_ctx->id.empty()); + + ldout(m_cct, 10) << dendl; + + IoCtx group_ioctx; + r = util::create_ioctx(image_ctx->md_ctx, "group", group_info->pool, {}, + &group_ioctx); + if (r < 0) { + return r; + } + + r = librbd::api::Group::image_remove_by_id(group_ioctx, + group_info->name.c_str(), + image_ctx->md_ctx, + image_ctx->id.c_str()); + if (r < 0) { + lderr(m_cct) << "failed to remove image from group: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template +int Migration::add_group(I *image_ctx, group_info_t &group_info) { + if (group_info.pool == -1) { + return 0; + } + + ldout(m_cct, 10) << dendl; + + IoCtx group_ioctx; + int r = util::create_ioctx(image_ctx->md_ctx, "group", group_info.pool, {}, + &group_ioctx); + if (r < 0) { + return r; + } + + r = librbd::api::Group::image_add(group_ioctx, group_info.name.c_str(), + image_ctx->md_ctx, + image_ctx->name.c_str()); + if (r < 0) { + lderr(m_cct) << "failed to add image to group: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template +int Migration::update_group(I *from_image_ctx, I *to_image_ctx) { + ldout(m_cct, 10) << dendl; + + group_info_t group_info; + + int r = remove_group(from_image_ctx, &group_info); + if (r < 0) { + return r == -ENOENT ? 0 : r; + } + + r = add_group(to_image_ctx, group_info); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Migration::disable_mirroring( + I *image_ctx, bool *was_enabled, + cls::rbd::MirrorImageMode *mirror_image_mode) { + *was_enabled = false; + + cls::rbd::MirrorImage mirror_image; + int r = cls_client::mirror_image_get(&image_ctx->md_ctx, image_ctx->id, + &mirror_image); + if (r == -ENOENT) { + ldout(m_cct, 10) << "mirroring is not enabled for this image" << dendl; + return 0; + } + + if (r < 0) { + lderr(m_cct) << "failed to retrieve mirror image: " << cpp_strerror(r) + << dendl; + return r; + } + + if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + *was_enabled = true; + *mirror_image_mode = mirror_image.mode; + } + + ldout(m_cct, 10) << dendl; + + C_SaferCond ctx; + auto req = mirror::DisableRequest::create(image_ctx, false, true, &ctx); + req->send(); + r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "failed to disable mirroring: " << cpp_strerror(r) + << dendl; + return r; + } + + m_src_migration_spec.mirroring = true; + + return 0; +} + +template +int Migration::enable_mirroring( + I *image_ctx, bool was_enabled, + cls::rbd::MirrorImageMode mirror_image_mode) { + cls::rbd::MirrorMode mirror_mode; + int r = cls_client::mirror_mode_get(&image_ctx->md_ctx, &mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + return r; + } + + if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + ldout(m_cct, 10) << "mirroring is not enabled for destination pool" + << dendl; + return 0; + } + if (mirror_mode == cls::rbd::MIRROR_MODE_IMAGE && !was_enabled) { + ldout(m_cct, 10) << "mirroring is not enabled for image" << dendl; + return 0; + } + + ldout(m_cct, 10) << dendl; + + C_SaferCond ctx; + auto req = mirror::EnableRequest::create( + image_ctx, mirror_image_mode, "", false, &ctx); + req->send(); + r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +// When relinking children we should be careful as it my be interrupted +// at any moment by some reason and we may end up in an inconsistent +// state, which we have to be able to fix with "migration abort". Below +// are all possible states during migration (P1 - sourse parent, P2 - +// destination parent, C - child): +// +// P1 P2 P1 P2 P1 P2 P1 P2 +// ^\ \ ^ \ /^ /^ +// \v v/ v/ v/ +// C C C C +// +// 1 2 3 4 +// +// (1) and (4) are the initial and the final consistent states. (2) +// and (3) are intermediate inconsistent states that have to be fixed +// by relink_children running in "migration abort" mode. For this, it +// scans P2 for all children attached and relinks (fixes) states (3) +// and (4) to state (1). Then it scans P1 for remaining children and +// fixes the states (2). + +template +int Migration::relink_children(I *from_image_ctx, I *to_image_ctx) { + ldout(m_cct, 10) << dendl; + + bool migration_abort = (to_image_ctx == m_src_image_ctx); + + std::vector snaps; + int r = list_src_snaps( + migration_abort ? to_image_ctx : from_image_ctx, &snaps); + if (r < 0) { + return r; + } + + for (auto it = snaps.begin(); it != snaps.end(); it++) { + auto &snap = *it; + std::vector src_child_images; + + if (from_image_ctx != m_src_image_ctx) { + ceph_assert(migration_abort); + + // We run list snaps against the src image to get only those snapshots + // that are migrated. If the "from" image is not the src image + // (abort migration case), we need to remap snap ids. + // Also collect the list of the children currently attached to the + // source, so we could make a proper decision later about relinking. + + std::shared_lock src_image_locker{to_image_ctx->image_lock}; + cls::rbd::ParentImageSpec src_parent_spec{to_image_ctx->md_ctx.get_id(), + to_image_ctx->md_ctx.get_namespace(), + to_image_ctx->id, snap.id}; + r = api::Image::list_children(to_image_ctx, src_parent_spec, + &src_child_images); + if (r < 0) { + lderr(m_cct) << "failed listing children: " << cpp_strerror(r) + << dendl; + return r; + } + + std::shared_lock image_locker{from_image_ctx->image_lock}; + snap.id = from_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace(), + snap.name); + if (snap.id == CEPH_NOSNAP) { + ldout(m_cct, 5) << "skipping snapshot " << snap.name << dendl; + continue; + } + } + + std::vector child_images; + { + std::shared_lock image_locker{from_image_ctx->image_lock}; + cls::rbd::ParentImageSpec parent_spec{from_image_ctx->md_ctx.get_id(), + from_image_ctx->md_ctx.get_namespace(), + from_image_ctx->id, snap.id}; + r = api::Image::list_children(from_image_ctx, parent_spec, + &child_images); + if (r < 0) { + lderr(m_cct) << "failed listing children: " << cpp_strerror(r) + << dendl; + return r; + } + } + + for (auto &child_image : child_images) { + r = relink_child(from_image_ctx, to_image_ctx, snap, child_image, + migration_abort, true); + if (r < 0) { + return r; + } + + src_child_images.erase(std::remove(src_child_images.begin(), + src_child_images.end(), child_image), + src_child_images.end()); + } + + for (auto &child_image : src_child_images) { + r = relink_child(from_image_ctx, to_image_ctx, snap, child_image, + migration_abort, false); + if (r < 0) { + return r; + } + } + } + + return 0; +} + +template +int Migration::relink_child(I *from_image_ctx, I *to_image_ctx, + const librbd::snap_info_t &from_snap, + const librbd::linked_image_spec_t &child_image, + bool migration_abort, bool reattach_child) { + ldout(m_cct, 10) << from_snap.name << " " << child_image.pool_name << "/" + << child_image.pool_namespace << "/" + << child_image.image_name << " (migration_abort=" + << migration_abort << ", reattach_child=" << reattach_child + << ")" << dendl; + + librados::snap_t to_snap_id; + { + std::shared_lock image_locker{to_image_ctx->image_lock}; + to_snap_id = to_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace(), + from_snap.name); + if (to_snap_id == CEPH_NOSNAP) { + lderr(m_cct) << "no snapshot " << from_snap.name << " on destination image" + << dendl; + return -ENOENT; + } + } + + librados::IoCtx child_io_ctx; + int r = util::create_ioctx(to_image_ctx->md_ctx, + "child image " + child_image.image_name, + child_image.pool_id, child_image.pool_namespace, + &child_io_ctx); + if (r < 0) { + return r; + } + + I *child_image_ctx = I::create("", child_image.image_id, nullptr, + child_io_ctx, false); + r = child_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + if (r < 0) { + lderr(m_cct) << "failed to open child image: " << cpp_strerror(r) << dendl; + return r; + } + BOOST_SCOPE_EXIT_TPL(child_image_ctx) { + child_image_ctx->state->close(); + } BOOST_SCOPE_EXIT_END; + + uint32_t clone_format = 1; + if (child_image_ctx->test_op_features(RBD_OPERATION_FEATURE_CLONE_CHILD)) { + clone_format = 2; + } + + cls::rbd::ParentImageSpec parent_spec; + uint64_t parent_overlap; + { + std::shared_lock image_locker{child_image_ctx->image_lock}; + + // use oldest snapshot or HEAD for parent spec + if (!child_image_ctx->snap_info.empty()) { + parent_spec = child_image_ctx->snap_info.begin()->second.parent.spec; + parent_overlap = child_image_ctx->snap_info.begin()->second.parent.overlap; + } else { + parent_spec = child_image_ctx->parent_md.spec; + parent_overlap = child_image_ctx->parent_md.overlap; + } + } + + if (migration_abort && + parent_spec.pool_id == to_image_ctx->md_ctx.get_id() && + parent_spec.pool_namespace == to_image_ctx->md_ctx.get_namespace() && + parent_spec.image_id == to_image_ctx->id && + parent_spec.snap_id == to_snap_id) { + ldout(m_cct, 10) << "no need for parent re-attach" << dendl; + } else { + if (parent_spec.pool_id != from_image_ctx->md_ctx.get_id() || + parent_spec.pool_namespace != from_image_ctx->md_ctx.get_namespace() || + parent_spec.image_id != from_image_ctx->id || + parent_spec.snap_id != from_snap.id) { + lderr(m_cct) << "parent is not source image: " << parent_spec.pool_id + << "/" << parent_spec.pool_namespace << "/" + << parent_spec.image_id << "@" << parent_spec.snap_id + << dendl; + return -ESTALE; + } + + parent_spec.pool_id = to_image_ctx->md_ctx.get_id(); + parent_spec.pool_namespace = to_image_ctx->md_ctx.get_namespace(); + parent_spec.image_id = to_image_ctx->id; + parent_spec.snap_id = to_snap_id; + + C_SaferCond on_reattach_parent; + auto reattach_parent_req = image::AttachParentRequest::create( + *child_image_ctx, parent_spec, parent_overlap, true, &on_reattach_parent); + reattach_parent_req->send(); + r = on_reattach_parent.wait(); + if (r < 0) { + lderr(m_cct) << "failed to re-attach parent: " << cpp_strerror(r) << dendl; + return r; + } + } + + if (reattach_child) { + C_SaferCond on_reattach_child; + auto reattach_child_req = image::AttachChildRequest::create( + child_image_ctx, to_image_ctx, to_snap_id, from_image_ctx, from_snap.id, + clone_format, &on_reattach_child); + reattach_child_req->send(); + r = on_reattach_child.wait(); + if (r < 0) { + lderr(m_cct) << "failed to re-attach child: " << cpp_strerror(r) << dendl; + return r; + } + } + + child_image_ctx->notify_update(); + + return 0; +} + +template +int Migration::remove_src_image(I** image_ctx) { + ldout(m_cct, 10) << dendl; + + auto src_image_ctx = *image_ctx; + + std::vector snaps; + int r = list_src_snaps(src_image_ctx, &snaps); + if (r < 0) { + return r; + } + + for (auto it = snaps.rbegin(); it != snaps.rend(); it++) { + auto &snap = *it; + + librbd::NoOpProgressContext prog_ctx; + int r = Snapshot::remove(src_image_ctx, snap.name.c_str(), + RBD_SNAP_REMOVE_UNPROTECT, prog_ctx); + if (r < 0) { + lderr(m_cct) << "failed removing source image snapshot '" << snap.name + << "': " << cpp_strerror(r) << dendl; + return r; + } + } + + ceph_assert(src_image_ctx->ignore_migrating); + + auto asio_engine = src_image_ctx->asio_engine; + auto src_image_id = src_image_ctx->id; + librados::IoCtx src_io_ctx(src_image_ctx->md_ctx); + + C_SaferCond on_remove; + auto req = librbd::image::RemoveRequest::create( + src_io_ctx, src_image_ctx, false, true, *m_prog_ctx, + asio_engine->get_work_queue(), &on_remove); + req->send(); + r = on_remove.wait(); + + *image_ctx = nullptr; + + // For old format image it will return -ENOENT due to expected + // tmap_rm failure at the end. + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed removing source image: " << cpp_strerror(r) + << dendl; + return r; + } + + if (!src_image_id.empty()) { + r = cls_client::trash_remove(&src_io_ctx, src_image_id); + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing image " << src_image_id + << " from rbd_trash object" << dendl; + } + } + + return 0; +} + +template +int Migration::revert_data(I* src_image_ctx, I* dst_image_ctx, + ProgressContext* prog_ctx) { + ldout(m_cct, 10) << dendl; + + cls::rbd::MigrationSpec migration_spec; + int r = cls_client::migration_get(&src_image_ctx->md_ctx, + src_image_ctx->header_oid, + &migration_spec); + + if (r < 0) { + lderr(m_cct) << "failed retrieving migration header: " << cpp_strerror(r) + << dendl; + return r; + } + + if (migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST) { + lderr(m_cct) << "unexpected migration header type: " + << migration_spec.header_type << dendl; + return -EINVAL; + } + + uint64_t src_snap_id_start = 0; + uint64_t src_snap_id_end = CEPH_NOSNAP; + uint64_t dst_snap_id_start = 0; + if (!migration_spec.snap_seqs.empty()) { + src_snap_id_start = migration_spec.snap_seqs.rbegin()->second; + } + + // we only care about the HEAD revision so only add a single mapping to + // represent the most recent state + SnapSeqs snap_seqs; + snap_seqs[CEPH_NOSNAP] = CEPH_NOSNAP; + + ldout(m_cct, 20) << "src_snap_id_start=" << src_snap_id_start << ", " + << "src_snap_id_end=" << src_snap_id_end << ", " + << "dst_snap_id_start=" << dst_snap_id_start << ", " + << "snap_seqs=" << snap_seqs << dendl; + + C_SaferCond ctx; + deep_copy::ProgressHandler progress_handler(prog_ctx); + auto request = deep_copy::ImageCopyRequest::create( + src_image_ctx, dst_image_ctx, src_snap_id_start, src_snap_id_end, + dst_snap_id_start, false, {}, snap_seqs, &progress_handler, &ctx); + request->send(); + + r = ctx.wait(); + if (r < 0) { + lderr(m_cct) << "error reverting destination image data blocks back to " + << "source image: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Migration; diff --git a/src/librbd/api/Migration.h b/src/librbd/api/Migration.h new file mode 100644 index 000000000..dd70dcc23 --- /dev/null +++ b/src/librbd/api/Migration.h @@ -0,0 +1,113 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_MIGRATION_H +#define CEPH_LIBRBD_API_MIGRATION_H + +#include "include/int_types.h" +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/librbd.hpp" +#include "cls/rbd/cls_rbd_types.h" + +#include + +namespace librbd { + +class ImageCtx; + +namespace api { + +template +class Migration { +public: + static int prepare(librados::IoCtx& io_ctx, const std::string &image_name, + librados::IoCtx& dest_io_ctx, + const std::string &dest_image_name, ImageOptions& opts); + static int prepare_import(const std::string& source_spec, + librados::IoCtx& dest_io_ctx, + const std::string &dest_image_name, + ImageOptions& opts); + static int execute(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext &prog_ctx); + static int abort(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext &prog_ctx); + static int commit(librados::IoCtx& io_ctx, const std::string &image_name, + ProgressContext &prog_ctx); + static int status(librados::IoCtx& io_ctx, const std::string &image_name, + image_migration_status_t *status); + + static int get_source_spec(ImageCtxT* image_ctx, std::string* source_spec); + +private: + CephContext* m_cct; + ImageCtx* m_src_image_ctx; + ImageCtx* m_dst_image_ctx; + librados::IoCtx m_dst_io_ctx; + std::string m_dst_image_name; + std::string m_dst_image_id; + std::string m_dst_header_oid; + ImageOptions &m_image_options; + bool m_flatten; + bool m_mirroring; + cls::rbd::MirrorImageMode m_mirror_image_mode; + ProgressContext *m_prog_ctx; + + cls::rbd::MigrationSpec m_src_migration_spec; + cls::rbd::MigrationSpec m_dst_migration_spec; + + Migration(ImageCtx* src_image_ctx, ImageCtx* dst_image_ctx, + const cls::rbd::MigrationSpec& dst_migration_spec, + ImageOptions& opts, ProgressContext *prog_ctx); + + int prepare(); + int prepare_import(); + int execute(); + int abort(); + int commit(); + int status(image_migration_status_t *status); + + int set_state(ImageCtxT* image_ctx, const std::string& image_description, + cls::rbd::MigrationState state, + const std::string &description); + int set_state(cls::rbd::MigrationState state, const std::string &description); + + int list_src_snaps(ImageCtxT* image_ctx, + std::vector *snaps); + int validate_src_snaps(ImageCtxT* image_ctx); + int disable_mirroring(ImageCtxT* image_ctx, bool *was_enabled, + cls::rbd::MirrorImageMode *mirror_image_mode); + int enable_mirroring(ImageCtxT* image_ctx, bool was_enabled, + cls::rbd::MirrorImageMode mirror_image_mode); + int set_src_migration(ImageCtxT* image_ctx); + int unlink_src_image(ImageCtxT* image_ctx); + int relink_src_image(ImageCtxT* image_ctx); + int create_dst_image(ImageCtxT** image_ctx); + int remove_group(ImageCtxT* image_ctx, group_info_t *group_info); + int add_group(ImageCtxT* image_ctx, group_info_t &group_info); + int update_group(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx); + int remove_migration(ImageCtxT* image_ctx); + int relink_children(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx); + int remove_src_image(ImageCtxT** image_ctx); + + int v1_set_src_migration(ImageCtxT* image_ctx); + int v2_set_src_migration(ImageCtxT* image_ctx); + int v1_unlink_src_image(ImageCtxT* image_ctx); + int v2_unlink_src_image(ImageCtxT* image_ctx); + int v1_relink_src_image(ImageCtxT* image_ctx); + int v2_relink_src_image(ImageCtxT* image_ctx); + + int relink_child(ImageCtxT *from_image_ctx, ImageCtxT *to_image_ctx, + const librbd::snap_info_t &src_snap, + const librbd::linked_image_spec_t &child_image, + bool migration_abort, bool reattach_child); + + int revert_data(ImageCtxT* src_image_ctx, ImageCtxT* dst_image_ctx, + ProgressContext *prog_ctx); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Migration; + +#endif // CEPH_LIBRBD_API_MIGRATION_H diff --git a/src/librbd/api/Mirror.cc b/src/librbd/api/Mirror.cc new file mode 100644 index 000000000..2cfad0d32 --- /dev/null +++ b/src/librbd/api/Mirror.cc @@ -0,0 +1,2104 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Mirror.h" +#include "include/rados/librados.hpp" +#include "include/stringify.h" +#include "common/ceph_json.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/MirroringWatcher.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/api/Image.h" +#include "librbd/api/Namespace.h" +#include "librbd/mirror/DemoteRequest.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/mirror/EnableRequest.h" +#include "librbd/mirror/GetInfoRequest.h" +#include "librbd/mirror/GetStatusRequest.h" +#include "librbd/mirror/GetUuidRequest.h" +#include "librbd/mirror/PromoteRequest.h" +#include "librbd/mirror/Types.h" +#include "librbd/MirroringWatcher.h" +#include "librbd/mirror/snapshot/CreatePrimaryRequest.h" +#include "librbd/mirror/snapshot/ImageMeta.h" +#include "librbd/mirror/snapshot/UnlinkPeerRequest.h" +#include "librbd/mirror/snapshot/Utils.h" +#include +#include +#include +#include "json_spirit/json_spirit.h" + +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Mirror: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +int get_config_key(librados::Rados& rados, const std::string& key, + std::string* value) { + std::string cmd = + "{" + "\"prefix\": \"config-key get\", " + "\"key\": \"" + key + "\"" + "}"; + + bufferlist in_bl; + bufferlist out_bl; + + int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr); + if (r == -EINVAL) { + return -EOPNOTSUPP; + } else if (r < 0 && r != -ENOENT) { + return r; + } + + *value = out_bl.to_str(); + return 0; +} + +int set_config_key(librados::Rados& rados, const std::string& key, + const std::string& value) { + std::string cmd; + if (value.empty()) { + cmd = "{" + "\"prefix\": \"config-key rm\", " + "\"key\": \"" + key + "\"" + "}"; + } else { + cmd = "{" + "\"prefix\": \"config-key set\", " + "\"key\": \"" + key + "\", " + "\"val\": \"" + value + "\"" + "}"; + } + bufferlist in_bl; + bufferlist out_bl; + + int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr); + if (r == -EINVAL) { + return -EOPNOTSUPP; + } else if (r < 0) { + return r; + } + + return 0; +} + +std::string get_peer_config_key_name(int64_t pool_id, + const std::string& peer_uuid) { + return RBD_MIRROR_PEER_CONFIG_KEY_PREFIX + stringify(pool_id) + "/" + + peer_uuid; +} + +int remove_peer_config_key(librados::IoCtx& io_ctx, + const std::string& peer_uuid) { + int64_t pool_id = io_ctx.get_id(); + auto key = get_peer_config_key_name(pool_id, peer_uuid); + + librados::Rados rados(io_ctx); + int r = set_config_key(rados, key, ""); + if (r < 0 && r != -ENOENT && r != -EPERM) { + return r; + } + return 0; +} + +std::string get_mon_host(CephContext* cct) { + std::string mon_host; + if (auto mon_addrs = cct->get_mon_addrs(); + mon_addrs != nullptr && !mon_addrs->empty()) { + CachedStackStringStream css; + for (auto it = mon_addrs->begin(); it != mon_addrs->end(); ++it) { + if (it != mon_addrs->begin()) { + *css << ","; + } + *css << *it; + } + mon_host = css->str(); + } else { + ldout(cct, 20) << "falling back to mon_host in conf" << dendl; + mon_host = cct->_conf.get_val("mon_host"); + } + ldout(cct, 20) << "mon_host=" << mon_host << dendl; + return mon_host; +} + +int create_bootstrap_user(CephContext* cct, librados::Rados& rados, + std::string* peer_client_id, std::string* cephx_key) { + ldout(cct, 20) << dendl; + + // retrieve peer CephX user from config-key + int r = get_config_key(rados, RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY, + peer_client_id); + if (r == -EACCES) { + ldout(cct, 5) << "insufficient permissions to get peer-client-id " + << "config-key" << dendl; + return r; + } else if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve peer client id key: " + << cpp_strerror(r) << dendl; + return r; + } else if (r == -ENOENT || peer_client_id->empty()) { + ldout(cct, 20) << "creating new peer-client-id config-key" << dendl; + + *peer_client_id = "rbd-mirror-peer"; + r = set_config_key(rados, RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY, + *peer_client_id); + if (r == -EACCES) { + ldout(cct, 5) << "insufficient permissions to update peer-client-id " + << "config-key" << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to update peer client id key: " + << cpp_strerror(r) << dendl; + return r; + } + } + ldout(cct, 20) << "peer_client_id=" << *peer_client_id << dendl; + + // create peer client user + std::string cmd = + R"({)" \ + R"( "prefix": "auth get-or-create",)" \ + R"( "entity": "client.)" + *peer_client_id + R"(",)" \ + R"( "caps": [)" \ + R"( "mon", "profile rbd-mirror-peer",)" \ + R"( "osd", "profile rbd"],)" \ + R"( "format": "json")" \ + R"(})"; + + bufferlist in_bl; + bufferlist out_bl; + + r = rados.mon_command(cmd, in_bl, &out_bl, nullptr); + if (r == -EINVAL) { + ldout(cct, 5) << "caps mismatch for existing user" << dendl; + return -EEXIST; + } else if (r == -EACCES) { + ldout(cct, 5) << "insufficient permissions to create user" << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to create or update RBD mirroring bootstrap user: " + << cpp_strerror(r) << dendl; + return r; + } + + // extract key from response + bool json_valid = false; + json_spirit::mValue json_root; + if(json_spirit::read(out_bl.to_str(), json_root)) { + try { + auto& json_obj = json_root.get_array()[0].get_obj(); + *cephx_key = json_obj["key"].get_str(); + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + lderr(cct) << "invalid auth keyring JSON received" << dendl; + return -EBADMSG; + } + + return 0; +} + +int create_bootstrap_peer(CephContext* cct, librados::IoCtx& io_ctx, + mirror_peer_direction_t direction, + const std::string& site_name, const std::string& fsid, + const std::string& client_id, const std::string& key, + const std::string& mon_host, + const std::string& cluster1, + const std::string& cluster2) { + ldout(cct, 20) << dendl; + + std::string peer_uuid; + std::vector peers; + int r = Mirror<>::peer_site_list(io_ctx, &peers); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list mirror peers: " << cpp_strerror(r) << dendl; + return r; + } + + if (peers.empty()) { + r = Mirror<>::peer_site_add(io_ctx, &peer_uuid, direction, site_name, + "client." + client_id); + if (r < 0) { + lderr(cct) << "failed to add " << cluster1 << " peer to " + << cluster2 << " " << "cluster: " << cpp_strerror(r) << dendl; + return r; + } + } else if (peers[0].site_name != site_name && + peers[0].site_name != fsid) { + // only support a single peer + lderr(cct) << "multiple peers are not currently supported" << dendl; + return -EINVAL; + } else { + peer_uuid = peers[0].uuid; + + if (peers[0].site_name != site_name) { + r = Mirror<>::peer_site_set_name(io_ctx, peer_uuid, site_name); + if (r < 0) { + // non-fatal attempt to update site name + lderr(cct) << "failed to update peer site name" << dendl; + } + } + } + + Mirror<>::Attributes attributes { + {"mon_host", mon_host}, + {"key", key}}; + r = Mirror<>::peer_site_set_attributes(io_ctx, peer_uuid, attributes); + if (r < 0) { + lderr(cct) << "failed to update " << cluster1 << " cluster connection " + << "attributes in " << cluster2 << " cluster: " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +int list_mirror_images(librados::IoCtx& io_ctx, + std::set& mirror_image_ids) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + + std::string last_read = ""; + int max_read = 1024; + int r; + do { + std::map mirror_images; + r = cls_client::mirror_image_list(&io_ctx, last_read, max_read, + &mirror_images); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing mirrored image directory: " + << cpp_strerror(r) << dendl; + return r; + } + for (auto it = mirror_images.begin(); it != mirror_images.end(); ++it) { + mirror_image_ids.insert(it->first); + } + if (!mirror_images.empty()) { + last_read = mirror_images.rbegin()->first; + } + r = mirror_images.size(); + } while (r == max_read); + + return 0; +} + +template +const char *pool_or_namespace(I *ictx) { + if (!ictx->md_ctx.get_namespace().empty()) { + return "namespace"; + } else { + return "pool"; + } +} + +struct C_ImageGetInfo : public Context { + mirror_image_info_t *mirror_image_info; + mirror_image_mode_t *mirror_image_mode; + Context *on_finish; + + cls::rbd::MirrorImage mirror_image; + mirror::PromotionState promotion_state = mirror::PROMOTION_STATE_PRIMARY; + std::string primary_mirror_uuid; + + C_ImageGetInfo(mirror_image_info_t *mirror_image_info, + mirror_image_mode_t *mirror_image_mode, Context *on_finish) + : mirror_image_info(mirror_image_info), + mirror_image_mode(mirror_image_mode), on_finish(on_finish) { + } + + void finish(int r) override { + if (r < 0 && r != -ENOENT) { + on_finish->complete(r); + return; + } + + if (mirror_image_info != nullptr) { + mirror_image_info->global_id = mirror_image.global_image_id; + mirror_image_info->state = static_cast( + mirror_image.state); + mirror_image_info->primary = ( + promotion_state == mirror::PROMOTION_STATE_PRIMARY); + } + + if (mirror_image_mode != nullptr) { + *mirror_image_mode = + static_cast(mirror_image.mode); + } + + on_finish->complete(0); + } +}; + +struct C_ImageGetGlobalStatus : public C_ImageGetInfo { + std::string image_name; + mirror_image_global_status_t *mirror_image_global_status; + + cls::rbd::MirrorImageStatus mirror_image_status_internal; + + C_ImageGetGlobalStatus( + const std::string &image_name, + mirror_image_global_status_t *mirror_image_global_status, + Context *on_finish) + : C_ImageGetInfo(&mirror_image_global_status->info, nullptr, on_finish), + image_name(image_name), + mirror_image_global_status(mirror_image_global_status) { + } + + void finish(int r) override { + if (r < 0 && r != -ENOENT) { + on_finish->complete(r); + return; + } + + mirror_image_global_status->name = image_name; + mirror_image_global_status->site_statuses.clear(); + mirror_image_global_status->site_statuses.reserve( + mirror_image_status_internal.mirror_image_site_statuses.size()); + for (auto& site_status : + mirror_image_status_internal.mirror_image_site_statuses) { + mirror_image_global_status->site_statuses.push_back({ + site_status.mirror_uuid, + static_cast(site_status.state), + site_status.description, site_status.last_update.sec(), + site_status.up}); + } + C_ImageGetInfo::finish(0); + } +}; + +template +struct C_ImageSnapshotCreate : public Context { + I *ictx; + uint64_t snap_create_flags; + uint64_t *snap_id; + Context *on_finish; + + cls::rbd::MirrorImage mirror_image; + mirror::PromotionState promotion_state; + std::string primary_mirror_uuid; + + C_ImageSnapshotCreate(I *ictx, uint64_t snap_create_flags, uint64_t *snap_id, + Context *on_finish) + : ictx(ictx), snap_create_flags(snap_create_flags), snap_id(snap_id), + on_finish(on_finish) { + } + + void finish(int r) override { + if (r < 0 && r != -ENOENT) { + on_finish->complete(r); + return; + } + + if (mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT || + mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(ictx->cct) << "snapshot based mirroring is not enabled" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto req = mirror::snapshot::CreatePrimaryRequest::create( + ictx, mirror_image.global_image_id, CEPH_NOSNAP, snap_create_flags, 0U, + snap_id, on_finish); + req->send(); + } +}; + +} // anonymous namespace + +template +int Mirror::image_enable(I *ictx, mirror_image_mode_t mode, + bool relax_same_pool_parent_check) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << " mode=" << mode + << " relax_same_pool_parent_check=" + << relax_same_pool_parent_check << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + cls::rbd::MirrorMode mirror_mode; + r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode); + if (r < 0) { + lderr(cct) << "cannot enable mirroring: failed to retrieve mirror mode: " + << cpp_strerror(r) << dendl; + return r; + } + + if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + lderr(cct) << "cannot enable mirroring: mirroring is not enabled on a " + << pool_or_namespace(ictx) << dendl; + return -EINVAL; + } + + if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) { + lderr(cct) << "cannot enable mirroring: " << pool_or_namespace(ictx) + << " is not in image mirror mode" << dendl; + return -EINVAL; + } + + // is mirroring not enabled for the parent? + { + std::shared_lock image_locker{ictx->image_lock}; + ImageCtx *parent = ictx->parent; + if (parent) { + if (parent->md_ctx.get_id() != ictx->md_ctx.get_id() || + !relax_same_pool_parent_check) { + cls::rbd::MirrorImage mirror_image_internal; + r = cls_client::mirror_image_get(&(parent->md_ctx), parent->id, + &mirror_image_internal); + if (r == -ENOENT) { + lderr(cct) << "mirroring is not enabled for the parent" << dendl; + return -EINVAL; + } + } + } + } + + if (mode == RBD_MIRROR_IMAGE_MODE_JOURNAL && + !ictx->test_features(RBD_FEATURE_JOURNALING)) { + uint64_t features = RBD_FEATURE_JOURNALING; + if (!ictx->test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) { + features |= RBD_FEATURE_EXCLUSIVE_LOCK; + } + r = ictx->operations->update_features(features, true); + if (r < 0) { + lderr(cct) << "cannot enable journaling: " << cpp_strerror(r) << dendl; + return r; + } + } + + C_SaferCond ctx; + auto req = mirror::EnableRequest::create( + ictx, static_cast(mode), "", false, &ctx); + req->send(); + + r = ctx.wait(); + if (r < 0) { + lderr(cct) << "cannot enable mirroring: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +template +int Mirror::image_disable(I *ictx, bool force) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + cls::rbd::MirrorMode mirror_mode; + r = cls_client::mirror_mode_get(&ictx->md_ctx, &mirror_mode); + if (r < 0) { + lderr(cct) << "cannot disable mirroring: failed to retrieve pool " + "mirroring mode: " << cpp_strerror(r) << dendl; + return r; + } + + if (mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) { + lderr(cct) << "cannot disable mirroring in the current pool mirroring " + "mode" << dendl; + return -EINVAL; + } + + // is mirroring enabled for the image? + cls::rbd::MirrorImage mirror_image_internal; + r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, + &mirror_image_internal); + if (r == -ENOENT) { + // mirroring is not enabled for this image + ldout(cct, 20) << "ignoring disable command: mirroring is not enabled for " + << "this image" << dendl; + return 0; + } else if (r == -EOPNOTSUPP) { + ldout(cct, 5) << "mirroring not supported by OSD" << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to retrieve mirror image metadata: " + << cpp_strerror(r) << dendl; + return r; + } + + mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING; + r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id, + mirror_image_internal); + if (r < 0) { + lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl; + return r; + } + + bool rollback = false; + BOOST_SCOPE_EXIT_ALL(ictx, &mirror_image_internal, &rollback) { + if (rollback) { + // restore the mask bit for treating the non-primary feature as read-only + ictx->image_lock.lock(); + ictx->read_only_mask |= IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + ictx->image_lock.unlock(); + + ictx->state->handle_update_notification(); + + // attempt to restore the image state + CephContext *cct = ictx->cct; + mirror_image_internal.state = cls::rbd::MIRROR_IMAGE_STATE_ENABLED; + int r = cls_client::mirror_image_set(&ictx->md_ctx, ictx->id, + mirror_image_internal); + if (r < 0) { + lderr(cct) << "failed to re-enable image mirroring: " + << cpp_strerror(r) << dendl; + } + } + }; + + std::unique_lock image_locker{ictx->image_lock}; + std::map snap_info = ictx->snap_info; + for (auto &info : snap_info) { + cls::rbd::ParentImageSpec parent_spec{ictx->md_ctx.get_id(), + ictx->md_ctx.get_namespace(), + ictx->id, info.first}; + std::vector child_images; + r = Image::list_children(ictx, parent_spec, &child_images); + if (r < 0) { + rollback = true; + return r; + } + + if (child_images.empty()) { + continue; + } + + librados::IoCtx child_io_ctx; + int64_t child_pool_id = -1; + for (auto &child_image : child_images){ + std::string pool = child_image.pool_name; + if (child_pool_id == -1 || + child_pool_id != child_image.pool_id || + child_io_ctx.get_namespace() != child_image.pool_namespace) { + r = util::create_ioctx(ictx->md_ctx, "child image", + child_image.pool_id, + child_image.pool_namespace, + &child_io_ctx); + if (r < 0) { + rollback = true; + return r; + } + + child_pool_id = child_image.pool_id; + } + + cls::rbd::MirrorImage child_mirror_image_internal; + r = cls_client::mirror_image_get(&child_io_ctx, child_image.image_id, + &child_mirror_image_internal); + if (r != -ENOENT) { + rollback = true; + lderr(cct) << "mirroring is enabled on one or more children " + << dendl; + return -EBUSY; + } + } + } + image_locker.unlock(); + + if (mirror_image_internal.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + // don't let the non-primary feature bit prevent image updates + ictx->image_lock.lock(); + ictx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + ictx->image_lock.unlock(); + + r = ictx->state->refresh(); + if (r < 0) { + rollback = true; + return r; + } + + // remove any snapshot-based mirroring image-meta from image + std::string mirror_uuid; + r = uuid_get(ictx->md_ctx, &mirror_uuid); + if (r < 0) { + rollback = true; + return r; + } + + r = ictx->operations->metadata_remove( + mirror::snapshot::util::get_image_meta_key(mirror_uuid)); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "cannot remove snapshot image-meta key: " << cpp_strerror(r) + << dendl; + rollback = true; + return r; + } + } + + C_SaferCond ctx; + auto req = mirror::DisableRequest::create(ictx, force, true, + &ctx); + req->send(); + + r = ctx.wait(); + if (r < 0) { + lderr(cct) << "cannot disable mirroring: " << cpp_strerror(r) << dendl; + rollback = true; + return r; + } + + if (mirror_image_internal.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) { + r = ictx->operations->update_features(RBD_FEATURE_JOURNALING, false); + if (r < 0) { + lderr(cct) << "cannot disable journaling: " << cpp_strerror(r) << dendl; + // not fatal + } + } + + return 0; +} + +template +int Mirror::image_promote(I *ictx, bool force) { + CephContext *cct = ictx->cct; + + C_SaferCond ctx; + Mirror::image_promote(ictx, force, &ctx); + int r = ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to promote image" << dendl; + return r; + } + + return 0; +} + +template +void Mirror::image_promote(I *ictx, bool force, Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << ", " + << "force=" << force << dendl; + + // don't let the non-primary feature bit prevent image updates + ictx->image_lock.lock(); + ictx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + ictx->image_lock.unlock(); + + auto on_promote = new LambdaContext([ictx, on_finish](int r) { + ictx->image_lock.lock(); + ictx->read_only_mask |= IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + ictx->image_lock.unlock(); + + ictx->state->handle_update_notification(); + on_finish->complete(r); + }); + + auto on_refresh = new LambdaContext([ictx, force, on_promote](int r) { + if (r < 0) { + lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl; + on_promote->complete(r); + return; + } + + auto req = mirror::PromoteRequest<>::create(*ictx, force, on_promote); + req->send(); + }); + ictx->state->refresh(on_refresh); +} + +template +int Mirror::image_demote(I *ictx) { + CephContext *cct = ictx->cct; + + C_SaferCond ctx; + Mirror::image_demote(ictx, &ctx); + int r = ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to demote image" << dendl; + return r; + } + + return 0; +} + +template +void Mirror::image_demote(I *ictx, Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + auto on_cleanup = new LambdaContext([ictx, on_finish](int r) { + ictx->image_lock.lock(); + ictx->read_only_mask |= IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + ictx->image_lock.unlock(); + + ictx->state->handle_update_notification(); + + on_finish->complete(r); + }); + auto on_refresh = new LambdaContext([ictx, on_cleanup](int r) { + if (r < 0) { + lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl; + on_cleanup->complete(r); + return; + } + + auto req = mirror::DemoteRequest<>::create(*ictx, on_cleanup); + req->send(); + }); + + // ensure we can create a snapshot after setting the non-primary + // feature bit + ictx->image_lock.lock(); + ictx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + ictx->image_lock.unlock(); + + ictx->state->refresh(on_refresh); +} + +template +int Mirror::image_resync(I *ictx) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + cls::rbd::MirrorImage mirror_image; + mirror::PromotionState promotion_state; + std::string primary_mirror_uuid; + C_SaferCond get_info_ctx; + auto req = mirror::GetInfoRequest::create(*ictx, &mirror_image, + &promotion_state, + &primary_mirror_uuid, + &get_info_ctx); + req->send(); + + r = get_info_ctx.wait(); + if (r < 0) { + return r; + } + + if (promotion_state == mirror::PROMOTION_STATE_PRIMARY) { + lderr(cct) << "image is primary, cannot resync to itself" << dendl; + return -EINVAL; + } + + if (mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) { + // flag the journal indicating that we want to rebuild the local image + r = Journal::request_resync(ictx); + if (r < 0) { + lderr(cct) << "failed to request resync: " << cpp_strerror(r) << dendl; + return r; + } + } else if (mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + std::string mirror_uuid; + r = uuid_get(ictx->md_ctx, &mirror_uuid); + if (r < 0) { + return r; + } + + mirror::snapshot::ImageMeta image_meta(ictx, mirror_uuid); + + C_SaferCond load_meta_ctx; + image_meta.load(&load_meta_ctx); + r = load_meta_ctx.wait(); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to load mirror image-meta: " << cpp_strerror(r) + << dendl; + return r; + } + + image_meta.resync_requested = true; + + C_SaferCond save_meta_ctx; + image_meta.save(&save_meta_ctx); + r = save_meta_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to request resync: " << cpp_strerror(r) << dendl; + return r; + } + } else { + lderr(cct) << "unknown mirror mode" << dendl; + return -EINVAL; + } + + return 0; +} + +template +void Mirror::image_get_info(I *ictx, mirror_image_info_t *mirror_image_info, + Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + auto on_refresh = new LambdaContext( + [ictx, mirror_image_info, on_finish](int r) { + if (r < 0) { + lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + auto ctx = new C_ImageGetInfo(mirror_image_info, nullptr, on_finish); + auto req = mirror::GetInfoRequest::create(*ictx, &ctx->mirror_image, + &ctx->promotion_state, + &ctx->primary_mirror_uuid, + ctx); + req->send(); + }); + + if (ictx->state->is_refresh_required()) { + ictx->state->refresh(on_refresh); + } else { + on_refresh->complete(0); + } +} + +template +int Mirror::image_get_info(I *ictx, mirror_image_info_t *mirror_image_info) { + C_SaferCond ctx; + image_get_info(ictx, mirror_image_info, &ctx); + + int r = ctx.wait(); + if (r < 0) { + return r; + } + return 0; +} + +template +void Mirror::image_get_info(librados::IoCtx& io_ctx, + asio::ContextWQ *op_work_queue, + const std::string &image_id, + mirror_image_info_t *mirror_image_info, + Context *on_finish) { + auto cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << "pool_id=" << io_ctx.get_id() << ", image_id=" << image_id + << dendl; + + auto ctx = new C_ImageGetInfo(mirror_image_info, nullptr, on_finish); + auto req = mirror::GetInfoRequest::create(io_ctx, op_work_queue, image_id, + &ctx->mirror_image, + &ctx->promotion_state, + &ctx->primary_mirror_uuid, ctx); + req->send(); +} + +template +int Mirror::image_get_info(librados::IoCtx& io_ctx, + asio::ContextWQ *op_work_queue, + const std::string &image_id, + mirror_image_info_t *mirror_image_info) { + C_SaferCond ctx; + image_get_info(io_ctx, op_work_queue, image_id, mirror_image_info, &ctx); + + int r = ctx.wait(); + if (r < 0) { + return r; + } + return 0; +} + +template +void Mirror::image_get_mode(I *ictx, mirror_image_mode_t *mode, + Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + auto ctx = new C_ImageGetInfo(nullptr, mode, on_finish); + auto req = mirror::GetInfoRequest::create(*ictx, &ctx->mirror_image, + &ctx->promotion_state, + &ctx->primary_mirror_uuid, ctx); + req->send(); +} + +template +int Mirror::image_get_mode(I *ictx, mirror_image_mode_t *mode) { + C_SaferCond ctx; + image_get_mode(ictx, mode, &ctx); + + int r = ctx.wait(); + if (r < 0) { + return r; + } + return 0; +} + +template +void Mirror::image_get_global_status(I *ictx, + mirror_image_global_status_t *status, + Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + auto ctx = new C_ImageGetGlobalStatus(ictx->name, status, on_finish); + auto req = mirror::GetStatusRequest::create( + *ictx, &ctx->mirror_image_status_internal, &ctx->mirror_image, + &ctx->promotion_state, ctx); + req->send(); +} + +template +int Mirror::image_get_global_status(I *ictx, + mirror_image_global_status_t *status) { + C_SaferCond ctx; + image_get_global_status(ictx, status, &ctx); + + int r = ctx.wait(); + if (r < 0) { + return r; + } + return 0; +} + +template +int Mirror::image_get_instance_id(I *ictx, std::string *instance_id) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + cls::rbd::MirrorImage mirror_image; + int r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, &mirror_image); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + return r; + } else if (mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(cct) << "mirroring is not currently enabled" << dendl; + return -EINVAL; + } + + entity_inst_t instance; + r = cls_client::mirror_image_instance_get(&ictx->md_ctx, + mirror_image.global_image_id, + &instance); + if (r < 0) { + if (r != -ENOENT && r != -ESTALE) { + lderr(cct) << "failed to get mirror image instance: " << cpp_strerror(r) + << dendl; + } + return r; + } + + *instance_id = stringify(instance.name.num()); + return 0; +} + +template +int Mirror::site_name_get(librados::Rados& rados, std::string* name) { + CephContext *cct = reinterpret_cast(rados.cct()); + ldout(cct, 20) << dendl; + + int r = get_config_key(rados, RBD_MIRROR_SITE_NAME_CONFIG_KEY, name); + if (r == -EOPNOTSUPP) { + return r; + } else if (r == -ENOENT || name->empty()) { + // default to the cluster fsid + r = rados.cluster_fsid(name); + if (r < 0) { + lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r) + << dendl; + } + return r; + } else if (r < 0) { + lderr(cct) << "failed to retrieve site name: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template +int Mirror::site_name_set(librados::Rados& rados, const std::string& name) { + CephContext *cct = reinterpret_cast(rados.cct()); + + std::string site_name{name}; + boost::algorithm::trim(site_name); + ldout(cct, 20) << "site_name=" << site_name << dendl; + + int r = set_config_key(rados, RBD_MIRROR_SITE_NAME_CONFIG_KEY, name); + if (r == -EOPNOTSUPP) { + return r; + } else if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to update site name: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template +int Mirror::mode_get(librados::IoCtx& io_ctx, + rbd_mirror_mode_t *mirror_mode) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + cls::rbd::MirrorMode mirror_mode_internal; + int r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode_internal); + if (r < 0) { + lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + return r; + } + + switch (mirror_mode_internal) { + case cls::rbd::MIRROR_MODE_DISABLED: + case cls::rbd::MIRROR_MODE_IMAGE: + case cls::rbd::MIRROR_MODE_POOL: + *mirror_mode = static_cast(mirror_mode_internal); + break; + default: + lderr(cct) << "unknown mirror mode (" + << static_cast(mirror_mode_internal) << ")" + << dendl; + return -EINVAL; + } + return 0; +} + +template +int Mirror::mode_set(librados::IoCtx& io_ctx, + rbd_mirror_mode_t mirror_mode) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + cls::rbd::MirrorMode next_mirror_mode; + switch (mirror_mode) { + case RBD_MIRROR_MODE_DISABLED: + case RBD_MIRROR_MODE_IMAGE: + case RBD_MIRROR_MODE_POOL: + next_mirror_mode = static_cast(mirror_mode); + break; + default: + lderr(cct) << "unknown mirror mode (" + << static_cast(mirror_mode) << ")" << dendl; + return -EINVAL; + } + + int r; + if (next_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + // fail early if pool still has peers registered and attempting to disable + std::vector mirror_peers; + r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list peers: " << cpp_strerror(r) << dendl; + return r; + } else if (!mirror_peers.empty()) { + lderr(cct) << "mirror peers still registered" << dendl; + return -EBUSY; + } + } + + cls::rbd::MirrorMode current_mirror_mode; + r = cls_client::mirror_mode_get(&io_ctx, ¤t_mirror_mode); + if (r < 0) { + lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + return r; + } + + if (current_mirror_mode == next_mirror_mode) { + return 0; + } else if (current_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + uuid_d uuid_gen; + uuid_gen.generate_random(); + r = cls_client::mirror_uuid_set(&io_ctx, uuid_gen.to_string()); + if (r < 0) { + lderr(cct) << "failed to allocate mirroring uuid: " << cpp_strerror(r) + << dendl; + return r; + } + } + + if (current_mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) { + r = cls_client::mirror_mode_set(&io_ctx, cls::rbd::MIRROR_MODE_IMAGE); + if (r < 0) { + lderr(cct) << "failed to set mirror mode to image: " + << cpp_strerror(r) << dendl; + return r; + } + + r = MirroringWatcher<>::notify_mode_updated(io_ctx, + cls::rbd::MIRROR_MODE_IMAGE); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + } + + if (next_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) { + return 0; + } + + if (next_mirror_mode == cls::rbd::MIRROR_MODE_POOL) { + std::map images; + r = Image::list_images_v2(io_ctx, &images); + if (r < 0) { + lderr(cct) << "failed listing images: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& img_pair : images) { + uint64_t features; + uint64_t incompatible_features; + r = cls_client::get_features(&io_ctx, util::header_name(img_pair.second), + true, &features, &incompatible_features); + if (r < 0) { + lderr(cct) << "error getting features for image " << img_pair.first + << ": " << cpp_strerror(r) << dendl; + return r; + } + + // Enable only journal based mirroring + + if ((features & RBD_FEATURE_JOURNALING) != 0) { + I *img_ctx = I::create("", img_pair.second, nullptr, io_ctx, false); + r = img_ctx->state->open(0); + if (r < 0) { + lderr(cct) << "error opening image "<< img_pair.first << ": " + << cpp_strerror(r) << dendl; + return r; + } + + r = image_enable(img_ctx, RBD_MIRROR_IMAGE_MODE_JOURNAL, true); + int close_r = img_ctx->state->close(); + if (r < 0) { + lderr(cct) << "error enabling mirroring for image " + << img_pair.first << ": " << cpp_strerror(r) << dendl; + return r; + } else if (close_r < 0) { + lderr(cct) << "failed to close image " << img_pair.first << ": " + << cpp_strerror(close_r) << dendl; + return close_r; + } + } + } + } else if (next_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + while (true) { + bool retry_busy = false; + bool pending_busy = false; + + std::set image_ids; + r = list_mirror_images(io_ctx, image_ids); + if (r < 0) { + lderr(cct) << "failed listing images: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& img_id : image_ids) { + if (current_mirror_mode == cls::rbd::MIRROR_MODE_IMAGE) { + cls::rbd::MirrorImage mirror_image; + r = cls_client::mirror_image_get(&io_ctx, img_id, &mirror_image); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring state for image id " + << img_id << ": " << cpp_strerror(r) << dendl; + return r; + } + if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(cct) << "failed to disable mirror mode: there are still " + << "images with mirroring enabled" << dendl; + return -EINVAL; + } + } else { + I *img_ctx = I::create("", img_id, nullptr, io_ctx, false); + r = img_ctx->state->open(0); + if (r < 0) { + lderr(cct) << "error opening image id "<< img_id << ": " + << cpp_strerror(r) << dendl; + return r; + } + + r = image_disable(img_ctx, false); + int close_r = img_ctx->state->close(); + if (r == -EBUSY) { + pending_busy = true; + } else if (r < 0) { + lderr(cct) << "error disabling mirroring for image id " << img_id + << cpp_strerror(r) << dendl; + return r; + } else if (close_r < 0) { + lderr(cct) << "failed to close image id " << img_id << ": " + << cpp_strerror(close_r) << dendl; + return close_r; + } else if (pending_busy) { + // at least one mirrored image was successfully disabled, so we can + // retry any failures caused by busy parent/child relationships + retry_busy = true; + } + } + } + + if (!retry_busy && pending_busy) { + lderr(cct) << "error disabling mirroring for one or more images" + << dendl; + return -EBUSY; + } else if (!retry_busy) { + break; + } + } + } + + r = cls_client::mirror_mode_set(&io_ctx, next_mirror_mode); + if (r < 0) { + lderr(cct) << "failed to set mirror mode: " << cpp_strerror(r) << dendl; + return r; + } + + r = MirroringWatcher<>::notify_mode_updated(io_ctx, next_mirror_mode); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + return 0; +} + +template +int Mirror::uuid_get(librados::IoCtx& io_ctx, std::string* mirror_uuid) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + C_SaferCond ctx; + uuid_get(io_ctx, mirror_uuid, &ctx); + int r = ctx.wait(); + if (r < 0) { + if (r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring uuid: " << cpp_strerror(r) + << dendl; + } + return r; + } + + return 0; +} + +template +void Mirror::uuid_get(librados::IoCtx& io_ctx, std::string* mirror_uuid, + Context* on_finish) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + auto req = mirror::GetUuidRequest::create(io_ctx, mirror_uuid, on_finish); + req->send(); +} + +template +int Mirror::peer_bootstrap_create(librados::IoCtx& io_ctx, + std::string* token) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + auto mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + int r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring mode: " << cpp_strerror(r) + << dendl; + return r; + } else if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + return -EINVAL; + } + + // retrieve the cluster fsid + std::string fsid; + librados::Rados rados(io_ctx); + r = rados.cluster_fsid(&fsid); + if (r < 0) { + lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r) + << dendl; + return r; + } + + std::string peer_client_id; + std::string cephx_key; + r = create_bootstrap_user(cct, rados, &peer_client_id, &cephx_key); + if (r < 0) { + return r; + } + + std::string mon_host = get_mon_host(cct); + + // format the token response + bufferlist token_bl; + token_bl.append( + R"({)" \ + R"("fsid":")" + fsid + R"(",)" + \ + R"("client_id":")" + peer_client_id + R"(",)" + \ + R"("key":")" + cephx_key + R"(",)" + \ + R"("mon_host":")" + \ + boost::replace_all_copy(mon_host, "\"", "\\\"") + R"(")" + \ + R"(})"); + ldout(cct, 20) << "token=" << token_bl.to_str() << dendl; + + bufferlist base64_bl; + token_bl.encode_base64(base64_bl); + *token = base64_bl.to_str(); + + return 0; +} + +template +int Mirror::peer_bootstrap_import(librados::IoCtx& io_ctx, + rbd_mirror_peer_direction_t direction, + const std::string& token) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + if (direction != RBD_MIRROR_PEER_DIRECTION_RX && + direction != RBD_MIRROR_PEER_DIRECTION_RX_TX) { + lderr(cct) << "invalid mirror peer direction" << dendl; + return -EINVAL; + } + + bufferlist token_bl; + try { + bufferlist base64_bl; + base64_bl.append(token); + token_bl.decode_base64(base64_bl); + } catch (buffer::error& err) { + lderr(cct) << "failed to decode base64" << dendl; + return -EINVAL; + } + + ldout(cct, 20) << "token=" << token_bl.to_str() << dendl; + + bool json_valid = false; + std::string expected_remote_fsid; + std::string remote_client_id; + std::string remote_key; + std::string remote_mon_host; + + json_spirit::mValue json_root; + if(json_spirit::read(token_bl.to_str(), json_root)) { + try { + auto& json_obj = json_root.get_obj(); + expected_remote_fsid = json_obj["fsid"].get_str(); + remote_client_id = json_obj["client_id"].get_str(); + remote_key = json_obj["key"].get_str(); + remote_mon_host = json_obj["mon_host"].get_str(); + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + lderr(cct) << "invalid bootstrap token JSON received" << dendl; + return -EINVAL; + } + + // sanity check import process + std::string local_fsid; + librados::Rados rados(io_ctx); + int r = rados.cluster_fsid(&local_fsid); + if (r < 0) { + lderr(cct) << "failed to retrieve cluster fsid: " << cpp_strerror(r) + << dendl; + return r; + } + + std::string local_site_name; + r = site_name_get(rados, &local_site_name); + if (r < 0) { + lderr(cct) << "failed to retrieve cluster site name: " << cpp_strerror(r) + << dendl; + return r; + } + + // attempt to connect to remote cluster + librados::Rados remote_rados; + remote_rados.init(remote_client_id.c_str()); + + auto remote_cct = reinterpret_cast(remote_rados.cct()); + remote_cct->_conf.set_val("mon_host", remote_mon_host); + remote_cct->_conf.set_val("key", remote_key); + + r = remote_rados.connect(); + if (r < 0) { + lderr(cct) << "failed to connect to peer cluster: " << cpp_strerror(r) + << dendl; + return r; + } + + std::string remote_fsid; + r = remote_rados.cluster_fsid(&remote_fsid); + if (r < 0) { + lderr(cct) << "failed to retrieve remote cluster fsid: " + << cpp_strerror(r) << dendl; + return r; + } else if (local_fsid == remote_fsid) { + lderr(cct) << "cannot import token for local cluster" << dendl; + return -EINVAL; + } else if (expected_remote_fsid != remote_fsid) { + lderr(cct) << "unexpected remote cluster fsid" << dendl; + return -EINVAL; + } + + std::string remote_site_name; + r = site_name_get(remote_rados, &remote_site_name); + if (r < 0) { + lderr(cct) << "failed to retrieve remote cluster site name: " + << cpp_strerror(r) << dendl; + return r; + } else if (local_site_name == remote_site_name) { + lderr(cct) << "cannot import token for duplicate site name" << dendl; + return -EINVAL; + } + + librados::IoCtx remote_io_ctx; + r = remote_rados.ioctx_create(io_ctx.get_pool_name().c_str(), remote_io_ctx); + if (r == -ENOENT) { + ldout(cct, 10) << "remote pool does not exist" << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to open remote pool '" << io_ctx.get_pool_name() + << "': " << cpp_strerror(r) << dendl; + return r; + } + + auto remote_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + r = cls_client::mirror_mode_get(&remote_io_ctx, &remote_mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve remote mirroring mode: " + << cpp_strerror(r) << dendl; + return r; + } else if (remote_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + return -ENOSYS; + } + + auto local_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + r = cls_client::mirror_mode_get(&io_ctx, &local_mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve local mirroring mode: " << cpp_strerror(r) + << dendl; + return r; + } else if (local_mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) { + // copy mirror mode from remote peer + r = mode_set(io_ctx, static_cast(remote_mirror_mode)); + if (r < 0) { + return r; + } + } + + if (direction == RBD_MIRROR_PEER_DIRECTION_RX_TX) { + // create a local mirror peer user and export it to the remote cluster + std::string local_client_id; + std::string local_key; + r = create_bootstrap_user(cct, rados, &local_client_id, &local_key); + if (r < 0) { + return r; + } + + std::string local_mon_host = get_mon_host(cct); + + // create local cluster peer in remote cluster + r = create_bootstrap_peer(cct, remote_io_ctx, + RBD_MIRROR_PEER_DIRECTION_RX_TX, local_site_name, + local_fsid, local_client_id, local_key, + local_mon_host, "local", "remote"); + if (r < 0) { + return r; + } + } + + // create remote cluster peer in local cluster + r = create_bootstrap_peer(cct, io_ctx, direction, remote_site_name, + remote_fsid, remote_client_id, remote_key, + remote_mon_host, "remote", "local"); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Mirror::peer_site_add(librados::IoCtx& io_ctx, std::string *uuid, + mirror_peer_direction_t direction, + const std::string &site_name, + const std::string &client_name) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << "name=" << site_name << ", " + << "client=" << client_name << dendl; + + if (cct->_conf->cluster == site_name) { + lderr(cct) << "cannot add self as remote peer" << dendl; + return -EINVAL; + } + + if (direction == RBD_MIRROR_PEER_DIRECTION_TX) { + return -EINVAL; + } + + int r; + do { + uuid_d uuid_gen; + uuid_gen.generate_random(); + + *uuid = uuid_gen.to_string(); + r = cls_client::mirror_peer_add( + &io_ctx, {*uuid, static_cast(direction), + site_name, client_name, ""}); + if (r == -ESTALE) { + ldout(cct, 5) << "duplicate UUID detected, retrying" << dendl; + } else if (r < 0) { + lderr(cct) << "failed to add mirror peer '" << site_name << "': " + << cpp_strerror(r) << dendl; + return r; + } + } while (r == -ESTALE); + return 0; +} + +template +int Mirror::peer_site_remove(librados::IoCtx& io_ctx, + const std::string &uuid) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << dendl; + + int r = remove_peer_config_key(io_ctx, uuid); + if (r < 0) { + lderr(cct) << "failed to remove peer attributes '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + + r = cls_client::mirror_peer_remove(&io_ctx, uuid); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to remove peer '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + + std::vector names; + r = Namespace::list(io_ctx, &names); + if (r < 0) { + return r; + } + + names.push_back(""); + + librados::IoCtx ns_io_ctx; + ns_io_ctx.dup(io_ctx); + + for (auto &name : names) { + ns_io_ctx.set_namespace(name); + + std::set image_ids; + r = list_mirror_images(ns_io_ctx, image_ids); + if (r < 0) { + lderr(cct) << "failed listing images in " + << (name.empty() ? "default" : name) << " namespace : " + << cpp_strerror(r) << dendl; + return r; + } + + for (const auto& image_id : image_ids) { + cls::rbd::MirrorImage mirror_image; + r = cls_client::mirror_image_get(&ns_io_ctx, image_id, &mirror_image); + if (r == -ENOENT) { + continue; + } + if (r < 0) { + lderr(cct) << "error getting mirror info for image " << image_id + << ": " << cpp_strerror(r) << dendl; + return r; + } + if (mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + continue; + } + + // Snapshot based mirroring. Unlink the peer from mirroring snapshots. + // TODO: optimize. + + I *img_ctx = I::create("", image_id, nullptr, ns_io_ctx, false); + img_ctx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + + r = img_ctx->state->open(0); + if (r == -ENOENT) { + continue; + } + if (r < 0) { + lderr(cct) << "error opening image " << image_id << ": " + << cpp_strerror(r) << dendl; + return r; + } + + std::list snap_ids; + { + std::shared_lock image_locker{img_ctx->image_lock}; + for (auto &it : img_ctx->snap_info) { + auto info = std::get_if( + &it.second.snap_namespace); + if (info && info->mirror_peer_uuids.count(uuid)) { + snap_ids.push_back(it.first); + } + } + } + for (auto snap_id : snap_ids) { + C_SaferCond cond; + auto req = mirror::snapshot::UnlinkPeerRequest::create( + img_ctx, snap_id, uuid, true, &cond); + req->send(); + r = cond.wait(); + if (r == -ENOENT) { + r = 0; + } + if (r < 0) { + break; + } + } + + int close_r = img_ctx->state->close(); + if (r < 0) { + lderr(cct) << "error unlinking peer for image " << image_id << ": " + << cpp_strerror(r) << dendl; + return r; + } else if (close_r < 0) { + lderr(cct) << "failed to close image " << image_id << ": " + << cpp_strerror(close_r) << dendl; + return close_r; + } + } + } + + return 0; +} + +template +int Mirror::peer_site_list(librados::IoCtx& io_ctx, + std::vector *peers) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << dendl; + + std::vector mirror_peers; + int r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list peers: " << cpp_strerror(r) << dendl; + return r; + } + + peers->clear(); + peers->reserve(mirror_peers.size()); + for (auto &mirror_peer : mirror_peers) { + mirror_peer_site_t peer; + peer.uuid = mirror_peer.uuid; + peer.direction = static_cast( + mirror_peer.mirror_peer_direction); + peer.site_name = mirror_peer.site_name; + peer.mirror_uuid = mirror_peer.mirror_uuid; + peer.client_name = mirror_peer.client_name; + peer.last_seen = mirror_peer.last_seen.sec(); + peers->push_back(peer); + } + return 0; +} + +template +int Mirror::peer_site_set_client(librados::IoCtx& io_ctx, + const std::string &uuid, + const std::string &client_name) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << ", " + << "client=" << client_name << dendl; + + int r = cls_client::mirror_peer_set_client(&io_ctx, uuid, client_name); + if (r < 0) { + lderr(cct) << "failed to update client '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +template +int Mirror::peer_site_set_name(librados::IoCtx& io_ctx, + const std::string &uuid, + const std::string &site_name) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << ", " + << "name=" << site_name << dendl; + + if (cct->_conf->cluster == site_name) { + lderr(cct) << "cannot set self as remote peer" << dendl; + return -EINVAL; + } + + int r = cls_client::mirror_peer_set_cluster(&io_ctx, uuid, site_name); + if (r < 0) { + lderr(cct) << "failed to update site '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +template +int Mirror::peer_site_set_direction(librados::IoCtx& io_ctx, + const std::string &uuid, + mirror_peer_direction_t direction) { + cls::rbd::MirrorPeerDirection mirror_peer_direction = static_cast< + cls::rbd::MirrorPeerDirection>(direction); + + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << ", " + << "direction=" << mirror_peer_direction << dendl; + + int r = cls_client::mirror_peer_set_direction(&io_ctx, uuid, + mirror_peer_direction); + if (r < 0) { + lderr(cct) << "failed to update direction '" << uuid << "': " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +template +int Mirror::peer_site_get_attributes(librados::IoCtx& io_ctx, + const std::string &uuid, + Attributes* attributes) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << dendl; + + attributes->clear(); + + librados::Rados rados(io_ctx); + std::string value; + int r = get_config_key(rados, get_peer_config_key_name(io_ctx.get_id(), uuid), + &value); + if (r == -ENOENT || value.empty()) { + return -ENOENT; + } else if (r < 0) { + lderr(cct) << "failed to retrieve peer attributes: " << cpp_strerror(r) + << dendl; + return r; + } + + bool json_valid = false; + json_spirit::mValue json_root; + if(json_spirit::read(value, json_root)) { + try { + auto& json_obj = json_root.get_obj(); + for (auto& pairs : json_obj) { + (*attributes)[pairs.first] = pairs.second.get_str(); + } + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + lderr(cct) << "invalid peer attributes JSON received" << dendl; + return -EINVAL; + } + return 0; +} + +template +int Mirror::peer_site_set_attributes(librados::IoCtx& io_ctx, + const std::string &uuid, + const Attributes& attributes) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << "uuid=" << uuid << ", " + << "attributes=" << attributes << dendl; + + std::vector mirror_peers; + int r = peer_site_list(io_ctx, &mirror_peers); + if (r < 0) { + return r; + } + + if (std::find_if(mirror_peers.begin(), mirror_peers.end(), + [&uuid](const librbd::mirror_peer_site_t& peer) { + return uuid == peer.uuid; + }) == mirror_peers.end()) { + ldout(cct, 5) << "mirror peer uuid " << uuid << " does not exist" << dendl; + return -ENOENT; + } + + std::stringstream ss; + ss << "{"; + for (auto& pair : attributes) { + ss << "\\\"" << pair.first << "\\\": " + << "\\\"" << pair.second << "\\\""; + if (&pair != &(*attributes.rbegin())) { + ss << ", "; + } + } + ss << "}"; + + librados::Rados rados(io_ctx); + r = set_config_key(rados, get_peer_config_key_name(io_ctx.get_id(), uuid), + ss.str()); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to update peer attributes: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template +int Mirror::image_global_status_list( + librados::IoCtx& io_ctx, const std::string &start_id, size_t max, + IdToMirrorImageGlobalStatus *images) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + int r; + + std::map id_to_name; + { + std::map name_to_id; + r = Image::list_images_v2(io_ctx, &name_to_id); + if (r < 0) { + return r; + } + for (auto it : name_to_id) { + id_to_name[it.second] = it.first; + } + } + + std::map images_; + std::map statuses_; + + r = librbd::cls_client::mirror_image_status_list(&io_ctx, start_id, max, + &images_, &statuses_); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list mirror image statuses: " + << cpp_strerror(r) << dendl; + return r; + } + + const std::string STATUS_NOT_FOUND("status not found"); + for (auto it = images_.begin(); it != images_.end(); ++it) { + auto &image_id = it->first; + auto &info = it->second; + if (info.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLED) { + continue; + } + + auto &image_name = id_to_name[image_id]; + if (image_name.empty()) { + lderr(cct) << "failed to find image name for image " << image_id << ", " + << "using image id as name" << dendl; + image_name = image_id; + } + + mirror_image_global_status_t& global_status = (*images)[image_id]; + global_status.name = image_name; + global_status.info = mirror_image_info_t{ + info.global_image_id, + static_cast(info.state), + false}; // XXX: To set "primary" right would require an additional call. + + bool found_local_site_status = false; + auto s_it = statuses_.find(image_id); + if (s_it != statuses_.end()) { + auto& status = s_it->second; + + global_status.site_statuses.reserve( + status.mirror_image_site_statuses.size()); + for (auto& site_status : status.mirror_image_site_statuses) { + if (site_status.mirror_uuid == + cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID) { + found_local_site_status = true; + } + + global_status.site_statuses.push_back(mirror_image_site_status_t{ + site_status.mirror_uuid, + static_cast(site_status.state), + site_status.state == cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN ? + STATUS_NOT_FOUND : site_status.description, + site_status.last_update.sec(), site_status.up}); + } + } + + if (!found_local_site_status) { + global_status.site_statuses.push_back(mirror_image_site_status_t{ + cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID, + MIRROR_IMAGE_STATUS_STATE_UNKNOWN, STATUS_NOT_FOUND, 0, false}); + } + } + + return 0; +} + +template +int Mirror::image_status_summary(librados::IoCtx& io_ctx, + MirrorImageStatusStates *states) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + + std::vector mirror_peers; + int r = cls_client::mirror_peer_list(&io_ctx, &mirror_peers); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list mirror peers: " << cpp_strerror(r) << dendl; + return r; + } + + std::map states_; + r = cls_client::mirror_image_status_get_summary(&io_ctx, mirror_peers, + &states_); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to get mirror status summary: " + << cpp_strerror(r) << dendl; + return r; + } + for (auto &s : states_) { + (*states)[static_cast(s.first)] = s.second; + } + return 0; +} + +template +int Mirror::image_instance_id_list( + librados::IoCtx& io_ctx, const std::string &start_image_id, size_t max, + std::map *instance_ids) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + std::map instances; + + int r = librbd::cls_client::mirror_image_instance_list( + &io_ctx, start_image_id, max, &instances); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list mirror image instances: " << cpp_strerror(r) + << dendl; + return r; + } + + for (auto it : instances) { + (*instance_ids)[it.first] = stringify(it.second.name.num()); + } + + return 0; +} + +template +int Mirror::image_info_list( + librados::IoCtx& io_ctx, mirror_image_mode_t *mode_filter, + const std::string &start_id, size_t max, + std::map> *entries) { + CephContext *cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 20) << "pool=" << io_ctx.get_pool_name() << ", mode_filter=" + << (mode_filter ? stringify(*mode_filter) : "null") + << ", start_id=" << start_id << ", max=" << max << dendl; + + std::string last_read = start_id; + entries->clear(); + + while (entries->size() < max) { + std::map images; + std::map statuses; + + int r = librbd::cls_client::mirror_image_status_list(&io_ctx, last_read, + max, &images, + &statuses); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list mirror image statuses: " + << cpp_strerror(r) << dendl; + return r; + } + + if (images.empty()) { + break; + } + + AsioEngine asio_engine(io_ctx); + + for (auto &it : images) { + auto &image_id = it.first; + auto &image = it.second; + auto mode = static_cast(image.mode); + + if ((mode_filter && mode != *mode_filter) || + image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + continue; + } + + // need to call get_info for every image to retrieve promotion state + + mirror_image_info_t info; + r = image_get_info(io_ctx, asio_engine.get_work_queue(), image_id, &info); + if (r < 0) { + continue; + } + + (*entries)[image_id] = std::make_pair(mode, info); + if (entries->size() == max) { + break; + } + } + + last_read = images.rbegin()->first; + } + + return 0; +} + +template +int Mirror::image_snapshot_create(I *ictx, uint32_t flags, + uint64_t *snap_id) { + C_SaferCond ctx; + Mirror::image_snapshot_create(ictx, flags, snap_id, &ctx); + + return ctx.wait(); +} + +template +void Mirror::image_snapshot_create(I *ictx, uint32_t flags, + uint64_t *snap_id, Context *on_finish) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "ictx=" << ictx << dendl; + + uint64_t snap_create_flags = 0; + int r = util::snap_create_flags_api_to_internal(cct, flags, + &snap_create_flags); + if (r < 0) { + on_finish->complete(r); + return; + } + + auto on_refresh = new LambdaContext( + [ictx, snap_create_flags, snap_id, on_finish](int r) { + if (r < 0) { + lderr(ictx->cct) << "refresh failed: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + auto ctx = new C_ImageSnapshotCreate(ictx, snap_create_flags, snap_id, + on_finish); + auto req = mirror::GetInfoRequest::create(*ictx, &ctx->mirror_image, + &ctx->promotion_state, + &ctx->primary_mirror_uuid, + ctx); + req->send(); + }); + + if (ictx->state->is_refresh_required()) { + ictx->state->refresh(on_refresh); + } else { + on_refresh->complete(0); + } +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Mirror; diff --git a/src/librbd/api/Mirror.h b/src/librbd/api/Mirror.h new file mode 100644 index 000000000..b3a552b13 --- /dev/null +++ b/src/librbd/api/Mirror.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_API_MIRROR_H +#define LIBRBD_API_MIRROR_H + +#include "include/rbd/librbd.hpp" +#include +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; +namespace asio { struct ContextWQ; } + +namespace api { + +template +struct Mirror { + typedef std::map Attributes; + typedef std::map + IdToMirrorImageGlobalStatus; + typedef std::map MirrorImageStatusStates; + + static int site_name_get(librados::Rados& rados, std::string* name); + static int site_name_set(librados::Rados& rados, const std::string& name); + + static int mode_get(librados::IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode); + static int mode_set(librados::IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode); + + static int uuid_get(librados::IoCtx& io_ctx, std::string* mirror_uuid); + static void uuid_get(librados::IoCtx& io_ctx, std::string* mirror_uuid, + Context* on_finish); + + static int peer_bootstrap_create(librados::IoCtx& io_ctx, std::string* token); + static int peer_bootstrap_import(librados::IoCtx& io_ctx, + rbd_mirror_peer_direction_t direction, + const std::string& token); + + static int peer_site_add(librados::IoCtx& io_ctx, std::string *uuid, + mirror_peer_direction_t direction, + const std::string &site_name, + const std::string &client_name); + static int peer_site_remove(librados::IoCtx& io_ctx, const std::string &uuid); + static int peer_site_list(librados::IoCtx& io_ctx, + std::vector *peers); + static int peer_site_set_client(librados::IoCtx& io_ctx, + const std::string &uuid, + const std::string &client_name); + static int peer_site_set_name(librados::IoCtx& io_ctx, + const std::string &uuid, + const std::string &site_name); + static int peer_site_set_direction(librados::IoCtx& io_ctx, + const std::string &uuid, + mirror_peer_direction_t direction); + static int peer_site_get_attributes(librados::IoCtx& io_ctx, + const std::string &uuid, + Attributes* attributes); + static int peer_site_set_attributes(librados::IoCtx& io_ctx, + const std::string &uuid, + const Attributes& attributes); + + static int image_global_status_list(librados::IoCtx& io_ctx, + const std::string &start_id, size_t max, + IdToMirrorImageGlobalStatus *images); + + static int image_status_summary(librados::IoCtx& io_ctx, + MirrorImageStatusStates *states); + static int image_instance_id_list(librados::IoCtx& io_ctx, + const std::string &start_image_id, + size_t max, + std::map *ids); + + static int image_info_list( + librados::IoCtx& io_ctx, mirror_image_mode_t *mode_filter, + const std::string &start_id, size_t max, + std::map> *entries); + + static int image_enable(ImageCtxT *ictx, mirror_image_mode_t mode, + bool relax_same_pool_parent_check); + static int image_disable(ImageCtxT *ictx, bool force); + static int image_promote(ImageCtxT *ictx, bool force); + static void image_promote(ImageCtxT *ictx, bool force, Context *on_finish); + static int image_demote(ImageCtxT *ictx); + static void image_demote(ImageCtxT *ictx, Context *on_finish); + static int image_resync(ImageCtxT *ictx); + static int image_get_info(ImageCtxT *ictx, + mirror_image_info_t *mirror_image_info); + static void image_get_info(ImageCtxT *ictx, + mirror_image_info_t *mirror_image_info, + Context *on_finish); + static int image_get_info(librados::IoCtx& io_ctx, + asio::ContextWQ *op_work_queue, + const std::string &image_id, + mirror_image_info_t *mirror_image_info); + static void image_get_info(librados::IoCtx& io_ctx, + asio::ContextWQ *op_work_queue, + const std::string &image_id, + mirror_image_info_t *mirror_image_info, + Context *on_finish); + static int image_get_mode(ImageCtxT *ictx, mirror_image_mode_t *mode); + static void image_get_mode(ImageCtxT *ictx, mirror_image_mode_t *mode, + Context *on_finish); + static int image_get_global_status(ImageCtxT *ictx, + mirror_image_global_status_t *status); + static void image_get_global_status(ImageCtxT *ictx, + mirror_image_global_status_t *status, + Context *on_finish); + static int image_get_instance_id(ImageCtxT *ictx, std::string *instance_id); + + static int image_snapshot_create(ImageCtxT *ictx, uint32_t flags, + uint64_t *snap_id); + static void image_snapshot_create(ImageCtxT *ictx, uint32_t flags, + uint64_t *snap_id, Context *on_finish); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Mirror; + +#endif // LIBRBD_API_MIRROR_H diff --git a/src/librbd/api/Namespace.cc b/src/librbd/api/Namespace.cc new file mode 100644 index 000000000..86ed70c06 --- /dev/null +++ b/src/librbd/api/Namespace.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/api/Mirror.h" +#include "librbd/api/Namespace.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Namespace: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +const std::list POOL_OBJECTS { + RBD_CHILDREN, + RBD_GROUP_DIRECTORY, + RBD_INFO, + RBD_MIRRORING, + RBD_TASK, + RBD_TRASH, + RBD_DIRECTORY +}; + +} // anonymous namespace + +template +int Namespace::create(librados::IoCtx& io_ctx, const std::string& name) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 5) << "name=" << name << dendl; + + if (name.empty()) { + return -EINVAL; + } + + librados::Rados rados(io_ctx); + int8_t require_osd_release; + int r = rados.get_min_compatible_osd(&require_osd_release); + if (r < 0) { + lderr(cct) << "failed to retrieve min OSD release: " << cpp_strerror(r) + << dendl; + return r; + } + + if (require_osd_release < CEPH_RELEASE_NAUTILUS) { + ldout(cct, 1) << "namespace support requires nautilus or later OSD" + << dendl; + return -ENOSYS; + } + + + librados::IoCtx default_ns_ctx; + default_ns_ctx.dup(io_ctx); + default_ns_ctx.set_namespace(""); + + r = cls_client::namespace_add(&default_ns_ctx, name); + if (r < 0) { + lderr(cct) << "failed to add namespace: " << cpp_strerror(r) << dendl; + return r; + } + + librados::IoCtx ns_ctx; + ns_ctx.dup(io_ctx); + ns_ctx.set_namespace(name); + + r = cls_client::dir_state_set(&ns_ctx, RBD_DIRECTORY, + cls::rbd::DIRECTORY_STATE_READY); + if (r < 0) { + lderr(cct) << "failed to initialize image directory: " << cpp_strerror(r) + << dendl; + goto rollback; + } + + return 0; + +rollback: + int ret_val = cls_client::namespace_remove(&default_ns_ctx, name); + if (ret_val < 0) { + lderr(cct) << "failed to remove namespace: " << cpp_strerror(ret_val) << dendl; + } + + return r; +} + +template +int Namespace::remove(librados::IoCtx& io_ctx, const std::string& name) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 5) << "name=" << name << dendl; + + if (name.empty()) { + return -EINVAL; + } + + librados::IoCtx default_ns_ctx; + default_ns_ctx.dup(io_ctx); + default_ns_ctx.set_namespace(""); + + librados::IoCtx ns_ctx; + ns_ctx.dup(io_ctx); + ns_ctx.set_namespace(name); + + std::map trash_entries; + + librados::ObjectWriteOperation dir_op; + librbd::cls_client::dir_state_set( + &dir_op, cls::rbd::DIRECTORY_STATE_ADD_DISABLED); + dir_op.remove(); + + int r = ns_ctx.operate(RBD_DIRECTORY, &dir_op); + if (r == -EBUSY) { + ldout(cct, 5) << "image directory not empty" << dendl; + goto rollback; + } else if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to disable the namespace: " << cpp_strerror(r) + << dendl; + return r; + } + + r = cls_client::trash_list(&ns_ctx, "", 1, &trash_entries); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to list trash directory: " << cpp_strerror(r) + << dendl; + return r; + } else if (!trash_entries.empty()) { + ldout(cct, 5) << "image trash not empty" << dendl; + goto rollback; + } + + r = Mirror::mode_set(ns_ctx, RBD_MIRROR_MODE_DISABLED); + if (r < 0) { + lderr(cct) << "failed to disable mirroring: " << cpp_strerror(r) + << dendl; + return r; + } + + for (auto& oid : POOL_OBJECTS) { + r = ns_ctx.remove(oid); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to remove object '" << oid << "': " + << cpp_strerror(r) << dendl; + return r; + } + } + + r = cls_client::namespace_remove(&default_ns_ctx, name); + if (r < 0) { + lderr(cct) << "failed to remove namespace: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; + +rollback: + + r = librbd::cls_client::dir_state_set( + &ns_ctx, RBD_DIRECTORY, cls::rbd::DIRECTORY_STATE_READY); + if (r < 0) { + lderr(cct) << "failed to restore directory state: " << cpp_strerror(r) + << dendl; + } + + return -EBUSY; +} + +template +int Namespace::list(IoCtx& io_ctx, std::vector *names) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 5) << dendl; + + librados::IoCtx default_ns_ctx; + default_ns_ctx.dup(io_ctx); + default_ns_ctx.set_namespace(""); + + int r; + int max_read = 1024; + std::string last_read = ""; + do { + std::list name_list; + r = cls_client::namespace_list(&default_ns_ctx, last_read, max_read, + &name_list); + if (r == -ENOENT) { + return 0; + } else if (r < 0) { + lderr(cct) << "error listing namespaces: " << cpp_strerror(r) << dendl; + return r; + } + + names->insert(names->end(), name_list.begin(), name_list.end()); + if (!name_list.empty()) { + last_read = name_list.back(); + } + r = name_list.size(); + } while (r == max_read); + + return 0; +} + +template +int Namespace::exists(librados::IoCtx& io_ctx, const std::string& name, bool *exists) +{ + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 5) << "name=" << name << dendl; + + *exists = false; + if (name.empty()) { + return -EINVAL; + } + + librados::IoCtx ns_ctx; + ns_ctx.dup(io_ctx); + ns_ctx.set_namespace(name); + + int r = librbd::cls_client::dir_state_assert(&ns_ctx, RBD_DIRECTORY, + cls::rbd::DIRECTORY_STATE_READY); + if (r == 0) { + *exists = true; + } else if (r != -ENOENT) { + lderr(cct) << "error asserting namespace: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Namespace; diff --git a/src/librbd/api/Namespace.h b/src/librbd/api/Namespace.h new file mode 100644 index 000000000..220eb28f3 --- /dev/null +++ b/src/librbd/api/Namespace.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_NAMESPACE_H +#define CEPH_LIBRBD_API_NAMESPACE_H + +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/librbd.hpp" +#include +#include + +namespace librbd { + +struct ImageCtx; + +namespace api { + +template +struct Namespace { + + static int create(librados::IoCtx& io_ctx, const std::string& name); + static int remove(librados::IoCtx& io_ctx, const std::string& name); + static int list(librados::IoCtx& io_ctx, std::vector* names); + static int exists(librados::IoCtx& io_ctx, const std::string& name, bool *exists); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Namespace; + +#endif // CEPH_LIBRBD_API_NAMESPACE_H diff --git a/src/librbd/api/Pool.cc b/src/librbd/api/Pool.cc new file mode 100644 index 000000000..65d55328f --- /dev/null +++ b/src/librbd/api/Pool.cc @@ -0,0 +1,375 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Pool.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "common/Throttle.h" +#include "cls/rbd/cls_rbd_client.h" +#include "osd/osd_types.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/api/Image.h" +#include "librbd/api/Trash.h" +#include "librbd/image/ValidatePoolRequest.h" + +#define dout_subsys ceph_subsys_rbd + +namespace librbd { +namespace api { + +namespace { + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Pool::ImageStatRequest: " \ + << __func__ << " " << this << ": " \ + << "(id=" << m_image_id << "): " + +template +class ImageStatRequest { +public: + ImageStatRequest(librados::IoCtx& io_ctx, SimpleThrottle& throttle, + const std::string& image_id, bool scan_snaps, + std::atomic* bytes, + std::atomic* max_bytes, + std::atomic* snaps) + : m_cct(reinterpret_cast(io_ctx.cct())), + m_io_ctx(io_ctx), m_throttle(throttle), m_image_id(image_id), + m_scan_snaps(scan_snaps), m_bytes(bytes), m_max_bytes(max_bytes), + m_snaps(snaps) { + m_throttle.start_op(); + } + + void send() { + get_head(); + } + +protected: + void finish(int r) { + (*m_max_bytes) += m_max_size; + m_throttle.end_op(r); + + delete this; + } + +private: + CephContext* m_cct; + librados::IoCtx& m_io_ctx; + SimpleThrottle& m_throttle; + const std::string& m_image_id; + bool m_scan_snaps; + std::atomic* m_bytes; + std::atomic* m_max_bytes; + std::atomic* m_snaps; + bufferlist m_out_bl; + + uint64_t m_max_size = 0; + ::SnapContext m_snapc; + + void get_head() { + ldout(m_cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::get_size_start(&op, CEPH_NOSNAP); + if (m_scan_snaps) { + cls_client::get_snapcontext_start(&op); + } + + m_out_bl.clear(); + auto aio_comp = util::create_rados_callback< + ImageStatRequest, &ImageStatRequest::handle_get_head>(this); + int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); + } + + void handle_get_head(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + auto it = m_out_bl.cbegin(); + if (r == 0) { + uint8_t order; + r = cls_client::get_size_finish(&it, &m_max_size, &order); + if (r == 0) { + (*m_bytes) += m_max_size; + } + } + if (m_scan_snaps && r == 0) { + r = cls_client::get_snapcontext_finish(&it, &m_snapc); + if (r == 0) { + (*m_snaps) += m_snapc.snaps.size(); + } + } + + if (r == -ENOENT) { + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to stat image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (!m_snapc.is_valid()) { + lderr(m_cct) << "snap context is invalid" << dendl; + finish(-EIO); + return; + } + + get_snaps(); + } + + void get_snaps() { + if (!m_scan_snaps || m_snapc.snaps.empty()) { + finish(0); + return; + } + + ldout(m_cct, 15) << dendl; + librados::ObjectReadOperation op; + for (auto snap_seq : m_snapc.snaps) { + cls_client::get_size_start(&op, snap_seq); + } + + m_out_bl.clear(); + auto aio_comp = util::create_rados_callback< + ImageStatRequest, &ImageStatRequest::handle_get_snaps>(this); + int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); + } + + void handle_get_snaps(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + auto it = m_out_bl.cbegin(); + for ([[maybe_unused]] auto snap_seq : m_snapc.snaps) { + uint64_t size; + if (r == 0) { + uint8_t order; + r = cls_client::get_size_finish(&it, &size, &order); + } + if (r == 0 && m_max_size < size) { + m_max_size = size; + } + } + + if (r == -ENOENT) { + ldout(m_cct, 15) << "out-of-sync metadata" << dendl; + get_head(); + } else if (r < 0) { + lderr(m_cct) << "failed to retrieve snap size: " << cpp_strerror(r) + << dendl; + finish(r); + } else { + finish(0); + } + } + +}; + +template +void get_pool_stat_option_value(typename Pool::StatOptions* stat_options, + rbd_pool_stat_option_t option, + uint64_t** value) { + auto it = stat_options->find(option); + if (it == stat_options->end()) { + *value = nullptr; + } else { + *value = it->second; + } +} + +template +int get_pool_stats(librados::IoCtx& io_ctx, const ConfigProxy& config, + const std::vector& image_ids, uint64_t* image_count, + uint64_t* provisioned_bytes, uint64_t* max_provisioned_bytes, + uint64_t* snapshot_count) { + + bool scan_snaps = ((max_provisioned_bytes != nullptr) || + (snapshot_count != nullptr)); + + SimpleThrottle throttle( + config.template get_val("rbd_concurrent_management_ops"), true); + std::atomic bytes{0}; + std::atomic max_bytes{0}; + std::atomic snaps{0}; + for (auto& image_id : image_ids) { + if (throttle.pending_error()) { + break; + } + + auto req = new ImageStatRequest(io_ctx, throttle, image_id, + scan_snaps, &bytes, &max_bytes, &snaps); + req->send(); + } + + int r = throttle.wait_for_ret(); + if (r < 0) { + return r; + } + + if (image_count != nullptr) { + *image_count = image_ids.size(); + } + if (provisioned_bytes != nullptr) { + *provisioned_bytes = bytes.load(); + } + if (max_provisioned_bytes != nullptr) { + *max_provisioned_bytes = max_bytes.load(); + } + if (snapshot_count != nullptr) { + *snapshot_count = snaps.load(); + } + + return 0; +} + +} // anonymous namespace + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Pool: " << __func__ << ": " + +template +int Pool::init(librados::IoCtx& io_ctx, bool force) { + auto cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 10) << dendl; + + int r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RBD, force); + if (r < 0) { + return r; + } + + ConfigProxy config{cct->_conf}; + api::Config::apply_pool_overrides(io_ctx, &config); + if (!config.get_val("rbd_validate_pool")) { + return 0; + } + + C_SaferCond ctx; + auto req = image::ValidatePoolRequest::create(io_ctx, &ctx); + req->send(); + + return ctx.wait(); +} + +template +int Pool::add_stat_option(StatOptions* stat_options, + rbd_pool_stat_option_t option, + uint64_t* value) { + switch (option) { + case RBD_POOL_STAT_OPTION_IMAGES: + case RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES: + case RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES: + case RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS: + case RBD_POOL_STAT_OPTION_TRASH_IMAGES: + case RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES: + case RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES: + case RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS: + stat_options->emplace(option, value); + return 0; + default: + break; + } + return -ENOENT; +} + +template +int Pool::get_stats(librados::IoCtx& io_ctx, StatOptions* stat_options) { + auto cct = reinterpret_cast(io_ctx.cct()); + ldout(cct, 10) << dendl; + + ConfigProxy config{cct->_conf}; + api::Config::apply_pool_overrides(io_ctx, &config); + + uint64_t* image_count; + uint64_t* provisioned_bytes; + uint64_t* max_provisioned_bytes; + uint64_t* snapshot_count; + + std::vector trash_entries; + int r = Trash::list(io_ctx, trash_entries, false); + if (r < 0 && r != -EOPNOTSUPP) { + return r; + } + + get_pool_stat_option_value( + stat_options, RBD_POOL_STAT_OPTION_IMAGES, &image_count); + get_pool_stat_option_value( + stat_options, RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES, + &provisioned_bytes); + get_pool_stat_option_value( + stat_options, RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES, + &max_provisioned_bytes); + get_pool_stat_option_value( + stat_options, RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, &snapshot_count); + if (image_count != nullptr || provisioned_bytes != nullptr || + max_provisioned_bytes != nullptr || snapshot_count != nullptr) { + typename Image::ImageNameToIds images; + int r = Image::list_images_v2(io_ctx, &images); + if (r < 0) { + return r; + } + + std::vector image_ids; + image_ids.reserve(images.size() + trash_entries.size()); + for (auto& it : images) { + image_ids.push_back(std::move(it.second)); + } + for (auto& it : trash_entries) { + if (it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + image_ids.push_back(std::move(it.id)); + } + } + + r = get_pool_stats(io_ctx, config, image_ids, image_count, + provisioned_bytes, max_provisioned_bytes, + snapshot_count); + if (r < 0) { + return r; + } + } + + get_pool_stat_option_value( + stat_options, RBD_POOL_STAT_OPTION_TRASH_IMAGES, &image_count); + get_pool_stat_option_value( + stat_options, RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES, + &provisioned_bytes); + get_pool_stat_option_value( + stat_options, RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES, + &max_provisioned_bytes); + get_pool_stat_option_value( + stat_options, RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS, &snapshot_count); + if (image_count != nullptr || provisioned_bytes != nullptr || + max_provisioned_bytes != nullptr || snapshot_count != nullptr) { + + std::vector image_ids; + image_ids.reserve(trash_entries.size()); + for (auto& it : trash_entries) { + if (it.source == RBD_TRASH_IMAGE_SOURCE_REMOVING) { + continue; + } + image_ids.push_back(std::move(it.id)); + } + + r = get_pool_stats(io_ctx, config, image_ids, image_count, + provisioned_bytes, max_provisioned_bytes, + snapshot_count); + if (r < 0) { + return r; + } + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Pool; diff --git a/src/librbd/api/Pool.h b/src/librbd/api/Pool.h new file mode 100644 index 000000000..7b607ab6e --- /dev/null +++ b/src/librbd/api/Pool.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_POOL_H +#define CEPH_LIBRBD_API_POOL_H + +#include "include/int_types.h" +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/librbd.h" +#include + +namespace librbd { + +struct ImageCtx; + +namespace api { + +template +class Pool { +public: + typedef std::map StatOptions; + + static int init(librados::IoCtx& io_ctx, bool force); + + static int add_stat_option(StatOptions* stat_options, + rbd_pool_stat_option_t option, + uint64_t* value); + + static int get_stats(librados::IoCtx& io_ctx, StatOptions* stat_options); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Pool; + +#endif // CEPH_LIBRBD_API_POOL_H diff --git a/src/librbd/api/PoolMetadata.cc b/src/librbd/api/PoolMetadata.cc new file mode 100644 index 000000000..33e3fb648 --- /dev/null +++ b/src/librbd/api/PoolMetadata.cc @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/PoolMetadata.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/image/GetMetadataRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::PoolMetadata: " << __func__ << ": " + +namespace librbd { +namespace api { + +namespace { + +void update_pool_timestamp(librados::IoCtx& io_ctx) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + auto now = ceph_clock_now(); + std::string cmd = + R"({)" + R"("prefix": "config set", )" + R"("who": "global", )" + R"("name": "rbd_config_pool_override_update_timestamp", )" + R"("value": ")" + stringify(now.sec()) + R"(")" + R"(})"; + + librados::Rados rados(io_ctx); + bufferlist in_bl; + std::string ss; + int r = rados.mon_command(cmd, in_bl, nullptr, &ss); + if (r < 0) { + lderr(cct) << "failed to notify clients of pool config update: " + << cpp_strerror(r) << dendl; + } +} + +} // anonymous namespace + +template +int PoolMetadata::get(librados::IoCtx& io_ctx, + const std::string &key, std::string *value) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + int r = cls_client::metadata_get(&io_ctx, RBD_INFO, key, value); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed reading metadata " << key << ": " << cpp_strerror(r) + << dendl; + } + + return r; +} + +template +int PoolMetadata::set(librados::IoCtx& io_ctx, const std::string &key, + const std::string &value) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + bool need_update_pool_timestamp = false; + + std::string config_key; + if (util::is_metadata_config_override(key, &config_key)) { + if (!librbd::api::Config::is_option_name(io_ctx, config_key)) { + lderr(cct) << "validation for " << key + << " failed: not allowed pool level override" << dendl; + return -EINVAL; + } + int r = ConfigProxy{false}.set_val(config_key.c_str(), value); + if (r < 0) { + lderr(cct) << "validation for " << key << " failed: " << cpp_strerror(r) + << dendl; + return -EINVAL; + } + + need_update_pool_timestamp = true; + } + + ceph::bufferlist bl; + bl.append(value); + + int r = cls_client::metadata_set(&io_ctx, RBD_INFO, {{key, bl}}); + if (r < 0) { + lderr(cct) << "failed setting metadata " << key << ": " << cpp_strerror(r) + << dendl; + return r; + } + + if (need_update_pool_timestamp) { + update_pool_timestamp(io_ctx); + } + + return 0; +} + +template +int PoolMetadata::remove(librados::IoCtx& io_ctx, const std::string &key) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + std::string value; + int r = cls_client::metadata_get(&io_ctx, RBD_INFO, key, &value); + if (r < 0) { + if (r == -ENOENT) { + ldout(cct, 1) << "metadata " << key << " does not exist" << dendl; + } else { + lderr(cct) << "failed reading metadata " << key << ": " << cpp_strerror(r) + << dendl; + } + return r; + } + + r = cls_client::metadata_remove(&io_ctx, RBD_INFO, key); + if (r < 0) { + lderr(cct) << "failed removing metadata " << key << ": " << cpp_strerror(r) + << dendl; + return r; + } + + std::string config_key; + if (util::is_metadata_config_override(key, &config_key)) { + update_pool_timestamp(io_ctx); + } + + return 0; +} + +template +int PoolMetadata::list(librados::IoCtx& io_ctx, const std::string &start, + uint64_t max, + std::map *pairs) { + CephContext *cct = (CephContext *)io_ctx.cct(); + + pairs->clear(); + C_SaferCond ctx; + auto req = image::GetMetadataRequest::create( + io_ctx, RBD_INFO, false, "", start, max, pairs, &ctx); + req->send(); + + int r = ctx.wait(); + if (r < 0) { + lderr(cct) << "failed listing metadata: " << cpp_strerror(r) + << dendl; + return r; + } + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::PoolMetadata; diff --git a/src/librbd/api/PoolMetadata.h b/src/librbd/api/PoolMetadata.h new file mode 100644 index 000000000..69ab574ac --- /dev/null +++ b/src/librbd/api/PoolMetadata.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_POOL_METADATA_H +#define CEPH_LIBRBD_API_POOL_METADATA_H + +#include "include/buffer_fwd.h" +#include "include/rados/librados_fwd.hpp" + +#include +#include +#include + +namespace librbd { + +class ImageCtx; + +namespace api { + +template +class PoolMetadata { +public: + static int get(librados::IoCtx& io_ctx, const std::string &key, + std::string *value); + static int set(librados::IoCtx& io_ctx, const std::string &key, + const std::string &value); + static int remove(librados::IoCtx& io_ctx, const std::string &key); + static int list(librados::IoCtx& io_ctx, const std::string &start, + uint64_t max, std::map *pairs); +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::PoolMetadata; + +#endif // CEPH_LIBRBD_API_POOL_METADATA_H diff --git a/src/librbd/api/Snapshot.cc b/src/librbd/api/Snapshot.cc new file mode 100644 index 000000000..03cefbd1c --- /dev/null +++ b/src/librbd/api/Snapshot.cc @@ -0,0 +1,444 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Snapshot.h" +#include "cls/rbd/cls_rbd_types.h" +#include "common/errno.h" +#include "librbd/internal.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/api/Image.h" +#include "include/Context.h" +#include "common/Cond.h" + +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Snapshot: " << __func__ << ": " + +using librados::snap_t; + +namespace librbd { +namespace api { + +namespace { + +class GetGroupVisitor { +public: + CephContext* cct; + librados::IoCtx *image_ioctx; + snap_group_namespace_t *group_snap; + + explicit GetGroupVisitor(CephContext* cct, librados::IoCtx *_image_ioctx, + snap_group_namespace_t *group_snap) + : cct(cct), image_ioctx(_image_ioctx), group_snap(group_snap) {}; + + template + inline int operator()(const T&) const { + // ignore other than GroupSnapshotNamespace types. + return -EINVAL; + } + + inline int operator()( + const cls::rbd::GroupSnapshotNamespace& snap_namespace) { + IoCtx group_ioctx; + int r = util::create_ioctx(*image_ioctx, "group", snap_namespace.group_pool, + {}, &group_ioctx); + if (r < 0) { + return r; + } + + cls::rbd::GroupSnapshot group_snapshot; + + std::string group_name; + r = cls_client::dir_get_name(&group_ioctx, RBD_GROUP_DIRECTORY, + snap_namespace.group_id, &group_name); + if (r < 0) { + lderr(cct) << "failed to retrieve group name: " << cpp_strerror(r) + << dendl; + return r; + } + + std::string group_header_oid = util::group_header_name(snap_namespace.group_id); + r = cls_client::group_snap_get_by_id(&group_ioctx, + group_header_oid, + snap_namespace.group_snapshot_id, + &group_snapshot); + if (r < 0) { + lderr(cct) << "failed to retrieve group snapshot: " << cpp_strerror(r) + << dendl; + return r; + } + + group_snap->group_pool = group_ioctx.get_id(); + group_snap->group_name = group_name; + group_snap->group_snap_name = group_snapshot.name; + return 0; + } +}; + +class GetTrashVisitor { +public: + std::string* original_name; + + explicit GetTrashVisitor(std::string* original_name) + : original_name(original_name) { + } + + template + inline int operator()(const T&) const { + return -EINVAL; + } + + inline int operator()( + const cls::rbd::TrashSnapshotNamespace& snap_namespace) { + *original_name = snap_namespace.original_name; + return 0; + } +}; + +class GetMirrorVisitor { +public: + snap_mirror_namespace_t *mirror_snap; + + explicit GetMirrorVisitor(snap_mirror_namespace_t *mirror_snap) + : mirror_snap(mirror_snap) { + } + + template + inline int operator()(const T&) const { + return -EINVAL; + } + + inline int operator()( + const cls::rbd::MirrorSnapshotNamespace& snap_namespace) { + mirror_snap->state = static_cast(snap_namespace.state); + mirror_snap->complete = snap_namespace.complete; + mirror_snap->mirror_peer_uuids = snap_namespace.mirror_peer_uuids; + mirror_snap->primary_mirror_uuid = snap_namespace.primary_mirror_uuid; + mirror_snap->primary_snap_id = snap_namespace.primary_snap_id; + mirror_snap->last_copied_object_number = + snap_namespace.last_copied_object_number; + return 0; + } +}; + +} // anonymous namespace + +template +int Snapshot::get_group_namespace(I *ictx, uint64_t snap_id, + snap_group_namespace_t *group_snap) { + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + std::shared_lock image_locker{ictx->image_lock}; + auto snap_info = ictx->get_snap_info(snap_id); + if (snap_info == nullptr) { + return -ENOENT; + } + + GetGroupVisitor ggv = GetGroupVisitor(ictx->cct, &ictx->md_ctx, group_snap); + r = snap_info->snap_namespace.visit(ggv); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Snapshot::get_trash_namespace(I *ictx, uint64_t snap_id, + std::string* original_name) { + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + std::shared_lock image_locker{ictx->image_lock}; + auto snap_info = ictx->get_snap_info(snap_id); + if (snap_info == nullptr) { + return -ENOENT; + } + + auto visitor = GetTrashVisitor(original_name); + r = snap_info->snap_namespace.visit(visitor); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Snapshot::get_mirror_namespace( + I *ictx, uint64_t snap_id, snap_mirror_namespace_t *mirror_snap) { + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + std::shared_lock image_locker{ictx->image_lock}; + auto snap_info = ictx->get_snap_info(snap_id); + if (snap_info == nullptr) { + return -ENOENT; + } + + auto gmv = GetMirrorVisitor(mirror_snap); + r = snap_info->snap_namespace.visit(gmv); + if (r < 0) { + return r; + } + + return 0; +} + +template +int Snapshot::get_namespace_type(I *ictx, uint64_t snap_id, + snap_namespace_type_t *namespace_type) { + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + std::shared_lock l{ictx->image_lock}; + auto snap_info = ictx->get_snap_info(snap_id); + if (snap_info == nullptr) { + return -ENOENT; + } + + *namespace_type = static_cast( + cls::rbd::get_snap_namespace_type(snap_info->snap_namespace)); + return 0; +} + +template +int Snapshot::remove(I *ictx, uint64_t snap_id) { + ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_id << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + cls::rbd::SnapshotNamespace snapshot_namespace; + std::string snapshot_name; + { + std::shared_lock image_locker{ictx->image_lock}; + auto it = ictx->snap_info.find(snap_id); + if (it == ictx->snap_info.end()) { + return -ENOENT; + } + + snapshot_namespace = it->second.snap_namespace; + snapshot_name = it->second.name; + } + + C_SaferCond ctx; + ictx->operations->snap_remove(snapshot_namespace, snapshot_name, &ctx); + r = ctx.wait(); + return r; +} + +template +int Snapshot::get_name(I *ictx, uint64_t snap_id, std::string *snap_name) + { + ldout(ictx->cct, 20) << "snap_get_name " << ictx << " " << snap_id << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + std::shared_lock image_locker{ictx->image_lock}; + r = ictx->get_snap_name(snap_id, snap_name); + + return r; + } + +template +int Snapshot::get_id(I *ictx, const std::string& snap_name, uint64_t *snap_id) + { + ldout(ictx->cct, 20) << "snap_get_id " << ictx << " " << snap_name << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + std::shared_lock image_locker{ictx->image_lock}; + *snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(), snap_name); + if (*snap_id == CEPH_NOSNAP) + return -ENOENT; + + return 0; + } + +template +int Snapshot::list(I *ictx, std::vector& snaps) { + ldout(ictx->cct, 20) << "snap_list " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + std::shared_lock l{ictx->image_lock}; + for (auto &it : ictx->snap_info) { + snap_info_t info; + info.name = it.second.name; + info.id = it.first; + info.size = it.second.size; + snaps.push_back(info); + } + + return 0; +} + +template +int Snapshot::exists(I *ictx, const cls::rbd::SnapshotNamespace& snap_namespace, + const char *snap_name, bool *exists) { + ldout(ictx->cct, 20) << "snap_exists " << ictx << " " << snap_name << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + std::shared_lock l{ictx->image_lock}; + *exists = ictx->get_snap_id(snap_namespace, snap_name) != CEPH_NOSNAP; + return 0; +} + +template +int Snapshot::create(I *ictx, const char *snap_name, uint32_t flags, + ProgressContext& pctx) { + ldout(ictx->cct, 20) << "snap_create " << ictx << " " << snap_name + << " flags: " << flags << dendl; + + uint64_t internal_flags = 0; + int r = util::snap_create_flags_api_to_internal(ictx->cct, flags, + &internal_flags); + if (r < 0) { + return r; + } + + return ictx->operations->snap_create(cls::rbd::UserSnapshotNamespace(), + snap_name, internal_flags, pctx); +} + +template +int Snapshot::remove(I *ictx, const char *snap_name, uint32_t flags, + ProgressContext& pctx) { + ldout(ictx->cct, 20) << "snap_remove " << ictx << " " << snap_name << " flags: " << flags << dendl; + + int r = 0; + + r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + if (flags & RBD_SNAP_REMOVE_FLATTEN) { + r = Image::flatten_children(ictx, snap_name, pctx); + if (r < 0) { + return r; + } + } + + bool protect; + r = is_protected(ictx, snap_name, &protect); + if (r < 0) { + return r; + } + + if (protect && flags & RBD_SNAP_REMOVE_UNPROTECT) { + r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name); + if (r < 0) { + lderr(ictx->cct) << "failed to unprotect snapshot: " << snap_name << dendl; + return r; + } + + r = is_protected(ictx, snap_name, &protect); + if (r < 0) { + return r; + } + if (protect) { + lderr(ictx->cct) << "snapshot is still protected after unprotection" << dendl; + ceph_abort(); + } + } + + C_SaferCond ctx; + ictx->operations->snap_remove(cls::rbd::UserSnapshotNamespace(), snap_name, &ctx); + + r = ctx.wait(); + return r; +} + +template +int Snapshot::get_timestamp(I *ictx, uint64_t snap_id, struct timespec *timestamp) { + auto snap_it = ictx->snap_info.find(snap_id); + ceph_assert(snap_it != ictx->snap_info.end()); + utime_t time = snap_it->second.timestamp; + time.to_timespec(timestamp); + return 0; +} + +template +int Snapshot::get_limit(I *ictx, uint64_t *limit) { + int r = cls_client::snapshot_get_limit(&ictx->md_ctx, ictx->header_oid, + limit); + if (r == -EOPNOTSUPP) { + *limit = UINT64_MAX; + r = 0; + } + return r; +} + +template +int Snapshot::set_limit(I *ictx, uint64_t limit) { + return ictx->operations->snap_set_limit(limit); +} + +template +int Snapshot::is_protected(I *ictx, const char *snap_name, bool *protect) { + ldout(ictx->cct, 20) << "snap_is_protected " << ictx << " " << snap_name + << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + std::shared_lock l{ictx->image_lock}; + snap_t snap_id = ictx->get_snap_id(cls::rbd::UserSnapshotNamespace(), snap_name); + if (snap_id == CEPH_NOSNAP) + return -ENOENT; + bool is_unprotected; + r = ictx->is_snap_unprotected(snap_id, &is_unprotected); + // consider both PROTECTED or UNPROTECTING to be 'protected', + // since in either state they can't be deleted + *protect = !is_unprotected; + return r; +} + +template +int Snapshot::get_namespace(I *ictx, const char *snap_name, + cls::rbd::SnapshotNamespace *snap_namespace) { + ldout(ictx->cct, 20) << "get_snap_namespace " << ictx << " " << snap_name + << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + std::shared_lock l{ictx->image_lock}; + snap_t snap_id = ictx->get_snap_id(*snap_namespace, snap_name); + if (snap_id == CEPH_NOSNAP) + return -ENOENT; + r = ictx->get_snap_namespace(snap_id, snap_namespace); + return r; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Snapshot; diff --git a/src/librbd/api/Snapshot.h b/src/librbd/api/Snapshot.h new file mode 100644 index 000000000..7e06a5a8d --- /dev/null +++ b/src/librbd/api/Snapshot.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_SNAPSHOT_H +#define CEPH_LIBRBD_API_SNAPSHOT_H + +#include "include/rbd/librbd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include + +namespace librbd { + +struct ImageCtx; + +namespace api { + +template +struct Snapshot { + + static int get_group_namespace(ImageCtxT *ictx, uint64_t snap_id, + snap_group_namespace_t *group_snap); + + static int get_trash_namespace(ImageCtxT *ictx, uint64_t snap_id, + std::string *original_name); + + static int get_mirror_namespace( + ImageCtxT *ictx, uint64_t snap_id, + snap_mirror_namespace_t *mirror_snap); + + static int get_namespace_type(ImageCtxT *ictx, uint64_t snap_id, + snap_namespace_type_t *namespace_type); + + static int remove(ImageCtxT *ictx, uint64_t snap_id); + + static int get_name(ImageCtxT *ictx, uint64_t snap_id, std::string *snap_name); + + static int get_id(ImageCtxT *ictx, const std::string& snap_name, uint64_t *snap_id); + + static int list(ImageCtxT *ictx, std::vector& snaps); + + static int exists(ImageCtxT *ictx, const cls::rbd::SnapshotNamespace& snap_namespace, + const char *snap_name, bool *exists); + + static int create(ImageCtxT *ictx, const char *snap_name, uint32_t flags, + ProgressContext& pctx); + + static int remove(ImageCtxT *ictx, const char *snap_name, uint32_t flags, ProgressContext& pctx); + + static int get_limit(ImageCtxT *ictx, uint64_t *limit); + + static int set_limit(ImageCtxT *ictx, uint64_t limit); + + static int get_timestamp(ImageCtxT *ictx, uint64_t snap_id, struct timespec *timestamp); + + static int is_protected(ImageCtxT *ictx, const char *snap_name, bool *protect); + + static int get_namespace(ImageCtxT *ictx, const char *snap_name, + cls::rbd::SnapshotNamespace *snap_namespace); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Snapshot; + +#endif // CEPH_LIBRBD_API_SNAPSHOT_H diff --git a/src/librbd/api/Trash.cc b/src/librbd/api/Trash.cc new file mode 100644 index 000000000..d8189e8a7 --- /dev/null +++ b/src/librbd/api/Trash.cc @@ -0,0 +1,759 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Trash.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Cond.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/AsioEngine.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Operations.h" +#include "librbd/TrashWatcher.h" +#include "librbd/Utils.h" +#include "librbd/api/DiffIterate.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/image/RemoveRequest.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/mirror/EnableRequest.h" +#include "librbd/trash/MoveRequest.h" +#include "librbd/trash/RemoveRequest.h" +#include +#include "librbd/journal/DisabledPolicy.h" +#include "librbd/image/ListWatchersRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::Trash: " << __func__ << ": " + +namespace librbd { +namespace api { + +template +const typename Trash::TrashImageSources Trash::ALLOWED_RESTORE_SOURCES { + cls::rbd::TRASH_IMAGE_SOURCE_USER, + cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING, + cls::rbd::TRASH_IMAGE_SOURCE_USER_PARENT + }; + +namespace { + +template +int disable_mirroring(I *ictx) { + ldout(ictx->cct, 10) << dendl; + + C_SaferCond ctx; + auto req = mirror::DisableRequest::create(ictx, false, true, &ctx); + req->send(); + int r = ctx.wait(); + if (r < 0) { + lderr(ictx->cct) << "failed to disable mirroring: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +template +int enable_mirroring(IoCtx &io_ctx, const std::string &image_id) { + auto cct = reinterpret_cast(io_ctx.cct()); + + uint64_t features; + uint64_t incompatible_features; + int r = cls_client::get_features(&io_ctx, util::header_name(image_id), true, + &features, &incompatible_features); + if (r < 0) { + lderr(cct) << "failed to retrieve features: " << cpp_strerror(r) << dendl; + return r; + } + + if ((features & RBD_FEATURE_JOURNALING) == 0) { + return 0; + } + + cls::rbd::MirrorMode mirror_mode; + r = cls_client::mirror_mode_get(&io_ctx, &mirror_mode); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + return r; + } + + if (mirror_mode != cls::rbd::MIRROR_MODE_POOL) { + ldout(cct, 10) << "not pool mirroring mode" << dendl; + return 0; + } + + ldout(cct, 10) << dendl; + + AsioEngine asio_engine(io_ctx); + + C_SaferCond ctx; + auto req = mirror::EnableRequest::create( + io_ctx, image_id, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, "", false, + asio_engine.get_work_queue(), &ctx); + req->send(); + r = ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to enable mirroring: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +int list_trash_image_specs( + librados::IoCtx &io_ctx, + std::map* trash_image_specs, + bool exclude_user_remove_source) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "list_trash_image_specs " << &io_ctx << dendl; + + bool more_entries; + uint32_t max_read = 1024; + std::string last_read; + do { + std::map trash_entries; + int r = cls_client::trash_list(&io_ctx, last_read, max_read, + &trash_entries); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error listing rbd trash entries: " << cpp_strerror(r) + << dendl; + return r; + } else if (r == -ENOENT) { + break; + } + + if (trash_entries.empty()) { + break; + } + + for (const auto &entry : trash_entries) { + if (exclude_user_remove_source && + entry.second.source == cls::rbd::TRASH_IMAGE_SOURCE_REMOVING) { + continue; + } + + trash_image_specs->insert({entry.first, entry.second}); + } + + last_read = trash_entries.rbegin()->first; + more_entries = (trash_entries.size() >= max_read); + } while (more_entries); + + return 0; +} + +} // anonymous namespace + +template +int Trash::move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, const std::string &image_id, + uint64_t delay) { + ceph_assert(!image_name.empty() && !image_id.empty()); + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << &io_ctx << " name=" << image_name << ", id=" << image_id + << dendl; + + auto ictx = new I("", image_id, nullptr, io_ctx, false); + int r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl; + return r; + } + + if (r == 0) { + cls::rbd::MirrorImage mirror_image; + int mirror_r = cls_client::mirror_image_get(&ictx->md_ctx, ictx->id, + &mirror_image); + if (mirror_r == -ENOENT) { + ldout(ictx->cct, 10) << "mirroring is not enabled for this image" + << dendl; + } else if (mirror_r < 0) { + lderr(ictx->cct) << "failed to retrieve mirror image: " + << cpp_strerror(mirror_r) << dendl; + return mirror_r; + } else if (mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + // a remote rbd-mirror might own the exclusive-lock on this image + // and therefore we need to disable mirroring so that it closes the image + r = disable_mirroring(ictx); + if (r < 0) { + ictx->state->close(); + return r; + } + } + + if (ictx->test_features(RBD_FEATURE_JOURNALING)) { + std::unique_lock image_locker{ictx->image_lock}; + ictx->set_journal_policy(new journal::DisabledPolicy()); + } + + ictx->owner_lock.lock_shared(); + if (ictx->exclusive_lock != nullptr) { + ictx->exclusive_lock->block_requests(0); + + r = ictx->operations->prepare_image_update( + exclusive_lock::OPERATION_REQUEST_TYPE_GENERAL, true); + if (r < 0) { + lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl; + ictx->owner_lock.unlock_shared(); + ictx->state->close(); + return -EBUSY; + } + } + ictx->owner_lock.unlock_shared(); + + ictx->image_lock.lock_shared(); + if (!ictx->migration_info.empty()) { + lderr(cct) << "cannot move migrating image to trash" << dendl; + ictx->image_lock.unlock_shared(); + ictx->state->close(); + return -EBUSY; + } + ictx->image_lock.unlock_shared(); + + if (mirror_r >= 0 && + mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + r = disable_mirroring(ictx); + if (r < 0) { + ictx->state->close(); + return r; + } + } + + ictx->state->close(); + } + + utime_t delete_time{ceph_clock_now()}; + utime_t deferment_end_time{delete_time}; + deferment_end_time += delay; + cls::rbd::TrashImageSpec trash_image_spec{ + static_cast(source), image_name, + delete_time, deferment_end_time}; + + trash_image_spec.state = cls::rbd::TRASH_IMAGE_STATE_MOVING; + C_SaferCond ctx; + auto req = trash::MoveRequest::create(io_ctx, image_id, trash_image_spec, + &ctx); + req->send(); + + r = ctx.wait(); + trash_image_spec.state = cls::rbd::TRASH_IMAGE_STATE_NORMAL; + int ret = cls_client::trash_state_set(&io_ctx, image_id, + trash_image_spec.state, + cls::rbd::TRASH_IMAGE_STATE_MOVING); + if (ret < 0 && ret != -EOPNOTSUPP) { + lderr(cct) << "error setting trash image state: " + << cpp_strerror(ret) << dendl; + return ret; + } + if (r < 0) { + return r; + } + + C_SaferCond notify_ctx; + TrashWatcher::notify_image_added(io_ctx, image_id, trash_image_spec, + ¬ify_ctx); + r = notify_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + + return 0; +} + +template +int Trash::move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, uint64_t delay) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << &io_ctx << " name=" << image_name << dendl; + + // try to get image id from the directory + std::string image_id; + int r = cls_client::dir_get_id(&io_ctx, RBD_DIRECTORY, image_name, + &image_id); + if (r == -ENOENT) { + r = io_ctx.stat(util::old_header_name(image_name), nullptr, nullptr); + if (r == 0) { + // cannot move V1 image to trash + ldout(cct, 10) << "cannot move v1 image to trash" << dendl; + return -EOPNOTSUPP; + } + + // search for an interrupted trash move request + std::map trash_image_specs; + int r = list_trash_image_specs(io_ctx, &trash_image_specs, true); + if (r < 0) { + return r; + } + if (auto found_image = + std::find_if( + trash_image_specs.begin(), trash_image_specs.end(), + [&](const auto& pair) { + const auto& spec = pair.second; + return (spec.source == cls::rbd::TRASH_IMAGE_SOURCE_USER && + spec.state == cls::rbd::TRASH_IMAGE_STATE_MOVING && + spec.name == image_name); + }); + found_image != trash_image_specs.end()) { + image_id = found_image->first; + } else { + return -ENOENT; + } + ldout(cct, 15) << "derived image id " << image_id << " from existing " + << "trash entry" << dendl; + } else if (r < 0) { + lderr(cct) << "failed to retrieve image id: " << cpp_strerror(r) << dendl; + return r; + } + + if (image_name.empty() || image_id.empty()) { + lderr(cct) << "invalid image name/id" << dendl; + return -EINVAL; + } + + return Trash::move(io_ctx, source, image_name, image_id, delay); +} + +template +int Trash::get(IoCtx &io_ctx, const std::string &id, + trash_image_info_t *info) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << __func__ << " " << &io_ctx << dendl; + + cls::rbd::TrashImageSpec spec; + int r = cls_client::trash_get(&io_ctx, id, &spec); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "error retrieving trash entry: " << cpp_strerror(r) + << dendl; + return r; + } + + rbd_trash_image_source_t source = static_cast( + spec.source); + *info = trash_image_info_t{id, spec.name, source, spec.deletion_time.sec(), + spec.deferment_end_time.sec()}; + return 0; +} + +template +int Trash::list(IoCtx &io_ctx, std::vector &entries, + bool exclude_user_remove_source) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << __func__ << " " << &io_ctx << dendl; + + std::map trash_image_specs; + int r = list_trash_image_specs(io_ctx, &trash_image_specs, + exclude_user_remove_source); + if (r < 0) { + return r; + } + + entries.reserve(trash_image_specs.size()); + for (const auto& [image_id, spec] : trash_image_specs) { + rbd_trash_image_source_t source = + static_cast(spec.source); + entries.push_back({image_id, spec.name, source, + spec.deletion_time.sec(), + spec.deferment_end_time.sec()}); + } + + return 0; +} + +template +int Trash::purge(IoCtx& io_ctx, time_t expire_ts, + float threshold, ProgressContext& pctx) { + auto *cct((CephContext *) io_ctx.cct()); + ldout(cct, 20) << &io_ctx << dendl; + + std::vector trash_entries; + int r = librbd::api::Trash::list(io_ctx, trash_entries, true); + if (r < 0) { + return r; + } + + trash_entries.erase( + std::remove_if(trash_entries.begin(), trash_entries.end(), + [](librbd::trash_image_info_t info) { + return info.source != RBD_TRASH_IMAGE_SOURCE_USER && + info.source != RBD_TRASH_IMAGE_SOURCE_USER_PARENT; + }), + trash_entries.end()); + + std::set to_be_removed; + if (threshold != -1) { + if (threshold < 0 || threshold > 1) { + lderr(cct) << "argument 'threshold' is out of valid range" + << dendl; + return -EINVAL; + } + + librados::bufferlist inbl; + librados::bufferlist outbl; + std::string pool_name = io_ctx.get_pool_name(); + + librados::Rados rados(io_ctx); + rados.mon_command(R"({"prefix": "df", "format": "json"})", inbl, + &outbl, nullptr); + + json_spirit::mValue json; + if (!json_spirit::read(outbl.to_str(), json)) { + lderr(cct) << "ceph df json output could not be parsed" + << dendl; + return -EBADMSG; + } + + json_spirit::mArray arr = json.get_obj()["pools"].get_array(); + + double pool_percent_used = 0; + uint64_t pool_total_bytes = 0; + + std::map> datapools; + + std::sort(trash_entries.begin(), trash_entries.end(), + [](librbd::trash_image_info_t a, librbd::trash_image_info_t b) { + return a.deferment_end_time < b.deferment_end_time; + } + ); + + for (const auto &entry : trash_entries) { + int64_t data_pool_id = -1; + r = cls_client::get_data_pool(&io_ctx, util::header_name(entry.id), + &data_pool_id); + if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) { + lderr(cct) << "failed to query data pool: " << cpp_strerror(r) << dendl; + return r; + } else if (data_pool_id == -1) { + data_pool_id = io_ctx.get_id(); + } + + if (data_pool_id != io_ctx.get_id()) { + librados::IoCtx data_io_ctx; + r = util::create_ioctx(io_ctx, "image", data_pool_id, + {}, &data_io_ctx); + if (r < 0) { + lderr(cct) << "error accessing data pool" << dendl; + continue; + } + auto data_pool = data_io_ctx.get_pool_name(); + datapools[data_pool].push_back(entry.id); + } else { + datapools[pool_name].push_back(entry.id); + } + } + + uint64_t bytes_to_free = 0; + + for (uint8_t i = 0; i < arr.size(); ++i) { + json_spirit::mObject obj = arr[i].get_obj(); + std::string name = obj.find("name")->second.get_str(); + auto img = datapools.find(name); + if (img != datapools.end()) { + json_spirit::mObject stats = arr[i].get_obj()["stats"].get_obj(); + pool_percent_used = stats["percent_used"].get_real(); + if (pool_percent_used <= threshold) continue; + + bytes_to_free = 0; + + pool_total_bytes = stats["max_avail"].get_uint64() + + stats["bytes_used"].get_uint64(); + + auto bytes_threshold = (uint64_t) (pool_total_bytes * + (pool_percent_used - threshold)); + + for (const auto &it : img->second) { + auto ictx = new I("", it, nullptr, io_ctx, false); + r = ictx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT); + if (r == -ENOENT) { + continue; + } else if (r < 0) { + lderr(cct) << "failed to open image " << it << ": " + << cpp_strerror(r) << dendl; + } + + r = librbd::api::DiffIterate::diff_iterate( + ictx, cls::rbd::UserSnapshotNamespace(), nullptr, 0, ictx->size, + false, true, + [](uint64_t offset, size_t len, int exists, void *arg) { + auto *to_free = reinterpret_cast(arg); + if (exists) + (*to_free) += len; + return 0; + }, &bytes_to_free); + + ictx->state->close(); + if (r < 0) { + lderr(cct) << "failed to calculate disk usage for image " << it + << ": " << cpp_strerror(r) << dendl; + continue; + } + + to_be_removed.insert(it); + if (bytes_to_free >= bytes_threshold) { + break; + } + } + } + } + + if (bytes_to_free == 0) { + ldout(cct, 10) << "pool usage is lower than or equal to " + << (threshold * 100) + << "%" << dendl; + return 0; + } + } + + if (expire_ts == 0) { + struct timespec now; + clock_gettime(CLOCK_REALTIME, &now); + expire_ts = now.tv_sec; + } + + for (const auto &entry : trash_entries) { + if (expire_ts >= entry.deferment_end_time) { + to_be_removed.insert(entry.id); + } + } + + NoOpProgressContext remove_pctx; + uint64_t list_size = to_be_removed.size(), i = 0; + int remove_err = 1; + while (!to_be_removed.empty() && remove_err == 1) { + remove_err = 0; + for (auto it = to_be_removed.begin(); it != to_be_removed.end(); ) { + trash_image_info_t trash_info; + r = Trash::get(io_ctx, *it, &trash_info); + if (r == -ENOENT) { + // likely RBD_TRASH_IMAGE_SOURCE_USER_PARENT image removed as a side + // effect of a preceeding remove (last child detach) + pctx.update_progress(++i, list_size); + it = to_be_removed.erase(it); + continue; + } else if (r < 0) { + lderr(cct) << "error getting image id " << *it + << " info: " << cpp_strerror(r) << dendl; + return r; + } + + r = Trash::remove(io_ctx, *it, true, remove_pctx); + if (r == -ENOTEMPTY || r == -EBUSY || r == -EMLINK || r == -EUCLEAN) { + if (!remove_err) { + remove_err = r; + } + ++it; + continue; + } else if (r < 0) { + lderr(cct) << "error removing image id " << *it + << ": " << cpp_strerror(r) << dendl; + return r; + } + pctx.update_progress(++i, list_size); + it = to_be_removed.erase(it); + remove_err = 1; + } + ldout(cct, 20) << "remove_err=" << remove_err << dendl; + } + + if (!to_be_removed.empty()) { + ceph_assert(remove_err < 0); + ldout(cct, 10) << "couldn't remove " << to_be_removed.size() + << " expired images" << dendl; + return remove_err; + } + + return 0; +} + +template +int Trash::remove(IoCtx &io_ctx, const std::string &image_id, bool force, + ProgressContext& prog_ctx) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "trash_remove " << &io_ctx << " " << image_id + << " " << force << dendl; + + cls::rbd::TrashImageSpec trash_spec; + int r = cls_client::trash_get(&io_ctx, image_id, &trash_spec); + if (r < 0) { + lderr(cct) << "error getting image id " << image_id + << " info from trash: " << cpp_strerror(r) << dendl; + return r; + } + + utime_t now = ceph_clock_now(); + if (now < trash_spec.deferment_end_time && !force) { + lderr(cct) << "error: deferment time has not expired." << dendl; + return -EPERM; + } + if (trash_spec.state == cls::rbd::TRASH_IMAGE_STATE_MOVING) { + lderr(cct) << "error: image is pending moving to the trash." + << dendl; + return -EUCLEAN; + } else if (trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL && + trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_REMOVING) { + lderr(cct) << "error: image is pending restoration." << dendl; + return -EBUSY; + } + + AsioEngine asio_engine(io_ctx); + + C_SaferCond cond; + auto req = librbd::trash::RemoveRequest::create( + io_ctx, image_id, asio_engine.get_work_queue(), force, prog_ctx, &cond); + req->send(); + + r = cond.wait(); + if (r < 0) { + return r; + } + + C_SaferCond notify_ctx; + TrashWatcher::notify_image_removed(io_ctx, image_id, ¬ify_ctx); + r = notify_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + + return 0; +} + +template +int Trash::restore(librados::IoCtx &io_ctx, + const TrashImageSources& trash_image_sources, + const std::string &image_id, + const std::string &image_new_name) { + CephContext *cct((CephContext *)io_ctx.cct()); + ldout(cct, 20) << "trash_restore " << &io_ctx << " " << image_id << " " + << image_new_name << dendl; + + cls::rbd::TrashImageSpec trash_spec; + int r = cls_client::trash_get(&io_ctx, image_id, &trash_spec); + if (r < 0) { + lderr(cct) << "error getting image id " << image_id + << " info from trash: " << cpp_strerror(r) << dendl; + return r; + } + + if (trash_image_sources.count(trash_spec.source) == 0) { + lderr(cct) << "Current trash source '" << trash_spec.source << "' " + << "does not match expected: " + << trash_image_sources << dendl; + return -EINVAL; + } + + std::string image_name = image_new_name; + if (trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL && + trash_spec.state != cls::rbd::TRASH_IMAGE_STATE_RESTORING) { + lderr(cct) << "error restoring image id " << image_id + << ", which is pending deletion" << dendl; + return -EBUSY; + } + r = cls_client::trash_state_set(&io_ctx, image_id, + cls::rbd::TRASH_IMAGE_STATE_RESTORING, + cls::rbd::TRASH_IMAGE_STATE_NORMAL); + if (r < 0 && r != -EOPNOTSUPP) { + lderr(cct) << "error setting trash image state: " + << cpp_strerror(r) << dendl; + return r; + } + + if (image_name.empty()) { + // if user didn't specify a new name, let's try using the old name + image_name = trash_spec.name; + ldout(cct, 20) << "restoring image id " << image_id << " with name " + << image_name << dendl; + } + + // check if no image exists with the same name + bool create_id_obj = true; + std::string existing_id; + r = cls_client::get_id(&io_ctx, util::id_obj_name(image_name), &existing_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error checking if image " << image_name << " exists: " + << cpp_strerror(r) << dendl; + int ret = cls_client::trash_state_set(&io_ctx, image_id, + cls::rbd::TRASH_IMAGE_STATE_NORMAL, + cls::rbd::TRASH_IMAGE_STATE_RESTORING); + if (ret < 0 && ret != -EOPNOTSUPP) { + lderr(cct) << "error setting trash image state: " + << cpp_strerror(ret) << dendl; + } + return r; + } else if (r != -ENOENT){ + // checking if we are recovering from an incomplete restore + if (existing_id != image_id) { + ldout(cct, 2) << "an image with the same name already exists" << dendl; + int r2 = cls_client::trash_state_set(&io_ctx, image_id, + cls::rbd::TRASH_IMAGE_STATE_NORMAL, + cls::rbd::TRASH_IMAGE_STATE_RESTORING); + if (r2 < 0 && r2 != -EOPNOTSUPP) { + lderr(cct) << "error setting trash image state: " + << cpp_strerror(r2) << dendl; + } + return -EEXIST; + } + create_id_obj = false; + } + + if (create_id_obj) { + ldout(cct, 2) << "adding id object" << dendl; + librados::ObjectWriteOperation op; + op.create(true); + cls_client::set_id(&op, image_id); + r = io_ctx.operate(util::id_obj_name(image_name), &op); + if (r < 0) { + lderr(cct) << "error adding id object for image " << image_name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + + ldout(cct, 2) << "adding rbd image to v2 directory..." << dendl; + r = cls_client::dir_add_image(&io_ctx, RBD_DIRECTORY, image_name, + image_id); + if (r < 0 && r != -EEXIST) { + lderr(cct) << "error adding image to v2 directory: " + << cpp_strerror(r) << dendl; + return r; + } + + r = enable_mirroring(io_ctx, image_id); + if (r < 0) { + // not fatal -- ignore + } + + ldout(cct, 2) << "removing image from trash..." << dendl; + r = cls_client::trash_remove(&io_ctx, image_id); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error removing image id " << image_id << " from trash: " + << cpp_strerror(r) << dendl; + return r; + } + + C_SaferCond notify_ctx; + TrashWatcher::notify_image_removed(io_ctx, image_id, ¬ify_ctx); + r = notify_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to send update notification: " << cpp_strerror(r) + << dendl; + } + + return 0; +} + +} // namespace api +} // namespace librbd + +template class librbd::api::Trash; diff --git a/src/librbd/api/Trash.h b/src/librbd/api/Trash.h new file mode 100644 index 000000000..66f819dfa --- /dev/null +++ b/src/librbd/api/Trash.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_API_TRASH_H +#define LIBRBD_API_TRASH_H + +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/librbd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include +#include +#include + +namespace librbd { + +class ProgressContext; + +struct ImageCtx; + +namespace api { + +template +struct Trash { + typedef std::set TrashImageSources; + static const TrashImageSources ALLOWED_RESTORE_SOURCES; + + static int move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, uint64_t delay); + static int move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source, + const std::string &image_name, const std::string &image_id, + uint64_t delay); + static int get(librados::IoCtx &io_ctx, const std::string &id, + trash_image_info_t *info); + static int list(librados::IoCtx &io_ctx, + std::vector &entries, + bool exclude_user_remove_source); + static int purge(IoCtx& io_ctx, time_t expire_ts, + float threshold, ProgressContext& pctx); + static int remove(librados::IoCtx &io_ctx, const std::string &image_id, + bool force, ProgressContext& prog_ctx); + static int restore(librados::IoCtx &io_ctx, + const TrashImageSources& trash_image_sources, + const std::string &image_id, + const std::string &image_new_name); + +}; + +} // namespace api +} // namespace librbd + +extern template class librbd::api::Trash; + +#endif // LIBRBD_API_TRASH_H diff --git a/src/librbd/api/Utils.cc b/src/librbd/api/Utils.cc new file mode 100644 index 000000000..056b6b435 --- /dev/null +++ b/src/librbd/api/Utils.cc @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/api/Utils.h" +#include "common/dout.h" + +#if defined(HAVE_LIBCRYPTSETUP) +#include "librbd/crypto/luks/LUKSEncryptionFormat.h" +#endif + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::api::util: " << __func__ << ": " + +namespace librbd { +namespace api { +namespace util { + +template +int create_encryption_format( + CephContext* cct, encryption_format_t format, + encryption_options_t opts, size_t opts_size, bool c_api, + crypto::EncryptionFormat** result_format) { + size_t expected_opts_size; + switch (format) { +#if defined(HAVE_LIBCRYPTSETUP) + case RBD_ENCRYPTION_FORMAT_LUKS1: { + if (c_api) { + expected_opts_size = sizeof(rbd_encryption_luks1_format_options_t); + if (expected_opts_size == opts_size) { + auto c_opts = (rbd_encryption_luks1_format_options_t*)opts; + *result_format = new crypto::luks::LUKS1EncryptionFormat( + c_opts->alg, {c_opts->passphrase, c_opts->passphrase_size}); + } + } else { + expected_opts_size = sizeof(encryption_luks1_format_options_t); + if (expected_opts_size == opts_size) { + auto cpp_opts = (encryption_luks1_format_options_t*)opts; + *result_format = new crypto::luks::LUKS1EncryptionFormat( + cpp_opts->alg, cpp_opts->passphrase); + } + } + break; + } + case RBD_ENCRYPTION_FORMAT_LUKS2: { + if (c_api) { + expected_opts_size = sizeof(rbd_encryption_luks2_format_options_t); + if (expected_opts_size == opts_size) { + auto c_opts = (rbd_encryption_luks2_format_options_t*)opts; + *result_format = new crypto::luks::LUKS2EncryptionFormat( + c_opts->alg, {c_opts->passphrase, c_opts->passphrase_size}); + } + } else { + expected_opts_size = sizeof(encryption_luks2_format_options_t); + if (expected_opts_size == opts_size) { + auto cpp_opts = (encryption_luks2_format_options_t*)opts; + *result_format = new crypto::luks::LUKS2EncryptionFormat( + cpp_opts->alg, cpp_opts->passphrase); + } + } + break; + } + case RBD_ENCRYPTION_FORMAT_LUKS: { + if (c_api) { + expected_opts_size = sizeof(rbd_encryption_luks_format_options_t); + if (expected_opts_size == opts_size) { + auto c_opts = (rbd_encryption_luks_format_options_t*)opts; + *result_format = new crypto::luks::LUKSEncryptionFormat( + {c_opts->passphrase, c_opts->passphrase_size}); + } + } else { + expected_opts_size = sizeof(encryption_luks_format_options_t); + if (expected_opts_size == opts_size) { + auto cpp_opts = (encryption_luks_format_options_t*)opts; + *result_format = new crypto::luks::LUKSEncryptionFormat( + cpp_opts->passphrase); + } + } + break; + } +#endif + default: + lderr(cct) << "unsupported encryption format: " << format << dendl; + return -ENOTSUP; + } + + if (expected_opts_size != opts_size) { + lderr(cct) << "expected opts_size: " << expected_opts_size << dendl; + return -EINVAL; + } + + return 0; +} + +} // namespace util +} // namespace api +} // namespace librbd + +template int librbd::api::util::create_encryption_format( + CephContext* cct, encryption_format_t format, encryption_options_t opts, + size_t opts_size, bool c_api, + crypto::EncryptionFormat** result_format); diff --git a/src/librbd/api/Utils.h b/src/librbd/api/Utils.h new file mode 100644 index 000000000..8f8c22290 --- /dev/null +++ b/src/librbd/api/Utils.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_API_UTILS_H +#define CEPH_LIBRBD_API_UTILS_H + +#include "include/rbd/librbd.hpp" +#include "librbd/ImageCtx.h" +#include "librbd/crypto/EncryptionFormat.h" + +namespace librbd { + +struct ImageCtx; + +namespace api { +namespace util { + +template +int create_encryption_format( + CephContext* cct, encryption_format_t format, + encryption_options_t opts, size_t opts_size, bool c_api, + crypto::EncryptionFormat** result_format); + +} // namespace util +} // namespace api +} // namespace librbd + +#endif // CEPH_LIBRBD_API_UTILS_H diff --git a/src/librbd/asio/ContextWQ.cc b/src/librbd/asio/ContextWQ.cc new file mode 100644 index 000000000..4f6c72770 --- /dev/null +++ b/src/librbd/asio/ContextWQ.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/asio/ContextWQ.h" +#include "include/Context.h" +#include "common/Cond.h" +#include "common/dout.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::asio::ContextWQ: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace asio { + +ContextWQ::ContextWQ(CephContext* cct, boost::asio::io_context& io_context) + : m_cct(cct), m_io_context(io_context), + m_strand(std::make_unique(io_context)), + m_queued_ops(0) { + ldout(m_cct, 20) << dendl; +} + +ContextWQ::~ContextWQ() { + ldout(m_cct, 20) << dendl; + drain(); + m_strand.reset(); +} + +void ContextWQ::drain() { + ldout(m_cct, 20) << dendl; + C_SaferCond ctx; + drain_handler(&ctx); + ctx.wait(); +} + +void ContextWQ::drain_handler(Context* ctx) { + if (m_queued_ops == 0) { + ctx->complete(0); + return; + } + + // new items might be queued while we are trying to drain, so we + // might need to post the handler multiple times + boost::asio::post(*m_strand, [this, ctx]() { drain_handler(ctx); }); +} + +} // namespace asio +} // namespace librbd diff --git a/src/librbd/asio/ContextWQ.h b/src/librbd/asio/ContextWQ.h new file mode 100644 index 000000000..85c254161 --- /dev/null +++ b/src/librbd/asio/ContextWQ.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_ASIO_CONTEXT_WQ_H +#define CEPH_LIBRBD_ASIO_CONTEXT_WQ_H + +#include "include/common_fwd.h" +#include "include/Context.h" +#include +#include +#include +#include +#include + +namespace librbd { +namespace asio { + +class ContextWQ { +public: + explicit ContextWQ(CephContext* cct, boost::asio::io_context& io_context); + ~ContextWQ(); + + void drain(); + + void queue(Context *ctx, int r = 0) { + ++m_queued_ops; + + // ensure all legacy ContextWQ users are dispatched sequentially for + // backwards compatibility (i.e. might not be concurrent thread-safe) + boost::asio::post(*m_strand, [this, ctx, r]() { + ctx->complete(r); + + ceph_assert(m_queued_ops > 0); + --m_queued_ops; + }); + } + +private: + CephContext* m_cct; + boost::asio::io_context& m_io_context; + std::unique_ptr m_strand; + + std::atomic m_queued_ops; + + void drain_handler(Context* ctx); + +}; + +} // namespace asio +} // namespace librbd + +#endif // CEPH_LIBRBD_ASIO_CONTEXT_WQ_H diff --git a/src/librbd/asio/Utils.h b/src/librbd/asio/Utils.h new file mode 100644 index 000000000..2fbbb5846 --- /dev/null +++ b/src/librbd/asio/Utils.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_ASIO_UTILS_H +#define CEPH_LIBRBD_ASIO_UTILS_H + +#include "include/Context.h" +#include "include/rados/librados_fwd.hpp" +#include + +namespace librbd { +namespace asio { +namespace util { + +template +auto get_context_adapter(T&& t) { + return [t = std::move(t)](boost::system::error_code ec) { + t->complete(-ec.value()); + }; +} + +template +auto get_callback_adapter(T&& t) { + return [t = std::move(t)](boost::system::error_code ec, auto&& ... args) { + t(-ec.value(), std::forward(args)...); + }; +} + +} // namespace util +} // namespace asio +} // namespace librbd + +#endif // CEPH_LIBRBD_ASIO_UTILS_H diff --git a/src/librbd/cache/ImageWriteback.cc b/src/librbd/cache/ImageWriteback.cc new file mode 100644 index 000000000..40422534b --- /dev/null +++ b/src/librbd/cache/ImageWriteback.cc @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ImageWriteback.h" +#include "include/buffer.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ReadResult.h" + +#undef dout_subsys +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ImageWriteback: " << __func__ << ": " + +namespace librbd { +namespace cache { + +template +ImageWriteback::ImageWriteback(I &image_ctx) : m_image_ctx(image_ctx) { +} + +template +void ImageWriteback::aio_read(Extents &&image_extents, bufferlist *bl, + int fadvise_flags, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "image_extents=" << image_extents << ", " + << "on_finish=" << on_finish << dendl; + + ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx); + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, image_ctx, io::AIO_TYPE_READ); + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_read( + *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp, + std::move(image_extents), io::ImageArea::DATA, io::ReadResult{bl}, + image_ctx->get_data_io_context(), fadvise_flags, 0, trace); + req->send(); +} + +template +void ImageWriteback::aio_write(Extents &&image_extents, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "image_extents=" << image_extents << ", " + << "on_finish=" << on_finish << dendl; + + ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx); + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, image_ctx, io::AIO_TYPE_WRITE); + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_write( + *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp, + std::move(image_extents), io::ImageArea::DATA, std::move(bl), + fadvise_flags, trace); + req->send(); +} + +template +void ImageWriteback::aio_discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "offset=" << offset << ", " + << "length=" << length << ", " + << "on_finish=" << on_finish << dendl; + + ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx); + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, image_ctx, io::AIO_TYPE_DISCARD); + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_discard( + *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp, + {{offset, length}}, io::ImageArea::DATA, discard_granularity_bytes, trace); + req->send(); +} + +template +void ImageWriteback::aio_flush(io::FlushSource flush_source, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "on_finish=" << on_finish << dendl; + + ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx); + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, image_ctx, io::AIO_TYPE_FLUSH); + + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_flush( + *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp, + flush_source, trace); + req->send(); +} + +template +void ImageWriteback::aio_writesame(uint64_t offset, uint64_t length, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "offset=" << offset << ", " + << "length=" << length << ", " + << "data_len=" << bl.length() << ", " + << "on_finish=" << on_finish << dendl; + + ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx); + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, image_ctx, io::AIO_TYPE_WRITESAME); + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_write_same( + *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp, + {{offset, length}}, io::ImageArea::DATA, std::move(bl), + fadvise_flags, trace); + req->send(); +} + +template +void ImageWriteback::aio_compare_and_write(Extents &&image_extents, + ceph::bufferlist&& cmp_bl, + ceph::bufferlist&& bl, + uint64_t *mismatch_offset, + int fadvise_flags, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "image_extents=" << image_extents << ", " + << "on_finish=" << on_finish << dendl; + + ImageCtx *image_ctx = util::get_image_ctx(&m_image_ctx); + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, image_ctx, io::AIO_TYPE_COMPARE_AND_WRITE); + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_compare_and_write( + *image_ctx, io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, aio_comp, + std::move(image_extents), io::ImageArea::DATA, std::move(cmp_bl), + std::move(bl), mismatch_offset, fadvise_flags, trace); + req->send(); +} + +} // namespace cache +} // namespace librbd + +template class librbd::cache::ImageWriteback; + diff --git a/src/librbd/cache/ImageWriteback.h b/src/librbd/cache/ImageWriteback.h new file mode 100644 index 000000000..3f62391e4 --- /dev/null +++ b/src/librbd/cache/ImageWriteback.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK +#define CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK + +#include "include/buffer_fwd.h" +#include "include/int_types.h" +#include "librbd/io/Types.h" +#include + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace cache { + +class ImageWritebackInterface { +public: + typedef std::vector > Extents; + virtual ~ImageWritebackInterface() { + } + virtual void aio_read(Extents &&image_extents, ceph::bufferlist *bl, + int fadvise_flags, Context *on_finish) = 0; + virtual void aio_write(Extents &&image_extents, ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) = 0; + virtual void aio_discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, Context *on_finish) = 0; + virtual void aio_flush(io::FlushSource flush_source, Context *on_finish) = 0 ; + virtual void aio_writesame(uint64_t offset, uint64_t length, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish) = 0; + virtual void aio_compare_and_write(Extents &&image_extents, + ceph::bufferlist&& cmp_bl, + ceph::bufferlist&& bl, + uint64_t *mismatch_offset, + int fadvise_flags, Context *on_finish) = 0; +}; + +/** + * client-side, image extent cache writeback handler + */ +template +class ImageWriteback : public ImageWritebackInterface { +public: + using ImageWritebackInterface::Extents; + + explicit ImageWriteback(ImageCtxT &image_ctx); + + void aio_read(Extents &&image_extents, ceph::bufferlist *bl, + int fadvise_flags, Context *on_finish); + void aio_write(Extents &&image_extents, ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish); + void aio_discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, Context *on_finish); + void aio_flush(io::FlushSource flush_source, Context *on_finish); + void aio_writesame(uint64_t offset, uint64_t length, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish); + void aio_compare_and_write(Extents &&image_extents, + ceph::bufferlist&& cmp_bl, + ceph::bufferlist&& bl, + uint64_t *mismatch_offset, + int fadvise_flags, Context *on_finish); +private: + ImageCtxT &m_image_ctx; + +}; + +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::ImageWriteback; + +#endif // CEPH_LIBRBD_CACHE_IMAGE_WRITEBACK diff --git a/src/librbd/cache/ObjectCacherObjectDispatch.cc b/src/librbd/cache/ObjectCacherObjectDispatch.cc new file mode 100644 index 000000000..822a053e1 --- /dev/null +++ b/src/librbd/cache/ObjectCacherObjectDispatch.cc @@ -0,0 +1,486 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/ObjectCacherObjectDispatch.h" +#include "include/neorados/RADOS.hpp" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/ObjectCacherWriteback.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" +#include "librbd/io/Utils.h" +#include "osd/osd_types.h" +#include "osdc/WritebackHandler.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::ObjectCacherObjectDispatch: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { + +using librbd::util::data_object_name; + +namespace { + +typedef std::vector ObjectExtents; + +} // anonymous namespace + +template +struct ObjectCacherObjectDispatch::C_InvalidateCache : public Context { + ObjectCacherObjectDispatch* dispatcher; + bool purge_on_error; + Context *on_finish; + + C_InvalidateCache(ObjectCacherObjectDispatch* dispatcher, + bool purge_on_error, Context *on_finish) + : dispatcher(dispatcher), purge_on_error(purge_on_error), + on_finish(on_finish) { + } + + void finish(int r) override { + ceph_assert(ceph_mutex_is_locked(dispatcher->m_cache_lock)); + auto cct = dispatcher->m_image_ctx->cct; + + if (r == -EBLOCKLISTED) { + lderr(cct) << "blocklisted during flush (purging)" << dendl; + dispatcher->m_object_cacher->purge_set(dispatcher->m_object_set); + } else if (r < 0 && purge_on_error) { + lderr(cct) << "failed to invalidate cache (purging): " + << cpp_strerror(r) << dendl; + dispatcher->m_object_cacher->purge_set(dispatcher->m_object_set); + } else if (r != 0) { + lderr(cct) << "failed to invalidate cache: " << cpp_strerror(r) << dendl; + } + + auto unclean = dispatcher->m_object_cacher->release_set( + dispatcher->m_object_set); + if (unclean == 0) { + r = 0; + } else { + lderr(cct) << "could not release all objects from cache: " + << unclean << " bytes remain" << dendl; + if (r == 0) { + r = -EBUSY; + } + } + + on_finish->complete(r); + } +}; + +template +ObjectCacherObjectDispatch::ObjectCacherObjectDispatch( + I* image_ctx, size_t max_dirty, bool writethrough_until_flush) + : m_image_ctx(image_ctx), m_max_dirty(max_dirty), + m_writethrough_until_flush(writethrough_until_flush), + m_cache_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::ObjectCacherObjectDispatch::cache_lock", this))) { + ceph_assert(m_image_ctx->data_ctx.is_valid()); +} + +template +ObjectCacherObjectDispatch::~ObjectCacherObjectDispatch() { + delete m_object_cacher; + delete m_object_set; + + delete m_writeback_handler; +} + +template +void ObjectCacherObjectDispatch::init() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_cache_lock.lock(); + ldout(cct, 5) << "enabling caching..." << dendl; + m_writeback_handler = new ObjectCacherWriteback(m_image_ctx, m_cache_lock); + + auto init_max_dirty = m_max_dirty; + if (m_writethrough_until_flush) { + init_max_dirty = 0; + } + + auto cache_size = + m_image_ctx->config.template get_val("rbd_cache_size"); + auto target_dirty = + m_image_ctx->config.template get_val("rbd_cache_target_dirty"); + auto max_dirty_age = + m_image_ctx->config.template get_val("rbd_cache_max_dirty_age"); + auto block_writes_upfront = + m_image_ctx->config.template get_val("rbd_cache_block_writes_upfront"); + auto max_dirty_object = + m_image_ctx->config.template get_val("rbd_cache_max_dirty_object"); + + ldout(cct, 5) << "Initial cache settings:" + << " size=" << cache_size + << " num_objects=" << 10 + << " max_dirty=" << init_max_dirty + << " target_dirty=" << target_dirty + << " max_dirty_age=" << max_dirty_age << dendl; + + m_object_cacher = new ObjectCacher(cct, m_image_ctx->perfcounter->get_name(), + *m_writeback_handler, m_cache_lock, + nullptr, nullptr, cache_size, + 10, /* reset this in init */ + init_max_dirty, target_dirty, + max_dirty_age, block_writes_upfront); + + // size object cache appropriately + if (max_dirty_object == 0) { + max_dirty_object = std::min( + 2000, std::max(10, cache_size / 100 / + sizeof(ObjectCacher::Object))); + } + ldout(cct, 5) << " cache bytes " << cache_size + << " -> about " << max_dirty_object << " objects" << dendl; + m_object_cacher->set_max_objects(max_dirty_object); + + m_object_set = new ObjectCacher::ObjectSet(nullptr, + m_image_ctx->data_ctx.get_id(), 0); + m_object_cacher->start(); + m_cache_lock.unlock(); + + // add ourself to the IO object dispatcher chain + if (m_max_dirty > 0) { + m_image_ctx->disable_zero_copy = true; + } + m_image_ctx->io_object_dispatcher->register_dispatch(this); +} + +template +void ObjectCacherObjectDispatch::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + // chain shut down in reverse order + + // shut down the cache + on_finish = new LambdaContext([this, on_finish](int r) { + m_object_cacher->stop(); + on_finish->complete(r); + }); + + // ensure we aren't holding the cache lock post-flush + on_finish = util::create_async_context_callback(*m_image_ctx, on_finish); + + // invalidate any remaining cache entries + on_finish = new C_InvalidateCache(this, true, on_finish); + + // flush all pending writeback state + std::lock_guard locker{m_cache_lock}; + m_object_cacher->release_set(m_object_set); + m_object_cacher->flush_set(m_object_set, on_finish); +} + +template +bool ObjectCacherObjectDispatch::read( + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + // IO chained in reverse order + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << *extents << dendl; + + if (extents->size() == 0) { + ldout(cct, 20) << "no extents to read" << dendl; + return false; + } + + if (version != nullptr) { + // we currently don't cache read versions + // and don't support reading more than one extent + return false; + } + + // ensure we aren't holding the cache lock post-read + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + // embed the RBD-internal read flags in the genenric RADOS op_flags and + op_flags = ((op_flags & ~ObjectCacherWriteback::READ_FLAGS_MASK) | + ((read_flags << ObjectCacherWriteback::READ_FLAGS_SHIFT) & + ObjectCacherWriteback::READ_FLAGS_MASK)); + + ceph::bufferlist* bl; + if (extents->size() > 1) { + auto req = new io::ReadResult::C_ObjectReadMergedExtents( + cct, extents, on_dispatched); + on_dispatched = req; + bl = &req->bl; + } else { + bl = &extents->front().bl; + } + + m_image_ctx->image_lock.lock_shared(); + auto rd = m_object_cacher->prepare_read( + io_context->read_snap().value_or(CEPH_NOSNAP), bl, op_flags); + m_image_ctx->image_lock.unlock_shared(); + + uint64_t off = 0; + for (auto& read_extent: *extents) { + ObjectExtent extent(data_object_name(m_image_ctx, object_no), object_no, + read_extent.offset, read_extent.length, 0); + extent.oloc.pool = m_image_ctx->data_ctx.get_id(); + extent.buffer_extents.push_back({off, read_extent.length}); + rd->extents.push_back(extent); + off += read_extent.length; + } + + ZTracer::Trace trace(parent_trace); + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + + m_cache_lock.lock(); + int r = m_object_cacher->readx(rd, m_object_set, on_dispatched, &trace); + m_cache_lock.unlock(); + if (r != 0) { + on_dispatched->complete(r); + } + return true; +} + +template +bool ObjectCacherObjectDispatch::discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~" + << object_len << dendl; + + ObjectExtents object_extents; + object_extents.emplace_back(data_object_name(m_image_ctx, object_no), + object_no, object_off, object_len, 0); + + // discard the cache state after changes are committed to disk (and to + // prevent races w/ readahead) + auto ctx = *on_finish; + *on_finish = new LambdaContext( + [this, object_extents, ctx](int r) { + m_cache_lock.lock(); + m_object_cacher->discard_set(m_object_set, object_extents); + m_cache_lock.unlock(); + + ctx->complete(r); + }); + + // ensure we aren't holding the cache lock post-write + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + + // ensure any in-flight writeback is complete before advancing + // the discard request + std::lock_guard locker{m_cache_lock}; + m_object_cacher->discard_writeback(m_object_set, object_extents, + on_dispatched); + return true; +} + +template +bool ObjectCacherObjectDispatch::write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~" + << data.length() << dendl; + + // ensure we aren't holding the cache lock post-write + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + // cache layer does not handle version checking + if (assert_version.has_value() || + (write_flags & io::OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) { + ObjectExtents object_extents; + object_extents.emplace_back(data_object_name(m_image_ctx, object_no), + object_no, object_off, data.length(), 0); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + + // ensure any in-flight writeback is complete before advancing + // the write request + std::lock_guard locker{m_cache_lock}; + m_object_cacher->discard_writeback(m_object_set, object_extents, + on_dispatched); + return true; + } + + SnapContext snapc; + if (io_context->write_snap_context()) { + auto write_snap_context = *io_context->write_snap_context(); + snapc = SnapContext(write_snap_context.first, + {write_snap_context.second.begin(), + write_snap_context.second.end()}); + } + + m_image_ctx->image_lock.lock_shared(); + ObjectCacher::OSDWrite *wr = m_object_cacher->prepare_write( + snapc, data, ceph::real_time::min(), op_flags, *journal_tid); + m_image_ctx->image_lock.unlock_shared(); + + ObjectExtent extent(data_object_name(m_image_ctx, object_no), + object_no, object_off, data.length(), 0); + extent.oloc.pool = m_image_ctx->data_ctx.get_id(); + extent.buffer_extents.push_back({0, data.length()}); + wr->extents.push_back(extent); + + ZTracer::Trace trace(parent_trace); + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + + std::lock_guard locker{m_cache_lock}; + m_object_cacher->writex(wr, m_object_set, on_dispatched, &trace); + return true; +} + +template +bool ObjectCacherObjectDispatch::write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~" + << object_len << dendl; + + // ObjectCacher doesn't support write-same so convert to regular write + io::LightweightObjectExtent extent(object_no, object_off, object_len, 0); + extent.buffer_extents = std::move(buffer_extents); + + bufferlist ws_data; + io::util::assemble_write_same_extent(extent, data, &ws_data, true); + + return write(object_no, object_off, std::move(ws_data), io_context, op_flags, + 0, std::nullopt, parent_trace, object_dispatch_flags, + journal_tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool ObjectCacherObjectDispatch::compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << object_off << "~" + << cmp_data.length() << dendl; + + // pass-through the compare-and-write request since it's not a supported + // operation of the ObjectCacher + + ObjectExtents object_extents; + object_extents.emplace_back(data_object_name(m_image_ctx, object_no), + object_no, object_off, cmp_data.length(), 0); + + // if compare succeeds, discard the cache state after changes are + // committed to disk + auto ctx = *on_finish; + *on_finish = new LambdaContext( + [this, object_extents, ctx](int r) { + // ObjectCacher doesn't provide a way to reliably invalidate + // extents: in case of a racing read (if the bh is in RX state), + // release_set() just returns while discard_set() populates the + // extent with zeroes. Neither is OK but the latter is better + // because it is at least deterministic... + if (r == 0) { + m_cache_lock.lock(); + m_object_cacher->discard_set(m_object_set, object_extents); + m_cache_lock.unlock(); + } + + ctx->complete(r); + }); + + // ensure we aren't holding the cache lock post-flush + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + // flush any pending writes from the cache before compare + ZTracer::Trace trace(parent_trace); + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + + std::lock_guard cache_locker{m_cache_lock}; + m_object_cacher->flush_set(m_object_set, object_extents, &trace, + on_dispatched); + return true; +} + +template +bool ObjectCacherObjectDispatch::flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + // ensure we aren't holding the cache lock post-flush + on_dispatched = util::create_async_context_callback(*m_image_ctx, + on_dispatched); + + std::lock_guard locker{m_cache_lock}; + if (flush_source == io::FLUSH_SOURCE_USER && !m_user_flushed) { + m_user_flushed = true; + if (m_writethrough_until_flush && m_max_dirty > 0) { + m_object_cacher->set_max_dirty(m_max_dirty); + ldout(cct, 5) << "saw first user flush, enabling writeback" << dendl; + } + } + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + m_object_cacher->flush_set(m_object_set, on_dispatched); + return true; +} + +template +bool ObjectCacherObjectDispatch::invalidate_cache(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + // ensure we aren't holding the cache lock post-flush + on_finish = util::create_async_context_callback(*m_image_ctx, on_finish); + + // invalidate any remaining cache entries + on_finish = new C_InvalidateCache(this, false, on_finish); + + std::lock_guard locker{m_cache_lock}; + m_object_cacher->release_set(m_object_set); + m_object_cacher->flush_set(m_object_set, on_finish); + return true; +} + +template +bool ObjectCacherObjectDispatch::reset_existence_cache( + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + std::lock_guard locker{m_cache_lock}; + m_object_cacher->clear_nonexistence(m_object_set); + return false; +} + +} // namespace cache +} // namespace librbd + +template class librbd::cache::ObjectCacherObjectDispatch; diff --git a/src/librbd/cache/ObjectCacherObjectDispatch.h b/src/librbd/cache/ObjectCacherObjectDispatch.h new file mode 100644 index 000000000..0cc87bd87 --- /dev/null +++ b/src/librbd/cache/ObjectCacherObjectDispatch.h @@ -0,0 +1,132 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H + +#include "librbd/io/ObjectDispatchInterface.h" +#include "common/ceph_mutex.h" +#include "osdc/ObjectCacher.h" + +struct WritebackHandler; + +namespace librbd { + +class ImageCtx; + +namespace cache { + +/** + * Facade around the OSDC object cacher to make it align with + * the object dispatcher interface + */ +template +class ObjectCacherObjectDispatch : public io::ObjectDispatchInterface { +public: + static ObjectCacherObjectDispatch* create(ImageCtxT* image_ctx, + size_t max_dirty, + bool writethrough_until_flush) { + return new ObjectCacherObjectDispatch(image_ctx, max_dirty, + writethrough_until_flush); + } + + ObjectCacherObjectDispatch(ImageCtxT* image_ctx, size_t max_dirty, + bool writethrough_until_flush); + ~ObjectCacherObjectDispatch() override; + + io::ObjectDispatchLayer get_dispatch_layer() const override { + return io::OBJECT_DISPATCH_LAYER_CACHE; + } + + void init(); + void shut_down(Context* on_finish) override; + + bool read( + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool list_snaps( + uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override; + bool reset_existence_cache(Context* on_finish) override; + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) { + } + + int prepare_copyup( + uint64_t object_no, + io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override { + return 0; + } + +private: + struct C_InvalidateCache; + + ImageCtxT* m_image_ctx; + size_t m_max_dirty; + bool m_writethrough_until_flush; + + ceph::mutex m_cache_lock; + ObjectCacher *m_object_cacher = nullptr; + ObjectCacher::ObjectSet *m_object_set = nullptr; + + WritebackHandler *m_writeback_handler = nullptr; + + bool m_user_flushed = false; + +}; + +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::ObjectCacherObjectDispatch; + +#endif // CEPH_LIBRBD_CACHE_OBJECT_CACHER_OBJECT_DISPATCH_H diff --git a/src/librbd/cache/ObjectCacherWriteback.cc b/src/librbd/cache/ObjectCacherWriteback.cc new file mode 100644 index 000000000..97f2d46ba --- /dev/null +++ b/src/librbd/cache/ObjectCacherWriteback.cc @@ -0,0 +1,287 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "librbd/cache/ObjectCacherWriteback.h" +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/ceph_mutex.h" +#include "osdc/Striper.h" +#include "include/Context.h" +#include "include/neorados/RADOS.hpp" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" + +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Utils.h" + +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::ObjectCacherWriteback: " + +using namespace std; + +namespace librbd { +namespace cache { + +/** + * context to wrap another context in a Mutex + * + * @param cct cct + * @param c context to finish + * @param l mutex to lock + */ +class C_ReadRequest : public Context { +public: + C_ReadRequest(CephContext *cct, Context *c, ceph::mutex *cache_lock) + : m_cct(cct), m_ctx(c), m_cache_lock(cache_lock) { + } + void finish(int r) override { + ldout(m_cct, 20) << "aio_cb completing " << dendl; + { + std::lock_guard cache_locker{*m_cache_lock}; + m_ctx->complete(r); + } + ldout(m_cct, 20) << "aio_cb finished" << dendl; + } +private: + CephContext *m_cct; + Context *m_ctx; + ceph::mutex *m_cache_lock; +}; + +class C_OrderedWrite : public Context { +public: + C_OrderedWrite(CephContext *cct, + ObjectCacherWriteback::write_result_d *result, + const ZTracer::Trace &trace, ObjectCacherWriteback *wb) + : m_cct(cct), m_result(result), m_trace(trace), m_wb_handler(wb) {} + ~C_OrderedWrite() override {} + void finish(int r) override { + ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl; + { + std::lock_guard l{m_wb_handler->m_lock}; + ceph_assert(!m_result->done); + m_result->done = true; + m_result->ret = r; + m_wb_handler->complete_writes(m_result->oid); + } + ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl; + m_trace.event("finish"); + } +private: + CephContext *m_cct; + ObjectCacherWriteback::write_result_d *m_result; + ZTracer::Trace m_trace; + ObjectCacherWriteback *m_wb_handler; +}; + +struct C_CommitIOEventExtent : public Context { + ImageCtx *image_ctx; + uint64_t journal_tid; + uint64_t offset; + uint64_t length; + + C_CommitIOEventExtent(ImageCtx *image_ctx, uint64_t journal_tid, + uint64_t offset, uint64_t length) + : image_ctx(image_ctx), journal_tid(journal_tid), offset(offset), + length(length) { + } + + void finish(int r) override { + // all IO operations are flushed prior to closing the journal + ceph_assert(image_ctx->journal != nullptr); + + image_ctx->journal->commit_io_event_extent(journal_tid, offset, length, r); + } +}; + +ObjectCacherWriteback::ObjectCacherWriteback(ImageCtx *ictx, ceph::mutex& lock) + : m_tid(0), m_lock(lock), m_ictx(ictx) { +} + +void ObjectCacherWriteback::read(const object_t& oid, uint64_t object_no, + const object_locator_t& oloc, + uint64_t off, uint64_t len, snapid_t snapid, + bufferlist *pbl, uint64_t trunc_size, + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, + Context *onfinish) +{ + ZTracer::Trace trace; + if (parent_trace.valid()) { + trace.init("", &m_ictx->trace_endpoint, &parent_trace); + trace.copy_name("cache read " + oid.name); + trace.event("start"); + } + + // on completion, take the mutex and then call onfinish. + onfinish = new C_ReadRequest(m_ictx->cct, onfinish, &m_lock); + + // re-use standard object read state machine + auto aio_comp = io::AioCompletion::create_and_start(onfinish, m_ictx, + io::AIO_TYPE_READ); + aio_comp->read_result = io::ReadResult{pbl}; + aio_comp->set_request_count(1); + + auto req_comp = new io::ReadResult::C_ObjectReadRequest( + aio_comp, {{off, len, {{0, len}}}}); + + auto io_context = m_ictx->duplicate_data_io_context(); + if (snapid != CEPH_NOSNAP) { + io_context->read_snap(snapid); + } + + // extract the embedded RBD read flags from the op_flags + int read_flags = (op_flags & READ_FLAGS_MASK) >> READ_FLAGS_SHIFT; + op_flags &= ~READ_FLAGS_MASK; + + auto req = io::ObjectDispatchSpec::create_read( + m_ictx, io::OBJECT_DISPATCH_LAYER_CACHE, object_no, &req_comp->extents, + io_context, op_flags, read_flags, trace, nullptr, req_comp); + req->send(); +} + +bool ObjectCacherWriteback::may_copy_on_write(const object_t& oid, + uint64_t read_off, + uint64_t read_len, + snapid_t snapid) +{ + std::shared_lock image_locker(m_ictx->image_lock); + uint64_t raw_overlap = 0; + uint64_t object_overlap = 0; + m_ictx->get_parent_overlap(m_ictx->snap_id, &raw_overlap); + if (raw_overlap > 0) { + uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix); + auto [parent_extents, area] = io::util::object_to_area_extents( + m_ictx, object_no, {{0, m_ictx->layout.object_size}}); + object_overlap = m_ictx->prune_parent_extents(parent_extents, area, + raw_overlap, false); + } + bool may = object_overlap > 0; + ldout(m_ictx->cct, 10) << "may_copy_on_write " << oid << " " << read_off + << "~" << read_len << " = " << may << dendl; + return may; +} + +ceph_tid_t ObjectCacherWriteback::write(const object_t& oid, + const object_locator_t& oloc, + uint64_t off, uint64_t len, + const SnapContext& snapc, + const bufferlist &bl, + ceph::real_time mtime, + uint64_t trunc_size, + __u32 trunc_seq, ceph_tid_t journal_tid, + const ZTracer::Trace &parent_trace, + Context *oncommit) +{ + ZTracer::Trace trace; + if (parent_trace.valid()) { + trace.init("", &m_ictx->trace_endpoint, &parent_trace); + trace.copy_name("writeback " + oid.name); + trace.event("start"); + } + + uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix); + + write_result_d *result = new write_result_d(oid.name, oncommit); + m_writes[oid.name].push(result); + ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl; + + bufferlist bl_copy(bl); + + Context *ctx = new C_OrderedWrite(m_ictx->cct, result, trace, this); + ctx = util::create_async_context_callback(*m_ictx, ctx); + + auto io_context = m_ictx->duplicate_data_io_context(); + if (!snapc.empty()) { + io_context->write_snap_context( + {{snapc.seq, {snapc.snaps.begin(), snapc.snaps.end()}}}); + } + + auto req = io::ObjectDispatchSpec::create_write( + m_ictx, io::OBJECT_DISPATCH_LAYER_CACHE, object_no, off, std::move(bl_copy), + io_context, 0, 0, std::nullopt, journal_tid, trace, ctx); + req->object_dispatch_flags = ( + io::OBJECT_DISPATCH_FLAG_FLUSH | + io::OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR); + req->send(); + + return ++m_tid; +} + + +void ObjectCacherWriteback::overwrite_extent(const object_t& oid, uint64_t off, + uint64_t len, + ceph_tid_t original_journal_tid, + ceph_tid_t new_journal_tid) { + ldout(m_ictx->cct, 20) << __func__ << ": " << oid << " " + << off << "~" << len << " " + << "journal_tid=" << original_journal_tid << ", " + << "new_journal_tid=" << new_journal_tid << dendl; + + uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix); + + // all IO operations are flushed prior to closing the journal + ceph_assert(original_journal_tid != 0 && m_ictx->journal != NULL); + + auto [image_extents, _] = io::util::object_to_area_extents(m_ictx, object_no, + {{off, len}}); + for (auto it = image_extents.begin(); it != image_extents.end(); ++it) { + if (new_journal_tid != 0) { + // ensure new journal event is safely committed to disk before + // committing old event + m_ictx->journal->flush_event( + new_journal_tid, new C_CommitIOEventExtent(m_ictx, + original_journal_tid, + it->first, it->second)); + } else { + m_ictx->journal->commit_io_event_extent(original_journal_tid, it->first, + it->second, 0); + } + } +} + +void ObjectCacherWriteback::complete_writes(const std::string& oid) +{ + ceph_assert(ceph_mutex_is_locked(m_lock)); + std::queue& results = m_writes[oid]; + ldout(m_ictx->cct, 20) << "complete_writes() oid " << oid << dendl; + std::list finished; + + while (!results.empty()) { + write_result_d *result = results.front(); + if (!result->done) + break; + finished.push_back(result); + results.pop(); + } + + if (results.empty()) + m_writes.erase(oid); + + for (std::list::iterator it = finished.begin(); + it != finished.end(); ++it) { + write_result_d *result = *it; + ldout(m_ictx->cct, 20) << "complete_writes() completing " << result + << dendl; + result->oncommit->complete(result->ret); + delete result; + } +} + +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/ObjectCacherWriteback.h b/src/librbd/cache/ObjectCacherWriteback.h new file mode 100644 index 000000000..d8c2ebbd9 --- /dev/null +++ b/src/librbd/cache/ObjectCacherWriteback.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_OBJECT_CACHER_WRITEBACK_H +#define CEPH_LIBRBD_CACHE_OBJECT_CACHER_WRITEBACK_H + +#include "common/snap_types.h" +#include "osd/osd_types.h" +#include "osdc/WritebackHandler.h" +#include + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace cache { + +class ObjectCacherWriteback : public WritebackHandler { +public: + static const int READ_FLAGS_MASK = 0xF000; + static const int READ_FLAGS_SHIFT = 24; + + ObjectCacherWriteback(ImageCtx *ictx, ceph::mutex& lock); + + // Note that oloc, trunc_size, and trunc_seq are ignored + void read(const object_t& oid, uint64_t object_no, + const object_locator_t& oloc, uint64_t off, uint64_t len, + snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, Context *onfinish) override; + + // Determine whether a read to this extent could be affected by a + // write-triggered copy-on-write + bool may_copy_on_write(const object_t& oid, uint64_t read_off, + uint64_t read_len, snapid_t snapid) override; + + // Note that oloc, trunc_size, and trunc_seq are ignored + ceph_tid_t write(const object_t& oid, const object_locator_t& oloc, + uint64_t off, uint64_t len, + const SnapContext& snapc, const bufferlist &bl, + ceph::real_time mtime, uint64_t trunc_size, + __u32 trunc_seq, ceph_tid_t journal_tid, + const ZTracer::Trace &parent_trace, + Context *oncommit) override; + using WritebackHandler::write; + + void overwrite_extent(const object_t& oid, uint64_t off, + uint64_t len, ceph_tid_t original_journal_tid, + ceph_tid_t new_journal_tid) override; + + struct write_result_d { + bool done; + int ret; + std::string oid; + Context *oncommit; + write_result_d(const std::string& oid, Context *oncommit) : + done(false), ret(0), oid(oid), oncommit(oncommit) {} + private: + write_result_d(const write_result_d& rhs); + const write_result_d& operator=(const write_result_d& rhs); + }; + +private: + void complete_writes(const std::string& oid); + + ceph_tid_t m_tid; + ceph::mutex& m_lock; + librbd::ImageCtx *m_ictx; + ceph::unordered_map > m_writes; + friend class C_OrderedWrite; +}; + +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_OBJECT_CACHER_WRITEBACK_H diff --git a/src/librbd/cache/ParentCacheObjectDispatch.cc b/src/librbd/cache/ParentCacheObjectDispatch.cc new file mode 100644 index 000000000..d5ef373ab --- /dev/null +++ b/src/librbd/cache/ParentCacheObjectDispatch.cc @@ -0,0 +1,261 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/errno.h" +#include "include/neorados/RADOS.hpp" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/ParentCacheObjectDispatch.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/plugin/Api.h" +#include "osd/osd_types.h" +#include "osdc/WritebackHandler.h" + +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::ParentCacheObjectDispatch: " \ + << this << " " << __func__ << ": " + +using namespace std; +using namespace ceph::immutable_obj_cache; +using librbd::util::data_object_name; + +namespace librbd { +namespace cache { + +template +ParentCacheObjectDispatch::ParentCacheObjectDispatch( + I* image_ctx, plugin::Api& plugin_api) + : m_image_ctx(image_ctx), m_plugin_api(plugin_api), + m_lock(ceph::make_mutex( + "librbd::cache::ParentCacheObjectDispatch::lock", true, false)) { + ceph_assert(m_image_ctx->data_ctx.is_valid()); + auto controller_path = image_ctx->cct->_conf.template get_val( + "immutable_object_cache_sock"); + m_cache_client = new CacheClient(controller_path.c_str(), m_image_ctx->cct); +} + +template +ParentCacheObjectDispatch::~ParentCacheObjectDispatch() { + delete m_cache_client; + m_cache_client = nullptr; +} + +template +void ParentCacheObjectDispatch::init(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + if (m_image_ctx->child == nullptr) { + ldout(cct, 5) << "non-parent image: skipping" << dendl; + if (on_finish != nullptr) { + on_finish->complete(-EINVAL); + } + return; + } + + m_image_ctx->io_object_dispatcher->register_dispatch(this); + + std::unique_lock locker{m_lock}; + create_cache_session(on_finish, false); +} + +template +bool ParentCacheObjectDispatch::read( + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << *extents << dendl; + + if (version != nullptr) { + // we currently don't cache read versions + return false; + } + + string oid = data_object_name(m_image_ctx, object_no); + + /* if RO daemon still don't startup, or RO daemon crash, + * or session occur any error, try to re-connect daemon.*/ + std::unique_lock locker{m_lock}; + if (!m_cache_client->is_session_work()) { + create_cache_session(nullptr, true); + ldout(cct, 5) << "Parent cache try to re-connect to RO daemon. " + << "dispatch current request to lower object layer" << dendl; + return false; + } + + CacheGenContextURef ctx = make_gen_lambda_context> + ([this, extents, dispatch_result, on_dispatched, object_no, io_context, + read_flags, &parent_trace] + (ObjectCacheRequest* ack) { + handle_read_cache(ack, object_no, extents, io_context, read_flags, + parent_trace, dispatch_result, on_dispatched); + }); + + m_cache_client->lookup_object(m_image_ctx->data_ctx.get_namespace(), + m_image_ctx->data_ctx.get_id(), + io_context->read_snap().value_or(CEPH_NOSNAP), + m_image_ctx->layout.object_size, + oid, std::move(ctx)); + return true; +} + +template +void ParentCacheObjectDispatch::handle_read_cache( + ObjectCacheRequest* ack, uint64_t object_no, io::ReadExtents* extents, + IOContext io_context, int read_flags, const ZTracer::Trace &parent_trace, + io::DispatchResult* dispatch_result, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + if(ack->type != RBDSC_READ_REPLY) { + // go back to read rados + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + on_dispatched->complete(0); + return; + } + + ceph_assert(ack->type == RBDSC_READ_REPLY); + std::string file_path = ((ObjectCacheReadReplyData*)ack)->cache_path; + if (file_path.empty()) { + if ((read_flags & io::READ_FLAG_DISABLE_READ_FROM_PARENT) != 0) { + on_dispatched->complete(-ENOENT); + return; + } + + auto ctx = new LambdaContext( + [this, dispatch_result, on_dispatched](int r) { + if (r < 0 && r != -ENOENT) { + lderr(m_image_ctx->cct) << "failed to read parent: " + << cpp_strerror(r) << dendl; + } + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + on_dispatched->complete(r); + }); + m_plugin_api.read_parent(m_image_ctx, object_no, extents, + io_context->read_snap().value_or(CEPH_NOSNAP), + parent_trace, ctx); + return; + } + + int read_len = 0; + for (auto& extent: *extents) { + // try to read from parent image cache + int r = read_object(file_path, &extent.bl, extent.offset, extent.length, + on_dispatched); + if (r < 0) { + // cache read error, fall back to read rados + for (auto& read_extent: *extents) { + // clear read bufferlists + if (&read_extent == &extent) { + break; + } + read_extent.bl.clear(); + } + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + on_dispatched->complete(0); + return; + } + + read_len += r; + } + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + on_dispatched->complete(read_len); +} + +template +int ParentCacheObjectDispatch::handle_register_client(bool reg) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + if (!reg) { + lderr(cct) << "Parent cache register fails." << dendl; + } + return 0; +} + +template +void ParentCacheObjectDispatch::create_cache_session(Context* on_finish, + bool is_reconnect) { + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + if (m_connecting) { + return; + } + m_connecting = true; + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + Context* register_ctx = new LambdaContext([this, cct, on_finish](int ret) { + if (ret < 0) { + lderr(cct) << "Parent cache fail to register client." << dendl; + } + handle_register_client(ret < 0 ? false : true); + + ceph_assert(m_connecting); + m_connecting = false; + + if (on_finish != nullptr) { + on_finish->complete(0); + } + }); + + Context* connect_ctx = new LambdaContext( + [this, cct, register_ctx](int ret) { + if (ret < 0) { + lderr(cct) << "Parent cache fail to connect RO daemon." << dendl; + register_ctx->complete(ret); + return; + } + + ldout(cct, 20) << "Parent cache connected to RO daemon." << dendl; + + m_cache_client->register_client(register_ctx); + }); + + if (m_cache_client != nullptr && is_reconnect) { + // CacheClient's destruction will cleanup all details on old session. + delete m_cache_client; + + // create new CacheClient to connect RO daemon. + auto controller_path = cct->_conf.template get_val( + "immutable_object_cache_sock"); + m_cache_client = new CacheClient(controller_path.c_str(), m_image_ctx->cct); + } + + m_cache_client->run(); + m_cache_client->connect(connect_ctx); +} + +template +int ParentCacheObjectDispatch::read_object( + std::string file_path, ceph::bufferlist* read_data, uint64_t offset, + uint64_t length, Context *on_finish) { + + auto *cct = m_image_ctx->cct; + ldout(cct, 20) << "file path: " << file_path << dendl; + + std::string error; + int ret = read_data->pread_file(file_path.c_str(), offset, length, &error); + if (ret < 0) { + ldout(cct, 5) << "read from file return error: " << error + << "file path= " << file_path + << dendl; + return ret; + } + return read_data->length(); +} + +} // namespace cache +} // namespace librbd + +template class librbd::cache::ParentCacheObjectDispatch; diff --git a/src/librbd/cache/ParentCacheObjectDispatch.h b/src/librbd/cache/ParentCacheObjectDispatch.h new file mode 100644 index 000000000..caf782244 --- /dev/null +++ b/src/librbd/cache/ParentCacheObjectDispatch.h @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PARENT_CACHER_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_CACHE_PARENT_CACHER_OBJECT_DISPATCH_H + +#include "librbd/io/ObjectDispatchInterface.h" +#include "common/ceph_mutex.h" +#include "librbd/cache/TypeTraits.h" +#include "tools/immutable_object_cache/CacheClient.h" +#include "tools/immutable_object_cache/Types.h" + +namespace librbd { + +class ImageCtx; + +namespace plugin { template struct Api; } + +namespace cache { + +template +class ParentCacheObjectDispatch : public io::ObjectDispatchInterface { + // mock unit testing support + typedef cache::TypeTraits TypeTraits; + typedef typename TypeTraits::CacheClient CacheClient; + +public: + static ParentCacheObjectDispatch* create(ImageCtxT* image_ctx, + plugin::Api& plugin_api) { + return new ParentCacheObjectDispatch(image_ctx, plugin_api); + } + + ParentCacheObjectDispatch(ImageCtxT* image_ctx, + plugin::Api& plugin_api); + ~ParentCacheObjectDispatch() override; + + io::ObjectDispatchLayer get_dispatch_layer() const override { + return io::OBJECT_DISPATCH_LAYER_PARENT_CACHE; + } + + void init(Context* on_finish = nullptr); + void shut_down(Context* on_finish) { + m_image_ctx->op_work_queue->queue(on_finish, 0); + } + + bool read( + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + return false; + } + + bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + return false; + } + + bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + return false; + } + + bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + return false; + } + + bool flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_id, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + return false; + } + + bool list_snaps( + uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) { + return false; + } + + bool reset_existence_cache(Context* on_finish) { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) { + } + + int prepare_copyup( + uint64_t object_no, + io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override { + return 0; + } + + ImageCtxT* get_image_ctx() { + return m_image_ctx; + } + + CacheClient* get_cache_client() { + return m_cache_client; + } + +private: + + int read_object(std::string file_path, ceph::bufferlist* read_data, + uint64_t offset, uint64_t length, Context *on_finish); + void handle_read_cache(ceph::immutable_obj_cache::ObjectCacheRequest* ack, + uint64_t object_no, io::ReadExtents* extents, + IOContext io_context, int read_flags, + const ZTracer::Trace &parent_trace, + io::DispatchResult* dispatch_result, + Context* on_dispatched); + int handle_register_client(bool reg); + void create_cache_session(Context* on_finish, bool is_reconnect); + + ImageCtxT* m_image_ctx; + plugin::Api& m_plugin_api; + + ceph::mutex m_lock; + CacheClient *m_cache_client = nullptr; + bool m_connecting = false; +}; + +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::ParentCacheObjectDispatch; + +#endif // CEPH_LIBRBD_CACHE_PARENT_CACHER_OBJECT_DISPATCH_H diff --git a/src/librbd/cache/TypeTraits.h b/src/librbd/cache/TypeTraits.h new file mode 100644 index 000000000..dd7075e8d --- /dev/null +++ b/src/librbd/cache/TypeTraits.h @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_TYPE_TRAITS_H +#define CEPH_LIBRBD_CACHE_TYPE_TRAITS_H + +namespace ceph { +namespace immutable_obj_cache { + +class CacheClient; + +} // namespace immutable_obj_cache +} // namespace ceph + +namespace librbd { +namespace cache { + +template +struct TypeTraits { + typedef ceph::immutable_obj_cache::CacheClient CacheClient; +}; + +} // namespace librbd +} // namespace cache + +#endif diff --git a/src/librbd/cache/Types.h b/src/librbd/cache/Types.h new file mode 100644 index 000000000..43dcd758f --- /dev/null +++ b/src/librbd/cache/Types.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_TYPES_H +#define CEPH_LIBRBD_CACHE_TYPES_H + +#include +#include + +class Context; + +namespace librbd { +namespace cache { + +enum ImageCacheType { + IMAGE_CACHE_TYPE_RWL = 1, + IMAGE_CACHE_TYPE_SSD, + IMAGE_CACHE_TYPE_UNKNOWN +}; + +typedef std::list Contexts; + +const std::string PERSISTENT_CACHE_STATE = ".rbd_persistent_cache_state"; + +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_TYPES_H diff --git a/src/librbd/cache/Utils.h b/src/librbd/cache/Utils.h new file mode 100644 index 000000000..cd2eb7c3b --- /dev/null +++ b/src/librbd/cache/Utils.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_UTILS_H +#define CEPH_LIBRBD_CACHE_UTILS_H + +#include "acconfig.h" +#include + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace cache { +namespace util { + +template +bool is_pwl_enabled(T& image_ctx) { +#if defined(WITH_RBD_RWL) || defined(WITH_RBD_SSD_CACHE) + auto value = image_ctx.config.template get_val("rbd_persistent_cache_mode"); + return value == "disabled" ? false : true; +#else + return false; +#endif // WITH_RBD_RWL +} + +} // namespace util +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_UTILS_H diff --git a/src/librbd/cache/WriteAroundObjectDispatch.cc b/src/librbd/cache/WriteAroundObjectDispatch.cc new file mode 100644 index 000000000..fafb73f40 --- /dev/null +++ b/src/librbd/cache/WriteAroundObjectDispatch.cc @@ -0,0 +1,525 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/WriteAroundObjectDispatch.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::WriteAroundObjectDispatch: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { + +using librbd::util::data_object_name; + +template +WriteAroundObjectDispatch::WriteAroundObjectDispatch( + I* image_ctx, size_t max_dirty, bool writethrough_until_flush) + : m_image_ctx(image_ctx), m_init_max_dirty(max_dirty), m_max_dirty(max_dirty), + m_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::cache::WriteAroundObjectDispatch::lock", this))) { + if (writethrough_until_flush) { + m_max_dirty = 0; + } +} + +template +WriteAroundObjectDispatch::~WriteAroundObjectDispatch() { +} + +template +void WriteAroundObjectDispatch::init() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + // add ourself to the IO object dispatcher chain + if (m_init_max_dirty > 0) { + m_image_ctx->disable_zero_copy = true; + } + m_image_ctx->io_object_dispatcher->register_dispatch(this); +} + +template +void WriteAroundObjectDispatch::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + on_finish->complete(0); +} + +template +bool WriteAroundObjectDispatch::read( + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + bool handled = false; + for (auto& extent: *extents) { + handled |= dispatch_unoptimized_io(object_no, extent.offset, extent.length, + dispatch_result, on_dispatched); + } + return handled; +} + +template +bool WriteAroundObjectDispatch::discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + return dispatch_io(object_no, object_off, object_len, 0, dispatch_result, + on_finish, on_dispatched); +} + +template +bool WriteAroundObjectDispatch::write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << data.length() << dendl; + + return dispatch_io(object_no, object_off, data.length(), op_flags, + dispatch_result, on_finish, on_dispatched); +} + +template +bool WriteAroundObjectDispatch::write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + return dispatch_io(object_no, object_off, object_len, op_flags, + dispatch_result, on_finish, on_dispatched); +} + +template +bool WriteAroundObjectDispatch::compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + return dispatch_unoptimized_io(object_no, object_off, cmp_data.length(), + dispatch_result, on_dispatched); +} + +template +bool WriteAroundObjectDispatch::flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + std::lock_guard locker{m_lock}; + if (flush_source == io::FLUSH_SOURCE_USER && !m_user_flushed) { + m_user_flushed = true; + if (m_max_dirty == 0 && m_init_max_dirty > 0) { + ldout(cct, 5) << "first user flush: enabling write-around" << dendl; + m_max_dirty = m_init_max_dirty; + } + } + + if (m_in_flight_io_tids.empty()) { + // no in-flight IO (also implies no queued/blocked IO) + return false; + } + + auto tid = ++m_last_tid; + auto ctx = util::create_async_context_callback(*m_image_ctx, *on_finish); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + *on_finish = new LambdaContext([this, tid](int r) { + handle_in_flight_flush_complete(r, tid); + }); + + if (m_queued_ios.empty() && m_blocked_ios.empty()) { + // immediately allow the flush to be dispatched + ldout(cct, 20) << "dispatching: tid=" << tid << dendl; + m_in_flight_flushes.emplace(tid, ctx); + return false; + } + + // cannot dispatch the flush until after preceeding IO is dispatched + ldout(cct, 20) << "queueing: tid=" << tid << dendl; + m_queued_flushes.emplace(tid, QueuedFlush{ctx, on_dispatched}); + return true; +} + +template +bool WriteAroundObjectDispatch::dispatch_unoptimized_io( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::DispatchResult* dispatch_result, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + + m_lock.lock(); + auto in_flight_extents_it = m_in_flight_extents.find(object_no); + if (in_flight_extents_it == m_in_flight_extents.end() || + !in_flight_extents_it->second.intersects(object_off, object_len)) { + // no IO in-flight to the specified extent + m_lock.unlock(); + return false; + } + + // write IO is in-flight -- it needs to complete before the unoptimized + // IO can be dispatched + auto tid = ++m_last_tid; + ldout(cct, 20) << "blocked by in-flight IO: tid=" << tid << dendl; + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + m_blocked_unoptimized_ios[object_no].emplace( + tid, BlockedIO{object_off, object_len, nullptr, on_dispatched}); + m_lock.unlock(); + + return true; +} + +template +bool WriteAroundObjectDispatch::dispatch_io( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + int op_flags, io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + + m_lock.lock(); + if (m_max_dirty == 0) { + // write-through mode is active -- no-op the cache + m_lock.unlock(); + return false; + } + + if ((op_flags & LIBRADOS_OP_FLAG_FADVISE_FUA) != 0) { + // force unit access flag is set -- disable write-around + m_lock.unlock(); + return dispatch_unoptimized_io(object_no, object_off, object_len, + dispatch_result, on_dispatched); + } + + auto tid = ++m_last_tid; + auto ctx = util::create_async_context_callback(*m_image_ctx, *on_finish); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + *on_finish = new LambdaContext( + [this, tid, object_no, object_off, object_len](int r) { + handle_in_flight_io_complete(r, tid, object_no, object_off, object_len); + }); + + bool blocked = block_overlapping_io(&m_in_flight_extents[object_no], + object_off, object_len); + if (blocked) { + ldout(cct, 20) << "blocked on overlap: tid=" << tid << dendl; + m_queued_or_blocked_io_tids.insert(tid); + m_blocked_ios[object_no].emplace(tid, BlockedIO{object_off, object_len, ctx, + on_dispatched}); + m_lock.unlock(); + } else if (can_dispatch_io(tid, object_len)) { + m_lock.unlock(); + + ldout(cct, 20) << "dispatching: tid=" << tid << dendl; + on_dispatched->complete(0); + ctx->complete(0); + } else { + ldout(cct, 20) << "queueing: tid=" << tid << dendl; + m_queued_or_blocked_io_tids.insert(tid); + m_queued_ios.emplace(tid, QueuedIO{object_len, ctx, on_dispatched}); + m_lock.unlock(); + } + return true; +} + +template +bool WriteAroundObjectDispatch::block_overlapping_io( + InFlightObjectExtents* in_flight_object_extents, uint64_t object_off, + uint64_t object_len) { + if (in_flight_object_extents->intersects(object_off, object_len)) { + return true; + } + + in_flight_object_extents->insert(object_off, object_len); + return false; +} + +template +void WriteAroundObjectDispatch::unblock_overlapping_ios( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + Contexts* unoptimized_io_dispatches) { + auto cct = m_image_ctx->cct; + ceph_assert(ceph_mutex_is_locked(m_lock)); + + auto in_flight_extents_it = m_in_flight_extents.find(object_no); + ceph_assert(in_flight_extents_it != m_in_flight_extents.end()); + + auto& in_flight_object_extents = in_flight_extents_it->second; + in_flight_object_extents.erase(object_off, object_len); + + // handle unoptimized IOs that were blocked by in-flight IO + InFlightObjectExtents blocked_unoptimized_ios; + auto blocked_unoptimized_ios_it = m_blocked_unoptimized_ios.find(object_no); + if (blocked_unoptimized_ios_it != m_blocked_unoptimized_ios.end()) { + auto& blocked_unoptimized_object_ios = blocked_unoptimized_ios_it->second; + for (auto it = blocked_unoptimized_object_ios.begin(); + it != blocked_unoptimized_object_ios.end();) { + auto& blocked_io = it->second; + if (!in_flight_object_extents.intersects(blocked_io.offset, + blocked_io.length)) { + unoptimized_io_dispatches->emplace(it->first, blocked_io.on_dispatched); + it = blocked_unoptimized_object_ios.erase(it); + } else { + blocked_unoptimized_ios.union_insert(blocked_io.offset, + blocked_io.length); + ++it; + } + } + + if (blocked_unoptimized_object_ios.empty()) { + m_blocked_unoptimized_ios.erase(blocked_unoptimized_ios_it); + } + } + + // handle optimized IOs that were blocked + auto blocked_io_it = m_blocked_ios.find(object_no); + if (blocked_io_it != m_blocked_ios.end()) { + auto& blocked_object_ios = blocked_io_it->second; + + auto blocked_object_ios_it = blocked_object_ios.begin(); + while (blocked_object_ios_it != blocked_object_ios.end()) { + auto next_blocked_object_ios_it = blocked_object_ios_it; + ++next_blocked_object_ios_it; + + auto& blocked_io = blocked_object_ios_it->second; + if (blocked_unoptimized_ios.intersects(blocked_io.offset, + blocked_io.length) || + block_overlapping_io(&in_flight_object_extents, blocked_io.offset, + blocked_io.length)) { + break; + } + + // move unblocked IO to the queued list, which will get processed when + // there is capacity + auto tid = blocked_object_ios_it->first; + ldout(cct, 20) << "queueing unblocked: tid=" << tid << dendl; + m_queued_ios.emplace(tid, blocked_io); + + blocked_object_ios.erase(blocked_object_ios_it); + blocked_object_ios_it = next_blocked_object_ios_it; + } + + if (blocked_object_ios.empty()) { + m_blocked_ios.erase(blocked_io_it); + } + } + + if (in_flight_object_extents.empty()) { + m_in_flight_extents.erase(in_flight_extents_it); + } +} + +template +bool WriteAroundObjectDispatch::can_dispatch_io( + uint64_t tid, uint64_t length) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + if (m_in_flight_bytes == 0 || m_in_flight_bytes + length <= m_max_dirty) { + // no in-flight IO or still under max write-around in-flight limit. + // allow the dispatcher to proceed to send the IO but complete it back + // to the invoker. + m_in_flight_bytes += length; + m_in_flight_io_tids.insert(tid); + return true; + } + + return false; +} + +template +void WriteAroundObjectDispatch::handle_in_flight_io_complete( + int r, uint64_t tid, uint64_t object_no, uint64_t object_off, + uint64_t object_len) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", tid=" << tid << dendl; + + m_lock.lock(); + m_in_flight_io_tids.erase(tid); + ceph_assert(m_in_flight_bytes >= object_len); + m_in_flight_bytes -= object_len; + + if (r < 0) { + lderr(cct) << "IO error encountered: tid=" << tid << ": " + << cpp_strerror(r) << dendl; + if (m_pending_flush_error == 0) { + m_pending_flush_error = r; + } + } + + // any overlapping blocked IOs can be queued now + Contexts unoptimized_io_dispatches; + unblock_overlapping_ios(object_no, object_off, object_len, + &unoptimized_io_dispatches); + + // collect any flushes that are ready for completion + int pending_flush_error = 0; + auto finished_flushes = collect_finished_flushes(); + if (!finished_flushes.empty()) { + std::swap(pending_flush_error, m_pending_flush_error); + } + + // collect any queued IOs that are ready for dispatch + auto ready_ios = collect_ready_ios(); + + // collect any queued flushes that were tied to queued IOs + auto ready_flushes = collect_ready_flushes(); + m_lock.unlock(); + + // dispatch any ready unoptimized IOs + for (auto& it : unoptimized_io_dispatches) { + ldout(cct, 20) << "dispatching unoptimized IO: tid=" << it.first << dendl; + it.second->complete(0); + } + + // complete flushes that were waiting on in-flight IO + // (and propogate any IO error to first flush) + for (auto& it : finished_flushes) { + ldout(cct, 20) << "completing flush: tid=" << it.first << ", " + << "r=" << pending_flush_error << dendl; + it.second->complete(pending_flush_error); + } + + // dispatch any ready queued IOs + for (auto& it : ready_ios) { + ldout(cct, 20) << "dispatching IO: tid=" << it.first << dendl; + it.second.on_dispatched->complete(0); + it.second.on_finish->complete(0); + } + + // dispatch any ready flushes + for (auto& it : ready_flushes) { + ldout(cct, 20) << "dispatching flush: tid=" << it.first << dendl; + it.second->complete(0); + } +} + +template +void WriteAroundObjectDispatch::handle_in_flight_flush_complete( + int r, uint64_t tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", tid=" << tid << dendl; + + m_lock.lock(); + + // move the in-flight flush to the pending completion list + auto it = m_in_flight_flushes.find(tid); + ceph_assert(it != m_in_flight_flushes.end()); + + m_pending_flushes.emplace(it->first, it->second); + m_in_flight_flushes.erase(it); + + // collect any flushes that are ready for completion + int pending_flush_error = 0; + auto finished_flushes = collect_finished_flushes(); + if (!finished_flushes.empty()) { + std::swap(pending_flush_error, m_pending_flush_error); + } + m_lock.unlock(); + + // complete flushes that were waiting on in-flight IO + // (and propogate any IO errors) + for (auto& it : finished_flushes) { + ldout(cct, 20) << "completing flush: tid=" << it.first << dendl; + it.second->complete(pending_flush_error); + pending_flush_error = 0; + } +} + +template +typename WriteAroundObjectDispatch::QueuedIOs +WriteAroundObjectDispatch::collect_ready_ios() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + QueuedIOs queued_ios; + + while (true) { + auto it = m_queued_ios.begin(); + if (it == m_queued_ios.end() || + !can_dispatch_io(it->first, it->second.length)) { + break; + } + + queued_ios.emplace(it->first, it->second); + m_queued_or_blocked_io_tids.erase(it->first); + m_queued_ios.erase(it); + } + return queued_ios; +} + +template +typename WriteAroundObjectDispatch::Contexts +WriteAroundObjectDispatch::collect_ready_flushes() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + Contexts ready_flushes; + auto io_tid_it = m_queued_or_blocked_io_tids.begin(); + while (true) { + auto it = m_queued_flushes.begin(); + if (it == m_queued_flushes.end() || + (io_tid_it != m_queued_or_blocked_io_tids.end() && + *io_tid_it < it->first)) { + break; + } + + m_in_flight_flushes.emplace(it->first, it->second.on_finish); + ready_flushes.emplace(it->first, it->second.on_dispatched); + m_queued_flushes.erase(it); + } + + return ready_flushes; +} + +template +typename WriteAroundObjectDispatch::Contexts +WriteAroundObjectDispatch::collect_finished_flushes() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + Contexts finished_flushes; + auto io_tid_it = m_in_flight_io_tids.begin(); + while (true) { + auto it = m_pending_flushes.begin(); + if (it == m_pending_flushes.end() || + (io_tid_it != m_in_flight_io_tids.end() && *io_tid_it < it->first)) { + break; + } + + finished_flushes.emplace(it->first, it->second); + m_pending_flushes.erase(it); + } + return finished_flushes; +} + +} // namespace cache +} // namespace librbd + +template class librbd::cache::WriteAroundObjectDispatch; diff --git a/src/librbd/cache/WriteAroundObjectDispatch.h b/src/librbd/cache/WriteAroundObjectDispatch.h new file mode 100644 index 000000000..bc289f91c --- /dev/null +++ b/src/librbd/cache/WriteAroundObjectDispatch.h @@ -0,0 +1,212 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_WRITE_AROUND_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_CACHE_WRITE_AROUND_OBJECT_DISPATCH_H + +#include "librbd/io/ObjectDispatchInterface.h" +#include "include/interval_set.h" +#include "common/ceph_mutex.h" +#include "librbd/io/Types.h" +#include +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace cache { + +template +class WriteAroundObjectDispatch : public io::ObjectDispatchInterface { +public: + static WriteAroundObjectDispatch* create(ImageCtxT* image_ctx, + size_t max_dirty, + bool writethrough_until_flush) { + return new WriteAroundObjectDispatch(image_ctx, max_dirty, + writethrough_until_flush); + } + + WriteAroundObjectDispatch(ImageCtxT* image_ctx, size_t max_dirty, + bool writethrough_until_flush); + ~WriteAroundObjectDispatch() override; + + io::ObjectDispatchLayer get_dispatch_layer() const override { + return io::OBJECT_DISPATCH_LAYER_CACHE; + } + + void init(); + void shut_down(Context* on_finish) override; + + bool read( + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) override; + + bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) override; + + bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) override; + + bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool list_snaps( + uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + + bool reset_existence_cache(Context* on_finish) override { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override { + } + + int prepare_copyup( + uint64_t object_no, + io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override { + return 0; + } + +private: + struct QueuedIO { + QueuedIO(uint64_t length, Context* on_finish, Context* on_dispatched) + : length(length), on_finish(on_finish), on_dispatched(on_dispatched) { + } + + uint64_t length; + Context* on_finish; + Context* on_dispatched; + }; + + struct QueuedFlush { + QueuedFlush(Context* on_finish, Context* on_dispatched) + : on_finish(on_finish), on_dispatched(on_dispatched) { + } + + Context* on_finish; + Context* on_dispatched; + }; + + + struct BlockedIO : public QueuedIO { + BlockedIO(uint64_t offset, uint64_t length, Context* on_finish, + Context* on_dispatched) + : QueuedIO(length, on_finish, on_dispatched), offset(offset) { + } + + uint64_t offset; + }; + + typedef std::map QueuedIOs; + typedef std::map QueuedFlushes; + + typedef std::map BlockedObjectIOs; + typedef std::map BlockedIOs; + + typedef std::map Contexts; + typedef std::set Tids; + typedef interval_set InFlightObjectExtents; + typedef std::map InFlightExtents; + + ImageCtxT* m_image_ctx; + size_t m_init_max_dirty; + size_t m_max_dirty; + + ceph::mutex m_lock; + bool m_user_flushed = false; + + uint64_t m_last_tid = 0; + uint64_t m_in_flight_bytes = 0; + + Tids m_in_flight_io_tids; + InFlightExtents m_in_flight_extents; + + BlockedIOs m_blocked_ios; + QueuedIOs m_queued_ios; + Tids m_queued_or_blocked_io_tids; + + BlockedIOs m_blocked_unoptimized_ios; + + QueuedFlushes m_queued_flushes; + Contexts m_in_flight_flushes; + Contexts m_pending_flushes; + int m_pending_flush_error = 0; + + bool dispatch_unoptimized_io(uint64_t object_no, uint64_t object_off, + uint64_t object_len, + io::DispatchResult* dispatch_result, + Context* on_dispatched); + bool dispatch_io(uint64_t object_no, uint64_t object_off, + uint64_t object_len, int op_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatch); + + bool block_overlapping_io(InFlightObjectExtents* in_flight_object_extents, + uint64_t object_off, uint64_t object_len); + void unblock_overlapping_ios(uint64_t object_no, uint64_t object_off, + uint64_t object_len, + Contexts* unoptimized_io_dispatches); + + bool can_dispatch_io(uint64_t tid, uint64_t length); + + void handle_in_flight_io_complete(int r, uint64_t tid, uint64_t object_no, + uint64_t object_off, uint64_t object_len); + void handle_in_flight_flush_complete(int r, uint64_t tid); + + QueuedIOs collect_ready_ios(); + Contexts collect_ready_flushes(); + Contexts collect_finished_flushes(); + +}; + +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::WriteAroundObjectDispatch; + +#endif // CEPH_LIBRBD_CACHE_WRITE_AROUND_OBJECT_DISPATCH_H diff --git a/src/librbd/cache/WriteLogImageDispatch.cc b/src/librbd/cache/WriteLogImageDispatch.cc new file mode 100644 index 000000000..7e0040fe7 --- /dev/null +++ b/src/librbd/cache/WriteLogImageDispatch.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/dout.h" +#include "include/neorados/RADOS.hpp" +#include "librbd/cache/pwl/AbstractWriteLog.h" +#include "librbd/cache/pwl/ShutdownRequest.h" +#include "librbd/cache/WriteLogImageDispatch.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/Utils.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::WriteLogImageDispatch: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { + +template +void WriteLogImageDispatch::shut_down(Context* on_finish) { + ceph_assert(m_image_cache != nullptr); + + Context* ctx = new LambdaContext( + [this, on_finish](int r) { + m_image_cache = nullptr; + on_finish->complete(r); + }); + + cache::pwl::ShutdownRequest *req = cache::pwl::ShutdownRequest::create( + *m_image_ctx, m_image_cache, m_plugin_api, ctx); + req->send(); +} + +template +bool WriteLogImageDispatch::read( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + io::ReadResult &&read_result, IOContext io_context, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*image_dispatch_flags & io::IMAGE_DISPATCH_FLAG_CRYPTO_HEADER) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + if (io_context->read_snap().value_or(CEPH_NOSNAP) != CEPH_NOSNAP) { + return false; + } + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + if (preprocess_length(aio_comp, image_extents)) { + return true; + } + + m_plugin_api.update_aio_comp(aio_comp, 1, read_result, image_extents); + + auto *req_comp = m_plugin_api.create_image_read_request(aio_comp, 0, image_extents); + + m_image_cache->read(std::move(image_extents), + &req_comp->bl, op_flags, + req_comp); + return true; +} + +template +bool WriteLogImageDispatch::write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*image_dispatch_flags & io::IMAGE_DISPATCH_FLAG_CRYPTO_HEADER) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + if (preprocess_length(aio_comp, image_extents)) { + return true; + } + + m_plugin_api.update_aio_comp(aio_comp, 1); + io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp); + m_image_cache->write(std::move(image_extents), + std::move(bl), op_flags, req_comp); + return true; +} + +template +bool WriteLogImageDispatch::discard( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*image_dispatch_flags & io::IMAGE_DISPATCH_FLAG_CRYPTO_HEADER) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + if (preprocess_length(aio_comp, image_extents)) { + return true; + } + + m_plugin_api.update_aio_comp(aio_comp, image_extents.size()); + for (auto &extent : image_extents) { + io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp); + m_image_cache->discard(extent.first, extent.second, + discard_granularity_bytes, + req_comp); + } + return true; +} + +template +bool WriteLogImageDispatch::write_same( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*image_dispatch_flags & io::IMAGE_DISPATCH_FLAG_CRYPTO_HEADER) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + if (preprocess_length(aio_comp, image_extents)) { + return true; + } + + m_plugin_api.update_aio_comp(aio_comp, image_extents.size()); + for (auto &extent : image_extents) { + io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp); + m_image_cache->writesame(extent.first, extent.second, + std::move(bl), op_flags, + req_comp); + } + return true; +} + +template +bool WriteLogImageDispatch::compare_and_write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*image_dispatch_flags & io::IMAGE_DISPATCH_FLAG_CRYPTO_HEADER) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + if (preprocess_length(aio_comp, image_extents)) { + return true; + } + + m_plugin_api.update_aio_comp(aio_comp, 1); + io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp); + m_image_cache->compare_and_write( + std::move(image_extents), std::move(cmp_bl), std::move(bl), + mismatch_offset, op_flags, req_comp); + return true; +} + +template +bool WriteLogImageDispatch::flush( + io::AioCompletion* aio_comp, io::FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + + m_plugin_api.update_aio_comp(aio_comp, 1); + + io::C_AioRequest *req_comp = m_plugin_api.create_aio_request(aio_comp); + m_image_cache->flush(flush_source, req_comp); + + return true; +} + +template +bool WriteLogImageDispatch::list_snaps( + io::AioCompletion* aio_comp, io::Extents&& image_extents, + io::SnapIds&& snap_ids, + int list_snaps_flags, io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + return false; +} + +template +bool WriteLogImageDispatch::preprocess_length( + io::AioCompletion* aio_comp, io::Extents &image_extents) const { + auto total_bytes = io::util::get_extents_length(image_extents); + if (total_bytes == 0) { + m_plugin_api.update_aio_comp(aio_comp, 0); + return true; + } + return false; +} + +template +bool WriteLogImageDispatch::invalidate_cache(Context* on_finish) { + m_image_cache->invalidate(on_finish); + return true; +} + +} // namespace io +} // namespace librbd + +template class librbd::cache::WriteLogImageDispatch; diff --git a/src/librbd/cache/WriteLogImageDispatch.h b/src/librbd/cache/WriteLogImageDispatch.h new file mode 100644 index 000000000..f68f37dc7 --- /dev/null +++ b/src/librbd/cache/WriteLogImageDispatch.h @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WRITELOG_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_WRITELOG_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" +#include "librbd/plugin/Api.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace cache { + +namespace pwl { template class AbstractWriteLog; } + +template +class WriteLogImageDispatch : public io::ImageDispatchInterface { +public: + WriteLogImageDispatch(ImageCtxT* image_ctx, + pwl::AbstractWriteLog *image_cache, + plugin::Api& plugin_api) : + m_image_ctx(image_ctx), m_image_cache(image_cache), + m_plugin_api(plugin_api) { + } + + io::ImageDispatchLayer get_dispatch_layer() const override { + return io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE; + } + + void shut_down(Context* on_finish) override; + + bool read( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + io::ReadResult &&read_result, IOContext io_context, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + bool discard( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + bool write_same( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + bool compare_and_write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + bool flush( + io::AioCompletion* aio_comp, io::FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + bool list_snaps( + io::AioCompletion* aio_comp, io::Extents&& image_extents, + io::SnapIds&& snap_ids, int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool invalidate_cache(Context* on_finish) override; + +private: + ImageCtxT* m_image_ctx; + pwl::AbstractWriteLog *m_image_cache; + plugin::Api& m_plugin_api; + + bool preprocess_length( + io::AioCompletion* aio_comp, io::Extents &image_extents) const; +}; + +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::WriteLogImageDispatch; + +#endif // CEPH_LIBRBD_WRITELOG_IMAGE_DISPATCH_H diff --git a/src/librbd/cache/pwl/AbstractWriteLog.cc b/src/librbd/cache/pwl/AbstractWriteLog.cc new file mode 100644 index 000000000..1e784f6b5 --- /dev/null +++ b/src/librbd/cache/pwl/AbstractWriteLog.cc @@ -0,0 +1,2187 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "AbstractWriteLog.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "include/ceph_assert.h" +#include "common/deleter.h" +#include "common/dout.h" +#include "common/environment.h" +#include "common/errno.h" +#include "common/hostname.h" +#include "common/WorkQueue.h" +#include "common/Timer.h" +#include "common/perf_counters.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/cache/pwl/LogEntry.h" +#include "librbd/plugin/Api.h" +#include +#include + +#undef dout_subsys +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::AbstractWriteLog: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +using namespace std; +using namespace librbd::cache::pwl; + +typedef AbstractWriteLog::Extent Extent; +typedef AbstractWriteLog::Extents Extents; + +template +AbstractWriteLog::AbstractWriteLog( + I &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state, + Builder *builder, cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api) + : m_builder(builder), + m_write_log_guard(image_ctx.cct), + m_flush_guard(image_ctx.cct), + m_flush_guard_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_flush_guard_lock", this))), + m_deferred_dispatch_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_deferred_dispatch_lock", this))), + m_blockguard_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_blockguard_lock", this))), + m_thread_pool( + image_ctx.cct, "librbd::cache::pwl::AbstractWriteLog::thread_pool", + "tp_pwl", 4, ""), + m_cache_state(cache_state), + m_image_ctx(image_ctx), + m_log_pool_size(DEFAULT_POOL_SIZE), + m_image_writeback(image_writeback), + m_plugin_api(plugin_api), + m_log_retire_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_log_retire_lock", this))), + m_entry_reader_lock("librbd::cache::pwl::AbstractWriteLog::m_entry_reader_lock"), + m_log_append_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_log_append_lock", this))), + m_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::AbstractWriteLog::m_lock", this))), + m_blocks_to_log_entries(image_ctx.cct), + m_work_queue("librbd::cache::pwl::ReplicatedWriteLog::work_queue", + ceph::make_timespan( + image_ctx.config.template get_val( + "rbd_op_thread_timeout")), + &m_thread_pool) +{ + CephContext *cct = m_image_ctx.cct; + m_plugin_api.get_image_timer_instance(cct, &m_timer, &m_timer_lock); +} + +template +AbstractWriteLog::~AbstractWriteLog() { + ldout(m_image_ctx.cct, 15) << "enter" << dendl; + { + std::lock_guard timer_locker(*m_timer_lock); + std::lock_guard locker(m_lock); + m_timer->cancel_event(m_timer_ctx); + m_thread_pool.stop(); + ceph_assert(m_deferred_ios.size() == 0); + ceph_assert(m_ops_to_flush.size() == 0); + ceph_assert(m_ops_to_append.size() == 0); + ceph_assert(m_flush_ops_in_flight == 0); + + delete m_cache_state; + m_cache_state = nullptr; + } + ldout(m_image_ctx.cct, 15) << "exit" << dendl; +} + +template +void AbstractWriteLog::perf_start(std::string name) { + PerfCountersBuilder plb(m_image_ctx.cct, name, l_librbd_pwl_first, + l_librbd_pwl_last); + + // Latency axis configuration for op histograms, values are in nanoseconds + PerfHistogramCommon::axis_config_d op_hist_x_axis_config{ + "Latency (nsec)", + PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale + 0, ///< Start at 0 + 5000, ///< Quantization unit is 5usec + 16, ///< Ranges into the mS + }; + + // Syncpoint logentry number x-axis configuration for op histograms + PerfHistogramCommon::axis_config_d sp_logentry_number_config{ + "logentry number", + PerfHistogramCommon::SCALE_LINEAR, // log entry number in linear scale + 0, // Start at 0 + 1, // Quantization unit is 1 + 260, // Up to 260 > (MAX_WRITES_PER_SYNC_POINT) + }; + + // Syncpoint bytes number y-axis configuration for op histogram + PerfHistogramCommon::axis_config_d sp_bytes_number_config{ + "Number of SyncPoint", + PerfHistogramCommon::SCALE_LOG2, // Request size in logarithmic scale + 0, // Start at 0 + 512, // Quantization unit is 512 + 17, // Writes up to 8M >= MAX_BYTES_PER_SYNC_POINT + }; + + // Op size axis configuration for op histogram y axis, values are in bytes + PerfHistogramCommon::axis_config_d op_hist_y_axis_config{ + "Request size (bytes)", + PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale + 0, ///< Start at 0 + 512, ///< Quantization unit is 512 bytes + 16, ///< Writes up to >32k + }; + + // Num items configuration for op histogram y axis, values are in items + PerfHistogramCommon::axis_config_d op_hist_y_axis_count_config{ + "Number of items", + PerfHistogramCommon::SCALE_LINEAR, ///< Request size in linear scale + 0, ///< Start at 0 + 1, ///< Quantization unit is 1 + 32, ///< Writes up to >32k + }; + + plb.add_u64_counter(l_librbd_pwl_rd_req, "rd", "Reads"); + plb.add_u64_counter(l_librbd_pwl_rd_bytes, "rd_bytes", "Data size in reads"); + plb.add_time_avg(l_librbd_pwl_rd_latency, "rd_latency", "Latency of reads"); + + plb.add_u64_counter(l_librbd_pwl_rd_hit_req, "hit_rd", "Reads completely hitting RWL"); + plb.add_u64_counter(l_librbd_pwl_rd_hit_bytes, "rd_hit_bytes", "Bytes read from RWL"); + plb.add_time_avg(l_librbd_pwl_rd_hit_latency, "hit_rd_latency", "Latency of read hits"); + + plb.add_u64_counter(l_librbd_pwl_rd_part_hit_req, "part_hit_rd", "reads partially hitting RWL"); + + plb.add_u64_counter_histogram( + l_librbd_pwl_syncpoint_hist, "syncpoint_logentry_bytes_histogram", + sp_logentry_number_config, sp_bytes_number_config, + "Histogram of syncpoint's logentry numbers vs bytes number"); + + plb.add_u64_counter(l_librbd_pwl_wr_req, "wr", "Writes"); + plb.add_u64_counter(l_librbd_pwl_wr_bytes, "wr_bytes", "Data size in writes"); + plb.add_u64_counter(l_librbd_pwl_wr_req_def, "wr_def", "Writes deferred for resources"); + plb.add_u64_counter(l_librbd_pwl_wr_req_def_lanes, "wr_def_lanes", "Writes deferred for lanes"); + plb.add_u64_counter(l_librbd_pwl_wr_req_def_log, "wr_def_log", "Writes deferred for log entries"); + plb.add_u64_counter(l_librbd_pwl_wr_req_def_buf, "wr_def_buf", "Writes deferred for buffers"); + plb.add_u64_counter(l_librbd_pwl_wr_req_overlap, "wr_overlap", "Writes overlapping with prior in-progress writes"); + plb.add_u64_counter(l_librbd_pwl_wr_req_queued, "wr_q_barrier", "Writes queued for prior barriers (aio_flush)"); + + plb.add_u64_counter(l_librbd_pwl_log_ops, "log_ops", "Log appends"); + plb.add_u64_avg(l_librbd_pwl_log_op_bytes, "log_op_bytes", "Average log append bytes"); + + plb.add_time_avg( + l_librbd_pwl_req_arr_to_all_t, "req_arr_to_all_t", + "Average arrival to allocation time (time deferred for overlap)"); + plb.add_time_avg( + l_librbd_pwl_req_arr_to_dis_t, "req_arr_to_dis_t", + "Average arrival to dispatch time (includes time deferred for overlaps and allocation)"); + plb.add_time_avg( + l_librbd_pwl_req_all_to_dis_t, "req_all_to_dis_t", + "Average allocation to dispatch time (time deferred for log resources)"); + plb.add_time_avg( + l_librbd_pwl_wr_latency, "wr_latency", + "Latency of writes (persistent completion)"); + plb.add_u64_counter_histogram( + l_librbd_pwl_wr_latency_hist, "wr_latency_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of write request latency (nanoseconds) vs. bytes written"); + plb.add_time_avg( + l_librbd_pwl_wr_caller_latency, "caller_wr_latency", + "Latency of write completion to caller"); + plb.add_time_avg( + l_librbd_pwl_nowait_req_arr_to_all_t, "req_arr_to_all_nw_t", + "Average arrival to allocation time (time deferred for overlap)"); + plb.add_time_avg( + l_librbd_pwl_nowait_req_arr_to_dis_t, "req_arr_to_dis_nw_t", + "Average arrival to dispatch time (includes time deferred for overlaps and allocation)"); + plb.add_time_avg( + l_librbd_pwl_nowait_req_all_to_dis_t, "req_all_to_dis_nw_t", + "Average allocation to dispatch time (time deferred for log resources)"); + plb.add_time_avg( + l_librbd_pwl_nowait_wr_latency, "wr_latency_nw", + "Latency of writes (persistent completion) not deferred for free space"); + plb.add_u64_counter_histogram( + l_librbd_pwl_nowait_wr_latency_hist, "wr_latency_nw_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of write request latency (nanoseconds) vs. bytes written for writes not deferred for free space"); + plb.add_time_avg( + l_librbd_pwl_nowait_wr_caller_latency, "caller_wr_latency_nw", + "Latency of write completion to callerfor writes not deferred for free space"); + plb.add_time_avg(l_librbd_pwl_log_op_alloc_t, "op_alloc_t", "Average buffer pmemobj_reserve() time"); + plb.add_u64_counter_histogram( + l_librbd_pwl_log_op_alloc_t_hist, "op_alloc_t_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of buffer pmemobj_reserve() time (nanoseconds) vs. bytes written"); + plb.add_time_avg(l_librbd_pwl_log_op_dis_to_buf_t, "op_dis_to_buf_t", "Average dispatch to buffer persist time"); + plb.add_time_avg(l_librbd_pwl_log_op_dis_to_app_t, "op_dis_to_app_t", "Average dispatch to log append time"); + plb.add_time_avg(l_librbd_pwl_log_op_dis_to_cmp_t, "op_dis_to_cmp_t", "Average dispatch to persist completion time"); + plb.add_u64_counter_histogram( + l_librbd_pwl_log_op_dis_to_cmp_t_hist, "op_dis_to_cmp_t_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of op dispatch to persist complete time (nanoseconds) vs. bytes written"); + + plb.add_time_avg( + l_librbd_pwl_log_op_buf_to_app_t, "op_buf_to_app_t", + "Average buffer persist to log append time (write data persist/replicate + wait for append time)"); + plb.add_time_avg( + l_librbd_pwl_log_op_buf_to_bufc_t, "op_buf_to_bufc_t", + "Average buffer persist time (write data persist/replicate time)"); + plb.add_u64_counter_histogram( + l_librbd_pwl_log_op_buf_to_bufc_t_hist, "op_buf_to_bufc_t_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of write buffer persist time (nanoseconds) vs. bytes written"); + plb.add_time_avg( + l_librbd_pwl_log_op_app_to_cmp_t, "op_app_to_cmp_t", + "Average log append to persist complete time (log entry append/replicate + wait for complete time)"); + plb.add_time_avg( + l_librbd_pwl_log_op_app_to_appc_t, "op_app_to_appc_t", + "Average log append to persist complete time (log entry append/replicate time)"); + plb.add_u64_counter_histogram( + l_librbd_pwl_log_op_app_to_appc_t_hist, "op_app_to_appc_t_bytes_histogram", + op_hist_x_axis_config, op_hist_y_axis_config, + "Histogram of log append persist time (nanoseconds) (vs. op bytes)"); + + plb.add_u64_counter(l_librbd_pwl_discard, "discard", "Discards"); + plb.add_u64_counter(l_librbd_pwl_discard_bytes, "discard_bytes", "Bytes discarded"); + plb.add_time_avg(l_librbd_pwl_discard_latency, "discard_lat", "Discard latency"); + + plb.add_u64_counter(l_librbd_pwl_aio_flush, "aio_flush", "AIO flush (flush to RWL)"); + plb.add_u64_counter(l_librbd_pwl_aio_flush_def, "aio_flush_def", "AIO flushes deferred for resources"); + plb.add_time_avg(l_librbd_pwl_aio_flush_latency, "aio_flush_lat", "AIO flush latency"); + + plb.add_u64_counter(l_librbd_pwl_ws,"ws", "Write Sames"); + plb.add_u64_counter(l_librbd_pwl_ws_bytes, "ws_bytes", "Write Same bytes to image"); + plb.add_time_avg(l_librbd_pwl_ws_latency, "ws_lat", "Write Same latency"); + + plb.add_u64_counter(l_librbd_pwl_cmp, "cmp", "Compare and Write requests"); + plb.add_u64_counter(l_librbd_pwl_cmp_bytes, "cmp_bytes", "Compare and Write bytes compared/written"); + plb.add_time_avg(l_librbd_pwl_cmp_latency, "cmp_lat", "Compare and Write latecy"); + plb.add_u64_counter(l_librbd_pwl_cmp_fails, "cmp_fails", "Compare and Write compare fails"); + + plb.add_u64_counter(l_librbd_pwl_internal_flush, "internal_flush", "Flush RWL (write back to OSD)"); + plb.add_time_avg(l_librbd_pwl_writeback_latency, "writeback_lat", "write back to OSD latency"); + plb.add_u64_counter(l_librbd_pwl_invalidate_cache, "invalidate", "Invalidate RWL"); + plb.add_u64_counter(l_librbd_pwl_invalidate_discard_cache, "discard", "Discard and invalidate RWL"); + + plb.add_time_avg(l_librbd_pwl_append_tx_t, "append_tx_lat", "Log append transaction latency"); + plb.add_u64_counter_histogram( + l_librbd_pwl_append_tx_t_hist, "append_tx_lat_histogram", + op_hist_x_axis_config, op_hist_y_axis_count_config, + "Histogram of log append transaction time (nanoseconds) vs. entries appended"); + plb.add_time_avg(l_librbd_pwl_retire_tx_t, "retire_tx_lat", "Log retire transaction latency"); + plb.add_u64_counter_histogram( + l_librbd_pwl_retire_tx_t_hist, "retire_tx_lat_histogram", + op_hist_x_axis_config, op_hist_y_axis_count_config, + "Histogram of log retire transaction time (nanoseconds) vs. entries retired"); + + m_perfcounter = plb.create_perf_counters(); + m_image_ctx.cct->get_perfcounters_collection()->add(m_perfcounter); +} + +template +void AbstractWriteLog::perf_stop() { + ceph_assert(m_perfcounter); + m_image_ctx.cct->get_perfcounters_collection()->remove(m_perfcounter); + delete m_perfcounter; +} + +template +void AbstractWriteLog::log_perf() { + bufferlist bl; + Formatter *f = Formatter::create("json-pretty"); + bl.append("Perf dump follows\n--- Begin perf dump ---\n"); + bl.append("{\n"); + stringstream ss; + utime_t now = ceph_clock_now(); + ss << "\"test_time\": \"" << now << "\","; + ss << "\"image\": \"" << m_image_ctx.name << "\","; + bl.append(ss); + bl.append("\"stats\": "); + m_image_ctx.cct->get_perfcounters_collection()->dump_formatted(f, false, false); + f->flush(bl); + bl.append(",\n\"histograms\": "); + m_image_ctx.cct->get_perfcounters_collection()->dump_formatted_histograms(f, 0); + f->flush(bl); + delete f; + bl.append("}\n--- End perf dump ---\n"); + bl.append('\0'); + ldout(m_image_ctx.cct, 1) << bl.c_str() << dendl; +} + +template +void AbstractWriteLog::periodic_stats() { + std::unique_lock locker(m_lock); + ldout(m_image_ctx.cct, 5) << "STATS: m_log_entries=" << m_log_entries.size() + << ", m_dirty_log_entries=" << m_dirty_log_entries.size() + << ", m_free_log_entries=" << m_free_log_entries + << ", m_bytes_allocated=" << m_bytes_allocated + << ", m_bytes_cached=" << m_bytes_cached + << ", m_bytes_dirty=" << m_bytes_dirty + << ", bytes available=" << m_bytes_allocated_cap - m_bytes_allocated + << ", m_first_valid_entry=" << m_first_valid_entry + << ", m_first_free_entry=" << m_first_free_entry + << ", m_current_sync_gen=" << m_current_sync_gen + << ", m_flushed_sync_gen=" << m_flushed_sync_gen + << dendl; + + update_image_cache_state(); + write_image_cache_state(locker); +} + +template +void AbstractWriteLog::arm_periodic_stats() { + ceph_assert(ceph_mutex_is_locked(*m_timer_lock)); + m_timer_ctx = new LambdaContext([this](int r) { + /* m_timer_lock is held */ + periodic_stats(); + arm_periodic_stats(); + }); + m_timer->add_event_after(LOG_STATS_INTERVAL_SECONDS, m_timer_ctx); +} + +template +void AbstractWriteLog::update_entries(std::shared_ptr *log_entry, + WriteLogCacheEntry *cache_entry, std::map &missing_sync_points, + std::map> &sync_point_entries, + uint64_t entry_index) { + bool writer = cache_entry->is_writer(); + if (cache_entry->is_sync_point()) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " is a sync point. cache_entry=[" << *cache_entry << "]" << dendl; + auto sync_point_entry = std::make_shared(cache_entry->sync_gen_number); + *log_entry = sync_point_entry; + sync_point_entries[cache_entry->sync_gen_number] = sync_point_entry; + missing_sync_points.erase(cache_entry->sync_gen_number); + m_current_sync_gen = cache_entry->sync_gen_number; + } else if (cache_entry->is_write()) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " is a write. cache_entry=[" << *cache_entry << "]" << dendl; + auto write_entry = + m_builder->create_write_log_entry(nullptr, cache_entry->image_offset_bytes, cache_entry->write_bytes); + write_data_to_buffer(write_entry, cache_entry); + *log_entry = write_entry; + } else if (cache_entry->is_writesame()) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " is a write same. cache_entry=[" << *cache_entry << "]" << dendl; + auto ws_entry = + m_builder->create_writesame_log_entry(nullptr, cache_entry->image_offset_bytes, + cache_entry->write_bytes, cache_entry->ws_datalen); + write_data_to_buffer(ws_entry, cache_entry); + *log_entry = ws_entry; + } else if (cache_entry->is_discard()) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " is a discard. cache_entry=[" << *cache_entry << "]" << dendl; + auto discard_entry = + std::make_shared(nullptr, cache_entry->image_offset_bytes, cache_entry->write_bytes, + m_discard_granularity_bytes); + *log_entry = discard_entry; + } else { + lderr(m_image_ctx.cct) << "Unexpected entry type in entry " << entry_index + << ", cache_entry=[" << *cache_entry << "]" << dendl; + } + + if (writer) { + ldout(m_image_ctx.cct, 20) << "Entry " << entry_index + << " writes. cache_entry=[" << *cache_entry << "]" << dendl; + if (!sync_point_entries[cache_entry->sync_gen_number]) { + missing_sync_points[cache_entry->sync_gen_number] = true; + } + } +} + +template +void AbstractWriteLog::update_sync_points(std::map &missing_sync_points, + std::map> &sync_point_entries, + DeferredContexts &later) { + /* Create missing sync points. These must not be appended until the + * entry reload is complete and the write map is up to + * date. Currently this is handled by the deferred contexts object + * passed to new_sync_point(). These contexts won't be completed + * until this function returns. */ + for (auto &kv : missing_sync_points) { + ldout(m_image_ctx.cct, 5) << "Adding sync point " << kv.first << dendl; + if (0 == m_current_sync_gen) { + /* The unlikely case where the log contains writing entries, but no sync + * points (e.g. because they were all retired) */ + m_current_sync_gen = kv.first-1; + } + ceph_assert(kv.first == m_current_sync_gen+1); + init_flush_new_sync_point(later); + ceph_assert(kv.first == m_current_sync_gen); + sync_point_entries[kv.first] = m_current_sync_point->log_entry; + } + + /* + * Iterate over the log entries again (this time via the global + * entries list), connecting write entries to their sync points and + * updating the sync point stats. + * + * Add writes to the write log map. + */ + std::shared_ptr previous_sync_point_entry = nullptr; + for (auto &log_entry : m_log_entries) { + if ((log_entry->write_bytes() > 0) || (log_entry->bytes_dirty() > 0)) { + /* This entry is one of the types that write */ + auto gen_write_entry = static_pointer_cast(log_entry); + if (gen_write_entry) { + auto sync_point_entry = sync_point_entries[gen_write_entry->ram_entry.sync_gen_number]; + if (!sync_point_entry) { + lderr(m_image_ctx.cct) << "Sync point missing for entry=[" << *gen_write_entry << "]" << dendl; + ceph_assert(false); + } else { + gen_write_entry->sync_point_entry = sync_point_entry; + sync_point_entry->writes++; + sync_point_entry->bytes += gen_write_entry->ram_entry.write_bytes; + sync_point_entry->writes_completed++; + m_blocks_to_log_entries.add_log_entry(gen_write_entry); + /* This entry is only dirty if its sync gen number is > the flushed + * sync gen number from the root object. */ + if (gen_write_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) { + m_dirty_log_entries.push_back(log_entry); + m_bytes_dirty += gen_write_entry->bytes_dirty(); + } else { + gen_write_entry->set_flushed(true); + sync_point_entry->writes_flushed++; + } + + /* calc m_bytes_allocated & m_bytes_cached */ + inc_allocated_cached_bytes(log_entry); + } + } + } else { + /* This entry is sync point entry */ + auto sync_point_entry = static_pointer_cast(log_entry); + if (sync_point_entry) { + if (previous_sync_point_entry) { + previous_sync_point_entry->next_sync_point_entry = sync_point_entry; + if (previous_sync_point_entry->ram_entry.sync_gen_number > m_flushed_sync_gen) { + sync_point_entry->prior_sync_point_flushed = false; + ceph_assert(!previous_sync_point_entry->prior_sync_point_flushed || + (0 == previous_sync_point_entry->writes) || + (previous_sync_point_entry->writes >= previous_sync_point_entry->writes_flushed)); + } else { + sync_point_entry->prior_sync_point_flushed = true; + ceph_assert(previous_sync_point_entry->prior_sync_point_flushed); + ceph_assert(previous_sync_point_entry->writes == previous_sync_point_entry->writes_flushed); + } + } else { + /* There are no previous sync points, so we'll consider them flushed */ + sync_point_entry->prior_sync_point_flushed = true; + } + previous_sync_point_entry = sync_point_entry; + ldout(m_image_ctx.cct, 10) << "Loaded to sync point=[" << *sync_point_entry << dendl; + } + } + } + if (0 == m_current_sync_gen) { + /* If a re-opened log was completely flushed, we'll have found no sync point entries here, + * and not advanced m_current_sync_gen. Here we ensure it starts past the last flushed sync + * point recorded in the log. */ + m_current_sync_gen = m_flushed_sync_gen; + } +} + +template +void AbstractWriteLog::pwl_init(Context *on_finish, DeferredContexts &later) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + ceph_assert(m_cache_state); + std::lock_guard locker(m_lock); + ceph_assert(!m_initialized); + ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl; + + if (!m_cache_state->present) { + m_cache_state->host = ceph_get_short_hostname(); + m_cache_state->size = m_image_ctx.config.template get_val( + "rbd_persistent_cache_size"); + + string path = m_image_ctx.config.template get_val( + "rbd_persistent_cache_path"); + std::string pool_name = m_image_ctx.md_ctx.get_pool_name(); + m_cache_state->path = path + "/rbd-pwl." + pool_name + "." + m_image_ctx.id + ".pool"; + } + + ldout(cct,5) << "pwl_size: " << m_cache_state->size << dendl; + ldout(cct,5) << "pwl_path: " << m_cache_state->path << dendl; + + m_log_pool_name = m_cache_state->path; + m_log_pool_size = max(m_cache_state->size, MIN_POOL_SIZE); + m_log_pool_size = p2align(m_log_pool_size, POOL_SIZE_ALIGN); + ldout(cct, 5) << "pool " << m_log_pool_name << " size " << m_log_pool_size + << " (adjusted from " << m_cache_state->size << ")" << dendl; + + if ((!m_cache_state->present) && + (access(m_log_pool_name.c_str(), F_OK) == 0)) { + ldout(cct, 5) << "There's an existing pool file " << m_log_pool_name + << ", While there's no cache in the image metatata." << dendl; + if (remove(m_log_pool_name.c_str()) != 0) { + lderr(cct) << "failed to remove the pool file " << m_log_pool_name + << dendl; + on_finish->complete(-errno); + return; + } else { + ldout(cct, 5) << "Removed the existing pool file." << dendl; + } + } else if ((m_cache_state->present) && + (access(m_log_pool_name.c_str(), F_OK) != 0)) { + lderr(cct) << "can't find the existed pool file: " << m_log_pool_name + << ". error: " << cpp_strerror(-errno) << dendl; + on_finish->complete(-errno); + return; + } + + bool succeeded = initialize_pool(on_finish, later); + if (!succeeded) { + return ; + } + + ldout(cct,1) << "pool " << m_log_pool_name << " has " << m_total_log_entries + << " log entries, " << m_free_log_entries << " of which are free." + << " first_valid=" << m_first_valid_entry + << ", first_free=" << m_first_free_entry + << ", flushed_sync_gen=" << m_flushed_sync_gen + << ", m_current_sync_gen=" << m_current_sync_gen << dendl; + if (m_first_free_entry == m_first_valid_entry) { + ldout(cct,1) << "write log is empty" << dendl; + m_cache_state->empty = true; + } + + /* Start the sync point following the last one seen in the + * log. Flush the last sync point created during the loading of the + * existing log entries. */ + init_flush_new_sync_point(later); + ldout(cct,20) << "new sync point = [" << m_current_sync_point << "]" << dendl; + + m_initialized = true; + // Start the thread + m_thread_pool.start(); + + /* Do these after we drop lock */ + later.add(new LambdaContext([this](int r) { + /* Log stats for the first time */ + periodic_stats(); + /* Arm periodic stats logging for the first time */ + std::lock_guard timer_locker(*m_timer_lock); + arm_periodic_stats(); + })); + m_image_ctx.op_work_queue->queue(on_finish, 0); +} + +template +void AbstractWriteLog::write_image_cache_state(std::unique_lock& locker) { + using klass = AbstractWriteLog; + Context *ctx = util::create_context_callback< + klass, &klass::handle_write_image_cache_state>(this); + m_cache_state->write_image_cache_state(locker, ctx); +} + +template +void AbstractWriteLog::update_image_cache_state() { + ldout(m_image_ctx.cct, 10) << dendl; + + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + m_cache_state->allocated_bytes = m_bytes_allocated; + m_cache_state->cached_bytes = m_bytes_cached; + m_cache_state->dirty_bytes = m_bytes_dirty; + m_cache_state->free_bytes = m_bytes_allocated_cap - m_bytes_allocated; + m_cache_state->hits_full = m_perfcounter->get(l_librbd_pwl_rd_hit_req); + m_cache_state->hits_partial = m_perfcounter->get(l_librbd_pwl_rd_part_hit_req); + m_cache_state->misses = m_perfcounter->get(l_librbd_pwl_rd_req) - + m_cache_state->hits_full - m_cache_state->hits_partial; + m_cache_state->hit_bytes = m_perfcounter->get(l_librbd_pwl_rd_hit_bytes); + m_cache_state->miss_bytes = m_perfcounter->get(l_librbd_pwl_rd_bytes) - + m_cache_state->hit_bytes; +} + +template +void AbstractWriteLog::handle_write_image_cache_state(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to update image cache state: " << cpp_strerror(r) + << dendl; + return; + } +} + +template +void AbstractWriteLog::init(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + auto pname = std::string("librbd-pwl-") + m_image_ctx.id + + std::string("-") + m_image_ctx.md_ctx.get_pool_name() + + std::string("-") + m_image_ctx.name; + perf_start(pname); + + ceph_assert(!m_initialized); + + Context *ctx = new LambdaContext( + [this, on_finish](int r) { + if (r >= 0) { + std::unique_lock locker(m_lock); + update_image_cache_state(); + m_cache_state->write_image_cache_state(locker, on_finish); + } else { + on_finish->complete(r); + } + }); + + DeferredContexts later; + pwl_init(ctx, later); +} + +template +void AbstractWriteLog::shut_down(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + ldout(cct,5) << "image name: " << m_image_ctx.name << " id: " << m_image_ctx.id << dendl; + + Context *ctx = new LambdaContext( + [this, on_finish](int r) { + if (m_perfcounter) { + perf_stop(); + } + ldout(m_image_ctx.cct, 6) << "shutdown complete" << dendl; + m_image_ctx.op_work_queue->queue(on_finish, r); + }); + ctx = new LambdaContext( + [this, ctx](int r) { + ldout(m_image_ctx.cct, 6) << "image cache cleaned" << dendl; + Context *next_ctx = override_ctx(r, ctx); + periodic_stats(); + + std::unique_lock locker(m_lock); + check_image_cache_state_clean(); + m_wake_up_enabled = false; + m_log_entries.clear(); + m_cache_state->clean = true; + m_cache_state->empty = true; + remove_pool_file(); + update_image_cache_state(); + m_cache_state->write_image_cache_state(locker, next_ctx); + }); + ctx = new LambdaContext( + [this, ctx](int r) { + Context *next_ctx = override_ctx(r, ctx); + ldout(m_image_ctx.cct, 6) << "waiting for in flight operations" << dendl; + // Wait for in progress IOs to complete + next_ctx = util::create_async_context_callback(&m_work_queue, next_ctx); + m_async_op_tracker.wait_for_ops(next_ctx); + }); + ctx = new LambdaContext( + [this, ctx](int r) { + Context *next_ctx = override_ctx(r, ctx); + { + /* Sync with process_writeback_dirty_entries() */ + RWLock::WLocker entry_reader_wlocker(m_entry_reader_lock); + m_shutting_down = true; + /* Flush all writes to OSDs (unless disabled) and wait for all + in-progress flush writes to complete */ + ldout(m_image_ctx.cct, 6) << "flushing" << dendl; + periodic_stats(); + } + flush_dirty_entries(next_ctx); + }); + ctx = new LambdaContext( + [this, ctx](int r) { + ldout(m_image_ctx.cct, 6) << "Done internal_flush in shutdown" << dendl; + m_work_queue.queue(ctx, r); + }); + /* Complete all in-flight writes before shutting down */ + ldout(m_image_ctx.cct, 6) << "internal_flush in shutdown" << dendl; + internal_flush(false, ctx); +} + +template +void AbstractWriteLog::read(Extents&& image_extents, + ceph::bufferlist* bl, + int fadvise_flags, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + utime_t now = ceph_clock_now(); + + on_finish = new LambdaContext( + [this, on_finish](int r) { + m_async_op_tracker.finish_op(); + on_finish->complete(r); + }); + C_ReadRequest *read_ctx = m_builder->create_read_request( + cct, now, m_perfcounter, bl, on_finish); + ldout(cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id + << "image_extents=" << image_extents + << ", bl=" << bl + << ", on_finish=" << on_finish << dendl; + + ceph_assert(m_initialized); + bl->clear(); + m_perfcounter->inc(l_librbd_pwl_rd_req, 1); + + std::vector> log_entries_to_read; + std::vector bls_to_read; + + m_async_op_tracker.start_op(); + Context *ctx = new LambdaContext( + [this, read_ctx, fadvise_flags](int r) { + if (read_ctx->miss_extents.empty()) { + /* All of this read comes from RWL */ + read_ctx->complete(0); + } else { + /* Pass the read misses on to the layer below RWL */ + m_image_writeback.aio_read( + std::move(read_ctx->miss_extents), &read_ctx->miss_bl, + fadvise_flags, read_ctx); + } + }); + + /* + * The strategy here is to look up all the WriteLogMapEntries that overlap + * this read, and iterate through those to separate this read into hits and + * misses. A new Extents object is produced here with Extents for each miss + * region. The miss Extents is then passed on to the read cache below RWL. We + * also produce an ImageExtentBufs for all the extents (hit or miss) in this + * read. When the read from the lower cache layer completes, we iterate + * through the ImageExtentBufs and insert buffers for each cache hit at the + * appropriate spot in the bufferlist returned from below for the miss + * read. The buffers we insert here refer directly to regions of various + * write log entry data buffers. + * + * Locking: These buffer objects hold a reference on the write log entries + * they refer to. Log entries can't be retired until there are no references. + * The GenericWriteLogEntry references are released by the buffer destructor. + */ + for (auto &extent : image_extents) { + uint64_t extent_offset = 0; + RWLock::RLocker entry_reader_locker(m_entry_reader_lock); + WriteLogMapEntries map_entries = m_blocks_to_log_entries.find_map_entries( + block_extent(extent)); + for (auto &map_entry : map_entries) { + Extent entry_image_extent(pwl::image_extent(map_entry.block_extent)); + /* If this map entry starts after the current image extent offset ... */ + if (entry_image_extent.first > extent.first + extent_offset) { + /* ... add range before map_entry to miss extents */ + uint64_t miss_extent_start = extent.first + extent_offset; + uint64_t miss_extent_length = entry_image_extent.first - + miss_extent_start; + Extent miss_extent(miss_extent_start, miss_extent_length); + read_ctx->miss_extents.push_back(miss_extent); + /* Add miss range to read extents */ + auto miss_extent_buf = std::make_shared(miss_extent); + read_ctx->read_extents.push_back(miss_extent_buf); + extent_offset += miss_extent_length; + } + ceph_assert(entry_image_extent.first <= extent.first + extent_offset); + uint64_t entry_offset = 0; + /* If this map entry starts before the current image extent offset ... */ + if (entry_image_extent.first < extent.first + extent_offset) { + /* ... compute offset into log entry for this read extent */ + entry_offset = (extent.first + extent_offset) - entry_image_extent.first; + } + /* This read hit ends at the end of the extent or the end of the log + entry, whichever is less. */ + uint64_t entry_hit_length = min(entry_image_extent.second - entry_offset, + extent.second - extent_offset); + Extent hit_extent(entry_image_extent.first, entry_hit_length); + if (0 == map_entry.log_entry->write_bytes() && + 0 < map_entry.log_entry->bytes_dirty()) { + /* discard log entry */ + ldout(cct, 20) << "discard log entry" << dendl; + auto discard_entry = map_entry.log_entry; + ldout(cct, 20) << "read hit on discard entry: log_entry=" + << *discard_entry + << dendl; + /* Discards read as zero, so we'll construct a bufferlist of zeros */ + bufferlist zero_bl; + zero_bl.append_zero(entry_hit_length); + /* Add hit extent to read extents */ + auto hit_extent_buf = std::make_shared( + hit_extent, zero_bl); + read_ctx->read_extents.push_back(hit_extent_buf); + } else { + ldout(cct, 20) << "write or writesame log entry" << dendl; + /* write and writesame log entry */ + /* Offset of the map entry into the log entry's buffer */ + uint64_t map_entry_buffer_offset = entry_image_extent.first - + map_entry.log_entry->ram_entry.image_offset_bytes; + /* Offset into the log entry buffer of this read hit */ + uint64_t read_buffer_offset = map_entry_buffer_offset + entry_offset; + /* Create buffer object referring to pmem pool for this read hit */ + collect_read_extents( + read_buffer_offset, map_entry, log_entries_to_read, bls_to_read, + entry_hit_length, hit_extent, read_ctx); + } + /* Exclude RWL hit range from buffer and extent */ + extent_offset += entry_hit_length; + ldout(cct, 20) << map_entry << dendl; + } + /* If the last map entry didn't consume the entire image extent ... */ + if (extent.second > extent_offset) { + /* ... add the rest of this extent to miss extents */ + uint64_t miss_extent_start = extent.first + extent_offset; + uint64_t miss_extent_length = extent.second - extent_offset; + Extent miss_extent(miss_extent_start, miss_extent_length); + read_ctx->miss_extents.push_back(miss_extent); + /* Add miss range to read extents */ + auto miss_extent_buf = std::make_shared(miss_extent); + read_ctx->read_extents.push_back(miss_extent_buf); + extent_offset += miss_extent_length; + } + } + + ldout(cct, 20) << "miss_extents=" << read_ctx->miss_extents + << ", miss_bl=" << read_ctx->miss_bl << dendl; + + complete_read(log_entries_to_read, bls_to_read, ctx); +} + +template +void AbstractWriteLog::write(Extents &&image_extents, + bufferlist&& bl, + int fadvise_flags, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << "aio_write" << dendl; + + utime_t now = ceph_clock_now(); + m_perfcounter->inc(l_librbd_pwl_wr_req, 1); + + ceph_assert(m_initialized); + + /* Split image extents larger than 1M. This isn't strictly necessary but + * makes libpmemobj allocator's job easier and reduces pmemobj_defrag() cost. + * We plan to manage pmem space and allocation by ourselves in the future. + */ + Extents split_image_extents; + uint64_t max_extent_size = get_max_extent(); + if (max_extent_size != 0) { + for (auto extent : image_extents) { + if (extent.second > max_extent_size) { + uint64_t off = extent.first; + uint64_t extent_bytes = extent.second; + for (int i = 0; extent_bytes != 0; ++i) { + Extent _ext; + _ext.first = off + i * max_extent_size; + _ext.second = std::min(max_extent_size, extent_bytes); + extent_bytes = extent_bytes - _ext.second ; + split_image_extents.emplace_back(_ext); + } + } else { + split_image_extents.emplace_back(extent); + } + } + } else { + split_image_extents = image_extents; + } + + C_WriteRequestT *write_req = + m_builder->create_write_request(*this, now, std::move(split_image_extents), + std::move(bl), fadvise_flags, m_lock, + m_perfcounter, on_finish); + m_perfcounter->inc(l_librbd_pwl_wr_bytes, + write_req->image_extents_summary.total_bytes); + + /* The lambda below will be called when the block guard for all + * blocks affected by this write is obtained */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, + write_req](GuardedRequestFunctionContext &guard_ctx) { + write_req->blockguard_acquired(guard_ctx); + alloc_and_dispatch_io_req(write_req); + }); + + detain_guarded_request(write_req, guarded_ctx, false); +} + +template +void AbstractWriteLog::discard(uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << dendl; + + utime_t now = ceph_clock_now(); + m_perfcounter->inc(l_librbd_pwl_discard, 1); + Extents discard_extents = {{offset, length}}; + m_discard_granularity_bytes = discard_granularity_bytes; + + ceph_assert(m_initialized); + + auto *discard_req = + new C_DiscardRequestT(*this, now, std::move(discard_extents), discard_granularity_bytes, + m_lock, m_perfcounter, on_finish); + + /* The lambda below will be called when the block guard for all + * blocks affected by this write is obtained */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, discard_req](GuardedRequestFunctionContext &guard_ctx) { + discard_req->blockguard_acquired(guard_ctx); + alloc_and_dispatch_io_req(discard_req); + }); + + detain_guarded_request(discard_req, guarded_ctx, false); +} + +/** + * Aio_flush completes when all previously completed writes are + * flushed to persistent cache. We make a best-effort attempt to also + * defer until all in-progress writes complete, but we may not know + * about all of the writes the application considers in-progress yet, + * due to uncertainty in the IO submission workq (multiple WQ threads + * may allow out-of-order submission). + * + * This flush operation will not wait for writes deferred for overlap + * in the block guard. + */ +template +void AbstractWriteLog::flush(io::FlushSource flush_source, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "on_finish=" << on_finish << " flush_source=" << flush_source << dendl; + + if (io::FLUSH_SOURCE_SHUTDOWN == flush_source || io::FLUSH_SOURCE_INTERNAL == flush_source || + io::FLUSH_SOURCE_WRITE_BLOCK == flush_source) { + internal_flush(false, on_finish); + return; + } + m_perfcounter->inc(l_librbd_pwl_aio_flush, 1); + + /* May be called even if initialization fails */ + if (!m_initialized) { + ldout(cct, 05) << "never initialized" << dendl; + /* Deadlock if completed here */ + m_image_ctx.op_work_queue->queue(on_finish, 0); + return; + } + + { + std::shared_lock image_locker(m_image_ctx.image_lock); + if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) { + on_finish->complete(-EROFS); + return; + } + } + + auto flush_req = make_flush_req(on_finish); + + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, flush_req](GuardedRequestFunctionContext &guard_ctx) { + ldout(m_image_ctx.cct, 20) << "flush_req=" << flush_req << " cell=" << guard_ctx.cell << dendl; + ceph_assert(guard_ctx.cell); + flush_req->detained = guard_ctx.state.detained; + /* We don't call flush_req->set_cell(), because the block guard will be released here */ + { + DeferredContexts post_unlock; /* Do these when the lock below is released */ + std::lock_guard locker(m_lock); + + if (!m_persist_on_flush && m_persist_on_write_until_flush) { + m_persist_on_flush = true; + ldout(m_image_ctx.cct, 5) << "now persisting on flush" << dendl; + } + + /* + * Create a new sync point if there have been writes since the last + * one. + * + * We do not flush the caches below the RWL here. + */ + flush_new_sync_point_if_needed(flush_req, post_unlock); + } + + release_guarded_request(guard_ctx.cell); + }); + + detain_guarded_request(flush_req, guarded_ctx, true); +} + +template +void AbstractWriteLog::writesame(uint64_t offset, uint64_t length, + bufferlist&& bl, int fadvise_flags, + Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << "aio_writesame" << dendl; + + utime_t now = ceph_clock_now(); + Extents ws_extents = {{offset, length}}; + m_perfcounter->inc(l_librbd_pwl_ws, 1); + ceph_assert(m_initialized); + + /* A write same request is also a write request. The key difference is the + * write same data buffer is shorter than the extent of the request. The full + * extent will be used in the block guard, and appear in + * m_blocks_to_log_entries_map. The data buffer allocated for the WS is only + * as long as the length of the bl here, which is the pattern that's repeated + * in the image for the entire length of this WS. Read hits and flushing of + * write sames are different than normal writes. */ + C_WriteSameRequestT *ws_req = + m_builder->create_writesame_request(*this, now, std::move(ws_extents), std::move(bl), + fadvise_flags, m_lock, m_perfcounter, on_finish); + m_perfcounter->inc(l_librbd_pwl_ws_bytes, ws_req->image_extents_summary.total_bytes); + + /* The lambda below will be called when the block guard for all + * blocks affected by this write is obtained */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, ws_req](GuardedRequestFunctionContext &guard_ctx) { + ws_req->blockguard_acquired(guard_ctx); + alloc_and_dispatch_io_req(ws_req); + }); + + detain_guarded_request(ws_req, guarded_ctx, false); +} + +template +void AbstractWriteLog::compare_and_write(Extents &&image_extents, + bufferlist&& cmp_bl, + bufferlist&& bl, + uint64_t *mismatch_offset, + int fadvise_flags, + Context *on_finish) { + ldout(m_image_ctx.cct, 20) << dendl; + + utime_t now = ceph_clock_now(); + m_perfcounter->inc(l_librbd_pwl_cmp, 1); + ceph_assert(m_initialized); + + /* A compare and write request is also a write request. We only allocate + * resources and dispatch this write request if the compare phase + * succeeds. */ + C_WriteRequestT *cw_req = + m_builder->create_comp_and_write_request( + *this, now, std::move(image_extents), std::move(cmp_bl), std::move(bl), + mismatch_offset, fadvise_flags, m_lock, m_perfcounter, on_finish); + m_perfcounter->inc(l_librbd_pwl_cmp_bytes, cw_req->image_extents_summary.total_bytes); + + /* The lambda below will be called when the block guard for all + * blocks affected by this write is obtained */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, cw_req](GuardedRequestFunctionContext &guard_ctx) { + cw_req->blockguard_acquired(guard_ctx); + + auto read_complete_ctx = new LambdaContext( + [this, cw_req](int r) { + ldout(m_image_ctx.cct, 20) << "name: " << m_image_ctx.name << " id: " << m_image_ctx.id + << "cw_req=" << cw_req << dendl; + + /* Compare read_bl to cmp_bl to determine if this will produce a write */ + ceph_assert(cw_req->read_bl.length() <= cw_req->cmp_bl.length()); + ceph_assert(cw_req->read_bl.length() == cw_req->image_extents_summary.total_bytes); + bufferlist sub_cmp_bl; + sub_cmp_bl.substr_of(cw_req->cmp_bl, 0, cw_req->read_bl.length()); + if (sub_cmp_bl.contents_equal(cw_req->read_bl)) { + /* Compare phase succeeds. Begin write */ + ldout(m_image_ctx.cct, 5) << " cw_req=" << cw_req << " compare matched" << dendl; + cw_req->compare_succeeded = true; + *cw_req->mismatch_offset = 0; + /* Continue with this request as a write. Blockguard release and + * user request completion handled as if this were a plain + * write. */ + alloc_and_dispatch_io_req(cw_req); + } else { + /* Compare phase fails. Comp-and write ends now. */ + ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " compare failed" << dendl; + /* Bufferlist doesn't tell us where they differed, so we'll have to determine that here */ + uint64_t bl_index = 0; + for (bl_index = 0; bl_index < sub_cmp_bl.length(); bl_index++) { + if (sub_cmp_bl[bl_index] != cw_req->read_bl[bl_index]) { + ldout(m_image_ctx.cct, 15) << " cw_req=" << cw_req << " mismatch at " << bl_index << dendl; + break; + } + } + cw_req->compare_succeeded = false; + *cw_req->mismatch_offset = bl_index; + cw_req->complete_user_request(-EILSEQ); + cw_req->release_cell(); + cw_req->complete(0); + } + }); + + /* Read phase of comp-and-write must read through RWL */ + Extents image_extents_copy = cw_req->image_extents; + read(std::move(image_extents_copy), &cw_req->read_bl, cw_req->fadvise_flags, read_complete_ctx); + }); + + detain_guarded_request(cw_req, guarded_ctx, false); +} + +template +void AbstractWriteLog::flush(Context *on_finish) { + internal_flush(false, on_finish); +} + +template +void AbstractWriteLog::invalidate(Context *on_finish) { + internal_flush(true, on_finish); +} + +template +CephContext *AbstractWriteLog::get_context() { + return m_image_ctx.cct; +} + +template +BlockGuardCell* AbstractWriteLog::detain_guarded_request_helper(GuardedRequest &req) +{ + CephContext *cct = m_image_ctx.cct; + BlockGuardCell *cell; + + ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock)); + ldout(cct, 20) << dendl; + + int r = m_write_log_guard.detain(req.block_extent, &req, &cell); + ceph_assert(r>=0); + if (r > 0) { + ldout(cct, 20) << "detaining guarded request due to in-flight requests: " + << "req=" << req << dendl; + return nullptr; + } + + ldout(cct, 20) << "in-flight request cell: " << cell << dendl; + return cell; +} + +template +BlockGuardCell* AbstractWriteLog::detain_guarded_request_barrier_helper( + GuardedRequest &req) +{ + BlockGuardCell *cell = nullptr; + + ceph_assert(ceph_mutex_is_locked_by_me(m_blockguard_lock)); + ldout(m_image_ctx.cct, 20) << dendl; + + if (m_barrier_in_progress) { + req.guard_ctx->state.queued = true; + m_awaiting_barrier.push_back(req); + } else { + bool barrier = req.guard_ctx->state.barrier; + if (barrier) { + m_barrier_in_progress = true; + req.guard_ctx->state.current_barrier = true; + } + cell = detain_guarded_request_helper(req); + if (barrier) { + /* Only non-null if the barrier acquires the guard now */ + m_barrier_cell = cell; + } + } + + return cell; +} + +template +void AbstractWriteLog::detain_guarded_request( + C_BlockIORequestT *request, + GuardedRequestFunctionContext *guarded_ctx, + bool is_barrier) +{ + BlockExtent extent; + if (request) { + extent = request->image_extents_summary.block_extent(); + } else { + extent = block_extent(whole_volume_extent()); + } + auto req = GuardedRequest(extent, guarded_ctx, is_barrier); + BlockGuardCell *cell = nullptr; + + ldout(m_image_ctx.cct, 20) << dendl; + { + std::lock_guard locker(m_blockguard_lock); + cell = detain_guarded_request_barrier_helper(req); + } + if (cell) { + req.guard_ctx->cell = cell; + req.guard_ctx->complete(0); + } +} + +template +void AbstractWriteLog::release_guarded_request(BlockGuardCell *released_cell) +{ + CephContext *cct = m_image_ctx.cct; + WriteLogGuard::BlockOperations block_reqs; + ldout(cct, 20) << "released_cell=" << released_cell << dendl; + + { + std::lock_guard locker(m_blockguard_lock); + m_write_log_guard.release(released_cell, &block_reqs); + + for (auto &req : block_reqs) { + req.guard_ctx->state.detained = true; + BlockGuardCell *detained_cell = detain_guarded_request_helper(req); + if (detained_cell) { + if (req.guard_ctx->state.current_barrier) { + /* The current barrier is acquiring the block guard, so now we know its cell */ + m_barrier_cell = detained_cell; + /* detained_cell could be == released_cell here */ + ldout(cct, 20) << "current barrier cell=" << detained_cell << " req=" << req << dendl; + } + req.guard_ctx->cell = detained_cell; + m_work_queue.queue(req.guard_ctx); + } + } + + if (m_barrier_in_progress && (released_cell == m_barrier_cell)) { + ldout(cct, 20) << "current barrier released cell=" << released_cell << dendl; + /* The released cell is the current barrier request */ + m_barrier_in_progress = false; + m_barrier_cell = nullptr; + /* Move waiting requests into the blockguard. Stop if there's another barrier */ + while (!m_barrier_in_progress && !m_awaiting_barrier.empty()) { + auto &req = m_awaiting_barrier.front(); + ldout(cct, 20) << "submitting queued request to blockguard: " << req << dendl; + BlockGuardCell *detained_cell = detain_guarded_request_barrier_helper(req); + if (detained_cell) { + req.guard_ctx->cell = detained_cell; + m_work_queue.queue(req.guard_ctx); + } + m_awaiting_barrier.pop_front(); + } + } + } + + ldout(cct, 20) << "exit" << dendl; +} + +template +void AbstractWriteLog::append_scheduled(GenericLogOperations &ops, bool &ops_remain, + bool &appending, bool isRWL) +{ + const unsigned long int OPS_APPENDED = isRWL ? MAX_ALLOC_PER_TRANSACTION + : MAX_WRITES_PER_SYNC_POINT; + { + std::lock_guard locker(m_lock); + if (!appending && m_appending) { + /* Another thread is appending */ + ldout(m_image_ctx.cct, 15) << "Another thread is appending" << dendl; + return; + } + if (m_ops_to_append.size()) { + appending = true; + m_appending = true; + auto last_in_batch = m_ops_to_append.begin(); + unsigned int ops_to_append = m_ops_to_append.size(); + if (ops_to_append > OPS_APPENDED) { + ops_to_append = OPS_APPENDED; + } + std::advance(last_in_batch, ops_to_append); + ops.splice(ops.end(), m_ops_to_append, m_ops_to_append.begin(), last_in_batch); + ops_remain = true; /* Always check again before leaving */ + ldout(m_image_ctx.cct, 20) << "appending " << ops.size() << ", remain " + << m_ops_to_append.size() << dendl; + } else if (isRWL) { + ops_remain = false; + if (appending) { + appending = false; + m_appending = false; + } + } + } +} + +template +void AbstractWriteLog::schedule_append(GenericLogOperationsVector &ops, C_BlockIORequestT *req) +{ + GenericLogOperations to_append(ops.begin(), ops.end()); + + schedule_append_ops(to_append, req); +} + +template +void AbstractWriteLog::schedule_append(GenericLogOperationSharedPtr op, C_BlockIORequestT *req) +{ + GenericLogOperations to_append { op }; + + schedule_append_ops(to_append, req); +} + +/* + * Complete a set of write ops with the result of append_op_entries. + */ +template +void AbstractWriteLog::complete_op_log_entries(GenericLogOperations &&ops, + const int result) +{ + GenericLogEntries dirty_entries; + int published_reserves = 0; + ldout(m_image_ctx.cct, 20) << __func__ << ": completing" << dendl; + for (auto &op : ops) { + utime_t now = ceph_clock_now(); + auto log_entry = op->get_log_entry(); + log_entry->completed = true; + if (op->is_writing_op()) { + op->mark_log_entry_completed(); + dirty_entries.push_back(log_entry); + } + if (log_entry->is_write_entry()) { + release_ram(log_entry); + } + if (op->reserved_allocated()) { + published_reserves++; + } + { + std::lock_guard locker(m_lock); + m_unpublished_reserves -= published_reserves; + m_dirty_log_entries.splice(m_dirty_log_entries.end(), dirty_entries); + } + op->complete(result); + m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_app_t, + op->log_append_start_time - op->dispatch_time); + m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_cmp_t, now - op->dispatch_time); + m_perfcounter->hinc(l_librbd_pwl_log_op_dis_to_cmp_t_hist, + utime_t(now - op->dispatch_time).to_nsec(), + log_entry->ram_entry.write_bytes); + utime_t app_lat = op->log_append_comp_time - op->log_append_start_time; + m_perfcounter->tinc(l_librbd_pwl_log_op_app_to_appc_t, app_lat); + m_perfcounter->hinc(l_librbd_pwl_log_op_app_to_appc_t_hist, app_lat.to_nsec(), + log_entry->ram_entry.write_bytes); + m_perfcounter->tinc(l_librbd_pwl_log_op_app_to_cmp_t, now - op->log_append_start_time); + } + // New entries may be flushable + { + std::lock_guard locker(m_lock); + wake_up(); + } +} + +/** + * Dispatch as many deferred writes as possible + */ +template +void AbstractWriteLog::dispatch_deferred_writes(void) +{ + C_BlockIORequestT *front_req = nullptr; /* req still on front of deferred list */ + C_BlockIORequestT *allocated_req = nullptr; /* req that was allocated, and is now off the list */ + bool allocated = false; /* front_req allocate succeeded */ + bool cleared_dispatching_flag = false; + + /* If we can't become the dispatcher, we'll exit */ + { + std::lock_guard locker(m_lock); + if (m_dispatching_deferred_ops || + !m_deferred_ios.size()) { + return; + } + m_dispatching_deferred_ops = true; + } + + /* There are ops to dispatch, and this should be the only thread dispatching them */ + { + std::lock_guard deferred_dispatch(m_deferred_dispatch_lock); + do { + { + std::lock_guard locker(m_lock); + ceph_assert(m_dispatching_deferred_ops); + if (allocated) { + /* On the 2..n-1 th time we get lock, front_req->alloc_resources() will + * have succeeded, and we'll need to pop it off the deferred ops list + * here. */ + ceph_assert(front_req); + ceph_assert(!allocated_req); + m_deferred_ios.pop_front(); + allocated_req = front_req; + front_req = nullptr; + allocated = false; + } + ceph_assert(!allocated); + if (!allocated && front_req) { + /* front_req->alloc_resources() failed on the last iteration. + * We'll stop dispatching. */ + wake_up(); + front_req = nullptr; + ceph_assert(!cleared_dispatching_flag); + m_dispatching_deferred_ops = false; + cleared_dispatching_flag = true; + } else { + ceph_assert(!front_req); + if (m_deferred_ios.size()) { + /* New allocation candidate */ + front_req = m_deferred_ios.front(); + } else { + ceph_assert(!cleared_dispatching_flag); + m_dispatching_deferred_ops = false; + cleared_dispatching_flag = true; + } + } + } + /* Try allocating for front_req before we decide what to do with allocated_req + * (if any) */ + if (front_req) { + ceph_assert(!cleared_dispatching_flag); + allocated = front_req->alloc_resources(); + } + if (allocated_req && front_req && allocated) { + /* Push dispatch of the first allocated req to a wq */ + m_work_queue.queue(new LambdaContext( + [allocated_req](int r) { + allocated_req->dispatch(); + }), 0); + allocated_req = nullptr; + } + ceph_assert(!(allocated_req && front_req && allocated)); + + /* Continue while we're still considering the front of the deferred ops list */ + } while (front_req); + ceph_assert(!allocated); + } + ceph_assert(cleared_dispatching_flag); + + /* If any deferred requests were allocated, the last one will still be in allocated_req */ + if (allocated_req) { + allocated_req->dispatch(); + } +} + +/** + * Returns the lanes used by this write, and attempts to dispatch the next + * deferred write + */ +template +void AbstractWriteLog::release_write_lanes(C_BlockIORequestT *req) +{ + { + std::lock_guard locker(m_lock); + m_free_lanes += req->image_extents.size(); + } + dispatch_deferred_writes(); +} + +/** + * Attempts to allocate log resources for a write. Write is dispatched if + * resources are available, or queued if they aren't. + */ +template +void AbstractWriteLog::alloc_and_dispatch_io_req(C_BlockIORequestT *req) +{ + bool dispatch_here = false; + + { + /* If there are already deferred writes, queue behind them for resources */ + { + std::lock_guard locker(m_lock); + dispatch_here = m_deferred_ios.empty(); + // Only flush req's total_bytes is the max uint64 + if (req->image_extents_summary.total_bytes == + std::numeric_limits::max() && + static_cast(req)->internal == true) { + dispatch_here = true; + } + } + if (dispatch_here) { + dispatch_here = req->alloc_resources(); + } + if (dispatch_here) { + ldout(m_image_ctx.cct, 20) << "dispatching" << dendl; + req->dispatch(); + } else { + req->deferred(); + { + std::lock_guard locker(m_lock); + m_deferred_ios.push_back(req); + } + ldout(m_image_ctx.cct, 20) << "deferred IOs: " << m_deferred_ios.size() << dendl; + dispatch_deferred_writes(); + } + } +} + +template +bool AbstractWriteLog::check_allocation( + C_BlockIORequestT *req, uint64_t bytes_cached, uint64_t bytes_dirtied, + uint64_t bytes_allocated, uint32_t num_lanes, uint32_t num_log_entries, + uint32_t num_unpublished_reserves) { + bool alloc_succeeds = true; + bool no_space = false; + { + std::lock_guard locker(m_lock); + if (m_free_lanes < num_lanes) { + ldout(m_image_ctx.cct, 20) << "not enough free lanes (need " + << num_lanes + << ", have " << m_free_lanes << ") " + << *req << dendl; + alloc_succeeds = false; + /* This isn't considered a "no space" alloc fail. Lanes are a throttling mechanism. */ + } + if (m_free_log_entries < num_log_entries) { + ldout(m_image_ctx.cct, 20) << "not enough free entries (need " + << num_log_entries + << ", have " << m_free_log_entries << ") " + << *req << dendl; + alloc_succeeds = false; + no_space = true; /* Entries must be retired */ + } + /* Don't attempt buffer allocate if we've exceeded the "full" threshold */ + if (m_bytes_allocated + bytes_allocated > m_bytes_allocated_cap) { + ldout(m_image_ctx.cct, 20) << "Waiting for allocation cap (cap=" + << m_bytes_allocated_cap + << ", allocated=" << m_bytes_allocated + << ") in write [" << *req << "]" << dendl; + alloc_succeeds = false; + no_space = true; /* Entries must be retired */ + } + } + + if (alloc_succeeds) { + reserve_cache(req, alloc_succeeds, no_space); + } + + if (alloc_succeeds) { + std::unique_lock locker(m_lock); + /* We need one free log entry per extent (each is a separate entry), and + * one free "lane" for remote replication. */ + if ((m_free_lanes >= num_lanes) && + (m_free_log_entries >= num_log_entries) && + (m_bytes_allocated_cap >= m_bytes_allocated + bytes_allocated)) { + m_free_lanes -= num_lanes; + m_free_log_entries -= num_log_entries; + m_unpublished_reserves += num_unpublished_reserves; + m_bytes_allocated += bytes_allocated; + m_bytes_cached += bytes_cached; + m_bytes_dirty += bytes_dirtied; + if (m_cache_state->clean && bytes_dirtied > 0) { + m_cache_state->clean = false; + update_image_cache_state(); + write_image_cache_state(locker); + } + } else { + alloc_succeeds = false; + } + } + + if (!alloc_succeeds && no_space) { + /* Expedite flushing and/or retiring */ + std::lock_guard locker(m_lock); + m_alloc_failed_since_retire = true; + m_last_alloc_fail = ceph_clock_now(); + } + + return alloc_succeeds; +} + +template +C_FlushRequest>* AbstractWriteLog::make_flush_req(Context *on_finish) { + utime_t flush_begins = ceph_clock_now(); + bufferlist bl; + auto *flush_req = + new C_FlushRequestT(*this, flush_begins, Extents({whole_volume_extent()}), + std::move(bl), 0, m_lock, m_perfcounter, on_finish); + + return flush_req; +} + +template +void AbstractWriteLog::wake_up() { + CephContext *cct = m_image_ctx.cct; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + if (!m_wake_up_enabled) { + // wake_up is disabled during shutdown after flushing completes + ldout(m_image_ctx.cct, 6) << "deferred processing disabled" << dendl; + return; + } + + if (m_wake_up_requested && m_wake_up_scheduled) { + return; + } + + ldout(cct, 20) << dendl; + + /* Wake-up can be requested while it's already scheduled */ + m_wake_up_requested = true; + + /* Wake-up cannot be scheduled if it's already scheduled */ + if (m_wake_up_scheduled) { + return; + } + m_wake_up_scheduled = true; + m_async_process_work++; + m_async_op_tracker.start_op(); + m_work_queue.queue(new LambdaContext( + [this](int r) { + process_work(); + m_async_op_tracker.finish_op(); + m_async_process_work--; + }), 0); +} + +template +bool AbstractWriteLog::can_flush_entry(std::shared_ptr log_entry) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << "" << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + if (m_invalidating) { + return true; + } + + /* For OWB we can flush entries with the same sync gen number (write between + * aio_flush() calls) concurrently. Here we'll consider an entry flushable if + * its sync gen number is <= the lowest sync gen number carried by all the + * entries currently flushing. + * + * If the entry considered here bears a sync gen number lower than a + * previously flushed entry, the application had to have submitted the write + * bearing the higher gen number before the write with the lower gen number + * completed. So, flushing these concurrently is OK. + * + * If the entry considered here bears a sync gen number higher than a + * currently flushing entry, the write with the lower gen number may have + * completed to the application before the write with the higher sync gen + * number was submitted, and the application may rely on that completion + * order for volume consistency. In this case the entry will not be + * considered flushable until all the entries bearing lower sync gen numbers + * finish flushing. + */ + + if (m_flush_ops_in_flight && + (log_entry->ram_entry.sync_gen_number > m_lowest_flushing_sync_gen)) { + return false; + } + + return (log_entry->can_writeback() && + (m_flush_ops_in_flight <= IN_FLIGHT_FLUSH_WRITE_LIMIT) && + (m_flush_bytes_in_flight <= IN_FLIGHT_FLUSH_BYTES_LIMIT)); +} + +template +void AbstractWriteLog::detain_flush_guard_request(std::shared_ptr log_entry, + GuardedRequestFunctionContext *guarded_ctx) { + ldout(m_image_ctx.cct, 20) << dendl; + + BlockExtent extent; + if (log_entry->is_sync_point()) { + extent = block_extent(whole_volume_extent()); + } else { + extent = log_entry->ram_entry.block_extent(); + } + + auto req = GuardedRequest(extent, guarded_ctx, false); + BlockGuardCell *cell = nullptr; + + { + std::lock_guard locker(m_flush_guard_lock); + m_flush_guard.detain(req.block_extent, &req, &cell); + } + if (cell) { + req.guard_ctx->cell = cell; + m_image_ctx.op_work_queue->queue(req.guard_ctx, 0); + } +} + +template +Context* AbstractWriteLog::construct_flush_entry(std::shared_ptr log_entry, + bool invalidating) { + ldout(m_image_ctx.cct, 20) << "" << dendl; + + /* Flush write completion action */ + utime_t writeback_start_time = ceph_clock_now(); + Context *ctx = new LambdaContext( + [this, log_entry, writeback_start_time, invalidating](int r) { + utime_t writeback_comp_time = ceph_clock_now(); + m_perfcounter->tinc(l_librbd_pwl_writeback_latency, + writeback_comp_time - writeback_start_time); + { + std::lock_guard locker(m_lock); + if (r < 0) { + lderr(m_image_ctx.cct) << "failed to flush log entry" + << cpp_strerror(r) << dendl; + m_dirty_log_entries.push_front(log_entry); + } else { + ceph_assert(m_bytes_dirty >= log_entry->bytes_dirty()); + log_entry->set_flushed(true); + m_bytes_dirty -= log_entry->bytes_dirty(); + sync_point_writer_flushed(log_entry->get_sync_point_entry()); + ldout(m_image_ctx.cct, 20) << "flushed: " << log_entry + << " invalidating=" << invalidating + << dendl; + } + m_flush_ops_in_flight -= 1; + m_flush_bytes_in_flight -= log_entry->ram_entry.write_bytes; + wake_up(); + } + }); + /* Flush through lower cache before completing */ + ctx = new LambdaContext( + [this, ctx, log_entry](int r) { + { + + WriteLogGuard::BlockOperations block_reqs; + BlockGuardCell *detained_cell = nullptr; + + std::lock_guard locker{m_flush_guard_lock}; + m_flush_guard.release(log_entry->m_cell, &block_reqs); + + for (auto &req : block_reqs) { + m_flush_guard.detain(req.block_extent, &req, &detained_cell); + if (detained_cell) { + req.guard_ctx->cell = detained_cell; + m_image_ctx.op_work_queue->queue(req.guard_ctx, 0); + } + } + } + + if (r < 0) { + lderr(m_image_ctx.cct) << "failed to flush log entry" + << cpp_strerror(r) << dendl; + ctx->complete(r); + } else { + m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, ctx); + } + }); + return ctx; +} + +template +void AbstractWriteLog::process_writeback_dirty_entries() { + CephContext *cct = m_image_ctx.cct; + bool all_clean = false; + int flushed = 0; + bool has_write_entry = false; + bool need_update_state = false; + + ldout(cct, 20) << "Look for dirty entries" << dendl; + { + DeferredContexts post_unlock; + GenericLogEntries entries_to_flush; + + std::shared_lock entry_reader_locker(m_entry_reader_lock); + std::lock_guard locker(m_lock); + while (flushed < IN_FLIGHT_FLUSH_WRITE_LIMIT) { + if (m_shutting_down) { + ldout(cct, 5) << "Flush during shutdown supressed" << dendl; + /* Do flush complete only when all flush ops are finished */ + all_clean = !m_flush_ops_in_flight; + break; + } + if (m_dirty_log_entries.empty()) { + ldout(cct, 20) << "Nothing new to flush" << dendl; + /* Do flush complete only when all flush ops are finished */ + all_clean = !m_flush_ops_in_flight; + if (!m_cache_state->clean && all_clean) { + m_cache_state->clean = true; + update_image_cache_state(); + need_update_state = true; + } + break; + } + + auto candidate = m_dirty_log_entries.front(); + bool flushable = can_flush_entry(candidate); + if (flushable) { + entries_to_flush.push_back(candidate); + flushed++; + if (!has_write_entry) + has_write_entry = candidate->is_write_entry(); + m_dirty_log_entries.pop_front(); + + // To track candidate, we should add m_flush_ops_in_flight in here + { + if (!m_flush_ops_in_flight || + (candidate->ram_entry.sync_gen_number < m_lowest_flushing_sync_gen)) { + m_lowest_flushing_sync_gen = candidate->ram_entry.sync_gen_number; + } + m_flush_ops_in_flight += 1; + /* For write same this is the bytes affected by the flush op, not the bytes transferred */ + m_flush_bytes_in_flight += candidate->ram_entry.write_bytes; + } + } else { + ldout(cct, 20) << "Next dirty entry isn't flushable yet" << dendl; + break; + } + } + + construct_flush_entries(entries_to_flush, post_unlock, has_write_entry); + } + if (need_update_state) { + std::unique_lock locker(m_lock); + write_image_cache_state(locker); + } + + if (all_clean) { + /* All flushing complete, drain outside lock */ + Contexts flush_contexts; + { + std::lock_guard locker(m_lock); + flush_contexts.swap(m_flush_complete_contexts); + } + finish_contexts(m_image_ctx.cct, flush_contexts, 0); + } +} + +/* Returns true if the specified SyncPointLogEntry is considered flushed, and + * the log will be updated to reflect this. */ +template +bool AbstractWriteLog::handle_flushed_sync_point(std::shared_ptr log_entry) +{ + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + ceph_assert(log_entry); + + if ((log_entry->writes_flushed == log_entry->writes) && + log_entry->completed && log_entry->prior_sync_point_flushed && + log_entry->next_sync_point_entry) { + ldout(m_image_ctx.cct, 20) << "All writes flushed up to sync point=" + << *log_entry << dendl; + log_entry->next_sync_point_entry->prior_sync_point_flushed = true; + /* Don't move the flushed sync gen num backwards. */ + if (m_flushed_sync_gen < log_entry->ram_entry.sync_gen_number) { + m_flushed_sync_gen = log_entry->ram_entry.sync_gen_number; + } + m_async_op_tracker.start_op(); + m_work_queue.queue(new LambdaContext( + [this, next = std::move(log_entry->next_sync_point_entry)](int r) { + bool handled_by_next; + { + std::lock_guard locker(m_lock); + handled_by_next = handle_flushed_sync_point(std::move(next)); + } + if (!handled_by_next) { + persist_last_flushed_sync_gen(); + } + m_async_op_tracker.finish_op(); + })); + return true; + } + return false; +} + +template +void AbstractWriteLog::sync_point_writer_flushed(std::shared_ptr log_entry) +{ + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + ceph_assert(log_entry); + log_entry->writes_flushed++; + + /* If this entry might be completely flushed, look closer */ + if ((log_entry->writes_flushed == log_entry->writes) && log_entry->completed) { + ldout(m_image_ctx.cct, 15) << "All writes flushed for sync point=" + << *log_entry << dendl; + handle_flushed_sync_point(log_entry); + } +} + +/* Make a new sync point and flush the previous during initialization, when there may or may + * not be a previous sync point */ +template +void AbstractWriteLog::init_flush_new_sync_point(DeferredContexts &later) { + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + ceph_assert(!m_initialized); /* Don't use this after init */ + + if (!m_current_sync_point) { + /* First sync point since start */ + new_sync_point(later); + } else { + flush_new_sync_point(nullptr, later); + } +} + +/** + * Begin a new sync point + */ +template +void AbstractWriteLog::new_sync_point(DeferredContexts &later) { + CephContext *cct = m_image_ctx.cct; + std::shared_ptr old_sync_point = m_current_sync_point; + std::shared_ptr new_sync_point; + ldout(cct, 20) << dendl; + + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + /* The first time this is called, if this is a newly created log, + * this makes the first sync gen number we'll use 1. On the first + * call for a re-opened log m_current_sync_gen will be the highest + * gen number from all the sync point entries found in the re-opened + * log, and this advances to the next sync gen number. */ + ++m_current_sync_gen; + + new_sync_point = std::make_shared(m_current_sync_gen, cct); + m_current_sync_point = new_sync_point; + + /* If this log has been re-opened, old_sync_point will initially be + * nullptr, but m_current_sync_gen may not be zero. */ + if (old_sync_point) { + new_sync_point->setup_earlier_sync_point(old_sync_point, m_last_op_sequence_num); + m_perfcounter->hinc(l_librbd_pwl_syncpoint_hist, + old_sync_point->log_entry->writes, + old_sync_point->log_entry->bytes); + /* This sync point will acquire no more sub-ops. Activation needs + * to acquire m_lock, so defer to later*/ + later.add(new LambdaContext( + [old_sync_point](int r) { + old_sync_point->prior_persisted_gather_activate(); + })); + } + + new_sync_point->prior_persisted_gather_set_finisher(); + + if (old_sync_point) { + ldout(cct,6) << "new sync point = [" << *m_current_sync_point + << "], prior = [" << *old_sync_point << "]" << dendl; + } else { + ldout(cct,6) << "first sync point = [" << *m_current_sync_point + << "]" << dendl; + } +} + +template +void AbstractWriteLog::flush_new_sync_point(C_FlushRequestT *flush_req, + DeferredContexts &later) { + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + if (!flush_req) { + m_async_null_flush_finish++; + m_async_op_tracker.start_op(); + Context *flush_ctx = new LambdaContext([this](int r) { + m_async_null_flush_finish--; + m_async_op_tracker.finish_op(); + }); + flush_req = make_flush_req(flush_ctx); + flush_req->internal = true; + } + + /* Add a new sync point. */ + new_sync_point(later); + std::shared_ptr to_append = m_current_sync_point->earlier_sync_point; + ceph_assert(to_append); + + /* This flush request will append/persist the (now) previous sync point */ + flush_req->to_append = to_append; + + /* When the m_sync_point_persist Gather completes this sync point can be + * appended. The only sub for this Gather is the finisher Context for + * m_prior_log_entries_persisted, which records the result of the Gather in + * the sync point, and completes. TODO: Do we still need both of these + * Gathers?*/ + Context * ctx = new LambdaContext([this, flush_req](int r) { + ldout(m_image_ctx.cct, 20) << "Flush req=" << flush_req + << " sync point =" << flush_req->to_append + << ". Ready to persist." << dendl; + alloc_and_dispatch_io_req(flush_req); + }); + to_append->persist_gather_set_finisher(ctx); + + /* The m_sync_point_persist Gather has all the subs it will ever have, and + * now has its finisher. If the sub is already complete, activation will + * complete the Gather. The finisher will acquire m_lock, so we'll activate + * this when we release m_lock.*/ + later.add(new LambdaContext([to_append](int r) { + to_append->persist_gather_activate(); + })); + + /* The flush request completes when the sync point persists */ + to_append->add_in_on_persisted_ctxs(flush_req); +} + +template +void AbstractWriteLog::flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, + DeferredContexts &later) { + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + /* If there have been writes since the last sync point ... */ + if (m_current_sync_point->log_entry->writes) { + flush_new_sync_point(flush_req, later); + } else { + /* There have been no writes to the current sync point. */ + if (m_current_sync_point->earlier_sync_point) { + /* If previous sync point hasn't completed, complete this flush + * with the earlier sync point. No alloc or dispatch needed. */ + m_current_sync_point->earlier_sync_point->on_sync_point_persisted.push_back(flush_req); + } else { + /* The previous sync point has already completed and been + * appended. The current sync point has no writes, so this flush + * has nothing to wait for. This flush completes now. */ + later.add(flush_req); + } + } +} + +/* + * RWL internal flush - will actually flush the RWL. + * + * User flushes should arrive at aio_flush(), and only flush prior + * writes to all log replicas. + * + * Librbd internal flushes will arrive at flush(invalidate=false, + * discard=false), and traverse the block guard to ensure in-flight writes are + * flushed. + */ +template +void AbstractWriteLog::flush_dirty_entries(Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + bool all_clean; + bool flushing; + bool stop_flushing; + + { + std::unique_lock locker(m_lock); + flushing = (0 != m_flush_ops_in_flight); + all_clean = m_dirty_log_entries.empty(); + stop_flushing = (m_shutting_down); + if (!m_cache_state->clean && all_clean && !flushing) { + m_cache_state->clean = true; + update_image_cache_state(); + write_image_cache_state(locker); + } + } + + if (!flushing && (all_clean || stop_flushing)) { + /* Complete without holding m_lock */ + if (all_clean) { + ldout(cct, 20) << "no dirty entries" << dendl; + } else { + ldout(cct, 5) << "flush during shutdown suppressed" << dendl; + } + on_finish->complete(0); + } else { + if (all_clean) { + ldout(cct, 5) << "flush ops still in progress" << dendl; + } else { + ldout(cct, 20) << "dirty entries remain" << dendl; + } + std::lock_guard locker(m_lock); + /* on_finish can't be completed yet */ + m_flush_complete_contexts.push_back(new LambdaContext( + [this, on_finish](int r) { + flush_dirty_entries(on_finish); + })); + wake_up(); + } +} + +template +void AbstractWriteLog::internal_flush(bool invalidate, Context *on_finish) { + ldout(m_image_ctx.cct, 20) << "invalidate=" << invalidate << dendl; + + if (m_perfcounter) { + if (invalidate) { + m_perfcounter->inc(l_librbd_pwl_invalidate_cache, 1); + } else { + m_perfcounter->inc(l_librbd_pwl_internal_flush, 1); + } + } + + /* May be called even if initialization fails */ + if (!m_initialized) { + ldout(m_image_ctx.cct, 05) << "never initialized" << dendl; + /* Deadlock if completed here */ + m_image_ctx.op_work_queue->queue(on_finish, 0); + return; + } + + /* Flush/invalidate must pass through block guard to ensure all layers of + * cache are consistently flush/invalidated. This ensures no in-flight write leaves + * some layers with valid regions, which may later produce inconsistent read + * results. */ + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext( + [this, on_finish, invalidate](GuardedRequestFunctionContext &guard_ctx) { + DeferredContexts on_exit; + ldout(m_image_ctx.cct, 20) << "cell=" << guard_ctx.cell << dendl; + ceph_assert(guard_ctx.cell); + + Context *ctx = new LambdaContext( + [this, cell=guard_ctx.cell, invalidate, on_finish](int r) { + std::lock_guard locker(m_lock); + m_invalidating = false; + ldout(m_image_ctx.cct, 6) << "Done flush/invalidating (invalidate=" + << invalidate << ")" << dendl; + if (m_log_entries.size()) { + ldout(m_image_ctx.cct, 1) << "m_log_entries.size()=" + << m_log_entries.size() + << ", front()=" << *m_log_entries.front() + << dendl; + } + if (invalidate) { + ceph_assert(m_log_entries.size() == 0); + } + ceph_assert(m_dirty_log_entries.size() == 0); + m_image_ctx.op_work_queue->queue(on_finish, r); + release_guarded_request(cell); + }); + ctx = new LambdaContext( + [this, ctx, invalidate](int r) { + Context *next_ctx = ctx; + ldout(m_image_ctx.cct, 6) << "flush_dirty_entries finished" << dendl; + if (r < 0) { + /* Override on_finish status with this error */ + next_ctx = new LambdaContext([r, ctx](int _r) { + ctx->complete(r); + }); + } + if (invalidate) { + { + std::lock_guard locker(m_lock); + ceph_assert(m_dirty_log_entries.size() == 0); + ceph_assert(!m_invalidating); + ldout(m_image_ctx.cct, 6) << "Invalidating" << dendl; + m_invalidating = true; + } + /* Discards all RWL entries */ + while (retire_entries(MAX_ALLOC_PER_TRANSACTION)) { } + next_ctx->complete(0); + } else { + { + std::lock_guard locker(m_lock); + ceph_assert(m_dirty_log_entries.size() == 0); + ceph_assert(!m_invalidating); + } + m_image_writeback.aio_flush(io::FLUSH_SOURCE_WRITEBACK, next_ctx); + } + }); + ctx = new LambdaContext( + [this, ctx](int r) { + flush_dirty_entries(ctx); + }); + std::lock_guard locker(m_lock); + /* Even if we're throwing everything away, but we want the last entry to + * be a sync point so we can cleanly resume. + * + * Also, the blockguard only guarantees the replication of this op + * can't overlap with prior ops. It doesn't guarantee those are all + * completed and eligible for flush & retire, which we require here. + */ + auto flush_req = make_flush_req(ctx); + flush_new_sync_point_if_needed(flush_req, on_exit); + }); + detain_guarded_request(nullptr, guarded_ctx, true); +} + +template +void AbstractWriteLog::add_into_log_map(GenericWriteLogEntries &log_entries, + C_BlockIORequestT *req) { + req->copy_cache(); + m_blocks_to_log_entries.add_log_entries(log_entries); +} + +template +bool AbstractWriteLog::can_retire_entry(std::shared_ptr log_entry) { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 20) << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + ceph_assert(log_entry); + return log_entry->can_retire(); +} + +template +void AbstractWriteLog::check_image_cache_state_clean() { + ceph_assert(m_deferred_ios.empty()); + ceph_assert(m_ops_to_append.empty()); + ceph_assert(m_async_flush_ops == 0); + ceph_assert(m_async_append_ops == 0); + ceph_assert(m_dirty_log_entries.empty()); + ceph_assert(m_ops_to_flush.empty()); + ceph_assert(m_flush_ops_in_flight == 0); + ceph_assert(m_flush_bytes_in_flight == 0); + ceph_assert(m_bytes_dirty == 0); + ceph_assert(m_work_queue.empty()); +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::AbstractWriteLog; diff --git a/src/librbd/cache/pwl/AbstractWriteLog.h b/src/librbd/cache/pwl/AbstractWriteLog.h new file mode 100644 index 000000000..ffe299c37 --- /dev/null +++ b/src/librbd/cache/pwl/AbstractWriteLog.h @@ -0,0 +1,410 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG +#define CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG + +#include "common/Timer.h" +#include "common/RWLock.h" +#include "common/WorkQueue.h" +#include "common/AsyncOpTracker.h" +#include "librbd/cache/ImageWriteback.h" +#include "librbd/Utils.h" +#include "librbd/BlockGuard.h" +#include "librbd/cache/Types.h" +#include "librbd/cache/pwl/LogOperation.h" +#include "librbd/cache/pwl/ReadRequest.h" +#include "librbd/cache/pwl/Request.h" +#include "librbd/cache/pwl/LogMap.h" +#include "librbd/cache/pwl/Builder.h" +#include +#include + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace plugin { template struct Api; } + +namespace cache { +namespace pwl { + +class GenericLogEntry; +class GenericWriteLogEntry; +class SyncPointLogEntry; +class WriteLogEntry; +struct WriteLogCacheEntry; + +typedef std::list> WriteLogEntries; +typedef std::list> GenericLogEntries; +typedef std::list> GenericWriteLogEntries; +typedef std::vector> GenericLogEntriesVector; + +typedef LogMapEntries WriteLogMapEntries; +typedef LogMap WriteLogMap; + +/**** Write log entries end ****/ + +typedef librbd::BlockGuard WriteLogGuard; + +class DeferredContexts; +template +class ImageCacheState; + +template +class Builder; + +template +struct C_BlockIORequest; + +template +struct C_WriteRequest; + +using GenericLogOperations = std::list; + + +template +class AbstractWriteLog { +public: + typedef io::Extent Extent; + typedef io::Extents Extents; + using This = AbstractWriteLog; + Builder *m_builder; + + AbstractWriteLog(ImageCtxT &image_ctx, + librbd::cache::pwl::ImageCacheState* cache_state, + Builder *builder, + cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api); + virtual ~AbstractWriteLog(); + AbstractWriteLog(const AbstractWriteLog&) = delete; + AbstractWriteLog &operator=(const AbstractWriteLog&) = delete; + + /// IO methods + void read( + Extents&& image_extents, ceph::bufferlist *bl, + int fadvise_flags, Context *on_finish); + void write( + Extents&& image_extents, ceph::bufferlist&& bl, + int fadvise_flags, + Context *on_finish); + void discard( + uint64_t offset, uint64_t length, + uint32_t discard_granularity_bytes, + Context *on_finish); + void flush( + io::FlushSource flush_source, Context *on_finish); + void writesame( + uint64_t offset, uint64_t length, + ceph::bufferlist&& bl, + int fadvise_flags, Context *on_finish); + void compare_and_write( + Extents&& image_extents, + ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl, + uint64_t *mismatch_offset,int fadvise_flags, + Context *on_finish); + + /// internal state methods + void init(Context *on_finish); + void shut_down(Context *on_finish); + void invalidate(Context *on_finish); + void flush(Context *on_finish); + + using C_WriteRequestT = pwl::C_WriteRequest; + using C_BlockIORequestT = pwl::C_BlockIORequest; + using C_FlushRequestT = pwl::C_FlushRequest; + using C_DiscardRequestT = pwl::C_DiscardRequest; + using C_WriteSameRequestT = pwl::C_WriteSameRequest; + + CephContext * get_context(); + void release_guarded_request(BlockGuardCell *cell); + void release_write_lanes(C_BlockIORequestT *req); + virtual bool alloc_resources(C_BlockIORequestT *req) = 0; + virtual void setup_schedule_append( + pwl::GenericLogOperationsVector &ops, bool do_early_flush, + C_BlockIORequestT *req) = 0; + void schedule_append(pwl::GenericLogOperationsVector &ops, C_BlockIORequestT *req = nullptr); + void schedule_append(pwl::GenericLogOperationSharedPtr op, C_BlockIORequestT *req = nullptr); + void flush_new_sync_point(C_FlushRequestT *flush_req, + pwl::DeferredContexts &later); + + std::shared_ptr get_current_sync_point() { + return m_current_sync_point; + } + bool get_persist_on_flush() { + return m_persist_on_flush; + } + void inc_last_op_sequence_num() { + m_perfcounter->inc(l_librbd_pwl_log_ops, 1); + ++m_last_op_sequence_num; + } + uint64_t get_last_op_sequence_num() { + return m_last_op_sequence_num; + } + uint64_t get_current_sync_gen() { + return m_current_sync_gen; + } + unsigned int get_free_lanes() { + return m_free_lanes; + } + uint32_t get_free_log_entries() { + return m_free_log_entries; + } + void add_into_log_map(pwl::GenericWriteLogEntries &log_entries, + C_BlockIORequestT *req); + virtual void complete_user_request(Context *&user_req, int r) = 0; + virtual void copy_bl_to_buffer( + WriteRequestResources *resources, + std::unique_ptr &op_set) {} + +private: + typedef std::list *> C_WriteRequests; + typedef std::list *> C_BlockIORequests; + + std::atomic m_initialized = {false}; + + uint64_t m_bytes_dirty = 0; /* Total bytes yet to flush to RBD */ + utime_t m_last_alloc_fail; /* Entry or buffer allocation fail seen */ + + pwl::WriteLogGuard m_write_log_guard; + + /* Starts at 0 for a new write log. Incremented on every flush. */ + uint64_t m_current_sync_gen = 0; + /* Starts at 0 on each sync gen increase. Incremented before applied + to an operation */ + uint64_t m_last_op_sequence_num = 0; + + bool m_persist_on_write_until_flush = true; + + pwl::WriteLogGuard m_flush_guard; + mutable ceph::mutex m_flush_guard_lock; + + /* Debug counters for the places m_async_op_tracker is used */ + std::atomic m_async_complete_ops = {0}; + std::atomic m_async_null_flush_finish = {0}; + std::atomic m_async_process_work = {0}; + + /* Hold m_deferred_dispatch_lock while consuming from m_deferred_ios. */ + mutable ceph::mutex m_deferred_dispatch_lock; + + /* Used in release/detain to make BlockGuard preserve submission order */ + mutable ceph::mutex m_blockguard_lock; + + /* Use m_blockguard_lock for the following 3 things */ + bool m_barrier_in_progress = false; + BlockGuardCell *m_barrier_cell = nullptr; + + bool m_wake_up_enabled = true; + + Contexts m_flush_complete_contexts; + + std::shared_ptr m_current_sync_point = nullptr; + bool m_persist_on_flush = false; //If false, persist each write before completion + + int m_flush_ops_in_flight = 0; + int m_flush_bytes_in_flight = 0; + uint64_t m_lowest_flushing_sync_gen = 0; + + /* Writes that have left the block guard, but are waiting for resources */ + C_BlockIORequests m_deferred_ios; + /* Throttle writes concurrently allocating & replicating */ + unsigned int m_free_lanes = pwl::MAX_CONCURRENT_WRITES; + + SafeTimer *m_timer = nullptr; /* Used with m_timer_lock */ + mutable ceph::mutex *m_timer_lock = nullptr; /* Used with and by m_timer */ + Context *m_timer_ctx = nullptr; + + ThreadPool m_thread_pool; + + uint32_t m_discard_granularity_bytes; + + BlockGuardCell* detain_guarded_request_helper(pwl::GuardedRequest &req); + BlockGuardCell* detain_guarded_request_barrier_helper( + pwl::GuardedRequest &req); + void detain_guarded_request(C_BlockIORequestT *request, + pwl::GuardedRequestFunctionContext *guarded_ctx, + bool is_barrier); + void perf_start(const std::string name); + void perf_stop(); + void log_perf(); + void periodic_stats(); + void arm_periodic_stats(); + + void pwl_init(Context *on_finish, pwl::DeferredContexts &later); + void check_image_cache_state_clean(); + + void flush_dirty_entries(Context *on_finish); + bool can_flush_entry(const std::shared_ptr log_entry); + bool handle_flushed_sync_point( + std::shared_ptr log_entry); + void sync_point_writer_flushed( + std::shared_ptr log_entry); + + void init_flush_new_sync_point(pwl::DeferredContexts &later); + void new_sync_point(pwl::DeferredContexts &later); + pwl::C_FlushRequest>* make_flush_req( + Context *on_finish); + void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, + pwl::DeferredContexts &later); + + void alloc_and_dispatch_io_req(C_BlockIORequestT *write_req); + void schedule_complete_op_log_entries(pwl::GenericLogOperations &&ops, + const int r); + void internal_flush(bool invalidate, Context *on_finish); + +protected: + librbd::cache::pwl::ImageCacheState* m_cache_state = nullptr; + + std::atomic m_shutting_down = {false}; + std::atomic m_invalidating = {false}; + + ImageCtxT &m_image_ctx; + + std::string m_log_pool_name; + uint64_t m_log_pool_size; + + uint32_t m_total_log_entries = 0; + uint32_t m_free_log_entries = 0; + + std::atomic m_bytes_allocated = {0}; /* Total bytes allocated in write buffers */ + uint64_t m_bytes_cached = 0; /* Total bytes used in write buffers */ + uint64_t m_bytes_allocated_cap = 0; + + std::atomic m_alloc_failed_since_retire = {false}; + + cache::ImageWritebackInterface& m_image_writeback; + plugin::Api& m_plugin_api; + + /* + * When m_first_free_entry == m_first_valid_entry, the log is + * empty. There is always at least one free entry, which can't be + * used. + */ + uint64_t m_first_free_entry = 0; /* Entries from here to m_first_valid_entry-1 are free */ + uint64_t m_first_valid_entry = 0; /* Entries from here to m_first_free_entry-1 are valid */ + + /* All writes bearing this and all prior sync gen numbers are flushed */ + uint64_t m_flushed_sync_gen = 0; + + AsyncOpTracker m_async_op_tracker; + /* Debug counters for the places m_async_op_tracker is used */ + std::atomic m_async_flush_ops = {0}; + std::atomic m_async_append_ops = {0}; + + /* Acquire locks in order declared here */ + + mutable ceph::mutex m_log_retire_lock; + /* Hold a read lock on m_entry_reader_lock to add readers to log entry + * bufs. Hold a write lock to prevent readers from being added (e.g. when + * removing log entrys from the map). No lock required to remove readers. */ + mutable RWLock m_entry_reader_lock; + /* Hold m_log_append_lock while appending or retiring log entries. */ + mutable ceph::mutex m_log_append_lock; + /* Used for most synchronization */ + mutable ceph::mutex m_lock; + + /* Use m_blockguard_lock for the following 3 things */ + pwl::WriteLogGuard::BlockOperations m_awaiting_barrier; + + bool m_wake_up_requested = false; + bool m_wake_up_scheduled = false; + bool m_appending = false; + bool m_dispatching_deferred_ops = false; + + pwl::GenericLogOperations m_ops_to_flush; /* Write ops needing flush in local log */ + pwl::GenericLogOperations m_ops_to_append; /* Write ops needing event append in local log */ + + pwl::WriteLogMap m_blocks_to_log_entries; + + /* New entries are at the back. Oldest at the front */ + pwl::GenericLogEntries m_log_entries; + pwl::GenericLogEntries m_dirty_log_entries; + + PerfCounters *m_perfcounter = nullptr; + + unsigned int m_unpublished_reserves = 0; + + ContextWQ m_work_queue; + + void wake_up(); + + void update_entries( + std::shared_ptr *log_entry, + pwl::WriteLogCacheEntry *cache_entry, + std::map &missing_sync_points, + std::map> &sync_point_entries, + uint64_t entry_index); + void update_sync_points( + std::map &missing_sync_points, + std::map> &sync_point_entries, + pwl::DeferredContexts &later); + virtual void inc_allocated_cached_bytes( + std::shared_ptr log_entry) = 0; + Context *construct_flush_entry( + const std::shared_ptr log_entry, bool invalidating); + void detain_flush_guard_request(std::shared_ptr log_entry, + GuardedRequestFunctionContext *guarded_ctx); + void process_writeback_dirty_entries(); + bool can_retire_entry(const std::shared_ptr log_entry); + + void dispatch_deferred_writes(void); + void complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r); + + bool check_allocation( + C_BlockIORequestT *req, uint64_t bytes_cached, uint64_t bytes_dirtied, + uint64_t bytes_allocated, uint32_t num_lanes, uint32_t num_log_entries, + uint32_t num_unpublished_reserves); + void append_scheduled( + pwl::GenericLogOperations &ops, bool &ops_remain, bool &appending, + bool isRWL=false); + + virtual void process_work() = 0; + virtual void append_scheduled_ops(void) = 0; + virtual void schedule_append_ops(pwl::GenericLogOperations &ops, C_BlockIORequestT *req) = 0; + virtual void remove_pool_file() = 0; + virtual bool initialize_pool(Context *on_finish, + pwl::DeferredContexts &later) = 0; + virtual void collect_read_extents( + uint64_t read_buffer_offset, LogMapEntry map_entry, + std::vector> &log_entries_to_read, + std::vector &bls_to_read, uint64_t entry_hit_length, + Extent hit_extent, pwl::C_ReadRequest *read_ctx) = 0; + virtual void complete_read( + std::vector> &log_entries_to_read, + std::vector &bls_to_read, Context *ctx) = 0; + virtual void write_data_to_buffer( + std::shared_ptr ws_entry, + pwl::WriteLogCacheEntry *cache_entry) {} + virtual void release_ram( + const std::shared_ptr log_entry) {} + virtual void alloc_op_log_entries(pwl::GenericLogOperations &ops) {} + virtual bool retire_entries(const unsigned long int frees_per_tx) { + return false; + } + virtual void schedule_flush_and_append( + pwl::GenericLogOperationsVector &ops) {} + virtual void persist_last_flushed_sync_gen() {} + virtual void reserve_cache(C_BlockIORequestT *req, bool &alloc_succeeds, + bool &no_space) {} + virtual void construct_flush_entries(pwl::GenericLogEntries entries_to_flush, + DeferredContexts &post_unlock, + bool has_write_entry) = 0; + virtual uint64_t get_max_extent() { + return 0; + } + void update_image_cache_state(void); + void write_image_cache_state(std::unique_lock& locker); + void handle_write_image_cache_state(int r); +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::AbstractWriteLog; + +#endif // CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG diff --git a/src/librbd/cache/pwl/Builder.h b/src/librbd/cache/pwl/Builder.h new file mode 100644 index 000000000..9db28ea68 --- /dev/null +++ b/src/librbd/cache/pwl/Builder.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_BUILDER_H +#define CEPH_LIBRBD_CACHE_PWL_BUILDER_H + +namespace librbd { +namespace cache { +namespace pwl { + +template +class Builder { +public: + virtual ~Builder() {} + virtual std::shared_ptr create_write_log_entry( + uint64_t image_offset_bytes, uint64_t write_bytes) = 0; + virtual std::shared_ptr create_write_log_entry( + std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes) = 0; + virtual std::shared_ptr create_writesame_log_entry( + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) = 0; + virtual std::shared_ptr create_writesame_log_entry( + std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) = 0; + virtual C_WriteRequest *create_write_request( + T &pwl, utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) = 0; + virtual C_WriteSameRequest *create_writesame_request( + T &pwl, utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) = 0; + virtual C_WriteRequest *create_comp_and_write_request( + T &pwl, utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) = 0; + virtual std::shared_ptr create_write_log_operation( + WriteLogOperationSet &set, uint64_t image_offset_bytes, + uint64_t write_bytes, CephContext *cct, + std::shared_ptr write_log_entry) = 0; + virtual std::shared_ptr create_write_log_operation( + WriteLogOperationSet &set, uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t data_len, CephContext *cct, + std::shared_ptr writesame_log_entry) = 0; + virtual std::shared_ptr create_discard_log_operation( + std::shared_ptr sync_point, uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t discard_granularity_bytes, + utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct) = 0; + virtual C_ReadRequest *create_read_request(CephContext *cct, utime_t arrived, + PerfCounters *perfcounter, ceph::bufferlist *bl, Context *on_finish) = 0; + +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_BUILDER_H diff --git a/src/librbd/cache/pwl/DiscardRequest.cc b/src/librbd/cache/pwl/DiscardRequest.cc new file mode 100644 index 000000000..1b537f32d --- /dev/null +++ b/src/librbd/cache/pwl/DiscardRequest.cc @@ -0,0 +1,160 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include + +#include "common/dout.h" +#include "common/errno.h" +#include "common/hostname.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/pwl/DiscardRequest.h" + +#include "librbd/cache/pwl/ImageCacheState.h" + +#include "librbd/cache/Types.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl:DiscardRequest: " \ + << this << " " << __func__ << ": " + +namespace fs = std::filesystem; + +namespace librbd { +namespace cache { +namespace pwl { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template +DiscardRequest* DiscardRequest::create( + I &image_ctx, + plugin::Api& plugin_api, + Context *on_finish) { + return new DiscardRequest(image_ctx, plugin_api, on_finish); +} + +template +DiscardRequest::DiscardRequest( + I &image_ctx, + plugin::Api& plugin_api, + Context *on_finish) + : m_image_ctx(image_ctx), + m_plugin_api(plugin_api), + m_on_finish(create_async_context_callback(image_ctx, on_finish)), + m_error_result(0) { +} + +template +void DiscardRequest::send() { + delete_image_cache_file(); +} + +template +void DiscardRequest::delete_image_cache_file() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + m_cache_state = ImageCacheState::get_image_cache_state(&m_image_ctx, m_plugin_api); + if (!m_cache_state) { + remove_feature_bit(); + return; + } + if (m_cache_state->present && + !m_cache_state->host.compare(ceph_get_short_hostname()) && + fs::exists(m_cache_state->path)) { + std::error_code ec; + fs::remove(m_cache_state->path, ec); + if (ec) { + lderr(cct) << "failed to remove persistent cache file: " << ec.message() + << dendl; + // not fatal + } + } + + remove_image_cache_state(); +} + +template +void DiscardRequest::remove_image_cache_state() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = DiscardRequest; + Context *ctx = create_context_callback( + this); + + m_cache_state->clear_image_cache_state(ctx); +} + +template +void DiscardRequest::handle_remove_image_cache_state(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove the image cache state: " << cpp_strerror(r) + << dendl; + save_result(r); + finish(); + return; + } + + remove_feature_bit(); +} + +template +void DiscardRequest::remove_feature_bit() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + uint64_t new_features = m_image_ctx.features & ~RBD_FEATURE_DIRTY_CACHE; + uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE; + ldout(cct, 10) << "old_features=" << m_image_ctx.features + << ", new_features=" << new_features + << ", features_mask=" << features_mask + << dendl; + + int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, m_image_ctx.header_oid, + new_features, features_mask); + m_image_ctx.features &= ~RBD_FEATURE_DIRTY_CACHE; + using klass = DiscardRequest; + Context *ctx = create_context_callback( + this); + ctx->complete(r); +} + +template +void DiscardRequest::handle_remove_feature_bit(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove the feature bit: " << cpp_strerror(r) + << dendl; + save_result(r); + } + finish(); +} + +template +void DiscardRequest::finish() { + if (m_cache_state) { + delete m_cache_state; + m_cache_state = nullptr; + } + + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::DiscardRequest; diff --git a/src/librbd/cache/pwl/DiscardRequest.h b/src/librbd/cache/pwl/DiscardRequest.h new file mode 100644 index 000000000..c896369fe --- /dev/null +++ b/src/librbd/cache/pwl/DiscardRequest.h @@ -0,0 +1,90 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_SHUTDOWN_REQUEST_H +#define CEPH_LIBRBD_CACHE_PWL_SHUTDOWN_REQUEST_H + +class Context; + +namespace librbd { + +class ImageCtx; +namespace plugin { template struct Api; } + +namespace cache { + +namespace pwl { + +template +class ImageCacheState; + +template +class DiscardRequest { +public: + static DiscardRequest* create( + ImageCtxT &image_ctx, + plugin::Api& plugin_api, + Context *on_finish); + + void send(); + +private: + + /** + * @verbatim + * + * Shutdown request goes through the following state machine: + * + * + * | + * v + * REMOVE_IMAGE_CACHE_FILE + * | + * v + * REMOVE_IMAGE_CACHE_STATE + * | + * v + * REMOVE_IMAGE_FEATURE_BIT + * | + * v + * + * + * @endverbatim + */ + + DiscardRequest(ImageCtxT &image_ctx, + plugin::Api& plugin_api, + Context *on_finish); + + ImageCtxT &m_image_ctx; + ImageCacheState* m_cache_state; + plugin::Api& m_plugin_api; + Context *m_on_finish; + + int m_error_result; + + void delete_image_cache_file(); + + void remove_image_cache_state(); + void handle_remove_image_cache_state(int r); + + void remove_feature_bit(); + void handle_remove_feature_bit(int r); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } + +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::DiscardRequest; + +#endif // CEPH_LIBRBD_CACHE_PWL_SHUTDOWN_REQUEST_H diff --git a/src/librbd/cache/pwl/ImageCacheState.cc b/src/librbd/cache/pwl/ImageCacheState.cc new file mode 100644 index 000000000..ab941df0f --- /dev/null +++ b/src/librbd/cache/pwl/ImageCacheState.cc @@ -0,0 +1,196 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/Types.h" +#include "librbd/cache/Utils.h" +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/ImageCtx.h" +#include "librbd/Operations.h" +#include "common/config_proxy.h" +#include "common/environment.h" +#include "common/hostname.h" +#include "librbd/plugin/Api.h" + +#undef dout_subsys +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::ImageCacheState: " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +using namespace std; + +template +void ImageCacheState::init_from_config() { + ldout(m_image_ctx->cct, 20) << dendl; + + present = false; + empty = true; + clean = true; + host = ""; + path = ""; + ConfigProxy &config = m_image_ctx->config; + mode = config.get_val("rbd_persistent_cache_mode"); + size = 0; +} + +template +bool ImageCacheState::init_from_metadata(json_spirit::mValue& json_root) { + ldout(m_image_ctx->cct, 20) << dendl; + + try { + auto& o = json_root.get_obj(); + present = o["present"].get_bool(); + empty = o["empty"].get_bool(); + clean = o["clean"].get_bool(); + host = o["host"].get_str(); + path = o["path"].get_str(); + mode = o["mode"].get_str(); + size = o["size"].get_uint64(); + } catch (std::runtime_error& e) { + lderr(m_image_ctx->cct) << "failed to parse cache state: " << e.what() + << dendl; + return false; + } + + return true; +} + +template +void ImageCacheState::write_image_cache_state(std::unique_lock& locker, + Context *on_finish) { + ceph_assert(ceph_mutex_is_locked_by_me(*locker.mutex())); + stats_timestamp = ceph_clock_now(); + json_spirit::mObject o; + o["present"] = present; + o["empty"] = empty; + o["clean"] = clean; + o["host"] = host; + o["path"] = path; + o["mode"] = mode; + o["size"] = size; + o["stats_timestamp"] = stats_timestamp.sec(); + o["allocated_bytes"] = allocated_bytes; + o["cached_bytes"] = cached_bytes; + o["dirty_bytes"] = dirty_bytes; + o["free_bytes"] = free_bytes; + o["hits_full"] = hits_full; + o["hits_partial"] = hits_partial; + o["misses"] = misses; + o["hit_bytes"] = hit_bytes; + o["miss_bytes"] = miss_bytes; + std::string image_state_json = json_spirit::write(o); + locker.unlock(); + + std::shared_lock owner_lock{m_image_ctx->owner_lock}; + ldout(m_image_ctx->cct, 20) << __func__ << " Store state: " + << image_state_json << dendl; + m_plugin_api.execute_image_metadata_set(m_image_ctx, PERSISTENT_CACHE_STATE, + image_state_json, on_finish); +} + +template +void ImageCacheState::clear_image_cache_state(Context *on_finish) { + std::shared_lock owner_lock{m_image_ctx->owner_lock}; + ldout(m_image_ctx->cct, 20) << __func__ << " Remove state: " << dendl; + m_plugin_api.execute_image_metadata_remove( + m_image_ctx, PERSISTENT_CACHE_STATE, on_finish); +} + +template +ImageCacheState* ImageCacheState::create_image_cache_state( + I* image_ctx, plugin::Api& plugin_api, int &r) { + std::string cache_state_str; + ImageCacheState* cache_state = nullptr; + + r = 0; + bool dirty_cache = plugin_api.test_image_features(image_ctx, RBD_FEATURE_DIRTY_CACHE); + if (dirty_cache) { + cls_client::metadata_get(&image_ctx->md_ctx, image_ctx->header_oid, + PERSISTENT_CACHE_STATE, &cache_state_str); + } + + ldout(image_ctx->cct, 20) << "image_cache_state: " << cache_state_str << dendl; + + bool pwl_enabled = cache::util::is_pwl_enabled(*image_ctx); + bool cache_desired = pwl_enabled; + cache_desired &= !image_ctx->read_only; + cache_desired &= !plugin_api.test_image_features(image_ctx, RBD_FEATURE_MIGRATING); + cache_desired &= !plugin_api.test_image_features(image_ctx, RBD_FEATURE_JOURNALING); + cache_desired &= !image_ctx->old_format; + + if (!dirty_cache && !cache_desired) { + ldout(image_ctx->cct, 20) << "Do not desire to use image cache." << dendl; + } else if (dirty_cache && !cache_desired) { + lderr(image_ctx->cct) << "There's a dirty cache, but RWL cache is disabled." + << dendl; + r = -EINVAL; + }else if ((!dirty_cache || cache_state_str.empty()) && cache_desired) { + cache_state = new ImageCacheState(image_ctx, plugin_api); + cache_state->init_from_config(); + } else { + ceph_assert(!cache_state_str.empty()); + json_spirit::mValue json_root; + if (!json_spirit::read(cache_state_str.c_str(), json_root)) { + lderr(image_ctx->cct) << "failed to parse cache state" << dendl; + r = -EINVAL; + return nullptr; + } + cache_state = new ImageCacheState(image_ctx, plugin_api); + if (!cache_state->init_from_metadata(json_root)) { + delete cache_state; + r = -EINVAL; + return nullptr; + } + if (!cache_state->present) { + cache_state->init_from_config(); + } + } + return cache_state; +} + +template +ImageCacheState* ImageCacheState::get_image_cache_state( + I* image_ctx, plugin::Api& plugin_api) { + ImageCacheState* cache_state = nullptr; + string cache_state_str; + cls_client::metadata_get(&image_ctx->md_ctx, image_ctx->header_oid, + PERSISTENT_CACHE_STATE, &cache_state_str); + if (!cache_state_str.empty()) { + // ignore errors, best effort + cache_state = new ImageCacheState(image_ctx, plugin_api); + json_spirit::mValue json_root; + if (!json_spirit::read(cache_state_str.c_str(), json_root)) { + lderr(image_ctx->cct) << "failed to parse cache state" << dendl; + } else { + cache_state->init_from_metadata(json_root); + } + } + return cache_state; +} + +template +bool ImageCacheState::is_valid() { + if (this->present && + (host.compare(ceph_get_short_hostname()) != 0)) { + auto cleanstring = "dirty"; + if (this->clean) { + cleanstring = "clean"; + } + lderr(m_image_ctx->cct) << "An image cache (RWL) remains on another host " + << host << " which is " << cleanstring + << ". Flush/close the image there to remove the " + << "image cache" << dendl; + return false; + } + return true; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::ImageCacheState; diff --git a/src/librbd/cache/pwl/ImageCacheState.h b/src/librbd/cache/pwl/ImageCacheState.h new file mode 100644 index 000000000..5be5f73ac --- /dev/null +++ b/src/librbd/cache/pwl/ImageCacheState.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H +#define CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H + +#include "json_spirit/json_spirit.h" +#include "librbd/ImageCtx.h" +#include "librbd/cache/Types.h" +#include + +namespace ceph { + class Formatter; +} + +namespace librbd { + +namespace plugin { template struct Api; } + +namespace cache { +namespace pwl { + +template +class ImageCacheState { +private: + ImageCtxT* m_image_ctx; + plugin::Api& m_plugin_api; +public: + bool present = false; + bool empty = true; + bool clean = true; + std::string host; + std::string path; + std::string mode; + uint64_t size = 0; + /* After reloading, the following data does not need to be read, + * but recalculated. */ + utime_t stats_timestamp; + uint64_t allocated_bytes = 0; + uint64_t cached_bytes = 0; + uint64_t dirty_bytes = 0; + uint64_t free_bytes = 0; + uint64_t hits_full = 0; + uint64_t hits_partial = 0; + uint64_t misses = 0; + uint64_t hit_bytes = 0; + uint64_t miss_bytes = 0; + + ImageCacheState(ImageCtxT* image_ctx, plugin::Api& plugin_api) + : m_image_ctx(image_ctx), m_plugin_api(plugin_api) {} + + ~ImageCacheState() {} + + ImageCacheType get_image_cache_mode() const { + if (mode == "rwl") { + return IMAGE_CACHE_TYPE_RWL; + } else if (mode == "ssd") { + return IMAGE_CACHE_TYPE_SSD; + } + return IMAGE_CACHE_TYPE_UNKNOWN; + } + + void init_from_config(); + bool init_from_metadata(json_spirit::mValue& json_root); + + void write_image_cache_state(std::unique_lock& locker, + Context *on_finish); + + void clear_image_cache_state(Context *on_finish); + + static ImageCacheState* create_image_cache_state( + ImageCtxT* image_ctx, plugin::Api& plugin_api, int &r); + + static ImageCacheState* get_image_cache_state( + ImageCtxT* image_ctx, plugin::Api& plugin_api); + + bool is_valid(); +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::ImageCacheState; + +#endif // CEPH_LIBRBD_CACHE_RWL_IMAGE_CACHE_STATE_H diff --git a/src/librbd/cache/pwl/InitRequest.cc b/src/librbd/cache/pwl/InitRequest.cc new file mode 100644 index 000000000..65dac8b46 --- /dev/null +++ b/src/librbd/cache/pwl/InitRequest.cc @@ -0,0 +1,226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/pwl/InitRequest.h" +#include "librbd/io/ImageDispatcher.h" +#include "librbd/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/asio/ContextWQ.h" + +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/cache/WriteLogImageDispatch.h" +#include "librbd/cache/ImageWriteback.h" +#ifdef WITH_RBD_RWL +#include "librbd/cache/pwl/rwl/WriteLog.h" +#endif + +#ifdef WITH_RBD_SSD_CACHE +#include "librbd/cache/pwl/ssd/WriteLog.h" +#endif + +#include "librbd/cache/Utils.h" +#include "librbd/ImageCtx.h" +#include "librbd/plugin/Api.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl:InitRequest " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template +InitRequest* InitRequest::create( + I &image_ctx, + cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api, + Context *on_finish) { + return new InitRequest(image_ctx, image_writeback, plugin_api, on_finish); +} + +template +InitRequest::InitRequest( + I &image_ctx, + cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api, + Context *on_finish) + : m_image_ctx(image_ctx), + m_image_writeback(image_writeback), + m_plugin_api(plugin_api), + m_on_finish(create_async_context_callback(image_ctx, on_finish)), + m_error_result(0) { +} + +template +void InitRequest::send() { + get_image_cache_state(); +} + +template +void InitRequest::get_image_cache_state() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + int r; + auto cache_state = ImageCacheState::create_image_cache_state( + &m_image_ctx, m_plugin_api, r); + + if (r < 0 || !cache_state) { + save_result(r); + finish(); + return; + } else if (!cache_state->is_valid()) { + delete cache_state; + cache_state = nullptr; + lderr(cct) << "failed to get image cache state: " << cpp_strerror(r) + << dendl; + save_result(-ENOENT); + finish(); + return; + } + + auto mode = cache_state->get_image_cache_mode(); + switch (mode) { + #ifdef WITH_RBD_RWL + case cache::IMAGE_CACHE_TYPE_RWL: + m_image_cache = + new librbd::cache::pwl::rwl::WriteLog(m_image_ctx, + cache_state, + m_image_writeback, + m_plugin_api); + break; + #endif + #ifdef WITH_RBD_SSD_CACHE + case cache::IMAGE_CACHE_TYPE_SSD: + m_image_cache = + new librbd::cache::pwl::ssd::WriteLog(m_image_ctx, + cache_state, + m_image_writeback, + m_plugin_api); + break; + #endif + default: + delete cache_state; + cache_state = nullptr; + save_result(-ENOENT); + finish(); + return; + } + + init_image_cache(); +} + +template +void InitRequest::init_image_cache() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = InitRequest; + Context *ctx = create_async_context_callback(m_image_ctx, + create_context_callback(this)); + m_image_cache->init(ctx); +} + +template +void InitRequest::handle_init_image_cache(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to init image cache: " << cpp_strerror(r) + << dendl; + delete m_image_cache; + m_image_cache = nullptr; + save_result(r); + finish(); + return; + } + set_feature_bit(); +} + +template +void InitRequest::set_feature_bit() { + CephContext *cct = m_image_ctx.cct; + + uint64_t new_features = m_image_ctx.features | RBD_FEATURE_DIRTY_CACHE; + uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE; + ldout(cct, 10) << "old_features=" << m_image_ctx.features + << ", new_features=" << new_features + << ", features_mask=" << features_mask + << dendl; + + int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, + m_image_ctx.header_oid, + new_features, features_mask); + m_image_ctx.features |= RBD_FEATURE_DIRTY_CACHE; + using klass = InitRequest; + Context *ctx = create_context_callback( + this); + ctx->complete(r); +} + +template +void InitRequest::handle_set_feature_bit(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to set feature bit: " << cpp_strerror(r) + << dendl; + save_result(r); + + shutdown_image_cache(); + } + + // Register RWL dispatch + auto image_dispatch = new cache::WriteLogImageDispatch( + &m_image_ctx, m_image_cache, m_plugin_api); + + m_image_ctx.io_image_dispatcher->register_dispatch(image_dispatch); + + finish(); +} + +template +void InitRequest::shutdown_image_cache() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = InitRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_shutdown_image_cache>(this); + m_image_cache->shut_down(ctx); +} + +template +void InitRequest::handle_shutdown_image_cache(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to close image cache: " << cpp_strerror(r) + << dendl; + } + delete m_image_cache; + m_image_cache = nullptr; + + finish(); +} + +template +void InitRequest::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::InitRequest; diff --git a/src/librbd/cache/pwl/InitRequest.h b/src/librbd/cache/pwl/InitRequest.h new file mode 100644 index 000000000..56e63425e --- /dev/null +++ b/src/librbd/cache/pwl/InitRequest.h @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H +#define CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace io { class ImageDispatchInterface; } + +namespace plugin { template struct Api; } + +namespace cache { + +class ImageWritebackInterface; + +namespace pwl { + +template +class AbstractWriteLog; + +template +class ImageCacheState; + +template +class InitRequest { +public: + static InitRequest* create( + ImageCtxT &image_ctx, + librbd::cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api, + Context *on_finish); + + void send(); + +private: + + /** + * @verbatim + * + * Init request goes through the following state machine: + * + * + * | + * v + * GET_IMAGE_CACHE_STATE + * | + * v + * INIT_IMAGE_CACHE + * | + * v + * SET_FEATURE_BIT * * * > CLOSE_IMAGE_CACHE + * | | + * v | + * <-------------------/ + * + * @endverbatim + */ + + InitRequest(ImageCtxT &image_ctx, + librbd::cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api, + Context *on_finish); + + ImageCtxT &m_image_ctx; + librbd::cache::ImageWritebackInterface& m_image_writeback; + plugin::Api& m_plugin_api; + AbstractWriteLog *m_image_cache; + Context *m_on_finish; + + int m_error_result; + + bool is_pwl_enabled(); + + void get_image_cache_state(); + + void init_image_cache(); + void handle_init_image_cache(int r); + + void set_feature_bit(); + void handle_set_feature_bit(int r); + + void shutdown_image_cache(); + void handle_shutdown_image_cache(int r); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::InitRequest; + +#endif // CEPH_LIBRBD_CACHE_RWL_INIT_REQUEST_H diff --git a/src/librbd/cache/pwl/LogEntry.cc b/src/librbd/cache/pwl/LogEntry.cc new file mode 100644 index 000000000..8a050eb79 --- /dev/null +++ b/src/librbd/cache/pwl/LogEntry.cc @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "LogEntry.h" +#include "librbd/cache/ImageWriteback.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::LogEntry: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +std::ostream& GenericLogEntry::format(std::ostream &os) const { + os << "ram_entry=[" << ram_entry + << "], cache_entry=" << (void*)cache_entry + << ", log_entry_index=" << log_entry_index + << ", completed=" << completed; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const GenericLogEntry &entry) { + return entry.format(os); +} + +std::ostream& SyncPointLogEntry::format(std::ostream &os) const { + os << "(Sync Point) "; + GenericLogEntry::format(os); + os << ", writes=" << writes + << ", bytes=" << bytes + << ", writes_completed=" << writes_completed + << ", writes_flushed=" << writes_flushed + << ", prior_sync_point_flushed=" << prior_sync_point_flushed + << ", next_sync_point_entry=" << next_sync_point_entry; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const SyncPointLogEntry &entry) { + return entry.format(os); +} + +bool GenericWriteLogEntry::can_writeback() const { + return (this->completed && + (ram_entry.is_sequenced() || + (sync_point_entry && + sync_point_entry->completed))); +} + +std::ostream& GenericWriteLogEntry::format(std::ostream &os) const { + GenericLogEntry::format(os); + os << ", sync_point_entry=["; + if (sync_point_entry) { + os << *sync_point_entry; + } else { + os << "nullptr"; + } + os << "], referring_map_entries=" << referring_map_entries; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const GenericWriteLogEntry &entry) { + return entry.format(os); +} + +void WriteLogEntry::init(bool has_data, + uint64_t current_sync_gen, + uint64_t last_op_sequence_num, bool persist_on_flush) { + ram_entry.set_has_data(has_data); + ram_entry.sync_gen_number = current_sync_gen; + if (persist_on_flush) { + /* Persist on flush. Sequence #0 is never used. */ + ram_entry.write_sequence_number = 0; + } else { + /* Persist on write */ + ram_entry.write_sequence_number = last_op_sequence_num; + ram_entry.set_sequenced(true); + } + ram_entry.set_sync_point(false); + ram_entry.set_discard(false); +} + +std::ostream& WriteLogEntry::format(std::ostream &os) const { + os << "(Write) "; + GenericWriteLogEntry::format(os); + os << ", cache_buffer=" << (void*)cache_buffer; + os << ", cache_bp=" << cache_bp; + os << ", bl_refs=" << bl_refs; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const WriteLogEntry &entry) { + return entry.format(os); +} + +void DiscardLogEntry::writeback( + librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) { + image_writeback.aio_discard(ram_entry.image_offset_bytes, + ram_entry.write_bytes, + m_discard_granularity_bytes, ctx); +} + +void DiscardLogEntry::init(uint64_t current_sync_gen, bool persist_on_flush, + uint64_t last_op_sequence_num) { + ram_entry.sync_gen_number = current_sync_gen; + if (persist_on_flush) { + /* Persist on flush. Sequence #0 is never used. */ + ram_entry.write_sequence_number = 0; + } else { + /* Persist on write */ + ram_entry.write_sequence_number = last_op_sequence_num; + ram_entry.set_sequenced(true); + } +} + +std::ostream &DiscardLogEntry::format(std::ostream &os) const { + os << "(Discard) "; + GenericWriteLogEntry::format(os); + return os; +} + +std::ostream &operator<<(std::ostream &os, + const DiscardLogEntry &entry) { + return entry.format(os); +} + +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/LogEntry.h b/src/librbd/cache/pwl/LogEntry.h new file mode 100644 index 000000000..ecaca0b7b --- /dev/null +++ b/src/librbd/cache/pwl/LogEntry.h @@ -0,0 +1,280 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H +#define CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H + +#include "common/ceph_mutex.h" +#include "librbd/Utils.h" +#include "librbd/cache/pwl/Types.h" +#include +#include + +namespace librbd { +namespace cache { +class ImageWritebackInterface; +namespace pwl { + +class SyncPointLogEntry; +class GenericWriteLogEntry; +class WriteLogEntry; + +typedef std::list> GenericWriteLogEntries; + +class GenericLogEntry { +public: + WriteLogCacheEntry ram_entry; + WriteLogCacheEntry *cache_entry = nullptr; + uint64_t log_entry_index = 0; + bool completed = false; + BlockGuardCell* m_cell = nullptr; + GenericLogEntry(uint64_t image_offset_bytes = 0, uint64_t write_bytes = 0) + : ram_entry(image_offset_bytes, write_bytes) { + }; + virtual ~GenericLogEntry() { }; + GenericLogEntry(const GenericLogEntry&) = delete; + GenericLogEntry &operator=(const GenericLogEntry&) = delete; + virtual bool can_writeback() const { + return false; + } + virtual bool can_retire() const { + return false; + } + virtual void set_flushed(bool flushed) { + ceph_assert(false); + } + virtual unsigned int write_bytes() const { + return 0; + }; + virtual unsigned int bytes_dirty() const { + return 0; + }; + virtual std::shared_ptr get_sync_point_entry() { + return nullptr; + } + virtual void writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) { + ceph_assert(false); + }; + virtual void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx, ceph::bufferlist &&bl) { + ceph_assert(false); + } + virtual bool is_write_entry() const { + return false; + } + virtual bool is_writesame_entry() const { + return false; + } + virtual bool is_sync_point() const { + return false; + } + virtual unsigned int get_aligned_data_size() const { + return 0; + } + virtual void remove_cache_bl() {} + virtual std::ostream& format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const GenericLogEntry &entry); +}; + +class SyncPointLogEntry : public GenericLogEntry { +public: + /* Writing entries using this sync gen number */ + std::atomic writes = {0}; + /* Total bytes for all writing entries using this sync gen number */ + std::atomic bytes = {0}; + /* Writing entries using this sync gen number that have completed */ + std::atomic writes_completed = {0}; + /* Writing entries using this sync gen number that have completed flushing to the writeback interface */ + std::atomic writes_flushed = {0}; + /* All writing entries using all prior sync gen numbers have been flushed */ + std::atomic prior_sync_point_flushed = {true}; + std::shared_ptr next_sync_point_entry = nullptr; + SyncPointLogEntry(uint64_t sync_gen_number) { + ram_entry.sync_gen_number = sync_gen_number; + ram_entry.set_sync_point(true); + }; + ~SyncPointLogEntry() override {}; + SyncPointLogEntry(const SyncPointLogEntry&) = delete; + SyncPointLogEntry &operator=(const SyncPointLogEntry&) = delete; + bool can_retire() const override { + return this->completed; + } + bool is_sync_point() const override { + return true; + } + std::ostream& format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const SyncPointLogEntry &entry); +}; + +class GenericWriteLogEntry : public GenericLogEntry { +public: + uint32_t referring_map_entries = 0; + std::shared_ptr sync_point_entry; + GenericWriteLogEntry(std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes) + : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(sync_point_entry) { } + GenericWriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes) + : GenericLogEntry(image_offset_bytes, write_bytes), sync_point_entry(nullptr) { } + ~GenericWriteLogEntry() override {}; + GenericWriteLogEntry(const GenericWriteLogEntry&) = delete; + GenericWriteLogEntry &operator=(const GenericWriteLogEntry&) = delete; + unsigned int write_bytes() const override { + /* The valid bytes in this ops data buffer. Discard and WS override. */ + return ram_entry.write_bytes; + }; + unsigned int bytes_dirty() const override { + /* The bytes in the image this op makes dirty. Discard and WS override. */ + return write_bytes(); + }; + BlockExtent block_extent() { + return ram_entry.block_extent(); + } + uint32_t get_map_ref() { + return(referring_map_entries); + } + void inc_map_ref() { referring_map_entries++; } + void dec_map_ref() { referring_map_entries--; } + bool can_writeback() const override; + std::shared_ptr get_sync_point_entry() override { + return sync_point_entry; + } + virtual void copy_cache_bl(bufferlist *out_bl) = 0; + void set_flushed(bool flushed) override { + m_flushed = flushed; + } + bool get_flushed() const { + return m_flushed; + } + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const GenericWriteLogEntry &entry); + +private: + bool m_flushed = false; /* or invalidated */ +}; + +class WriteLogEntry : public GenericWriteLogEntry { +protected: + bool is_writesame = false; + buffer::ptr cache_bp; + buffer::list cache_bl; + std::atomic bl_refs = {0}; /* The refs held on cache_bp by cache_bl */ + /* Used in WriteLogEntry::get_cache_bl() to syncronize between threads making entries readable */ + mutable ceph::mutex m_entry_bl_lock; + + virtual void init_cache_bp() {} + + virtual void init_bl(buffer::ptr &bp, buffer::list &bl) {} +public: + uint8_t *cache_buffer = nullptr; + WriteLogEntry(std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes) + : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes), + m_entry_bl_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this))) + { } + WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes) + : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes), + m_entry_bl_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::WriteLogEntry::m_entry_bl_lock", this))) + { } + WriteLogEntry(std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) { + ram_entry.set_writesame(true); + ram_entry.ws_datalen = data_length; + is_writesame = true; + }; + WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : WriteLogEntry(nullptr, image_offset_bytes, write_bytes) { + ram_entry.set_writesame(true); + ram_entry.ws_datalen = data_length; + is_writesame = true; + }; + ~WriteLogEntry() override {}; + WriteLogEntry(const WriteLogEntry&) = delete; + WriteLogEntry &operator=(const WriteLogEntry&) = delete; + unsigned int write_bytes() const override { + // The valid bytes in this ops data buffer. + if(is_writesame) { + return ram_entry.ws_datalen; + } + return ram_entry.write_bytes; + }; + unsigned int bytes_dirty() const override { + // The bytes in the image this op makes dirty. + return ram_entry.write_bytes; + }; + void init(bool has_data, + uint64_t current_sync_gen, uint64_t last_op_sequence_num, bool persist_on_flush); + virtual void init_cache_buffer(std::vector::iterator allocation) {} + virtual void init_cache_bl(bufferlist &src_bl, uint64_t off, uint64_t len) {} + /* Returns a ref to a bl containing bufferptrs to the entry cache buffer */ + virtual buffer::list &get_cache_bl() = 0; + + BlockExtent block_extent(); + virtual unsigned int reader_count() const = 0; + /* Constructs a new bl containing copies of cache_bp */ + bool can_retire() const override { + return (this->completed && this->get_flushed() && (0 == reader_count())); + } + bool is_write_entry() const override { + return true; + } + bool is_writesame_entry() const override { + return is_writesame; + } + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const WriteLogEntry &entry); +}; + +class DiscardLogEntry : public GenericWriteLogEntry { +public: + DiscardLogEntry(std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t discard_granularity_bytes) + : GenericWriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes), + m_discard_granularity_bytes(discard_granularity_bytes) { + ram_entry.set_discard(true); + }; + DiscardLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes) + : GenericWriteLogEntry(nullptr, image_offset_bytes, write_bytes) { + ram_entry.set_discard(true); + }; + DiscardLogEntry(const DiscardLogEntry&) = delete; + DiscardLogEntry &operator=(const DiscardLogEntry&) = delete; + unsigned int write_bytes() const override { + /* The valid bytes in this ops data buffer. */ + return 0; + }; + unsigned int bytes_dirty() const override { + /* The bytes in the image this op makes dirty. */ + return ram_entry.write_bytes; + }; + bool can_retire() const override { + return this->completed; + } + void copy_cache_bl(bufferlist *out_bl) override { + ceph_assert(false); + } + void writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) override; + void init(uint64_t current_sync_gen, bool persist_on_flush, uint64_t last_op_sequence_num); + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const DiscardLogEntry &entry); +private: + uint32_t m_discard_granularity_bytes; +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_LOG_ENTRY_H diff --git a/src/librbd/cache/pwl/LogMap.cc b/src/librbd/cache/pwl/LogMap.cc new file mode 100644 index 000000000..b3e7022b0 --- /dev/null +++ b/src/librbd/cache/pwl/LogMap.cc @@ -0,0 +1,278 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "LogMap.h" +#include "include/ceph_assert.h" +#include "librbd/Utils.h" +#include "librbd/cache/pwl/LogEntry.h" + +namespace librbd { +namespace cache { +namespace pwl { + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::LogMap: " << this << " " \ + << __func__ << ": " +template +std::ostream &operator<<(std::ostream &os, + LogMapEntry &e) { + os << "block_extent=" << e.block_extent + << ", log_entry=[" << e.log_entry << "]"; + return os; +} + +template +LogMapEntry::LogMapEntry(const BlockExtent block_extent, + std::shared_ptr log_entry) + : block_extent(block_extent) , log_entry(log_entry) { +} + +template +LogMapEntry::LogMapEntry(std::shared_ptr log_entry) + : block_extent(log_entry->block_extent()) , log_entry(log_entry) { +} + +template +LogMap::LogMap(CephContext *cct) + : m_cct(cct), + m_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::LogMap::m_lock", this))) { +} + +/** + * Add a write log entry to the map. Subsequent queries for blocks + * within this log entry's extent will find this log entry. Portions + * of prior write log entries overlapping with this log entry will + * be replaced in the map by this log entry. + * + * The map_entries field of the log entry object will be updated to + * contain this map entry. + * + * The map_entries fields of all log entries overlapping with this + * entry will be updated to remove the regions that overlap with + * this. + */ +template +void LogMap::add_log_entry(std::shared_ptr log_entry) { + std::lock_guard locker(m_lock); + add_log_entry_locked(log_entry); +} + +template +void LogMap::add_log_entries(std::list> &log_entries) { + std::lock_guard locker(m_lock); + ldout(m_cct, 20) << dendl; + for (auto &log_entry : log_entries) { + add_log_entry_locked(log_entry); + } +} + +/** + * Remove any map entries that refer to the supplied write log + * entry. + */ +template +void LogMap::remove_log_entry(std::shared_ptr log_entry) { + std::lock_guard locker(m_lock); + remove_log_entry_locked(log_entry); +} + +template +void LogMap::remove_log_entries(std::list> &log_entries) { + std::lock_guard locker(m_lock); + ldout(m_cct, 20) << dendl; + for (auto &log_entry : log_entries) { + remove_log_entry_locked(log_entry); + } +} + +/** + * Returns the list of all write log entries that overlap the specified block + * extent. This doesn't tell you which portions of these entries overlap the + * extent, or each other. For that, use find_map_entries(). A log entry may + * appear in the list more than once, if multiple map entries refer to it + * (e.g. the middle of that write log entry has been overwritten). + */ +template +std::list> LogMap::find_log_entries(BlockExtent block_extent) { + std::lock_guard locker(m_lock); + ldout(m_cct, 20) << dendl; + return find_log_entries_locked(block_extent); +} + +/** + * Returns the list of all write log map entries that overlap the + * specified block extent. + */ +template +LogMapEntries LogMap::find_map_entries(BlockExtent block_extent) { + std::lock_guard locker(m_lock); + ldout(m_cct, 20) << dendl; + return find_map_entries_locked(block_extent); +} + +template +void LogMap::add_log_entry_locked(std::shared_ptr log_entry) { + LogMapEntry map_entry(log_entry); + ldout(m_cct, 20) << "block_extent=" << map_entry.block_extent + << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + LogMapEntries overlap_entries = find_map_entries_locked(map_entry.block_extent); + for (auto &entry : overlap_entries) { + ldout(m_cct, 20) << entry << dendl; + if (map_entry.block_extent.block_start <= entry.block_extent.block_start) { + if (map_entry.block_extent.block_end >= entry.block_extent.block_end) { + ldout(m_cct, 20) << "map entry completely occluded by new log entry" << dendl; + remove_map_entry_locked(entry); + } else { + ceph_assert(map_entry.block_extent.block_end < entry.block_extent.block_end); + /* The new entry occludes the beginning of the old entry */ + BlockExtent adjusted_extent(map_entry.block_extent.block_end, + entry.block_extent.block_end); + adjust_map_entry_locked(entry, adjusted_extent); + } + } else { + if (map_entry.block_extent.block_end >= entry.block_extent.block_end) { + /* The new entry occludes the end of the old entry */ + BlockExtent adjusted_extent(entry.block_extent.block_start, + map_entry.block_extent.block_start); + adjust_map_entry_locked(entry, adjusted_extent); + } else { + /* The new entry splits the old entry */ + split_map_entry_locked(entry, map_entry.block_extent); + } + } + } + add_map_entry_locked(map_entry); +} + +template +void LogMap::remove_log_entry_locked(std::shared_ptr log_entry) { + ldout(m_cct, 20) << "*log_entry=" << *log_entry << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + + LogMapEntries possible_hits = find_map_entries_locked(log_entry->block_extent()); + for (auto &possible_hit : possible_hits) { + if (possible_hit.log_entry == log_entry) { + /* This map entry refers to the specified log entry */ + remove_map_entry_locked(possible_hit); + } + } +} + +template +void LogMap::add_map_entry_locked(LogMapEntry &map_entry) { + ceph_assert(map_entry.log_entry); + m_block_to_log_entry_map.insert(map_entry); + map_entry.log_entry->inc_map_ref(); +} + +template +void LogMap::remove_map_entry_locked(LogMapEntry &map_entry) { + auto it = m_block_to_log_entry_map.find(map_entry); + ceph_assert(it != m_block_to_log_entry_map.end()); + + LogMapEntry erased = *it; + m_block_to_log_entry_map.erase(it); + erased.log_entry->dec_map_ref(); + if (0 == erased.log_entry->get_map_ref()) { + ldout(m_cct, 20) << "log entry has zero map entries: " << erased.log_entry << dendl; + } +} + +template +void LogMap::adjust_map_entry_locked(LogMapEntry &map_entry, BlockExtent &new_extent) { + auto it = m_block_to_log_entry_map.find(map_entry); + ceph_assert(it != m_block_to_log_entry_map.end()); + + LogMapEntry adjusted = *it; + m_block_to_log_entry_map.erase(it); + + m_block_to_log_entry_map.insert(LogMapEntry(new_extent, adjusted.log_entry)); +} + +template +void LogMap::split_map_entry_locked(LogMapEntry &map_entry, BlockExtent &removed_extent) { + auto it = m_block_to_log_entry_map.find(map_entry); + ceph_assert(it != m_block_to_log_entry_map.end()); + + LogMapEntry split = *it; + m_block_to_log_entry_map.erase(it); + + BlockExtent left_extent(split.block_extent.block_start, + removed_extent.block_start); + m_block_to_log_entry_map.insert(LogMapEntry(left_extent, split.log_entry)); + + BlockExtent right_extent(removed_extent.block_end, + split.block_extent.block_end); + m_block_to_log_entry_map.insert(LogMapEntry(right_extent, split.log_entry)); + + split.log_entry->inc_map_ref(); +} + +template +std::list> LogMap::find_log_entries_locked(const BlockExtent &block_extent) { + std::list> overlaps; + ldout(m_cct, 20) << "block_extent=" << block_extent << dendl; + + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + LogMapEntries map_entries = find_map_entries_locked(block_extent); + for (auto &map_entry : map_entries) { + overlaps.emplace_back(map_entry.log_entry); + } + return overlaps; +} + +/** + * TODO: Generalize this to do some arbitrary thing to each map + * extent, instead of returning a list. + */ +template +LogMapEntries LogMap::find_map_entries_locked(const BlockExtent &block_extent) { + LogMapEntries overlaps; + + ldout(m_cct, 20) << "block_extent=" << block_extent << dendl; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + auto p = m_block_to_log_entry_map.equal_range(LogMapEntry(block_extent)); + ldout(m_cct, 20) << "count=" << std::distance(p.first, p.second) << dendl; + for ( auto i = p.first; i != p.second; ++i ) { + LogMapEntry entry = *i; + overlaps.emplace_back(entry); + ldout(m_cct, 20) << entry << dendl; + } + return overlaps; +} + +/* We map block extents to write log entries, or portions of write log + * entries. These are both represented by a WriteLogMapEntry. When a + * GenericWriteLogEntry is added to this map, a WriteLogMapEntry is created to + * represent the entire block extent of the GenericWriteLogEntry, and the + * WriteLogMapEntry is added to the set. + * + * The set must not contain overlapping WriteLogMapEntrys. WriteLogMapEntrys + * in the set that overlap with one being added are adjusted (shrunk, split, + * or removed) before the new entry is added. + * + * This comparison works despite the ambiguity because we ensure the set + * contains no overlapping entries. This comparison works to find entries + * that overlap with a given block extent because equal_range() returns the + * first entry in which the extent doesn't end before the given extent + * starts, and the last entry for which the extent starts before the given + * extent ends (the first entry that the key is less than, and the last entry + * that is less than the key). + */ +template +bool LogMap::LogMapEntryCompare::operator()(const LogMapEntry &lhs, + const LogMapEntry &rhs) const { + if (lhs.block_extent.block_end <= rhs.block_extent.block_start) { + return true; + } + return false; +} + +} //namespace pwl +} //namespace cache +} //namespace librbd + +template class librbd::cache::pwl::LogMap; diff --git a/src/librbd/cache/pwl/LogMap.h b/src/librbd/cache/pwl/LogMap.h new file mode 100644 index 000000000..a05307896 --- /dev/null +++ b/src/librbd/cache/pwl/LogMap.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H +#define CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H + +#include "librbd/BlockGuard.h" +#include + +namespace librbd { +namespace cache { +namespace pwl { + +/** + * WriteLogMap: maps block extents to GenericWriteLogEntries + * + * A WriteLogMapEntry (based on LogMapEntry) refers to a portion of a GenericWriteLogEntry + */ +template +class LogMapEntry { +public: + BlockExtent block_extent; + std::shared_ptr log_entry; + + LogMapEntry(BlockExtent block_extent, + std::shared_ptr log_entry = nullptr); + LogMapEntry(std::shared_ptr log_entry); + + template + friend std::ostream &operator<<(std::ostream &os, + LogMapEntry &e); +}; + +template +using LogMapEntries = std::list>; + +template +class LogMap { +public: + LogMap(CephContext *cct); + LogMap(const LogMap&) = delete; + LogMap &operator=(const LogMap&) = delete; + + void add_log_entry(std::shared_ptr log_entry); + void add_log_entries(std::list> &log_entries); + void remove_log_entry(std::shared_ptr log_entry); + void remove_log_entries(std::list> &log_entries); + std::list> find_log_entries(BlockExtent block_extent); + LogMapEntries find_map_entries(BlockExtent block_extent); + +private: + void add_log_entry_locked(std::shared_ptr log_entry); + void remove_log_entry_locked(std::shared_ptr log_entry); + void add_map_entry_locked(LogMapEntry &map_entry); + void remove_map_entry_locked(LogMapEntry &map_entry); + void adjust_map_entry_locked(LogMapEntry &map_entry, BlockExtent &new_extent); + void split_map_entry_locked(LogMapEntry &map_entry, BlockExtent &removed_extent); + std::list> find_log_entries_locked(const BlockExtent &block_extent); + LogMapEntries find_map_entries_locked(const BlockExtent &block_extent); + + using LogMapEntryT = LogMapEntry; + + class LogMapEntryCompare { + public: + bool operator()(const LogMapEntryT &lhs, + const LogMapEntryT &rhs) const; + }; + + using BlockExtentToLogMapEntries = std::set; + + CephContext *m_cct; + ceph::mutex m_lock; + BlockExtentToLogMapEntries m_block_to_log_entry_map; +}; + +} //namespace pwl +} //namespace cache +} //namespace librbd + +#endif //CEPH_LIBRBD_CACHE_RWL_LOG_MAP_H diff --git a/src/librbd/cache/pwl/LogOperation.cc b/src/librbd/cache/pwl/LogOperation.cc new file mode 100644 index 000000000..e779802f0 --- /dev/null +++ b/src/librbd/cache/pwl/LogOperation.cc @@ -0,0 +1,312 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "LogOperation.h" +#include "librbd/cache/pwl/Types.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::LogOperation: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +GenericLogOperation::GenericLogOperation(utime_t dispatch_time, + PerfCounters *perfcounter) + : m_perfcounter(perfcounter), dispatch_time(dispatch_time) { +} + +std::ostream& GenericLogOperation::format(std::ostream &os) const { + os << "dispatch_time=[" << dispatch_time + << "], buf_persist_start_time=[" << buf_persist_start_time + << "], buf_persist_comp_time=[" << buf_persist_comp_time + << "], log_append_start_time=[" << log_append_start_time + << "], log_append_comp_time=[" << log_append_comp_time << "]"; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const GenericLogOperation &op) { + return op.format(os); +} + +SyncPointLogOperation::SyncPointLogOperation(ceph::mutex &lock, + std::shared_ptr sync_point, + utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct) + : GenericLogOperation(dispatch_time, perfcounter), m_cct(cct), m_lock(lock), + sync_point(sync_point) { +} + +SyncPointLogOperation::~SyncPointLogOperation() { } + +std::ostream &SyncPointLogOperation::format(std::ostream &os) const { + os << "(Sync Point) "; + GenericLogOperation::format(os); + os << ", sync_point=[" << *sync_point << "]"; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const SyncPointLogOperation &op) { + return op.format(os); +} + +std::vector SyncPointLogOperation::append_sync_point() { + std::vector appending_contexts; + std::lock_guard locker(m_lock); + if (!sync_point->appending) { + sync_point->appending = true; + } + appending_contexts.swap(sync_point->on_sync_point_appending); + return appending_contexts; +} + +void SyncPointLogOperation::clear_earlier_sync_point() { + std::lock_guard locker(m_lock); + ceph_assert(sync_point->later_sync_point); + ceph_assert(sync_point->later_sync_point->earlier_sync_point == sync_point); + sync_point->later_sync_point->earlier_sync_point = nullptr; + sync_point->later_sync_point = nullptr; +} + +std::vector SyncPointLogOperation::swap_on_sync_point_persisted() { + std::lock_guard locker(m_lock); + std::vector persisted_contexts; + persisted_contexts.swap(sync_point->on_sync_point_persisted); + return persisted_contexts; +} + +void SyncPointLogOperation::appending() { + ceph_assert(sync_point); + ldout(m_cct, 20) << "Sync point op=[" << *this + << "] appending" << dendl; + auto appending_contexts = append_sync_point(); + for (auto &ctx : appending_contexts) { + ctx->complete(0); + } +} + +void SyncPointLogOperation::complete(int result) { + ceph_assert(sync_point); + ldout(m_cct, 20) << "Sync point op =[" << *this + << "] completed" << dendl; + clear_earlier_sync_point(); + + /* Do append now in case completion occurred before the + * normal append callback executed, and to handle + * on_append work that was queued after the sync point + * entered the appending state. */ + appending(); + auto persisted_contexts = swap_on_sync_point_persisted(); + for (auto &ctx : persisted_contexts) { + ctx->complete(result); + } +} + +GenericWriteLogOperation::GenericWriteLogOperation(std::shared_ptr sync_point, + utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct) + : GenericLogOperation(dispatch_time, perfcounter), + m_lock(ceph::make_mutex(pwl::unique_lock_name( + "librbd::cache::pwl::GenericWriteLogOperation::m_lock", this))), + m_cct(cct), + sync_point(sync_point) { +} + +GenericWriteLogOperation::~GenericWriteLogOperation() { } + +std::ostream &GenericWriteLogOperation::format(std::ostream &os) const { + GenericLogOperation::format(os); + return os; +} + +std::ostream &operator<<(std::ostream &os, + const GenericWriteLogOperation &op) { + return op.format(os); +} + +/* Called when the write log operation is appending and its log position is guaranteed */ +void GenericWriteLogOperation::appending() { + Context *on_append = nullptr; + ldout(m_cct, 20) << __func__ << " " << this << dendl; + { + std::lock_guard locker(m_lock); + on_append = on_write_append; + on_write_append = nullptr; + } + if (on_append) { + ldout(m_cct, 20) << __func__ << " " << this << " on_append=" << on_append << dendl; + on_append->complete(0); + } +} + +/* Called when the write log operation is completed in all log replicas */ +void GenericWriteLogOperation::complete(int result) { + appending(); + Context *on_persist = nullptr; + ldout(m_cct, 20) << __func__ << " " << this << dendl; + { + std::lock_guard locker(m_lock); + on_persist = on_write_persist; + on_write_persist = nullptr; + } + if (on_persist) { + ldout(m_cct, 20) << __func__ << " " << this << " on_persist=" << on_persist + << dendl; + on_persist->complete(result); + } +} + +WriteLogOperation::WriteLogOperation( + WriteLogOperationSet &set, uint64_t image_offset_bytes, + uint64_t write_bytes, CephContext *cct, + std::shared_ptr write_log_entry) + : GenericWriteLogOperation(set.sync_point, set.dispatch_time, + set.perfcounter, cct), + log_entry(write_log_entry) { + on_write_append = set.extent_ops_appending->new_sub(); + on_write_persist = set.extent_ops_persist->new_sub(); + log_entry->sync_point_entry->writes++; + log_entry->sync_point_entry->bytes += write_bytes; +} + +WriteLogOperation::WriteLogOperation(WriteLogOperationSet &set, + uint64_t image_offset_bytes, + uint64_t write_bytes, + uint32_t data_len, + CephContext *cct, + std::shared_ptr writesame_log_entry) + : WriteLogOperation(set, image_offset_bytes, write_bytes, cct, + writesame_log_entry) { + is_writesame = true; +} + +WriteLogOperation::~WriteLogOperation() { } + +void WriteLogOperation::init(bool has_data, std::vector::iterator allocation, + uint64_t current_sync_gen, + uint64_t last_op_sequence_num, + bufferlist &write_req_bl, uint64_t buffer_offset, + bool persist_on_flush) { + log_entry->init(has_data, current_sync_gen, last_op_sequence_num, + persist_on_flush); + buffer_alloc = &(*allocation); + bl.substr_of(write_req_bl, buffer_offset, log_entry->write_bytes()); + log_entry->init_cache_bl(write_req_bl, buffer_offset, + log_entry->write_bytes()); +} + +std::ostream &WriteLogOperation::format(std::ostream &os) const { + std::string op_name = is_writesame ? "(Write Same) " : "(Write) "; + os << op_name; + GenericWriteLogOperation::format(os); + if (log_entry) { + os << ", log_entry=[" << *log_entry << "]"; + } else { + os << ", log_entry=nullptr"; + } + os << ", bl=[" << bl << "], buffer_alloc=" << buffer_alloc; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const WriteLogOperation &op) { + return op.format(os); +} + + +void WriteLogOperation::complete(int result) { + GenericWriteLogOperation::complete(result); + m_perfcounter->tinc(l_librbd_pwl_log_op_dis_to_buf_t, + buf_persist_start_time - dispatch_time); + utime_t buf_persist_lat = buf_persist_comp_time - buf_persist_start_time; + m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_bufc_t, buf_persist_lat); + m_perfcounter->hinc(l_librbd_pwl_log_op_buf_to_bufc_t_hist, + buf_persist_lat.to_nsec(), + log_entry->ram_entry.write_bytes); + m_perfcounter->tinc(l_librbd_pwl_log_op_buf_to_app_t, + log_append_start_time - buf_persist_start_time); +} + +WriteLogOperationSet::WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, std::shared_ptr sync_point, + bool persist_on_flush, CephContext *cct, Context *on_finish) + : m_cct(cct), m_on_finish(on_finish), + persist_on_flush(persist_on_flush), + dispatch_time(dispatched), + perfcounter(perfcounter), + sync_point(sync_point) { + on_ops_appending = sync_point->prior_persisted_gather_new_sub(); + on_ops_persist = nullptr; + extent_ops_persist = + new C_Gather(m_cct, + new LambdaContext( [this](int r) { + ldout(this->m_cct,20) << __func__ << " " << this << " m_extent_ops_persist completed" << dendl; + if (on_ops_persist) { + on_ops_persist->complete(r); + } + m_on_finish->complete(r); + })); + auto appending_persist_sub = extent_ops_persist->new_sub(); + extent_ops_appending = + new C_Gather(m_cct, + new LambdaContext( [this, appending_persist_sub](int r) { + ldout(this->m_cct, 20) << __func__ << " " << this << " m_extent_ops_appending completed" << dendl; + on_ops_appending->complete(r); + appending_persist_sub->complete(r); + })); +} + +WriteLogOperationSet::~WriteLogOperationSet() { } + +std::ostream &operator<<(std::ostream &os, + const WriteLogOperationSet &s) { + os << "cell=" << (void*)s.cell + << ", extent_ops_appending=" << s.extent_ops_appending + << ", extent_ops_persist=" << s.extent_ops_persist; + return os; +} + +DiscardLogOperation::DiscardLogOperation(std::shared_ptr sync_point, + uint64_t image_offset_bytes, + uint64_t write_bytes, + uint32_t discard_granularity_bytes, + utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct) + : GenericWriteLogOperation(sync_point, dispatch_time, perfcounter, cct), + log_entry(std::make_shared(sync_point->log_entry, + image_offset_bytes, + write_bytes, + discard_granularity_bytes)) { + on_write_persist = nullptr; + log_entry->sync_point_entry->writes++; + log_entry->sync_point_entry->bytes += write_bytes; +} + +DiscardLogOperation::~DiscardLogOperation() { } + +std::ostream &DiscardLogOperation::format(std::ostream &os) const { + os << "(Discard) "; + GenericWriteLogOperation::format(os); + if (log_entry) { + os << ", log_entry=[" << *log_entry << "]"; + } else { + os << ", log_entry=nullptr"; + } + return os; +} + +std::ostream &operator<<(std::ostream &os, + const DiscardLogOperation &op) { + return op.format(os); +} + +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/LogOperation.h b/src/librbd/cache/pwl/LogOperation.h new file mode 100644 index 000000000..15befe05f --- /dev/null +++ b/src/librbd/cache/pwl/LogOperation.h @@ -0,0 +1,224 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H +#define CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H + +#include "include/utime.h" +#include "librbd/cache/pwl/LogEntry.h" +#include "librbd/cache/pwl/SyncPoint.h" + +namespace librbd { +namespace cache { +namespace pwl { + +struct WriteBufferAllocation; + +class WriteLogOperationSet; + +class WriteLogOperation; + +class GenericWriteLogOperation; + +class SyncPointLogOperation; + +class GenericLogOperation; + +template +class AbstractWriteLog; + +using GenericLogOperationSharedPtr = std::shared_ptr; + +using GenericLogOperationsVector = std::vector; + +class GenericLogOperation { +protected: + PerfCounters *m_perfcounter = nullptr; +public: + utime_t dispatch_time; // When op created + utime_t buf_persist_start_time; // When buffer persist begins + utime_t buf_persist_comp_time; // When buffer persist completes + utime_t log_append_start_time; // When log append begins + utime_t log_append_comp_time; // When log append completes + GenericLogOperation(utime_t dispatch_time, PerfCounters *perfcounter); + virtual ~GenericLogOperation() { }; + GenericLogOperation(const GenericLogOperation&) = delete; + GenericLogOperation &operator=(const GenericLogOperation&) = delete; + virtual std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const GenericLogOperation &op); + virtual const std::shared_ptr get_log_entry() = 0; + virtual void appending() = 0; + virtual void complete(int r) = 0; + virtual void mark_log_entry_completed() {}; + virtual bool reserved_allocated() const { + return false; + } + virtual bool is_writing_op() const { + return false; + } + virtual void init_op(uint64_t current_sync_gen, bool persist_on_flush, + uint64_t last_op_sequence_num, Context *write_persist, + Context *write_append) {}; + virtual void copy_bl_to_cache_buffer( + std::vector::iterator allocation) {}; +}; + +class SyncPointLogOperation : public GenericLogOperation { +private: + CephContext *m_cct; + ceph::mutex &m_lock; + std::vector append_sync_point(); + void clear_earlier_sync_point(); + std::vector swap_on_sync_point_persisted(); +public: + std::shared_ptr sync_point; + SyncPointLogOperation(ceph::mutex &lock, + std::shared_ptr sync_point, + utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct); + ~SyncPointLogOperation() override; + SyncPointLogOperation(const SyncPointLogOperation&) = delete; + SyncPointLogOperation &operator=(const SyncPointLogOperation&) = delete; + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const SyncPointLogOperation &op); + const std::shared_ptr get_log_entry() override { + return sync_point->log_entry; + } + void appending() override; + void complete(int r) override; +}; + +class GenericWriteLogOperation : public GenericLogOperation { +protected: + ceph::mutex m_lock; + CephContext *m_cct; +public: + std::shared_ptr sync_point; + Context *on_write_append = nullptr; /* Completion for things waiting on this + * write's position in the log to be + * guaranteed */ + Context *on_write_persist = nullptr; /* Completion for things waiting on this + * write to persist */ + GenericWriteLogOperation(std::shared_ptr sync_point, + utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct); + ~GenericWriteLogOperation() override; + GenericWriteLogOperation(const GenericWriteLogOperation&) = delete; + GenericWriteLogOperation &operator=(const GenericWriteLogOperation&) = delete; + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const GenericWriteLogOperation &op); + void mark_log_entry_completed() override{ + sync_point->log_entry->writes_completed++; + } + bool reserved_allocated() const override { + return true; + } + bool is_writing_op() const override { + return true; + } + void appending() override; + void complete(int r) override; +}; + +class WriteLogOperation : public GenericWriteLogOperation { +public: + using GenericWriteLogOperation::m_lock; + using GenericWriteLogOperation::sync_point; + using GenericWriteLogOperation::on_write_append; + using GenericWriteLogOperation::on_write_persist; + std::shared_ptr log_entry; + bufferlist bl; + bool is_writesame = false; + WriteBufferAllocation *buffer_alloc = nullptr; + WriteLogOperation(WriteLogOperationSet &set, + uint64_t image_offset_bytes, + uint64_t write_bytes, CephContext *cct, + std::shared_ptr write_log_entry); + WriteLogOperation(WriteLogOperationSet &set, + uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t data_len, + CephContext *cct, + std::shared_ptr writesame_log_entry); + ~WriteLogOperation() override; + WriteLogOperation(const WriteLogOperation&) = delete; + WriteLogOperation &operator=(const WriteLogOperation&) = delete; + void init(bool has_data, + std::vector::iterator allocation, + uint64_t current_sync_gen, uint64_t last_op_sequence_num, + bufferlist &write_req_bl, uint64_t buffer_offset, + bool persist_on_flush); + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const WriteLogOperation &op); + const std::shared_ptr get_log_entry() override { + return log_entry; + } + + void complete(int r) override; +}; + + +class WriteLogOperationSet { +private: + CephContext *m_cct; + Context *m_on_finish; +public: + bool persist_on_flush; + BlockGuardCell *cell; + C_Gather *extent_ops_appending; + Context *on_ops_appending; + C_Gather *extent_ops_persist; + Context *on_ops_persist; + GenericLogOperationsVector operations; + utime_t dispatch_time; /* When set created */ + PerfCounters *perfcounter = nullptr; + std::shared_ptr sync_point; + WriteLogOperationSet(utime_t dispatched, PerfCounters *perfcounter, + std::shared_ptr sync_point, + const bool persist_on_flush, CephContext *cct, + Context *on_finish); + ~WriteLogOperationSet(); + WriteLogOperationSet(const WriteLogOperationSet&) = delete; + WriteLogOperationSet &operator=(const WriteLogOperationSet&) = delete; + friend std::ostream &operator<<(std::ostream &os, + const WriteLogOperationSet &s); +}; + +class DiscardLogOperation : public GenericWriteLogOperation { +public: + using GenericWriteLogOperation::m_lock; + using GenericWriteLogOperation::sync_point; + using GenericWriteLogOperation::on_write_append; + using GenericWriteLogOperation::on_write_persist; + std::shared_ptr log_entry; + DiscardLogOperation(std::shared_ptr sync_point, + uint64_t image_offset_bytes, + uint64_t write_bytes, + uint32_t discard_granularity_bytes, + utime_t dispatch_time, + PerfCounters *perfcounter, + CephContext *cct); + ~DiscardLogOperation() override; + DiscardLogOperation(const DiscardLogOperation&) = delete; + DiscardLogOperation &operator=(const DiscardLogOperation&) = delete; + const std::shared_ptr get_log_entry() override { + return log_entry; + } + bool reserved_allocated() const override { + return false; + } + std::ostream &format(std::ostream &os) const; + friend std::ostream &operator<<(std::ostream &os, + const DiscardLogOperation &op); +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_LOG_OPERATION_H diff --git a/src/librbd/cache/pwl/ReadRequest.h b/src/librbd/cache/pwl/ReadRequest.h new file mode 100644 index 000000000..d4b2aee5b --- /dev/null +++ b/src/librbd/cache/pwl/ReadRequest.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H +#define CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H + +#include "include/Context.h" +#include "librbd/cache/pwl/Types.h" + +namespace librbd { +namespace cache { +namespace pwl { + +typedef std::vector> ImageExtentBufs; + +class C_ReadRequest : public Context { +public: + io::Extents miss_extents; // move back to caller + ImageExtentBufs read_extents; + bufferlist miss_bl; + + C_ReadRequest( + CephContext *cct, utime_t arrived, PerfCounters *perfcounter, + bufferlist *out_bl, Context *on_finish) + : m_cct(cct), m_on_finish(on_finish), m_out_bl(out_bl), + m_arrived_time(arrived), m_perfcounter(perfcounter) {} + ~C_ReadRequest() {} + + const char *get_name() const { + return "C_ReadRequest"; + } + +protected: + CephContext *m_cct; + Context *m_on_finish; + bufferlist *m_out_bl; + utime_t m_arrived_time; + PerfCounters *m_perfcounter; +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_READ_REQUEST_H diff --git a/src/librbd/cache/pwl/Request.cc b/src/librbd/cache/pwl/Request.cc new file mode 100644 index 000000000..963331925 --- /dev/null +++ b/src/librbd/cache/pwl/Request.cc @@ -0,0 +1,562 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Request.h" +#include "librbd/BlockGuard.h" +#include "librbd/cache/pwl/LogEntry.h" +#include "librbd/cache/pwl/AbstractWriteLog.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::Request: " << this << " " \ + << __func__ << ": " + +using namespace std; + +namespace librbd { +namespace cache { +namespace pwl { + +template +C_BlockIORequest::C_BlockIORequest(T &pwl, const utime_t arrived, io::Extents &&extents, + bufferlist&& bl, const int fadvise_flags, Context *user_req) + : pwl(pwl), image_extents(std::move(extents)), + bl(std::move(bl)), fadvise_flags(fadvise_flags), + user_req(user_req), image_extents_summary(image_extents), m_arrived_time(arrived) { + ldout(pwl.get_context(), 99) << this << dendl; +} + +template +C_BlockIORequest::~C_BlockIORequest() { + ldout(pwl.get_context(), 99) << this << dendl; + ceph_assert(m_cell_released || !m_cell); +} + +template +std::ostream &operator<<(std::ostream &os, + const C_BlockIORequest &req) { + os << "image_extents=" << req.image_extents + << ", image_extents_summary=[" << req.image_extents_summary + << "], bl=" << req.bl + << ", user_req=" << req.user_req + << ", m_user_req_completed=" << req.m_user_req_completed + << ", m_deferred=" << req.m_deferred + << ", detained=" << req.detained; + return os; +} + +template +void C_BlockIORequest::set_cell(BlockGuardCell *cell) { + ldout(pwl.get_context(), 20) << this << " cell=" << cell << dendl; + ceph_assert(cell); + ceph_assert(!m_cell); + m_cell = cell; +} + +template +BlockGuardCell *C_BlockIORequest::get_cell(void) { + ldout(pwl.get_context(), 20) << this << " cell=" << m_cell << dendl; + return m_cell; +} + +template +void C_BlockIORequest::release_cell() { + ldout(pwl.get_context(), 20) << this << " cell=" << m_cell << dendl; + ceph_assert(m_cell); + bool initial = false; + if (m_cell_released.compare_exchange_strong(initial, true)) { + pwl.release_guarded_request(m_cell); + } else { + ldout(pwl.get_context(), 5) << "cell " << m_cell << " already released for " << this << dendl; + } +} + +template +void C_BlockIORequest::complete_user_request(int r) { + bool initial = false; + if (m_user_req_completed.compare_exchange_strong(initial, true)) { + ldout(pwl.get_context(), 15) << this << " completing user req" << dendl; + m_user_req_completed_time = ceph_clock_now(); + pwl.complete_user_request(user_req, r); + } else { + ldout(pwl.get_context(), 20) << this << " user req already completed" << dendl; + } +} + +template +void C_BlockIORequest::finish(int r) { + ldout(pwl.get_context(), 20) << this << dendl; + + complete_user_request(r); + bool initial = false; + if (m_finish_called.compare_exchange_strong(initial, true)) { + ldout(pwl.get_context(), 15) << this << " finishing" << dendl; + finish_req(0); + } else { + ldout(pwl.get_context(), 20) << this << " already finished" << dendl; + ceph_assert(0); + } +} + +template +void C_BlockIORequest::deferred() { + bool initial = false; + if (m_deferred.compare_exchange_strong(initial, true)) { + deferred_handler(); + } +} + +template +C_WriteRequest::C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : C_BlockIORequest(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req), + m_perfcounter(perfcounter), m_lock(lock) { + ldout(pwl.get_context(), 99) << this << dendl; +} + +template +C_WriteRequest::C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter, + Context *user_req) + : C_BlockIORequest(pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, user_req), + mismatch_offset(mismatch_offset), cmp_bl(std::move(cmp_bl)), + m_perfcounter(perfcounter), m_lock(lock) { + is_comp_and_write = true; + ldout(pwl.get_context(), 20) << dendl; +} + +template +C_WriteRequest::~C_WriteRequest() { + ldout(pwl.get_context(), 99) << this << dendl; +} + +template +std::ostream &operator<<(std::ostream &os, + const C_WriteRequest &req) { + os << (C_BlockIORequest&)req + << " m_resources.allocated=" << req.m_resources.allocated; + if (req.op_set) { + os << " op_set=[" << *req.op_set << "]"; + } + return os; +} + +template +void C_WriteRequest::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) { + ldout(pwl.get_context(), 20) << __func__ << " write_req=" << this << " cell=" << guard_ctx.cell << dendl; + + ceph_assert(guard_ctx.cell); + this->detained = guard_ctx.state.detained; /* overlapped */ + this->m_queued = guard_ctx.state.queued; /* queued behind at least one barrier */ + this->set_cell(guard_ctx.cell); +} + +template +void C_WriteRequest::finish_req(int r) { + ldout(pwl.get_context(), 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl; + + /* Completed to caller by here (in finish(), which calls this) */ + utime_t now = ceph_clock_now(); + if(is_comp_and_write && !compare_succeeded) { + update_req_stats(now); + return; + } + pwl.release_write_lanes(this); + ceph_assert(m_resources.allocated); + m_resources.allocated = false; + this->release_cell(); /* TODO: Consider doing this in appending state */ + update_req_stats(now); +} + +template +std::shared_ptr C_WriteRequest::create_operation( + uint64_t offset, uint64_t len) { + return pwl.m_builder->create_write_log_operation( + *op_set, offset, len, pwl.get_context(), + pwl.m_builder->create_write_log_entry(op_set->sync_point->log_entry, offset, len)); +} + +template +void C_WriteRequest::setup_log_operations(DeferredContexts &on_exit) { + GenericWriteLogEntries log_entries; + { + std::lock_guard locker(m_lock); + std::shared_ptr current_sync_point = pwl.get_current_sync_point(); + if ((!pwl.get_persist_on_flush() && current_sync_point->log_entry->writes_completed) || + (current_sync_point->log_entry->writes > MAX_WRITES_PER_SYNC_POINT) || + (current_sync_point->log_entry->bytes > MAX_BYTES_PER_SYNC_POINT)) { + /* Create new sync point and persist the previous one. This sequenced + * write will bear a sync gen number shared with no already completed + * writes. A group of sequenced writes may be safely flushed concurrently + * if they all arrived before any of them completed. We'll insert one on + * an aio_flush() from the application. Here we're inserting one to cap + * the number of bytes and writes per sync point. When the application is + * not issuing flushes, we insert sync points to record some observed + * write concurrency information that enables us to safely issue >1 flush + * write (for writes observed here to have been in flight simultaneously) + * at a time in persist-on-write mode. + */ + pwl.flush_new_sync_point(nullptr, on_exit); + current_sync_point = pwl.get_current_sync_point(); + } + uint64_t current_sync_gen = pwl.get_current_sync_gen(); + op_set = + make_unique(this->m_dispatched_time, + m_perfcounter, + current_sync_point, + pwl.get_persist_on_flush(), + pwl.get_context(), this); + ldout(pwl.get_context(), 20) << "write_req=[" << *this + << "], op_set=" << op_set.get() << dendl; + ceph_assert(m_resources.allocated); + /* op_set->operations initialized differently for plain write or write same */ + auto allocation = m_resources.buffers.begin(); + uint64_t buffer_offset = 0; + for (auto &extent : this->image_extents) { + /* operation->on_write_persist connected to m_prior_log_entries_persisted Gather */ + auto operation = this->create_operation(extent.first, extent.second); + this->op_set->operations.emplace_back(operation); + + /* A WS is also a write */ + ldout(pwl.get_context(), 20) << "write_req=[" << *this + << "], op_set=" << op_set.get() + << ", operation=" << operation << dendl; + log_entries.emplace_back(operation->log_entry); + if (!op_set->persist_on_flush) { + pwl.inc_last_op_sequence_num(); + } + operation->init(true, allocation, current_sync_gen, + pwl.get_last_op_sequence_num(), this->bl, buffer_offset, op_set->persist_on_flush); + buffer_offset += operation->log_entry->write_bytes(); + ldout(pwl.get_context(), 20) << "operation=[" << *operation << "]" << dendl; + allocation++; + } + } + /* All extent ops subs created */ + op_set->extent_ops_appending->activate(); + op_set->extent_ops_persist->activate(); + + pwl.add_into_log_map(log_entries, this); +} + +template +void C_WriteRequest::copy_cache() { + pwl.copy_bl_to_buffer(&m_resources, op_set); +} + +template +bool C_WriteRequest::append_write_request(std::shared_ptr sync_point) { + std::lock_guard locker(m_lock); + auto write_req_sp = this; + if (sync_point->earlier_sync_point) { + Context *schedule_append_ctx = new LambdaContext([write_req_sp](int r) { + write_req_sp->schedule_append(); + }); + sync_point->earlier_sync_point->on_sync_point_appending.push_back(schedule_append_ctx); + return true; + } + return false; +} + +template +void C_WriteRequest::schedule_append() { + ceph_assert(++m_appended == 1); + pwl.setup_schedule_append(this->op_set->operations, m_do_early_flush, this); +} + +/** + * Attempts to allocate log resources for a write. Returns true if successful. + * + * Resources include 1 lane per extent, 1 log entry per extent, and the payload + * data space for each extent. + * + * Lanes are released after the write persists via release_write_lanes() + */ +template +bool C_WriteRequest::alloc_resources() { + this->allocated_time = ceph_clock_now(); + return pwl.alloc_resources(this); +} + +/** + * Takes custody of write_req. Resources must already be allocated. + * + * Locking: + * Acquires lock + */ +template +void C_WriteRequest::dispatch() +{ + CephContext *cct = pwl.get_context(); + DeferredContexts on_exit; + utime_t now = ceph_clock_now(); + this->m_dispatched_time = now; + + ldout(cct, 15) << "write_req=" << this << " cell=" << this->get_cell() << dendl; + this->setup_log_operations(on_exit); + + bool append_deferred = false; + if (!op_set->persist_on_flush && + append_write_request(op_set->sync_point)) { + /* In persist-on-write mode, we defer the append of this write until the + * previous sync point is appending (meaning all the writes before it are + * persisted and that previous sync point can now appear in the + * log). Since we insert sync points in persist-on-write mode when writes + * have already completed to the current sync point, this limits us to + * one inserted sync point in flight at a time, and gives the next + * inserted sync point some time to accumulate a few writes if they + * arrive soon. Without this we can insert an absurd number of sync + * points, each with one or two writes. That uses a lot of log entries, + * and limits flushing to very few writes at a time. */ + m_do_early_flush = false; + append_deferred = true; + } else { + /* The prior sync point is done, so we'll schedule append here. If this is + * persist-on-write, and probably still the caller's thread, we'll use this + * caller's thread to perform the persist & replication of the payload + * buffer. */ + m_do_early_flush = + !(this->detained || this->m_queued || this->m_deferred || op_set->persist_on_flush); + } + if (!append_deferred) { + this->schedule_append(); + } +} + +template +C_FlushRequest::C_FlushRequest(T &pwl, const utime_t arrived, + io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, + ceph::mutex &lock, PerfCounters *perfcounter, + Context *user_req) + : C_BlockIORequest(pwl, arrived, std::move(image_extents), std::move(bl), + fadvise_flags, user_req), + m_lock(lock), m_perfcounter(perfcounter) { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +void C_FlushRequest::finish_req(int r) { + ldout(pwl.get_context(), 20) << "flush_req=" << this + << " cell=" << this->get_cell() << dendl; + /* Block guard already released */ + ceph_assert(!this->get_cell()); + + /* Completed to caller by here */ + utime_t now = ceph_clock_now(); + m_perfcounter->tinc(l_librbd_pwl_aio_flush_latency, now - this->m_arrived_time); +} + +template +bool C_FlushRequest::alloc_resources() { + ldout(pwl.get_context(), 20) << "req type=" << get_name() + << " req=[" << *this << "]" << dendl; + return pwl.alloc_resources(this); +} + +template +void C_FlushRequest::dispatch() { + utime_t now = ceph_clock_now(); + ldout(pwl.get_context(), 20) << "req type=" << get_name() + << " req=[" << *this << "]" << dendl; + ceph_assert(this->m_resources.allocated); + this->m_dispatched_time = now; + + op = std::make_shared(m_lock, + to_append, + now, + m_perfcounter, + pwl.get_context()); + + m_perfcounter->inc(l_librbd_pwl_log_ops, 1); + pwl.schedule_append(op); +} + +template +void C_FlushRequest::setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated, + uint64_t *number_lanes, uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) { + *number_log_entries = 1; +} + +template +std::ostream &operator<<(std::ostream &os, + const C_FlushRequest &req) { + os << (C_BlockIORequest&)req + << " m_resources.allocated=" << req.m_resources.allocated; + return os; +} + +template +C_DiscardRequest::C_DiscardRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : C_BlockIORequest(pwl, arrived, std::move(image_extents), bufferlist(), 0, user_req), + m_discard_granularity_bytes(discard_granularity_bytes), + m_lock(lock), + m_perfcounter(perfcounter) { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +C_DiscardRequest::~C_DiscardRequest() { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +bool C_DiscardRequest::alloc_resources() { + ldout(pwl.get_context(), 20) << "req type=" << get_name() + << " req=[" << *this << "]" << dendl; + return pwl.alloc_resources(this); +} + +template +void C_DiscardRequest::setup_log_operations() { + std::lock_guard locker(m_lock); + GenericWriteLogEntries log_entries; + for (auto &extent : this->image_extents) { + op = pwl.m_builder->create_discard_log_operation( + pwl.get_current_sync_point(), extent.first, extent.second, + m_discard_granularity_bytes, this->m_dispatched_time, m_perfcounter, + pwl.get_context()); + log_entries.emplace_back(op->log_entry); + break; + } + uint64_t current_sync_gen = pwl.get_current_sync_gen(); + bool persist_on_flush = pwl.get_persist_on_flush(); + if (!persist_on_flush) { + pwl.inc_last_op_sequence_num(); + } + auto discard_req = this; + Context *on_write_append = pwl.get_current_sync_point()->prior_persisted_gather_new_sub(); + + Context *on_write_persist = new LambdaContext( + [this, discard_req](int r) { + ldout(pwl.get_context(), 20) << "discard_req=" << discard_req + << " cell=" << discard_req->get_cell() << dendl; + ceph_assert(discard_req->get_cell()); + discard_req->complete_user_request(r); + discard_req->release_cell(); + }); + op->init_op(current_sync_gen, persist_on_flush, pwl.get_last_op_sequence_num(), + on_write_persist, on_write_append); + pwl.add_into_log_map(log_entries, this); +} + +template +void C_DiscardRequest::dispatch() { + utime_t now = ceph_clock_now(); + ldout(pwl.get_context(), 20) << "req type=" << get_name() + << " req=[" << *this << "]" << dendl; + ceph_assert(this->m_resources.allocated); + this->m_dispatched_time = now; + setup_log_operations(); + m_perfcounter->inc(l_librbd_pwl_log_ops, 1); + pwl.schedule_append(op); +} + +template +void C_DiscardRequest::setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated, + uint64_t *number_lanes, uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) { + *number_log_entries = 1; + /* No bytes are allocated for a discard, but we count the discarded bytes + * as dirty. This means it's possible to have more bytes dirty than + * there are bytes cached or allocated. */ + for (auto &extent : this->image_extents) { + *bytes_dirtied = extent.second; + break; + } +} + +template +void C_DiscardRequest::blockguard_acquired(GuardedRequestFunctionContext &guard_ctx) { + ldout(pwl.get_context(), 20) << " cell=" << guard_ctx.cell << dendl; + + ceph_assert(guard_ctx.cell); + this->detained = guard_ctx.state.detained; /* overlapped */ + this->set_cell(guard_ctx.cell); +} + +template +std::ostream &operator<<(std::ostream &os, + const C_DiscardRequest &req) { + os << (C_BlockIORequest&)req; + if (req.op) { + os << " op=[" << *req.op << "]"; + } else { + os << " op=nullptr"; + } + return os; +} + +template +C_WriteSameRequest::C_WriteSameRequest( + T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : C_WriteRequest(pwl, arrived, std::move(image_extents), std::move(bl), + fadvise_flags, lock, perfcounter, user_req) { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +C_WriteSameRequest::~C_WriteSameRequest() { + ldout(pwl.get_context(), 20) << this << dendl; +} + +template +void C_WriteSameRequest::update_req_stats(utime_t &now) { + /* Write same stats excluded from most write stats + * because the read phase will make them look like slow writes in + * those histograms. */ + ldout(pwl.get_context(), 20) << this << dendl; + utime_t comp_latency = now - this->m_arrived_time; + this->m_perfcounter->tinc(l_librbd_pwl_ws_latency, comp_latency); +} + +template +std::shared_ptr C_WriteSameRequest::create_operation( + uint64_t offset, uint64_t len) { + ceph_assert(this->image_extents.size() == 1); + WriteLogOperationSet &set = *this->op_set.get(); + return pwl.m_builder->create_write_log_operation( + *this->op_set.get(), offset, len, this->bl.length(), pwl.get_context(), + pwl.m_builder->create_writesame_log_entry(set.sync_point->log_entry, offset, + len, this->bl.length())); +} + +template +std::ostream &operator<<(std::ostream &os, + const C_WriteSameRequest &req) { + os << (C_WriteRequest&)req; + return os; +} + +template +void C_WriteRequest::update_req_stats(utime_t &now) { + /* Compare-and-write stats. Compare-and-write excluded from most write + * stats because the read phase will make them look like slow writes in + * those histograms. */ + if(is_comp_and_write) { + if (!compare_succeeded) { + this->m_perfcounter->inc(l_librbd_pwl_cmp_fails, 1); + } + utime_t comp_latency = now - this->m_arrived_time; + this->m_perfcounter->tinc(l_librbd_pwl_cmp_latency, comp_latency); + } +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::C_BlockIORequest >; +template class librbd::cache::pwl::C_WriteRequest >; +template class librbd::cache::pwl::C_FlushRequest >; +template class librbd::cache::pwl::C_DiscardRequest >; +template class librbd::cache::pwl::C_WriteSameRequest >; diff --git a/src/librbd/cache/pwl/Request.h b/src/librbd/cache/pwl/Request.h new file mode 100644 index 000000000..4840b049e --- /dev/null +++ b/src/librbd/cache/pwl/Request.h @@ -0,0 +1,361 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_REQUEST_H +#define CEPH_LIBRBD_CACHE_PWL_REQUEST_H + +#include "include/Context.h" +#include "librbd/cache/pwl/Types.h" +#include "librbd/cache/pwl/LogOperation.h" + +namespace librbd { +class BlockGuardCell; + +namespace cache { +namespace pwl { + +class GuardedRequestFunctionContext; + +struct WriteRequestResources { + bool allocated = false; + std::vector buffers; +}; + +/** + * A request that can be deferred in a BlockGuard to sequence + * overlapping operations. + * This is the custodian of the BlockGuard cell for this IO, and the + * state information about the progress of this IO. This object lives + * until the IO is persisted in all (live) log replicas. User request + * may be completed from here before the IO persists. + */ +template +class C_BlockIORequest : public Context { +public: + T &pwl; + io::Extents image_extents; + bufferlist bl; + int fadvise_flags; + Context *user_req; /* User write request */ + ExtentsSummary image_extents_summary; + bool detained = false; /* Detained in blockguard (overlapped with a prior IO) */ + utime_t allocated_time; /* When allocation began */ + + C_BlockIORequest(T &pwl, const utime_t arrived, io::Extents &&extents, + bufferlist&& bl, const int fadvise_flags, Context *user_req); + ~C_BlockIORequest() override; + C_BlockIORequest(const C_BlockIORequest&) = delete; + C_BlockIORequest &operator=(const C_BlockIORequest&) = delete; + + void set_cell(BlockGuardCell *cell); + BlockGuardCell *get_cell(void); + void release_cell(); + + void complete_user_request(int r); + void finish(int r); + virtual void finish_req(int r) = 0; + + virtual bool alloc_resources() = 0; + + void deferred(); + + virtual void deferred_handler() = 0; + + virtual void dispatch() = 0; + + virtual void copy_cache() {}; + + virtual const char *get_name() const { + return "C_BlockIORequest"; + } + + uint64_t get_image_extents_size() { + return image_extents.size(); + } + + std::vector& get_resources_buffers() { + return m_resources.buffers; + } + + void set_allocated(bool allocated) { + if (allocated) { + m_resources.allocated = true; + } else { + m_resources.buffers.clear(); + } + } + + virtual void setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated, + uint64_t *number_lanes, uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) = 0; + +protected: + utime_t m_arrived_time; + utime_t m_dispatched_time; /* When dispatch began */ + utime_t m_user_req_completed_time; + std::atomic m_deferred = {false}; /* Deferred because this or a prior IO had to wait for write resources */ + WriteRequestResources m_resources; + +private: + std::atomic m_user_req_completed = {false}; + std::atomic m_finish_called = {false}; + std::atomic m_cell_released = {false}; + BlockGuardCell* m_cell = nullptr; + + template + friend std::ostream &operator<<(std::ostream &os, + const C_BlockIORequest &req); +}; + +/** + * This is the custodian of the BlockGuard cell for this write. Block + * guard is not released until the write persists everywhere (this is + * how we guarantee to each log replica that they will never see + * overlapping writes). + */ +template +class C_WriteRequest : public C_BlockIORequest { +public: + using C_BlockIORequest::pwl; + bool compare_succeeded = false; + uint64_t *mismatch_offset; + bufferlist cmp_bl; + bufferlist read_bl; + bool is_comp_and_write = false; + std::unique_ptr op_set = nullptr; + + C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req); + + C_WriteRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + int fadvise_flags, ceph::mutex &lock, PerfCounters *perfcounter, + Context *user_req); + + ~C_WriteRequest() override; + + void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx); + + /* Common finish to plain write and compare-and-write (if it writes) */ + void finish_req(int r) override; + + /* Compare and write will override this */ + virtual void update_req_stats(utime_t &now); + + bool alloc_resources() override; + + void deferred_handler() override { } + + void dispatch() override; + + void copy_cache() override; + + virtual std::shared_ptr create_operation(uint64_t offset, + uint64_t len); + + virtual void setup_log_operations(DeferredContexts &on_exit); + + bool append_write_request(std::shared_ptr sync_point); + + virtual void schedule_append(); + + const char *get_name() const override { + return "C_WriteRequest"; + } + +protected: + using C_BlockIORequest::m_resources; + PerfCounters *m_perfcounter = nullptr; + +private: + bool m_do_early_flush = false; + std::atomic m_appended = {0}; + bool m_queued = false; + ceph::mutex &m_lock; + template + friend std::ostream &operator<<(std::ostream &os, + const C_WriteRequest &req); +}; + +/** + * This is the custodian of the BlockGuard cell for this + * aio_flush. Block guard is released as soon as the new + * sync point (if required) is created. Subsequent IOs can + * proceed while this flush waits for prior IOs to complete + * and any required sync points to be persisted. + */ +template +class C_FlushRequest : public C_BlockIORequest { +public: + using C_BlockIORequest::pwl; + bool internal = false; + std::shared_ptr to_append; + + C_FlushRequest(T &pwl, const utime_t arrived, + io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, + ceph::mutex &lock, PerfCounters *perfcounter, + Context *user_req); + + ~C_FlushRequest() override {} + + bool alloc_resources() override; + + void dispatch() override; + + const char *get_name() const override { + return "C_FlushRequest"; + } + + void setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, + uint64_t *bytes_allocated, uint64_t *number_lanes, + uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) override; +private: + std::shared_ptr op; + ceph::mutex &m_lock; + PerfCounters *m_perfcounter = nullptr; + + void finish_req(int r) override; + void deferred_handler() override { + m_perfcounter->inc(l_librbd_pwl_aio_flush_def, 1); + } + + template + friend std::ostream &operator<<(std::ostream &os, + const C_FlushRequest &req); +}; + +/** + * This is the custodian of the BlockGuard cell for this discard. As in the + * case of write, the block guard is not released until the discard persists + * everywhere. + */ +template +class C_DiscardRequest : public C_BlockIORequest { +public: + using C_BlockIORequest::pwl; + std::shared_ptr op; + + C_DiscardRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req); + + ~C_DiscardRequest() override; + void finish_req(int r) override {} + + bool alloc_resources() override; + + void deferred_handler() override { } + + void setup_log_operations(); + + void dispatch() override; + + void blockguard_acquired(GuardedRequestFunctionContext &guard_ctx); + + const char *get_name() const override { + return "C_DiscardRequest"; + } + void setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated, + uint64_t *number_lanes, uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) override; +private: + uint32_t m_discard_granularity_bytes; + ceph::mutex &m_lock; + PerfCounters *m_perfcounter = nullptr; + template + friend std::ostream &operator<<(std::ostream &os, + const C_DiscardRequest &req); +}; + +/** + * This is the custodian of the BlockGuard cell for this write same. + * + * A writesame allocates and persists a data buffer like a write, but the + * data buffer is usually much shorter than the write same. + */ +template +class C_WriteSameRequest : public C_WriteRequest { +public: + using C_BlockIORequest::pwl; + C_WriteSameRequest(T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req); + + ~C_WriteSameRequest() override; + + void update_req_stats(utime_t &now) override; + + std::shared_ptr create_operation(uint64_t offset, uint64_t len) override; + + const char *get_name() const override { + return "C_WriteSameRequest"; + } + + template + friend std::ostream &operator<<(std::ostream &os, + const C_WriteSameRequest &req); +}; + +struct BlockGuardReqState { + bool barrier = false; /* This is a barrier request */ + bool current_barrier = false; /* This is the currently active barrier */ + bool detained = false; + bool queued = false; /* Queued for barrier */ + friend std::ostream &operator<<(std::ostream &os, + const BlockGuardReqState &r) { + os << "barrier=" << r.barrier + << ", current_barrier=" << r.current_barrier + << ", detained=" << r.detained + << ", queued=" << r.queued; + return os; + } +}; + +class GuardedRequestFunctionContext : public Context { +public: + BlockGuardCell *cell = nullptr; + BlockGuardReqState state; + GuardedRequestFunctionContext(boost::function &&callback) + : m_callback(std::move(callback)){ } + ~GuardedRequestFunctionContext(void) override { }; + GuardedRequestFunctionContext(const GuardedRequestFunctionContext&) = delete; + GuardedRequestFunctionContext &operator=(const GuardedRequestFunctionContext&) = delete; + +private: + boost::function m_callback; + void finish(int r) override { + ceph_assert(cell); + m_callback(*this); + } +}; + +class GuardedRequest { +public: + const BlockExtent block_extent; + GuardedRequestFunctionContext *guard_ctx; /* Work to do when guard on range obtained */ + + GuardedRequest(const BlockExtent block_extent, + GuardedRequestFunctionContext *on_guard_acquire, bool barrier = false) + : block_extent(block_extent), guard_ctx(on_guard_acquire) { + guard_ctx->state.barrier = barrier; + } + friend std::ostream &operator<<(std::ostream &os, + const GuardedRequest &r) { + os << "guard_ctx->state=[" << r.guard_ctx->state + << "], block_extent.block_start=" << r.block_extent.block_start + << ", block_extent.block_end=" << r.block_extent.block_end; + return os; + } +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_REQUEST_H diff --git a/src/librbd/cache/pwl/ShutdownRequest.cc b/src/librbd/cache/pwl/ShutdownRequest.cc new file mode 100644 index 000000000..e022328ba --- /dev/null +++ b/src/librbd/cache/pwl/ShutdownRequest.cc @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/pwl/ShutdownRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/Operations.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/Types.h" + +#include "librbd/cache/pwl/AbstractWriteLog.h" +#include "librbd/plugin/Api.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl:ShutdownRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template +ShutdownRequest* ShutdownRequest::create( + I &image_ctx, + AbstractWriteLog *image_cache, + plugin::Api& plugin_api, + Context *on_finish) { + return new ShutdownRequest(image_ctx, image_cache, plugin_api, on_finish); +} + +template +ShutdownRequest::ShutdownRequest( + I &image_ctx, + AbstractWriteLog *image_cache, + plugin::Api& plugin_api, + Context *on_finish) + : m_image_ctx(image_ctx), + m_image_cache(image_cache), + m_plugin_api(plugin_api), + m_on_finish(create_async_context_callback(image_ctx, on_finish)), + m_error_result(0) { +} + +template +void ShutdownRequest::send() { + send_shutdown_image_cache(); +} + +template +void ShutdownRequest::send_shutdown_image_cache() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (m_image_cache == nullptr) { + finish(); + return; + } + + using klass = ShutdownRequest; + Context *ctx = create_context_callback( + this); + + m_image_cache->shut_down(ctx); +} + +template +void ShutdownRequest::handle_shutdown_image_cache(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to shut down the image cache: " << cpp_strerror(r) + << dendl; + save_result(r); + finish(); + return; + } else { + delete m_image_cache; + m_image_cache = nullptr; + } + send_remove_feature_bit(); +} + +template +void ShutdownRequest::send_remove_feature_bit() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + uint64_t new_features = m_image_ctx.features & ~RBD_FEATURE_DIRTY_CACHE; + uint64_t features_mask = RBD_FEATURE_DIRTY_CACHE; + ldout(cct, 10) << "old_features=" << m_image_ctx.features + << ", new_features=" << new_features + << ", features_mask=" << features_mask + << dendl; + + int r = librbd::cls_client::set_features(&m_image_ctx.md_ctx, m_image_ctx.header_oid, + new_features, features_mask); + m_image_ctx.features &= ~RBD_FEATURE_DIRTY_CACHE; + using klass = ShutdownRequest; + Context *ctx = create_context_callback( + this); + ctx->complete(r); +} + +template +void ShutdownRequest::handle_remove_feature_bit(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove the feature bit: " << cpp_strerror(r) + << dendl; + save_result(r); + finish(); + return; + } + send_remove_image_cache_state(); +} + +template +void ShutdownRequest::send_remove_image_cache_state() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = ShutdownRequest; + Context *ctx = create_context_callback( + this); + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + m_plugin_api.execute_image_metadata_remove(&m_image_ctx, PERSISTENT_CACHE_STATE, ctx); +} + +template +void ShutdownRequest::handle_remove_image_cache_state(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove the image cache state: " << cpp_strerror(r) + << dendl; + save_result(r); + } + finish(); +} + +template +void ShutdownRequest::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::ShutdownRequest; diff --git a/src/librbd/cache/pwl/ShutdownRequest.h b/src/librbd/cache/pwl/ShutdownRequest.h new file mode 100644 index 000000000..dafac9e9c --- /dev/null +++ b/src/librbd/cache/pwl/ShutdownRequest.h @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H +#define CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace plugin { template struct Api; } + +namespace cache { +namespace pwl { + +template +class AbstractWriteLog; + +template +class ImageCacheState; + +template +class ShutdownRequest { +public: + static ShutdownRequest* create( + ImageCtxT &image_ctx, + AbstractWriteLog *image_cache, + plugin::Api& plugin_api, + Context *on_finish); + + void send(); + +private: + + /** + * @verbatim + * + * Shutdown request goes through the following state machine: + * + * + * | + * v + * SHUTDOWN_IMAGE_CACHE + * | + * v + * REMOVE_IMAGE_FEATURE_BIT + * | + * v + * REMOVE_IMAGE_CACHE_STATE + * | + * v + * + * + * @endverbatim + */ + + ShutdownRequest(ImageCtxT &image_ctx, + AbstractWriteLog *image_cache, + plugin::Api& plugin_api, + Context *on_finish); + + ImageCtxT &m_image_ctx; + AbstractWriteLog *m_image_cache; + plugin::Api& m_plugin_api; + Context *m_on_finish; + + int m_error_result; + + void send_shutdown_image_cache(); + void handle_shutdown_image_cache(int r); + + void send_remove_feature_bit(); + void handle_remove_feature_bit(int r); + + void send_remove_image_cache_state(); + void handle_remove_image_cache_state(int r); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::ShutdownRequest; + +#endif // CEPH_LIBRBD_CACHE_RWL_SHUTDOWN_REQUEST_H diff --git a/src/librbd/cache/pwl/SyncPoint.cc b/src/librbd/cache/pwl/SyncPoint.cc new file mode 100644 index 000000000..6d45e7a30 --- /dev/null +++ b/src/librbd/cache/pwl/SyncPoint.cc @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SyncPoint.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::SyncPoint: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { + +SyncPoint::SyncPoint(uint64_t sync_gen_num, CephContext *cct) + : log_entry(std::make_shared(sync_gen_num)), m_cct(cct) { + m_prior_log_entries_persisted = new C_Gather(cct, nullptr); + m_sync_point_persist = new C_Gather(cct, nullptr); + on_sync_point_appending.reserve(MAX_WRITES_PER_SYNC_POINT + 2); + on_sync_point_persisted.reserve(MAX_WRITES_PER_SYNC_POINT + 2); + ldout(m_cct, 20) << "sync point " << sync_gen_num << dendl; +} + +SyncPoint::~SyncPoint() { + ceph_assert(on_sync_point_appending.empty()); + ceph_assert(on_sync_point_persisted.empty()); + ceph_assert(!earlier_sync_point); +} + +std::ostream &operator<<(std::ostream &os, + const SyncPoint &p) { + os << "log_entry=[" << *p.log_entry + << "], earlier_sync_point=" << p.earlier_sync_point + << ", later_sync_point=" << p.later_sync_point + << ", m_final_op_sequence_num=" << p.m_final_op_sequence_num + << ", m_prior_log_entries_persisted=" << p.m_prior_log_entries_persisted + << ", m_prior_log_entries_persisted_complete=" << p.m_prior_log_entries_persisted_complete + << ", m_append_scheduled=" << p.m_append_scheduled + << ", appending=" << p.appending + << ", on_sync_point_appending=" << p.on_sync_point_appending.size() + << ", on_sync_point_persisted=" << p.on_sync_point_persisted.size(); + return os; +} + +void SyncPoint::persist_gather_set_finisher(Context *ctx) { + m_append_scheduled = true; + /* All prior sync points that are still in this list must already be scheduled for append */ + std::shared_ptr previous = earlier_sync_point; + while (previous) { + ceph_assert(previous->m_append_scheduled); + previous = previous->earlier_sync_point; + } + + m_sync_point_persist->set_finisher(ctx); +} + +void SyncPoint::persist_gather_activate() { + m_sync_point_persist->activate(); +} + +Context* SyncPoint::persist_gather_new_sub() { + return m_sync_point_persist->new_sub(); +} + +void SyncPoint::prior_persisted_gather_activate() { + m_prior_log_entries_persisted->activate(); +} + +Context* SyncPoint::prior_persisted_gather_new_sub() { + return m_prior_log_entries_persisted->new_sub(); +} + +void SyncPoint::prior_persisted_gather_set_finisher() { + Context *sync_point_persist_ready = persist_gather_new_sub(); + std::shared_ptr sp = shared_from_this(); + m_prior_log_entries_persisted-> + set_finisher(new LambdaContext([this, sp, sync_point_persist_ready](int r) { + ldout(m_cct, 20) << "Prior log entries persisted for sync point =[" + << sp << "]" << dendl; + sp->m_prior_log_entries_persisted_result = r; + sp->m_prior_log_entries_persisted_complete = true; + sync_point_persist_ready->complete(r); + })); +} + +void SyncPoint::add_in_on_persisted_ctxs(Context* ctx) { + on_sync_point_persisted.push_back(ctx); +} + +void SyncPoint::add_in_on_appending_ctxs(Context* ctx) { + on_sync_point_appending.push_back(ctx); +} + +void SyncPoint::setup_earlier_sync_point(std::shared_ptr sync_point, + uint64_t last_op_sequence_num) { + earlier_sync_point = sync_point; + log_entry->prior_sync_point_flushed = false; + earlier_sync_point->log_entry->next_sync_point_entry = log_entry; + earlier_sync_point->later_sync_point = shared_from_this(); + earlier_sync_point->m_final_op_sequence_num = last_op_sequence_num; + if (!earlier_sync_point->appending) { + /* Append of new sync point deferred until old sync point is appending */ + earlier_sync_point->add_in_on_appending_ctxs(prior_persisted_gather_new_sub()); + } +} + +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/SyncPoint.h b/src/librbd/cache/pwl/SyncPoint.h new file mode 100644 index 000000000..424e3730e --- /dev/null +++ b/src/librbd/cache/pwl/SyncPoint.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H +#define CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H + +#include "librbd/ImageCtx.h" +#include "librbd/cache/pwl/LogEntry.h" +#include "librbd/cache/pwl/Types.h" + +namespace librbd { +namespace cache { +namespace pwl { + +class SyncPoint: public std::enable_shared_from_this { +public: + std::shared_ptr log_entry; + /* Use lock for earlier/later links */ + std::shared_ptr earlier_sync_point; /* NULL if earlier has completed */ + std::shared_ptr later_sync_point; + bool appending = false; + /* Signal these when this sync point is appending to the log, and its order + * of appearance is guaranteed. One of these is is a sub-operation of the + * next sync point's m_prior_log_entries_persisted Gather. */ + std::vector on_sync_point_appending; + /* Signal these when this sync point is appended and persisted. User + * aio_flush() calls are added to this. */ + std::vector on_sync_point_persisted; + + SyncPoint(uint64_t sync_gen_num, CephContext *cct); + ~SyncPoint(); + SyncPoint(const SyncPoint&) = delete; + SyncPoint &operator=(const SyncPoint&) = delete; + void persist_gather_activate(); + Context* persist_gather_new_sub(); + void persist_gather_set_finisher(Context *ctx); + void prior_persisted_gather_activate(); + Context* prior_persisted_gather_new_sub(); + void prior_persisted_gather_set_finisher(); + void add_in_on_persisted_ctxs(Context* cxt); + void add_in_on_appending_ctxs(Context* cxt); + void setup_earlier_sync_point(std::shared_ptr sync_point, + uint64_t last_op_sequence_num); +private: + CephContext *m_cct; + bool m_append_scheduled = false; + uint64_t m_final_op_sequence_num = 0; + /* A sync point can't appear in the log until all the writes bearing + * it and all the prior sync points have been appended and + * persisted. + * + * Writes bearing this sync gen number and the prior sync point will be + * sub-ops of this Gather. This sync point will not be appended until all + * these complete to the point where their persist order is guaranteed. */ + C_Gather *m_prior_log_entries_persisted; + /* The finisher for this will append the sync point to the log. The finisher + * for m_prior_log_entries_persisted will be a sub-op of this. */ + C_Gather *m_sync_point_persist; + int m_prior_log_entries_persisted_result = 0; + int m_prior_log_entries_persisted_complete = false; + friend std::ostream &operator<<(std::ostream &os, + const SyncPoint &p); +}; + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_SYNC_POINT_H diff --git a/src/librbd/cache/pwl/Types.cc b/src/librbd/cache/pwl/Types.cc new file mode 100644 index 000000000..c29305eec --- /dev/null +++ b/src/librbd/cache/pwl/Types.cc @@ -0,0 +1,185 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include "Types.h" +#include "common/ceph_context.h" +#include "include/Context.h" +#include "include/stringify.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::Types: " << this << " " \ + << __func__ << ": " +using ceph::Formatter; + +namespace librbd { +namespace cache { +namespace pwl { + +DeferredContexts::~DeferredContexts() { + finish_contexts(nullptr, contexts, 0); +} + +void DeferredContexts::add(Context* ctx) { + contexts.push_back(ctx); +} + +/* + * A BlockExtent identifies a range by first and last. + * + * An Extent ("image extent") identifies a range by start and length. + * + * The ImageDispatch interface is defined in terms of image extents, and + * requires no alignment of the beginning or end of the extent. We + * convert between image and block extents here using a "block size" + * of 1. + */ +BlockExtent convert_to_block_extent(uint64_t offset_bytes, uint64_t length_bytes) +{ + return BlockExtent(offset_bytes, + offset_bytes + length_bytes); +} + +BlockExtent WriteLogCacheEntry::block_extent() { + return convert_to_block_extent(image_offset_bytes, write_bytes); +} + +uint64_t WriteLogCacheEntry::get_offset_bytes() { + return image_offset_bytes; +} + +uint64_t WriteLogCacheEntry::get_write_bytes() { + return write_bytes; +} + +#ifdef WITH_RBD_SSD_CACHE +void WriteLogCacheEntry::dump(Formatter *f) const { + f->dump_unsigned("sync_gen_number", sync_gen_number); + f->dump_unsigned("write_sequence_number", write_sequence_number); + f->dump_unsigned("image_offset_bytes", image_offset_bytes); + f->dump_unsigned("write_bytes", write_bytes); + f->dump_unsigned("write_data_pos", write_data_pos); + f->dump_bool("entry_valid", is_entry_valid()); + f->dump_bool("sync_point", is_sync_point()); + f->dump_bool("sequenced", is_sequenced()); + f->dump_bool("has_data", has_data()); + f->dump_bool("discard", is_discard()); + f->dump_bool("writesame", is_writesame()); + f->dump_unsigned("ws_datalen", ws_datalen); + f->dump_unsigned("entry_index", entry_index); +} + +void WriteLogCacheEntry::generate_test_instances(std::list& ls) { + ls.push_back(new WriteLogCacheEntry()); + ls.push_back(new WriteLogCacheEntry); + ls.back()->sync_gen_number = 1; + ls.back()->write_sequence_number = 1; + ls.back()->image_offset_bytes = 1; + ls.back()->write_bytes = 1; + ls.back()->write_data_pos = 1; + ls.back()->set_entry_valid(true); + ls.back()->set_sync_point(true); + ls.back()->set_sequenced(true); + ls.back()->set_has_data(true); + ls.back()->set_discard(true); + ls.back()->set_writesame(true); + ls.back()->ws_datalen = 1; + ls.back()->entry_index = 1; +} + +void WriteLogPoolRoot::dump(Formatter *f) const { + f->dump_unsigned("layout_version", layout_version); + f->dump_unsigned("cur_sync_gen", cur_sync_gen); + f->dump_unsigned("pool_size", pool_size); + f->dump_unsigned("flushed_sync_gen", flushed_sync_gen); + f->dump_unsigned("block_size", block_size); + f->dump_unsigned("num_log_entries", num_log_entries); + f->dump_unsigned("first_free_entry", first_free_entry); + f->dump_unsigned("first_valid_entry", first_valid_entry); +} + +void WriteLogPoolRoot::generate_test_instances(std::list& ls) { + ls.push_back(new WriteLogPoolRoot()); + ls.push_back(new WriteLogPoolRoot); + ls.back()->layout_version = 2; + ls.back()->cur_sync_gen = 1; + ls.back()->pool_size = 1024; + ls.back()->flushed_sync_gen = 1; + ls.back()->block_size = 4096; + ls.back()->num_log_entries = 10000000; + ls.back()->first_free_entry = 1; + ls.back()->first_valid_entry = 0; +} +#endif + +std::ostream& operator<<(std::ostream& os, + const WriteLogCacheEntry &entry) { + os << "entry_valid=" << entry.is_entry_valid() + << ", sync_point=" << entry.is_sync_point() + << ", sequenced=" << entry.is_sequenced() + << ", has_data=" << entry.has_data() + << ", discard=" << entry.is_discard() + << ", writesame=" << entry.is_writesame() + << ", sync_gen_number=" << entry.sync_gen_number + << ", write_sequence_number=" << entry.write_sequence_number + << ", image_offset_bytes=" << entry.image_offset_bytes + << ", write_bytes=" << entry.write_bytes + << ", ws_datalen=" << entry.ws_datalen + << ", entry_index=" << entry.entry_index; + return os; +} + +template +ExtentsSummary::ExtentsSummary(const ExtentsType &extents) + : total_bytes(0), first_image_byte(0), last_image_byte(0) +{ + if (extents.empty()) return; + /* These extents refer to image offsets between first_image_byte + * and last_image_byte, inclusive, but we don't guarantee here + * that they address all of those bytes. There may be gaps. */ + first_image_byte = extents.front().first; + last_image_byte = first_image_byte + extents.front().second; + for (auto &extent : extents) { + /* Ignore zero length extents */ + if (extent.second) { + total_bytes += extent.second; + if (extent.first < first_image_byte) { + first_image_byte = extent.first; + } + if ((extent.first + extent.second) > last_image_byte) { + last_image_byte = extent.first + extent.second; + } + } + } +} + +io::Extent whole_volume_extent() { + return io::Extent({0, std::numeric_limits::max()}); +} + +BlockExtent block_extent(const io::Extent& image_extent) { + return convert_to_block_extent(image_extent.first, image_extent.second); +} + +Context * override_ctx(int r, Context *ctx) { + if (r < 0) { + /* Override next_ctx status with this error */ + return new LambdaContext( + [r, ctx](int _r) { + ctx->complete(r); + }); + } else { + return ctx; + } +} + +std::string unique_lock_name(const std::string &name, void *address) { + return name + " (" + stringify(address) + ")"; +} + +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::ExtentsSummary; diff --git a/src/librbd/cache/pwl/Types.h b/src/librbd/cache/pwl/Types.h new file mode 100644 index 000000000..0d8c93a24 --- /dev/null +++ b/src/librbd/cache/pwl/Types.h @@ -0,0 +1,445 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_TYPES_H +#define CEPH_LIBRBD_CACHE_PWL_TYPES_H + +#include "acconfig.h" + +#ifdef WITH_RBD_RWL +#include "libpmemobj.h" +#endif + +#include +#include "librbd/BlockGuard.h" +#include "librbd/io/Types.h" + +namespace ceph { +class Formatter; +} + +class Context; + +enum { + l_librbd_pwl_first = 26500, + + // All read requests + l_librbd_pwl_rd_req, // read requests + l_librbd_pwl_rd_bytes, // bytes read + l_librbd_pwl_rd_latency, // average req completion latency + + // Read requests completed from RWL (no misses) + l_librbd_pwl_rd_hit_req, // read requests + l_librbd_pwl_rd_hit_bytes, // bytes read + l_librbd_pwl_rd_hit_latency, // average req completion latency + + // Reed requests with hit and miss extents + l_librbd_pwl_rd_part_hit_req, // read ops + + // Per SyncPoint's LogEntry number and write bytes distribution + l_librbd_pwl_syncpoint_hist, + + // All write requests + l_librbd_pwl_wr_req, // write requests + l_librbd_pwl_wr_bytes, // bytes written + l_librbd_pwl_wr_req_def, // write requests deferred for resources + l_librbd_pwl_wr_req_def_lanes, // write requests deferred for lanes + l_librbd_pwl_wr_req_def_log, // write requests deferred for log entries + l_librbd_pwl_wr_req_def_buf, // write requests deferred for buffer space + l_librbd_pwl_wr_req_overlap, // write requests detained for overlap + l_librbd_pwl_wr_req_queued, // write requests queued for prior barrier + + // Write log operations (1 .. n per request that appends to the log) + l_librbd_pwl_log_ops, // log append ops + l_librbd_pwl_log_op_bytes, // average bytes written per log op + + /* + + Req and op average latencies to the beginning of and over various phases: + + +------------------------------+------+-------------------------------+ + | Phase | Name | Description | + +------------------------------+------+-------------------------------+ + | Arrive at RWL | arr |Arrives as a request | + +------------------------------+------+-------------------------------+ + | Allocate resources | all |time spent in block guard for | + | | |overlap sequencing occurs | + | | |before this point | + +------------------------------+------+-------------------------------+ + | Dispatch | dis |Op lifetime begins here. time | + | | |spent in allocation waiting for| + | | |resources occurs before this | + | | |point | + +------------------------------+------+-------------------------------+ + | Payload buffer persist and | buf |time spent queued for | + |replicate | |replication occurs before here | + +------------------------------+------+-------------------------------+ + | Payload buffer persist | bufc |bufc - buf is just the persist | + |complete | |time | + +------------------------------+------+-------------------------------+ + | Log append | app |time spent queued for append | + | | |occurs before here | + +------------------------------+------+-------------------------------+ + | Append complete | appc |appc - app is just the time | + | | |spent in the append operation | + +------------------------------+------+-------------------------------+ + | Complete | cmp |write persisted, replicated, | + | | |and globally visible | + +------------------------------+------+-------------------------------+ + + */ + + /* Request times */ + l_librbd_pwl_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard + l_librbd_pwl_req_arr_to_dis_t, // arrival to dispatch elapsed time + l_librbd_pwl_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources + l_librbd_pwl_wr_latency, // average req (persist) completion latency + l_librbd_pwl_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written + l_librbd_pwl_wr_caller_latency, // average req completion (to caller) latency + + /* Request times for requests that never waited for space*/ + l_librbd_pwl_nowait_req_arr_to_all_t, // arrival to allocation elapsed time - same as time deferred in block guard + l_librbd_pwl_nowait_req_arr_to_dis_t, // arrival to dispatch elapsed time + l_librbd_pwl_nowait_req_all_to_dis_t, // Time spent allocating or waiting to allocate resources + l_librbd_pwl_nowait_wr_latency, // average req (persist) completion latency + l_librbd_pwl_nowait_wr_latency_hist, // Histogram of write req (persist) completion latency vs. bytes written + l_librbd_pwl_nowait_wr_caller_latency, // average req completion (to caller) latency + + /* Log operation times */ + l_librbd_pwl_log_op_alloc_t, // elapsed time of pmemobj_reserve() + l_librbd_pwl_log_op_alloc_t_hist, // Histogram of elapsed time of pmemobj_reserve() + + l_librbd_pwl_log_op_dis_to_buf_t, // dispatch to buffer persist elapsed time + l_librbd_pwl_log_op_dis_to_app_t, // dispatch to log append elapsed time + l_librbd_pwl_log_op_dis_to_cmp_t, // dispatch to persist completion elapsed time + l_librbd_pwl_log_op_dis_to_cmp_t_hist, // Histogram of dispatch to persist completion elapsed time + + l_librbd_pwl_log_op_buf_to_app_t, // data buf persist + append wait time + l_librbd_pwl_log_op_buf_to_bufc_t,// data buf persist / replicate elapsed time + l_librbd_pwl_log_op_buf_to_bufc_t_hist,// data buf persist time vs bytes histogram + l_librbd_pwl_log_op_app_to_cmp_t, // log entry append + completion wait time + l_librbd_pwl_log_op_app_to_appc_t, // log entry append / replicate elapsed time + l_librbd_pwl_log_op_app_to_appc_t_hist, // log entry append time (vs. op bytes) histogram + + l_librbd_pwl_discard, + l_librbd_pwl_discard_bytes, + l_librbd_pwl_discard_latency, + + l_librbd_pwl_aio_flush, + l_librbd_pwl_aio_flush_def, + l_librbd_pwl_aio_flush_latency, + l_librbd_pwl_ws, + l_librbd_pwl_ws_bytes, // Bytes modified by write same, probably much larger than WS payload bytes + l_librbd_pwl_ws_latency, + + l_librbd_pwl_cmp, + l_librbd_pwl_cmp_bytes, + l_librbd_pwl_cmp_latency, + l_librbd_pwl_cmp_fails, + + l_librbd_pwl_internal_flush, + l_librbd_pwl_writeback_latency, + l_librbd_pwl_invalidate_cache, + l_librbd_pwl_invalidate_discard_cache, + + l_librbd_pwl_append_tx_t, + l_librbd_pwl_retire_tx_t, + l_librbd_pwl_append_tx_t_hist, + l_librbd_pwl_retire_tx_t_hist, + + l_librbd_pwl_last, +}; + +enum { + WRITE_LOG_CACHE_ENTRY_VALID = 1U << 0, /* if 0, this entry is free */ + WRITE_LOG_CACHE_ENTRY_SYNC_POINT = 1U << 1, /* No data. No write sequence number. + Marks sync point for this sync gen number */ + WRITE_LOG_CACHE_ENTRY_SEQUENCED = 1U << 2, /* write sequence number is valid */ + WRITE_LOG_CACHE_ENTRY_HAS_DATA = 1U << 3, /* write_data field is valid (else ignore) */ + WRITE_LOG_CACHE_ENTRY_DISCARD = 1U << 4, /* has_data will be 0 if this is a discard */ + WRITE_LOG_CACHE_ENTRY_WRITESAME = 1U << 5, /* ws_datalen indicates length of data at write_bytes */ +}; + +namespace librbd { +namespace cache { +namespace pwl { + +class ImageExtentBuf; + +const int IN_FLIGHT_FLUSH_WRITE_LIMIT = 64; +const int IN_FLIGHT_FLUSH_BYTES_LIMIT = (1 * 1024 * 1024); + +/* Limit work between sync points */ +const uint64_t MAX_WRITES_PER_SYNC_POINT = 256; +const uint64_t MAX_BYTES_PER_SYNC_POINT = (1024 * 1024 * 8); + +const uint32_t MIN_WRITE_ALLOC_SIZE = 512; +const uint32_t MIN_WRITE_ALLOC_SSD_SIZE = 4096; +const uint32_t LOG_STATS_INTERVAL_SECONDS = 5; + +/**** Write log entries ****/ +const unsigned long int MAX_ALLOC_PER_TRANSACTION = 8; +const unsigned long int MAX_FREE_PER_TRANSACTION = 1; +const unsigned int MAX_CONCURRENT_WRITES = (1024 * 1024); + +const uint64_t DEFAULT_POOL_SIZE = 1u<<30; +const uint64_t MIN_POOL_SIZE = DEFAULT_POOL_SIZE; +const uint64_t POOL_SIZE_ALIGN = 1 << 20; +constexpr double USABLE_SIZE = (7.0 / 10); +const uint64_t BLOCK_ALLOC_OVERHEAD_BYTES = 16; +const uint8_t RWL_LAYOUT_VERSION = 1; +const uint8_t SSD_LAYOUT_VERSION = 1; +const uint64_t MAX_LOG_ENTRIES = (1024 * 1024); +const double AGGRESSIVE_RETIRE_HIGH_WATER = 0.75; +const double RETIRE_HIGH_WATER = 0.50; +const double RETIRE_LOW_WATER = 0.40; +const int RETIRE_BATCH_TIME_LIMIT_MS = 250; +const uint64_t CONTROL_BLOCK_MAX_LOG_ENTRIES = 32; +const uint64_t SPAN_MAX_DATA_LEN = (16 * 1024 * 1024); + +/* offset of ring on SSD */ +const uint64_t DATA_RING_BUFFER_OFFSET = 8192; + +/* Defer a set of Contexts until destruct/exit. Used for deferring + * work on a given thread until a required lock is dropped. */ +class DeferredContexts { +private: + std::vector contexts; +public: + ~DeferredContexts(); + void add(Context* ctx); +}; + +/* Pmem structures */ +#ifdef WITH_RBD_RWL +POBJ_LAYOUT_BEGIN(rbd_pwl); +POBJ_LAYOUT_ROOT(rbd_pwl, struct WriteLogPoolRoot); +POBJ_LAYOUT_TOID(rbd_pwl, uint8_t); +POBJ_LAYOUT_TOID(rbd_pwl, struct WriteLogCacheEntry); +POBJ_LAYOUT_END(rbd_pwl); +#endif + +struct WriteLogCacheEntry { + uint64_t sync_gen_number = 0; + uint64_t write_sequence_number = 0; + uint64_t image_offset_bytes; + uint64_t write_bytes; + #ifdef WITH_RBD_RWL + TOID(uint8_t) write_data; + #endif + #ifdef WITH_RBD_SSD_CACHE + uint64_t write_data_pos = 0; /* SSD data offset */ + #endif + uint8_t flags = 0; + uint32_t ws_datalen = 0; /* Length of data buffer (writesame only) */ + uint32_t entry_index = 0; /* For debug consistency check. Can be removed if + * we need the space */ + WriteLogCacheEntry(uint64_t image_offset_bytes=0, uint64_t write_bytes=0) + : image_offset_bytes(image_offset_bytes), write_bytes(write_bytes) {} + BlockExtent block_extent(); + uint64_t get_offset_bytes(); + uint64_t get_write_bytes(); + bool is_entry_valid() const { + return flags & WRITE_LOG_CACHE_ENTRY_VALID; + } + bool is_sync_point() const { + return flags & WRITE_LOG_CACHE_ENTRY_SYNC_POINT; + } + bool is_sequenced() const { + return flags & WRITE_LOG_CACHE_ENTRY_SEQUENCED; + } + bool has_data() const { + return flags & WRITE_LOG_CACHE_ENTRY_HAS_DATA; + } + bool is_discard() const { + return flags & WRITE_LOG_CACHE_ENTRY_DISCARD; + } + bool is_writesame() const { + return flags & WRITE_LOG_CACHE_ENTRY_WRITESAME; + } + bool is_write() const { + /* Log entry is a basic write */ + return !is_sync_point() && !is_discard() && !is_writesame(); + } + bool is_writer() const { + /* Log entry is any type that writes data */ + return is_write() || is_discard() || is_writesame(); + } + void set_entry_valid(bool flag) { + if (flag) { + flags |= WRITE_LOG_CACHE_ENTRY_VALID; + } else { + flags &= ~WRITE_LOG_CACHE_ENTRY_VALID; + } + } + void set_sync_point(bool flag) { + if (flag) { + flags |= WRITE_LOG_CACHE_ENTRY_SYNC_POINT; + } else { + flags &= ~WRITE_LOG_CACHE_ENTRY_SYNC_POINT; + } + } + void set_sequenced(bool flag) { + if (flag) { + flags |= WRITE_LOG_CACHE_ENTRY_SEQUENCED; + } else { + flags &= ~WRITE_LOG_CACHE_ENTRY_SEQUENCED; + } + } + void set_has_data(bool flag) { + if (flag) { + flags |= WRITE_LOG_CACHE_ENTRY_HAS_DATA; + } else { + flags &= ~WRITE_LOG_CACHE_ENTRY_HAS_DATA; + } + } + void set_discard(bool flag) { + if (flag) { + flags |= WRITE_LOG_CACHE_ENTRY_DISCARD; + } else { + flags &= ~WRITE_LOG_CACHE_ENTRY_DISCARD; + } + } + void set_writesame(bool flag) { + if (flag) { + flags |= WRITE_LOG_CACHE_ENTRY_WRITESAME; + } else { + flags &= ~WRITE_LOG_CACHE_ENTRY_WRITESAME; + } + } + friend std::ostream& operator<<(std::ostream& os, + const WriteLogCacheEntry &entry); + #ifdef WITH_RBD_SSD_CACHE + DENC(WriteLogCacheEntry, v, p) { + DENC_START(1, 1, p); + denc(v.sync_gen_number, p); + denc(v.write_sequence_number, p); + denc(v.image_offset_bytes, p); + denc(v.write_bytes, p); + denc(v.write_data_pos, p); + denc(v.flags, p); + denc(v.ws_datalen, p); + denc(v.entry_index, p); + DENC_FINISH(p); + } + #endif + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& ls); +}; + +struct WriteLogPoolRoot { + #ifdef WITH_RBD_RWL + union { + struct { + uint8_t layout_version; + }; + uint64_t _u64; + } header; + TOID(struct WriteLogCacheEntry) log_entries; /* contiguous array of log entries */ + #endif + #ifdef WITH_RBD_SSD_CACHE + uint64_t layout_version = 0; + uint64_t cur_sync_gen = 0; /* TODO: remove it when changing disk format */ + #endif + uint64_t pool_size; + uint64_t flushed_sync_gen; /* All writing entries with this or a lower + * sync gen number are flushed. */ + uint32_t block_size; + uint32_t num_log_entries; + uint64_t first_free_entry; /* The free entry following the latest valid + * entry, which is going to be written */ + uint64_t first_valid_entry; /* The oldest valid entry to be retired */ + + #ifdef WITH_RBD_SSD_CACHE + DENC(WriteLogPoolRoot, v, p) { + DENC_START(1, 1, p); + denc(v.layout_version, p); + denc(v.cur_sync_gen, p); + denc(v.pool_size, p); + denc(v.flushed_sync_gen, p); + denc(v.block_size, p); + denc(v.num_log_entries, p); + denc(v.first_free_entry, p); + denc(v.first_valid_entry, p); + DENC_FINISH(p); + } + #endif + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list& ls); +}; + +struct WriteBufferAllocation { + unsigned int allocation_size = 0; + #ifdef WITH_RBD_RWL + pobj_action buffer_alloc_action; + TOID(uint8_t) buffer_oid = OID_NULL; + #endif + bool allocated = false; + utime_t allocation_lat; +}; + +static inline io::Extent image_extent(const BlockExtent& block_extent) { + return io::Extent(block_extent.block_start, + block_extent.block_end - block_extent.block_start); +} + +template +class ExtentsSummary { +public: + uint64_t total_bytes; + uint64_t first_image_byte; + uint64_t last_image_byte; + explicit ExtentsSummary(const ExtentsType &extents); + friend std::ostream &operator<<(std::ostream &os, + const ExtentsSummary &s) { + os << "total_bytes=" << s.total_bytes + << ", first_image_byte=" << s.first_image_byte + << ", last_image_byte=" << s.last_image_byte; + return os; + } + BlockExtent block_extent() { + return BlockExtent(first_image_byte, last_image_byte); + } + io::Extent image_extent() { + return librbd::cache::pwl::image_extent(block_extent()); + } +}; + +io::Extent whole_volume_extent(); + +BlockExtent block_extent(const io::Extent& image_extent); + +Context * override_ctx(int r, Context *ctx); + +class ImageExtentBuf : public io::Extent { +public: + bufferlist m_bl; + bool need_to_truncate; + int truncate_offset; + bool writesame; + ImageExtentBuf() {} + ImageExtentBuf(io::Extent extent, + bool need_to_truncate = false, uint64_t truncate_offset = 0, + bool writesame = false) + : io::Extent(extent), need_to_truncate(need_to_truncate), + truncate_offset(truncate_offset), writesame(writesame) {} + ImageExtentBuf(io::Extent extent, bufferlist bl, + bool need_to_truncate = false, uint64_t truncate_offset = 0, + bool writesame = false) + : io::Extent(extent), m_bl(bl), need_to_truncate(need_to_truncate), + truncate_offset(truncate_offset), writesame(writesame) {} +}; + +std::string unique_lock_name(const std::string &name, void *address); + +} // namespace pwl +} // namespace cache +} // namespace librbd + +#ifdef WITH_RBD_SSD_CACHE +WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogCacheEntry) +WRITE_CLASS_DENC(librbd::cache::pwl::WriteLogPoolRoot) +#endif + +#endif // CEPH_LIBRBD_CACHE_PWL_TYPES_H diff --git a/src/librbd/cache/pwl/rwl/Builder.h b/src/librbd/cache/pwl/rwl/Builder.h new file mode 100644 index 000000000..c13c7b5ae --- /dev/null +++ b/src/librbd/cache/pwl/rwl/Builder.h @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H +#define CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H + +#include +#include "LogEntry.h" +#include "ReadRequest.h" +#include "Request.h" +#include "LogOperation.h" + +#include "librbd/cache/ImageWriteback.h" +#include "librbd/cache/pwl/Builder.h" + +namespace librbd { +namespace cache { +namespace pwl { +namespace rwl { + +template +class Builder : public pwl::Builder { +public: + std::shared_ptr create_write_log_entry( + uint64_t image_offset_bytes, uint64_t write_bytes) override { + return std::make_shared(image_offset_bytes, write_bytes); + } + std::shared_ptr create_write_log_entry( + std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes) override { + return std::make_shared( + sync_point_entry, image_offset_bytes, write_bytes); + } + std::shared_ptr create_writesame_log_entry( + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) override { + return std::make_shared( + image_offset_bytes, write_bytes, data_length); + } + std::shared_ptr create_writesame_log_entry( + std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) override { + return std::make_shared( + sync_point_entry, image_offset_bytes, write_bytes, data_length); + } + pwl::C_WriteRequest *create_write_request( + T &pwl, utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) override { + return new C_WriteRequest( + pwl, arrived, std::move(image_extents), std::move(bl), + fadvise_flags, lock, perfcounter, user_req); + } + pwl::C_WriteSameRequest *create_writesame_request( + T &pwl, utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) override { + return new C_WriteSameRequest( + pwl, arrived, std::move(image_extents), std::move(bl), + fadvise_flags, lock, perfcounter, user_req); + } + pwl::C_WriteRequest *create_comp_and_write_request( + T &pwl, utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) override { + return new rwl::C_CompAndWriteRequest( + pwl, arrived, std::move(image_extents), std::move(cmp_bl), + std::move(bl), mismatch_offset, fadvise_flags, + lock, perfcounter, user_req); + } + std::shared_ptr create_write_log_operation( + WriteLogOperationSet &set, uint64_t image_offset_bytes, + uint64_t write_bytes, CephContext *cct, + std::shared_ptr write_log_entry) { + return std::make_shared( + set, image_offset_bytes, write_bytes, cct, write_log_entry); + } + std::shared_ptr create_write_log_operation( + WriteLogOperationSet &set, uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t data_len, CephContext *cct, + std::shared_ptr writesame_log_entry) { + return std::make_shared( + set, image_offset_bytes, write_bytes, data_len, cct, + writesame_log_entry); + } + std::shared_ptr create_discard_log_operation( + std::shared_ptr sync_point, uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t discard_granularity_bytes, + utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct) { + return std::make_shared( + sync_point, image_offset_bytes, write_bytes, discard_granularity_bytes, + dispatch_time, perfcounter, cct); + } + C_ReadRequest *create_read_request(CephContext *cct, utime_t arrived, + PerfCounters *perfcounter, ceph::bufferlist *bl, Context *on_finish) { + return new C_ReadRequest(cct, arrived, perfcounter, bl, on_finish); + } +}; + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_RWL_BUILDER_H diff --git a/src/librbd/cache/pwl/rwl/LogEntry.cc b/src/librbd/cache/pwl/rwl/LogEntry.cc new file mode 100644 index 000000000..38e09c22a --- /dev/null +++ b/src/librbd/cache/pwl/rwl/LogEntry.cc @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/ImageWriteback.h" +#include "LogEntry.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::rwl::WriteLogEntry: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +namespace rwl { + +void WriteLogEntry::writeback( + librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) { + /* Pass a copy of the pmem buffer to ImageWriteback (which may hang on to the + * bl even after flush()). */ + bufferlist entry_bl; + buffer::list entry_bl_copy; + copy_cache_bl(&entry_bl_copy); + entry_bl_copy.begin(0).copy(write_bytes(), entry_bl); + image_writeback.aio_write({{ram_entry.image_offset_bytes, + ram_entry.write_bytes}}, + std::move(entry_bl), 0, ctx); +} + +void WriteLogEntry::init_cache_bp() { + ceph_assert(!this->cache_bp.have_raw()); + cache_bp = buffer::ptr(buffer::create_static(this->write_bytes(), + (char*)this->cache_buffer)); +} + +void WriteLogEntry::init_bl(buffer::ptr &bp, buffer::list &bl) { + if(!is_writesame) { + bl.append(bp); + return; + } + for (uint64_t i = 0; i < ram_entry.write_bytes / ram_entry.ws_datalen; i++) { + bl.append(bp); + } + int trailing_partial = ram_entry.write_bytes % ram_entry.ws_datalen; + if (trailing_partial) { + bl.append(bp, 0, trailing_partial); + } +} + +void WriteLogEntry::init_cache_buffer( + std::vector::iterator allocation) { + this->ram_entry.write_data = allocation->buffer_oid; + ceph_assert(!TOID_IS_NULL(this->ram_entry.write_data)); + cache_buffer = D_RW(this->ram_entry.write_data); +} + +buffer::list& WriteLogEntry::get_cache_bl() { + if (0 == bl_refs) { + std::lock_guard locker(m_entry_bl_lock); + if (0 == bl_refs) { + //init pmem bufferlist + cache_bl.clear(); + init_cache_bp(); + ceph_assert(cache_bp.have_raw()); + int before_bl = cache_bp.raw_nref(); + this->init_bl(cache_bp, cache_bl); + int after_bl = cache_bp.raw_nref(); + bl_refs = after_bl - before_bl; + } + ceph_assert(0 != bl_refs); + } + return cache_bl; +} + +void WriteLogEntry::copy_cache_bl(bufferlist *out_bl) { + this->get_cache_bl(); + // cache_bp is now initialized + ceph_assert(cache_bp.length() == cache_bp.raw_length()); + buffer::ptr cloned_bp = cache_bp.begin_deep().get_ptr(cache_bp.length()); + out_bl->clear(); + this->init_bl(cloned_bp, *out_bl); +} + +unsigned int WriteLogEntry::reader_count() const { + if (cache_bp.have_raw()) { + return (cache_bp.raw_nref() - bl_refs - 1); + } else { + return 0; + } +} + +void WriteSameLogEntry::writeback( + librbd::cache::ImageWritebackInterface &image_writeback, Context *ctx) { + bufferlist entry_bl; + buffer::list entry_bl_copy; + copy_cache_bl(&entry_bl_copy); + entry_bl_copy.begin(0).copy(write_bytes(), entry_bl); + image_writeback.aio_writesame(ram_entry.image_offset_bytes, + ram_entry.write_bytes, + std::move(entry_bl), 0, ctx); +} + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/rwl/LogEntry.h b/src/librbd/cache/pwl/rwl/LogEntry.h new file mode 100644 index 000000000..a4675c5fb --- /dev/null +++ b/src/librbd/cache/pwl/rwl/LogEntry.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H +#define CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H + +#include "librbd/cache/pwl/LogEntry.h" + +namespace librbd { +namespace cache { +class ImageWritebackInterface; +namespace pwl { +namespace rwl { + +class WriteLogEntry : public pwl::WriteLogEntry { +public: + WriteLogEntry(std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes) + : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {} + WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes) + : pwl::WriteLogEntry(image_offset_bytes, write_bytes) {} + WriteLogEntry(std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes, + data_length) {} + WriteLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : pwl::WriteLogEntry(image_offset_bytes, write_bytes, data_length) {} + ~WriteLogEntry() {} + WriteLogEntry(const WriteLogEntry&) = delete; + WriteLogEntry &operator=(const WriteLogEntry&) = delete; + + void writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) override; + void init_cache_bp() override; + void init_bl(buffer::ptr &bp, buffer::list &bl) override; + void init_cache_buffer( + std::vector::iterator allocation) override; + buffer::list &get_cache_bl() override; + void copy_cache_bl(bufferlist *out_bl) override; + unsigned int reader_count() const override; +}; + +class WriteSameLogEntry : public WriteLogEntry { +public: + WriteSameLogEntry(std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes, + data_length) {} + WriteSameLogEntry(uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : WriteLogEntry(image_offset_bytes, write_bytes, data_length) {} + ~WriteSameLogEntry() {} + WriteSameLogEntry(const WriteSameLogEntry&) = delete; + WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete; + + void writeback(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx) override; +}; + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_RWL_LOG_ENTRY_H diff --git a/src/librbd/cache/pwl/rwl/LogOperation.cc b/src/librbd/cache/pwl/rwl/LogOperation.cc new file mode 100644 index 000000000..53fb917b2 --- /dev/null +++ b/src/librbd/cache/pwl/rwl/LogOperation.cc @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "LogOperation.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::rwl::LogOperation: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +namespace rwl { + +void WriteLogOperation::copy_bl_to_cache_buffer( + std::vector::iterator allocation) { + /* operation is a shared_ptr, so write_op is only good as long as operation is + * in scope */ + bufferlist::iterator i(&bl); + m_perfcounter->inc(l_librbd_pwl_log_op_bytes, log_entry->write_bytes()); + ldout(m_cct, 20) << bl << dendl; + log_entry->init_cache_buffer(allocation); + i.copy((unsigned)log_entry->write_bytes(), (char*)log_entry->cache_buffer); +} + +void DiscardLogOperation::init_op( + uint64_t current_sync_gen, bool persist_on_flush, + uint64_t last_op_sequence_num, Context *write_persist, + Context *write_append) { + log_entry->init(current_sync_gen, persist_on_flush, last_op_sequence_num); + this->on_write_append = write_append; + this->on_write_persist = write_persist; +} + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/rwl/LogOperation.h b/src/librbd/cache/pwl/rwl/LogOperation.h new file mode 100644 index 000000000..874ac77fb --- /dev/null +++ b/src/librbd/cache/pwl/rwl/LogOperation.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H +#define CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H + +#include "librbd/cache/pwl/LogOperation.h" + + +namespace librbd { +namespace cache { +namespace pwl { +namespace rwl { + +class WriteLogOperation : public pwl::WriteLogOperation { +public: + WriteLogOperation( + WriteLogOperationSet &set, uint64_t image_offset_bytes, + uint64_t write_bytes, CephContext *cct, + std::shared_ptr write_log_entry) + : pwl::WriteLogOperation(set, image_offset_bytes, write_bytes, cct, + write_log_entry) {} + + WriteLogOperation( + WriteLogOperationSet &set, uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t data_len, CephContext *cct, + std::shared_ptr writesame_log_entry) + : pwl::WriteLogOperation(set, image_offset_bytes, write_bytes, cct, + writesame_log_entry) {} + + void copy_bl_to_cache_buffer( + std::vector::iterator allocation) override; +}; + +class DiscardLogOperation : public pwl::DiscardLogOperation { +public: + DiscardLogOperation( + std::shared_ptr sync_point, uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t discard_granularity_bytes, + utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct) + : pwl::DiscardLogOperation(sync_point, image_offset_bytes, write_bytes, + discard_granularity_bytes, dispatch_time, + perfcounter, cct) {} + void init_op( + uint64_t current_sync_gen, bool persist_on_flush, + uint64_t last_op_sequence_num, Context *write_persist, + Context *write_append) override; +}; + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_RWL_LOG_OPERATION_H diff --git a/src/librbd/cache/pwl/rwl/ReadRequest.cc b/src/librbd/cache/pwl/rwl/ReadRequest.cc new file mode 100644 index 000000000..f91f8e5a7 --- /dev/null +++ b/src/librbd/cache/pwl/rwl/ReadRequest.cc @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ReadRequest.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::rwl::ReadRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +namespace rwl { + +void C_ReadRequest::finish(int r) { + ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl; + int hits = 0; + int misses = 0; + int hit_bytes = 0; + int miss_bytes = 0; + if (r >= 0) { + /* + * At this point the miss read has completed. We'll iterate through + * read_extents and produce *m_out_bl by assembling pieces of miss_bl + * and the individual hit extent bufs in the read extents that represent + * hits. + */ + uint64_t miss_bl_offset = 0; + for (auto extent : read_extents) { + if (extent->m_bl.length()) { + /* This was a hit */ + ceph_assert(extent->second == extent->m_bl.length()); + ++hits; + hit_bytes += extent->second; + m_out_bl->claim_append(extent->m_bl); + } else { + /* This was a miss. */ + ++misses; + miss_bytes += extent->second; + bufferlist miss_extent_bl; + miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent->second); + /* Add this read miss bufferlist to the output bufferlist */ + m_out_bl->claim_append(miss_extent_bl); + /* Consume these bytes in the read miss bufferlist */ + miss_bl_offset += extent->second; + } + } + } + ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl; + utime_t now = ceph_clock_now(); + ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes); + m_on_finish->complete(r); + m_perfcounter->inc(l_librbd_pwl_rd_bytes, hit_bytes + miss_bytes); + m_perfcounter->inc(l_librbd_pwl_rd_hit_bytes, hit_bytes); + m_perfcounter->tinc(l_librbd_pwl_rd_latency, now - m_arrived_time); + if (!misses) { + m_perfcounter->inc(l_librbd_pwl_rd_hit_req, 1); + m_perfcounter->tinc(l_librbd_pwl_rd_hit_latency, now - m_arrived_time); + } else { + if (hits) { + m_perfcounter->inc(l_librbd_pwl_rd_part_hit_req, 1); + } + } +} + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/rwl/ReadRequest.h b/src/librbd/cache/pwl/rwl/ReadRequest.h new file mode 100644 index 000000000..25168e83b --- /dev/null +++ b/src/librbd/cache/pwl/rwl/ReadRequest.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_RWL_READ_REQUEST_H +#define CEPH_LIBRBD_CACHE_PWL_RWL_READ_REQUEST_H + +#include "librbd/cache/pwl/ReadRequest.h" + +namespace librbd { +namespace cache { +namespace pwl { +namespace rwl { + +typedef std::vector ImageExtentBufs; + +class C_ReadRequest : public pwl::C_ReadRequest { +protected: + using pwl::C_ReadRequest::m_cct; + using pwl::C_ReadRequest::m_on_finish; + using pwl::C_ReadRequest::m_out_bl; + using pwl::C_ReadRequest::m_arrived_time; + using pwl::C_ReadRequest::m_perfcounter; +public: + C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish) + : pwl::C_ReadRequest(cct, arrived, perfcounter, out_bl, on_finish) {} + void finish(int r) override; +}; + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_RWL_READ_REQUEST_H diff --git a/src/librbd/cache/pwl/rwl/Request.cc b/src/librbd/cache/pwl/rwl/Request.cc new file mode 100644 index 000000000..a6b81d55b --- /dev/null +++ b/src/librbd/cache/pwl/rwl/Request.cc @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Request.h" +#include "librbd/cache/pwl/AbstractWriteLog.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::rwl::Request: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +namespace rwl { + +template +void C_WriteRequest::setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated, + uint64_t *number_lanes, uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) { + + ceph_assert(!this->m_resources.allocated); + + auto image_extents_size = this->image_extents.size(); + this->m_resources.buffers.reserve(image_extents_size); + + *bytes_cached = 0; + *bytes_allocated = 0; + *number_lanes = image_extents_size; + *number_log_entries = image_extents_size; + *number_unpublished_reserves = image_extents_size; + + for (auto &extent : this->image_extents) { + this->m_resources.buffers.emplace_back(); + struct WriteBufferAllocation &buffer = this->m_resources.buffers.back(); + buffer.allocation_size = MIN_WRITE_ALLOC_SIZE; + buffer.allocated = false; + *bytes_cached += extent.second; + if (extent.second > buffer.allocation_size) { + buffer.allocation_size = extent.second; + } + *bytes_allocated += buffer.allocation_size; + } + *bytes_dirtied = *bytes_cached; +} + +template +std::ostream &operator<<(std::ostream &os, + const C_CompAndWriteRequest &req) { + os << (C_WriteRequest&)req + << " cmp_bl=" << req.cmp_bl + << ", read_bl=" << req.read_bl + << ", compare_succeeded=" << req.compare_succeeded + << ", mismatch_offset=" << req.mismatch_offset; + return os; +} + +template +void C_WriteSameRequest::setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated, + uint64_t *number_lanes, uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) { + ceph_assert(this->image_extents.size() == 1); + *number_log_entries = 1; + *bytes_dirtied += this->image_extents[0].second; + auto pattern_length = this->bl.length(); + this->m_resources.buffers.emplace_back(); + struct WriteBufferAllocation &buffer = this->m_resources.buffers.back(); + buffer.allocation_size = MIN_WRITE_ALLOC_SIZE; + buffer.allocated = false; + *bytes_cached += pattern_length; + if (pattern_length > buffer.allocation_size) { + buffer.allocation_size = pattern_length; + } + *bytes_allocated += buffer.allocation_size; +} + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::rwl::C_WriteRequest >; +template class librbd::cache::pwl::rwl::C_WriteSameRequest >; +template class librbd::cache::pwl::rwl::C_CompAndWriteRequest >; diff --git a/src/librbd/cache/pwl/rwl/Request.h b/src/librbd/cache/pwl/rwl/Request.h new file mode 100644 index 000000000..0a5c610d6 --- /dev/null +++ b/src/librbd/cache/pwl/rwl/Request.h @@ -0,0 +1,90 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_RWL_REQUEST_H +#define CEPH_LIBRBD_CACHE_RWL_REQUEST_H + +#include "librbd/cache/pwl/Request.h" + +namespace librbd { +class BlockGuardCell; + +namespace cache { +namespace pwl { +namespace rwl { + +template +class C_WriteRequest : public pwl::C_WriteRequest { +public: + C_WriteRequest( + T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : pwl::C_WriteRequest( + pwl, arrived, std::move(image_extents), std::move(cmp_bl), + std::move(bl), mismatch_offset, fadvise_flags, + lock, perfcounter, user_req) {} + + C_WriteRequest( + T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : pwl::C_WriteRequest( + pwl, arrived, std::move(image_extents), std::move(bl), + fadvise_flags, lock, perfcounter, user_req) {} +protected: + //Plain writes will allocate one buffer per request extent + void setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, + uint64_t *bytes_allocated, uint64_t *number_lanes, + uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) override; +}; + +template +class C_CompAndWriteRequest : public C_WriteRequest { +public: + C_CompAndWriteRequest( + T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : C_WriteRequest( + pwl, arrived, std::move(image_extents), std::move(cmp_bl), + std::move(bl), mismatch_offset, fadvise_flags, + lock, perfcounter, user_req) {} + + const char *get_name() const override { + return "C_CompAndWriteRequest"; + } + template + friend std::ostream &operator<<(std::ostream &os, + const C_CompAndWriteRequest &req); +}; + +template +class C_WriteSameRequest : public pwl::C_WriteSameRequest { +public: + C_WriteSameRequest( + T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : pwl::C_WriteSameRequest( + pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, + lock, perfcounter, user_req) {} + + void setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, + uint64_t *bytes_allocated, uint64_t *number_lanes, + uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) override; + +}; + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_RWL_REQUEST_H diff --git a/src/librbd/cache/pwl/rwl/WriteLog.cc b/src/librbd/cache/pwl/rwl/WriteLog.cc new file mode 100644 index 000000000..e922ba543 --- /dev/null +++ b/src/librbd/cache/pwl/rwl/WriteLog.cc @@ -0,0 +1,1011 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "WriteLog.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "include/ceph_assert.h" +#include "common/deleter.h" +#include "common/dout.h" +#include "common/environment.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "common/Timer.h" +#include "common/perf_counters.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/cache/pwl/LogEntry.h" +#include "librbd/plugin/Api.h" +#include +#include + +#undef dout_subsys +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::rwl::WriteLog: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +using namespace std; +using namespace librbd::cache::pwl; +namespace rwl { + +const unsigned long int OPS_APPENDED_TOGETHER = MAX_ALLOC_PER_TRANSACTION; + +template +Builder>* WriteLog::create_builder() { + m_builderobj = new Builder(); + return m_builderobj; +} + +template +WriteLog::WriteLog( + I &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state, + ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api) +: AbstractWriteLog(image_ctx, cache_state, create_builder(), image_writeback, + plugin_api), + m_pwl_pool_layout_name(POBJ_LAYOUT_NAME(rbd_pwl)) +{ +} + +template +WriteLog::~WriteLog() { + m_log_pool = nullptr; + delete m_builderobj; +} + +template +void WriteLog::collect_read_extents( + uint64_t read_buffer_offset, LogMapEntry map_entry, + std::vector> &log_entries_to_read, + std::vector &bls_to_read, uint64_t entry_hit_length, + Extent hit_extent, pwl::C_ReadRequest *read_ctx) { + /* Make a bl for this hit extent. This will add references to the + * write_entry->pmem_bp */ + buffer::list hit_bl; + + /* Create buffer object referring to pmem pool for this read hit */ + auto write_entry = map_entry.log_entry; + + buffer::list entry_bl_copy; + write_entry->copy_cache_bl(&entry_bl_copy); + entry_bl_copy.begin(read_buffer_offset).copy(entry_hit_length, hit_bl); + ceph_assert(hit_bl.length() == entry_hit_length); + + /* Add hit extent to read extents */ + auto hit_extent_buf = std::make_shared(hit_extent, hit_bl); + read_ctx->read_extents.push_back(hit_extent_buf); +} + +template +void WriteLog::complete_read( + std::vector> &log_entries_to_read, + std::vector &bls_to_read, Context *ctx) { + ctx->complete(0); +} + +/* + * Allocate the (already reserved) write log entries for a set of operations. + * + * Locking: + * Acquires lock + */ +template +void WriteLog::alloc_op_log_entries(GenericLogOperations &ops) +{ + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + struct WriteLogCacheEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries); + + ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock)); + + /* Allocate the (already reserved) log entries */ + std::unique_lock locker(m_lock); + + for (auto &operation : ops) { + uint32_t entry_index = this->m_first_free_entry; + this->m_first_free_entry = (this->m_first_free_entry + 1) % this->m_total_log_entries; + auto &log_entry = operation->get_log_entry(); + log_entry->log_entry_index = entry_index; + log_entry->ram_entry.entry_index = entry_index; + log_entry->cache_entry = &pmem_log_entries[entry_index]; + log_entry->ram_entry.set_entry_valid(true); + m_log_entries.push_back(log_entry); + ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl; + } + if (m_cache_state->empty && !m_log_entries.empty()) { + m_cache_state->empty = false; + this->update_image_cache_state(); + this->write_image_cache_state(locker); + } +} + +/* + * Write and persist the (already allocated) write log entries and + * data buffer allocations for a set of ops. The data buffer for each + * of these must already have been persisted to its reserved area. + */ +template +int WriteLog::append_op_log_entries(GenericLogOperations &ops) +{ + CephContext *cct = m_image_ctx.cct; + GenericLogOperationsVector entries_to_flush; + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + int ret = 0; + + ceph_assert(ceph_mutex_is_locked_by_me(this->m_log_append_lock)); + + if (ops.empty()) { + return 0; + } + entries_to_flush.reserve(OPS_APPENDED_TOGETHER); + + /* Write log entries to ring and persist */ + utime_t now = ceph_clock_now(); + for (auto &operation : ops) { + if (!entries_to_flush.empty()) { + /* Flush these and reset the list if the current entry wraps to the + * tail of the ring */ + if (entries_to_flush.back()->get_log_entry()->log_entry_index > + operation->get_log_entry()->log_entry_index) { + ldout(m_image_ctx.cct, 20) << "entries to flush wrap around the end of the ring at " + << "operation=[" << *operation << "]" << dendl; + flush_op_log_entries(entries_to_flush); + entries_to_flush.clear(); + now = ceph_clock_now(); + } + } + ldout(m_image_ctx.cct, 20) << "Copying entry for operation at index=" + << operation->get_log_entry()->log_entry_index + << " from " << &operation->get_log_entry()->ram_entry + << " to " << operation->get_log_entry()->cache_entry + << " operation=[" << *operation << "]" << dendl; + operation->log_append_start_time = now; + *operation->get_log_entry()->cache_entry = operation->get_log_entry()->ram_entry; + ldout(m_image_ctx.cct, 20) << "APPENDING: index=" + << operation->get_log_entry()->log_entry_index + << " pmem_entry=[" << *operation->get_log_entry()->cache_entry + << "]" << dendl; + entries_to_flush.push_back(operation); + } + flush_op_log_entries(entries_to_flush); + + /* Drain once for all */ + pmemobj_drain(m_log_pool); + + /* + * Atomically advance the log head pointer and publish the + * allocations for all the data buffers they refer to. + */ + utime_t tx_start = ceph_clock_now(); + TX_BEGIN(m_log_pool) { + D_RW(pool_root)->first_free_entry = this->m_first_free_entry; + for (auto &operation : ops) { + if (operation->reserved_allocated()) { + auto write_op = (std::shared_ptr&) operation; + pmemobj_tx_publish(&write_op->buffer_alloc->buffer_alloc_action, 1); + } else { + ldout(m_image_ctx.cct, 20) << "skipping non-write op: " << *operation << dendl; + } + } + } TX_ONCOMMIT { + } TX_ONABORT { + lderr(cct) << "failed to commit " << ops.size() + << " log entries (" << this->m_log_pool_name << ")" << dendl; + ceph_assert(false); + ret = -EIO; + } TX_FINALLY { + } TX_END; + + utime_t tx_end = ceph_clock_now(); + m_perfcounter->tinc(l_librbd_pwl_append_tx_t, tx_end - tx_start); + m_perfcounter->hinc( + l_librbd_pwl_append_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), ops.size()); + for (auto &operation : ops) { + operation->log_append_comp_time = tx_end; + } + + return ret; +} + +/* + * Flush the persistent write log entries set of ops. The entries must + * be contiguous in persistent memory. + */ +template +void WriteLog::flush_op_log_entries(GenericLogOperationsVector &ops) +{ + if (ops.empty()) { + return; + } + + if (ops.size() > 1) { + ceph_assert(ops.front()->get_log_entry()->cache_entry < ops.back()->get_log_entry()->cache_entry); + } + + ldout(m_image_ctx.cct, 20) << "entry count=" << ops.size() + << " start address=" + << ops.front()->get_log_entry()->cache_entry + << " bytes=" + << ops.size() * sizeof(*(ops.front()->get_log_entry()->cache_entry)) + << dendl; + pmemobj_flush(m_log_pool, + ops.front()->get_log_entry()->cache_entry, + ops.size() * sizeof(*(ops.front()->get_log_entry()->cache_entry))); +} + +template +void WriteLog::remove_pool_file() { + if (m_log_pool) { + ldout(m_image_ctx.cct, 6) << "closing pmem pool" << dendl; + pmemobj_close(m_log_pool); + } + if (m_cache_state->clean) { + ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " << this->m_log_pool_name << dendl; + if (remove(this->m_log_pool_name.c_str()) != 0) { + lderr(m_image_ctx.cct) << "failed to remove empty pool \"" << this->m_log_pool_name << "\": " + << pmemobj_errormsg() << dendl; + } else { + m_cache_state->present = false; + } + } else { + ldout(m_image_ctx.cct, 5) << "Not removing pool file: " << this->m_log_pool_name << dendl; + } +} + +template +bool WriteLog::initialize_pool(Context *on_finish, pwl::DeferredContexts &later) { + CephContext *cct = m_image_ctx.cct; + int r = -EINVAL; + TOID(struct WriteLogPoolRoot) pool_root; + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + if (access(this->m_log_pool_name.c_str(), F_OK) != 0) { + if ((m_log_pool = + pmemobj_create(this->m_log_pool_name.c_str(), + this->m_pwl_pool_layout_name, + this->m_log_pool_size, + (S_IWUSR | S_IRUSR))) == NULL) { + lderr(cct) << "failed to create pool: " << this->m_log_pool_name + << ". error: " << pmemobj_errormsg() << dendl; + m_cache_state->present = false; + m_cache_state->clean = true; + m_cache_state->empty = true; + /* TODO: filter/replace errnos that are meaningless to the caller */ + on_finish->complete(-errno); + return false; + } + m_cache_state->present = true; + m_cache_state->clean = true; + m_cache_state->empty = true; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + + /* new pool, calculate and store metadata */ + size_t effective_pool_size = (size_t)(this->m_log_pool_size * USABLE_SIZE); + size_t small_write_size = MIN_WRITE_ALLOC_SIZE + BLOCK_ALLOC_OVERHEAD_BYTES + sizeof(struct WriteLogCacheEntry); + uint64_t num_small_writes = (uint64_t)(effective_pool_size / small_write_size); + if (num_small_writes > MAX_LOG_ENTRIES) { + num_small_writes = MAX_LOG_ENTRIES; + } + if (num_small_writes <= 2) { + lderr(cct) << "num_small_writes needs to > 2" << dendl; + goto err_close_pool; + } + this->m_bytes_allocated_cap = effective_pool_size; + /* Log ring empty */ + m_first_free_entry = 0; + m_first_valid_entry = 0; + TX_BEGIN(m_log_pool) { + TX_ADD(pool_root); + D_RW(pool_root)->header.layout_version = RWL_LAYOUT_VERSION; + D_RW(pool_root)->log_entries = + TX_ZALLOC(struct WriteLogCacheEntry, + sizeof(struct WriteLogCacheEntry) * num_small_writes); + D_RW(pool_root)->pool_size = this->m_log_pool_size; + D_RW(pool_root)->flushed_sync_gen = this->m_flushed_sync_gen; + D_RW(pool_root)->block_size = MIN_WRITE_ALLOC_SIZE; + D_RW(pool_root)->num_log_entries = num_small_writes; + D_RW(pool_root)->first_free_entry = m_first_free_entry; + D_RW(pool_root)->first_valid_entry = m_first_valid_entry; + } TX_ONCOMMIT { + this->m_total_log_entries = D_RO(pool_root)->num_log_entries; + this->m_free_log_entries = D_RO(pool_root)->num_log_entries - 1; // leave one free + } TX_ONABORT { + this->m_total_log_entries = 0; + this->m_free_log_entries = 0; + lderr(cct) << "failed to initialize pool: " << this->m_log_pool_name + << ". pmemobj TX errno: " << pmemobj_tx_errno() << dendl; + r = -pmemobj_tx_errno(); + goto err_close_pool; + } TX_FINALLY { + } TX_END; + } else { + ceph_assert(m_cache_state->present); + /* Open existing pool */ + if ((m_log_pool = + pmemobj_open(this->m_log_pool_name.c_str(), + this->m_pwl_pool_layout_name)) == NULL) { + lderr(cct) << "failed to open pool (" << this->m_log_pool_name << "): " + << pmemobj_errormsg() << dendl; + on_finish->complete(-errno); + return false; + } + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + if (D_RO(pool_root)->header.layout_version != RWL_LAYOUT_VERSION) { + // TODO: will handle upgrading version in the future + lderr(cct) << "pool layout version is " + << D_RO(pool_root)->header.layout_version + << " expected " << RWL_LAYOUT_VERSION << dendl; + goto err_close_pool; + } + if (D_RO(pool_root)->block_size != MIN_WRITE_ALLOC_SIZE) { + lderr(cct) << "pool block size is " << D_RO(pool_root)->block_size + << " expected " << MIN_WRITE_ALLOC_SIZE << dendl; + goto err_close_pool; + } + this->m_log_pool_size = D_RO(pool_root)->pool_size; + this->m_flushed_sync_gen = D_RO(pool_root)->flushed_sync_gen; + this->m_total_log_entries = D_RO(pool_root)->num_log_entries; + m_first_free_entry = D_RO(pool_root)->first_free_entry; + m_first_valid_entry = D_RO(pool_root)->first_valid_entry; + if (m_first_free_entry < m_first_valid_entry) { + /* Valid entries wrap around the end of the ring, so first_free is lower + * than first_valid. If first_valid was == first_free+1, the entry at + * first_free would be empty. The last entry is never used, so in + * that case there would be zero free log entries. */ + this->m_free_log_entries = this->m_total_log_entries - (m_first_valid_entry - m_first_free_entry) -1; + } else { + /* first_valid is <= first_free. If they are == we have zero valid log + * entries, and n-1 free log entries */ + this->m_free_log_entries = this->m_total_log_entries - (m_first_free_entry - m_first_valid_entry) -1; + } + size_t effective_pool_size = (size_t)(this->m_log_pool_size * USABLE_SIZE); + this->m_bytes_allocated_cap = effective_pool_size; + load_existing_entries(later); + m_cache_state->clean = this->m_dirty_log_entries.empty(); + m_cache_state->empty = m_log_entries.empty(); + } + return true; + +err_close_pool: + pmemobj_close(m_log_pool); + on_finish->complete(r); + return false; +} + +/* + * Loads the log entries from an existing log. + * + * Creates the in-memory structures to represent the state of the + * re-opened log. + * + * Finds the last appended sync point, and any sync points referred to + * in log entries, but missing from the log. These missing sync points + * are created and scheduled for append. Some rudimentary consistency + * checking is done. + * + * Rebuilds the m_blocks_to_log_entries map, to make log entries + * readable. + * + * Places all writes on the dirty entries list, which causes them all + * to be flushed. + * + */ + +template +void WriteLog::load_existing_entries(DeferredContexts &later) { + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + struct WriteLogCacheEntry *pmem_log_entries = D_RW(D_RW(pool_root)->log_entries); + uint64_t entry_index = m_first_valid_entry; + /* The map below allows us to find sync point log entries by sync + * gen number, which is necessary so write entries can be linked to + * their sync points. */ + std::map> sync_point_entries; + /* The map below tracks sync points referred to in writes but not + * appearing in the sync_point_entries map. We'll use this to + * determine which sync points are missing and need to be + * created. */ + std::map missing_sync_points; + + /* + * Read the existing log entries. Construct an in-memory log entry + * object of the appropriate type for each. Add these to the global + * log entries list. + * + * Write entries will not link to their sync points yet. We'll do + * that in the next pass. Here we'll accumulate a map of sync point + * gen numbers that are referred to in writes but do not appearing in + * the log. + */ + while (entry_index != m_first_free_entry) { + WriteLogCacheEntry *pmem_entry = &pmem_log_entries[entry_index]; + std::shared_ptr log_entry = nullptr; + ceph_assert(pmem_entry->entry_index == entry_index); + + this->update_entries(&log_entry, pmem_entry, missing_sync_points, + sync_point_entries, entry_index); + + log_entry->ram_entry = *pmem_entry; + log_entry->cache_entry = pmem_entry; + log_entry->log_entry_index = entry_index; + log_entry->completed = true; + + m_log_entries.push_back(log_entry); + + entry_index = (entry_index + 1) % this->m_total_log_entries; + } + + this->update_sync_points(missing_sync_points, sync_point_entries, later); +} + +template +void WriteLog::inc_allocated_cached_bytes( + std::shared_ptr log_entry) { + if (log_entry->is_write_entry()) { + this->m_bytes_allocated += std::max(log_entry->write_bytes(), MIN_WRITE_ALLOC_SIZE); + this->m_bytes_cached += log_entry->write_bytes(); + } +} + +template +void WriteLog::write_data_to_buffer( + std::shared_ptr ws_entry, + WriteLogCacheEntry *pmem_entry) { + ws_entry->cache_buffer = D_RW(pmem_entry->write_data); +} + +/** + * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries + * that are eligible to be retired. Returns true if anything was + * retired. + */ +template +bool WriteLog::retire_entries(const unsigned long int frees_per_tx) { + CephContext *cct = m_image_ctx.cct; + GenericLogEntriesVector retiring_entries; + uint32_t initial_first_valid_entry; + uint32_t first_valid_entry; + + std::lock_guard retire_locker(this->m_log_retire_lock); + ldout(cct, 20) << "Look for entries to retire" << dendl; + { + /* Entry readers can't be added while we hold m_entry_reader_lock */ + RWLock::WLocker entry_reader_locker(this->m_entry_reader_lock); + std::lock_guard locker(m_lock); + initial_first_valid_entry = this->m_first_valid_entry; + first_valid_entry = this->m_first_valid_entry; + while (!m_log_entries.empty() && retiring_entries.size() < frees_per_tx && + this->can_retire_entry(m_log_entries.front())) { + auto entry = m_log_entries.front(); + if (entry->log_entry_index != first_valid_entry) { + lderr(cct) << "retiring entry index (" << entry->log_entry_index + << ") and first valid log entry index (" << first_valid_entry + << ") must be ==." << dendl; + } + ceph_assert(entry->log_entry_index == first_valid_entry); + first_valid_entry = (first_valid_entry + 1) % this->m_total_log_entries; + m_log_entries.pop_front(); + retiring_entries.push_back(entry); + /* Remove entry from map so there will be no more readers */ + if ((entry->write_bytes() > 0) || (entry->bytes_dirty() > 0)) { + auto gen_write_entry = static_pointer_cast(entry); + if (gen_write_entry) { + this->m_blocks_to_log_entries.remove_log_entry(gen_write_entry); + } + } + } + } + + if (retiring_entries.size()) { + ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" << dendl; + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + + utime_t tx_start; + utime_t tx_end; + /* Advance first valid entry and release buffers */ + { + uint64_t flushed_sync_gen; + std::lock_guard append_locker(this->m_log_append_lock); + { + std::lock_guard locker(m_lock); + flushed_sync_gen = this->m_flushed_sync_gen; + } + + tx_start = ceph_clock_now(); + TX_BEGIN(m_log_pool) { + if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) { + ldout(m_image_ctx.cct, 20) << "flushed_sync_gen in log updated from " + << D_RO(pool_root)->flushed_sync_gen << " to " + << flushed_sync_gen << dendl; + D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen; + } + D_RW(pool_root)->first_valid_entry = first_valid_entry; + for (auto &entry: retiring_entries) { + if (entry->write_bytes()) { + ldout(cct, 20) << "Freeing " << entry->ram_entry.write_data.oid.pool_uuid_lo + << "." << entry->ram_entry.write_data.oid.off << dendl; + TX_FREE(entry->ram_entry.write_data); + } else { + ldout(cct, 20) << "Retiring non-write: " << *entry << dendl; + } + } + } TX_ONCOMMIT { + } TX_ONABORT { + lderr(cct) << "failed to commit free of" << retiring_entries.size() + << " log entries (" << this->m_log_pool_name << ")" << dendl; + ceph_assert(false); + } TX_FINALLY { + } TX_END; + tx_end = ceph_clock_now(); + } + m_perfcounter->tinc(l_librbd_pwl_retire_tx_t, tx_end - tx_start); + m_perfcounter->hinc(l_librbd_pwl_retire_tx_t_hist, utime_t(tx_end - tx_start).to_nsec(), + retiring_entries.size()); + + bool need_update_state = false; + /* Update runtime copy of first_valid, and free entries counts */ + { + std::lock_guard locker(m_lock); + + ceph_assert(this->m_first_valid_entry == initial_first_valid_entry); + this->m_first_valid_entry = first_valid_entry; + this->m_free_log_entries += retiring_entries.size(); + if (!m_cache_state->empty && m_log_entries.empty()) { + m_cache_state->empty = true; + this->update_image_cache_state(); + need_update_state = true; + } + for (auto &entry: retiring_entries) { + if (entry->write_bytes()) { + ceph_assert(this->m_bytes_cached >= entry->write_bytes()); + this->m_bytes_cached -= entry->write_bytes(); + uint64_t entry_allocation_size = entry->write_bytes(); + if (entry_allocation_size < MIN_WRITE_ALLOC_SIZE) { + entry_allocation_size = MIN_WRITE_ALLOC_SIZE; + } + ceph_assert(this->m_bytes_allocated >= entry_allocation_size); + this->m_bytes_allocated -= entry_allocation_size; + } + } + this->m_alloc_failed_since_retire = false; + this->wake_up(); + } + if (need_update_state) { + std::unique_lock locker(m_lock); + this->write_image_cache_state(locker); + } + } else { + ldout(cct, 20) << "Nothing to retire" << dendl; + return false; + } + return true; +} + +template +void WriteLog::construct_flush_entries(pwl::GenericLogEntries entries_to_flush, + DeferredContexts &post_unlock, + bool has_write_entry) { + bool invalidating = this->m_invalidating; // snapshot so we behave consistently + + for (auto &log_entry : entries_to_flush) { + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, log_entry, invalidating] + (GuardedRequestFunctionContext &guard_ctx) { + log_entry->m_cell = guard_ctx.cell; + Context *ctx = this->construct_flush_entry(log_entry, invalidating); + + if (!invalidating) { + ctx = new LambdaContext( + [this, log_entry, ctx](int r) { + m_image_ctx.op_work_queue->queue(new LambdaContext( + [this, log_entry, ctx](int r) { + ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry + << " " << *log_entry << dendl; + log_entry->writeback(this->m_image_writeback, ctx); + }), 0); + }); + } + + ctx->complete(0); + }); + this->detain_flush_guard_request(log_entry, guarded_ctx); + } +} + +const unsigned long int ops_flushed_together = 4; +/* + * Performs the pmem buffer flush on all scheduled ops, then schedules + * the log event append operation for all of them. + */ +template +void WriteLog::flush_then_append_scheduled_ops(void) +{ + GenericLogOperations ops; + bool ops_remain = false; + ldout(m_image_ctx.cct, 20) << dendl; + do { + { + ops.clear(); + std::lock_guard locker(m_lock); + if (m_ops_to_flush.size()) { + auto last_in_batch = m_ops_to_flush.begin(); + unsigned int ops_to_flush = m_ops_to_flush.size(); + if (ops_to_flush > ops_flushed_together) { + ops_to_flush = ops_flushed_together; + } + ldout(m_image_ctx.cct, 20) << "should flush " << ops_to_flush << dendl; + std::advance(last_in_batch, ops_to_flush); + ops.splice(ops.end(), m_ops_to_flush, m_ops_to_flush.begin(), last_in_batch); + ops_remain = !m_ops_to_flush.empty(); + ldout(m_image_ctx.cct, 20) << "flushing " << ops.size() << ", remain " + << m_ops_to_flush.size() << dendl; + } else { + ops_remain = false; + } + } + if (ops_remain) { + enlist_op_flusher(); + } + + /* Ops subsequently scheduled for flush may finish before these, + * which is fine. We're unconcerned with completion order until we + * get to the log message append step. */ + if (ops.size()) { + flush_pmem_buffer(ops); + schedule_append_ops(ops, nullptr); + } + } while (ops_remain); + append_scheduled_ops(); +} + +/* + * Performs the log event append operation for all of the scheduled + * events. + */ +template +void WriteLog::append_scheduled_ops(void) { + GenericLogOperations ops; + int append_result = 0; + bool ops_remain = false; + bool appending = false; /* true if we set m_appending */ + ldout(m_image_ctx.cct, 20) << dendl; + do { + ops.clear(); + this->append_scheduled(ops, ops_remain, appending, true); + + if (ops.size()) { + std::lock_guard locker(this->m_log_append_lock); + alloc_op_log_entries(ops); + append_result = append_op_log_entries(ops); + } + + int num_ops = ops.size(); + if (num_ops) { + /* New entries may be flushable. Completion will wake up flusher. */ + this->complete_op_log_entries(std::move(ops), append_result); + } + } while (ops_remain); +} + +template +void WriteLog::enlist_op_flusher() +{ + this->m_async_flush_ops++; + this->m_async_op_tracker.start_op(); + Context *flush_ctx = new LambdaContext([this](int r) { + flush_then_append_scheduled_ops(); + this->m_async_flush_ops--; + this->m_async_op_tracker.finish_op(); + }); + this->m_work_queue.queue(flush_ctx); +} + +template +void WriteLog::setup_schedule_append( + pwl::GenericLogOperationsVector &ops, bool do_early_flush, + C_BlockIORequestT *req) { + if (do_early_flush) { + /* This caller is waiting for persist, so we'll use their thread to + * expedite it */ + flush_pmem_buffer(ops); + this->schedule_append(ops); + } else { + /* This is probably not still the caller's thread, so do the payload + * flushing/replicating later. */ + schedule_flush_and_append(ops); + } +} + +/* + * Takes custody of ops. They'll all get their log entries appended, + * and have their on_write_persist contexts completed once they and + * all prior log entries are persisted everywhere. + */ +template +void WriteLog::schedule_append_ops(GenericLogOperations &ops, C_BlockIORequestT *req) +{ + bool need_finisher; + GenericLogOperationsVector appending; + + std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending)); + { + std::lock_guard locker(m_lock); + + need_finisher = this->m_ops_to_append.empty() && !this->m_appending; + this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops); + } + + if (need_finisher) { + //enlist op appender + this->m_async_append_ops++; + this->m_async_op_tracker.start_op(); + Context *append_ctx = new LambdaContext([this](int r) { + append_scheduled_ops(); + this->m_async_append_ops--; + this->m_async_op_tracker.finish_op(); + }); + this->m_work_queue.queue(append_ctx); + } + + for (auto &op : appending) { + op->appending(); + } +} + +/* + * Takes custody of ops. They'll all get their pmem blocks flushed, + * then get their log entries appended. + */ +template +void WriteLog::schedule_flush_and_append(GenericLogOperationsVector &ops) +{ + GenericLogOperations to_flush(ops.begin(), ops.end()); + bool need_finisher; + ldout(m_image_ctx.cct, 20) << dendl; + { + std::lock_guard locker(m_lock); + + need_finisher = m_ops_to_flush.empty(); + m_ops_to_flush.splice(m_ops_to_flush.end(), to_flush); + } + + if (need_finisher) { + enlist_op_flusher(); + } +} + +template +void WriteLog::process_work() { + CephContext *cct = m_image_ctx.cct; + int max_iterations = 4; + bool wake_up_requested = false; + uint64_t aggressive_high_water_bytes = this->m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER; + uint64_t high_water_bytes = this->m_bytes_allocated_cap * RETIRE_HIGH_WATER; + uint64_t low_water_bytes = this->m_bytes_allocated_cap * RETIRE_LOW_WATER; + uint64_t aggressive_high_water_entries = this->m_total_log_entries * AGGRESSIVE_RETIRE_HIGH_WATER; + uint64_t high_water_entries = this->m_total_log_entries * RETIRE_HIGH_WATER; + uint64_t low_water_entries = this->m_total_log_entries * RETIRE_LOW_WATER; + + ldout(cct, 20) << dendl; + + do { + { + std::lock_guard locker(m_lock); + this->m_wake_up_requested = false; + } + if (this->m_alloc_failed_since_retire || this->m_invalidating || + this->m_bytes_allocated > high_water_bytes || + (m_log_entries.size() > high_water_entries)) { + int retired = 0; + utime_t started = ceph_clock_now(); + ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire + << ", allocated > high_water=" + << (this->m_bytes_allocated > high_water_bytes) + << ", allocated_entries > high_water=" + << (m_log_entries.size() > high_water_entries) + << dendl; + while (this->m_alloc_failed_since_retire || this->m_invalidating || + (this->m_bytes_allocated > high_water_bytes) || + (m_log_entries.size() > high_water_entries) || + (((this->m_bytes_allocated > low_water_bytes) || + (m_log_entries.size() > low_water_entries)) && + (utime_t(ceph_clock_now() - started).to_msec() < RETIRE_BATCH_TIME_LIMIT_MS))) { + if (!retire_entries((this->m_shutting_down || this->m_invalidating || + (this->m_bytes_allocated > aggressive_high_water_bytes) || + (m_log_entries.size() > aggressive_high_water_entries) || + this->m_alloc_failed_since_retire) + ? MAX_ALLOC_PER_TRANSACTION + : MAX_FREE_PER_TRANSACTION)) { + break; + } + retired++; + this->dispatch_deferred_writes(); + this->process_writeback_dirty_entries(); + } + ldout(m_image_ctx.cct, 10) << "Retired " << retired << " times" << dendl; + } + this->dispatch_deferred_writes(); + this->process_writeback_dirty_entries(); + + { + std::lock_guard locker(m_lock); + wake_up_requested = this->m_wake_up_requested; + } + } while (wake_up_requested && --max_iterations > 0); + + { + std::lock_guard locker(m_lock); + this->m_wake_up_scheduled = false; + /* Reschedule if it's still requested */ + if (this->m_wake_up_requested) { + this->wake_up(); + } + } +} + +/* + * Flush the pmem regions for the data blocks of a set of operations + * + * V is expected to be GenericLogOperations, or GenericLogOperationsVector + */ +template +template +void WriteLog::flush_pmem_buffer(V& ops) +{ + utime_t now = ceph_clock_now(); + for (auto &operation : ops) { + if (operation->reserved_allocated()) { + operation->buf_persist_start_time = now; + } else { + ldout(m_image_ctx.cct, 20) << "skipping non-write op: " + << *operation << dendl; + } + } + + for (auto &operation : ops) { + if(operation->is_writing_op()) { + auto log_entry = static_pointer_cast(operation->get_log_entry()); + pmemobj_flush(m_log_pool, log_entry->cache_buffer, log_entry->write_bytes()); + } + } + + /* Drain once for all */ + pmemobj_drain(m_log_pool); + + now = ceph_clock_now(); + for (auto &operation : ops) { + if (operation->reserved_allocated()) { + operation->buf_persist_comp_time = now; + } else { + ldout(m_image_ctx.cct, 20) << "skipping non-write op: " + << *operation << dendl; + } + } +} + +/** + * Update/persist the last flushed sync point in the log + */ +template +void WriteLog::persist_last_flushed_sync_gen() +{ + TOID(struct WriteLogPoolRoot) pool_root; + pool_root = POBJ_ROOT(m_log_pool, struct WriteLogPoolRoot); + uint64_t flushed_sync_gen; + + std::lock_guard append_locker(this->m_log_append_lock); + { + std::lock_guard locker(m_lock); + flushed_sync_gen = this->m_flushed_sync_gen; + } + + if (D_RO(pool_root)->flushed_sync_gen < flushed_sync_gen) { + ldout(m_image_ctx.cct, 15) << "flushed_sync_gen in log updated from " + << D_RO(pool_root)->flushed_sync_gen << " to " + << flushed_sync_gen << dendl; + TX_BEGIN(m_log_pool) { + D_RW(pool_root)->flushed_sync_gen = flushed_sync_gen; + } TX_ONCOMMIT { + } TX_ONABORT { + lderr(m_image_ctx.cct) << "failed to commit update of flushed sync point" << dendl; + ceph_assert(false); + } TX_FINALLY { + } TX_END; + } +} + +template +void WriteLog::reserve_cache(C_BlockIORequestT *req, + bool &alloc_succeeds, bool &no_space) { + std::vector& buffers = req->get_resources_buffers(); + for (auto &buffer : buffers) { + utime_t before_reserve = ceph_clock_now(); + buffer.buffer_oid = pmemobj_reserve(m_log_pool, + &buffer.buffer_alloc_action, + buffer.allocation_size, + 0 /* Object type */); + buffer.allocation_lat = ceph_clock_now() - before_reserve; + if (TOID_IS_NULL(buffer.buffer_oid)) { + ldout(m_image_ctx.cct, 5) << "can't allocate all data buffers: " + << pmemobj_errormsg() << ". " + << *req << dendl; + alloc_succeeds = false; + no_space = true; /* Entries need to be retired */ + + if (this->m_free_log_entries == this->m_total_log_entries - 1) { + /* When the cache is empty, there is still no space to allocate. + * Defragment. */ + pmemobj_defrag(m_log_pool, NULL, 0, NULL); + } + break; + } else { + buffer.allocated = true; + } + ldout(m_image_ctx.cct, 20) << "Allocated " << buffer.buffer_oid.oid.pool_uuid_lo + << "." << buffer.buffer_oid.oid.off + << ", size=" << buffer.allocation_size << dendl; + } +} + +template +void WriteLog::copy_bl_to_buffer( + WriteRequestResources *resources, std::unique_ptr &op_set) { + auto allocation = resources->buffers.begin(); + for (auto &operation : op_set->operations) { + operation->copy_bl_to_cache_buffer(allocation); + allocation++; + } +} + +template +bool WriteLog::alloc_resources(C_BlockIORequestT *req) { + bool alloc_succeeds = true; + uint64_t bytes_allocated = 0; + uint64_t bytes_cached = 0; + uint64_t bytes_dirtied = 0; + uint64_t num_lanes = 0; + uint64_t num_unpublished_reserves = 0; + uint64_t num_log_entries = 0; + + ldout(m_image_ctx.cct, 20) << dendl; + // Setup buffer, and get all the number of required resources + req->setup_buffer_resources(&bytes_cached, &bytes_dirtied, &bytes_allocated, + &num_lanes, &num_log_entries, &num_unpublished_reserves); + + alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied, + bytes_allocated, num_lanes, num_log_entries, + num_unpublished_reserves); + + std::vector& buffers = req->get_resources_buffers(); + if (!alloc_succeeds) { + /* On alloc failure, free any buffers we did allocate */ + for (auto &buffer : buffers) { + if (buffer.allocated) { + pmemobj_cancel(m_log_pool, &buffer.buffer_alloc_action, 1); + } + } + } + + req->set_allocated(alloc_succeeds); + return alloc_succeeds; +} + +template +void WriteLog::complete_user_request(Context *&user_req, int r) { + user_req->complete(r); + // Set user_req as null as it is deleted + user_req = nullptr; +} + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::rwl::WriteLog; diff --git a/src/librbd/cache/pwl/rwl/WriteLog.h b/src/librbd/cache/pwl/rwl/WriteLog.h new file mode 100644 index 000000000..5083a2568 --- /dev/null +++ b/src/librbd/cache/pwl/rwl/WriteLog.h @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG +#define CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG + +#include +#include +#include +#include "common/Timer.h" +#include "common/RWLock.h" +#include "common/WorkQueue.h" +#include "common/AsyncOpTracker.h" +#include "librbd/cache/ImageWriteback.h" +#include "librbd/Utils.h" +#include "librbd/BlockGuard.h" +#include "librbd/cache/Types.h" +#include "librbd/cache/pwl/AbstractWriteLog.h" +#include "librbd/cache/pwl/LogMap.h" +#include "librbd/cache/pwl/LogOperation.h" +#include "librbd/cache/pwl/Request.h" +#include "librbd/cache/pwl/rwl/Builder.h" + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace cache { +namespace pwl { +namespace rwl { + +template +class WriteLog : public AbstractWriteLog { +public: + WriteLog( + ImageCtxT &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state, + ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api); + ~WriteLog(); + WriteLog(const WriteLog&) = delete; + WriteLog &operator=(const WriteLog&) = delete; + + typedef io::Extent Extent; + using This = AbstractWriteLog; + using C_WriteRequestT = pwl::C_WriteRequest; + using C_WriteSameRequestT = pwl::C_WriteSameRequest; + + void copy_bl_to_buffer( + WriteRequestResources *resources, std::unique_ptr &op_set) override; + void complete_user_request(Context *&user_req, int r) override; +private: + using C_BlockIORequestT = pwl::C_BlockIORequest; + using C_FlushRequestT = pwl::C_FlushRequest; + using C_DiscardRequestT = pwl::C_DiscardRequest; + + PMEMobjpool *m_log_pool = nullptr; + Builder *m_builderobj; + const char* m_pwl_pool_layout_name; + const uint64_t MAX_EXTENT_SIZE = 1048576; + + Builder* create_builder(); + void remove_pool_file(); + void load_existing_entries(pwl::DeferredContexts &later); + void alloc_op_log_entries(pwl::GenericLogOperations &ops); + int append_op_log_entries(pwl::GenericLogOperations &ops); + void flush_then_append_scheduled_ops(void); + void enlist_op_flusher(); + void flush_op_log_entries(pwl::GenericLogOperationsVector &ops); + template + void flush_pmem_buffer(V& ops); + void inc_allocated_cached_bytes( + std::shared_ptr log_entry) override; +protected: + using AbstractWriteLog::m_lock; + using AbstractWriteLog::m_log_entries; + using AbstractWriteLog::m_image_ctx; + using AbstractWriteLog::m_perfcounter; + using AbstractWriteLog::m_ops_to_flush; + using AbstractWriteLog::m_cache_state; + using AbstractWriteLog::m_first_free_entry; + using AbstractWriteLog::m_first_valid_entry; + + void process_work() override; + void schedule_append_ops(pwl::GenericLogOperations &ops, C_BlockIORequestT *req) override; + void append_scheduled_ops(void) override; + void reserve_cache(C_BlockIORequestT *req, + bool &alloc_succeeds, bool &no_space) override; + void collect_read_extents( + uint64_t read_buffer_offset, LogMapEntry map_entry, + std::vector> &log_entries_to_read, + std::vector &bls_to_read, uint64_t entry_hit_length, + Extent hit_extent, pwl::C_ReadRequest *read_ctx) override; + void complete_read( + std::vector> &log_entries_to_read, + std::vector &bls_to_read, Context *ctx) override; + bool retire_entries(const unsigned long int frees_per_tx) override; + void persist_last_flushed_sync_gen() override; + bool alloc_resources(C_BlockIORequestT *req) override; + void schedule_flush_and_append(pwl::GenericLogOperationsVector &ops) override; + void setup_schedule_append( + pwl::GenericLogOperationsVector &ops, bool do_early_flush, + C_BlockIORequestT *req) override; + void construct_flush_entries(pwl::GenericLogEntries entries_to_flush, + DeferredContexts &post_unlock, + bool has_write_entry) override; + bool initialize_pool(Context *on_finish, pwl::DeferredContexts &later) override; + void write_data_to_buffer( + std::shared_ptr ws_entry, + pwl::WriteLogCacheEntry *pmem_entry) override; + uint64_t get_max_extent() override { + return MAX_EXTENT_SIZE; + } +}; + +} // namespace rwl +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::rwl::WriteLog; + +#endif // CEPH_LIBRBD_CACHE_REPLICATED_WRITE_LOG diff --git a/src/librbd/cache/pwl/ssd/Builder.h b/src/librbd/cache/pwl/ssd/Builder.h new file mode 100644 index 000000000..07b3fb869 --- /dev/null +++ b/src/librbd/cache/pwl/ssd/Builder.h @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H +#define CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H + +#include +#include "LogEntry.h" +#include "ReadRequest.h" +#include "Request.h" +#include "LogOperation.h" + +#include "librbd/cache/ImageWriteback.h" +#include "librbd/cache/pwl/Builder.h" + +namespace librbd { +namespace cache { +namespace pwl { +namespace ssd { + +template +class Builder : public pwl::Builder { +public: + std::shared_ptr create_write_log_entry( + uint64_t image_offset_bytes, uint64_t write_bytes) override { + return std::make_shared(image_offset_bytes, write_bytes); + } + std::shared_ptr create_write_log_entry( + std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes) override { + return std::make_shared( + sync_point_entry, image_offset_bytes, write_bytes); + } + std::shared_ptr create_writesame_log_entry( + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) override { + return std::make_shared( + image_offset_bytes, write_bytes, data_length); + } + std::shared_ptr create_writesame_log_entry( + std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) override { + return std::make_shared( + sync_point_entry, image_offset_bytes, write_bytes, data_length); + } + pwl::C_WriteRequest *create_write_request( + T &pwl, utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) override { + return new C_WriteRequest( + pwl, arrived, std::move(image_extents), std::move(bl), + fadvise_flags, lock, perfcounter, user_req); + } + pwl::C_WriteSameRequest *create_writesame_request( + T &pwl, utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) override { + return new C_WriteSameRequest( + pwl, arrived, std::move(image_extents), std::move(bl), + fadvise_flags, lock, perfcounter, user_req); + } + pwl::C_WriteRequest *create_comp_and_write_request( + T &pwl, utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) override { + return new C_CompAndWriteRequest( + pwl, arrived, std::move(image_extents), std::move(cmp_bl), + std::move(bl), mismatch_offset, fadvise_flags, + lock, perfcounter, user_req); + } + std::shared_ptr create_write_log_operation( + WriteLogOperationSet &set, uint64_t image_offset_bytes, + uint64_t write_bytes, CephContext *cct, + std::shared_ptr write_log_entry) { + return std::make_shared( + set, image_offset_bytes, write_bytes, cct, write_log_entry); + } + std::shared_ptr create_write_log_operation( + WriteLogOperationSet &set, uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t data_len, CephContext *cct, + std::shared_ptr writesame_log_entry) { + return std::make_shared( + set, image_offset_bytes, write_bytes, data_len, cct, + writesame_log_entry); + } + std::shared_ptr create_discard_log_operation( + std::shared_ptr sync_point, uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t discard_granularity_bytes, + utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct) { + return std::make_shared( + sync_point, image_offset_bytes, write_bytes, discard_granularity_bytes, + dispatch_time, perfcounter, cct); + } + C_ReadRequest *create_read_request(CephContext *cct, utime_t arrived, + PerfCounters *perfcounter, ceph::bufferlist *bl, Context *on_finish) { + return new C_ReadRequest(cct, arrived, perfcounter, bl, on_finish); + } +}; + + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_SSD_BUILDER_H diff --git a/src/librbd/cache/pwl/ssd/LogEntry.cc b/src/librbd/cache/pwl/ssd/LogEntry.cc new file mode 100644 index 000000000..0e6edd87b --- /dev/null +++ b/src/librbd/cache/pwl/ssd/LogEntry.cc @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/cache/ImageWriteback.h" +#include "librbd/cache/pwl/ssd/LogEntry.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::ssd::WriteLogEntry: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +namespace ssd { + +void WriteLogEntry::init_cache_bl( + bufferlist &src_bl, uint64_t off, uint64_t len) { + cache_bl.clear(); + cache_bl.substr_of(src_bl, off, len); +} + +buffer::list& WriteLogEntry::get_cache_bl() { + return cache_bl; +} + +void WriteLogEntry::copy_cache_bl(bufferlist *out) { + std::lock_guard locker(m_entry_bl_lock); + *out = cache_bl; +} + +void WriteLogEntry::remove_cache_bl() { + std::lock_guard locker(m_entry_bl_lock); + cache_bl.clear(); +} + +unsigned int WriteLogEntry::get_aligned_data_size() const { + if (cache_bl.length()) { + return round_up_to(cache_bl.length(), MIN_WRITE_ALLOC_SSD_SIZE); + } + return round_up_to(write_bytes(), MIN_WRITE_ALLOC_SSD_SIZE); +} + +void WriteLogEntry::writeback_bl( + librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx, ceph::bufferlist&& bl) { + image_writeback.aio_write({{ram_entry.image_offset_bytes, + ram_entry.write_bytes}}, + std::move(bl), 0, ctx); +} + +void WriteSameLogEntry::writeback_bl( + librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx, ceph::bufferlist &&bl) { + image_writeback.aio_writesame(ram_entry.image_offset_bytes, + ram_entry.write_bytes, + std::move(bl), 0, ctx); +} + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/ssd/LogEntry.h b/src/librbd/cache/pwl/ssd/LogEntry.h new file mode 100644 index 000000000..8e26f661f --- /dev/null +++ b/src/librbd/cache/pwl/ssd/LogEntry.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// // vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H +#define CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H + +#include "librbd/cache/pwl/LogEntry.h" + +namespace librbd { +namespace cache { +class ImageWritebackInterface; +namespace pwl { +namespace ssd { + +class WriteLogEntry : public pwl::WriteLogEntry { +public: + WriteLogEntry( + std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes) + : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, write_bytes) {} + WriteLogEntry( + uint64_t image_offset_bytes, uint64_t write_bytes) + : pwl::WriteLogEntry(image_offset_bytes, write_bytes) {} + WriteLogEntry( + std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : pwl::WriteLogEntry(sync_point_entry, image_offset_bytes, + write_bytes, data_length) {} + WriteLogEntry( + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : pwl::WriteLogEntry(image_offset_bytes, write_bytes, data_length) {} + ~WriteLogEntry() {} + WriteLogEntry(const WriteLogEntry&) = delete; + WriteLogEntry &operator=(const WriteLogEntry&) = delete; + void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx, ceph::bufferlist &&bl) override; + void init_cache_bl(bufferlist &src_bl, uint64_t off, uint64_t len) override; + buffer::list &get_cache_bl() override; + void copy_cache_bl(bufferlist *out) override; + void remove_cache_bl() override; + unsigned int get_aligned_data_size() const override; + void inc_bl_refs() { bl_refs++; }; + void dec_bl_refs() { bl_refs--; }; + unsigned int reader_count() const override { + return bl_refs; + } +}; + +class WriteSameLogEntry : public WriteLogEntry { +public: + WriteSameLogEntry( + std::shared_ptr sync_point_entry, + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : WriteLogEntry(sync_point_entry, image_offset_bytes, + write_bytes, data_length) {} + WriteSameLogEntry( + uint64_t image_offset_bytes, uint64_t write_bytes, + uint32_t data_length) + : WriteLogEntry(image_offset_bytes, write_bytes, data_length) {} + ~WriteSameLogEntry() {} + WriteSameLogEntry(const WriteSameLogEntry&) = delete; + WriteSameLogEntry &operator=(const WriteSameLogEntry&) = delete; + void writeback_bl(librbd::cache::ImageWritebackInterface &image_writeback, + Context *ctx, ceph::bufferlist &&bl) override; +}; + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_SSD_LOG_ENTRY_H diff --git a/src/librbd/cache/pwl/ssd/LogOperation.cc b/src/librbd/cache/pwl/ssd/LogOperation.cc new file mode 100644 index 000000000..c8080e37d --- /dev/null +++ b/src/librbd/cache/pwl/ssd/LogOperation.cc @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "LogOperation.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::ssd::LogOperation: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +namespace ssd { + +void DiscardLogOperation::init_op( + uint64_t current_sync_gen, bool persist_on_flush, + uint64_t last_op_sequence_num, Context *write_persist, + Context *write_append) { + log_entry->init(current_sync_gen, persist_on_flush, last_op_sequence_num); + if (persist_on_flush) { + this->on_write_append = new LambdaContext( + [write_persist, write_append] (int r) { + write_append->complete(r); + write_persist->complete(r); + }); + } else { + this->on_write_append = write_append; + this->on_write_persist = write_persist; + } +} + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/ssd/LogOperation.h b/src/librbd/cache/pwl/ssd/LogOperation.h new file mode 100644 index 000000000..dbc89aa73 --- /dev/null +++ b/src/librbd/cache/pwl/ssd/LogOperation.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_LOG_OPERATION_H +#define CEPH_LIBRBD_CACHE_PWL_SSD_LOG_OPERATION_H + +#include "librbd/cache/pwl/LogOperation.h" + + +namespace librbd { +namespace cache { +namespace pwl { +namespace ssd { + +class DiscardLogOperation : public pwl::DiscardLogOperation { +public: + DiscardLogOperation( + std::shared_ptr sync_point, uint64_t image_offset_bytes, + uint64_t write_bytes, uint32_t discard_granularity_bytes, + utime_t dispatch_time, PerfCounters *perfcounter, CephContext *cct) + : pwl::DiscardLogOperation(sync_point, image_offset_bytes, write_bytes, + discard_granularity_bytes, dispatch_time, + perfcounter, cct) {} + void init_op( + uint64_t current_sync_gen, bool persist_on_flush, + uint64_t last_op_sequence_num, Context *write_persist, + Context *write_append) override; +}; + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_SSD_LOG_OPERATION_H diff --git a/src/librbd/cache/pwl/ssd/ReadRequest.cc b/src/librbd/cache/pwl/ssd/ReadRequest.cc new file mode 100644 index 000000000..1a80a8d8c --- /dev/null +++ b/src/librbd/cache/pwl/ssd/ReadRequest.cc @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ReadRequest.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::ssd::ReadRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +namespace ssd { + +void C_ReadRequest::finish(int r) { + ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << dendl; + int hits = 0; + int misses = 0; + int hit_bytes = 0; + int miss_bytes = 0; + if (r >= 0) { + /* + * At this point the miss read has completed. We'll iterate through + * m_read_extents and produce *m_out_bl by assembling pieces of m_miss_bl + * and the individual hit extent bufs in the read extents that represent + * hits. + */ + uint64_t miss_bl_offset = 0; + for (auto extent : read_extents) { + if (extent->m_bl.length()) { + /* This was a hit */ + bufferlist data_bl; + if (extent->writesame) { + int data_len = extent->m_bl.length(); + int read_buffer_offset = extent->truncate_offset; + if (extent->need_to_truncate && extent->truncate_offset >= data_len) { + read_buffer_offset = (extent->truncate_offset) % data_len; + } + // build data and truncate + bufferlist temp_bl; + uint64_t total_left_bytes = read_buffer_offset + extent->second; + while (total_left_bytes > 0) { + temp_bl.append(extent->m_bl); + total_left_bytes = total_left_bytes - data_len; + } + data_bl.substr_of(temp_bl, read_buffer_offset, extent->second); + m_out_bl->claim_append(data_bl); + } else if (extent->need_to_truncate) { + assert(extent->m_bl.length() >= extent->truncate_offset + extent->second); + data_bl.substr_of(extent->m_bl, extent->truncate_offset, extent->second); + m_out_bl->claim_append(data_bl); + } else { + assert(extent->second == extent->m_bl.length()); + m_out_bl->claim_append(extent->m_bl); + } + ++hits; + hit_bytes += extent->second; + } else { + /* This was a miss. */ + ++misses; + miss_bytes += extent->second; + bufferlist miss_extent_bl; + miss_extent_bl.substr_of(miss_bl, miss_bl_offset, extent->second); + /* Add this read miss bufferlist to the output bufferlist */ + m_out_bl->claim_append(miss_extent_bl); + /* Consume these bytes in the read miss bufferlist */ + miss_bl_offset += extent->second; + } + } + } + ldout(m_cct, 20) << "(" << get_name() << "): r=" << r << " bl=" << *m_out_bl << dendl; + utime_t now = ceph_clock_now(); + ceph_assert((int)m_out_bl->length() == hit_bytes + miss_bytes); + m_on_finish->complete(r); + m_perfcounter->inc(l_librbd_pwl_rd_bytes, hit_bytes + miss_bytes); + m_perfcounter->inc(l_librbd_pwl_rd_hit_bytes, hit_bytes); + m_perfcounter->tinc(l_librbd_pwl_rd_latency, now - m_arrived_time); + if (!misses) { + m_perfcounter->inc(l_librbd_pwl_rd_hit_req, 1); + m_perfcounter->tinc(l_librbd_pwl_rd_hit_latency, now - m_arrived_time); + } else { + if (hits) { + m_perfcounter->inc(l_librbd_pwl_rd_part_hit_req, 1); + } + } +} + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd diff --git a/src/librbd/cache/pwl/ssd/ReadRequest.h b/src/librbd/cache/pwl/ssd/ReadRequest.h new file mode 100644 index 000000000..345c4aa65 --- /dev/null +++ b/src/librbd/cache/pwl/ssd/ReadRequest.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_READ_REQUEST_H +#define CEPH_LIBRBD_CACHE_PWL_SSD_READ_REQUEST_H + +#include "librbd/cache/pwl/ReadRequest.h" + +namespace librbd { +namespace cache { +namespace pwl { +namespace ssd { + +typedef std::vector ImageExtentBufs; + +class C_ReadRequest : public pwl::C_ReadRequest { +protected: + using pwl::C_ReadRequest::m_cct; + using pwl::C_ReadRequest::m_on_finish; + using pwl::C_ReadRequest::m_out_bl; + using pwl::C_ReadRequest::m_arrived_time; + using pwl::C_ReadRequest::m_perfcounter; +public: + C_ReadRequest(CephContext *cct, utime_t arrived, PerfCounters *perfcounter, bufferlist *out_bl, Context *on_finish) + : pwl::C_ReadRequest(cct, arrived, perfcounter, out_bl, on_finish) {} + void finish(int r) override; +}; + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_PWL_SSD_READ_REQUEST_H diff --git a/src/librbd/cache/pwl/ssd/Request.cc b/src/librbd/cache/pwl/ssd/Request.cc new file mode 100644 index 000000000..61e39b7c1 --- /dev/null +++ b/src/librbd/cache/pwl/ssd/Request.cc @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Request.h" + +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::ssd::Request: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +namespace ssd { + +template +void C_WriteRequest::setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated, + uint64_t *number_lanes, uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) { + + *bytes_cached = 0; + *bytes_allocated = 0; + *number_log_entries = this->image_extents.size(); + + for (auto &extent : this->image_extents) { + *bytes_cached += extent.second; + *bytes_allocated += round_up_to(extent.second, MIN_WRITE_ALLOC_SSD_SIZE); + } + *bytes_dirtied = *bytes_cached; +} + +template +std::ostream &operator<<(std::ostream &os, + const C_CompAndWriteRequest &req) { + os << (C_WriteRequest&)req + << " cmp_bl=" << req.cmp_bl + << ", read_bl=" << req.read_bl + << ", compare_succeeded=" << req.compare_succeeded + << ", mismatch_offset=" << req.mismatch_offset; + return os; +} + +template +void C_WriteSameRequest::setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, uint64_t *bytes_allocated, + uint64_t *number_lanes, uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) { + ceph_assert(this->image_extents.size() == 1); + *number_log_entries = 1; + *bytes_dirtied = this->image_extents[0].second; + *bytes_cached = this->bl.length(); + *bytes_allocated = round_up_to(*bytes_cached, MIN_WRITE_ALLOC_SSD_SIZE); +} + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::ssd::C_WriteRequest >; +template class librbd::cache::pwl::ssd::C_WriteSameRequest >; +template class librbd::cache::pwl::ssd::C_CompAndWriteRequest >; diff --git a/src/librbd/cache/pwl/ssd/Request.h b/src/librbd/cache/pwl/ssd/Request.h new file mode 100644 index 000000000..9bb3e85b9 --- /dev/null +++ b/src/librbd/cache/pwl/ssd/Request.h @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_SSD_REQUEST_H +#define CEPH_LIBRBD_CACHE_SSD_REQUEST_H + +#include "librbd/cache/pwl/Request.h" + +namespace librbd { +class BlockGuardCell; + +namespace cache { +namespace pwl { + +template +class AbstractWriteLog; + +namespace ssd { + +template +class C_WriteRequest : public pwl::C_WriteRequest { +public: + C_WriteRequest( + T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : pwl::C_WriteRequest( + pwl, arrived, std::move(image_extents), std::move(cmp_bl), + std::move(bl), mismatch_offset, fadvise_flags, + lock, perfcounter, user_req) {} + + C_WriteRequest( + T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : pwl::C_WriteRequest( + pwl, arrived, std::move(image_extents), std::move(bl), + fadvise_flags, lock, perfcounter, user_req) {} +protected: + void setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, + uint64_t *bytes_allocated, uint64_t *number_lanes, + uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) override; +}; + +template +class C_CompAndWriteRequest : public C_WriteRequest { +public: + C_CompAndWriteRequest( + T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& cmp_bl, bufferlist&& bl, uint64_t *mismatch_offset, + const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : C_WriteRequest( + pwl, arrived, std::move(image_extents), std::move(cmp_bl), + std::move(bl), mismatch_offset,fadvise_flags, + lock, perfcounter, user_req) {} + + const char *get_name() const override { + return "C_CompAndWriteRequest"; + } + template + friend std::ostream &operator<<(std::ostream &os, + const C_CompAndWriteRequest &req); +}; + +template +class C_WriteSameRequest : public pwl::C_WriteSameRequest { +public: + C_WriteSameRequest( + T &pwl, const utime_t arrived, io::Extents &&image_extents, + bufferlist&& bl, const int fadvise_flags, ceph::mutex &lock, + PerfCounters *perfcounter, Context *user_req) + : pwl::C_WriteSameRequest( + pwl, arrived, std::move(image_extents), std::move(bl), fadvise_flags, + lock, perfcounter, user_req) {} + + void setup_buffer_resources( + uint64_t *bytes_cached, uint64_t *bytes_dirtied, + uint64_t *bytes_allocated, uint64_t *number_lanes, + uint64_t *number_log_entries, + uint64_t *number_unpublished_reserves) override; +}; + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd + +#endif // CEPH_LIBRBD_CACHE_SSD_REQUEST_H diff --git a/src/librbd/cache/pwl/ssd/Types.h b/src/librbd/cache/pwl/ssd/Types.h new file mode 100644 index 000000000..52f67ae20 --- /dev/null +++ b/src/librbd/cache/pwl/ssd/Types.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_SSD_TYPES_H +#define CEPH_LIBRBD_CACHE_SSD_TYPES_H + +#include "acconfig.h" + +#include "librbd/io/Types.h" +#include "librbd/cache/pwl/Types.h" + +namespace librbd { +namespace cache { +namespace pwl { +namespace ssd { + +struct SuperBlock{ + WriteLogPoolRoot root; + + DENC(SuperBlock, v, p) { + DENC_START(1, 1, p); + denc(v.root, p); + DENC_FINISH(p); + } + + void dump(Formatter *f) const { + f->dump_object("super", root); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new SuperBlock()); + ls.push_back(new SuperBlock); + ls.back()->root.layout_version = 3; + ls.back()->root.cur_sync_gen = 1; + ls.back()->root.pool_size = 10737418240; + ls.back()->root.flushed_sync_gen = 1; + ls.back()->root.block_size = 4096; + ls.back()->root.num_log_entries = 0; + ls.back()->root.first_free_entry = 30601; + ls.back()->root.first_valid_entry = 2; + } +}; + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd + +WRITE_CLASS_DENC(librbd::cache::pwl::ssd::SuperBlock) + +#endif // CEPH_LIBRBD_CACHE_SSD_TYPES_H diff --git a/src/librbd/cache/pwl/ssd/WriteLog.cc b/src/librbd/cache/pwl/ssd/WriteLog.cc new file mode 100644 index 000000000..753b15b69 --- /dev/null +++ b/src/librbd/cache/pwl/ssd/WriteLog.cc @@ -0,0 +1,1160 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "WriteLog.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "include/ceph_assert.h" +#include "common/deleter.h" +#include "common/dout.h" +#include "common/environment.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "common/Timer.h" +#include "common/perf_counters.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/cache/pwl/ImageCacheState.h" +#include "librbd/cache/pwl/LogEntry.h" +#include +#include + +#undef dout_subsys +#define dout_subsys ceph_subsys_rbd_pwl +#undef dout_prefix +#define dout_prefix *_dout << "librbd::cache::pwl::ssd::WriteLog: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace cache { +namespace pwl { +namespace ssd { + +using namespace std; +using namespace librbd::cache::pwl; + +static bool is_valid_pool_root(const WriteLogPoolRoot& root) { + return root.pool_size % MIN_WRITE_ALLOC_SSD_SIZE == 0 && + root.first_valid_entry >= DATA_RING_BUFFER_OFFSET && + root.first_valid_entry < root.pool_size && + root.first_valid_entry % MIN_WRITE_ALLOC_SSD_SIZE == 0 && + root.first_free_entry >= DATA_RING_BUFFER_OFFSET && + root.first_free_entry < root.pool_size && + root.first_free_entry % MIN_WRITE_ALLOC_SSD_SIZE == 0; +} + +template +Builder>* WriteLog::create_builder() { + m_builderobj = new Builder(); + return m_builderobj; +} + +template +WriteLog::WriteLog( + I &image_ctx, librbd::cache::pwl::ImageCacheState* cache_state, + cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api) + : AbstractWriteLog(image_ctx, cache_state, create_builder(), + image_writeback, plugin_api) +{ +} + +template +WriteLog::~WriteLog() { + delete m_builderobj; +} + +template +void WriteLog::collect_read_extents( + uint64_t read_buffer_offset, LogMapEntry map_entry, + std::vector> &log_entries_to_read, + std::vector &bls_to_read, + uint64_t entry_hit_length, Extent hit_extent, + pwl::C_ReadRequest *read_ctx) { + // Make a bl for this hit extent. This will add references to the + // write_entry->cache_bl */ + ldout(m_image_ctx.cct, 5) << dendl; + auto write_entry = std::static_pointer_cast(map_entry.log_entry); + buffer::list hit_bl; + write_entry->copy_cache_bl(&hit_bl); + bool writesame = write_entry->is_writesame_entry(); + auto hit_extent_buf = std::make_shared( + hit_extent, hit_bl, true, read_buffer_offset, writesame); + read_ctx->read_extents.push_back(hit_extent_buf); + + if (!hit_bl.length()) { + ldout(m_image_ctx.cct, 5) << "didn't hit RAM" << dendl; + auto read_extent = read_ctx->read_extents.back(); + write_entry->inc_bl_refs(); + log_entries_to_read.push_back(std::move(write_entry)); + bls_to_read.push_back(&read_extent->m_bl); + } +} + +template +void WriteLog::complete_read( + std::vector> &log_entries_to_read, + std::vector &bls_to_read, + Context *ctx) { + if (!log_entries_to_read.empty()) { + aio_read_data_blocks(log_entries_to_read, bls_to_read, ctx); + } else { + ctx->complete(0); + } +} + +template +int WriteLog::create_and_open_bdev() { + CephContext *cct = m_image_ctx.cct; + + bdev = BlockDevice::create(cct, this->m_log_pool_name, aio_cache_cb, + nullptr, nullptr, nullptr); + int r = bdev->open(this->m_log_pool_name); + if (r < 0) { + lderr(cct) << "failed to open bdev" << dendl; + delete bdev; + return r; + } + + ceph_assert(this->m_log_pool_size % MIN_WRITE_ALLOC_SSD_SIZE == 0); + if (bdev->get_size() != this->m_log_pool_size) { + lderr(cct) << "size mismatch: bdev size " << bdev->get_size() + << " (block size " << bdev->get_block_size() + << ") != pool size " << this->m_log_pool_size << dendl; + bdev->close(); + delete bdev; + return -EINVAL; + } + + return 0; +} + +template +bool WriteLog::initialize_pool(Context *on_finish, + pwl::DeferredContexts &later) { + int r; + CephContext *cct = m_image_ctx.cct; + + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + if (access(this->m_log_pool_name.c_str(), F_OK) != 0) { + int fd = ::open(this->m_log_pool_name.c_str(), O_RDWR|O_CREAT, 0644); + bool succeed = true; + if (fd >= 0) { + if (truncate(this->m_log_pool_name.c_str(), + this->m_log_pool_size) != 0) { + succeed = false; + } + ::close(fd); + } else { + succeed = false; + } + if (!succeed) { + m_cache_state->present = false; + m_cache_state->clean = true; + m_cache_state->empty = true; + /* TODO: filter/replace errnos that are meaningless to the caller */ + on_finish->complete(-errno); + return false; + } + + r = create_and_open_bdev(); + if (r < 0) { + on_finish->complete(r); + return false; + } + m_cache_state->present = true; + m_cache_state->clean = true; + m_cache_state->empty = true; + /* new pool, calculate and store metadata */ + + /* Keep ring buffer at least MIN_WRITE_ALLOC_SSD_SIZE bytes free. + * In this way, when all ring buffer spaces are allocated, + * m_first_free_entry and m_first_valid_entry will not be equal. + * Equal only means the cache is empty. */ + this->m_bytes_allocated_cap = this->m_log_pool_size - + DATA_RING_BUFFER_OFFSET - MIN_WRITE_ALLOC_SSD_SIZE; + /* Log ring empty */ + m_first_free_entry = DATA_RING_BUFFER_OFFSET; + m_first_valid_entry = DATA_RING_BUFFER_OFFSET; + + auto new_root = std::make_shared(pool_root); + new_root->layout_version = SSD_LAYOUT_VERSION; + new_root->pool_size = this->m_log_pool_size; + new_root->flushed_sync_gen = this->m_flushed_sync_gen; + new_root->block_size = MIN_WRITE_ALLOC_SSD_SIZE; + new_root->first_free_entry = m_first_free_entry; + new_root->first_valid_entry = m_first_valid_entry; + new_root->num_log_entries = 0; + pool_root = *new_root; + + r = update_pool_root_sync(new_root); + if (r != 0) { + lderr(cct) << "failed to initialize pool (" + << this->m_log_pool_name << ")" << dendl; + bdev->close(); + delete bdev; + on_finish->complete(r); + return false; + } + } else { + ceph_assert(m_cache_state->present); + r = create_and_open_bdev(); + if (r < 0) { + on_finish->complete(r); + return false; + } + + bufferlist bl; + SuperBlock superblock; + ::IOContext ioctx(cct, nullptr); + r = bdev->read(0, MIN_WRITE_ALLOC_SSD_SIZE, &bl, &ioctx, false); + if (r < 0) { + lderr(cct) << "read ssd cache superblock failed " << dendl; + goto err_close_bdev; + } + auto p = bl.cbegin(); + decode(superblock, p); + pool_root = superblock.root; + ldout(cct, 1) << "Decoded root: pool_size=" << pool_root.pool_size + << " first_valid_entry=" << pool_root.first_valid_entry + << " first_free_entry=" << pool_root.first_free_entry + << " flushed_sync_gen=" << pool_root.flushed_sync_gen + << dendl; + ceph_assert(is_valid_pool_root(pool_root)); + if (pool_root.layout_version != SSD_LAYOUT_VERSION) { + lderr(cct) << "pool layout version is " + << pool_root.layout_version + << " expected " << SSD_LAYOUT_VERSION + << dendl; + goto err_close_bdev; + } + if (pool_root.block_size != MIN_WRITE_ALLOC_SSD_SIZE) { + lderr(cct) << "pool block size is " << pool_root.block_size + << " expected " << MIN_WRITE_ALLOC_SSD_SIZE + << dendl; + goto err_close_bdev; + } + + this->m_log_pool_size = pool_root.pool_size; + this->m_flushed_sync_gen = pool_root.flushed_sync_gen; + this->m_first_valid_entry = pool_root.first_valid_entry; + this->m_first_free_entry = pool_root.first_free_entry; + this->m_bytes_allocated_cap = this->m_log_pool_size - + DATA_RING_BUFFER_OFFSET - + MIN_WRITE_ALLOC_SSD_SIZE; + + load_existing_entries(later); + m_cache_state->clean = this->m_dirty_log_entries.empty(); + m_cache_state->empty = m_log_entries.empty(); + } + return true; + +err_close_bdev: + bdev->close(); + delete bdev; + on_finish->complete(-EINVAL); + return false; +} + +template +void WriteLog::remove_pool_file() { + ceph_assert(bdev); + bdev->close(); + delete bdev; + bdev = nullptr; + ldout(m_image_ctx.cct, 5) << "block device is closed" << dendl; + + if (m_cache_state->clean) { + ldout(m_image_ctx.cct, 5) << "Removing empty pool file: " + << this->m_log_pool_name << dendl; + if (remove(this->m_log_pool_name.c_str()) != 0) { + lderr(m_image_ctx.cct) << "failed to remove empty pool \"" + << this->m_log_pool_name << "\": " << dendl; + } else { + m_cache_state->present = false; + } + } else { + ldout(m_image_ctx.cct, 5) << "Not removing pool file: " + << this->m_log_pool_name << dendl; + } +} + +template +void WriteLog::load_existing_entries(pwl::DeferredContexts &later) { + CephContext *cct = m_image_ctx.cct; + std::map> sync_point_entries; + std::map missing_sync_points; + + // Iterate through the log_entries and append all the write_bytes + // of each entry to fetch the pos of next 4k of log_entries. Iterate + // through the log entries and append them to the in-memory vector + for (uint64_t next_log_pos = this->m_first_valid_entry; + next_log_pos != this->m_first_free_entry; ) { + // read the entries from SSD cache and decode + bufferlist bl_entries; + ::IOContext ioctx_entry(cct, nullptr); + bdev->read(next_log_pos, MIN_WRITE_ALLOC_SSD_SIZE, &bl_entries, + &ioctx_entry, false); + std::vector ssd_log_entries; + auto pl = bl_entries.cbegin(); + decode(ssd_log_entries, pl); + ldout(cct, 5) << "decoded ssd log entries" << dendl; + uint64_t curr_log_pos = next_log_pos; + std::shared_ptr log_entry = nullptr; + + for (auto it = ssd_log_entries.begin(); it != ssd_log_entries.end(); ++it) { + this->update_entries(&log_entry, &*it, missing_sync_points, + sync_point_entries, curr_log_pos); + log_entry->ram_entry = *it; + log_entry->log_entry_index = curr_log_pos; + log_entry->completed = true; + m_log_entries.push_back(log_entry); + next_log_pos += round_up_to(it->write_bytes, MIN_WRITE_ALLOC_SSD_SIZE); + } + // along with the write_bytes, add control block size too + next_log_pos += MIN_WRITE_ALLOC_SSD_SIZE; + if (next_log_pos >= this->m_log_pool_size) { + next_log_pos = next_log_pos % this->m_log_pool_size + DATA_RING_BUFFER_OFFSET; + } + } + this->update_sync_points(missing_sync_points, sync_point_entries, later); + if (m_first_valid_entry > m_first_free_entry) { + m_bytes_allocated = this->m_log_pool_size - m_first_valid_entry + + m_first_free_entry - DATA_RING_BUFFER_OFFSET; + } else { + m_bytes_allocated = m_first_free_entry - m_first_valid_entry; + } +} + +// For SSD we don't calc m_bytes_allocated in this +template +void WriteLog::inc_allocated_cached_bytes( + std::shared_ptr log_entry) { + if (log_entry->is_write_entry()) { + this->m_bytes_cached += log_entry->write_bytes(); + } +} + +template +bool WriteLog::alloc_resources(C_BlockIORequestT *req) { + bool alloc_succeeds = true; + uint64_t bytes_allocated = 0; + uint64_t bytes_cached = 0; + uint64_t bytes_dirtied = 0; + uint64_t num_lanes = 0; + uint64_t num_unpublished_reserves = 0; + uint64_t num_log_entries = 0; + + // Setup buffer, and get all the number of required resources + req->setup_buffer_resources(&bytes_cached, &bytes_dirtied, &bytes_allocated, + &num_lanes, &num_log_entries, + &num_unpublished_reserves); + + ceph_assert(!num_lanes); + if (num_log_entries) { + bytes_allocated += num_log_entries * MIN_WRITE_ALLOC_SSD_SIZE; + num_log_entries = 0; + } + ceph_assert(!num_unpublished_reserves); + + alloc_succeeds = this->check_allocation(req, bytes_cached, bytes_dirtied, + bytes_allocated, num_lanes, + num_log_entries, + num_unpublished_reserves); + req->set_allocated(alloc_succeeds); + return alloc_succeeds; +} + +template +bool WriteLog::has_sync_point_logs(GenericLogOperations &ops) { + for (auto &op : ops) { + if (op->get_log_entry()->is_sync_point()) { + return true; + break; + } + } + return false; +} + +template +void WriteLog::enlist_op_appender() { + this->m_async_append_ops++; + this->m_async_op_tracker.start_op(); + Context *append_ctx = new LambdaContext([this](int r) { + append_scheduled_ops(); + }); + this->m_work_queue.queue(append_ctx); +} + +/* + * Takes custody of ops. They'll all get their log entries appended, + * and have their on_write_persist contexts completed once they and + * all prior log entries are persisted everywhere. + */ +template +void WriteLog::schedule_append_ops(GenericLogOperations &ops, C_BlockIORequestT *req) { + bool need_finisher = false; + GenericLogOperationsVector appending; + + std::copy(std::begin(ops), std::end(ops), std::back_inserter(appending)); + { + std::lock_guard locker(m_lock); + + bool persist_on_flush = this->get_persist_on_flush(); + need_finisher = !this->m_appending && + ((this->m_ops_to_append.size() >= CONTROL_BLOCK_MAX_LOG_ENTRIES) || + !persist_on_flush); + + // Only flush logs into SSD when there is internal/external flush request + if (!need_finisher) { + need_finisher = has_sync_point_logs(ops); + } + this->m_ops_to_append.splice(this->m_ops_to_append.end(), ops); + + // To preserve the order of overlapping IOs, release_cell() may be + // called only after the ops are added to m_ops_to_append. + // As soon as m_lock is released, the appended ops can be picked up + // by append_scheduled_ops() in another thread and req can be freed. + if (req != nullptr) { + if (persist_on_flush) { + req->complete_user_request(0); + } + req->release_cell(); + } + } + + if (need_finisher) { + this->enlist_op_appender(); + } + + for (auto &op : appending) { + op->appending(); + } +} + +template +void WriteLog::setup_schedule_append(pwl::GenericLogOperationsVector &ops, + bool do_early_flush, + C_BlockIORequestT *req) { + this->schedule_append(ops, req); +} + +template +void WriteLog::append_scheduled_ops(void) { + GenericLogOperations ops; + ldout(m_image_ctx.cct, 20) << dendl; + + bool ops_remain = false; // unused, no-op variable for SSD + bool appending = false; // unused, no-op variable for SSD + this->append_scheduled(ops, ops_remain, appending); + + if (ops.size()) { + alloc_op_log_entries(ops); + append_op_log_entries(ops); + } else { + this->m_async_append_ops--; + this->m_async_op_tracker.finish_op(); + } +} + +/* + * Write and persist the (already allocated) write log entries and + * data buffer allocations for a set of ops. The data buffer for each + * of these must already have been persisted to its reserved area. + */ +template +void WriteLog::append_op_log_entries(GenericLogOperations &ops) { + ceph_assert(!ops.empty()); + ldout(m_image_ctx.cct, 20) << dendl; + Context *ctx = new LambdaContext([this, ops](int r) { + assert(r == 0); + ldout(m_image_ctx.cct, 20) << "Finished root update " << dendl; + + auto captured_ops = std::move(ops); + this->complete_op_log_entries(std::move(captured_ops), r); + + bool need_finisher = false; + { + std::lock_guard locker1(m_lock); + bool persist_on_flush = this->get_persist_on_flush(); + need_finisher = ((this->m_ops_to_append.size() >= CONTROL_BLOCK_MAX_LOG_ENTRIES) || + !persist_on_flush); + + if (!need_finisher) { + need_finisher = has_sync_point_logs(this->m_ops_to_append); + } + } + + if (need_finisher) { + this->enlist_op_appender(); + } + this->m_async_update_superblock--; + this->m_async_op_tracker.finish_op(); + }); + uint64_t *new_first_free_entry = new(uint64_t); + Context *append_ctx = new LambdaContext( + [this, new_first_free_entry, ops, ctx](int r) { + std::shared_ptr new_root; + { + ldout(m_image_ctx.cct, 20) << "Finished appending at " + << *new_first_free_entry << dendl; + utime_t now = ceph_clock_now(); + for (auto &operation : ops) { + operation->log_append_comp_time = now; + } + + std::lock_guard locker(this->m_log_append_lock); + std::lock_guard locker1(m_lock); + assert(this->m_appending); + this->m_appending = false; + new_root = std::make_shared(pool_root); + pool_root.first_free_entry = *new_first_free_entry; + new_root->first_free_entry = *new_first_free_entry; + delete new_first_free_entry; + schedule_update_root(new_root, ctx); + } + this->m_async_append_ops--; + this->m_async_op_tracker.finish_op(); + }); + // Append logs and update first_free_update + append_ops(ops, append_ctx, new_first_free_entry); + + if (ops.size()) { + this->dispatch_deferred_writes(); + } +} + +template +void WriteLog::release_ram(std::shared_ptr log_entry) { + log_entry->remove_cache_bl(); +} + +template +void WriteLog::alloc_op_log_entries(GenericLogOperations &ops) { + std::unique_lock locker(m_lock); + + for (auto &operation : ops) { + auto &log_entry = operation->get_log_entry(); + log_entry->ram_entry.set_entry_valid(true); + m_log_entries.push_back(log_entry); + ldout(m_image_ctx.cct, 20) << "operation=[" << *operation << "]" << dendl; + } + if (m_cache_state->empty && !m_log_entries.empty()) { + m_cache_state->empty = false; + this->update_image_cache_state(); + this->write_image_cache_state(locker); + } +} + +template +void WriteLog::construct_flush_entries(pwl::GenericLogEntries entries_to_flush, + DeferredContexts &post_unlock, + bool has_write_entry) { + // snapshot so we behave consistently + bool invalidating = this->m_invalidating; + + if (invalidating || !has_write_entry) { + for (auto &log_entry : entries_to_flush) { + GuardedRequestFunctionContext *guarded_ctx = + new GuardedRequestFunctionContext([this, log_entry, invalidating] + (GuardedRequestFunctionContext &guard_ctx) { + log_entry->m_cell = guard_ctx.cell; + Context *ctx = this->construct_flush_entry(log_entry, invalidating); + + if (!invalidating) { + ctx = new LambdaContext([this, log_entry, ctx](int r) { + m_image_ctx.op_work_queue->queue(new LambdaContext( + [this, log_entry, ctx](int r) { + ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry + << " " << *log_entry << dendl; + log_entry->writeback(this->m_image_writeback, ctx); + }), 0); + }); + } + ctx->complete(0); + }); + this->detain_flush_guard_request(log_entry, guarded_ctx); + } + } else { + int count = entries_to_flush.size(); + std::vector> write_entries; + std::vector read_bls; + + write_entries.reserve(count); + read_bls.reserve(count); + + for (auto &log_entry : entries_to_flush) { + if (log_entry->is_write_entry()) { + bufferlist *bl = new bufferlist; + auto write_entry = static_pointer_cast(log_entry); + write_entry->inc_bl_refs(); + write_entries.push_back(write_entry); + read_bls.push_back(bl); + } + } + + Context *ctx = new LambdaContext( + [this, entries_to_flush, read_bls](int r) { + int i = 0; + GuardedRequestFunctionContext *guarded_ctx = nullptr; + + for (auto &log_entry : entries_to_flush) { + if (log_entry->is_write_entry()) { + bufferlist captured_entry_bl; + captured_entry_bl.claim_append(*read_bls[i]); + delete read_bls[i++]; + + guarded_ctx = new GuardedRequestFunctionContext([this, log_entry, captured_entry_bl] + (GuardedRequestFunctionContext &guard_ctx) { + log_entry->m_cell = guard_ctx.cell; + Context *ctx = this->construct_flush_entry(log_entry, false); + + m_image_ctx.op_work_queue->queue(new LambdaContext( + [this, log_entry, entry_bl=std::move(captured_entry_bl), ctx](int r) { + auto captured_entry_bl = std::move(entry_bl); + ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry + << " " << *log_entry << dendl; + log_entry->writeback_bl(this->m_image_writeback, ctx, + std::move(captured_entry_bl)); + }), 0); + }); + } else { + guarded_ctx = new GuardedRequestFunctionContext([this, log_entry] + (GuardedRequestFunctionContext &guard_ctx) { + log_entry->m_cell = guard_ctx.cell; + Context *ctx = this->construct_flush_entry(log_entry, false); + m_image_ctx.op_work_queue->queue(new LambdaContext( + [this, log_entry, ctx](int r) { + ldout(m_image_ctx.cct, 15) << "flushing:" << log_entry + << " " << *log_entry << dendl; + log_entry->writeback(this->m_image_writeback, ctx); + }), 0); + }); + } + this->detain_flush_guard_request(log_entry, guarded_ctx); + } + }); + + aio_read_data_blocks(write_entries, read_bls, ctx); + } +} + +template +void WriteLog::process_work() { + CephContext *cct = m_image_ctx.cct; + int max_iterations = 4; + bool wake_up_requested = false; + uint64_t aggressive_high_water_bytes = + this->m_bytes_allocated_cap * AGGRESSIVE_RETIRE_HIGH_WATER; + uint64_t high_water_bytes = this->m_bytes_allocated_cap * RETIRE_HIGH_WATER; + + ldout(cct, 20) << dendl; + + do { + { + std::lock_guard locker(m_lock); + this->m_wake_up_requested = false; + } + if (this->m_alloc_failed_since_retire || (this->m_shutting_down) || + this->m_invalidating || m_bytes_allocated > high_water_bytes) { + ldout(m_image_ctx.cct, 10) << "alloc_fail=" << this->m_alloc_failed_since_retire + << ", allocated > high_water=" + << (m_bytes_allocated > high_water_bytes) + << dendl; + retire_entries((this->m_shutting_down || this->m_invalidating || + m_bytes_allocated > aggressive_high_water_bytes) + ? MAX_ALLOC_PER_TRANSACTION : MAX_FREE_PER_TRANSACTION); + } + this->dispatch_deferred_writes(); + this->process_writeback_dirty_entries(); + { + std::lock_guard locker(m_lock); + wake_up_requested = this->m_wake_up_requested; + } + } while (wake_up_requested && --max_iterations > 0); + + { + std::lock_guard locker(m_lock); + this->m_wake_up_scheduled = false; + // Reschedule if it's still requested + if (this->m_wake_up_requested) { + this->wake_up(); + } + } +} + +/** + * Retire up to MAX_ALLOC_PER_TRANSACTION of the oldest log entries + * that are eligible to be retired. Returns true if anything was + * retired. + * +*/ +template +bool WriteLog::retire_entries(const unsigned long int frees_per_tx) { + CephContext *cct = m_image_ctx.cct; + GenericLogEntriesVector retiring_entries; + uint64_t initial_first_valid_entry; + uint64_t first_valid_entry; + + std::lock_guard retire_locker(this->m_log_retire_lock); + ldout(cct, 20) << "Look for entries to retire" << dendl; + { + // Entry readers can't be added while we hold m_entry_reader_lock + RWLock::WLocker entry_reader_locker(this->m_entry_reader_lock); + std::lock_guard locker(m_lock); + initial_first_valid_entry = m_first_valid_entry; + first_valid_entry = m_first_valid_entry; + while (retiring_entries.size() < frees_per_tx && !m_log_entries.empty()) { + GenericLogEntriesVector retiring_subentries; + uint64_t control_block_pos = m_log_entries.front()->log_entry_index; + uint64_t data_length = 0; + for (auto it = m_log_entries.begin(); it != m_log_entries.end(); ++it) { + if (this->can_retire_entry(*it)) { + // log_entry_index is valid after appending to SSD + if ((*it)->log_entry_index != control_block_pos) { + ldout(cct, 20) << "Old log_entry_index is " << control_block_pos + << ",New log_entry_index is " + << (*it)->log_entry_index + << ",data length is " << data_length << dendl; + ldout(cct, 20) << "The log entry is " << *(*it) << dendl; + if ((*it)->log_entry_index < control_block_pos) { + ceph_assert((*it)->log_entry_index == + (control_block_pos + data_length + MIN_WRITE_ALLOC_SSD_SIZE) % + this->m_log_pool_size + DATA_RING_BUFFER_OFFSET); + } else { + ceph_assert((*it)->log_entry_index == control_block_pos + + data_length + MIN_WRITE_ALLOC_SSD_SIZE); + } + break; + } else { + retiring_subentries.push_back(*it); + if ((*it)->is_write_entry()) { + data_length += (*it)->get_aligned_data_size(); + } + } + } else { + retiring_subentries.clear(); + break; + } + } + // SSD: retiring_subentries in a span + if (!retiring_subentries.empty()) { + for (auto it = retiring_subentries.begin(); + it != retiring_subentries.end(); it++) { + ceph_assert(m_log_entries.front() == *it); + m_log_entries.pop_front(); + if ((*it)->write_bytes() > 0 || (*it)->bytes_dirty() > 0) { + auto gen_write_entry = static_pointer_cast(*it); + if (gen_write_entry) { + this->m_blocks_to_log_entries.remove_log_entry(gen_write_entry); + } + } + } + + ldout(cct, 20) << "span with " << retiring_subentries.size() + << " entries: control_block_pos=" << control_block_pos + << " data_length=" << data_length + << dendl; + retiring_entries.insert( + retiring_entries.end(), retiring_subentries.begin(), + retiring_subentries.end()); + + first_valid_entry = control_block_pos + data_length + + MIN_WRITE_ALLOC_SSD_SIZE; + if (first_valid_entry >= this->m_log_pool_size) { + first_valid_entry = first_valid_entry % this->m_log_pool_size + + DATA_RING_BUFFER_OFFSET; + } + } else { + break; + } + } + } + if (retiring_entries.size()) { + ldout(cct, 20) << "Retiring " << retiring_entries.size() << " entries" + << dendl; + + // Advance first valid entry and release buffers + uint64_t flushed_sync_gen; + std::lock_guard append_locker(this->m_log_append_lock); + { + std::lock_guard locker(m_lock); + flushed_sync_gen = this->m_flushed_sync_gen; + } + + ceph_assert(first_valid_entry != initial_first_valid_entry); + auto new_root = std::make_shared(pool_root); + new_root->flushed_sync_gen = flushed_sync_gen; + new_root->first_valid_entry = first_valid_entry; + pool_root.flushed_sync_gen = flushed_sync_gen; + pool_root.first_valid_entry = first_valid_entry; + + Context *ctx = new LambdaContext( + [this, first_valid_entry, initial_first_valid_entry, + retiring_entries](int r) { + uint64_t allocated_bytes = 0; + uint64_t cached_bytes = 0; + uint64_t former_log_pos = 0; + for (auto &entry : retiring_entries) { + ceph_assert(entry->log_entry_index != 0); + if (entry->log_entry_index != former_log_pos ) { + // Space for control blocks + allocated_bytes += MIN_WRITE_ALLOC_SSD_SIZE; + former_log_pos = entry->log_entry_index; + } + if (entry->is_write_entry()) { + cached_bytes += entry->write_bytes(); + // space for userdata + allocated_bytes += entry->get_aligned_data_size(); + } + } + bool need_update_state = false; + { + std::lock_guard locker(m_lock); + m_first_valid_entry = first_valid_entry; + ceph_assert(m_first_valid_entry % MIN_WRITE_ALLOC_SSD_SIZE == 0); + ceph_assert(this->m_bytes_allocated >= allocated_bytes); + this->m_bytes_allocated -= allocated_bytes; + ceph_assert(this->m_bytes_cached >= cached_bytes); + this->m_bytes_cached -= cached_bytes; + if (!m_cache_state->empty && m_log_entries.empty()) { + m_cache_state->empty = true; + this->update_image_cache_state(); + need_update_state = true; + } + + ldout(m_image_ctx.cct, 20) + << "Finished root update: initial_first_valid_entry=" + << initial_first_valid_entry << ", m_first_valid_entry=" + << m_first_valid_entry << ", release space = " + << allocated_bytes << ", m_bytes_allocated=" + << m_bytes_allocated << ", release cached space=" + << cached_bytes << ", m_bytes_cached=" + << this->m_bytes_cached << dendl; + + this->m_alloc_failed_since_retire = false; + this->wake_up(); + } + if (need_update_state) { + std::unique_lock locker(m_lock); + this->write_image_cache_state(locker); + } + + this->dispatch_deferred_writes(); + this->process_writeback_dirty_entries(); + m_async_update_superblock--; + this->m_async_op_tracker.finish_op(); + }); + + std::lock_guard locker(m_lock); + schedule_update_root(new_root, ctx); + } else { + ldout(cct, 20) << "Nothing to retire" << dendl; + return false; + } + return true; +} + +template +void WriteLog::append_ops(GenericLogOperations &ops, Context *ctx, + uint64_t* new_first_free_entry) { + GenericLogEntriesVector log_entries; + CephContext *cct = m_image_ctx.cct; + uint64_t span_payload_len = 0; + uint64_t bytes_to_free = 0; + ldout(cct, 20) << "Appending " << ops.size() << " log entries." << dendl; + + *new_first_free_entry = pool_root.first_free_entry; + AioTransContext* aio = new AioTransContext(cct, ctx); + + utime_t now = ceph_clock_now(); + for (auto &operation : ops) { + operation->log_append_start_time = now; + auto log_entry = operation->get_log_entry(); + + if (log_entries.size() == CONTROL_BLOCK_MAX_LOG_ENTRIES || + span_payload_len >= SPAN_MAX_DATA_LEN) { + if (log_entries.size() > 1) { + bytes_to_free += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE; + } + write_log_entries(log_entries, aio, new_first_free_entry); + log_entries.clear(); + span_payload_len = 0; + } + log_entries.push_back(log_entry); + span_payload_len += log_entry->write_bytes(); + } + if (!span_payload_len || !log_entries.empty()) { + if (log_entries.size() > 1) { + bytes_to_free += (log_entries.size() - 1) * MIN_WRITE_ALLOC_SSD_SIZE; + } + write_log_entries(log_entries, aio, new_first_free_entry); + } + + { + std::lock_guard locker1(m_lock); + m_first_free_entry = *new_first_free_entry; + m_bytes_allocated -= bytes_to_free; + } + + bdev->aio_submit(&aio->ioc); +} + +template +void WriteLog::write_log_entries(GenericLogEntriesVector log_entries, + AioTransContext *aio, uint64_t *pos) { + CephContext *cct = m_image_ctx.cct; + ldout(m_image_ctx.cct, 20) << "pos=" << *pos << dendl; + ceph_assert(*pos >= DATA_RING_BUFFER_OFFSET && + *pos < this->m_log_pool_size && + *pos % MIN_WRITE_ALLOC_SSD_SIZE == 0); + + // The first block is for log entries + uint64_t control_block_pos = *pos; + *pos += MIN_WRITE_ALLOC_SSD_SIZE; + if (*pos == this->m_log_pool_size) { + *pos = DATA_RING_BUFFER_OFFSET; + } + + std::vector persist_log_entries; + bufferlist data_bl; + for (auto &log_entry : log_entries) { + log_entry->log_entry_index = control_block_pos; + // Append data buffer for write operations + if (log_entry->is_write_entry()) { + auto write_entry = static_pointer_cast(log_entry); + auto cache_bl = write_entry->get_cache_bl(); + auto align_size = write_entry->get_aligned_data_size(); + data_bl.append(cache_bl); + data_bl.append_zero(align_size - cache_bl.length()); + + write_entry->ram_entry.write_data_pos = *pos; + *pos += align_size; + if (*pos >= this->m_log_pool_size) { + *pos = *pos % this->m_log_pool_size + DATA_RING_BUFFER_OFFSET; + } + } + // push_back _after_ setting write_data_pos + persist_log_entries.push_back(log_entry->ram_entry); + } + + //aio write + bufferlist bl; + encode(persist_log_entries, bl); + ceph_assert(bl.length() <= MIN_WRITE_ALLOC_SSD_SIZE); + bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length()); + bl.append(data_bl); + ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0); + if (control_block_pos + bl.length() > this->m_log_pool_size) { + //exceeds border, need to split + uint64_t size = bl.length(); + bufferlist bl1; + bl.splice(0, this->m_log_pool_size - control_block_pos, &bl1); + ceph_assert(bl.length() == (size - bl1.length())); + + ldout(cct, 20) << "write " << control_block_pos << "~" + << size << " spans boundary, split into " + << control_block_pos << "~" << bl1.length() + << " and " << DATA_RING_BUFFER_OFFSET << "~" + << bl.length() << dendl; + bdev->aio_write(control_block_pos, bl1, &aio->ioc, false, + WRITE_LIFE_NOT_SET); + bdev->aio_write(DATA_RING_BUFFER_OFFSET, bl, &aio->ioc, false, + WRITE_LIFE_NOT_SET); + } else { + ldout(cct, 20) << "write " << control_block_pos << "~" + << bl.length() << dendl; + bdev->aio_write(control_block_pos, bl, &aio->ioc, false, + WRITE_LIFE_NOT_SET); + } +} + +template +void WriteLog::schedule_update_root( + std::shared_ptr root, Context *ctx) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 15) << "New root: pool_size=" << root->pool_size + << " first_valid_entry=" << root->first_valid_entry + << " first_free_entry=" << root->first_free_entry + << " flushed_sync_gen=" << root->flushed_sync_gen + << dendl; + ceph_assert(is_valid_pool_root(*root)); + + bool need_finisher; + { + ceph_assert(ceph_mutex_is_locked_by_me(m_lock)); + need_finisher = m_poolroot_to_update.empty() && !m_updating_pool_root; + std::shared_ptr entry = + std::make_shared(root, ctx); + this->m_async_update_superblock++; + this->m_async_op_tracker.start_op(); + m_poolroot_to_update.emplace_back(entry); + } + if (need_finisher) { + enlist_op_update_root(); + } +} + +template +void WriteLog::enlist_op_update_root() { + Context *append_ctx = new LambdaContext([this](int r) { + update_root_scheduled_ops(); + }); + this->m_work_queue.queue(append_ctx); +} + +template +void WriteLog::update_root_scheduled_ops() { + ldout(m_image_ctx.cct, 20) << dendl; + + std::shared_ptr root; + WriteLogPoolRootUpdateList root_updates; + Context *ctx = nullptr; + { + std::lock_guard locker(m_lock); + if (m_updating_pool_root) { + /* Another thread is appending */ + ldout(m_image_ctx.cct, 15) << "Another thread is updating pool root" + << dendl; + return; + } + if (m_poolroot_to_update.size()) { + m_updating_pool_root = true; + root_updates.swap(m_poolroot_to_update); + } + } + ceph_assert(!root_updates.empty()); + ldout(m_image_ctx.cct, 15) << "Update root number: " << root_updates.size() + << dendl; + // We just update the last one, and call all the completions. + auto entry = root_updates.back(); + root = entry->root; + + ctx = new LambdaContext([this, updates = std::move(root_updates)](int r) { + ldout(m_image_ctx.cct, 15) << "Start to callback." << dendl; + for (auto it = updates.begin(); it != updates.end(); it++) { + Context *it_ctx = (*it)->ctx; + it_ctx->complete(r); + } + }); + Context *append_ctx = new LambdaContext([this, ctx](int r) { + ldout(m_image_ctx.cct, 15) << "Finish the update of pool root." << dendl; + bool need_finisher = false; + assert(r == 0); + { + std::lock_guard locker(m_lock); + m_updating_pool_root = false; + need_finisher = !m_poolroot_to_update.empty(); + } + if (need_finisher) { + enlist_op_update_root(); + } + ctx->complete(r); + }); + AioTransContext* aio = new AioTransContext(m_image_ctx.cct, append_ctx); + update_pool_root(root, aio); +} + +template +void WriteLog::update_pool_root(std::shared_ptr root, + AioTransContext *aio) { + bufferlist bl; + SuperBlock superblock; + superblock.root = *root; + encode(superblock, bl); + bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length()); + ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0); + bdev->aio_write(0, bl, &aio->ioc, false, WRITE_LIFE_NOT_SET); + bdev->aio_submit(&aio->ioc); +} + +template +int WriteLog::update_pool_root_sync( + std::shared_ptr root) { + bufferlist bl; + SuperBlock superblock; + superblock.root = *root; + encode(superblock, bl); + bl.append_zero(MIN_WRITE_ALLOC_SSD_SIZE - bl.length()); + ceph_assert(bl.length() % MIN_WRITE_ALLOC_SSD_SIZE == 0); + return bdev->write(0, bl, false); +} + +template +void WriteLog::aio_read_data_block(std::shared_ptr log_entry, + bufferlist *bl, Context *ctx) { + std::vector> log_entries = {std::move(log_entry)}; + std::vector bls {bl}; + aio_read_data_blocks(log_entries, bls, ctx); +} + +template +void WriteLog::aio_read_data_blocks( + std::vector> &log_entries, + std::vector &bls, Context *ctx) { + ceph_assert(log_entries.size() == bls.size()); + + //get the valid part + Context *read_ctx = new LambdaContext( + [log_entries, bls, ctx](int r) { + for (unsigned int i = 0; i < log_entries.size(); i++) { + bufferlist valid_data_bl; + auto write_entry = static_pointer_cast(log_entries[i]); + auto length = write_entry->ram_entry.is_write() ? write_entry->ram_entry.write_bytes + : write_entry->ram_entry.ws_datalen; + + valid_data_bl.substr_of(*bls[i], 0, length); + bls[i]->clear(); + bls[i]->append(valid_data_bl); + write_entry->dec_bl_refs(); + } + ctx->complete(r); + }); + + CephContext *cct = m_image_ctx.cct; + AioTransContext *aio = new AioTransContext(cct, read_ctx); + for (unsigned int i = 0; i < log_entries.size(); i++) { + WriteLogCacheEntry *log_entry = &log_entries[i]->ram_entry; + + ceph_assert(log_entry->is_write() || log_entry->is_writesame()); + uint64_t len = log_entry->is_write() ? log_entry->write_bytes : + log_entry->ws_datalen; + uint64_t align_len = round_up_to(len, MIN_WRITE_ALLOC_SSD_SIZE); + + ldout(cct, 20) << "entry i=" << i << " " << log_entry->write_data_pos + << "~" << len << dendl; + ceph_assert(log_entry->write_data_pos >= DATA_RING_BUFFER_OFFSET && + log_entry->write_data_pos < pool_root.pool_size); + ceph_assert(align_len); + if (log_entry->write_data_pos + align_len > pool_root.pool_size) { + // spans boundary, need to split + uint64_t len1 = pool_root.pool_size - log_entry->write_data_pos; + uint64_t len2 = align_len - len1; + + ldout(cct, 20) << "read " << log_entry->write_data_pos << "~" + << align_len << " spans boundary, split into " + << log_entry->write_data_pos << "~" << len1 + << " and " << DATA_RING_BUFFER_OFFSET << "~" + << len2 << dendl; + bdev->aio_read(log_entry->write_data_pos, len1, bls[i], &aio->ioc); + bdev->aio_read(DATA_RING_BUFFER_OFFSET, len2, bls[i], &aio->ioc); + } else { + ldout(cct, 20) << "read " << log_entry->write_data_pos << "~" + << align_len << dendl; + bdev->aio_read(log_entry->write_data_pos, align_len, bls[i], &aio->ioc); + } + } + bdev->aio_submit(&aio->ioc); +} + +template +void WriteLog::complete_user_request(Context *&user_req, int r) { + m_image_ctx.op_work_queue->queue(user_req, r); +} + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd + +template class librbd::cache::pwl::ssd::WriteLog; diff --git a/src/librbd/cache/pwl/ssd/WriteLog.h b/src/librbd/cache/pwl/ssd/WriteLog.h new file mode 100644 index 000000000..69cc36662 --- /dev/null +++ b/src/librbd/cache/pwl/ssd/WriteLog.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG +#define CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG + +#include "blk/BlockDevice.h" +#include "common/AsyncOpTracker.h" +#include "common/Checksummer.h" +#include "common/environment.h" +#include "common/RWLock.h" +#include "common/WorkQueue.h" +#include "librbd/BlockGuard.h" +#include "librbd/Utils.h" +#include "librbd/cache/ImageWriteback.h" +#include "librbd/cache/Types.h" +#include "librbd/cache/pwl/AbstractWriteLog.h" +#include "librbd/cache/pwl/LogMap.h" +#include "librbd/cache/pwl/LogOperation.h" +#include "librbd/cache/pwl/Request.h" +#include "librbd/cache/pwl/ssd/Builder.h" +#include "librbd/cache/pwl/ssd/Types.h" +#include +#include + +namespace librbd { + +struct ImageCtx; + +namespace cache { +namespace pwl { +namespace ssd { + +template +class WriteLog : public AbstractWriteLog { +public: + WriteLog(ImageCtxT &image_ctx, + librbd::cache::pwl::ImageCacheState* cache_state, + cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api); + ~WriteLog(); + WriteLog(const WriteLog&) = delete; + WriteLog &operator=(const WriteLog&) = delete; + + typedef io::Extent Extent; + using This = AbstractWriteLog; + using C_BlockIORequestT = pwl::C_BlockIORequest; + using C_WriteRequestT = pwl::C_WriteRequest; + using C_WriteSameRequestT = pwl::C_WriteSameRequest; + + bool alloc_resources(C_BlockIORequestT *req) override; + void setup_schedule_append( + pwl::GenericLogOperationsVector &ops, bool do_early_flush, + C_BlockIORequestT *req) override; + void complete_user_request(Context *&user_req, int r) override; + +protected: + using AbstractWriteLog::m_lock; + using AbstractWriteLog::m_log_entries; + using AbstractWriteLog::m_image_ctx; + using AbstractWriteLog::m_cache_state; + using AbstractWriteLog::m_first_free_entry; + using AbstractWriteLog::m_first_valid_entry; + using AbstractWriteLog::m_bytes_allocated; + + bool initialize_pool(Context *on_finish, + pwl::DeferredContexts &later) override; + void process_work() override; + void append_scheduled_ops(void) override; + void schedule_append_ops(pwl::GenericLogOperations &ops, C_BlockIORequestT *req) override; + void remove_pool_file() override; + void release_ram(std::shared_ptr log_entry) override; + +private: + class AioTransContext { + public: + Context *on_finish; + ::IOContext ioc; + explicit AioTransContext(CephContext* cct, Context *cb) + : on_finish(cb), ioc(cct, this) {} + + ~AioTransContext(){} + + void aio_finish() { + on_finish->complete(ioc.get_return_value()); + delete this; + } + }; //class AioTransContext + + struct WriteLogPoolRootUpdate { + std::shared_ptr root; + Context *ctx; + WriteLogPoolRootUpdate(std::shared_ptr r, + Context* c) + : root(r), ctx(c) {} + }; + + using WriteLogPoolRootUpdateList = std::list>; + WriteLogPoolRootUpdateList m_poolroot_to_update; /* pool root list to update to SSD */ + bool m_updating_pool_root = false; + + std::atomic m_async_update_superblock = {0}; + BlockDevice *bdev = nullptr; + pwl::WriteLogPoolRoot pool_root; + Builder *m_builderobj; + + Builder* create_builder(); + int create_and_open_bdev(); + void load_existing_entries(pwl::DeferredContexts &later); + void inc_allocated_cached_bytes( + std::shared_ptr log_entry) override; + void collect_read_extents( + uint64_t read_buffer_offset, LogMapEntry map_entry, + std::vector> &log_entries_to_read, + std::vector &bls_to_read, uint64_t entry_hit_length, + Extent hit_extent, pwl::C_ReadRequest *read_ctx) override; + void complete_read( + std::vector> &log_entries_to_read, + std::vector &bls_to_read, Context *ctx) override; + void enlist_op_appender(); + bool retire_entries(const unsigned long int frees_per_tx); + bool has_sync_point_logs(GenericLogOperations &ops); + void append_op_log_entries(GenericLogOperations &ops); + void alloc_op_log_entries(GenericLogOperations &ops); + void construct_flush_entries(pwl::GenericLogEntries entires_to_flush, + DeferredContexts &post_unlock, + bool has_write_entry) override; + void append_ops(GenericLogOperations &ops, Context *ctx, + uint64_t* new_first_free_entry); + void write_log_entries(GenericLogEntriesVector log_entries, + AioTransContext *aio, uint64_t *pos); + void schedule_update_root(std::shared_ptr root, + Context *ctx); + void enlist_op_update_root(); + void update_root_scheduled_ops(); + int update_pool_root_sync(std::shared_ptr root); + void update_pool_root(std::shared_ptr root, + AioTransContext *aio); + void aio_read_data_block(std::shared_ptr log_entry, + bufferlist *bl, Context *ctx); + void aio_read_data_blocks(std::vector> &log_entries, + std::vector &bls, Context *ctx); + static void aio_cache_cb(void *priv, void *priv2) { + AioTransContext *c = static_cast(priv2); + c->aio_finish(); + } +};//class WriteLog + +} // namespace ssd +} // namespace pwl +} // namespace cache +} // namespace librbd + +extern template class librbd::cache::pwl::ssd::WriteLog; + +#endif // CEPH_LIBRBD_CACHE_PWL_SSD_WRITE_LOG diff --git a/src/librbd/crypto/BlockCrypto.cc b/src/librbd/crypto/BlockCrypto.cc new file mode 100644 index 000000000..cce90d2e3 --- /dev/null +++ b/src/librbd/crypto/BlockCrypto.cc @@ -0,0 +1,132 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/crypto/BlockCrypto.h" +#include "include/byteorder.h" +#include "include/ceph_assert.h" +#include "include/scope_guard.h" + +#include +#include + +namespace librbd { +namespace crypto { + +template +BlockCrypto::BlockCrypto(CephContext* cct, DataCryptor* data_cryptor, + uint64_t block_size, uint64_t data_offset) + : m_cct(cct), m_data_cryptor(data_cryptor), m_block_size(block_size), + m_data_offset(data_offset), m_iv_size(data_cryptor->get_iv_size()) { + ceph_assert(std::has_single_bit(block_size)); + ceph_assert((block_size % data_cryptor->get_block_size()) == 0); + ceph_assert((block_size % 512) == 0); +} + +template +BlockCrypto::~BlockCrypto() { + if (m_data_cryptor != nullptr) { + delete m_data_cryptor; + m_data_cryptor = nullptr; + } +} + +template +int BlockCrypto::crypt(ceph::bufferlist* data, uint64_t image_offset, + CipherMode mode) { + if (image_offset % m_block_size != 0) { + lderr(m_cct) << "image offset: " << image_offset + << " not aligned to block size: " << m_block_size << dendl; + return -EINVAL; + } + if (data->length() % m_block_size != 0) { + lderr(m_cct) << "data length: " << data->length() + << " not aligned to block size: " << m_block_size << dendl; + return -EINVAL; + } + + unsigned char* iv = (unsigned char*)alloca(m_iv_size); + memset(iv, 0, m_iv_size); + + bufferlist src = *data; + data->clear(); + + auto ctx = m_data_cryptor->get_context(mode); + if (ctx == nullptr) { + lderr(m_cct) << "unable to get crypt context" << dendl; + return -EIO; + } + + auto sg = make_scope_guard([&] { + m_data_cryptor->return_context(ctx, mode); }); + + auto sector_number = image_offset / 512; + auto appender = data->get_contiguous_appender(src.length()); + unsigned char* out_buf_ptr = nullptr; + unsigned char* leftover_block = (unsigned char*)alloca(m_block_size); + uint32_t leftover_size = 0; + for (auto buf = src.buffers().begin(); buf != src.buffers().end(); ++buf) { + auto in_buf_ptr = reinterpret_cast(buf->c_str()); + auto remaining_buf_bytes = buf->length(); + while (remaining_buf_bytes > 0) { + if (leftover_size == 0) { + auto block_offset_le = ceph_le64(sector_number); + memcpy(iv, &block_offset_le, sizeof(block_offset_le)); + auto r = m_data_cryptor->init_context(ctx, iv, m_iv_size); + if (r != 0) { + lderr(m_cct) << "unable to init cipher's IV" << dendl; + return r; + } + + out_buf_ptr = reinterpret_cast( + appender.get_pos_add(m_block_size)); + sector_number += m_block_size / 512; + } + + if (leftover_size > 0 || remaining_buf_bytes < m_block_size) { + auto copy_size = std::min( + (uint32_t)m_block_size - leftover_size, remaining_buf_bytes); + memcpy(leftover_block + leftover_size, in_buf_ptr, copy_size); + in_buf_ptr += copy_size; + leftover_size += copy_size; + remaining_buf_bytes -= copy_size; + } + + int crypto_output_length = 0; + if (leftover_size == 0) { + crypto_output_length = m_data_cryptor->update_context( + ctx, in_buf_ptr, out_buf_ptr, m_block_size); + + in_buf_ptr += m_block_size; + remaining_buf_bytes -= m_block_size; + } else if (leftover_size == m_block_size) { + crypto_output_length = m_data_cryptor->update_context( + ctx, leftover_block, out_buf_ptr, m_block_size); + leftover_size = 0; + } + + if (crypto_output_length < 0) { + lderr(m_cct) << "crypt update failed" << dendl; + return crypto_output_length; + } + + out_buf_ptr += crypto_output_length; + } + } + + return 0; +} + +template +int BlockCrypto::encrypt(ceph::bufferlist* data, uint64_t image_offset) { + return crypt(data, image_offset, CipherMode::CIPHER_MODE_ENC); +} + +template +int BlockCrypto::decrypt(ceph::bufferlist* data, uint64_t image_offset) { + return crypt(data, image_offset, CipherMode::CIPHER_MODE_DEC); +} + +} // namespace crypto +} // namespace librbd + +template class librbd::crypto::BlockCrypto; diff --git a/src/librbd/crypto/BlockCrypto.h b/src/librbd/crypto/BlockCrypto.h new file mode 100644 index 000000000..0bbdd2524 --- /dev/null +++ b/src/librbd/crypto/BlockCrypto.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_BLOCK_CRYPTO_H +#define CEPH_LIBRBD_CRYPTO_BLOCK_CRYPTO_H + +#include "include/Context.h" +#include "librbd/crypto/CryptoInterface.h" +#include "librbd/crypto/openssl/DataCryptor.h" + +namespace librbd { +namespace crypto { + +template +class BlockCrypto : public CryptoInterface { + +public: + static BlockCrypto* create(CephContext* cct, DataCryptor* data_cryptor, + uint32_t block_size, uint64_t data_offset) { + return new BlockCrypto(cct, data_cryptor, block_size, data_offset); + } + BlockCrypto(CephContext* cct, DataCryptor* data_cryptor, + uint64_t block_size, uint64_t data_offset); + ~BlockCrypto(); + + int encrypt(ceph::bufferlist* data, uint64_t image_offset) override; + int decrypt(ceph::bufferlist* data, uint64_t image_offset) override; + + uint64_t get_block_size() const override { + return m_block_size; + } + + uint64_t get_data_offset() const override { + return m_data_offset; + } + + const unsigned char* get_key() const override { + return m_data_cryptor->get_key(); + } + + int get_key_length() const override { + return m_data_cryptor->get_key_length(); + } + +private: + CephContext* m_cct; + DataCryptor* m_data_cryptor; + uint64_t m_block_size; + uint64_t m_data_offset; + uint32_t m_iv_size; + + int crypt(ceph::bufferlist* data, uint64_t image_offset, CipherMode mode); +}; + +} // namespace crypto +} // namespace librbd + +extern template class librbd::crypto::BlockCrypto; + +#endif //CEPH_LIBRBD_CRYPTO_BLOCK_CRYPTO_H diff --git a/src/librbd/crypto/CryptoContextPool.cc b/src/librbd/crypto/CryptoContextPool.cc new file mode 100644 index 000000000..b303a54ec --- /dev/null +++ b/src/librbd/crypto/CryptoContextPool.cc @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/crypto/CryptoContextPool.h" + +namespace librbd { +namespace crypto { + +template +CryptoContextPool::CryptoContextPool(DataCryptor* data_cryptor, + uint32_t pool_size) + : m_data_cryptor(data_cryptor), m_encrypt_contexts(pool_size), + m_decrypt_contexts(pool_size) { +} + +template +CryptoContextPool::~CryptoContextPool() { + T* ctx; + while (m_encrypt_contexts.pop(ctx)) { + m_data_cryptor->return_context(ctx, CipherMode::CIPHER_MODE_ENC); + } + while (m_decrypt_contexts.pop(ctx)) { + m_data_cryptor->return_context(ctx, CipherMode::CIPHER_MODE_DEC); + } +} + +template +T* CryptoContextPool::get_context(CipherMode mode) { + T* ctx; + if (!get_contexts(mode).pop(ctx)) { + ctx = m_data_cryptor->get_context(mode); + } + return ctx; +} + +template +void CryptoContextPool::return_context(T* ctx, CipherMode mode) { + if (!get_contexts(mode).push(ctx)) { + m_data_cryptor->return_context(ctx, mode); + } +} + +} // namespace crypto +} // namespace librbd diff --git a/src/librbd/crypto/CryptoContextPool.h b/src/librbd/crypto/CryptoContextPool.h new file mode 100644 index 000000000..00486dacd --- /dev/null +++ b/src/librbd/crypto/CryptoContextPool.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_CRYPTO_CONTEXT_POOL_H +#define CEPH_LIBRBD_CRYPTO_CRYPTO_CONTEXT_POOL_H + +#include "librbd/crypto/DataCryptor.h" +#include "include/ceph_assert.h" +#include + +namespace librbd { +namespace crypto { + +template +class CryptoContextPool : public DataCryptor { + +public: + CryptoContextPool(DataCryptor* data_cryptor, uint32_t pool_size); + ~CryptoContextPool(); + + T* get_context(CipherMode mode) override; + void return_context(T* ctx, CipherMode mode) override; + + inline uint32_t get_block_size() const override { + return m_data_cryptor->get_block_size(); + } + inline uint32_t get_iv_size() const override { + return m_data_cryptor->get_iv_size(); + } + inline int get_key_length() const override { + return m_data_cryptor->get_key_length(); + } + inline const unsigned char* get_key() const override { + return m_data_cryptor->get_key(); + } + inline int init_context(T* ctx, const unsigned char* iv, + uint32_t iv_length) const override { + return m_data_cryptor->init_context(ctx, iv, iv_length); + } + inline int update_context(T* ctx, const unsigned char* in, + unsigned char* out, + uint32_t len) const override { + return m_data_cryptor->update_context(ctx, in, out, len); + } + + using ContextQueue = boost::lockfree::queue; + +private: + DataCryptor* m_data_cryptor; + ContextQueue m_encrypt_contexts; + ContextQueue m_decrypt_contexts; + + inline ContextQueue& get_contexts(CipherMode mode) { + switch(mode) { + case CIPHER_MODE_ENC: + return m_encrypt_contexts; + case CIPHER_MODE_DEC: + return m_decrypt_contexts; + default: + ceph_assert(false); + } + } +}; + +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_CRYPTO_CONTEXT_POOL_H diff --git a/src/librbd/crypto/CryptoImageDispatch.cc b/src/librbd/crypto/CryptoImageDispatch.cc new file mode 100644 index 000000000..4d4c360dc --- /dev/null +++ b/src/librbd/crypto/CryptoImageDispatch.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/crypto/CryptoImageDispatch.h" + +namespace librbd { +namespace crypto { + +CryptoImageDispatch::CryptoImageDispatch( + uint64_t data_offset) : m_data_offset(data_offset) { +} + +void CryptoImageDispatch::remap_to_physical(io::Extents& image_extents, + io::ImageArea area) { + switch (area) { + case io::ImageArea::DATA: + for (auto& [off, _] : image_extents) { + off += m_data_offset; + } + break; + case io::ImageArea::CRYPTO_HEADER: + // direct mapping + break; + default: + ceph_abort(); + } +} + +io::ImageArea CryptoImageDispatch::remap_to_logical( + io::Extents& image_extents) { + bool saw_data = false; + bool saw_crypto_header = false; + for (auto& [off, _] : image_extents) { + if (off >= m_data_offset) { + off -= m_data_offset; + saw_data = true; + } else { + saw_crypto_header = true; + } + } + if (saw_crypto_header) { + ceph_assert(!saw_data); + return io::ImageArea::CRYPTO_HEADER; + } + return io::ImageArea::DATA; +} + +} // namespace crypto +} // namespace librbd diff --git a/src/librbd/crypto/CryptoImageDispatch.h b/src/librbd/crypto/CryptoImageDispatch.h new file mode 100644 index 000000000..3ce658981 --- /dev/null +++ b/src/librbd/crypto/CryptoImageDispatch.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_CRYPTO_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_CRYPTO_CRYPTO_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" + +namespace librbd { +namespace crypto { + +class CryptoImageDispatch : public io::ImageDispatchInterface { +public: + static CryptoImageDispatch* create(uint64_t data_offset) { + return new CryptoImageDispatch(data_offset); + } + CryptoImageDispatch(uint64_t data_offset); + + io::ImageDispatchLayer get_dispatch_layer() const override { + return io::IMAGE_DISPATCH_LAYER_CRYPTO; + } + + void shut_down(Context* on_finish) override { + on_finish->complete(0); + } + + bool read( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + io::ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool discard( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool write_same( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool compare_and_write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool flush( + io::AioCompletion* aio_comp, io::FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool list_snaps( + io::AioCompletion* aio_comp, io::Extents&& image_extents, + io::SnapIds&& snap_ids, int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + + // called directly by ImageDispatcher + // TODO: hoist these out and remove CryptoImageDispatch since it's + // just a placeholder + void remap_to_physical(io::Extents& image_extents, io::ImageArea area); + io::ImageArea remap_to_logical(io::Extents& image_extents); + +private: + uint64_t m_data_offset; + +}; + +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_CRYPTO_IMAGE_DISPATCH_H diff --git a/src/librbd/crypto/CryptoInterface.h b/src/librbd/crypto/CryptoInterface.h new file mode 100644 index 000000000..1145494a9 --- /dev/null +++ b/src/librbd/crypto/CryptoInterface.h @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_CRYPTO_INTERFACE_H +#define CEPH_LIBRBD_CRYPTO_CRYPTO_INTERFACE_H + +#include "include/buffer.h" +#include "include/intarith.h" +#include "librbd/io/Types.h" + +namespace librbd { +namespace crypto { + +class CryptoInterface { + +public: + virtual ~CryptoInterface() = default; + + virtual int encrypt(ceph::bufferlist* data, uint64_t image_offset) = 0; + virtual int decrypt(ceph::bufferlist* data, uint64_t image_offset) = 0; + virtual uint64_t get_block_size() const = 0; + virtual uint64_t get_data_offset() const = 0; + virtual const unsigned char* get_key() const = 0; + virtual int get_key_length() const = 0; + + inline std::pair get_pre_and_post_align( + uint64_t off, uint64_t len) { + if (len == 0) { + return std::make_pair(0, 0); + } + auto block_size = get_block_size(); + return std::make_pair(p2phase(off, block_size), + p2nphase(off + len, block_size)); + } + + inline std::pair align(uint64_t off, uint64_t len) { + auto aligns = get_pre_and_post_align(off, len); + return std::make_pair(off - aligns.first, + len + aligns.first + aligns.second); + } + + inline bool is_aligned(uint64_t off, uint64_t len) { + auto aligns = get_pre_and_post_align(off, len); + return aligns.first == 0 && aligns.second == 0; + } + + inline bool is_aligned(const io::ReadExtents& extents) { + for (const auto& extent: extents) { + if (!is_aligned(extent.offset, extent.length)) { + return false; + } + } + return true; + } + + inline void align_extents(const io::ReadExtents& extents, + io::ReadExtents* aligned_extents) { + for (const auto& extent: extents) { + auto aligned = align(extent.offset, extent.length); + aligned_extents->emplace_back(aligned.first, aligned.second); + } + } + + inline int decrypt_aligned_extent(io::ReadExtent& extent, + uint64_t image_offset) { + if (extent.length == 0 || extent.bl.length() == 0) { + return 0; + } + + if (extent.extent_map.empty()) { + extent.extent_map.emplace_back(extent.offset, extent.bl.length()); + } + + ceph::bufferlist result_bl; + io::Extents result_extent_map; + + ceph::bufferlist curr_block_bl; + auto curr_offset = extent.offset; + auto curr_block_start_offset = curr_offset; + auto curr_block_end_offset = curr_offset; + + // this will add a final loop iteration for decrypting the last extent + extent.extent_map.emplace_back( + extent.offset + extent.length + get_block_size(), 0); + + for (auto [off, len]: extent.extent_map) { + auto [aligned_off, aligned_len] = align(off, len); + if (aligned_off > curr_block_end_offset) { + curr_block_bl.append_zero(curr_block_end_offset - curr_offset); + auto curr_block_length = curr_block_bl.length(); + if (curr_block_length > 0) { + auto r = decrypt( + &curr_block_bl, + image_offset + curr_block_start_offset - extent.offset); + if (r != 0) { + return r; + } + + curr_block_bl.splice(0, curr_block_length, &result_bl); + result_extent_map.emplace_back( + curr_block_start_offset, curr_block_length); + } + + curr_block_start_offset = aligned_off; + curr_block_end_offset = aligned_off + aligned_len; + curr_offset = aligned_off; + } + + curr_block_bl.append_zero(off - curr_offset); + extent.bl.splice(0, len, &curr_block_bl); + curr_offset = off + len; + curr_block_end_offset = aligned_off + aligned_len; + } + + extent.bl = std::move(result_bl); + extent.extent_map = std::move(result_extent_map); + + return 0; + } +}; + +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_CRYPTO_INTERFACE_H diff --git a/src/librbd/crypto/CryptoObjectDispatch.cc b/src/librbd/crypto/CryptoObjectDispatch.cc new file mode 100644 index 000000000..6ba449099 --- /dev/null +++ b/src/librbd/crypto/CryptoObjectDispatch.cc @@ -0,0 +1,691 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/crypto/CryptoObjectDispatch.h" +#include "include/ceph_assert.h" +#include "include/neorados/RADOS.hpp" +#include "common/dout.h" +#include "osdc/Striper.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/crypto/CryptoInterface.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::CryptoObjectDispatch: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace crypto { + +using librbd::util::create_context_callback; +using librbd::util::data_object_name; + +template +uint64_t get_file_offset(I* image_ctx, uint64_t object_no, + uint64_t object_off) { + auto off = io::util::raw_to_area_offset( + *image_ctx, Striper::get_file_offset(image_ctx->cct, &image_ctx->layout, + object_no, object_off)); + ceph_assert(off.second == io::ImageArea::DATA); + return off.first; +} + +template +struct C_AlignedObjectReadRequest : public Context { + I* image_ctx; + CryptoInterface* crypto; + uint64_t object_no; + io::ReadExtents* extents; + IOContext io_context; + const ZTracer::Trace parent_trace; + uint64_t* version; + Context* on_finish; + io::ObjectDispatchSpec* req; + bool disable_read_from_parent; + + C_AlignedObjectReadRequest( + I* image_ctx, CryptoInterface* crypto, + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + Context* on_dispatched + ) : image_ctx(image_ctx), crypto(crypto), object_no(object_no), + extents(extents), io_context(io_context), + parent_trace(parent_trace), version(version), + on_finish(on_dispatched) { + disable_read_from_parent = + ((read_flags & io::READ_FLAG_DISABLE_READ_FROM_PARENT) != 0); + read_flags |= io::READ_FLAG_DISABLE_READ_FROM_PARENT; + + auto ctx = create_context_callback< + C_AlignedObjectReadRequest, + &C_AlignedObjectReadRequest::handle_read>(this); + + req = io::ObjectDispatchSpec::create_read( + image_ctx, io::OBJECT_DISPATCH_LAYER_CRYPTO, object_no, + extents, io_context, op_flags, read_flags, parent_trace, + version, ctx); + } + + void send() { + req->send(); + } + + void finish(int r) override { + ldout(image_ctx->cct, 20) << "aligned read r=" << r << dendl; + on_finish->complete(r); + } + + void handle_read(int r) { + auto cct = image_ctx->cct; + ldout(cct, 20) << "aligned read r=" << r << dendl; + if (r >= 0) { + r = 0; + for (auto& extent: *extents) { + auto crypto_ret = crypto->decrypt_aligned_extent( + extent, get_file_offset(image_ctx, object_no, extent.offset)); + if (crypto_ret != 0) { + ceph_assert(crypto_ret < 0); + r = crypto_ret; + break; + } + r += extent.length; + } + } + + if (r == -ENOENT && !disable_read_from_parent) { + io::util::read_parent( + image_ctx, object_no, extents, + io_context->read_snap().value_or(CEPH_NOSNAP), + parent_trace, this); + } else { + complete(r); + } + } +}; + +template +struct C_UnalignedObjectReadRequest : public Context { + CephContext* cct; + io::ReadExtents* extents; + Context* on_finish; + io::ReadExtents aligned_extents; + io::ObjectDispatchSpec* req; + + C_UnalignedObjectReadRequest( + I* image_ctx, CryptoInterface* crypto, + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + Context* on_dispatched) : cct(image_ctx->cct), extents(extents), + on_finish(on_dispatched) { + crypto->align_extents(*extents, &aligned_extents); + + // send the aligned read back to get decrypted + req = io::ObjectDispatchSpec::create_read( + image_ctx, + io::util::get_previous_layer(io::OBJECT_DISPATCH_LAYER_CRYPTO), + object_no, &aligned_extents, io_context, op_flags, read_flags, + parent_trace, version, this); + } + + void send() { + req->send(); + } + + void remove_alignment_data() { + for (uint64_t i = 0; i < extents->size(); ++i) { + auto& extent = (*extents)[i]; + auto& aligned_extent = aligned_extents[i]; + if (aligned_extent.extent_map.empty()) { + uint64_t cut_offset = extent.offset - aligned_extent.offset; + int64_t padding_count = + cut_offset + extent.length - aligned_extent.bl.length(); + if (padding_count > 0) { + aligned_extent.bl.append_zero(padding_count); + } + aligned_extent.bl.splice(cut_offset, extent.length, &extent.bl); + } else { + for (auto [off, len]: aligned_extent.extent_map) { + ceph::bufferlist tmp; + aligned_extent.bl.splice(0, len, &tmp); + + uint64_t bytes_to_skip = 0; + if (off < extent.offset) { + bytes_to_skip = extent.offset - off; + if (len <= bytes_to_skip) { + continue; + } + off += bytes_to_skip; + len -= bytes_to_skip; + } + + len = std::min(len, extent.offset + extent.length - off); + if (len == 0) { + continue; + } + + if (len > 0) { + tmp.splice(bytes_to_skip, len, &extent.bl); + extent.extent_map.emplace_back(off, len); + } + } + } + } + } + + void finish(int r) override { + ldout(cct, 20) << "unaligned read r=" << r << dendl; + if (r >= 0) { + remove_alignment_data(); + + r = 0; + for (auto& extent: *extents) { + r += extent.length; + } + } + on_finish->complete(r); + } +}; + +template +struct C_UnalignedObjectWriteRequest : public Context { + I* image_ctx; + CryptoInterface* crypto; + uint64_t object_no; + uint64_t object_off; + ceph::bufferlist data; + ceph::bufferlist cmp_data; + uint64_t* mismatch_offset; + IOContext io_context; + int op_flags; + int write_flags; + std::optional assert_version; + const ZTracer::Trace parent_trace; + int* object_dispatch_flags; + uint64_t* journal_tid; + Context* on_finish; + bool may_copyup; + ceph::bufferlist aligned_data; + io::ReadExtents extents; + uint64_t version; + C_UnalignedObjectReadRequest* read_req; + bool object_exists; + + C_UnalignedObjectWriteRequest( + I* image_ctx, CryptoInterface* crypto, + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + ceph::bufferlist&& cmp_data, uint64_t* mismatch_offset, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, Context* on_dispatched, bool may_copyup + ) : image_ctx(image_ctx), crypto(crypto), object_no(object_no), + object_off(object_off), data(data), cmp_data(cmp_data), + mismatch_offset(mismatch_offset), io_context(io_context), + op_flags(op_flags), write_flags(write_flags), + assert_version(assert_version), parent_trace(parent_trace), + object_dispatch_flags(object_dispatch_flags), + journal_tid(journal_tid), on_finish(on_dispatched), + may_copyup(may_copyup) { + // build read extents + auto [pre_align, post_align] = crypto->get_pre_and_post_align( + object_off, data.length()); + if (pre_align != 0) { + extents.emplace_back(object_off - pre_align, pre_align); + } + if (post_align != 0) { + extents.emplace_back(object_off + data.length(), post_align); + } + if (cmp_data.length() != 0) { + extents.emplace_back(object_off, cmp_data.length()); + } + + auto ctx = create_context_callback< + C_UnalignedObjectWriteRequest, + &C_UnalignedObjectWriteRequest::handle_read>(this); + + read_req = new C_UnalignedObjectReadRequest( + image_ctx, crypto, object_no, &extents, io_context, + 0, io::READ_FLAG_DISABLE_READ_FROM_PARENT, parent_trace, + &version, 0, ctx); + } + + void send() { + read_req->send(); + } + + bool check_cmp_data() { + if (cmp_data.length() == 0) { + return true; + } + + auto& cmp_extent = extents.back(); + io::util::unsparsify(image_ctx->cct, &cmp_extent.bl, + cmp_extent.extent_map, cmp_extent.offset, + cmp_extent.length); + + std::optional found_mismatch = std::nullopt; + + auto it1 = cmp_data.cbegin(); + auto it2 = cmp_extent.bl.cbegin(); + for (uint64_t idx = 0; idx < cmp_data.length(); ++idx) { + if (*it1 != *it2) { + found_mismatch = std::make_optional(idx); + break; + } + ++it1; + ++it2; + } + + extents.pop_back(); + + if (found_mismatch.has_value()) { + if (mismatch_offset != nullptr) { + *mismatch_offset = found_mismatch.value(); + } + complete(-EILSEQ); + return false; + } + + return true; + } + + bool check_create_exclusive() { + bool exclusive = + ((write_flags & io::OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0); + if (exclusive && object_exists) { + complete(-EEXIST); + return false; + } + return true; + } + + bool check_version() { + int r = 0; + if (assert_version.has_value()) { + if (!object_exists) { + r = -ENOENT; + } else if (assert_version.value() < version) { + r = -ERANGE; + } else if (assert_version.value() > version) { + r = -EOVERFLOW; + } + } + + if (r != 0) { + complete(r); + return false; + } + return true; + } + + void build_aligned_data() { + auto [pre_align, post_align] = crypto->get_pre_and_post_align( + object_off, data.length()); + if (pre_align != 0) { + auto &extent = extents.front(); + io::util::unsparsify(image_ctx->cct, &extent.bl, extent.extent_map, + extent.offset, extent.length); + extent.bl.splice(0, pre_align, &aligned_data); + } + aligned_data.append(data); + if (post_align != 0) { + auto &extent = extents.back(); + io::util::unsparsify(image_ctx->cct, &extent.bl, extent.extent_map, + extent.offset, extent.length); + extent.bl.splice(0, post_align, &aligned_data); + } + } + + void handle_copyup(int r) { + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + if (r < 0) { + complete(r); + } else { + restart_request(false); + } + } + + void handle_read(int r) { + ldout(image_ctx->cct, 20) << "unaligned write r=" << r << dendl; + + if (r == -ENOENT) { + if (may_copyup) { + auto ctx = create_context_callback< + C_UnalignedObjectWriteRequest, + &C_UnalignedObjectWriteRequest::handle_copyup>(this); + if (io::util::trigger_copyup( + image_ctx, object_no, io_context, ctx)) { + return; + } + delete ctx; + } + object_exists = false; + } else if (r < 0) { + complete(r); + return; + } else { + object_exists = true; + } + + if (!check_create_exclusive() || !check_version() || !check_cmp_data()) { + return; + } + + build_aligned_data(); + + auto aligned_off = crypto->align(object_off, data.length()).first; + auto new_write_flags = write_flags; + auto new_assert_version = std::make_optional(version); + if (!object_exists) { + new_write_flags |= io::OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE; + new_assert_version = std::nullopt; + } + + auto ctx = create_context_callback< + C_UnalignedObjectWriteRequest, + &C_UnalignedObjectWriteRequest::handle_write>(this); + + // send back aligned write back to get encrypted and committed + auto write_req = io::ObjectDispatchSpec::create_write( + image_ctx, + io::util::get_previous_layer(io::OBJECT_DISPATCH_LAYER_CRYPTO), + object_no, aligned_off, std::move(aligned_data), io_context, + op_flags, new_write_flags, new_assert_version, + journal_tid == nullptr ? 0 : *journal_tid, parent_trace, ctx); + write_req->send(); + } + + void restart_request(bool may_copyup) { + auto req = new C_UnalignedObjectWriteRequest( + image_ctx, crypto, object_no, object_off, + std::move(data), std::move(cmp_data), + mismatch_offset, io_context, op_flags, write_flags, + assert_version, parent_trace, + object_dispatch_flags, journal_tid, this, may_copyup); + req->send(); + } + + void handle_write(int r) { + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + bool exclusive = write_flags & io::OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE; + bool restart = false; + if (r == -ERANGE && !assert_version.has_value()) { + restart = true; + } else if (r == -EEXIST && !exclusive) { + restart = true; + } + + if (restart) { + restart_request(may_copyup); + } else { + complete(r); + } + } + + void finish(int r) override { + ldout(image_ctx->cct, 20) << "unaligned write r=" << r << dendl; + on_finish->complete(r); + } +}; + +template +CryptoObjectDispatch::CryptoObjectDispatch( + I* image_ctx, CryptoInterface* crypto) + : m_image_ctx(image_ctx), m_crypto(crypto) { + m_data_offset_object_no = Striper::get_num_objects(image_ctx->layout, + crypto->get_data_offset()); +} + +template +void CryptoObjectDispatch::shut_down(Context* on_finish) { + on_finish->complete(0); +} + +template +bool CryptoObjectDispatch::read( + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + if (object_no < m_data_offset_object_no) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << *extents << dendl; + ceph_assert(m_crypto != nullptr); + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + if (m_crypto->is_aligned(*extents)) { + auto req = new C_AlignedObjectReadRequest( + m_image_ctx, m_crypto, object_no, extents, io_context, + op_flags, read_flags, parent_trace, version, object_dispatch_flags, + on_dispatched); + req->send(); + } else { + auto req = new C_UnalignedObjectReadRequest( + m_image_ctx, m_crypto, object_no, extents, io_context, + op_flags, read_flags, parent_trace, version, object_dispatch_flags, + on_dispatched); + req->send(); + } + + return true; +} + +template +bool CryptoObjectDispatch::write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (object_no < m_data_offset_object_no) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << data.length() << dendl; + ceph_assert(m_crypto != nullptr); + + if (m_crypto->is_aligned(object_off, data.length())) { + auto r = m_crypto->encrypt( + &data, get_file_offset(m_image_ctx, object_no, object_off)); + *dispatch_result = r == 0 ? io::DISPATCH_RESULT_CONTINUE + : io::DISPATCH_RESULT_COMPLETE; + on_dispatched->complete(r); + } else { + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + auto req = new C_UnalignedObjectWriteRequest( + m_image_ctx, m_crypto, object_no, object_off, std::move(data), {}, + nullptr, io_context, op_flags, write_flags, assert_version, + parent_trace, object_dispatch_flags, journal_tid, on_dispatched, + true); + req->send(); + } + + return true; +} + +template +bool CryptoObjectDispatch::write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (object_no < m_data_offset_object_no) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + ceph_assert(m_crypto != nullptr); + + // convert to regular write + io::LightweightObjectExtent extent(object_no, object_off, object_len, 0); + extent.buffer_extents = std::move(buffer_extents); + + bufferlist ws_data; + io::util::assemble_write_same_extent(extent, data, &ws_data, true); + + auto ctx = new LambdaContext( + [on_finish_ctx=on_dispatched](int r) { + on_finish_ctx->complete(r); + }); + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + auto req = io::ObjectDispatchSpec::create_write( + m_image_ctx, + io::util::get_previous_layer(io::OBJECT_DISPATCH_LAYER_CRYPTO), + object_no, object_off, std::move(ws_data), io_context, op_flags, 0, + std::nullopt, 0, parent_trace, ctx); + req->send(); + return true; +} + +template +bool CryptoObjectDispatch::compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + if (object_no < m_data_offset_object_no) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << write_data.length() + << dendl; + ceph_assert(m_crypto != nullptr); + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + auto req = new C_UnalignedObjectWriteRequest( + m_image_ctx, m_crypto, object_no, object_off, std::move(write_data), + std::move(cmp_data), mismatch_offset, io_context, op_flags, 0, + std::nullopt, parent_trace, object_dispatch_flags, journal_tid, + on_dispatched, true); + req->send(); + + return true; +} + +template +bool CryptoObjectDispatch::discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (object_no < m_data_offset_object_no) { + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + ceph_assert(m_crypto != nullptr); + + // convert to write-same + auto ctx = new LambdaContext( + [on_finish_ctx=on_dispatched](int r) { + on_finish_ctx->complete(r); + }); + + bufferlist bl; + const int buffer_size = 4096; + bl.append_zero(buffer_size); + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + auto req = io::ObjectDispatchSpec::create_write_same( + m_image_ctx, + io::util::get_previous_layer(io::OBJECT_DISPATCH_LAYER_CRYPTO), + object_no, object_off, object_len, {{0, object_len}}, std::move(bl), + io_context, *object_dispatch_flags, 0, parent_trace, ctx); + req->send(); + return true; +} + +template +int CryptoObjectDispatch::prepare_copyup( + uint64_t object_no, + io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) { + if (object_no < m_data_offset_object_no) { + return 0; + } + + ceph::bufferlist current_bl; + current_bl.append_zero(m_image_ctx->get_object_size()); + + for (auto& [key, extent_map]: *snapshot_sparse_bufferlist) { + // update current_bl with data from extent_map + for (auto& extent : extent_map) { + auto &sbe = extent.get_val(); + if (sbe.state == io::SPARSE_EXTENT_STATE_DATA) { + current_bl.begin(extent.get_off()).copy_in(extent.get_len(), sbe.bl); + } else if (sbe.state == io::SPARSE_EXTENT_STATE_ZEROED) { + ceph::bufferlist zeros; + zeros.append_zero(extent.get_len()); + current_bl.begin(extent.get_off()).copy_in(extent.get_len(), zeros); + } + } + + // encrypt + io::SparseBufferlist encrypted_sparse_bufferlist; + for (auto& extent : extent_map) { + auto [aligned_off, aligned_len] = m_crypto->align( + extent.get_off(), extent.get_len()); + + auto [image_extents, _] = io::util::object_to_area_extents( + m_image_ctx, object_no, {{aligned_off, aligned_len}}); + + ceph::bufferlist encrypted_bl; + uint64_t position = 0; + for (auto [image_offset, image_length]: image_extents) { + ceph::bufferlist aligned_bl; + aligned_bl.substr_of(current_bl, aligned_off + position, image_length); + aligned_bl.rebuild(); // to deep copy aligned_bl from current_bl + position += image_length; + + auto r = m_crypto->encrypt(&aligned_bl, image_offset); + if (r != 0) { + return r; + } + + encrypted_bl.append(aligned_bl); + } + + encrypted_sparse_bufferlist.insert( + aligned_off, aligned_len, {io::SPARSE_EXTENT_STATE_DATA, aligned_len, + std::move(encrypted_bl)}); + } + + // replace original plaintext sparse bufferlist with encrypted one + extent_map.clear(); + extent_map.insert(std::move(encrypted_sparse_bufferlist)); + } + + return 0; +} + +} // namespace crypto +} // namespace librbd + +template class librbd::crypto::CryptoObjectDispatch; diff --git a/src/librbd/crypto/CryptoObjectDispatch.h b/src/librbd/crypto/CryptoObjectDispatch.h new file mode 100644 index 000000000..b72fe1948 --- /dev/null +++ b/src/librbd/crypto/CryptoObjectDispatch.h @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_CRYPTO_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_CRYPTO_CRYPTO_OBJECT_DISPATCH_H + +#include "librbd/crypto/CryptoInterface.h" +#include "librbd/io/Types.h" +#include "librbd/io/ObjectDispatchInterface.h" + +namespace librbd { + +struct ImageCtx; + +namespace crypto { + +template +class CryptoObjectDispatch : public io::ObjectDispatchInterface { +public: + static CryptoObjectDispatch* create( + ImageCtxT* image_ctx, CryptoInterface* crypto) { + return new CryptoObjectDispatch(image_ctx, crypto); + } + + CryptoObjectDispatch(ImageCtxT* image_ctx, + CryptoInterface* crypto); + + io::ObjectDispatchLayer get_dispatch_layer() const override { + return io::OBJECT_DISPATCH_LAYER_CRYPTO; + } + + void shut_down(Context* on_finish) override; + + bool read( + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override { + return false; + } + + bool list_snaps( + uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + bool reset_existence_cache(Context* on_finish) override { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override { + } + + int prepare_copyup( + uint64_t object_no, + io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override; + +private: + ImageCtxT* m_image_ctx; + CryptoInterface* m_crypto; + uint64_t m_data_offset_object_no; +}; + +} // namespace crypto +} // namespace librbd + +extern template class librbd::crypto::CryptoObjectDispatch; + +#endif // CEPH_LIBRBD_CRYPTO_CRYPTO_OBJECT_DISPATCH_H diff --git a/src/librbd/crypto/DataCryptor.h b/src/librbd/crypto/DataCryptor.h new file mode 100644 index 000000000..ffcc57ce4 --- /dev/null +++ b/src/librbd/crypto/DataCryptor.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_DATA_CRYPTOR_H +#define CEPH_LIBRBD_CRYPTO_DATA_CRYPTOR_H + +#include "include/int_types.h" +#include "librbd/crypto/Types.h" + +namespace librbd { +namespace crypto { + +template +class DataCryptor { + +public: + + virtual ~DataCryptor() = default; + + virtual uint32_t get_block_size() const = 0; + virtual uint32_t get_iv_size() const = 0; + virtual const unsigned char* get_key() const = 0; + virtual int get_key_length() const = 0; + + virtual T* get_context(CipherMode mode) = 0; + virtual void return_context(T* ctx, CipherMode mode) = 0; + + virtual int init_context(T* ctx, const unsigned char* iv, + uint32_t iv_length) const = 0; + virtual int update_context(T* ctx, const unsigned char* in, + unsigned char* out, uint32_t len) const = 0; +}; + +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_DATA_CRYPTOR_H diff --git a/src/librbd/crypto/EncryptionFormat.h b/src/librbd/crypto/EncryptionFormat.h new file mode 100644 index 000000000..252592891 --- /dev/null +++ b/src/librbd/crypto/EncryptionFormat.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_ENCRYPTION_FORMAT_H +#define CEPH_LIBRBD_CRYPTO_ENCRYPTION_FORMAT_H + +#include + +struct Context; + +namespace librbd { +namespace crypto { + +struct CryptoInterface; + +template +struct EncryptionFormat { + virtual ~EncryptionFormat() { + } + + virtual std::unique_ptr> clone() const = 0; + virtual void format(ImageCtxT* ictx, Context* on_finish) = 0; + virtual void load(ImageCtxT* ictx, std::string* detected_format_name, + Context* on_finish) = 0; + virtual void flatten(ImageCtxT* ictx, Context* on_finish) = 0; + + virtual CryptoInterface* get_crypto() = 0; +}; + +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_ENCRYPTION_FORMAT_H diff --git a/src/librbd/crypto/FormatRequest.cc b/src/librbd/crypto/FormatRequest.cc new file mode 100644 index 000000000..5e90bbb76 --- /dev/null +++ b/src/librbd/crypto/FormatRequest.cc @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "FormatRequest.h" + +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/crypto/EncryptionFormat.h" +#include "librbd/crypto/ShutDownCryptoRequest.h" +#include "librbd/crypto/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/Types.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::FormatRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace crypto { + +using librbd::util::create_context_callback; + +template +FormatRequest::FormatRequest( + I* image_ctx, EncryptionFormat format, + Context* on_finish) : m_image_ctx(image_ctx), + m_format(std::move(format)), + m_on_finish(on_finish) { +} + +template +void FormatRequest::send() { + if (m_image_ctx->test_features(RBD_FEATURE_JOURNALING)) { + lderr(m_image_ctx->cct) << "cannot use encryption with journal" << dendl; + finish(-ENOTSUP); + return; + } + + if (m_image_ctx->encryption_format.get() == nullptr) { + format(); + return; + } else if (m_image_ctx->parent != nullptr) { + lderr(m_image_ctx->cct) << "cannot format a cloned image " + "while encryption is loaded" + << dendl; + finish(-EINVAL); + return; + } + + auto ctx = create_context_callback< + FormatRequest, &FormatRequest::handle_shutdown_crypto>(this); + auto *req = ShutDownCryptoRequest::create(m_image_ctx, ctx); + req->send(); +} + +template +void FormatRequest::handle_shutdown_crypto(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r != 0) { + lderr(m_image_ctx->cct) << "unable to unload existing crypto: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + format(); +} + +template +void FormatRequest::format() { + auto ctx = create_context_callback< + FormatRequest, &FormatRequest::handle_format>(this); + m_format->format(m_image_ctx, ctx); +} + +template +void FormatRequest::handle_format(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r != 0) { + lderr(m_image_ctx->cct) << "unable to format image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + flush(); +} + +template +void FormatRequest::flush() { + auto ctx = create_context_callback< + FormatRequest, &FormatRequest::handle_flush>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); +} + +template +void FormatRequest::handle_flush(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r != 0) { + lderr(m_image_ctx->cct) << "unable to flush image: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template +void FormatRequest::finish(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r == 0 && m_image_ctx->parent == nullptr) { + // only load on flat images, to avoid a case where encryption + // is wrongfully loaded only on the child image + util::set_crypto(m_image_ctx, std::move(m_format)); + } + m_on_finish->complete(r); + delete this; +} + +} // namespace crypto +} // namespace librbd + +template class librbd::crypto::FormatRequest; diff --git a/src/librbd/crypto/FormatRequest.h b/src/librbd/crypto/FormatRequest.h new file mode 100644 index 000000000..c2270a817 --- /dev/null +++ b/src/librbd/crypto/FormatRequest.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_FORMAT_REQUEST_H +#define CEPH_LIBRBD_CRYPTO_FORMAT_REQUEST_H + +#include "include/rbd/librbd.hpp" +#include "librbd/ImageCtx.h" + +struct Context; + +namespace librbd { + +class ImageCtx; + +namespace crypto { + +template +class FormatRequest { +public: + using EncryptionFormat = decltype(I::encryption_format); + + static FormatRequest* create( + I* image_ctx, EncryptionFormat format, Context* on_finish) { + return new FormatRequest(image_ctx, std::move(format), on_finish); + } + + FormatRequest(I* image_ctx, EncryptionFormat format, Context* on_finish); + void send(); + void handle_shutdown_crypto(int r); + void format(); + void handle_format(int r); + void flush(); + void handle_flush(int r); + void finish(int r); + +private: + I* m_image_ctx; + + EncryptionFormat m_format; + Context* m_on_finish; +}; + +} // namespace crypto +} // namespace librbd + +extern template class librbd::crypto::FormatRequest; + +#endif // CEPH_LIBRBD_CRYPTO_FORMAT_REQUEST_H diff --git a/src/librbd/crypto/LoadRequest.cc b/src/librbd/crypto/LoadRequest.cc new file mode 100644 index 000000000..5bc57d693 --- /dev/null +++ b/src/librbd/crypto/LoadRequest.cc @@ -0,0 +1,195 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "LoadRequest.h" + +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "librbd/ImageCtx.h" +#include "librbd/crypto/EncryptionFormat.h" +#include "librbd/crypto/Types.h" +#include "librbd/crypto/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/Types.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::LoadRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace crypto { + +using librbd::util::create_context_callback; + +template +LoadRequest::LoadRequest( + I* image_ctx, std::vector&& formats, + Context* on_finish) : m_image_ctx(image_ctx), + m_on_finish(on_finish), + m_format_idx(0), + m_is_current_format_cloned(false), + m_formats(std::move(formats)) { +} + +template +void LoadRequest::send() { + if (m_formats.empty()) { + lderr(m_image_ctx->cct) << "no encryption formats were specified" << dendl; + finish(-EINVAL); + return; + } + + ldout(m_image_ctx->cct, 20) << "got " << m_formats.size() << " formats" + << dendl; + + if (m_image_ctx->encryption_format.get() != nullptr) { + lderr(m_image_ctx->cct) << "encryption already loaded" << dendl; + finish(-EEXIST); + return; + } + + auto ictx = m_image_ctx; + while (ictx != nullptr) { + if (ictx->test_features(RBD_FEATURE_JOURNALING)) { + lderr(m_image_ctx->cct) << "cannot use encryption with journal." + << " image name: " << ictx->name << dendl; + finish(-ENOTSUP); + return; + } + ictx = ictx->parent; + } + + m_current_image_ctx = m_image_ctx; + flush(); +} + +template +void LoadRequest::flush() { + auto ctx = create_context_callback< + LoadRequest, &LoadRequest::handle_flush>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); +} + +template +void LoadRequest::handle_flush(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "failed to flush image" << dendl; + finish(r); + return; + } + + load(); +} + +template +void LoadRequest::load() { + ldout(m_image_ctx->cct, 20) << "format_idx=" << m_format_idx << dendl; + + m_detected_format_name = ""; + auto ctx = create_context_callback< + LoadRequest, &LoadRequest::handle_load>(this); + m_formats[m_format_idx]->load(m_current_image_ctx, &m_detected_format_name, + ctx); +} + +template +void LoadRequest::handle_load(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + if (m_is_current_format_cloned && + m_detected_format_name == UNKNOWN_FORMAT) { + // encryption format was not detected, assume plaintext + ldout(m_image_ctx->cct, 5) << "assuming plaintext for image " + << m_current_image_ctx->name << dendl; + m_formats.pop_back(); + invalidate_cache(); + return; + } + + lderr(m_image_ctx->cct) << "failed to load encryption. image name: " + << m_current_image_ctx->name << dendl; + finish(r); + return; + } + + ldout(m_image_ctx->cct, 5) << "loaded format " << m_detected_format_name + << (m_is_current_format_cloned ? " (cloned)" : "") + << " for image " << m_current_image_ctx->name + << dendl; + + m_format_idx++; + m_current_image_ctx = m_current_image_ctx->parent; + if (m_current_image_ctx != nullptr) { + // move on to loading parent + if (m_format_idx >= m_formats.size()) { + // try to load next ancestor using the same format + ldout(m_image_ctx->cct, 20) << "cloning format" << dendl; + m_is_current_format_cloned = true; + m_formats.push_back(m_formats[m_formats.size() - 1]->clone()); + } + + load(); + } else { + if (m_formats.size() != m_format_idx) { + lderr(m_image_ctx->cct) << "got " << m_formats.size() + << " encryption specs to load, " + << "but image has " << m_format_idx - 1 + << " ancestors" << dendl; + finish(-EINVAL); + return; + } + + invalidate_cache(); + } +} + +template +void LoadRequest::invalidate_cache() { + auto ctx = create_context_callback< + LoadRequest, &LoadRequest::handle_invalidate_cache>(this); + m_image_ctx->io_image_dispatcher->invalidate_cache(ctx); +} + +template +void LoadRequest::handle_invalidate_cache(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "failed to invalidate image cache" << dendl; + } + + finish(r); +} + +template +void LoadRequest::finish(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r == 0) { + auto ictx = m_image_ctx; + for (auto& format : m_formats) { + util::set_crypto(ictx, std::move(format)); + ictx = ictx->parent; + } + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace crypto +} // namespace librbd + +template class librbd::crypto::LoadRequest; diff --git a/src/librbd/crypto/LoadRequest.h b/src/librbd/crypto/LoadRequest.h new file mode 100644 index 000000000..84f595bb6 --- /dev/null +++ b/src/librbd/crypto/LoadRequest.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_LOAD_REQUEST_H +#define CEPH_LIBRBD_CRYPTO_LOAD_REQUEST_H + +#include "include/rbd/librbd.hpp" +#include "librbd/ImageCtx.h" + +struct Context; + +namespace librbd { + +class ImageCtx; + +namespace crypto { + +template +class LoadRequest { +public: + using EncryptionFormat = decltype(I::encryption_format); + + static constexpr char UNKNOWN_FORMAT[] = ""; + + static LoadRequest* create( + I* image_ctx, std::vector&& formats, + Context* on_finish) { + return new LoadRequest(image_ctx, std::move(formats), on_finish); + } + + LoadRequest(I* image_ctx, std::vector&& formats, + Context* on_finish); + void send(); + void flush(); + void handle_flush(int r); + void load(); + void handle_load(int r); + void invalidate_cache(); + void handle_invalidate_cache(int r); + void finish(int r); + +private: + I* m_image_ctx; + Context* m_on_finish; + + size_t m_format_idx; + bool m_is_current_format_cloned; + std::vector m_formats; + I* m_current_image_ctx; + std::string m_detected_format_name; +}; + +} // namespace crypto +} // namespace librbd + +extern template class librbd::crypto::LoadRequest; + +#endif // CEPH_LIBRBD_CRYPTO_LOAD_REQUEST_H diff --git a/src/librbd/crypto/ShutDownCryptoRequest.cc b/src/librbd/crypto/ShutDownCryptoRequest.cc new file mode 100644 index 000000000..fb1e77479 --- /dev/null +++ b/src/librbd/crypto/ShutDownCryptoRequest.cc @@ -0,0 +1,116 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ShutDownCryptoRequest.h" + +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "librbd/crypto/CryptoImageDispatch.h" +#include "librbd/crypto/CryptoObjectDispatch.h" +#include "librbd/crypto/EncryptionFormat.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/ObjectDispatcherInterface.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::ShutDownCryptoRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace crypto { + +using librbd::util::create_context_callback; + +template +ShutDownCryptoRequest::ShutDownCryptoRequest(I* image_ctx, + Context* on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) {} + +template +void ShutDownCryptoRequest::send() { + shut_down_object_dispatch(); +} + +template +void ShutDownCryptoRequest::shut_down_object_dispatch() { + if (!m_image_ctx->io_object_dispatcher->exists( + io::OBJECT_DISPATCH_LAYER_CRYPTO)) { + finish(0); + return; + } + + auto ctx = create_context_callback< + ShutDownCryptoRequest, + &ShutDownCryptoRequest::handle_shut_down_object_dispatch>(this); + + m_image_ctx->io_object_dispatcher->shut_down_dispatch( + io::OBJECT_DISPATCH_LAYER_CRYPTO, ctx); +} + +template +void ShutDownCryptoRequest::handle_shut_down_object_dispatch(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "failed to shut down object dispatch: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + shut_down_image_dispatch(); +} + +template +void ShutDownCryptoRequest::shut_down_image_dispatch() { + if (!m_image_ctx->io_image_dispatcher->exists( + io::IMAGE_DISPATCH_LAYER_CRYPTO)) { + finish(0); + return; + } + + auto ctx = create_context_callback< + ShutDownCryptoRequest, + &ShutDownCryptoRequest::handle_shut_down_image_dispatch>(this); + m_image_ctx->io_image_dispatcher->shut_down_dispatch( + io::IMAGE_DISPATCH_LAYER_CRYPTO, ctx); +} + +template +void ShutDownCryptoRequest::handle_shut_down_image_dispatch(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "failed to shut down image dispatch: " + << cpp_strerror(r) << dendl; + } + finish(r); +} + +template +void ShutDownCryptoRequest::finish(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r == 0) { + { + std::unique_lock image_locker{m_image_ctx->image_lock}; + m_image_ctx->encryption_format.reset(); + } + + if (m_image_ctx->parent != nullptr) { + // move to shutting down parent crypto + m_image_ctx = m_image_ctx->parent; + shut_down_object_dispatch(); + return; + } + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace crypto +} // namespace librbd + +template class librbd::crypto::ShutDownCryptoRequest; diff --git a/src/librbd/crypto/ShutDownCryptoRequest.h b/src/librbd/crypto/ShutDownCryptoRequest.h new file mode 100644 index 000000000..274228574 --- /dev/null +++ b/src/librbd/crypto/ShutDownCryptoRequest.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_SHUT_DOWN_CRYPTO_REQUEST_H +#define CEPH_LIBRBD_CRYPTO_SHUT_DOWN_CRYPTO_REQUEST_H + +#include "librbd/ImageCtx.h" + +struct Context; + +namespace librbd { + +class ImageCtx; + +namespace crypto { + +template +class ShutDownCryptoRequest { +public: + static ShutDownCryptoRequest* create(I* image_ctx, Context* on_finish) { + return new ShutDownCryptoRequest(image_ctx, on_finish); + } + + ShutDownCryptoRequest(I* image_ctx, Context* on_finish); + + void send(); + void shut_down_object_dispatch(); + void handle_shut_down_object_dispatch(int r); + void shut_down_image_dispatch(); + void handle_shut_down_image_dispatch(int r); + void finish(int r); + +private: + I* m_image_ctx; + Context* m_on_finish; +}; + +} // namespace crypto +} // namespace librbd + +extern template class librbd::crypto::ShutDownCryptoRequest; + +#endif // CEPH_LIBRBD_CRYPTO_SHUT_DOWN_CRYPTO_REQUEST_H diff --git a/src/librbd/crypto/Types.h b/src/librbd/crypto/Types.h new file mode 100644 index 000000000..93d9c172c --- /dev/null +++ b/src/librbd/crypto/Types.h @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_TYPES_H +#define CEPH_LIBRBD_CRYPTO_TYPES_H + +namespace librbd { +namespace crypto { + +enum CipherMode { + CIPHER_MODE_ENC, + CIPHER_MODE_DEC, +}; + +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_DATA_CRYPTOR_H diff --git a/src/librbd/crypto/Utils.cc b/src/librbd/crypto/Utils.cc new file mode 100644 index 000000000..981ad47b0 --- /dev/null +++ b/src/librbd/crypto/Utils.cc @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Utils.h" + +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/crypto/BlockCrypto.h" +#include "librbd/crypto/CryptoImageDispatch.h" +#include "librbd/crypto/CryptoInterface.h" +#include "librbd/crypto/CryptoObjectDispatch.h" +#include "librbd/crypto/EncryptionFormat.h" +#include "librbd/crypto/openssl/DataCryptor.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/ObjectDispatcherInterface.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::util: " << __func__ << ": " + +namespace librbd { +namespace crypto { +namespace util { + +template +void set_crypto(I *image_ctx, + decltype(I::encryption_format) encryption_format) { + std::unique_lock image_locker{image_ctx->image_lock}; + ceph_assert(!image_ctx->encryption_format); + + auto crypto = encryption_format->get_crypto(); + + auto object_dispatch = CryptoObjectDispatch::create(image_ctx, crypto); + auto image_dispatch = CryptoImageDispatch::create(crypto->get_data_offset()); + image_ctx->io_object_dispatcher->register_dispatch(object_dispatch); + image_ctx->io_image_dispatcher->register_dispatch(image_dispatch); + + image_ctx->encryption_format = std::move(encryption_format); +} + +int build_crypto( + CephContext* cct, const unsigned char* key, uint32_t key_length, + uint64_t block_size, uint64_t data_offset, + std::unique_ptr* result_crypto) { + const char* cipher_suite; + switch (key_length) { + case 32: + cipher_suite = "aes-128-xts"; + break; + case 64: + cipher_suite = "aes-256-xts"; + break; + default: + lderr(cct) << "unsupported key length: " << key_length << dendl; + return -ENOTSUP; + } + + auto data_cryptor = new openssl::DataCryptor(cct); + int r = data_cryptor->init(cipher_suite, key, key_length); + if (r != 0) { + lderr(cct) << "error initializing data cryptor: " << cpp_strerror(r) + << dendl; + delete data_cryptor; + return r; + } + + result_crypto->reset(BlockCrypto::create( + cct, data_cryptor, block_size, data_offset)); + return 0; +} + +} // namespace util +} // namespace crypto +} // namespace librbd + +template void librbd::crypto::util::set_crypto( + librbd::ImageCtx *image_ctx, + std::unique_ptr> encryption_format); diff --git a/src/librbd/crypto/Utils.h b/src/librbd/crypto/Utils.h new file mode 100644 index 000000000..ffac08c83 --- /dev/null +++ b/src/librbd/crypto/Utils.h @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_UTILS_H +#define CEPH_LIBRBD_CRYPTO_UTILS_H + +#include "include/Context.h" + +namespace librbd { + +struct ImageCtx; + +namespace crypto { + +class CryptoInterface; +template class EncryptionFormat; + +namespace util { + +template +void set_crypto(ImageCtxT *image_ctx, + decltype(ImageCtxT::encryption_format) encryption_format); + +int build_crypto( + CephContext* cct, const unsigned char* key, uint32_t key_length, + uint64_t block_size, uint64_t data_offset, + std::unique_ptr* result_crypto); + +} // namespace util +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_UTILS_H diff --git a/src/librbd/crypto/luks/FlattenRequest.cc b/src/librbd/crypto/luks/FlattenRequest.cc new file mode 100644 index 000000000..fdf6b5dae --- /dev/null +++ b/src/librbd/crypto/luks/FlattenRequest.cc @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "FlattenRequest.h" + +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "librbd/crypto/EncryptionFormat.h" +#include "librbd/crypto/Utils.h" +#include "librbd/crypto/luks/LoadRequest.h" +#include "librbd/crypto/luks/Magic.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ReadResult.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::luks::FlattenRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace crypto { +namespace luks { + +using librbd::util::create_context_callback; + +template +FlattenRequest::FlattenRequest( + I* image_ctx, Context* on_finish) : m_image_ctx(image_ctx), + m_on_finish(on_finish) { + ceph_assert(m_image_ctx->encryption_format.get() != nullptr); +} + +template +void FlattenRequest::send() { + read_header(); +} + +template +void FlattenRequest::read_header() { + auto ctx = create_context_callback< + FlattenRequest, &FlattenRequest::handle_read_header>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_READ); + + auto crypto = m_image_ctx->encryption_format->get_crypto(); + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_read( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{0, crypto->get_data_offset()}}, io::ImageArea::CRYPTO_HEADER, + io::ReadResult{&m_bl}, m_image_ctx->get_data_io_context(), 0, 0, + trace); + req->send(); +} + +template +void FlattenRequest::handle_read_header(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "error reading from image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + r = Magic::is_rbd_clone(m_bl); + if (r < 0) { + lderr(m_image_ctx->cct) << "unable to determine encryption header magic: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } else if (r > 0) { + // switch magic + r = Magic::replace_magic(m_image_ctx->cct, m_bl); + if (r < 0) { + lderr(m_image_ctx->cct) << "unable to restore header magic: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + } + + write_header(); +} + +template +void FlattenRequest::write_header() { + // write header to offset 0 of the image + auto ctx = create_context_callback< + FlattenRequest, &FlattenRequest::handle_write_header>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_WRITE); + + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_write( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{0, m_bl.length()}}, io::ImageArea::CRYPTO_HEADER, + std::move(m_bl), 0, trace); + req->send(); +} + +template +void FlattenRequest::handle_write_header(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "error writing header to image: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + flush(); +} + +template +void FlattenRequest::flush() { + auto ctx = create_context_callback< + FlattenRequest, &FlattenRequest::handle_flush>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); +} + +template +void FlattenRequest::handle_flush(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "unable to flush image: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template +void FlattenRequest::finish(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace luks +} // namespace crypto +} // namespace librbd + +template class librbd::crypto::luks::FlattenRequest; diff --git a/src/librbd/crypto/luks/FlattenRequest.h b/src/librbd/crypto/luks/FlattenRequest.h new file mode 100644 index 000000000..a1432f505 --- /dev/null +++ b/src/librbd/crypto/luks/FlattenRequest.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_LUKS_FLATTEN_REQUEST_H +#define CEPH_LIBRBD_CRYPTO_LUKS_FLATTEN_REQUEST_H + +#include "librbd/ImageCtx.h" + +namespace librbd { + +namespace crypto { +namespace luks { + +template +class FlattenRequest { +public: + using EncryptionFormat = decltype(I::encryption_format); + + static FlattenRequest* create(I* image_ctx, Context* on_finish) { + return new FlattenRequest(image_ctx, on_finish); + } + + FlattenRequest(I* image_ctx, Context* on_finish); + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * READ_HEADER + * | + * v + * WRITE_HEADER (replacing magic back from RBDL to LUKS if needed) + * | + * v + * FLUSH + * | + * v + * + * + * @endverbatim + */ + I* m_image_ctx; + Context* m_on_finish; + ceph::bufferlist m_bl; + + void read_header(); + void handle_read_header(int r); + void write_header(); + void handle_write_header(int r); + void flush(); + void handle_flush(int r); + void finish(int r); +}; + +} // namespace luks +} // namespace crypto +} // namespace librbd + +extern template class librbd::crypto::luks::FlattenRequest; + +#endif // CEPH_LIBRBD_CRYPTO_LUKS_FLATTEN_REQUEST_H diff --git a/src/librbd/crypto/luks/FormatRequest.cc b/src/librbd/crypto/luks/FormatRequest.cc new file mode 100644 index 000000000..32673b9cf --- /dev/null +++ b/src/librbd/crypto/luks/FormatRequest.cc @@ -0,0 +1,200 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "FormatRequest.h" + +#include +#include +#include "common/dout.h" +#include "common/errno.h" +#include "include/compat.h" +#include "librbd/Utils.h" +#include "librbd/crypto/Utils.h" +#include "librbd/crypto/luks/Header.h" +#include "librbd/crypto/luks/Magic.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::luks::FormatRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace crypto { +namespace luks { + +using librbd::util::create_context_callback; + +template +FormatRequest::FormatRequest( + I* image_ctx, encryption_format_t format, encryption_algorithm_t alg, + std::string_view passphrase, + std::unique_ptr* result_crypto, Context* on_finish, + bool insecure_fast_mode) : m_image_ctx(image_ctx), m_format(format), + m_alg(alg), + m_passphrase(passphrase), + m_result_crypto(result_crypto), + m_on_finish(on_finish), + m_insecure_fast_mode(insecure_fast_mode), + m_header(image_ctx->cct) { +} + +template +void FormatRequest::send() { + const char* type; + size_t sector_size; + switch (m_format) { + case RBD_ENCRYPTION_FORMAT_LUKS1: + type = CRYPT_LUKS1; + sector_size = 512; + break; + case RBD_ENCRYPTION_FORMAT_LUKS2: + type = CRYPT_LUKS2; + sector_size = 4096; + break; + default: + lderr(m_image_ctx->cct) << "unsupported format type: " << m_format + << dendl; + finish(-EINVAL); + return; + } + + const char* cipher; + size_t key_size; + switch (m_alg) { + case RBD_ENCRYPTION_ALGORITHM_AES128: + cipher = "aes"; + key_size = 32; + break; + case RBD_ENCRYPTION_ALGORITHM_AES256: + cipher = "aes"; + key_size = 64; + break; + default: + lderr(m_image_ctx->cct) << "unsupported cipher algorithm: " << m_alg + << dendl; + finish(-EINVAL); + return; + } + + // generate encryption key + unsigned char* key = (unsigned char*)alloca(key_size); + if (RAND_bytes((unsigned char *)key, key_size) != 1) { + lderr(m_image_ctx->cct) << "cannot generate random encryption key" + << dendl; + finish(-EAGAIN); + return; + } + + // setup interface with libcryptsetup + auto r = m_header.init(); + if (r < 0) { + finish(r); + return; + } + + // format (create LUKS header) + auto stripe_period = m_image_ctx->get_stripe_period(); + r = m_header.format(type, cipher, reinterpret_cast(key), key_size, + "xts-plain64", sector_size, stripe_period, + m_insecure_fast_mode); + if (r != 0) { + finish(r); + return; + } + + m_image_ctx->image_lock.lock_shared(); + uint64_t image_size = m_image_ctx->get_image_size(CEPH_NOSNAP); + m_image_ctx->image_lock.unlock_shared(); + + if (m_header.get_data_offset() > image_size) { + lderr(m_image_ctx->cct) << "image is too small, format requires " + << m_header.get_data_offset() << " bytes" << dendl; + finish(-ENOSPC); + return; + } + + // add keyslot (volume key encrypted with passphrase) + r = m_header.add_keyslot(m_passphrase.data(), m_passphrase.size()); + if (r != 0) { + finish(r); + return; + } + + r = util::build_crypto(m_image_ctx->cct, key, key_size, + m_header.get_sector_size(), + m_header.get_data_offset(), m_result_crypto); + ceph_memzero_s(key, key_size, key_size); + if (r != 0) { + finish(r); + return; + } + + // read header from libcryptsetup interface + ceph::bufferlist bl; + r = m_header.read(&bl); + if (r < 0) { + finish(r); + return; + } + + if (m_image_ctx->parent != nullptr) { + // parent is not encrypted with same key + // change LUKS magic to prevent decryption by other LUKS implementations + r = Magic::replace_magic(m_image_ctx->cct, bl); + if (r < 0) { + lderr(m_image_ctx->cct) << "error replacing LUKS magic: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + } + + // pad header to stripe period alignment to prevent copyup of parent data + // when writing encryption header to the child image + auto alignment = bl.length() % stripe_period; + if (alignment > 0) { + bl.append_zero(stripe_period - alignment); + } + + // write header to offset 0 of the image + auto ctx = create_context_callback< + FormatRequest, &FormatRequest::handle_write_header>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_WRITE); + + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_write( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{0, bl.length()}}, io::ImageArea::DATA, std::move(bl), 0, trace); + req->send(); +} + +template +void FormatRequest::handle_write_header(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "error writing header to image: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void FormatRequest::finish(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace luks +} // namespace crypto +} // namespace librbd + +template class librbd::crypto::luks::FormatRequest; diff --git a/src/librbd/crypto/luks/FormatRequest.h b/src/librbd/crypto/luks/FormatRequest.h new file mode 100644 index 000000000..17d0b3af9 --- /dev/null +++ b/src/librbd/crypto/luks/FormatRequest.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_LUKS_FORMAT_REQUEST_H +#define CEPH_LIBRBD_CRYPTO_LUKS_FORMAT_REQUEST_H + +#include +#include "include/rbd/librbd.hpp" +#include "librbd/ImageCtx.h" +#include "librbd/crypto/CryptoInterface.h" +#include "librbd/crypto/luks/Header.h" + +namespace librbd { + +class ImageCtx; + +namespace crypto { +namespace luks { + +template +class FormatRequest { +public: + static FormatRequest* create( + I* image_ctx, encryption_format_t format, + encryption_algorithm_t alg, std::string_view passphrase, + std::unique_ptr* result_crypto, Context* on_finish, + bool insecure_fast_mode) { + return new FormatRequest(image_ctx, format, alg, passphrase, + result_crypto, on_finish, insecure_fast_mode); + } + + FormatRequest(I* image_ctx, encryption_format_t format, + encryption_algorithm_t alg, std::string_view passphrase, + std::unique_ptr* result_crypto, + Context* on_finish, bool insecure_fast_mode); + void send(); + void finish(int r); + +private: + I* m_image_ctx; + + encryption_format_t m_format; + encryption_algorithm_t m_alg; + std::string_view m_passphrase; + std::unique_ptr* m_result_crypto; + Context* m_on_finish; + bool m_insecure_fast_mode; + Header m_header; + + void handle_write_header(int r); +}; + +} // namespace luks +} // namespace crypto +} // namespace librbd + +extern template class librbd::crypto::luks::FormatRequest; + +#endif // CEPH_LIBRBD_CRYPTO_LUKS_FORMAT_REQUEST_H diff --git a/src/librbd/crypto/luks/Header.cc b/src/librbd/crypto/luks/Header.cc new file mode 100644 index 000000000..0866f285f --- /dev/null +++ b/src/librbd/crypto/luks/Header.cc @@ -0,0 +1,261 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Header.h" + +#include +#include +#include +#include +#include "common/dout.h" +#include "common/errno.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::luks::Header: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace crypto { +namespace luks { + +Header::Header(CephContext* cct) : m_cct(cct), m_fd(-1), m_cd(nullptr) { +} + +Header::~Header() { + if (m_fd != -1) { + close(m_fd); + m_fd = -1; + } + if (m_cd != nullptr) { + crypt_free(m_cd); + m_cd = nullptr; + } +} + +void Header::libcryptsetup_log_wrapper(int level, const char* msg, void* header) { + ((Header*)header)->libcryptsetup_log(level, msg); +} + +void Header::libcryptsetup_log(int level, const char* msg) { + switch (level) { + case CRYPT_LOG_NORMAL: + ldout(m_cct, 5) << "[libcryptsetup] " << msg << dendl; + break; + case CRYPT_LOG_ERROR: + lderr(m_cct) << "[libcryptsetup] " << msg << dendl; + break; + case CRYPT_LOG_VERBOSE: + ldout(m_cct, 10) << "[libcryptsetup] " << msg << dendl; + break; + case CRYPT_LOG_DEBUG: + ldout(m_cct, 20) << "[libcryptsetup] " << msg << dendl; + break; + } +} + +int Header::init() { + if (m_fd != -1) { + return 0; + } + + // create anonymous file + m_fd = syscall(SYS_memfd_create, "LibcryptsetupInterface", 0); + if (m_fd == -1) { + lderr(m_cct) << "error creating anonymous file: " << cpp_strerror(-errno) + << dendl; + return -errno; + } + std::string path = + "/proc/" + std::to_string(getpid()) + "/fd/" + std::to_string(m_fd); + + if (m_cct->_conf->subsys.should_gather()) { + crypt_set_debug_level(CRYPT_DEBUG_ALL); + } + + // init libcryptsetup handle + auto r = crypt_init(&m_cd, path.c_str()); + if (r != 0) { + lderr(m_cct) << "crypt_init failed: " << cpp_strerror(r) << dendl; + return r; + } + + // redirect logging + crypt_set_log_callback(m_cd, &libcryptsetup_log_wrapper, this); + + return 0; +} + +int Header::write(const ceph::bufferlist& bl) { + ceph_assert(m_fd != -1); + + auto r = bl.write_fd(m_fd); + if (r != 0) { + lderr(m_cct) << "error writing header: " << cpp_strerror(r) << dendl; + } + return r; +} + +ssize_t Header::read(ceph::bufferlist* bl) { + ceph_assert(m_fd != -1); + + // get current header size + struct stat st; + ssize_t r = fstat(m_fd, &st); + if (r < 0) { + r = -errno; + lderr(m_cct) << "failed to stat anonymous file: " << cpp_strerror(r) + << dendl; + return r; + } + + r = bl->read_fd(m_fd, st.st_size); + if (r < 0) { + lderr(m_cct) << "error reading header: " << cpp_strerror(r) << dendl; + } + + ldout(m_cct, 20) << "read size = " << r << dendl; + return r; +} + +int Header::format(const char* type, const char* alg, const char* key, + size_t key_size, const char* cipher_mode, + uint32_t sector_size, uint32_t data_alignment, + bool insecure_fast_mode) { + ceph_assert(m_cd != nullptr); + + ldout(m_cct, 20) << "sector size: " << sector_size << ", data alignment: " + << data_alignment << dendl; + + // required for passing libcryptsetup device size check + if (ftruncate(m_fd, 4096) != 0) { + lderr(m_cct) << "failed to truncate anonymous file: " + << cpp_strerror(-errno) << dendl; + return -errno; + } + + struct crypt_params_luks1 luks1params; + struct crypt_params_luks2 luks2params; + + const size_t converted_data_alignment = data_alignment / 512; + + void* params = nullptr; + if (strcmp(type, CRYPT_LUKS1) == 0) { + memset(&luks1params, 0, sizeof(luks1params)); + luks1params.data_alignment = converted_data_alignment; + params = &luks1params; + } else if (strcmp(type, CRYPT_LUKS2) == 0) { + memset(&luks2params, 0, sizeof(luks2params)); + luks2params.data_alignment = converted_data_alignment; + luks2params.sector_size = sector_size; + params = &luks2params; + } + + // this mode should be used for testing only + if (insecure_fast_mode) { + struct crypt_pbkdf_type pbkdf; + memset(&pbkdf, 0, sizeof(pbkdf)); + pbkdf.type = CRYPT_KDF_PBKDF2; + pbkdf.flags = CRYPT_PBKDF_NO_BENCHMARK; + pbkdf.hash = "sha256"; + pbkdf.iterations = 1000; + pbkdf.time_ms = 1; + auto r = crypt_set_pbkdf_type(m_cd, &pbkdf); + if (r != 0) { + lderr(m_cct) << "crypt_set_pbkdf_type failed: " << cpp_strerror(r) + << dendl; + return r; + } + } + + auto r = crypt_format( + m_cd, type, alg, cipher_mode, NULL, key, key_size, params); + if (r != 0) { + lderr(m_cct) << "crypt_format failed: " << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +int Header::add_keyslot(const char* passphrase, size_t passphrase_size) { + ceph_assert(m_cd != nullptr); + + auto r = crypt_keyslot_add_by_volume_key( + m_cd, CRYPT_ANY_SLOT, NULL, 0, passphrase, passphrase_size); + if (r < 0) { + lderr(m_cct) << "crypt_keyslot_add_by_volume_key failed: " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +int Header::load(const char* type) { + ceph_assert(m_cd != nullptr); + + // libcryptsetup checks if device size matches the header and keyslots size + // in LUKS2, 2 X 4MB header + 128MB keyslots + if (ftruncate(m_fd, 136 * 1024 * 1024) != 0) { + lderr(m_cct) << "failed to truncate anonymous file: " + << cpp_strerror(-errno) << dendl; + return -errno; + } + + auto r = crypt_load(m_cd, type, NULL); + if (r != 0) { + ldout(m_cct, 20) << "crypt_load failed: " << cpp_strerror(r) << dendl; + return r; + } + + ldout(m_cct, 20) << "sector size: " << get_sector_size() << ", data offset: " + << get_data_offset() << dendl; + + return 0; +} + +int Header::read_volume_key(const char* passphrase, size_t passphrase_size, + char* volume_key, size_t* volume_key_size) { + ceph_assert(m_cd != nullptr); + + auto r = crypt_volume_key_get( + m_cd, CRYPT_ANY_SLOT, volume_key, volume_key_size, passphrase, + passphrase_size); + if (r < 0) { + ldout(m_cct, 20) << "crypt_volume_key_get failed: " << cpp_strerror(r) + << dendl; + return r; + } + + return 0; +} + +int Header::get_sector_size() { + ceph_assert(m_cd != nullptr); + return crypt_get_sector_size(m_cd); +} + +uint64_t Header::get_data_offset() { + ceph_assert(m_cd != nullptr); + return crypt_get_data_offset(m_cd) << 9; +} + +const char* Header::get_cipher() { + ceph_assert(m_cd != nullptr); + return crypt_get_cipher(m_cd); +} + +const char* Header::get_cipher_mode() { + ceph_assert(m_cd != nullptr); + return crypt_get_cipher_mode(m_cd); +} + +const char* Header::get_format_name() { + ceph_assert(m_cd != nullptr); + return crypt_get_type(m_cd); +} + +} // namespace luks +} // namespace crypto +} // namespace librbd diff --git a/src/librbd/crypto/luks/Header.h b/src/librbd/crypto/luks/Header.h new file mode 100644 index 000000000..067d96b4a --- /dev/null +++ b/src/librbd/crypto/luks/Header.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_LUKS_HEADER_H +#define CEPH_LIBRBD_CRYPTO_LUKS_HEADER_H + +#include +#include "common/ceph_context.h" +#include "include/buffer.h" + +namespace librbd { +namespace crypto { +namespace luks { + +class Header { +public: + Header(CephContext* cct); + ~Header(); + int init(); + + int write(const ceph::bufferlist& bl); + ssize_t read(ceph::bufferlist* bl); + + int format(const char* type, const char* alg, const char* key, + size_t key_size, const char* cipher_mode, uint32_t sector_size, + uint32_t data_alignment, bool insecure_fast_mode); + int add_keyslot(const char* passphrase, size_t passphrase_size); + int load(const char* type); + int read_volume_key(const char* passphrase, size_t passphrase_size, + char* volume_key, size_t* volume_key_size); + + int get_sector_size(); + uint64_t get_data_offset(); + const char* get_cipher(); + const char* get_cipher_mode(); + const char* get_format_name(); + +private: + void libcryptsetup_log(int level, const char* msg); + static void libcryptsetup_log_wrapper(int level, const char* msg, + void* header); + + CephContext* m_cct; + int m_fd; + struct crypt_device *m_cd; +}; + +} // namespace luks +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_LUKS_HEADER_H diff --git a/src/librbd/crypto/luks/LUKSEncryptionFormat.cc b/src/librbd/crypto/luks/LUKSEncryptionFormat.cc new file mode 100644 index 000000000..1f92cf0f7 --- /dev/null +++ b/src/librbd/crypto/luks/LUKSEncryptionFormat.cc @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "LUKSEncryptionFormat.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/compat.h" +#include "librbd/crypto/luks/FlattenRequest.h" +#include "librbd/crypto/luks/FormatRequest.h" +#include "librbd/crypto/luks/LoadRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::luks::LUKSEncryptionFormat:: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace crypto { +namespace luks { + +template +void EncryptionFormat::flatten(I* image_ctx, Context* on_finish) { + auto req = luks::FlattenRequest::create(image_ctx, on_finish); + req->send(); +} + +template +void LUKSEncryptionFormat::format(I* image_ctx, Context* on_finish) { + lderr(image_ctx->cct) << "explicit LUKS version required for format" << dendl; + on_finish->complete(-EINVAL); +} + +template +void LUKSEncryptionFormat::load(I* image_ctx, + std::string* detected_format_name, + Context* on_finish) { + auto req = luks::LoadRequest::create(image_ctx, RBD_ENCRYPTION_FORMAT_LUKS, + m_passphrase, &this->m_crypto, + detected_format_name, on_finish); + req->send(); +} + +template +void LUKS1EncryptionFormat::format(I* image_ctx, Context* on_finish) { + auto req = luks::FormatRequest::create( + image_ctx, RBD_ENCRYPTION_FORMAT_LUKS1, m_alg, m_passphrase, + &this->m_crypto, on_finish, false); + req->send(); +} + +template +void LUKS1EncryptionFormat::load(I* image_ctx, + std::string* detected_format_name, + Context* on_finish) { + auto req = luks::LoadRequest::create( + image_ctx, RBD_ENCRYPTION_FORMAT_LUKS1, m_passphrase, &this->m_crypto, + detected_format_name, on_finish); + req->send(); +} + +template +void LUKS2EncryptionFormat::format(I* image_ctx, Context* on_finish) { + auto req = luks::FormatRequest::create( + image_ctx, RBD_ENCRYPTION_FORMAT_LUKS2, m_alg, m_passphrase, + &this->m_crypto, on_finish, false); + req->send(); +} + +template +void LUKS2EncryptionFormat::load(I* image_ctx, + std::string* detected_format_name, + Context* on_finish) { + auto req = luks::LoadRequest::create( + image_ctx, RBD_ENCRYPTION_FORMAT_LUKS2, m_passphrase, &this->m_crypto, + detected_format_name, on_finish); + req->send(); +} + +} // namespace luks +} // namespace crypto +} // namespace librbd + +template class librbd::crypto::luks::LUKSEncryptionFormat; +template class librbd::crypto::luks::LUKS1EncryptionFormat; +template class librbd::crypto::luks::LUKS2EncryptionFormat; diff --git a/src/librbd/crypto/luks/LUKSEncryptionFormat.h b/src/librbd/crypto/luks/LUKSEncryptionFormat.h new file mode 100644 index 000000000..353bd8933 --- /dev/null +++ b/src/librbd/crypto/luks/LUKSEncryptionFormat.h @@ -0,0 +1,100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_LUKS_ENCRYPTION_FORMAT_H +#define CEPH_LIBRBD_CRYPTO_LUKS_ENCRYPTION_FORMAT_H + +#include +#include "include/rbd/librbd.hpp" +#include "librbd/crypto/CryptoInterface.h" +#include "librbd/crypto/EncryptionFormat.h" + +namespace librbd { + +struct ImageCtx; + +namespace crypto { +namespace luks { + +template +class EncryptionFormat : public crypto::EncryptionFormat { +public: + void flatten(ImageCtxT* ictx, Context* on_finish) override; + + CryptoInterface* get_crypto() override { + ceph_assert(m_crypto); + return m_crypto.get(); + } + +protected: + std::unique_ptr m_crypto; +}; + +template +class LUKSEncryptionFormat : public EncryptionFormat { +public: + LUKSEncryptionFormat(std::string_view passphrase) + : m_passphrase(passphrase) {} + + std::unique_ptr> clone() const override { + return std::make_unique(m_passphrase); + } + + void format(ImageCtxT* ictx, Context* on_finish) override; + void load(ImageCtxT* ictx, std::string* detected_format_name, + Context* on_finish) override; + +private: + std::string_view m_passphrase; +}; + +template +class LUKS1EncryptionFormat : public EncryptionFormat { +public: + LUKS1EncryptionFormat(encryption_algorithm_t alg, std::string_view passphrase) + : m_alg(alg), m_passphrase(passphrase) {} + + std::unique_ptr> clone() const override { + return std::make_unique(m_alg, m_passphrase); + } + + void format(ImageCtxT* ictx, Context* on_finish) override; + void load(ImageCtxT* ictx, std::string* detected_format_name, + Context* on_finish) override; + +private: + encryption_algorithm_t m_alg; + std::string_view m_passphrase; +}; + +template +class LUKS2EncryptionFormat : public EncryptionFormat { +public: + LUKS2EncryptionFormat(encryption_algorithm_t alg, std::string_view passphrase) + : m_alg(alg), m_passphrase(passphrase) {} + + std::unique_ptr> clone() const override { + return std::make_unique(m_alg, m_passphrase); + } + + void format(ImageCtxT* ictx, Context* on_finish) override; + void load(ImageCtxT* ictx, std::string* detected_format_name, + Context* on_finish) override; + +private: + encryption_algorithm_t m_alg; + std::string_view m_passphrase; +}; + +} // namespace luks +} // namespace crypto +} // namespace librbd + +extern template class librbd::crypto::luks::LUKSEncryptionFormat< + librbd::ImageCtx>; +extern template class librbd::crypto::luks::LUKS1EncryptionFormat< + librbd::ImageCtx>; +extern template class librbd::crypto::luks::LUKS2EncryptionFormat< + librbd::ImageCtx>; + +#endif // CEPH_LIBRBD_CRYPTO_LUKS_ENCRYPTION_FORMAT_H diff --git a/src/librbd/crypto/luks/LoadRequest.cc b/src/librbd/crypto/luks/LoadRequest.cc new file mode 100644 index 000000000..b5e16f100 --- /dev/null +++ b/src/librbd/crypto/luks/LoadRequest.cc @@ -0,0 +1,272 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "LoadRequest.h" + +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "librbd/crypto/Utils.h" +#include "librbd/crypto/LoadRequest.h" +#include "librbd/crypto/luks/Magic.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ReadResult.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::luks::LoadRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace crypto { +namespace luks { + +using librbd::util::create_context_callback; + +template +LoadRequest::LoadRequest( + I* image_ctx, encryption_format_t format, std::string_view passphrase, + std::unique_ptr* result_crypto, + std::string* detected_format_name, + Context* on_finish) : m_image_ctx(image_ctx), + m_format(format), + m_passphrase(passphrase), + m_on_finish(on_finish), + m_result_crypto(result_crypto), + m_detected_format_name(detected_format_name), + m_initial_read_size(DEFAULT_INITIAL_READ_SIZE), + m_header(image_ctx->cct), m_offset(0) { +} + +template +void LoadRequest::set_initial_read_size(uint64_t read_size) { + m_initial_read_size = read_size; +} + +template +void LoadRequest::send() { + auto ctx = create_context_callback< + LoadRequest, &LoadRequest::handle_read_header>(this); + read(m_initial_read_size, ctx); +} + +template +void LoadRequest::read(uint64_t end_offset, Context* on_finish) { + auto length = end_offset - m_offset; + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, librbd::util::get_image_ctx(m_image_ctx), + io::AIO_TYPE_READ); + ZTracer::Trace trace; + auto req = io::ImageDispatchSpec::create_read( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + {{m_offset, length}}, io::ImageArea::DATA, io::ReadResult{&m_bl}, + m_image_ctx->get_data_io_context(), 0, 0, trace); + req->send(); +} + +template +bool LoadRequest::handle_read(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "error reading from image: " << cpp_strerror(r) + << dendl; + finish(r); + return false; + } + + // first, check LUKS magic at the beginning of the image + // If no magic is detected, caller may assume image is actually plaintext + if (m_offset == 0) { + if (Magic::is_luks(m_bl) > 0 || Magic::is_rbd_clone(m_bl) > 0) { + *m_detected_format_name = "LUKS"; + } else { + *m_detected_format_name = crypto::LoadRequest::UNKNOWN_FORMAT; + finish(-EINVAL); + return false; + } + + if (m_image_ctx->parent != nullptr && Magic::is_rbd_clone(m_bl) > 0) { + r = Magic::replace_magic(m_image_ctx->cct, m_bl); + if (r < 0) { + m_image_ctx->image_lock.lock_shared(); + auto image_size = m_image_ctx->get_image_size(m_image_ctx->snap_id); + m_image_ctx->image_lock.unlock_shared(); + + auto max_header_size = std::min(MAXIMUM_HEADER_SIZE, image_size); + + if (r == -EINVAL && m_bl.length() < max_header_size) { + m_bl.clear(); + auto ctx = create_context_callback< + LoadRequest, &LoadRequest::handle_read_header>(this); + read(max_header_size, ctx); + return false; + } + + lderr(m_image_ctx->cct) << "error replacing rbd clone magic: " + << cpp_strerror(r) << dendl; + finish(r); + return false; + } + } + } + + // setup interface with libcryptsetup + r = m_header.init(); + if (r < 0) { + finish(r); + return false; + } + + m_offset += m_bl.length(); + + // write header to libcryptsetup interface + r = m_header.write(m_bl); + if (r < 0) { + finish(r); + return false; + } + + m_bl.clear(); + + return true; +} + +template +void LoadRequest::handle_read_header(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (!handle_read(r)) { + return; + } + + const char* type; + switch (m_format) { + case RBD_ENCRYPTION_FORMAT_LUKS: + type = CRYPT_LUKS; + break; + case RBD_ENCRYPTION_FORMAT_LUKS1: + type = CRYPT_LUKS1; + break; + case RBD_ENCRYPTION_FORMAT_LUKS2: + type = CRYPT_LUKS2; + break; + default: + lderr(m_image_ctx->cct) << "unsupported format type: " << m_format + << dendl; + finish(-EINVAL); + return; + } + + // parse header via libcryptsetup + r = m_header.load(type); + if (r != 0) { + if (m_offset < MAXIMUM_HEADER_SIZE) { + // perhaps we did not feed the entire header to libcryptsetup, retry + auto ctx = create_context_callback< + LoadRequest, &LoadRequest::handle_read_header>(this); + read(MAXIMUM_HEADER_SIZE, ctx); + return; + } + + finish(r); + return; + } + + // gets actual LUKS version (only used for logging) + ceph_assert(*m_detected_format_name == "LUKS"); + *m_detected_format_name = m_header.get_format_name(); + + auto cipher = m_header.get_cipher(); + if (strcmp(cipher, "aes") != 0) { + lderr(m_image_ctx->cct) << "unsupported cipher: " << cipher << dendl; + finish(-ENOTSUP); + return; + } + + auto cipher_mode = m_header.get_cipher_mode(); + if (strcmp(cipher_mode, "xts-plain64") != 0) { + lderr(m_image_ctx->cct) << "unsupported cipher mode: " << cipher_mode + << dendl; + finish(-ENOTSUP); + return; + } + + m_image_ctx->image_lock.lock_shared(); + uint64_t image_size = m_image_ctx->get_image_size(CEPH_NOSNAP); + m_image_ctx->image_lock.unlock_shared(); + + if (m_header.get_data_offset() > image_size) { + lderr(m_image_ctx->cct) << "image is too small, data offset " + << m_header.get_data_offset() << dendl; + finish(-EINVAL); + return; + } + + uint64_t stripe_period = m_image_ctx->get_stripe_period(); + if (m_header.get_data_offset() % stripe_period != 0) { + lderr(m_image_ctx->cct) << "incompatible stripe pattern, data offset " + << m_header.get_data_offset() << dendl; + finish(-EINVAL); + return; + } + + read_volume_key(); + return; +} + +template +void LoadRequest::handle_read_keyslots(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + if (!handle_read(r)) { + return; + } + + read_volume_key(); +} + +template +void LoadRequest::read_volume_key() { + char volume_key[64]; + size_t volume_key_size = sizeof(volume_key); + + auto r = m_header.read_volume_key( + m_passphrase.data(), m_passphrase.size(), + reinterpret_cast(volume_key), &volume_key_size); + if (r != 0) { + auto keyslots_end_offset = m_header.get_data_offset(); + if (m_offset < keyslots_end_offset) { + // perhaps we did not feed the the necessary keyslot, retry + auto ctx = create_context_callback< + LoadRequest, &LoadRequest::handle_read_keyslots>(this); + read(keyslots_end_offset, ctx); + return; + } + + finish(r); + return; + } + + r = util::build_crypto( + m_image_ctx->cct, reinterpret_cast(volume_key), + volume_key_size, m_header.get_sector_size(), + m_header.get_data_offset(), m_result_crypto); + ceph_memzero_s(volume_key, 64, 64); + finish(r); +} + +template +void LoadRequest::finish(int r) { + ldout(m_image_ctx->cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace luks +} // namespace crypto +} // namespace librbd + +template class librbd::crypto::luks::LoadRequest; diff --git a/src/librbd/crypto/luks/LoadRequest.h b/src/librbd/crypto/luks/LoadRequest.h new file mode 100644 index 000000000..60ed9a4a4 --- /dev/null +++ b/src/librbd/crypto/luks/LoadRequest.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_LUKS_LOAD_REQUEST_H +#define CEPH_LIBRBD_CRYPTO_LUKS_LOAD_REQUEST_H + +#include +#include "include/rbd/librbd.hpp" +#include "librbd/ImageCtx.h" +#include "librbd/crypto/CryptoInterface.h" +#include "librbd/crypto/luks/Header.h" + +namespace librbd { + +class ImageCtx; + +namespace crypto { +namespace luks { + +// max header size in LUKS1/2 (excl. keyslots) is 4MB +const uint64_t MAXIMUM_HEADER_SIZE = 4 * 1024 * 1024; +// default header size in LUKS2 2 X 16KB + 1 X 256KB keyslot +const uint64_t DEFAULT_INITIAL_READ_SIZE = 288 * 1024; + +template +class LoadRequest { +public: + static LoadRequest* create( + I* image_ctx, encryption_format_t format, + std::string_view passphrase, + std::unique_ptr* result_crypto, + std::string* detected_format_name, + Context* on_finish) { + return new LoadRequest(image_ctx, format, passphrase, result_crypto, + detected_format_name, on_finish); + } + + LoadRequest(I* image_ctx, encryption_format_t format, + std::string_view passphrase, + std::unique_ptr* result_crypto, + std::string* detected_format_name, Context* on_finish); + void send(); + void finish(int r); + void set_initial_read_size(uint64_t read_size); + +private: + I* m_image_ctx; + encryption_format_t m_format; + std::string_view m_passphrase; + Context* m_on_finish; + ceph::bufferlist m_bl; + std::unique_ptr* m_result_crypto; + std::string* m_detected_format_name; + uint64_t m_initial_read_size; + Header m_header; + uint64_t m_offset; + + void read(uint64_t end_offset, Context* on_finish); + bool handle_read(int r); + void handle_read_header(int r); + void handle_read_keyslots(int r); + void read_volume_key(); +}; + +} // namespace luks +} // namespace crypto +} // namespace librbd + +extern template class librbd::crypto::luks::LoadRequest; + +#endif // CEPH_LIBRBD_CRYPTO_LUKS_LOAD_REQUEST_H diff --git a/src/librbd/crypto/luks/Magic.cc b/src/librbd/crypto/luks/Magic.cc new file mode 100644 index 000000000..bc5e19704 --- /dev/null +++ b/src/librbd/crypto/luks/Magic.cc @@ -0,0 +1,139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Magic.h" + +#include "common/dout.h" +#include "common/errno.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::crypto::luks::Magic: " << __func__ \ + << ": " + +namespace librbd { +namespace crypto { +namespace luks { + +namespace { + +constexpr uint64_t MAGIC_LENGTH = 6; +const std::string LUKS_MAGIC = "LUKS\xba\xbe"; +const std::string RBD_CLONE_MAGIC = "RBDL\xba\xbe"; + +} // anonymous namespace + +int Magic::read(ceph::bufferlist &bl, uint32_t bl_off, + uint32_t read_size, char* result) { + if (bl_off + read_size > bl.length()) { + return -EINVAL; + } + + memcpy(result, bl.c_str() + bl_off, read_size); + return 0; +} + +int Magic::cmp(ceph::bufferlist &bl, uint32_t bl_off, + const std::string &cmp_str) { + auto cmp_length = cmp_str.length(); + + if (bl_off + cmp_length > bl.length()) { + return -EINVAL; + } + + if (memcmp(bl.c_str() + bl_off, cmp_str.c_str(), cmp_length)) { + return 0; + } + + return 1; +} + +int Magic::is_luks(ceph::bufferlist& bl) { + return cmp(bl, 0, LUKS_MAGIC); +} + +int Magic::is_rbd_clone(ceph::bufferlist& bl) { + return cmp(bl, 0, RBD_CLONE_MAGIC); +} + +void Magic::transform_secondary_header_magic(char* magic) { + std::swap(magic[0], magic[3]); + std::swap(magic[1], magic[2]); +} + +int Magic::replace_magic(CephContext* cct, ceph::bufferlist& bl) { + const std::string *old_magic, *new_magic; + if (is_luks(bl) > 0) { + old_magic = &LUKS_MAGIC; + new_magic = &RBD_CLONE_MAGIC; + } else if (is_rbd_clone(bl) > 0) { + old_magic = &RBD_CLONE_MAGIC; + new_magic = &LUKS_MAGIC; + } else { + lderr(cct) << "invalid magic: " << dendl; + return -EILSEQ; + } + + // read luks version + uint16_t version; + auto r = read(bl, MAGIC_LENGTH, sizeof(version), (char*)&version); + if (r < 0) { + lderr(cct) << "cannot read header version: " << cpp_strerror(r) << dendl; + return r; + } + boost::endian::big_to_native_inplace(version); + + switch (version) { + case 1: { + // LUKS1, no secondary header + break; + } + case 2: { + // LUKS2, secondary header follows primary header + // read header size + uint64_t hdr_size; + r = read(bl, MAGIC_LENGTH + sizeof(version), sizeof(hdr_size), + (char*)&hdr_size); + if (r < 0) { + lderr(cct) << "cannot read header size: " << cpp_strerror(r) << dendl; + return r; + } + boost::endian::big_to_native_inplace(hdr_size); + + if ((uint32_t)hdr_size + MAGIC_LENGTH > bl.length()) { + ldout(cct, 20) << "cannot replace secondary header magic" << dendl; + return -EINVAL; + } + + // check secondary header magic + auto secondary_header_magic = bl.c_str() + hdr_size; + transform_secondary_header_magic(secondary_header_magic); + auto is_secondary_header_magic_valid = + !memcmp(secondary_header_magic, old_magic->c_str(), MAGIC_LENGTH); + if (!is_secondary_header_magic_valid) { + transform_secondary_header_magic(secondary_header_magic); + lderr(cct) << "invalid secondary header magic" << dendl; + return -EILSEQ; + } + + // replace secondary header magic + memcpy(secondary_header_magic, new_magic->c_str(), MAGIC_LENGTH); + transform_secondary_header_magic(secondary_header_magic); + + break; + } + default: { + lderr(cct) << "bad header version: " << version << dendl; + return -EINVAL; + } + } + + // switch primary header magic + memcpy(bl.c_str(), new_magic->c_str(), MAGIC_LENGTH); + + return 0; +} + +} // namespace luks +} // namespace crypto +} // namespace librbd diff --git a/src/librbd/crypto/luks/Magic.h b/src/librbd/crypto/luks/Magic.h new file mode 100644 index 000000000..ad06e67f5 --- /dev/null +++ b/src/librbd/crypto/luks/Magic.h @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_LUKS_MAGIC_H +#define CEPH_LIBRBD_CRYPTO_LUKS_MAGIC_H + +#include "common/ceph_context.h" +#include "include/buffer.h" + +namespace librbd { +namespace crypto { +namespace luks { + +class Magic { +public: + static int is_luks(ceph::bufferlist& bl); + static int is_rbd_clone(ceph::bufferlist& bl); + + static int replace_magic(CephContext* cct, ceph::bufferlist& bl); +private: + static int read(ceph::bufferlist& bl, uint32_t bl_off, + uint32_t read_size, char* result); + static int cmp(ceph::bufferlist& bl, uint32_t bl_off, + const std::string& cmp_str); + static void transform_secondary_header_magic(char* magic); +}; + +} // namespace luks +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_LUKS_MAGIC_H diff --git a/src/librbd/crypto/openssl/DataCryptor.cc b/src/librbd/crypto/openssl/DataCryptor.cc new file mode 100644 index 000000000..aa9427a79 --- /dev/null +++ b/src/librbd/crypto/openssl/DataCryptor.cc @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/crypto/openssl/DataCryptor.h" +#include +#include +#include "include/ceph_assert.h" +#include "include/compat.h" + +namespace librbd { +namespace crypto { +namespace openssl { + +int DataCryptor::init(const char* cipher_name, const unsigned char* key, + uint16_t key_length) { + if (m_key != nullptr) { + ceph_memzero_s(m_key, m_key_size, m_key_size); + delete [] m_key; + m_key = nullptr; + m_key_size = 0; + } + if (cipher_name == nullptr) { + lderr(m_cct) << "missing cipher name" << dendl; + return -EINVAL; + } + if (key == nullptr) { + lderr(m_cct) << "missing key" << dendl; + return -EINVAL; + } + + m_cipher = EVP_get_cipherbyname(cipher_name); + if (m_cipher == nullptr) { + lderr(m_cct) << "EVP_get_cipherbyname failed. Cipher name: " << cipher_name + << dendl; + log_errors(); + return -EINVAL; + } + + auto expected_key_length = EVP_CIPHER_key_length(m_cipher); + if (expected_key_length != key_length) { + lderr(m_cct) << "cipher " << cipher_name << " expects key of " + << expected_key_length << " bytes. got: " << key_length + << dendl; + return -EINVAL; + } + + m_key_size = key_length; + m_key = new unsigned char[key_length]; + memcpy(m_key, key, key_length); + m_iv_size = static_cast(EVP_CIPHER_iv_length(m_cipher)); + return 0; +} + +DataCryptor::~DataCryptor() { + if (m_key != nullptr) { + ceph_memzero_s(m_key, m_key_size, m_key_size); + delete [] m_key; + m_key = nullptr; + } +} + +uint32_t DataCryptor::get_block_size() const { + return EVP_CIPHER_block_size(m_cipher); +} + +uint32_t DataCryptor::get_iv_size() const { + return m_iv_size; +} + +const unsigned char* DataCryptor::get_key() const { + return m_key; +} + +int DataCryptor::get_key_length() const { + return EVP_CIPHER_key_length(m_cipher); +} + +EVP_CIPHER_CTX* DataCryptor::get_context(CipherMode mode) { + int enc; + switch(mode) { + case CIPHER_MODE_ENC: + enc = 1; + break; + case CIPHER_MODE_DEC: + enc = 0; + break; + default: + lderr(m_cct) << "Invalid CipherMode:" << mode << dendl; + return nullptr; + } + + auto ctx = EVP_CIPHER_CTX_new(); + if (ctx == nullptr) { + lderr(m_cct) << "EVP_CIPHER_CTX_new failed" << dendl; + log_errors(); + return nullptr; + } + + if (1 != EVP_CipherInit_ex(ctx, m_cipher, nullptr, m_key, nullptr, enc)) { + lderr(m_cct) << "EVP_CipherInit_ex failed" << dendl; + log_errors(); + return nullptr; + } + + return ctx; +} + +void DataCryptor::return_context(EVP_CIPHER_CTX* ctx, CipherMode mode) { + if (ctx != nullptr) { + EVP_CIPHER_CTX_free(ctx); + } +} + +int DataCryptor::init_context(EVP_CIPHER_CTX* ctx, const unsigned char* iv, + uint32_t iv_length) const { + if (iv_length != m_iv_size) { + lderr(m_cct) << "cipher expects IV of " << m_iv_size << " bytes. got: " + << iv_length << dendl; + return -EINVAL; + } + if (1 != EVP_CipherInit_ex(ctx, nullptr, nullptr, nullptr, iv, -1)) { + lderr(m_cct) << "EVP_CipherInit_ex failed" << dendl; + log_errors(); + return -EIO; + } + return 0; +} + +int DataCryptor::update_context(EVP_CIPHER_CTX* ctx, const unsigned char* in, + unsigned char* out, uint32_t len) const { + int out_length; + if (1 != EVP_CipherUpdate(ctx, out, &out_length, in, len)) { + lderr(m_cct) << "EVP_CipherUpdate failed. len=" << len << dendl; + log_errors(); + return -EIO; + } + return out_length; +} + +void DataCryptor::log_errors() const { + while (true) { + auto error = ERR_get_error(); + if (error == 0) { + break; + } + lderr(m_cct) << "OpenSSL error: " << ERR_error_string(error, nullptr) + << dendl; + } +} + +} // namespace openssl +} // namespace crypto +} // namespace librbd diff --git a/src/librbd/crypto/openssl/DataCryptor.h b/src/librbd/crypto/openssl/DataCryptor.h new file mode 100644 index 000000000..af6956883 --- /dev/null +++ b/src/librbd/crypto/openssl/DataCryptor.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_CRYPTO_OPENSSL_DATA_CRYPTOR_H +#define CEPH_LIBRBD_CRYPTO_OPENSSL_DATA_CRYPTOR_H + +#include "librbd/crypto/DataCryptor.h" +#include "include/Context.h" +#include + +namespace librbd { +namespace crypto { +namespace openssl { + +class DataCryptor : public crypto::DataCryptor { + +public: + DataCryptor(CephContext* cct) : m_cct(cct) {}; + ~DataCryptor(); + + int init(const char* cipher_name, const unsigned char* key, + uint16_t key_length); + uint32_t get_block_size() const override; + uint32_t get_iv_size() const override; + const unsigned char* get_key() const override; + int get_key_length() const override; + + EVP_CIPHER_CTX* get_context(CipherMode mode) override; + void return_context(EVP_CIPHER_CTX* ctx, CipherMode mode) override; + int init_context(EVP_CIPHER_CTX* ctx, const unsigned char* iv, + uint32_t iv_length) const override; + int update_context(EVP_CIPHER_CTX* ctx, const unsigned char* in, + unsigned char* out, uint32_t len) const override; + +private: + CephContext* m_cct; + unsigned char* m_key = nullptr; + uint16_t m_key_size = 0; + const EVP_CIPHER* m_cipher; + uint32_t m_iv_size; + + void log_errors() const; +}; + +} // namespace openssl +} // namespace crypto +} // namespace librbd + +#endif // CEPH_LIBRBD_CRYPTO_OPENSSL_DATA_CRYPTOR_H diff --git a/src/librbd/deep_copy/Handler.h b/src/librbd/deep_copy/Handler.h new file mode 100644 index 000000000..fea553ee2 --- /dev/null +++ b/src/librbd/deep_copy/Handler.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_HANDLER_H +#define CEPH_LIBRBD_DEEP_COPY_HANDLER_H + +#include "include/int_types.h" +#include "include/rbd/librbd.hpp" + +namespace librbd { +namespace deep_copy { + +struct Handler { + virtual ~Handler() {} + + virtual void handle_read(uint64_t bytes_read) = 0; + + virtual int update_progress(uint64_t object_number, + uint64_t object_count) = 0; +}; + +struct NoOpHandler : public Handler { + void handle_read(uint64_t bytes_read) override { + } + + int update_progress(uint64_t object_number, + uint64_t object_count) override { + return 0; + } +}; + +class ProgressHandler : public NoOpHandler { +public: + ProgressHandler(ProgressContext* progress_ctx) + : m_progress_ctx(progress_ctx) { + } + + int update_progress(uint64_t object_number, + uint64_t object_count) override { + return m_progress_ctx->update_progress(object_number, object_count); + } + +private: + librbd::ProgressContext* m_progress_ctx; +}; + +} // namespace deep_copy +} // namespace librbd + +#endif // CEPH_LIBRBD_DEEP_COPY_HANDLER_H diff --git a/src/librbd/deep_copy/ImageCopyRequest.cc b/src/librbd/deep_copy/ImageCopyRequest.cc new file mode 100644 index 000000000..08e959dd5 --- /dev/null +++ b/src/librbd/deep_copy/ImageCopyRequest.cc @@ -0,0 +1,278 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ImageCopyRequest.h" +#include "ObjectCopyRequest.h" +#include "common/errno.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/deep_copy/Handler.h" +#include "librbd/deep_copy/Utils.h" +#include "librbd/object_map/DiffRequest.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::ImageCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using librbd::util::unique_lock_name; + +template +ImageCopyRequest::ImageCopyRequest(I *src_image_ctx, I *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, + const ObjectNumber &object_number, + const SnapSeqs &snap_seqs, + Handler *handler, + Context *on_finish) + : RefCountedObject(dst_image_ctx->cct), m_src_image_ctx(src_image_ctx), + m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start), + m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start), + m_flatten(flatten), m_object_number(object_number), m_snap_seqs(snap_seqs), + m_handler(handler), m_on_finish(on_finish), m_cct(dst_image_ctx->cct), + m_lock(ceph::make_mutex(unique_lock_name("ImageCopyRequest::m_lock", this))) { +} + +template +void ImageCopyRequest::send() { + m_dst_image_ctx->image_lock.lock_shared(); + util::compute_snap_map(m_dst_image_ctx->cct, m_src_snap_id_start, + m_src_snap_id_end, m_dst_image_ctx->snaps, m_snap_seqs, + &m_snap_map); + m_dst_image_ctx->image_lock.unlock_shared(); + + if (m_snap_map.empty()) { + lderr(m_cct) << "failed to map snapshots within boundary" << dendl; + finish(-EINVAL); + return; + } + + compute_diff(); +} + +template +void ImageCopyRequest::cancel() { + std::lock_guard locker{m_lock}; + + ldout(m_cct, 20) << dendl; + m_canceled = true; +} + +template +void ImageCopyRequest::map_src_objects(uint64_t dst_object, + std::set *src_objects) { + std::vector> image_extents; + Striper::extent_to_file(m_cct, &m_dst_image_ctx->layout, dst_object, 0, + m_dst_image_ctx->layout.object_size, image_extents); + + for (auto &e : image_extents) { + std::map> src_object_extents; + Striper::file_to_extents(m_cct, m_src_image_ctx->format_string, + &m_src_image_ctx->layout, e.first, e.second, 0, + src_object_extents); + for (auto &p : src_object_extents) { + for (auto &s : p.second) { + src_objects->insert(s.objectno); + } + } + } + + ceph_assert(!src_objects->empty()); + + ldout(m_cct, 20) << dst_object << " -> " << *src_objects << dendl; +} + +template +void ImageCopyRequest::compute_diff() { + if (m_flatten) { + send_object_copies(); + return; + } + + ldout(m_cct, 10) << dendl; + + auto ctx = create_context_callback< + ImageCopyRequest, &ImageCopyRequest::handle_compute_diff>(this); + auto req = object_map::DiffRequest::create(m_src_image_ctx, m_src_snap_id_start, + m_src_snap_id_end, &m_object_diff_state, + ctx); + req->send(); +} + +template +void ImageCopyRequest::handle_compute_diff(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + ldout(m_cct, 10) << "fast-diff optimization disabled" << dendl; + m_object_diff_state.resize(0); + } + + send_object_copies(); +} + +template +void ImageCopyRequest::send_object_copies() { + m_object_no = 0; + if (m_object_number) { + m_object_no = *m_object_number + 1; + } + + uint64_t size; + { + std::shared_lock image_locker{m_src_image_ctx->image_lock}; + size = m_src_image_ctx->get_image_size(CEPH_NOSNAP); + for (auto snap_id : m_src_image_ctx->snaps) { + size = std::max(size, m_src_image_ctx->get_image_size(snap_id)); + } + } + m_end_object_no = Striper::get_num_objects(m_dst_image_ctx->layout, size); + + ldout(m_cct, 20) << "start_object=" << m_object_no << ", " + << "end_object=" << m_end_object_no << dendl; + + bool complete; + { + std::lock_guard locker{m_lock}; + auto max_ops = m_src_image_ctx->config.template get_val( + "rbd_concurrent_management_ops"); + + // attempt to schedule at least 'max_ops' initial requests where + // some objects might be skipped if fast-diff notes no change + for (uint64_t i = 0; i < max_ops; i++) { + send_next_object_copy(); + } + + complete = (m_current_ops == 0) && !m_updating_progress; + } + + if (complete) { + finish(m_ret_val); + } +} + +template +void ImageCopyRequest::send_next_object_copy() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + if (m_canceled && m_ret_val == 0) { + ldout(m_cct, 10) << "image copy canceled" << dendl; + m_ret_val = -ECANCELED; + } + + if (m_ret_val < 0 || m_object_no >= m_end_object_no) { + return; + } + + uint64_t ono = m_object_no++; + Context *ctx = new LambdaContext( + [this, ono](int r) { + handle_object_copy(ono, r); + }); + + ldout(m_cct, 20) << "object_num=" << ono << dendl; + ++m_current_ops; + + uint8_t object_diff_state = object_map::DIFF_STATE_HOLE; + if (m_object_diff_state.size() > 0) { + std::set src_objects; + map_src_objects(ono, &src_objects); + + for (auto src_ono : src_objects) { + if (src_ono >= m_object_diff_state.size()) { + object_diff_state = object_map::DIFF_STATE_DATA_UPDATED; + } else { + auto state = m_object_diff_state[src_ono]; + if ((state == object_map::DIFF_STATE_HOLE_UPDATED && + object_diff_state != object_map::DIFF_STATE_DATA_UPDATED) || + (state == object_map::DIFF_STATE_DATA && + object_diff_state == object_map::DIFF_STATE_HOLE) || + (state == object_map::DIFF_STATE_DATA_UPDATED)) { + object_diff_state = state; + } + } + } + + if (object_diff_state == object_map::DIFF_STATE_HOLE) { + ldout(m_cct, 20) << "skipping non-existent object " << ono << dendl; + create_async_context_callback(*m_src_image_ctx, ctx)->complete(0); + return; + } + } + + uint32_t flags = 0; + if (m_flatten) { + flags |= OBJECT_COPY_REQUEST_FLAG_FLATTEN; + } + if (object_diff_state == object_map::DIFF_STATE_DATA) { + // no source objects have been updated and at least one has clean data + flags |= OBJECT_COPY_REQUEST_FLAG_EXISTS_CLEAN; + } + + auto req = ObjectCopyRequest::create( + m_src_image_ctx, m_dst_image_ctx, m_src_snap_id_start, m_dst_snap_id_start, + m_snap_map, ono, flags, m_handler, ctx); + req->send(); +} + +template +void ImageCopyRequest::handle_object_copy(uint64_t object_no, int r) { + ldout(m_cct, 20) << "object_no=" << object_no << ", r=" << r << dendl; + + bool complete; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_current_ops > 0); + --m_current_ops; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "object copy failed: " << cpp_strerror(r) << dendl; + if (m_ret_val == 0) { + m_ret_val = r; + } + } else { + m_copied_objects.push(object_no); + while (!m_updating_progress && !m_copied_objects.empty() && + m_copied_objects.top() == + (m_object_number ? *m_object_number + 1 : 0)) { + m_object_number = m_copied_objects.top(); + m_copied_objects.pop(); + uint64_t progress_object_no = *m_object_number + 1; + m_updating_progress = true; + m_lock.unlock(); + m_handler->update_progress(progress_object_no, m_end_object_no); + m_lock.lock(); + ceph_assert(m_updating_progress); + m_updating_progress = false; + } + } + + send_next_object_copy(); + complete = (m_current_ops == 0) && !m_updating_progress; + } + + if (complete) { + finish(m_ret_val); + } +} + +template +void ImageCopyRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + put(); +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::ImageCopyRequest; diff --git a/src/librbd/deep_copy/ImageCopyRequest.h b/src/librbd/deep_copy/ImageCopyRequest.h new file mode 100644 index 000000000..cb8b83781 --- /dev/null +++ b/src/librbd/deep_copy/ImageCopyRequest.h @@ -0,0 +1,123 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "common/bit_vector.hpp" +#include "common/ceph_mutex.h" +#include "common/RefCountedObj.h" +#include "librbd/Types.h" +#include "librbd/deep_copy/Types.h" +#include +#include +#include +#include +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace deep_copy { + +class Handler; + +template +class ImageCopyRequest : public RefCountedObject { +public: + static ImageCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, + const ObjectNumber &object_number, + const SnapSeqs &snap_seqs, + Handler *handler, + Context *on_finish) { + return new ImageCopyRequest(src_image_ctx, dst_image_ctx, src_snap_id_start, + src_snap_id_end, dst_snap_id_start, flatten, + object_number, snap_seqs, handler, on_finish); + } + + ImageCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, const ObjectNumber &object_number, + const SnapSeqs &snap_seqs, Handler *handler, + Context *on_finish); + + void send(); + void cancel(); + +private: + /** + * @verbatim + * + * + * | + * v + * COMPUTE_DIFF + * | + * | . . . . . + * | . . (parallel execution of + * v v . multiple objects at once) + * COPY_OBJECT . . . . + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + librados::snap_t m_src_snap_id_start; + librados::snap_t m_src_snap_id_end; + librados::snap_t m_dst_snap_id_start; + bool m_flatten; + ObjectNumber m_object_number; + SnapSeqs m_snap_seqs; + Handler *m_handler; + Context *m_on_finish; + + CephContext *m_cct; + ceph::mutex m_lock; + bool m_canceled = false; + + uint64_t m_object_no = 0; + uint64_t m_end_object_no = 0; + uint64_t m_current_ops = 0; + std::priority_queue< + uint64_t, std::vector, std::greater> m_copied_objects; + bool m_updating_progress = false; + SnapMap m_snap_map; + int m_ret_val = 0; + + BitVector<2> m_object_diff_state; + + void map_src_objects(uint64_t dst_object, std::set *src_objects); + + void compute_diff(); + void handle_compute_diff(int r); + + void send_object_copies(); + void send_next_object_copy(); + void handle_object_copy(uint64_t object_no, int r); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::ImageCopyRequest; + +#endif // CEPH_LIBRBD_DEEP_COPY_IMAGE_DEEP_COPY_REQUEST_H diff --git a/src/librbd/deep_copy/MetadataCopyRequest.cc b/src/librbd/deep_copy/MetadataCopyRequest.cc new file mode 100644 index 000000000..c584bea54 --- /dev/null +++ b/src/librbd/deep_copy/MetadataCopyRequest.cc @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MetadataCopyRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/Utils.h" +#include "librbd/image/GetMetadataRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::MetadataCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +namespace { + +const uint64_t MAX_METADATA_ITEMS = 128; + +} // anonymous namespace + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +MetadataCopyRequest::MetadataCopyRequest(I *src_image_ctx, I *dst_image_ctx, + Context *on_finish) + : m_src_image_ctx(src_image_ctx), m_dst_image_ctx(dst_image_ctx), + m_on_finish(on_finish), m_cct(dst_image_ctx->cct) { +} + +template +void MetadataCopyRequest::send() { + list_src_metadata(); +} + +template +void MetadataCopyRequest::list_src_metadata() { + ldout(m_cct, 20) << "start_key=" << m_last_metadata_key << dendl; + + m_metadata.clear(); + auto ctx = create_context_callback< + MetadataCopyRequest, + &MetadataCopyRequest::handle_list_src_metadata>(this); + auto req = image::GetMetadataRequest::create( + m_src_image_ctx->md_ctx, m_src_image_ctx->header_oid, true, "", + m_last_metadata_key, MAX_METADATA_ITEMS, &m_metadata, ctx); + req->send(); +} + +template +void MetadataCopyRequest::handle_list_src_metadata(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to retrieve metadata: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (m_metadata.empty()) { + finish(0); + return; + } + + m_last_metadata_key = m_metadata.rbegin()->first; + m_more_metadata = (m_metadata.size() >= MAX_METADATA_ITEMS); + set_dst_metadata(); +} + +template +void MetadataCopyRequest::set_dst_metadata() { + ldout(m_cct, 20) << "count=" << m_metadata.size() << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::metadata_set(&op, m_metadata); + + librados::AioCompletion *aio_comp = create_rados_callback< + MetadataCopyRequest, + &MetadataCopyRequest::handle_set_dst_metadata>(this); + m_dst_image_ctx->md_ctx.aio_operate(m_dst_image_ctx->header_oid, aio_comp, + &op); + aio_comp->release(); +} + +template +void MetadataCopyRequest::handle_set_dst_metadata(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to set metadata: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (m_more_metadata) { + list_src_metadata(); + return; + } + + finish(0); +} + +template +void MetadataCopyRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + m_on_finish->complete(r); + delete this; +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::MetadataCopyRequest; diff --git a/src/librbd/deep_copy/MetadataCopyRequest.h b/src/librbd/deep_copy/MetadataCopyRequest.h new file mode 100644 index 000000000..8db55db96 --- /dev/null +++ b/src/librbd/deep_copy/MetadataCopyRequest.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "librbd/ImageCtx.h" +#include +#include + +class Context; + +namespace librbd { +namespace deep_copy { + +template +class MetadataCopyRequest { +public: + static MetadataCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + Context *on_finish) { + return new MetadataCopyRequest(src_image_ctx, dst_image_ctx, on_finish); + } + + MetadataCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * LIST_SRC_METADATA <------\ + * | | (repeat if additional + * v | metadata) + * SET_DST_METADATA --------/ + * | + * v + * + * + * @endverbatim + */ + typedef std::map Metadata; + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + Context *m_on_finish; + + CephContext *m_cct; + bufferlist m_out_bl; + + std::map m_metadata; + std::string m_last_metadata_key; + bool m_more_metadata = false; + + void list_src_metadata(); + void handle_list_src_metadata(int r); + + void set_dst_metadata(); + void handle_set_dst_metadata(int r); + + void finish(int r); + +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::MetadataCopyRequest; + +#endif // CEPH_LIBRBD_DEEP_COPY_METADATA_COPY_REQUEST_H diff --git a/src/librbd/deep_copy/ObjectCopyRequest.cc b/src/librbd/deep_copy/ObjectCopyRequest.cc new file mode 100644 index 000000000..e8b42b68f --- /dev/null +++ b/src/librbd/deep_copy/ObjectCopyRequest.cc @@ -0,0 +1,839 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ObjectCopyRequest.h" +#include "include/neorados/RADOS.hpp" +#include "common/errno.h" +#include "librados/snap_set_diff.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/deep_copy/Handler.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Utils.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::ObjectCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; +using librbd::util::get_image_ctx; + +template +ObjectCopyRequest::ObjectCopyRequest(I *src_image_ctx, + I *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t dst_snap_id_start, + const SnapMap &snap_map, + uint64_t dst_object_number, + uint32_t flags, Handler* handler, + Context *on_finish) + : m_src_image_ctx(src_image_ctx), + m_dst_image_ctx(dst_image_ctx), m_cct(dst_image_ctx->cct), + m_src_snap_id_start(src_snap_id_start), + m_dst_snap_id_start(dst_snap_id_start), m_snap_map(snap_map), + m_dst_object_number(dst_object_number), m_flags(flags), + m_handler(handler), m_on_finish(on_finish) { + ceph_assert(src_image_ctx->data_ctx.is_valid()); + ceph_assert(dst_image_ctx->data_ctx.is_valid()); + ceph_assert(!m_snap_map.empty()); + + m_src_async_op = new io::AsyncOperation(); + m_src_async_op->start_op(*get_image_ctx(m_src_image_ctx)); + + m_src_io_ctx.dup(m_src_image_ctx->data_ctx); + m_dst_io_ctx.dup(m_dst_image_ctx->data_ctx); + + m_dst_oid = m_dst_image_ctx->get_object_name(dst_object_number); + + ldout(m_cct, 20) << "dst_oid=" << m_dst_oid << ", " + << "src_snap_id_start=" << m_src_snap_id_start << ", " + << "dst_snap_id_start=" << m_dst_snap_id_start << ", " + << "snap_map=" << m_snap_map << dendl; +} + +template +void ObjectCopyRequest::send() { + send_list_snaps(); +} + +template +void ObjectCopyRequest::send_list_snaps() { + // image extents are consistent across src and dst so compute once + std::tie(m_image_extents, m_image_area) = io::util::object_to_area_extents( + m_dst_image_ctx, m_dst_object_number, + {{0, m_dst_image_ctx->layout.object_size}}); + ldout(m_cct, 20) << "image_extents=" << m_image_extents + << " area=" << m_image_area << dendl; + + auto ctx = create_async_context_callback( + *m_src_image_ctx, create_context_callback< + ObjectCopyRequest, &ObjectCopyRequest::handle_list_snaps>(this)); + if ((m_flags & OBJECT_COPY_REQUEST_FLAG_EXISTS_CLEAN) != 0) { + // skip listing the snaps if we know the destination exists and is clean, + // but we do need to update the object-map + ctx->complete(0); + return; + } + + io::SnapIds snap_ids; + snap_ids.reserve(1 + m_snap_map.size()); + snap_ids.push_back(m_src_snap_id_start); + for (auto& [src_snap_id, _] : m_snap_map) { + if (m_src_snap_id_start < src_snap_id) { + snap_ids.push_back(src_snap_id); + } + } + + auto list_snaps_flags = io::LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT; + + m_snapshot_delta.clear(); + + auto aio_comp = io::AioCompletion::create_and_start( + ctx, get_image_ctx(m_src_image_ctx), io::AIO_TYPE_GENERIC); + auto req = io::ImageDispatchSpec::create_list_snaps( + *m_src_image_ctx, io::IMAGE_DISPATCH_LAYER_NONE, aio_comp, + io::Extents{m_image_extents}, m_image_area, std::move(snap_ids), + list_snaps_flags, &m_snapshot_delta, {}); + req->send(); +} + +template +void ObjectCopyRequest::handle_list_snaps(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to list snaps: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + ldout(m_cct, 20) << "snapshot_delta=" << m_snapshot_delta << dendl; + + compute_dst_object_may_exist(); + compute_read_ops(); + + send_read(); +} + +template +void ObjectCopyRequest::send_read() { + if (m_read_snaps.empty()) { + // all snapshots have been read + merge_write_ops(); + compute_zero_ops(); + + send_update_object_map(); + return; + } + + auto index = *m_read_snaps.begin(); + auto& read_op = m_read_ops[index]; + if (read_op.image_interval.empty()) { + // nothing written to this object for this snapshot (must be trunc/remove) + handle_read(0); + return; + } + + auto io_context = m_src_image_ctx->duplicate_data_io_context(); + io_context->read_snap(index.second); + + io::Extents image_extents{read_op.image_interval.begin(), + read_op.image_interval.end()}; + io::ReadResult read_result{&read_op.image_extent_map, + &read_op.out_bl}; + + ldout(m_cct, 20) << "read: src_snap_seq=" << index.second << ", " + << "image_extents=" << image_extents << dendl; + + int op_flags = (LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | + LIBRADOS_OP_FLAG_FADVISE_NOCACHE); + + int read_flags = 0; + if (index.second != m_src_image_ctx->snap_id) { + read_flags |= io::READ_FLAG_DISABLE_CLIPPING; + } + + auto ctx = create_context_callback< + ObjectCopyRequest, &ObjectCopyRequest::handle_read>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, get_image_ctx(m_src_image_ctx), io::AIO_TYPE_READ); + + auto req = io::ImageDispatchSpec::create_read( + *m_src_image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, aio_comp, + std::move(image_extents), m_image_area, std::move(read_result), + io_context, op_flags, read_flags, {}); + req->send(); +} + +template +void ObjectCopyRequest::handle_read(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to read from source object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (m_handler != nullptr) { + auto index = *m_read_snaps.begin(); + auto& read_op = m_read_ops[index]; + m_handler->handle_read(read_op.out_bl.length()); + } + + ceph_assert(!m_read_snaps.empty()); + m_read_snaps.erase(m_read_snaps.begin()); + + send_read(); +} + +template +void ObjectCopyRequest::send_update_object_map() { + if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP) || + m_dst_object_state.empty()) { + process_copyup(); + return; + } + + m_dst_image_ctx->owner_lock.lock_shared(); + m_dst_image_ctx->image_lock.lock_shared(); + if (m_dst_image_ctx->object_map == nullptr) { + // possible that exclusive lock was lost in background + lderr(m_cct) << "object map is not initialized" << dendl; + + m_dst_image_ctx->image_lock.unlock_shared(); + m_dst_image_ctx->owner_lock.unlock_shared(); + finish(-EINVAL); + return; + } + + auto &dst_object_state = *m_dst_object_state.begin(); + auto it = m_snap_map.find(dst_object_state.first); + ceph_assert(it != m_snap_map.end()); + auto dst_snap_id = it->second.front(); + auto object_state = dst_object_state.second; + m_dst_object_state.erase(m_dst_object_state.begin()); + + ldout(m_cct, 20) << "dst_snap_id=" << dst_snap_id << ", object_state=" + << static_cast(object_state) << dendl; + + int r; + auto finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + m_dst_image_ctx->image_lock.unlock_shared(); + m_dst_image_ctx->owner_lock.unlock_shared(); + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_update_object_map(r); + finish_op_ctx->complete(0); + }); + + auto dst_image_ctx = m_dst_image_ctx; + bool sent = dst_image_ctx->object_map->template aio_update< + Context, &Context::complete>(dst_snap_id, m_dst_object_number, object_state, + {}, {}, false, ctx); + + // NOTE: state machine might complete before we reach here + dst_image_ctx->image_lock.unlock_shared(); + dst_image_ctx->owner_lock.unlock_shared(); + if (!sent) { + ceph_assert(dst_snap_id == CEPH_NOSNAP); + ctx->complete(0); + } +} + +template +void ObjectCopyRequest::handle_update_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to update object map: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (!m_dst_object_state.empty()) { + send_update_object_map(); + return; + } + + process_copyup(); +} + +template +void ObjectCopyRequest::process_copyup() { + if (m_snapshot_sparse_bufferlist.empty()) { + // no data to copy or truncate/zero. only the copyup state machine cares + // about whether the object exists or not, and it always copies from + // snap id 0. + finish(m_src_snap_id_start > 0 ? 0 : -ENOENT); + return; + } + + ldout(m_cct, 20) << dendl; + + // let dispatch layers have a chance to process the data but + // assume that the dispatch layer will only touch the sparse bufferlist + auto r = m_dst_image_ctx->io_object_dispatcher->prepare_copyup( + m_dst_object_number, &m_snapshot_sparse_bufferlist); + if (r < 0) { + lderr(m_cct) << "failed to prepare copyup data: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + send_write_object(); +} + +template +void ObjectCopyRequest::send_write_object() { + ceph_assert(!m_snapshot_sparse_bufferlist.empty()); + auto& sparse_bufferlist = m_snapshot_sparse_bufferlist.begin()->second; + + m_src_image_ctx->image_lock.lock_shared(); + bool hide_parent = (m_src_snap_id_start == 0 && + m_src_image_ctx->parent != nullptr); + m_src_image_ctx->image_lock.unlock_shared(); + + // retrieve the destination snap context for the op + SnapIds dst_snap_ids; + librados::snap_t dst_snap_seq = 0; + librados::snap_t src_snap_seq = m_snapshot_sparse_bufferlist.begin()->first; + if (src_snap_seq != 0) { + auto snap_map_it = m_snap_map.find(src_snap_seq); + ceph_assert(snap_map_it != m_snap_map.end()); + + auto dst_snap_id = snap_map_it->second.front(); + auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_id); + ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end()); + if (!dst_may_exist_it->second && !sparse_bufferlist.empty()) { + // if the object cannot exist, the only valid op is to remove it + ldout(m_cct, 20) << "object DNE: src_snap_seq=" << src_snap_seq << dendl; + ceph_assert(sparse_bufferlist.ext_count() == 1U); + ceph_assert(sparse_bufferlist.begin().get_val().state == + io::SPARSE_EXTENT_STATE_ZEROED && + sparse_bufferlist.begin().get_off() == 0 && + sparse_bufferlist.begin().get_len() == + m_dst_image_ctx->layout.object_size); + } + + // write snapshot context should be before actual snapshot + ceph_assert(!snap_map_it->second.empty()); + auto dst_snap_ids_it = snap_map_it->second.begin(); + ++dst_snap_ids_it; + + dst_snap_ids = SnapIds{dst_snap_ids_it, snap_map_it->second.end()}; + if (!dst_snap_ids.empty()) { + dst_snap_seq = dst_snap_ids.front(); + } + ceph_assert(dst_snap_seq != CEPH_NOSNAP); + } + + ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", " + << "dst_snap_seq=" << dst_snap_seq << ", " + << "dst_snaps=" << dst_snap_ids << dendl; + + librados::ObjectWriteOperation op; + + bool migration = ((m_flags & OBJECT_COPY_REQUEST_FLAG_MIGRATION) != 0); + if (migration) { + ldout(m_cct, 20) << "assert_snapc_seq=" << dst_snap_seq << dendl; + cls_client::assert_snapc_seq(&op, dst_snap_seq, + cls::rbd::ASSERT_SNAPC_SEQ_GT_SNAPSET_SEQ); + } + + for (auto& sbe : sparse_bufferlist) { + switch (sbe.get_val().state) { + case io::SPARSE_EXTENT_STATE_DATA: + ldout(m_cct, 20) << "write op: " << sbe.get_off() << "~" + << sbe.get_len() << dendl; + op.write(sbe.get_off(), std::move(sbe.get_val().bl)); + op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | + LIBRADOS_OP_FLAG_FADVISE_NOCACHE); + break; + case io::SPARSE_EXTENT_STATE_ZEROED: + if (sbe.get_off() + sbe.get_len() == + m_dst_image_ctx->layout.object_size) { + if (sbe.get_off() == 0) { + if (hide_parent) { + ldout(m_cct, 20) << "create+truncate op" << dendl; + op.create(false); + op.truncate(0); + } else { + ldout(m_cct, 20) << "remove op" << dendl; + op.remove(); + } + } else { + ldout(m_cct, 20) << "trunc op: " << sbe.get_off() << dendl; + op.truncate(sbe.get_off()); + } + } else { + ldout(m_cct, 20) << "zero op: " << sbe.get_off() << "~" + << sbe.get_len() << dendl; + op.zero(sbe.get_off(), sbe.get_len()); + } + break; + default: + ceph_abort(); + } + } + + if (op.size() == (migration ? 1 : 0)) { + handle_write_object(0); + return; + } + + int r; + Context *finish_op_ctx; + { + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r); + } + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_write_object(r); + finish_op_ctx->complete(0); + }); + librados::AioCompletion *comp = create_rados_callback(ctx); + r = m_dst_io_ctx.aio_operate(m_dst_oid, comp, &op, dst_snap_seq, dst_snap_ids, + nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template +void ObjectCopyRequest::handle_write_object(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == -ENOENT) { + r = 0; + } else if (r == -ERANGE) { + ldout(m_cct, 10) << "concurrent deep copy" << dendl; + r = 0; + } + if (r < 0) { + lderr(m_cct) << "failed to write to destination object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + m_snapshot_sparse_bufferlist.erase(m_snapshot_sparse_bufferlist.begin()); + if (!m_snapshot_sparse_bufferlist.empty()) { + send_write_object(); + return; + } + + finish(0); +} + +template +Context *ObjectCopyRequest::start_lock_op(ceph::shared_mutex &owner_lock, + int* r) { + ceph_assert(ceph_mutex_is_locked(m_dst_image_ctx->owner_lock)); + if (m_dst_image_ctx->exclusive_lock == nullptr) { + return new LambdaContext([](int r) {}); + } + return m_dst_image_ctx->exclusive_lock->start_op(r); +} + +template +void ObjectCopyRequest::compute_read_ops() { + ldout(m_cct, 20) << dendl; + + m_src_image_ctx->image_lock.lock_shared(); + bool read_from_parent = (m_src_snap_id_start == 0 && + m_src_image_ctx->parent != nullptr); + m_src_image_ctx->image_lock.unlock_shared(); + + bool only_dne_extents = true; + interval_set dne_image_interval; + + // compute read ops for any data sections or for any extents that we need to + // read from our parent + for (auto& [key, image_intervals] : m_snapshot_delta) { + io::WriteReadSnapIds write_read_snap_ids{key}; + + // advance the src write snap id to the first valid snap id + if (write_read_snap_ids.first > m_src_snap_id_start) { + // don't attempt to read from snapshots that shouldn't exist in + // case the OSD fails to give a correct snap list + auto snap_map_it = m_snap_map.find(write_read_snap_ids.first); + ceph_assert(snap_map_it != m_snap_map.end()); + auto dst_snap_seq = snap_map_it->second.front(); + + auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_seq); + ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end()); + if (!dst_may_exist_it->second) { + ldout(m_cct, 20) << "DNE snapshot: " << write_read_snap_ids.first + << dendl; + continue; + } + } + + for (auto& image_interval : image_intervals) { + auto state = image_interval.get_val().state; + switch (state) { + case io::SPARSE_EXTENT_STATE_DNE: + if (write_read_snap_ids == io::INITIAL_WRITE_READ_SNAP_IDS && + read_from_parent) { + // special-case for DNE initial object-extents since when flattening + // we need to read data from the parent images extents + ldout(m_cct, 20) << "DNE extent: " + << image_interval.get_off() << "~" + << image_interval.get_len() << dendl; + dne_image_interval.insert( + image_interval.get_off(), image_interval.get_len()); + } + break; + case io::SPARSE_EXTENT_STATE_ZEROED: + only_dne_extents = false; + break; + case io::SPARSE_EXTENT_STATE_DATA: + ldout(m_cct, 20) << "read op: " + << "snap_ids=" << write_read_snap_ids << " " + << image_interval.get_off() << "~" + << image_interval.get_len() << dendl; + m_read_ops[write_read_snap_ids].image_interval.union_insert( + image_interval.get_off(), image_interval.get_len()); + only_dne_extents = false; + break; + default: + ceph_abort(); + break; + } + } + } + + bool flatten = ((m_flags & OBJECT_COPY_REQUEST_FLAG_FLATTEN) != 0); + if (!dne_image_interval.empty() && (!only_dne_extents || flatten)) { + auto snap_map_it = m_snap_map.begin(); + ceph_assert(snap_map_it != m_snap_map.end()); + + auto src_snap_seq = snap_map_it->first; + WriteReadSnapIds write_read_snap_ids{src_snap_seq, src_snap_seq}; + + // prepare to prune the extents to the maximum parent overlap + std::shared_lock image_locker(m_src_image_ctx->image_lock); + uint64_t raw_overlap = 0; + int r = m_src_image_ctx->get_parent_overlap(src_snap_seq, &raw_overlap); + if (r < 0) { + ldout(m_cct, 5) << "failed getting parent overlap for snap_id: " + << src_snap_seq << ": " << cpp_strerror(r) << dendl; + } else if (raw_overlap > 0) { + ldout(m_cct, 20) << "raw_overlap=" << raw_overlap << dendl; + io::Extents parent_extents; + for (auto [image_offset, image_length] : dne_image_interval) { + parent_extents.emplace_back(image_offset, image_length); + } + m_src_image_ctx->prune_parent_extents(parent_extents, m_image_area, + raw_overlap, false); + for (auto [image_offset, image_length] : parent_extents) { + ldout(m_cct, 20) << "parent read op: " + << "snap_ids=" << write_read_snap_ids << " " + << image_offset << "~" << image_length << dendl; + m_read_ops[write_read_snap_ids].image_interval.union_insert( + image_offset, image_length); + } + } + } + + for (auto& [write_read_snap_ids, _] : m_read_ops) { + m_read_snaps.push_back(write_read_snap_ids); + } +} + +template +void ObjectCopyRequest::merge_write_ops() { + ldout(m_cct, 20) << dendl; + + for (auto& [write_read_snap_ids, read_op] : m_read_ops) { + auto src_snap_seq = write_read_snap_ids.first; + + // convert the the resulting sparse image extent map to an interval ... + auto& image_data_interval = m_dst_data_interval[src_snap_seq]; + for (auto [image_offset, image_length] : read_op.image_extent_map) { + image_data_interval.union_insert(image_offset, image_length); + } + + // ... and compute the difference between it and the image extents since + // that indicates zeroed extents + interval_set intersection; + intersection.intersection_of(read_op.image_interval, image_data_interval); + read_op.image_interval.subtract(intersection); + + for (auto& [image_offset, image_length] : read_op.image_interval) { + ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", " + << "inserting sparse-read zero " << image_offset << "~" + << image_length << dendl; + m_dst_zero_interval[src_snap_seq].union_insert( + image_offset, image_length); + } + + uint64_t buffer_offset = 0; + for (auto [image_offset, image_length] : read_op.image_extent_map) { + // convert image extents back to object extents for the write op + striper::LightweightObjectExtents object_extents; + io::util::area_to_object_extents(m_dst_image_ctx, image_offset, + image_length, m_image_area, + buffer_offset, &object_extents); + for (auto& object_extent : object_extents) { + ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", " + << "object_offset=" << object_extent.offset << ", " + << "object_length=" << object_extent.length << dendl; + + bufferlist sub_bl; + sub_bl.substr_of(read_op.out_bl, buffer_offset, object_extent.length); + + m_snapshot_sparse_bufferlist[src_snap_seq].insert( + object_extent.offset, object_extent.length, + {io::SPARSE_EXTENT_STATE_DATA, object_extent.length,\ + std::move(sub_bl)}); + + buffer_offset += object_extent.length; + } + } + } +} + +template +void ObjectCopyRequest::compute_zero_ops() { + ldout(m_cct, 20) << dendl; + + m_src_image_ctx->image_lock.lock_shared(); + bool hide_parent = (m_src_snap_id_start == 0 && + m_src_image_ctx->parent != nullptr); + m_src_image_ctx->image_lock.unlock_shared(); + + // ensure we have a zeroed interval for each snapshot + for (auto& [src_snap_seq, _] : m_snap_map) { + if (m_src_snap_id_start < src_snap_seq) { + m_dst_zero_interval[src_snap_seq]; + } + } + + // exists if copying from an arbitrary snapshot w/o any deltas in the + // start snapshot slot (i.e. DNE) + bool object_exists = ( + m_src_snap_id_start > 0 && + m_snapshot_delta.count({m_src_snap_id_start, m_src_snap_id_start}) == 0); + bool fast_diff = m_dst_image_ctx->test_features(RBD_FEATURE_FAST_DIFF); + uint64_t prev_end_size = 0; + + // compute zero ops from the zeroed intervals + for (auto &it : m_dst_zero_interval) { + auto src_snap_seq = it.first; + auto &zero_interval = it.second; + + auto snap_map_it = m_snap_map.find(src_snap_seq); + ceph_assert(snap_map_it != m_snap_map.end()); + auto dst_snap_seq = snap_map_it->second.front(); + + auto dst_may_exist_it = m_dst_object_may_exist.find(dst_snap_seq); + ceph_assert(dst_may_exist_it != m_dst_object_may_exist.end()); + if (!dst_may_exist_it->second && object_exists) { + ldout(m_cct, 5) << "object DNE for snap_id: " << dst_snap_seq << dendl; + m_snapshot_sparse_bufferlist[src_snap_seq].insert( + 0, m_dst_image_ctx->layout.object_size, + {io::SPARSE_EXTENT_STATE_ZEROED, m_dst_image_ctx->layout.object_size}); + object_exists = false; + prev_end_size = 0; + continue; + } + + if (hide_parent) { + std::shared_lock image_locker{m_dst_image_ctx->image_lock}; + uint64_t raw_overlap = 0; + uint64_t object_overlap = 0; + int r = m_dst_image_ctx->get_parent_overlap(dst_snap_seq, &raw_overlap); + if (r < 0) { + ldout(m_cct, 5) << "failed getting parent overlap for snap_id: " + << dst_snap_seq << ": " << cpp_strerror(r) << dendl; + } else if (raw_overlap > 0) { + auto parent_extents = m_image_extents; + object_overlap = m_dst_image_ctx->prune_parent_extents( + parent_extents, m_image_area, raw_overlap, false); + } + if (object_overlap == 0) { + ldout(m_cct, 20) << "no parent overlap" << dendl; + hide_parent = false; + } + } + + // collect known zeroed extents from the snapshot delta for the current + // src snapshot. If this is the first snapshot, we might need to handle + // the whiteout case if it overlaps with the parent + auto first_src_snap_id = m_snap_map.begin()->first; + auto snapshot_delta_it = m_snapshot_delta.lower_bound( + {(hide_parent && src_snap_seq == first_src_snap_id ? + 0 : src_snap_seq), 0}); + for (; snapshot_delta_it != m_snapshot_delta.end() && + snapshot_delta_it->first.first <= src_snap_seq; + ++snapshot_delta_it) { + auto& write_read_snap_ids = snapshot_delta_it->first; + auto& image_intervals = snapshot_delta_it->second; + for (auto& image_interval : image_intervals) { + auto state = image_interval.get_val().state; + switch (state) { + case io::SPARSE_EXTENT_STATE_ZEROED: + if (write_read_snap_ids != io::INITIAL_WRITE_READ_SNAP_IDS) { + ldout(m_cct, 20) << "zeroed extent: " + << "src_snap_seq=" << src_snap_seq << " " + << image_interval.get_off() << "~" + << image_interval.get_len() << dendl; + zero_interval.union_insert( + image_interval.get_off(), image_interval.get_len()); + } else if (hide_parent && + write_read_snap_ids == io::INITIAL_WRITE_READ_SNAP_IDS) { + ldout(m_cct, 20) << "zeroed (hide parent) extent: " + << "src_snap_seq=" << src_snap_seq << " " + << image_interval.get_off() << "~" + << image_interval.get_len() << dendl; + zero_interval.union_insert( + image_interval.get_off(), image_interval.get_len()); + } + break; + case io::SPARSE_EXTENT_STATE_DNE: + case io::SPARSE_EXTENT_STATE_DATA: + break; + default: + ceph_abort(); + break; + } + } + } + + // subtract any data intervals from our zero intervals + auto& data_interval = m_dst_data_interval[src_snap_seq]; + interval_set intersection; + intersection.intersection_of(zero_interval, data_interval); + zero_interval.subtract(intersection); + + // update end_size if there are writes into higher offsets + uint64_t end_size = prev_end_size; + auto iter = m_snapshot_sparse_bufferlist.find(src_snap_seq); + if (iter != m_snapshot_sparse_bufferlist.end()) { + for (auto &sparse_bufferlist : iter->second) { + object_exists = true; + end_size = std::max( + end_size, sparse_bufferlist.get_off() + sparse_bufferlist.get_len()); + } + } + + ldout(m_cct, 20) << "src_snap_seq=" << src_snap_seq << ", " + << "dst_snap_seq=" << dst_snap_seq << ", " + << "zero_interval=" << zero_interval << ", " + << "end_size=" << end_size << dendl; + for (auto z = zero_interval.begin(); z != zero_interval.end(); ++z) { + // convert image extents back to object extents for the write op + striper::LightweightObjectExtents object_extents; + io::util::area_to_object_extents(m_dst_image_ctx, z.get_start(), + z.get_len(), m_image_area, 0, + &object_extents); + for (auto& object_extent : object_extents) { + ceph_assert(object_extent.offset + object_extent.length <= + m_dst_image_ctx->layout.object_size); + + if (object_extent.offset + object_extent.length >= end_size) { + // zero interval at the object end + if ((object_extent.offset == 0 && hide_parent) || + (object_extent.offset < prev_end_size)) { + ldout(m_cct, 20) << "truncate " << object_extent.offset + << dendl; + auto length = + m_dst_image_ctx->layout.object_size - object_extent.offset; + m_snapshot_sparse_bufferlist[src_snap_seq].insert( + object_extent.offset, length, + {io::SPARSE_EXTENT_STATE_ZEROED, length}); + } + + object_exists = (object_extent.offset > 0 || hide_parent); + end_size = std::min(end_size, object_extent.offset); + } else { + // zero interval inside the object + ldout(m_cct, 20) << "zero " + << object_extent.offset << "~" + << object_extent.length << dendl; + m_snapshot_sparse_bufferlist[src_snap_seq].insert( + object_extent.offset, object_extent.length, + {io::SPARSE_EXTENT_STATE_ZEROED, object_extent.length}); + object_exists = true; + } + } + } + + uint8_t dst_object_map_state = OBJECT_NONEXISTENT; + if (object_exists) { + dst_object_map_state = OBJECT_EXISTS; + if (fast_diff && m_snapshot_sparse_bufferlist.count(src_snap_seq) == 0) { + dst_object_map_state = OBJECT_EXISTS_CLEAN; + } + m_dst_object_state[src_snap_seq] = dst_object_map_state; + } + + ldout(m_cct, 20) << "dst_snap_seq=" << dst_snap_seq << ", " + << "end_size=" << end_size << ", " + << "dst_object_map_state=" + << static_cast(dst_object_map_state) << dendl; + prev_end_size = end_size; + } +} + +template +void ObjectCopyRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + // ensure IoCtxs are closed prior to proceeding + auto on_finish = m_on_finish; + + m_src_async_op->finish_op(); + delete m_src_async_op; + delete this; + + on_finish->complete(r); +} + +template +void ObjectCopyRequest::compute_dst_object_may_exist() { + std::shared_lock image_locker{m_dst_image_ctx->image_lock}; + + auto snap_ids = m_dst_image_ctx->snaps; + snap_ids.push_back(CEPH_NOSNAP); + + for (auto snap_id : snap_ids) { + m_dst_object_may_exist[snap_id] = + (m_dst_object_number < m_dst_image_ctx->get_object_count(snap_id)); + } + + ldout(m_cct, 20) << "dst_object_may_exist=" << m_dst_object_may_exist + << dendl; +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::ObjectCopyRequest; diff --git a/src/librbd/deep_copy/ObjectCopyRequest.h b/src/librbd/deep_copy/ObjectCopyRequest.h new file mode 100644 index 000000000..fc2f58cd3 --- /dev/null +++ b/src/librbd/deep_copy/ObjectCopyRequest.h @@ -0,0 +1,163 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H + +#include "include/int_types.h" +#include "include/interval_set.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/deep_copy/Types.h" +#include "librbd/io/Types.h" +#include +#include +#include + +class Context; +class RWLock; + +namespace librbd { + +namespace io { class AsyncOperation; } + +namespace deep_copy { + +struct Handler; + +template +class ObjectCopyRequest { +public: + static ObjectCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t dst_snap_id_start, + const SnapMap &snap_map, + uint64_t object_number, uint32_t flags, + Handler* handler, Context *on_finish) { + return new ObjectCopyRequest(src_image_ctx, dst_image_ctx, + src_snap_id_start, dst_snap_id_start, snap_map, + object_number, flags, handler, on_finish); + } + + ObjectCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t dst_snap_id_start, const SnapMap &snap_map, + uint64_t object_number, uint32_t flags, Handler* handler, + Context *on_finish); + + void send(); + + // testing support + inline librados::IoCtx &get_src_io_ctx() { + return m_src_io_ctx; + } + inline librados::IoCtx &get_dst_io_ctx() { + return m_dst_io_ctx; + } + +private: + /** + * @verbatim + * + * + * | + * v + * LIST_SNAPS + * | + * |/---------\ + * | | (repeat for each snapshot) + * v | + * READ ---------/ + * | + * | /-----------\ + * | | | (repeat for each snapshot) + * v v | + * UPDATE_OBJECT_MAP ---/ (skip if object + * | map disabled) + * | /-----------\ + * | | | (repeat for each snapshot) + * v v | + * WRITE_OBJECT --------/ + * | + * v + * + * + * @endverbatim + */ + + enum WriteOpType { + WRITE_OP_TYPE_WRITE, + WRITE_OP_TYPE_ZERO + }; + + struct ReadOp { + interval_set image_interval; + io::Extents image_extent_map; + bufferlist out_bl; + }; + + typedef std::pair WriteReadSnapIds; + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + CephContext *m_cct; + librados::snap_t m_src_snap_id_start; + librados::snap_t m_dst_snap_id_start; + SnapMap m_snap_map; + uint64_t m_dst_object_number; + uint32_t m_flags; + Handler* m_handler; + Context *m_on_finish; + + decltype(m_src_image_ctx->data_ctx) m_src_io_ctx; + decltype(m_dst_image_ctx->data_ctx) m_dst_io_ctx; + std::string m_dst_oid; + + io::Extents m_image_extents; + io::ImageArea m_image_area = io::ImageArea::DATA; + + io::SnapshotDelta m_snapshot_delta; + + std::map m_read_ops; + std::list m_read_snaps; + io::SnapshotSparseBufferlist m_snapshot_sparse_bufferlist; + + std::map> m_dst_data_interval; + std::map> m_dst_zero_interval; + std::map m_dst_object_state; + std::map m_dst_object_may_exist; + + io::AsyncOperation* m_src_async_op = nullptr; + + void send_list_snaps(); + void handle_list_snaps(int r); + + void send_read(); + void handle_read(int r); + + void send_update_object_map(); + void handle_update_object_map(int r); + + void process_copyup(); + void send_write_object(); + void handle_write_object(int r); + + Context *start_lock_op(ceph::shared_mutex &owner_lock, int* r); + + void compute_read_ops(); + void merge_write_ops(); + void compute_zero_ops(); + + void compute_dst_object_may_exist(); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::ObjectCopyRequest; + +#endif // CEPH_LIBRBD_DEEP_COPY_OBJECT_COPY_REQUEST_H diff --git a/src/librbd/deep_copy/SetHeadRequest.cc b/src/librbd/deep_copy/SetHeadRequest.cc new file mode 100644 index 000000000..1e056b958 --- /dev/null +++ b/src/librbd/deep_copy/SetHeadRequest.cc @@ -0,0 +1,223 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SetHeadRequest.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/Utils.h" +#include "librbd/image/AttachParentRequest.h" +#include "librbd/image/DetachParentRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::SetHeadRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +SetHeadRequest::SetHeadRequest(I *image_ctx, uint64_t size, + const cls::rbd::ParentImageSpec &spec, + uint64_t parent_overlap, + Context *on_finish) + : m_image_ctx(image_ctx), m_size(size), m_parent_spec(spec), + m_parent_overlap(parent_overlap), m_on_finish(on_finish), + m_cct(image_ctx->cct) { + ceph_assert(m_parent_overlap <= m_size); +} + +template +void SetHeadRequest::send() { + send_set_size(); +} + +template +void SetHeadRequest::send_set_size() { + m_image_ctx->image_lock.lock_shared(); + if (m_image_ctx->size == m_size) { + m_image_ctx->image_lock.unlock_shared(); + send_detach_parent(); + return; + } + m_image_ctx->image_lock.unlock_shared(); + + ldout(m_cct, 20) << dendl; + + // Change the image size on disk so that the snapshot picks up + // the expected size. We can do this because the last snapshot + // we process is the sync snapshot which was created to match the + // image size. We also don't need to worry about trimming because + // we track the highest possible object number within the sync record + librados::ObjectWriteOperation op; + librbd::cls_client::set_size(&op, m_size); + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_set_size(r); + finish_op_ctx->complete(0); + }); + librados::AioCompletion *comp = create_rados_callback(ctx); + r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void SetHeadRequest::handle_set_size(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to update image size: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + { + // adjust in-memory image size now that it's updated on disk + std::unique_lock image_locker{m_image_ctx->image_lock}; + if (m_image_ctx->size > m_size) { + if (m_image_ctx->parent_md.spec.pool_id != -1 && + m_image_ctx->parent_md.overlap > m_size) { + m_image_ctx->parent_md.overlap = m_size; + } + } + m_image_ctx->size = m_size; + } + + send_detach_parent(); +} + +template +void SetHeadRequest::send_detach_parent() { + m_image_ctx->image_lock.lock_shared(); + if (m_image_ctx->parent_md.spec.pool_id == -1 || + (m_image_ctx->parent_md.spec == m_parent_spec && + m_image_ctx->parent_md.overlap == m_parent_overlap)) { + m_image_ctx->image_lock.unlock_shared(); + send_attach_parent(); + return; + } + m_image_ctx->image_lock.unlock_shared(); + + ldout(m_cct, 20) << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_detach_parent(r); + finish_op_ctx->complete(0); + }); + auto req = image::DetachParentRequest::create(*m_image_ctx, ctx); + req->send(); +} + +template +void SetHeadRequest::handle_detach_parent(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to remove parent: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + { + // adjust in-memory parent now that it's updated on disk + std::unique_lock image_locker{m_image_ctx->image_lock}; + m_image_ctx->parent_md.spec = {}; + m_image_ctx->parent_md.overlap = 0; + } + + send_attach_parent(); +} + +template +void SetHeadRequest::send_attach_parent() { + m_image_ctx->image_lock.lock_shared(); + if (m_image_ctx->parent_md.spec == m_parent_spec && + m_image_ctx->parent_md.overlap == m_parent_overlap) { + m_image_ctx->image_lock.unlock_shared(); + finish(0); + return; + } + m_image_ctx->image_lock.unlock_shared(); + + ldout(m_cct, 20) << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_attach_parent(r); + finish_op_ctx->complete(0); + }); + auto req = image::AttachParentRequest::create( + *m_image_ctx, m_parent_spec, m_parent_overlap, false, ctx); + req->send(); +} + +template +void SetHeadRequest::handle_attach_parent(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + { + // adjust in-memory parent now that it's updated on disk + std::unique_lock image_locker{m_image_ctx->image_lock}; + m_image_ctx->parent_md.spec = m_parent_spec; + m_image_ctx->parent_md.overlap = m_parent_overlap; + } + + finish(0); +} + +template +Context *SetHeadRequest::start_lock_op(int* r) { + std::shared_lock owner_locker{m_image_ctx->owner_lock}; + if (m_image_ctx->exclusive_lock == nullptr) { + return new LambdaContext([](int r) {}); + } + return m_image_ctx->exclusive_lock->start_op(r); +} + +template +void SetHeadRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::SetHeadRequest; diff --git a/src/librbd/deep_copy/SetHeadRequest.h b/src/librbd/deep_copy/SetHeadRequest.h new file mode 100644 index 000000000..9a17c9fd0 --- /dev/null +++ b/src/librbd/deep_copy/SetHeadRequest.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include +#include +#include +#include + +class Context; + +namespace librbd { +namespace deep_copy { + +template +class SetHeadRequest { +public: + static SetHeadRequest* create(ImageCtxT *image_ctx, uint64_t size, + const cls::rbd::ParentImageSpec &parent_spec, + uint64_t parent_overlap, + Context *on_finish) { + return new SetHeadRequest(image_ctx, size, parent_spec, parent_overlap, + on_finish); + } + + SetHeadRequest(ImageCtxT *image_ctx, uint64_t size, + const cls::rbd::ParentImageSpec &parent_spec, + uint64_t parent_overlap, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v (skip if not needed) + * SET_SIZE + * | + * v (skip if not needed) + * DETACH_PARENT + * | + * v (skip if not needed) + * ATTACH_PARENT + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + uint64_t m_size; + cls::rbd::ParentImageSpec m_parent_spec; + uint64_t m_parent_overlap; + Context *m_on_finish; + + CephContext *m_cct; + + void send_set_size(); + void handle_set_size(int r); + + void send_detach_parent(); + void handle_detach_parent(int r); + + void send_attach_parent(); + void handle_attach_parent(int r); + + Context *start_lock_op(int* r); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::SetHeadRequest; + +#endif // CEPH_LIBRBD_DEEP_COPY_SET_HEAD_REQUEST_H diff --git a/src/librbd/deep_copy/SnapshotCopyRequest.cc b/src/librbd/deep_copy/SnapshotCopyRequest.cc new file mode 100644 index 000000000..1aadd34db --- /dev/null +++ b/src/librbd/deep_copy/SnapshotCopyRequest.cc @@ -0,0 +1,729 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SnapshotCopyRequest.h" +#include "SetHeadRequest.h" +#include "SnapshotCreateRequest.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ObjectMap.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::SnapshotCopyRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +namespace { + +template +const std::string &get_snapshot_name(I *image_ctx, librados::snap_t snap_id) { + auto snap_it = std::find_if(image_ctx->snap_ids.begin(), + image_ctx->snap_ids.end(), + [snap_id]( + const std::pair< + std::pair, + librados::snap_t> &pair) { + return pair.second == snap_id; + }); + ceph_assert(snap_it != image_ctx->snap_ids.end()); + return snap_it->first.second; +} + +} // anonymous namespace + +using librbd::util::create_context_callback; +using librbd::util::unique_lock_name; + +template +SnapshotCopyRequest::SnapshotCopyRequest(I *src_image_ctx, + I *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, + asio::ContextWQ *work_queue, + SnapSeqs *snap_seqs, + Context *on_finish) + : RefCountedObject(dst_image_ctx->cct), m_src_image_ctx(src_image_ctx), + m_dst_image_ctx(dst_image_ctx), m_src_snap_id_start(src_snap_id_start), + m_src_snap_id_end(src_snap_id_end), m_dst_snap_id_start(dst_snap_id_start), + m_flatten(flatten), m_work_queue(work_queue), m_snap_seqs_result(snap_seqs), + m_snap_seqs(*snap_seqs), m_on_finish(on_finish), m_cct(dst_image_ctx->cct), + m_lock(ceph::make_mutex(unique_lock_name("SnapshotCopyRequest::m_lock", this))) { + ceph_assert((m_src_snap_id_start == 0 && m_dst_snap_id_start == 0) || + (m_src_snap_id_start > 0 && m_dst_snap_id_start > 0)); + + // snap ids ordered from oldest to newest + m_src_image_ctx->image_lock.lock_shared(); + m_src_snap_ids.insert(src_image_ctx->snaps.begin(), + src_image_ctx->snaps.end()); + m_src_image_ctx->image_lock.unlock_shared(); + + m_dst_image_ctx->image_lock.lock_shared(); + m_dst_snap_ids.insert(dst_image_ctx->snaps.begin(), + dst_image_ctx->snaps.end()); + m_dst_image_ctx->image_lock.unlock_shared(); + + if (m_src_snap_id_end != CEPH_NOSNAP) { + m_src_snap_ids.erase(m_src_snap_ids.upper_bound(m_src_snap_id_end), + m_src_snap_ids.end()); + } +} + +template +void SnapshotCopyRequest::send() { + cls::rbd::ParentImageSpec src_parent_spec; + int r = validate_parent(m_src_image_ctx, &src_parent_spec); + if (r < 0) { + lderr(m_cct) << "source image parent spec mismatch" << dendl; + error(r); + return; + } + + r = validate_parent(m_dst_image_ctx, &m_dst_parent_spec); + if (r < 0) { + lderr(m_cct) << "destination image parent spec mismatch" << dendl; + error(r); + return; + } + + send_snap_unprotect(); +} + +template +void SnapshotCopyRequest::cancel() { + std::lock_guard locker{m_lock}; + + ldout(m_cct, 20) << dendl; + m_canceled = true; +} + +template +void SnapshotCopyRequest::send_snap_unprotect() { + + SnapIdSet::iterator snap_id_it = m_dst_snap_ids.begin(); + if (m_prev_snap_id != CEPH_NOSNAP) { + snap_id_it = m_dst_snap_ids.upper_bound(m_prev_snap_id); + } else if (m_dst_snap_id_start > 0) { + snap_id_it = m_dst_snap_ids.upper_bound(m_dst_snap_id_start); + } + + for (; snap_id_it != m_dst_snap_ids.end(); ++snap_id_it) { + librados::snap_t dst_snap_id = *snap_id_it; + + m_dst_image_ctx->image_lock.lock_shared(); + + bool dst_unprotected; + int r = m_dst_image_ctx->is_snap_unprotected(dst_snap_id, &dst_unprotected); + if (r < 0) { + lderr(m_cct) << "failed to retrieve destination snap unprotect status: " + << cpp_strerror(r) << dendl; + m_dst_image_ctx->image_lock.unlock_shared(); + finish(r); + return; + } + m_dst_image_ctx->image_lock.unlock_shared(); + + if (dst_unprotected) { + // snap is already unprotected -- check next snap + continue; + } + + // if destination snapshot is protected and (1) it isn't in our mapping + // table, or (2) the source snapshot isn't protected, unprotect it + auto snap_seq_it = std::find_if( + m_snap_seqs.begin(), m_snap_seqs.end(), + [dst_snap_id](const SnapSeqs::value_type& pair) { + return pair.second == dst_snap_id; + }); + + if (snap_seq_it != m_snap_seqs.end()) { + m_src_image_ctx->image_lock.lock_shared(); + bool src_unprotected; + r = m_src_image_ctx->is_snap_unprotected(snap_seq_it->first, + &src_unprotected); + ldout(m_cct, 20) << "m_src_image_ctx->is_snap_unprotected(" + << snap_seq_it->first << "): r=" << r + << ", src_unprotected=" << src_unprotected << dendl; + if (r == -ENOENT) { + src_unprotected = true; + r = 0; + } + if (r < 0) { + lderr(m_cct) << "failed to retrieve source snap unprotect status: " + << cpp_strerror(r) << dendl; + m_src_image_ctx->image_lock.unlock_shared(); + finish(r); + return; + } + m_src_image_ctx->image_lock.unlock_shared(); + + if (src_unprotected) { + // source is unprotected -- unprotect destination snap + break; + } + } else { + // source snapshot doesn't exist -- unprotect destination snap + break; + } + } + + if (snap_id_it == m_dst_snap_ids.end()) { + // no destination snapshots to unprotect + m_prev_snap_id = CEPH_NOSNAP; + send_snap_remove(); + return; + } + + m_prev_snap_id = *snap_id_it; + m_snap_name = get_snapshot_name(m_dst_image_ctx, m_prev_snap_id); + + ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_prev_snap_id << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_snap_unprotect(r); + finish_op_ctx->complete(0); + }); + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + m_dst_image_ctx->operations->execute_snap_unprotect( + cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx); +} + +template +void SnapshotCopyRequest::handle_snap_unprotect(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to unprotect snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + { + // avoid the need to refresh to delete the newly unprotected snapshot + std::shared_lock image_locker{m_dst_image_ctx->image_lock}; + auto snap_info_it = m_dst_image_ctx->snap_info.find(m_prev_snap_id); + if (snap_info_it != m_dst_image_ctx->snap_info.end()) { + snap_info_it->second.protection_status = + RBD_PROTECTION_STATUS_UNPROTECTED; + } + } + + if (handle_cancellation()) { + return; + } + + send_snap_unprotect(); +} + +template +void SnapshotCopyRequest::send_snap_remove() { + SnapIdSet::iterator snap_id_it = m_dst_snap_ids.begin(); + if (m_prev_snap_id != CEPH_NOSNAP) { + snap_id_it = m_dst_snap_ids.upper_bound(m_prev_snap_id); + } else if (m_dst_snap_id_start > 0) { + snap_id_it = m_dst_snap_ids.upper_bound(m_dst_snap_id_start); + } + + for (; snap_id_it != m_dst_snap_ids.end(); ++snap_id_it) { + librados::snap_t dst_snap_id = *snap_id_it; + + cls::rbd::SnapshotNamespace snap_namespace; + m_dst_image_ctx->image_lock.lock_shared(); + int r = m_dst_image_ctx->get_snap_namespace(dst_snap_id, &snap_namespace); + m_dst_image_ctx->image_lock.unlock_shared(); + if (r < 0) { + lderr(m_cct) << "failed to retrieve destination snap namespace: " + << m_snap_name << dendl; + finish(r); + return; + } + + if (!std::holds_alternative(snap_namespace)) { + continue; + } + + // if the destination snapshot isn't in our mapping table, remove it + auto snap_seq_it = std::find_if( + m_snap_seqs.begin(), m_snap_seqs.end(), + [dst_snap_id](const SnapSeqs::value_type& pair) { + return pair.second == dst_snap_id; + }); + + if (snap_seq_it == m_snap_seqs.end()) { + break; + } + } + + if (snap_id_it == m_dst_snap_ids.end()) { + // no destination snapshots to delete + m_prev_snap_id = CEPH_NOSNAP; + send_snap_create(); + return; + } + + m_prev_snap_id = *snap_id_it; + m_snap_name = get_snapshot_name(m_dst_image_ctx, m_prev_snap_id); + + ldout(m_cct, 20) << "" + << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_prev_snap_id << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_snap_remove(r); + finish_op_ctx->complete(0); + }); + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + m_dst_image_ctx->operations->execute_snap_remove( + cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx); +} + +template +void SnapshotCopyRequest::handle_snap_remove(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to remove snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + if (handle_cancellation()) { + return; + } + + send_snap_remove(); +} + +template +void SnapshotCopyRequest::send_snap_create() { + SnapIdSet::iterator snap_id_it = m_src_snap_ids.begin(); + if (m_prev_snap_id != CEPH_NOSNAP) { + snap_id_it = m_src_snap_ids.upper_bound(m_prev_snap_id); + } else if (m_src_snap_id_start > 0) { + snap_id_it = m_src_snap_ids.upper_bound(m_src_snap_id_start); + } + + for (; snap_id_it != m_src_snap_ids.end(); ++snap_id_it) { + librados::snap_t src_snap_id = *snap_id_it; + + cls::rbd::SnapshotNamespace snap_namespace; + m_src_image_ctx->image_lock.lock_shared(); + int r = m_src_image_ctx->get_snap_namespace(src_snap_id, &snap_namespace); + m_src_image_ctx->image_lock.unlock_shared(); + if (r < 0) { + lderr(m_cct) << "failed to retrieve source snap namespace: " + << m_snap_name << dendl; + finish(r); + return; + } + + if (m_snap_seqs.find(src_snap_id) == m_snap_seqs.end()) { + // the source snapshot is not in our mapping table, ... + if (std::holds_alternative(snap_namespace)) { + // ... create it since it's a user snapshot + break; + } else if (src_snap_id == m_src_snap_id_end) { + // ... map it to destination HEAD since it's not a user snapshot that we + // will create (e.g. MirrorSnapshotNamespace) + m_snap_seqs[src_snap_id] = CEPH_NOSNAP; + } + } + } + + if (snap_id_it == m_src_snap_ids.end()) { + // no source snapshots to create + m_prev_snap_id = CEPH_NOSNAP; + send_snap_protect(); + return; + } + + m_prev_snap_id = *snap_id_it; + m_snap_name = get_snapshot_name(m_src_image_ctx, m_prev_snap_id); + + m_src_image_ctx->image_lock.lock_shared(); + auto snap_info_it = m_src_image_ctx->snap_info.find(m_prev_snap_id); + if (snap_info_it == m_src_image_ctx->snap_info.end()) { + m_src_image_ctx->image_lock.unlock_shared(); + lderr(m_cct) << "failed to retrieve source snap info: " << m_snap_name + << dendl; + finish(-ENOENT); + return; + } + + uint64_t size = snap_info_it->second.size; + m_snap_namespace = snap_info_it->second.snap_namespace; + cls::rbd::ParentImageSpec parent_spec; + uint64_t parent_overlap = 0; + if (!m_flatten && snap_info_it->second.parent.spec.pool_id != -1) { + parent_spec = m_dst_parent_spec; + parent_overlap = snap_info_it->second.parent.overlap; + } + m_src_image_ctx->image_lock.unlock_shared(); + + ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_prev_snap_id << ", " + << "size=" << size << ", " + << "parent_info=[" + << "pool_id=" << parent_spec.pool_id << ", " + << "image_id=" << parent_spec.image_id << ", " + << "snap_id=" << parent_spec.snap_id << ", " + << "overlap=" << parent_overlap << "]" << dendl; + + int r; + Context *finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_snap_create(r); + finish_op_ctx->complete(0); + }); + SnapshotCreateRequest *req = SnapshotCreateRequest::create( + m_dst_image_ctx, m_snap_name, m_snap_namespace, size, parent_spec, + parent_overlap, ctx); + req->send(); +} + +template +void SnapshotCopyRequest::handle_snap_create(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to create snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + if (handle_cancellation()) { + return; + } + + ceph_assert(m_prev_snap_id != CEPH_NOSNAP); + + auto snap_it = m_dst_image_ctx->snap_ids.find( + {cls::rbd::UserSnapshotNamespace(), m_snap_name}); + ceph_assert(snap_it != m_dst_image_ctx->snap_ids.end()); + librados::snap_t dst_snap_id = snap_it->second; + + ldout(m_cct, 20) << "mapping source snap id " << m_prev_snap_id << " to " + << dst_snap_id << dendl; + m_snap_seqs[m_prev_snap_id] = dst_snap_id; + + send_snap_create(); +} + +template +void SnapshotCopyRequest::send_snap_protect() { + SnapIdSet::iterator snap_id_it = m_src_snap_ids.begin(); + if (m_prev_snap_id != CEPH_NOSNAP) { + snap_id_it = m_src_snap_ids.upper_bound(m_prev_snap_id); + } else if (m_src_snap_id_start > 0) { + snap_id_it = m_src_snap_ids.upper_bound(m_src_snap_id_start); + } + + for (; snap_id_it != m_src_snap_ids.end(); ++snap_id_it) { + librados::snap_t src_snap_id = *snap_id_it; + + m_src_image_ctx->image_lock.lock_shared(); + + bool src_protected; + int r = m_src_image_ctx->is_snap_protected(src_snap_id, &src_protected); + if (r < 0) { + lderr(m_cct) << "failed to retrieve source snap protect status: " + << cpp_strerror(r) << dendl; + m_src_image_ctx->image_lock.unlock_shared(); + finish(r); + return; + } + m_src_image_ctx->image_lock.unlock_shared(); + + if (!src_protected) { + // snap is not protected -- check next snap + continue; + } + + // if destination snapshot is not protected, protect it + auto snap_seq_it = m_snap_seqs.find(src_snap_id); + ceph_assert(snap_seq_it != m_snap_seqs.end()); + if (snap_seq_it->second == CEPH_NOSNAP) { + // implies src end snapshot is mapped to a non-copyable snapshot + ceph_assert(src_snap_id == m_src_snap_id_end); + break; + } + + m_dst_image_ctx->image_lock.lock_shared(); + bool dst_protected; + r = m_dst_image_ctx->is_snap_protected(snap_seq_it->second, &dst_protected); + if (r < 0) { + lderr(m_cct) << "failed to retrieve destination snap protect status: " + << cpp_strerror(r) << dendl; + m_dst_image_ctx->image_lock.unlock_shared(); + finish(r); + return; + } + m_dst_image_ctx->image_lock.unlock_shared(); + + if (!dst_protected) { + break; + } + } + + if (snap_id_it == m_src_snap_ids.end()) { + // no destination snapshots to protect + m_prev_snap_id = CEPH_NOSNAP; + send_set_head(); + return; + } + + m_prev_snap_id = *snap_id_it; + m_snap_name = get_snapshot_name(m_src_image_ctx, m_prev_snap_id); + + ldout(m_cct, 20) << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_prev_snap_id << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_snap_protect(r); + finish_op_ctx->complete(0); + }); + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + m_dst_image_ctx->operations->execute_snap_protect( + cls::rbd::UserSnapshotNamespace(), m_snap_name.c_str(), ctx); +} + +template +void SnapshotCopyRequest::handle_snap_protect(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to protect snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + if (handle_cancellation()) { + return; + } + + send_snap_protect(); +} + +template +void SnapshotCopyRequest::send_set_head() { + auto snap_seq_it = m_snap_seqs.find(m_src_snap_id_end); + if (m_src_snap_id_end != CEPH_NOSNAP && + (snap_seq_it == m_snap_seqs.end() || + snap_seq_it->second != CEPH_NOSNAP)) { + // not copying to src nor dst HEAD revision + finish(0); + return; + } + + ldout(m_cct, 20) << dendl; + + uint64_t size; + cls::rbd::ParentImageSpec parent_spec; + uint64_t parent_overlap = 0; + { + std::shared_lock src_locker{m_src_image_ctx->image_lock}; + auto snap_info_it = m_src_image_ctx->snap_info.find(m_src_snap_id_end); + if (snap_info_it != m_src_image_ctx->snap_info.end()) { + auto& snap_info = snap_info_it->second; + size = snap_info.size; + if (!m_flatten && snap_info.parent.spec.pool_id != -1) { + parent_spec = m_dst_parent_spec; + parent_overlap = snap_info.parent.overlap; + } + } else { + size = m_src_image_ctx->size; + if (!m_flatten) { + parent_spec = m_dst_image_ctx->parent_md.spec; + parent_overlap = m_src_image_ctx->parent_md.overlap; + } + } + } + + auto ctx = create_context_callback< + SnapshotCopyRequest, &SnapshotCopyRequest::handle_set_head>(this); + auto req = SetHeadRequest::create(m_dst_image_ctx, size, parent_spec, + parent_overlap, ctx); + req->send(); +} + +template +void SnapshotCopyRequest::handle_set_head(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to set head: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (handle_cancellation()) { + return; + } + + send_resize_object_map(); +} + +template +void SnapshotCopyRequest::send_resize_object_map() { + int r = 0; + + if (m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) { + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + std::shared_lock image_locker{m_dst_image_ctx->image_lock}; + + if (m_dst_image_ctx->object_map != nullptr && + Striper::get_num_objects(m_dst_image_ctx->layout, + m_dst_image_ctx->size) != + m_dst_image_ctx->object_map->size()) { + + ldout(m_cct, 20) << dendl; + + auto finish_op_ctx = start_lock_op(m_dst_image_ctx->owner_lock, &r); + if (finish_op_ctx != nullptr) { + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_resize_object_map(r); + finish_op_ctx->complete(0); + }); + + m_dst_image_ctx->object_map->aio_resize(m_dst_image_ctx->size, + OBJECT_NONEXISTENT, ctx); + return; + } + + lderr(m_cct) << "lost exclusive lock" << dendl; + } + } + + finish(r); +} + +template +void SnapshotCopyRequest::handle_resize_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to resize object map: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +bool SnapshotCopyRequest::handle_cancellation() { + { + std::lock_guard locker{m_lock}; + if (!m_canceled) { + return false; + } + } + ldout(m_cct, 10) << "snapshot copy canceled" << dendl; + finish(-ECANCELED); + return true; +} + +template +void SnapshotCopyRequest::error(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_work_queue->queue(new LambdaContext([this, r](int r1) { finish(r); })); +} + +template +int SnapshotCopyRequest::validate_parent(I *image_ctx, + cls::rbd::ParentImageSpec *spec) { + std::shared_lock owner_locker{image_ctx->owner_lock}; + std::shared_lock image_locker{image_ctx->image_lock}; + + // ensure source image's parent specs are still consistent + *spec = image_ctx->parent_md.spec; + for (auto &snap_info_pair : image_ctx->snap_info) { + auto &parent_spec = snap_info_pair.second.parent.spec; + if (parent_spec.pool_id == -1) { + continue; + } else if (spec->pool_id == -1) { + *spec = parent_spec; + continue; + } + + if (*spec != parent_spec) { + return -EINVAL; + } + } + return 0; +} + +template +Context *SnapshotCopyRequest::start_lock_op(int* r) { + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + return start_lock_op(m_dst_image_ctx->owner_lock, r); +} + +template +Context *SnapshotCopyRequest::start_lock_op(ceph::shared_mutex &owner_lock, int* r) { + ceph_assert(ceph_mutex_is_locked(m_dst_image_ctx->owner_lock)); + if (m_dst_image_ctx->exclusive_lock == nullptr) { + return new LambdaContext([](int r) {}); + } + return m_dst_image_ctx->exclusive_lock->start_op(r); +} + +template +void SnapshotCopyRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == 0) { + *m_snap_seqs_result = m_snap_seqs; + } + + m_on_finish->complete(r); + put(); +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::SnapshotCopyRequest; diff --git a/src/librbd/deep_copy/SnapshotCopyRequest.h b/src/librbd/deep_copy/SnapshotCopyRequest.h new file mode 100644 index 000000000..9c6abdf73 --- /dev/null +++ b/src/librbd/deep_copy/SnapshotCopyRequest.h @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "common/RefCountedObj.h" +#include "common/snap_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include +#include +#include +#include + +class Context; + +namespace librbd { + +namespace asio { struct ContextWQ; } + +namespace deep_copy { + +template +class SnapshotCopyRequest : public RefCountedObject { +public: + static SnapshotCopyRequest* create(ImageCtxT *src_image_ctx, + ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, asio::ContextWQ *work_queue, + SnapSeqs *snap_seqs, Context *on_finish) { + return new SnapshotCopyRequest(src_image_ctx, dst_image_ctx, + src_snap_id_start, src_snap_id_end, + dst_snap_id_start, flatten, work_queue, + snap_seqs, on_finish); + } + + SnapshotCopyRequest(ImageCtxT *src_image_ctx, ImageCtxT *dst_image_ctx, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + librados::snap_t dst_snap_id_start, + bool flatten, asio::ContextWQ *work_queue, + SnapSeqs *snap_seqs, Context *on_finish); + + void send(); + void cancel(); + +private: + /** + * @verbatim + * + * + * | + * | /-----------\ + * | | | + * v v | (repeat as needed) + * UNPROTECT_SNAP ----/ + * | + * | /-----------\ + * | | | + * v v | (repeat as needed) + * REMOVE_SNAP -------/ + * | + * | /-----------\ + * | | | + * v v | (repeat as needed) + * CREATE_SNAP -------/ + * | + * | /-----------\ + * | | | + * v v | (repeat as needed) + * PROTECT_SNAP ------/ + * | + * v + * SET_HEAD (skip if not needed) + * | + * v + * RESIZE_OBJECT_MAP (skip if not needed) + * | + * v + * + * + * @endverbatim + */ + + typedef std::set SnapIdSet; + + ImageCtxT *m_src_image_ctx; + ImageCtxT *m_dst_image_ctx; + librados::snap_t m_src_snap_id_start; + librados::snap_t m_src_snap_id_end; + librados::snap_t m_dst_snap_id_start; + bool m_flatten; + asio::ContextWQ *m_work_queue; + SnapSeqs *m_snap_seqs_result; + SnapSeqs m_snap_seqs; + Context *m_on_finish; + + CephContext *m_cct; + SnapIdSet m_src_snap_ids; + SnapIdSet m_dst_snap_ids; + librados::snap_t m_prev_snap_id = CEPH_NOSNAP; + + std::string m_snap_name; + cls::rbd::SnapshotNamespace m_snap_namespace; + + cls::rbd::ParentImageSpec m_dst_parent_spec; + + ceph::mutex m_lock; + bool m_canceled = false; + + void send_snap_unprotect(); + void handle_snap_unprotect(int r); + + void send_snap_remove(); + void handle_snap_remove(int r); + + void send_snap_create(); + void handle_snap_create(int r); + + void send_snap_protect(); + void handle_snap_protect(int r); + + void send_set_head(); + void handle_set_head(int r); + + void send_resize_object_map(); + void handle_resize_object_map(int r); + + bool handle_cancellation(); + + void error(int r); + + int validate_parent(ImageCtxT *image_ctx, cls::rbd::ParentImageSpec *spec); + + Context *start_lock_op(int* r); + Context *start_lock_op(ceph::shared_mutex &owner_locki, int* r); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::SnapshotCopyRequest; + +#endif // CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_COPY_REQUEST_H diff --git a/src/librbd/deep_copy/SnapshotCreateRequest.cc b/src/librbd/deep_copy/SnapshotCreateRequest.cc new file mode 100644 index 000000000..d437bd355 --- /dev/null +++ b/src/librbd/deep_copy/SnapshotCreateRequest.cc @@ -0,0 +1,187 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "SetHeadRequest.h" +#include "SnapshotCreateRequest.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ObjectMap.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::SnapshotCreateRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace deep_copy { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +SnapshotCreateRequest::SnapshotCreateRequest( + I *dst_image_ctx, const std::string &snap_name, + const cls::rbd::SnapshotNamespace &snap_namespace, + uint64_t size, const cls::rbd::ParentImageSpec &spec, + uint64_t parent_overlap, Context *on_finish) + : m_dst_image_ctx(dst_image_ctx), m_snap_name(snap_name), + m_snap_namespace(snap_namespace), m_size(size), + m_parent_spec(spec), m_parent_overlap(parent_overlap), + m_on_finish(on_finish), m_cct(dst_image_ctx->cct) { +} + +template +void SnapshotCreateRequest::send() { + send_set_head(); +} + +template +void SnapshotCreateRequest::send_set_head() { + ldout(m_cct, 20) << dendl; + + auto ctx = create_context_callback< + SnapshotCreateRequest, &SnapshotCreateRequest::handle_set_head>(this); + auto req = SetHeadRequest::create(m_dst_image_ctx, m_size, m_parent_spec, + m_parent_overlap, ctx); + req->send(); +} + +template +void SnapshotCreateRequest::handle_set_head(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to set head: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_create_snap(); +} + +template +void SnapshotCreateRequest::send_create_snap() { + ldout(m_cct, 20) << "snap_name=" << m_snap_name << dendl; + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_create_snap(r); + finish_op_ctx->complete(0); + }); + uint64_t flags = SNAP_CREATE_FLAG_SKIP_OBJECT_MAP | + SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE; + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + m_dst_image_ctx->operations->execute_snap_create( + m_snap_namespace, m_snap_name.c_str(), ctx, 0U, flags, m_prog_ctx); +} + +template +void SnapshotCreateRequest::handle_create_snap(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to create snapshot '" << m_snap_name << "': " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_create_object_map(); +} +template +void SnapshotCreateRequest::send_create_object_map() { + + if (!m_dst_image_ctx->test_features(RBD_FEATURE_OBJECT_MAP)) { + finish(0); + return; + } + + m_dst_image_ctx->image_lock.lock_shared(); + auto snap_it = m_dst_image_ctx->snap_ids.find( + {cls::rbd::UserSnapshotNamespace(), m_snap_name}); + if (snap_it == m_dst_image_ctx->snap_ids.end()) { + lderr(m_cct) << "failed to locate snap: " << m_snap_name << dendl; + m_dst_image_ctx->image_lock.unlock_shared(); + finish(-ENOENT); + return; + } + librados::snap_t local_snap_id = snap_it->second; + m_dst_image_ctx->image_lock.unlock_shared(); + + std::string object_map_oid(librbd::ObjectMap<>::object_map_name( + m_dst_image_ctx->id, local_snap_id)); + uint64_t object_count = Striper::get_num_objects(m_dst_image_ctx->layout, + m_size); + ldout(m_cct, 20) << "object_map_oid=" << object_map_oid << ", " + << "object_count=" << object_count << dendl; + + // initialize an empty object map of the correct size (object sync + // will populate the object map) + librados::ObjectWriteOperation op; + librbd::cls_client::object_map_resize(&op, object_count, OBJECT_NONEXISTENT); + + int r; + auto finish_op_ctx = start_lock_op(&r); + if (finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + finish(r); + return; + } + + auto ctx = new LambdaContext([this, finish_op_ctx](int r) { + handle_create_object_map(r); + finish_op_ctx->complete(0); + }); + librados::AioCompletion *comp = create_rados_callback(ctx); + r = m_dst_image_ctx->md_ctx.aio_operate(object_map_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void SnapshotCreateRequest::handle_create_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to create object map: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +Context *SnapshotCreateRequest::start_lock_op(int* r) { + std::shared_lock owner_locker{m_dst_image_ctx->owner_lock}; + if (m_dst_image_ctx->exclusive_lock == nullptr) { + return new LambdaContext([](int r) {}); + } + return m_dst_image_ctx->exclusive_lock->start_op(r); +} + +template +void SnapshotCreateRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace deep_copy +} // namespace librbd + +template class librbd::deep_copy::SnapshotCreateRequest; diff --git a/src/librbd/deep_copy/SnapshotCreateRequest.h b/src/librbd/deep_copy/SnapshotCreateRequest.h new file mode 100644 index 000000000..41f7f54e4 --- /dev/null +++ b/src/librbd/deep_copy/SnapshotCreateRequest.h @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H +#define CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "common/snap_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include "librbd/internal.h" + +#include +#include +#include +#include + +class Context; + +namespace librbd { +namespace deep_copy { + +template +class SnapshotCreateRequest { +public: + static SnapshotCreateRequest* create(ImageCtxT *dst_image_ctx, + const std::string &snap_name, + const cls::rbd::SnapshotNamespace &snap_namespace, + uint64_t size, + const cls::rbd::ParentImageSpec &parent_spec, + uint64_t parent_overlap, + Context *on_finish) { + return new SnapshotCreateRequest(dst_image_ctx, snap_name, snap_namespace, size, + parent_spec, parent_overlap, on_finish); + } + + SnapshotCreateRequest(ImageCtxT *dst_image_ctx, + const std::string &snap_name, + const cls::rbd::SnapshotNamespace &snap_namespace, + uint64_t size, + const cls::rbd::ParentImageSpec &parent_spec, + uint64_t parent_overlap, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * SET_HEAD + * | + * v + * CREATE_SNAP + * | + * v (skip if not needed) + * CREATE_OBJECT_MAP + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_dst_image_ctx; + std::string m_snap_name; + cls::rbd::SnapshotNamespace m_snap_namespace; + uint64_t m_size; + cls::rbd::ParentImageSpec m_parent_spec; + uint64_t m_parent_overlap; + Context *m_on_finish; + + CephContext *m_cct; + NoOpProgressContext m_prog_ctx; + + void send_set_head(); + void handle_set_head(int r); + + void send_create_snap(); + void handle_create_snap(int r); + + void send_create_object_map(); + void handle_create_object_map(int r); + + Context *start_lock_op(int* r); + + void finish(int r); +}; + +} // namespace deep_copy +} // namespace librbd + +extern template class librbd::deep_copy::SnapshotCreateRequest; + +#endif // CEPH_LIBRBD_DEEP_COPY_SNAPSHOT_CREATE_REQUEST_H diff --git a/src/librbd/deep_copy/Types.h b/src/librbd/deep_copy/Types.h new file mode 100644 index 000000000..9cd4835b3 --- /dev/null +++ b/src/librbd/deep_copy/Types.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_TYPES_H +#define CEPH_LIBRBD_DEEP_COPY_TYPES_H + +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include + +namespace librbd { +namespace deep_copy { + +enum { + OBJECT_COPY_REQUEST_FLAG_FLATTEN = 1U << 0, + OBJECT_COPY_REQUEST_FLAG_MIGRATION = 1U << 1, + OBJECT_COPY_REQUEST_FLAG_EXISTS_CLEAN = 1U << 2, +}; + +typedef std::vector SnapIds; +typedef std::map SnapMap; + +typedef boost::optional ObjectNumber; + +} // namespace deep_copy +} // namespace librbd + +#endif // CEPH_LIBRBD_DEEP_COPY_TYPES_H diff --git a/src/librbd/deep_copy/Utils.cc b/src/librbd/deep_copy/Utils.cc new file mode 100644 index 000000000..c2dd25020 --- /dev/null +++ b/src/librbd/deep_copy/Utils.cc @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/debug.h" +#include "Utils.h" +#include + +namespace librbd { +namespace deep_copy { +namespace util { + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::deep_copy::util::" << __func__ << ": " + +void compute_snap_map(CephContext* cct, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + const SnapIds& dst_snap_ids, + const SnapSeqs &snap_seqs, + SnapMap *snap_map) { + std::set ordered_dst_snap_ids{ + dst_snap_ids.begin(), dst_snap_ids.end()}; + auto dst_snap_id_it = ordered_dst_snap_ids.begin(); + + SnapIds snap_ids; + for (auto &it : snap_seqs) { + // ensure all dst snap ids are included in the mapping table since + // deep copy will skip non-user snapshots + while (dst_snap_id_it != ordered_dst_snap_ids.end()) { + if (*dst_snap_id_it < it.second) { + snap_ids.insert(snap_ids.begin(), *dst_snap_id_it); + } else if (*dst_snap_id_it > it.second) { + break; + } + ++dst_snap_id_it; + } + + // we should only have the HEAD revision in the the last snap seq + ceph_assert(snap_ids.empty() || snap_ids[0] != CEPH_NOSNAP); + snap_ids.insert(snap_ids.begin(), it.second); + + if (it.first < src_snap_id_start) { + continue; + } else if (it.first > src_snap_id_end) { + break; + } + + (*snap_map)[it.first] = snap_ids; + } + + ldout(cct, 10) << "src_snap_id_start=" << src_snap_id_start << ", " + << "src_snap_id_end=" << src_snap_id_end << ", " + << "dst_snap_ids=" << dst_snap_ids << ", " + << "snap_seqs=" << snap_seqs << ", " + << "snap_map=" << *snap_map << dendl; +} + +} // namespace util +} // namespace deep_copy +} // namespace librbd diff --git a/src/librbd/deep_copy/Utils.h b/src/librbd/deep_copy/Utils.h new file mode 100644 index 000000000..268a39daf --- /dev/null +++ b/src/librbd/deep_copy/Utils.h @@ -0,0 +1,29 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_DEEP_COPY_UTILS_H +#define CEPH_LIBRBD_DEEP_COPY_UTILS_H + +#include "include/common_fwd.h" +#include "include/rados/librados.hpp" +#include "librbd/Types.h" +#include "librbd/deep_copy/Types.h" + +#include + +namespace librbd { +namespace deep_copy { +namespace util { + +void compute_snap_map(CephContext* cct, + librados::snap_t src_snap_id_start, + librados::snap_t src_snap_id_end, + const SnapIds& dst_snap_ids, + const SnapSeqs &snap_seqs, + SnapMap *snap_map); + +} // namespace util +} // namespace deep_copy +} // namespace librbd + +#endif // CEPH_LIBRBD_DEEP_COPY_UTILS_H diff --git a/src/librbd/exclusive_lock/AutomaticPolicy.cc b/src/librbd/exclusive_lock/AutomaticPolicy.cc new file mode 100644 index 000000000..bfaddc1b2 --- /dev/null +++ b/src/librbd/exclusive_lock/AutomaticPolicy.cc @@ -0,0 +1,29 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/AutomaticPolicy.h" +#include "librbd/ImageCtx.h" +#include "librbd/ExclusiveLock.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ExclusiveLock::AutomaticPolicy " + +namespace librbd { +namespace exclusive_lock { + +int AutomaticPolicy::lock_requested(bool force) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock)); + ceph_assert(m_image_ctx->exclusive_lock != nullptr); + + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << ": force=" << force + << dendl; + + // release the lock upon request (ignore forced requests) + m_image_ctx->exclusive_lock->release_lock(nullptr); + return 0; +} + +} // namespace exclusive_lock +} // namespace librbd + diff --git a/src/librbd/exclusive_lock/AutomaticPolicy.h b/src/librbd/exclusive_lock/AutomaticPolicy.h new file mode 100644 index 000000000..12ba9b6c4 --- /dev/null +++ b/src/librbd/exclusive_lock/AutomaticPolicy.h @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H + +#include "librbd/exclusive_lock/Policy.h" + +namespace librbd { + +struct ImageCtx; + +namespace exclusive_lock { + +class AutomaticPolicy : public Policy { +public: + AutomaticPolicy(ImageCtx *image_ctx) : m_image_ctx(image_ctx) { + } + + bool may_auto_request_lock() override { + return true; + } + + int lock_requested(bool force) override; + +private: + ImageCtx *m_image_ctx; + +}; + +} // namespace exclusive_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_AUTOMATIC_POLICY_H diff --git a/src/librbd/exclusive_lock/ImageDispatch.cc b/src/librbd/exclusive_lock/ImageDispatch.cc new file mode 100644 index 000000000..5939c7a81 --- /dev/null +++ b/src/librbd/exclusive_lock/ImageDispatch.cc @@ -0,0 +1,320 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/ImageDispatch.h" +#include "include/Context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/exclusive_lock/Policy.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageDispatcherInterface.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::exclusive_lock::ImageDispatch: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace exclusive_lock { + +using util::create_context_callback; +using util::create_async_context_callback; + +template +ImageDispatch::ImageDispatch(I* image_ctx) + : m_image_ctx(image_ctx), + m_lock(ceph::make_shared_mutex( + util::unique_lock_name("librbd::exclusve_lock::ImageDispatch::m_lock", + this))) { +} + +template +void ImageDispatch::shut_down(Context* on_finish) { + // release any IO waiting on exclusive lock + Contexts on_dispatches; + { + std::unique_lock locker{m_lock}; + std::swap(on_dispatches, m_on_dispatches); + } + + for (auto ctx : on_dispatches) { + ctx->complete(0); + } + + on_finish->complete(0); +} + +template +void ImageDispatch::set_require_lock(bool init_shutdown, + io::Direction direction, + Context* on_finish) { + // pause any matching IO from proceeding past this layer + set_require_lock(direction, true); + + if (direction == io::DIRECTION_READ) { + on_finish->complete(0); + return; + } + + // push through a flush for any in-flight writes at lower levels + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, util::get_image_ctx(m_image_ctx), io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK, aio_comp, + (init_shutdown ? + io::FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH : + io::FLUSH_SOURCE_EXCLUSIVE_LOCK), {}); + req->send(); +} + +template +void ImageDispatch::unset_require_lock(io::Direction direction) { + set_require_lock(direction, false); +} + +template +bool ImageDispatch::set_require_lock(io::Direction direction, bool enabled) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "direction=" << direction << ", enabled=" << enabled + << dendl; + + std::unique_lock locker{m_lock}; + auto prev_require_lock = (m_require_lock_on_read || m_require_lock_on_write); + + switch (direction) { + case io::DIRECTION_READ: + m_require_lock_on_read = enabled; + break; + case io::DIRECTION_WRITE: + m_require_lock_on_write = enabled; + break; + case io::DIRECTION_BOTH: + m_require_lock_on_read = enabled; + m_require_lock_on_write = enabled; + break; + } + + bool require_lock = (m_require_lock_on_read || m_require_lock_on_write); + return ((enabled && !prev_require_lock && require_lock) || + (!enabled && prev_require_lock && !require_lock)); +} + +template +bool ImageDispatch::read( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + io::ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + if (needs_exclusive_lock(true, tid, dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool ImageDispatch::write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool ImageDispatch::discard( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool ImageDispatch::write_same( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool ImageDispatch::compare_and_write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool ImageDispatch::flush( + io::AioCompletion* aio_comp, io::FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + // don't attempt to grab the exclusive lock if were are just internally + // clearing out our in-flight IO queue + if (flush_source != io::FLUSH_SOURCE_USER) { + return false; + } + + if (needs_exclusive_lock(false, tid, dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool ImageDispatch::is_lock_required(bool read_op) const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return ((read_op && m_require_lock_on_read) || + (!read_op && m_require_lock_on_write)); +} + +template +bool ImageDispatch::needs_exclusive_lock(bool read_op, uint64_t tid, + io::DispatchResult* dispatch_result, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + bool lock_required = false; + { + std::shared_lock locker{m_lock}; + lock_required = is_lock_required(read_op); + } + + if (lock_required) { + std::shared_lock owner_locker{m_image_ctx->owner_lock}; + if (m_image_ctx->exclusive_lock == nullptr) { + // raced with the exclusive lock being disabled + return false; + } + + ldout(cct, 5) << "exclusive lock required: delaying IO" << dendl; + if (!m_image_ctx->get_exclusive_lock_policy()->may_auto_request_lock()) { + lderr(cct) << "op requires exclusive lock" << dendl; + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + on_dispatched->complete( + m_image_ctx->exclusive_lock->get_unlocked_op_error()); + return true; + } + + // block potential races with other incoming IOs + std::unique_lock locker{m_lock}; + bool retesting_lock = ( + !m_on_dispatches.empty() && m_on_dispatches.front() == on_dispatched); + if (!m_on_dispatches.empty() && !retesting_lock) { + *dispatch_result = io::DISPATCH_RESULT_RESTART; + m_on_dispatches.push_back(on_dispatched); + return true; + } + + if (!is_lock_required(read_op)) { + return false; + } + + ceph_assert(m_on_dispatches.empty() || retesting_lock); + m_on_dispatches.push_back(on_dispatched); + locker.unlock(); + + *dispatch_result = io::DISPATCH_RESULT_RESTART; + auto ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + ImageDispatch, &ImageDispatch::handle_acquire_lock>(this)); + m_image_ctx->exclusive_lock->acquire_lock(ctx); + return true; + } + + return false; +} + +template +void ImageDispatch::handle_acquire_lock(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + std::unique_lock locker{m_lock}; + ceph_assert(!m_on_dispatches.empty()); + + Context* failed_dispatch = nullptr; + Contexts on_dispatches; + if (r == -ERESTART) { + ldout(cct, 5) << "IO raced with exclusive lock shutdown" << dendl; + } else if (r < 0) { + lderr(cct) << "failed to acquire exclusive lock: " << cpp_strerror(r) + << dendl; + failed_dispatch = m_on_dispatches.front(); + m_on_dispatches.pop_front(); + } + + // re-test if lock is still required (i.e. it wasn't acquired/lost) via a + // restart dispatch + std::swap(on_dispatches, m_on_dispatches); + locker.unlock(); + + if (failed_dispatch != nullptr) { + failed_dispatch->complete(r); + } + for (auto ctx : on_dispatches) { + ctx->complete(0); + } +} + +} // namespace exclusive_lock +} // namespace librbd + +template class librbd::exclusive_lock::ImageDispatch; diff --git a/src/librbd/exclusive_lock/ImageDispatch.h b/src/librbd/exclusive_lock/ImageDispatch.h new file mode 100644 index 000000000..c0d9d49f5 --- /dev/null +++ b/src/librbd/exclusive_lock/ImageDispatch.h @@ -0,0 +1,133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/ceph_mutex.h" +#include "common/zipkin_trace.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" +#include +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { +struct AioCompletion; +} + +namespace exclusive_lock { + +template +class ImageDispatch : public io::ImageDispatchInterface { +public: + static ImageDispatch* create(ImageCtxT* image_ctx) { + return new ImageDispatch(image_ctx); + } + void destroy() { + delete this; + } + + ImageDispatch(ImageCtxT* image_ctx); + + io::ImageDispatchLayer get_dispatch_layer() const override { + return io::IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK; + } + + void set_require_lock(bool init_shutdown, + io::Direction direction, Context* on_finish); + void unset_require_lock(io::Direction direction); + + void shut_down(Context* on_finish) override; + + bool read( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + io::ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + io::AioCompletion* aio_comp, io::FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + io::AioCompletion* aio_comp, io::Extents&& image_extents, + io::SnapIds&& snap_ids, int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + +private: + typedef std::list Contexts; + typedef std::unordered_set Tids; + + ImageCtxT* m_image_ctx; + mutable ceph::shared_mutex m_lock; + + bool m_require_lock_on_read = false; + bool m_require_lock_on_write = false; + + Contexts m_on_dispatches; + + bool set_require_lock(io::Direction direction, bool enabled); + + bool is_lock_required(bool read_op) const; + + bool needs_exclusive_lock(bool read_op, uint64_t tid, + io::DispatchResult* dispatch_result, + Context* on_dispatched); + + void handle_acquire_lock(int r); +}; + +} // namespace exclusiv_lock +} // namespace librbd + +extern template class librbd::exclusive_lock::ImageDispatch; + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_IMAGE_DISPATCH_H diff --git a/src/librbd/exclusive_lock/Policy.h b/src/librbd/exclusive_lock/Policy.h new file mode 100644 index 000000000..7064a6515 --- /dev/null +++ b/src/librbd/exclusive_lock/Policy.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H + +namespace librbd { +namespace exclusive_lock { + +enum OperationRequestType { + OPERATION_REQUEST_TYPE_GENERAL = 0, + OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE = 1, + OPERATION_REQUEST_TYPE_FORCE_PROMOTION = 2, +}; + +struct Policy { + virtual ~Policy() { + } + + virtual bool may_auto_request_lock() = 0; + virtual int lock_requested(bool force) = 0; + + virtual bool accept_blocked_request(OperationRequestType) { + return false; + } +}; + +} // namespace exclusive_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_POLICY_H diff --git a/src/librbd/exclusive_lock/PostAcquireRequest.cc b/src/librbd/exclusive_lock/PostAcquireRequest.cc new file mode 100644 index 000000000..4553b2158 --- /dev/null +++ b/src/librbd/exclusive_lock/PostAcquireRequest.cc @@ -0,0 +1,368 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/PostAcquireRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/image/RefreshRequest.h" +#include "librbd/journal/Policy.h" +#include "librbd/PluginRegistry.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::exclusive_lock::PostAcquireRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace exclusive_lock { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template +PostAcquireRequest* PostAcquireRequest::create(I &image_ctx, + Context *on_acquire, + Context *on_finish) { + return new PostAcquireRequest(image_ctx, on_acquire, on_finish); +} + +template +PostAcquireRequest::PostAcquireRequest(I &image_ctx, Context *on_acquire, + Context *on_finish) + : m_image_ctx(image_ctx), + m_on_acquire(on_acquire), + m_on_finish(create_async_context_callback(image_ctx, on_finish)), + m_object_map(nullptr), m_journal(nullptr), m_error_result(0) { +} + +template +PostAcquireRequest::~PostAcquireRequest() { + if (!m_prepare_lock_completed) { + m_image_ctx.state->handle_prepare_lock_complete(); + } + delete m_on_acquire; +} + +template +void PostAcquireRequest::send() { + send_refresh(); +} + +template +void PostAcquireRequest::send_refresh() { + if (!m_image_ctx.state->is_refresh_required()) { + send_open_object_map(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest; + Context *ctx = create_async_context_callback( + m_image_ctx, create_context_callback(this)); + + // ImageState is blocked waiting for lock to complete -- safe to directly + // refresh + image::RefreshRequest *req = image::RefreshRequest::create( + m_image_ctx, true, false, ctx); + req->send(); +} + +template +void PostAcquireRequest::handle_refresh(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r == -ERESTART) { + // next issued IO or op will (re)-refresh the image and shut down lock + ldout(cct, 5) << "exclusive lock dynamically disabled" << dendl; + r = 0; + } else if (r < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl; + save_result(r); + revert(); + finish(); + return; + } + + send_open_object_map(); +} + +template +void PostAcquireRequest::send_open_journal() { + // alert caller that we now own the exclusive lock + m_on_acquire->complete(0); + m_on_acquire = nullptr; + + bool journal_enabled; + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + journal_enabled = (m_image_ctx.test_features(RBD_FEATURE_JOURNALING, + m_image_ctx.image_lock) && + !m_image_ctx.get_journal_policy()->journal_disabled()); + } + if (!journal_enabled) { + apply(); + send_process_plugin_acquire_lock(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest; + Context *ctx = create_context_callback( + this); + m_journal = m_image_ctx.create_journal(); + + // journal playback requires object map (if enabled) and itself + apply(); + + m_journal->open(ctx); +} + +template +void PostAcquireRequest::handle_open_journal(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl; + send_close_journal(); + return; + } + + send_allocate_journal_tag(); +} + +template +void PostAcquireRequest::send_allocate_journal_tag() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + std::shared_lock image_locker{m_image_ctx.image_lock}; + using klass = PostAcquireRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_allocate_journal_tag>(this, m_journal); + m_image_ctx.get_journal_policy()->allocate_tag_on_lock(ctx); +} + +template +void PostAcquireRequest::handle_allocate_journal_tag(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to allocate journal tag: " << cpp_strerror(r) + << dendl; + send_close_journal(); + return; + } + + send_process_plugin_acquire_lock(); +} + +template +void PostAcquireRequest::send_process_plugin_acquire_lock() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_process_plugin_acquire_lock>(this); + m_image_ctx.plugin_registry->acquired_exclusive_lock(ctx); +} + +template +void PostAcquireRequest::handle_process_plugin_acquire_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to process plugins: " << cpp_strerror(r) + << dendl; + send_process_plugin_release_lock(); + return; + } + + finish(); +} + +template +void PostAcquireRequest::send_process_plugin_release_lock() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_process_plugin_release_lock>(this); + m_image_ctx.plugin_registry->prerelease_exclusive_lock(ctx); +} + +template +void PostAcquireRequest::handle_process_plugin_release_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to release plugins: " << cpp_strerror(r) + << dendl; + } + send_close_journal(); +} + +template +void PostAcquireRequest::send_close_journal() { + if (m_journal == nullptr) { + send_close_object_map(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest; + Context *ctx = create_context_callback( + this); + m_journal->close(ctx); +} + +template +void PostAcquireRequest::handle_close_journal(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to close journal: " << cpp_strerror(r) << dendl; + } + + send_close_object_map(); +} + +template +void PostAcquireRequest::send_open_object_map() { + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + send_open_journal(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest; + Context *ctx = create_context_callback( + this); + + m_object_map = m_image_ctx.create_object_map(CEPH_NOSNAP); + m_object_map->open(ctx); +} + +template +void PostAcquireRequest::handle_open_object_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to open object map: " << cpp_strerror(r) << dendl; + m_object_map->put(); + m_object_map = nullptr; + + if (r != -EFBIG) { + save_result(r); + revert(); + finish(); + return; + } + } + + send_open_journal(); +} + +template +void PostAcquireRequest::send_close_object_map() { + if (m_object_map == nullptr) { + revert(); + finish(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PostAcquireRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_close_object_map>(this); + m_object_map->close(ctx); +} + +template +void PostAcquireRequest::handle_close_object_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to close object map: " << cpp_strerror(r) << dendl; + } + + revert(); + finish(); +} + +template +void PostAcquireRequest::apply() { + { + std::unique_lock image_locker{m_image_ctx.image_lock}; + ceph_assert(m_image_ctx.object_map == nullptr); + m_image_ctx.object_map = m_object_map; + + ceph_assert(m_image_ctx.journal == nullptr); + m_image_ctx.journal = m_journal; + } + + m_prepare_lock_completed = true; + m_image_ctx.state->handle_prepare_lock_complete(); +} + +template +void PostAcquireRequest::revert() { + std::unique_lock image_locker{m_image_ctx.image_lock}; + m_image_ctx.object_map = nullptr; + m_image_ctx.journal = nullptr; + + if (m_object_map) { + m_object_map->put(); + } + if (m_journal) { + m_journal->put(); + } + + ceph_assert(m_error_result < 0); +} + +template +void PostAcquireRequest::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace exclusive_lock +} // namespace librbd + +template class librbd::exclusive_lock::PostAcquireRequest; diff --git a/src/librbd/exclusive_lock/PostAcquireRequest.h b/src/librbd/exclusive_lock/PostAcquireRequest.h new file mode 100644 index 000000000..2f7efdf07 --- /dev/null +++ b/src/librbd/exclusive_lock/PostAcquireRequest.h @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "librbd/ImageCtx.h" +#include "msg/msg_types.h" +#include + +class Context; + +namespace librbd { + +namespace exclusive_lock { + +template +class PostAcquireRequest { +public: + static PostAcquireRequest* create(ImageCtxT &image_ctx, Context *on_acquire, + Context *on_finish); + + ~PostAcquireRequest(); + void send(); + +private: + + /** + * @verbatim + * + * + * | + * | + * v + * REFRESH (skip if not + * | needed) + * v + * OPEN_OBJECT_MAP (skip if + * | disabled) + * v + * OPEN_JOURNAL (skip if + * | * disabled) + * | * + * | * * * * * * * * + * v * + * ALLOCATE_JOURNAL_TAG * + * | * * + * | * * + * v * * + * PROCESS_PLUGIN_ACQUIRE* + * | * * + * | * * + * | v v v + * | PROCESS_PLUGIN_RELEASE + * | | + * | v + * | CLOSE_JOURNAL + * | | + * | v + * | CLOSE_OBJECT_MAP + * | | + * v | + * <----------/ + * + * @endverbatim + */ + + PostAcquireRequest(ImageCtxT &image_ctx, Context *on_acquire, + Context *on_finish); + + ImageCtxT &m_image_ctx; + Context *m_on_acquire; + Context *m_on_finish; + + decltype(m_image_ctx.object_map) m_object_map; + decltype(m_image_ctx.journal) m_journal; + + bool m_prepare_lock_completed = false; + int m_error_result; + + void send_refresh(); + void handle_refresh(int r); + + void send_open_journal(); + void handle_open_journal(int r); + + void send_allocate_journal_tag(); + void handle_allocate_journal_tag(int r); + + void send_open_object_map(); + void handle_open_object_map(int r); + + void send_close_journal(); + void handle_close_journal(int r); + + void send_close_object_map(); + void handle_close_object_map(int r); + + void send_process_plugin_acquire_lock(); + void handle_process_plugin_acquire_lock(int r); + + void send_process_plugin_release_lock(); + void handle_process_plugin_release_lock(int r); + + void apply(); + void revert(); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace exclusive_lock +} // namespace librbd + +extern template class librbd::exclusive_lock::PostAcquireRequest; + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_POST_ACQUIRE_REQUEST_H diff --git a/src/librbd/exclusive_lock/PreAcquireRequest.cc b/src/librbd/exclusive_lock/PreAcquireRequest.cc new file mode 100644 index 000000000..feb0913d7 --- /dev/null +++ b/src/librbd/exclusive_lock/PreAcquireRequest.cc @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/PreAcquireRequest.h" +#include "librbd/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ImageState.h" +#include "librbd/asio/ContextWQ.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::exclusive_lock::PreAcquireRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace exclusive_lock { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template +PreAcquireRequest* PreAcquireRequest::create(I &image_ctx, + Context *on_finish) { + return new PreAcquireRequest(image_ctx, on_finish); +} + +template +PreAcquireRequest::PreAcquireRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), + m_on_finish(create_async_context_callback(image_ctx, on_finish)), + m_error_result(0) { +} + +template +PreAcquireRequest::~PreAcquireRequest() { +} + +template +void PreAcquireRequest::send() { + send_prepare_lock(); +} + +template +void PreAcquireRequest::send_prepare_lock() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + // acquire the lock if the image is not busy performing other actions + Context *ctx = create_context_callback< + PreAcquireRequest, &PreAcquireRequest::handle_prepare_lock>(this); + m_image_ctx.state->prepare_lock(ctx); +} + +template +void PreAcquireRequest::handle_prepare_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + send_flush_notifies(); +} + +template +void PreAcquireRequest::send_flush_notifies() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreAcquireRequest; + Context *ctx = create_context_callback( + this); + m_image_ctx.image_watcher->flush(ctx); +} + +template +void PreAcquireRequest::handle_flush_notifies(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + ceph_assert(r == 0); + finish(); +} + +template +void PreAcquireRequest::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace exclusive_lock +} // namespace librbd + +template class librbd::exclusive_lock::PreAcquireRequest; diff --git a/src/librbd/exclusive_lock/PreAcquireRequest.h b/src/librbd/exclusive_lock/PreAcquireRequest.h new file mode 100644 index 000000000..15d4b2c12 --- /dev/null +++ b/src/librbd/exclusive_lock/PreAcquireRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_ACQUIRE_REQUEST_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_ACQUIRE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "librbd/ImageCtx.h" +#include "msg/msg_types.h" +#include + +class Context; + +namespace librbd { + +namespace exclusive_lock { + +template +class PreAcquireRequest { +public: + static PreAcquireRequest* create(ImageCtxT &image_ctx, Context *on_finish); + + ~PreAcquireRequest(); + void send(); + +private: + + /** + * @verbatim + * + * + * | + * v + * PREPARE_LOCK + * | + * v + * FLUSH_NOTIFIES + * | + * | + * | + v + * + * + * @endverbatim + */ + + PreAcquireRequest(ImageCtxT &image_ctx, Context *on_finish); + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + int m_error_result; + + void send_prepare_lock(); + void handle_prepare_lock(int r); + + void send_flush_notifies(); + void handle_flush_notifies(int r); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace exclusive_lock +} // namespace librbd + +extern template class librbd::exclusive_lock::PreAcquireRequest; + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_ACQUIRE_REQUEST_H diff --git a/src/librbd/exclusive_lock/PreReleaseRequest.cc b/src/librbd/exclusive_lock/PreReleaseRequest.cc new file mode 100644 index 000000000..a9cd1248a --- /dev/null +++ b/src/librbd/exclusive_lock/PreReleaseRequest.cc @@ -0,0 +1,363 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/PreReleaseRequest.h" +#include "common/AsyncOpTracker.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/exclusive_lock/ImageDispatch.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/Types.h" +#include "librbd/PluginRegistry.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::exclusive_lock::PreReleaseRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace exclusive_lock { + +using util::create_async_context_callback; +using util::create_context_callback; + +template +PreReleaseRequest* PreReleaseRequest::create( + I &image_ctx, ImageDispatch* image_dispatch, bool shutting_down, + AsyncOpTracker &async_op_tracker, Context *on_finish) { + return new PreReleaseRequest(image_ctx, image_dispatch, shutting_down, + async_op_tracker, on_finish); +} + +template +PreReleaseRequest::PreReleaseRequest(I &image_ctx, + ImageDispatch* image_dispatch, + bool shutting_down, + AsyncOpTracker &async_op_tracker, + Context *on_finish) + : m_image_ctx(image_ctx), m_image_dispatch(image_dispatch), + m_shutting_down(shutting_down), m_async_op_tracker(async_op_tracker), + m_on_finish(create_async_context_callback(image_ctx, on_finish)) { +} + +template +PreReleaseRequest::~PreReleaseRequest() { + if (!m_shutting_down) { + m_image_ctx.state->handle_prepare_lock_complete(); + } +} + +template +void PreReleaseRequest::send() { + send_cancel_op_requests(); +} + +template +void PreReleaseRequest::send_cancel_op_requests() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_cancel_op_requests>(this); + m_image_ctx.cancel_async_requests(ctx); +} + +template +void PreReleaseRequest::handle_cancel_op_requests(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + ceph_assert(r == 0); + + send_set_require_lock(); +} + +template +void PreReleaseRequest::send_set_require_lock() { + if (!m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) { + // exclusive-lock was disabled, no need to block IOs + send_wait_for_ops(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_set_require_lock>(this); + + // setting the lock as required will automatically cause the IO + // queue to re-request the lock if any IO is queued + if (m_image_ctx.clone_copy_on_read || + m_image_ctx.test_features(RBD_FEATURE_JOURNALING) || + m_image_ctx.test_features(RBD_FEATURE_DIRTY_CACHE)) { + m_image_dispatch->set_require_lock(m_shutting_down, + io::DIRECTION_BOTH, ctx); + } else { + m_image_dispatch->set_require_lock(m_shutting_down, + io::DIRECTION_WRITE, ctx); + } +} + +template +void PreReleaseRequest::handle_set_require_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + // IOs are still flushed regardless of the error + lderr(cct) << "failed to set lock: " << cpp_strerror(r) << dendl; + } + + send_wait_for_ops(); +} + +template +void PreReleaseRequest::send_wait_for_ops() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + Context *ctx = create_context_callback< + PreReleaseRequest, &PreReleaseRequest::handle_wait_for_ops>(this); + m_async_op_tracker.wait_for_ops(ctx); +} + +template +void PreReleaseRequest::handle_wait_for_ops(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + send_prepare_lock(); +} + +template +void PreReleaseRequest::send_prepare_lock() { + if (m_shutting_down) { + send_process_plugin_release_lock(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + // release the lock if the image is not busy performing other actions + Context *ctx = create_context_callback< + PreReleaseRequest, &PreReleaseRequest::handle_prepare_lock>(this); + m_image_ctx.state->prepare_lock(ctx); +} + +template +void PreReleaseRequest::handle_prepare_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + send_process_plugin_release_lock(); +} + +template +void PreReleaseRequest::send_process_plugin_release_lock() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + Context *ctx = create_async_context_callback(m_image_ctx, create_context_callback< + PreReleaseRequest, + &PreReleaseRequest::handle_process_plugin_release_lock>(this)); + m_image_ctx.plugin_registry->prerelease_exclusive_lock(ctx); +} + +template +void PreReleaseRequest::handle_process_plugin_release_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to handle plugins before releasing lock: " + << cpp_strerror(r) << dendl; + m_image_dispatch->unset_require_lock(io::DIRECTION_BOTH); + save_result(r); + finish(); + return; + } + + send_invalidate_cache(); +} + +template +void PreReleaseRequest::send_invalidate_cache() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + Context *ctx = create_context_callback< + PreReleaseRequest, + &PreReleaseRequest::handle_invalidate_cache>(this); + m_image_ctx.io_image_dispatcher->invalidate_cache(ctx); +} + +template +void PreReleaseRequest::handle_invalidate_cache(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -EBLOCKLISTED && r != -EBUSY) { + lderr(cct) << "failed to invalidate cache: " << cpp_strerror(r) + << dendl; + m_image_dispatch->unset_require_lock(io::DIRECTION_BOTH); + save_result(r); + finish(); + return; + } + + send_flush_io(); +} + +template +void PreReleaseRequest::send_flush_io() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + // ensure that all in-flight IO is flushed -- skipping the refresh layer + // since it should have been flushed when the lock was required and now + // refreshes are disabled / interlocked w/ this state machine. + auto ctx = create_context_callback< + PreReleaseRequest, &PreReleaseRequest::handle_flush_io>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, util::get_image_ctx(&m_image_ctx), librbd::io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + m_image_ctx, io::IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK, aio_comp, + io::FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH, {}); + req->send(); +} + +template +void PreReleaseRequest::handle_flush_io(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to flush IO: " << cpp_strerror(r) << dendl; + } + + send_flush_notifies(); +} + +template +void PreReleaseRequest::send_flush_notifies() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest; + Context *ctx = + create_context_callback(this); + m_image_ctx.image_watcher->flush(ctx); +} + +template +void PreReleaseRequest::handle_flush_notifies(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + ceph_assert(r == 0); + send_close_journal(); +} + +template +void PreReleaseRequest::send_close_journal() { + { + std::unique_lock image_locker{m_image_ctx.image_lock}; + std::swap(m_journal, m_image_ctx.journal); + } + + if (m_journal == nullptr) { + send_close_object_map(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest; + Context *ctx = create_context_callback( + this); + m_journal->close(ctx); +} + +template +void PreReleaseRequest::handle_close_journal(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + // error implies some journal events were not flushed -- continue + lderr(cct) << "failed to close journal: " << cpp_strerror(r) << dendl; + } + + m_journal->put(); + m_journal = nullptr; + + send_close_object_map(); +} + +template +void PreReleaseRequest::send_close_object_map() { + { + std::unique_lock image_locker{m_image_ctx.image_lock}; + std::swap(m_object_map, m_image_ctx.object_map); + } + + if (m_object_map == nullptr) { + send_unlock(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + using klass = PreReleaseRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_close_object_map>(this, m_object_map); + m_object_map->close(ctx); +} + +template +void PreReleaseRequest::handle_close_object_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to close object map: " << cpp_strerror(r) << dendl; + } + m_object_map->put(); + + send_unlock(); +} + +template +void PreReleaseRequest::send_unlock() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << dendl; + + finish(); +} + +template +void PreReleaseRequest::finish() { + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace exclusive_lock +} // namespace librbd + +template class librbd::exclusive_lock::PreReleaseRequest; diff --git a/src/librbd/exclusive_lock/PreReleaseRequest.h b/src/librbd/exclusive_lock/PreReleaseRequest.h new file mode 100644 index 000000000..426337943 --- /dev/null +++ b/src/librbd/exclusive_lock/PreReleaseRequest.h @@ -0,0 +1,139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H + +#include "librbd/ImageCtx.h" +#include + +class AsyncOpTracker; +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace exclusive_lock { + +template struct ImageDispatch; + +template +class PreReleaseRequest { +public: + static PreReleaseRequest* create(ImageCtxT &image_ctx, + ImageDispatch* image_dispatch, + bool shutting_down, + AsyncOpTracker &async_op_tracker, + Context *on_finish); + + ~PreReleaseRequest(); + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * CANCEL_OP_REQUESTS + * | + * v + * SET_REQUIRE_LOCK + * | + * v + * WAIT_FOR_OPS + * | + * v + * PREPARE_LOCK + * | + * v + * PROCESS_PLUGIN_RELEASE + * | + * v + * SHUT_DOWN_IMAGE_CACHE + * | + * v + * INVALIDATE_CACHE + * | + * v + * FLUSH_IO + * | + * v + * FLUSH_NOTIFIES . . . . . . . . . . . . . . + * | . + * v . + * CLOSE_JOURNAL . + * | (journal disabled, . + * v object map enabled) . + * CLOSE_OBJECT_MAP < . . . . . . . . . . . . + * | . + * v (object map disabled) . + * < . . . . . . . . . . . . . . . . . + * + * @endverbatim + */ + + PreReleaseRequest(ImageCtxT &image_ctx, + ImageDispatch* image_dispatch, + bool shutting_down, AsyncOpTracker &async_op_tracker, + Context *on_finish); + + ImageCtxT &m_image_ctx; + ImageDispatch* m_image_dispatch; + bool m_shutting_down; + AsyncOpTracker &m_async_op_tracker; + Context *m_on_finish; + + int m_error_result = 0; + + decltype(m_image_ctx.object_map) m_object_map = nullptr; + decltype(m_image_ctx.journal) m_journal = nullptr; + + void send_cancel_op_requests(); + void handle_cancel_op_requests(int r); + + void send_set_require_lock(); + void handle_set_require_lock(int r); + + void send_wait_for_ops(); + void handle_wait_for_ops(int r); + + void send_prepare_lock(); + void handle_prepare_lock(int r); + + void send_process_plugin_release_lock(); + void handle_process_plugin_release_lock(int r); + + void send_invalidate_cache(); + void handle_invalidate_cache(int r); + + void send_flush_io(); + void handle_flush_io(int r); + + void send_flush_notifies(); + void handle_flush_notifies(int r); + + void send_close_journal(); + void handle_close_journal(int r); + + void send_close_object_map(); + void handle_close_object_map(int r); + + void send_unlock(); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } + +}; + +} // namespace exclusive_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_PRE_RELEASE_REQUEST_H diff --git a/src/librbd/exclusive_lock/StandardPolicy.cc b/src/librbd/exclusive_lock/StandardPolicy.cc new file mode 100644 index 000000000..519e9618e --- /dev/null +++ b/src/librbd/exclusive_lock/StandardPolicy.cc @@ -0,0 +1,29 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/exclusive_lock/StandardPolicy.h" +#include "librbd/ImageCtx.h" +#include "librbd/ExclusiveLock.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ExclusiveLock::StandardPolicy " + +namespace librbd { +namespace exclusive_lock { + +template +int StandardPolicy::lock_requested(bool force) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock)); + ceph_assert(m_image_ctx->exclusive_lock != nullptr); + + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << ": force=" << force + << dendl; + + return -EROFS; +} + +} // namespace exclusive_lock +} // namespace librbd + +template class librbd::exclusive_lock::StandardPolicy; diff --git a/src/librbd/exclusive_lock/StandardPolicy.h b/src/librbd/exclusive_lock/StandardPolicy.h new file mode 100644 index 000000000..dd4e19050 --- /dev/null +++ b/src/librbd/exclusive_lock/StandardPolicy.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H +#define CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H + +#include "librbd/exclusive_lock/Policy.h" + +namespace librbd { + +struct ImageCtx; + +namespace exclusive_lock { + +template +class StandardPolicy : public Policy { +public: + StandardPolicy(ImageCtxT* image_ctx) : m_image_ctx(image_ctx) { + } + + bool may_auto_request_lock() override { + return false; + } + + int lock_requested(bool force) override; + +private: + ImageCtxT* m_image_ctx; + +}; + +} // namespace exclusive_lock +} // namespace librbd + +extern template class librbd::exclusive_lock::StandardPolicy; + +#endif // CEPH_LIBRBD_EXCLUSIVE_LOCK_STANDARD_POLICY_H diff --git a/src/librbd/image/AttachChildRequest.cc b/src/librbd/image/AttachChildRequest.cc new file mode 100644 index 000000000..2f74191ed --- /dev/null +++ b/src/librbd/image/AttachChildRequest.cc @@ -0,0 +1,261 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/AttachChildRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/image/RefreshRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::AttachChildRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template +AttachChildRequest::AttachChildRequest( + I *image_ctx, I *parent_image_ctx, const librados::snap_t &parent_snap_id, + I *old_parent_image_ctx, const librados::snap_t &old_parent_snap_id, + uint32_t clone_format, Context* on_finish) + : m_image_ctx(image_ctx), m_parent_image_ctx(parent_image_ctx), + m_parent_snap_id(parent_snap_id), + m_old_parent_image_ctx(old_parent_image_ctx), + m_old_parent_snap_id(old_parent_snap_id), m_clone_format(clone_format), + m_on_finish(on_finish), m_cct(m_image_ctx->cct) { +} + +template +void AttachChildRequest::send() { + if (m_clone_format == 1) { + v1_add_child(); + } else { + v2_set_op_feature(); + } +} + +template +void AttachChildRequest::v1_add_child() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::add_child(&op, {m_parent_image_ctx->md_ctx.get_id(), "", + m_parent_image_ctx->id, + m_parent_snap_id}, m_image_ctx->id); + + using klass = AttachChildRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_image_ctx->md_ctx.aio_operate(RBD_CHILDREN, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void AttachChildRequest::handle_v1_add_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + if (r == -EEXIST && m_old_parent_image_ctx != nullptr) { + ldout(m_cct, 5) << "child already exists" << dendl; + } else { + lderr(m_cct) << "couldn't add child: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + } + + v1_refresh(); +} + +template +void AttachChildRequest::v1_refresh() { + ldout(m_cct, 15) << dendl; + + using klass = AttachChildRequest; + RefreshRequest *req = RefreshRequest::create( + *m_parent_image_ctx, false, false, + create_context_callback(this)); + req->send(); +} + +template +void AttachChildRequest::handle_v1_refresh(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + bool snap_protected = false; + if (r == 0) { + std::shared_lock image_locker{m_parent_image_ctx->image_lock}; + r = m_parent_image_ctx->is_snap_protected(m_parent_snap_id, + &snap_protected); + } + + if (r < 0 || !snap_protected) { + lderr(m_cct) << "validate protected failed" << dendl; + finish(-EINVAL); + return; + } + + v1_remove_child_from_old_parent(); +} + +template +void AttachChildRequest::v1_remove_child_from_old_parent() { + if (m_old_parent_image_ctx == nullptr) { + finish(0); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::remove_child(&op, {m_old_parent_image_ctx->md_ctx.get_id(), + m_old_parent_image_ctx->md_ctx.get_namespace(), + m_old_parent_image_ctx->id, + m_old_parent_snap_id}, m_image_ctx->id); + + using klass = AttachChildRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v1_remove_child_from_old_parent>(this); + int r = m_image_ctx->md_ctx.aio_operate(RBD_CHILDREN, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void AttachChildRequest::handle_v1_remove_child_from_old_parent(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "couldn't remove child: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void AttachChildRequest::v2_set_op_feature() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::op_features_set(&op, RBD_OPERATION_FEATURE_CLONE_CHILD, + RBD_OPERATION_FEATURE_CLONE_CHILD); + + using klass = AttachChildRequest; + auto aio_comp = create_rados_callback< + klass, &klass::handle_v2_set_op_feature>(this); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp, + &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void AttachChildRequest::handle_v2_set_op_feature(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to enable clone v2: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + v2_child_attach(); +} + +template +void AttachChildRequest::v2_child_attach() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::child_attach(&op, m_parent_snap_id, + {m_image_ctx->md_ctx.get_id(), + m_image_ctx->md_ctx.get_namespace(), + m_image_ctx->id}); + + using klass = AttachChildRequest; + auto aio_comp = create_rados_callback< + klass, &klass::handle_v2_child_attach>(this); + int r = m_parent_image_ctx->md_ctx.aio_operate(m_parent_image_ctx->header_oid, + aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void AttachChildRequest::handle_v2_child_attach(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + if (r == -EEXIST && m_old_parent_image_ctx != nullptr) { + ldout(m_cct, 5) << "child already exists" << dendl; + } else { + lderr(m_cct) << "failed to attach child image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + } + + v2_child_detach_from_old_parent(); +} + +template +void AttachChildRequest::v2_child_detach_from_old_parent() { + if (m_old_parent_image_ctx == nullptr) { + finish(0); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::child_detach(&op, m_old_parent_snap_id, + {m_image_ctx->md_ctx.get_id(), + m_image_ctx->md_ctx.get_namespace(), + m_image_ctx->id}); + + using klass = AttachChildRequest; + auto aio_comp = create_rados_callback< + klass, &klass::handle_v2_child_detach_from_old_parent>(this); + int r = m_old_parent_image_ctx->md_ctx.aio_operate( + m_old_parent_image_ctx->header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void AttachChildRequest::handle_v2_child_detach_from_old_parent(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to detach child image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void AttachChildRequest::finish(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::AttachChildRequest; diff --git a/src/librbd/image/AttachChildRequest.h b/src/librbd/image/AttachChildRequest.h new file mode 100644 index 000000000..a40afaf54 --- /dev/null +++ b/src/librbd/image/AttachChildRequest.h @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H +#define CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H + +#include "include/common_fwd.h" +#include "include/int_types.h" +#include "include/rados/librados.hpp" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template +class AttachChildRequest { +public: + static AttachChildRequest* create(ImageCtxT *image_ctx, + ImageCtxT *parent_image_ctx, + const librados::snap_t &parent_snap_id, + ImageCtxT *old_parent_image_ctx, + const librados::snap_t &old_parent_snap_id, + uint32_t clone_format, + Context* on_finish) { + return new AttachChildRequest(image_ctx, parent_image_ctx, parent_snap_id, + old_parent_image_ctx, old_parent_snap_id, + clone_format, on_finish); + } + + AttachChildRequest(ImageCtxT *image_ctx, + ImageCtxT *parent_image_ctx, + const librados::snap_t &parent_snap_id, + ImageCtxT *old_parent_image_ctx, + const librados::snap_t &old_parent_snap_id, + uint32_t clone_format, + Context* on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * (clone v1) | (clone v2) + * /----------------/ \---------------\ + * | | + * v v + * V1 ADD CHILD V2 SET CLONE + * | | + * v v + * V1 VALIDATE PROTECTED V2 ATTACH CHILD + * | | + * | v + * V1 REMOVE CHILD FROM OLD PARENT V2 DETACH CHILD FROM OLD PARENT + * | | + * \----------------\ /---------------/ + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + ImageCtxT *m_parent_image_ctx; + librados::snap_t m_parent_snap_id; + ImageCtxT *m_old_parent_image_ctx; + librados::snap_t m_old_parent_snap_id; + uint32_t m_clone_format; + Context* m_on_finish; + + CephContext *m_cct; + + void v1_add_child(); + void handle_v1_add_child(int r); + + void v1_refresh(); + void handle_v1_refresh(int r); + + void v1_remove_child_from_old_parent(); + void handle_v1_remove_child_from_old_parent(int r); + + void v2_set_op_feature(); + void handle_v2_set_op_feature(int r); + + void v2_child_attach(); + void handle_v2_child_attach(int r); + + void v2_child_detach_from_old_parent(); + void handle_v2_child_detach_from_old_parent(int r); + + void finish(int r); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::AttachChildRequest; + +#endif // CEPH_LIBRBD_IMAGE_ATTACH_CHILD_REQUEST_H diff --git a/src/librbd/image/AttachParentRequest.cc b/src/librbd/image/AttachParentRequest.cc new file mode 100644 index 000000000..d0c35b6a9 --- /dev/null +++ b/src/librbd/image/AttachParentRequest.cc @@ -0,0 +1,90 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/AttachParentRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::AttachParentRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +using util::create_rados_callback; + +template +void AttachParentRequest::send() { + attach_parent(); +} + +template +void AttachParentRequest::attach_parent() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "parent_image_spec=" << m_parent_image_spec << dendl; + + librados::ObjectWriteOperation op; + if (!m_legacy_parent) { + librbd::cls_client::parent_attach(&op, m_parent_image_spec, + m_parent_overlap, m_reattach); + } else { + librbd::cls_client::set_parent(&op, m_parent_image_spec, m_parent_overlap); + } + + auto aio_comp = create_rados_callback< + AttachParentRequest, + &AttachParentRequest::handle_attach_parent>(this); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void AttachParentRequest::handle_attach_parent(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + if (!m_legacy_parent && r == -EOPNOTSUPP && !m_reattach) { + if (m_parent_image_spec.pool_namespace == + m_image_ctx.md_ctx.get_namespace()) { + m_parent_image_spec.pool_namespace = ""; + } + if (m_parent_image_spec.pool_namespace.empty()) { + ldout(cct, 10) << "retrying using legacy parent method" << dendl; + m_legacy_parent = true; + attach_parent(); + return; + } + + // namespaces require newer OSDs + r = -EXDEV; + } + + if (r < 0) { + lderr(cct) << "attach parent encountered an error: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void AttachParentRequest::finish(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::AttachParentRequest; diff --git a/src/librbd/image/AttachParentRequest.h b/src/librbd/image/AttachParentRequest.h new file mode 100644 index 000000000..482e03273 --- /dev/null +++ b/src/librbd/image/AttachParentRequest.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H +#define CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "librbd/Types.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template +class AttachParentRequest { +public: + static AttachParentRequest* create(ImageCtxT& image_ctx, + const cls::rbd::ParentImageSpec& pspec, + uint64_t parent_overlap, + bool reattach, + Context* on_finish) { + return new AttachParentRequest(image_ctx, pspec, parent_overlap, reattach, + on_finish); + } + + AttachParentRequest(ImageCtxT& image_ctx, + const cls::rbd::ParentImageSpec& pspec, + uint64_t parent_overlap, bool reattach, + Context* on_finish) + : m_image_ctx(image_ctx), m_parent_image_spec(pspec), + m_parent_overlap(parent_overlap), m_reattach(reattach), + m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | * * * * * * + * | * * -EOPNOTSUPP + * v v * + * ATTACH_PARENT * * * + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT& m_image_ctx; + cls::rbd::ParentImageSpec m_parent_image_spec; + uint64_t m_parent_overlap; + bool m_reattach; + Context* m_on_finish; + + bool m_legacy_parent = false; + + void attach_parent(); + void handle_attach_parent(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::AttachParentRequest; + +#endif // CEPH_LIBRBD_IMAGE_ATTACH_PARENT_REQUEST_H diff --git a/src/librbd/image/CloneRequest.cc b/src/librbd/image/CloneRequest.cc new file mode 100644 index 000000000..7a955f064 --- /dev/null +++ b/src/librbd/image/CloneRequest.cc @@ -0,0 +1,607 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cls/rbd/cls_rbd_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/ceph_assert.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/deep_copy/MetadataCopyRequest.h" +#include "librbd/image/AttachChildRequest.h" +#include "librbd/image/AttachParentRequest.h" +#include "librbd/image/CloneRequest.h" +#include "librbd/image/CreateRequest.h" +#include "librbd/image/RemoveRequest.h" +#include "librbd/image/Types.h" +#include "librbd/mirror/EnableRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::CloneRequest: " << this << " " \ + << __func__ << ": " + +#define MAX_KEYS 64 + +namespace librbd { +namespace image { + +using util::create_rados_callback; +using util::create_context_callback; +using util::create_async_context_callback; + +template +CloneRequest::CloneRequest( + ConfigProxy& config, + IoCtx& parent_io_ctx, + const std::string& parent_image_id, + const std::string& parent_snap_name, + const cls::rbd::SnapshotNamespace& parent_snap_namespace, + uint64_t parent_snap_id, + IoCtx &c_ioctx, + const std::string &c_name, + const std::string &c_id, + ImageOptions c_options, + cls::rbd::MirrorImageMode mirror_image_mode, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + asio::ContextWQ *op_work_queue, Context *on_finish) + : m_config(config), m_parent_io_ctx(parent_io_ctx), + m_parent_image_id(parent_image_id), m_parent_snap_name(parent_snap_name), + m_parent_snap_namespace(parent_snap_namespace), + m_parent_snap_id(parent_snap_id), m_ioctx(c_ioctx), m_name(c_name), + m_id(c_id), m_opts(c_options), m_mirror_image_mode(mirror_image_mode), + m_non_primary_global_image_id(non_primary_global_image_id), + m_primary_mirror_uuid(primary_mirror_uuid), + m_op_work_queue(op_work_queue), m_on_finish(on_finish), + m_use_p_features(true) { + + m_cct = reinterpret_cast(m_ioctx.cct()); + + bool default_format_set; + m_opts.is_set(RBD_IMAGE_OPTION_FORMAT, &default_format_set); + if (!default_format_set) { + m_opts.set(RBD_IMAGE_OPTION_FORMAT, static_cast(2)); + } + + ldout(m_cct, 20) << "parent_pool_id=" << parent_io_ctx.get_id() << ", " + << "parent_image_id=" << parent_image_id << ", " + << "parent_snap=" << parent_snap_name << "/" + << parent_snap_id << " clone to " + << "pool_id=" << m_ioctx.get_id() << ", " + << "name=" << m_name << ", " + << "opts=" << m_opts << dendl; +} + +template +void CloneRequest::send() { + ldout(m_cct, 20) << dendl; + validate_options(); +} + +template +void CloneRequest::validate_options() { + ldout(m_cct, 20) << dendl; + + uint64_t format = 0; + m_opts.get(RBD_IMAGE_OPTION_FORMAT, &format); + if (format < 2) { + lderr(m_cct) << "format 2 or later required for clone" << dendl; + complete(-EINVAL); + return; + } + + if (m_opts.get(RBD_IMAGE_OPTION_FEATURES, &m_features) == 0) { + if (m_features & ~RBD_FEATURES_ALL) { + lderr(m_cct) << "librbd does not support requested features" << dendl; + complete(-ENOSYS); + return; + } + m_use_p_features = false; + } + + if (m_opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &m_clone_format) < 0) { + std::string default_clone_format = m_config.get_val( + "rbd_default_clone_format"); + if (default_clone_format == "1") { + m_clone_format = 1; + } else if (default_clone_format == "auto") { + librados::Rados rados(m_ioctx); + int8_t min_compat_client; + int8_t require_min_compat_client; + int r = rados.get_min_compatible_client(&min_compat_client, + &require_min_compat_client); + if (r < 0) { + complete(r); + return; + } + if (std::max(min_compat_client, require_min_compat_client) < + CEPH_RELEASE_MIMIC) { + m_clone_format = 1; + } + } + } + + if (m_clone_format == 1 && + m_parent_io_ctx.get_namespace() != m_ioctx.get_namespace()) { + ldout(m_cct, 1) << "clone v2 required for cross-namespace clones" << dendl; + complete(-EXDEV); + return; + } + + open_parent(); +} + +template +void CloneRequest::open_parent() { + ldout(m_cct, 20) << dendl; + ceph_assert(m_parent_snap_name.empty() ^ (m_parent_snap_id == CEPH_NOSNAP)); + + if (m_parent_snap_id != CEPH_NOSNAP) { + m_parent_image_ctx = I::create("", m_parent_image_id, m_parent_snap_id, + m_parent_io_ctx, true); + } else { + m_parent_image_ctx = I::create("", m_parent_image_id, + m_parent_snap_name.c_str(), + m_parent_io_ctx, + true); + m_parent_image_ctx->snap_namespace = m_parent_snap_namespace; + } + + Context *ctx = create_context_callback< + CloneRequest, &CloneRequest::handle_open_parent>(this); + m_parent_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx); +} + +template +void CloneRequest::handle_open_parent(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_parent_image_ctx = nullptr; + + lderr(m_cct) << "failed to open parent image: " << cpp_strerror(r) << dendl; + complete(r); + return; + } + + m_parent_snap_id = m_parent_image_ctx->snap_id; + m_pspec = {m_parent_io_ctx.get_id(), m_parent_io_ctx.get_namespace(), + m_parent_image_id, m_parent_snap_id}; + validate_parent(); +} + +template +void CloneRequest::validate_parent() { + ldout(m_cct, 20) << dendl; + + if (m_parent_image_ctx->operations_disabled) { + lderr(m_cct) << "image operations disabled due to unsupported op features" + << dendl; + m_r_saved = -EROFS; + close_parent(); + return; + } + + if (m_parent_image_ctx->snap_id == CEPH_NOSNAP) { + lderr(m_cct) << "image to be cloned must be a snapshot" << dendl; + m_r_saved = -EINVAL; + close_parent(); + return; + } + + if (m_parent_image_ctx->old_format) { + lderr(m_cct) << "parent image must be in new format" << dendl; + m_r_saved = -EINVAL; + close_parent(); + return; + } + + m_parent_image_ctx->image_lock.lock_shared(); + uint64_t p_features = m_parent_image_ctx->features; + m_size = m_parent_image_ctx->get_image_size(m_parent_image_ctx->snap_id); + + bool snap_protected; + int r = m_parent_image_ctx->is_snap_protected(m_parent_image_ctx->snap_id, &snap_protected); + m_parent_image_ctx->image_lock.unlock_shared(); + + if ((p_features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) { + lderr(m_cct) << "parent image must support layering" << dendl; + m_r_saved = -ENOSYS; + close_parent(); + return; + } + if (m_use_p_features) { + m_features = p_features; + } + + if (r < 0) { + lderr(m_cct) << "unable to locate parent's snapshot" << dendl; + m_r_saved = r; + close_parent(); + return; + } + + if (m_clone_format == 1 && !snap_protected) { + lderr(m_cct) << "parent snapshot must be protected" << dendl; + m_r_saved = -EINVAL; + close_parent(); + return; + } + + validate_child(); +} + +template +void CloneRequest::validate_child() { + ldout(m_cct, 15) << dendl; + + if ((m_features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) { + lderr(m_cct) << "cloning image must support layering" << dendl; + m_r_saved = -ENOSYS; + close_parent(); + return; + } + + using klass = CloneRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_validate_child>(this); + + librados::ObjectReadOperation op; + op.stat(NULL, NULL, NULL); + + int r = m_ioctx.aio_operate(util::old_header_name(m_name), comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void CloneRequest::handle_validate_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r != -ENOENT) { + lderr(m_cct) << "rbd image " << m_name << " already exists" << dendl; + m_r_saved = r; + close_parent(); + return; + } + + create_child(); +} + +template +void CloneRequest::create_child() { + ldout(m_cct, 15) << dendl; + + uint64_t order = m_parent_image_ctx->order; + if (m_opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) { + m_opts.set(RBD_IMAGE_OPTION_ORDER, order); + } + m_opts.set(RBD_IMAGE_OPTION_FEATURES, m_features); + + uint64_t stripe_unit = m_parent_image_ctx->stripe_unit; + if (m_opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) { + m_opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + } + + uint64_t stripe_count = m_parent_image_ctx->stripe_count; + if (m_opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) { + m_opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + } + + using klass = CloneRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_create_child>(this); + + auto req = CreateRequest::create( + m_config, m_ioctx, m_name, m_id, m_size, m_opts, + image::CREATE_FLAG_SKIP_MIRROR_ENABLE, + cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, m_non_primary_global_image_id, + m_primary_mirror_uuid, m_op_work_queue, ctx); + req->send(); +} + +template +void CloneRequest::handle_create_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EBADF) { + ldout(m_cct, 5) << "image id already in-use" << dendl; + complete(r); + return; + } else if (r < 0) { + lderr(m_cct) << "error creating child: " << cpp_strerror(r) << dendl; + m_r_saved = r; + close_parent(); + return; + } + open_child(); +} + +template +void CloneRequest::open_child() { + ldout(m_cct, 15) << dendl; + + m_imctx = I::create(m_name, "", nullptr, m_ioctx, false); + + using klass = CloneRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_open_child>(this); + + uint64_t flags = OPEN_FLAG_SKIP_OPEN_PARENT; + if ((m_features & RBD_FEATURE_MIGRATING) != 0) { + flags |= OPEN_FLAG_IGNORE_MIGRATING; + } + + m_imctx->state->open(flags, ctx); +} + +template +void CloneRequest::handle_open_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + m_imctx = nullptr; + + lderr(m_cct) << "Error opening new image: " << cpp_strerror(r) << dendl; + m_r_saved = r; + remove_child(); + return; + } + + attach_parent(); +} + +template +void CloneRequest::attach_parent() { + ldout(m_cct, 15) << dendl; + + auto ctx = create_context_callback< + CloneRequest, &CloneRequest::handle_attach_parent>(this); + auto req = AttachParentRequest::create( + *m_imctx, m_pspec, m_size, false, ctx); + req->send(); +} + +template +void CloneRequest::handle_attach_parent(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl; + m_r_saved = r; + close_child(); + return; + } + + attach_child(); +} + +template +void CloneRequest::attach_child() { + ldout(m_cct, 15) << dendl; + + auto ctx = create_context_callback< + CloneRequest, &CloneRequest::handle_attach_child>(this); + auto req = AttachChildRequest::create( + m_imctx, m_parent_image_ctx, m_parent_image_ctx->snap_id, nullptr, 0, + m_clone_format, ctx); + req->send(); +} + +template +void CloneRequest::handle_attach_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to attach parent: " << cpp_strerror(r) << dendl; + m_r_saved = r; + close_child(); + return; + } + + copy_metadata(); +} + +template +void CloneRequest::copy_metadata() { + ldout(m_cct, 15) << dendl; + + auto ctx = create_context_callback< + CloneRequest, &CloneRequest::handle_copy_metadata>(this); + auto req = deep_copy::MetadataCopyRequest::create( + m_parent_image_ctx, m_imctx, ctx); + req->send(); +} + +template +void CloneRequest::handle_copy_metadata(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl; + m_r_saved = r; + close_child(); + return; + } + + get_mirror_mode(); +} + +template +void CloneRequest::get_mirror_mode() { + ldout(m_cct, 15) << dendl; + + uint64_t mirror_image_mode; + if (!m_non_primary_global_image_id.empty()) { + enable_mirror(); + return; + } else if (m_opts.get(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE, + &mirror_image_mode) == 0) { + m_mirror_image_mode = static_cast( + mirror_image_mode); + enable_mirror(); + return; + } else if (!m_imctx->test_features(RBD_FEATURE_JOURNALING)) { + close_child(); + return; + } + + librados::ObjectReadOperation op; + cls_client::mirror_mode_get_start(&op); + + using klass = CloneRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_out_bl.clear(); + m_imctx->md_ctx.aio_operate(RBD_MIRRORING, + comp, &op, &m_out_bl); + comp->release(); +} + +template +void CloneRequest::handle_get_mirror_mode(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode); + } + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + + m_r_saved = r; + } else if (m_mirror_mode == cls::rbd::MIRROR_MODE_POOL) { + m_mirror_image_mode = cls::rbd::MIRROR_IMAGE_MODE_JOURNAL; + enable_mirror(); + return; + } + + close_child(); +} + +template +void CloneRequest::enable_mirror() { + ldout(m_cct, 15) << dendl; + + using klass = CloneRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_enable_mirror>(this); + auto req = mirror::EnableRequest::create( + m_imctx, m_mirror_image_mode, m_non_primary_global_image_id, true, ctx); + req->send(); +} + +template +void CloneRequest::handle_enable_mirror(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r) + << dendl; + m_r_saved = r; + } + close_child(); +} + +template +void CloneRequest::close_child() { + ldout(m_cct, 15) << dendl; + + ceph_assert(m_imctx != nullptr); + + auto ctx = create_context_callback< + CloneRequest, &CloneRequest::handle_close_child>(this); + m_imctx->state->close(ctx); +} + +template +void CloneRequest::handle_close_child(int r) { + ldout(m_cct, 15) << dendl; + + m_imctx = nullptr; + + if (r < 0) { + lderr(m_cct) << "couldn't close image: " << cpp_strerror(r) << dendl; + if (m_r_saved == 0) { + m_r_saved = r; + } + } + + if (m_r_saved < 0) { + remove_child(); + return; + } + + close_parent(); +} + +template +void CloneRequest::remove_child() { + ldout(m_cct, 15) << dendl; + + using klass = CloneRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_remove_child>(this); + + auto req = librbd::image::RemoveRequest::create( + m_ioctx, m_name, m_id, false, false, m_no_op, m_op_work_queue, ctx); + req->send(); +} + +template +void CloneRequest::handle_remove_child(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "Error removing failed clone: " + << cpp_strerror(r) << dendl; + } + + close_parent(); +} + +template +void CloneRequest::close_parent() { + ldout(m_cct, 20) << dendl; + ceph_assert(m_parent_image_ctx != nullptr); + + auto ctx = create_context_callback< + CloneRequest, &CloneRequest::handle_close_parent>(this); + m_parent_image_ctx->state->close(ctx); +} + +template +void CloneRequest::handle_close_parent(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_parent_image_ctx = nullptr; + + if (r < 0) { + lderr(m_cct) << "failed to close parent image: " + << cpp_strerror(r) << dendl; + if (m_r_saved == 0) { + m_r_saved = r; + } + } + + complete(m_r_saved); +} + +template +void CloneRequest::complete(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} //namespace image +} //namespace librbd + +template class librbd::image::CloneRequest; diff --git a/src/librbd/image/CloneRequest.h b/src/librbd/image/CloneRequest.h new file mode 100644 index 000000000..35d9cab17 --- /dev/null +++ b/src/librbd/image/CloneRequest.h @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H + +#include "cls/rbd/cls_rbd_types.h" +#include "common/config_fwd.h" +#include "librbd/internal.h" +#include "include/rbd/librbd.hpp" + +class Context; + +using librados::IoCtx; + +namespace librbd { + +namespace asio { struct ContextWQ; } + +namespace image { + +template +class CloneRequest { +public: + static CloneRequest *create( + ConfigProxy& config, IoCtx& parent_io_ctx, + const std::string& parent_image_id, + const std::string& parent_snap_name, + const cls::rbd::SnapshotNamespace& parent_snap_namespace, + uint64_t parent_snap_id, + IoCtx &c_ioctx, const std::string &c_name, + const std::string &c_id, ImageOptions c_options, + cls::rbd::MirrorImageMode mirror_image_mode, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + asio::ContextWQ *op_work_queue, Context *on_finish) { + return new CloneRequest(config, parent_io_ctx, parent_image_id, + parent_snap_name, parent_snap_namespace, + parent_snap_id, c_ioctx, c_name, c_id, c_options, + mirror_image_mode, non_primary_global_image_id, + primary_mirror_uuid, op_work_queue, on_finish); + } + + CloneRequest(ConfigProxy& config, IoCtx& parent_io_ctx, + const std::string& parent_image_id, + const std::string& parent_snap_name, + const cls::rbd::SnapshotNamespace& parent_snap_namespace, + uint64_t parent_snap_id, + IoCtx &c_ioctx, const std::string &c_name, + const std::string &c_id, ImageOptions c_options, + cls::rbd::MirrorImageMode mirror_image_mode, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + asio::ContextWQ *op_work_queue, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * OPEN PARENT + * | + * v + * VALIDATE CHILD + * | ^ + * v | + * CREATE CHILD * * * * * * * * * > CLOSE PARENT + * | ^ + * v | + * OPEN CHILD * * * * * * * * * * > REMOVE CHILD + * | ^ + * v | + * ATTACH PARENT * * * * * * * * > CLOSE CHILD + * | ^ + * v * + * ATTACH CHILD * * * * * * * * * * * * + * | * + * v * + * COPY META DATA * * * * * * * * * * ^ + * | * + * v (skip if not needed) * + * GET MIRROR MODE * * * * * * * * * ^ + * | * + * v (skip if not needed) * + * SET MIRROR ENABLED * * * * * * * * * + * | + * v + * CLOSE CHILD + * | + * v + * CLOSE PARENT + * | + * v + * + * + * @endverbatim + */ + + ConfigProxy& m_config; + IoCtx &m_parent_io_ctx; + std::string m_parent_image_id; + std::string m_parent_snap_name; + cls::rbd::SnapshotNamespace m_parent_snap_namespace; + uint64_t m_parent_snap_id; + ImageCtxT *m_parent_image_ctx; + + IoCtx &m_ioctx; + std::string m_name; + std::string m_id; + ImageOptions m_opts; + cls::rbd::ParentImageSpec m_pspec; + ImageCtxT *m_imctx; + cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + cls::rbd::MirrorImageMode m_mirror_image_mode; + const std::string m_non_primary_global_image_id; + const std::string m_primary_mirror_uuid; + NoOpProgressContext m_no_op; + asio::ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + uint64_t m_clone_format = 2; + bool m_use_p_features; + uint64_t m_features; + bufferlist m_out_bl; + uint64_t m_size; + int m_r_saved = 0; + + void validate_options(); + + void open_parent(); + void handle_open_parent(int r); + + void validate_parent(); + + void validate_child(); + void handle_validate_child(int r); + + void create_child(); + void handle_create_child(int r); + + void open_child(); + void handle_open_child(int r); + + void attach_parent(); + void handle_attach_parent(int r); + + void attach_child(); + void handle_attach_child(int r); + + void copy_metadata(); + void handle_copy_metadata(int r); + + void get_mirror_mode(); + void handle_get_mirror_mode(int r); + + void enable_mirror(); + void handle_enable_mirror(int r); + + void close_child(); + void handle_close_child(int r); + + void remove_child(); + void handle_remove_child(int r); + + void close_parent(); + void handle_close_parent(int r); + + void complete(int r); +}; + +} //namespace image +} //namespace librbd + +extern template class librbd::image::CloneRequest; + +#endif // CEPH_LIBRBD_IMAGE_CLONE_REQUEST_H diff --git a/src/librbd/image/CloseRequest.cc b/src/librbd/image/CloseRequest.cc new file mode 100644 index 000000000..7293687f5 --- /dev/null +++ b/src/librbd/image/CloseRequest.cc @@ -0,0 +1,350 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/CloseRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ConfigWatcher.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatcher.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::CloseRequest: " + +namespace librbd { +namespace image { + +using util::create_async_context_callback; +using util::create_context_callback; + +template +CloseRequest::CloseRequest(I *image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish), m_error_result(0), + m_exclusive_lock(nullptr) { + ceph_assert(image_ctx != nullptr); +} + +template +void CloseRequest::send() { + if (m_image_ctx->config_watcher != nullptr) { + m_image_ctx->config_watcher->shut_down(); + + delete m_image_ctx->config_watcher; + m_image_ctx->config_watcher = nullptr; + } + + send_block_image_watcher(); +} + +template +void CloseRequest::send_block_image_watcher() { + if (m_image_ctx->image_watcher == nullptr) { + send_shut_down_update_watchers(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // prevent incoming requests from our peers + m_image_ctx->image_watcher->block_notifies(create_context_callback< + CloseRequest, &CloseRequest::handle_block_image_watcher>(this)); +} + +template +void CloseRequest::handle_block_image_watcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + send_shut_down_update_watchers(); +} + +template +void CloseRequest::send_shut_down_update_watchers() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->state->shut_down_update_watchers(create_async_context_callback( + *m_image_ctx, create_context_callback< + CloseRequest, &CloseRequest::handle_shut_down_update_watchers>(this))); +} + +template +void CloseRequest::handle_shut_down_update_watchers(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to shut down update watchers: " << cpp_strerror(r) + << dendl; + } + + send_flush(); +} + +template +void CloseRequest::send_flush() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + std::shared_lock owner_locker{m_image_ctx->owner_lock}; + auto ctx = create_context_callback< + CloseRequest, &CloseRequest::handle_flush>(this); + auto aio_comp = io::AioCompletion::create_and_start(ctx, m_image_ctx, + io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_API_START, aio_comp, + io::FLUSH_SOURCE_SHUTDOWN, {}); + req->send(); +} + +template +void CloseRequest::handle_flush(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to flush IO: " << cpp_strerror(r) << dendl; + } + + send_shut_down_exclusive_lock(); +} + +template +void CloseRequest::send_shut_down_exclusive_lock() { + { + std::unique_lock owner_locker{m_image_ctx->owner_lock}; + m_exclusive_lock = m_image_ctx->exclusive_lock; + + // if reading a snapshot -- possible object map is open + std::unique_lock image_locker{m_image_ctx->image_lock}; + if (m_exclusive_lock == nullptr && m_image_ctx->object_map) { + m_image_ctx->object_map->put(); + m_image_ctx->object_map = nullptr; + } + } + + if (m_exclusive_lock == nullptr) { + send_unregister_image_watcher(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // in-flight IO will be flushed and in-flight requests will be canceled + // before releasing lock + m_exclusive_lock->shut_down(create_context_callback< + CloseRequest, &CloseRequest::handle_shut_down_exclusive_lock>(this)); +} + +template +void CloseRequest::handle_shut_down_exclusive_lock(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + { + std::shared_lock owner_locker{m_image_ctx->owner_lock}; + ceph_assert(m_image_ctx->exclusive_lock == nullptr); + + // object map and journal closed during exclusive lock shutdown + std::shared_lock image_locker{m_image_ctx->image_lock}; + ceph_assert(m_image_ctx->journal == nullptr); + ceph_assert(m_image_ctx->object_map == nullptr); + } + + m_exclusive_lock->put(); + m_exclusive_lock = nullptr; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to shut down exclusive lock: " << cpp_strerror(r) + << dendl; + } + + send_unregister_image_watcher(); +} + +template +void CloseRequest::send_unregister_image_watcher() { + if (m_image_ctx->image_watcher == nullptr) { + send_flush_readahead(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->image_watcher->unregister_watch(create_context_callback< + CloseRequest, &CloseRequest::handle_unregister_image_watcher>(this)); +} + +template +void CloseRequest::handle_unregister_image_watcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to unregister image watcher: " << cpp_strerror(r) + << dendl; + } + + send_flush_readahead(); +} + +template +void CloseRequest::send_flush_readahead() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->readahead.wait_for_pending(create_async_context_callback( + *m_image_ctx, create_context_callback< + CloseRequest, &CloseRequest::handle_flush_readahead>(this))); +} + +template +void CloseRequest::handle_flush_readahead(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + send_shut_down_image_dispatcher(); +} + +template +void CloseRequest::send_shut_down_image_dispatcher() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->io_image_dispatcher->shut_down(create_context_callback< + CloseRequest, + &CloseRequest::handle_shut_down_image_dispatcher>(this)); +} + +template +void CloseRequest::handle_shut_down_image_dispatcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to shut down image dispatcher: " + << cpp_strerror(r) << dendl; + } + + send_shut_down_object_dispatcher(); +} + +template +void CloseRequest::send_shut_down_object_dispatcher() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->io_object_dispatcher->shut_down(create_context_callback< + CloseRequest, + &CloseRequest::handle_shut_down_object_dispatcher>(this)); +} + +template +void CloseRequest::handle_shut_down_object_dispatcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + save_result(r); + if (r < 0) { + lderr(cct) << "failed to shut down object dispatcher: " + << cpp_strerror(r) << dendl; + } + + send_flush_op_work_queue(); +} + +template +void CloseRequest::send_flush_op_work_queue() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->op_work_queue->queue(create_context_callback< + CloseRequest, &CloseRequest::handle_flush_op_work_queue>(this), 0); +} + +template +void CloseRequest::handle_flush_op_work_queue(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + send_close_parent(); +} + +template +void CloseRequest::send_close_parent() { + if (m_image_ctx->parent == nullptr) { + send_flush_image_watcher(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->parent->state->close(create_async_context_callback( + *m_image_ctx, create_context_callback< + CloseRequest, &CloseRequest::handle_close_parent>(this))); +} + +template +void CloseRequest::handle_close_parent(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + m_image_ctx->parent = nullptr; + save_result(r); + if (r < 0) { + lderr(cct) << "error closing parent image: " << cpp_strerror(r) << dendl; + } + send_flush_image_watcher(); +} + +template +void CloseRequest::send_flush_image_watcher() { + if (m_image_ctx->image_watcher == nullptr) { + finish(); + return; + } + + m_image_ctx->image_watcher->flush(create_context_callback< + CloseRequest, &CloseRequest::handle_flush_image_watcher>(this)); +} + +template +void CloseRequest::handle_flush_image_watcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "error flushing image watcher: " << cpp_strerror(r) << dendl; + } + save_result(r); + finish(); +} + +template +void CloseRequest::finish() { + m_image_ctx->shutdown(); + m_on_finish->complete(m_error_result); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::CloseRequest; diff --git a/src/librbd/image/CloseRequest.h b/src/librbd/image/CloseRequest.h new file mode 100644 index 000000000..ee298aa9d --- /dev/null +++ b/src/librbd/image/CloseRequest.h @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H + +#include "librbd/ImageCtx.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template +class CloseRequest { +public: + static CloseRequest *create(ImageCtxT *image_ctx, Context *on_finish) { + return new CloseRequest(image_ctx, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * BLOCK_IMAGE_WATCHER (skip if R/O) + * | + * v + * SHUT_DOWN_UPDATE_WATCHERS + * | + * v + * FLUSH + * | + * v (skip if disabled) + * SHUT_DOWN_EXCLUSIVE_LOCK + * | + * v + * UNREGISTER_IMAGE_WATCHER (skip if R/O) + * | + * v + * FLUSH_READAHEAD + * | + * v + * SHUT_DOWN_IMAGE_DISPATCHER + * | + * v + * SHUT_DOWN_OBJECT_DISPATCHER + * | + * v + * FLUSH_OP_WORK_QUEUE + * | + * v (skip if no parent) + * CLOSE_PARENT + * | + * v + * FLUSH_IMAGE_WATCHER + * | + * v + * + * + * @endverbatim + */ + + CloseRequest(ImageCtxT *image_ctx, Context *on_finish); + + ImageCtxT *m_image_ctx; + Context *m_on_finish; + + int m_error_result; + + decltype(m_image_ctx->exclusive_lock) m_exclusive_lock; + + void send_block_image_watcher(); + void handle_block_image_watcher(int r); + + void send_shut_down_update_watchers(); + void handle_shut_down_update_watchers(int r); + + void send_flush(); + void handle_flush(int r); + + void send_shut_down_exclusive_lock(); + void handle_shut_down_exclusive_lock(int r); + + void send_unregister_image_watcher(); + void handle_unregister_image_watcher(int r); + + void send_flush_readahead(); + void handle_flush_readahead(int r); + + void send_shut_down_image_dispatcher(); + void handle_shut_down_image_dispatcher(int r); + + void send_shut_down_object_dispatcher(); + void handle_shut_down_object_dispatcher(int r); + + void send_flush_op_work_queue(); + void handle_flush_op_work_queue(int r); + + void send_close_parent(); + void handle_close_parent(int r); + + void send_flush_image_watcher(); + void handle_flush_image_watcher(int r); + + void finish(); + + void save_result(int result) { + if (m_error_result == 0 && result < 0) { + m_error_result = result; + } + } +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::CloseRequest; + +#endif // CEPH_LIBRBD_IMAGE_CLOSE_REQUEST_H diff --git a/src/librbd/image/CreateRequest.cc b/src/librbd/image/CreateRequest.cc new file mode 100644 index 000000000..71fb9cf26 --- /dev/null +++ b/src/librbd/image/CreateRequest.cc @@ -0,0 +1,835 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/CreateRequest.h" +#include "include/ceph_assert.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/ceph_context.h" +#include "cls/rbd/cls_rbd_client.h" +#include "osdc/Striper.h" +#include "librbd/Features.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/image/Types.h" +#include "librbd/image/ValidatePoolRequest.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/journal/RemoveRequest.h" +#include "librbd/journal/TypeTraits.h" +#include "librbd/mirror/EnableRequest.h" +#include "journal/Journaler.h" + + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::CreateRequest: " << __func__ \ + << ": " + +namespace librbd { +namespace image { + +using util::create_rados_callback; +using util::create_context_callback; + +namespace { + +int validate_features(CephContext *cct, uint64_t features) { + if (features & ~RBD_FEATURES_ALL) { + lderr(cct) << "librbd does not support requested features." << dendl; + return -ENOSYS; + } + if ((features & RBD_FEATURES_INTERNAL) != 0) { + lderr(cct) << "cannot use internally controlled features" << dendl; + return -EINVAL; + } + if ((features & RBD_FEATURE_FAST_DIFF) != 0 && + (features & RBD_FEATURE_OBJECT_MAP) == 0) { + lderr(cct) << "cannot use fast diff without object map" << dendl; + return -EINVAL; + } + if ((features & RBD_FEATURE_OBJECT_MAP) != 0 && + (features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + lderr(cct) << "cannot use object map without exclusive lock" << dendl; + return -EINVAL; + } + if ((features & RBD_FEATURE_JOURNALING) != 0 && + (features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + lderr(cct) << "cannot use journaling without exclusive lock" << dendl; + return -EINVAL; + } + + return 0; +} + +int validate_striping(CephContext *cct, uint8_t order, uint64_t stripe_unit, + uint64_t stripe_count) { + if ((stripe_unit && !stripe_count) || + (!stripe_unit && stripe_count)) { + lderr(cct) << "must specify both (or neither) of stripe-unit and " + << "stripe-count" << dendl; + return -EINVAL; + } else if (stripe_unit && ((1ull << order) % stripe_unit || stripe_unit > (1ull << order))) { + lderr(cct) << "stripe unit is not a factor of the object size" << dendl; + return -EINVAL; + } else if (stripe_unit != 0 && stripe_unit < 512) { + lderr(cct) << "stripe unit must be at least 512 bytes" << dendl; + return -EINVAL; + } + return 0; +} + +bool validate_layout(CephContext *cct, uint64_t size, file_layout_t &layout) { + if (!librbd::ObjectMap<>::is_compatible(layout, size)) { + lderr(cct) << "image size not compatible with object map" << dendl; + return false; + } + + return true; +} + +int get_image_option(const ImageOptions &image_options, int option, + uint8_t *value) { + uint64_t large_value; + int r = image_options.get(option, &large_value); + if (r < 0) { + return r; + } + *value = static_cast(large_value); + return 0; +} + +} // anonymous namespace + +template +int CreateRequest::validate_order(CephContext *cct, uint8_t order) { + if (order > 25 || order < 12) { + lderr(cct) << "order must be in the range [12, 25]" << dendl; + return -EDOM; + } + return 0; +} + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::CreateRequest: " << this << " " \ + << __func__ << ": " + +template +CreateRequest::CreateRequest(const ConfigProxy& config, IoCtx &ioctx, + const std::string &image_name, + const std::string &image_id, uint64_t size, + const ImageOptions &image_options, + uint32_t create_flags, + cls::rbd::MirrorImageMode mirror_image_mode, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + asio::ContextWQ *op_work_queue, + Context *on_finish) + : m_config(config), m_image_name(image_name), m_image_id(image_id), + m_size(size), m_create_flags(create_flags), + m_mirror_image_mode(mirror_image_mode), + m_non_primary_global_image_id(non_primary_global_image_id), + m_primary_mirror_uuid(primary_mirror_uuid), + m_op_work_queue(op_work_queue), m_on_finish(on_finish) { + + m_io_ctx.dup(ioctx); + m_cct = reinterpret_cast(m_io_ctx.cct()); + + m_id_obj = util::id_obj_name(m_image_name); + m_header_obj = util::header_name(m_image_id); + m_objmap_name = ObjectMap<>::object_map_name(m_image_id, CEPH_NOSNAP); + if (!non_primary_global_image_id.empty() && + (m_create_flags & CREATE_FLAG_MIRROR_ENABLE_MASK) == 0) { + m_create_flags |= CREATE_FLAG_FORCE_MIRROR_ENABLE; + } + + if (image_options.get(RBD_IMAGE_OPTION_FEATURES, &m_features) != 0) { + m_features = librbd::rbd_features_from_string( + m_config.get_val("rbd_default_features"), nullptr); + m_negotiate_features = true; + } + + uint64_t features_clear = 0; + uint64_t features_set = 0; + image_options.get(RBD_IMAGE_OPTION_FEATURES_CLEAR, &features_clear); + image_options.get(RBD_IMAGE_OPTION_FEATURES_SET, &features_set); + + uint64_t features_conflict = features_clear & features_set; + features_clear &= ~features_conflict; + features_set &= ~features_conflict; + m_features |= features_set; + m_features &= ~features_clear; + + m_features &= ~RBD_FEATURES_IMPLICIT_ENABLE; + if ((m_features & RBD_FEATURE_OBJECT_MAP) == RBD_FEATURE_OBJECT_MAP) { + m_features |= RBD_FEATURE_FAST_DIFF; + } + + if (image_options.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &m_stripe_unit) != 0 || + m_stripe_unit == 0) { + m_stripe_unit = m_config.get_val("rbd_default_stripe_unit"); + } + if (image_options.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &m_stripe_count) != 0 || + m_stripe_count == 0) { + m_stripe_count = m_config.get_val("rbd_default_stripe_count"); + } + if (get_image_option(image_options, RBD_IMAGE_OPTION_ORDER, &m_order) != 0 || + m_order == 0) { + m_order = config.get_val("rbd_default_order"); + } + if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_ORDER, + &m_journal_order) != 0) { + m_journal_order = m_config.get_val("rbd_journal_order"); + } + if (get_image_option(image_options, RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, + &m_journal_splay_width) != 0) { + m_journal_splay_width = m_config.get_val( + "rbd_journal_splay_width"); + } + if (image_options.get(RBD_IMAGE_OPTION_JOURNAL_POOL, &m_journal_pool) != 0) { + m_journal_pool = m_config.get_val("rbd_journal_pool"); + } + if (image_options.get(RBD_IMAGE_OPTION_DATA_POOL, &m_data_pool) != 0) { + m_data_pool = m_config.get_val("rbd_default_data_pool"); + } + + m_layout.object_size = 1ull << m_order; + if (m_stripe_unit == 0 || m_stripe_count == 0) { + m_layout.stripe_unit = m_layout.object_size; + m_layout.stripe_count = 1; + } else { + m_layout.stripe_unit = m_stripe_unit; + m_layout.stripe_count = m_stripe_count; + } + + if (!m_data_pool.empty() && m_data_pool != ioctx.get_pool_name()) { + m_features |= RBD_FEATURE_DATA_POOL; + } else { + m_data_pool.clear(); + } + + if ((m_stripe_unit != 0 && m_stripe_unit != (1ULL << m_order)) || + (m_stripe_count != 0 && m_stripe_count != 1)) { + m_features |= RBD_FEATURE_STRIPINGV2; + } + + ldout(m_cct, 10) << "name=" << m_image_name << ", " + << "id=" << m_image_id << ", " + << "size=" << m_size << ", " + << "features=" << m_features << ", " + << "order=" << (uint64_t)m_order << ", " + << "stripe_unit=" << m_stripe_unit << ", " + << "stripe_count=" << m_stripe_count << ", " + << "journal_order=" << (uint64_t)m_journal_order << ", " + << "journal_splay_width=" + << (uint64_t)m_journal_splay_width << ", " + << "journal_pool=" << m_journal_pool << ", " + << "data_pool=" << m_data_pool << dendl; +} + +template +void CreateRequest::send() { + ldout(m_cct, 20) << dendl; + + int r = validate_features(m_cct, m_features); + if (r < 0) { + complete(r); + return; + } + + r = validate_order(m_cct, m_order); + if (r < 0) { + complete(r); + return; + } + + r = validate_striping(m_cct, m_order, m_stripe_unit, m_stripe_count); + if (r < 0) { + complete(r); + return; + } + + if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) && + (!validate_layout(m_cct, m_size, m_layout))) { + complete(-EINVAL); + return; + } + + validate_data_pool(); +} + +template +void CreateRequest::validate_data_pool() { + m_data_io_ctx = m_io_ctx; + if ((m_features & RBD_FEATURE_DATA_POOL) != 0) { + librados::Rados rados(m_io_ctx); + int r = rados.ioctx_create(m_data_pool.c_str(), m_data_io_ctx); + if (r < 0) { + lderr(m_cct) << "data pool " << m_data_pool << " does not exist" << dendl; + complete(r); + return; + } + m_data_pool_id = m_data_io_ctx.get_id(); + m_data_io_ctx.set_namespace(m_io_ctx.get_namespace()); + } + + if (!m_config.get_val("rbd_validate_pool")) { + add_image_to_directory(); + return; + } + + ldout(m_cct, 15) << dendl; + + auto ctx = create_context_callback< + CreateRequest, &CreateRequest::handle_validate_data_pool>(this); + auto req = ValidatePoolRequest::create(m_data_io_ctx, ctx); + req->send(); +} + +template +void CreateRequest::handle_validate_data_pool(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EINVAL) { + lderr(m_cct) << "pool does not support RBD images" << dendl; + complete(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to validate pool: " << cpp_strerror(r) << dendl; + complete(r); + return; + } + + add_image_to_directory(); +} + +template +void CreateRequest::add_image_to_directory() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + if (!m_io_ctx.get_namespace().empty()) { + cls_client::dir_state_assert(&op, cls::rbd::DIRECTORY_STATE_READY); + } + cls_client::dir_add_image(&op, m_image_name, m_image_id); + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_add_image_to_directory(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EEXIST) { + ldout(m_cct, 5) << "directory entry for image " << m_image_name + << " already exists" << dendl; + complete(r); + return; + } else if (!m_io_ctx.get_namespace().empty() && r == -ENOENT) { + ldout(m_cct, 5) << "namespace " << m_io_ctx.get_namespace() + << " does not exist" << dendl; + complete(r); + return; + } else if (r < 0) { + lderr(m_cct) << "error adding image to directory: " << cpp_strerror(r) + << dendl; + complete(r); + return; + } + + create_id_object(); +} + +template +void CreateRequest::create_id_object() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + op.create(true); + cls_client::set_id(&op, m_image_id); + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_operate(m_id_obj, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_create_id_object(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EEXIST) { + ldout(m_cct, 5) << "id object for " << m_image_name << " already exists" + << dendl; + m_r_saved = r; + remove_from_dir(); + return; + } else if (r < 0) { + lderr(m_cct) << "error creating RBD id object: " << cpp_strerror(r) + << dendl; + m_r_saved = r; + remove_from_dir(); + return; + } + + negotiate_features(); +} + +template +void CreateRequest::negotiate_features() { + if (!m_negotiate_features) { + create_image(); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::get_all_features_start(&op); + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + + m_outbl.clear(); + int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_outbl); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_negotiate_features(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + uint64_t all_features; + if (r >= 0) { + auto it = m_outbl.cbegin(); + r = cls_client::get_all_features_finish(&it, &all_features); + } + if (r < 0) { + ldout(m_cct, 10) << "error retrieving server supported features set: " + << cpp_strerror(r) << dendl; + } else if ((m_features & all_features) != m_features) { + m_features &= all_features; + ldout(m_cct, 10) << "limiting default features set to server supported: " + << m_features << dendl; + } + + create_image(); +} + +template +void CreateRequest::create_image() { + ldout(m_cct, 15) << dendl; + ceph_assert(m_data_pool.empty() || m_data_pool_id != -1); + + std::ostringstream oss; + oss << RBD_DATA_PREFIX; + if (m_data_pool_id != -1) { + oss << stringify(m_io_ctx.get_id()) << "."; + } + oss << m_image_id; + if (oss.str().length() > RBD_MAX_BLOCK_NAME_PREFIX_LENGTH) { + lderr(m_cct) << "object prefix '" << oss.str() << "' too large" << dendl; + m_r_saved = -EINVAL; + remove_id_object(); + return; + } + + librados::ObjectWriteOperation op; + op.create(true); + cls_client::create_image(&op, m_size, m_order, m_features, oss.str(), + m_data_pool_id); + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_operate(m_header_obj, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_create_image(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r == -EEXIST) { + ldout(m_cct, 5) << "image id already in-use" << dendl; + complete(-EBADF); + return; + } else if (r < 0) { + lderr(m_cct) << "error writing header: " << cpp_strerror(r) << dendl; + m_r_saved = r; + remove_id_object(); + return; + } + + set_stripe_unit_count(); +} + +template +void CreateRequest::set_stripe_unit_count() { + if ((!m_stripe_unit && !m_stripe_count) || + ((m_stripe_count == 1) && (m_stripe_unit == (1ull << m_order)))) { + object_map_resize(); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::set_stripe_unit_count(&op, m_stripe_unit, m_stripe_count); + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_operate(m_header_obj, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_set_stripe_unit_count(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error setting stripe unit/count: " + << cpp_strerror(r) << dendl; + m_r_saved = r; + remove_header_object(); + return; + } + + object_map_resize(); +} + +template +void CreateRequest::object_map_resize() { + if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) { + fetch_mirror_mode(); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::object_map_resize(&op, Striper::get_num_objects(m_layout, m_size), + OBJECT_NONEXISTENT); + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_operate(m_objmap_name, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_object_map_resize(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error creating initial object map: " + << cpp_strerror(r) << dendl; + + m_r_saved = r; + remove_header_object(); + return; + } + + fetch_mirror_mode(); +} + +template +void CreateRequest::fetch_mirror_mode() { + if ((m_features & RBD_FEATURE_JOURNALING) == 0) { + mirror_image_enable(); + return; + } + + ldout(m_cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_mode_get_start(&op); + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_outbl.clear(); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_outbl); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_fetch_mirror_mode(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if ((r < 0) && (r != -ENOENT)) { + lderr(m_cct) << "failed to retrieve mirror mode: " << cpp_strerror(r) + << dendl; + + m_r_saved = r; + remove_object_map(); + return; + } + + m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + if (r == 0) { + auto it = m_outbl.cbegin(); + r = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode); + if (r < 0) { + lderr(m_cct) << "Failed to retrieve mirror mode" << dendl; + + m_r_saved = r; + remove_object_map(); + return; + } + } + + journal_create(); +} + +template +void CreateRequest::journal_create() { + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest; + Context *ctx = create_context_callback( + this); + + // only link to remote primary mirror uuid if in journal-based + // mirroring mode + bool use_primary_mirror_uuid = ( + !m_non_primary_global_image_id.empty() && + m_mirror_image_mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL); + + librbd::journal::TagData tag_data; + tag_data.mirror_uuid = (use_primary_mirror_uuid ? m_primary_mirror_uuid : + librbd::Journal::LOCAL_MIRROR_UUID); + + typename journal::TypeTraits::ContextWQ* context_wq; + Journal<>::get_work_queue(m_cct, &context_wq); + + auto req = librbd::journal::CreateRequest::create( + m_io_ctx, m_image_id, m_journal_order, m_journal_splay_width, + m_journal_pool, cls::journal::Tag::TAG_CLASS_NEW, tag_data, + librbd::Journal::IMAGE_CLIENT_ID, context_wq, ctx); + req->send(); +} + +template +void CreateRequest::handle_journal_create(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error creating journal: " << cpp_strerror(r) + << dendl; + + m_r_saved = r; + remove_object_map(); + return; + } + + mirror_image_enable(); +} + +template +void CreateRequest::mirror_image_enable() { + auto mirror_enable_flag = (m_create_flags & CREATE_FLAG_MIRROR_ENABLE_MASK); + + if ((m_mirror_mode != cls::rbd::MIRROR_MODE_POOL && + mirror_enable_flag != CREATE_FLAG_FORCE_MIRROR_ENABLE) || + (mirror_enable_flag == CREATE_FLAG_SKIP_MIRROR_ENABLE)) { + complete(0); + return; + } + + ldout(m_cct, 15) << dendl; + auto ctx = create_context_callback< + CreateRequest, &CreateRequest::handle_mirror_image_enable>(this); + + auto req = mirror::EnableRequest::create( + m_io_ctx, m_image_id, m_mirror_image_mode, + m_non_primary_global_image_id, true, m_op_work_queue, ctx); + req->send(); +} + +template +void CreateRequest::handle_mirror_image_enable(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "cannot enable mirroring: " << cpp_strerror(r) + << dendl; + + m_r_saved = r; + journal_remove(); + return; + } + + complete(0); +} + +template +void CreateRequest::complete(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_data_io_ctx.close(); + auto on_finish = m_on_finish; + delete this; + on_finish->complete(r); +} + +// cleanup +template +void CreateRequest::journal_remove() { + if ((m_features & RBD_FEATURE_JOURNALING) == 0) { + remove_object_map(); + return; + } + + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest; + Context *ctx = create_context_callback( + this); + + typename journal::TypeTraits::ContextWQ* context_wq; + Journal<>::get_work_queue(m_cct, &context_wq); + + librbd::journal::RemoveRequest *req = + librbd::journal::RemoveRequest::create( + m_io_ctx, m_image_id, librbd::Journal::IMAGE_CLIENT_ID, context_wq, + ctx); + req->send(); +} + +template +void CreateRequest::handle_journal_remove(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up journal after creation failed: " + << cpp_strerror(r) << dendl; + } + + remove_object_map(); +} + +template +void CreateRequest::remove_object_map() { + if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) { + remove_header_object(); + return; + } + + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_remove(m_objmap_name, comp); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_remove_object_map(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up object map after creation failed: " + << cpp_strerror(r) << dendl; + } + + remove_header_object(); +} + +template +void CreateRequest::remove_header_object() { + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_remove(m_header_obj, comp); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_remove_header_object(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up image header after creation failed: " + << cpp_strerror(r) << dendl; + } + + remove_id_object(); +} + +template +void CreateRequest::remove_id_object() { + ldout(m_cct, 15) << dendl; + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_remove(m_id_obj, comp); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_remove_id_object(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up id object after creation failed: " + << cpp_strerror(r) << dendl; + } + + remove_from_dir(); +} + +template +void CreateRequest::remove_from_dir() { + ldout(m_cct, 15) << dendl; + + librados::ObjectWriteOperation op; + cls_client::dir_remove_image(&op, m_image_name, m_image_id); + + using klass = CreateRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = m_io_ctx.aio_operate(RBD_DIRECTORY, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateRequest::handle_remove_from_dir(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error cleaning up image from rbd_directory object " + << "after creation failed: " << cpp_strerror(r) << dendl; + } + + complete(m_r_saved); +} + +} //namespace image +} //namespace librbd + +template class librbd::image::CreateRequest; diff --git a/src/librbd/image/CreateRequest.h b/src/librbd/image/CreateRequest.h new file mode 100644 index 000000000..9cb0eec7c --- /dev/null +++ b/src/librbd/image/CreateRequest.h @@ -0,0 +1,191 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H + +#include "common/config_fwd.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ImageCtx.h" + +class Context; + +using librados::IoCtx; + +namespace journal { class Journaler; } + +namespace librbd { + +namespace asio { struct ContextWQ; } + +namespace image { + +template +class CreateRequest { +public: + static CreateRequest *create(const ConfigProxy& config, IoCtx &ioctx, + const std::string &image_name, + const std::string &image_id, uint64_t size, + const ImageOptions &image_options, + uint32_t create_flags, + cls::rbd::MirrorImageMode mirror_image_mode, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + asio::ContextWQ *op_work_queue, + Context *on_finish) { + return new CreateRequest(config, ioctx, image_name, image_id, size, + image_options, create_flags, + mirror_image_mode, non_primary_global_image_id, + primary_mirror_uuid, op_work_queue, on_finish); + } + + static int validate_order(CephContext *cct, uint8_t order); + + void send(); + +private: + /** + * @verbatim + * + * . . . . > . . . . . + * | . + * v . + * VALIDATE DATA POOL v (pool validation + * | . disabled) + * v . + * (error: bottom up) ADD IMAGE TO DIRECTORY < . . . . + * _______<_______ | + * | | v + * | | CREATE ID OBJECT + * | | / | + * | REMOVE FROM DIR <-------/ v + * | | NEGOTIATE FEATURES (when using default features) + * | | | + * | | v (stripingv2 disabled) + * | | CREATE IMAGE. . . . > . . . . + * v | / | . + * | REMOVE ID OBJ <---------/ v . + * | | SET STRIPE UNIT COUNT . + * | | / | \ . . . . . > . . . . + * | REMOVE HEADER OBJ<------/ v /. (object-map + * | |\ OBJECT MAP RESIZE . . < . . * v disabled) + * | | \ / | \ . . . . . > . . . . + * | | *<-----------/ v /. (journaling + * | | FETCH MIRROR MODE. . < . . * v disabled) + * | | / | . + * | REMOVE OBJECT MAP<--------/ v . + * | |\ JOURNAL CREATE . + * | | \ / | . + * v | *<------------/ v . + * | | MIRROR IMAGE ENABLE . + * | | / | . + * | JOURNAL REMOVE*<-------/ | . + * | v . + * |_____________>___________________ . . . . < . . . . + * + * @endverbatim + */ + + CreateRequest(const ConfigProxy& config, IoCtx &ioctx, + const std::string &image_name, + const std::string &image_id, uint64_t size, + const ImageOptions &image_options, + uint32_t create_flags, + cls::rbd::MirrorImageMode mirror_image_mode, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + asio::ContextWQ *op_work_queue, Context *on_finish); + + const ConfigProxy& m_config; + IoCtx m_io_ctx; + IoCtx m_data_io_ctx; + std::string m_image_name; + std::string m_image_id; + uint64_t m_size; + uint8_t m_order = 0; + uint64_t m_features = 0; + uint64_t m_stripe_unit = 0; + uint64_t m_stripe_count = 0; + uint8_t m_journal_order = 0; + uint8_t m_journal_splay_width = 0; + std::string m_journal_pool; + std::string m_data_pool; + int64_t m_data_pool_id = -1; + uint32_t m_create_flags; + cls::rbd::MirrorImageMode m_mirror_image_mode; + const std::string m_non_primary_global_image_id; + const std::string m_primary_mirror_uuid; + bool m_negotiate_features = false; + + asio::ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + int m_r_saved = 0; // used to return actual error after cleanup + file_layout_t m_layout; + std::string m_id_obj, m_header_obj, m_objmap_name; + + bufferlist m_outbl; + cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + cls::rbd::MirrorImage m_mirror_image_internal; + + void validate_data_pool(); + void handle_validate_data_pool(int r); + + void add_image_to_directory(); + void handle_add_image_to_directory(int r); + + void create_id_object(); + void handle_create_id_object(int r); + + void negotiate_features(); + void handle_negotiate_features(int r); + + void create_image(); + void handle_create_image(int r); + + void set_stripe_unit_count(); + void handle_set_stripe_unit_count(int r); + + void object_map_resize(); + void handle_object_map_resize(int r); + + void fetch_mirror_mode(); + void handle_fetch_mirror_mode(int r); + + void journal_create(); + void handle_journal_create(int r); + + void mirror_image_enable(); + void handle_mirror_image_enable(int r); + + void complete(int r); + + // cleanup + void journal_remove(); + void handle_journal_remove(int r); + + void remove_object_map(); + void handle_remove_object_map(int r); + + void remove_header_object(); + void handle_remove_header_object(int r); + + void remove_id_object(); + void handle_remove_id_object(int r); + + void remove_from_dir(); + void handle_remove_from_dir(int r); + +}; + +} //namespace image +} //namespace librbd + +extern template class librbd::image::CreateRequest; + +#endif // CEPH_LIBRBD_IMAGE_CREATE_REQUEST_H diff --git a/src/librbd/image/DetachChildRequest.cc b/src/librbd/image/DetachChildRequest.cc new file mode 100644 index 000000000..ab39dbcd7 --- /dev/null +++ b/src/librbd/image/DetachChildRequest.cc @@ -0,0 +1,392 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/DetachChildRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/journal/DisabledPolicy.h" +#include "librbd/trash/RemoveRequest.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::DetachChildRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template +DetachChildRequest::~DetachChildRequest() { + ceph_assert(m_parent_image_ctx == nullptr); +} + +template +void DetachChildRequest::send() { + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + + // use oldest snapshot or HEAD for parent spec + if (!m_image_ctx.snap_info.empty()) { + m_parent_spec = m_image_ctx.snap_info.begin()->second.parent.spec; + } else { + m_parent_spec = m_image_ctx.parent_md.spec; + } + } + + if (m_parent_spec.pool_id == -1) { + // ignore potential race with parent disappearing + m_image_ctx.op_work_queue->queue(create_context_callback< + DetachChildRequest, + &DetachChildRequest::finish>(this), 0); + return; + } else if (!m_image_ctx.test_op_features(RBD_OPERATION_FEATURE_CLONE_CHILD)) { + clone_v1_remove_child(); + return; + } + + clone_v2_child_detach(); +} + +template +void DetachChildRequest::clone_v2_child_detach() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectWriteOperation op; + cls_client::child_detach(&op, m_parent_spec.snap_id, + {m_image_ctx.md_ctx.get_id(), + m_image_ctx.md_ctx.get_namespace(), + m_image_ctx.id}); + + int r = util::create_ioctx(m_image_ctx.md_ctx, "parent image", + m_parent_spec.pool_id, + m_parent_spec.pool_namespace, &m_parent_io_ctx); + if (r < 0) { + if (r == -ENOENT) { + r = 0; + } + finish(r); + return; + } + + m_parent_header_name = util::header_name(m_parent_spec.image_id); + + auto aio_comp = create_rados_callback< + DetachChildRequest, + &DetachChildRequest::handle_clone_v2_child_detach>(this); + r = m_parent_io_ctx.aio_operate(m_parent_header_name, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void DetachChildRequest::handle_clone_v2_child_detach(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "error detaching child from parent: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + clone_v2_get_snapshot(); +} + +template +void DetachChildRequest::clone_v2_get_snapshot() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectReadOperation op; + cls_client::snapshot_get_start(&op, m_parent_spec.snap_id); + + m_out_bl.clear(); + auto aio_comp = create_rados_callback< + DetachChildRequest, + &DetachChildRequest::handle_clone_v2_get_snapshot>(this); + int r = m_parent_io_ctx.aio_operate(m_parent_header_name, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void DetachChildRequest::handle_clone_v2_get_snapshot(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + bool remove_snapshot = false; + if (r == 0) { + cls::rbd::SnapshotInfo snap_info; + auto it = m_out_bl.cbegin(); + r = cls_client::snapshot_get_finish(&it, &snap_info); + if (r == 0) { + m_parent_snap_namespace = snap_info.snapshot_namespace; + m_parent_snap_name = snap_info.name; + + if (cls::rbd::get_snap_namespace_type(m_parent_snap_namespace) == + cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH && + snap_info.child_count == 0) { + // snapshot is in trash w/ zero children, so remove it + remove_snapshot = true; + } + } + } + + if (r < 0 && r != -ENOENT) { + ldout(cct, 5) << "failed to retrieve snapshot: " << cpp_strerror(r) + << dendl; + } + + if (!remove_snapshot) { + finish(0); + return; + } + + clone_v2_open_parent(); +} + +template +void DetachChildRequest::clone_v2_open_parent() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + m_parent_image_ctx = I::create("", m_parent_spec.image_id, nullptr, + m_parent_io_ctx, false); + + // ensure non-primary images can be modified + m_parent_image_ctx->read_only_mask &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + + auto ctx = create_context_callback< + DetachChildRequest, + &DetachChildRequest::handle_clone_v2_open_parent>(this); + m_parent_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx); +} + +template +void DetachChildRequest::handle_clone_v2_open_parent(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + ldout(cct, 5) << "failed to open parent for read/write: " + << cpp_strerror(r) << dendl; + m_parent_image_ctx = nullptr; + finish(0); + return; + } + + // do not attempt to open the parent journal when removing the trash + // snapshot, because the parent may be not promoted + if (m_parent_image_ctx->test_features(RBD_FEATURE_JOURNALING)) { + std::unique_lock image_locker{m_parent_image_ctx->image_lock}; + m_parent_image_ctx->set_journal_policy(new journal::DisabledPolicy()); + } + + // disallow any proxied maintenance operations + { + std::shared_lock owner_locker{m_parent_image_ctx->owner_lock}; + if (m_parent_image_ctx->exclusive_lock != nullptr) { + m_parent_image_ctx->exclusive_lock->block_requests(0); + } + } + + clone_v2_remove_snapshot(); +} + +template +void DetachChildRequest::clone_v2_remove_snapshot() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + DetachChildRequest, + &DetachChildRequest::handle_clone_v2_remove_snapshot>(this); + m_parent_image_ctx->operations->snap_remove(m_parent_snap_namespace, + m_parent_snap_name, ctx); +} + +template +void DetachChildRequest::handle_clone_v2_remove_snapshot(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + ldout(cct, 5) << "failed to remove trashed clone snapshot: " + << cpp_strerror(r) << dendl; + clone_v2_close_parent(); + return; + } + + if (m_parent_image_ctx->snaps.empty()) { + clone_v2_get_parent_trash_entry(); + } else { + clone_v2_close_parent(); + } +} + +template +void DetachChildRequest::clone_v2_get_parent_trash_entry() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectReadOperation op; + cls_client::trash_get_start(&op, m_parent_image_ctx->id); + + m_out_bl.clear(); + auto aio_comp = create_rados_callback< + DetachChildRequest, + &DetachChildRequest::handle_clone_v2_get_parent_trash_entry>(this); + int r = m_parent_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void DetachChildRequest::handle_clone_v2_get_parent_trash_entry(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + ldout(cct, 5) << "failed to get parent trash entry: " << cpp_strerror(r) + << dendl; + clone_v2_close_parent(); + return; + } + + bool in_trash = false; + + if (r == 0) { + cls::rbd::TrashImageSpec trash_spec; + auto it = m_out_bl.cbegin(); + r = cls_client::trash_get_finish(&it, &trash_spec); + + if (r == 0 && + trash_spec.source == cls::rbd::TRASH_IMAGE_SOURCE_USER_PARENT && + trash_spec.state == cls::rbd::TRASH_IMAGE_STATE_NORMAL && + trash_spec.deferment_end_time <= ceph_clock_now()) { + in_trash = true; + } + } + + if (in_trash) { + clone_v2_remove_parent_from_trash(); + } else { + clone_v2_close_parent(); + } +} + +template +void DetachChildRequest::clone_v2_remove_parent_from_trash() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + DetachChildRequest, + &DetachChildRequest::handle_clone_v2_remove_parent_from_trash>(this); + auto req = librbd::trash::RemoveRequest::create( + m_parent_io_ctx, m_parent_image_ctx, m_image_ctx.op_work_queue, false, + m_no_op, ctx); + req->send(); +} + +template +void DetachChildRequest::handle_clone_v2_remove_parent_from_trash(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + ldout(cct, 5) << "failed to remove parent image:" << cpp_strerror(r) + << dendl; + } + + m_parent_image_ctx = nullptr; + finish(0); +} + +template +void DetachChildRequest::clone_v2_close_parent() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + DetachChildRequest, + &DetachChildRequest::handle_clone_v2_close_parent>(this); + m_parent_image_ctx->state->close(ctx); +} + +template +void DetachChildRequest::handle_clone_v2_close_parent(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + ldout(cct, 5) << "failed to close parent image:" << cpp_strerror(r) + << dendl; + } + + m_parent_image_ctx = nullptr; + finish(0); +} + +template +void DetachChildRequest::clone_v1_remove_child() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + m_parent_spec.pool_namespace = ""; + + librados::ObjectWriteOperation op; + librbd::cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id); + + auto aio_comp = create_rados_callback< + DetachChildRequest, + &DetachChildRequest::handle_clone_v1_remove_child>(this); + int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void DetachChildRequest::handle_clone_v1_remove_child(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == -ENOENT) { + r = 0; + } else if (r < 0) { + lderr(cct) << "failed to remove child from children list: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void DetachChildRequest::finish(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::DetachChildRequest; diff --git a/src/librbd/image/DetachChildRequest.h b/src/librbd/image/DetachChildRequest.h new file mode 100644 index 000000000..646b7ec62 --- /dev/null +++ b/src/librbd/image/DetachChildRequest.h @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H +#define CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "librbd/Types.h" +#include "librbd/internal.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template +class DetachChildRequest { +public: + static DetachChildRequest* create(ImageCtxT& image_ctx, Context* on_finish) { + return new DetachChildRequest(image_ctx, on_finish); + } + + DetachChildRequest(ImageCtxT& image_ctx, Context* on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { + } + ~DetachChildRequest(); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * (v1) | (v2) + * /--------------/ \--------------\ + * | | + * v v + * REMOVE_CHILD CHILD_DETACH + * | | + * | v + * | GET_SNAPSHOT + * | (snapshot in-use) . | + * |/. . . . . . . . . . . . . . . | + * | v + * | OPEN_PARENT + * | | + * | v (has more children) + * | REMOVE_SNAPSHOT ---------------\ + * | | | + * | v (noent) | + * | (auto-delete when GET_PARENT_TRASH_ENTRY . . . .\| + * | last child detached) | | + * | v v + * | REMOVE_PARENT_FROM_TRASH CLOSE_PARENT + * | | | + * |/------------------------------/--------------------------/ + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT& m_image_ctx; + Context* m_on_finish; + + librados::IoCtx m_parent_io_ctx; + cls::rbd::ParentImageSpec m_parent_spec; + std::string m_parent_header_name; + + cls::rbd::SnapshotNamespace m_parent_snap_namespace; + std::string m_parent_snap_name; + + ImageCtxT* m_parent_image_ctx = nullptr; + + ceph::bufferlist m_out_bl; + NoOpProgressContext m_no_op; + + void clone_v2_child_detach(); + void handle_clone_v2_child_detach(int r); + + void clone_v2_get_snapshot(); + void handle_clone_v2_get_snapshot(int r); + + void clone_v2_open_parent(); + void handle_clone_v2_open_parent(int r); + + void clone_v2_remove_snapshot(); + void handle_clone_v2_remove_snapshot(int r); + + void clone_v2_get_parent_trash_entry(); + void handle_clone_v2_get_parent_trash_entry(int r); + + void clone_v2_remove_parent_from_trash(); + void handle_clone_v2_remove_parent_from_trash(int r); + + void clone_v2_close_parent(); + void handle_clone_v2_close_parent(int r); + + void clone_v1_remove_child(); + void handle_clone_v1_remove_child(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::DetachChildRequest; + +#endif // CEPH_LIBRBD_IMAGE_DETACH_CHILD_REQUEST_H diff --git a/src/librbd/image/DetachParentRequest.cc b/src/librbd/image/DetachParentRequest.cc new file mode 100644 index 000000000..74b1b0f67 --- /dev/null +++ b/src/librbd/image/DetachParentRequest.cc @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/DetachParentRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::DetachParentRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template +void DetachParentRequest::send() { + detach_parent(); +} + +template +void DetachParentRequest::detach_parent() { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectWriteOperation op; + if (!m_legacy_parent) { + librbd::cls_client::parent_detach(&op); + } else { + librbd::cls_client::remove_parent(&op); + } + + auto aio_comp = create_rados_callback< + DetachParentRequest, + &DetachParentRequest::handle_detach_parent>(this); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void DetachParentRequest::handle_detach_parent(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + if (!m_legacy_parent && r == -EOPNOTSUPP) { + ldout(cct, 10) << "retrying using legacy parent method" << dendl; + m_legacy_parent = true; + detach_parent(); + return; + } + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "detach parent encountered an error: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void DetachParentRequest::finish(int r) { + auto cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::DetachParentRequest; diff --git a/src/librbd/image/DetachParentRequest.h b/src/librbd/image/DetachParentRequest.h new file mode 100644 index 000000000..17c86aaac --- /dev/null +++ b/src/librbd/image/DetachParentRequest.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H +#define CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "librbd/Types.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template +class DetachParentRequest { +public: + static DetachParentRequest* create(ImageCtxT& image_ctx, Context* on_finish) { + return new DetachParentRequest(image_ctx, on_finish); + } + + DetachParentRequest(ImageCtxT& image_ctx, Context* on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | * * * * * * + * | * * -EOPNOTSUPP + * v v * + * DETACH_PARENT * * * + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT& m_image_ctx; + Context* m_on_finish; + + bool m_legacy_parent = false; + + void detach_parent(); + void handle_detach_parent(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::DetachParentRequest; + +#endif // CEPH_LIBRBD_IMAGE_DETACH_PARENT_REQUEST_H diff --git a/src/librbd/image/GetMetadataRequest.cc b/src/librbd/image/GetMetadataRequest.cc new file mode 100644 index 000000000..1410c9005 --- /dev/null +++ b/src/librbd/image/GetMetadataRequest.cc @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/GetMetadataRequest.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/ceph_assert.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::GetMetadataRequest: " \ + << this << " " << __func__ << ": " + +#define MAX_KEYS 64U + +namespace librbd { +namespace image { +namespace { + +static const std::string INTERNAL_KEY_PREFIX{".rbd"}; + +} // anonymous namespace + +using util::create_rados_callback; + +template +GetMetadataRequest::GetMetadataRequest( + IoCtx &io_ctx, const std::string &oid, bool filter_internal, + const std::string& filter_key_prefix, const std::string& last_key, + uint32_t max_results, KeyValues* key_values, Context *on_finish) + : m_io_ctx(io_ctx), m_oid(oid), m_filter_internal(filter_internal), + m_filter_key_prefix(filter_key_prefix), m_last_key(last_key), + m_max_results(max_results), m_key_values(key_values), + m_on_finish(on_finish), + m_cct(reinterpret_cast(m_io_ctx.cct())) { +} + +template +void GetMetadataRequest::send() { + metadata_list(); +} + +template +void GetMetadataRequest::metadata_list() { + ldout(m_cct, 15) << "start_key=" << m_last_key << dendl; + + m_expected_results = MAX_KEYS; + if (m_max_results > 0) { + m_expected_results = std::min( + m_expected_results, m_max_results - m_key_values->size()); + } + + librados::ObjectReadOperation op; + cls_client::metadata_list_start(&op, m_last_key, m_expected_results); + + auto aio_comp = create_rados_callback< + GetMetadataRequest, &GetMetadataRequest::handle_metadata_list>(this); + m_out_bl.clear(); + m_io_ctx.aio_operate(m_oid, aio_comp, &op, &m_out_bl); + aio_comp->release(); +} + +template +void GetMetadataRequest::handle_metadata_list(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + KeyValues metadata; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::metadata_list_finish(&it, &metadata); + } + + if (r == -ENOENT || r == -EOPNOTSUPP) { + finish(0); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to retrieve image metadata: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + for (auto it = metadata.begin(); it != metadata.end(); ++it) { + if (m_filter_internal && + boost::starts_with(it->first, INTERNAL_KEY_PREFIX)) { + continue; + } else if (!m_filter_key_prefix.empty() && + !boost::starts_with(it->first, m_filter_key_prefix)) { + continue; + } + m_key_values->insert({it->first, std::move(it->second)}); + } + if (!metadata.empty()) { + m_last_key = metadata.rbegin()->first; + } + + if (metadata.size() == m_expected_results && + (m_max_results == 0 || m_key_values->size() < m_max_results)) { + metadata_list(); + return; + } + + finish(0); +} + +template +void GetMetadataRequest::finish(int r) { + ldout(m_cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::GetMetadataRequest; diff --git a/src/librbd/image/GetMetadataRequest.h b/src/librbd/image/GetMetadataRequest.h new file mode 100644 index 000000000..08fc2de71 --- /dev/null +++ b/src/librbd/image/GetMetadataRequest.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_GET_METADATA_REQUEST_H +#define CEPH_LIBRBD_IMAGE_GET_METADATA_REQUEST_H + +#include "include/common_fwd.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include +#include + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace image { + +template +class GetMetadataRequest { +public: + typedef std::map KeyValues; + + static GetMetadataRequest* create( + IoCtx &io_ctx, const std::string &oid, bool filter_internal, + const std::string& filter_key_prefix, const std::string& last_key, + uint32_t max_results, KeyValues* key_values, Context *on_finish) { + return new GetMetadataRequest(io_ctx, oid, filter_internal, + filter_key_prefix, last_key, max_results, + key_values, on_finish); + } + + GetMetadataRequest( + IoCtx &io_ctx, const std::string &oid, bool filter_internal, + const std::string& filter_key_prefix, const std::string& last_key, + uint32_t max_results, KeyValues* key_values, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * | /-------\ + * | | | + * v v | + * METADATA_LIST ---/ + * | + * v + * + * + * @endverbatim + */ + librados::IoCtx m_io_ctx; + std::string m_oid; + bool m_filter_internal; + std::string m_filter_key_prefix; + std::string m_last_key; + uint32_t m_max_results; + KeyValues* m_key_values; + Context* m_on_finish; + + CephContext* m_cct; + bufferlist m_out_bl; + uint32_t m_expected_results = 0; + + void metadata_list(); + void handle_metadata_list(int r); + + void finish(int r); + +}; + +} //namespace image +} //namespace librbd + +extern template class librbd::image::GetMetadataRequest; + +#endif // CEPH_LIBRBD_IMAGE_GET_METADATA_REQUEST_H diff --git a/src/librbd/image/ListWatchersRequest.cc b/src/librbd/image/ListWatchersRequest.cc new file mode 100644 index 000000000..7ccbd136f --- /dev/null +++ b/src/librbd/image/ListWatchersRequest.cc @@ -0,0 +1,174 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ListWatchersRequest.h" +#include "common/RWLock.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Utils.h" + +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::ListWatchersRequest: " << this \ + << " " << __func__ << ": " + +static std::ostream& operator<<(std::ostream& os, const obj_watch_t& watch) { + os << "{addr=" << watch.addr << ", " + << "watcher_id=" << watch.watcher_id << ", " + << "cookie=" << watch.cookie << "}"; + return os; +} + +namespace librbd { +namespace image { + +using librados::IoCtx; +using util::create_rados_callback; + +template +ListWatchersRequest::ListWatchersRequest(I &image_ctx, int flags, + std::list *watchers, + Context *on_finish) + : m_image_ctx(image_ctx), m_flags(flags), m_watchers(watchers), + m_on_finish(on_finish), m_cct(m_image_ctx.cct) { + ceph_assert((m_flags & LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES) == 0 || + (m_flags & LIST_WATCHERS_MIRROR_INSTANCES_ONLY) == 0); +} + +template +void ListWatchersRequest::send() { + ldout(m_cct, 20) << dendl; + + list_image_watchers(); +} + +template +void ListWatchersRequest::list_image_watchers() { + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + op.list_watchers(&m_object_watchers, &m_ret_val); + + using klass = ListWatchersRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, + rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void ListWatchersRequest::handle_list_image_watchers(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == 0 && m_ret_val < 0) { + r = m_ret_val; + } + if (r < 0) { + lderr(m_cct) << "error listing image watchers: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + ldout(m_cct, 20) << "object_watchers=" << m_object_watchers << dendl; + list_mirror_watchers(); +} + +template +void ListWatchersRequest::list_mirror_watchers() { + if ((m_object_watchers.empty()) || + (m_flags & (LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES | + LIST_WATCHERS_MIRROR_INSTANCES_ONLY)) == 0) { + finish(0); + return; + } + + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + op.list_watchers(&m_mirror_watchers, &m_ret_val); + + using klass = ListWatchersRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, rados_completion, + &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void ListWatchersRequest::handle_list_mirror_watchers(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == 0 && m_ret_val < 0) { + r = m_ret_val; + } + if (r < 0 && r != -ENOENT) { + ldout(m_cct, 1) << "error listing mirror watchers: " << cpp_strerror(r) + << dendl; + } + + ldout(m_cct, 20) << "mirror_watchers=" << m_mirror_watchers << dendl; + finish(0); +} + +template +void ListWatchersRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == 0) { + m_watchers->clear(); + + if (m_object_watchers.size() > 0) { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + uint64_t watch_handle = m_image_ctx.image_watcher != nullptr ? + m_image_ctx.image_watcher->get_watch_handle() : 0; + + for (auto &w : m_object_watchers) { + if ((m_flags & LIST_WATCHERS_FILTER_OUT_MY_INSTANCE) != 0) { + if (w.cookie == watch_handle) { + ldout(m_cct, 20) << "filtering out my instance: " << w << dendl; + continue; + } + } + auto it = std::find_if(m_mirror_watchers.begin(), + m_mirror_watchers.end(), + [w] (obj_watch_t &watcher) { + return (strncmp(w.addr, watcher.addr, + sizeof(w.addr)) == 0); + }); + if ((m_flags & LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES) != 0) { + if (it != m_mirror_watchers.end()) { + ldout(m_cct, 20) << "filtering out mirror instance: " << w << dendl; + continue; + } + } else if ((m_flags & LIST_WATCHERS_MIRROR_INSTANCES_ONLY) != 0) { + if (it == m_mirror_watchers.end()) { + ldout(m_cct, 20) << "filtering out non-mirror instance: " << w + << dendl; + continue; + } + } + m_watchers->push_back(w); + } + } + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::ListWatchersRequest; diff --git a/src/librbd/image/ListWatchersRequest.h b/src/librbd/image/ListWatchersRequest.h new file mode 100644 index 000000000..2c77254a7 --- /dev/null +++ b/src/librbd/image/ListWatchersRequest.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H +#define CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H + +#include "include/rados/rados_types.hpp" + +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +enum { + LIST_WATCHERS_FILTER_OUT_MY_INSTANCE = 1 << 0, + LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES = 1 << 1, + LIST_WATCHERS_MIRROR_INSTANCES_ONLY = 1 << 3, +}; + +template +class ListWatchersRequest { +public: + static ListWatchersRequest *create(ImageCtxT &image_ctx, int flags, + std::list *watchers, + Context *on_finish) { + return new ListWatchersRequest(image_ctx, flags, watchers, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * LIST_IMAGE_WATCHERS + * | + * v + * LIST_MIRROR_WATCHERS (skip if not needed) + * | + * v + * + * + * @endverbatim + */ + + ListWatchersRequest(ImageCtxT &image_ctx, int flags, std::list *watchers, + Context *on_finish); + + ImageCtxT& m_image_ctx; + int m_flags; + std::list *m_watchers; + Context *m_on_finish; + + CephContext *m_cct; + int m_ret_val; + bufferlist m_out_bl; + std::list m_object_watchers; + std::list m_mirror_watchers; + + void list_image_watchers(); + void handle_list_image_watchers(int r); + + void list_mirror_watchers(); + void handle_list_mirror_watchers(int r); + + void finish(int r); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::ListWatchersRequest; + +#endif // CEPH_LIBRBD_IMAGE_LIST_WATCHERS_REQUEST_H diff --git a/src/librbd/image/OpenRequest.cc b/src/librbd/image/OpenRequest.cc new file mode 100644 index 000000000..70008d712 --- /dev/null +++ b/src/librbd/image/OpenRequest.cc @@ -0,0 +1,727 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/OpenRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ConfigWatcher.h" +#include "librbd/ImageCtx.h" +#include "librbd/PluginRegistry.h" +#include "librbd/Utils.h" +#include "librbd/cache/ObjectCacherObjectDispatch.h" +#include "librbd/cache/WriteAroundObjectDispatch.h" +#include "librbd/image/CloseRequest.h" +#include "librbd/image/RefreshRequest.h" +#include "librbd/image/SetSnapRequest.h" +#include "librbd/io/SimpleSchedulerObjectDispatch.h" +#include +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::OpenRequest: " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template +OpenRequest::OpenRequest(I *image_ctx, uint64_t flags, + Context *on_finish) + : m_image_ctx(image_ctx), + m_skip_open_parent_image(flags & OPEN_FLAG_SKIP_OPEN_PARENT), + m_on_finish(on_finish), m_error_result(0) { + if ((flags & OPEN_FLAG_OLD_FORMAT) != 0) { + m_image_ctx->old_format = true; + } + if ((flags & OPEN_FLAG_IGNORE_MIGRATING) != 0) { + m_image_ctx->ignore_migrating = true; + } +} + +template +void OpenRequest::send() { + if (m_image_ctx->old_format) { + send_v1_detect_header(); + } else { + send_v2_detect_header(); + } +} + +template +void OpenRequest::send_v1_detect_header() { + librados::ObjectReadOperation op; + op.stat(NULL, NULL, NULL); + + using klass = OpenRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(util::old_header_name(m_image_ctx->name), + comp, &op, &m_out_bl); + comp->release(); +} + +template +Context *OpenRequest::handle_v1_detect_header(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + if (*result != -ENOENT) { + lderr(cct) << "failed to stat image header: " << cpp_strerror(*result) + << dendl; + } + send_close_image(*result); + } else { + ldout(cct, 1) << "RBD image format 1 is deprecated. " + << "Please copy this image to image format 2." << dendl; + + m_image_ctx->old_format = true; + m_image_ctx->header_oid = util::old_header_name(m_image_ctx->name); + m_image_ctx->apply_metadata({}, true); + + send_refresh(); + } + return nullptr; +} + +template +void OpenRequest::send_v2_detect_header() { + if (m_image_ctx->id.empty()) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + op.stat(NULL, NULL, NULL); + + using klass = OpenRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name), + comp, &op, &m_out_bl); + comp->release(); + } else { + send_v2_get_name(); + } +} + +template +Context *OpenRequest::handle_v2_detect_header(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result == -ENOENT) { + send_v1_detect_header(); + } else if (*result < 0) { + lderr(cct) << "failed to stat v2 image header: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + } else { + m_image_ctx->old_format = false; + send_v2_get_id(); + } + return nullptr; +} + +template +void OpenRequest::send_v2_get_id() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_id_start(&op); + + using klass = OpenRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name), + comp, &op, &m_out_bl); + comp->release(); +} + +template +Context *OpenRequest::handle_v2_get_id(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_id_finish(&it, &m_image_ctx->id); + } + if (*result < 0) { + lderr(cct) << "failed to retrieve image id: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + } else { + send_v2_get_initial_metadata(); + } + return nullptr; +} + +template +void OpenRequest::send_v2_get_name() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::dir_get_name_start(&op, m_image_ctx->id); + + using klass = OpenRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_name>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_out_bl); + comp->release(); +} + +template +Context *OpenRequest::handle_v2_get_name(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::dir_get_name_finish(&it, &m_image_ctx->name); + } + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to retrieve name: " + << cpp_strerror(*result) << dendl; + send_close_image(*result); + } else if (*result == -ENOENT) { + // image does not exist in directory, look in the trash bin + ldout(cct, 10) << "image id " << m_image_ctx->id << " does not exist in " + << "rbd directory, searching in rbd trash..." << dendl; + send_v2_get_name_from_trash(); + } else { + send_v2_get_initial_metadata(); + } + return nullptr; +} + +template +void OpenRequest::send_v2_get_name_from_trash() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::trash_get_start(&op, m_image_ctx->id); + + using klass = OpenRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_name_from_trash>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(RBD_TRASH, comp, &op, &m_out_bl); + comp->release(); +} + +template +Context *OpenRequest::handle_v2_get_name_from_trash(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + cls::rbd::TrashImageSpec trash_spec; + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::trash_get_finish(&it, &trash_spec); + m_image_ctx->name = trash_spec.name; + } + if (*result < 0) { + if (*result == -EOPNOTSUPP) { + *result = -ENOENT; + } + if (*result == -ENOENT) { + ldout(cct, 5) << "failed to retrieve name for image id " + << m_image_ctx->id << dendl; + } else { + lderr(cct) << "failed to retrieve name from trash: " + << cpp_strerror(*result) << dendl; + } + send_close_image(*result); + } else { + send_v2_get_initial_metadata(); + } + + return nullptr; +} + +template +void OpenRequest::send_v2_get_initial_metadata() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->old_format = false; + m_image_ctx->header_oid = util::header_name(m_image_ctx->id); + + librados::ObjectReadOperation op; + cls_client::get_size_start(&op, CEPH_NOSNAP); + cls_client::get_object_prefix_start(&op); + cls_client::get_features_start(&op, true); + + using klass = OpenRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_initial_metadata>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template +Context *OpenRequest::handle_v2_get_initial_metadata(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + auto it = m_out_bl.cbegin(); + if (*result >= 0) { + uint64_t size; + *result = cls_client::get_size_finish(&it, &size, &m_image_ctx->order); + } + + if (*result >= 0) { + *result = cls_client::get_object_prefix_finish(&it, + &m_image_ctx->object_prefix); + } + + if (*result >= 0) { + uint64_t incompatible_features; + *result = cls_client::get_features_finish(&it, &m_image_ctx->features, + &incompatible_features); + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve initial metadata: " + << cpp_strerror(*result) << dendl; + send_close_image(*result); + return nullptr; + } + + if (m_image_ctx->test_features(RBD_FEATURE_STRIPINGV2)) { + send_v2_get_stripe_unit_count(); + } else { + send_v2_get_create_timestamp(); + } + + return nullptr; +} + +template +void OpenRequest::send_v2_get_stripe_unit_count() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_stripe_unit_count_start(&op); + + using klass = OpenRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_stripe_unit_count>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template +Context *OpenRequest::handle_v2_get_stripe_unit_count(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_stripe_unit_count_finish( + &it, &m_image_ctx->stripe_unit, &m_image_ctx->stripe_count); + } + + if (*result == -ENOEXEC || *result == -EINVAL) { + *result = 0; + } + + if (*result < 0) { + lderr(cct) << "failed to read striping metadata: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + send_v2_get_create_timestamp(); + return nullptr; +} + +template +void OpenRequest::send_v2_get_create_timestamp() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_create_timestamp_start(&op); + + using klass = OpenRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_create_timestamp>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template +Context *OpenRequest::handle_v2_get_create_timestamp(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_create_timestamp_finish(&it, + &m_image_ctx->create_timestamp); + } + if (*result < 0 && *result != -EOPNOTSUPP) { + lderr(cct) << "failed to retrieve create_timestamp: " + << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + send_v2_get_access_modify_timestamp(); + return nullptr; +} + +template +void OpenRequest::send_v2_get_access_modify_timestamp() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_access_timestamp_start(&op); + cls_client::get_modify_timestamp_start(&op); + //TODO: merge w/ create timestamp query after luminous EOLed + + using klass = OpenRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_access_modify_timestamp>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template +Context *OpenRequest::handle_v2_get_access_modify_timestamp(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_access_timestamp_finish(&it, + &m_image_ctx->access_timestamp); + if (*result == 0) + *result = cls_client::get_modify_timestamp_finish(&it, + &m_image_ctx->modify_timestamp); + } + if (*result < 0 && *result != -EOPNOTSUPP) { + lderr(cct) << "failed to retrieve access/modify_timestamp: " + << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + send_v2_get_data_pool(); + return nullptr; +} + +template +void OpenRequest::send_v2_get_data_pool() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::get_data_pool_start(&op); + + using klass = OpenRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_data_pool>(this); + m_out_bl.clear(); + m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template +Context *OpenRequest::handle_v2_get_data_pool(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + int64_t data_pool_id = -1; + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::get_data_pool_finish(&it, &data_pool_id); + } else if (*result == -EOPNOTSUPP) { + *result = 0; + } + + if (*result < 0) { + lderr(cct) << "failed to read data pool: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + if (data_pool_id != -1) { + *result = util::create_ioctx(m_image_ctx->md_ctx, "data pool", data_pool_id, + {}, &m_image_ctx->data_ctx); + if (*result < 0) { + if (*result != -ENOENT) { + send_close_image(*result); + return nullptr; + } + m_image_ctx->data_ctx.close(); + } else { + m_image_ctx->rebuild_data_io_context(); + } + } else { + data_pool_id = m_image_ctx->md_ctx.get_id(); + } + + m_image_ctx->init_layout(data_pool_id); + send_refresh(); + return nullptr; +} + +template +void OpenRequest::send_refresh() { + m_image_ctx->init(); + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_image_ctx->config_watcher = ConfigWatcher::create(*m_image_ctx); + m_image_ctx->config_watcher->init(); + + using klass = OpenRequest; + RefreshRequest *req = RefreshRequest::create( + *m_image_ctx, false, m_skip_open_parent_image, + create_context_callback(this)); + req->send(); +} + +template +Context *OpenRequest::handle_refresh(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + send_init_plugin_registry(); + return nullptr; +} + +template +void OpenRequest::send_init_plugin_registry() { + CephContext *cct = m_image_ctx->cct; + + auto plugins = m_image_ctx->config.template get_val( + "rbd_plugins"); + ldout(cct, 10) << __func__ << ": plugins=" << plugins << dendl; + + auto ctx = create_context_callback< + OpenRequest, &OpenRequest::handle_init_plugin_registry>(this); + m_image_ctx->plugin_registry->init(plugins, ctx); +} + +template +Context* OpenRequest::handle_init_plugin_registry(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to initialize plugin registry: " + << cpp_strerror(*result) << dendl; + send_close_image(*result); + return nullptr; + } + + return send_init_cache(result); +} + +template +Context *OpenRequest::send_init_cache(int *result) { + if (!m_image_ctx->cache || m_image_ctx->child != nullptr || + !m_image_ctx->data_ctx.is_valid()) { + return send_register_watch(result); + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + size_t max_dirty = m_image_ctx->config.template get_val( + "rbd_cache_max_dirty"); + auto writethrough_until_flush = m_image_ctx->config.template get_val( + "rbd_cache_writethrough_until_flush"); + auto cache_policy = m_image_ctx->config.template get_val( + "rbd_cache_policy"); + if (cache_policy == "writearound") { + auto cache = cache::WriteAroundObjectDispatch::create( + m_image_ctx, max_dirty, writethrough_until_flush); + cache->init(); + + m_image_ctx->readahead.set_max_readahead_size(0); + } else if (cache_policy == "writethrough" || cache_policy == "writeback") { + if (cache_policy == "writethrough") { + max_dirty = 0; + } + + auto cache = cache::ObjectCacherObjectDispatch::create( + m_image_ctx, max_dirty, writethrough_until_flush); + cache->init(); + + // readahead requires the object cacher cache + m_image_ctx->readahead.set_trigger_requests( + m_image_ctx->config.template get_val("rbd_readahead_trigger_requests")); + m_image_ctx->readahead.set_max_readahead_size( + m_image_ctx->config.template get_val("rbd_readahead_max_bytes")); + } + return send_register_watch(result); +} + +template +Context *OpenRequest::send_register_watch(int *result) { + if ((m_image_ctx->read_only_flags & IMAGE_READ_ONLY_FLAG_USER) != 0U) { + return send_set_snap(result); + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = OpenRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_register_watch>(this); + m_image_ctx->register_watch(ctx); + return nullptr; +} + +template +Context *OpenRequest::handle_register_watch(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -EPERM) { + ldout(cct, 5) << "user does not have write permission" << dendl; + send_close_image(*result); + return nullptr; + } else if (*result < 0) { + lderr(cct) << "failed to register watch: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + return send_set_snap(result); +} + +template +Context *OpenRequest::send_set_snap(int *result) { + if (m_image_ctx->snap_name.empty() && + m_image_ctx->open_snap_id == CEPH_NOSNAP) { + *result = 0; + return finalize(*result); + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + uint64_t snap_id = CEPH_NOSNAP; + std::swap(m_image_ctx->open_snap_id, snap_id); + if (snap_id == CEPH_NOSNAP) { + std::shared_lock image_locker{m_image_ctx->image_lock}; + snap_id = m_image_ctx->get_snap_id(m_image_ctx->snap_namespace, + m_image_ctx->snap_name); + } + if (snap_id == CEPH_NOSNAP) { + lderr(cct) << "failed to find snapshot " << m_image_ctx->snap_name << dendl; + send_close_image(-ENOENT); + return nullptr; + } + + using klass = OpenRequest; + SetSnapRequest *req = SetSnapRequest::create( + *m_image_ctx, snap_id, + create_context_callback(this)); + req->send(); + return nullptr; +} + +template +Context *OpenRequest::handle_set_snap(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to set image snapshot: " << cpp_strerror(*result) + << dendl; + send_close_image(*result); + return nullptr; + } + + return finalize(*result); +} + +template +Context *OpenRequest::finalize(int r) { + if (r == 0) { + auto io_scheduler_cfg = + m_image_ctx->config.template get_val("rbd_io_scheduler"); + + if (io_scheduler_cfg == "simple" && !m_image_ctx->read_only) { + auto io_scheduler = + io::SimpleSchedulerObjectDispatch::create(m_image_ctx); + io_scheduler->init(); + } + } + + return m_on_finish; +} + +template +void OpenRequest::send_close_image(int error_result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_error_result = error_result; + + using klass = OpenRequest; + Context *ctx = create_context_callback( + this); + CloseRequest *req = CloseRequest::create(m_image_ctx, ctx); + req->send(); +} + +template +Context *OpenRequest::handle_close_image(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to close image: " << cpp_strerror(*result) << dendl; + } + if (m_error_result < 0) { + *result = m_error_result; + } + return m_on_finish; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::OpenRequest; diff --git a/src/librbd/image/OpenRequest.h b/src/librbd/image/OpenRequest.h new file mode 100644 index 000000000..0fe218a39 --- /dev/null +++ b/src/librbd/image/OpenRequest.h @@ -0,0 +1,149 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H +#define CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H + +#include "include/buffer.h" +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template +class OpenRequest { +public: + static OpenRequest *create(ImageCtxT *image_ctx, uint64_t flags, + Context *on_finish) { + return new OpenRequest(image_ctx, flags, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * | (v1) + * |-----> V1_DETECT_HEADER + * | | + * | \-------------------------------\ + * | (v2) | + * \-----> V2_DETECT_HEADER | + * | | + * v | + * V2_GET_ID|NAME | + * | | + * v (skip if have name) | + * V2_GET_NAME_FROM_TRASH | + * | | + * v | + * V2_GET_INITIAL_METADATA | + * | | + * v | + * V2_GET_STRIPE_UNIT_COUNT (skip if | + * | disabled) | + * v | + * V2_GET_CREATE_TIMESTAMP | + * | | + * v | + * V2_GET_ACCESS_MODIFIY_TIMESTAMP | + * | | + * v | + * V2_GET_DATA_POOL --------------> REFRESH + * | + * v + * INIT_PLUGIN_REGISTRY + * | + * v + * INIT_CACHE + * | + * v + * REGISTER_WATCH (skip if + * | read-only) + * v + * SET_SNAP (skip if no snap) + * | + * v + * + * ^ + * (on error) | + * * * * * * * > CLOSE ------------------------/ + * + * @endverbatim + */ + + OpenRequest(ImageCtxT *image_ctx, uint64_t flags, Context *on_finish); + + ImageCtxT *m_image_ctx; + bool m_skip_open_parent_image; + Context *m_on_finish; + + bufferlist m_out_bl; + int m_error_result; + + void send_v1_detect_header(); + Context *handle_v1_detect_header(int *result); + + void send_v2_detect_header(); + Context *handle_v2_detect_header(int *result); + + void send_v2_get_id(); + Context *handle_v2_get_id(int *result); + + void send_v2_get_name(); + Context *handle_v2_get_name(int *result); + + void send_v2_get_name_from_trash(); + Context *handle_v2_get_name_from_trash(int *result); + + void send_v2_get_initial_metadata(); + Context *handle_v2_get_initial_metadata(int *result); + + void send_v2_get_stripe_unit_count(); + Context *handle_v2_get_stripe_unit_count(int *result); + + void send_v2_get_create_timestamp(); + Context *handle_v2_get_create_timestamp(int *result); + + void send_v2_get_access_modify_timestamp(); + Context *handle_v2_get_access_modify_timestamp(int *result); + + void send_v2_get_data_pool(); + Context *handle_v2_get_data_pool(int *result); + + void send_refresh(); + Context *handle_refresh(int *result); + + void send_init_plugin_registry(); + Context* handle_init_plugin_registry(int *result); + + Context *send_init_cache(int *result); + + Context *send_register_watch(int *result); + Context *handle_register_watch(int *result); + + Context *send_set_snap(int *result); + Context *handle_set_snap(int *result); + + Context *finalize(int r); + + void send_close_image(int error_result); + Context *handle_close_image(int *result); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::OpenRequest; + +#endif // CEPH_LIBRBD_IMAGE_OPEN_REQUEST_H diff --git a/src/librbd/image/PreRemoveRequest.cc b/src/librbd/image/PreRemoveRequest.cc new file mode 100644 index 000000000..fa4141834 --- /dev/null +++ b/src/librbd/image/PreRemoveRequest.cc @@ -0,0 +1,348 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/PreRemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/Utils.h" +#include "librbd/exclusive_lock/StandardPolicy.h" +#include "librbd/image/ListWatchersRequest.h" +#include "librbd/journal/DisabledPolicy.h" +#include "librbd/operation/SnapshotRemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::PreRemoveRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace image { + +namespace { + +bool auto_delete_snapshot(const SnapInfo& snap_info) { + auto snap_namespace_type = cls::rbd::get_snap_namespace_type( + snap_info.snap_namespace); + switch (snap_namespace_type) { + case cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH: + return true; + default: + return false; + } +} + +bool ignore_snapshot(const SnapInfo& snap_info) { + auto snap_namespace_type = cls::rbd::get_snap_namespace_type( + snap_info.snap_namespace); + switch (snap_namespace_type) { + case cls::rbd::SNAPSHOT_NAMESPACE_TYPE_MIRROR: + return true; + default: + return false; + } +} + +} // anonymous namespace + +using util::create_context_callback; +using util::create_rados_callback; + +template +void PreRemoveRequest::send() { + auto cct = m_image_ctx->cct; + if (m_image_ctx->operations_disabled) { + lderr(cct) << "image operations disabled due to unsupported op features" + << dendl; + finish(-EROFS); + return; + } + + acquire_exclusive_lock(); +} + +template +void PreRemoveRequest::acquire_exclusive_lock() { + // lock for write for set_exclusive_lock_policy() + std::unique_lock owner_locker{m_image_ctx->owner_lock}; + if (m_image_ctx->exclusive_lock == nullptr) { + owner_locker.unlock(); + validate_image_removal(); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + // refuse to release exclusive lock when (in the midst of) removing + // the image + m_image_ctx->set_exclusive_lock_policy( + new exclusive_lock::StandardPolicy(m_image_ctx)); + + // do not attempt to open the journal when removing the image in case + // it's corrupt + if (m_image_ctx->test_features(RBD_FEATURE_JOURNALING)) { + std::unique_lock image_locker{m_image_ctx->image_lock}; + m_image_ctx->set_journal_policy(new journal::DisabledPolicy()); + } + + m_exclusive_lock = m_image_ctx->exclusive_lock; + + auto ctx = create_context_callback< + PreRemoveRequest, + &PreRemoveRequest::handle_exclusive_lock>(this, m_exclusive_lock); + m_exclusive_lock->acquire_lock(ctx); +} + +template +void PreRemoveRequest::handle_exclusive_lock(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 || !m_image_ctx->exclusive_lock->is_lock_owner()) { + if (!m_force) { + lderr(cct) << "cannot obtain exclusive lock - not removing" << dendl; + finish(-EBUSY); + } else { + ldout(cct, 5) << "cannot obtain exclusive lock - " + << "proceeding due to force flag set" << dendl; + shut_down_exclusive_lock(); + } + return; + } + + validate_image_removal(); +} + +template +void PreRemoveRequest::shut_down_exclusive_lock() { + std::shared_lock owner_locker{m_image_ctx->owner_lock}; + if (m_image_ctx->exclusive_lock == nullptr) { + owner_locker.unlock(); + validate_image_removal(); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + PreRemoveRequest, + &PreRemoveRequest::handle_shut_down_exclusive_lock>(this); + + m_exclusive_lock = m_image_ctx->exclusive_lock; + m_exclusive_lock->shut_down(ctx); +} + +template +void PreRemoveRequest::handle_shut_down_exclusive_lock(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + m_exclusive_lock->put(); + m_exclusive_lock = nullptr; + + if (r < 0) { + lderr(cct) << "error shutting down exclusive lock: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + ceph_assert(m_image_ctx->exclusive_lock == nullptr); + validate_image_removal(); +} + +template +void PreRemoveRequest::validate_image_removal() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + if (!m_image_ctx->ignore_migrating && + m_image_ctx->test_features(RBD_FEATURE_MIGRATING)) { + lderr(cct) << "image in migration state - not removing" << dendl; + finish(-EBUSY); + return; + } + + check_image_snaps(); +} + +template +void PreRemoveRequest::check_image_snaps() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_image_ctx->image_lock.lock_shared(); + for (auto& snap_info : m_image_ctx->snap_info) { + if (auto_delete_snapshot(snap_info.second)) { + m_snap_infos.insert(snap_info); + } else if (!ignore_snapshot(snap_info.second)) { + m_image_ctx->image_lock.unlock_shared(); + + ldout(cct, 5) << "image has snapshots - not removing" << dendl; + finish(-ENOTEMPTY); + return; + } + } + m_image_ctx->image_lock.unlock_shared(); + + list_image_watchers(); +} + +template +void PreRemoveRequest::list_image_watchers() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + int flags = LIST_WATCHERS_FILTER_OUT_MY_INSTANCE | + LIST_WATCHERS_FILTER_OUT_MIRROR_INSTANCES; + auto ctx = create_context_callback< + PreRemoveRequest, + &PreRemoveRequest::handle_list_image_watchers>(this); + auto req = ListWatchersRequest::create(*m_image_ctx, flags, &m_watchers, + ctx); + req->send(); +} + +template +void PreRemoveRequest::handle_list_image_watchers(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "error listing image watchers: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + check_image_watchers(); +} + +template +void PreRemoveRequest::check_image_watchers() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + if (!m_watchers.empty()) { + lderr(cct) << "image has watchers - not removing" << dendl; + finish(-EBUSY); + return; + } + + check_group(); +} + +template +void PreRemoveRequest::check_group() { + if (m_image_ctx->old_format) { + finish(0); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::image_group_get_start(&op); + + auto rados_completion = create_rados_callback< + PreRemoveRequest, &PreRemoveRequest::handle_check_group>(this); + m_out_bl.clear(); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, + rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void PreRemoveRequest::handle_check_group(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + cls::rbd::GroupSpec s; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::image_group_get_finish(&it, &s); + } + if (r < 0 && r != -EOPNOTSUPP) { + lderr(cct) << "error fetching group for image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (s.is_valid()) { + lderr(cct) << "image is in a group - not removing" << dendl; + finish(-EMLINK); + return; + } + + remove_snapshot(); +} + +template +void PreRemoveRequest::remove_snapshot() { + if (m_snap_infos.empty()) { + finish(0); + return; + } + + auto cct = m_image_ctx->cct; + auto snap_id = m_snap_infos.begin()->first; + auto& snap_info = m_snap_infos.begin()->second; + ldout(cct, 20) << "snap_id=" << snap_id << ", " + << "snap_name=" << snap_info.name << dendl; + + std::shared_lock owner_lock{m_image_ctx->owner_lock}; + auto ctx = create_context_callback< + PreRemoveRequest, &PreRemoveRequest::handle_remove_snapshot>(this); + auto req = librbd::operation::SnapshotRemoveRequest::create( + *m_image_ctx, snap_info.snap_namespace, snap_info.name, + snap_id, ctx); + req->send(); + +} + +template +void PreRemoveRequest::handle_remove_snapshot(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == -EBUSY) { + ldout(cct, 5) << "skipping attached child" << dendl; + if (m_ret_val == 0) { + m_ret_val = -ECHILD; + } + } else if (r < 0 && r != -ENOENT) { + auto snap_id = m_snap_infos.begin()->first; + lderr(cct) << "failed to auto-prune snapshot " << snap_id << ": " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + ceph_assert(!m_snap_infos.empty()); + m_snap_infos.erase(m_snap_infos.begin()); + + remove_snapshot(); +} + +template +void PreRemoveRequest::finish(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (m_ret_val == 0) { + m_ret_val = r; + } + + m_on_finish->complete(m_ret_val); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::PreRemoveRequest; diff --git a/src/librbd/image/PreRemoveRequest.h b/src/librbd/image/PreRemoveRequest.h new file mode 100644 index 000000000..06b3bf2f8 --- /dev/null +++ b/src/librbd/image/PreRemoveRequest.h @@ -0,0 +1,100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "include/buffer.h" +#include "librbd/ImageCtx.h" +#include +#include + +class Context; + +namespace librbd { +namespace image { + +template +class PreRemoveRequest { +public: + + static PreRemoveRequest *create(ImageCtxT *image_ctx, bool force, + Context *on_finish) { + return new PreRemoveRequest(image_ctx, force, on_finish); + } + + PreRemoveRequest(ImageCtxT *image_ctx, bool force, Context *on_finish) + : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | (skip if + * v not needed) (error) + * ACQUIRE EXCLUSIVE LOCK * * * * * * > SHUT DOWN EXCLUSIVE LOCK + * | | + * v | + * CHECK IMAGE WATCHERS <------------------/ + * | + * v + * CHECK GROUP + * | + * | /------\ + * | | | + * v v | + * REMOVE SNAPS ----/ + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT* m_image_ctx; + bool m_force; + Context* m_on_finish; + + decltype(m_image_ctx->exclusive_lock) m_exclusive_lock = nullptr; + + bufferlist m_out_bl; + std::list m_watchers; + + std::map m_snap_infos; + int m_ret_val = 0; + + void acquire_exclusive_lock(); + void handle_exclusive_lock(int r); + + void shut_down_exclusive_lock(); + void handle_shut_down_exclusive_lock(int r); + + void validate_image_removal(); + void check_image_snaps(); + + void list_image_watchers(); + void handle_list_image_watchers(int r); + + void check_image_watchers(); + + void check_group(); + void handle_check_group(int r); + + void remove_snapshot(); + void handle_remove_snapshot(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::PreRemoveRequest; + +#endif // CEPH_LIBRBD_IMAGE_PRE_REMOVE_REQUEST_H diff --git a/src/librbd/image/RefreshParentRequest.cc b/src/librbd/image/RefreshParentRequest.cc new file mode 100644 index 000000000..348226c39 --- /dev/null +++ b/src/librbd/image/RefreshParentRequest.cc @@ -0,0 +1,244 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/RefreshParentRequest.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/migration/OpenSourceImageRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::RefreshParentRequest: " + +namespace librbd { +namespace image { + +using util::create_async_context_callback; +using util::create_context_callback; + +template +RefreshParentRequest::RefreshParentRequest( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info, Context *on_finish) + : m_child_image_ctx(child_image_ctx), m_parent_md(parent_md), + m_migration_info(migration_info), m_on_finish(on_finish), + m_parent_image_ctx(nullptr), m_parent_snap_id(CEPH_NOSNAP), + m_error_result(0) { +} + +template +bool RefreshParentRequest::is_refresh_required( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info) { + ceph_assert(ceph_mutex_is_locked(child_image_ctx.image_lock)); + return (is_open_required(child_image_ctx, parent_md, migration_info) || + is_close_required(child_image_ctx, parent_md, migration_info)); +} + +template +bool RefreshParentRequest::is_close_required( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info) { + return (child_image_ctx.parent != nullptr && + !does_parent_exist(child_image_ctx, parent_md, migration_info)); +} + +template +bool RefreshParentRequest::is_open_required( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info) { + return (does_parent_exist(child_image_ctx, parent_md, migration_info) && + (child_image_ctx.parent == nullptr || + child_image_ctx.parent->md_ctx.get_id() != parent_md.spec.pool_id || + child_image_ctx.parent->md_ctx.get_namespace() != + parent_md.spec.pool_namespace || + child_image_ctx.parent->id != parent_md.spec.image_id || + child_image_ctx.parent->snap_id != parent_md.spec.snap_id)); +} + +template +bool RefreshParentRequest::does_parent_exist( + I &child_image_ctx, const ParentImageInfo &parent_md, + const MigrationInfo &migration_info) { + if (child_image_ctx.child != nullptr && + child_image_ctx.child->migration_info.empty() && parent_md.overlap == 0) { + // intermediate, non-migrating images should only open their parent if they + // overlap + return false; + } + + return (parent_md.spec.pool_id > -1 && parent_md.overlap > 0) || + !migration_info.empty(); +} + +template +void RefreshParentRequest::send() { + if (is_open_required(m_child_image_ctx, m_parent_md, m_migration_info)) { + send_open_parent(); + } else { + // parent will be closed (if necessary) during finalize + send_complete(0); + } +} + +template +void RefreshParentRequest::apply() { + ceph_assert(ceph_mutex_is_wlocked(m_child_image_ctx.image_lock)); + std::swap(m_child_image_ctx.parent, m_parent_image_ctx); +} + +template +void RefreshParentRequest::finalize(Context *on_finish) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_on_finish = on_finish; + if (m_parent_image_ctx != nullptr) { + send_close_parent(); + } else { + send_complete(0); + } +} + +template +void RefreshParentRequest::send_open_parent() { + ceph_assert(m_parent_md.spec.pool_id >= 0); + + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + if (!m_migration_info.empty()) { + auto ctx = create_async_context_callback( + m_child_image_ctx, create_context_callback< + RefreshParentRequest, + &RefreshParentRequest::handle_open_parent, false>(this)); + auto req = migration::OpenSourceImageRequest::create( + m_child_image_ctx.md_ctx, &m_child_image_ctx, m_parent_md.spec.snap_id, + m_migration_info, &m_parent_image_ctx, ctx); + req->send(); + return; + } + + librados::IoCtx parent_io_ctx; + int r = util::create_ioctx(m_child_image_ctx.md_ctx, "parent image", + m_parent_md.spec.pool_id, + m_parent_md.spec.pool_namespace, &parent_io_ctx); + if (r < 0) { + send_complete(r); + return; + } + + m_parent_image_ctx = new I("", m_parent_md.spec.image_id, + m_parent_md.spec.snap_id, parent_io_ctx, true); + m_parent_image_ctx->child = &m_child_image_ctx; + + // set rados flags for reading the parent image + if (m_child_image_ctx.config.template get_val("rbd_balance_parent_reads")) { + m_parent_image_ctx->set_read_flag(librados::OPERATION_BALANCE_READS); + } else if (m_child_image_ctx.config.template get_val("rbd_localize_parent_reads")) { + m_parent_image_ctx->set_read_flag(librados::OPERATION_LOCALIZE_READS); + } + + auto ctx = create_async_context_callback( + m_child_image_ctx, create_context_callback< + RefreshParentRequest, + &RefreshParentRequest::handle_open_parent, false>(this)); + m_parent_image_ctx->state->open(0U, ctx); +} + +template +Context *RefreshParentRequest::handle_open_parent(int *result) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl; + + save_result(result); + if (*result < 0) { + lderr(cct) << "failed to open parent image: " << cpp_strerror(*result) + << dendl; + + // image already closed by open state machine + m_parent_image_ctx = nullptr; + } + + return m_on_finish; +} + +template +void RefreshParentRequest::send_close_parent() { + ceph_assert(m_parent_image_ctx != nullptr); + + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + auto ctx = create_async_context_callback( + m_child_image_ctx, create_context_callback< + RefreshParentRequest, + &RefreshParentRequest::handle_close_parent, false>(this)); + m_parent_image_ctx->state->close(ctx); +} + +template +Context *RefreshParentRequest::handle_close_parent(int *result) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl; + + m_parent_image_ctx = nullptr; + + if (*result < 0) { + lderr(cct) << "failed to close parent image: " << cpp_strerror(*result) + << dendl; + } + + send_reset_existence_cache(); + return nullptr; +} + +template +void RefreshParentRequest::send_reset_existence_cache() { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + Context *ctx = create_async_context_callback( + m_child_image_ctx, create_context_callback< + RefreshParentRequest, + &RefreshParentRequest::handle_reset_existence_cache, false>(this)); + m_child_image_ctx.io_object_dispatcher->reset_existence_cache(ctx); +} + +template +Context *RefreshParentRequest::handle_reset_existence_cache(int *result) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << " r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to reset object existence cache: " + << cpp_strerror(*result) << dendl; + } + + if (m_error_result < 0) { + // propagate errors from opening the image + *result = m_error_result; + } else { + *result = 0; + } + return m_on_finish; +} + +template +void RefreshParentRequest::send_complete(int r) { + CephContext *cct = m_child_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_on_finish->complete(r); +} + +} // namespace image +} // namespace librbd + +template class librbd::image::RefreshParentRequest; diff --git a/src/librbd/image/RefreshParentRequest.h b/src/librbd/image/RefreshParentRequest.h new file mode 100644 index 000000000..086d8ec1b --- /dev/null +++ b/src/librbd/image/RefreshParentRequest.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H +#define CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H + +#include "include/int_types.h" +#include "librbd/Types.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template +class RefreshParentRequest { +public: + static RefreshParentRequest *create(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info, + Context *on_finish) { + return new RefreshParentRequest(child_image_ctx, parent_md, migration_info, + on_finish); + } + + static bool is_refresh_required(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info); + + void send(); + void apply(); + void finalize(Context *on_finish); + +private: + /** + * @verbatim + * + * + * | + * | (open required) + * |----------------> OPEN_PARENT * * * * * * * * * * * * * * * + * | | * + * | v (on error) * + * \----------------> * + * | * + * | (close required) * + * |-----------------> CLOSE_PARENT * + * | | * + * | v * + * | RESET_EXISTENCE * + * | | * + * | v * + * \-----------------> < * * * * + * + * @endverbatim + */ + + RefreshParentRequest(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info, Context *on_finish); + + ImageCtxT &m_child_image_ctx; + ParentImageInfo m_parent_md; + MigrationInfo m_migration_info; + Context *m_on_finish; + + ImageCtxT *m_parent_image_ctx; + uint64_t m_parent_snap_id; + + int m_error_result; + + static bool is_close_required(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info); + static bool is_open_required(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info); + static bool does_parent_exist(ImageCtxT &child_image_ctx, + const ParentImageInfo &parent_md, + const MigrationInfo &migration_info); + + void send_open_parent(); + Context *handle_open_parent(int *result); + + void send_close_parent(); + Context *handle_close_parent(int *result); + + void send_reset_existence_cache(); + Context *handle_reset_existence_cache(int *result); + + void send_complete(int r); + + void save_result(int *result) { + if (m_error_result == 0 && *result < 0) { + m_error_result = *result; + } + } + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::RefreshParentRequest; + +#endif // CEPH_LIBRBD_IMAGE_REFRESH_PARENT_REQUEST_H diff --git a/src/librbd/image/RefreshRequest.cc b/src/librbd/image/RefreshRequest.cc new file mode 100644 index 000000000..24159c55b --- /dev/null +++ b/src/librbd/image/RefreshRequest.cc @@ -0,0 +1,1575 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/ceph_assert.h" + +#include "librbd/image/RefreshRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/deep_copy/Utils.h" +#include "librbd/image/GetMetadataRequest.h" +#include "librbd/image/RefreshParentRequest.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/journal/Policy.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::RefreshRequest: " + +namespace librbd { +namespace image { + +using util::create_rados_callback; +using util::create_async_context_callback; +using util::create_context_callback; + +template +RefreshRequest::RefreshRequest(I &image_ctx, bool acquiring_lock, + bool skip_open_parent, Context *on_finish) + : m_image_ctx(image_ctx), m_acquiring_lock(acquiring_lock), + m_skip_open_parent_image(skip_open_parent), + m_on_finish(create_async_context_callback(m_image_ctx, on_finish)), + m_error_result(0), m_flush_aio(false), m_exclusive_lock(nullptr), + m_object_map(nullptr), m_journal(nullptr), m_refresh_parent(nullptr) { + m_pool_metadata_io_ctx.dup(image_ctx.md_ctx); + m_pool_metadata_io_ctx.set_namespace(""); +} + +template +RefreshRequest::~RefreshRequest() { + // these require state machine to close + ceph_assert(m_exclusive_lock == nullptr); + ceph_assert(m_object_map == nullptr); + ceph_assert(m_journal == nullptr); + ceph_assert(m_refresh_parent == nullptr); + ceph_assert(!m_blocked_writes); +} + +template +void RefreshRequest::send() { + if (m_image_ctx.old_format) { + send_v1_read_header(); + } else { + send_v2_get_mutable_metadata(); + } +} + +template +void RefreshRequest::send_get_migration_header() { + if (m_image_ctx.ignore_migrating) { + m_migration_spec = {}; + if (m_image_ctx.old_format) { + send_v1_get_snapshots(); + } else { + send_v2_get_metadata(); + } + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::migration_get_start(&op); + + using klass = RefreshRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_out_bl.clear(); + m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + comp->release(); +} + +template +Context *RefreshRequest::handle_get_migration_header(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result >= 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::migration_get_finish(&it, &m_migration_spec); + } else if (*result == -ENOENT) { + ldout(cct, 5) << this << " " << __func__ << ": no migration header found" + << ", retrying" << dendl; + send(); + return nullptr; + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve migration header: " + << cpp_strerror(*result) << dendl; + return m_on_finish; + } + + switch(m_migration_spec.header_type) { + case cls::rbd::MIGRATION_HEADER_TYPE_SRC: + if (!m_read_only) { + lderr(cct) << "image being migrated" << dendl; + *result = -EROFS; + return m_on_finish; + } + ldout(cct, 1) << this << " " << __func__ << ": migrating to: " + << m_migration_spec << dendl; + break; + case cls::rbd::MIGRATION_HEADER_TYPE_DST: + ldout(cct, 1) << this << " " << __func__ << ": migrating from: " + << m_migration_spec << dendl; + switch (m_migration_spec.state) { + case cls::rbd::MIGRATION_STATE_PREPARING: + ldout(cct, 5) << this << " " << __func__ << ": current migration state: " + << m_migration_spec.state << ", retrying" << dendl; + send(); + return nullptr; + case cls::rbd::MIGRATION_STATE_PREPARED: + case cls::rbd::MIGRATION_STATE_EXECUTING: + case cls::rbd::MIGRATION_STATE_EXECUTED: + break; + case cls::rbd::MIGRATION_STATE_ABORTING: + if (!m_read_only) { + lderr(cct) << this << " " << __func__ << ": migration is being aborted" + << dendl; + *result = -EROFS; + return m_on_finish; + } + break; + default: + lderr(cct) << this << " " << __func__ << ": migration is in an " + << "unexpected state" << dendl; + *result = -EINVAL; + return m_on_finish; + } + break; + default: + ldout(cct, 1) << this << " " << __func__ << ": migration type " + << m_migration_spec.header_type << dendl; + *result = -EBADMSG; + return m_on_finish; + } + + if (m_image_ctx.old_format) { + send_v1_get_snapshots(); + } else { + send_v2_get_metadata(); + } + return nullptr; +} + +template +void RefreshRequest::send_v1_read_header() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + op.read(0, 0, nullptr, nullptr); + + using klass = RefreshRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v1_read_header>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *RefreshRequest::handle_v1_read_header(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl; + + rbd_obj_header_ondisk v1_header; + bool migrating = false; + if (*result < 0) { + return m_on_finish; + } else if (m_out_bl.length() < sizeof(v1_header)) { + lderr(cct) << "v1 header too small" << dendl; + *result = -EIO; + return m_on_finish; + } else if (memcmp(RBD_HEADER_TEXT, m_out_bl.c_str(), + sizeof(RBD_HEADER_TEXT)) != 0) { + if (memcmp(RBD_MIGRATE_HEADER_TEXT, m_out_bl.c_str(), + sizeof(RBD_MIGRATE_HEADER_TEXT)) == 0) { + ldout(cct, 1) << this << " " << __func__ << ": migration v1 header detected" + << dendl; + migrating = true; + } else { + lderr(cct) << "unrecognized v1 header" << dendl; + *result = -ENXIO; + return m_on_finish; + } + } + + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + m_read_only = m_image_ctx.read_only; + m_read_only_flags = m_image_ctx.read_only_flags; + } + + memcpy(&v1_header, m_out_bl.c_str(), sizeof(v1_header)); + m_order = v1_header.options.order; + m_size = v1_header.image_size; + m_object_prefix = v1_header.block_name; + if (migrating) { + send_get_migration_header(); + } else { + m_migration_spec = {}; + send_v1_get_snapshots(); + } + return nullptr; +} + +template +void RefreshRequest::send_v1_get_snapshots() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::old_snapshot_list_start(&op); + + using klass = RefreshRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v1_get_snapshots>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *RefreshRequest::handle_v1_get_snapshots(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl; + + std::vector snap_names; + std::vector snap_sizes; + if (*result >= 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::old_snapshot_list_finish(&it, &snap_names, + &snap_sizes, &m_snapc); + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve v1 snapshots: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + if (!m_snapc.is_valid()) { + lderr(cct) << "v1 image snap context is invalid" << dendl; + *result = -EIO; + return m_on_finish; + } + + m_snap_infos.clear(); + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + m_snap_infos.push_back({m_snapc.snaps[i], + {cls::rbd::UserSnapshotNamespace{}}, + snap_names[i], snap_sizes[i], {}, 0}); + } + + send_v1_get_locks(); + return nullptr; +} + +template +void RefreshRequest::send_v1_get_locks() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME); + + using klass = RefreshRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v1_get_locks>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *RefreshRequest::handle_v1_get_locks(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "r=" << *result << dendl; + + if (*result >= 0) { + auto it = m_out_bl.cbegin(); + ClsLockType lock_type; + *result = rados::cls::lock::get_lock_info_finish(&it, &m_lockers, + &lock_type, &m_lock_tag); + if (*result >= 0) { + m_exclusive_locked = (lock_type == ClsLockType::EXCLUSIVE); + } + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve locks: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_v1_apply(); + return nullptr; +} + +template +void RefreshRequest::send_v1_apply() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // ensure we are not in a rados callback when applying updates + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v1_apply>(this); + m_image_ctx.op_work_queue->queue(ctx, 0); +} + +template +Context *RefreshRequest::handle_v1_apply(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + apply(); + return send_flush_aio(); +} + +template +void RefreshRequest::send_v2_get_mutable_metadata() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + uint64_t snap_id; + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + snap_id = m_image_ctx.snap_id; + m_read_only = m_image_ctx.read_only; + m_read_only_flags = m_image_ctx.read_only_flags; + } + + // mask out the non-primary read-only flag since its state can change + bool read_only = ( + ((m_read_only_flags & ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY) != 0) || + (snap_id != CEPH_NOSNAP)); + librados::ObjectReadOperation op; + cls_client::get_size_start(&op, CEPH_NOSNAP); + cls_client::get_features_start(&op, read_only); + cls_client::get_flags_start(&op, CEPH_NOSNAP); + cls_client::get_snapcontext_start(&op); + rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME); + + using klass = RefreshRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_mutable_metadata>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *RefreshRequest::handle_v2_get_mutable_metadata(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "r=" << *result << dendl; + + auto it = m_out_bl.cbegin(); + if (*result >= 0) { + uint8_t order; + *result = cls_client::get_size_finish(&it, &m_size, &order); + } + + if (*result >= 0) { + *result = cls_client::get_features_finish(&it, &m_features, + &m_incompatible_features); + } + + if (*result >= 0) { + *result = cls_client::get_flags_finish(&it, &m_flags); + } + + if (*result >= 0) { + *result = cls_client::get_snapcontext_finish(&it, &m_snapc); + } + + if (*result >= 0) { + ClsLockType lock_type; + *result = rados::cls::lock::get_lock_info_finish(&it, &m_lockers, + &lock_type, &m_lock_tag); + if (*result >= 0) { + m_exclusive_locked = (lock_type == ClsLockType::EXCLUSIVE); + } + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve mutable metadata: " + << cpp_strerror(*result) << dendl; + return m_on_finish; + } + + uint64_t unsupported = m_incompatible_features & ~RBD_FEATURES_ALL; + if (unsupported != 0ULL) { + lderr(cct) << "Image uses unsupported features: " << unsupported << dendl; + *result = -ENOSYS; + return m_on_finish; + } + + if (!m_snapc.is_valid()) { + lderr(cct) << "image snap context is invalid!" << dendl; + *result = -EIO; + return m_on_finish; + } + + if (m_acquiring_lock && (m_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + ldout(cct, 5) << "ignoring dynamically disabled exclusive lock" << dendl; + m_features |= RBD_FEATURE_EXCLUSIVE_LOCK; + m_incomplete_update = true; + } else { + m_incomplete_update = false; + } + + if (((m_incompatible_features & RBD_FEATURE_NON_PRIMARY) != 0U) && + ((m_read_only_flags & IMAGE_READ_ONLY_FLAG_NON_PRIMARY) == 0U) && + ((m_image_ctx.read_only_mask & IMAGE_READ_ONLY_FLAG_NON_PRIMARY) != 0U)) { + // implies we opened a non-primary image in R/W mode + ldout(cct, 5) << "adding non-primary read-only image flag" << dendl; + m_read_only_flags |= IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + } else if ((((m_incompatible_features & RBD_FEATURE_NON_PRIMARY) == 0U) || + ((m_image_ctx.read_only_mask & + IMAGE_READ_ONLY_FLAG_NON_PRIMARY) == 0U)) && + ((m_read_only_flags & IMAGE_READ_ONLY_FLAG_NON_PRIMARY) != 0U)) { + ldout(cct, 5) << "removing non-primary read-only image flag" << dendl; + m_read_only_flags &= ~IMAGE_READ_ONLY_FLAG_NON_PRIMARY; + } + m_read_only = (m_read_only_flags != 0U); + + m_legacy_parent = false; + send_v2_get_parent(); + return nullptr; +} + +template +void RefreshRequest::send_v2_get_parent() { + // NOTE: remove support when Mimic is EOLed + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": legacy=" << m_legacy_parent + << dendl; + + librados::ObjectReadOperation op; + if (!m_legacy_parent) { + cls_client::parent_get_start(&op); + cls_client::parent_overlap_get_start(&op, CEPH_NOSNAP); + } else { + cls_client::get_parent_start(&op, CEPH_NOSNAP); + } + + auto aio_comp = create_rados_callback< + RefreshRequest, &RefreshRequest::handle_v2_get_parent>(this); + m_out_bl.clear(); + m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, aio_comp, &op, + &m_out_bl); + aio_comp->release(); +} + +template +Context *RefreshRequest::handle_v2_get_parent(int *result) { + // NOTE: remove support when Mimic is EOLed + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + auto it = m_out_bl.cbegin(); + if (!m_legacy_parent) { + if (*result >= 0) { + *result = cls_client::parent_get_finish(&it, &m_parent_md.spec); + } + + std::optional parent_overlap; + if (*result >= 0) { + *result = cls_client::parent_overlap_get_finish(&it, &parent_overlap); + } + + if (*result >= 0) { + if (parent_overlap) { + m_parent_md.overlap = *parent_overlap; + m_head_parent_overlap = true; + } else { + m_parent_md.overlap = 0; + m_head_parent_overlap = false; + } + } + } else if (*result >= 0) { + *result = cls_client::get_parent_finish(&it, &m_parent_md.spec, + &m_parent_md.overlap); + m_head_parent_overlap = true; + } + + if (*result == -EOPNOTSUPP && !m_legacy_parent) { + ldout(cct, 10) << "retrying using legacy parent method" << dendl; + m_legacy_parent = true; + send_v2_get_parent(); + return nullptr; + } else if (*result < 0) { + lderr(cct) << "failed to retrieve parent: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + if ((m_features & RBD_FEATURE_MIGRATING) != 0) { + ldout(cct, 1) << "migrating feature set" << dendl; + send_get_migration_header(); + } else { + m_migration_spec = {}; + send_v2_get_metadata(); + } + return nullptr; +} + +template +void RefreshRequest::send_v2_get_metadata() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + auto ctx = create_context_callback< + RefreshRequest, &RefreshRequest::handle_v2_get_metadata>(this); + m_metadata.clear(); + auto req = GetMetadataRequest::create( + m_image_ctx.md_ctx, m_image_ctx.header_oid, true, + ImageCtx::METADATA_CONF_PREFIX, ImageCtx::METADATA_CONF_PREFIX, 0U, + &m_metadata, ctx); + req->send(); +} + +template +Context *RefreshRequest::handle_v2_get_metadata(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to retrieve metadata: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_v2_get_pool_metadata(); + return nullptr; +} + +template +void RefreshRequest::send_v2_get_pool_metadata() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + auto ctx = create_context_callback< + RefreshRequest, &RefreshRequest::handle_v2_get_pool_metadata>(this); + auto req = GetMetadataRequest::create( + m_pool_metadata_io_ctx, RBD_INFO, true, ImageCtx::METADATA_CONF_PREFIX, + ImageCtx::METADATA_CONF_PREFIX, 0U, &m_metadata, ctx); + req->send(); +} + +template +Context *RefreshRequest::handle_v2_get_pool_metadata(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to retrieve pool metadata: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + bool thread_safe = m_image_ctx.image_watcher->is_unregistered(); + m_image_ctx.apply_metadata(m_metadata, thread_safe); + + send_v2_get_op_features(); + return nullptr; +} + +template +void RefreshRequest::send_v2_get_op_features() { + if ((m_features & RBD_FEATURE_OPERATIONS) == 0LL) { + m_op_features = 0; + send_v2_get_group(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::op_features_get_start(&op); + + librados::AioCompletion *comp = create_rados_callback< + RefreshRequest, &RefreshRequest::handle_v2_get_op_features>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *RefreshRequest::handle_v2_get_op_features(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "r=" << *result << dendl; + + // -EOPNOTSUPP handler not required since feature bit implies OSD + // supports the method + if (*result >= 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::op_features_get_finish(&it, &m_op_features); + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve op features: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_v2_get_group(); + return nullptr; +} + +template +void RefreshRequest::send_v2_get_group() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::image_group_get_start(&op); + + using klass = RefreshRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_group>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *RefreshRequest::handle_v2_get_group(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " + << "r=" << *result << dendl; + + if (*result >= 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::image_group_get_finish(&it, &m_group_spec); + } + + if (*result == -EOPNOTSUPP) { + m_group_spec = {}; + } else if (*result < 0) { + lderr(cct) << "failed to retrieve group: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + m_legacy_snapshot = LEGACY_SNAPSHOT_DISABLED; + send_v2_get_snapshots(); + return nullptr; +} + +template +void RefreshRequest::send_v2_get_snapshots() { + m_snap_infos.resize(m_snapc.snaps.size()); + m_snap_flags.resize(m_snapc.snaps.size()); + m_snap_parents.resize(m_snapc.snaps.size()); + m_snap_protection.resize(m_snapc.snaps.size()); + + if (m_snapc.snaps.empty()) { + send_v2_refresh_parent(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + for (auto snap_id : m_snapc.snaps) { + if (m_legacy_snapshot != LEGACY_SNAPSHOT_DISABLED) { + /// NOTE: remove after Luminous is retired + cls_client::get_snapshot_name_start(&op, snap_id); + cls_client::get_size_start(&op, snap_id); + if (m_legacy_snapshot != LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP) { + cls_client::get_snapshot_timestamp_start(&op, snap_id); + } + } else { + cls_client::snapshot_get_start(&op, snap_id); + } + + if (m_legacy_parent) { + cls_client::get_parent_start(&op, snap_id); + } else { + cls_client::parent_overlap_get_start(&op, snap_id); + } + + cls_client::get_flags_start(&op, snap_id); + cls_client::get_protection_status_start(&op, snap_id); + } + + using klass = RefreshRequest; + librados::AioCompletion *comp = create_rados_callback< + klass, &klass::handle_v2_get_snapshots>(this); + m_out_bl.clear(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *RefreshRequest::handle_v2_get_snapshots(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": " << "r=" << *result << dendl; + + auto it = m_out_bl.cbegin(); + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + if (m_legacy_snapshot != LEGACY_SNAPSHOT_DISABLED) { + /// NOTE: remove after Luminous is retired + std::string snap_name; + if (*result >= 0) { + *result = cls_client::get_snapshot_name_finish(&it, &snap_name); + } + + uint64_t snap_size; + if (*result >= 0) { + uint8_t order; + *result = cls_client::get_size_finish(&it, &snap_size, &order); + } + + utime_t snap_timestamp; + if (*result >= 0 && + m_legacy_snapshot != LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP) { + /// NOTE: remove after Jewel is retired + *result = cls_client::get_snapshot_timestamp_finish(&it, + &snap_timestamp); + } + + if (*result >= 0) { + m_snap_infos[i] = {m_snapc.snaps[i], + {cls::rbd::UserSnapshotNamespace{}}, + snap_name, snap_size, snap_timestamp, 0}; + } + } else if (*result >= 0) { + *result = cls_client::snapshot_get_finish(&it, &m_snap_infos[i]); + } + + if (*result >= 0) { + if (m_legacy_parent) { + *result = cls_client::get_parent_finish(&it, &m_snap_parents[i].spec, + &m_snap_parents[i].overlap); + } else { + std::optional parent_overlap; + *result = cls_client::parent_overlap_get_finish(&it, &parent_overlap); + if (*result >= 0) { + if (parent_overlap && m_parent_md.spec.pool_id > -1) { + m_snap_parents[i].spec = m_parent_md.spec; + m_snap_parents[i].overlap = *parent_overlap; + } else { + m_snap_parents[i] = {}; + } + } + } + } + + if (*result >= 0) { + *result = cls_client::get_flags_finish(&it, &m_snap_flags[i]); + } + + if (*result >= 0) { + *result = cls_client::get_protection_status_finish( + &it, &m_snap_protection[i]); + } + + if (*result < 0) { + break; + } + } + + if (*result == -ENOENT && m_enoent_retries++ < MAX_ENOENT_RETRIES) { + ldout(cct, 10) << "out-of-sync snapshot state detected, retrying" << dendl; + send_v2_get_mutable_metadata(); + return nullptr; + } else if (m_legacy_snapshot == LEGACY_SNAPSHOT_DISABLED && + *result == -EOPNOTSUPP) { + ldout(cct, 10) << "retrying using legacy snapshot methods" << dendl; + m_legacy_snapshot = LEGACY_SNAPSHOT_ENABLED; + send_v2_get_snapshots(); + return nullptr; + } else if (m_legacy_snapshot == LEGACY_SNAPSHOT_ENABLED && + *result == -EOPNOTSUPP) { + ldout(cct, 10) << "retrying using legacy snapshot methods (jewel)" << dendl; + m_legacy_snapshot = LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP; + send_v2_get_snapshots(); + return nullptr; + } else if (*result < 0) { + lderr(cct) << "failed to retrieve snapshots: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_v2_refresh_parent(); + return nullptr; +} + +template +void RefreshRequest::send_v2_refresh_parent() { + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + + ParentImageInfo parent_md; + MigrationInfo migration_info; + int r = get_parent_info(m_image_ctx.snap_id, &parent_md, &migration_info); + if (!m_skip_open_parent_image && (r < 0 || + RefreshParentRequest::is_refresh_required(m_image_ctx, parent_md, + migration_info))) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_refresh_parent>(this); + m_refresh_parent = RefreshParentRequest::create( + m_image_ctx, parent_md, migration_info, ctx); + } + } + + if (m_refresh_parent != nullptr) { + m_refresh_parent->send(); + } else { + send_v2_init_exclusive_lock(); + } +} + +template +Context *RefreshRequest::handle_v2_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -ENOENT && m_enoent_retries++ < MAX_ENOENT_RETRIES) { + ldout(cct, 10) << "out-of-sync parent info detected, retrying" << dendl; + ceph_assert(m_refresh_parent != nullptr); + delete m_refresh_parent; + m_refresh_parent = nullptr; + send_v2_get_mutable_metadata(); + return nullptr; + } else if (*result < 0) { + lderr(cct) << "failed to refresh parent image: " << cpp_strerror(*result) + << dendl; + save_result(result); + send_v2_apply(); + return nullptr; + } + + send_v2_init_exclusive_lock(); + return nullptr; +} + +template +void RefreshRequest::send_v2_init_exclusive_lock() { + if ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0 || + m_read_only || !m_image_ctx.snap_name.empty() || + m_image_ctx.exclusive_lock != nullptr) { + send_v2_open_object_map(); + return; + } + + // implies exclusive lock dynamically enabled or image open in-progress + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // TODO need safe shut down + m_exclusive_lock = m_image_ctx.create_exclusive_lock(); + + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_init_exclusive_lock>(this); + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + m_exclusive_lock->init(m_features, ctx); +} + +template +Context *RefreshRequest::handle_v2_init_exclusive_lock(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to initialize exclusive lock: " + << cpp_strerror(*result) << dendl; + save_result(result); + } + + // object map and journal will be opened when exclusive lock is + // acquired (if features are enabled) + send_v2_apply(); + return nullptr; +} + +template +void RefreshRequest::send_v2_open_journal() { + bool journal_disabled = ( + (m_features & RBD_FEATURE_JOURNALING) == 0 || + m_read_only || + !m_image_ctx.snap_name.empty() || + m_image_ctx.journal != nullptr || + m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->is_lock_owner()); + bool journal_disabled_by_policy; + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + journal_disabled_by_policy = ( + !journal_disabled && + m_image_ctx.get_journal_policy()->journal_disabled()); + } + + if (journal_disabled || journal_disabled_by_policy) { + // journal dynamically enabled -- doesn't own exclusive lock + if ((m_features & RBD_FEATURE_JOURNALING) != 0 && + !journal_disabled_by_policy && + m_image_ctx.exclusive_lock != nullptr && + m_image_ctx.journal == nullptr) { + auto ctx = new LambdaContext([this](int) { + send_v2_block_writes(); + }); + m_image_ctx.exclusive_lock->set_require_lock( + true, librbd::io::DIRECTION_BOTH, ctx); + return; + } + + send_v2_block_writes(); + return; + } + + // implies journal dynamically enabled since ExclusiveLock will init + // the journal upon acquiring the lock + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_open_journal>(this); + + // TODO need safe close + m_journal = m_image_ctx.create_journal(); + m_journal->open(ctx); +} + +template +Context *RefreshRequest::handle_v2_open_journal(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to initialize journal: " << cpp_strerror(*result) + << dendl; + save_result(result); + } + + send_v2_block_writes(); + return nullptr; +} + +template +void RefreshRequest::send_v2_block_writes() { + bool disabled_journaling = false; + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + disabled_journaling = ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0 && + (m_features & RBD_FEATURE_JOURNALING) == 0 && + m_image_ctx.journal != nullptr); + } + + if (!disabled_journaling) { + send_v2_apply(); + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // we need to block writes temporarily to avoid in-flight journal + // writes + m_blocked_writes = true; + Context *ctx = create_context_callback< + RefreshRequest, &RefreshRequest::handle_v2_block_writes>(this); + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + m_image_ctx.io_image_dispatcher->block_writes(ctx); +} + +template +Context *RefreshRequest::handle_v2_block_writes(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) + << dendl; + save_result(result); + } + send_v2_apply(); + return nullptr; +} + +template +void RefreshRequest::send_v2_open_object_map() { + if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0 || + m_image_ctx.object_map != nullptr || + (m_image_ctx.snap_name.empty() && + (m_read_only || + m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->is_lock_owner()))) { + send_v2_open_journal(); + return; + } + + // implies object map dynamically enabled or image open in-progress + // since SetSnapRequest loads the object map for a snapshot and + // ExclusiveLock loads the object map for HEAD + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + if (m_image_ctx.snap_name.empty()) { + m_object_map = m_image_ctx.create_object_map(CEPH_NOSNAP); + } else { + for (size_t snap_idx = 0; snap_idx < m_snap_infos.size(); ++snap_idx) { + if (m_snap_infos[snap_idx].name == m_image_ctx.snap_name) { + m_object_map = m_image_ctx.create_object_map( + m_snapc.snaps[snap_idx].val); + break; + } + } + + if (m_object_map == nullptr) { + lderr(cct) << "failed to locate snapshot: " << m_image_ctx.snap_name + << dendl; + send_v2_open_journal(); + return; + } + } + + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_open_object_map>(this); + m_object_map->open(ctx); +} + +template +Context *RefreshRequest::handle_v2_open_object_map(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to open object map: " << cpp_strerror(*result) + << dendl; + m_object_map->put(); + m_object_map = nullptr; + + if (*result != -EFBIG) { + save_result(result); + } + } + + send_v2_open_journal(); + return nullptr; +} + +template +void RefreshRequest::send_v2_apply() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // ensure we are not in a rados callback when applying updates + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_apply>(this); + m_image_ctx.op_work_queue->queue(ctx, 0); +} + +template +Context *RefreshRequest::handle_v2_apply(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + apply(); + + return send_v2_finalize_refresh_parent(); +} + +template +Context *RefreshRequest::send_v2_finalize_refresh_parent() { + if (m_refresh_parent == nullptr) { + return send_v2_shut_down_exclusive_lock(); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_finalize_refresh_parent>(this); + m_refresh_parent->finalize(ctx); + return nullptr; +} + +template +Context *RefreshRequest::handle_v2_finalize_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + ceph_assert(m_refresh_parent != nullptr); + delete m_refresh_parent; + m_refresh_parent = nullptr; + + return send_v2_shut_down_exclusive_lock(); +} + +template +Context *RefreshRequest::send_v2_shut_down_exclusive_lock() { + if (m_exclusive_lock == nullptr) { + return send_v2_close_journal(); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // exclusive lock feature was dynamically disabled. in-flight IO will be + // flushed and in-flight requests will be canceled before releasing lock + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_shut_down_exclusive_lock>(this); + m_exclusive_lock->shut_down(ctx); + return nullptr; +} + +template +Context *RefreshRequest::handle_v2_shut_down_exclusive_lock(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to shut down exclusive lock: " + << cpp_strerror(*result) << dendl; + save_result(result); + } + + { + std::unique_lock owner_locker{m_image_ctx.owner_lock}; + ceph_assert(m_image_ctx.exclusive_lock == nullptr); + } + + ceph_assert(m_exclusive_lock != nullptr); + m_exclusive_lock->put(); + m_exclusive_lock = nullptr; + + return send_v2_close_journal(); +} + +template +Context *RefreshRequest::send_v2_close_journal() { + if (m_journal == nullptr) { + return send_v2_close_object_map(); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // journal feature was dynamically disabled + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_close_journal>(this); + m_journal->close(ctx); + return nullptr; +} + +template +Context *RefreshRequest::handle_v2_close_journal(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + save_result(result); + lderr(cct) << "failed to close journal: " << cpp_strerror(*result) + << dendl; + } + + ceph_assert(m_journal != nullptr); + m_journal->put(); + m_journal = nullptr; + + ceph_assert(m_blocked_writes); + m_blocked_writes = false; + + m_image_ctx.io_image_dispatcher->unblock_writes(); + return send_v2_close_object_map(); +} + +template +Context *RefreshRequest::send_v2_close_object_map() { + if (m_object_map == nullptr) { + return send_flush_aio(); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + // object map was dynamically disabled + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_v2_close_object_map>(this); + m_object_map->close(ctx); + return nullptr; +} + +template +Context *RefreshRequest::handle_v2_close_object_map(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to close object map: " << cpp_strerror(*result) + << dendl; + } + + ceph_assert(m_object_map != nullptr); + + m_object_map->put(); + m_object_map = nullptr; + + return send_flush_aio(); +} + +template +Context *RefreshRequest::send_flush_aio() { + if (m_incomplete_update && m_error_result == 0) { + // if this was a partial refresh, notify ImageState + m_error_result = -ERESTART; + } + + if (m_flush_aio) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + auto ctx = create_context_callback< + RefreshRequest, &RefreshRequest::handle_flush_aio>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + m_image_ctx, io::IMAGE_DISPATCH_LAYER_REFRESH, aio_comp, + io::FLUSH_SOURCE_REFRESH, {}); + req->send(); + return nullptr; + } else if (m_error_result < 0) { + // propagate saved error back to caller + Context *ctx = create_context_callback< + RefreshRequest, &RefreshRequest::handle_error>(this); + m_image_ctx.op_work_queue->queue(ctx, 0); + return nullptr; + } + + return m_on_finish; +} + +template +Context *RefreshRequest::handle_flush_aio(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to flush pending AIO: " << cpp_strerror(*result) + << dendl; + } + + return handle_error(result); +} + +template +Context *RefreshRequest::handle_error(int *result) { + if (m_error_result < 0) { + *result = m_error_result; + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + } + return m_on_finish; +} + +template +void RefreshRequest::apply() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + std::scoped_lock locker{m_image_ctx.owner_lock, m_image_ctx.image_lock}; + + m_image_ctx.read_only_flags = m_read_only_flags; + m_image_ctx.read_only = m_read_only; + m_image_ctx.size = m_size; + m_image_ctx.lockers = m_lockers; + m_image_ctx.lock_tag = m_lock_tag; + m_image_ctx.exclusive_locked = m_exclusive_locked; + + std::map migration_reverse_snap_seq; + + if (m_image_ctx.old_format) { + m_image_ctx.order = m_order; + m_image_ctx.features = 0; + m_image_ctx.flags = 0; + m_image_ctx.op_features = 0; + m_image_ctx.operations_disabled = false; + m_image_ctx.object_prefix = std::move(m_object_prefix); + m_image_ctx.init_layout(m_image_ctx.md_ctx.get_id()); + } else { + // HEAD revision doesn't have a defined overlap so it's only + // applicable to snapshots + if (!m_head_parent_overlap) { + m_parent_md = {}; + } + + m_image_ctx.features = m_features; + m_image_ctx.flags = m_flags; + m_image_ctx.op_features = m_op_features; + m_image_ctx.operations_disabled = ( + (m_op_features & ~RBD_OPERATION_FEATURES_ALL) != 0ULL); + m_image_ctx.group_spec = m_group_spec; + + bool migration_info_valid; + int r = get_migration_info(&m_image_ctx.parent_md, + &m_image_ctx.migration_info, + &migration_info_valid); + ceph_assert(r == 0); // validated in refresh parent step + + if (migration_info_valid) { + for (auto it : m_image_ctx.migration_info.snap_map) { + migration_reverse_snap_seq[it.second.front()] = it.first; + } + } else { + m_image_ctx.parent_md = m_parent_md; + m_image_ctx.migration_info = {}; + } + + librados::Rados rados(m_image_ctx.md_ctx); + int8_t require_osd_release; + r = rados.get_min_compatible_osd(&require_osd_release); + if (r == 0 && require_osd_release >= CEPH_RELEASE_OCTOPUS) { + m_image_ctx.enable_sparse_copyup = true; + } + } + + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + std::vector::const_iterator it = std::find( + m_image_ctx.snaps.begin(), m_image_ctx.snaps.end(), + m_snapc.snaps[i].val); + if (it == m_image_ctx.snaps.end()) { + m_flush_aio = true; + ldout(cct, 20) << "new snapshot id=" << m_snapc.snaps[i].val + << " name=" << m_snap_infos[i].name + << " size=" << m_snap_infos[i].image_size + << dendl; + } + } + + m_image_ctx.snaps.clear(); + m_image_ctx.snap_info.clear(); + m_image_ctx.snap_ids.clear(); + auto overlap = m_image_ctx.parent_md.overlap; + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + uint64_t flags = m_image_ctx.old_format ? 0 : m_snap_flags[i]; + uint8_t protection_status = m_image_ctx.old_format ? + static_cast(RBD_PROTECTION_STATUS_UNPROTECTED) : + m_snap_protection[i]; + ParentImageInfo parent; + if (!m_image_ctx.old_format) { + if (!m_image_ctx.migration_info.empty()) { + parent = m_image_ctx.parent_md; + auto it = migration_reverse_snap_seq.find(m_snapc.snaps[i].val); + if (it != migration_reverse_snap_seq.end()) { + parent.spec.snap_id = it->second; + parent.overlap = m_snap_infos[i].image_size; + } else { + overlap = std::min(overlap, m_snap_infos[i].image_size); + parent.overlap = overlap; + } + } else { + parent = m_snap_parents[i]; + } + } + m_image_ctx.add_snap(m_snap_infos[i].snapshot_namespace, + m_snap_infos[i].name, m_snapc.snaps[i].val, + m_snap_infos[i].image_size, parent, + protection_status, flags, + m_snap_infos[i].timestamp); + } + m_image_ctx.parent_md.overlap = std::min(overlap, m_image_ctx.size); + m_image_ctx.snapc = m_snapc; + + if (m_image_ctx.snap_id != CEPH_NOSNAP && + m_image_ctx.get_snap_id(m_image_ctx.snap_namespace, + m_image_ctx.snap_name) != m_image_ctx.snap_id) { + lderr(cct) << "tried to read from a snapshot that no longer exists: " + << m_image_ctx.snap_name << dendl; + m_image_ctx.snap_exists = false; + } + + if (m_refresh_parent != nullptr) { + m_refresh_parent->apply(); + } + if (m_image_ctx.data_ctx.is_valid()) { + m_image_ctx.data_ctx.selfmanaged_snap_set_write_ctx(m_image_ctx.snapc.seq, + m_image_ctx.snaps); + m_image_ctx.rebuild_data_io_context(); + } + + // handle dynamically enabled / disabled features + if (m_image_ctx.exclusive_lock != nullptr && + !m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK, + m_image_ctx.image_lock)) { + // disabling exclusive lock will automatically handle closing + // object map and journaling + ceph_assert(m_exclusive_lock == nullptr); + m_exclusive_lock = m_image_ctx.exclusive_lock; + } else { + if (m_exclusive_lock != nullptr) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr); + std::swap(m_exclusive_lock, m_image_ctx.exclusive_lock); + } + if (!m_image_ctx.test_features(RBD_FEATURE_JOURNALING, + m_image_ctx.image_lock)) { + if (!m_image_ctx.clone_copy_on_read && m_image_ctx.journal != nullptr) { + m_image_ctx.exclusive_lock->unset_require_lock(io::DIRECTION_READ); + } + std::swap(m_journal, m_image_ctx.journal); + } else if (m_journal != nullptr) { + std::swap(m_journal, m_image_ctx.journal); + } + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, + m_image_ctx.image_lock) || + m_object_map != nullptr) { + std::swap(m_object_map, m_image_ctx.object_map); + } + } +} + +template +int RefreshRequest::get_parent_info(uint64_t snap_id, + ParentImageInfo *parent_md, + MigrationInfo *migration_info) { + bool migration_info_valid; + int r = get_migration_info(parent_md, migration_info, &migration_info_valid); + if (r < 0) { + return r; + } + + if (migration_info_valid) { + return 0; + } else if (snap_id == CEPH_NOSNAP) { + *parent_md = m_parent_md; + *migration_info = {}; + return 0; + } else { + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + if (m_snapc.snaps[i].val == snap_id) { + *parent_md = m_snap_parents[i]; + *migration_info = {}; + return 0; + } + } + } + return -ENOENT; +} + +template +int RefreshRequest::get_migration_info(ParentImageInfo *parent_md, + MigrationInfo *migration_info, + bool* migration_info_valid) { + CephContext *cct = m_image_ctx.cct; + if (m_migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_DST || + (m_migration_spec.state != cls::rbd::MIGRATION_STATE_PREPARED && + m_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTING && + m_migration_spec.state != cls::rbd::MIGRATION_STATE_ABORTING)) { + if (m_migration_spec.header_type != cls::rbd::MIGRATION_HEADER_TYPE_SRC && + m_migration_spec.pool_id != -1 && + m_migration_spec.state != cls::rbd::MIGRATION_STATE_EXECUTED) { + lderr(cct) << this << " " << __func__ << ": invalid migration spec" + << dendl; + return -EINVAL; + } + + *migration_info_valid = false; + return 0; + } + + if (!m_migration_spec.source_spec.empty()) { + // use special pool id just to indicate a parent (migration source image) + // exists + parent_md->spec.pool_id = std::numeric_limits::max(); + parent_md->spec.pool_namespace = ""; + parent_md->spec.image_id = ""; + } else { + parent_md->spec.pool_id = m_migration_spec.pool_id; + parent_md->spec.pool_namespace = m_migration_spec.pool_namespace; + parent_md->spec.image_id = m_migration_spec.image_id; + } + parent_md->spec.snap_id = CEPH_NOSNAP; + parent_md->overlap = std::min(m_size, m_migration_spec.overlap); + + auto snap_seqs = m_migration_spec.snap_seqs; + // If new snapshots have been created on destination image after + // migration stared, map the source CEPH_NOSNAP to the earliest of + // these snapshots. + snapid_t snap_id = snap_seqs.empty() ? 0 : snap_seqs.rbegin()->second; + auto it = std::upper_bound(m_snapc.snaps.rbegin(), m_snapc.snaps.rend(), + snap_id); + if (it != m_snapc.snaps.rend()) { + snap_seqs[CEPH_NOSNAP] = *it; + } else { + snap_seqs[CEPH_NOSNAP] = CEPH_NOSNAP; + } + + std::set snap_ids; + for (auto& it : snap_seqs) { + snap_ids.insert(it.second); + } + uint64_t overlap = snap_ids.find(CEPH_NOSNAP) != snap_ids.end() ? + parent_md->overlap : 0; + for (size_t i = 0; i < m_snapc.snaps.size(); ++i) { + if (snap_ids.find(m_snapc.snaps[i].val) != snap_ids.end()) { + overlap = std::max(overlap, m_snap_infos[i].image_size); + } + } + + *migration_info = {m_migration_spec.pool_id, m_migration_spec.pool_namespace, + m_migration_spec.image_name, m_migration_spec.image_id, + m_migration_spec.source_spec, {}, overlap, + m_migration_spec.flatten}; + *migration_info_valid = true; + + deep_copy::util::compute_snap_map(m_image_ctx.cct, 0, CEPH_NOSNAP, {}, + snap_seqs, &migration_info->snap_map); + return 0; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::RefreshRequest; diff --git a/src/librbd/image/RefreshRequest.h b/src/librbd/image/RefreshRequest.h new file mode 100644 index 000000000..42f4b4669 --- /dev/null +++ b/src/librbd/image/RefreshRequest.h @@ -0,0 +1,275 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H +#define CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/utime.h" +#include "common/snap_types.h" +#include "cls/lock/cls_lock_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template class RefreshParentRequest; + +template +class RefreshRequest { +public: + static constexpr int MAX_ENOENT_RETRIES = 10; + + static RefreshRequest *create(ImageCtxT &image_ctx, bool acquiring_lock, + bool skip_open_parent, Context *on_finish) { + return new RefreshRequest(image_ctx, acquiring_lock, skip_open_parent, + on_finish); + } + + RefreshRequest(ImageCtxT &image_ctx, bool acquiring_lock, + bool skip_open_parent, Context *on_finish); + ~RefreshRequest(); + + void send(); + +private: + /** + * @verbatim + * + * < * * * * * * * * * * * * * * * * * * * * * * * * * * (ENOENT) + * ^ | * + * * | (v1) * + * * |-----> V1_READ_HEADER -------------> GET_MIGRATION_HEADER (skip if not + * * | | migrating) + * * | (v2) v + * * \-----> V2_GET_MUTABLE_METADATA V1_GET_SNAPSHOTS + * * * | | + * * * | -EOPNOTSUPP v + * * * | * * * V1_GET_LOCKS + * * * | * * | + * * * v v * v + * * * V2_GET_PARENT + * * * | | + * * v | + * * * * * * GET_MIGRATION_HEADER (skip if not | + * (ENOENT) | migrating) | + * v | + * * V2_GET_METADATA | + * * | | + * * v | + * * V2_GET_POOL_METADATA | + * * | | + * * v (skip if not enabled) | + * * V2_GET_OP_FEATURES | + * * | | + * * v | + * * V2_GET_GROUP | + * * | | + * * | -EOPNOTSUPP | + * * | * * * | + * * | * * | + * * v v * | + * * * V2_GET_SNAPSHOTS (skip if no snaps) | + * (ENOENT) | | + * * v | + * * * V2_REFRESH_PARENT (skip if no parent or | + * (ENOENT) | refresh not needed) | + * v | + * V2_INIT_EXCLUSIVE_LOCK (skip if lock | + * | active or disabled) | + * v | + * V2_OPEN_OBJECT_MAP (skip if map | + * | active or disabled) | + * v | + * V2_OPEN_JOURNAL (skip if journal | + * | active or disabled) | + * v | + * V2_BLOCK_WRITES (skip if journal not | + * | disabled) | + * v | + * | + * | | + * v | + * V2_FINALIZE_REFRESH_PARENT (skip if refresh | + * | not needed) | + * (error) v | + * * * * * > V2_SHUT_DOWN_EXCLUSIVE_LOCK (skip if lock | + * | active or enabled) | + * v | + * V2_CLOSE_JOURNAL (skip if journal inactive | + * | or enabled) | + * v | + * V2_CLOSE_OBJECT_MAP (skip if map inactive | + * | or enabled) | + * | | + * \-------------------\/--------------------/ + * | + * v + * FLUSH (skip if no new + * | snapshots) + * v + * + * + * @endverbatim + */ + + enum LegacySnapshot { + LEGACY_SNAPSHOT_DISABLED, + LEGACY_SNAPSHOT_ENABLED, + LEGACY_SNAPSHOT_ENABLED_NO_TIMESTAMP + }; + + ImageCtxT &m_image_ctx; + bool m_acquiring_lock; + bool m_skip_open_parent_image; + Context *m_on_finish; + + cls::rbd::MigrationSpec m_migration_spec; + int m_error_result; + bool m_flush_aio; + decltype(m_image_ctx.exclusive_lock) m_exclusive_lock; + decltype(m_image_ctx.object_map) m_object_map; + decltype(m_image_ctx.journal) m_journal; + RefreshParentRequest *m_refresh_parent; + + bufferlist m_out_bl; + + bool m_legacy_parent = false; + LegacySnapshot m_legacy_snapshot = LEGACY_SNAPSHOT_DISABLED; + + int m_enoent_retries = 0; + + uint8_t m_order = 0; + uint64_t m_size = 0; + uint64_t m_features = 0; + uint64_t m_incompatible_features = 0; + uint64_t m_flags = 0; + uint64_t m_op_features = 0; + uint32_t m_read_only_flags = 0U; + bool m_read_only = false; + + librados::IoCtx m_pool_metadata_io_ctx; + std::map m_metadata; + + std::string m_object_prefix; + ParentImageInfo m_parent_md; + bool m_head_parent_overlap = false; + cls::rbd::GroupSpec m_group_spec; + + ::SnapContext m_snapc; + std::vector m_snap_infos; + std::vector m_snap_parents; + std::vector m_snap_protection; + std::vector m_snap_flags; + + std::map m_lockers; + std::string m_lock_tag; + bool m_exclusive_locked = false; + + bool m_blocked_writes = false; + bool m_incomplete_update = false; + + void send_get_migration_header(); + Context *handle_get_migration_header(int *result); + + void send_v1_read_header(); + Context *handle_v1_read_header(int *result); + + void send_v1_get_snapshots(); + Context *handle_v1_get_snapshots(int *result); + + void send_v1_get_locks(); + Context *handle_v1_get_locks(int *result); + + void send_v1_apply(); + Context *handle_v1_apply(int *result); + + void send_v2_get_mutable_metadata(); + Context *handle_v2_get_mutable_metadata(int *result); + + void send_v2_get_parent(); + Context *handle_v2_get_parent(int *result); + + void send_v2_get_metadata(); + Context *handle_v2_get_metadata(int *result); + + void send_v2_get_pool_metadata(); + Context *handle_v2_get_pool_metadata(int *result); + + void send_v2_get_op_features(); + Context *handle_v2_get_op_features(int *result); + + void send_v2_get_group(); + Context *handle_v2_get_group(int *result); + + void send_v2_get_snapshots(); + Context *handle_v2_get_snapshots(int *result); + + void send_v2_get_snapshots_legacy(); + Context *handle_v2_get_snapshots_legacy(int *result); + + void send_v2_refresh_parent(); + Context *handle_v2_refresh_parent(int *result); + + void send_v2_init_exclusive_lock(); + Context *handle_v2_init_exclusive_lock(int *result); + + void send_v2_open_journal(); + Context *handle_v2_open_journal(int *result); + + void send_v2_block_writes(); + Context *handle_v2_block_writes(int *result); + + void send_v2_open_object_map(); + Context *handle_v2_open_object_map(int *result); + + void send_v2_apply(); + Context *handle_v2_apply(int *result); + + Context *send_v2_finalize_refresh_parent(); + Context *handle_v2_finalize_refresh_parent(int *result); + + Context *send_v2_shut_down_exclusive_lock(); + Context *handle_v2_shut_down_exclusive_lock(int *result); + + Context *send_v2_close_journal(); + Context *handle_v2_close_journal(int *result); + + Context *send_v2_close_object_map(); + Context *handle_v2_close_object_map(int *result); + + Context *send_flush_aio(); + Context *handle_flush_aio(int *result); + + Context *handle_error(int *result); + + void save_result(int *result) { + if (m_error_result == 0 && *result < 0) { + m_error_result = *result; + } + } + + void apply(); + int get_parent_info(uint64_t snap_id, ParentImageInfo *parent_md, + MigrationInfo *migration_info); + int get_migration_info(ParentImageInfo *parent_md, + MigrationInfo *migration_info, + bool* migration_info_valid); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::RefreshRequest; + +#endif // CEPH_LIBRBD_IMAGE_REFRESH_REQUEST_H diff --git a/src/librbd/image/RemoveRequest.cc b/src/librbd/image/RemoveRequest.cc new file mode 100644 index 000000000..42af593b1 --- /dev/null +++ b/src/librbd/image/RemoveRequest.cc @@ -0,0 +1,617 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/RemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/internal.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/image/DetachChildRequest.h" +#include "librbd/image/PreRemoveRequest.h" +#include "librbd/journal/RemoveRequest.h" +#include "librbd/journal/TypeTraits.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/operation/TrimRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::RemoveRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace image { + +using librados::IoCtx; +using util::create_context_callback; +using util::create_async_context_callback; +using util::create_rados_callback; + +template +RemoveRequest::RemoveRequest(IoCtx &ioctx, const std::string &image_name, + const std::string &image_id, bool force, + bool from_trash_remove, + ProgressContext &prog_ctx, + ContextWQ *op_work_queue, Context *on_finish) + : m_ioctx(ioctx), m_image_name(image_name), m_image_id(image_id), + m_force(force), m_from_trash_remove(from_trash_remove), + m_prog_ctx(prog_ctx), m_op_work_queue(op_work_queue), + m_on_finish(on_finish) { + m_cct = reinterpret_cast(m_ioctx.cct()); +} + +template +RemoveRequest::RemoveRequest(IoCtx &ioctx, I *image_ctx, bool force, + bool from_trash_remove, + ProgressContext &prog_ctx, + ContextWQ *op_work_queue, Context *on_finish) + : m_ioctx(ioctx), m_image_name(image_ctx->name), m_image_id(image_ctx->id), + m_image_ctx(image_ctx), m_force(force), + m_from_trash_remove(from_trash_remove), m_prog_ctx(prog_ctx), + m_op_work_queue(op_work_queue), m_on_finish(on_finish), + m_cct(image_ctx->cct), m_header_oid(image_ctx->header_oid), + m_old_format(image_ctx->old_format), m_unknown_format(false) { +} + +template +void RemoveRequest::send() { + ldout(m_cct, 20) << dendl; + + open_image(); +} + +template +void RemoveRequest::open_image() { + if (m_image_ctx != nullptr) { + pre_remove_image(); + return; + } + + m_image_ctx = I::create(m_image_id.empty() ? m_image_name : "", m_image_id, + nullptr, m_ioctx, false); + + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest; + Context *ctx = create_context_callback( + this); + + m_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT, ctx); +} + +template +void RemoveRequest::handle_open_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_image_ctx = nullptr; + + if (r != -ENOENT) { + lderr(m_cct) << "error opening image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + remove_image(); + return; + } + + m_image_id = m_image_ctx->id; + m_image_name = m_image_ctx->name; + m_header_oid = m_image_ctx->header_oid; + m_old_format = m_image_ctx->old_format; + m_unknown_format = false; + + pre_remove_image(); +} + +template +void RemoveRequest::pre_remove_image() { + ldout(m_cct, 5) << dendl; + + auto ctx = create_context_callback< + RemoveRequest, &RemoveRequest::handle_pre_remove_image>(this); + auto req = PreRemoveRequest::create(m_image_ctx, m_force, ctx); + req->send(); +} + +template +void RemoveRequest::handle_pre_remove_image(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r < 0) { + if (r == -ECHILD) { + r = -ENOTEMPTY; + } + send_close_image(r); + return; + } + + if (!m_image_ctx->data_ctx.is_valid()) { + detach_child(); + return; + } + + trim_image(); +} + +template +void RemoveRequest::trim_image() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest; + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + klass, &klass::handle_trim_image>(this)); + + std::shared_lock owner_lock{m_image_ctx->owner_lock}; + auto req = librbd::operation::TrimRequest::create( + *m_image_ctx, ctx, m_image_ctx->size, 0, m_prog_ctx); + req->send(); +} + +template +void RemoveRequest::handle_trim_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to remove some object(s): " + << cpp_strerror(r) << dendl; + send_close_image(r); + return; + } + + if (m_old_format) { + send_close_image(r); + return; + } + + detach_child(); +} + +template +void RemoveRequest::detach_child() { + ldout(m_cct, 20) << dendl; + + auto ctx = create_context_callback< + RemoveRequest, &RemoveRequest::handle_detach_child>(this); + auto req = DetachChildRequest::create(*m_image_ctx, ctx); + req->send(); +} + +template +void RemoveRequest::handle_detach_child(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to detach child from parent: " + << cpp_strerror(r) << dendl; + send_close_image(r); + return; + } + + send_disable_mirror(); +} + +template +void RemoveRequest::send_disable_mirror() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_disable_mirror>(this); + + mirror::DisableRequest *req = + mirror::DisableRequest::create(m_image_ctx, m_force, !m_force, ctx); + req->send(); +} + +template +void RemoveRequest::handle_disable_mirror(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + r = 0; + } else if (r < 0) { + lderr(m_cct) << "error disabling image mirroring: " + << cpp_strerror(r) << dendl; + } + + // one last chance to ensure all snapshots have been deleted + m_image_ctx->image_lock.lock_shared(); + if (!m_image_ctx->snap_info.empty()) { + ldout(m_cct, 5) << "image has snapshots - not removing" << dendl; + m_ret_val = -ENOTEMPTY; + } + m_image_ctx->image_lock.unlock_shared(); + + send_close_image(r); +} + +template +void RemoveRequest::send_close_image(int r) { + ldout(m_cct, 20) << dendl; + + m_ret_val = r; + using klass = RemoveRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_send_close_image>(this); + + m_image_ctx->state->close(ctx); +} + +template +void RemoveRequest::handle_send_close_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "error encountered while closing image: " + << cpp_strerror(r) << dendl; + } + + m_image_ctx = nullptr; + if (m_ret_val < 0) { + r = m_ret_val; + finish(r); + return; + } + + remove_header(); +} + +template +void RemoveRequest::remove_header() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_ioctx.aio_remove(m_header_oid, rados_completion); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RemoveRequest::handle_remove_header(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing header: " << cpp_strerror(r) << dendl; + m_ret_val = r; + } + + remove_image(); +} + +template +void RemoveRequest::remove_header_v2() { + ldout(m_cct, 20) << dendl; + + if (m_header_oid.empty()) { + m_header_oid = util::header_name(m_image_id); + } + + using klass = RemoveRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_ioctx.aio_remove(m_header_oid, rados_completion); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RemoveRequest::handle_remove_header_v2(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing header: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_journal_remove(); +} + +template +void RemoveRequest::send_journal_remove() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_journal_remove>(this); + + typename journal::TypeTraits::ContextWQ* context_wq; + Journal::get_work_queue(m_cct, &context_wq); + + journal::RemoveRequest *req = journal::RemoveRequest::create( + m_ioctx, m_image_id, Journal<>::IMAGE_CLIENT_ID, context_wq, ctx); + req->send(); +} + +template +void RemoveRequest::handle_journal_remove(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to remove image journal: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else { + r = 0; + } + + send_object_map_remove(); +} + +template +void RemoveRequest::send_object_map_remove() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + + int r = ObjectMap<>::aio_remove(m_ioctx, + m_image_id, + rados_completion); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RemoveRequest::handle_object_map_remove(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to remove image journal: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else { + r = 0; + } + + mirror_image_remove(); +} + +template +void RemoveRequest::mirror_image_remove() { + ldout(m_cct, 20) << dendl; + + librados::ObjectWriteOperation op; + cls_client::mirror_image_remove(&op, m_image_id); + + using klass = RemoveRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_ioctx.aio_operate(RBD_MIRRORING, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RemoveRequest::handle_mirror_image_remove(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT && r != -EOPNOTSUPP) { + lderr(m_cct) << "failed to remove mirror image state: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (m_from_trash_remove) { + // both the id object and the directory entry have been removed in + // a previous call to trash_move. + finish(0); + return; + } + + remove_id_object(); +} + +template +void RemoveRequest::remove_image() { + ldout(m_cct, 20) << dendl; + + if (m_old_format || m_unknown_format) { + remove_v1_image(); + } else { + remove_v2_image(); + } +} + +template +void RemoveRequest::remove_v1_image() { + ldout(m_cct, 20) << dendl; + + Context *ctx = new LambdaContext([this] (int r) { + r = tmap_rm(m_ioctx, m_image_name); + handle_remove_v1_image(r); + }); + + m_op_work_queue->queue(ctx, 0); +} + +template +void RemoveRequest::handle_remove_v1_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_old_format = (r == 0); + if (r == 0 || (r < 0 && !m_unknown_format)) { + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing image from v1 directory: " + << cpp_strerror(r) << dendl; + } + + m_on_finish->complete(r); + delete this; + return; + } + + if (!m_old_format) { + remove_v2_image(); + } +} + +template +void RemoveRequest::remove_v2_image() { + ldout(m_cct, 20) << dendl; + + if (m_image_id.empty()) { + dir_get_image_id(); + return; + } else if (m_image_name.empty()) { + dir_get_image_name(); + return; + } + + remove_header_v2(); + return; +} + +template +void RemoveRequest::dir_get_image_id() { + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::dir_get_id_start(&op, m_image_name); + + using klass = RemoveRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + m_out_bl.clear(); + int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RemoveRequest::handle_dir_get_image_id(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error fetching image id: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = librbd::cls_client::dir_get_id_finish(&iter, &m_image_id); + if (r < 0) { + finish(r); + return; + } + } + + remove_header_v2(); +} + +template +void RemoveRequest::dir_get_image_name() { + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::dir_get_name_start(&op, m_image_id); + + using klass = RemoveRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + m_out_bl.clear(); + int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RemoveRequest::handle_dir_get_image_name(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error fetching image name: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = librbd::cls_client::dir_get_name_finish(&iter, &m_image_name); + if (r < 0) { + finish(r); + return; + } + } + + remove_header_v2(); +} + +template +void RemoveRequest::remove_id_object() { + ldout(m_cct, 20) << dendl; + + using klass = RemoveRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_ioctx.aio_remove(util::id_obj_name(m_image_name), rados_completion); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RemoveRequest::handle_remove_id_object(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing id object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + dir_remove_image(); +} + +template +void RemoveRequest::dir_remove_image() { + ldout(m_cct, 20) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::dir_remove_image(&op, m_image_name, m_image_id); + + using klass = RemoveRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_ioctx.aio_operate(RBD_DIRECTORY, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RemoveRequest::handle_dir_remove_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing image from v2 directory: " + << cpp_strerror(r) << dendl; + } + + finish(r); +} + +template +void RemoveRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::RemoveRequest; diff --git a/src/librbd/image/RemoveRequest.h b/src/librbd/image/RemoveRequest.h new file mode 100644 index 000000000..b03f8fc7c --- /dev/null +++ b/src/librbd/image/RemoveRequest.h @@ -0,0 +1,197 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H +#define CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "librbd/ImageCtx.h" +#include "librbd/image/TypeTraits.h" +#include "common/Timer.h" + +#include + +class Context; + +namespace librbd { + +class ProgressContext; + +namespace image { + +template +class RemoveRequest { +private: + // mock unit testing support + typedef ::librbd::image::TypeTraits TypeTraits; + typedef typename TypeTraits::ContextWQ ContextWQ; +public: + static RemoveRequest *create(librados::IoCtx &ioctx, + const std::string &image_name, + const std::string &image_id, + bool force, bool from_trash_remove, + ProgressContext &prog_ctx, + ContextWQ *op_work_queue, + Context *on_finish) { + return new RemoveRequest(ioctx, image_name, image_id, force, + from_trash_remove, prog_ctx, op_work_queue, + on_finish); + } + + static RemoveRequest *create(librados::IoCtx &ioctx, ImageCtxT *image_ctx, + bool force, bool from_trash_remove, + ProgressContext &prog_ctx, + ContextWQ *op_work_queue, + Context *on_finish) { + return new RemoveRequest(ioctx, image_ctx, force, from_trash_remove, + prog_ctx, op_work_queue, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * (skip if already opened) OPEN IMAGE------------------\ + * | | + * v | + * PRE REMOVE IMAGE * * * | + * | * | + * v * | + * (skip if invalid data pool) TRIM IMAGE * * * * * | + * | * | + * v * | + * DETACH CHILD * | + * | * | + * v * v + * CLOSE IMAGE < * * * * | + * | | + * error v | + * /------<--------\ REMOVE HEADER<--------------/ + * | | / | + * | |-------<-------/ | + * | | v + * | | REMOVE JOURNAL + * | | / | + * | |-------<-------/ | + * | | v + * v ^ REMOVE OBJECTMAP + * | | / | + * | |-------<-------/ | + * | | v + * | | REMOVE MIRROR IMAGE + * | | / | + * | |-------<-------/ | + * | | v + * | | REMOVE ID OBJECT + * | | / | + * | |-------<-------/ | + * | | v + * | | REMOVE IMAGE + * | | / | + * | \-------<-------/ | + * | v + * \------------------>------------ + * + * @endverbatim + */ + + RemoveRequest(librados::IoCtx &ioctx, const std::string &image_name, + const std::string &image_id, bool force, bool from_trash_remove, + ProgressContext &prog_ctx, ContextWQ *op_work_queue, + Context *on_finish); + + RemoveRequest(librados::IoCtx &ioctx, ImageCtxT *image_ctx, bool force, + bool from_trash_remove, ProgressContext &prog_ctx, + ContextWQ *op_work_queue, Context *on_finish); + + librados::IoCtx &m_ioctx; + std::string m_image_name; + std::string m_image_id; + ImageCtxT *m_image_ctx = nullptr; + bool m_force; + bool m_from_trash_remove; + ProgressContext &m_prog_ctx; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + std::string m_header_oid; + bool m_old_format = false; + bool m_unknown_format = true; + + librados::IoCtx m_parent_io_ctx; + + decltype(m_image_ctx->exclusive_lock) m_exclusive_lock = nullptr; + + int m_ret_val = 0; + bufferlist m_out_bl; + std::list m_watchers; + + std::map m_snap_infos; + + void open_image(); + void handle_open_image(int r); + + void send_journal_remove(); + void handle_journal_remove(int r); + + void send_object_map_remove(); + void handle_object_map_remove(int r); + + void mirror_image_remove(); + void handle_mirror_image_remove(int r); + + void pre_remove_image(); + void handle_pre_remove_image(int r); + + void trim_image(); + void handle_trim_image(int r); + + void detach_child(); + void handle_detach_child(int r); + + void send_disable_mirror(); + void handle_disable_mirror(int r); + + void send_close_image(int r); + void handle_send_close_image(int r); + + void remove_header(); + void handle_remove_header(int r); + + void remove_header_v2(); + void handle_remove_header_v2(int r); + + void remove_image(); + + void remove_v1_image(); + void handle_remove_v1_image(int r); + + void remove_v2_image(); + + void dir_get_image_id(); + void handle_dir_get_image_id(int r); + + void dir_get_image_name(); + void handle_dir_get_image_name(int r); + + void remove_id_object(); + void handle_remove_id_object(int r); + + void dir_remove_image(); + void handle_dir_remove_image(int r); + + void finish(int r); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::RemoveRequest; + +#endif // CEPH_LIBRBD_IMAGE_REMOVE_REQUEST_H diff --git a/src/librbd/image/SetFlagsRequest.cc b/src/librbd/image/SetFlagsRequest.cc new file mode 100644 index 000000000..fa00ed981 --- /dev/null +++ b/src/librbd/image/SetFlagsRequest.cc @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/SetFlagsRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::SetFlagsRequest: " + +namespace librbd { +namespace image { + +using util::create_context_callback; +using util::create_rados_callback; + +template +SetFlagsRequest::SetFlagsRequest(I *image_ctx, uint64_t flags, + uint64_t mask, Context *on_finish) + : m_image_ctx(image_ctx), m_flags(flags), m_mask(mask), + m_on_finish(on_finish) { +} + +template +void SetFlagsRequest::send() { + send_set_flags(); +} + +template +void SetFlagsRequest::send_set_flags() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + std::unique_lock image_locker{m_image_ctx->image_lock}; + std::vector snap_ids; + snap_ids.push_back(CEPH_NOSNAP); + for (auto it : m_image_ctx->snap_info) { + snap_ids.push_back(it.first); + } + + Context *ctx = create_context_callback< + SetFlagsRequest, &SetFlagsRequest::handle_set_flags>(this); + C_Gather *gather_ctx = new C_Gather(cct, ctx); + + for (auto snap_id : snap_ids) { + librados::ObjectWriteOperation op; + cls_client::set_flags(&op, snap_id, m_flags, m_mask); + + librados::AioCompletion *comp = + create_rados_callback(gather_ctx->new_sub()); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + gather_ctx->activate(); +} + +template +Context *SetFlagsRequest::handle_set_flags(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "set_flags failed: " << cpp_strerror(*result) + << dendl; + } + return m_on_finish; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::SetFlagsRequest; diff --git a/src/librbd/image/SetFlagsRequest.h b/src/librbd/image/SetFlagsRequest.h new file mode 100644 index 000000000..be67e176a --- /dev/null +++ b/src/librbd/image/SetFlagsRequest.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H +#define CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H + +#include "include/buffer.h" +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace image { + +template +class SetFlagsRequest { +public: + static SetFlagsRequest *create(ImageCtxT *image_ctx, uint64_t flags, + uint64_t mask, Context *on_finish) { + return new SetFlagsRequest(image_ctx, flags, mask, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * + * | . . . + * v v . + * SET_FLAGS . (for every snapshot) + * | . . + * v . . . + * + * + * @endverbatim + */ + + SetFlagsRequest(ImageCtxT *image_ctx, uint64_t flags, uint64_t mask, + Context *on_finish); + + ImageCtxT *m_image_ctx; + uint64_t m_flags; + uint64_t m_mask; + Context *m_on_finish; + + void send_set_flags(); + Context *handle_set_flags(int *result); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::SetFlagsRequest; + +#endif // CEPH_LIBRBD_IMAGE_SET_FLAGS_REQUEST_H diff --git a/src/librbd/image/SetSnapRequest.cc b/src/librbd/image/SetSnapRequest.cc new file mode 100644 index 000000000..fbc234aef --- /dev/null +++ b/src/librbd/image/SetSnapRequest.cc @@ -0,0 +1,368 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/SetSnapRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/image/RefreshParentRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::SetSnapRequest: " + +namespace librbd { +namespace image { + +using util::create_context_callback; + +template +SetSnapRequest::SetSnapRequest(I &image_ctx, uint64_t snap_id, + Context *on_finish) + : m_image_ctx(image_ctx), m_snap_id(snap_id), m_on_finish(on_finish), + m_exclusive_lock(nullptr), m_object_map(nullptr), m_refresh_parent(nullptr), + m_writes_blocked(false) { +} + +template +SetSnapRequest::~SetSnapRequest() { + ceph_assert(!m_writes_blocked); + delete m_refresh_parent; + if (m_object_map) { + m_object_map->put(); + } + if (m_exclusive_lock) { + m_exclusive_lock->put(); + } +} + +template +void SetSnapRequest::send() { + if (m_snap_id == CEPH_NOSNAP) { + send_init_exclusive_lock(); + } else { + send_block_writes(); + } +} + +template +void SetSnapRequest::send_init_exclusive_lock() { + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + if (m_image_ctx.exclusive_lock != nullptr) { + ceph_assert(m_image_ctx.snap_id == CEPH_NOSNAP); + send_complete(); + return; + } + } + + if (m_image_ctx.read_only || + !m_image_ctx.test_features(RBD_FEATURE_EXCLUSIVE_LOCK)) { + int r = 0; + if (send_refresh_parent(&r) != nullptr) { + send_complete(); + } + return; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + m_exclusive_lock = ExclusiveLock::create(m_image_ctx); + + using klass = SetSnapRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_init_exclusive_lock>(this); + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + m_exclusive_lock->init(m_image_ctx.features, ctx); +} + +template +Context *SetSnapRequest::handle_init_exclusive_lock(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to initialize exclusive lock: " + << cpp_strerror(*result) << dendl; + finalize(); + return m_on_finish; + } + return send_refresh_parent(result); +} + +template +void SetSnapRequest::send_block_writes() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + m_writes_blocked = true; + + using klass = SetSnapRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_block_writes>(this); + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + m_image_ctx.io_image_dispatcher->block_writes(ctx); +} + +template +Context *SetSnapRequest::handle_block_writes(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) + << dendl; + finalize(); + return m_on_finish; + } + + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + auto it = m_image_ctx.snap_info.find(m_snap_id); + if (it == m_image_ctx.snap_info.end()) { + ldout(cct, 5) << "failed to locate snapshot '" << m_snap_id << "'" + << dendl; + + *result = -ENOENT; + finalize(); + return m_on_finish; + } + } + + return send_shut_down_exclusive_lock(result); +} + +template +Context *SetSnapRequest::send_shut_down_exclusive_lock(int *result) { + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + m_exclusive_lock = m_image_ctx.exclusive_lock; + } + + if (m_exclusive_lock == nullptr) { + return send_refresh_parent(result); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + using klass = SetSnapRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_shut_down_exclusive_lock>(this); + m_exclusive_lock->shut_down(ctx); + return nullptr; +} + +template +Context *SetSnapRequest::handle_shut_down_exclusive_lock(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to shut down exclusive lock: " + << cpp_strerror(*result) << dendl; + finalize(); + return m_on_finish; + } + + return send_refresh_parent(result); +} + +template +Context *SetSnapRequest::send_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + + ParentImageInfo parent_md; + bool refresh_parent; + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + + const auto parent_info = m_image_ctx.get_parent_info(m_snap_id); + if (parent_info == nullptr) { + *result = -ENOENT; + lderr(cct) << "failed to retrieve snapshot parent info" << dendl; + finalize(); + return m_on_finish; + } + + parent_md = *parent_info; + refresh_parent = RefreshParentRequest::is_refresh_required( + m_image_ctx, parent_md, m_image_ctx.migration_info); + } + + if (!refresh_parent) { + if (m_snap_id == CEPH_NOSNAP) { + // object map is loaded when exclusive lock is acquired + *result = apply(); + finalize(); + return m_on_finish; + } else { + // load snapshot object map + return send_open_object_map(result); + } + } + + ldout(cct, 10) << __func__ << dendl; + + using klass = SetSnapRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_refresh_parent>(this); + m_refresh_parent = RefreshParentRequest::create(m_image_ctx, parent_md, + m_image_ctx.migration_info, + ctx); + m_refresh_parent->send(); + return nullptr; +} + +template +Context *SetSnapRequest::handle_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to refresh snapshot parent: " << cpp_strerror(*result) + << dendl; + finalize(); + return m_on_finish; + } + + if (m_snap_id == CEPH_NOSNAP) { + // object map is loaded when exclusive lock is acquired + *result = apply(); + if (*result < 0) { + finalize(); + return m_on_finish; + } + + return send_finalize_refresh_parent(result); + } else { + // load snapshot object map + return send_open_object_map(result); + } +} + +template +Context *SetSnapRequest::send_open_object_map(int *result) { + if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + *result = apply(); + if (*result < 0) { + finalize(); + return m_on_finish; + } + + return send_finalize_refresh_parent(result); + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + using klass = SetSnapRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_open_object_map>(this); + m_object_map = ObjectMap::create(m_image_ctx, m_snap_id); + m_object_map->open(ctx); + return nullptr; +} + +template +Context *SetSnapRequest::handle_open_object_map(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to open object map: " << cpp_strerror(*result) + << dendl; + m_object_map->put(); + m_object_map = nullptr; + } + + *result = apply(); + if (*result < 0) { + finalize(); + return m_on_finish; + } + + return send_finalize_refresh_parent(result); +} + +template +Context *SetSnapRequest::send_finalize_refresh_parent(int *result) { + if (m_refresh_parent == nullptr) { + finalize(); + return m_on_finish; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = SetSnapRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_finalize_refresh_parent>(this); + m_refresh_parent->finalize(ctx); + return nullptr; +} + +template +Context *SetSnapRequest::handle_finalize_refresh_parent(int *result) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to close parent image: " << cpp_strerror(*result) + << dendl; + } + finalize(); + return m_on_finish; +} + +template +int SetSnapRequest::apply() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << __func__ << dendl; + + std::scoped_lock locker{m_image_ctx.owner_lock, m_image_ctx.image_lock}; + if (m_snap_id != CEPH_NOSNAP) { + ceph_assert(m_image_ctx.exclusive_lock == nullptr); + int r = m_image_ctx.snap_set(m_snap_id); + if (r < 0) { + return r; + } + } else { + std::swap(m_image_ctx.exclusive_lock, m_exclusive_lock); + m_image_ctx.snap_unset(); + } + + if (m_refresh_parent != nullptr) { + m_refresh_parent->apply(); + } + + std::swap(m_object_map, m_image_ctx.object_map); + return 0; +} + +template +void SetSnapRequest::finalize() { + if (m_writes_blocked) { + m_image_ctx.io_image_dispatcher->unblock_writes(); + m_writes_blocked = false; + } +} + +template +void SetSnapRequest::send_complete() { + finalize(); + m_on_finish->complete(0); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::SetSnapRequest; diff --git a/src/librbd/image/SetSnapRequest.h b/src/librbd/image/SetSnapRequest.h new file mode 100644 index 000000000..c12ea9f27 --- /dev/null +++ b/src/librbd/image/SetSnapRequest.h @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H +#define CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H + +#include "cls/rbd/cls_rbd_client.h" +#include + +class Context; + +namespace librbd { + +template class ExclusiveLock; +class ImageCtx; +template class ObjectMap; + +namespace image { + +template class RefreshParentRequest; + +template +class SetSnapRequest { +public: + static SetSnapRequest *create(ImageCtxT &image_ctx, uint64_t snap_id, + Context *on_finish) { + return new SetSnapRequest(image_ctx, snap_id, on_finish); + } + + ~SetSnapRequest(); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * | (set snap) + * |-----------> BLOCK_WRITES + * | | + * | v + * | SHUTDOWN_EXCLUSIVE_LOCK (skip if lock inactive + * | | or disabled) + * | v + * | REFRESH_PARENT (skip if no parent + * | | or refresh not needed) + * | v + * | OPEN_OBJECT_MAP (skip if map disabled) + * | | + * | v + * | + * | | + * | v + * | FINALIZE_REFRESH_PARENT (skip if no parent + * | | or refresh not needed) + * | v + * | + * | + * \-----------> INIT_EXCLUSIVE_LOCK (skip if active or + * | disabled) + * v + * REFRESH_PARENT (skip if no parent + * | or refresh not needed) + * v + * + * | + * v + * FINALIZE_REFRESH_PARENT (skip if no parent + * | or refresh not needed) + * v + * + * + * @endverbatim + */ + + SetSnapRequest(ImageCtxT &image_ctx, uint64_t snap_id, Context *on_finish); + + ImageCtxT &m_image_ctx; + uint64_t m_snap_id; + Context *m_on_finish; + + ExclusiveLock *m_exclusive_lock; + ObjectMap *m_object_map; + RefreshParentRequest *m_refresh_parent; + + bool m_writes_blocked; + + void send_block_writes(); + Context *handle_block_writes(int *result); + + void send_init_exclusive_lock(); + Context *handle_init_exclusive_lock(int *result); + + Context *send_shut_down_exclusive_lock(int *result); + Context *handle_shut_down_exclusive_lock(int *result); + + Context *send_refresh_parent(int *result); + Context *handle_refresh_parent(int *result); + + Context *send_open_object_map(int *result); + Context *handle_open_object_map(int *result); + + Context *send_finalize_refresh_parent(int *result); + Context *handle_finalize_refresh_parent(int *result); + + int apply(); + void finalize(); + void send_complete(); +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::SetSnapRequest; + +#endif // CEPH_LIBRBD_IMAGE_SNAP_SET_REQUEST_H diff --git a/src/librbd/image/TypeTraits.h b/src/librbd/image/TypeTraits.h new file mode 100644 index 000000000..2989e30b5 --- /dev/null +++ b/src/librbd/image/TypeTraits.h @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H +#define CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H + +namespace librbd { + +namespace asio { struct ContextWQ; } + +namespace image { + +template +struct TypeTraits { + typedef asio::ContextWQ ContextWQ; +}; + +} // namespace image +} // namespace librbd + +#endif // CEPH_LIBRBD_IMAGE_TYPE_TRAITS_H diff --git a/src/librbd/image/Types.h b/src/librbd/image/Types.h new file mode 100644 index 000000000..44c66e227 --- /dev/null +++ b/src/librbd/image/Types.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_IMAGE_TYPES_H +#define LIBRBD_IMAGE_TYPES_H + +namespace librbd { +namespace image { + +enum { + CREATE_FLAG_SKIP_MIRROR_ENABLE = 1 << 0, + CREATE_FLAG_FORCE_MIRROR_ENABLE = 1 << 1, + CREATE_FLAG_MIRROR_ENABLE_MASK = (CREATE_FLAG_SKIP_MIRROR_ENABLE | + CREATE_FLAG_FORCE_MIRROR_ENABLE), +}; + +} // namespace image +} // librbd + +#endif // LIBRBD_IMAGE_TYPES_H diff --git a/src/librbd/image/ValidatePoolRequest.cc b/src/librbd/image/ValidatePoolRequest.cc new file mode 100644 index 000000000..6f2872e25 --- /dev/null +++ b/src/librbd/image/ValidatePoolRequest.cc @@ -0,0 +1,234 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image/ValidatePoolRequest.h" +#include "include/rados/librados.hpp" +#include "include/ceph_assert.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image::ValidatePoolRequest: " \ + << __func__ << ": " + +namespace librbd { +namespace image { + +namespace { + +const std::string OVERWRITE_VALIDATED("overwrite validated"); +const std::string VALIDATE("validate"); + +} // anonymous namespace + +using util::create_rados_callback; +using util::create_context_callback; +using util::create_async_context_callback; + +template +ValidatePoolRequest::ValidatePoolRequest(librados::IoCtx& io_ctx, + Context *on_finish) + : m_cct(reinterpret_cast(io_ctx.cct())), + m_on_finish(on_finish) { + // validatation should occur in default namespace + m_io_ctx.dup(io_ctx); + m_io_ctx.set_namespace(""); + } + +template +void ValidatePoolRequest::send() { + read_rbd_info(); +} + +template +void ValidatePoolRequest::read_rbd_info() { + ldout(m_cct, 5) << dendl; + + auto comp = create_rados_callback< + ValidatePoolRequest, + &ValidatePoolRequest::handle_read_rbd_info>(this); + + librados::ObjectReadOperation op; + op.read(0, 0, nullptr, nullptr); + + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void ValidatePoolRequest::handle_read_rbd_info(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r >= 0) { + bufferlist validated_bl; + validated_bl.append(OVERWRITE_VALIDATED); + + bufferlist validate_bl; + validate_bl.append(VALIDATE); + + if (m_out_bl.contents_equal(validated_bl)) { + // already validated pool + finish(0); + return; + } else if (m_out_bl.contents_equal(validate_bl)) { + // implies snapshot was already successfully created + overwrite_rbd_info(); + return; + } + } else if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to read RBD info: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + create_snapshot(); +} + +template +void ValidatePoolRequest::create_snapshot() { + ldout(m_cct, 5) << dendl; + + // allocate a self-managed snapshot id if this a new pool to force + // self-managed snapshot mode + auto comp = create_rados_callback< + ValidatePoolRequest, + &ValidatePoolRequest::handle_create_snapshot>(this); + m_io_ctx.aio_selfmanaged_snap_create(&m_snap_id, comp); + comp->release(); +} + +template +void ValidatePoolRequest::handle_create_snapshot(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r == -EINVAL) { + lderr(m_cct) << "pool not configured for self-managed RBD snapshot support" + << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to allocate self-managed snapshot: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + write_rbd_info(); +} + +template +void ValidatePoolRequest::write_rbd_info() { + ldout(m_cct, 5) << dendl; + + bufferlist bl; + bl.append(VALIDATE); + + librados::ObjectWriteOperation op; + op.create(true); + op.write(0, bl); + + auto comp = create_rados_callback< + ValidatePoolRequest, + &ValidatePoolRequest::handle_write_rbd_info>(this); + int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void ValidatePoolRequest::handle_write_rbd_info(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + lderr(m_cct) << "pool missing required overwrite support" << dendl; + m_ret_val = -EINVAL; + } else if (r < 0 && r != -EEXIST) { + lderr(m_cct) << "failed to write RBD info: " << cpp_strerror(r) << dendl; + m_ret_val = r; + } + + remove_snapshot(); +} + +template +void ValidatePoolRequest::remove_snapshot() { + ldout(m_cct, 5) << dendl; + + auto comp = create_rados_callback< + ValidatePoolRequest, + &ValidatePoolRequest::handle_remove_snapshot>(this); + m_io_ctx.aio_selfmanaged_snap_remove(m_snap_id, comp); + comp->release(); +} + +template +void ValidatePoolRequest::handle_remove_snapshot(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r < 0) { + // not a fatal error + lderr(m_cct) << "failed to remove validation snapshot: " << cpp_strerror(r) + << dendl; + } + + if (m_ret_val < 0) { + finish(m_ret_val); + return; + } + + overwrite_rbd_info(); +} + +template +void ValidatePoolRequest::overwrite_rbd_info() { + ldout(m_cct, 5) << dendl; + + bufferlist bl; + bl.append(OVERWRITE_VALIDATED); + + librados::ObjectWriteOperation op; + op.write(0, bl); + + auto comp = create_rados_callback< + ValidatePoolRequest, + &ValidatePoolRequest::handle_overwrite_rbd_info>(this); + int r = m_io_ctx.aio_operate(RBD_INFO, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void ValidatePoolRequest::handle_overwrite_rbd_info(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + lderr(m_cct) << "pool missing required overwrite support" << dendl; + finish(-EINVAL); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to validate overwrite support: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void ValidatePoolRequest::finish(int r) { + ldout(m_cct, 5) << "r=" << r << dendl; + m_on_finish->complete(r); + delete this; +} + +} // namespace image +} // namespace librbd + +template class librbd::image::ValidatePoolRequest; diff --git a/src/librbd/image/ValidatePoolRequest.h b/src/librbd/image/ValidatePoolRequest.h new file mode 100644 index 000000000..74f384417 --- /dev/null +++ b/src/librbd/image/ValidatePoolRequest.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H +#define CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H + +#include "include/common_fwd.h" +#include "include/rados/librados.hpp" +#include "include/buffer.h" + +class Context; + +namespace librbd { + +struct ImageCtx; +namespace asio { struct ContextWQ; } + +namespace image { + +template +class ValidatePoolRequest { +public: + static ValidatePoolRequest* create(librados::IoCtx& io_ctx, + Context *on_finish) { + return new ValidatePoolRequest(io_ctx, on_finish); + } + + ValidatePoolRequest(librados::IoCtx& io_ctx, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v (overwrites validated) + * READ RBD INFO . . . . . . . . . + * | . . + * | . (snapshots validated) . + * | . . . . . . . . . . . + * v . . + * CREATE SNAPSHOT . . + * | . . + * v . . + * WRITE RBD INFO . . + * | . . + * v . . + * REMOVE SNAPSHOT . . + * | . . + * v . . + * OVERWRITE RBD INFO < . . . . + * | . + * v . + * < . . . . . . . . . .` + * + * @endverbatim + */ + + librados::IoCtx m_io_ctx; + CephContext* m_cct; + Context* m_on_finish; + + int m_ret_val = 0; + bufferlist m_out_bl; + uint64_t m_snap_id = 0; + + void read_rbd_info(); + void handle_read_rbd_info(int r); + + void create_snapshot(); + void handle_create_snapshot(int r); + + void write_rbd_info(); + void handle_write_rbd_info(int r); + + void remove_snapshot(); + void handle_remove_snapshot(int r); + + void overwrite_rbd_info(); + void handle_overwrite_rbd_info(int r); + + void finish(int r); + +}; + +} // namespace image +} // namespace librbd + +extern template class librbd::image::ValidatePoolRequest; + +#endif // CEPH_LIBRBD_IMAGE_VALIDATE_POOL_REQUEST_H diff --git a/src/librbd/image_watcher/NotifyLockOwner.cc b/src/librbd/image_watcher/NotifyLockOwner.cc new file mode 100644 index 000000000..fe441d7f2 --- /dev/null +++ b/src/librbd/image_watcher/NotifyLockOwner.cc @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/image_watcher/NotifyLockOwner.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/WatchNotifyTypes.h" +#include "librbd/watcher/Notifier.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::image_watcher::NotifyLockOwner: " \ + << this << " " << __func__ + +namespace librbd { + +namespace image_watcher { + +using namespace watch_notify; +using util::create_context_callback; + +NotifyLockOwner::NotifyLockOwner(ImageCtx &image_ctx, + watcher::Notifier ¬ifier, + bufferlist &&bl, Context *on_finish) + : m_image_ctx(image_ctx), m_notifier(notifier), m_bl(std::move(bl)), + m_on_finish(on_finish) { +} + +void NotifyLockOwner::send() { + send_notify(); +} + +void NotifyLockOwner::send_notify() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + m_notifier.notify(m_bl, &m_notify_response, create_context_callback< + NotifyLockOwner, &NotifyLockOwner::handle_notify>(this)); +} + +void NotifyLockOwner::handle_notify(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": r=" << r << dendl; + + if (r < 0 && r != -ETIMEDOUT) { + lderr(cct) << ": lock owner notification failed: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + bufferlist response; + bool lock_owner_responded = false; + for (auto &it : m_notify_response.acks) { + if (it.second.length() > 0) { + if (lock_owner_responded) { + lderr(cct) << ": duplicate lock owners detected" << dendl; + finish(-EINVAL); + return; + } + lock_owner_responded = true; + response = std::move(it.second); + } + } + + if (!lock_owner_responded) { + ldout(cct, 1) << ": no lock owners detected" << dendl; + finish(-ETIMEDOUT); + return; + } + + try { + auto iter = response.cbegin(); + + ResponseMessage response_message; + using ceph::decode; + decode(response_message, iter); + + r = response_message.result; + ldout(cct, 20) << " client responded with r=" << r << dendl; + } catch (const buffer::error &err) { + r = -EINVAL; + } + finish(r); +} + +void NotifyLockOwner::finish(int r) { + m_on_finish->complete(r); + delete this; +} + +} // namespace image_watcher +} // namespace librbd diff --git a/src/librbd/image_watcher/NotifyLockOwner.h b/src/librbd/image_watcher/NotifyLockOwner.h new file mode 100644 index 000000000..6249bc128 --- /dev/null +++ b/src/librbd/image_watcher/NotifyLockOwner.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H +#define CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H + +#include "include/buffer.h" +#include "librbd/watcher/Types.h" + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace watcher { class Notifier; } + +namespace image_watcher { + +class NotifyLockOwner { +public: + static NotifyLockOwner *create(ImageCtx &image_ctx, + watcher::Notifier ¬ifier, + bufferlist &&bl, Context *on_finish) { + return new NotifyLockOwner(image_ctx, notifier, std::move(bl), on_finish); + } + + NotifyLockOwner(ImageCtx &image_ctx, watcher::Notifier ¬ifier, + bufferlist &&bl, Context *on_finish); + + void send(); + +private: + ImageCtx &m_image_ctx; + watcher::Notifier &m_notifier; + + bufferlist m_bl; + watcher::NotifyResponse m_notify_response; + Context *m_on_finish; + + void send_notify(); + void handle_notify(int r); + + void finish(int r); +}; + +} // namespace image_watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_IMAGE_WATCHER_NOTIFY_LOCK_OWNER_H diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc new file mode 100644 index 000000000..3cd699b2c --- /dev/null +++ b/src/librbd/internal.cc @@ -0,0 +1,1740 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "include/int_types.h" + +#include +#include + +#include "include/types.h" +#include "include/uuid.h" +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Throttle.h" +#include "common/event_socket.h" +#include "common/perf_counters.h" +#include "osdc/Striper.h" +#include "include/stringify.h" + +#include "cls/lock/cls_lock_client.h" +#include "cls/rbd/cls_rbd.h" +#include "cls/rbd/cls_rbd_types.h" +#include "cls/rbd/cls_rbd_client.h" +#include "cls/journal/cls_journal_types.h" +#include "cls/journal/cls_journal_client.h" + +#include "librbd/AsioEngine.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Journal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Operations.h" +#include "librbd/PluginRegistry.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/api/Image.h" +#include "librbd/api/Io.h" +#include "librbd/cache/Utils.h" +#include "librbd/exclusive_lock/AutomaticPolicy.h" +#include "librbd/exclusive_lock/StandardPolicy.h" +#include "librbd/deep_copy/MetadataCopyRequest.h" +#include "librbd/image/CloneRequest.h" +#include "librbd/image/CreateRequest.h" +#include "librbd/image/GetMetadataRequest.h" +#include "librbd/image/Types.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/ObjectRequest.h" +#include "librbd/io/ReadResult.h" +#include "librbd/journal/Types.h" +#include "librbd/managed_lock/Types.h" +#include "librbd/mirror/EnableRequest.h" +#include "librbd/operation/TrimRequest.h" + +#include "journal/Journaler.h" + +#include +#include +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd: " + +#define rbd_howmany(x, y) (((x) + (y) - 1) / (y)) + +using std::istringstream; +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; +// list binds to list() here, so std::list is explicitly used below + +using ceph::bufferlist; +using librados::snap_t; +using librados::IoCtx; +using librados::Rados; + +namespace librbd { + +namespace { + +int validate_pool(IoCtx &io_ctx, CephContext *cct) { + if (!cct->_conf.get_val("rbd_validate_pool")) { + return 0; + } + + int r = io_ctx.stat(RBD_DIRECTORY, NULL, NULL); + if (r == 0) { + return 0; + } else if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to stat RBD directory: " << cpp_strerror(r) << dendl; + return r; + } + + // allocate a self-managed snapshot id if this a new pool to force + // self-managed snapshot mode + uint64_t snap_id; + r = io_ctx.selfmanaged_snap_create(&snap_id); + if (r == -EINVAL) { + lderr(cct) << "pool not configured for self-managed RBD snapshot support" + << dendl; + return r; + } else if (r < 0) { + lderr(cct) << "failed to allocate self-managed snapshot: " + << cpp_strerror(r) << dendl; + return r; + } + + r = io_ctx.selfmanaged_snap_remove(snap_id); + if (r < 0) { + lderr(cct) << "failed to release self-managed snapshot " << snap_id + << ": " << cpp_strerror(r) << dendl; + } + return 0; +} + +} // anonymous namespace + + int detect_format(IoCtx &io_ctx, const string &name, + bool *old_format, uint64_t *size) + { + CephContext *cct = (CephContext *)io_ctx.cct(); + if (old_format) + *old_format = true; + int r = io_ctx.stat(util::old_header_name(name), size, NULL); + if (r == -ENOENT) { + if (old_format) + *old_format = false; + r = io_ctx.stat(util::id_obj_name(name), size, NULL); + if (r < 0) + return r; + } else if (r < 0) { + return r; + } + + ldout(cct, 20) << "detect format of " << name << " : " + << (old_format ? (*old_format ? "old" : "new") : + "don't care") << dendl; + return 0; + } + + bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap) + { + return (parent_pool_id != -1 && off <= overlap); + } + + void init_rbd_header(struct rbd_obj_header_ondisk& ondisk, + uint64_t size, int order, uint64_t bid) + { + uint32_t hi = bid >> 32; + uint32_t lo = bid & 0xFFFFFFFF; + uint32_t extra = rand() % 0xFFFFFFFF; + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&ondisk, 0, sizeof(ondisk)); + + memcpy(&ondisk.text, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)); + memcpy(&ondisk.signature, RBD_HEADER_SIGNATURE, + sizeof(RBD_HEADER_SIGNATURE)); + memcpy(&ondisk.version, RBD_HEADER_VERSION, sizeof(RBD_HEADER_VERSION)); + + snprintf(ondisk.block_name, sizeof(ondisk.block_name), "rb.%x.%x.%x", + hi, lo, extra); + + ondisk.image_size = size; + ondisk.options.order = order; + ondisk.options.crypt_type = RBD_CRYPT_NONE; + ondisk.options.comp_type = RBD_COMP_NONE; + ondisk.snap_seq = 0; + ondisk.snap_count = 0; + ondisk.reserved = 0; + ondisk.snap_names_len = 0; + } + + void image_info(ImageCtx *ictx, image_info_t& info, size_t infosize) + { + int obj_order = ictx->order; + { + std::shared_lock locker{ictx->image_lock}; + info.size = ictx->get_area_size(io::ImageArea::DATA); + } + info.obj_size = 1ULL << obj_order; + info.num_objs = Striper::get_num_objects(ictx->layout, info.size); + info.order = obj_order; + strncpy(info.block_name_prefix, ictx->object_prefix.c_str(), + RBD_MAX_BLOCK_NAME_SIZE); + info.block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE - 1] = '\0'; + + // clear deprecated fields + info.parent_pool = -1L; + info.parent_name[0] = '\0'; + } + + uint64_t oid_to_object_no(const string& oid, const string& object_prefix) + { + istringstream iss(oid); + // skip object prefix and separator + iss.ignore(object_prefix.length() + 1); + uint64_t num; + iss >> std::hex >> num; + return num; + } + + int read_header_bl(IoCtx& io_ctx, const string& header_oid, + bufferlist& header, uint64_t *ver) + { + int r; + uint64_t off = 0; +#define READ_SIZE 4096 + do { + bufferlist bl; + r = io_ctx.read(header_oid, bl, READ_SIZE, off); + if (r < 0) + return r; + header.claim_append(bl); + off += r; + } while (r == READ_SIZE); + + static_assert(sizeof(RBD_HEADER_TEXT) == sizeof(RBD_MIGRATE_HEADER_TEXT), + "length of rbd headers must be the same"); + + if (header.length() < sizeof(RBD_HEADER_TEXT) || + (memcmp(RBD_HEADER_TEXT, header.c_str(), + sizeof(RBD_HEADER_TEXT)) != 0 && + memcmp(RBD_MIGRATE_HEADER_TEXT, header.c_str(), + sizeof(RBD_MIGRATE_HEADER_TEXT)) != 0)) { + CephContext *cct = (CephContext *)io_ctx.cct(); + lderr(cct) << "unrecognized header format" << dendl; + return -ENXIO; + } + + if (ver) + *ver = io_ctx.get_last_version(); + + return 0; + } + + int read_header(IoCtx& io_ctx, const string& header_oid, + struct rbd_obj_header_ondisk *header, uint64_t *ver) + { + bufferlist header_bl; + int r = read_header_bl(io_ctx, header_oid, header_bl, ver); + if (r < 0) + return r; + if (header_bl.length() < (int)sizeof(*header)) + return -EIO; + memcpy(header, header_bl.c_str(), sizeof(*header)); + + return 0; + } + + int tmap_set(IoCtx& io_ctx, const string& imgname) + { + bufferlist cmdbl, emptybl; + __u8 c = CEPH_OSD_TMAP_SET; + encode(c, cmdbl); + encode(imgname, cmdbl); + encode(emptybl, cmdbl); + return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl); + } + + int tmap_rm(IoCtx& io_ctx, const string& imgname) + { + bufferlist cmdbl; + __u8 c = CEPH_OSD_TMAP_RM; + encode(c, cmdbl); + encode(imgname, cmdbl); + return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl); + } + + typedef boost::variant image_option_value_t; + typedef std::map image_options_t; + typedef std::shared_ptr image_options_ref; + + enum image_option_type_t { + STR, + UINT64, + }; + + const std::map IMAGE_OPTIONS_TYPE_MAPPING = { + {RBD_IMAGE_OPTION_FORMAT, UINT64}, + {RBD_IMAGE_OPTION_FEATURES, UINT64}, + {RBD_IMAGE_OPTION_ORDER, UINT64}, + {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64}, + {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64}, + {RBD_IMAGE_OPTION_JOURNAL_ORDER, UINT64}, + {RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH, UINT64}, + {RBD_IMAGE_OPTION_JOURNAL_POOL, STR}, + {RBD_IMAGE_OPTION_FEATURES_SET, UINT64}, + {RBD_IMAGE_OPTION_FEATURES_CLEAR, UINT64}, + {RBD_IMAGE_OPTION_DATA_POOL, STR}, + {RBD_IMAGE_OPTION_FLATTEN, UINT64}, + {RBD_IMAGE_OPTION_CLONE_FORMAT, UINT64}, + {RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE, UINT64}, + }; + + std::string image_option_name(int optname) { + switch (optname) { + case RBD_IMAGE_OPTION_FORMAT: + return "format"; + case RBD_IMAGE_OPTION_FEATURES: + return "features"; + case RBD_IMAGE_OPTION_ORDER: + return "order"; + case RBD_IMAGE_OPTION_STRIPE_UNIT: + return "stripe_unit"; + case RBD_IMAGE_OPTION_STRIPE_COUNT: + return "stripe_count"; + case RBD_IMAGE_OPTION_JOURNAL_ORDER: + return "journal_order"; + case RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH: + return "journal_splay_width"; + case RBD_IMAGE_OPTION_JOURNAL_POOL: + return "journal_pool"; + case RBD_IMAGE_OPTION_FEATURES_SET: + return "features_set"; + case RBD_IMAGE_OPTION_FEATURES_CLEAR: + return "features_clear"; + case RBD_IMAGE_OPTION_DATA_POOL: + return "data_pool"; + case RBD_IMAGE_OPTION_FLATTEN: + return "flatten"; + case RBD_IMAGE_OPTION_CLONE_FORMAT: + return "clone_format"; + case RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE: + return "mirror_image_mode"; + default: + return "unknown (" + stringify(optname) + ")"; + } + } + + void image_options_create(rbd_image_options_t* opts) + { + image_options_ref* opts_ = new image_options_ref(new image_options_t()); + + *opts = static_cast(opts_); + } + + void image_options_create_ref(rbd_image_options_t* opts, + rbd_image_options_t orig) + { + image_options_ref* orig_ = static_cast(orig); + image_options_ref* opts_ = new image_options_ref(*orig_); + + *opts = static_cast(opts_); + } + + void image_options_copy(rbd_image_options_t* opts, + const ImageOptions &orig) + { + image_options_ref* opts_ = new image_options_ref(new image_options_t()); + + *opts = static_cast(opts_); + + std::string str_val; + uint64_t uint64_val; + for (auto &i : IMAGE_OPTIONS_TYPE_MAPPING) { + switch (i.second) { + case STR: + if (orig.get(i.first, &str_val) == 0) { + image_options_set(*opts, i.first, str_val); + } + continue; + case UINT64: + if (orig.get(i.first, &uint64_val) == 0) { + image_options_set(*opts, i.first, uint64_val); + } + continue; + } + } + } + + void image_options_destroy(rbd_image_options_t opts) + { + image_options_ref* opts_ = static_cast(opts); + + delete opts_; + } + + int image_options_set(rbd_image_options_t opts, int optname, + const std::string& optval) + { + image_options_ref* opts_ = static_cast(opts); + + std::map::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) { + return -EINVAL; + } + + (*opts_->get())[optname] = optval; + return 0; + } + + int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval) + { + image_options_ref* opts_ = static_cast(opts); + + std::map::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) { + return -EINVAL; + } + + (*opts_->get())[optname] = optval; + return 0; + } + + int image_options_get(rbd_image_options_t opts, int optname, + std::string* optval) + { + image_options_ref* opts_ = static_cast(opts); + + std::map::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) { + return -EINVAL; + } + + image_options_t::const_iterator j = (*opts_)->find(optname); + + if (j == (*opts_)->end()) { + return -ENOENT; + } + + *optval = boost::get(j->second); + return 0; + } + + int image_options_get(rbd_image_options_t opts, int optname, uint64_t* optval) + { + image_options_ref* opts_ = static_cast(opts); + + std::map::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) { + return -EINVAL; + } + + image_options_t::const_iterator j = (*opts_)->find(optname); + + if (j == (*opts_)->end()) { + return -ENOENT; + } + + *optval = boost::get(j->second); + return 0; + } + + int image_options_is_set(rbd_image_options_t opts, int optname, + bool* is_set) + { + if (IMAGE_OPTIONS_TYPE_MAPPING.find(optname) == + IMAGE_OPTIONS_TYPE_MAPPING.end()) { + return -EINVAL; + } + + image_options_ref* opts_ = static_cast(opts); + *is_set = ((*opts_)->find(optname) != (*opts_)->end()); + return 0; + } + + int image_options_unset(rbd_image_options_t opts, int optname) + { + image_options_ref* opts_ = static_cast(opts); + + std::map::const_iterator i = + IMAGE_OPTIONS_TYPE_MAPPING.find(optname); + + if (i == IMAGE_OPTIONS_TYPE_MAPPING.end()) { + ceph_assert((*opts_)->find(optname) == (*opts_)->end()); + return -EINVAL; + } + + image_options_t::const_iterator j = (*opts_)->find(optname); + + if (j == (*opts_)->end()) { + return -ENOENT; + } + + (*opts_)->erase(j); + return 0; + } + + void image_options_clear(rbd_image_options_t opts) + { + image_options_ref* opts_ = static_cast(opts); + + (*opts_)->clear(); + } + + bool image_options_is_empty(rbd_image_options_t opts) + { + image_options_ref* opts_ = static_cast(opts); + + return (*opts_)->empty(); + } + + int create_v1(IoCtx& io_ctx, const char *imgname, uint64_t size, int order) + { + CephContext *cct = (CephContext *)io_ctx.cct(); + + ldout(cct, 20) << __func__ << " " << &io_ctx << " name = " << imgname + << " size = " << size << " order = " << order << dendl; + int r = validate_pool(io_ctx, cct); + if (r < 0) { + return r; + } + + if (!io_ctx.get_namespace().empty()) { + lderr(cct) << "attempting to add v1 image to namespace" << dendl; + return -EINVAL; + } + + ldout(cct, 2) << "adding rbd image to directory..." << dendl; + r = tmap_set(io_ctx, imgname); + if (r < 0) { + lderr(cct) << "error adding image to directory: " << cpp_strerror(r) + << dendl; + return r; + } + + Rados rados(io_ctx); + uint64_t bid = rados.get_instance_id(); + + ldout(cct, 2) << "creating rbd image..." << dendl; + struct rbd_obj_header_ondisk header; + init_rbd_header(header, size, order, bid); + + bufferlist bl; + bl.append((const char *)&header, sizeof(header)); + + string header_oid = util::old_header_name(imgname); + r = io_ctx.write(header_oid, bl, bl.length(), 0); + if (r < 0) { + lderr(cct) << "Error writing image header: " << cpp_strerror(r) + << dendl; + int remove_r = tmap_rm(io_ctx, imgname); + if (remove_r < 0) { + lderr(cct) << "Could not remove image from directory after " + << "header creation failed: " + << cpp_strerror(remove_r) << dendl; + } + return r; + } + + ldout(cct, 2) << "done." << dendl; + return 0; + } + + int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size, + int *order) + { + uint64_t order_ = *order; + ImageOptions opts; + + int r = opts.set(RBD_IMAGE_OPTION_ORDER, order_); + ceph_assert(r == 0); + + r = create(io_ctx, imgname, "", size, opts, "", "", false); + + int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_); + ceph_assert(r1 == 0); + *order = order_; + + return r; + } + + int create(IoCtx& io_ctx, const char *imgname, uint64_t size, + bool old_format, uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count) + { + if (!order) + return -EINVAL; + + uint64_t order_ = *order; + uint64_t format = old_format ? 1 : 2; + ImageOptions opts; + int r; + + r = opts.set(RBD_IMAGE_OPTION_FORMAT, format); + ceph_assert(r == 0); + r = opts.set(RBD_IMAGE_OPTION_FEATURES, features); + ceph_assert(r == 0); + r = opts.set(RBD_IMAGE_OPTION_ORDER, order_); + ceph_assert(r == 0); + r = opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + ceph_assert(r == 0); + r = opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + ceph_assert(r == 0); + + r = create(io_ctx, imgname, "", size, opts, "", "", false); + + int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_); + ceph_assert(r1 == 0); + *order = order_; + + return r; + } + + int create(IoCtx& io_ctx, const std::string &image_name, + const std::string &image_id, uint64_t size, + ImageOptions& opts, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + bool skip_mirror_enable) + { + std::string id(image_id); + if (id.empty()) { + id = util::generate_image_id(io_ctx); + } + + CephContext *cct = (CephContext *)io_ctx.cct(); + uint64_t option; + if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) { + lderr(cct) << "create does not support 'flatten' image option" << dendl; + return -EINVAL; + } + if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) { + lderr(cct) << "create does not support 'clone_format' image option" + << dendl; + return -EINVAL; + } + + ldout(cct, 10) << __func__ << " name=" << image_name << ", " + << "id= " << id << ", " + << "size=" << size << ", opts=" << opts << dendl; + + uint64_t format; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) + format = cct->_conf.get_val("rbd_default_format"); + bool old_format = format == 1; + + // make sure it doesn't already exist, in either format + int r = detect_format(io_ctx, image_name, NULL, NULL); + if (r != -ENOENT) { + if (r) { + lderr(cct) << "Could not tell if " << image_name << " already exists" + << dendl; + return r; + } + lderr(cct) << "rbd image " << image_name << " already exists" << dendl; + return -EEXIST; + } + + uint64_t order = 0; + if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0 || order == 0) { + order = cct->_conf.get_val("rbd_default_order"); + } + r = image::CreateRequest<>::validate_order(cct, order); + if (r < 0) { + return r; + } + + if (old_format) { + if ( !getenv("RBD_FORCE_ALLOW_V1") ) { + lderr(cct) << "Format 1 image creation unsupported. " << dendl; + return -EINVAL; + } + lderr(cct) << "Forced V1 image creation. " << dendl; + r = create_v1(io_ctx, image_name.c_str(), size, order); + } else { + AsioEngine asio_engine(io_ctx); + + ConfigProxy config{cct->_conf}; + api::Config<>::apply_pool_overrides(io_ctx, &config); + + uint32_t create_flags = 0U; + uint64_t mirror_image_mode = RBD_MIRROR_IMAGE_MODE_JOURNAL; + if (skip_mirror_enable) { + create_flags = image::CREATE_FLAG_SKIP_MIRROR_ENABLE; + } else if (opts.get(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE, + &mirror_image_mode) == 0) { + create_flags = image::CREATE_FLAG_FORCE_MIRROR_ENABLE; + } + + C_SaferCond cond; + image::CreateRequest<> *req = image::CreateRequest<>::create( + config, io_ctx, image_name, id, size, opts, create_flags, + static_cast(mirror_image_mode), + non_primary_global_image_id, primary_mirror_uuid, + asio_engine.get_work_queue(), &cond); + req->send(); + + r = cond.wait(); + } + + int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order); + ceph_assert(r1 == 0); + + return r; + } + + /* + * Parent may be in different pool, hence different IoCtx + */ + int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, + uint64_t features, int *c_order, + uint64_t stripe_unit, int stripe_count) + { + uint64_t order = *c_order; + + ImageOptions opts; + opts.set(RBD_IMAGE_OPTION_FEATURES, features); + opts.set(RBD_IMAGE_OPTION_ORDER, order); + opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + + int r = clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx, nullptr, + c_name, opts, "", ""); + opts.get(RBD_IMAGE_OPTION_ORDER, &order); + *c_order = order; + return r; + } + + int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name, + const char *p_snap_name, IoCtx& c_ioctx, const char *c_id, + const char *c_name, ImageOptions& c_opts, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid) + { + ceph_assert((p_id == nullptr) ^ (p_name == nullptr)); + + CephContext *cct = (CephContext *)p_ioctx.cct(); + if (p_snap_name == nullptr) { + lderr(cct) << "image to be cloned must be a snapshot" << dendl; + return -EINVAL; + } + + uint64_t flatten; + if (c_opts.get(RBD_IMAGE_OPTION_FLATTEN, &flatten) == 0) { + lderr(cct) << "clone does not support 'flatten' image option" << dendl; + return -EINVAL; + } + + int r; + std::string parent_id; + if (p_id == nullptr) { + r = cls_client::dir_get_id(&p_ioctx, RBD_DIRECTORY, p_name, + &parent_id); + if (r < 0) { + if (r != -ENOENT) { + lderr(cct) << "failed to retrieve parent image id: " + << cpp_strerror(r) << dendl; + } + return r; + } + } else { + parent_id = p_id; + } + + std::string clone_id; + if (c_id == nullptr) { + clone_id = util::generate_image_id(c_ioctx); + } else { + clone_id = c_id; + } + + ldout(cct, 10) << __func__ << " " + << "c_name=" << c_name << ", " + << "c_id= " << clone_id << ", " + << "c_opts=" << c_opts << dendl; + + ConfigProxy config{reinterpret_cast(c_ioctx.cct())->_conf}; + api::Config<>::apply_pool_overrides(c_ioctx, &config); + + AsioEngine asio_engine(p_ioctx); + + C_SaferCond cond; + auto *req = image::CloneRequest<>::create( + config, p_ioctx, parent_id, p_snap_name, + {cls::rbd::UserSnapshotNamespace{}}, CEPH_NOSNAP, c_ioctx, c_name, + clone_id, c_opts, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, + non_primary_global_image_id, primary_mirror_uuid, + asio_engine.get_work_queue(), &cond); + req->send(); + + r = cond.wait(); + if (r < 0) { + return r; + } + + return 0; + } + + int rename(IoCtx& io_ctx, const char *srcname, const char *dstname) + { + CephContext *cct = (CephContext *)io_ctx.cct(); + ldout(cct, 20) << "rename " << &io_ctx << " " << srcname << " -> " + << dstname << dendl; + + ImageCtx *ictx = new ImageCtx(srcname, "", "", io_ctx, false); + int r = ictx->state->open(0); + if (r < 0) { + lderr(cct) << "error opening source image: " << cpp_strerror(r) << dendl; + return r; + } + BOOST_SCOPE_EXIT((ictx)) { + ictx->state->close(); + } BOOST_SCOPE_EXIT_END + + return ictx->operations->rename(dstname); + } + + int info(ImageCtx *ictx, image_info_t& info, size_t infosize) + { + ldout(ictx->cct, 20) << "info " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + image_info(ictx, info, infosize); + return 0; + } + + int get_old_format(ImageCtx *ictx, uint8_t *old) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + *old = ictx->old_format; + return 0; + } + + int get_size(ImageCtx *ictx, uint64_t *size) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + std::shared_lock l2{ictx->image_lock}; + *size = ictx->get_area_size(io::ImageArea::DATA); + return 0; + } + + int get_features(ImageCtx *ictx, uint64_t *features) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + std::shared_lock l{ictx->image_lock}; + *features = ictx->features; + return 0; + } + + int get_overlap(ImageCtx *ictx, uint64_t *overlap) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + std::shared_lock image_locker{ictx->image_lock}; + uint64_t raw_overlap; + r = ictx->get_parent_overlap(ictx->snap_id, &raw_overlap); + if (r < 0) { + return r; + } + auto _overlap = ictx->reduce_parent_overlap(raw_overlap, false); + *overlap = (_overlap.second == io::ImageArea::DATA ? _overlap.first : 0); + return 0; + } + + int get_flags(ImageCtx *ictx, uint64_t *flags) + { + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + std::shared_lock l2{ictx->image_lock}; + return ictx->get_flags(ictx->snap_id, flags); + } + + int set_image_notification(ImageCtx *ictx, int fd, int type) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << " " << ictx << " fd " << fd << " type" << type << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + if (ictx->event_socket.is_valid()) + return -EINVAL; + return ictx->event_socket.init(fd, type); + } + + int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl; + *is_owner = false; + + std::shared_lock owner_locker{ictx->owner_lock}; + if (ictx->exclusive_lock == nullptr) { + return 0; + } + + // might have been blocklisted by peer -- ensure we still own + // the lock by pinging the OSD + int r = ictx->exclusive_lock->assert_header_locked(); + if (r == -EBUSY || r == -ENOENT) { + return 0; + } else if (r < 0) { + return r; + } + + *is_owner = true; + return 0; + } + + int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", " + << "lock_mode=" << lock_mode << dendl; + + if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) { + return -EOPNOTSUPP; + } + + C_SaferCond lock_ctx; + { + std::unique_lock l{ictx->owner_lock}; + + if (ictx->exclusive_lock == nullptr) { + lderr(cct) << "exclusive-lock feature is not enabled" << dendl; + return -EINVAL; + } + + if (ictx->get_exclusive_lock_policy()->may_auto_request_lock()) { + ictx->set_exclusive_lock_policy( + new exclusive_lock::StandardPolicy(ictx)); + } + + if (ictx->exclusive_lock->is_lock_owner()) { + return 0; + } + + ictx->exclusive_lock->acquire_lock(&lock_ctx); + } + + int r = lock_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to request exclusive lock: " << cpp_strerror(r) + << dendl; + return r; + } + + std::shared_lock l{ictx->owner_lock}; + if (ictx->exclusive_lock == nullptr) { + return -EINVAL; + } else if (!ictx->exclusive_lock->is_lock_owner()) { + lderr(cct) << "failed to acquire exclusive lock" << dendl; + return ictx->exclusive_lock->get_unlocked_op_error(); + } + + return 0; + } + + int lock_release(ImageCtx *ictx) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl; + + C_SaferCond lock_ctx; + { + std::unique_lock l{ictx->owner_lock}; + + if (ictx->exclusive_lock == nullptr || + !ictx->exclusive_lock->is_lock_owner()) { + lderr(cct) << "not exclusive lock owner" << dendl; + return -EINVAL; + } + + ictx->exclusive_lock->release_lock(&lock_ctx); + } + + int r = lock_ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r) + << dendl; + return r; + } + return 0; + } + + int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode, + std::list *lock_owners) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << dendl; + + managed_lock::Locker locker; + C_SaferCond get_owner_ctx; + { + std::shared_lock owner_locker{ictx->owner_lock}; + + if (ictx->exclusive_lock == nullptr) { + lderr(cct) << "exclusive-lock feature is not enabled" << dendl; + return -EINVAL; + } + + ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx); + } + + int r = get_owner_ctx.wait(); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "failed to determine current lock owner: " + << cpp_strerror(r) << dendl; + return r; + } + + *lock_mode = RBD_LOCK_MODE_EXCLUSIVE; + lock_owners->clear(); + lock_owners->emplace_back(locker.address); + return 0; + } + + int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode, + const std::string &lock_owner) { + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << ": ictx=" << ictx << ", " + << "lock_mode=" << lock_mode << ", " + << "lock_owner=" << lock_owner << dendl; + + if (lock_mode != RBD_LOCK_MODE_EXCLUSIVE) { + return -EOPNOTSUPP; + } + + if (ictx->read_only) { + return -EROFS; + } + + managed_lock::Locker locker; + C_SaferCond get_owner_ctx; + { + std::shared_lock l{ictx->owner_lock}; + + if (ictx->exclusive_lock == nullptr) { + lderr(cct) << "exclusive-lock feature is not enabled" << dendl; + return -EINVAL; + } + + ictx->exclusive_lock->get_locker(&locker, &get_owner_ctx); + } + int r = get_owner_ctx.wait(); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "failed to determine current lock owner: " + << cpp_strerror(r) << dendl; + return r; + } + + if (locker.address != lock_owner) { + return -EBUSY; + } + + C_SaferCond break_ctx; + { + std::shared_lock l{ictx->owner_lock}; + + if (ictx->exclusive_lock == nullptr) { + lderr(cct) << "exclusive-lock feature is not enabled" << dendl; + return -EINVAL; + } + + ictx->exclusive_lock->break_lock(locker, true, &break_ctx); + } + r = break_ctx.wait(); + if (r == -ENOENT) { + return r; + } else if (r < 0) { + lderr(cct) << "failed to break lock: " << cpp_strerror(r) << dendl; + return r; + } + return 0; + } + + int copy(ImageCtx *src, IoCtx& dest_md_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size) + { + CephContext *cct = (CephContext *)dest_md_ctx.cct(); + uint64_t option; + if (opts.get(RBD_IMAGE_OPTION_FLATTEN, &option) == 0) { + lderr(cct) << "copy does not support 'flatten' image option" << dendl; + return -EINVAL; + } + if (opts.get(RBD_IMAGE_OPTION_CLONE_FORMAT, &option) == 0) { + lderr(cct) << "copy does not support 'clone_format' image option" + << dendl; + return -EINVAL; + } + + ldout(cct, 20) << "copy " << src->name + << (src->snap_name.length() ? "@" + src->snap_name : "") + << " -> " << destname << " opts = " << opts << dendl; + + src->image_lock.lock_shared(); + uint64_t features = src->features; + uint64_t src_size = src->get_image_size(src->snap_id); + src->image_lock.unlock_shared(); + uint64_t format = 2; + if (opts.get(RBD_IMAGE_OPTION_FORMAT, &format) != 0) { + opts.set(RBD_IMAGE_OPTION_FORMAT, format); + } + uint64_t stripe_unit = src->stripe_unit; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit); + } + uint64_t stripe_count = src->stripe_count; + if (opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count) != 0) { + opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count); + } + uint64_t order = src->order; + if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) { + opts.set(RBD_IMAGE_OPTION_ORDER, order); + } + if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) { + opts.set(RBD_IMAGE_OPTION_FEATURES, features); + } + if (features & ~RBD_FEATURES_ALL) { + lderr(cct) << "librbd does not support requested features" << dendl; + return -ENOSYS; + } + + int r = create(dest_md_ctx, destname, "", src_size, opts, "", "", false); + if (r < 0) { + lderr(cct) << "header creation failed" << dendl; + return r; + } + opts.set(RBD_IMAGE_OPTION_ORDER, static_cast(order)); + + ImageCtx *dest = new librbd::ImageCtx(destname, "", nullptr, dest_md_ctx, + false); + r = dest->state->open(0); + if (r < 0) { + lderr(cct) << "failed to read newly created header" << dendl; + return r; + } + + r = copy(src, dest, prog_ctx, sparse_size); + + int close_r = dest->state->close(); + if (r == 0 && close_r < 0) { + r = close_r; + } + return r; + } + + class C_CopyWrite : public Context { + public: + C_CopyWrite(bufferlist *bl, Context* ctx) + : m_bl(bl), m_ctx(ctx) {} + void finish(int r) override { + delete m_bl; + m_ctx->complete(r); + } + private: + bufferlist *m_bl; + Context *m_ctx; + }; + + class C_CopyRead : public Context { + public: + C_CopyRead(SimpleThrottle *throttle, ImageCtx *dest, uint64_t offset, + bufferlist *bl, size_t sparse_size) + : m_throttle(throttle), m_dest(dest), m_offset(offset), m_bl(bl), + m_sparse_size(sparse_size) { + m_throttle->start_op(); + } + void finish(int r) override { + if (r < 0) { + lderr(m_dest->cct) << "error reading from source image at offset " + << m_offset << ": " << cpp_strerror(r) << dendl; + delete m_bl; + m_throttle->end_op(r); + return; + } + ceph_assert(m_bl->length() == (size_t)r); + + if (m_bl->is_zero()) { + delete m_bl; + m_throttle->end_op(r); + return; + } + + if (!m_sparse_size) { + m_sparse_size = (1 << m_dest->order); + } + + auto *throttle = m_throttle; + auto *end_op_ctx = new LambdaContext([throttle](int r) { + throttle->end_op(r); + }); + auto gather_ctx = new C_Gather(m_dest->cct, end_op_ctx); + + m_bl->rebuild(buffer::ptr_node::create(m_bl->length())); + size_t write_offset = 0; + size_t write_length = 0; + size_t offset = 0; + size_t length = m_bl->length(); + const auto& m_ptr = m_bl->front(); + while (offset < length) { + if (util::calc_sparse_extent(m_ptr, + m_sparse_size, + length, + &write_offset, + &write_length, + &offset)) { + bufferlist *write_bl = new bufferlist(); + write_bl->push_back( + buffer::ptr_node::create(m_ptr, write_offset, write_length)); + Context *ctx = new C_CopyWrite(write_bl, gather_ctx->new_sub()); + auto comp = io::AioCompletion::create(ctx); + + // coordinate through AIO WQ to ensure lock is acquired if needed + api::Io<>::aio_write(*m_dest, comp, m_offset + write_offset, + write_length, std::move(*write_bl), + LIBRADOS_OP_FLAG_FADVISE_DONTNEED, + std::move(read_trace)); + write_offset = offset; + write_length = 0; + } + } + delete m_bl; + ceph_assert(gather_ctx->get_sub_created_count() > 0); + gather_ctx->activate(); + } + + ZTracer::Trace read_trace; + + private: + SimpleThrottle *m_throttle; + ImageCtx *m_dest; + uint64_t m_offset; + bufferlist *m_bl; + size_t m_sparse_size; + }; + + int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size) + { + src->image_lock.lock_shared(); + uint64_t src_size = src->get_image_size(src->snap_id); + src->image_lock.unlock_shared(); + + dest->image_lock.lock_shared(); + uint64_t dest_size = dest->get_image_size(dest->snap_id); + dest->image_lock.unlock_shared(); + + CephContext *cct = src->cct; + if (dest_size < src_size) { + lderr(cct) << " src size " << src_size << " > dest size " + << dest_size << dendl; + return -EINVAL; + } + + // ensure previous writes are visible to dest + C_SaferCond flush_ctx; + { + auto aio_comp = io::AioCompletion::create_and_start(&flush_ctx, src, + io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + *src, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, + aio_comp, io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); + } + int r = flush_ctx.wait(); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + auto req = deep_copy::MetadataCopyRequest<>::create( + src, dest, &ctx); + req->send(); + + r = ctx.wait(); + if (r < 0) { + lderr(cct) << "failed to copy metadata: " << cpp_strerror(r) << dendl; + return r; + } + + ZTracer::Trace trace; + if (src->blkin_trace_all) { + trace.init("copy", &src->trace_endpoint); + } + + SimpleThrottle throttle(src->config.get_val("rbd_concurrent_management_ops"), false); + uint64_t period = src->get_stripe_period(); + unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | + LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + uint64_t object_id = 0; + for (uint64_t offset = 0; offset < src_size; offset += period) { + if (throttle.pending_error()) { + return throttle.wait_for_ret(); + } + + { + std::shared_lock image_locker{src->image_lock}; + if (src->object_map != nullptr) { + bool skip = true; + // each period is related to src->stripe_count objects, check them all + for (uint64_t i=0; i < src->stripe_count; i++) { + if (object_id < src->object_map->size() && + src->object_map->object_may_exist(object_id)) { + skip = false; + } + ++object_id; + } + + if (skip) continue; + } else { + object_id += src->stripe_count; + } + } + + uint64_t len = std::min(period, src_size - offset); + bufferlist *bl = new bufferlist(); + auto ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size); + auto comp = io::AioCompletion::create_and_start( + ctx, src, io::AIO_TYPE_READ); + auto req = io::ImageDispatchSpec::create_read( + *src, io::IMAGE_DISPATCH_LAYER_NONE, comp, + {{offset, len}}, io::ImageArea::DATA, io::ReadResult{bl}, + src->get_data_io_context(), fadvise_flags, 0, trace); + + ctx->read_trace = trace; + req->send(); + + prog_ctx.update_progress(offset, src_size); + } + + r = throttle.wait_for_ret(); + if (r >= 0) + prog_ctx.update_progress(src_size, src_size); + return r; + } + + int list_lockers(ImageCtx *ictx, + std::list *lockers, + bool *exclusive, + string *tag) + { + ldout(ictx->cct, 20) << "list_locks on image " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + std::shared_lock locker{ictx->image_lock}; + if (exclusive) + *exclusive = ictx->exclusive_locked; + if (tag) + *tag = ictx->lock_tag; + if (lockers) { + lockers->clear(); + map::const_iterator it; + for (it = ictx->lockers.begin(); it != ictx->lockers.end(); ++it) { + locker_t locker; + locker.client = stringify(it->first.locker); + locker.cookie = it->first.cookie; + locker.address = it->second.addr.get_legacy_str(); + lockers->push_back(locker); + } + } + + return 0; + } + + int lock(ImageCtx *ictx, bool exclusive, const string& cookie, + const string& tag) + { + ldout(ictx->cct, 20) << "lock image " << ictx << " exclusive=" << exclusive + << " cookie='" << cookie << "' tag='" << tag << "'" + << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + /** + * If we wanted we could do something more intelligent, like local + * checks that we think we will succeed. But for now, let's not + * duplicate that code. + */ + { + std::shared_lock locker{ictx->image_lock}; + r = rados::cls::lock::lock(&ictx->md_ctx, ictx->header_oid, RBD_LOCK_NAME, + exclusive ? ClsLockType::EXCLUSIVE : ClsLockType::SHARED, + cookie, tag, "", utime_t(), 0); + if (r < 0) { + return r; + } + } + + ictx->notify_update(); + return 0; + } + + int unlock(ImageCtx *ictx, const string& cookie) + { + ldout(ictx->cct, 20) << "unlock image " << ictx + << " cookie='" << cookie << "'" << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + { + std::shared_lock locker{ictx->image_lock}; + r = rados::cls::lock::unlock(&ictx->md_ctx, ictx->header_oid, + RBD_LOCK_NAME, cookie); + if (r < 0) { + return r; + } + } + + ictx->notify_update(); + return 0; + } + + int break_lock(ImageCtx *ictx, const string& client, + const string& cookie) + { + ldout(ictx->cct, 20) << "break_lock image " << ictx << " client='" << client + << "' cookie='" << cookie << "'" << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + entity_name_t lock_client; + if (!lock_client.parse(client)) { + lderr(ictx->cct) << "Unable to parse client '" << client + << "'" << dendl; + return -EINVAL; + } + + if (ictx->config.get_val("rbd_blocklist_on_break_lock")) { + typedef std::map Lockers; + Lockers lockers; + ClsLockType lock_type; + std::string lock_tag; + r = rados::cls::lock::get_lock_info(&ictx->md_ctx, ictx->header_oid, + RBD_LOCK_NAME, &lockers, &lock_type, + &lock_tag); + if (r < 0) { + lderr(ictx->cct) << "unable to retrieve lock info: " << cpp_strerror(r) + << dendl; + return r; + } + + std::string client_address; + for (Lockers::iterator it = lockers.begin(); + it != lockers.end(); ++it) { + if (it->first.locker == lock_client) { + client_address = it->second.addr.get_legacy_str(); + break; + } + } + if (client_address.empty()) { + return -ENOENT; + } + + librados::Rados rados(ictx->md_ctx); + r = rados.blocklist_add( + client_address, + ictx->config.get_val("rbd_blocklist_expire_seconds")); + if (r < 0) { + lderr(ictx->cct) << "unable to blocklist client: " << cpp_strerror(r) + << dendl; + return r; + } + } + + r = rados::cls::lock::break_lock(&ictx->md_ctx, ictx->header_oid, + RBD_LOCK_NAME, cookie, lock_client); + if (r < 0) + return r; + ictx->notify_update(); + return 0; + } + + void rbd_ctx_cb(completion_t cb, void *arg) + { + Context *ctx = reinterpret_cast(arg); + auto comp = reinterpret_cast(cb); + ctx->complete(comp->get_return_value()); + comp->release(); + } + + int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) + { + coarse_mono_time start_time; + ceph::timespan elapsed; + + ldout(ictx->cct, 20) << "read_iterate " << ictx << " off = " << off + << " len = " << len << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) + return r; + + uint64_t mylen = len; + ictx->image_lock.lock_shared(); + r = clip_io(ictx, off, &mylen, io::ImageArea::DATA); + ictx->image_lock.unlock_shared(); + if (r < 0) + return r; + + int64_t total_read = 0; + uint64_t period = ictx->get_stripe_period(); + uint64_t left = mylen; + + ZTracer::Trace trace; + if (ictx->blkin_trace_all) { + trace.init("read_iterate", &ictx->trace_endpoint); + } + + std::shared_lock owner_locker{ictx->owner_lock}; + start_time = coarse_mono_clock::now(); + while (left > 0) { + uint64_t period_off = off - (off % period); + uint64_t read_len = std::min(period_off + period - off, left); + + bufferlist bl; + + C_SaferCond ctx; + auto c = io::AioCompletion::create_and_start(&ctx, ictx, + io::AIO_TYPE_READ); + auto req = io::ImageDispatchSpec::create_read( + *ictx, io::IMAGE_DISPATCH_LAYER_NONE, c, + {{off, read_len}}, io::ImageArea::DATA, io::ReadResult{&bl}, + ictx->get_data_io_context(), 0, 0, trace); + req->send(); + + int ret = ctx.wait(); + if (ret < 0) { + return ret; + } + + r = cb(total_read, ret, bl.c_str(), arg); + if (r < 0) { + return r; + } + + total_read += ret; + left -= ret; + off += ret; + } + + elapsed = coarse_mono_clock::now() - start_time; + ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed); + ictx->perfcounter->inc(l_librbd_rd); + ictx->perfcounter->inc(l_librbd_rd_bytes, mylen); + return total_read; + } + + // validate extent against area size; clip to area size if necessary + int clip_io(ImageCtx* ictx, uint64_t off, uint64_t* len, io::ImageArea area) { + ceph_assert(ceph_mutex_is_locked(ictx->image_lock)); + + if (ictx->snap_id != CEPH_NOSNAP && + ictx->get_snap_info(ictx->snap_id) == nullptr) { + return -ENOENT; + } + + // special-case "len == 0" requests: always valid + if (*len == 0) + return 0; + + uint64_t area_size = ictx->get_area_size(area); + + // can't start past end + if (off >= area_size) + return -EINVAL; + + // clip requests that extend past end to just end + if ((off + *len) > area_size) + *len = (size_t)(area_size - off); + + return 0; + } + + int invalidate_cache(ImageCtx *ictx) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "invalidate_cache " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + { + ictx->io_image_dispatcher->invalidate_cache(&ctx); + } + r = ctx.wait(); + + if (r < 0) { + ldout(cct, 20) << "failed to invalidate image cache" << dendl; + return r; + } + + ictx->perfcounter->inc(l_librbd_invalidate_cache); + + // Delete writeback cache if it is not initialized + if ((!ictx->exclusive_lock || + !ictx->exclusive_lock->is_lock_owner()) && + ictx->test_features(RBD_FEATURE_DIRTY_CACHE)) { + C_SaferCond ctx3; + ictx->plugin_registry->discard(&ctx3); + r = ctx3.wait(); + } + return r; + } + + int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp) + { + if (numcomp <= 0) + return -EINVAL; + CephContext *cct = ictx->cct; + ldout(cct, 20) << __func__ << " " << ictx << " numcomp = " << numcomp + << dendl; + int i = 0; + while (i < numcomp && ictx->event_socket_completions.pop(comps[i])) { + ++i; + } + + return i; + } + + int metadata_get(ImageCtx *ictx, const string &key, string *value) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "metadata_get " << ictx << " key=" << key << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + return cls_client::metadata_get(&ictx->md_ctx, ictx->header_oid, key, value); + } + + int metadata_list(ImageCtx *ictx, const string &start, uint64_t max, map *pairs) + { + CephContext *cct = ictx->cct; + ldout(cct, 20) << "metadata_list " << ictx << dendl; + + int r = ictx->state->refresh_if_required(); + if (r < 0) { + return r; + } + + C_SaferCond ctx; + auto req = image::GetMetadataRequest<>::create( + ictx->md_ctx, ictx->header_oid, false, "", start, max, pairs, &ctx); + req->send(); + + return ctx.wait(); + } + + int list_watchers(ImageCtx *ictx, + std::list &watchers) + { + int r; + std::string header_oid; + std::list obj_watchers; + + if (ictx->old_format) { + header_oid = util::old_header_name(ictx->name); + } else { + header_oid = util::header_name(ictx->id); + } + + r = ictx->md_ctx.list_watchers(header_oid, &obj_watchers); + if (r < 0) { + return r; + } + + watchers.clear(); + for (auto i = obj_watchers.begin(); i != obj_watchers.end(); ++i) { + librbd::image_watcher_t watcher; + watcher.addr = i->addr; + watcher.id = i->watcher_id; + watcher.cookie = i->cookie; + + watchers.push_back(watcher); + } + + return 0; + } + +} + +std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts) { + os << "["; + + const char *delimiter = ""; + for (auto &i : librbd::IMAGE_OPTIONS_TYPE_MAPPING) { + if (i.second == librbd::STR) { + std::string val; + if (opts.get(i.first, &val) == 0) { + os << delimiter << librbd::image_option_name(i.first) << "=" << val; + delimiter = ", "; + } + } else if (i.second == librbd::UINT64) { + uint64_t val; + if (opts.get(i.first, &val) == 0) { + os << delimiter << librbd::image_option_name(i.first) << "=" << val; + delimiter = ", "; + } + } + } + + os << "]"; + + return os; +} diff --git a/src/librbd/internal.h b/src/librbd/internal.h new file mode 100644 index 000000000..65e9a9d18 --- /dev/null +++ b/src/librbd/internal.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_INTERNAL_H +#define CEPH_LIBRBD_INTERNAL_H + +#include "include/int_types.h" + +#include +#include +#include +#include + +#include "include/buffer_fwd.h" +#include "include/rbd/librbd.hpp" +#include "include/rbd_types.h" +#include "cls/rbd/cls_rbd_types.h" +#include "common/ceph_time.h" +#include "librbd/Types.h" + +namespace librbd { + + struct ImageCtx; + namespace io { + struct AioCompletion; + enum class ImageArea; + } + + class NoOpProgressContext : public ProgressContext + { + public: + NoOpProgressContext() + { + } + int update_progress(uint64_t offset, uint64_t src_size) override + { + return 0; + } + }; + + int detect_format(librados::IoCtx &io_ctx, const std::string &name, + bool *old_format, uint64_t *size); + + bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap); + + std::string image_option_name(int optname); + void image_options_create(rbd_image_options_t* opts); + void image_options_create_ref(rbd_image_options_t* opts, + rbd_image_options_t orig); + void image_options_copy(rbd_image_options_t *opts, + const ImageOptions &orig); + void image_options_destroy(rbd_image_options_t opts); + int image_options_set(rbd_image_options_t opts, int optname, + const std::string& optval); + int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval); + int image_options_get(rbd_image_options_t opts, int optname, + std::string* optval); + int image_options_get(rbd_image_options_t opts, int optname, + uint64_t* optval); + int image_options_is_set(rbd_image_options_t opts, int optname, + bool* is_set); + int image_options_unset(rbd_image_options_t opts, int optname); + void image_options_clear(rbd_image_options_t opts); + bool image_options_is_empty(rbd_image_options_t opts); + + int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size, + int *order); + int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size, + bool old_format, uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count); + int create(IoCtx& io_ctx, const std::string &image_name, + const std::string &image_id, uint64_t size, ImageOptions& opts, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid, + bool skip_mirror_enable); + int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, + uint64_t features, int *c_order, + uint64_t stripe_unit, int stripe_count); + int clone(IoCtx& p_ioctx, const char *p_id, const char *p_name, + const char *p_snap_name, IoCtx& c_ioctx, const char *c_id, + const char *c_name, ImageOptions& c_opts, + const std::string &non_primary_global_image_id, + const std::string &primary_mirror_uuid); + int rename(librados::IoCtx& io_ctx, const char *srcname, const char *dstname); + int info(ImageCtx *ictx, image_info_t& info, size_t image_size); + int get_old_format(ImageCtx *ictx, uint8_t *old); + int get_size(ImageCtx *ictx, uint64_t *size); + int get_features(ImageCtx *ictx, uint64_t *features); + int get_overlap(ImageCtx *ictx, uint64_t *overlap); + int get_flags(ImageCtx *ictx, uint64_t *flags); + int set_image_notification(ImageCtx *ictx, int fd, int type); + int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner); + int lock_acquire(ImageCtx *ictx, rbd_lock_mode_t lock_mode); + int lock_release(ImageCtx *ictx); + int lock_get_owners(ImageCtx *ictx, rbd_lock_mode_t *lock_mode, + std::list *lock_owners); + int lock_break(ImageCtx *ictx, rbd_lock_mode_t lock_mode, + const std::string &lock_owner); + + int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx, size_t sparse_size); + int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx, size_t sparse_size); + + /* cooperative locking */ + int list_lockers(ImageCtx *ictx, + std::list *locks, + bool *exclusive, + std::string *tag); + int lock(ImageCtx *ictx, bool exclusive, const std::string& cookie, + const std::string& tag); + int lock_shared(ImageCtx *ictx, const std::string& cookie, + const std::string& tag); + int unlock(ImageCtx *ictx, const std::string& cookie); + int break_lock(ImageCtx *ictx, const std::string& client, + const std::string& cookie); + + int read_header_bl(librados::IoCtx& io_ctx, const std::string& md_oid, + ceph::bufferlist& header, uint64_t *ver); + int read_header(librados::IoCtx& io_ctx, const std::string& md_oid, + struct rbd_obj_header_ondisk *header, uint64_t *ver); + int tmap_set(librados::IoCtx& io_ctx, const std::string& imgname); + int tmap_rm(librados::IoCtx& io_ctx, const std::string& imgname); + void image_info(const ImageCtx *ictx, image_info_t& info, size_t info_size); + uint64_t oid_to_object_no(const std::string& oid, + const std::string& object_prefix); + int clip_io(ImageCtx* ictx, uint64_t off, uint64_t* len, io::ImageArea area); + void init_rbd_header(struct rbd_obj_header_ondisk& ondisk, + uint64_t size, int order, uint64_t bid); + + int64_t read_iterate(ImageCtx *ictx, uint64_t off, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg); + + int invalidate_cache(ImageCtx *ictx); + int poll_io_events(ImageCtx *ictx, io::AioCompletion **comps, int numcomp); + int metadata_list(ImageCtx *ictx, const std::string &last, uint64_t max, + std::map *pairs); + int metadata_get(ImageCtx *ictx, const std::string &key, std::string *value); + + int list_watchers(ImageCtx *ictx, std::list &watchers); +} + +std::ostream &operator<<(std::ostream &os, const librbd::ImageOptions &opts); + +#endif diff --git a/src/librbd/io/AioCompletion.cc b/src/librbd/io/AioCompletion.cc new file mode 100644 index 000000000..c04b80770 --- /dev/null +++ b/src/librbd/io/AioCompletion.cc @@ -0,0 +1,294 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/AioCompletion.h" +#include + +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/perf_counters.h" + +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Journal.h" +#include "librbd/Types.h" +#include +#include + +#ifdef WITH_LTTNG +#include "tracing/librbd.h" +#else +#define tracepoint(...) +#endif + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::AioCompletion: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +int AioCompletion::wait_for_complete() { + tracepoint(librbd, aio_wait_for_complete_enter, this); + { + std::unique_lock locker(lock); + while (state != AIO_STATE_COMPLETE) { + cond.wait(locker); + } + } + tracepoint(librbd, aio_wait_for_complete_exit, 0); + return 0; +} + +void AioCompletion::finalize() { + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + // finalize any pending error results since we won't be + // atomically incrementing rval anymore + int err_r = error_rval; + if (err_r < 0) { + rval = err_r; + } + + ssize_t r = rval; + ldout(cct, 20) << "r=" << r << dendl; + if (r >= 0 && aio_type == AIO_TYPE_READ) { + read_result.assemble_result(cct); + } +} + +void AioCompletion::complete() { + ceph_assert(ictx != nullptr); + + ssize_t r = rval; + if ((aio_type == AIO_TYPE_CLOSE) || (aio_type == AIO_TYPE_OPEN && r < 0)) { + ictx = nullptr; + external_callback = false; + } else { + CephContext *cct = ictx->cct; + + tracepoint(librbd, aio_complete_enter, this, r); + if (ictx->perfcounter != nullptr) { + ceph::timespan elapsed = coarse_mono_clock::now() - start_time; + switch (aio_type) { + case AIO_TYPE_GENERIC: + case AIO_TYPE_OPEN: + break; + case AIO_TYPE_READ: + ictx->perfcounter->tinc(l_librbd_rd_latency, elapsed); break; + case AIO_TYPE_WRITE: + ictx->perfcounter->tinc(l_librbd_wr_latency, elapsed); break; + case AIO_TYPE_DISCARD: + ictx->perfcounter->tinc(l_librbd_discard_latency, elapsed); break; + case AIO_TYPE_FLUSH: + ictx->perfcounter->tinc(l_librbd_flush_latency, elapsed); break; + case AIO_TYPE_WRITESAME: + ictx->perfcounter->tinc(l_librbd_ws_latency, elapsed); break; + case AIO_TYPE_COMPARE_AND_WRITE: + ictx->perfcounter->tinc(l_librbd_cmp_latency, elapsed); break; + default: + lderr(cct) << "completed invalid aio_type: " << aio_type << dendl; + break; + } + } + } + + state = AIO_STATE_CALLBACK; + if (complete_cb) { + if (external_callback) { + complete_external_callback(); + } else { + complete_cb(rbd_comp, complete_arg); + complete_event_socket(); + notify_callbacks_complete(); + } + } else { + complete_event_socket(); + notify_callbacks_complete(); + } + + tracepoint(librbd, aio_complete_exit); +} + +void AioCompletion::init_time(ImageCtx *i, aio_type_t t) { + if (ictx == nullptr) { + ictx = i; + aio_type = t; + start_time = coarse_mono_clock::now(); + } +} + +void AioCompletion::start_op() { + ceph_assert(ictx != nullptr); + + if (aio_type == AIO_TYPE_OPEN || aio_type == AIO_TYPE_CLOSE) { + // no need to track async open/close operations + return; + } + + ceph_assert(!async_op.started()); + async_op.start_op(*ictx); +} + +void AioCompletion::queue_complete() { + uint32_t zero = 0; + pending_count.compare_exchange_strong(zero, 1); + ceph_assert(zero == 0); + + add_request(); + + // ensure completion fires in clean lock context + boost::asio::post(ictx->asio_engine->get_api_strand(), [this]() { + complete_request(0); + }); +} + +void AioCompletion::block(CephContext* cct) { + ldout(cct, 20) << dendl; + ceph_assert(!was_armed); + + get(); + ++pending_count; +} + +void AioCompletion::unblock(CephContext* cct) { + ldout(cct, 20) << dendl; + ceph_assert(was_armed); + + uint32_t previous_pending_count = pending_count--; + ceph_assert(previous_pending_count > 0); + + if (previous_pending_count == 1) { + queue_complete(); + } + put(); +} + +void AioCompletion::fail(int r) +{ + ceph_assert(ictx != nullptr); + ceph_assert(r < 0); + + bool queue_required = true; + if (aio_type == AIO_TYPE_CLOSE || aio_type == AIO_TYPE_OPEN) { + // executing from a safe context and the ImageCtx has been destructed + queue_required = false; + } else { + CephContext *cct = ictx->cct; + lderr(cct) << cpp_strerror(r) << dendl; + } + + ceph_assert(!was_armed); + was_armed = true; + + rval = r; + + uint32_t previous_pending_count = pending_count.load(); + if (previous_pending_count == 0) { + if (queue_required) { + queue_complete(); + } else { + complete(); + } + } +} + +void AioCompletion::set_request_count(uint32_t count) { + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + ceph_assert(!was_armed); + was_armed = true; + + ldout(cct, 20) << "pending=" << count << dendl; + uint32_t previous_pending_count = pending_count.fetch_add(count); + if (previous_pending_count == 0 && count == 0) { + queue_complete(); + } +} + +void AioCompletion::complete_request(ssize_t r) +{ + ceph_assert(ictx != nullptr); + CephContext *cct = ictx->cct; + + if (r > 0) { + rval += r; + } else if (r < 0 && r != -EEXIST) { + // might race w/ another thread setting an error code but + // first one wins + int zero = 0; + error_rval.compare_exchange_strong(zero, r); + } + + uint32_t previous_pending_count = pending_count--; + ceph_assert(previous_pending_count > 0); + auto pending_count = previous_pending_count - 1; + + ldout(cct, 20) << "cb=" << complete_cb << ", " + << "pending=" << pending_count << dendl; + if (pending_count == 0) { + finalize(); + complete(); + } + put(); +} + +bool AioCompletion::is_complete() { + tracepoint(librbd, aio_is_complete_enter, this); + bool done = (this->state != AIO_STATE_PENDING); + tracepoint(librbd, aio_is_complete_exit, done); + return done; +} + +ssize_t AioCompletion::get_return_value() { + tracepoint(librbd, aio_get_return_value_enter, this); + ssize_t r = rval; + tracepoint(librbd, aio_get_return_value_exit, r); + return r; +} + +void AioCompletion::complete_external_callback() { + get(); + + // ensure librbd external users never experience concurrent callbacks + // from multiple librbd-internal threads. + boost::asio::dispatch(ictx->asio_engine->get_api_strand(), [this]() { + complete_cb(rbd_comp, complete_arg); + complete_event_socket(); + notify_callbacks_complete(); + put(); + }); +} + +void AioCompletion::complete_event_socket() { + if (ictx != nullptr && event_notify && ictx->event_socket.is_valid()) { + ictx->event_socket_completions.push(this); + ictx->event_socket.notify(); + } +} + +void AioCompletion::notify_callbacks_complete() { + state = AIO_STATE_COMPLETE; + + { + std::unique_lock locker(lock); + cond.notify_all(); + } + + if (image_dispatcher_ctx != nullptr) { + image_dispatcher_ctx->complete(rval); + } + + // note: possible for image to be closed after op marked finished + if (async_op.started()) { + async_op.finish_op(); + } +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/AioCompletion.h b/src/librbd/io/AioCompletion.h new file mode 100644 index 000000000..4ae93fe36 --- /dev/null +++ b/src/librbd/io/AioCompletion.h @@ -0,0 +1,203 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_AIO_COMPLETION_H +#define CEPH_LIBRBD_IO_AIO_COMPLETION_H + +#include "common/ceph_time.h" +#include "include/common_fwd.h" +#include "include/Context.h" +#include "include/utime.h" +#include "include/rbd/librbd.hpp" + +#include "librbd/ImageCtx.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" + +#include +#include +#include + +struct Context; + +namespace librbd { +namespace io { + +/** + * AioCompletion is the overall completion for a single + * rbd I/O request. It may be composed of many AioObjectRequests, + * which each go to a single object. + * + * The retrying of individual requests is handled at a lower level, + * so all AioCompletion cares about is the count of outstanding + * requests. The number of expected individual requests should be + * set initially using set_request_count() prior to issuing the + * requests. This ensures that the completion will not be completed + * within the caller's thread of execution (instead via a librados + * context or via a thread pool context for cache read hits). + */ +struct AioCompletion { + typedef enum { + AIO_STATE_PENDING = 0, + AIO_STATE_CALLBACK, + AIO_STATE_COMPLETE, + } aio_state_t; + + mutable std::mutex lock; + std::condition_variable cond; + + callback_t complete_cb = nullptr; + void *complete_arg = nullptr; + rbd_completion_t rbd_comp = nullptr; + + /// note: only using atomic for built-in memory barrier + std::atomic state{AIO_STATE_PENDING}; + + std::atomic rval{0}; + std::atomic error_rval{0}; + std::atomic ref{1}; + std::atomic pending_count{0}; ///< number of requests/blocks + std::atomic released{false}; + + ImageCtx *ictx = nullptr; + coarse_mono_time start_time; + aio_type_t aio_type = AIO_TYPE_NONE; + + ReadResult read_result; + + AsyncOperation async_op; + + bool event_notify = false; + bool was_armed = false; + bool external_callback = false; + + Context* image_dispatcher_ctx = nullptr; + + template + static void callback_adapter(completion_t cb, void *arg) { + AioCompletion *comp = reinterpret_cast(cb); + T *t = reinterpret_cast(arg); + (t->*MF)(comp->get_return_value()); + comp->release(); + } + + static AioCompletion *create(void *cb_arg, callback_t cb_complete, + rbd_completion_t rbd_comp) { + AioCompletion *comp = new AioCompletion(); + comp->set_complete_cb(cb_arg, cb_complete); + comp->rbd_comp = (rbd_comp != nullptr ? rbd_comp : comp); + return comp; + } + + template + static AioCompletion *create(T *obj) { + AioCompletion *comp = new AioCompletion(); + comp->set_complete_cb(obj, &callback_adapter); + comp->rbd_comp = comp; + return comp; + } + + template + static AioCompletion *create_and_start(T *obj, ImageCtx *image_ctx, + aio_type_t type) { + AioCompletion *comp = create(obj); + comp->init_time(image_ctx, type); + comp->start_op(); + return comp; + } + + AioCompletion() { + } + + ~AioCompletion() { + } + + int wait_for_complete(); + + void finalize(); + + inline bool is_initialized(aio_type_t type) const { + std::unique_lock locker(lock); + return ((ictx != nullptr) && (aio_type == type)); + } + inline bool is_started() const { + std::unique_lock locker(lock); + return async_op.started(); + } + + void block(CephContext* cct); + void unblock(CephContext* cct); + + void init_time(ImageCtx *i, aio_type_t t); + void start_op(); + void fail(int r); + + void complete(); + + void set_complete_cb(void *cb_arg, callback_t cb) { + complete_cb = cb; + complete_arg = cb_arg; + } + + void set_request_count(uint32_t num); + void add_request() { + ceph_assert(pending_count > 0); + get(); + } + void complete_request(ssize_t r); + + bool is_complete(); + + ssize_t get_return_value(); + + void get() { + ceph_assert(ref > 0); + ++ref; + } + void release() { + bool previous_released = released.exchange(true); + ceph_assert(!previous_released); + put(); + } + void put() { + uint32_t previous_ref = ref--; + ceph_assert(previous_ref > 0); + + if (previous_ref == 1) { + delete this; + } + } + + void set_event_notify(bool s) { + event_notify = s; + } + + void *get_arg() { + return complete_arg; + } + +private: + void queue_complete(); + void complete_external_callback(); + void complete_event_socket(); + void notify_callbacks_complete(); +}; + +class C_AioRequest : public Context { +public: + C_AioRequest(AioCompletion *completion) : m_completion(completion) { + m_completion->add_request(); + } + ~C_AioRequest() override {} + void finish(int r) override { + m_completion->complete_request(r); + } +protected: + AioCompletion *m_completion; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_AIO_COMPLETION_H diff --git a/src/librbd/io/AsyncOperation.cc b/src/librbd/io/AsyncOperation.cc new file mode 100644 index 000000000..18db2410e --- /dev/null +++ b/src/librbd/io/AsyncOperation.cc @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/AsyncOperation.h" +#include "include/ceph_assert.h" +#include "common/dout.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::AsyncOperation: " + +namespace librbd { +namespace io { + +namespace { + +struct C_CompleteFlushes : public Context { + ImageCtx *image_ctx; + std::list flush_contexts; + + explicit C_CompleteFlushes(ImageCtx *image_ctx, + std::list &&flush_contexts) + : image_ctx(image_ctx), flush_contexts(std::move(flush_contexts)) { + } + void finish(int r) override { + std::shared_lock owner_locker{image_ctx->owner_lock}; + while (!flush_contexts.empty()) { + Context *flush_ctx = flush_contexts.front(); + flush_contexts.pop_front(); + + ldout(image_ctx->cct, 20) << "completed flush: " << flush_ctx << dendl; + flush_ctx->complete(0); + } + } +}; + +} // anonymous namespace + +void AsyncOperation::start_op(ImageCtx &image_ctx) { + ceph_assert(m_image_ctx == NULL); + m_image_ctx = &image_ctx; + + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl; + std::lock_guard l{m_image_ctx->async_ops_lock}; + m_image_ctx->async_ops.push_front(&m_xlist_item); +} + +void AsyncOperation::finish_op() { + ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl; + + { + std::lock_guard l{m_image_ctx->async_ops_lock}; + xlist::iterator iter(&m_xlist_item); + ++iter; + ceph_assert(m_xlist_item.remove_myself()); + + // linked list stored newest -> oldest ops + if (!iter.end() && !m_flush_contexts.empty()) { + ldout(m_image_ctx->cct, 20) << "moving flush contexts to previous op: " + << *iter << dendl; + (*iter)->m_flush_contexts.insert((*iter)->m_flush_contexts.end(), + m_flush_contexts.begin(), + m_flush_contexts.end()); + return; + } + } + + if (!m_flush_contexts.empty()) { + C_CompleteFlushes *ctx = new C_CompleteFlushes(m_image_ctx, + std::move(m_flush_contexts)); + m_image_ctx->asio_engine->post(ctx, 0); + } +} + +void AsyncOperation::flush(Context* on_finish) { + { + std::lock_guard locker{m_image_ctx->async_ops_lock}; + xlist::iterator iter(&m_xlist_item); + ++iter; + + // linked list stored newest -> oldest ops + if (!iter.end()) { + (*iter)->m_flush_contexts.push_back(on_finish); + return; + } + } + + m_image_ctx->asio_engine->post(on_finish, 0); +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/AsyncOperation.h b/src/librbd/io/AsyncOperation.h new file mode 100644 index 000000000..b0a37c4b8 --- /dev/null +++ b/src/librbd/io/AsyncOperation.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef LIBRBD_IO_ASYNC_OPERATION_H +#define LIBRBD_IO_ASYNC_OPERATION_H + +#include "include/ceph_assert.h" +#include "include/xlist.h" +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace io { + +class AsyncOperation { +public: + + AsyncOperation() + : m_image_ctx(NULL), m_xlist_item(this) + { + } + + ~AsyncOperation() + { + ceph_assert(!m_xlist_item.is_on_list()); + } + + inline bool started() const { + return m_xlist_item.is_on_list(); + } + + void start_op(ImageCtx &image_ctx); + void finish_op(); + + void flush(Context *on_finish); + +private: + + ImageCtx *m_image_ctx; + xlist::item m_xlist_item; + std::list m_flush_contexts; + +}; + +} // namespace io +} // namespace librbd + +#endif // LIBRBD_IO_ASYNC_OPERATION_H diff --git a/src/librbd/io/CopyupRequest.cc b/src/librbd/io/CopyupRequest.cc new file mode 100644 index 000000000..228f95980 --- /dev/null +++ b/src/librbd/io/CopyupRequest.cc @@ -0,0 +1,773 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/CopyupRequest.h" +#include "include/neorados/RADOS.hpp" +#include "common/ceph_context.h" +#include "common/ceph_mutex.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsioEngine.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/asio/Utils.h" +#include "librbd/deep_copy/ObjectCopyRequest.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/ObjectRequest.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Utils.h" + +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::CopyupRequest: " << this \ + << " " << __func__ << ": " \ + << data_object_name(m_image_ctx, m_object_no) << " " + +namespace librbd { +namespace io { + +using librbd::util::data_object_name; + +namespace { + +template +class C_UpdateObjectMap : public C_AsyncObjectThrottle { +public: + C_UpdateObjectMap(AsyncObjectThrottle &throttle, I *image_ctx, + uint64_t object_no, uint8_t head_object_map_state, + const std::vector *snap_ids, + bool first_snap_is_clean, const ZTracer::Trace &trace, + size_t snap_id_idx) + : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_no(object_no), + m_head_object_map_state(head_object_map_state), m_snap_ids(*snap_ids), + m_first_snap_is_clean(first_snap_is_clean), m_trace(trace), + m_snap_id_idx(snap_id_idx) + { + } + + int send() override { + auto& image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + if (image_ctx.exclusive_lock == nullptr) { + return 1; + } + ceph_assert(image_ctx.exclusive_lock->is_lock_owner()); + + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.object_map == nullptr) { + return 1; + } + + uint64_t snap_id = m_snap_ids[m_snap_id_idx]; + if (snap_id == CEPH_NOSNAP) { + return update_head(); + } else { + return update_snapshot(snap_id); + } + } + + int update_head() { + auto& image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + + bool sent = image_ctx.object_map->template aio_update( + CEPH_NOSNAP, m_object_no, m_head_object_map_state, {}, m_trace, false, + this); + return (sent ? 0 : 1); + } + + int update_snapshot(uint64_t snap_id) { + auto& image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + + uint8_t state = OBJECT_EXISTS; + if (image_ctx.test_features(RBD_FEATURE_FAST_DIFF, image_ctx.image_lock) && + (m_snap_id_idx > 0 || m_first_snap_is_clean)) { + // first snapshot should be exists+dirty since it contains + // the copyup data -- later snapshots inherit the data. + state = OBJECT_EXISTS_CLEAN; + } + + bool sent = image_ctx.object_map->template aio_update( + snap_id, m_object_no, state, {}, m_trace, true, this); + ceph_assert(sent); + return 0; + } + +private: + uint64_t m_object_no; + uint8_t m_head_object_map_state; + const std::vector &m_snap_ids; + bool m_first_snap_is_clean; + const ZTracer::Trace &m_trace; + size_t m_snap_id_idx; +}; + +} // anonymous namespace + +template +CopyupRequest::CopyupRequest(I *ictx, uint64_t objectno, + Extents &&image_extents, ImageArea area, + const ZTracer::Trace &parent_trace) + : m_image_ctx(ictx), m_object_no(objectno), + m_image_extents(std::move(image_extents)), m_image_area(area), + m_trace(librbd::util::create_trace(*m_image_ctx, "copy-up", parent_trace)) +{ + ceph_assert(m_image_ctx->data_ctx.is_valid()); + m_async_op.start_op(*librbd::util::get_image_ctx(m_image_ctx)); +} + +template +CopyupRequest::~CopyupRequest() { + ceph_assert(m_pending_requests.empty()); + m_async_op.finish_op(); +} + +template +void CopyupRequest::append_request(AbstractObjectWriteRequest *req, + const Extents& object_extents) { + std::lock_guard locker{m_lock}; + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_request=" << req << ", " + << "append=" << m_append_request_permitted << dendl; + if (m_append_request_permitted) { + m_pending_requests.push_back(req); + + for (auto [offset, length] : object_extents) { + if (length > 0) { + m_write_object_extents.union_insert(offset, length); + } + } + } else { + m_restart_requests.push_back(req); + } +} + +template +void CopyupRequest::send() { + read_from_parent(); +} + +template +void CopyupRequest::read_from_parent() { + auto cct = m_image_ctx->cct; + std::shared_lock image_locker{m_image_ctx->image_lock}; + + if (m_image_ctx->parent == nullptr) { + ldout(cct, 5) << "parent detached" << dendl; + + m_image_ctx->asio_engine->post( + [this]() { handle_read_from_parent(-ENOENT); }); + return; + } else if (is_deep_copy()) { + deep_copy(); + return; + } + + auto comp = AioCompletion::create_and_start< + CopyupRequest, + &CopyupRequest::handle_read_from_parent>( + this, librbd::util::get_image_ctx(m_image_ctx->parent), AIO_TYPE_READ); + + ldout(cct, 20) << "completion=" << comp + << " image_extents=" << m_image_extents + << " area=" << m_image_area << dendl; + auto req = io::ImageDispatchSpec::create_read( + *m_image_ctx->parent, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, comp, + std::move(m_image_extents), m_image_area, + ReadResult{&m_copyup_extent_map, &m_copyup_data}, + m_image_ctx->parent->get_data_io_context(), 0, 0, m_trace); + req->send(); +} + +template +void CopyupRequest::handle_read_from_parent(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + m_lock.lock(); + disable_append_requests(); + m_lock.unlock(); + + lderr(cct) << "error reading from parent: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + convert_copyup_extent_map(); + + m_image_ctx->image_lock.lock_shared(); + m_lock.lock(); + disable_append_requests(); + + r = prepare_copyup_data(); + if (r < 0) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + lderr(m_image_ctx->cct) << "failed to prepare copyup data: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + m_copyup_is_zero = m_copyup_data.is_zero(); + m_copyup_required = is_copyup_required(); + if (!m_copyup_required) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + ldout(cct, 20) << "no-op, skipping" << dendl; + finish(0); + return; + } + + // copyup() will affect snapshots only if parent data is not all + // zeros. + if (!m_copyup_is_zero) { + m_snap_ids.insert(m_snap_ids.end(), m_image_ctx->snaps.rbegin(), + m_image_ctx->snaps.rend()); + } + + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + update_object_maps(); +} + +template +void CopyupRequest::deep_copy() { + auto cct = m_image_ctx->cct; + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + ceph_assert(m_image_ctx->parent != nullptr); + + m_lock.lock(); + m_deep_copied = true; + m_flatten = is_copyup_required() ? true : m_image_ctx->migration_info.flatten; + m_lock.unlock(); + + ldout(cct, 20) << "flatten=" << m_flatten << dendl; + + uint32_t flags = deep_copy::OBJECT_COPY_REQUEST_FLAG_MIGRATION; + if (m_flatten) { + flags |= deep_copy::OBJECT_COPY_REQUEST_FLAG_FLATTEN; + } + + auto ctx = librbd::util::create_context_callback< + CopyupRequest, &CopyupRequest::handle_deep_copy>(this); + auto req = deep_copy::ObjectCopyRequest::create( + m_image_ctx->parent, m_image_ctx, 0, 0, + m_image_ctx->migration_info.snap_map, m_object_no, flags, nullptr, ctx); + + req->send(); +} + +template +void CopyupRequest::handle_deep_copy(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_image_ctx->image_lock.lock_shared(); + m_lock.lock(); + m_copyup_required = is_copyup_required(); + if (r == -ENOENT && !m_flatten && m_copyup_required) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + ldout(cct, 10) << "restart deep-copy with flatten" << dendl; + send(); + return; + } + + disable_append_requests(); + + if (r < 0 && r != -ENOENT) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + lderr(cct) << "error encountered during deep-copy: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (!m_copyup_required && !is_update_object_map_required(r)) { + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + if (r == -ENOENT) { + r = 0; + } + + ldout(cct, 20) << "skipping" << dendl; + finish(r); + return; + } + + // For deep-copy, copyup() will never affect snapshots. However, + // this state machine is responsible for updating object maps for + // snapshots that have been created on destination image after + // migration started. + if (r != -ENOENT) { + compute_deep_copy_snap_ids(); + } + + m_lock.unlock(); + m_image_ctx->image_lock.unlock_shared(); + + update_object_maps(); +} + +template +void CopyupRequest::update_object_maps() { + std::shared_lock owner_locker{m_image_ctx->owner_lock}; + std::shared_lock image_locker{m_image_ctx->image_lock}; + if (m_image_ctx->object_map == nullptr) { + image_locker.unlock(); + owner_locker.unlock(); + + copyup(); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + bool copy_on_read = m_pending_requests.empty(); + uint8_t head_object_map_state = OBJECT_EXISTS; + if (copy_on_read && !m_snap_ids.empty() && + m_image_ctx->test_features(RBD_FEATURE_FAST_DIFF, + m_image_ctx->image_lock)) { + // HEAD is non-dirty since data is tied to first snapshot + head_object_map_state = OBJECT_EXISTS_CLEAN; + } + + auto r_it = m_pending_requests.rbegin(); + if (r_it != m_pending_requests.rend()) { + // last write-op determines the final object map state + head_object_map_state = (*r_it)->get_pre_write_object_map_state(); + } + + if ((*m_image_ctx->object_map)[m_object_no] != head_object_map_state) { + // (maybe) need to update the HEAD object map state + m_snap_ids.push_back(CEPH_NOSNAP); + } + image_locker.unlock(); + + ceph_assert(m_image_ctx->exclusive_lock->is_lock_owner()); + typename AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr>(), + boost::lambda::_1, m_image_ctx, m_object_no, head_object_map_state, + &m_snap_ids, m_first_snap_is_clean, m_trace, boost::lambda::_2)); + auto ctx = librbd::util::create_context_callback< + CopyupRequest, &CopyupRequest::handle_update_object_maps>(this); + auto throttle = new AsyncObjectThrottle( + nullptr, *m_image_ctx, context_factory, ctx, nullptr, 0, m_snap_ids.size()); + throttle->start_ops( + m_image_ctx->config.template get_val("rbd_concurrent_management_ops")); +} + +template +void CopyupRequest::handle_update_object_maps(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "failed to update object map: " + << cpp_strerror(r) << dendl; + + finish(r); + return; + } + + copyup(); +} + +template +void CopyupRequest::copyup() { + auto cct = m_image_ctx->cct; + m_image_ctx->image_lock.lock_shared(); + auto snapc = m_image_ctx->snapc; + auto io_context = m_image_ctx->get_data_io_context(); + m_image_ctx->image_lock.unlock_shared(); + + m_lock.lock(); + if (!m_copyup_required) { + m_lock.unlock(); + + ldout(cct, 20) << "skipping copyup" << dendl; + finish(0); + return; + } + + ldout(cct, 20) << dendl; + + bool copy_on_read = m_pending_requests.empty() && !m_deep_copied; + bool deep_copyup = !snapc.snaps.empty() && !m_copyup_is_zero; + if (m_copyup_is_zero) { + m_copyup_data.clear(); + m_copyup_extent_map.clear(); + } + + neorados::WriteOp copyup_op; + neorados::WriteOp write_op; + neorados::WriteOp* op; + if (copy_on_read || deep_copyup) { + // copyup-op will use its own request issued to the initial object revision + op = ©up_op; + ++m_pending_copyups; + } else { + // copyup-op can be combined with the write-ops (if any) + op = &write_op; + } + + if (m_image_ctx->enable_sparse_copyup) { + cls_client::sparse_copyup(op, m_copyup_extent_map, m_copyup_data); + } else { + // convert the sparse read back into a standard (thick) read + Striper::StripedReadResult destriper; + destriper.add_partial_sparse_result( + cct, std::move(m_copyup_data), m_copyup_extent_map, 0, + {{0, m_image_ctx->layout.object_size}}); + + bufferlist thick_bl; + destriper.assemble_result(cct, thick_bl, false); + cls_client::copyup(op, thick_bl); + } + ObjectRequest::add_write_hint(*m_image_ctx, op); + + if (!copy_on_read) { + // merge all pending write ops into this single RADOS op + for (auto req : m_pending_requests) { + ldout(cct, 20) << "add_copyup_ops " << req << dendl; + req->add_copyup_ops(&write_op); + } + + if (write_op.size() > 0) { + ++m_pending_copyups; + } + } + m_lock.unlock(); + + // issue librados ops at the end to simplify test cases + auto object = neorados::Object{data_object_name(m_image_ctx, m_object_no)}; + if (copyup_op.size() > 0) { + // send only the copyup request with a blank snapshot context so that + // all snapshots are detected from the parent for this object. If + // this is a CoW request, a second request will be created for the + // actual modification. + ldout(cct, 20) << "copyup with empty snapshot context" << dendl; + + auto copyup_io_context = *io_context; + copyup_io_context.write_snap_context({}); + + m_image_ctx->rados_api.execute( + object, copyup_io_context, std::move(copyup_op), + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_copyup(r); }), nullptr, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); + } + + if (write_op.size() > 0) { + // compare-and-write doesn't add any write ops (copyup+cmpext+write + // can't be executed in the same RADOS op because, unless the object + // was already present in the clone, cmpext wouldn't see it) + ldout(cct, 20) << (!deep_copyup && write_op.size() > 2 ? + "copyup + ops" : !deep_copyup ? "copyup" : "ops") + << " with current snapshot context" << dendl; + + m_image_ctx->rados_api.execute( + object, *io_context, std::move(write_op), + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_copyup(r); }), nullptr, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); + } +} + +template +void CopyupRequest::handle_copyup(int r) { + auto cct = m_image_ctx->cct; + unsigned pending_copyups; + int copyup_ret_val = r; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_pending_copyups > 0); + pending_copyups = --m_pending_copyups; + if (m_copyup_ret_val < 0) { + copyup_ret_val = m_copyup_ret_val; + } else if (r < 0) { + m_copyup_ret_val = r; + } + } + + ldout(cct, 20) << "r=" << r << ", " + << "pending=" << pending_copyups << dendl; + + if (pending_copyups == 0) { + if (copyup_ret_val < 0 && copyup_ret_val != -ENOENT) { + lderr(cct) << "failed to copyup object: " << cpp_strerror(copyup_ret_val) + << dendl; + complete_requests(false, copyup_ret_val); + } + + finish(0); + } +} + +template +void CopyupRequest::finish(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + complete_requests(true, r); + delete this; +} + +template +void CopyupRequest::complete_requests(bool override_restart_retval, int r) { + auto cct = m_image_ctx->cct; + remove_from_list(); + + while (!m_pending_requests.empty()) { + auto it = m_pending_requests.begin(); + auto req = *it; + ldout(cct, 20) << "completing request " << req << dendl; + req->handle_copyup(r); + m_pending_requests.erase(it); + } + + if (override_restart_retval) { + r = -ERESTART; + } + + while (!m_restart_requests.empty()) { + auto it = m_restart_requests.begin(); + auto req = *it; + ldout(cct, 20) << "restarting request " << req << dendl; + req->handle_copyup(r); + m_restart_requests.erase(it); + } +} + +template +void CopyupRequest::disable_append_requests() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + m_append_request_permitted = false; +} + +template +void CopyupRequest::remove_from_list() { + std::lock_guard copyup_list_locker{m_image_ctx->copyup_list_lock}; + + auto it = m_image_ctx->copyup_list.find(m_object_no); + if (it != m_image_ctx->copyup_list.end()) { + m_image_ctx->copyup_list.erase(it); + } +} + +template +bool CopyupRequest::is_copyup_required() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + bool copy_on_read = m_pending_requests.empty(); + if (copy_on_read) { + // always force a copyup if CoR enabled + return true; + } + + if (!m_copyup_is_zero) { + return true; + } + + for (auto req : m_pending_requests) { + if (!req->is_empty_write_op()) { + return true; + } + } + return false; +} + +template +bool CopyupRequest::is_deep_copy() const { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + return !m_image_ctx->migration_info.empty(); +} + +template +bool CopyupRequest::is_update_object_map_required(int r) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + + if (r < 0) { + return false; + } + + if (m_image_ctx->object_map == nullptr) { + return false; + } + + if (m_image_ctx->migration_info.empty()) { + // migration might have completed while IO was in-flight, + // assume worst-case and perform an object map update + return true; + } + + auto it = m_image_ctx->migration_info.snap_map.find(CEPH_NOSNAP); + ceph_assert(it != m_image_ctx->migration_info.snap_map.end()); + return it->second[0] != CEPH_NOSNAP; +} + +template +void CopyupRequest::compute_deep_copy_snap_ids() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + + // don't copy ids for the snaps updated by object deep copy or + // that don't overlap + std::set deep_copied; + for (auto &it : m_image_ctx->migration_info.snap_map) { + if (it.first != CEPH_NOSNAP) { + deep_copied.insert(it.second.front()); + } + } + ldout(m_image_ctx->cct, 15) << "deep_copied=" << deep_copied << dendl; + + std::copy_if(m_image_ctx->snaps.rbegin(), m_image_ctx->snaps.rend(), + std::back_inserter(m_snap_ids), + [this, cct=m_image_ctx->cct, &deep_copied](uint64_t snap_id) { + if (deep_copied.count(snap_id)) { + m_first_snap_is_clean = true; + return false; + } + + uint64_t raw_overlap = 0; + uint64_t object_overlap = 0; + int r = m_image_ctx->get_parent_overlap(snap_id, &raw_overlap); + if (r < 0) { + ldout(cct, 5) << "failed getting parent overlap for snap_id: " + << snap_id << ": " << cpp_strerror(r) << dendl; + } else if (raw_overlap > 0) { + auto [parent_extents, area] = util::object_to_area_extents( + m_image_ctx, m_object_no, {{0, m_image_ctx->layout.object_size}}); + object_overlap = m_image_ctx->prune_parent_extents(parent_extents, area, + raw_overlap, false); + } + return object_overlap > 0; + }); +} + +template +void CopyupRequest::convert_copyup_extent_map() { + auto cct = m_image_ctx->cct; + + Extents image_extent_map; + image_extent_map.swap(m_copyup_extent_map); + m_copyup_extent_map.reserve(image_extent_map.size()); + + // convert the image-extent extent map to object-extents + for (auto [image_offset, image_length] : image_extent_map) { + striper::LightweightObjectExtents object_extents; + util::area_to_object_extents(m_image_ctx, image_offset, image_length, + m_image_area, 0, &object_extents); + for (auto& object_extent : object_extents) { + m_copyup_extent_map.emplace_back( + object_extent.offset, object_extent.length); + } + } + + ldout(cct, 20) << "image_extents=" << image_extent_map << ", " + << "object_extents=" << m_copyup_extent_map << dendl; +} + +template +int CopyupRequest::prepare_copyup_data() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + auto cct = m_image_ctx->cct; + + SnapshotSparseBufferlist snapshot_sparse_bufferlist; + auto& sparse_bufferlist = snapshot_sparse_bufferlist[0]; + + bool copy_on_read = m_pending_requests.empty(); + bool maybe_deep_copyup = !m_image_ctx->snapc.snaps.empty(); + if (copy_on_read || maybe_deep_copyup) { + // stand-alone copyup that will not be overwritten until HEAD revision + ldout(cct, 20) << "processing full copy-up" << dendl; + + uint64_t buffer_offset = 0; + for (auto [object_offset, object_length] : m_copyup_extent_map) { + bufferlist sub_bl; + sub_bl.substr_of(m_copyup_data, buffer_offset, object_length); + buffer_offset += object_length; + + sparse_bufferlist.insert( + object_offset, object_length, + {SPARSE_EXTENT_STATE_DATA, object_length, std::move(sub_bl)}); + } + } else { + // copyup that will concurrently written to the HEAD revision with the + // associated write-ops so only process partial extents + uint64_t buffer_offset = 0; + for (auto [object_offset, object_length] : m_copyup_extent_map) { + interval_set copyup_object_extents; + copyup_object_extents.insert(object_offset, object_length); + + interval_set intersection; + intersection.intersection_of(copyup_object_extents, + m_write_object_extents); + + // extract only portions of the parent copyup data that have not + // been overwritten by write-ops + copyup_object_extents.subtract(intersection); + for (auto [copyup_offset, copyup_length] : copyup_object_extents) { + bufferlist sub_bl; + sub_bl.substr_of( + m_copyup_data, buffer_offset + (copyup_offset - object_offset), + copyup_length); + ceph_assert(sub_bl.length() == copyup_length); + + sparse_bufferlist.insert( + copyup_offset, copyup_length, + {SPARSE_EXTENT_STATE_DATA, copyup_length, std::move(sub_bl)}); + } + buffer_offset += object_length; + } + + ldout(cct, 20) << "processing partial copy-up: " << sparse_bufferlist + << dendl; + } + + // Let dispatch layers have a chance to process the data + auto r = m_image_ctx->io_object_dispatcher->prepare_copyup( + m_object_no, &snapshot_sparse_bufferlist); + if (r < 0) { + return r; + } + + // Convert sparse extents back to extent map + m_copyup_data.clear(); + m_copyup_extent_map.clear(); + m_copyup_extent_map.reserve(sparse_bufferlist.ext_count()); + for (auto& extent : sparse_bufferlist) { + auto& sbe = extent.get_val(); + if (sbe.state == SPARSE_EXTENT_STATE_DATA) { + m_copyup_extent_map.emplace_back(extent.get_off(), extent.get_len()); + m_copyup_data.append(sbe.bl); + } + } + + return 0; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::CopyupRequest; diff --git a/src/librbd/io/CopyupRequest.h b/src/librbd/io/CopyupRequest.h new file mode 100644 index 000000000..a94139421 --- /dev/null +++ b/src/librbd/io/CopyupRequest.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_COPYUP_REQUEST_H +#define CEPH_LIBRBD_IO_COPYUP_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/interval_set.h" +#include "common/ceph_mutex.h" +#include "common/zipkin_trace.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/Types.h" + +#include +#include +#include + +namespace ZTracer { struct Trace; } + +namespace librbd { + +struct ImageCtx; + +namespace io { + +template class AbstractObjectWriteRequest; + +template +class CopyupRequest { +public: + static CopyupRequest* create(ImageCtxT *ictx, uint64_t objectno, + Extents &&image_extents, ImageArea area, + const ZTracer::Trace &parent_trace) { + return new CopyupRequest(ictx, objectno, std::move(image_extents), area, + parent_trace); + } + + CopyupRequest(ImageCtxT *ictx, uint64_t objectno, + Extents &&image_extents, ImageArea area, + const ZTracer::Trace &parent_trace); + ~CopyupRequest(); + + void append_request(AbstractObjectWriteRequest *req, + const Extents& object_extents); + + void send(); + +private: + /** + * Copyup requests go through the following state machine to read from the + * parent image, update the object map, and copyup the object: + * + * + * @verbatim + * + * + * | + * /---------/ \---------\ + * | | + * v v + * READ_FROM_PARENT DEEP_COPY + * | | + * \---------\ /---------/ + * | + * v (skip if not needed) + * UPDATE_OBJECT_MAPS + * | + * v (skip if not needed) + * COPYUP + * | + * v + * + * + * @endverbatim + * + * The OBJECT_MAP state is skipped if the object map isn't enabled or if + * an object map update isn't required. The COPYUP state is skipped if + * no data was read from the parent *and* there are no additional ops. + */ + + typedef std::vector *> WriteRequests; + + ImageCtxT *m_image_ctx; + uint64_t m_object_no; + Extents m_image_extents; + ImageArea m_image_area; + ZTracer::Trace m_trace; + + bool m_flatten = false; + bool m_copyup_required = true; + bool m_copyup_is_zero = true; + bool m_deep_copied = false; + + Extents m_copyup_extent_map; + ceph::bufferlist m_copyup_data; + + AsyncOperation m_async_op; + + std::vector m_snap_ids; + bool m_first_snap_is_clean = false; + + ceph::mutex m_lock = ceph::make_mutex("CopyupRequest", false); + WriteRequests m_pending_requests; + unsigned m_pending_copyups = 0; + int m_copyup_ret_val = 0; + + WriteRequests m_restart_requests; + bool m_append_request_permitted = true; + + interval_set m_write_object_extents; + + void read_from_parent(); + void handle_read_from_parent(int r); + + void deep_copy(); + void handle_deep_copy(int r); + + void update_object_maps(); + void handle_update_object_maps(int r); + + void copyup(); + void handle_copyup(int r); + + void finish(int r); + void complete_requests(bool override_restart_retval, int r); + + void disable_append_requests(); + void remove_from_list(); + + bool is_copyup_required(); + bool is_update_object_map_required(int r); + bool is_deep_copy() const; + + void compute_deep_copy_snap_ids(); + void convert_copyup_extent_map(); + int prepare_copyup_data(); +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::CopyupRequest; + +#endif // CEPH_LIBRBD_IO_COPYUP_REQUEST_H diff --git a/src/librbd/io/Dispatcher.h b/src/librbd/io/Dispatcher.h new file mode 100644 index 000000000..cb64e11b2 --- /dev/null +++ b/src/librbd/io/Dispatcher.h @@ -0,0 +1,252 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_DISPATCHER_H +#define CEPH_LIBRBD_IO_DISPATCHER_H + +#include "include/int_types.h" +#include "include/Context.h" +#include "common/ceph_mutex.h" +#include "common/dout.h" +#include "common/AsyncOpTracker.h" +#include "librbd/Utils.h" +#include "librbd/io/DispatcherInterface.h" +#include "librbd/io/Types.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::Dispatcher: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template +class Dispatcher : public DispatchInterfaceT { +public: + typedef typename DispatchInterfaceT::Dispatch Dispatch; + typedef typename DispatchInterfaceT::DispatchLayer DispatchLayer; + typedef typename DispatchInterfaceT::DispatchSpec DispatchSpec; + + Dispatcher(ImageCtxT* image_ctx) + : m_image_ctx(image_ctx), + m_lock(ceph::make_shared_mutex( + librbd::util::unique_lock_name("librbd::io::Dispatcher::lock", + this))) { + } + + virtual ~Dispatcher() { + ceph_assert(m_dispatches.empty()); + } + + void shut_down(Context* on_finish) override { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + std::map dispatches; + { + std::unique_lock locker{m_lock}; + std::swap(dispatches, m_dispatches); + } + + for (auto it : dispatches) { + shut_down_dispatch(it.second, &on_finish); + } + on_finish->complete(0); + } + + void register_dispatch(Dispatch* dispatch) override { + auto cct = m_image_ctx->cct; + auto type = dispatch->get_dispatch_layer(); + ldout(cct, 5) << "dispatch_layer=" << type << dendl; + + std::unique_lock locker{m_lock}; + + auto result = m_dispatches.insert( + {type, {dispatch, new AsyncOpTracker()}}); + ceph_assert(result.second); + } + + bool exists(DispatchLayer dispatch_layer) override { + std::unique_lock locker{m_lock}; + return m_dispatches.find(dispatch_layer) != m_dispatches.end(); + } + + void shut_down_dispatch(DispatchLayer dispatch_layer, + Context* on_finish) override { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "dispatch_layer=" << dispatch_layer << dendl; + + DispatchMeta dispatch_meta; + { + std::unique_lock locker{m_lock}; + auto it = m_dispatches.find(dispatch_layer); + if (it == m_dispatches.end()) { + on_finish->complete(0); + return; + } + + dispatch_meta = it->second; + m_dispatches.erase(it); + } + + shut_down_dispatch(dispatch_meta, &on_finish); + on_finish->complete(0); + } + + void send(DispatchSpec* dispatch_spec) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "dispatch_spec=" << dispatch_spec << dendl; + + auto dispatch_layer = dispatch_spec->dispatch_layer; + + // apply the IO request to all layers -- this method will be re-invoked + // by the dispatch layer if continuing / restarting the IO + while (true) { + m_lock.lock_shared(); + dispatch_layer = dispatch_spec->dispatch_layer; + auto it = m_dispatches.upper_bound(dispatch_layer); + if (it == m_dispatches.end()) { + // the request is complete if handled by all layers + dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE; + m_lock.unlock_shared(); + break; + } + + auto& dispatch_meta = it->second; + auto dispatch = dispatch_meta.dispatch; + auto async_op_tracker = dispatch_meta.async_op_tracker; + dispatch_spec->dispatch_result = DISPATCH_RESULT_INVALID; + + // prevent recursive locking back into the dispatcher while handling IO + async_op_tracker->start_op(); + m_lock.unlock_shared(); + + // advance to next layer in case we skip or continue + dispatch_spec->dispatch_layer = dispatch->get_dispatch_layer(); + + bool handled = send_dispatch(dispatch, dispatch_spec); + async_op_tracker->finish_op(); + + // handled ops will resume when the dispatch ctx is invoked + if (handled) { + return; + } + } + + // skipped through to the last layer + dispatch_spec->dispatcher_ctx.complete(0); + } + +protected: + struct DispatchMeta { + Dispatch* dispatch = nullptr; + AsyncOpTracker* async_op_tracker = nullptr; + + DispatchMeta() { + } + DispatchMeta(Dispatch* dispatch, AsyncOpTracker* async_op_tracker) + : dispatch(dispatch), async_op_tracker(async_op_tracker) { + } + }; + + ImageCtxT* m_image_ctx; + + ceph::shared_mutex m_lock; + std::map m_dispatches; + + virtual bool send_dispatch(Dispatch* dispatch, + DispatchSpec* dispatch_spec) = 0; + +protected: + struct C_LayerIterator : public Context { + Dispatcher* dispatcher; + Context* on_finish; + DispatchLayer dispatch_layer; + + C_LayerIterator(Dispatcher* dispatcher, + DispatchLayer start_layer, + Context* on_finish) + : dispatcher(dispatcher), on_finish(on_finish), dispatch_layer(start_layer) { + } + + void complete(int r) override { + while (true) { + dispatcher->m_lock.lock_shared(); + auto it = dispatcher->m_dispatches.upper_bound(dispatch_layer); + if (it == dispatcher->m_dispatches.end()) { + dispatcher->m_lock.unlock_shared(); + Context::complete(r); + return; + } + + auto& dispatch_meta = it->second; + auto dispatch = dispatch_meta.dispatch; + + // prevent recursive locking back into the dispatcher while handling IO + dispatch_meta.async_op_tracker->start_op(); + dispatcher->m_lock.unlock_shared(); + + // next loop should start after current layer + dispatch_layer = dispatch->get_dispatch_layer(); + + auto handled = execute(dispatch, this); + dispatch_meta.async_op_tracker->finish_op(); + + if (handled) { + break; + } + } + } + + void finish(int r) override { + on_finish->complete(0); + } + virtual bool execute(Dispatch* dispatch, + Context* on_finish) = 0; + }; + + struct C_InvalidateCache : public C_LayerIterator { + C_InvalidateCache(Dispatcher* dispatcher, DispatchLayer start_layer, Context* on_finish) + : C_LayerIterator(dispatcher, start_layer, on_finish) { + } + + bool execute(Dispatch* dispatch, + Context* on_finish) override { + return dispatch->invalidate_cache(on_finish); + } + }; + +private: + void shut_down_dispatch(DispatchMeta& dispatch_meta, + Context** on_finish) { + auto dispatch = dispatch_meta.dispatch; + auto async_op_tracker = dispatch_meta.async_op_tracker; + + auto ctx = *on_finish; + ctx = new LambdaContext( + [dispatch, async_op_tracker, ctx](int r) { + delete dispatch; + delete async_op_tracker; + + ctx->complete(r); + }); + ctx = new LambdaContext([dispatch, ctx](int r) { + dispatch->shut_down(ctx); + }); + *on_finish = new LambdaContext([async_op_tracker, ctx](int r) { + async_op_tracker->wait_for_ops(ctx); + }); + } + +}; + +} // namespace io +} // namespace librbd + +#undef dout_subsys +#undef dout_prefix +#define dout_prefix *_dout + +#endif // CEPH_LIBRBD_IO_DISPATCHER_H diff --git a/src/librbd/io/DispatcherInterface.h b/src/librbd/io/DispatcherInterface.h new file mode 100644 index 000000000..2bac9ee75 --- /dev/null +++ b/src/librbd/io/DispatcherInterface.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H +#define CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H + +#include "include/int_types.h" + +struct Context; + +namespace librbd { +namespace io { + +template +struct DispatcherInterface { +public: + typedef DispatchT Dispatch; + typedef typename DispatchT::DispatchLayer DispatchLayer; + typedef typename DispatchT::DispatchSpec DispatchSpec; + + virtual ~DispatcherInterface() { + } + + virtual void shut_down(Context* on_finish) = 0; + + virtual void register_dispatch(Dispatch* dispatch) = 0; + virtual bool exists(DispatchLayer dispatch_layer) = 0; + virtual void shut_down_dispatch(DispatchLayer dispatch_layer, + Context* on_finish) = 0; + + virtual void send(DispatchSpec* dispatch_spec) = 0; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_DISPATCHER_INTERFACE_H diff --git a/src/librbd/io/FlushTracker.cc b/src/librbd/io/FlushTracker.cc new file mode 100644 index 000000000..b6e2ed658 --- /dev/null +++ b/src/librbd/io/FlushTracker.cc @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/FlushTracker.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::FlushTracker: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template +FlushTracker::FlushTracker(I* image_ctx) + : m_image_ctx(image_ctx), + m_lock(ceph::make_shared_mutex( + util::unique_lock_name("librbd::io::FlushTracker::m_lock", this))) { +} + +template +FlushTracker::~FlushTracker() { + std::unique_lock locker{m_lock}; + ceph_assert(m_flush_contexts.empty()); +} + +template +void FlushTracker::shut_down() { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + std::unique_lock locker{m_lock}; + Contexts flush_ctxs; + for (auto& [flush_tid, ctxs] : m_flush_contexts) { + flush_ctxs.insert(flush_ctxs.end(), ctxs.begin(), ctxs.end()); + } + m_flush_contexts.clear(); + locker.unlock(); + + for (auto ctx : flush_ctxs) { + ctx->complete(0); + } +} + +template +uint64_t FlushTracker::start_io(uint64_t tid) { + auto cct = m_image_ctx->cct; + + std::unique_lock locker{m_lock}; + auto [it, inserted] = m_tid_to_flush_tid.insert({tid, ++m_next_flush_tid}); + auto flush_tid = it->second; + m_in_flight_flush_tids.insert(flush_tid); + locker.unlock(); + + ldout(cct, 20) << "tid=" << tid << ", flush_tid=" << flush_tid << dendl; + return flush_tid; +} + +template +void FlushTracker::finish_io(uint64_t tid) { + auto cct = m_image_ctx->cct; + + std::unique_lock locker{m_lock}; + auto tid_to_flush_tid_it = m_tid_to_flush_tid.find(tid); + if (tid_to_flush_tid_it == m_tid_to_flush_tid.end()) { + return; + } + + auto flush_tid = tid_to_flush_tid_it->second; + m_tid_to_flush_tid.erase(tid_to_flush_tid_it); + m_in_flight_flush_tids.erase(flush_tid); + + ldout(cct, 20) << "tid=" << tid << ", flush_tid=" << flush_tid << dendl; + auto oldest_flush_tid = std::numeric_limits::max(); + if (!m_in_flight_flush_tids.empty()) { + oldest_flush_tid = *m_in_flight_flush_tids.begin(); + } + + // all flushes tagged before the oldest tid should be completed + Contexts flush_ctxs; + auto flush_contexts_it = m_flush_contexts.begin(); + while (flush_contexts_it != m_flush_contexts.end()) { + if (flush_contexts_it->first >= oldest_flush_tid) { + ldout(cct, 20) << "pending IOs: [" << m_in_flight_flush_tids << "], " + << "pending flushes=" << m_flush_contexts << dendl; + break; + } + + auto& ctxs = flush_contexts_it->second; + flush_ctxs.insert(flush_ctxs.end(), ctxs.begin(), ctxs.end()); + flush_contexts_it = m_flush_contexts.erase(flush_contexts_it); + } + locker.unlock(); + + if (!flush_ctxs.empty()) { + ldout(cct, 20) << "completing flushes: " << flush_ctxs << dendl; + for (auto ctx : flush_ctxs) { + ctx->complete(0); + } + } +} + +template +void FlushTracker::flush(Context* on_finish) { + auto cct = m_image_ctx->cct; + + std::unique_lock locker{m_lock}; + if (m_in_flight_flush_tids.empty()) { + locker.unlock(); + on_finish->complete(0); + return; + } + + auto flush_tid = *m_in_flight_flush_tids.rbegin(); + m_flush_contexts[flush_tid].push_back(on_finish); + ldout(cct, 20) << "flush_tid=" << flush_tid << ", ctx=" << on_finish << ", " + << "flush_contexts=" << m_flush_contexts << dendl; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::FlushTracker; diff --git a/src/librbd/io/FlushTracker.h b/src/librbd/io/FlushTracker.h new file mode 100644 index 000000000..cc7fcd9ae --- /dev/null +++ b/src/librbd/io/FlushTracker.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_FLUSH_TRACKER_H +#define CEPH_LIBRBD_IO_FLUSH_TRACKER_H + +#include "include/int_types.h" +#include "common/ceph_mutex.h" +#include +#include +#include +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template +class FlushTracker { +public: + FlushTracker(ImageCtxT* image_ctx); + ~FlushTracker(); + + void shut_down(); + + uint64_t start_io(uint64_t tid); + void finish_io(uint64_t tid); + + void flush(Context* on_finish); + +private: + typedef std::list Contexts; + typedef std::map FlushContexts; + typedef std::set Tids; + typedef std::unordered_map TidToFlushTid; + + ImageCtxT* m_image_ctx; + + std::atomic m_next_flush_tid{0}; + + mutable ceph::shared_mutex m_lock; + TidToFlushTid m_tid_to_flush_tid; + + Tids m_in_flight_flush_tids; + FlushContexts m_flush_contexts; + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::FlushTracker; + +#endif // CEPH_LIBRBD_IO_FLUSH_TRACKER_H diff --git a/src/librbd/io/ImageDispatch.cc b/src/librbd/io/ImageDispatch.cc new file mode 100644 index 000000000..12c55cb0c --- /dev/null +++ b/src/librbd/io/ImageDispatch.cc @@ -0,0 +1,200 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageDispatch.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ObjectDispatcherInterface.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageDispatch: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace io { + +namespace { + +void start_in_flight_io(AioCompletion* aio_comp) { + // TODO remove AsyncOperation from AioCompletion + if (!aio_comp->async_op.started()) { + aio_comp->start_op(); + } +} + +ImageArea get_area(const std::atomic* image_dispatch_flags) { + return (*image_dispatch_flags & IMAGE_DISPATCH_FLAG_CRYPTO_HEADER ? + ImageArea::CRYPTO_HEADER : ImageArea::DATA); +} + +} // anonymous namespace + +template +void ImageDispatch::shut_down(Context* on_finish) { + on_finish->complete(0); +} + +template +bool ImageDispatch::read( + AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest::aio_read(m_image_ctx, aio_comp, std::move(image_extents), + area, std::move(read_result), io_context, op_flags, + read_flags, parent_trace); + return true; +} + +template +bool ImageDispatch::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest::aio_write(m_image_ctx, aio_comp, std::move(image_extents), + area, std::move(bl), op_flags, parent_trace); + return true; +} + +template +bool ImageDispatch::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest::aio_discard(m_image_ctx, aio_comp, std::move(image_extents), + area, discard_granularity_bytes, parent_trace); + return true; +} + +template +bool ImageDispatch::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest::aio_writesame(m_image_ctx, aio_comp, + std::move(image_extents), area, std::move(bl), + op_flags, parent_trace); + return true; +} + +template +bool ImageDispatch::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest::aio_compare_and_write(m_image_ctx, aio_comp, + std::move(image_extents), area, + std::move(cmp_bl), std::move(bl), + mismatch_offset, op_flags, + parent_trace); + return true; +} + +template +bool ImageDispatch::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageRequest::aio_flush(m_image_ctx, aio_comp, flush_source, parent_trace); + return true; +} + +template +bool ImageDispatch::list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto area = get_area(image_dispatch_flags); + ldout(cct, 20) << "image_extents=" << image_extents + << " area=" << area << dendl; + + start_in_flight_io(aio_comp); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + ImageListSnapsRequest req(*m_image_ctx, aio_comp, std::move(image_extents), + area, std::move(snap_ids), list_snaps_flags, + snapshot_delta, parent_trace); + req.send(); + return true; +} + +template +bool ImageDispatch::invalidate_cache(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + std::shared_lock owner_lock{m_image_ctx->owner_lock}; + m_image_ctx->io_object_dispatcher->invalidate_cache(on_finish); + return true; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ImageDispatch; diff --git a/src/librbd/io/ImageDispatch.h b/src/librbd/io/ImageDispatch.h new file mode 100644 index 000000000..4a89c6054 --- /dev/null +++ b/src/librbd/io/ImageDispatch.h @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template +class ImageDispatch : public ImageDispatchInterface { +public: + ImageDispatch(ImageCtxT* image_ctx) : m_image_ctx(image_ctx) { + } + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_CORE; + } + + void shut_down(Context* on_finish) override; + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool invalidate_cache(Context* on_finish) override; + +private: + ImageCtxT* m_image_ctx; + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ImageDispatch; + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_H diff --git a/src/librbd/io/ImageDispatchInterface.h b/src/librbd/io/ImageDispatchInterface.h new file mode 100644 index 000000000..e479f7eef --- /dev/null +++ b/src/librbd/io/ImageDispatchInterface.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" +#include + +struct Context; + +namespace librbd { +namespace io { + +struct AioCompletion; +struct ImageDispatchSpec; + +struct ImageDispatchInterface { + typedef ImageDispatchLayer DispatchLayer; + typedef ImageDispatchSpec DispatchSpec; + + virtual ~ImageDispatchInterface() { + } + + virtual ImageDispatchLayer get_dispatch_layer() const = 0; + + virtual void shut_down(Context* on_finish) = 0; + + virtual bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + virtual bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool invalidate_cache(Context* on_finish) = 0; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_INTERFACE_H diff --git a/src/librbd/io/ImageDispatchSpec.cc b/src/librbd/io/ImageDispatchSpec.cc new file mode 100644 index 000000000..95d8224ae --- /dev/null +++ b/src/librbd/io/ImageDispatchSpec.cc @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include + +namespace librbd { +namespace io { + +void ImageDispatchSpec::C_Dispatcher::complete(int r) { + switch (image_dispatch_spec->dispatch_result) { + case DISPATCH_RESULT_RESTART: + ceph_assert(image_dispatch_spec->dispatch_layer != 0); + image_dispatch_spec->dispatch_layer = static_cast( + image_dispatch_spec->dispatch_layer - 1); + [[fallthrough]]; + case DISPATCH_RESULT_CONTINUE: + if (r < 0) { + // bubble dispatch failure through AioCompletion + image_dispatch_spec->dispatch_result = DISPATCH_RESULT_COMPLETE; + image_dispatch_spec->fail(r); + return; + } + + image_dispatch_spec->send(); + break; + case DISPATCH_RESULT_COMPLETE: + finish(r); + break; + case DISPATCH_RESULT_INVALID: + ceph_abort(); + break; + } +} + +void ImageDispatchSpec::C_Dispatcher::finish(int r) { + delete image_dispatch_spec; +} + +void ImageDispatchSpec::send() { + image_dispatcher->send(this); +} + +void ImageDispatchSpec::fail(int r) { + dispatch_result = DISPATCH_RESULT_COMPLETE; + aio_comp->fail(r); +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/ImageDispatchSpec.h b/src/librbd/io/ImageDispatchSpec.h new file mode 100644 index 000000000..9323f9879 --- /dev/null +++ b/src/librbd/io/ImageDispatchSpec.h @@ -0,0 +1,254 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "common/zipkin_trace.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/Types.h" +#include "librbd/io/ReadResult.h" +#include +#include + +namespace librbd { + +class ImageCtx; + +namespace io { + +struct ImageDispatcherInterface; + +class ImageDispatchSpec { +private: + // helper to avoid extra heap allocation per object IO + struct C_Dispatcher : public Context { + ImageDispatchSpec* image_dispatch_spec; + + C_Dispatcher(ImageDispatchSpec* image_dispatch_spec) + : image_dispatch_spec(image_dispatch_spec) { + } + + void complete(int r) override; + void finish(int r) override; + }; + +public: + struct Read { + ReadResult read_result; + int read_flags; + + Read(ReadResult &&read_result, int read_flags) + : read_result(std::move(read_result)), read_flags(read_flags) { + } + }; + + struct Discard { + uint32_t discard_granularity_bytes; + + Discard(uint32_t discard_granularity_bytes) + : discard_granularity_bytes(discard_granularity_bytes) { + } + }; + + struct Write { + bufferlist bl; + + Write(bufferlist&& bl) : bl(std::move(bl)) { + } + }; + + struct WriteSame { + bufferlist bl; + + WriteSame(bufferlist&& bl) : bl(std::move(bl)) { + } + }; + + struct CompareAndWrite { + bufferlist cmp_bl; + bufferlist bl; + uint64_t *mismatch_offset; + + CompareAndWrite(bufferlist&& cmp_bl, bufferlist&& bl, + uint64_t *mismatch_offset) + : cmp_bl(std::move(cmp_bl)), bl(std::move(bl)), + mismatch_offset(mismatch_offset) { + } + }; + + struct Flush { + FlushSource flush_source; + + Flush(FlushSource flush_source) : flush_source(flush_source) { + } + }; + + struct ListSnaps { + SnapIds snap_ids; + int list_snaps_flags; + SnapshotDelta* snapshot_delta; + + ListSnaps(SnapIds&& snap_ids, int list_snaps_flags, + SnapshotDelta* snapshot_delta) + : snap_ids(std::move(snap_ids)), list_snaps_flags(list_snaps_flags), + snapshot_delta(snapshot_delta) { + } + }; + + typedef boost::variant Request; + + C_Dispatcher dispatcher_ctx; + + ImageDispatcherInterface* image_dispatcher; + ImageDispatchLayer dispatch_layer; + std::atomic image_dispatch_flags = 0; + DispatchResult dispatch_result = DISPATCH_RESULT_INVALID; + + AioCompletion* aio_comp; + Extents image_extents; + Request request; + IOContext io_context; + int op_flags; + ZTracer::Trace parent_trace; + uint64_t tid = 0; + + template + static ImageDispatchSpec* create_read( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + Read{std::move(read_result), read_flags}, + io_context, op_flags, parent_trace); + } + + template + static ImageDispatchSpec* create_discard( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + Discard{discard_granularity_bytes}, + {}, 0, parent_trace); + } + + template + static ImageDispatchSpec* create_write( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + Write{std::move(bl)}, + {}, op_flags, parent_trace); + } + + template + static ImageDispatchSpec* create_write_same( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + WriteSame{std::move(bl)}, + {}, op_flags, parent_trace); + } + + template + static ImageDispatchSpec* create_compare_and_write( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + CompareAndWrite{std::move(cmp_bl), + std::move(bl), + mismatch_offset}, + {}, op_flags, parent_trace); + } + + template + static ImageDispatchSpec* create_flush( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, {}, + ImageArea::DATA /* dummy for {} */, + Flush{flush_source}, {}, 0, parent_trace); + } + + template + static ImageDispatchSpec* create_list_snaps( + ImageCtxT &image_ctx, ImageDispatchLayer image_dispatch_layer, + AioCompletion *aio_comp, Extents &&image_extents, ImageArea area, + SnapIds&& snap_ids, int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace) { + return new ImageDispatchSpec(image_ctx.io_image_dispatcher, + image_dispatch_layer, aio_comp, + std::move(image_extents), area, + ListSnaps{std::move(snap_ids), + list_snaps_flags, snapshot_delta}, + {}, 0, parent_trace); + } + + ~ImageDispatchSpec() { + aio_comp->put(); + } + + void send(); + void fail(int r); + +private: + struct SendVisitor; + struct IsWriteOpVisitor; + struct TokenRequestedVisitor; + + ImageDispatchSpec(ImageDispatcherInterface* image_dispatcher, + ImageDispatchLayer image_dispatch_layer, + AioCompletion* aio_comp, Extents&& image_extents, + ImageArea area, Request&& request, IOContext io_context, + int op_flags, const ZTracer::Trace& parent_trace) + : dispatcher_ctx(this), image_dispatcher(image_dispatcher), + dispatch_layer(image_dispatch_layer), aio_comp(aio_comp), + image_extents(std::move(image_extents)), request(std::move(request)), + io_context(io_context), op_flags(op_flags), parent_trace(parent_trace) { + ceph_assert(aio_comp->image_dispatcher_ctx == nullptr); + aio_comp->image_dispatcher_ctx = &dispatcher_ctx; + aio_comp->get(); + + switch (area) { + case ImageArea::DATA: + break; + case ImageArea::CRYPTO_HEADER: + image_dispatch_flags |= IMAGE_DISPATCH_FLAG_CRYPTO_HEADER; + break; + default: + ceph_abort(); + } + } +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCH_SPEC_H diff --git a/src/librbd/io/ImageDispatcher.cc b/src/librbd/io/ImageDispatcher.cc new file mode 100644 index 000000000..4aa7929b2 --- /dev/null +++ b/src/librbd/io/ImageDispatcher.cc @@ -0,0 +1,324 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageDispatcher.h" +#include "include/Context.h" +#include "common/AsyncOpTracker.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/crypto/CryptoImageDispatch.h" +#include "librbd/io/ImageDispatch.h" +#include "librbd/io/ImageDispatchInterface.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/QueueImageDispatch.h" +#include "librbd/io/QosImageDispatch.h" +#include "librbd/io/RefreshImageDispatch.h" +#include "librbd/io/Utils.h" +#include "librbd/io/WriteBlockImageDispatch.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageDispatcher: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template +struct ImageDispatcher::SendVisitor : public boost::static_visitor { + ImageDispatchInterface* image_dispatch; + ImageDispatchSpec* image_dispatch_spec; + + SendVisitor(ImageDispatchInterface* image_dispatch, + ImageDispatchSpec* image_dispatch_spec) + : image_dispatch(image_dispatch), + image_dispatch_spec(image_dispatch_spec) { + } + + bool operator()(ImageDispatchSpec::Read& read) const { + return image_dispatch->read( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), + std::move(read.read_result), image_dispatch_spec->io_context, + image_dispatch_spec->op_flags, read.read_flags, + image_dispatch_spec->parent_trace, image_dispatch_spec->tid, + &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::Discard& discard) const { + return image_dispatch->discard( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), + discard.discard_granularity_bytes, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::Write& write) const { + return image_dispatch->write( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), std::move(write.bl), + image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::WriteSame& write_same) const { + return image_dispatch->write_same( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), std::move(write_same.bl), + image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()( + ImageDispatchSpec::CompareAndWrite& compare_and_write) const { + return image_dispatch->compare_and_write( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), + std::move(compare_and_write.cmp_bl), std::move(compare_and_write.bl), + compare_and_write.mismatch_offset, + image_dispatch_spec->op_flags, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::Flush& flush) const { + return image_dispatch->flush( + image_dispatch_spec->aio_comp, flush.flush_source, + image_dispatch_spec->parent_trace, image_dispatch_spec->tid, + &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ImageDispatchSpec::ListSnaps& list_snaps) const { + return image_dispatch->list_snaps( + image_dispatch_spec->aio_comp, + std::move(image_dispatch_spec->image_extents), + std::move(list_snaps.snap_ids), list_snaps.list_snaps_flags, + list_snaps.snapshot_delta, image_dispatch_spec->parent_trace, + image_dispatch_spec->tid, &image_dispatch_spec->image_dispatch_flags, + &image_dispatch_spec->dispatch_result, + &image_dispatch_spec->aio_comp->image_dispatcher_ctx, + &image_dispatch_spec->dispatcher_ctx); + } +}; + +template +struct ImageDispatcher::PreprocessVisitor + : public boost::static_visitor { + ImageDispatcher* image_dispatcher; + ImageDispatchSpec* image_dispatch_spec; + + PreprocessVisitor(ImageDispatcher* image_dispatcher, + ImageDispatchSpec* image_dispatch_spec) + : image_dispatcher(image_dispatcher), + image_dispatch_spec(image_dispatch_spec) { + } + + bool clip_request() const { + auto area = (image_dispatch_spec->image_dispatch_flags & + IMAGE_DISPATCH_FLAG_CRYPTO_HEADER ? ImageArea::CRYPTO_HEADER : + ImageArea::DATA); + int r = util::clip_request(image_dispatcher->m_image_ctx, + &image_dispatch_spec->image_extents, area); + if (r < 0) { + image_dispatch_spec->fail(r); + return true; + } + return false; + } + + bool operator()(ImageDispatchSpec::Read& read) const { + if ((read.read_flags & READ_FLAG_DISABLE_CLIPPING) != 0) { + return false; + } + return clip_request(); + } + + bool operator()(ImageDispatchSpec::Flush&) const { + return clip_request(); + } + + bool operator()(ImageDispatchSpec::ListSnaps&) const { + return false; + } + + template + bool operator()(T&) const { + if (clip_request()) { + return true; + } + + std::shared_lock image_locker{image_dispatcher->m_image_ctx->image_lock}; + if (image_dispatcher->m_image_ctx->snap_id != CEPH_NOSNAP || + image_dispatcher->m_image_ctx->read_only) { + image_dispatch_spec->fail(-EROFS); + return true; + } + return false; + } +}; + +template +ImageDispatcher::ImageDispatcher(I* image_ctx) + : Dispatcher(image_ctx) { + // configure the core image dispatch handler on startup + auto image_dispatch = new ImageDispatch(image_ctx); + this->register_dispatch(image_dispatch); + + auto queue_image_dispatch = new QueueImageDispatch(image_ctx); + this->register_dispatch(queue_image_dispatch); + + m_qos_image_dispatch = new QosImageDispatch(image_ctx); + this->register_dispatch(m_qos_image_dispatch); + + auto refresh_image_dispatch = new RefreshImageDispatch(image_ctx); + this->register_dispatch(refresh_image_dispatch); + + m_write_block_dispatch = new WriteBlockImageDispatch(image_ctx); + this->register_dispatch(m_write_block_dispatch); +} + +template +void ImageDispatcher::invalidate_cache(Context* on_finish) { + auto image_ctx = this->m_image_ctx; + auto cct = image_ctx->cct; + ldout(cct, 5) << dendl; + + auto ctx = new C_InvalidateCache( + this, IMAGE_DISPATCH_LAYER_NONE, on_finish); + ctx->complete(0); +} + +template +void ImageDispatcher::shut_down(Context* on_finish) { + // TODO ensure all IOs are executed via a dispatcher + // ensure read-ahead / copy-on-read ops are finished since they are + // currently outside dispatcher tracking + auto async_op = new AsyncOperation(); + + on_finish = new LambdaContext([async_op, on_finish](int r) { + async_op->finish_op(); + delete async_op; + on_finish->complete(0); + }); + on_finish = new LambdaContext([this, on_finish](int r) { + Dispatcher::shut_down(on_finish); + }); + async_op->start_op(*this->m_image_ctx); + async_op->flush(on_finish); +} + +template +void ImageDispatcher::apply_qos_schedule_tick_min(uint64_t tick) { + m_qos_image_dispatch->apply_qos_schedule_tick_min(tick); +} + +template +void ImageDispatcher::apply_qos_limit(uint64_t flag, uint64_t limit, + uint64_t burst, uint64_t burst_seconds) { + m_qos_image_dispatch->apply_qos_limit(flag, limit, burst, burst_seconds); +} + +template +void ImageDispatcher::apply_qos_exclude_ops(uint64_t exclude_ops) { + m_qos_image_dispatch->apply_qos_exclude_ops(exclude_ops); +} + +template +bool ImageDispatcher::writes_blocked() const { + return m_write_block_dispatch->writes_blocked(); +} + +template +int ImageDispatcher::block_writes() { + return m_write_block_dispatch->block_writes(); +} + +template +void ImageDispatcher::block_writes(Context *on_blocked) { + m_write_block_dispatch->block_writes(on_blocked); +} + +template +void ImageDispatcher::unblock_writes() { + m_write_block_dispatch->unblock_writes(); +} + +template +void ImageDispatcher::wait_on_writes_unblocked(Context *on_unblocked) { + m_write_block_dispatch->wait_on_writes_unblocked(on_unblocked); +} + +template +void ImageDispatcher::remap_to_physical(Extents& image_extents, + ImageArea area) { + std::shared_lock locker{this->m_lock}; + auto it = this->m_dispatches.find(IMAGE_DISPATCH_LAYER_CRYPTO); + if (it == this->m_dispatches.end()) { + ceph_assert(area == ImageArea::DATA); + return; + } + auto crypto_image_dispatch = static_cast( + it->second.dispatch); + crypto_image_dispatch->remap_to_physical(image_extents, area); +} + +template +ImageArea ImageDispatcher::remap_to_logical(Extents& image_extents) { + std::shared_lock locker{this->m_lock}; + auto it = this->m_dispatches.find(IMAGE_DISPATCH_LAYER_CRYPTO); + if (it == this->m_dispatches.end()) { + return ImageArea::DATA; + } + auto crypto_image_dispatch = static_cast( + it->second.dispatch); + return crypto_image_dispatch->remap_to_logical(image_extents); +} + +template +bool ImageDispatcher::send_dispatch( + ImageDispatchInterface* image_dispatch, + ImageDispatchSpec* image_dispatch_spec) { + if (image_dispatch_spec->tid == 0) { + image_dispatch_spec->tid = ++m_next_tid; + + bool finished = preprocess(image_dispatch_spec); + if (finished) { + return true; + } + } + + return boost::apply_visitor( + SendVisitor{image_dispatch, image_dispatch_spec}, + image_dispatch_spec->request); +} + +template +bool ImageDispatcher::preprocess( + ImageDispatchSpec* image_dispatch_spec) { + return boost::apply_visitor( + PreprocessVisitor{this, image_dispatch_spec}, + image_dispatch_spec->request); +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ImageDispatcher; diff --git a/src/librbd/io/ImageDispatcher.h b/src/librbd/io/ImageDispatcher.h new file mode 100644 index 000000000..5d5fb0535 --- /dev/null +++ b/src/librbd/io/ImageDispatcher.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H + +#include "include/int_types.h" +#include "common/ceph_mutex.h" +#include "librbd/io/Dispatcher.h" +#include "librbd/io/ImageDispatchInterface.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/Types.h" +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +template struct QosImageDispatch; +template struct WriteBlockImageDispatch; + +template +class ImageDispatcher : public Dispatcher { +public: + ImageDispatcher(ImageCtxT* image_ctx); + + void invalidate_cache(Context* on_finish) override; + + void shut_down(Context* on_finish) override; + + void apply_qos_schedule_tick_min(uint64_t tick) override; + void apply_qos_limit(uint64_t flag, uint64_t limit, uint64_t burst, + uint64_t burst_seconds) override; + void apply_qos_exclude_ops(uint64_t exclude_ops) override; + + bool writes_blocked() const override; + int block_writes() override; + void block_writes(Context *on_blocked) override; + + void unblock_writes() override; + void wait_on_writes_unblocked(Context *on_unblocked) override; + + void remap_to_physical(Extents& image_extents, ImageArea area) override; + ImageArea remap_to_logical(Extents& image_extents) override; + +protected: + bool send_dispatch( + ImageDispatchInterface* image_dispatch, + ImageDispatchSpec* image_dispatch_spec) override; + +private: + struct SendVisitor; + struct PreprocessVisitor; + + using typename Dispatcher::C_InvalidateCache; + + std::atomic m_next_tid{0}; + + QosImageDispatch* m_qos_image_dispatch = nullptr; + WriteBlockImageDispatch* m_write_block_dispatch = nullptr; + + bool preprocess(ImageDispatchSpec* image_dispatch_spec); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ImageDispatcher; + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCHER_H diff --git a/src/librbd/io/ImageDispatcherInterface.h b/src/librbd/io/ImageDispatcherInterface.h new file mode 100644 index 000000000..dcff3d96a --- /dev/null +++ b/src/librbd/io/ImageDispatcherInterface.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H +#define CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H + +#include "include/int_types.h" +#include "librbd/io/DispatcherInterface.h" +#include "librbd/io/ImageDispatchInterface.h" +#include "librbd/io/Types.h" + +struct Context; + +namespace librbd { +namespace io { + +struct ImageDispatcherInterface + : public DispatcherInterface { +public: + virtual void apply_qos_schedule_tick_min(uint64_t tick) = 0; + virtual void apply_qos_limit(uint64_t flag, uint64_t limit, + uint64_t burst, uint64_t burst_seconds) = 0; + virtual void apply_qos_exclude_ops(uint64_t exclude_ops) = 0; + + virtual bool writes_blocked() const = 0; + virtual int block_writes() = 0; + virtual void block_writes(Context *on_blocked) = 0; + + virtual void unblock_writes() = 0; + virtual void wait_on_writes_unblocked(Context *on_unblocked) = 0; + + virtual void invalidate_cache(Context* on_finish) = 0; + + virtual void remap_to_physical(Extents& image_extents, ImageArea area) = 0; + virtual ImageArea remap_to_logical(Extents& image_extents) = 0; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_IMAGE_DISPATCHER_INTERFACE_H diff --git a/src/librbd/io/ImageRequest.cc b/src/librbd/io/ImageRequest.cc new file mode 100644 index 000000000..e4c41c229 --- /dev/null +++ b/src/librbd/io/ImageRequest.cc @@ -0,0 +1,909 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ImageRequest.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Journal.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ObjectDispatchInterface.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/Utils.h" +#include "librbd/journal/Types.h" +#include "include/rados/librados.hpp" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "osdc/Striper.h" +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageRequest: " << __func__ << ": " + +namespace librbd { +namespace io { + +using librbd::util::data_object_name; +using librbd::util::get_image_ctx; + +namespace { + +template +struct C_AssembleSnapshotDeltas : public C_AioRequest { + I* image_ctx; + SnapshotDelta* snapshot_delta; + + ceph::mutex lock = ceph::make_mutex( + "librbd::io::C_AssembleSnapshotDeltas::lock", false); + std::map object_snapshot_delta; + + C_AssembleSnapshotDeltas(I* image_ctx, AioCompletion* aio_comp, + SnapshotDelta* snapshot_delta) + : C_AioRequest(aio_comp), + image_ctx(image_ctx), snapshot_delta(snapshot_delta) { + } + + SnapshotDelta* get_snapshot_delta(uint64_t object_no) { + std::unique_lock locker{lock}; + return &object_snapshot_delta[object_no]; + } + + void finish(int r) override { + auto cct = image_ctx->cct; + + if (r < 0) { + lderr(cct) << "C_AssembleSnapshotDeltas: list snaps failed: " + << cpp_strerror(r) << dendl; + C_AioRequest::finish(r); + return; + } + + std::unique_lock locker{lock}; + *snapshot_delta = {}; + for (auto& [object_no, object_snapshot_delta] : object_snapshot_delta) { + SnapshotDelta image_snapshot_delta; + object_to_image_intervals(object_no, object_snapshot_delta, + &image_snapshot_delta, snapshot_delta); + + ldout(cct, 20) << "object_no=" << object_no << ", " + << "object_snapshot_delta=" + << object_snapshot_delta << ", " + << "image_snapshot_delta=" << image_snapshot_delta + << dendl; + } + + ldout(cct, 20) << "snapshot_delta=" << *snapshot_delta << dendl; + C_AioRequest::finish(0); + } + + void object_to_image_intervals( + uint64_t object_no, const SnapshotDelta& object_snapshot_delta, + SnapshotDelta* image_snapshot_delta, + SnapshotDelta* assembled_image_snapshot_delta) { + for (auto& [key, object_extents] : object_snapshot_delta) { + for (auto& object_extent : object_extents) { + auto [image_extents, _] = io::util::object_to_area_extents( + image_ctx, object_no, + {{object_extent.get_off(), object_extent.get_len()}}); + + auto& intervals = (*image_snapshot_delta)[key]; + auto& assembled_intervals = (*assembled_image_snapshot_delta)[key]; + for (auto [image_offset, image_length] : image_extents) { + SparseExtent sparse_extent{object_extent.get_val().state, + image_length}; + intervals.insert(image_offset, image_length, sparse_extent); + assembled_intervals.insert(image_offset, image_length, + sparse_extent); + } + } + } + } +}; + +template +struct C_RBD_Readahead : public Context { + I *ictx; + uint64_t object_no; + io::ReadExtents extents; + + C_RBD_Readahead(I *ictx, uint64_t object_no, uint64_t offset, uint64_t length) + : ictx(ictx), object_no(object_no), extents({{offset, length}}) { + ictx->readahead.inc_pending(); + } + + void finish(int r) override { + ceph_assert(extents.size() == 1); + auto& extent = extents.front(); + ldout(ictx->cct, 20) << "C_RBD_Readahead on " + << data_object_name(ictx, object_no) << ": " + << extent.offset << "~" << extent.length << dendl; + ictx->readahead.dec_pending(); + } +}; + +template +void readahead(I *ictx, const Extents& image_extents, IOContext io_context) { + uint64_t total_bytes = 0; + for (auto& image_extent : image_extents) { + total_bytes += image_extent.second; + } + + ictx->image_lock.lock_shared(); + auto total_bytes_read = ictx->total_bytes_read.fetch_add(total_bytes); + bool abort = ( + ictx->readahead_disable_after_bytes != 0 && + total_bytes_read > ictx->readahead_disable_after_bytes); + if (abort) { + ictx->image_lock.unlock_shared(); + return; + } + + uint64_t data_size = ictx->get_area_size(ImageArea::DATA); + ictx->image_lock.unlock_shared(); + + auto readahead_extent = ictx->readahead.update(image_extents, data_size); + uint64_t readahead_offset = readahead_extent.first; + uint64_t readahead_length = readahead_extent.second; + + if (readahead_length > 0) { + ldout(ictx->cct, 20) << "(readahead logical) " << readahead_offset << "~" + << readahead_length << dendl; + LightweightObjectExtents readahead_object_extents; + io::util::area_to_object_extents(ictx, readahead_offset, readahead_length, + ImageArea::DATA, 0, + &readahead_object_extents); + for (auto& object_extent : readahead_object_extents) { + ldout(ictx->cct, 20) << "(readahead) " + << data_object_name(ictx, + object_extent.object_no) << " " + << object_extent.offset << "~" + << object_extent.length << dendl; + + auto req_comp = new C_RBD_Readahead(ictx, object_extent.object_no, + object_extent.offset, + object_extent.length); + auto req = io::ObjectDispatchSpec::create_read( + ictx, io::OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + &req_comp->extents, io_context, 0, 0, {}, nullptr, req_comp); + req->send(); + } + + ictx->perfcounter->inc(l_librbd_readahead); + ictx->perfcounter->inc(l_librbd_readahead_bytes, readahead_length); + } +} + +template +struct C_UpdateTimestamp : public Context { +public: + I& m_image_ctx; + bool m_modify; // if modify set to 'true', modify timestamp is updated, + // access timestamp otherwise + AsyncOperation m_async_op; + + C_UpdateTimestamp(I& ictx, bool m) : m_image_ctx(ictx), m_modify(m) { + m_async_op.start_op(*get_image_ctx(&m_image_ctx)); + } + ~C_UpdateTimestamp() override { + m_async_op.finish_op(); + } + + void send() { + librados::ObjectWriteOperation op; + if (m_modify) { + cls_client::set_modify_timestamp(&op); + } else { + cls_client::set_access_timestamp(&op); + } + + auto comp = librbd::util::create_rados_callback(this); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + + void finish(int r) override { + // ignore errors updating timestamp + } +}; + +bool should_update_timestamp(const utime_t& now, const utime_t& timestamp, + uint64_t interval) { + return (interval && + (static_cast(now.sec()) >= interval + timestamp)); +} + +} // anonymous namespace + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ImageRequest: " << this \ + << " " << __func__ << ": " + +template +void ImageRequest::aio_read(I *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + ReadResult &&read_result, IOContext io_context, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) { + ImageReadRequest req(*ictx, c, std::move(image_extents), area, + std::move(read_result), io_context, op_flags, + read_flags, parent_trace); + req.send(); +} + +template +void ImageRequest::aio_write(I *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace) { + ImageWriteRequest req(*ictx, c, std::move(image_extents), area, + std::move(bl), op_flags, parent_trace); + req.send(); +} + +template +void ImageRequest::aio_discard(I *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + uint32_t discard_granularity_bytes, + const ZTracer::Trace &parent_trace) { + ImageDiscardRequest req(*ictx, c, std::move(image_extents), area, + discard_granularity_bytes, parent_trace); + req.send(); +} + +template +void ImageRequest::aio_flush(I *ictx, AioCompletion *c, + FlushSource flush_source, + const ZTracer::Trace &parent_trace) { + ImageFlushRequest req(*ictx, c, flush_source, parent_trace); + req.send(); +} + +template +void ImageRequest::aio_writesame(I *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace) { + ImageWriteSameRequest req(*ictx, c, std::move(image_extents), area, + std::move(bl), op_flags, parent_trace); + req.send(); +} + +template +void ImageRequest::aio_compare_and_write(I *ictx, AioCompletion *c, + Extents &&image_extents, + ImageArea area, + bufferlist &&cmp_bl, + bufferlist &&bl, + uint64_t *mismatch_offset, + int op_flags, + const ZTracer::Trace &parent_trace) { + ImageCompareAndWriteRequest req(*ictx, c, std::move(image_extents), area, + std::move(cmp_bl), std::move(bl), + mismatch_offset, op_flags, parent_trace); + req.send(); +} + +template +void ImageRequest::send() { + I &image_ctx = this->m_image_ctx; + ceph_assert(m_aio_comp->is_initialized(get_aio_type())); + ceph_assert(m_aio_comp->is_started()); + + CephContext *cct = image_ctx.cct; + AioCompletion *aio_comp = this->m_aio_comp; + ldout(cct, 20) << get_request_type() << ": ictx=" << &image_ctx << ", " + << "completion=" << aio_comp << dendl; + + update_timestamp(); + send_request(); +} + +template +void ImageRequest::update_timestamp() { + bool modify = (get_aio_type() != AIO_TYPE_READ); + uint64_t update_interval; + if (modify) { + update_interval = m_image_ctx.mtime_update_interval; + } else { + update_interval = m_image_ctx.atime_update_interval; + } + + if (update_interval == 0) { + return; + } + + utime_t (I::*get_timestamp_fn)() const; + void (I::*set_timestamp_fn)(utime_t); + if (modify) { + get_timestamp_fn = &I::get_modify_timestamp; + set_timestamp_fn = &I::set_modify_timestamp; + } else { + get_timestamp_fn = &I::get_access_timestamp; + set_timestamp_fn = &I::set_access_timestamp; + } + + utime_t ts = ceph_clock_now(); + { + std::shared_lock timestamp_locker{m_image_ctx.timestamp_lock}; + if(!should_update_timestamp(ts, std::invoke(get_timestamp_fn, m_image_ctx), + update_interval)) { + return; + } + } + + { + std::unique_lock timestamp_locker{m_image_ctx.timestamp_lock}; + bool update = should_update_timestamp( + ts, std::invoke(get_timestamp_fn, m_image_ctx), update_interval); + if (!update) { + return; + } + + std::invoke(set_timestamp_fn, m_image_ctx, ts); + } + + // TODO we fire and forget this outside the IO path to prevent + // potential race conditions with librbd client IO callbacks + // between different threads (e.g. librados and object cacher) + ldout(m_image_ctx.cct, 10) << get_request_type() << dendl; + auto req = new C_UpdateTimestamp(m_image_ctx, modify); + req->send(); +} + +template +ImageReadRequest::ImageReadRequest(I &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, + ReadResult &&read_result, + IOContext io_context, int op_flags, + int read_flags, + const ZTracer::Trace &parent_trace) + : ImageRequest(image_ctx, aio_comp, std::move(image_extents), area, + "read", parent_trace), + m_io_context(io_context), m_op_flags(op_flags), m_read_flags(read_flags) { + aio_comp->read_result = std::move(read_result); +} + +template +void ImageReadRequest::send_request() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + auto &image_extents = this->m_image_extents; + if (this->m_image_area == ImageArea::DATA && + image_ctx.cache && image_ctx.readahead_max_bytes > 0 && + !(m_op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)) { + readahead(get_image_ctx(&image_ctx), image_extents, m_io_context); + } + + // map image extents to object extents + LightweightObjectExtents object_extents; + uint64_t buffer_ofs = 0; + for (auto &extent : image_extents) { + if (extent.second == 0) { + continue; + } + + util::area_to_object_extents(&image_ctx, extent.first, extent.second, + this->m_image_area, buffer_ofs, + &object_extents); + buffer_ofs += extent.second; + } + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->read_result.set_image_extents(image_extents); + + // issue the requests + aio_comp->set_request_count(object_extents.size()); + for (auto &oe : object_extents) { + ldout(cct, 20) << data_object_name(&image_ctx, oe.object_no) << " " + << oe.offset << "~" << oe.length << " from " + << oe.buffer_extents << dendl; + + auto req_comp = new io::ReadResult::C_ObjectReadRequest( + aio_comp, {{oe.offset, oe.length, std::move(oe.buffer_extents)}}); + auto req = ObjectDispatchSpec::create_read( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, oe.object_no, + &req_comp->extents, m_io_context, m_op_flags, m_read_flags, + this->m_trace, nullptr, req_comp); + req->send(); + } + + image_ctx.perfcounter->inc(l_librbd_rd); + image_ctx.perfcounter->inc(l_librbd_rd_bytes, buffer_ofs); +} + +template +void AbstractImageWriteRequest::send_request() { + I &image_ctx = this->m_image_ctx; + + bool journaling = false; + + AioCompletion *aio_comp = this->m_aio_comp; + { + // prevent image size from changing between computing clip and recording + // pending async operation + std::shared_lock image_locker{image_ctx.image_lock}; + journaling = (image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()); + } + + uint64_t clip_len = 0; + LightweightObjectExtents object_extents; + for (auto &extent : this->m_image_extents) { + if (extent.second == 0) { + continue; + } + + // map to object extents + io::util::area_to_object_extents(&image_ctx, extent.first, extent.second, + this->m_image_area, clip_len, + &object_extents); + clip_len += extent.second; + } + + int ret = prune_object_extents(&object_extents); + if (ret < 0) { + aio_comp->fail(ret); + return; + } + + // reflect changes in object_extents back to m_image_extents + if (ret == 1) { + this->m_image_extents.clear(); + for (auto& object_extent : object_extents) { + auto [image_extents, _] = io::util::object_to_area_extents( + &image_ctx, object_extent.object_no, + {{object_extent.offset, object_extent.length}}); + this->m_image_extents.insert(this->m_image_extents.end(), + image_extents.begin(), image_extents.end()); + } + } + + aio_comp->set_request_count(object_extents.size()); + if (!object_extents.empty()) { + uint64_t journal_tid = 0; + if (journaling) { + // in-flight ops are flushed prior to closing the journal + ceph_assert(image_ctx.journal != NULL); + journal_tid = append_journal_event(m_synchronous); + } + + // it's very important that IOContext is captured here instead of + // e.g. at the API layer so that an up-to-date snap context is used + // when owning the exclusive lock + send_object_requests(object_extents, image_ctx.get_data_io_context(), + journal_tid); + } + + update_stats(clip_len); +} + +template +void AbstractImageWriteRequest::send_object_requests( + const LightweightObjectExtents &object_extents, IOContext io_context, + uint64_t journal_tid) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + AioCompletion *aio_comp = this->m_aio_comp; + bool single_extent = (object_extents.size() == 1); + for (auto& oe : object_extents) { + ldout(cct, 20) << data_object_name(&image_ctx, oe.object_no) << " " + << oe.offset << "~" << oe.length << " from " + << oe.buffer_extents << dendl; + C_AioRequest *req_comp = new C_AioRequest(aio_comp); + auto request = create_object_request(oe, io_context, journal_tid, + single_extent, req_comp); + request->send(); + } +} + +template +void ImageWriteRequest::assemble_extent( + const LightweightObjectExtent &object_extent, bufferlist *bl) { + for (auto q = object_extent.buffer_extents.begin(); + q != object_extent.buffer_extents.end(); ++q) { + bufferlist sub_bl; + sub_bl.substr_of(m_bl, q->first, q->second); + bl->claim_append(sub_bl); + } +} + +template +uint64_t ImageWriteRequest::append_journal_event(bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + uint64_t buffer_offset = 0; + ceph_assert(!this->m_image_extents.empty()); + for (auto &extent : this->m_image_extents) { + bufferlist sub_bl; + sub_bl.substr_of(m_bl, buffer_offset, extent.second); + buffer_offset += extent.second; + + tid = image_ctx.journal->append_write_event(extent.first, extent.second, + sub_bl, synchronous); + } + + return tid; +} + +template +ObjectDispatchSpec *ImageWriteRequest::create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + + bufferlist bl; + if (single_extent && object_extent.buffer_extents.size() == 1 && + m_bl.length() == object_extent.length) { + // optimization for single object/buffer extent writes + bl = std::move(m_bl); + } else { + assemble_extent(object_extent, &bl); + } + + auto req = ObjectDispatchSpec::create_write( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, std::move(bl), io_context, m_op_flags, 0, + std::nullopt, journal_tid, this->m_trace, on_finish); + return req; +} + +template +void ImageWriteRequest::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_wr); + image_ctx.perfcounter->inc(l_librbd_wr_bytes, length); +} + +template +uint64_t ImageDiscardRequest::append_journal_event(bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + ceph_assert(!this->m_image_extents.empty()); + for (auto &extent : this->m_image_extents) { + journal::EventEntry event_entry( + journal::AioDiscardEvent(extent.first, + extent.second, + this->m_discard_granularity_bytes)); + tid = image_ctx.journal->append_io_event(std::move(event_entry), + extent.first, extent.second, + synchronous, 0); + } + + return tid; +} + +template +ObjectDispatchSpec *ImageDiscardRequest::create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + auto req = ObjectDispatchSpec::create_discard( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, object_extent.length, io_context, + OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE, journal_tid, this->m_trace, + on_finish); + return req; +} + +template +void ImageDiscardRequest::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_discard); + image_ctx.perfcounter->inc(l_librbd_discard_bytes, length); +} + +template +int ImageDiscardRequest::prune_object_extents( + LightweightObjectExtents* object_extents) const { + if (m_discard_granularity_bytes == 0) { + return 0; + } + + // Align the range to discard_granularity_bytes boundary and skip + // and discards that are too small to free up any space. + // + // discard_granularity_bytes >= object_size && tail truncation + // is a special case for filestore + bool prune_required = false; + bool length_modified = false; + auto object_size = this->m_image_ctx.layout.object_size; + auto discard_granularity_bytes = std::min(m_discard_granularity_bytes, + object_size); + auto xform_lambda = + [discard_granularity_bytes, object_size, &prune_required, &length_modified] + (LightweightObjectExtent& object_extent) { + auto& offset = object_extent.offset; + auto& length = object_extent.length; + auto next_offset = offset + length; + + if ((discard_granularity_bytes < object_size) || + (next_offset < object_size)) { + offset = p2roundup(offset, discard_granularity_bytes); + next_offset = p2align(next_offset, discard_granularity_bytes); + if (offset >= next_offset) { + prune_required = true; + length = 0; + } else { + auto new_length = next_offset - offset; + if (length != new_length) { + length_modified = true; + length = new_length; + } + } + } + }; + std::for_each(object_extents->begin(), object_extents->end(), + xform_lambda); + + if (prune_required) { + // one or more object extents were skipped + auto remove_lambda = + [](const LightweightObjectExtent& object_extent) { + return (object_extent.length == 0); + }; + object_extents->erase( + std::remove_if(object_extents->begin(), object_extents->end(), + remove_lambda), + object_extents->end()); + } + + // object extents were modified, image extents needs updating + if (length_modified || prune_required) { + return 1; + } + + return 0; +} + +template +void ImageFlushRequest::send_request() { + I &image_ctx = this->m_image_ctx; + + bool journaling = false; + { + std::shared_lock image_locker{image_ctx.image_lock}; + journaling = (m_flush_source == FLUSH_SOURCE_USER && + image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()); + } + + AioCompletion *aio_comp = this->m_aio_comp; + aio_comp->set_request_count(1); + + Context *ctx = new C_AioRequest(aio_comp); + + // ensure no locks are held when flush is complete + ctx = librbd::util::create_async_context_callback(image_ctx, ctx); + + uint64_t journal_tid = 0; + if (journaling) { + // in-flight ops are flushed prior to closing the journal + ceph_assert(image_ctx.journal != NULL); + journal_tid = image_ctx.journal->append_io_event( + journal::EventEntry(journal::AioFlushEvent()), 0, 0, false, 0); + image_ctx.journal->user_flushed(); + } + + auto object_dispatch_spec = ObjectDispatchSpec::create_flush( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, m_flush_source, journal_tid, + this->m_trace, ctx); + ctx = new LambdaContext([object_dispatch_spec](int r) { + object_dispatch_spec->send(); + }); + + // ensure all in-flight IOs are settled if non-user flush request + if (m_flush_source == FLUSH_SOURCE_WRITEBACK) { + ctx->complete(0); + } else { + aio_comp->async_op.flush(ctx); + } + + // might be flushing during image shutdown + if (image_ctx.perfcounter != nullptr) { + image_ctx.perfcounter->inc(l_librbd_flush); + } +} + +template +uint64_t ImageWriteSameRequest::append_journal_event(bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + ceph_assert(!this->m_image_extents.empty()); + for (auto &extent : this->m_image_extents) { + journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first, + extent.second, + m_data_bl)); + tid = image_ctx.journal->append_io_event(std::move(event_entry), + extent.first, extent.second, + synchronous, 0); + } + + return tid; +} + +template +ObjectDispatchSpec *ImageWriteSameRequest::create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + + bufferlist bl; + ObjectDispatchSpec *req; + + if (util::assemble_write_same_extent(object_extent, m_data_bl, &bl, false)) { + auto buffer_extents{object_extent.buffer_extents}; + + req = ObjectDispatchSpec::create_write_same( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, object_extent.length, std::move(buffer_extents), + std::move(bl), io_context, m_op_flags, journal_tid, + this->m_trace, on_finish); + return req; + } + req = ObjectDispatchSpec::create_write( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, std::move(bl), io_context, m_op_flags, 0, + std::nullopt, journal_tid, this->m_trace, on_finish); + return req; +} + +template +void ImageWriteSameRequest::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_ws); + image_ctx.perfcounter->inc(l_librbd_ws_bytes, length); +} + +template +uint64_t ImageCompareAndWriteRequest::append_journal_event( + bool synchronous) { + I &image_ctx = this->m_image_ctx; + + uint64_t tid = 0; + ceph_assert(this->m_image_extents.size() == 1); + auto &extent = this->m_image_extents.front(); + tid = image_ctx.journal->append_compare_and_write_event(extent.first, + extent.second, + m_cmp_bl, + m_bl, + synchronous); + + return tid; +} + +template +void ImageCompareAndWriteRequest::assemble_extent( + const LightweightObjectExtent &object_extent, bufferlist *bl, + bufferlist *cmp_bl) { + for (auto q = object_extent.buffer_extents.begin(); + q != object_extent.buffer_extents.end(); ++q) { + bufferlist sub_bl; + sub_bl.substr_of(m_bl, q->first, q->second); + bl->claim_append(sub_bl); + + bufferlist sub_cmp_bl; + sub_cmp_bl.substr_of(m_cmp_bl, q->first, q->second); + cmp_bl->claim_append(sub_cmp_bl); + } +} + +template +ObjectDispatchSpec *ImageCompareAndWriteRequest::create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) { + I &image_ctx = this->m_image_ctx; + + bufferlist bl; + bufferlist cmp_bl; + assemble_extent(object_extent, &bl, &cmp_bl); + auto req = ObjectDispatchSpec::create_compare_and_write( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, object_extent.object_no, + object_extent.offset, std::move(cmp_bl), std::move(bl), io_context, + m_mismatch_offset, m_op_flags, journal_tid, this->m_trace, on_finish); + return req; +} + +template +void ImageCompareAndWriteRequest::update_stats(size_t length) { + I &image_ctx = this->m_image_ctx; + image_ctx.perfcounter->inc(l_librbd_cmp); + image_ctx.perfcounter->inc(l_librbd_cmp_bytes, length); +} + +template +int ImageCompareAndWriteRequest::prune_object_extents( + LightweightObjectExtents* object_extents) const { + if (object_extents->size() > 1) + return -EINVAL; + + I &image_ctx = this->m_image_ctx; + uint64_t su = image_ctx.layout.stripe_unit; + auto& object_extent = object_extents->front(); + if (su == 0 || (object_extent.offset % su + object_extent.length > su)) + return -EINVAL; + + return 0; +} + +template +ImageListSnapsRequest::ImageListSnapsRequest( + I& image_ctx, AioCompletion* aio_comp, Extents&& image_extents, + ImageArea area, SnapIds&& snap_ids, int list_snaps_flags, + SnapshotDelta* snapshot_delta, const ZTracer::Trace& parent_trace) + : ImageRequest(image_ctx, aio_comp, std::move(image_extents), area, + "list-snaps", parent_trace), + m_snap_ids(std::move(snap_ids)), m_list_snaps_flags(list_snaps_flags), + m_snapshot_delta(snapshot_delta) { +} + +template +void ImageListSnapsRequest::send_request() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + // map image extents to object extents + auto &image_extents = this->m_image_extents; + std::map object_number_extents; + for (auto& image_extent : image_extents) { + if (image_extent.second == 0) { + continue; + } + + striper::LightweightObjectExtents object_extents; + io::util::area_to_object_extents(&image_ctx, image_extent.first, + image_extent.second, this->m_image_area, 0, + &object_extents); + for (auto& object_extent : object_extents) { + object_number_extents[object_extent.object_no].emplace_back( + object_extent.offset, object_extent.length); + } + } + + // reassemble the deltas back into image-extents when complete + auto aio_comp = this->m_aio_comp; + aio_comp->set_request_count(1); + auto assemble_ctx = new C_AssembleSnapshotDeltas( + &image_ctx, aio_comp, m_snapshot_delta); + auto sub_aio_comp = AioCompletion::create_and_start< + Context, &Context::complete>(assemble_ctx, get_image_ctx(&image_ctx), + AIO_TYPE_GENERIC); + + // issue the requests + sub_aio_comp->set_request_count(object_number_extents.size()); + for (auto& oe : object_number_extents) { + ldout(cct, 20) << data_object_name(&image_ctx, oe.first) << " " + << oe.second << dendl; + auto ctx = new C_AioRequest(sub_aio_comp); + auto req = ObjectDispatchSpec::create_list_snaps( + &image_ctx, OBJECT_DISPATCH_LAYER_NONE, oe.first, std::move(oe.second), + SnapIds{m_snap_ids}, m_list_snaps_flags, this->m_trace, + assemble_ctx->get_snapshot_delta(oe.first), ctx); + req->send(); + } +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ImageRequest; +template class librbd::io::ImageReadRequest; +template class librbd::io::AbstractImageWriteRequest; +template class librbd::io::ImageWriteRequest; +template class librbd::io::ImageDiscardRequest; +template class librbd::io::ImageFlushRequest; +template class librbd::io::ImageWriteSameRequest; +template class librbd::io::ImageCompareAndWriteRequest; +template class librbd::io::ImageListSnapsRequest; diff --git a/src/librbd/io/ImageRequest.h b/src/librbd/io/ImageRequest.h new file mode 100644 index 000000000..2668c1acb --- /dev/null +++ b/src/librbd/io/ImageRequest.h @@ -0,0 +1,377 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_IMAGE_REQUEST_H +#define CEPH_LIBRBD_IO_IMAGE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "common/zipkin_trace.h" +#include "osd/osd_types.h" +#include "librbd/Utils.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include +#include +#include + +namespace librbd { +class ImageCtx; + +namespace io { + +class AioCompletion; +class ObjectDispatchSpec; +class ReadResult; + +template +class ImageRequest { +public: + virtual ~ImageRequest() { + m_trace.event("finish"); + } + + static void aio_read(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + ReadResult &&read_result, IOContext io_context, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace); + static void aio_write(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace); + static void aio_discard(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + uint32_t discard_granularity_bytes, + const ZTracer::Trace &parent_trace); + static void aio_flush(ImageCtxT *ictx, AioCompletion *c, + FlushSource flush_source, + const ZTracer::Trace &parent_trace); + static void aio_writesame(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace); + static void aio_compare_and_write(ImageCtxT *ictx, AioCompletion *c, + Extents &&image_extents, ImageArea area, + bufferlist &&cmp_bl, bufferlist &&bl, + uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace); + + void send(); + + inline const ZTracer::Trace &get_trace() const { + return m_trace; + } + +protected: + typedef std::list ObjectRequests; + + ImageCtxT &m_image_ctx; + AioCompletion *m_aio_comp; + Extents m_image_extents; + ImageArea m_image_area; + ZTracer::Trace m_trace; + + ImageRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, const char *trace_name, + const ZTracer::Trace &parent_trace) + : m_image_ctx(image_ctx), m_aio_comp(aio_comp), + m_image_extents(std::move(image_extents)), m_image_area(area), + m_trace(librbd::util::create_trace(image_ctx, trace_name, parent_trace)) { + m_trace.event("start"); + } + + virtual void update_timestamp(); + virtual void send_request() = 0; + + virtual aio_type_t get_aio_type() const = 0; + virtual const char *get_request_type() const = 0; +}; + +template +class ImageReadRequest : public ImageRequest { +public: + ImageReadRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace); + +protected: + void send_request() override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_READ; + } + const char *get_request_type() const override { + return "aio_read"; + } + +private: + IOContext m_io_context; + int m_op_flags; + int m_read_flags; +}; + +template +class AbstractImageWriteRequest : public ImageRequest { +public: + inline void flag_synchronous() { + m_synchronous = true; + } + +protected: + using typename ImageRequest::ObjectRequests; + + AbstractImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, + const char *trace_name, + const ZTracer::Trace &parent_trace) + : ImageRequest(image_ctx, aio_comp, std::move(image_extents), + area, trace_name, parent_trace), + m_synchronous(false) { + } + + void send_request() override; + + virtual int prune_object_extents( + LightweightObjectExtents* object_extents) const { + return 0; + } + + void send_object_requests(const LightweightObjectExtents &object_extents, + IOContext io_context, uint64_t journal_tid); + virtual ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) = 0; + + virtual uint64_t append_journal_event(bool synchronous) = 0; + virtual void update_stats(size_t length) = 0; + +private: + bool m_synchronous; +}; + +template +class ImageWriteRequest : public AbstractImageWriteRequest { +public: + ImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest( + image_ctx, aio_comp, std::move(image_extents), area, + "write", parent_trace), + m_bl(std::move(bl)), m_op_flags(op_flags) { + } + +protected: + using typename ImageRequest::ObjectRequests; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_WRITE; + } + const char *get_request_type() const override { + return "aio_write"; + } + + void assemble_extent(const LightweightObjectExtent &object_extent, + bufferlist *bl); + + ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; + +private: + bufferlist m_bl; + int m_op_flags; +}; + +template +class ImageDiscardRequest : public AbstractImageWriteRequest { +public: + ImageDiscardRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents&& image_extents, ImageArea area, + uint32_t discard_granularity_bytes, + const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest( + image_ctx, aio_comp, std::move(image_extents), area, + "discard", parent_trace), + m_discard_granularity_bytes(discard_granularity_bytes) { + } + +protected: + using typename ImageRequest::ObjectRequests; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_DISCARD; + } + const char *get_request_type() const override { + return "aio_discard"; + } + + ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; + + int prune_object_extents( + LightweightObjectExtents* object_extents) const override; + +private: + uint32_t m_discard_granularity_bytes; +}; + +template +class ImageFlushRequest : public ImageRequest { +public: + ImageFlushRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + FlushSource flush_source, + const ZTracer::Trace &parent_trace) + : ImageRequest(image_ctx, aio_comp, {}, + ImageArea::DATA /* dummy for {} */, + "flush", parent_trace), + m_flush_source(flush_source) { + } + +protected: + using typename ImageRequest::ObjectRequests; + + void update_timestamp() override { + } + void send_request() override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_FLUSH; + } + const char *get_request_type() const override { + return "aio_flush"; + } + +private: + FlushSource m_flush_source; + +}; + +template +class ImageWriteSameRequest : public AbstractImageWriteRequest { +public: + ImageWriteSameRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents&& image_extents, ImageArea area, + bufferlist &&bl, int op_flags, + const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest( + image_ctx, aio_comp, std::move(image_extents), area, + "writesame", parent_trace), + m_data_bl(std::move(bl)), m_op_flags(op_flags) { + } + +protected: + using typename ImageRequest::ObjectRequests; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_WRITESAME; + } + const char *get_request_type() const override { + return "aio_writesame"; + } + + ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; +private: + bufferlist m_data_bl; + int m_op_flags; +}; + +template +class ImageCompareAndWriteRequest : public AbstractImageWriteRequest { +public: + using typename ImageRequest::ObjectRequests; + + ImageCompareAndWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp, + Extents &&image_extents, ImageArea area, + bufferlist &&cmp_bl, bufferlist &&bl, + uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace) + : AbstractImageWriteRequest( + image_ctx, aio_comp, std::move(image_extents), area, + "compare_and_write", parent_trace), + m_cmp_bl(std::move(cmp_bl)), m_bl(std::move(bl)), + m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) { + } + +protected: + void assemble_extent(const LightweightObjectExtent &object_extent, + bufferlist *bl, bufferlist *cmp_bl); + + ObjectDispatchSpec *create_object_request( + const LightweightObjectExtent &object_extent, IOContext io_context, + uint64_t journal_tid, bool single_extent, Context *on_finish) override; + + uint64_t append_journal_event(bool synchronous) override; + void update_stats(size_t length) override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_COMPARE_AND_WRITE; + } + const char *get_request_type() const override { + return "aio_compare_and_write"; + } + + int prune_object_extents( + LightweightObjectExtents* object_extents) const override; + +private: + bufferlist m_cmp_bl; + bufferlist m_bl; + uint64_t *m_mismatch_offset; + int m_op_flags; +}; + +template +class ImageListSnapsRequest : public ImageRequest { +public: + ImageListSnapsRequest( + ImageCtxT& image_ctx, AioCompletion* aio_comp, + Extents&& image_extents, ImageArea area, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace& parent_trace); + +protected: + void update_timestamp() override {} + void send_request() override; + + aio_type_t get_aio_type() const override { + return AIO_TYPE_GENERIC; + } + const char *get_request_type() const override { + return "list-snaps"; + } + +private: + SnapIds m_snap_ids; + int m_list_snaps_flags; + SnapshotDelta* m_snapshot_delta; +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ImageRequest; +extern template class librbd::io::ImageReadRequest; +extern template class librbd::io::AbstractImageWriteRequest; +extern template class librbd::io::ImageWriteRequest; +extern template class librbd::io::ImageDiscardRequest; +extern template class librbd::io::ImageFlushRequest; +extern template class librbd::io::ImageWriteSameRequest; +extern template class librbd::io::ImageCompareAndWriteRequest; +extern template class librbd::io::ImageListSnapsRequest; + +#endif // CEPH_LIBRBD_IO_IMAGE_REQUEST_H diff --git a/src/librbd/io/IoOperations.cc b/src/librbd/io/IoOperations.cc new file mode 100644 index 000000000..7db7e7a80 --- /dev/null +++ b/src/librbd/io/IoOperations.cc @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "librbd/io/Types.h" +#include "librbd/io/IoOperations.h" + +#include +#include + +namespace librbd { +namespace io { + +#define RBD_IO_OPERATION_NAME_READ "read" +#define RBD_IO_OPERATION_NAME_WRITE "write" +#define RBD_IO_OPERATION_NAME_DISCARD "discard" +#define RBD_IO_OPERATION_NAME_WRITE_SAME "write_same" +#define RBD_IO_OPERATION_NAME_COMPARE_AND_WRITE "compare_and_write" + +static const std::map RBD_IO_OPERATION_MAP = { + {RBD_IO_OPERATION_NAME_READ, RBD_IO_OPERATION_READ}, + {RBD_IO_OPERATION_NAME_WRITE, RBD_IO_OPERATION_WRITE}, + {RBD_IO_OPERATION_NAME_DISCARD, RBD_IO_OPERATION_DISCARD}, + {RBD_IO_OPERATION_NAME_WRITE_SAME, RBD_IO_OPERATION_WRITE_SAME}, + {RBD_IO_OPERATION_NAME_COMPARE_AND_WRITE, RBD_IO_OPERATION_COMPARE_AND_WRITE}, +}; +static_assert((RBD_IO_OPERATION_COMPARE_AND_WRITE << 1) > RBD_IO_OPERATIONS_ALL, + "new RBD io operation added"); + +std::string rbd_io_operations_to_string(uint64_t operations, + std::ostream *err) +{ + std::string r; + for (auto& i : RBD_IO_OPERATION_MAP) { + if (operations & i.second) { + if (!r.empty()) { + r += ","; + } + r += i.first; + operations &= ~i.second; + } + } + if (err && operations) { + *err << "ignoring unknown io operation mask 0x" + << std::hex << operations << std::dec; + } + return r; +} + +uint64_t rbd_io_operations_from_string(const std::string& orig_value, + std::ostream *err) +{ + uint64_t operations = 0; + std::string value = orig_value; + boost::trim(value); + + // empty string means default operations + if (!value.size()) { + return RBD_IO_OPERATIONS_DEFAULT; + } + + try { + // numeric? + operations = boost::lexical_cast(value); + + // drop unrecognized bits + uint64_t unsupported_operations = (operations & ~RBD_IO_OPERATIONS_ALL); + if (unsupported_operations != 0ull) { + operations &= RBD_IO_OPERATIONS_ALL; + if (err) { + *err << "ignoring unknown operation mask 0x" + << std::hex << unsupported_operations << std::dec; + } + } + } catch (boost::bad_lexical_cast&) { + // operation name list? + bool errors = false; + std::vector operation_names; + boost::split(operation_names, value, boost::is_any_of(",")); + for (auto operation_name: operation_names) { + boost::trim(operation_name); + auto operation_it = RBD_IO_OPERATION_MAP.find(operation_name); + if (operation_it != RBD_IO_OPERATION_MAP.end()) { + operations += operation_it->second; + } else if (err) { + if (errors) { + *err << ", "; + } else { + errors = true; + } + *err << "ignoring unknown operation " << operation_name; + } + } + } + return operations; +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/IoOperations.h b/src/librbd/io/IoOperations.h new file mode 100644 index 000000000..93d3ef4fe --- /dev/null +++ b/src/librbd/io/IoOperations.h @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +namespace librbd { +namespace io { + + std::string rbd_io_operations_to_string(uint64_t ops, + std::ostream *err); + uint64_t rbd_io_operations_from_string(const std::string& value, + std::ostream *err); + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/ObjectDispatch.cc b/src/librbd/io/ObjectDispatch.cc new file mode 100644 index 000000000..a31cc74ea --- /dev/null +++ b/src/librbd/io/ObjectDispatch.cc @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectDispatch.h" +#include "common/dout.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/io/ObjectRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ObjectDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +using librbd::util::data_object_name; + +template +ObjectDispatch::ObjectDispatch(I* image_ctx) + : m_image_ctx(image_ctx) { +} + +template +void ObjectDispatch::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_image_ctx->asio_engine->post(on_finish, 0); +} + +template +bool ObjectDispatch::read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << " " << *extents << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectReadRequest(m_image_ctx, object_no, extents, + io_context, op_flags, read_flags, + parent_trace, version, on_dispatched); + req->send(); + return true; +} + +template +bool ObjectDispatch::discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectDiscardRequest(m_image_ctx, object_no, object_off, + object_len, io_context, discard_flags, + parent_trace, on_dispatched); + req->send(); + return true; +} + +template +bool ObjectDispatch::write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << data.length() << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectWriteRequest(m_image_ctx, object_no, object_off, + std::move(data), io_context, op_flags, + write_flags, assert_version, + parent_trace, on_dispatched); + req->send(); + return true; +} + +template +bool ObjectDispatch::write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectWriteSameRequest(m_image_ctx, object_no, + object_off, object_len, + std::move(data), io_context, + op_flags, parent_trace, + on_dispatched); + req->send(); + return true; +} + +template +bool ObjectDispatch::compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << write_data.length() << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = new ObjectCompareAndWriteRequest(m_image_ctx, object_no, + object_off, + std::move(cmp_data), + std::move(write_data), + io_context, mismatch_offset, + op_flags, parent_trace, + on_dispatched); + req->send(); + return true; +} + +template +bool ObjectDispatch::list_snaps( + uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << "extents=" << extents << ", " + << "snap_ids=" << snap_ids << dendl; + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + auto req = ObjectListSnapsRequest::create( + m_image_ctx, object_no, std::move(extents), std::move(snap_ids), + list_snap_flags, parent_trace, snapshot_delta, on_dispatched); + req->send(); + return true; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ObjectDispatch; diff --git a/src/librbd/io/ObjectDispatch.h b/src/librbd/io/ObjectDispatch.h new file mode 100644 index 000000000..dd1f7261d --- /dev/null +++ b/src/librbd/io/ObjectDispatch.h @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "common/zipkin_trace.h" +#include "librbd/io/Types.h" +#include "librbd/io/ObjectDispatchInterface.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template +class ObjectDispatch : public ObjectDispatchInterface { +public: + ObjectDispatch(ImageCtxT* image_ctx); + + ObjectDispatchLayer get_dispatch_layer() const override { + return OBJECT_DISPATCH_LAYER_CORE; + } + + void shut_down(Context* on_finish) override; + + bool read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override { + return false; + } + + bool list_snaps( + uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool invalidate_cache(Context* on_finish) override { + return false; + } + bool reset_existence_cache(Context* on_finish) override { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override { + } + + int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override { + return 0; + } + +private: + ImageCtxT* m_image_ctx; + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ObjectDispatch; + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_H diff --git a/src/librbd/io/ObjectDispatchInterface.h b/src/librbd/io/ObjectDispatchInterface.h new file mode 100644 index 000000000..2e9dd1300 --- /dev/null +++ b/src/librbd/io/ObjectDispatchInterface.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" + +struct Context; +struct RWLock; + +namespace librbd { +namespace io { + +struct AioCompletion; +struct ObjectDispatchInterface; +struct ObjectDispatchSpec; + +struct ObjectDispatchInterface { + typedef ObjectDispatchInterface Dispatch; + typedef ObjectDispatchLayer DispatchLayer; + typedef ObjectDispatchSpec DispatchSpec; + + virtual ~ObjectDispatchInterface() { + } + + virtual ObjectDispatchLayer get_dispatch_layer() const = 0; + + virtual void shut_down(Context* on_finish) = 0; + + virtual bool read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) = 0; + + virtual bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context**on_finish, Context* on_dispatched) = 0; + + virtual bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, const ZTracer::Trace &parent_trace, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context**on_finish, + Context* on_dispatched) = 0; + + virtual bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) = 0; + + virtual bool list_snaps( + uint64_t object_no, Extents&& extents, SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) = 0; + + virtual bool invalidate_cache(Context* on_finish) = 0; + virtual bool reset_existence_cache(Context* on_finish) = 0; + + virtual void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) = 0; + + virtual int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) = 0; + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_INTERFACE_H diff --git a/src/librbd/io/ObjectDispatchSpec.cc b/src/librbd/io/ObjectDispatchSpec.cc new file mode 100644 index 000000000..3efff9774 --- /dev/null +++ b/src/librbd/io/ObjectDispatchSpec.cc @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectDispatchSpec.h" +#include "include/Context.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include + +namespace librbd { +namespace io { + +void ObjectDispatchSpec::C_Dispatcher::complete(int r) { + if (r < 0) { + finish(r); + return; + } + + switch (object_dispatch_spec->dispatch_result) { + case DISPATCH_RESULT_CONTINUE: + object_dispatch_spec->send(); + break; + case DISPATCH_RESULT_COMPLETE: + finish(r); + break; + case DISPATCH_RESULT_INVALID: + case DISPATCH_RESULT_RESTART: + ceph_abort(); + break; + } +} + +void ObjectDispatchSpec::C_Dispatcher::finish(int r) { + on_finish->complete(r); + delete object_dispatch_spec; +} + +void ObjectDispatchSpec::send() { + object_dispatcher->send(this); +} + +void ObjectDispatchSpec::fail(int r) { + ceph_assert(r < 0); + dispatcher_ctx.complete(r); +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/ObjectDispatchSpec.h b/src/librbd/io/ObjectDispatchSpec.h new file mode 100644 index 000000000..a0d4b49a4 --- /dev/null +++ b/src/librbd/io/ObjectDispatchSpec.h @@ -0,0 +1,295 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include + +namespace librbd { +namespace io { + +struct ObjectDispatcherInterface; + +struct ObjectDispatchSpec { +private: + // helper to avoid extra heap allocation per object IO + struct C_Dispatcher : public Context { + ObjectDispatchSpec* object_dispatch_spec; + Context* on_finish; + + C_Dispatcher(ObjectDispatchSpec* object_dispatch_spec, Context* on_finish) + : object_dispatch_spec(object_dispatch_spec), on_finish(on_finish) { + } + + void complete(int r) override; + void finish(int r) override; + }; + +public: + struct RequestBase { + uint64_t object_no; + + RequestBase(uint64_t object_no) + : object_no(object_no) { + } + }; + + struct ReadRequest : public RequestBase { + ReadExtents* extents; + int read_flags; + uint64_t* version; + + ReadRequest(uint64_t object_no, ReadExtents* extents, int read_flags, + uint64_t* version) + : RequestBase(object_no), extents(extents), read_flags(read_flags), + version(version) { + } + }; + + struct WriteRequestBase : public RequestBase { + uint64_t object_off; + uint64_t journal_tid; + + WriteRequestBase(uint64_t object_no, uint64_t object_off, + uint64_t journal_tid) + : RequestBase(object_no), object_off(object_off), + journal_tid(journal_tid) { + } + }; + + struct DiscardRequest : public WriteRequestBase { + uint64_t object_len; + int discard_flags; + + DiscardRequest(uint64_t object_no, uint64_t object_off, uint64_t object_len, + int discard_flags, uint64_t journal_tid) + : WriteRequestBase(object_no, object_off, journal_tid), + object_len(object_len), discard_flags(discard_flags) { + } + }; + + struct WriteRequest : public WriteRequestBase { + ceph::bufferlist data; + int write_flags; + std::optional assert_version; + + WriteRequest(uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, int write_flags, + std::optional assert_version, uint64_t journal_tid) + : WriteRequestBase(object_no, object_off, journal_tid), + data(std::move(data)), write_flags(write_flags), + assert_version(assert_version) { + } + }; + + struct WriteSameRequest : public WriteRequestBase { + uint64_t object_len; + LightweightBufferExtents buffer_extents; + ceph::bufferlist data; + + WriteSameRequest(uint64_t object_no, uint64_t object_off, + uint64_t object_len, + LightweightBufferExtents&& buffer_extents, + ceph::bufferlist&& data, uint64_t journal_tid) + : WriteRequestBase(object_no, object_off, journal_tid), + object_len(object_len), buffer_extents(std::move(buffer_extents)), + data(std::move(data)) { + } + }; + + struct CompareAndWriteRequest : public WriteRequestBase { + ceph::bufferlist cmp_data; + ceph::bufferlist data; + uint64_t* mismatch_offset; + + CompareAndWriteRequest(uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& data, + uint64_t* mismatch_offset, + uint64_t journal_tid) + : WriteRequestBase(object_no, object_off, journal_tid), + cmp_data(std::move(cmp_data)), data(std::move(data)), + mismatch_offset(mismatch_offset) { + } + }; + + struct FlushRequest { + FlushSource flush_source; + uint64_t journal_tid; + + FlushRequest(FlushSource flush_source, uint64_t journal_tid) + : flush_source(flush_source), journal_tid(journal_tid) { + } + }; + + struct ListSnapsRequest : public RequestBase { + Extents extents; + SnapIds snap_ids; + int list_snaps_flags; + SnapshotDelta* snapshot_delta; + + ListSnapsRequest(uint64_t object_no, Extents&& extents, + SnapIds&& snap_ids, int list_snaps_flags, + SnapshotDelta* snapshot_delta) + : RequestBase(object_no), extents(std::move(extents)), + snap_ids(std::move(snap_ids)),list_snaps_flags(list_snaps_flags), + snapshot_delta(snapshot_delta) { + } + }; + + typedef boost::variant Request; + + C_Dispatcher dispatcher_ctx; + + ObjectDispatcherInterface* object_dispatcher; + ObjectDispatchLayer dispatch_layer; + int object_dispatch_flags = 0; + DispatchResult dispatch_result = DISPATCH_RESULT_INVALID; + + Request request; + IOContext io_context; + int op_flags; + ZTracer::Trace parent_trace; + + template + static ObjectDispatchSpec* create_read( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, Context* on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + ReadRequest{object_no, extents, + read_flags, version}, + io_context, op_flags, parent_trace, + on_finish); + } + + template + static ObjectDispatchSpec* create_discard( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + DiscardRequest{object_no, object_off, + object_len, discard_flags, + journal_tid}, + io_context, 0, parent_trace, on_finish); + } + + template + static ObjectDispatchSpec* create_write( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + WriteRequest{object_no, object_off, + std::move(data), write_flags, + assert_version, journal_tid}, + io_context, op_flags, parent_trace, + on_finish); + } + + template + static ObjectDispatchSpec* create_write_same( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + WriteSameRequest{object_no, object_off, + object_len, + std::move(buffer_extents), + std::move(data), + journal_tid}, + io_context, op_flags, parent_trace, + on_finish); + } + + template + static ObjectDispatchSpec* create_compare_and_write( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, + uint64_t *mismatch_offset, int op_flags, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + CompareAndWriteRequest{object_no, + object_off, + std::move(cmp_data), + std::move(write_data), + mismatch_offset, + journal_tid}, + io_context, op_flags, parent_trace, + on_finish); + } + + template + static ObjectDispatchSpec* create_flush( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + FlushSource flush_source, uint64_t journal_tid, + const ZTracer::Trace &parent_trace, Context *on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + FlushRequest{flush_source, journal_tid}, + {}, 0, parent_trace, on_finish); + } + + template + static ObjectDispatchSpec* create_list_snaps( + ImageCtxT* image_ctx, ObjectDispatchLayer object_dispatch_layer, + uint64_t object_no, Extents&& extents, SnapIds&& snap_ids, + int list_snaps_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, Context* on_finish) { + return new ObjectDispatchSpec(image_ctx->io_object_dispatcher, + object_dispatch_layer, + ListSnapsRequest{object_no, + std::move(extents), + std::move(snap_ids), + list_snaps_flags, + snapshot_delta}, + {}, 0, parent_trace, on_finish); + } + + void send(); + void fail(int r); + +private: + template friend class ObjectDispatcher; + + ObjectDispatchSpec(ObjectDispatcherInterface* object_dispatcher, + ObjectDispatchLayer object_dispatch_layer, + Request&& request, IOContext io_context, int op_flags, + const ZTracer::Trace& parent_trace, Context* on_finish) + : dispatcher_ctx(this, on_finish), object_dispatcher(object_dispatcher), + dispatch_layer(object_dispatch_layer), request(std::move(request)), + io_context(io_context), op_flags(op_flags), parent_trace(parent_trace) { + } + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCH_SPEC_H diff --git a/src/librbd/io/ObjectDispatcher.cc b/src/librbd/io/ObjectDispatcher.cc new file mode 100644 index 000000000..b66c6bb18 --- /dev/null +++ b/src/librbd/io/ObjectDispatcher.cc @@ -0,0 +1,208 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectDispatcher.h" +#include "include/Context.h" +#include "common/AsyncOpTracker.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/ObjectDispatch.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ObjectDispatcher: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template +struct ObjectDispatcher::C_ResetExistenceCache : public C_LayerIterator { + C_ResetExistenceCache(ObjectDispatcher* object_dispatcher, Context* on_finish) + : C_LayerIterator(object_dispatcher, OBJECT_DISPATCH_LAYER_NONE, on_finish) { + } + + bool execute(ObjectDispatchInterface* object_dispatch, + Context* on_finish) override { + return object_dispatch->reset_existence_cache(on_finish); + } +}; + +template +struct ObjectDispatcher::SendVisitor : public boost::static_visitor { + ObjectDispatchInterface* object_dispatch; + ObjectDispatchSpec* object_dispatch_spec; + + SendVisitor(ObjectDispatchInterface* object_dispatch, + ObjectDispatchSpec* object_dispatch_spec) + : object_dispatch(object_dispatch), + object_dispatch_spec(object_dispatch_spec) { + } + + bool operator()(ObjectDispatchSpec::ReadRequest& read) const { + return object_dispatch->read( + read.object_no, read.extents, object_dispatch_spec->io_context, + object_dispatch_spec->op_flags, read.read_flags, + object_dispatch_spec->parent_trace, read.version, + &object_dispatch_spec->object_dispatch_flags, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::DiscardRequest& discard) const { + return object_dispatch->discard( + discard.object_no, discard.object_off, discard.object_len, + object_dispatch_spec->io_context, discard.discard_flags, + object_dispatch_spec->parent_trace, + &object_dispatch_spec->object_dispatch_flags, &discard.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::WriteRequest& write) const { + return object_dispatch->write( + write.object_no, write.object_off, std::move(write.data), + object_dispatch_spec->io_context, object_dispatch_spec->op_flags, + write.write_flags, write.assert_version, + object_dispatch_spec->parent_trace, + &object_dispatch_spec->object_dispatch_flags, &write.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::WriteSameRequest& write_same) const { + return object_dispatch->write_same( + write_same.object_no, write_same.object_off, write_same.object_len, + std::move(write_same.buffer_extents), std::move(write_same.data), + object_dispatch_spec->io_context, object_dispatch_spec->op_flags, + object_dispatch_spec->parent_trace, + &object_dispatch_spec->object_dispatch_flags, &write_same.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()( + ObjectDispatchSpec::CompareAndWriteRequest& compare_and_write) const { + return object_dispatch->compare_and_write( + compare_and_write.object_no, compare_and_write.object_off, + std::move(compare_and_write.cmp_data), std::move(compare_and_write.data), + object_dispatch_spec->io_context, object_dispatch_spec->op_flags, + object_dispatch_spec->parent_trace, compare_and_write.mismatch_offset, + &object_dispatch_spec->object_dispatch_flags, + &compare_and_write.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::FlushRequest& flush) const { + return object_dispatch->flush( + flush.flush_source, object_dispatch_spec->parent_trace, + &flush.journal_tid, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } + + bool operator()(ObjectDispatchSpec::ListSnapsRequest& list_snaps) const { + return object_dispatch->list_snaps( + list_snaps.object_no, std::move(list_snaps.extents), + std::move(list_snaps.snap_ids), list_snaps.list_snaps_flags, + object_dispatch_spec->parent_trace, list_snaps.snapshot_delta, + &object_dispatch_spec->object_dispatch_flags, + &object_dispatch_spec->dispatch_result, + &object_dispatch_spec->dispatcher_ctx.on_finish, + &object_dispatch_spec->dispatcher_ctx); + } +}; + +template +ObjectDispatcher::ObjectDispatcher(I* image_ctx) + : Dispatcher(image_ctx) { + // configure the core object dispatch handler on startup + auto object_dispatch = new ObjectDispatch(image_ctx); + this->register_dispatch(object_dispatch); +} + +template +void ObjectDispatcher::invalidate_cache(Context* on_finish) { + auto image_ctx = this->m_image_ctx; + auto cct = image_ctx->cct; + ldout(cct, 5) << dendl; + + on_finish = util::create_async_context_callback(*image_ctx, on_finish); + auto ctx = new C_InvalidateCache( + this, OBJECT_DISPATCH_LAYER_NONE, on_finish); + ctx->complete(0); +} + +template +void ObjectDispatcher::reset_existence_cache(Context* on_finish) { + auto image_ctx = this->m_image_ctx; + auto cct = image_ctx->cct; + ldout(cct, 5) << dendl; + + on_finish = util::create_async_context_callback(*image_ctx, on_finish); + auto ctx = new C_ResetExistenceCache(this, on_finish); + ctx->complete(0); +} + +template +void ObjectDispatcher::extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) { + auto cct = this->m_image_ctx->cct; + ldout(cct, 20) << object_no << " " << object_off << "~" << object_len + << dendl; + + std::shared_lock locker{this->m_lock}; + for (auto it : this->m_dispatches) { + auto& object_dispatch_meta = it.second; + auto object_dispatch = object_dispatch_meta.dispatch; + object_dispatch->extent_overwritten(object_no, object_off, object_len, + journal_tid, new_journal_tid); + } +} + +template +int ObjectDispatcher::prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) { + auto cct = this->m_image_ctx->cct; + ldout(cct, 20) << "object_no=" << object_no << dendl; + + std::shared_lock locker{this->m_lock}; + for (auto it : this->m_dispatches) { + auto& object_dispatch_meta = it.second; + auto object_dispatch = object_dispatch_meta.dispatch; + auto r = object_dispatch->prepare_copyup( + object_no, snapshot_sparse_bufferlist); + if (r < 0) { + return r; + } + } + + return 0; +} + +template +bool ObjectDispatcher::send_dispatch( + ObjectDispatchInterface* object_dispatch, + ObjectDispatchSpec* object_dispatch_spec) { + return boost::apply_visitor( + SendVisitor{object_dispatch, object_dispatch_spec}, + object_dispatch_spec->request); +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ObjectDispatcher; diff --git a/src/librbd/io/ObjectDispatcher.h b/src/librbd/io/ObjectDispatcher.h new file mode 100644 index 000000000..1e5e78d8b --- /dev/null +++ b/src/librbd/io/ObjectDispatcher.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H + +#include "include/int_types.h" +#include "common/ceph_mutex.h" +#include "librbd/io/Dispatcher.h" +#include "librbd/io/ObjectDispatchInterface.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/Types.h" +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +template +class ObjectDispatcher + : public Dispatcher { +public: + ObjectDispatcher(ImageCtxT* image_ctx); + + void invalidate_cache(Context* on_finish) override; + void reset_existence_cache(Context* on_finish) override; + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override; + + int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override; + + using typename Dispatcher::C_LayerIterator; + + using typename Dispatcher::C_InvalidateCache; + +protected: + bool send_dispatch(ObjectDispatchInterface* object_dispatch, + ObjectDispatchSpec* object_dispatch_spec) override; + +private: + struct C_ResetExistenceCache; + struct SendVisitor; + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ObjectDispatcher; + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_H diff --git a/src/librbd/io/ObjectDispatcherInterface.h b/src/librbd/io/ObjectDispatcherInterface.h new file mode 100644 index 000000000..0f3d33330 --- /dev/null +++ b/src/librbd/io/ObjectDispatcherInterface.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H +#define CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H + +#include "include/int_types.h" +#include "librbd/io/DispatcherInterface.h" +#include "librbd/io/ObjectDispatchInterface.h" + +struct Context; + +namespace librbd { +namespace io { + +struct ObjectDispatcherInterface + : public DispatcherInterface { +public: + virtual void invalidate_cache(Context* on_finish) = 0; + virtual void reset_existence_cache(Context* on_finish) = 0; + + virtual void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) = 0; + + virtual int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) = 0; + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_OBJECT_DISPATCHER_INTERFACE_H diff --git a/src/librbd/io/ObjectRequest.cc b/src/librbd/io/ObjectRequest.cc new file mode 100644 index 000000000..6d246cdf3 --- /dev/null +++ b/src/librbd/io/ObjectRequest.cc @@ -0,0 +1,1073 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ObjectRequest.h" +#include "common/ceph_context.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/ceph_mutex.h" +#include "include/Context.h" +#include "include/err.h" +#include "include/neorados/RADOS.hpp" +#include "osd/osd_types.h" +#include "librados/snap_set_diff.h" +#include "librbd/AsioEngine.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/asio/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/CopyupRequest.h" +#include "librbd/io/ImageRequest.h" +#include "librbd/io/Utils.h" + +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ObjectRequest: " << this \ + << " " << __func__ << ": " \ + << data_object_name(this->m_ictx, \ + this->m_object_no) << " " + +namespace librbd { +namespace io { + +using librbd::util::data_object_name; +using librbd::util::create_context_callback; +using librbd::util::create_trace; + +namespace { + +template +inline bool is_copy_on_read(I *ictx, const IOContext& io_context) { + std::shared_lock image_locker{ictx->image_lock}; + return (ictx->clone_copy_on_read && !ictx->read_only && + io_context->read_snap().value_or(CEPH_NOSNAP) == CEPH_NOSNAP && + (ictx->exclusive_lock == nullptr || + ictx->exclusive_lock->is_lock_owner())); +} + +template +void convert_snap_set(const S& src_snap_set, + D* dst_snap_set) { + dst_snap_set->seq = src_snap_set.seq; + dst_snap_set->clones.reserve(src_snap_set.clones.size()); + for (auto& src_clone : src_snap_set.clones) { + dst_snap_set->clones.emplace_back(); + auto& dst_clone = dst_snap_set->clones.back(); + dst_clone.cloneid = src_clone.cloneid; + dst_clone.snaps = src_clone.snaps; + dst_clone.overlap = src_clone.overlap; + dst_clone.size = src_clone.size; + } +} + +} // anonymous namespace + +template +ObjectRequest* +ObjectRequest::create_write( + I *ictx, uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, Context *completion) { + return new ObjectWriteRequest(ictx, object_no, object_off, + std::move(data), io_context, op_flags, + write_flags, assert_version, + parent_trace, completion); +} + +template +ObjectRequest* +ObjectRequest::create_discard( + I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, Context *completion) { + return new ObjectDiscardRequest(ictx, object_no, object_off, + object_len, io_context, discard_flags, + parent_trace, completion); +} + +template +ObjectRequest* +ObjectRequest::create_write_same( + I *ictx, uint64_t object_no, uint64_t object_off, uint64_t object_len, + ceph::bufferlist&& data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion) { + return new ObjectWriteSameRequest(ictx, object_no, object_off, + object_len, std::move(data), io_context, + op_flags, parent_trace, completion); +} + +template +ObjectRequest* +ObjectRequest::create_compare_and_write( + I *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + IOContext io_context, uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion) { + return new ObjectCompareAndWriteRequest(ictx, object_no, object_off, + std::move(cmp_data), + std::move(write_data), io_context, + mismatch_offset, op_flags, + parent_trace, completion); +} + +template +ObjectRequest::ObjectRequest( + I *ictx, uint64_t objectno, IOContext io_context, + const char *trace_name, const ZTracer::Trace &trace, Context *completion) + : m_ictx(ictx), m_object_no(objectno), m_io_context(io_context), + m_completion(completion), + m_trace(create_trace(*ictx, "", trace)) { + ceph_assert(m_ictx->data_ctx.is_valid()); + if (m_trace.valid()) { + m_trace.copy_name(trace_name + std::string(" ") + + data_object_name(ictx, objectno)); + m_trace.event("start"); + } +} + +template +void ObjectRequest::add_write_hint(I& image_ctx, neorados::WriteOp* wr) { + auto alloc_hint_flags = static_cast( + image_ctx.alloc_hint_flags); + if (image_ctx.enable_alloc_hint) { + wr->set_alloc_hint(image_ctx.get_object_size(), + image_ctx.get_object_size(), + alloc_hint_flags); + } else if (image_ctx.alloc_hint_flags != 0U) { + wr->set_alloc_hint(0, 0, alloc_hint_flags); + } +} + +template +bool ObjectRequest::compute_parent_extents(Extents *parent_extents, + ImageArea *area, + bool read_request) { + ceph_assert(ceph_mutex_is_locked(m_ictx->image_lock)); + + m_has_parent = false; + parent_extents->clear(); + *area = ImageArea::DATA; + + uint64_t raw_overlap; + int r = m_ictx->get_parent_overlap( + m_io_context->read_snap().value_or(CEPH_NOSNAP), &raw_overlap); + if (r < 0) { + // NOTE: it's possible for a snapshot to be deleted while we are + // still reading from it + lderr(m_ictx->cct) << "failed to retrieve parent overlap: " + << cpp_strerror(r) << dendl; + return false; + } + bool migration_write = !read_request && !m_ictx->migration_info.empty(); + if (migration_write) { + raw_overlap = m_ictx->migration_info.overlap; + } + if (raw_overlap == 0) { + return false; + } + + std::tie(*parent_extents, *area) = io::util::object_to_area_extents( + m_ictx, m_object_no, {{0, m_ictx->layout.object_size}}); + uint64_t object_overlap = m_ictx->prune_parent_extents( + *parent_extents, *area, raw_overlap, migration_write); + if (object_overlap > 0) { + m_has_parent = true; + return true; + } + return false; +} + +template +void ObjectRequest::async_finish(int r) { + ldout(m_ictx->cct, 20) << "r=" << r << dendl; + m_ictx->asio_engine->post([this, r]() { finish(r); }); +} + +template +void ObjectRequest::finish(int r) { + ldout(m_ictx->cct, 20) << "r=" << r << dendl; + m_completion->complete(r); + delete this; +} + +/** read **/ + +template +ObjectReadRequest::ObjectReadRequest( + I *ictx, uint64_t objectno, ReadExtents* extents, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t* version, + Context *completion) + : ObjectRequest(ictx, objectno, io_context, "read", parent_trace, + completion), + m_extents(extents), m_op_flags(op_flags),m_read_flags(read_flags), + m_version(version) { +} + +template +void ObjectReadRequest::send() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + read_object(); +} + +template +void ObjectReadRequest::read_object() { + I *image_ctx = this->m_ictx; + + std::shared_lock image_locker{image_ctx->image_lock}; + auto read_snap_id = this->m_io_context->read_snap().value_or(CEPH_NOSNAP); + if (read_snap_id == image_ctx->snap_id && + image_ctx->object_map != nullptr && + !image_ctx->object_map->object_may_exist(this->m_object_no)) { + image_ctx->asio_engine->post([this]() { read_parent(); }); + return; + } + image_locker.unlock(); + + ldout(image_ctx->cct, 20) << "snap_id=" << read_snap_id << dendl; + + neorados::ReadOp read_op; + for (auto& extent: *this->m_extents) { + if (extent.length >= image_ctx->sparse_read_threshold_bytes) { + read_op.sparse_read(extent.offset, extent.length, &extent.bl, + &extent.extent_map); + } else { + read_op.read(extent.offset, extent.length, &extent.bl); + } + } + util::apply_op_flags( + m_op_flags, image_ctx->get_read_flags(read_snap_id), &read_op); + + image_ctx->rados_api.execute( + {data_object_name(this->m_ictx, this->m_object_no)}, + *this->m_io_context, std::move(read_op), nullptr, + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_read_object(r); }), m_version, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); +} + +template +void ObjectReadRequest::handle_read_object(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + if (m_version != nullptr) { + ldout(image_ctx->cct, 20) << "version=" << *m_version << dendl; + } + + if (r == -ENOENT) { + read_parent(); + return; + } else if (r < 0) { + lderr(image_ctx->cct) << "failed to read from object: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + this->finish(0); +} + +template +void ObjectReadRequest::read_parent() { + if ((m_read_flags & READ_FLAG_DISABLE_READ_FROM_PARENT) != 0) { + this->finish(-ENOENT); + return; + } + + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + auto ctx = create_context_callback< + ObjectReadRequest, &ObjectReadRequest::handle_read_parent>(this); + + io::util::read_parent( + image_ctx, this->m_object_no, this->m_extents, + this->m_io_context->read_snap().value_or(CEPH_NOSNAP), this->m_trace, + ctx); +} + +template +void ObjectReadRequest::handle_read_parent(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + if (r == -ENOENT) { + this->finish(r); + return; + } else if (r < 0) { + lderr(image_ctx->cct) << "failed to read parent extents: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + copyup(); +} + +template +void ObjectReadRequest::copyup() { + I *image_ctx = this->m_ictx; + if (!is_copy_on_read(image_ctx, this->m_io_context)) { + this->finish(0); + return; + } + + image_ctx->owner_lock.lock_shared(); + image_ctx->image_lock.lock_shared(); + Extents parent_extents; + ImageArea area; + if (!this->compute_parent_extents(&parent_extents, &area, true) || + (image_ctx->exclusive_lock != nullptr && + !image_ctx->exclusive_lock->is_lock_owner())) { + image_ctx->image_lock.unlock_shared(); + image_ctx->owner_lock.unlock_shared(); + this->finish(0); + return; + } + + ldout(image_ctx->cct, 20) << dendl; + + image_ctx->copyup_list_lock.lock(); + auto it = image_ctx->copyup_list.find(this->m_object_no); + if (it == image_ctx->copyup_list.end()) { + // create and kick off a CopyupRequest + auto new_req = CopyupRequest::create( + image_ctx, this->m_object_no, std::move(parent_extents), area, + this->m_trace); + + image_ctx->copyup_list[this->m_object_no] = new_req; + image_ctx->copyup_list_lock.unlock(); + image_ctx->image_lock.unlock_shared(); + new_req->send(); + } else { + image_ctx->copyup_list_lock.unlock(); + image_ctx->image_lock.unlock_shared(); + } + + image_ctx->owner_lock.unlock_shared(); + this->finish(0); +} + +/** write **/ + +template +AbstractObjectWriteRequest::AbstractObjectWriteRequest( + I *ictx, uint64_t object_no, uint64_t object_off, uint64_t len, + IOContext io_context, const char *trace_name, + const ZTracer::Trace &parent_trace, Context *completion) + : ObjectRequest(ictx, object_no, io_context, trace_name, parent_trace, + completion), + m_object_off(object_off), m_object_len(len) +{ + if (this->m_object_off == 0 && + this->m_object_len == ictx->get_object_size()) { + m_full_object = true; + } + + compute_parent_info(); + + ictx->image_lock.lock_shared(); + if (!ictx->migration_info.empty()) { + m_guarding_migration_write = true; + } + ictx->image_lock.unlock_shared(); +} + +template +void AbstractObjectWriteRequest::compute_parent_info() { + I *image_ctx = this->m_ictx; + std::shared_lock image_locker{image_ctx->image_lock}; + + this->compute_parent_extents(&m_parent_extents, &m_image_area, false); + + if (!this->has_parent() || + (m_full_object && + !this->m_io_context->write_snap_context() && + !is_post_copyup_write_required())) { + m_copyup_enabled = false; + } +} + +template +void AbstractObjectWriteRequest::add_write_hint( + neorados::WriteOp *wr) { + I *image_ctx = this->m_ictx; + std::shared_lock image_locker{image_ctx->image_lock}; + if (image_ctx->object_map == nullptr || !this->m_object_may_exist || + image_ctx->alloc_hint_flags != 0U) { + ObjectRequest::add_write_hint(*image_ctx, wr); + } +} + +template +void AbstractObjectWriteRequest::send() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << this->get_op_type() << " " + << this->m_object_off << "~" << this->m_object_len + << dendl; + { + std::shared_lock image_lock{image_ctx->image_lock}; + if (image_ctx->object_map == nullptr) { + m_object_may_exist = true; + } else { + // should have been flushed prior to releasing lock + ceph_assert(image_ctx->exclusive_lock->is_lock_owner()); + m_object_may_exist = image_ctx->object_map->object_may_exist( + this->m_object_no); + } + } + + if (!m_object_may_exist && is_no_op_for_nonexistent_object()) { + ldout(image_ctx->cct, 20) << "skipping no-op on nonexistent object" + << dendl; + this->async_finish(0); + return; + } + + pre_write_object_map_update(); +} + +template +void AbstractObjectWriteRequest::pre_write_object_map_update() { + I *image_ctx = this->m_ictx; + + image_ctx->image_lock.lock_shared(); + if (image_ctx->object_map == nullptr || !is_object_map_update_enabled()) { + image_ctx->image_lock.unlock_shared(); + write_object(); + return; + } + + if (!m_object_may_exist && m_copyup_enabled) { + // optimization: copyup required + image_ctx->image_lock.unlock_shared(); + copyup(); + return; + } + + uint8_t new_state = this->get_pre_write_object_map_state(); + ldout(image_ctx->cct, 20) << this->m_object_off << "~" << this->m_object_len + << dendl; + + if (image_ctx->object_map->template aio_update< + AbstractObjectWriteRequest, + &AbstractObjectWriteRequest::handle_pre_write_object_map_update>( + CEPH_NOSNAP, this->m_object_no, new_state, {}, this->m_trace, false, + this)) { + image_ctx->image_lock.unlock_shared(); + return; + } + + image_ctx->image_lock.unlock_shared(); + write_object(); +} + +template +void AbstractObjectWriteRequest::handle_pre_write_object_map_update(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + if (r < 0) { + lderr(image_ctx->cct) << "failed to update object map: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + write_object(); +} + +template +void AbstractObjectWriteRequest::write_object() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + neorados::WriteOp write_op; + if (m_copyup_enabled) { + if (m_guarding_migration_write) { + auto snap_seq = (this->m_io_context->write_snap_context() ? + this->m_io_context->write_snap_context()->first : 0); + ldout(image_ctx->cct, 20) << "guarding write: snap_seq=" << snap_seq + << dendl; + + cls_client::assert_snapc_seq( + &write_op, snap_seq, cls::rbd::ASSERT_SNAPC_SEQ_LE_SNAPSET_SEQ); + } else { + ldout(image_ctx->cct, 20) << "guarding write" << dendl; + write_op.assert_exists(); + } + } + + add_write_hint(&write_op); + add_write_ops(&write_op); + ceph_assert(write_op.size() != 0); + + image_ctx->rados_api.execute( + {data_object_name(this->m_ictx, this->m_object_no)}, + *this->m_io_context, std::move(write_op), + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_write_object(r); }), nullptr, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); +} + +template +void AbstractObjectWriteRequest::handle_write_object(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + r = filter_write_result(r); + if (r == -ENOENT) { + if (m_copyup_enabled) { + copyup(); + return; + } + } else if (r == -ERANGE && m_guarding_migration_write) { + image_ctx->image_lock.lock_shared(); + m_guarding_migration_write = !image_ctx->migration_info.empty(); + image_ctx->image_lock.unlock_shared(); + + if (m_guarding_migration_write) { + copyup(); + } else { + ldout(image_ctx->cct, 10) << "migration parent gone, restart io" << dendl; + compute_parent_info(); + write_object(); + } + return; + } else if (r == -EILSEQ) { + ldout(image_ctx->cct, 10) << "failed to write object" << dendl; + this->finish(r); + return; + } else if (r < 0) { + lderr(image_ctx->cct) << "failed to write object: " << cpp_strerror(r) + << dendl; + this->finish(r); + return; + } + + post_write_object_map_update(); +} + +template +void AbstractObjectWriteRequest::copyup() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + ceph_assert(!m_copyup_in_progress); + m_copyup_in_progress = true; + + image_ctx->copyup_list_lock.lock(); + auto it = image_ctx->copyup_list.find(this->m_object_no); + if (it == image_ctx->copyup_list.end()) { + auto new_req = CopyupRequest::create( + image_ctx, this->m_object_no, std::move(this->m_parent_extents), + m_image_area, this->m_trace); + this->m_parent_extents.clear(); + + // make sure to wait on this CopyupRequest + new_req->append_request(this, std::move(get_copyup_overwrite_extents())); + image_ctx->copyup_list[this->m_object_no] = new_req; + + image_ctx->copyup_list_lock.unlock(); + new_req->send(); + } else { + it->second->append_request(this, std::move(get_copyup_overwrite_extents())); + image_ctx->copyup_list_lock.unlock(); + } +} + +template +void AbstractObjectWriteRequest::handle_copyup(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + + ceph_assert(m_copyup_in_progress); + m_copyup_in_progress = false; + + if (r < 0 && r != -ERESTART) { + lderr(image_ctx->cct) << "failed to copyup object: " << cpp_strerror(r) + << dendl; + this->finish(r); + return; + } + + if (r == -ERESTART || is_post_copyup_write_required()) { + write_object(); + return; + } + + post_write_object_map_update(); +} + +template +void AbstractObjectWriteRequest::post_write_object_map_update() { + I *image_ctx = this->m_ictx; + + image_ctx->image_lock.lock_shared(); + if (image_ctx->object_map == nullptr || !is_object_map_update_enabled() || + !is_non_existent_post_write_object_map_state()) { + image_ctx->image_lock.unlock_shared(); + this->finish(0); + return; + } + + ldout(image_ctx->cct, 20) << dendl; + + // should have been flushed prior to releasing lock + ceph_assert(image_ctx->exclusive_lock->is_lock_owner()); + if (image_ctx->object_map->template aio_update< + AbstractObjectWriteRequest, + &AbstractObjectWriteRequest::handle_post_write_object_map_update>( + CEPH_NOSNAP, this->m_object_no, OBJECT_NONEXISTENT, OBJECT_PENDING, + this->m_trace, false, this)) { + image_ctx->image_lock.unlock_shared(); + return; + } + + image_ctx->image_lock.unlock_shared(); + this->finish(0); +} + +template +void AbstractObjectWriteRequest::handle_post_write_object_map_update(int r) { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << "r=" << r << dendl; + if (r < 0) { + lderr(image_ctx->cct) << "failed to update object map: " + << cpp_strerror(r) << dendl; + this->finish(r); + return; + } + + this->finish(0); +} + +template +void ObjectWriteRequest::add_write_hint(neorados::WriteOp* wr) { + if ((m_write_flags & OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) { + wr->create(true); + } else if (m_assert_version.has_value()) { + wr->assert_version(m_assert_version.value()); + } + AbstractObjectWriteRequest::add_write_hint(wr); +} + +template +void ObjectWriteRequest::add_write_ops(neorados::WriteOp* wr) { + if (this->m_full_object) { + wr->write_full(bufferlist{m_write_data}); + } else { + wr->write(this->m_object_off, bufferlist{m_write_data}); + } + util::apply_op_flags(m_op_flags, 0U, wr); +} + +template +void ObjectDiscardRequest::add_write_ops(neorados::WriteOp* wr) { + switch (m_discard_action) { + case DISCARD_ACTION_REMOVE: + wr->remove(); + break; + case DISCARD_ACTION_REMOVE_TRUNCATE: + wr->create(false); + // fall through + case DISCARD_ACTION_TRUNCATE: + wr->truncate(this->m_object_off); + break; + case DISCARD_ACTION_ZERO: + wr->zero(this->m_object_off, this->m_object_len); + break; + default: + ceph_abort(); + break; + } +} + +template +void ObjectWriteSameRequest::add_write_ops(neorados::WriteOp* wr) { + wr->writesame(this->m_object_off, this->m_object_len, + bufferlist{m_write_data}); + util::apply_op_flags(m_op_flags, 0U, wr); +} + +template +void ObjectCompareAndWriteRequest::add_write_ops(neorados::WriteOp* wr) { + wr->cmpext(this->m_object_off, bufferlist{m_cmp_bl}, nullptr); + + if (this->m_full_object) { + wr->write_full(bufferlist{m_write_bl}); + } else { + wr->write(this->m_object_off, bufferlist{m_write_bl}); + } + util::apply_op_flags(m_op_flags, 0U, wr); +} + +template +int ObjectCompareAndWriteRequest::filter_write_result(int r) const { + if (r <= -MAX_ERRNO) { + I *image_ctx = this->m_ictx; + + // object extent compare mismatch + uint64_t offset = -MAX_ERRNO - r; + auto [image_extents, _] = io::util::object_to_area_extents( + image_ctx, this->m_object_no, {{offset, this->m_object_len}}); + ceph_assert(image_extents.size() == 1); + + if (m_mismatch_offset) { + *m_mismatch_offset = image_extents[0].first; + } + r = -EILSEQ; + } + return r; +} + +template +ObjectListSnapsRequest::ObjectListSnapsRequest( + I *ictx, uint64_t objectno, Extents&& object_extents, SnapIds&& snap_ids, + int list_snaps_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, Context *completion) + : ObjectRequest( + ictx, objectno, ictx->duplicate_data_io_context(), "snap_list", + parent_trace, completion), + m_object_extents(std::move(object_extents)), + m_snap_ids(std::move(snap_ids)), m_list_snaps_flags(list_snaps_flags), + m_snapshot_delta(snapshot_delta) { + this->m_io_context->read_snap(CEPH_SNAPDIR); +} + +template +void ObjectListSnapsRequest::send() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + if (m_snap_ids.size() < 2) { + lderr(image_ctx->cct) << "invalid snap ids: " << m_snap_ids << dendl; + this->async_finish(-EINVAL); + return; + } + + list_snaps(); +} + +template +void ObjectListSnapsRequest::list_snaps() { + I *image_ctx = this->m_ictx; + ldout(image_ctx->cct, 20) << dendl; + + neorados::ReadOp read_op; + read_op.list_snaps(&m_snap_set, &m_ec); + + image_ctx->rados_api.execute( + {data_object_name(this->m_ictx, this->m_object_no)}, + *this->m_io_context, std::move(read_op), nullptr, + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_list_snaps(r); }), nullptr, + (this->m_trace.valid() ? this->m_trace.get_info() : nullptr)); +} + +template +void ObjectListSnapsRequest::handle_list_snaps(int r) { + I *image_ctx = this->m_ictx; + auto cct = image_ctx->cct; + + if (r >= 0) { + r = -m_ec.value(); + } + + ldout(cct, 20) << "r=" << r << dendl; + + m_snapshot_delta->clear(); + auto& snapshot_delta = *m_snapshot_delta; + + ceph_assert(!m_snap_ids.empty()); + librados::snap_t start_snap_id = 0; + librados::snap_t first_snap_id = *m_snap_ids.begin(); + librados::snap_t last_snap_id = *m_snap_ids.rbegin(); + + if (r == -ENOENT) { + // the object does not exist -- mark the missing extents + zero_extent(first_snap_id, true); + list_from_parent(); + return; + } else if (r < 0) { + lderr(cct) << "failed to retrieve object snapshot list: " << cpp_strerror(r) + << dendl; + this->finish(r); + return; + } + + // helper function requires the librados legacy data structure + librados::snap_set_t snap_set; + convert_snap_set(m_snap_set, &snap_set); + + bool initial_extents_written = false; + + interval_set object_interval; + for (auto& object_extent : m_object_extents) { + object_interval.insert(object_extent.first, object_extent.second); + } + ldout(cct, 20) << "object_interval=" << object_interval << dendl; + + // loop through all expected snapshots and build interval sets for + // data and zeroed ranges for each snapshot + uint64_t prev_end_size = 0; + interval_set initial_written_extents; + for (auto end_snap_id : m_snap_ids) { + if (start_snap_id == end_snap_id) { + continue; + } else if (end_snap_id > last_snap_id) { + break; + } + + interval_set diff; + uint64_t end_size; + bool exists; + librados::snap_t clone_end_snap_id; + bool read_whole_object; + calc_snap_set_diff(cct, snap_set, start_snap_id, + end_snap_id, &diff, &end_size, &exists, + &clone_end_snap_id, &read_whole_object); + + if (read_whole_object || + (!diff.empty() && + ((m_list_snaps_flags & LIST_SNAPS_FLAG_WHOLE_OBJECT) != 0))) { + ldout(cct, 1) << "need to read full object" << dendl; + diff.clear(); + diff.insert(0, image_ctx->layout.object_size); + end_size = image_ctx->layout.object_size; + clone_end_snap_id = end_snap_id; + } else if (!exists) { + end_size = 0; + } + + if (exists) { + // reads should be issued against the newest (existing) snapshot within + // the associated snapshot object clone. writes should be issued + // against the oldest snapshot in the snap_map. + ceph_assert(clone_end_snap_id >= end_snap_id); + if (clone_end_snap_id > last_snap_id) { + // do not read past the copy point snapshot + clone_end_snap_id = last_snap_id; + } + } + + // clip diff to current object extent + interval_set diff_interval; + diff_interval.intersection_of(object_interval, diff); + + // clip diff to size of object (in case it was truncated) + interval_set zero_interval; + if (end_size < prev_end_size) { + zero_interval.insert(end_size, prev_end_size - end_size); + zero_interval.intersection_of(object_interval); + + interval_set trunc_interval; + trunc_interval.intersection_of(zero_interval, diff_interval); + if (!trunc_interval.empty()) { + diff_interval.subtract(trunc_interval); + ldout(cct, 20) << "clearing truncate diff: " << trunc_interval << dendl; + } + } + + ldout(cct, 20) << "start_snap_id=" << start_snap_id << ", " + << "end_snap_id=" << end_snap_id << ", " + << "clone_end_snap_id=" << clone_end_snap_id << ", " + << "diff=" << diff << ", " + << "diff_interval=" << diff_interval<< ", " + << "zero_interval=" << zero_interval<< ", " + << "end_size=" << end_size << ", " + << "prev_end_size=" << prev_end_size << ", " + << "exists=" << exists << ", " + << "whole_object=" << read_whole_object << dendl; + + // check if object exists prior to start of incremental snap delta so that + // we don't DNE the object if no additional deltas exist + if (exists && start_snap_id == 0 && + (!diff_interval.empty() || !zero_interval.empty())) { + ldout(cct, 20) << "object exists at snap id " << end_snap_id << dendl; + initial_extents_written = true; + } + + prev_end_size = end_size; + start_snap_id = end_snap_id; + + if (end_snap_id <= first_snap_id) { + // don't include deltas from the starting snapshots, but we iterate over + // it to track its existence and size + ldout(cct, 20) << "skipping prior snapshot " << dendl; + continue; + } + + if (exists) { + for (auto& interval : diff_interval) { + snapshot_delta[{end_snap_id, clone_end_snap_id}].insert( + interval.first, interval.second, + SparseExtent(SPARSE_EXTENT_STATE_DATA, interval.second)); + } + } else { + zero_interval.union_of(diff_interval); + } + + if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) { + for (auto& interval : zero_interval) { + snapshot_delta[{end_snap_id, end_snap_id}].insert( + interval.first, interval.second, + SparseExtent(SPARSE_EXTENT_STATE_ZEROED, interval.second)); + } + } + } + + bool snapshot_delta_empty = snapshot_delta.empty(); + if (!initial_extents_written) { + zero_extent(first_snap_id, first_snap_id > 0); + } + ldout(cct, 20) << "snapshot_delta=" << snapshot_delta << dendl; + + if (snapshot_delta_empty) { + list_from_parent(); + return; + } + + this->finish(0); +} + +template +void ObjectListSnapsRequest::list_from_parent() { + I *image_ctx = this->m_ictx; + auto cct = image_ctx->cct; + + ceph_assert(!m_snap_ids.empty()); + librados::snap_t snap_id_start = *m_snap_ids.begin(); + librados::snap_t snap_id_end = *m_snap_ids.rbegin(); + + std::unique_lock image_locker{image_ctx->image_lock}; + if ((snap_id_start > 0) || (image_ctx->parent == nullptr) || + ((m_list_snaps_flags & LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT) != 0)) { + image_locker.unlock(); + + this->finish(0); + return; + } + + Extents parent_extents; + uint64_t raw_overlap = 0; + uint64_t object_overlap = 0; + image_ctx->get_parent_overlap(snap_id_end, &raw_overlap); + if (raw_overlap > 0) { + // calculate reverse mapping onto the parent image + std::tie(parent_extents, m_image_area) = io::util::object_to_area_extents( + image_ctx, this->m_object_no, m_object_extents); + object_overlap = image_ctx->prune_parent_extents( + parent_extents, m_image_area, raw_overlap, false); + } + if (object_overlap == 0) { + image_locker.unlock(); + + this->finish(0); + return; + } + + auto ctx = create_context_callback< + ObjectListSnapsRequest, + &ObjectListSnapsRequest::handle_list_from_parent>(this); + auto aio_comp = AioCompletion::create_and_start( + ctx, librbd::util::get_image_ctx(image_ctx->parent), AIO_TYPE_GENERIC); + ldout(cct, 20) << "completion=" << aio_comp + << " parent_extents=" << parent_extents + << " area=" << m_image_area << dendl; + + auto list_snaps_flags = ( + m_list_snaps_flags | LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS); + + ImageListSnapsRequest req( + *image_ctx->parent, aio_comp, std::move(parent_extents), m_image_area, + {0, image_ctx->parent->snap_id}, list_snaps_flags, &m_parent_snapshot_delta, + this->m_trace); + req.send(); +} + +template +void ObjectListSnapsRequest::handle_list_from_parent(int r) { + I *image_ctx = this->m_ictx; + auto cct = image_ctx->cct; + + ldout(cct, 20) << "r=" << r << ", " + << "parent_snapshot_delta=" << m_parent_snapshot_delta + << dendl; + + // ignore special-case of fully empty dataset (we ignore zeroes) + if (m_parent_snapshot_delta.empty()) { + this->finish(0); + return; + } + + // the write/read snapshot id key is not useful for parent images so + // map the the special-case INITIAL_WRITE_READ_SNAP_IDS key + *m_snapshot_delta = {}; + auto& intervals = (*m_snapshot_delta)[INITIAL_WRITE_READ_SNAP_IDS]; + for (auto& [key, image_extents] : m_parent_snapshot_delta) { + for (auto image_extent : image_extents) { + auto state = image_extent.get_val().state; + + // map image-extents back to this object + striper::LightweightObjectExtents object_extents; + io::util::area_to_object_extents(image_ctx, image_extent.get_off(), + image_extent.get_len(), m_image_area, 0, + &object_extents); + for (auto& object_extent : object_extents) { + ceph_assert(object_extent.object_no == this->m_object_no); + intervals.insert( + object_extent.offset, object_extent.length, + {state, object_extent.length}); + } + } + } + + ldout(cct, 20) << "snapshot_delta=" << *m_snapshot_delta << dendl; + this->finish(0); +} + +template +void ObjectListSnapsRequest::zero_extent(uint64_t snap_id, bool dne) { + I *image_ctx = this->m_ictx; + auto cct = image_ctx->cct; + + // the object does not exist or is (partially) under whiteout -- mark the + // missing extents which would be any portion of the object that does not + // have data in the initial snapshot set + if ((m_list_snaps_flags & LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS) == 0) { + interval_set interval; + for (auto [object_offset, object_length] : m_object_extents) { + interval.insert(object_offset, object_length); + } + + for (auto [offset, length] : interval) { + ldout(cct, 20) << "snapshot " << snap_id << ": " + << (dne ? "DNE" : "zeroed") << " extent " + << offset << "~" << length << dendl; + (*m_snapshot_delta)[{snap_id, snap_id}].insert( + offset, length, + SparseExtent( + (dne ? SPARSE_EXTENT_STATE_DNE : SPARSE_EXTENT_STATE_ZEROED), + length)); + } + } +} + +} // namespace io +} // namespace librbd + +template class librbd::io::ObjectRequest; +template class librbd::io::ObjectReadRequest; +template class librbd::io::AbstractObjectWriteRequest; +template class librbd::io::ObjectWriteRequest; +template class librbd::io::ObjectDiscardRequest; +template class librbd::io::ObjectWriteSameRequest; +template class librbd::io::ObjectCompareAndWriteRequest; +template class librbd::io::ObjectListSnapsRequest; diff --git a/src/librbd/io/ObjectRequest.h b/src/librbd/io/ObjectRequest.h new file mode 100644 index 000000000..caf644023 --- /dev/null +++ b/src/librbd/io/ObjectRequest.h @@ -0,0 +1,505 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_OBJECT_REQUEST_H +#define CEPH_LIBRBD_IO_OBJECT_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/neorados/RADOS.hpp" +#include "include/rados/librados.hpp" +#include "common/zipkin_trace.h" +#include "librbd/ObjectMap.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include + +class Context; +class ObjectExtent; + +namespace neorados { struct WriteOp; } + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template class CopyupRequest; + +/** + * This class represents an I/O operation to a single RBD data object. + * Its subclasses encapsulate logic for dealing with special cases + * for I/O due to layering. + */ +template +class ObjectRequest { +public: + static ObjectRequest* create_write( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, IOContext io_context, int op_flags, + int write_flags, std::optional assert_version, + const ZTracer::Trace &parent_trace, Context *completion); + static ObjectRequest* create_discard( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + uint64_t object_len, IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, Context *completion); + static ObjectRequest* create_write_same( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + uint64_t object_len, ceph::bufferlist&& data, IOContext io_context, + int op_flags, const ZTracer::Trace &parent_trace, Context *completion); + static ObjectRequest* create_compare_and_write( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_data, ceph::bufferlist&& write_data, + IOContext io_context, uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion); + + ObjectRequest(ImageCtxT *ictx, uint64_t objectno, IOContext io_context, + const char *trace_name, const ZTracer::Trace &parent_trace, + Context *completion); + virtual ~ObjectRequest() { + m_trace.event("finish"); + } + + static void add_write_hint(ImageCtxT& image_ctx, + neorados::WriteOp *wr); + + virtual void send() = 0; + + bool has_parent() const { + return m_has_parent; + } + + virtual const char *get_op_type() const = 0; + +protected: + bool compute_parent_extents(Extents *parent_extents, ImageArea *area, + bool read_request); + + ImageCtxT *m_ictx; + uint64_t m_object_no; + IOContext m_io_context; + Context *m_completion; + ZTracer::Trace m_trace; + + void async_finish(int r); + void finish(int r); + +private: + bool m_has_parent = false; +}; + +template +class ObjectReadRequest : public ObjectRequest { +public: + static ObjectReadRequest* create( + ImageCtxT *ictx, uint64_t objectno, ReadExtents* extents, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t* version, + Context *completion) { + return new ObjectReadRequest(ictx, objectno, extents, io_context, op_flags, + read_flags, parent_trace, version, completion); + } + + ObjectReadRequest( + ImageCtxT *ictx, uint64_t objectno, ReadExtents* extents, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t* version, + Context *completion); + + void send() override; + + const char *get_op_type() const override { + return "read"; + } + +private: + /** + * @verbatim + * + * + * | + * | + * v + * READ_OBJECT + * | + * v (skip if not needed) + * READ_PARENT + * | + * v (skip if not needed) + * COPYUP + * | + * v + * + * + * @endverbatim + */ + + ReadExtents* m_extents; + int m_op_flags; + int m_read_flags; + uint64_t* m_version; + + void read_object(); + void handle_read_object(int r); + + void read_parent(); + void handle_read_parent(int r); + + void copyup(); +}; + +template +class AbstractObjectWriteRequest : public ObjectRequest { +public: + AbstractObjectWriteRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, uint64_t len, + IOContext io_context, const char *trace_name, + const ZTracer::Trace &parent_trace, Context *completion); + + virtual bool is_empty_write_op() const { + return false; + } + + virtual uint8_t get_pre_write_object_map_state() const { + return OBJECT_EXISTS; + } + + virtual void add_copyup_ops(neorados::WriteOp *wr) { + add_write_ops(wr); + } + + void handle_copyup(int r); + + void send() override; + +protected: + uint64_t m_object_off; + uint64_t m_object_len; + bool m_full_object = false; + bool m_copyup_enabled = true; + + virtual bool is_no_op_for_nonexistent_object() const { + return false; + } + virtual bool is_object_map_update_enabled() const { + return true; + } + virtual bool is_post_copyup_write_required() const { + return false; + } + virtual bool is_non_existent_post_write_object_map_state() const { + return false; + } + + virtual void add_write_hint(neorados::WriteOp *wr); + virtual void add_write_ops(neorados::WriteOp *wr) = 0; + + virtual int filter_write_result(int r) const { + return r; + } + + virtual Extents get_copyup_overwrite_extents() const { + return {{m_object_off, m_object_len}}; + } + +private: + /** + * @verbatim + * + * + * | + * v (no-op write request) + * DETECT_NO_OP . . . . . . . . . . . . . . . . . . . + * | . + * v (skip if not required/disabled) . + * PRE_UPDATE_OBJECT_MAP . + * | . . + * | . (child dne) . + * | . . . . . . . . . . + * | . . + * | (post-copyup write) . . + * | . . . . . . . . . . . . . . + * | . . . . + * v v . v . + * WRITE . . . . . . . . > COPYUP (if required) . + * | | . + * |/----------------------/ . + * | . + * v (skip if not required/disabled) . + * POST_UPDATE_OBJECT_MAP . + * | . + * v . + * < . . . . . . . . . . . . . . . . . . . . + * + * @endverbatim + */ + + Extents m_parent_extents; + ImageArea m_image_area = ImageArea::DATA; + bool m_object_may_exist = false; + bool m_copyup_in_progress = false; + bool m_guarding_migration_write = false; + + void compute_parent_info(); + + void pre_write_object_map_update(); + void handle_pre_write_object_map_update(int r); + + void write_object(); + void handle_write_object(int r); + + void copyup(); + + void post_write_object_map_update(); + void handle_post_write_object_map_update(int r); + +}; + +template +class ObjectWriteRequest : public AbstractObjectWriteRequest { +public: + ObjectWriteRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, IOContext io_context, int op_flags, + int write_flags, std::optional assert_version, + const ZTracer::Trace &parent_trace, Context *completion) + : AbstractObjectWriteRequest(ictx, object_no, object_off, + data.length(), io_context, "write", + parent_trace, completion), + m_write_data(std::move(data)), m_op_flags(op_flags), + m_write_flags(write_flags), m_assert_version(assert_version) { + } + + bool is_empty_write_op() const override { + return (m_write_data.length() == 0); + } + + const char *get_op_type() const override { + return "write"; + } + +protected: + void add_write_ops(neorados::WriteOp *wr) override; + void add_write_hint(neorados::WriteOp *wr) override; + +private: + ceph::bufferlist m_write_data; + int m_op_flags; + int m_write_flags; + std::optional m_assert_version; +}; + +template +class ObjectDiscardRequest : public AbstractObjectWriteRequest { +public: + ObjectDiscardRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + uint64_t object_len, IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, Context *completion) + : AbstractObjectWriteRequest(ictx, object_no, object_off, + object_len, io_context, "discard", + parent_trace, completion), + m_discard_flags(discard_flags) { + if (this->m_full_object) { + if ((m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE) != 0 && + this->has_parent()) { + if (!this->m_copyup_enabled) { + // need to hide the parent object instead of child object + m_discard_action = DISCARD_ACTION_REMOVE_TRUNCATE; + } else { + m_discard_action = DISCARD_ACTION_TRUNCATE; + } + } else { + m_discard_action = DISCARD_ACTION_REMOVE; + } + } else if (object_off + object_len == ictx->layout.object_size) { + m_discard_action = DISCARD_ACTION_TRUNCATE; + } else { + m_discard_action = DISCARD_ACTION_ZERO; + } + } + + const char* get_op_type() const override { + switch (m_discard_action) { + case DISCARD_ACTION_REMOVE: + return "remove"; + case DISCARD_ACTION_REMOVE_TRUNCATE: + return "remove (create+truncate)"; + case DISCARD_ACTION_TRUNCATE: + return "truncate"; + case DISCARD_ACTION_ZERO: + return "zero"; + } + ceph_abort(); + return nullptr; + } + + uint8_t get_pre_write_object_map_state() const override { + if (m_discard_action == DISCARD_ACTION_REMOVE) { + return OBJECT_PENDING; + } + return OBJECT_EXISTS; + } + +protected: + bool is_no_op_for_nonexistent_object() const override { + return (!this->has_parent()); + } + bool is_object_map_update_enabled() const override { + return ( + (m_discard_flags & OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE) == 0); + } + bool is_non_existent_post_write_object_map_state() const override { + return (m_discard_action == DISCARD_ACTION_REMOVE); + } + + void add_write_hint(neorados::WriteOp *wr) override { + // no hint for discard + } + + void add_write_ops(neorados::WriteOp *wr) override; + +private: + enum DiscardAction { + DISCARD_ACTION_REMOVE, + DISCARD_ACTION_REMOVE_TRUNCATE, + DISCARD_ACTION_TRUNCATE, + DISCARD_ACTION_ZERO + }; + + DiscardAction m_discard_action; + int m_discard_flags; + +}; + +template +class ObjectWriteSameRequest : public AbstractObjectWriteRequest { +public: + ObjectWriteSameRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + uint64_t object_len, ceph::bufferlist&& data, IOContext io_context, + int op_flags, const ZTracer::Trace &parent_trace, Context *completion) + : AbstractObjectWriteRequest(ictx, object_no, object_off, + object_len, io_context, "writesame", + parent_trace, completion), + m_write_data(std::move(data)), m_op_flags(op_flags) { + } + + const char *get_op_type() const override { + return "writesame"; + } + +protected: + void add_write_ops(neorados::WriteOp *wr) override; + +private: + ceph::bufferlist m_write_data; + int m_op_flags; +}; + +template +class ObjectCompareAndWriteRequest : public AbstractObjectWriteRequest { +public: + ObjectCompareAndWriteRequest( + ImageCtxT *ictx, uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& cmp_bl, ceph::bufferlist&& write_bl, + IOContext io_context, uint64_t *mismatch_offset, int op_flags, + const ZTracer::Trace &parent_trace, Context *completion) + : AbstractObjectWriteRequest(ictx, object_no, object_off, + cmp_bl.length(), io_context, + "compare_and_write", parent_trace, + completion), + m_cmp_bl(std::move(cmp_bl)), m_write_bl(std::move(write_bl)), + m_mismatch_offset(mismatch_offset), m_op_flags(op_flags) { + } + + const char *get_op_type() const override { + return "compare_and_write"; + } + + void add_copyup_ops(neorados::WriteOp *wr) override { + // no-op on copyup + } + +protected: + virtual bool is_post_copyup_write_required() const { + return true; + } + + void add_write_ops(neorados::WriteOp *wr) override; + + int filter_write_result(int r) const override; + + Extents get_copyup_overwrite_extents() const override { + return {}; + } + +private: + ceph::bufferlist m_cmp_bl; + ceph::bufferlist m_write_bl; + uint64_t *m_mismatch_offset; + int m_op_flags; +}; + +template +class ObjectListSnapsRequest : public ObjectRequest { +public: + static ObjectListSnapsRequest* create( + ImageCtxT *ictx, uint64_t objectno, Extents&& object_extents, + SnapIds&& snap_ids, int list_snaps_flags, + const ZTracer::Trace &parent_trace, SnapshotDelta* snapshot_delta, + Context *completion) { + return new ObjectListSnapsRequest(ictx, objectno, + std::move(object_extents), + std::move(snap_ids), list_snaps_flags, + parent_trace, snapshot_delta, completion); + } + + ObjectListSnapsRequest( + ImageCtxT *ictx, uint64_t objectno, Extents&& object_extents, + SnapIds&& snap_ids, int list_snaps_flags, + const ZTracer::Trace &parent_trace, SnapshotDelta* snapshot_delta, + Context *completion); + + void send() override; + + const char *get_op_type() const override { + return "snap_list"; + } + +private: + Extents m_object_extents; + SnapIds m_snap_ids; + int m_list_snaps_flags; + SnapshotDelta* m_snapshot_delta; + + neorados::SnapSet m_snap_set; + boost::system::error_code m_ec; + + ImageArea m_image_area = ImageArea::DATA; + SnapshotDelta m_parent_snapshot_delta; + + void list_snaps(); + void handle_list_snaps(int r); + + void list_from_parent(); + void handle_list_from_parent(int r); + + void zero_extent(uint64_t snap_id, bool dne); +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::ObjectRequest; +extern template class librbd::io::ObjectReadRequest; +extern template class librbd::io::AbstractObjectWriteRequest; +extern template class librbd::io::ObjectWriteRequest; +extern template class librbd::io::ObjectDiscardRequest; +extern template class librbd::io::ObjectWriteSameRequest; +extern template class librbd::io::ObjectCompareAndWriteRequest; +extern template class librbd::io::ObjectListSnapsRequest; + +#endif // CEPH_LIBRBD_IO_OBJECT_REQUEST_H diff --git a/src/librbd/io/QosImageDispatch.cc b/src/librbd/io/QosImageDispatch.cc new file mode 100644 index 000000000..ea1d5dbb5 --- /dev/null +++ b/src/librbd/io/QosImageDispatch.cc @@ -0,0 +1,328 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/QosImageDispatch.h" +#include "common/dout.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/FlushTracker.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::QosImageDispatch: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace io { + +namespace { + +uint64_t get_extent_length(const Extents& extents) { + uint64_t length = 0; + for (auto& extent : extents) { + length += extent.second; + } + return length; +} + +uint64_t calculate_tokens(bool read_op, uint64_t extent_length, uint64_t flag) { + if (read_op && ((flag & IMAGE_DISPATCH_FLAG_QOS_WRITE_MASK) != 0)) { + return 0; + } else if (!read_op && ((flag & IMAGE_DISPATCH_FLAG_QOS_READ_MASK) != 0)) { + return 0; + } + + return (((flag & IMAGE_DISPATCH_FLAG_QOS_BPS_MASK) != 0) ? extent_length : 1); +} + +static const std::pair throttle_flags[] = { + {IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE, "rbd_qos_iops_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE, "rbd_qos_bps_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE, "rbd_qos_read_iops_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE, "rbd_qos_write_iops_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE, "rbd_qos_read_bps_throttle" }, + {IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE, "rbd_qos_write_bps_throttle" } +}; + +} // anonymous namespace + +template +QosImageDispatch::QosImageDispatch(I* image_ctx) + : m_image_ctx(image_ctx), m_flush_tracker(new FlushTracker(image_ctx)) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; + + SafeTimer *timer; + ceph::mutex *timer_lock; + ImageCtx::get_timer_instance(cct, &timer, &timer_lock); + for (auto [flag, name] : throttle_flags) { + m_throttles.emplace_back( + flag, + new TokenBucketThrottle(cct, name, 0, 0, timer, timer_lock)); + } +} + +template +QosImageDispatch::~QosImageDispatch() { + for (auto t : m_throttles) { + delete t.second; + } +} + +template +void QosImageDispatch::shut_down(Context* on_finish) { + m_flush_tracker->shut_down(); + on_finish->complete(0); +} + +template +void QosImageDispatch::apply_qos_schedule_tick_min(uint64_t tick) { + for (auto pair : m_throttles) { + pair.second->set_schedule_tick_min(tick); + } +} + +template +void QosImageDispatch::apply_qos_limit(uint64_t flag, uint64_t limit, + uint64_t burst, uint64_t burst_seconds) { + auto cct = m_image_ctx->cct; + TokenBucketThrottle *throttle = nullptr; + for (auto pair : m_throttles) { + if (flag == pair.first) { + throttle = pair.second; + break; + } + } + ceph_assert(throttle != nullptr); + + int r = throttle->set_limit(limit, burst, burst_seconds); + if (r < 0) { + lderr(cct) << throttle->get_name() << ": invalid qos parameter: " + << "burst(" << burst << ") is less than " + << "limit(" << limit << ")" << dendl; + // if apply failed, we should at least make sure the limit works. + throttle->set_limit(limit, 0, 1); + } + + if (limit) { + m_qos_enabled_flag |= flag; + } else { + m_qos_enabled_flag &= ~flag; + } +} + +template +void QosImageDispatch::apply_qos_exclude_ops(uint64_t exclude_ops) { + m_qos_exclude_ops = exclude_ops; +} + +template +bool QosImageDispatch::read( + AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_READ) { + return false; + } + + if (needs_throttle(true, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template +bool QosImageDispatch::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_WRITE) { + return false; + } + + if (needs_throttle(false, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template +bool QosImageDispatch::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_DISCARD) { + return false; + } + + if (needs_throttle(false, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template +bool QosImageDispatch::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_WRITE_SAME) { + return false; + } + + if (needs_throttle(false, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template +bool QosImageDispatch::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (m_qos_exclude_ops & RBD_IO_OPERATION_COMPARE_AND_WRITE) { + return false; + } + + if (needs_throttle(false, image_extents, tid, image_dispatch_flags, + dispatch_result, on_finish, on_dispatched)) { + return true; + } + + return false; +} + +template +bool QosImageDispatch::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_flush_tracker->flush(on_dispatched); + return true; +} + +template +void QosImageDispatch::handle_finished(int r, uint64_t tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + m_flush_tracker->finish_io(tid); +} + +template +bool QosImageDispatch::set_throttle_flag( + std::atomic* image_dispatch_flags, uint32_t flag) { + uint32_t expected = image_dispatch_flags->load(); + uint32_t desired; + do { + desired = expected | flag; + } while (!image_dispatch_flags->compare_exchange_weak(expected, desired)); + + return ((desired & IMAGE_DISPATCH_FLAG_QOS_MASK) == + IMAGE_DISPATCH_FLAG_QOS_MASK); +} + +template +bool QosImageDispatch::needs_throttle( + bool read_op, const Extents& image_extents, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + auto extent_length = get_extent_length(image_extents); + bool all_qos_flags_set = false; + + if (!read_op) { + m_flush_tracker->start_io(tid); + *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) { + handle_finished(r, tid); + on_finish->complete(r); + }); + } + *dispatch_result = DISPATCH_RESULT_CONTINUE; + + auto qos_enabled_flag = m_qos_enabled_flag; + for (auto [flag, throttle] : m_throttles) { + if ((qos_enabled_flag & flag) == 0) { + all_qos_flags_set = set_throttle_flag(image_dispatch_flags, flag); + continue; + } + + auto tokens = calculate_tokens(read_op, extent_length, flag); + if (tokens > 0 && + throttle->get(tokens, this, &QosImageDispatch::handle_throttle_ready, + Tag{image_dispatch_flags, on_dispatched}, flag)) { + ldout(cct, 15) << "on_dispatched=" << on_dispatched << ", " + << "flag=" << flag << dendl; + all_qos_flags_set = false; + } else { + all_qos_flags_set = set_throttle_flag(image_dispatch_flags, flag); + } + } + return !all_qos_flags_set; +} + +template +void QosImageDispatch::handle_throttle_ready(Tag&& tag, uint64_t flag) { + auto cct = m_image_ctx->cct; + ldout(cct, 15) << "on_dispatched=" << tag.on_dispatched << ", " + << "flag=" << flag << dendl; + + if (set_throttle_flag(tag.image_dispatch_flags, flag)) { + // timer_lock is held -- so dispatch from outside the timer thread + m_image_ctx->asio_engine->post(tag.on_dispatched, 0); + } +} + +} // namespace io +} // namespace librbd + +template class librbd::io::QosImageDispatch; diff --git a/src/librbd/io/QosImageDispatch.h b/src/librbd/io/QosImageDispatch.h new file mode 100644 index 000000000..f5e08940a --- /dev/null +++ b/src/librbd/io/QosImageDispatch.h @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H + +#include +#include + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "common/Throttle.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template class FlushTracker; + +template +class QosImageDispatch : public ImageDispatchInterface { +public: + struct Tag { + std::atomic* image_dispatch_flags; + Context* on_dispatched; + + Tag(std::atomic* image_dispatch_flags, Context* on_dispatched) + : image_dispatch_flags(image_dispatch_flags), + on_dispatched(on_dispatched) { + } + }; + + QosImageDispatch(ImageCtxT* image_ctx); + ~QosImageDispatch() override; + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_QOS; + } + + void shut_down(Context* on_finish) override; + + void apply_qos_schedule_tick_min(uint64_t tick); + void apply_qos_limit(uint64_t flag, uint64_t limit, uint64_t burst, + uint64_t burst_seconds); + void apply_qos_exclude_ops(uint64_t exclude_ops); + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + +private: + ImageCtxT* m_image_ctx; + + std::list > m_throttles; + uint64_t m_qos_enabled_flag = 0; + uint64_t m_qos_exclude_ops = 0; + + std::unique_ptr> m_flush_tracker; + + void handle_finished(int r, uint64_t tid); + + bool set_throttle_flag(std::atomic* image_dispatch_flags, + uint32_t flag); + bool needs_throttle(bool read_op, const Extents& image_extents, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched); + void handle_throttle_ready(Tag&& tag, uint64_t flag); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::QosImageDispatch; + +#endif // CEPH_LIBRBD_IO_QOS_IMAGE_DISPATCH_H diff --git a/src/librbd/io/QueueImageDispatch.cc b/src/librbd/io/QueueImageDispatch.cc new file mode 100644 index 000000000..ea5ed63b4 --- /dev/null +++ b/src/librbd/io/QueueImageDispatch.cc @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/QueueImageDispatch.h" +#include "common/dout.h" +#include "common/Cond.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/FlushTracker.h" +#include "librbd/io/ImageDispatchSpec.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::QueueImageDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template +QueueImageDispatch::QueueImageDispatch(I* image_ctx) + : m_image_ctx(image_ctx), m_flush_tracker(new FlushTracker(image_ctx)) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; +} + +template +QueueImageDispatch::~QueueImageDispatch() { + delete m_flush_tracker; +} + +template +void QueueImageDispatch::shut_down(Context* on_finish) { + m_flush_tracker->shut_down(); + on_finish->complete(0); +} + +template +bool QueueImageDispatch::read( + AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(true, tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool QueueImageDispatch::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(false, tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool QueueImageDispatch::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(false, tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool QueueImageDispatch::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(false, tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool QueueImageDispatch::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return enqueue(false, tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool QueueImageDispatch::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_flush_tracker->flush(on_dispatched); + return true; +} + +template +void QueueImageDispatch::handle_finished(int r, uint64_t tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + m_flush_tracker->finish_io(tid); +} + +template +bool QueueImageDispatch::enqueue( + bool read_op, uint64_t tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (!m_image_ctx->non_blocking_aio) { + return false; + } + + if (!read_op) { + m_flush_tracker->start_io(tid); + *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) { + handle_finished(r, tid); + on_finish->complete(r); + }); + } + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_image_ctx->asio_engine->post(on_dispatched, 0); + return true; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::QueueImageDispatch; diff --git a/src/librbd/io/QueueImageDispatch.h b/src/librbd/io/QueueImageDispatch.h new file mode 100644 index 000000000..9a41927ba --- /dev/null +++ b/src/librbd/io/QueueImageDispatch.h @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "common/Throttle.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template class FlushTracker; + +template +class QueueImageDispatch : public ImageDispatchInterface { +public: + QueueImageDispatch(ImageCtxT* image_ctx); + ~QueueImageDispatch(); + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_QUEUE; + } + + void shut_down(Context* on_finish) override; + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + +private: + ImageCtxT* m_image_ctx; + + FlushTracker* m_flush_tracker; + + void handle_finished(int r, uint64_t tid); + + bool enqueue(bool read_op, uint64_t tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::QueueImageDispatch; + +#endif // CEPH_LIBRBD_IO_QUEUE_IMAGE_DISPATCH_H diff --git a/src/librbd/io/ReadResult.cc b/src/librbd/io/ReadResult.cc new file mode 100644 index 000000000..c4053fee6 --- /dev/null +++ b/src/librbd/io/ReadResult.cc @@ -0,0 +1,262 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/ReadResult.h" +#include "include/buffer.h" +#include "common/dout.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/Utils.h" +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::ReadResult: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +struct ReadResult::SetImageExtentsVisitor : public boost::static_visitor { + Extents image_extents; + + explicit SetImageExtentsVisitor(const Extents& image_extents) + : image_extents(image_extents) { + } + + void operator()(Linear &linear) const { + uint64_t length = util::get_extents_length(image_extents); + + ceph_assert(length <= linear.buf_len); + linear.buf_len = length; + } + + void operator()(SparseBufferlist &sbl) const { + sbl.image_extents = image_extents; + } + + template + void operator()(T &t) const { + } +}; + +struct ReadResult::AssembleResultVisitor : public boost::static_visitor { + CephContext *cct; + Striper::StripedReadResult &destriper; + + AssembleResultVisitor(CephContext *cct, Striper::StripedReadResult &destriper) + : cct(cct), destriper(destriper) { + } + + void operator()(Empty &empty) const { + ldout(cct, 20) << "dropping read result" << dendl; + } + + void operator()(Linear &linear) const { + ldout(cct, 20) << "copying resulting bytes to " + << reinterpret_cast(linear.buf) << dendl; + destriper.assemble_result(cct, linear.buf, linear.buf_len); + } + + void operator()(Vector &vector) const { + bufferlist bl; + destriper.assemble_result(cct, bl, true); + + ldout(cct, 20) << "copying resulting " << bl.length() << " bytes to iovec " + << reinterpret_cast(vector.iov) << dendl; + + bufferlist::iterator it = bl.begin(); + size_t length = bl.length(); + size_t offset = 0; + int idx = 0; + for (; offset < length && idx < vector.iov_count; idx++) { + size_t len = std::min(vector.iov[idx].iov_len, length - offset); + it.copy(len, static_cast(vector.iov[idx].iov_base)); + offset += len; + } + ceph_assert(offset == bl.length()); + } + + void operator()(Bufferlist &bufferlist) const { + bufferlist.bl->clear(); + destriper.assemble_result(cct, *bufferlist.bl, true); + + ldout(cct, 20) << "moved resulting " << bufferlist.bl->length() << " " + << "bytes to bl " << reinterpret_cast(bufferlist.bl) + << dendl; + } + + void operator()(SparseBufferlist &sparse_bufferlist) const { + sparse_bufferlist.bl->clear(); + + ExtentMap buffer_extent_map; + auto buffer_extents_length = destriper.assemble_result( + cct, &buffer_extent_map, sparse_bufferlist.bl); + + ldout(cct, 20) << "image_extents=" + << sparse_bufferlist.image_extents << ", " + << "buffer_extent_map=" << buffer_extent_map << dendl; + + sparse_bufferlist.extent_map->clear(); + sparse_bufferlist.extent_map->reserve(buffer_extent_map.size()); + + // The extent-map is logically addressed by buffer-extents not image- or + // object-extents. Translate this address mapping to image-extent + // logical addressing since it's tied to an image-extent read + uint64_t buffer_offset = 0; + auto bem_it = buffer_extent_map.begin(); + for (auto [image_offset, image_length] : sparse_bufferlist.image_extents) { + while (bem_it != buffer_extent_map.end()) { + auto [buffer_extent_offset, buffer_extent_length] = *bem_it; + + if (buffer_offset + image_length <= buffer_extent_offset) { + // skip any image extent that is not included in the results + break; + } + + // current buffer-extent should be within the current image-extent + ceph_assert(buffer_offset <= buffer_extent_offset && + buffer_offset + image_length >= + buffer_extent_offset + buffer_extent_length); + auto image_extent_offset = + image_offset + (buffer_extent_offset - buffer_offset); + ldout(cct, 20) << "mapping buffer extent " << buffer_extent_offset + << "~" << buffer_extent_length << " to image extent " + << image_extent_offset << "~" << buffer_extent_length + << dendl; + sparse_bufferlist.extent_map->emplace_back( + image_extent_offset, buffer_extent_length); + ++bem_it; + } + + buffer_offset += image_length; + } + ceph_assert(buffer_offset == buffer_extents_length); + ceph_assert(bem_it == buffer_extent_map.end()); + + ldout(cct, 20) << "moved resulting " << *sparse_bufferlist.extent_map + << " extents of total " << sparse_bufferlist.bl->length() + << " bytes to bl " + << reinterpret_cast(sparse_bufferlist.bl) << dendl; + } +}; + +ReadResult::C_ImageReadRequest::C_ImageReadRequest( + AioCompletion *aio_completion, uint64_t buffer_offset, + const Extents image_extents) + : aio_completion(aio_completion), buffer_offset(buffer_offset), + image_extents(image_extents) { + aio_completion->add_request(); +} + +void ReadResult::C_ImageReadRequest::finish(int r) { + CephContext *cct = aio_completion->ictx->cct; + ldout(cct, 10) << "C_ImageReadRequest: r=" << r + << dendl; + if (r >= 0 || (ignore_enoent && r == -ENOENT)) { + striper::LightweightBufferExtents buffer_extents; + size_t length = 0; + for (auto &image_extent : image_extents) { + buffer_extents.emplace_back(buffer_offset + length, image_extent.second); + length += image_extent.second; + } + ceph_assert(r == -ENOENT || length == bl.length()); + + aio_completion->lock.lock(); + aio_completion->read_result.m_destriper.add_partial_result( + cct, std::move(bl), buffer_extents); + aio_completion->lock.unlock(); + r = length; + } + + aio_completion->complete_request(r); +} + +ReadResult::C_ObjectReadRequest::C_ObjectReadRequest( + AioCompletion *aio_completion, ReadExtents&& extents) + : aio_completion(aio_completion), extents(std::move(extents)) { + aio_completion->add_request(); +} + +void ReadResult::C_ObjectReadRequest::finish(int r) { + CephContext *cct = aio_completion->ictx->cct; + ldout(cct, 10) << "C_ObjectReadRequest: r=" << r + << dendl; + + if (r == -ENOENT) { + r = 0; + } + if (r >= 0) { + uint64_t object_len = 0; + aio_completion->lock.lock(); + for (auto& extent: extents) { + ldout(cct, 10) << " got " << extent.extent_map + << " for " << extent.buffer_extents + << " bl " << extent.bl.length() << dendl; + + aio_completion->read_result.m_destriper.add_partial_sparse_result( + cct, std::move(extent.bl), extent.extent_map, extent.offset, + extent.buffer_extents); + + object_len += extent.length; + } + aio_completion->lock.unlock(); + r = object_len; + } + + aio_completion->complete_request(r); +} + +ReadResult::C_ObjectReadMergedExtents::C_ObjectReadMergedExtents( + CephContext* cct, ReadExtents* extents, Context* on_finish) + : cct(cct), extents(extents), on_finish(on_finish) { +} + +void ReadResult::C_ObjectReadMergedExtents::finish(int r) { + if (r >= 0) { + for (auto& extent: *extents) { + if (bl.length() < extent.length) { + lderr(cct) << "Merged extents length is less than expected" << dendl; + r = -EIO; + break; + } + bl.splice(0, extent.length, &extent.bl); + } + if (bl.length() != 0) { + lderr(cct) << "Merged extents length is greater than expected" << dendl; + r = -EIO; + } + } + on_finish->complete(r); +} + +ReadResult::ReadResult() : m_buffer(Empty()) { +} + +ReadResult::ReadResult(char *buf, size_t buf_len) + : m_buffer(Linear(buf, buf_len)) { +} + +ReadResult::ReadResult(const struct iovec *iov, int iov_count) + : m_buffer(Vector(iov, iov_count)) { +} + +ReadResult::ReadResult(ceph::bufferlist *bl) + : m_buffer(Bufferlist(bl)) { +} + +ReadResult::ReadResult(Extents* extent_map, ceph::bufferlist* bl) + : m_buffer(SparseBufferlist(extent_map, bl)) { +} + +void ReadResult::set_image_extents(const Extents& image_extents) { + boost::apply_visitor(SetImageExtentsVisitor(image_extents), m_buffer); +} + +void ReadResult::assemble_result(CephContext *cct) { + boost::apply_visitor(AssembleResultVisitor(cct, m_destriper), m_buffer); +} + +} // namespace io +} // namespace librbd + diff --git a/src/librbd/io/ReadResult.h b/src/librbd/io/ReadResult.h new file mode 100644 index 000000000..12a1e78cc --- /dev/null +++ b/src/librbd/io/ReadResult.h @@ -0,0 +1,129 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_READ_RESULT_H +#define CEPH_LIBRBD_IO_READ_RESULT_H + +#include "include/common_fwd.h" +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "librbd/io/Types.h" +#include "osdc/Striper.h" +#include +#include + + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; +template struct ObjectReadRequest; + +class ReadResult { +public: + struct C_ImageReadRequest : public Context { + AioCompletion *aio_completion; + uint64_t buffer_offset = 0; + Extents image_extents; + bufferlist bl; + bool ignore_enoent = false; + + C_ImageReadRequest(AioCompletion *aio_completion, + uint64_t buffer_offset, + const Extents image_extents); + + void finish(int r) override; + }; + + struct C_ObjectReadRequest : public Context { + AioCompletion *aio_completion; + ReadExtents extents; + + C_ObjectReadRequest(AioCompletion *aio_completion, ReadExtents&& extents); + + void finish(int r) override; + }; + + struct C_ObjectReadMergedExtents : public Context { + CephContext* cct; + ReadExtents* extents; + Context *on_finish; + bufferlist bl; + + C_ObjectReadMergedExtents(CephContext* cct, ReadExtents* extents, + Context* on_finish); + + void finish(int r) override; + }; + + ReadResult(); + ReadResult(char *buf, size_t buf_len); + ReadResult(const struct iovec *iov, int iov_count); + ReadResult(ceph::bufferlist *bl); + ReadResult(Extents* extent_map, ceph::bufferlist* bl); + + void set_image_extents(const Extents& image_extents); + + void assemble_result(CephContext *cct); + +private: + struct Empty { + }; + + struct Linear { + char *buf; + size_t buf_len; + + Linear(char *buf, size_t buf_len) : buf(buf), buf_len(buf_len) { + } + }; + + struct Vector { + const struct iovec *iov; + int iov_count; + + Vector(const struct iovec *iov, int iov_count) + : iov(iov), iov_count(iov_count) { + } + }; + + struct Bufferlist { + ceph::bufferlist *bl; + + Bufferlist(ceph::bufferlist *bl) : bl(bl) { + } + }; + + struct SparseBufferlist { + Extents *extent_map; + ceph::bufferlist *bl; + + Extents image_extents; + + SparseBufferlist(Extents* extent_map, ceph::bufferlist* bl) + : extent_map(extent_map), bl(bl) { + } + }; + + typedef boost::variant Buffer; + struct SetImageExtentsVisitor; + struct AssembleResultVisitor; + + Buffer m_buffer; + Striper::StripedReadResult m_destriper; + +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_READ_RESULT_H + diff --git a/src/librbd/io/RefreshImageDispatch.cc b/src/librbd/io/RefreshImageDispatch.cc new file mode 100644 index 000000000..3141faf25 --- /dev/null +++ b/src/librbd/io/RefreshImageDispatch.cc @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/RefreshImageDispatch.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::RefreshImageDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template +RefreshImageDispatch::RefreshImageDispatch(I* image_ctx) + : m_image_ctx(image_ctx) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; +} + +template +void RefreshImageDispatch::shut_down(Context* on_finish) { + on_finish->complete(0); +} + +template +bool RefreshImageDispatch::read( + AioCompletion* aio_comp, Extents &&image_extents, ReadResult &&read_result, + IOContext io_context, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool RefreshImageDispatch::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool RefreshImageDispatch::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool RefreshImageDispatch::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool RefreshImageDispatch::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << ", image_extents=" << image_extents + << dendl; + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool RefreshImageDispatch::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + // The refresh state machine can initiate a flush and it can + // enable the exclusive-lock which will also attmept to flush. + if (flush_source == FLUSH_SOURCE_REFRESH || + flush_source == FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH || + flush_source == FLUSH_SOURCE_SHUTDOWN) { + return false; + } + + if (needs_refresh(dispatch_result, on_dispatched)) { + return true; + } + + return false; +} + +template +bool RefreshImageDispatch::needs_refresh( + DispatchResult* dispatch_result, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + + if (m_image_ctx->state->is_refresh_required()) { + ldout(cct, 15) << "on_dispatched=" << on_dispatched << dendl; + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_image_ctx->state->refresh(on_dispatched); + return true; + } + + return false; +} + +} // namespace io +} // namespace librbd + +template class librbd::io::RefreshImageDispatch; diff --git a/src/librbd/io/RefreshImageDispatch.h b/src/librbd/io/RefreshImageDispatch.h new file mode 100644 index 000000000..668dec419 --- /dev/null +++ b/src/librbd/io/RefreshImageDispatch.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/zipkin_trace.h" +#include "common/Throttle.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template +class RefreshImageDispatch : public ImageDispatchInterface { +public: + RefreshImageDispatch(ImageCtxT* image_ctx); + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_REFRESH; + } + + void shut_down(Context* on_finish) override; + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + +private: + ImageCtxT* m_image_ctx; + + bool needs_refresh(DispatchResult* dispatch_result, Context* on_dispatched); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::RefreshImageDispatch; + +#endif // CEPH_LIBRBD_IO_REFRESH_IMAGE_DISPATCH_H diff --git a/src/librbd/io/SimpleSchedulerObjectDispatch.cc b/src/librbd/io/SimpleSchedulerObjectDispatch.cc new file mode 100644 index 000000000..cd2ffb197 --- /dev/null +++ b/src/librbd/io/SimpleSchedulerObjectDispatch.cc @@ -0,0 +1,565 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/SimpleSchedulerObjectDispatch.h" +#include "include/neorados/RADOS.hpp" +#include "common/ceph_time.h" +#include "common/Timer.h" +#include "common/errno.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/io/FlushTracker.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcher.h" +#include "librbd/io/Utils.h" + +#include +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::SimpleSchedulerObjectDispatch: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace io { + +using namespace boost::accumulators; +using ceph::operator<<; +using librbd::util::data_object_name; + +static const int LATENCY_STATS_WINDOW_SIZE = 10; + +class LatencyStats { +private: + accumulator_set> m_acc; + +public: + LatencyStats() + : m_acc(tag::rolling_window::window_size = LATENCY_STATS_WINDOW_SIZE) { + } + + bool is_ready() const { + return rolling_count(m_acc) == LATENCY_STATS_WINDOW_SIZE; + } + + void add(uint64_t latency) { + m_acc(latency); + } + + uint64_t avg() const { + auto count = rolling_count(m_acc); + + if (count > 0) { + return rolling_sum(m_acc); + } + return 0; + } +}; + +template +bool SimpleSchedulerObjectDispatch::ObjectRequests::try_delay_request( + uint64_t object_off, ceph::bufferlist&& data, IOContext io_context, + int op_flags, int object_dispatch_flags, Context* on_dispatched) { + if (!m_delayed_requests.empty()) { + if (!m_io_context || *m_io_context != *io_context || + op_flags != m_op_flags || data.length() == 0 || + intersects(object_off, data.length())) { + return false; + } + } else { + m_io_context = io_context; + m_op_flags = op_flags; + } + + if (data.length() == 0) { + // a zero length write is usually a special case, + // and we don't want it to be merged with others + ceph_assert(m_delayed_requests.empty()); + m_delayed_request_extents.insert(0, UINT64_MAX); + } else { + m_delayed_request_extents.insert(object_off, data.length()); + } + m_object_dispatch_flags |= object_dispatch_flags; + + if (!m_delayed_requests.empty()) { + // try to merge front to an existing request + auto iter = m_delayed_requests.find(object_off + data.length()); + if (iter != m_delayed_requests.end()) { + auto new_iter = m_delayed_requests.insert({object_off, {}}).first; + new_iter->second.data = std::move(data); + new_iter->second.data.append(std::move(iter->second.data)); + new_iter->second.requests = std::move(iter->second.requests); + new_iter->second.requests.push_back(on_dispatched); + m_delayed_requests.erase(iter); + + if (new_iter != m_delayed_requests.begin()) { + auto prev = new_iter; + try_merge_delayed_requests(--prev, new_iter); + } + return true; + } + + // try to merge back to an existing request + iter = m_delayed_requests.lower_bound(object_off); + if (iter != m_delayed_requests.begin() && + (iter == m_delayed_requests.end() || iter->first > object_off)) { + iter--; + } + if (iter != m_delayed_requests.end() && + iter->first + iter->second.data.length() == object_off) { + iter->second.data.append(std::move(data)); + iter->second.requests.push_back(on_dispatched); + + auto next = iter; + if (++next != m_delayed_requests.end()) { + try_merge_delayed_requests(iter, next); + } + return true; + } + } + + // create a new request + auto iter = m_delayed_requests.insert({object_off, {}}).first; + iter->second.data = std::move(data); + iter->second.requests.push_back(on_dispatched); + return true; +} + +template +void SimpleSchedulerObjectDispatch::ObjectRequests::try_merge_delayed_requests( + typename std::map::iterator &iter1, + typename std::map::iterator &iter2) { + if (iter1->first + iter1->second.data.length() != iter2->first) { + return; + } + + iter1->second.data.append(std::move(iter2->second.data)); + iter1->second.requests.insert(iter1->second.requests.end(), + iter2->second.requests.begin(), + iter2->second.requests.end()); + m_delayed_requests.erase(iter2); +} + +template +void SimpleSchedulerObjectDispatch::ObjectRequests::dispatch_delayed_requests( + I *image_ctx, LatencyStats *latency_stats, ceph::mutex *latency_stats_lock) { + for (auto &it : m_delayed_requests) { + auto offset = it.first; + auto &merged_requests = it.second; + + auto ctx = new LambdaContext( + [requests=std::move(merged_requests.requests), latency_stats, + latency_stats_lock, start_time=ceph_clock_now()](int r) { + if (latency_stats) { + std::lock_guard locker{*latency_stats_lock}; + auto latency = ceph_clock_now() - start_time; + latency_stats->add(latency.to_nsec()); + } + for (auto on_dispatched : requests) { + on_dispatched->complete(r); + } + }); + + auto req = ObjectDispatchSpec::create_write( + image_ctx, OBJECT_DISPATCH_LAYER_SCHEDULER, + m_object_no, offset, std::move(merged_requests.data), m_io_context, + m_op_flags, 0, std::nullopt, 0, {}, ctx); + + req->object_dispatch_flags = m_object_dispatch_flags; + req->send(); + } + + m_dispatch_time = {}; +} + +template +SimpleSchedulerObjectDispatch::SimpleSchedulerObjectDispatch( + I* image_ctx) + : m_image_ctx(image_ctx), + m_flush_tracker(new FlushTracker(image_ctx)), + m_lock(ceph::make_mutex(librbd::util::unique_lock_name( + "librbd::io::SimpleSchedulerObjectDispatch::lock", this))), + m_max_delay(image_ctx->config.template get_val( + "rbd_io_scheduler_simple_max_delay")) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; + + I::get_timer_instance(cct, &m_timer, &m_timer_lock); + + if (m_max_delay == 0) { + m_latency_stats = std::make_unique(); + } +} + +template +SimpleSchedulerObjectDispatch::~SimpleSchedulerObjectDispatch() { + delete m_flush_tracker; +} + +template +void SimpleSchedulerObjectDispatch::init() { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + // add ourself to the IO object dispatcher chain + m_image_ctx->io_object_dispatcher->register_dispatch(this); +} + +template +void SimpleSchedulerObjectDispatch::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << dendl; + + m_flush_tracker->shut_down(); + on_finish->complete(0); +} + +template +bool SimpleSchedulerObjectDispatch::read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " << extents + << dendl; + + std::lock_guard locker{m_lock}; + for (auto& extent : *extents) { + if (intersects(object_no, extent.offset, extent.length)) { + dispatch_delayed_requests(object_no); + break; + } + } + + return false; +} + +template +bool SimpleSchedulerObjectDispatch::discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + std::lock_guard locker{m_lock}; + dispatch_delayed_requests(object_no); + register_in_flight_request(object_no, {}, on_finish); + + return false; +} + +template +bool SimpleSchedulerObjectDispatch::write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << data.length() << dendl; + + std::lock_guard locker{m_lock}; + + // don't try to batch assert version writes + if (assert_version.has_value() || + (write_flags & OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE) != 0) { + dispatch_delayed_requests(object_no); + return false; + } + + if (try_delay_write(object_no, object_off, std::move(data), io_context, + op_flags, *object_dispatch_flags, on_dispatched)) { + + auto dispatch_seq = ++m_dispatch_seq; + m_flush_tracker->start_io(dispatch_seq); + *on_finish = new LambdaContext( + [this, dispatch_seq, ctx=*on_finish](int r) { + ctx->complete(r); + m_flush_tracker->finish_io(dispatch_seq); + }); + + *dispatch_result = DISPATCH_RESULT_COMPLETE; + return true; + } + + dispatch_delayed_requests(object_no); + register_in_flight_request(object_no, ceph_clock_now(), on_finish); + + return false; +} + +template +bool SimpleSchedulerObjectDispatch::write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + std::lock_guard locker{m_lock}; + dispatch_delayed_requests(object_no); + register_in_flight_request(object_no, {}, on_finish); + + return false; +} + +template +bool SimpleSchedulerObjectDispatch::compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << cmp_data.length() << dendl; + + std::lock_guard locker{m_lock}; + dispatch_delayed_requests(object_no); + register_in_flight_request(object_no, {}, on_finish); + + return false; +} + +template +bool SimpleSchedulerObjectDispatch::flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + { + std::lock_guard locker{m_lock}; + dispatch_all_delayed_requests(); + } + + *dispatch_result = DISPATCH_RESULT_CONTINUE; + m_flush_tracker->flush(on_dispatched); + + return true; +} + +template +bool SimpleSchedulerObjectDispatch::intersects( + uint64_t object_no, uint64_t object_off, uint64_t len) const { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + auto it = m_requests.find(object_no); + bool intersects = (it != m_requests.end()) && + it->second->intersects(object_off, len); + + ldout(cct, 20) << intersects << dendl; + + return intersects; +} + +template +bool SimpleSchedulerObjectDispatch::try_delay_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int object_dispatch_flags, + Context* on_dispatched) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + if (m_latency_stats && !m_latency_stats->is_ready()) { + ldout(cct, 20) << "latency stats not collected yet" << dendl; + return false; + } + + auto it = m_requests.find(object_no); + if (it == m_requests.end()) { + ldout(cct, 20) << "no pending requests" << dendl; + return false; + } + + auto &object_requests = it->second; + bool delayed = object_requests->try_delay_request( + object_off, std::move(data), io_context, op_flags, object_dispatch_flags, + on_dispatched); + + ldout(cct, 20) << "delayed: " << delayed << dendl; + + // schedule dispatch on the first request added + if (delayed && !object_requests->is_scheduled_dispatch()) { + auto dispatch_time = ceph::real_clock::now(); + if (m_latency_stats) { + dispatch_time += std::chrono::nanoseconds(m_latency_stats->avg() / 2); + } else { + dispatch_time += std::chrono::milliseconds(m_max_delay); + } + object_requests->set_scheduled_dispatch(dispatch_time); + m_dispatch_queue.push_back(object_requests); + if (m_dispatch_queue.front() == object_requests) { + schedule_dispatch_delayed_requests(); + } + } + + return delayed; +} + +template +void SimpleSchedulerObjectDispatch::dispatch_all_delayed_requests() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + while (!m_requests.empty()) { + auto it = m_requests.begin(); + dispatch_delayed_requests(it->second); + m_requests.erase(it); + } +} + +template +void SimpleSchedulerObjectDispatch::register_in_flight_request( + uint64_t object_no, const utime_t &start_time, Context **on_finish) { + auto res = m_requests.insert( + {object_no, std::make_shared(object_no)}); + ceph_assert(res.second); + auto it = res.first; + + auto dispatch_seq = ++m_dispatch_seq; + m_flush_tracker->start_io(dispatch_seq); + + it->second->set_dispatch_seq(dispatch_seq); + *on_finish = new LambdaContext( + [this, object_no, dispatch_seq, start_time, ctx=*on_finish](int r) { + ctx->complete(r); + + std::unique_lock locker{m_lock}; + if (m_latency_stats && start_time != utime_t()) { + auto latency = ceph_clock_now() - start_time; + m_latency_stats->add(latency.to_nsec()); + } + + auto it = m_requests.find(object_no); + if (it == m_requests.end() || + it->second->get_dispatch_seq() != dispatch_seq) { + ldout(m_image_ctx->cct, 20) << "already dispatched" << dendl; + } else { + dispatch_delayed_requests(it->second); + m_requests.erase(it); + } + locker.unlock(); + + m_flush_tracker->finish_io(dispatch_seq); + }); +} + +template +void SimpleSchedulerObjectDispatch::dispatch_delayed_requests( + uint64_t object_no) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + auto it = m_requests.find(object_no); + if (it == m_requests.end()) { + ldout(cct, 20) << "object_no=" << object_no << ": not found" << dendl; + return; + } + + dispatch_delayed_requests(it->second); + m_requests.erase(it); +} + +template +void SimpleSchedulerObjectDispatch::dispatch_delayed_requests( + ObjectRequestsRef object_requests) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + ldout(cct, 20) << "object_no=" << object_requests->get_object_no() << ", " + << object_requests->delayed_requests_size() << " requests, " + << "dispatch_time=" << object_requests->get_dispatch_time() + << dendl; + + if (!object_requests->is_scheduled_dispatch()) { + return; + } + + object_requests->dispatch_delayed_requests(m_image_ctx, m_latency_stats.get(), + &m_lock); + + ceph_assert(!m_dispatch_queue.empty()); + if (m_dispatch_queue.front() == object_requests) { + m_dispatch_queue.pop_front(); + schedule_dispatch_delayed_requests(); + } +} + +template +void SimpleSchedulerObjectDispatch::schedule_dispatch_delayed_requests() { + ceph_assert(ceph_mutex_is_locked(m_lock)); + auto cct = m_image_ctx->cct; + + std::lock_guard timer_locker{*m_timer_lock}; + + if (m_timer_task != nullptr) { + ldout(cct, 20) << "canceling task " << m_timer_task << dendl; + + bool canceled = m_timer->cancel_event(m_timer_task); + ceph_assert(canceled); + m_timer_task = nullptr; + } + + if (m_dispatch_queue.empty()) { + ldout(cct, 20) << "nothing to schedule" << dendl; + return; + } + + auto object_requests = m_dispatch_queue.front().get(); + + while (!object_requests->is_scheduled_dispatch()) { + ldout(cct, 20) << "garbage collecting " << object_requests << dendl; + m_dispatch_queue.pop_front(); + + if (m_dispatch_queue.empty()) { + ldout(cct, 20) << "nothing to schedule" << dendl; + return; + } + object_requests = m_dispatch_queue.front().get(); + } + + m_timer_task = new LambdaContext( + [this, object_no=object_requests->get_object_no()](int r) { + ceph_assert(ceph_mutex_is_locked(*m_timer_lock)); + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "running timer task " << m_timer_task << dendl; + + m_timer_task = nullptr; + m_image_ctx->asio_engine->post( + [this, object_no]() { + std::lock_guard locker{m_lock}; + dispatch_delayed_requests(object_no); + }); + }); + + ldout(cct, 20) << "scheduling task " << m_timer_task << " at " + << object_requests->get_dispatch_time() << dendl; + + m_timer->add_event_at(object_requests->get_dispatch_time(), m_timer_task); +} + +} // namespace io +} // namespace librbd + +template class librbd::io::SimpleSchedulerObjectDispatch; diff --git a/src/librbd/io/SimpleSchedulerObjectDispatch.h b/src/librbd/io/SimpleSchedulerObjectDispatch.h new file mode 100644 index 000000000..ca8a57f3a --- /dev/null +++ b/src/librbd/io/SimpleSchedulerObjectDispatch.h @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_IO_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H + +#include "common/ceph_mutex.h" +#include "include/interval_set.h" +#include "include/utime.h" + +#include "librbd/io/ObjectDispatchInterface.h" +#include "librbd/io/TypeTraits.h" + +#include +#include +#include + +namespace librbd { + +class ImageCtx; + +namespace io { + +template class FlushTracker; +class LatencyStats; + +/** + * Simple scheduler plugin for object dispatcher layer. + */ +template +class SimpleSchedulerObjectDispatch : public ObjectDispatchInterface { +private: + // mock unit testing support + typedef ::librbd::io::TypeTraits TypeTraits; + typedef typename TypeTraits::SafeTimer SafeTimer; +public: + static SimpleSchedulerObjectDispatch* create(ImageCtxT* image_ctx) { + return new SimpleSchedulerObjectDispatch(image_ctx); + } + + SimpleSchedulerObjectDispatch(ImageCtxT* image_ctx); + ~SimpleSchedulerObjectDispatch() override; + + ObjectDispatchLayer get_dispatch_layer() const override { + return OBJECT_DISPATCH_LAYER_SCHEDULER; + } + + void init(); + void shut_down(Context* on_finish) override; + + bool read( + uint64_t object_no, ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool list_snaps( + uint64_t object_no, io::Extents&& extents, SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + bool reset_existence_cache(Context* on_finish) override { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override { + } + + int prepare_copyup( + uint64_t object_no, + SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override { + return 0; + } + +private: + struct MergedRequests { + ceph::bufferlist data; + std::list requests; + }; + + class ObjectRequests { + public: + using clock_t = ceph::real_clock; + + ObjectRequests(uint64_t object_no) : m_object_no(object_no) { + } + + uint64_t get_object_no() const { + return m_object_no; + } + + void set_dispatch_seq(uint64_t dispatch_seq) { + m_dispatch_seq = dispatch_seq; + } + + uint64_t get_dispatch_seq() const { + return m_dispatch_seq; + } + + clock_t::time_point get_dispatch_time() const { + return m_dispatch_time; + } + + void set_scheduled_dispatch(const clock_t::time_point &dispatch_time) { + m_dispatch_time = dispatch_time; + } + + bool is_scheduled_dispatch() const { + return !clock_t::is_zero(m_dispatch_time); + } + + size_t delayed_requests_size() const { + return m_delayed_requests.size(); + } + + bool intersects(uint64_t object_off, uint64_t len) const { + return m_delayed_request_extents.intersects(object_off, len); + } + + bool try_delay_request(uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + int object_dispatch_flags, Context* on_dispatched); + + void dispatch_delayed_requests(ImageCtxT *image_ctx, + LatencyStats *latency_stats, + ceph::mutex *latency_stats_lock); + + private: + uint64_t m_object_no; + uint64_t m_dispatch_seq = 0; + clock_t::time_point m_dispatch_time; + IOContext m_io_context; + int m_op_flags = 0; + int m_object_dispatch_flags = 0; + std::map m_delayed_requests; + interval_set m_delayed_request_extents; + + void try_merge_delayed_requests( + typename std::map::iterator &iter, + typename std::map::iterator &iter2); + }; + + typedef std::shared_ptr ObjectRequestsRef; + typedef std::map Requests; + + ImageCtxT *m_image_ctx; + + FlushTracker* m_flush_tracker; + + ceph::mutex m_lock; + SafeTimer *m_timer; + ceph::mutex *m_timer_lock; + uint64_t m_max_delay; + uint64_t m_dispatch_seq = 0; + + Requests m_requests; + std::list m_dispatch_queue; + Context *m_timer_task = nullptr; + std::unique_ptr m_latency_stats; + + bool try_delay_write(uint64_t object_no, uint64_t object_off, + ceph::bufferlist&& data, IOContext io_context, + int op_flags, int object_dispatch_flags, + Context* on_dispatched); + bool intersects(uint64_t object_no, uint64_t object_off, uint64_t len) const; + + void dispatch_all_delayed_requests(); + void dispatch_delayed_requests(uint64_t object_no); + void dispatch_delayed_requests(ObjectRequestsRef object_requests); + void register_in_flight_request(uint64_t object_no, const utime_t &start_time, + Context** on_finish); + + void schedule_dispatch_delayed_requests(); +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::SimpleSchedulerObjectDispatch; + +#endif // CEPH_LIBRBD_CACHE_SIMPLE_SCHEDULER_OBJECT_DISPATCH_H diff --git a/src/librbd/io/TypeTraits.h b/src/librbd/io/TypeTraits.h new file mode 100644 index 000000000..2f3a6b7ef --- /dev/null +++ b/src/librbd/io/TypeTraits.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_TYPE_TRAITS_H +#define CEPH_LIBRBD_IO_TYPE_TRAITS_H + +#include "common/Timer.h" + +namespace librbd { +namespace io { + +template +struct TypeTraits { + typedef ::SafeTimer SafeTimer; +}; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_TYPE_TRAITS_H diff --git a/src/librbd/io/Types.cc b/src/librbd/io/Types.cc new file mode 100644 index 000000000..19fcc6b89 --- /dev/null +++ b/src/librbd/io/Types.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/Types.h" +#include + +namespace librbd { +namespace io { + +const WriteReadSnapIds INITIAL_WRITE_READ_SNAP_IDS{0, 0}; + +std::ostream& operator<<(std::ostream& os, SparseExtentState state) { + switch (state) { + case SPARSE_EXTENT_STATE_DNE: + os << "dne"; + break; + case SPARSE_EXTENT_STATE_ZEROED: + os << "zeroed"; + break; + case SPARSE_EXTENT_STATE_DATA: + os << "data"; + break; + default: + ceph_abort(); + break; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const SparseExtent& se) { + os << "[" + << "state=" << se.state << ", " + << "length=" << se.length << "]"; + return os; +} + +std::ostream& operator<<(std::ostream& os, ImageArea area) { + switch (area) { + case ImageArea::DATA: + return os << "data"; + case ImageArea::CRYPTO_HEADER: + return os << "crypto_header"; + default: + ceph_abort(); + } +} + +} // namespace io +} // namespace librbd diff --git a/src/librbd/io/Types.h b/src/librbd/io/Types.h new file mode 100644 index 000000000..7c70986c5 --- /dev/null +++ b/src/librbd/io/Types.h @@ -0,0 +1,328 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_TYPES_H +#define CEPH_LIBRBD_IO_TYPES_H + +#include "include/int_types.h" +#include "include/rados/rados_types.hpp" +#include "common/interval_map.h" +#include "osdc/StriperTypes.h" +#include +#include +#include + +struct Context; + +namespace librbd { +namespace io { + +typedef enum { + AIO_TYPE_NONE = 0, + AIO_TYPE_GENERIC, + AIO_TYPE_OPEN, + AIO_TYPE_CLOSE, + AIO_TYPE_READ, + AIO_TYPE_WRITE, + AIO_TYPE_DISCARD, + AIO_TYPE_FLUSH, + AIO_TYPE_WRITESAME, + AIO_TYPE_COMPARE_AND_WRITE, +} aio_type_t; + +enum FlushSource { + FLUSH_SOURCE_USER, + FLUSH_SOURCE_INTERNAL, + FLUSH_SOURCE_SHUTDOWN, + FLUSH_SOURCE_EXCLUSIVE_LOCK, + FLUSH_SOURCE_EXCLUSIVE_LOCK_SKIP_REFRESH, + FLUSH_SOURCE_REFRESH, + FLUSH_SOURCE_WRITEBACK, + FLUSH_SOURCE_WRITE_BLOCK, +}; + +enum Direction { + DIRECTION_READ, + DIRECTION_WRITE, + DIRECTION_BOTH +}; + +enum DispatchResult { + DISPATCH_RESULT_INVALID, + DISPATCH_RESULT_RESTART, + DISPATCH_RESULT_CONTINUE, + DISPATCH_RESULT_COMPLETE +}; + +enum ImageDispatchLayer { + IMAGE_DISPATCH_LAYER_NONE = 0, + IMAGE_DISPATCH_LAYER_API_START = IMAGE_DISPATCH_LAYER_NONE, + IMAGE_DISPATCH_LAYER_QUEUE, + IMAGE_DISPATCH_LAYER_QOS, + IMAGE_DISPATCH_LAYER_EXCLUSIVE_LOCK, + IMAGE_DISPATCH_LAYER_REFRESH, + IMAGE_DISPATCH_LAYER_INTERNAL_START = IMAGE_DISPATCH_LAYER_REFRESH, + IMAGE_DISPATCH_LAYER_MIGRATION, + IMAGE_DISPATCH_LAYER_JOURNAL, + IMAGE_DISPATCH_LAYER_WRITE_BLOCK, + IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, + IMAGE_DISPATCH_LAYER_CRYPTO, + IMAGE_DISPATCH_LAYER_CORE, + IMAGE_DISPATCH_LAYER_LAST +}; + +enum { + IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE = 1 << 0, + IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE = 1 << 1, + IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE = 1 << 2, + IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE = 1 << 3, + IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE = 1 << 4, + IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE = 1 << 5, + IMAGE_DISPATCH_FLAG_QOS_BPS_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_BPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE), + IMAGE_DISPATCH_FLAG_QOS_IOPS_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_IOPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE), + IMAGE_DISPATCH_FLAG_QOS_READ_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_READ_IOPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_READ_BPS_THROTTLE), + IMAGE_DISPATCH_FLAG_QOS_WRITE_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_WRITE_IOPS_THROTTLE | + IMAGE_DISPATCH_FLAG_QOS_WRITE_BPS_THROTTLE), + IMAGE_DISPATCH_FLAG_QOS_MASK = ( + IMAGE_DISPATCH_FLAG_QOS_BPS_MASK | + IMAGE_DISPATCH_FLAG_QOS_IOPS_MASK), + + // TODO: pass area through ImageDispatchInterface and remove + // this flag + IMAGE_DISPATCH_FLAG_CRYPTO_HEADER = 1 << 6 +}; + +enum { + RBD_IO_OPERATIONS_DEFAULT = 0, + RBD_IO_OPERATION_READ = 1 << 0, + RBD_IO_OPERATION_WRITE = 1 << 1, + RBD_IO_OPERATION_DISCARD = 1 << 2, + RBD_IO_OPERATION_WRITE_SAME = 1 << 3, + RBD_IO_OPERATION_COMPARE_AND_WRITE = 1 << 4, + RBD_IO_OPERATIONS_ALL = ( + RBD_IO_OPERATION_READ | + RBD_IO_OPERATION_WRITE | + RBD_IO_OPERATION_DISCARD | + RBD_IO_OPERATION_WRITE_SAME | + RBD_IO_OPERATION_COMPARE_AND_WRITE) +}; + +enum ObjectDispatchLayer { + OBJECT_DISPATCH_LAYER_NONE = 0, + OBJECT_DISPATCH_LAYER_CACHE, + OBJECT_DISPATCH_LAYER_CRYPTO, + OBJECT_DISPATCH_LAYER_JOURNAL, + OBJECT_DISPATCH_LAYER_PARENT_CACHE, + OBJECT_DISPATCH_LAYER_SCHEDULER, + OBJECT_DISPATCH_LAYER_CORE, + OBJECT_DISPATCH_LAYER_LAST +}; + +enum { + READ_FLAG_DISABLE_READ_FROM_PARENT = 1UL << 0, + READ_FLAG_DISABLE_CLIPPING = 1UL << 1, +}; + +enum { + OBJECT_WRITE_FLAG_CREATE_EXCLUSIVE = 1UL << 0 +}; + +enum { + OBJECT_DISCARD_FLAG_DISABLE_CLONE_REMOVE = 1UL << 0, + OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE = 1UL << 1 +}; + +enum { + OBJECT_DISPATCH_FLAG_FLUSH = 1UL << 0, + OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR = 1UL << 1 +}; + +enum { + LIST_SNAPS_FLAG_DISABLE_LIST_FROM_PARENT = 1UL << 0, + LIST_SNAPS_FLAG_WHOLE_OBJECT = 1UL << 1, + LIST_SNAPS_FLAG_IGNORE_ZEROED_EXTENTS = 1UL << 2, +}; + +enum SparseExtentState { + SPARSE_EXTENT_STATE_DNE, /* does not exist */ + SPARSE_EXTENT_STATE_ZEROED, + SPARSE_EXTENT_STATE_DATA +}; + +std::ostream& operator<<(std::ostream& os, SparseExtentState state); + +struct SparseExtent { + SparseExtentState state; + uint64_t length; + + SparseExtent(SparseExtentState state, uint64_t length) + : state(state), length(length) { + } + + operator SparseExtentState() const { + return state; + } + + bool operator==(const SparseExtent& rhs) const { + return state == rhs.state && length == rhs.length; + } +}; + +std::ostream& operator<<(std::ostream& os, const SparseExtent& state); + +struct SparseExtentSplitMerge { + SparseExtent split(uint64_t offset, uint64_t length, SparseExtent &se) const { + return SparseExtent(se.state, se.length); + } + + bool can_merge(const SparseExtent& left, const SparseExtent& right) const { + return left.state == right.state; + } + + SparseExtent merge(SparseExtent&& left, SparseExtent&& right) const { + SparseExtent se(left); + se.length += right.length; + return se; + } + + uint64_t length(const SparseExtent& se) const { + return se.length; + } +}; + +typedef interval_map SparseExtents; + +typedef std::vector SnapIds; + +typedef std::pair WriteReadSnapIds; +extern const WriteReadSnapIds INITIAL_WRITE_READ_SNAP_IDS; + +typedef std::map SnapshotDelta; + +struct SparseBufferlistExtent : public SparseExtent { + ceph::bufferlist bl; + + SparseBufferlistExtent(SparseExtentState state, uint64_t length) + : SparseExtent(state, length) { + ceph_assert(state != SPARSE_EXTENT_STATE_DATA); + } + SparseBufferlistExtent(SparseExtentState state, uint64_t length, + ceph::bufferlist&& bl_) + : SparseExtent(state, length), bl(std::move(bl_)) { + ceph_assert(state != SPARSE_EXTENT_STATE_DATA || length == bl.length()); + } + + bool operator==(const SparseBufferlistExtent& rhs) const { + return (state == rhs.state && + length == rhs.length && + bl.contents_equal(rhs.bl)); + } +}; + +struct SparseBufferlistExtentSplitMerge { + SparseBufferlistExtent split(uint64_t offset, uint64_t length, + SparseBufferlistExtent& sbe) const { + ceph::bufferlist bl; + if (sbe.state == SPARSE_EXTENT_STATE_DATA) { + bl.substr_of(bl, offset, length); + } + return SparseBufferlistExtent(sbe.state, length, std::move(bl)); + } + + bool can_merge(const SparseBufferlistExtent& left, + const SparseBufferlistExtent& right) const { + return left.state == right.state; + } + + SparseBufferlistExtent merge(SparseBufferlistExtent&& left, + SparseBufferlistExtent&& right) const { + if (left.state == SPARSE_EXTENT_STATE_DATA) { + ceph::bufferlist bl{std::move(left.bl)}; + bl.claim_append(std::move(right.bl)); + return SparseBufferlistExtent(SPARSE_EXTENT_STATE_DATA, + bl.length(), std::move(bl)); + } else { + return SparseBufferlistExtent(left.state, left.length + right.length, {}); + } + } + + uint64_t length(const SparseBufferlistExtent& sbe) const { + return sbe.length; + } +}; + +typedef interval_map SparseBufferlist; +typedef std::map SnapshotSparseBufferlist; + +using striper::LightweightBufferExtents; +using striper::LightweightObjectExtent; +using striper::LightweightObjectExtents; + +typedef std::pair Extent; +typedef std::vector Extents; + +enum class ImageArea { + DATA, + CRYPTO_HEADER +}; + +std::ostream& operator<<(std::ostream& os, ImageArea area); + +struct ReadExtent { + const uint64_t offset; + const uint64_t length; + const LightweightBufferExtents buffer_extents; + ceph::bufferlist bl; + Extents extent_map; + + ReadExtent(uint64_t offset, + uint64_t length) : offset(offset), length(length) {}; + ReadExtent(uint64_t offset, + uint64_t length, + const LightweightBufferExtents&& buffer_extents) + : offset(offset), + length(length), + buffer_extents(buffer_extents) {} + ReadExtent(uint64_t offset, + uint64_t length, + const LightweightBufferExtents&& buffer_extents, + ceph::bufferlist&& bl, + Extents&& extent_map) : offset(offset), + length(length), + buffer_extents(buffer_extents), + bl(bl), + extent_map(extent_map) {}; + + friend inline std::ostream& operator<<( + std::ostream& os, + const ReadExtent &extent) { + os << "offset=" << extent.offset << ", " + << "length=" << extent.length << ", " + << "buffer_extents=" << extent.buffer_extents << ", " + << "bl.length=" << extent.bl.length() << ", " + << "extent_map=" << extent.extent_map; + return os; + } +}; + +typedef std::vector ReadExtents; + +typedef std::map ExtentMap; + +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_TYPES_H diff --git a/src/librbd/io/Utils.cc b/src/librbd/io/Utils.cc new file mode 100644 index 000000000..63d587206 --- /dev/null +++ b/src/librbd/io/Utils.cc @@ -0,0 +1,249 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/Utils.h" +#include "common/dout.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/neorados/RADOS.hpp" +#include "librbd/internal.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ObjectRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "osd/osd_types.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::util: " << __func__ << ": " + +namespace librbd { +namespace io { +namespace util { + +void apply_op_flags(uint32_t op_flags, uint32_t flags, neorados::Op* op) { + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM) + op->set_fadvise_random(); + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL) + op->set_fadvise_sequential(); + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_WILLNEED) + op->set_fadvise_willneed(); + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED) + op->set_fadvise_dontneed(); + if (op_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE) + op->set_fadvise_nocache(); + + if (flags & librados::OPERATION_BALANCE_READS) + op->balance_reads(); + if (flags & librados::OPERATION_LOCALIZE_READS) + op->localize_reads(); +} + +bool assemble_write_same_extent( + const LightweightObjectExtent &object_extent, const ceph::bufferlist& data, + ceph::bufferlist *ws_data, bool force_write) { + size_t data_len = data.length(); + + if (!force_write) { + bool may_writesame = true; + for (auto& q : object_extent.buffer_extents) { + if (!(q.first % data_len == 0 && q.second % data_len == 0)) { + may_writesame = false; + break; + } + } + + if (may_writesame) { + ws_data->append(data); + return true; + } + } + + for (auto& q : object_extent.buffer_extents) { + bufferlist sub_bl; + uint64_t sub_off = q.first % data_len; + uint64_t sub_len = data_len - sub_off; + uint64_t extent_left = q.second; + while (extent_left >= sub_len) { + sub_bl.substr_of(data, sub_off, sub_len); + ws_data->claim_append(sub_bl); + extent_left -= sub_len; + if (sub_off) { + sub_off = 0; + sub_len = data_len; + } + } + if (extent_left) { + sub_bl.substr_of(data, sub_off, extent_left); + ws_data->claim_append(sub_bl); + } + } + return false; +} + +template +void read_parent(I *image_ctx, uint64_t object_no, ReadExtents* read_extents, + librados::snap_t snap_id, const ZTracer::Trace &trace, + Context* on_finish) { + + auto cct = image_ctx->cct; + + std::shared_lock image_locker{image_ctx->image_lock}; + + Extents parent_extents; + ImageArea area; + uint64_t raw_overlap = 0; + uint64_t object_overlap = 0; + image_ctx->get_parent_overlap(snap_id, &raw_overlap); + if (raw_overlap > 0) { + // calculate reverse mapping onto the parent image + Extents extents; + for (const auto& extent : *read_extents) { + extents.emplace_back(extent.offset, extent.length); + } + std::tie(parent_extents, area) = object_to_area_extents(image_ctx, + object_no, extents); + object_overlap = image_ctx->prune_parent_extents(parent_extents, area, + raw_overlap, false); + } + if (object_overlap == 0) { + image_locker.unlock(); + + on_finish->complete(-ENOENT); + return; + } + + ldout(cct, 20) << dendl; + + ceph::bufferlist* parent_read_bl; + if (read_extents->size() > 1) { + auto parent_comp = new ReadResult::C_ObjectReadMergedExtents( + cct, read_extents, on_finish); + parent_read_bl = &parent_comp->bl; + on_finish = parent_comp; + } else { + parent_read_bl = &read_extents->front().bl; + } + + auto comp = AioCompletion::create_and_start(on_finish, image_ctx->parent, + AIO_TYPE_READ); + ldout(cct, 20) << "completion=" << comp + << " parent_extents=" << parent_extents + << " area=" << area << dendl; + auto req = io::ImageDispatchSpec::create_read( + *image_ctx->parent, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, comp, + std::move(parent_extents), area, ReadResult{parent_read_bl}, + image_ctx->parent->get_data_io_context(), 0, 0, trace); + req->send(); +} + +template +int clip_request(I* image_ctx, Extents* image_extents, ImageArea area) { + std::shared_lock image_locker{image_ctx->image_lock}; + for (auto &image_extent : *image_extents) { + auto clip_len = image_extent.second; + int r = clip_io(librbd::util::get_image_ctx(image_ctx), + image_extent.first, &clip_len, area); + if (r < 0) { + return r; + } + + image_extent.second = clip_len; + } + return 0; +} + +void unsparsify(CephContext* cct, ceph::bufferlist* bl, + const Extents& extent_map, uint64_t bl_off, + uint64_t out_bl_len) { + Striper::StripedReadResult destriper; + bufferlist out_bl; + + destriper.add_partial_sparse_result(cct, std::move(*bl), extent_map, bl_off, + {{0, out_bl_len}}); + destriper.assemble_result(cct, out_bl, true); + *bl = out_bl; +} + +template +bool trigger_copyup(I* image_ctx, uint64_t object_no, IOContext io_context, + Context* on_finish) { + bufferlist bl; + auto req = new ObjectWriteRequest( + image_ctx, object_no, 0, std::move(bl), io_context, 0, 0, + std::nullopt, {}, on_finish); + if (!req->has_parent()) { + delete req; + return false; + } + + req->send(); + return true; +} + +template +void area_to_object_extents(I* image_ctx, uint64_t offset, uint64_t length, + ImageArea area, uint64_t buffer_offset, + striper::LightweightObjectExtents* object_extents) { + Extents extents = {{offset, length}}; + image_ctx->io_image_dispatcher->remap_to_physical(extents, area); + for (auto [off, len] : extents) { + Striper::file_to_extents(image_ctx->cct, &image_ctx->layout, off, len, 0, + buffer_offset, object_extents); + } +} + +template +std::pair object_to_area_extents( + I* image_ctx, uint64_t object_no, const Extents& object_extents) { + Extents extents; + for (auto [off, len] : object_extents) { + Striper::extent_to_file(image_ctx->cct, &image_ctx->layout, object_no, off, + len, extents); + } + auto area = image_ctx->io_image_dispatcher->remap_to_logical(extents); + return {std::move(extents), area}; +} + +template +uint64_t area_to_raw_offset(const I& image_ctx, uint64_t offset, + ImageArea area) { + Extents extents = {{offset, 0}}; + image_ctx.io_image_dispatcher->remap_to_physical(extents, area); + return extents[0].first; +} + +template +std::pair raw_to_area_offset(const I& image_ctx, + uint64_t offset) { + Extents extents = {{offset, 0}}; + auto area = image_ctx.io_image_dispatcher->remap_to_logical(extents); + return {extents[0].first, area}; +} + +} // namespace util +} // namespace io +} // namespace librbd + +template void librbd::io::util::read_parent( + librbd::ImageCtx *image_ctx, uint64_t object_no, ReadExtents* extents, + librados::snap_t snap_id, const ZTracer::Trace &trace, Context* on_finish); +template int librbd::io::util::clip_request( + librbd::ImageCtx* image_ctx, Extents* image_extents, ImageArea area); +template bool librbd::io::util::trigger_copyup( + librbd::ImageCtx *image_ctx, uint64_t object_no, IOContext io_context, + Context* on_finish); +template void librbd::io::util::area_to_object_extents( + librbd::ImageCtx* image_ctx, uint64_t offset, uint64_t length, + ImageArea area, uint64_t buffer_offset, + striper::LightweightObjectExtents* object_extents); +template auto librbd::io::util::object_to_area_extents( + librbd::ImageCtx* image_ctx, uint64_t object_no, const Extents& extents) + -> std::pair; +template uint64_t librbd::io::util::area_to_raw_offset( + const librbd::ImageCtx& image_ctx, uint64_t offset, ImageArea area); +template auto librbd::io::util::raw_to_area_offset( + const librbd::ImageCtx& image_ctx, uint64_t offset) + -> std::pair; diff --git a/src/librbd/io/Utils.h b/src/librbd/io/Utils.h new file mode 100644 index 000000000..efb79b6a6 --- /dev/null +++ b/src/librbd/io/Utils.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_UTILS_H +#define CEPH_LIBRBD_IO_UTILS_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/rados/rados_types.hpp" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include + +class ObjectExtent; + +namespace neorados { struct Op; } + +namespace librbd { + +struct ImageCtx; + +namespace io { +namespace util { + +void apply_op_flags(uint32_t op_flags, uint32_t flags, neorados::Op* op); + +bool assemble_write_same_extent(const LightweightObjectExtent &object_extent, + const ceph::bufferlist& data, + ceph::bufferlist *ws_data, + bool force_write); + +template +void read_parent(ImageCtxT *image_ctx, uint64_t object_no, + ReadExtents* read_extents, librados::snap_t snap_id, + const ZTracer::Trace &trace, Context* on_finish); + +template +int clip_request(ImageCtxT* image_ctx, Extents* image_extents, ImageArea area); + +inline uint64_t get_extents_length(const Extents &extents) { + uint64_t total_bytes = 0; + for (auto [_, extent_length] : extents) { + total_bytes += extent_length; + } + return total_bytes; +} + +void unsparsify(CephContext* cct, ceph::bufferlist* bl, + const Extents& extent_map, uint64_t bl_off, + uint64_t out_bl_len); + +template +bool trigger_copyup(ImageCtxT *image_ctx, uint64_t object_no, + IOContext io_context, Context* on_finish); + +template +void area_to_object_extents(ImageCtxT* image_ctx, uint64_t offset, + uint64_t length, ImageArea area, + uint64_t buffer_offset, + striper::LightweightObjectExtents* object_extents); + +template +std::pair object_to_area_extents( + ImageCtxT* image_ctx, uint64_t object_no, const Extents& object_extents); + +template +uint64_t area_to_raw_offset(const ImageCtxT& image_ctx, uint64_t offset, + ImageArea area); + +template +std::pair raw_to_area_offset(const ImageCtxT& image_ctx, + uint64_t offset); + +inline ObjectDispatchLayer get_previous_layer(ObjectDispatchLayer layer) { + return (ObjectDispatchLayer)(((int)layer) - 1); +} + +} // namespace util +} // namespace io +} // namespace librbd + +#endif // CEPH_LIBRBD_IO_UTILS_H diff --git a/src/librbd/io/WriteBlockImageDispatch.cc b/src/librbd/io/WriteBlockImageDispatch.cc new file mode 100644 index 000000000..57d181d20 --- /dev/null +++ b/src/librbd/io/WriteBlockImageDispatch.cc @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/io/WriteBlockImageDispatch.h" +#include "common/dout.h" +#include "common/Cond.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::io::WriteBlockImageDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace io { + +template +struct WriteBlockImageDispatch::C_BlockedWrites : public Context { + WriteBlockImageDispatch *dispatch; + explicit C_BlockedWrites(WriteBlockImageDispatch *dispatch) + : dispatch(dispatch) { + } + + void finish(int r) override { + dispatch->handle_blocked_writes(r); + } +}; + +template +WriteBlockImageDispatch::WriteBlockImageDispatch(I* image_ctx) + : m_image_ctx(image_ctx), + m_lock(ceph::make_shared_mutex( + util::unique_lock_name("librbd::io::WriteBlockImageDispatch::m_lock", + this))) { + auto cct = m_image_ctx->cct; + ldout(cct, 5) << "ictx=" << image_ctx << dendl; +} + +template +void WriteBlockImageDispatch::shut_down(Context* on_finish) { + on_finish->complete(0); +} + +template +int WriteBlockImageDispatch::block_writes() { + C_SaferCond cond_ctx; + block_writes(&cond_ctx); + return cond_ctx.wait(); +} + +template +void WriteBlockImageDispatch::block_writes(Context *on_blocked) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock)); + auto cct = m_image_ctx->cct; + + // ensure owner lock is not held after block_writes completes + on_blocked = util::create_async_context_callback( + *m_image_ctx, on_blocked); + + { + std::unique_lock locker{m_lock}; + ++m_write_blockers; + ldout(cct, 5) << m_image_ctx << ", " + << "num=" << m_write_blockers << dendl; + if (!m_write_blocker_contexts.empty() || m_in_flight_writes > 0) { + ldout(cct, 5) << "waiting for in-flight writes to complete: " + << "in_flight_writes=" << m_in_flight_writes << dendl; + m_write_blocker_contexts.push_back(on_blocked); + return; + } + } + + flush_io(on_blocked); +}; + +template +void WriteBlockImageDispatch::unblock_writes() { + auto cct = m_image_ctx->cct; + + Contexts waiter_contexts; + Contexts dispatch_contexts; + { + std::unique_lock locker{m_lock}; + ceph_assert(m_write_blockers > 0); + --m_write_blockers; + + ldout(cct, 5) << m_image_ctx << ", " + << "num=" << m_write_blockers << dendl; + if (m_write_blockers == 0) { + std::swap(waiter_contexts, m_unblocked_write_waiter_contexts); + std::swap(dispatch_contexts, m_on_dispatches); + } + } + + for (auto ctx : waiter_contexts) { + ctx->complete(0); + } + + for (auto ctx : dispatch_contexts) { + ctx->complete(0); + } +} + +template +void WriteBlockImageDispatch::wait_on_writes_unblocked( + Context *on_unblocked) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->owner_lock)); + auto cct = m_image_ctx->cct; + + { + std::unique_lock locker{m_lock}; + ldout(cct, 20) << m_image_ctx << ", " + << "write_blockers=" << m_write_blockers << dendl; + if (!m_unblocked_write_waiter_contexts.empty() || m_write_blockers > 0) { + m_unblocked_write_waiter_contexts.push_back(on_unblocked); + return; + } + } + + on_unblocked->complete(0); +} + +template +bool WriteBlockImageDispatch::write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool WriteBlockImageDispatch::discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool WriteBlockImageDispatch::write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool WriteBlockImageDispatch::compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template +bool WriteBlockImageDispatch::flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "tid=" << tid << dendl; + + if (flush_source != FLUSH_SOURCE_USER) { + return false; + } + + return process_io(tid, dispatch_result, on_finish, on_dispatched); +} + +template +void WriteBlockImageDispatch::handle_finished(int r, uint64_t tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", tid=" << tid << dendl; + + std::unique_lock locker{m_lock}; + ceph_assert(m_in_flight_writes > 0); + --m_in_flight_writes; + + bool writes_blocked = false; + if (m_write_blockers > 0 && m_in_flight_writes == 0) { + ldout(cct, 10) << "flushing all in-flight IO for blocked writes" << dendl; + writes_blocked = true; + } + locker.unlock(); + + if (writes_blocked) { + flush_io(new C_BlockedWrites(this)); + } +} + +template +bool WriteBlockImageDispatch::process_io( + uint64_t tid, DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + std::unique_lock locker{m_lock}; + if (m_write_blockers > 0 || !m_on_dispatches.empty()) { + *dispatch_result = DISPATCH_RESULT_RESTART; + m_on_dispatches.push_back(on_dispatched); + return true; + } + + ++m_in_flight_writes; + *on_finish = new LambdaContext([this, tid, on_finish=*on_finish](int r) { + handle_finished(r, tid); + on_finish->complete(r); + }); + return false; +} + +template +void WriteBlockImageDispatch::flush_io(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + // ensure that all in-flight IO is flushed + auto aio_comp = AioCompletion::create_and_start( + on_finish, util::get_image_ctx(m_image_ctx), librbd::io::AIO_TYPE_FLUSH); + auto req = ImageDispatchSpec::create_flush( + *m_image_ctx, IMAGE_DISPATCH_LAYER_WRITE_BLOCK, aio_comp, + FLUSH_SOURCE_WRITE_BLOCK, {}); + req->send(); +} + +template +void WriteBlockImageDispatch::handle_blocked_writes(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + Contexts write_blocker_contexts; + { + std::unique_lock locker{m_lock}; + std::swap(write_blocker_contexts, m_write_blocker_contexts); + } + + for (auto ctx : write_blocker_contexts) { + ctx->complete(0); + } +} + +} // namespace io +} // namespace librbd + +template class librbd::io::WriteBlockImageDispatch; diff --git a/src/librbd/io/WriteBlockImageDispatch.h b/src/librbd/io/WriteBlockImageDispatch.h new file mode 100644 index 000000000..b1d0ddb0e --- /dev/null +++ b/src/librbd/io/WriteBlockImageDispatch.h @@ -0,0 +1,134 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/ceph_mutex.h" +#include "common/zipkin_trace.h" +#include "common/Throttle.h" +#include "librbd/io/ReadResult.h" +#include "librbd/io/Types.h" +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace io { + +struct AioCompletion; + +template +class WriteBlockImageDispatch : public ImageDispatchInterface { +public: + WriteBlockImageDispatch(ImageCtxT* image_ctx); + + ImageDispatchLayer get_dispatch_layer() const override { + return IMAGE_DISPATCH_LAYER_WRITE_BLOCK; + } + + void shut_down(Context* on_finish) override; + + int block_writes(); + void block_writes(Context *on_blocked); + void unblock_writes(); + + inline bool writes_blocked() const { + std::shared_lock locker{m_lock}; + return (m_write_blockers > 0); + } + + void wait_on_writes_unblocked(Context *on_unblocked); + + bool read( + AioCompletion* aio_comp, Extents &&image_extents, + ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + bool write( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + AioCompletion* aio_comp, Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + AioCompletion* aio_comp, Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + AioCompletion* aio_comp, Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + AioCompletion* aio_comp, FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + AioCompletion* aio_comp, Extents&& image_extents, SnapIds&& snap_ids, + int list_snaps_flags, SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + +private: + struct C_BlockedWrites; + + typedef std::list Contexts; + + ImageCtxT* m_image_ctx; + + mutable ceph::shared_mutex m_lock; + Contexts m_on_dispatches; + + uint32_t m_write_blockers = 0; + Contexts m_write_blocker_contexts; + Contexts m_unblocked_write_waiter_contexts; + uint64_t m_in_flight_writes = 0; + + void handle_finished(int r, uint64_t tid); + + bool process_io(uint64_t tid, DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched); + void flush_io(Context* on_finish); + + bool invalidate_cache(Context* on_finish) override { + return false; + } + + void handle_blocked_writes(int r); + +}; + +} // namespace io +} // namespace librbd + +extern template class librbd::io::WriteBlockImageDispatch; + +#endif // CEPH_LIBRBD_IO_WRITE_BLOCK_IMAGE_DISPATCH_H diff --git a/src/librbd/journal/CreateRequest.cc b/src/librbd/journal/CreateRequest.cc new file mode 100644 index 000000000..4f7a0f5be --- /dev/null +++ b/src/librbd/journal/CreateRequest.cc @@ -0,0 +1,234 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/dout.h" +#include "common/errno.h" +#include "include/ceph_assert.h" +#include "librbd/Utils.h" +#include "common/Timer.h" +#include "journal/Settings.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/journal/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Journal::CreateRequest: " + +namespace librbd { + +using util::create_context_callback; + +namespace journal { + +template +CreateRequest::CreateRequest(IoCtx &ioctx, const std::string &imageid, + uint8_t order, uint8_t splay_width, + const std::string &object_pool, + uint64_t tag_class, TagData &tag_data, + const std::string &client_id, + ContextWQ *op_work_queue, + Context *on_finish) + : m_ioctx(ioctx), m_image_id(imageid), m_order(order), + m_splay_width(splay_width), m_object_pool(object_pool), + m_tag_class(tag_class), m_tag_data(tag_data), m_image_client_id(client_id), + m_op_work_queue(op_work_queue), m_on_finish(on_finish) { + m_cct = reinterpret_cast(m_ioctx.cct()); +} + +template +void CreateRequest::send() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + if (m_order > 64 || m_order < 12) { + lderr(m_cct) << "order must be in the range [12, 64]" << dendl; + complete(-EDOM); + return; + } + if (m_splay_width == 0) { + complete(-EINVAL); + return; + } + + get_pool_id(); +} + +template +void CreateRequest::get_pool_id() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + if (m_object_pool.empty()) { + create_journal(); + return; + } + + librados::Rados rados(m_ioctx); + IoCtx data_ioctx; + int r = rados.ioctx_create(m_object_pool.c_str(), data_ioctx); + if (r != 0) { + lderr(m_cct) << "failed to create journal: " + << "error opening journal object pool '" << m_object_pool + << "': " << cpp_strerror(r) << dendl; + complete(r); + return; + } + data_ioctx.set_namespace(m_ioctx.get_namespace()); + + m_pool_id = data_ioctx.get_id(); + create_journal(); +} + +template +void CreateRequest::create_journal() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + ImageCtx::get_timer_instance(m_cct, &m_timer, &m_timer_lock); + m_journaler = new Journaler(m_op_work_queue, m_timer, m_timer_lock, m_ioctx, + m_image_id, m_image_client_id, {}, nullptr); + + using klass = CreateRequest; + Context *ctx = create_context_callback(this); + + m_journaler->create(m_order, m_splay_width, m_pool_id, ctx); +} + +template +Context *CreateRequest::handle_create_journal(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to create journal: " << cpp_strerror(*result) << dendl; + shut_down_journaler(*result); + return nullptr; + } + + allocate_journal_tag(); + return nullptr; +} + +template +void CreateRequest::allocate_journal_tag() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + using klass = CreateRequest; + Context *ctx = create_context_callback(this); + + encode(m_tag_data, m_bl); + m_journaler->allocate_tag(m_tag_class, m_bl, &m_tag, ctx); +} + +template +Context *CreateRequest::handle_journal_tag(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to allocate tag: " << cpp_strerror(*result) << dendl; + shut_down_journaler(*result); + return nullptr; + } + + register_client(); + return nullptr; +} + +template +void CreateRequest::register_client() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + m_bl.clear(); + encode(ClientData{ImageClientMeta{m_tag.tag_class}}, m_bl); + + using klass = CreateRequest; + Context *ctx = create_context_callback(this); + + m_journaler->register_client(m_bl, ctx); +} + +template +Context *CreateRequest::handle_register_client(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to register client: " << cpp_strerror(*result) << dendl; + } + + shut_down_journaler(*result); + return nullptr; +} + +template +void CreateRequest::shut_down_journaler(int r) { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + m_r_saved = r; + + using klass = CreateRequest; + Context *ctx = create_context_callback(this); + + m_journaler->shut_down(ctx); +} + +template +Context *CreateRequest::handle_journaler_shutdown(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(*result) << dendl; + } + + delete m_journaler; + + if (!m_r_saved) { + complete(0); + return nullptr; + } + + // there was an error during journal creation, so we rollback + // what ever was done. the easiest way to do this is to invoke + // journal remove state machine, although it's not the most + // cleanest approach when it comes to redundancy, but that's + // ok during the failure path. + remove_journal(); + return nullptr; +} + +template +void CreateRequest::remove_journal() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + using klass = CreateRequest; + Context *ctx = create_context_callback(this); + + RemoveRequest *req = RemoveRequest::create( + m_ioctx, m_image_id, m_image_client_id, m_op_work_queue, ctx); + req->send(); +} + +template +Context *CreateRequest::handle_remove_journal(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "error cleaning up journal after creation failed: " + << cpp_strerror(*result) << dendl; + } + + complete(m_r_saved); + return nullptr; +} + +template +void CreateRequest::complete(int r) { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + if (r == 0) { + ldout(m_cct, 20) << "done." << dendl; + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::CreateRequest; diff --git a/src/librbd/journal/CreateRequest.h b/src/librbd/journal/CreateRequest.h new file mode 100644 index 000000000..6fab409c4 --- /dev/null +++ b/src/librbd/journal/CreateRequest.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "common/ceph_mutex.h" +#include "common/Timer.h" +#include "librbd/ImageCtx.h" +#include "journal/Journaler.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" +#include "cls/journal/cls_journal_types.h" + +using librados::IoCtx; +using journal::Journaler; + +class Context; +class ContextWQ; + +namespace journal { + class Journaler; +} + +namespace librbd { + +class ImageCtx; + +namespace journal { + +template +class CreateRequest { +public: + static CreateRequest *create(IoCtx &ioctx, const std::string &imageid, + uint8_t order, uint8_t splay_width, + const std::string &object_pool, + uint64_t tag_class, TagData &tag_data, + const std::string &client_id, + ContextWQ *op_work_queue, Context *on_finish) { + return new CreateRequest(ioctx, imageid, order, splay_width, object_pool, + tag_class, tag_data, client_id, op_work_queue, + on_finish); + } + + void send(); + +private: + typedef typename TypeTraits::Journaler Journaler; + + CreateRequest(IoCtx &ioctx, const std::string &imageid, uint8_t order, + uint8_t splay_width, const std::string &object_pool, + uint64_t tag_class, TagData &tag_data, + const std::string &client_id, ContextWQ *op_work_queue, + Context *on_finish); + + IoCtx &m_ioctx; + std::string m_image_id; + uint8_t m_order; + uint8_t m_splay_width; + std::string m_object_pool; + uint64_t m_tag_class; + TagData m_tag_data; + std::string m_image_client_id; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + cls::journal::Tag m_tag; + bufferlist m_bl; + Journaler *m_journaler; + SafeTimer *m_timer; + ceph::mutex *m_timer_lock; + int m_r_saved; + + int64_t m_pool_id = -1; + + void get_pool_id(); + + void create_journal(); + Context *handle_create_journal(int *result); + + void allocate_journal_tag(); + Context *handle_journal_tag(int *result); + + void register_client(); + Context *handle_register_client(int *result); + + void shut_down_journaler(int r); + Context *handle_journaler_shutdown(int *result); + + void remove_journal(); + Context *handle_remove_journal(int *result); + + void complete(int r); +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::CreateRequest; + +#endif /* CEPH_LIBRBD_JOURNAL_CREATE_REQUEST_H */ diff --git a/src/librbd/journal/DemoteRequest.cc b/src/librbd/journal/DemoteRequest.cc new file mode 100644 index 000000000..564391978 --- /dev/null +++ b/src/librbd/journal/DemoteRequest.cc @@ -0,0 +1,255 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/DemoteRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "journal/Journaler.h" +#include "journal/Settings.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/journal/OpenRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::DemoteRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace journal { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template +DemoteRequest::DemoteRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish), + m_lock(ceph::make_mutex("DemoteRequest::m_lock")) { +} + +template +DemoteRequest::~DemoteRequest() { + ceph_assert(m_journaler == nullptr); +} + +template +void DemoteRequest::send() { + open_journaler(); +} + +template +void DemoteRequest::open_journaler() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + m_journaler = new Journaler(m_image_ctx.md_ctx, m_image_ctx.id, + Journal<>::IMAGE_CLIENT_ID, {}, nullptr); + auto ctx = create_async_context_callback( + m_image_ctx, create_context_callback< + DemoteRequest, &DemoteRequest::handle_open_journaler>(this)); + auto req = OpenRequest::create(&m_image_ctx, m_journaler, &m_lock, + &m_client_meta, &m_tag_tid, &m_tag_data, + ctx); + req->send(); +} + +template +void DemoteRequest::handle_open_journaler(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl; + shut_down_journaler(); + return; + } else if (m_tag_data.mirror_uuid != Journal<>::LOCAL_MIRROR_UUID) { + m_ret_val = -EINVAL; + lderr(cct) << "image is not currently the primary" << dendl; + shut_down_journaler(); + return; + } + + allocate_tag(); +} + +template +void DemoteRequest::allocate_tag() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + cls::journal::Client client; + int r = m_journaler->get_cached_client(Journal<>::IMAGE_CLIENT_ID, &client); + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to retrieve client: " << cpp_strerror(r) << dendl; + shut_down_journaler(); + return; + } + + TagPredecessor predecessor; + predecessor.mirror_uuid = Journal<>::LOCAL_MIRROR_UUID; + if (!client.commit_position.object_positions.empty()) { + auto position = client.commit_position.object_positions.front(); + predecessor.commit_valid = true; + predecessor.tag_tid = position.tag_tid; + predecessor.entry_tid = position.entry_tid; + } + + TagData tag_data; + tag_data.mirror_uuid = Journal<>::ORPHAN_MIRROR_UUID; + tag_data.predecessor = std::move(predecessor); + + bufferlist tag_bl; + encode(tag_data, tag_bl); + + auto ctx = create_context_callback< + DemoteRequest, &DemoteRequest::handle_allocate_tag>(this); + m_journaler->allocate_tag(m_client_meta.tag_class, tag_bl, &m_tag, ctx); +} + +template +void DemoteRequest::handle_allocate_tag(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl; + shut_down_journaler(); + return; + } + + m_tag_tid = m_tag.tid; + append_event(); +} + +template +void DemoteRequest::append_event() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + EventEntry event_entry{DemotePromoteEvent{}, {}}; + bufferlist event_entry_bl; + encode(event_entry, event_entry_bl); + + m_journaler->start_append(0); + m_future = m_journaler->append(m_tag_tid, event_entry_bl); + + auto ctx = create_context_callback< + DemoteRequest, &DemoteRequest::handle_append_event>(this); + m_future.flush(ctx); + +} + +template +void DemoteRequest::handle_append_event(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to append demotion journal event: " << cpp_strerror(r) + << dendl; + stop_append(); + return; + } + + commit_event(); +} + +template +void DemoteRequest::commit_event() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + m_journaler->committed(m_future); + + auto ctx = create_context_callback< + DemoteRequest, &DemoteRequest::handle_commit_event>(this); + m_journaler->flush_commit_position(ctx); +} + +template +void DemoteRequest::handle_commit_event(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to flush demotion commit position: " + << cpp_strerror(r) << dendl; + } + + stop_append(); +} + +template +void DemoteRequest::stop_append() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + DemoteRequest, &DemoteRequest::handle_stop_append>(this); + m_journaler->stop_append(ctx); +} + +template +void DemoteRequest::handle_stop_append(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + if (m_ret_val == 0) { + m_ret_val = r; + } + lderr(cct) << "failed to stop journal append: " << cpp_strerror(r) << dendl; + } + + shut_down_journaler(); +} + +template +void DemoteRequest::shut_down_journaler() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + Context *ctx = create_async_context_callback( + m_image_ctx, create_context_callback< + DemoteRequest, &DemoteRequest::handle_shut_down_journaler>(this)); + m_journaler->shut_down(ctx); +} + +template +void DemoteRequest::handle_shut_down_journaler(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to shut down journal: " << cpp_strerror(r) << dendl; + } + + delete m_journaler; + m_journaler = nullptr; + finish(r); +} + +template +void DemoteRequest::finish(int r) { + if (m_ret_val < 0) { + r = m_ret_val; + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::DemoteRequest; diff --git a/src/librbd/journal/DemoteRequest.h b/src/librbd/journal/DemoteRequest.h new file mode 100644 index 000000000..6aba6cc8f --- /dev/null +++ b/src/librbd/journal/DemoteRequest.h @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H + +#include "common/ceph_mutex.h" +#include "cls/journal/cls_journal_types.h" +#include "journal/Future.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +template +class DemoteRequest { +public: + static DemoteRequest *create(ImageCtxT &image_ctx, Context *on_finish) { + return new DemoteRequest(image_ctx, on_finish); + } + + DemoteRequest(ImageCtxT &image_ctx, Context *on_finish); + ~DemoteRequest(); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * OPEN_JOURNALER * * * * * + * | * + * v * + * ALLOCATE_TAG * * * * * * + * | * + * v * + * APPEND_EVENT * * * * + * | * * + * v * * + * COMMIT_EVENT * * + * | * * + * v * * + * STOP_APPEND <* * * * + * | * + * v * + * SHUT_DOWN_JOURNALER <* * + * | + * v + * + * + * @endverbatim + */ + + typedef typename TypeTraits::Journaler Journaler; + typedef typename TypeTraits::Future Future; + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + Journaler *m_journaler = nullptr; + int m_ret_val = 0; + + ceph::mutex m_lock; + ImageClientMeta m_client_meta; + uint64_t m_tag_tid = 0; + TagData m_tag_data; + + cls::journal::Tag m_tag; + Future m_future; + + void open_journaler(); + void handle_open_journaler(int r); + + void allocate_tag(); + void handle_allocate_tag(int r); + + void append_event(); + void handle_append_event(int r); + + void commit_event(); + void handle_commit_event(int r); + + void stop_append(); + void handle_stop_append(int r); + + void shut_down_journaler(); + void handle_shut_down_journaler(int r); + + void finish(int r); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::DemoteRequest; + +#endif // CEPH_LIBRBD_JOURNAL_DEMOTE_REQUEST_H diff --git a/src/librbd/journal/DisabledPolicy.h b/src/librbd/journal/DisabledPolicy.h new file mode 100644 index 000000000..27d69a50d --- /dev/null +++ b/src/librbd/journal/DisabledPolicy.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H +#define CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H + +#include "librbd/journal/Policy.h" + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +class DisabledPolicy : public Policy { +public: + bool append_disabled() const override { + return true; + } + bool journal_disabled() const override { + return true; + } + void allocate_tag_on_lock(Context *on_finish) override { + ceph_abort(); + } +}; + +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_DISABLED_POLICY_H diff --git a/src/librbd/journal/ObjectDispatch.cc b/src/librbd/journal/ObjectDispatch.cc new file mode 100644 index 000000000..e3659c221 --- /dev/null +++ b/src/librbd/journal/ObjectDispatch.cc @@ -0,0 +1,257 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/ObjectDispatch.h" +#include "common/dout.h" +#include "osdc/Striper.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/io/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::ObjectDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace journal { + +using librbd::util::data_object_name; +using util::create_context_callback; + +namespace { + +template +struct C_CommitIOEvent : public Context { + I* image_ctx; + Journal* journal; + uint64_t object_no; + uint64_t object_off; + uint64_t object_len; + uint64_t journal_tid; + int object_dispatch_flags; + Context* on_finish; + + C_CommitIOEvent(I* image_ctx, Journal* journal, uint64_t object_no, + uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, int object_dispatch_flags, + Context* on_finish) + : image_ctx(image_ctx), journal(journal), object_no(object_no), + object_off(object_off), object_len(object_len), journal_tid(journal_tid), + object_dispatch_flags(object_dispatch_flags), on_finish(on_finish) { + } + + void finish(int r) override { + // don't commit the IO extent if a previous dispatch handler will just + // retry the failed IO + if (r >= 0 || + (object_dispatch_flags & + io::OBJECT_DISPATCH_FLAG_WILL_RETRY_ON_ERROR) == 0) { + auto [image_extents, _] = io::util::object_to_area_extents( + image_ctx, object_no, {{object_off, object_len}}); + for (const auto& extent : image_extents) { + journal->commit_io_event_extent(journal_tid, extent.first, + extent.second, r); + } + } + + if (on_finish != nullptr) { + on_finish->complete(r); + } + } +}; + +} // anonymous namespace + +template +ObjectDispatch::ObjectDispatch(I* image_ctx, Journal* journal) + : m_image_ctx(image_ctx), m_journal(journal) { +} + +template +void ObjectDispatch::shut_down(Context* on_finish) { + m_image_ctx->op_work_queue->queue(on_finish, 0); +} + +template +bool ObjectDispatch::discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*journal_tid == 0) { + // non-journaled IO + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + *on_finish = new C_CommitIOEvent(m_image_ctx, m_journal, object_no, + object_off, object_len, *journal_tid, + *object_dispatch_flags, *on_finish); + *on_finish = create_context_callback< + Context, &Context::complete>(*on_finish, m_journal); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched); + return true; +} + +template +bool ObjectDispatch::write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*journal_tid == 0) { + // non-journaled IO + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << data.length() << dendl; + + *on_finish = new C_CommitIOEvent(m_image_ctx, m_journal, object_no, + object_off, data.length(), *journal_tid, + *object_dispatch_flags, *on_finish); + *on_finish = create_context_callback< + Context, &Context::complete>(*on_finish, m_journal); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched); + return true; +} + +template +bool ObjectDispatch::write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*journal_tid == 0) { + // non-journaled IO + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << object_len << dendl; + + *on_finish = new C_CommitIOEvent(m_image_ctx, m_journal, object_no, + object_off, object_len, *journal_tid, + *object_dispatch_flags, *on_finish); + *on_finish = create_context_callback< + Context, &Context::complete>(*on_finish, m_journal); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched); + return true; +} + +template +bool ObjectDispatch::compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + if (*journal_tid == 0) { + // non-journaled IO + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << data_object_name(m_image_ctx, object_no) << " " + << object_off << "~" << write_data.length() + << dendl; + + *on_finish = new C_CommitIOEvent(m_image_ctx, m_journal, object_no, + object_off, write_data.length(), + *journal_tid, *object_dispatch_flags, + *on_finish); + *on_finish = create_context_callback< + Context, &Context::complete>(*on_finish, m_journal); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + wait_or_flush_event(*journal_tid, *object_dispatch_flags, on_dispatched); + return true; +} + +template +bool ObjectDispatch::flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) { + if (*journal_tid == 0) { + // non-journaled IO + return false; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + auto ctx = *on_finish; + *on_finish = new LambdaContext( + [image_ctx=m_image_ctx, ctx, journal_tid=*journal_tid](int r) { + image_ctx->journal->commit_io_event(journal_tid, r); + ctx->complete(r); + }); + + *dispatch_result = io::DISPATCH_RESULT_CONTINUE; + wait_or_flush_event(*journal_tid, io::OBJECT_DISPATCH_FLAG_FLUSH, + on_dispatched); + return true; +} + +template +void ObjectDispatch::extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << object_no << " " << object_off << "~" << object_len + << dendl; + + Context *ctx = new C_CommitIOEvent(m_image_ctx, m_journal, object_no, + object_off, object_len, journal_tid, false, + nullptr); + if (new_journal_tid != 0) { + // ensure new journal event is safely committed to disk before + // committing old event + m_journal->flush_event(new_journal_tid, ctx); + } else { + ctx = create_context_callback< + Context, &Context::complete>(ctx, m_journal); + ctx->complete(0); + } +} + +template +void ObjectDispatch::wait_or_flush_event( + uint64_t journal_tid, int object_dispatch_flags, Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "journal_tid=" << journal_tid << dendl; + + if ((object_dispatch_flags & io::OBJECT_DISPATCH_FLAG_FLUSH) != 0) { + m_journal->flush_event(journal_tid, on_dispatched); + } else { + m_journal->wait_event(journal_tid, on_dispatched); + } +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::ObjectDispatch; diff --git a/src/librbd/journal/ObjectDispatch.h b/src/librbd/journal/ObjectDispatch.h new file mode 100644 index 000000000..45e4773cc --- /dev/null +++ b/src/librbd/journal/ObjectDispatch.h @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H +#define CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "common/zipkin_trace.h" +#include "librbd/io/Types.h" +#include "librbd/io/ObjectDispatchInterface.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; +template class Journal; + +namespace journal { + +template +class ObjectDispatch : public io::ObjectDispatchInterface { +public: + static ObjectDispatch* create(ImageCtxT* image_ctx, + Journal* journal) { + return new ObjectDispatch(image_ctx, journal); + } + + ObjectDispatch(ImageCtxT* image_ctx, Journal* journal); + + io::ObjectDispatchLayer get_dispatch_layer() const override { + return io::OBJECT_DISPATCH_LAYER_JOURNAL; + } + + void shut_down(Context* on_finish) override; + + bool read( + uint64_t object_no, io::ReadExtents* extents, IOContext io_context, + int op_flags, int read_flags, const ZTracer::Trace &parent_trace, + uint64_t* version, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + return false; + } + + bool discard( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + IOContext io_context, int discard_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& data, + IOContext io_context, int op_flags, int write_flags, + std::optional assert_version, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool write_same( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + io::LightweightBufferExtents&& buffer_extents, ceph::bufferlist&& data, + IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, int* object_dispatch_flags, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool compare_and_write( + uint64_t object_no, uint64_t object_off, ceph::bufferlist&& cmp_data, + ceph::bufferlist&& write_data, IOContext io_context, int op_flags, + const ZTracer::Trace &parent_trace, uint64_t* mismatch_offset, + int* object_dispatch_flags, uint64_t* journal_tid, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool flush( + io::FlushSource flush_source, const ZTracer::Trace &parent_trace, + uint64_t* journal_tid, io::DispatchResult* dispatch_result, + Context** on_finish, Context* on_dispatched) override; + + bool list_snaps( + uint64_t object_no, io::Extents&& extents, io::SnapIds&& snap_ids, + int list_snap_flags, const ZTracer::Trace &parent_trace, + io::SnapshotDelta* snapshot_delta, int* object_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override { + return false; + } + + bool invalidate_cache(Context* on_finish) override { + return false; + } + bool reset_existence_cache(Context* on_finish) override { + return false; + } + + void extent_overwritten( + uint64_t object_no, uint64_t object_off, uint64_t object_len, + uint64_t journal_tid, uint64_t new_journal_tid) override; + + int prepare_copyup( + uint64_t object_no, + io::SnapshotSparseBufferlist* snapshot_sparse_bufferlist) override { + return 0; + } + +private: + ImageCtxT* m_image_ctx; + Journal* m_journal; + + void wait_or_flush_event(uint64_t journal_tid, int object_dispatch_flags, + Context* on_dispatched); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::ObjectDispatch; + +#endif // CEPH_LIBRBD_JOURNAL_OBJECT_DISPATCH_H diff --git a/src/librbd/journal/OpenRequest.cc b/src/librbd/journal/OpenRequest.cc new file mode 100644 index 000000000..eb01aa35a --- /dev/null +++ b/src/librbd/journal/OpenRequest.cc @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/OpenRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "journal/Journaler.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::OpenRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace journal { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using util::C_DecodeTags; + +template +OpenRequest::OpenRequest(I *image_ctx, Journaler *journaler, ceph::mutex *lock, + journal::ImageClientMeta *client_meta, + uint64_t *tag_tid, journal::TagData *tag_data, + Context *on_finish) + : m_image_ctx(image_ctx), m_journaler(journaler), m_lock(lock), + m_client_meta(client_meta), m_tag_tid(tag_tid), m_tag_data(tag_data), + m_on_finish(on_finish) { +} + +template +void OpenRequest::send() { + send_init(); +} + +template +void OpenRequest::send_init() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + m_journaler->init(create_async_context_callback( + *m_image_ctx, create_context_callback< + OpenRequest, &OpenRequest::handle_init>(this))); +} + +template +void OpenRequest::handle_init(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to initialize journal: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + // locate the master image client record + cls::journal::Client client; + r = m_journaler->get_cached_client(Journal::IMAGE_CLIENT_ID, + &client); + if (r < 0) { + lderr(cct) << "failed to locate master image client" << dendl; + finish(r); + return; + } + + librbd::journal::ClientData client_data; + auto bl = client.data.cbegin(); + try { + decode(client_data, bl); + } catch (const buffer::error &err) { + lderr(cct) << "failed to decode client meta data: " << err.what() + << dendl; + finish(-EINVAL); + return; + } + + journal::ImageClientMeta *image_client_meta = + boost::get(&client_data.client_meta); + if (image_client_meta == nullptr) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to extract client meta data" << dendl; + finish(-EINVAL); + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": " + << "client: " << client << ", " + << "image meta: " << *image_client_meta << dendl; + + m_tag_class = image_client_meta->tag_class; + { + std::lock_guard locker{*m_lock}; + *m_client_meta = *image_client_meta; + } + + send_get_tags(); +} + +template +void OpenRequest::send_get_tags() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + C_DecodeTags *tags_ctx = new C_DecodeTags( + cct, m_lock, m_tag_tid, m_tag_data, create_async_context_callback( + *m_image_ctx, create_context_callback< + OpenRequest, &OpenRequest::handle_get_tags>(this))); + m_journaler->get_tags(m_tag_class, &tags_ctx->tags, tags_ctx); +} + +template +void OpenRequest::handle_get_tags(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << this << " " << __func__ << ": " + << "failed to decode journal tags: " << cpp_strerror(r) << dendl; + } + + finish(r); +} + +template +void OpenRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::OpenRequest; diff --git a/src/librbd/journal/OpenRequest.h b/src/librbd/journal/OpenRequest.h new file mode 100644 index 000000000..0f10bccf1 --- /dev/null +++ b/src/librbd/journal/OpenRequest.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H + +#include "common/ceph_mutex.h" +#include "include/int_types.h" +#include "librbd/journal/TypeTraits.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +struct ImageClientMeta; +struct TagData; + +template +class OpenRequest { +public: + typedef typename TypeTraits::Journaler Journaler; + + static OpenRequest* create(ImageCtxT *image_ctx, Journaler *journaler, + ceph::mutex *lock, journal::ImageClientMeta *client_meta, + uint64_t *tag_tid, journal::TagData *tag_data, + Context *on_finish) { + return new OpenRequest(image_ctx, journaler, lock, client_meta, tag_tid, + tag_data, on_finish); + } + + OpenRequest(ImageCtxT *image_ctx, Journaler *journaler, ceph::mutex *lock, + journal::ImageClientMeta *client_meta, uint64_t *tag_tid, + journal::TagData *tag_data, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * INIT + * | + * v + * GET_TAGS + * | + * v + * + * + * @endverbatim + */ + + + ImageCtxT *m_image_ctx; + Journaler *m_journaler; + ceph::mutex *m_lock; + journal::ImageClientMeta *m_client_meta; + uint64_t *m_tag_tid; + journal::TagData *m_tag_data; + Context *m_on_finish; + + uint64_t m_tag_class = 0; + + void send_init(); + void handle_init(int r); + + void send_get_tags(); + void handle_get_tags(int r); + + void finish(int r); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::OpenRequest; + +#endif // CEPH_LIBRBD_JOURNAL_OPEN_REQUEST_H diff --git a/src/librbd/journal/Policy.h b/src/librbd/journal/Policy.h new file mode 100644 index 000000000..1ced3c53e --- /dev/null +++ b/src/librbd/journal/Policy.h @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_POLICY_H +#define CEPH_LIBRBD_JOURNAL_POLICY_H + +class Context; + +namespace librbd { + +namespace journal { + +struct Policy { + virtual ~Policy() { + } + + virtual bool append_disabled() const = 0; + virtual bool journal_disabled() const = 0; + virtual void allocate_tag_on_lock(Context *on_finish) = 0; +}; + +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_POLICY_H diff --git a/src/librbd/journal/PromoteRequest.cc b/src/librbd/journal/PromoteRequest.cc new file mode 100644 index 000000000..f7ae45a92 --- /dev/null +++ b/src/librbd/journal/PromoteRequest.cc @@ -0,0 +1,237 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/PromoteRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "journal/Journaler.h" +#include "journal/Settings.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/journal/OpenRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::PromoteRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace journal { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; + +template +PromoteRequest::PromoteRequest(I *image_ctx, bool force, Context *on_finish) + : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish), + m_lock(ceph::make_mutex("PromoteRequest::m_lock")) { +} + +template +void PromoteRequest::send() { + send_open(); +} + +template +void PromoteRequest::send_open() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + m_journaler = new Journaler(m_image_ctx->md_ctx, m_image_ctx->id, + Journal<>::IMAGE_CLIENT_ID, {}, nullptr); + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + PromoteRequest, &PromoteRequest::handle_open>(this)); + auto open_req = OpenRequest::create(m_image_ctx, m_journaler, + &m_lock, &m_client_meta, + &m_tag_tid, &m_tag_data, ctx); + open_req->send(); +} + +template +void PromoteRequest::handle_open(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to open journal: " << cpp_strerror(r) << dendl; + shut_down(); + return; + } + + allocate_tag(); +} + +template +void PromoteRequest::allocate_tag() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + journal::TagPredecessor predecessor; + if (!m_force && m_tag_data.mirror_uuid == Journal<>::ORPHAN_MIRROR_UUID) { + // orderly promotion -- demotion epoch will have a single entry + // so link to our predecessor (demotion) epoch + predecessor = TagPredecessor{Journal<>::ORPHAN_MIRROR_UUID, true, m_tag_tid, + 1}; + } else { + // forced promotion -- create an epoch no peers can link against + predecessor = TagPredecessor{Journal<>::LOCAL_MIRROR_UUID, true, m_tag_tid, + 0}; + } + + TagData tag_data; + tag_data.mirror_uuid = Journal<>::LOCAL_MIRROR_UUID; + tag_data.predecessor = predecessor; + + bufferlist tag_bl; + encode(tag_data, tag_bl); + + Context *ctx = create_context_callback< + PromoteRequest, &PromoteRequest::handle_allocate_tag>(this); + m_journaler->allocate_tag(m_client_meta.tag_class, tag_bl, &m_tag, ctx); +} + +template +void PromoteRequest::handle_allocate_tag(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl; + shut_down(); + return; + } + + m_tag_tid = m_tag.tid; + append_event(); +} + +template +void PromoteRequest::append_event() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + EventEntry event_entry{DemotePromoteEvent{}, {}}; + bufferlist event_entry_bl; + encode(event_entry, event_entry_bl); + + m_journaler->start_append(0); + m_future = m_journaler->append(m_tag_tid, event_entry_bl); + + auto ctx = create_context_callback< + PromoteRequest, &PromoteRequest::handle_append_event>(this); + m_future.flush(ctx); +} + +template +void PromoteRequest::handle_append_event(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to append promotion journal event: " + << cpp_strerror(r) << dendl; + stop_append(); + return; + } + + commit_event(); +} + +template +void PromoteRequest::commit_event() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + m_journaler->committed(m_future); + + auto ctx = create_context_callback< + PromoteRequest, &PromoteRequest::handle_commit_event>(this); + m_journaler->flush_commit_position(ctx); +} + +template +void PromoteRequest::handle_commit_event(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to flush promote commit position: " + << cpp_strerror(r) << dendl; + } + + stop_append(); +} + +template +void PromoteRequest::stop_append() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + PromoteRequest, &PromoteRequest::handle_stop_append>(this); + m_journaler->stop_append(ctx); +} + +template +void PromoteRequest::handle_stop_append(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + if (m_ret_val == 0) { + m_ret_val = r; + } + lderr(cct) << "failed to stop journal append: " << cpp_strerror(r) << dendl; + } + + shut_down(); +} + +template +void PromoteRequest::shut_down() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + Context *ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + PromoteRequest, &PromoteRequest::handle_shut_down>(this)); + m_journaler->shut_down(ctx); +} + +template +void PromoteRequest::handle_shut_down(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to shut down journal: " << cpp_strerror(r) << dendl; + } + + delete m_journaler; + finish(r); +} + +template +void PromoteRequest::finish(int r) { + if (m_ret_val < 0) { + r = m_ret_val; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::PromoteRequest; diff --git a/src/librbd/journal/PromoteRequest.h b/src/librbd/journal/PromoteRequest.h new file mode 100644 index 000000000..f6258066e --- /dev/null +++ b/src/librbd/journal/PromoteRequest.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H + +#include "include/int_types.h" +#include "common/ceph_mutex.h" +#include "cls/journal/cls_journal_types.h" +#include "journal/Future.h" +#include "librbd/journal/Types.h" +#include "librbd/journal/TypeTraits.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +template +class PromoteRequest { +public: + static PromoteRequest* create(ImageCtxT *image_ctx, bool force, + Context *on_finish) { + return new PromoteRequest(image_ctx, force, on_finish); + } + + PromoteRequest(ImageCtxT *image_ctx, bool force, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * OPEN * * * * * * * * * * + * | * + * v * + * ALLOCATE_TAG * * * * * * + * | * + * v * + * APPEND_EVENT * * * * + * | * * + * v * * + * COMMIT_EVENT * * + * | * * + * v * * + * STOP_APPEND <* * * * + * | * + * v * + * SHUT_DOWN <* * * * * * * + * | + * v + * + * + * @endverbatim + */ + + typedef typename TypeTraits::Journaler Journaler; + typedef typename TypeTraits::Future Future; + + ImageCtxT *m_image_ctx; + bool m_force; + Context *m_on_finish; + + Journaler *m_journaler = nullptr; + int m_ret_val = 0; + + ceph::mutex m_lock; + ImageClientMeta m_client_meta; + uint64_t m_tag_tid = 0; + TagData m_tag_data; + + cls::journal::Tag m_tag; + Future m_future; + + void send_open(); + void handle_open(int r); + + void allocate_tag(); + void handle_allocate_tag(int r); + + void append_event(); + void handle_append_event(int r); + + void commit_event(); + void handle_commit_event(int r); + + void stop_append(); + void handle_stop_append(int r); + + void shut_down(); + void handle_shut_down(int r); + + void finish(int r); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::PromoteRequest; + +#endif // CEPH_LIBRBD_JOURNAL_PROMOTE_REQUEST_H diff --git a/src/librbd/journal/RemoveRequest.cc b/src/librbd/journal/RemoveRequest.cc new file mode 100644 index 000000000..0f73a31ba --- /dev/null +++ b/src/librbd/journal/RemoveRequest.cc @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/dout.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "journal/Settings.h" +#include "include/ceph_assert.h" +#include "librbd/Utils.h" +#include "librbd/journal/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Journal::RemoveRequest: " + +namespace librbd { + +using util::create_context_callback; + +namespace journal { + +template +RemoveRequest::RemoveRequest(IoCtx &ioctx, const std::string &image_id, + const std::string &client_id, + ContextWQ *op_work_queue, + Context *on_finish) + : m_ioctx(ioctx), m_image_id(image_id), m_image_client_id(client_id), + m_op_work_queue(op_work_queue), m_on_finish(on_finish) { + m_cct = reinterpret_cast(m_ioctx.cct()); +} + +template +void RemoveRequest::send() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + stat_journal(); +} + +template +void RemoveRequest::stat_journal() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + ImageCtx::get_timer_instance(m_cct, &m_timer, &m_timer_lock); + m_journaler = new Journaler(m_op_work_queue, m_timer, m_timer_lock, m_ioctx, + m_image_id, m_image_client_id, {}, nullptr); + + using klass = RemoveRequest; + Context *ctx = create_context_callback(this); + + m_journaler->exists(ctx); +} + +template +Context *RemoveRequest::handle_stat_journal(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if ((*result < 0) && (*result != -ENOENT)) { + lderr(m_cct) << "failed to stat journal header: " << cpp_strerror(*result) << dendl; + shut_down_journaler(*result); + return nullptr; + } + + if (*result == -ENOENT) { + shut_down_journaler(0); + return nullptr; + } + + init_journaler(); + return nullptr; +} + +template +void RemoveRequest::init_journaler() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + using klass = RemoveRequest; + Context *ctx = create_context_callback(this); + + m_journaler->init(ctx); +} + +template +Context *RemoveRequest::handle_init_journaler(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if ((*result < 0) && (*result != -ENOENT)) { + lderr(m_cct) << "failed to init journaler: " << cpp_strerror(*result) << dendl; + shut_down_journaler(*result); + return nullptr; + } + + remove_journal(); + return nullptr; +} + +template +void RemoveRequest::remove_journal() { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + using klass = RemoveRequest; + Context *ctx = create_context_callback(this); + + m_journaler->remove(true, ctx); +} + +template +Context *RemoveRequest::handle_remove_journal(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to remove journal: " << cpp_strerror(*result) << dendl; + } + + shut_down_journaler(*result); + return nullptr; +} + +template +void RemoveRequest::shut_down_journaler(int r) { + ldout(m_cct, 20) << this << " " << __func__ << dendl; + + m_r_saved = r; + + using klass = RemoveRequest; + Context *ctx = create_context_callback(this); + + m_journaler->shut_down(ctx); +} + +template +Context *RemoveRequest::handle_journaler_shutdown(int *result) { + ldout(m_cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(*result) << dendl; + } + + delete m_journaler; + + if (m_r_saved == 0) { + ldout(m_cct, 20) << "done." << dendl; + } + + m_on_finish->complete(m_r_saved); + delete this; + + return nullptr; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::RemoveRequest; diff --git a/src/librbd/journal/RemoveRequest.h b/src/librbd/journal/RemoveRequest.h new file mode 100644 index 000000000..14b1c4dc5 --- /dev/null +++ b/src/librbd/journal/RemoveRequest.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "librbd/ImageCtx.h" +#include "journal/Journaler.h" +#include "librbd/journal/TypeTraits.h" +#include "common/Timer.h" + +using librados::IoCtx; +using journal::Journaler; + +class Context; +class ContextWQ; + +namespace journal { + class Journaler; +} + +namespace librbd { + +class ImageCtx; + +namespace journal { + +template +class RemoveRequest { +public: + static RemoveRequest *create(IoCtx &ioctx, const std::string &image_id, + const std::string &client_id, + ContextWQ *op_work_queue, Context *on_finish) { + return new RemoveRequest(ioctx, image_id, client_id, + op_work_queue, on_finish); + } + + void send(); + +private: + typedef typename TypeTraits::Journaler Journaler; + + RemoveRequest(IoCtx &ioctx, const std::string &image_id, + const std::string &client_id, + ContextWQ *op_work_queue, Context *on_finish); + + IoCtx &m_ioctx; + std::string m_image_id; + std::string m_image_client_id; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + Journaler *m_journaler; + SafeTimer *m_timer; + ceph::mutex *m_timer_lock; + int m_r_saved; + + void stat_journal(); + Context *handle_stat_journal(int *result); + + void init_journaler(); + Context *handle_init_journaler(int *result); + + void remove_journal(); + Context *handle_remove_journal(int *result); + + void shut_down_journaler(int r); + Context *handle_journaler_shutdown(int *result); +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::RemoveRequest; + +#endif // CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H diff --git a/src/librbd/journal/Replay.cc b/src/librbd/journal/Replay.cc new file mode 100644 index 000000000..42acf5eb2 --- /dev/null +++ b/src/librbd/journal/Replay.cc @@ -0,0 +1,1175 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/Replay.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " " + +namespace librbd { +namespace journal { + +namespace { + +static const uint64_t IN_FLIGHT_IO_LOW_WATER_MARK(32); +static const uint64_t IN_FLIGHT_IO_HIGH_WATER_MARK(64); + +static NoOpProgressContext no_op_progress_callback; + +template +struct ExecuteOp : public Context { + I &image_ctx; + E event; + Context *on_op_complete; + + ExecuteOp(I &image_ctx, const E &event, Context *on_op_complete) + : image_ctx(image_ctx), event(event), on_op_complete(on_op_complete) { + } + + void execute(const journal::SnapCreateEvent &_) { + image_ctx.operations->execute_snap_create(event.snap_namespace, + event.snap_name, + on_op_complete, + event.op_tid, + SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE, + no_op_progress_callback); + } + + void execute(const journal::SnapRemoveEvent &_) { + image_ctx.operations->execute_snap_remove(event.snap_namespace, + event.snap_name, + on_op_complete); + } + + void execute(const journal::SnapRenameEvent &_) { + image_ctx.operations->execute_snap_rename(event.snap_id, + event.dst_snap_name, + on_op_complete); + } + + void execute(const journal::SnapProtectEvent &_) { + image_ctx.operations->execute_snap_protect(event.snap_namespace, + event.snap_name, + on_op_complete); + } + + void execute(const journal::SnapUnprotectEvent &_) { + image_ctx.operations->execute_snap_unprotect(event.snap_namespace, + event.snap_name, + on_op_complete); + } + + void execute(const journal::SnapRollbackEvent &_) { + image_ctx.operations->execute_snap_rollback(event.snap_namespace, + event.snap_name, + no_op_progress_callback, + on_op_complete); + } + + void execute(const journal::RenameEvent &_) { + image_ctx.operations->execute_rename(event.image_name, + on_op_complete); + } + + void execute(const journal::ResizeEvent &_) { + image_ctx.operations->execute_resize(event.size, true, no_op_progress_callback, + on_op_complete, event.op_tid); + } + + void execute(const journal::FlattenEvent &_) { + image_ctx.operations->execute_flatten(no_op_progress_callback, + on_op_complete); + } + + void execute(const journal::SnapLimitEvent &_) { + image_ctx.operations->execute_snap_set_limit(event.limit, on_op_complete); + } + + void execute(const journal::UpdateFeaturesEvent &_) { + image_ctx.operations->execute_update_features(event.features, event.enabled, + on_op_complete, event.op_tid); + } + + void execute(const journal::MetadataSetEvent &_) { + image_ctx.operations->execute_metadata_set(event.key, event.value, + on_op_complete); + } + + void execute(const journal::MetadataRemoveEvent &_) { + image_ctx.operations->execute_metadata_remove(event.key, on_op_complete); + } + + void finish(int r) override { + CephContext *cct = image_ctx.cct; + if (r < 0) { + lderr(cct) << ": ExecuteOp::" << __func__ << ": r=" << r << dendl; + on_op_complete->complete(r); + return; + } + + ldout(cct, 20) << ": ExecuteOp::" << __func__ << dendl; + std::shared_lock owner_locker{image_ctx.owner_lock}; + + if (image_ctx.exclusive_lock == nullptr || + !image_ctx.exclusive_lock->accept_ops()) { + ldout(cct, 5) << ": lost exclusive lock -- skipping op" << dendl; + on_op_complete->complete(-ECANCELED); + return; + } + + execute(event); + } +}; + +template +struct C_RefreshIfRequired : public Context { + I &image_ctx; + Context *on_finish; + + C_RefreshIfRequired(I &image_ctx, Context *on_finish) + : image_ctx(image_ctx), on_finish(on_finish) { + } + ~C_RefreshIfRequired() override { + delete on_finish; + } + + void finish(int r) override { + CephContext *cct = image_ctx.cct; + Context *ctx = on_finish; + on_finish = nullptr; + + if (r < 0) { + lderr(cct) << ": C_RefreshIfRequired::" << __func__ << ": r=" << r << dendl; + image_ctx.op_work_queue->queue(ctx, r); + return; + } + + if (image_ctx.state->is_refresh_required()) { + ldout(cct, 20) << ": C_RefreshIfRequired::" << __func__ << ": " + << "refresh required" << dendl; + image_ctx.state->refresh(ctx); + return; + } + + image_ctx.op_work_queue->queue(ctx, 0); + } +}; + +} // anonymous namespace + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::Replay: " << this << " " \ + << __func__ + +template +Replay::Replay(I &image_ctx) + : m_image_ctx(image_ctx) { +} + +template +Replay::~Replay() { + std::lock_guard locker{m_lock}; + ceph_assert(m_in_flight_aio_flush == 0); + ceph_assert(m_in_flight_aio_modify == 0); + ceph_assert(m_aio_modify_unsafe_contexts.empty()); + ceph_assert(m_aio_modify_safe_contexts.empty()); + ceph_assert(m_op_events.empty()); + ceph_assert(m_in_flight_op_events == 0); +} + +template +int Replay::decode(bufferlist::const_iterator *it, EventEntry *event_entry) { + try { + using ceph::decode; + decode(*event_entry, *it); + } catch (const buffer::error &err) { + return -EBADMSG; + } + return 0; +} + +template +void Replay::process(const EventEntry &event_entry, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": on_ready=" << on_ready << ", on_safe=" << on_safe + << dendl; + + on_ready = util::create_async_context_callback(m_image_ctx, on_ready); + + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + if (m_image_ctx.exclusive_lock == nullptr || + !m_image_ctx.exclusive_lock->accept_ops()) { + ldout(cct, 5) << ": lost exclusive lock -- skipping event" << dendl; + m_image_ctx.op_work_queue->queue(on_safe, -ECANCELED); + on_ready->complete(0); + return; + } + + boost::apply_visitor(EventVisitor(this, on_ready, on_safe), + event_entry.event); +} + +template +void Replay::shut_down(bool cancel_ops, Context *on_finish) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + io::AioCompletion *flush_comp = nullptr; + on_finish = util::create_async_context_callback( + m_image_ctx, on_finish); + + { + std::lock_guard locker{m_lock}; + + // safely commit any remaining AIO modify operations + if ((m_in_flight_aio_flush + m_in_flight_aio_modify) != 0) { + flush_comp = create_aio_flush_completion(nullptr); + ceph_assert(flush_comp != nullptr); + } + + for (auto &op_event_pair : m_op_events) { + OpEvent &op_event = op_event_pair.second; + if (cancel_ops) { + // cancel ops that are waiting to start (waiting for + // OpFinishEvent or waiting for ready) + if (op_event.on_start_ready == nullptr && + op_event.on_op_finish_event != nullptr) { + Context *on_op_finish_event = nullptr; + std::swap(on_op_finish_event, op_event.on_op_finish_event); + m_image_ctx.op_work_queue->queue(on_op_finish_event, -ERESTART); + } + } else if (op_event.on_op_finish_event != nullptr) { + // start ops waiting for OpFinishEvent + Context *on_op_finish_event = nullptr; + std::swap(on_op_finish_event, op_event.on_op_finish_event); + m_image_ctx.op_work_queue->queue(on_op_finish_event, 0); + } else if (op_event.on_start_ready != nullptr) { + // waiting for op ready + op_event_pair.second.finish_on_ready = true; + } + } + + ceph_assert(!m_shut_down); + m_shut_down = true; + + ceph_assert(m_flush_ctx == nullptr); + if (m_in_flight_op_events > 0 || flush_comp != nullptr) { + std::swap(m_flush_ctx, on_finish); + } + } + + // execute the following outside of lock scope + if (flush_comp != nullptr) { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + io::ImageRequest::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + if (on_finish != nullptr) { + on_finish->complete(0); + } +} + +template +void Replay::flush(Context *on_finish) { + io::AioCompletion *aio_comp; + { + std::lock_guard locker{m_lock}; + aio_comp = create_aio_flush_completion( + util::create_async_context_callback(m_image_ctx, on_finish)); + if (aio_comp == nullptr) { + return; + } + } + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + io::ImageRequest::aio_flush(&m_image_ctx, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); +} + +template +void Replay::replay_op_ready(uint64_t op_tid, Context *on_resume) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": op_tid=" << op_tid << dendl; + + std::lock_guard locker{m_lock}; + auto op_it = m_op_events.find(op_tid); + ceph_assert(op_it != m_op_events.end()); + + OpEvent &op_event = op_it->second; + ceph_assert(op_event.op_in_progress && + op_event.on_op_finish_event == nullptr && + op_event.on_finish_ready == nullptr && + op_event.on_finish_safe == nullptr); + + // resume processing replay events + Context *on_start_ready = nullptr; + std::swap(on_start_ready, op_event.on_start_ready); + on_start_ready->complete(0); + + // cancel has been requested -- send error to paused state machine + if (!op_event.finish_on_ready && m_flush_ctx != nullptr) { + m_image_ctx.op_work_queue->queue(on_resume, -ERESTART); + return; + } + + // resume the op state machine once the associated OpFinishEvent + // is processed + op_event.on_op_finish_event = new LambdaContext( + [on_resume](int r) { + on_resume->complete(r); + }); + + // shut down request -- don't expect OpFinishEvent + if (op_event.finish_on_ready) { + m_image_ctx.op_work_queue->queue(on_resume, 0); + } +} + +template +void Replay::handle_event(const journal::AioDiscardEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO discard event" << dendl; + + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_DISCARD, + &flush_required, + {}); + if (aio_comp == nullptr) { + return; + } + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest::aio_discard(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + io::ImageArea::DATA, + event.discard_granularity_bytes, {}); + } + + if (flush_required) { + m_lock.lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.unlock(); + + if (flush_comp != nullptr) { + io::ImageRequest::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + } +} + +template +void Replay::handle_event(const journal::AioWriteEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO write event" << dendl; + + bufferlist data = event.data; + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_WRITE, + &flush_required, + {}); + if (aio_comp == nullptr) { + return; + } + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest::aio_write(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + io::ImageArea::DATA, std::move(data), + 0, {}); + } + + if (flush_required) { + m_lock.lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.unlock(); + + if (flush_comp != nullptr) { + io::ImageRequest::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + } +} + +template +void Replay::handle_event(const journal::AioFlushEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO flush event" << dendl; + + io::AioCompletion *aio_comp; + { + std::lock_guard locker{m_lock}; + aio_comp = create_aio_flush_completion(on_safe); + } + + if (aio_comp != nullptr) { + io::ImageRequest::aio_flush(&m_image_ctx, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::AioWriteSameEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO writesame event" << dendl; + + bufferlist data = event.data; + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_WRITESAME, + &flush_required, + {}); + if (aio_comp == nullptr) { + return; + } + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest::aio_writesame(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + io::ImageArea::DATA, std::move(data), + 0, {}); + } + + if (flush_required) { + m_lock.lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.unlock(); + + if (flush_comp != nullptr) { + io::ImageRequest::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } + } +} + + template + void Replay::handle_event(const journal::AioCompareAndWriteEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": AIO CompareAndWrite event" << dendl; + + bufferlist cmp_data = event.cmp_data; + bufferlist write_data = event.write_data; + bool flush_required; + auto aio_comp = create_aio_modify_completion(on_ready, on_safe, + io::AIO_TYPE_COMPARE_AND_WRITE, + &flush_required, + {-EILSEQ}); + + if (!clipped_io(event.offset, aio_comp)) { + io::ImageRequest::aio_compare_and_write(&m_image_ctx, aio_comp, + {{event.offset, event.length}}, + io::ImageArea::DATA, + std::move(cmp_data), + std::move(write_data), + nullptr, 0, {}); + } + + if (flush_required) { + m_lock.lock(); + auto flush_comp = create_aio_flush_completion(nullptr); + m_lock.unlock(); + + io::ImageRequest::aio_flush(&m_image_ctx, flush_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + } +} + +template +void Replay::handle_event(const journal::OpFinishEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Op finish event: " + << "op_tid=" << event.op_tid << dendl; + + bool op_in_progress; + bool filter_ret_val; + Context *on_op_complete = nullptr; + Context *on_op_finish_event = nullptr; + { + std::lock_guard locker{m_lock}; + auto op_it = m_op_events.find(event.op_tid); + if (op_it == m_op_events.end()) { + ldout(cct, 10) << ": unable to locate associated op: assuming previously " + << "committed." << dendl; + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, 0); + return; + } + + OpEvent &op_event = op_it->second; + ceph_assert(op_event.on_finish_safe == nullptr); + op_event.on_finish_ready = on_ready; + op_event.on_finish_safe = on_safe; + op_in_progress = op_event.op_in_progress; + std::swap(on_op_complete, op_event.on_op_complete); + std::swap(on_op_finish_event, op_event.on_op_finish_event); + + // special errors which indicate op never started but was recorded + // as failed in the journal + filter_ret_val = (op_event.op_finish_error_codes.count(event.r) != 0); + } + + if (event.r < 0) { + if (op_in_progress) { + // bubble the error up to the in-progress op to cancel it + on_op_finish_event->complete(event.r); + } else { + // op hasn't been started -- bubble the error up since + // our image is now potentially in an inconsistent state + // since simple errors should have been caught before + // creating the op event + delete on_op_complete; + delete on_op_finish_event; + handle_op_complete(event.op_tid, filter_ret_val ? 0 : event.r); + } + return; + } + + // journal recorded success -- apply the op now + on_op_finish_event->complete(0); +} + +template +void Replay::handle_event(const journal::SnapCreateEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap create event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EEXIST}; + + // avoid lock cycles + m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, event, + on_op_complete)), + 0); + + // do not process more events until the state machine is ready + // since it will affect IO + op_event->op_in_progress = true; + op_event->on_start_ready = on_ready; +} + +template +void Replay::handle_event(const journal::SnapRemoveEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap remove event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-ENOENT}; + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::SnapRenameEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap rename event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EEXIST}; + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::SnapProtectEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap protect event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EBUSY}; + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::SnapUnprotectEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap unprotect event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, + event, + on_op_complete)); + + // ignore errors recorded in the journal + op_event->op_finish_error_codes = {-EBUSY}; + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EINVAL}; + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::SnapRollbackEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap rollback start event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, + event, + on_op_complete)); + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::RenameEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Rename event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EEXIST}; + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::ResizeEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Resize start event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + // avoid lock cycles + m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, event, + on_op_complete)), 0); + + // do not process more events until the state machine is ready + // since it will affect IO + op_event->op_in_progress = true; + op_event->on_start_ready = on_ready; +} + +template +void Replay::handle_event(const journal::FlattenEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Flatten start event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, event, + on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-EINVAL}; + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::DemotePromoteEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Demote/Promote event" << dendl; + on_ready->complete(0); + on_safe->complete(0); +} + +template +void Replay::handle_event(const journal::SnapLimitEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Snap limit event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + op_event->on_op_finish_event = new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp(m_image_ctx, + event, + on_op_complete)); + + op_event->ignore_error_codes = {-ERANGE}; + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::UpdateFeaturesEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Update features event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + // avoid lock cycles + m_image_ctx.op_work_queue->queue(new C_RefreshIfRequired( + m_image_ctx, new ExecuteOp( + m_image_ctx, event, on_op_complete)), 0); + + // do not process more events until the state machine is ready + // since it will affect IO + op_event->op_in_progress = true; + op_event->on_start_ready = on_ready; +} + +template +void Replay::handle_event(const journal::MetadataSetEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Metadata set event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + on_op_complete = new C_RefreshIfRequired(m_image_ctx, on_op_complete); + op_event->on_op_finish_event = util::create_async_context_callback( + m_image_ctx, new ExecuteOp( + m_image_ctx, event, on_op_complete)); + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::MetadataRemoveEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": Metadata remove event" << dendl; + + std::lock_guard locker{m_lock}; + OpEvent *op_event; + Context *on_op_complete = create_op_context_callback(event.op_tid, on_ready, + on_safe, &op_event); + if (on_op_complete == nullptr) { + return; + } + + on_op_complete = new C_RefreshIfRequired(m_image_ctx, on_op_complete); + op_event->on_op_finish_event = util::create_async_context_callback( + m_image_ctx, new ExecuteOp( + m_image_ctx, event, on_op_complete)); + + // ignore errors caused due to replay + op_event->ignore_error_codes = {-ENOENT}; + + on_ready->complete(0); +} + +template +void Replay::handle_event(const journal::UnknownEvent &event, + Context *on_ready, Context *on_safe) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": unknown event" << dendl; + on_ready->complete(0); + on_safe->complete(0); +} + +template +void Replay::handle_aio_modify_complete(Context *on_ready, Context *on_safe, + int r, std::set &filters) { + std::lock_guard locker{m_lock}; + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": on_ready=" << on_ready << ", " + << "on_safe=" << on_safe << ", r=" << r << dendl; + + if (on_ready != nullptr) { + on_ready->complete(0); + } + + if (filters.find(r) != filters.end()) + r = 0; + + if (r < 0) { + lderr(cct) << ": AIO modify op failed: " << cpp_strerror(r) << dendl; + m_image_ctx.op_work_queue->queue(on_safe, r); + return; + } + + // will be completed after next flush operation completes + m_aio_modify_safe_contexts.insert(on_safe); +} + +template +void Replay::handle_aio_flush_complete(Context *on_flush_safe, + Contexts &on_safe_ctxs, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << ": AIO flush failed: " << cpp_strerror(r) << dendl; + } + + Context *on_aio_ready = nullptr; + Context *on_flush = nullptr; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_in_flight_aio_flush > 0); + ceph_assert(m_in_flight_aio_modify >= on_safe_ctxs.size()); + --m_in_flight_aio_flush; + m_in_flight_aio_modify -= on_safe_ctxs.size(); + + std::swap(on_aio_ready, m_on_aio_ready); + if (m_in_flight_op_events == 0 && + (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) { + on_flush = m_flush_ctx; + } + + // strip out previously failed on_safe contexts + for (auto it = on_safe_ctxs.begin(); it != on_safe_ctxs.end(); ) { + if (m_aio_modify_safe_contexts.erase(*it)) { + ++it; + } else { + it = on_safe_ctxs.erase(it); + } + } + } + + if (on_aio_ready != nullptr) { + ldout(cct, 10) << ": resuming paused AIO" << dendl; + on_aio_ready->complete(0); + } + + if (on_flush_safe != nullptr) { + on_safe_ctxs.push_back(on_flush_safe); + } + for (auto ctx : on_safe_ctxs) { + ldout(cct, 20) << ": completing safe context: " << ctx << dendl; + ctx->complete(r); + } + + if (on_flush != nullptr) { + ldout(cct, 20) << ": completing flush context: " << on_flush << dendl; + on_flush->complete(r); + } +} + +template +Context *Replay::create_op_context_callback(uint64_t op_tid, + Context *on_ready, + Context *on_safe, + OpEvent **op_event) { + CephContext *cct = m_image_ctx.cct; + if (m_shut_down) { + ldout(cct, 5) << ": ignoring event after shut down" << dendl; + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN); + return nullptr; + } + + ceph_assert(ceph_mutex_is_locked(m_lock)); + if (m_op_events.count(op_tid) != 0) { + lderr(cct) << ": duplicate op tid detected: " << op_tid << dendl; + + // on_ready is already async but on failure invoke on_safe async + // as well + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, -EINVAL); + return nullptr; + } + + ++m_in_flight_op_events; + *op_event = &m_op_events[op_tid]; + (*op_event)->on_start_safe = on_safe; + + Context *on_op_complete = new C_OpOnComplete(this, op_tid); + (*op_event)->on_op_complete = on_op_complete; + return on_op_complete; +} + +template +void Replay::handle_op_complete(uint64_t op_tid, int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << ": op_tid=" << op_tid << ", " + << "r=" << r << dendl; + + OpEvent op_event; + bool shutting_down = false; + { + std::lock_guard locker{m_lock}; + auto op_it = m_op_events.find(op_tid); + ceph_assert(op_it != m_op_events.end()); + + op_event = std::move(op_it->second); + m_op_events.erase(op_it); + + if (m_shut_down) { + ceph_assert(m_flush_ctx != nullptr); + shutting_down = true; + } + } + + ceph_assert(op_event.on_start_ready == nullptr || (r < 0 && r != -ERESTART)); + if (op_event.on_start_ready != nullptr) { + // blocking op event failed before it became ready + ceph_assert(op_event.on_finish_ready == nullptr && + op_event.on_finish_safe == nullptr); + + op_event.on_start_ready->complete(0); + } else { + // event kicked off by OpFinishEvent + ceph_assert((op_event.on_finish_ready != nullptr && + op_event.on_finish_safe != nullptr) || shutting_down); + } + + if (op_event.on_op_finish_event != nullptr) { + op_event.on_op_finish_event->complete(r); + } + + if (op_event.on_finish_ready != nullptr) { + op_event.on_finish_ready->complete(0); + } + + // filter out errors caused by replay of the same op + if (r < 0 && op_event.ignore_error_codes.count(r) != 0) { + r = 0; + } + + op_event.on_start_safe->complete(r); + if (op_event.on_finish_safe != nullptr) { + op_event.on_finish_safe->complete(r); + } + + // shut down request might have occurred while lock was + // dropped -- handle if pending + Context *on_flush = nullptr; + { + std::lock_guard locker{m_lock}; + ceph_assert(m_in_flight_op_events > 0); + --m_in_flight_op_events; + if (m_in_flight_op_events == 0 && + (m_in_flight_aio_flush + m_in_flight_aio_modify) == 0) { + on_flush = m_flush_ctx; + } + } + if (on_flush != nullptr) { + m_image_ctx.op_work_queue->queue(on_flush, 0); + } +} + +template +io::AioCompletion * +Replay::create_aio_modify_completion(Context *on_ready, + Context *on_safe, + io::aio_type_t aio_type, + bool *flush_required, + std::set &&filters) { + std::lock_guard locker{m_lock}; + CephContext *cct = m_image_ctx.cct; + ceph_assert(m_on_aio_ready == nullptr); + + if (m_shut_down) { + ldout(cct, 5) << ": ignoring event after shut down" << dendl; + on_ready->complete(0); + m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN); + return nullptr; + } + + ++m_in_flight_aio_modify; + m_aio_modify_unsafe_contexts.push_back(on_safe); + + // FLUSH if we hit the low-water mark -- on_safe contexts are + // completed by flushes-only so that we don't move the journal + // commit position until safely on-disk + + *flush_required = (m_aio_modify_unsafe_contexts.size() == + IN_FLIGHT_IO_LOW_WATER_MARK); + if (*flush_required) { + ldout(cct, 10) << ": hit AIO replay low-water mark: scheduling flush" + << dendl; + } + + // READY for more events if: + // * not at high-water mark for IO + // * in-flight ops are at a consistent point (snap create has IO flushed, + // shrink has adjusted clip boundary, etc) -- should have already been + // flagged not-ready + if (m_in_flight_aio_modify == IN_FLIGHT_IO_HIGH_WATER_MARK) { + ldout(cct, 10) << ": hit AIO replay high-water mark: pausing replay" + << dendl; + ceph_assert(m_on_aio_ready == nullptr); + std::swap(m_on_aio_ready, on_ready); + } + + // when the modification is ACKed by librbd, we can process the next + // event. when flushed, the completion of the next flush will fire the + // on_safe callback + auto aio_comp = io::AioCompletion::create_and_start( + new C_AioModifyComplete(this, on_ready, on_safe, std::move(filters)), + util::get_image_ctx(&m_image_ctx), aio_type); + return aio_comp; +} + +template +io::AioCompletion *Replay::create_aio_flush_completion(Context *on_safe) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + CephContext *cct = m_image_ctx.cct; + if (m_shut_down) { + ldout(cct, 5) << ": ignoring event after shut down" << dendl; + if (on_safe != nullptr) { + m_image_ctx.op_work_queue->queue(on_safe, -ESHUTDOWN); + } + return nullptr; + } + + ++m_in_flight_aio_flush; + + // associate all prior write/discard ops to this flush request + auto aio_comp = io::AioCompletion::create_and_start( + new C_AioFlushComplete(this, on_safe, + std::move(m_aio_modify_unsafe_contexts)), + util::get_image_ctx(&m_image_ctx), io::AIO_TYPE_FLUSH); + m_aio_modify_unsafe_contexts.clear(); + return aio_comp; +} + +template +bool Replay::clipped_io(uint64_t image_offset, io::AioCompletion *aio_comp) { + CephContext *cct = m_image_ctx.cct; + + m_image_ctx.image_lock.lock_shared(); + size_t image_size = m_image_ctx.size; + m_image_ctx.image_lock.unlock_shared(); + + if (image_offset >= image_size) { + // rbd-mirror image sync might race an IO event w/ associated resize between + // the point the peer is registered and the sync point is created, so no-op + // IO events beyond the current image extents since under normal conditions + // it wouldn't have been recorded in the journal + ldout(cct, 5) << ": no-op IO event beyond image size" << dendl; + aio_comp->get(); + aio_comp->set_request_count(0); + aio_comp->put(); + return true; + } + + return false; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::Replay; diff --git a/src/librbd/journal/Replay.h b/src/librbd/journal/Replay.h new file mode 100644 index 000000000..038601833 --- /dev/null +++ b/src/librbd/journal/Replay.h @@ -0,0 +1,205 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_REPLAY_H +#define CEPH_LIBRBD_JOURNAL_REPLAY_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "common/ceph_mutex.h" +#include "librbd/io/Types.h" +#include "librbd/journal/Types.h" +#include +#include +#include +#include + +namespace librbd { + +class ImageCtx; +namespace io { struct AioCompletion; } + +namespace journal { + +template +class Replay { +public: + static Replay *create(ImageCtxT &image_ctx) { + return new Replay(image_ctx); + } + + Replay(ImageCtxT &image_ctx); + ~Replay(); + + int decode(bufferlist::const_iterator *it, EventEntry *event_entry); + void process(const EventEntry &event_entry, + Context *on_ready, Context *on_safe); + + void shut_down(bool cancel_ops, Context *on_finish); + void flush(Context *on_finish); + + void replay_op_ready(uint64_t op_tid, Context *on_resume); + +private: + typedef std::unordered_set ReturnValues; + + struct OpEvent { + bool op_in_progress = false; + bool finish_on_ready = false; + Context *on_op_finish_event = nullptr; + Context *on_start_ready = nullptr; + Context *on_start_safe = nullptr; + Context *on_finish_ready = nullptr; + Context *on_finish_safe = nullptr; + Context *on_op_complete = nullptr; + ReturnValues op_finish_error_codes; + ReturnValues ignore_error_codes; + }; + + typedef std::list OpTids; + typedef std::list Contexts; + typedef std::unordered_set ContextSet; + typedef std::unordered_map OpEvents; + + struct C_OpOnComplete : public Context { + Replay *replay; + uint64_t op_tid; + C_OpOnComplete(Replay *replay, uint64_t op_tid) + : replay(replay), op_tid(op_tid) { + } + void finish(int r) override { + replay->handle_op_complete(op_tid, r); + } + }; + + struct C_AioModifyComplete : public Context { + Replay *replay; + Context *on_ready; + Context *on_safe; + std::set filters; + C_AioModifyComplete(Replay *replay, Context *on_ready, + Context *on_safe, std::set &&filters) + : replay(replay), on_ready(on_ready), on_safe(on_safe), + filters(std::move(filters)) { + } + void finish(int r) override { + replay->handle_aio_modify_complete(on_ready, on_safe, r, filters); + } + }; + + struct C_AioFlushComplete : public Context { + Replay *replay; + Context *on_flush_safe; + Contexts on_safe_ctxs; + C_AioFlushComplete(Replay *replay, Context *on_flush_safe, + Contexts &&on_safe_ctxs) + : replay(replay), on_flush_safe(on_flush_safe), + on_safe_ctxs(on_safe_ctxs) { + } + void finish(int r) override { + replay->handle_aio_flush_complete(on_flush_safe, on_safe_ctxs, r); + } + }; + + struct EventVisitor : public boost::static_visitor { + Replay *replay; + Context *on_ready; + Context *on_safe; + + EventVisitor(Replay *_replay, Context *_on_ready, Context *_on_safe) + : replay(_replay), on_ready(_on_ready), on_safe(_on_safe) { + } + + template + inline void operator()(const Event &event) const { + replay->handle_event(event, on_ready, on_safe); + } + }; + + ImageCtxT &m_image_ctx; + + ceph::mutex m_lock = ceph::make_mutex("Replay::m_lock"); + + uint64_t m_in_flight_aio_flush = 0; + uint64_t m_in_flight_aio_modify = 0; + Contexts m_aio_modify_unsafe_contexts; + ContextSet m_aio_modify_safe_contexts; + + OpEvents m_op_events; + uint64_t m_in_flight_op_events = 0; + + bool m_shut_down = false; + Context *m_flush_ctx = nullptr; + Context *m_on_aio_ready = nullptr; + + void handle_event(const AioDiscardEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const AioWriteEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const AioWriteSameEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const AioCompareAndWriteEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const AioFlushEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const OpFinishEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapCreateEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapRemoveEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapRenameEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapProtectEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapUnprotectEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapRollbackEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const RenameEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const ResizeEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const FlattenEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const DemotePromoteEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const SnapLimitEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const UpdateFeaturesEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const MetadataSetEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const MetadataRemoveEvent &event, Context *on_ready, + Context *on_safe); + void handle_event(const UnknownEvent &event, Context *on_ready, + Context *on_safe); + + void handle_aio_modify_complete(Context *on_ready, Context *on_safe, + int r, std::set &filters); + void handle_aio_flush_complete(Context *on_flush_safe, Contexts &on_safe_ctxs, + int r); + + Context *create_op_context_callback(uint64_t op_tid, Context *on_ready, + Context *on_safe, OpEvent **op_event); + void handle_op_complete(uint64_t op_tid, int r); + + io::AioCompletion *create_aio_modify_completion(Context *on_ready, + Context *on_safe, + io::aio_type_t aio_type, + bool *flush_required, + std::set &&filters); + io::AioCompletion *create_aio_flush_completion(Context *on_safe); + void handle_aio_completion(io::AioCompletion *aio_comp); + + bool clipped_io(uint64_t image_offset, io::AioCompletion *aio_comp); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::Replay; + +#endif // CEPH_LIBRBD_JOURNAL_REPLAY_H diff --git a/src/librbd/journal/ResetRequest.cc b/src/librbd/journal/ResetRequest.cc new file mode 100644 index 000000000..895d0046e --- /dev/null +++ b/src/librbd/journal/ResetRequest.cc @@ -0,0 +1,162 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/ResetRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "journal/Journaler.h" +#include "journal/Settings.h" +#include "include/ceph_assert.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/journal/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::ResetRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace journal { + +using util::create_async_context_callback; +using util::create_context_callback; + +template +void ResetRequest::send() { + init_journaler(); +} + +template +void ResetRequest::init_journaler() { + ldout(m_cct, 10) << dendl; + + m_journaler = new Journaler(m_io_ctx, m_image_id, m_client_id, {}, nullptr); + Context *ctx = create_context_callback< + ResetRequest, &ResetRequest::handle_init_journaler>(this); + m_journaler->init(ctx); +} + +template +void ResetRequest::handle_init_journaler(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -ENOENT) { + ldout(m_cct, 5) << "journal does not exist" << dendl; + m_ret_val = r; + } else if (r < 0) { + lderr(m_cct) << "failed to init journaler: " << cpp_strerror(r) << dendl; + m_ret_val = r; + } else { + int64_t pool_id; + m_journaler->get_metadata(&m_order, &m_splay_width, &pool_id); + + if (pool_id != -1) { + librados::Rados rados(m_io_ctx); + r = rados.pool_reverse_lookup(pool_id, &m_object_pool_name); + if (r < 0) { + lderr(m_cct) << "failed to lookup data pool: " << cpp_strerror(r) + << dendl; + m_ret_val = r; + } + } + } + + shut_down_journaler(); +} + +template +void ResetRequest::shut_down_journaler() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_async_context_callback( + m_op_work_queue, create_context_callback< + ResetRequest, &ResetRequest::handle_journaler_shutdown>(this)); + m_journaler->shut_down(ctx); +} + +template +void ResetRequest::handle_journaler_shutdown(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + delete m_journaler; + if (r < 0) { + lderr(m_cct) << "failed to shut down journaler: " << cpp_strerror(r) + << dendl; + if (m_ret_val == 0) { + m_ret_val = r; + } + } + + if (m_ret_val < 0) { + finish(m_ret_val); + return; + } + + remove_journal(); +} + +template +void ResetRequest::remove_journal() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_context_callback< + ResetRequest, &ResetRequest::handle_remove_journal>(this); + auto req = RemoveRequest::create(m_io_ctx, m_image_id, m_client_id, + m_op_work_queue, ctx); + req->send(); +} + +template +void ResetRequest::handle_remove_journal(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to remove journal: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + create_journal(); +} + +template +void ResetRequest::create_journal() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_context_callback< + ResetRequest, &ResetRequest::handle_create_journal>(this); + journal::TagData tag_data(m_mirror_uuid); + auto req = CreateRequest::create(m_io_ctx, m_image_id, m_order, + m_splay_width, m_object_pool_name, + cls::journal::Tag::TAG_CLASS_NEW, + tag_data, m_client_id, m_op_work_queue, + ctx); + req->send(); +} + +template +void ResetRequest::handle_create_journal(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to create journal: " << cpp_strerror(r) << dendl; + } + finish(r); +} + +template +void ResetRequest::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::ResetRequest; diff --git a/src/librbd/journal/ResetRequest.h b/src/librbd/journal/ResetRequest.h new file mode 100644 index 000000000..f9331f644 --- /dev/null +++ b/src/librbd/journal/ResetRequest.h @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_RESET_REQUEST_H +#define CEPH_LIBRBD_JOURNAL_RESET_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rbd/librbd.hpp" +#include "librbd/journal/TypeTraits.h" +#include "common/Timer.h" +#include + +class Context; +class ContextWQ; + +namespace journal { class Journaler; } + +namespace librbd { + +class ImageCtx; + +namespace journal { + +template +class ResetRequest { +public: + static ResetRequest *create(librados::IoCtx &io_ctx, + const std::string &image_id, + const std::string &client_id, + const std::string &mirror_uuid, + ContextWQ *op_work_queue, Context *on_finish) { + return new ResetRequest(io_ctx, image_id, client_id, mirror_uuid, + op_work_queue, on_finish); + } + + ResetRequest(librados::IoCtx &io_ctx, const std::string &image_id, + const std::string &client_id, const std::string &mirror_uuid, + ContextWQ *op_work_queue, Context *on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), m_client_id(client_id), + m_mirror_uuid(mirror_uuid), m_op_work_queue(op_work_queue), + m_on_finish(on_finish), + m_cct(reinterpret_cast(m_io_ctx.cct())) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * INIT_JOURNALER + * | + * v + * SHUT_DOWN_JOURNALER + * | + * v + * REMOVE_JOURNAL + * | + * v + * CREATE_JOURNAL + * | + * v + * + * + * @endverbatim + */ + typedef typename TypeTraits::Journaler Journaler; + + librados::IoCtx &m_io_ctx; + std::string m_image_id; + std::string m_client_id; + std::string m_mirror_uuid; + ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct; + Journaler *m_journaler = nullptr; + int m_ret_val = 0; + + uint8_t m_order = 0; + uint8_t m_splay_width = 0; + std::string m_object_pool_name; + + void init_journaler(); + void handle_init_journaler(int r); + + void shut_down_journaler(); + void handle_journaler_shutdown(int r); + + void remove_journal(); + void handle_remove_journal(int r); + + void create_journal(); + void handle_create_journal(int r); + + void finish(int r); + +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::ResetRequest; + +#endif // CEPH_LIBRBD_JOURNAL_REMOVE_REQUEST_H diff --git a/src/librbd/journal/StandardPolicy.cc b/src/librbd/journal/StandardPolicy.cc new file mode 100644 index 000000000..7f124aeef --- /dev/null +++ b/src/librbd/journal/StandardPolicy.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/StandardPolicy.h" +#include "librbd/ImageCtx.h" +#include "librbd/Journal.h" +#include "librbd/asio/ContextWQ.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::StandardPolicy: " + +namespace librbd { +namespace journal { + +template +void StandardPolicy::allocate_tag_on_lock(Context *on_finish) { + ceph_assert(m_image_ctx->journal != nullptr); + + if (!m_image_ctx->journal->is_tag_owner()) { + lderr(m_image_ctx->cct) << "local image not promoted" << dendl; + m_image_ctx->op_work_queue->queue(on_finish, -EPERM); + return; + } + + m_image_ctx->journal->allocate_local_tag(on_finish); +} + +} // namespace journal +} // namespace librbd + +template class librbd::journal::StandardPolicy; diff --git a/src/librbd/journal/StandardPolicy.h b/src/librbd/journal/StandardPolicy.h new file mode 100644 index 000000000..ec8d0148f --- /dev/null +++ b/src/librbd/journal/StandardPolicy.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H +#define CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H + +#include "librbd/journal/Policy.h" + +namespace librbd { + +struct ImageCtx; + +namespace journal { + +template +class StandardPolicy : public Policy { +public: + StandardPolicy(ImageCtxT *image_ctx) : m_image_ctx(image_ctx) { + } + + bool append_disabled() const override { + return false; + } + bool journal_disabled() const override { + return false; + } + void allocate_tag_on_lock(Context *on_finish) override; + +private: + ImageCtxT *m_image_ctx; +}; + +} // namespace journal +} // namespace librbd + +extern template class librbd::journal::StandardPolicy; + +#endif // CEPH_LIBRBD_JOURNAL_STANDARD_POLICY_H diff --git a/src/librbd/journal/TypeTraits.h b/src/librbd/journal/TypeTraits.h new file mode 100644 index 000000000..51b025f6d --- /dev/null +++ b/src/librbd/journal/TypeTraits.h @@ -0,0 +1,29 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H +#define CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H + +struct ContextWQ; + +namespace journal { +class Future; +class Journaler; +class ReplayEntry; +} + +namespace librbd { +namespace journal { + +template +struct TypeTraits { + typedef ::journal::Journaler Journaler; + typedef ::journal::Future Future; + typedef ::journal::ReplayEntry ReplayEntry; + typedef ::ContextWQ ContextWQ; +}; + +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_TYPE_TRAITS_H diff --git a/src/librbd/journal/Types.cc b/src/librbd/journal/Types.cc new file mode 100644 index 000000000..d76a15e55 --- /dev/null +++ b/src/librbd/journal/Types.cc @@ -0,0 +1,956 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/Types.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "include/types.h" +#include "common/Formatter.h" + +namespace librbd { +namespace journal { + +using ceph::encode; +using ceph::decode; + +namespace { + +template +class GetTypeVisitor : public boost::static_visitor { +public: + template + inline E operator()(const T&) const { + return T::TYPE; + } +}; + +class EncodeVisitor : public boost::static_visitor { +public: + explicit EncodeVisitor(bufferlist &bl) : m_bl(bl) { + } + + template + inline void operator()(const T& t) const { + encode(static_cast(T::TYPE), m_bl); + t.encode(m_bl); + } +private: + bufferlist &m_bl; +}; + +class DecodeVisitor : public boost::static_visitor { +public: + DecodeVisitor(__u8 version, bufferlist::const_iterator &iter) + : m_version(version), m_iter(iter) { + } + + template + inline void operator()(T& t) const { + t.decode(m_version, m_iter); + } +private: + __u8 m_version; + bufferlist::const_iterator &m_iter; +}; + +class DumpVisitor : public boost::static_visitor { +public: + explicit DumpVisitor(Formatter *formatter, const std::string &key) + : m_formatter(formatter), m_key(key) {} + + template + inline void operator()(const T& t) const { + auto type = T::TYPE; + m_formatter->dump_string(m_key.c_str(), stringify(type)); + t.dump(m_formatter); + } +private: + ceph::Formatter *m_formatter; + std::string m_key; +}; + +} // anonymous namespace + +void AioDiscardEvent::encode(bufferlist& bl) const { + using ceph::encode; + encode(offset, bl); + encode(length, bl); + bool skip_partial_discard = (discard_granularity_bytes > 0); + encode(skip_partial_discard, bl); + encode(discard_granularity_bytes, bl); +} + +void AioDiscardEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(offset, it); + decode(length, it); + + bool skip_partial_discard = false; + if (version >= 4) { + decode(skip_partial_discard, it); + } + + if (version >= 5) { + decode(discard_granularity_bytes, it); + } else { + if (skip_partial_discard) { + // use a size larger than the maximum object size which will + // truncated down to object size during IO processing + discard_granularity_bytes = std::numeric_limits::max(); + } else { + discard_granularity_bytes = 0; + } + } +} + +void AioDiscardEvent::dump(Formatter *f) const { + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); + f->dump_unsigned("discard_granularity_bytes", discard_granularity_bytes); +} + +uint32_t AioWriteEvent::get_fixed_size() { + return EventEntry::get_fixed_size() + 16 /* offset, length */; +} + +void AioWriteEvent::encode(bufferlist& bl) const { + using ceph::encode; + encode(offset, bl); + encode(length, bl); + encode(data, bl); +} + +void AioWriteEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(offset, it); + decode(length, it); + decode(data, it); +} + +void AioWriteEvent::dump(Formatter *f) const { + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); +} + +void AioWriteSameEvent::encode(bufferlist& bl) const { + using ceph::encode; + encode(offset, bl); + encode(length, bl); + encode(data, bl); +} + +void AioWriteSameEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(offset, it); + decode(length, it); + decode(data, it); +} + +void AioWriteSameEvent::dump(Formatter *f) const { + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); +} + +uint32_t AioCompareAndWriteEvent::get_fixed_size() { + return EventEntry::get_fixed_size() + 32 /* offset, length */; +} + +void AioCompareAndWriteEvent::encode(bufferlist& bl) const { + using ceph::encode; + encode(offset, bl); + encode(length, bl); + encode(cmp_data, bl); + encode(write_data, bl); +} + +void AioCompareAndWriteEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(offset, it); + decode(length, it); + decode(cmp_data, it); + decode(write_data, it); +} + +void AioCompareAndWriteEvent::dump(Formatter *f) const { + f->dump_unsigned("offset", offset); + f->dump_unsigned("length", length); +} + +void AioFlushEvent::encode(bufferlist& bl) const { +} + +void AioFlushEvent::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void AioFlushEvent::dump(Formatter *f) const { +} + +void OpEventBase::encode(bufferlist& bl) const { + using ceph::encode; + encode(op_tid, bl); +} + +void OpEventBase::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(op_tid, it); +} + +void OpEventBase::dump(Formatter *f) const { + f->dump_unsigned("op_tid", op_tid); +} + +void OpFinishEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(op_tid, bl); + encode(r, bl); +} + +void OpFinishEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(op_tid, it); + decode(r, it); +} + +void OpFinishEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("op_tid", op_tid); + f->dump_int("result", r); +} + +void SnapEventBase::encode(bufferlist& bl) const { + using ceph::encode; + OpEventBase::encode(bl); + encode(snap_name, bl); + encode(snap_namespace, bl); +} + +void SnapEventBase::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + OpEventBase::decode(version, it); + using ceph::decode; + decode(snap_name, it); + if (version >= 4) { + decode(snap_namespace, it); + } +} + +void SnapEventBase::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_string("snap_name", snap_name); + snap_namespace.dump(f); +} + +void SnapCreateEvent::encode(bufferlist &bl) const { + SnapEventBase::encode(bl); +} + +void SnapCreateEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + SnapEventBase::decode(version, it); + if (version == 3) { + decode(snap_namespace, it); + } +} + +void SnapCreateEvent::dump(Formatter *f) const { + SnapEventBase::dump(f); +} + +void SnapLimitEvent::encode(bufferlist &bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(limit, bl); +} + +void SnapLimitEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(limit, it); +} + +void SnapLimitEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("limit", limit); +} + +void SnapRenameEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(dst_snap_name, bl); + encode(snap_id, bl); + encode(src_snap_name, bl); +} + +void SnapRenameEvent::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + OpEventBase::decode(version, it); + decode(dst_snap_name, it); + decode(snap_id, it); + if (version >= 2) { + decode(src_snap_name, it); + } +} + +void SnapRenameEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("src_snap_id", snap_id); + f->dump_string("src_snap_name", src_snap_name); + f->dump_string("dest_snap_name", dst_snap_name); +} + +void RenameEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(image_name, bl); +} + +void RenameEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(image_name, it); +} + +void RenameEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_string("image_name", image_name); +} + +void ResizeEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(size, bl); +} + +void ResizeEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(size, it); +} + +void ResizeEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("size", size); +} + +void DemotePromoteEvent::encode(bufferlist& bl) const { +} + +void DemotePromoteEvent::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void DemotePromoteEvent::dump(Formatter *f) const { +} + +void UpdateFeaturesEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(features, bl); + encode(enabled, bl); +} + +void UpdateFeaturesEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(features, it); + decode(enabled, it); +} + +void UpdateFeaturesEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_unsigned("features", features); + f->dump_bool("enabled", enabled); +} + +void MetadataSetEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(key, bl); + encode(value, bl); +} + +void MetadataSetEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(key, it); + decode(value, it); +} + +void MetadataSetEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_string("key", key); + f->dump_string("value", value); +} + +void MetadataRemoveEvent::encode(bufferlist& bl) const { + OpEventBase::encode(bl); + using ceph::encode; + encode(key, bl); +} + +void MetadataRemoveEvent::decode(__u8 version, bufferlist::const_iterator& it) { + OpEventBase::decode(version, it); + using ceph::decode; + decode(key, it); +} + +void MetadataRemoveEvent::dump(Formatter *f) const { + OpEventBase::dump(f); + f->dump_string("key", key); +} + +void UnknownEvent::encode(bufferlist& bl) const { + ceph_abort(); +} + +void UnknownEvent::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void UnknownEvent::dump(Formatter *f) const { +} + +EventType EventEntry::get_event_type() const { + return boost::apply_visitor(GetTypeVisitor(), event); +} + +void EventEntry::encode(bufferlist& bl) const { + ENCODE_START(5, 1, bl); + boost::apply_visitor(EncodeVisitor(bl), event); + ENCODE_FINISH(bl); + encode_metadata(bl); +} + +void EventEntry::decode(bufferlist::const_iterator& it) { + DECODE_START(1, it); + + uint32_t event_type; + decode(event_type, it); + + // select the correct payload variant based upon the encoded op + switch (event_type) { + case EVENT_TYPE_AIO_DISCARD: + event = AioDiscardEvent(); + break; + case EVENT_TYPE_AIO_WRITE: + event = AioWriteEvent(); + break; + case EVENT_TYPE_AIO_FLUSH: + event = AioFlushEvent(); + break; + case EVENT_TYPE_OP_FINISH: + event = OpFinishEvent(); + break; + case EVENT_TYPE_SNAP_CREATE: + event = SnapCreateEvent(); + break; + case EVENT_TYPE_SNAP_REMOVE: + event = SnapRemoveEvent(); + break; + case EVENT_TYPE_SNAP_RENAME: + event = SnapRenameEvent(); + break; + case EVENT_TYPE_SNAP_PROTECT: + event = SnapProtectEvent(); + break; + case EVENT_TYPE_SNAP_UNPROTECT: + event = SnapUnprotectEvent(); + break; + case EVENT_TYPE_SNAP_ROLLBACK: + event = SnapRollbackEvent(); + break; + case EVENT_TYPE_RENAME: + event = RenameEvent(); + break; + case EVENT_TYPE_RESIZE: + event = ResizeEvent(); + break; + case EVENT_TYPE_FLATTEN: + event = FlattenEvent(); + break; + case EVENT_TYPE_DEMOTE_PROMOTE: + event = DemotePromoteEvent(); + break; + case EVENT_TYPE_SNAP_LIMIT: + event = SnapLimitEvent(); + break; + case EVENT_TYPE_UPDATE_FEATURES: + event = UpdateFeaturesEvent(); + break; + case EVENT_TYPE_METADATA_SET: + event = MetadataSetEvent(); + break; + case EVENT_TYPE_METADATA_REMOVE: + event = MetadataRemoveEvent(); + break; + case EVENT_TYPE_AIO_WRITESAME: + event = AioWriteSameEvent(); + break; + case EVENT_TYPE_AIO_COMPARE_AND_WRITE: + event = AioCompareAndWriteEvent(); + break; + default: + event = UnknownEvent(); + break; + } + + boost::apply_visitor(DecodeVisitor(struct_v, it), event); + DECODE_FINISH(it); + if (struct_v >= 4) { + decode_metadata(it); + } +} + +void EventEntry::dump(Formatter *f) const { + boost::apply_visitor(DumpVisitor(f, "event_type"), event); + f->dump_stream("timestamp") << timestamp; +} + +void EventEntry::encode_metadata(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(timestamp, bl); + ENCODE_FINISH(bl); +} + +void EventEntry::decode_metadata(bufferlist::const_iterator& it) { + DECODE_START(1, it); + decode(timestamp, it); + DECODE_FINISH(it); +} + +void EventEntry::generate_test_instances(std::list &o) { + o.push_back(new EventEntry(AioDiscardEvent())); + o.push_back(new EventEntry(AioDiscardEvent(123, 345, 4096), utime_t(1, 1))); + + bufferlist bl; + bl.append(std::string(32, '1')); + o.push_back(new EventEntry(AioWriteEvent())); + o.push_back(new EventEntry(AioWriteEvent(123, 456, bl), utime_t(1, 1))); + + o.push_back(new EventEntry(AioFlushEvent())); + + o.push_back(new EventEntry(OpFinishEvent(123, -1), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapCreateEvent(), utime_t(1, 1))); + o.push_back(new EventEntry(SnapCreateEvent(234, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapRemoveEvent())); + o.push_back(new EventEntry(SnapRemoveEvent(345, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapRenameEvent())); + o.push_back(new EventEntry(SnapRenameEvent(456, 1, "src snap", "dest snap"), + utime_t(1, 1))); + + o.push_back(new EventEntry(SnapProtectEvent())); + o.push_back(new EventEntry(SnapProtectEvent(567, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapUnprotectEvent())); + o.push_back(new EventEntry(SnapUnprotectEvent(678, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(SnapRollbackEvent())); + o.push_back(new EventEntry(SnapRollbackEvent(789, cls::rbd::UserSnapshotNamespace(), "snap"), utime_t(1, 1))); + + o.push_back(new EventEntry(RenameEvent())); + o.push_back(new EventEntry(RenameEvent(890, "image name"), utime_t(1, 1))); + + o.push_back(new EventEntry(ResizeEvent())); + o.push_back(new EventEntry(ResizeEvent(901, 1234), utime_t(1, 1))); + + o.push_back(new EventEntry(FlattenEvent(123), utime_t(1, 1))); + + o.push_back(new EventEntry(DemotePromoteEvent())); + + o.push_back(new EventEntry(UpdateFeaturesEvent())); + o.push_back(new EventEntry(UpdateFeaturesEvent(123, 127, true), utime_t(1, 1))); + + o.push_back(new EventEntry(MetadataSetEvent())); + o.push_back(new EventEntry(MetadataSetEvent(123, "key", "value"), utime_t(1, 1))); + + o.push_back(new EventEntry(MetadataRemoveEvent())); + o.push_back(new EventEntry(MetadataRemoveEvent(123, "key"), utime_t(1, 1))); +} + +// Journal Client + +void ImageClientMeta::encode(bufferlist& bl) const { + using ceph::encode; + encode(tag_class, bl); + encode(resync_requested, bl); +} + +void ImageClientMeta::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(tag_class, it); + decode(resync_requested, it); +} + +void ImageClientMeta::dump(Formatter *f) const { + f->dump_unsigned("tag_class", tag_class); + f->dump_bool("resync_requested", resync_requested); +} + +void MirrorPeerSyncPoint::encode(bufferlist& bl) const { + using ceph::encode; + encode(snap_name, bl); + encode(from_snap_name, bl); + encode(object_number, bl); + encode(snap_namespace, bl); +} + +void MirrorPeerSyncPoint::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(snap_name, it); + decode(from_snap_name, it); + decode(object_number, it); + if (version >= 2) { + decode(snap_namespace, it); + } +} + +void MirrorPeerSyncPoint::dump(Formatter *f) const { + f->dump_string("snap_name", snap_name); + f->dump_string("from_snap_name", from_snap_name); + if (object_number) { + f->dump_unsigned("object_number", *object_number); + } + snap_namespace.dump(f); +} + +void MirrorPeerClientMeta::encode(bufferlist& bl) const { + using ceph::encode; + encode(image_id, bl); + encode(static_cast(state), bl); + encode(sync_object_count, bl); + encode(static_cast(sync_points.size()), bl); + for (auto &sync_point : sync_points) { + sync_point.encode(bl); + } + encode(snap_seqs, bl); +} + +void MirrorPeerClientMeta::decode(__u8 version, bufferlist::const_iterator& it) { + using ceph::decode; + decode(image_id, it); + + uint32_t decode_state; + decode(decode_state, it); + state = static_cast(decode_state); + + decode(sync_object_count, it); + + uint32_t sync_point_count; + decode(sync_point_count, it); + sync_points.resize(sync_point_count); + for (auto &sync_point : sync_points) { + sync_point.decode(version, it); + } + + decode(snap_seqs, it); +} + +void MirrorPeerClientMeta::dump(Formatter *f) const { + f->dump_string("image_id", image_id); + f->dump_stream("state") << state; + f->dump_unsigned("sync_object_count", sync_object_count); + f->open_array_section("sync_points"); + for (auto &sync_point : sync_points) { + f->open_object_section("sync_point"); + sync_point.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("snap_seqs"); + for (auto &pair : snap_seqs) { + f->open_object_section("snap_seq"); + f->dump_unsigned("local_snap_seq", pair.first); + f->dump_unsigned("peer_snap_seq", pair.second); + f->close_section(); + } + f->close_section(); +} + +void CliClientMeta::encode(bufferlist& bl) const { +} + +void CliClientMeta::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void CliClientMeta::dump(Formatter *f) const { +} + +void UnknownClientMeta::encode(bufferlist& bl) const { + ceph_abort(); +} + +void UnknownClientMeta::decode(__u8 version, bufferlist::const_iterator& it) { +} + +void UnknownClientMeta::dump(Formatter *f) const { +} + +ClientMetaType ClientData::get_client_meta_type() const { + return boost::apply_visitor(GetTypeVisitor(), client_meta); +} + +void ClientData::encode(bufferlist& bl) const { + ENCODE_START(2, 1, bl); + boost::apply_visitor(EncodeVisitor(bl), client_meta); + ENCODE_FINISH(bl); +} + +void ClientData::decode(bufferlist::const_iterator& it) { + DECODE_START(1, it); + + uint32_t client_meta_type; + decode(client_meta_type, it); + + // select the correct payload variant based upon the encoded op + switch (client_meta_type) { + case IMAGE_CLIENT_META_TYPE: + client_meta = ImageClientMeta(); + break; + case MIRROR_PEER_CLIENT_META_TYPE: + client_meta = MirrorPeerClientMeta(); + break; + case CLI_CLIENT_META_TYPE: + client_meta = CliClientMeta(); + break; + default: + client_meta = UnknownClientMeta(); + break; + } + + boost::apply_visitor(DecodeVisitor(struct_v, it), client_meta); + DECODE_FINISH(it); +} + +void ClientData::dump(Formatter *f) const { + boost::apply_visitor(DumpVisitor(f, "client_meta_type"), client_meta); +} + +void ClientData::generate_test_instances(std::list &o) { + o.push_back(new ClientData(ImageClientMeta())); + o.push_back(new ClientData(ImageClientMeta(123))); + o.push_back(new ClientData(MirrorPeerClientMeta())); + o.push_back(new ClientData(MirrorPeerClientMeta("image_id", + {{{}, "snap 2", "snap 1", 123}}, + {{1, 2}, {3, 4}}))); + o.push_back(new ClientData(CliClientMeta())); +} + +// Journal Tag + +void TagPredecessor::encode(bufferlist& bl) const { + using ceph::encode; + encode(mirror_uuid, bl); + encode(commit_valid, bl); + encode(tag_tid, bl); + encode(entry_tid, bl); +} + +void TagPredecessor::decode(bufferlist::const_iterator& it) { + using ceph::decode; + decode(mirror_uuid, it); + decode(commit_valid, it); + decode(tag_tid, it); + decode(entry_tid, it); +} + +void TagPredecessor::dump(Formatter *f) const { + f->dump_string("mirror_uuid", mirror_uuid); + f->dump_string("commit_valid", commit_valid ? "true" : "false"); + f->dump_unsigned("tag_tid", tag_tid); + f->dump_unsigned("entry_tid", entry_tid); +} + +void TagData::encode(bufferlist& bl) const { + using ceph::encode; + encode(mirror_uuid, bl); + predecessor.encode(bl); +} + +void TagData::decode(bufferlist::const_iterator& it) { + using ceph::decode; + decode(mirror_uuid, it); + predecessor.decode(it); +} + +void TagData::dump(Formatter *f) const { + f->dump_string("mirror_uuid", mirror_uuid); + f->open_object_section("predecessor"); + predecessor.dump(f); + f->close_section(); +} + +void TagData::generate_test_instances(std::list &o) { + o.push_back(new TagData()); + o.push_back(new TagData("mirror-uuid")); + o.push_back(new TagData("mirror-uuid", "remote-mirror-uuid", true, 123, 234)); +} + +std::ostream &operator<<(std::ostream &out, const EventType &type) { + using namespace librbd::journal; + + switch (type) { + case EVENT_TYPE_AIO_DISCARD: + out << "AioDiscard"; + break; + case EVENT_TYPE_AIO_WRITE: + out << "AioWrite"; + break; + case EVENT_TYPE_AIO_FLUSH: + out << "AioFlush"; + break; + case EVENT_TYPE_OP_FINISH: + out << "OpFinish"; + break; + case EVENT_TYPE_SNAP_CREATE: + out << "SnapCreate"; + break; + case EVENT_TYPE_SNAP_REMOVE: + out << "SnapRemove"; + break; + case EVENT_TYPE_SNAP_RENAME: + out << "SnapRename"; + break; + case EVENT_TYPE_SNAP_PROTECT: + out << "SnapProtect"; + break; + case EVENT_TYPE_SNAP_UNPROTECT: + out << "SnapUnprotect"; + break; + case EVENT_TYPE_SNAP_ROLLBACK: + out << "SnapRollback"; + break; + case EVENT_TYPE_RENAME: + out << "Rename"; + break; + case EVENT_TYPE_RESIZE: + out << "Resize"; + break; + case EVENT_TYPE_FLATTEN: + out << "Flatten"; + break; + case EVENT_TYPE_DEMOTE_PROMOTE: + out << "Demote/Promote"; + break; + case EVENT_TYPE_SNAP_LIMIT: + out << "SnapLimit"; + break; + case EVENT_TYPE_UPDATE_FEATURES: + out << "UpdateFeatures"; + break; + case EVENT_TYPE_METADATA_SET: + out << "MetadataSet"; + break; + case EVENT_TYPE_METADATA_REMOVE: + out << "MetadataRemove"; + break; + case EVENT_TYPE_AIO_WRITESAME: + out << "AioWriteSame"; + break; + case EVENT_TYPE_AIO_COMPARE_AND_WRITE: + out << "AioCompareAndWrite"; + break; + default: + out << "Unknown (" << static_cast(type) << ")"; + break; + } + return out; +} + +std::ostream &operator<<(std::ostream &out, const ClientMetaType &type) { + using namespace librbd::journal; + + switch (type) { + case IMAGE_CLIENT_META_TYPE: + out << "Master Image"; + break; + case MIRROR_PEER_CLIENT_META_TYPE: + out << "Mirror Peer"; + break; + case CLI_CLIENT_META_TYPE: + out << "CLI Tool"; + break; + default: + out << "Unknown (" << static_cast(type) << ")"; + break; + } + return out; +} + +std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta) { + out << "[tag_class=" << meta.tag_class << "]"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync) { + out << "[snap_name=" << sync.snap_name << ", " + << "from_snap_name=" << sync.from_snap_name; + if (sync.object_number) { + out << ", " << *sync.object_number; + } + out << "]"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const MirrorPeerState &state) { + switch (state) { + case MIRROR_PEER_STATE_SYNCING: + out << "Syncing"; + break; + case MIRROR_PEER_STATE_REPLAYING: + out << "Replaying"; + break; + default: + out << "Unknown (" << static_cast(state) << ")"; + break; + } + return out; +} + +std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta) { + out << "[image_id=" << meta.image_id << ", " + << "state=" << meta.state << ", " + << "sync_object_count=" << meta.sync_object_count << ", " + << "sync_points=["; + std::string delimiter; + for (auto &sync_point : meta.sync_points) { + out << delimiter << "[" << sync_point << "]"; + delimiter = ", "; + } + out << "], snap_seqs=["; + delimiter = ""; + for (auto &pair : meta.snap_seqs) { + out << delimiter << "[" + << "local_snap_seq=" << pair.first << ", " + << "peer_snap_seq" << pair.second << "]"; + delimiter = ", "; + } + out << "]"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor) { + out << "[" + << "mirror_uuid=" << predecessor.mirror_uuid; + if (predecessor.commit_valid) { + out << ", " + << "tag_tid=" << predecessor.tag_tid << ", " + << "entry_tid=" << predecessor.entry_tid; + } + out << "]"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const TagData &tag_data) { + out << "[" + << "mirror_uuid=" << tag_data.mirror_uuid << ", " + << "predecessor=" << tag_data.predecessor + << "]"; + return out; +} + +} // namespace journal +} // namespace librbd + diff --git a/src/librbd/journal/Types.h b/src/librbd/journal/Types.h new file mode 100644 index 000000000..d57858a15 --- /dev/null +++ b/src/librbd/journal/Types.h @@ -0,0 +1,685 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_TYPES_H +#define CEPH_LIBRBD_JOURNAL_TYPES_H + +#include "cls/rbd/cls_rbd_types.h" +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/types.h" +#include "include/utime.h" +#include "librbd/Types.h" +#include +#include +#include +#include +#include +#include + +namespace ceph { +class Formatter; +} + +namespace librbd { +namespace journal { + +enum EventType { + EVENT_TYPE_AIO_DISCARD = 0, + EVENT_TYPE_AIO_WRITE = 1, + EVENT_TYPE_AIO_FLUSH = 2, + EVENT_TYPE_OP_FINISH = 3, + EVENT_TYPE_SNAP_CREATE = 4, + EVENT_TYPE_SNAP_REMOVE = 5, + EVENT_TYPE_SNAP_RENAME = 6, + EVENT_TYPE_SNAP_PROTECT = 7, + EVENT_TYPE_SNAP_UNPROTECT = 8, + EVENT_TYPE_SNAP_ROLLBACK = 9, + EVENT_TYPE_RENAME = 10, + EVENT_TYPE_RESIZE = 11, + EVENT_TYPE_FLATTEN = 12, + EVENT_TYPE_DEMOTE_PROMOTE = 13, + EVENT_TYPE_SNAP_LIMIT = 14, + EVENT_TYPE_UPDATE_FEATURES = 15, + EVENT_TYPE_METADATA_SET = 16, + EVENT_TYPE_METADATA_REMOVE = 17, + EVENT_TYPE_AIO_WRITESAME = 18, + EVENT_TYPE_AIO_COMPARE_AND_WRITE = 19, +}; + +struct AioDiscardEvent { + static const EventType TYPE = EVENT_TYPE_AIO_DISCARD; + + uint64_t offset = 0; + uint64_t length = 0; + uint32_t discard_granularity_bytes = 0; + + AioDiscardEvent() { + } + AioDiscardEvent(uint64_t _offset, uint64_t _length, + uint32_t discard_granularity_bytes) + : offset(_offset), length(_length), + discard_granularity_bytes(discard_granularity_bytes) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct AioWriteEvent { + static const EventType TYPE = EVENT_TYPE_AIO_WRITE; + + uint64_t offset; + uint64_t length; + bufferlist data; + + static uint32_t get_fixed_size(); + + AioWriteEvent() : offset(0), length(0) { + } + AioWriteEvent(uint64_t _offset, uint64_t _length, const bufferlist &_data) + : offset(_offset), length(_length), data(_data) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct AioWriteSameEvent { + static const EventType TYPE = EVENT_TYPE_AIO_WRITESAME; + + uint64_t offset; + uint64_t length; + bufferlist data; + + AioWriteSameEvent() : offset(0), length(0) { + } + AioWriteSameEvent(uint64_t _offset, uint64_t _length, + const bufferlist &_data) + : offset(_offset), length(_length), data(_data) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct AioCompareAndWriteEvent { + static const EventType TYPE = EVENT_TYPE_AIO_COMPARE_AND_WRITE; + + uint64_t offset; + uint64_t length; + bufferlist cmp_data; + bufferlist write_data; + + static uint32_t get_fixed_size(); + + AioCompareAndWriteEvent() : offset(0), length(0) { + } + AioCompareAndWriteEvent(uint64_t _offset, uint64_t _length, + const bufferlist &_cmp_data, const bufferlist &_write_data) + : offset(_offset), length(_length), cmp_data(_cmp_data), write_data(_write_data) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct AioFlushEvent { + static const EventType TYPE = EVENT_TYPE_AIO_FLUSH; + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct OpEventBase { + uint64_t op_tid; + +protected: + OpEventBase() : op_tid(0) { + } + OpEventBase(uint64_t op_tid) : op_tid(op_tid) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct OpFinishEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_OP_FINISH; + + int r; + + OpFinishEvent() : r(0) { + } + OpFinishEvent(uint64_t op_tid, int r) : OpEventBase(op_tid), r(r) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapEventBase : public OpEventBase { + cls::rbd::SnapshotNamespace snap_namespace; + std::string snap_name; + +protected: + SnapEventBase() { + } + SnapEventBase(uint64_t op_tid, const cls::rbd::SnapshotNamespace& _snap_namespace, + const std::string &_snap_name) + : OpEventBase(op_tid), + snap_namespace(_snap_namespace), + snap_name(_snap_name) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapCreateEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_CREATE; + + SnapCreateEvent() { + } + SnapCreateEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapRemoveEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_REMOVE; + + SnapRemoveEvent() { + } + SnapRemoveEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + using SnapEventBase::encode; + using SnapEventBase::decode; + using SnapEventBase::dump; +}; + +struct SnapRenameEvent : public OpEventBase{ + static const EventType TYPE = EVENT_TYPE_SNAP_RENAME; + + uint64_t snap_id; + std::string src_snap_name; + std::string dst_snap_name; + + SnapRenameEvent() : snap_id(CEPH_NOSNAP) { + } + SnapRenameEvent(uint64_t op_tid, uint64_t src_snap_id, + const std::string &src_snap_name, + const std::string &dest_snap_name) + : OpEventBase(op_tid), + snap_id(src_snap_id), + src_snap_name(src_snap_name), + dst_snap_name(dest_snap_name) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapProtectEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_PROTECT; + + SnapProtectEvent() { + } + SnapProtectEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + using SnapEventBase::encode; + using SnapEventBase::decode; + using SnapEventBase::dump; +}; + +struct SnapUnprotectEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_UNPROTECT; + + SnapUnprotectEvent() { + } + SnapUnprotectEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + using SnapEventBase::encode; + using SnapEventBase::decode; + using SnapEventBase::dump; +}; + +struct SnapLimitEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_LIMIT; + uint64_t limit; + + SnapLimitEvent() { + } + SnapLimitEvent(uint64_t op_tid, const uint64_t _limit) + : OpEventBase(op_tid), limit(_limit) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct SnapRollbackEvent : public SnapEventBase { + static const EventType TYPE = EVENT_TYPE_SNAP_ROLLBACK; + + SnapRollbackEvent() { + } + SnapRollbackEvent(uint64_t op_tid, const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name) + : SnapEventBase(op_tid, snap_namespace, snap_name) { + } + + using SnapEventBase::encode; + using SnapEventBase::decode; + using SnapEventBase::dump; +}; + +struct RenameEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_RENAME; + + std::string image_name; + + RenameEvent() { + } + RenameEvent(uint64_t op_tid, const std::string &_image_name) + : OpEventBase(op_tid), image_name(_image_name) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct ResizeEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_RESIZE; + + uint64_t size; + + ResizeEvent() : size(0) { + } + ResizeEvent(uint64_t op_tid, uint64_t _size) + : OpEventBase(op_tid), size(_size) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct FlattenEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_FLATTEN; + + FlattenEvent() { + } + FlattenEvent(uint64_t op_tid) : OpEventBase(op_tid) { + } + + using OpEventBase::encode; + using OpEventBase::decode; + using OpEventBase::dump; +}; + +struct DemotePromoteEvent { + static const EventType TYPE = static_cast( + EVENT_TYPE_DEMOTE_PROMOTE); + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct UpdateFeaturesEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_UPDATE_FEATURES; + + uint64_t features; + bool enabled; + + UpdateFeaturesEvent() : features(0), enabled(false) { + } + UpdateFeaturesEvent(uint64_t op_tid, uint64_t _features, bool _enabled) + : OpEventBase(op_tid), features(_features), enabled(_enabled) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct MetadataSetEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_METADATA_SET; + + std::string key; + std::string value; + + MetadataSetEvent() { + } + MetadataSetEvent(uint64_t op_tid, const std::string &_key, const std::string &_value) + : OpEventBase(op_tid), key(_key), value(_value) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct MetadataRemoveEvent : public OpEventBase { + static const EventType TYPE = EVENT_TYPE_METADATA_REMOVE; + + std::string key; + + MetadataRemoveEvent() { + } + MetadataRemoveEvent(uint64_t op_tid, const std::string &_key) + : OpEventBase(op_tid), key(_key) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct UnknownEvent { + static const EventType TYPE = static_cast(-1); + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +typedef boost::mpl::vector EventVector; +typedef boost::make_variant_over::type Event; + +struct EventEntry { + static uint32_t get_fixed_size() { + return EVENT_FIXED_SIZE + METADATA_FIXED_SIZE; + } + + EventEntry() : event(UnknownEvent()) { + } + EventEntry(const Event &_event, const utime_t &_timestamp = utime_t()) + : event(_event), timestamp(_timestamp) { + } + + Event event; + utime_t timestamp; + + EventType get_event_type() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list &o); + +private: + static const uint32_t EVENT_FIXED_SIZE = 14; /// version encoding, type + static const uint32_t METADATA_FIXED_SIZE = 14; /// version encoding, timestamp + + void encode_metadata(bufferlist& bl) const; + void decode_metadata(bufferlist::const_iterator& it); +}; + +// Journal Client data structures + +enum ClientMetaType { + IMAGE_CLIENT_META_TYPE = 0, + MIRROR_PEER_CLIENT_META_TYPE = 1, + CLI_CLIENT_META_TYPE = 2 +}; + +struct ImageClientMeta { + static const ClientMetaType TYPE = IMAGE_CLIENT_META_TYPE; + + uint64_t tag_class = 0; + bool resync_requested = false; + + ImageClientMeta() { + } + ImageClientMeta(uint64_t tag_class) : tag_class(tag_class) { + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct MirrorPeerSyncPoint { + typedef boost::optional ObjectNumber; + + cls::rbd::SnapshotNamespace snap_namespace; + std::string snap_name; + std::string from_snap_name; + ObjectNumber object_number; + + MirrorPeerSyncPoint() : MirrorPeerSyncPoint({}, "", "", boost::none) { + } + MirrorPeerSyncPoint(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + const ObjectNumber &object_number) + : MirrorPeerSyncPoint(snap_namespace, snap_name, "", object_number) { + } + MirrorPeerSyncPoint(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string &snap_name, + const std::string &from_snap_name, + const ObjectNumber &object_number) + : snap_namespace(snap_namespace), snap_name(snap_name), + from_snap_name(from_snap_name), object_number(object_number) { + } + + inline bool operator==(const MirrorPeerSyncPoint &sync) const { + return (snap_name == sync.snap_name && + from_snap_name == sync.from_snap_name && + object_number == sync.object_number && + snap_namespace == sync.snap_namespace); + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +enum MirrorPeerState { + MIRROR_PEER_STATE_SYNCING, + MIRROR_PEER_STATE_REPLAYING +}; + +struct MirrorPeerClientMeta { + typedef std::list SyncPoints; + + static const ClientMetaType TYPE = MIRROR_PEER_CLIENT_META_TYPE; + + std::string image_id; + MirrorPeerState state = MIRROR_PEER_STATE_SYNCING; ///< replay state + uint64_t sync_object_count = 0; ///< maximum number of objects ever sync'ed + SyncPoints sync_points; ///< max two in-use snapshots for sync + SnapSeqs snap_seqs; ///< local to peer snap seq mapping + + MirrorPeerClientMeta() { + } + MirrorPeerClientMeta(const std::string &image_id, + const SyncPoints &sync_points = SyncPoints(), + const SnapSeqs &snap_seqs = SnapSeqs()) + : image_id(image_id), sync_points(sync_points), snap_seqs(snap_seqs) { + } + + inline bool operator==(const MirrorPeerClientMeta &meta) const { + return (image_id == meta.image_id && + state == meta.state && + sync_object_count == meta.sync_object_count && + sync_points == meta.sync_points && + snap_seqs == meta.snap_seqs); + } + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct CliClientMeta { + static const ClientMetaType TYPE = CLI_CLIENT_META_TYPE; + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct UnknownClientMeta { + static const ClientMetaType TYPE = static_cast(-1); + + void encode(bufferlist& bl) const; + void decode(__u8 version, bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +typedef boost::variant ClientMeta; + +struct ClientData { + ClientData() { + } + ClientData(const ClientMeta &client_meta) : client_meta(client_meta) { + } + + ClientMeta client_meta; + + ClientMetaType get_client_meta_type() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list &o); +}; + +// Journal Tag data structures + +struct TagPredecessor { + std::string mirror_uuid; // empty if local + bool commit_valid = false; + uint64_t tag_tid = 0; + uint64_t entry_tid = 0; + + TagPredecessor() { + } + TagPredecessor(const std::string &mirror_uuid, bool commit_valid, + uint64_t tag_tid, uint64_t entry_tid) + : mirror_uuid(mirror_uuid), commit_valid(commit_valid), tag_tid(tag_tid), + entry_tid(entry_tid) { + } + + inline bool operator==(const TagPredecessor &rhs) const { + return (mirror_uuid == rhs.mirror_uuid && + commit_valid == rhs.commit_valid && + tag_tid == rhs.tag_tid && + entry_tid == rhs.entry_tid); + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; +}; + +struct TagData { + // owner of the tag (exclusive lock epoch) + std::string mirror_uuid; // empty if local + + // mapping to last committed record of previous tag + TagPredecessor predecessor; + + TagData() { + } + TagData(const std::string &mirror_uuid) : mirror_uuid(mirror_uuid) { + } + TagData(const std::string &mirror_uuid, + const std::string &predecessor_mirror_uuid, + bool predecessor_commit_valid, + uint64_t predecessor_tag_tid, uint64_t predecessor_entry_tid) + : mirror_uuid(mirror_uuid), + predecessor(predecessor_mirror_uuid, predecessor_commit_valid, + predecessor_tag_tid, predecessor_entry_tid) { + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list &o); +}; + +std::ostream &operator<<(std::ostream &out, const EventType &type); +std::ostream &operator<<(std::ostream &out, const ClientMetaType &type); +std::ostream &operator<<(std::ostream &out, const ImageClientMeta &meta); +std::ostream &operator<<(std::ostream &out, const MirrorPeerSyncPoint &sync); +std::ostream &operator<<(std::ostream &out, const MirrorPeerState &meta); +std::ostream &operator<<(std::ostream &out, const MirrorPeerClientMeta &meta); +std::ostream &operator<<(std::ostream &out, const TagPredecessor &predecessor); +std::ostream &operator<<(std::ostream &out, const TagData &tag_data); + +struct Listener { + virtual ~Listener() { + } + + /// invoked when journal close is requested + virtual void handle_close() = 0; + + /// invoked when journal is promoted to primary + virtual void handle_promoted() = 0; + + /// invoked when journal resync is requested + virtual void handle_resync() = 0; +}; + +WRITE_CLASS_ENCODER(EventEntry); +WRITE_CLASS_ENCODER(ClientData); +WRITE_CLASS_ENCODER(TagData); + +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_TYPES_H diff --git a/src/librbd/journal/Utils.cc b/src/librbd/journal/Utils.cc new file mode 100644 index 000000000..231bcae2d --- /dev/null +++ b/src/librbd/journal/Utils.cc @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/journal/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/journal/Types.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::journal::" + +namespace librbd { +namespace journal { +namespace util { + +int C_DecodeTag::decode(bufferlist::const_iterator *it, TagData *tag_data) { + try { + using ceph::decode; + decode(*tag_data, *it); + } catch (const buffer::error &err) { + return -EBADMSG; + } + return 0; +} + +int C_DecodeTag::process(int r) { + if (r < 0) { + lderr(cct) << "C_DecodeTag: " << this << " " << __func__ << ": " + << "failed to allocate tag: " << cpp_strerror(r) + << dendl; + return r; + } + + std::lock_guard locker{*lock}; + *tag_tid = tag.tid; + + auto data_it = tag.data.cbegin(); + r = decode(&data_it, tag_data); + if (r < 0) { + lderr(cct) << "C_DecodeTag: " << this << " " << __func__ << ": " + << "failed to decode allocated tag" << dendl; + return r; + } + + ldout(cct, 20) << "C_DecodeTag: " << this << " " << __func__ << ": " + << "allocated journal tag: " + << "tid=" << tag.tid << ", " + << "data=" << *tag_data << dendl; + return 0; +} + +int C_DecodeTags::process(int r) { + if (r < 0) { + lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": " + << "failed to retrieve journal tags: " << cpp_strerror(r) + << dendl; + return r; + } + + if (tags.empty()) { + lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": " + << "no journal tags retrieved" << dendl; + return -ENOENT; + } + + std::lock_guard locker{*lock}; + *tag_tid = tags.back().tid; + auto data_it = tags.back().data.cbegin(); + r = C_DecodeTag::decode(&data_it, tag_data); + if (r < 0) { + lderr(cct) << "C_DecodeTags: " << this << " " << __func__ << ": " + << "failed to decode journal tag" << dendl; + return r; + } + + ldout(cct, 20) << "C_DecodeTags: " << this << " " << __func__ << ": " + << "most recent journal tag: " + << "tid=" << *tag_tid << ", " + << "data=" << *tag_data << dendl; + return 0; +} + +} // namespace util +} // namespace journal +} // namespace librbd diff --git a/src/librbd/journal/Utils.h b/src/librbd/journal/Utils.h new file mode 100644 index 000000000..93643f9f9 --- /dev/null +++ b/src/librbd/journal/Utils.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_JOURNAL_UTILS_H +#define CEPH_LIBRBD_JOURNAL_UTILS_H + +#include "include/common_fwd.h" +#include "include/int_types.h" +#include "include/Context.h" +#include "cls/journal/cls_journal_types.h" +#include + + +namespace librbd { +namespace journal { + +struct TagData; + +namespace util { + +struct C_DecodeTag : public Context { + CephContext *cct; + ceph::mutex *lock; + uint64_t *tag_tid; + TagData *tag_data; + Context *on_finish; + + cls::journal::Tag tag; + + C_DecodeTag(CephContext *cct, ceph::mutex *lock, uint64_t *tag_tid, + TagData *tag_data, Context *on_finish) + : cct(cct), lock(lock), tag_tid(tag_tid), tag_data(tag_data), + on_finish(on_finish) { + } + + void complete(int r) override { + on_finish->complete(process(r)); + Context::complete(0); + } + void finish(int r) override { + } + + int process(int r); + + static int decode(bufferlist::const_iterator *it, TagData *tag_data); + +}; + +struct C_DecodeTags : public Context { + typedef std::list Tags; + + CephContext *cct; + ceph::mutex *lock; + uint64_t *tag_tid; + TagData *tag_data; + Context *on_finish; + + Tags tags; + + C_DecodeTags(CephContext *cct, ceph::mutex *lock, uint64_t *tag_tid, + TagData *tag_data, Context *on_finish) + : cct(cct), lock(lock), tag_tid(tag_tid), tag_data(tag_data), + on_finish(on_finish) { + } + + void complete(int r) override { + on_finish->complete(process(r)); + Context::complete(0); + } + void finish(int r) override { + } + + int process(int r); +}; + +} // namespace util +} // namespace journal +} // namespace librbd + +#endif // CEPH_LIBRBD_JOURNAL_UTILS_H diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc new file mode 100644 index 000000000..54a2d0205 --- /dev/null +++ b/src/librbd/librbd.cc @@ -0,0 +1,7459 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "include/int_types.h" + +#include + +#include "common/deleter.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/TracepointProvider.h" +#include "include/Context.h" + +#include "cls/rbd/cls_rbd_client.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/Features.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/internal.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/api/Config.h" +#include "librbd/api/DiffIterate.h" +#include "librbd/api/Group.h" +#include "librbd/api/Image.h" +#include "librbd/api/Io.h" +#include "librbd/api/Migration.h" +#include "librbd/api/Mirror.h" +#include "librbd/api/Namespace.h" +#include "librbd/api/Pool.h" +#include "librbd/api/PoolMetadata.h" +#include "librbd/api/Snapshot.h" +#include "librbd/api/Trash.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ReadResult.h" +#include +#include +#include + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/librbd.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd: " + +using std::list; +using std::map; +using std::pair; +using std::set; +using std::string; +using std::vector; + +using ceph::bufferlist; +using librados::snap_t; +using librados::IoCtx; + +namespace { + +TracepointProvider::Traits tracepoint_traits("librbd_tp.so", "rbd_tracing"); + +struct UserBufferDeleter : public deleter::impl { + CephContext* cct; + librbd::io::AioCompletion* aio_completion; + + UserBufferDeleter(CephContext* cct, librbd::io::AioCompletion* aio_completion) + : deleter::impl(deleter()), cct(cct), aio_completion(aio_completion) { + aio_completion->block(cct); + } + + ~UserBufferDeleter() override { + aio_completion->unblock(cct); + } +}; + +static auto create_write_raw(librbd::ImageCtx *ictx, const char *buf, + size_t len, + librbd::io::AioCompletion* aio_completion) { + if (ictx->disable_zero_copy || aio_completion == nullptr) { + // must copy the buffer if writeback/writearound cache is in-use (or using + // non-AIO) + return buffer::copy(buf, len); + } + + // avoid copying memory for AIO operations, but possibly delay completions + // until the last reference to the user's memory has been released + return ceph::unique_leakable_ptr( + buffer::claim_buffer( + len, const_cast(buf), + deleter(new UserBufferDeleter(ictx->cct, aio_completion)))); +} + +static int get_iovec_length(const struct iovec *iov, int iovcnt, size_t &len) +{ + len = 0; + + if (iovcnt <= 0) { + return -EINVAL; + } + + for (int i = 0; i < iovcnt; ++i) { + const struct iovec &io = iov[i]; + // check for overflow + if (len + io.iov_len < len) { + return -EINVAL; + } + len += io.iov_len; + } + + return 0; +} + +static bufferlist iovec_to_bufferlist(librbd::ImageCtx *ictx, + const struct iovec *iov, + int iovcnt, + librbd::io::AioCompletion* aio_completion) +{ + bufferlist bl; + for (int i = 0; i < iovcnt; ++i) { + const struct iovec &io = iov[i]; + bl.push_back(create_write_raw(ictx, static_cast(io.iov_base), + io.iov_len, aio_completion)); + } + return bl; +} + +CephContext* get_cct(IoCtx &io_ctx) { + return reinterpret_cast(io_ctx.cct()); +} + +librbd::io::AioCompletion* get_aio_completion(librbd::RBD::AioCompletion *comp) { + return reinterpret_cast(comp->pc); +} + +struct C_AioCompletion : public Context { + CephContext *cct; + librbd::io::aio_type_t aio_type; + librbd::io::AioCompletion* aio_comp; + + C_AioCompletion(librbd::ImageCtx *ictx, librbd::io::aio_type_t aio_type, + librbd::io::AioCompletion* aio_comp) + : cct(ictx->cct), aio_type(aio_type), aio_comp(aio_comp) { + aio_comp->init_time(ictx, aio_type); + aio_comp->get(); + } + virtual ~C_AioCompletion() { + aio_comp->put(); + } + + void finish(int r) override { + ldout(cct, 20) << "C_AioCompletion::finish: r=" << r << dendl; + if (r < 0) { + aio_comp->fail(r); + } else { + aio_comp->complete(); + } + } +}; + +struct C_OpenComplete : public C_AioCompletion { + librbd::ImageCtx *ictx; + void **ictxp; + C_OpenComplete(librbd::ImageCtx *ictx, librbd::io::AioCompletion* comp, + void **ictxp) + : C_AioCompletion(ictx, librbd::io::AIO_TYPE_OPEN, comp), + ictx(ictx), ictxp(ictxp) { + } + void finish(int r) override { + ldout(cct, 20) << "C_OpenComplete::finish: r=" << r << dendl; + if (r < 0) { + *ictxp = nullptr; + } else { + *ictxp = ictx; + } + + C_AioCompletion::finish(r); + } +}; + +struct C_OpenAfterCloseComplete : public Context { + librbd::ImageCtx *ictx; + librbd::io::AioCompletion* comp; + void **ictxp; + C_OpenAfterCloseComplete(librbd::ImageCtx *ictx, + librbd::io::AioCompletion* comp, + void **ictxp) + : ictx(ictx), comp(comp), ictxp(ictxp) { + } + void finish(int r) override { + ldout(ictx->cct, 20) << "C_OpenAfterCloseComplete::finish: r=" << r + << dendl; + *ictxp = nullptr; + + ictx->state->open(0, new C_OpenComplete(ictx, comp, ictxp)); + } +}; + +struct C_UpdateWatchCB : public librbd::UpdateWatchCtx { + rbd_update_callback_t watch_cb; + void *arg; + uint64_t handle = 0; + + C_UpdateWatchCB(rbd_update_callback_t watch_cb, void *arg) : + watch_cb(watch_cb), arg(arg) { + } + void handle_notify() override { + watch_cb(arg); + } +}; + +struct C_QuiesceWatchCB : public librbd::QuiesceWatchCtx { + rbd_update_callback_t quiesce_cb; + rbd_update_callback_t unquiesce_cb; + void *arg; + uint64_t handle = 0; + + C_QuiesceWatchCB(rbd_update_callback_t quiesce_cb, + rbd_update_callback_t unquiesce_cb, void *arg) : + quiesce_cb(quiesce_cb), unquiesce_cb(unquiesce_cb), arg(arg) { + } + void handle_quiesce() override { + quiesce_cb(arg); + } + void handle_unquiesce() override { + unquiesce_cb(arg); + } +}; + +void group_image_status_cpp_to_c(const librbd::group_image_info_t &cpp_info, + rbd_group_image_info_t *c_info) { + c_info->name = strdup(cpp_info.name.c_str()); + c_info->pool = cpp_info.pool; + c_info->state = cpp_info.state; +} + +void group_info_cpp_to_c(const librbd::group_info_t &cpp_info, + rbd_group_info_t *c_info) { + c_info->name = strdup(cpp_info.name.c_str()); + c_info->pool = cpp_info.pool; +} + +void group_snap_info_cpp_to_c(const librbd::group_snap_info_t &cpp_info, + rbd_group_snap_info_t *c_info) { + c_info->name = strdup(cpp_info.name.c_str()); + c_info->state = cpp_info.state; +} + +void mirror_image_info_cpp_to_c(const librbd::mirror_image_info_t &cpp_info, + rbd_mirror_image_info_t *c_info) { + c_info->global_id = strdup(cpp_info.global_id.c_str()); + c_info->state = cpp_info.state; + c_info->primary = cpp_info.primary; +} + +int get_local_mirror_image_site_status( + const librbd::mirror_image_global_status_t& status, + librbd::mirror_image_site_status_t* local_status) { + auto it = std::find_if(status.site_statuses.begin(), + status.site_statuses.end(), + [](const librbd::mirror_image_site_status_t& s) { + return (s.mirror_uuid == + cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID); + }); + if (it == status.site_statuses.end()) { + return -ENOENT; + } + + *local_status = *it; + return 0; +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +int mirror_image_global_status_cpp_to_c( + const librbd::mirror_image_global_status_t &cpp_status, + rbd_mirror_image_status_t *c_status) { + c_status->name = strdup(cpp_status.name.c_str()); + mirror_image_info_cpp_to_c(cpp_status.info, &c_status->info); + + librbd::mirror_image_site_status_t local_status; + int r = get_local_mirror_image_site_status(cpp_status, &local_status); + if (r < 0) { + return r; + } + + c_status->state = local_status.state; + c_status->description = strdup(local_status.description.c_str()); + c_status->last_update = local_status.last_update; + c_status->up = local_status.up; + return 0; +} + +#pragma GCC diagnostic pop + +void mirror_image_global_status_cpp_to_c( + const librbd::mirror_image_global_status_t &cpp_status, + rbd_mirror_image_global_status_t *c_status) { + c_status->name = strdup(cpp_status.name.c_str()); + mirror_image_info_cpp_to_c(cpp_status.info, &c_status->info); + + c_status->site_statuses_count = cpp_status.site_statuses.size(); + c_status->site_statuses = (rbd_mirror_image_site_status_t*)calloc( + cpp_status.site_statuses.size(), sizeof(rbd_mirror_image_site_status_t)); + + auto idx = 0U; + for (auto it = cpp_status.site_statuses.begin(); + it != cpp_status.site_statuses.end(); ++it) { + auto& s_status = c_status->site_statuses[idx++]; + s_status.mirror_uuid = strdup(it->mirror_uuid.c_str()); + s_status.state = it->state; + s_status.description = strdup(it->description.c_str()); + s_status.last_update = it->last_update; + s_status.up = it->up; + } +} + +void trash_image_info_cpp_to_c(const librbd::trash_image_info_t &cpp_info, + rbd_trash_image_info_t *c_info) { + c_info->id = strdup(cpp_info.id.c_str()); + c_info->name = strdup(cpp_info.name.c_str()); + c_info->source = cpp_info.source; + c_info->deletion_time = cpp_info.deletion_time; + c_info->deferment_end_time = cpp_info.deferment_end_time; +} + +void config_option_cpp_to_c(const librbd::config_option_t &cpp_option, + rbd_config_option_t *c_option) { + c_option->name = strdup(cpp_option.name.c_str()); + c_option->value = strdup(cpp_option.value.c_str()); + c_option->source = cpp_option.source; +} + +void config_option_cleanup(rbd_config_option_t &option) { + free(option.name); + free(option.value); +} + +struct C_MirrorImageGetInfo : public Context { + rbd_mirror_image_info_t *mirror_image_info; + Context *on_finish; + + librbd::mirror_image_info_t cpp_mirror_image_info; + + C_MirrorImageGetInfo(rbd_mirror_image_info_t *mirror_image_info, + Context *on_finish) + : mirror_image_info(mirror_image_info), on_finish(on_finish) { + } + + void finish(int r) override { + if (r < 0) { + on_finish->complete(r); + return; + } + + mirror_image_info_cpp_to_c(cpp_mirror_image_info, mirror_image_info); + on_finish->complete(0); + } +}; + +struct C_MirrorImageGetGlobalStatus : public Context { + rbd_mirror_image_global_status_t *mirror_image_global_status; + Context *on_finish; + + librbd::mirror_image_global_status_t cpp_mirror_image_global_status; + + C_MirrorImageGetGlobalStatus( + rbd_mirror_image_global_status_t *mirror_image_global_status, + Context *on_finish) + : mirror_image_global_status(mirror_image_global_status), + on_finish(on_finish) { + } + + void finish(int r) override { + if (r < 0) { + on_finish->complete(r); + return; + } + + mirror_image_global_status_cpp_to_c(cpp_mirror_image_global_status, + mirror_image_global_status); + on_finish->complete(0); + } +}; + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +struct C_MirrorImageGetStatus : public Context { + librbd::mirror_image_status_t *mirror_image_status_cpp = nullptr; + rbd_mirror_image_status_t *mirror_image_status = nullptr; + Context *on_finish; + + librbd::mirror_image_global_status_t cpp_mirror_image_global_status; + + C_MirrorImageGetStatus(rbd_mirror_image_status_t *mirror_image_status, + Context *on_finish) + : mirror_image_status(mirror_image_status), on_finish(on_finish) { + } + C_MirrorImageGetStatus(librbd::mirror_image_status_t *mirror_image_status, + Context *on_finish) + : mirror_image_status_cpp(mirror_image_status), on_finish(on_finish) { + } + + + void finish(int r) override { + if (r < 0) { + on_finish->complete(r); + return; + } + + if (mirror_image_status != nullptr) { + r = mirror_image_global_status_cpp_to_c(cpp_mirror_image_global_status, + mirror_image_status); + } else if (mirror_image_status_cpp != nullptr) { + librbd::mirror_image_site_status_t local_status; + r = get_local_mirror_image_site_status(cpp_mirror_image_global_status, + &local_status); + if (r >= 0) { + *mirror_image_status_cpp = { + cpp_mirror_image_global_status.name, + cpp_mirror_image_global_status.info, + local_status.state, local_status.description, + local_status.last_update, local_status.up}; + } + } + on_finish->complete(r); + } +}; + +#pragma GCC diagnostic pop + +} // anonymous namespace + +namespace librbd { + ProgressContext::~ProgressContext() + { + } + + class CProgressContext : public ProgressContext + { + public: + CProgressContext(librbd_progress_fn_t fn, void *data) + : m_fn(fn), m_data(data) + { + } + int update_progress(uint64_t offset, uint64_t src_size) override + { + return m_fn(offset, src_size, m_data); + } + private: + librbd_progress_fn_t m_fn; + void *m_data; + }; + + /* + * Pool stats + */ + PoolStats::PoolStats() { + rbd_pool_stats_create(&pool_stats); + } + + PoolStats::~PoolStats() { + rbd_pool_stats_destroy(pool_stats); + } + + int PoolStats::add(rbd_pool_stat_option_t option, uint64_t* opt_val) { + return rbd_pool_stats_option_add_uint64(pool_stats, option, opt_val); + } + + /* + * RBD + */ + RBD::RBD() + { + } + + RBD::~RBD() + { + } + + void RBD::version(int *major, int *minor, int *extra) + { + rbd_version(major, minor, extra); + } + + int RBD::open(IoCtx& io_ctx, Image& image, const char *name) + { + return open(io_ctx, image, name, NULL); + } + + int RBD::open_by_id(IoCtx& io_ctx, Image& image, const char *id) + { + return open_by_id(io_ctx, image, id, nullptr); + } + + int RBD::open(IoCtx& io_ctx, Image& image, const char *name, + const char *snap_name) + { + ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + if (image.ctx != NULL) { + reinterpret_cast(image.ctx)->state->close(); + image.ctx = NULL; + } + + int r = ictx->state->open(0); + if (r < 0) { + tracepoint(librbd, open_image_exit, r); + return r; + } + + image.ctx = (image_ctx_t) ictx; + tracepoint(librbd, open_image_exit, 0); + return 0; + } + + int RBD::open_by_id(IoCtx& io_ctx, Image& image, const char *id, + const char *snap_name) + { + ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, false); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, open_image_by_id_enter, ictx, ictx->id.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + if (image.ctx != nullptr) { + reinterpret_cast(image.ctx)->state->close(); + image.ctx = nullptr; + } + + int r = ictx->state->open(0); + if (r < 0) { + tracepoint(librbd, open_image_by_id_exit, r); + return r; + } + + image.ctx = (image_ctx_t) ictx; + tracepoint(librbd, open_image_by_id_exit, 0); + return 0; + } + + int RBD::aio_open(IoCtx& io_ctx, Image& image, const char *name, + const char *snap_name, RBD::AioCompletion *c) + { + ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, false); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc); + + if (image.ctx != NULL) { + reinterpret_cast(image.ctx)->state->close( + new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx)); + } else { + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c), + &image.ctx)); + } + tracepoint(librbd, aio_open_image_exit, 0); + return 0; + } + + int RBD::aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id, + const char *snap_name, RBD::AioCompletion *c) + { + ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, false); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, aio_open_image_by_id_enter, ictx, ictx->id.c_str(), + ictx->snap_name.c_str(), ictx->read_only, c->pc); + + if (image.ctx != nullptr) { + reinterpret_cast(image.ctx)->state->close( + new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx)); + } else { + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c), + &image.ctx)); + } + tracepoint(librbd, aio_open_image_by_id_exit, 0); + return 0; + } + + int RBD::open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snap_name) + { + ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, true); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + if (image.ctx != NULL) { + reinterpret_cast(image.ctx)->state->close(); + image.ctx = NULL; + } + + int r = ictx->state->open(0); + if (r < 0) { + tracepoint(librbd, open_image_exit, r); + return r; + } + + image.ctx = (image_ctx_t) ictx; + tracepoint(librbd, open_image_exit, 0); + return 0; + } + + int RBD::open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snap_name) + { + ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, true); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, open_image_by_id_enter, ictx, ictx->id.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + if (image.ctx != nullptr) { + reinterpret_cast(image.ctx)->state->close(); + image.ctx = nullptr; + } + + int r = ictx->state->open(0); + if (r < 0) { + tracepoint(librbd, open_image_by_id_exit, r); + return r; + } + + image.ctx = (image_ctx_t) ictx; + tracepoint(librbd, open_image_by_id_exit, 0); + return 0; + } + + int RBD::aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snap_name, RBD::AioCompletion *c) + { + ImageCtx *ictx = new ImageCtx(name, "", snap_name, io_ctx, true); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc); + + if (image.ctx != NULL) { + reinterpret_cast(image.ctx)->state->close( + new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx)); + } else { + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c), + &image.ctx)); + } + tracepoint(librbd, aio_open_image_exit, 0); + return 0; + } + + int RBD::aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snap_name, RBD::AioCompletion *c) + { + ImageCtx *ictx = new ImageCtx("", id, snap_name, io_ctx, true); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, aio_open_image_by_id_enter, ictx, ictx->id.c_str(), + ictx->snap_name.c_str(), ictx->read_only, c->pc); + + if (image.ctx != nullptr) { + reinterpret_cast(image.ctx)->state->close( + new C_OpenAfterCloseComplete(ictx, get_aio_completion(c), &image.ctx)); + } else { + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(c), + &image.ctx)); + } + tracepoint(librbd, aio_open_image_by_id_exit, 0); + return 0; + } + + int RBD::features_to_string(uint64_t features, std::string *str_features) + { + std::stringstream err; + *str_features = librbd::rbd_features_to_string(features, &err); + if (!err.str().empty()) { + return -EINVAL; + } + + return 0; + } + + int RBD::features_from_string(const std::string str_features, uint64_t *features) + { + std::stringstream err; + *features = librbd::rbd_features_from_string(str_features, &err); + if (!err.str().empty()) { + return -EINVAL; + } + + return 0; + } + + int RBD::create(IoCtx& io_ctx, const char *name, uint64_t size, int *order) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order); + int r = librbd::create(io_ctx, name, size, order); + tracepoint(librbd, create_exit, r, *order); + return r; + } + + int RBD::create2(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order); + int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0); + tracepoint(librbd, create2_exit, r, *order); + return r; + } + + int RBD::create3(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order, uint64_t stripe_unit, + uint64_t stripe_count) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count); + int r = librbd::create(io_ctx, name, size, false, features, order, + stripe_unit, stripe_count); + tracepoint(librbd, create3_exit, r, *order); + return r; + } + + int RBD::create4(IoCtx& io_ctx, const char *name, uint64_t size, + ImageOptions& opts) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts.opts); + int r = librbd::create(io_ctx, name, "", size, opts, "", "", false); + tracepoint(librbd, create4_exit, r); + return r; + } + + int RBD::clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order) + { + TracepointProvider::initialize(get_cct(p_ioctx)); + tracepoint(librbd, clone_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features); + int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name, + features, c_order, 0, 0); + tracepoint(librbd, clone_exit, r, *c_order); + return r; + } + + int RBD::clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order, uint64_t stripe_unit, int stripe_count) + { + TracepointProvider::initialize(get_cct(p_ioctx)); + tracepoint(librbd, clone2_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features, stripe_unit, stripe_count); + int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name, + features, c_order, stripe_unit, stripe_count); + tracepoint(librbd, clone2_exit, r, *c_order); + return r; + } + + int RBD::clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name, + IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts) + { + TracepointProvider::initialize(get_cct(p_ioctx)); + tracepoint(librbd, clone3_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, c_opts.opts); + int r = librbd::clone(p_ioctx, nullptr, p_name, p_snap_name, c_ioctx, + nullptr, c_name, c_opts, "", ""); + tracepoint(librbd, clone3_exit, r); + return r; + } + + int RBD::remove(IoCtx& io_ctx, const char *name) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx); + tracepoint(librbd, remove_exit, r); + return r; + } + + int RBD::remove_with_progress(IoCtx& io_ctx, const char *name, + ProgressContext& pctx) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name); + int r = librbd::api::Image<>::remove(io_ctx, name, pctx); + tracepoint(librbd, remove_exit, r); + return r; + } + + int RBD::trash_move(IoCtx &io_ctx, const char *name, uint64_t delay) { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_move_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + int r = librbd::api::Trash<>::move(io_ctx, RBD_TRASH_IMAGE_SOURCE_USER, + name, delay); + tracepoint(librbd, trash_move_exit, r); + return r; + } + + int RBD::trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info) { + return librbd::api::Trash<>::get(io_ctx, id, info); + } + + int RBD::trash_list(IoCtx &io_ctx, vector &entries) { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_list_enter, + io_ctx.get_pool_name().c_str(), io_ctx.get_id()); + int r = librbd::api::Trash<>::list(io_ctx, entries, true); +#ifdef WITH_LTTNG + if (r >= 0) { + for (const auto& entry : entries) { + tracepoint(librbd, trash_list_entry, entry.id.c_str()); + } + } +#endif + tracepoint(librbd, trash_list_exit, r, r); + return r; + } + + int RBD::trash_remove(IoCtx &io_ctx, const char *image_id, bool force) { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_id, force); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx); + tracepoint(librbd, trash_remove_exit, r); + return r; + } + + int RBD::trash_remove_with_progress(IoCtx &io_ctx, const char *image_id, + bool force, ProgressContext &pctx) { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_id, force); + int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, pctx); + tracepoint(librbd, trash_remove_exit, r); + return r; + } + + int RBD::trash_restore(IoCtx &io_ctx, const char *id, const char *name) { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_undelete_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), id, name); + int r = librbd::api::Trash<>::restore( + io_ctx, librbd::api::Trash<>::ALLOWED_RESTORE_SOURCES, id, name); + tracepoint(librbd, trash_undelete_exit, r); + return r; + } + + int RBD::trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold) { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), expire_ts, threshold); + NoOpProgressContext nop_pctx; + int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, nop_pctx); + tracepoint(librbd, trash_purge_exit, r); + return r; + } + + int RBD::trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts, + float threshold, ProgressContext &pctx) { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), expire_ts, threshold); + int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, pctx); + tracepoint(librbd, trash_purge_exit, r); + return r; + } + + int RBD::namespace_create(IoCtx& io_ctx, const char *namespace_name) { + return librbd::api::Namespace<>::create(io_ctx, namespace_name); + } + + int RBD::namespace_remove(IoCtx& io_ctx, const char *namespace_name) { + return librbd::api::Namespace<>::remove(io_ctx, namespace_name); + } + + int RBD::namespace_list(IoCtx& io_ctx, + std::vector* namespace_names) { + return librbd::api::Namespace<>::list(io_ctx, namespace_names); + } + + int RBD::namespace_exists(IoCtx& io_ctx, const char *namespace_name, + bool *exists) { + return librbd::api::Namespace<>::exists(io_ctx, namespace_name, exists); + } + + int RBD::pool_init(IoCtx& io_ctx, bool force) { + return librbd::api::Pool<>::init(io_ctx, force); + } + + int RBD::pool_stats_get(IoCtx& io_ctx, PoolStats* stats) { + auto pool_stat_options = + reinterpret_cast::StatOptions*>(stats->pool_stats); + return librbd::api::Pool<>::get_stats(io_ctx, pool_stat_options); + } + + int RBD::list(IoCtx& io_ctx, vector& names) + { + std::vector image_specs; + int r = list2(io_ctx, &image_specs); + if (r < 0) { + return r; + } + + names.clear(); + for (auto& it : image_specs) { + names.push_back(it.name); + } + return 0; + } + + int RBD::list2(IoCtx& io_ctx, std::vector *images) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + + int r = librbd::api::Image<>::list_images(io_ctx, images); +#ifdef WITH_LTTNG + if (r >= 0) { + for (auto& it : *images) { + tracepoint(librbd, list_entry, it.name.c_str()); + } + } +#endif + tracepoint(librbd, list_exit, r, r); + return r; + } + + int RBD::rename(IoCtx& src_io_ctx, const char *srcname, const char *destname) + { + TracepointProvider::initialize(get_cct(src_io_ctx)); + tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname); + int r = librbd::rename(src_io_ctx, srcname, destname); + tracepoint(librbd, rename_exit, r); + return r; + } + + int RBD::migration_prepare(IoCtx& io_ctx, const char *image_name, + IoCtx& dest_io_ctx, const char *dest_image_name, + ImageOptions& opts) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_prepare_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name, dest_io_ctx.get_pool_name().c_str(), + dest_io_ctx.get_id(), dest_image_name, opts.opts); + int r = librbd::api::Migration<>::prepare(io_ctx, image_name, dest_io_ctx, + dest_image_name, opts); + tracepoint(librbd, migration_prepare_exit, r); + return r; + } + + int RBD::migration_prepare_import(const char *source_spec, IoCtx& dest_io_ctx, + const char *dest_image_name, + ImageOptions& opts) { + return librbd::api::Migration<>::prepare_import(source_spec, dest_io_ctx, + dest_image_name, opts); + } + + int RBD::migration_execute(IoCtx& io_ctx, const char *image_name) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_execute_exit, r); + return r; + } + + int RBD::migration_execute_with_progress(IoCtx& io_ctx, + const char *image_name, + librbd::ProgressContext &prog_ctx) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_execute_exit, r); + return r; + } + + int RBD::migration_abort(IoCtx& io_ctx, const char *image_name) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_abort_exit, r); + return r; + } + + int RBD::migration_abort_with_progress(IoCtx& io_ctx, const char *image_name, + librbd::ProgressContext &prog_ctx) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_abort_exit, r); + return r; + } + + int RBD::migration_commit(IoCtx& io_ctx, const char *image_name) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_commit_exit, r); + return r; + } + + int RBD::migration_commit_with_progress(IoCtx& io_ctx, const char *image_name, + librbd::ProgressContext &prog_ctx) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_commit_exit, r); + return r; + } + + int RBD::migration_status(IoCtx& io_ctx, const char *image_name, + image_migration_status_t *status, + size_t status_size) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_status_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + + if (status_size != sizeof(image_migration_status_t)) { + tracepoint(librbd, migration_status_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Migration<>::status(io_ctx, image_name, status); + tracepoint(librbd, migration_status_exit, r); + return r; + } + + int RBD::mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode) { + return librbd::api::Mirror<>::mode_get(io_ctx, mirror_mode); + } + + int RBD::mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode) { + return librbd::api::Mirror<>::mode_set(io_ctx, mirror_mode); + } + + int RBD::mirror_uuid_get(IoCtx& io_ctx, std::string* mirror_uuid) { + return librbd::api::Mirror<>::uuid_get(io_ctx, mirror_uuid); + } + + int RBD::mirror_site_name_get(librados::Rados& rados, + std::string* site_name) { + return librbd::api::Mirror<>::site_name_get(rados, site_name); + } + + int RBD::mirror_site_name_set(librados::Rados& rados, + const std::string& site_name) { + return librbd::api::Mirror<>::site_name_set(rados, site_name); + } + + int RBD::mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token) { + return librbd::api::Mirror<>::peer_bootstrap_create(io_ctx, token); + } + + int RBD::mirror_peer_bootstrap_import(IoCtx& io_ctx, + rbd_mirror_peer_direction_t direction, + const std::string& token) { + return librbd::api::Mirror<>::peer_bootstrap_import(io_ctx, direction, + token); + } + + int RBD::mirror_peer_site_add(IoCtx& io_ctx, std::string *uuid, + mirror_peer_direction_t direction, + const std::string &site_name, + const std::string &client_name) { + return librbd::api::Mirror<>::peer_site_add( + io_ctx, uuid, direction, site_name, client_name); + } + + int RBD::mirror_peer_site_remove(IoCtx& io_ctx, const std::string &uuid) { + return librbd::api::Mirror<>::peer_site_remove(io_ctx, uuid); + } + + int RBD::mirror_peer_site_list( + IoCtx& io_ctx, std::vector *peer_sites) { + return librbd::api::Mirror<>::peer_site_list(io_ctx, peer_sites); + } + + int RBD::mirror_peer_site_set_client_name( + IoCtx& io_ctx, const std::string &uuid, const std::string &client_name) { + return librbd::api::Mirror<>::peer_site_set_client(io_ctx, uuid, + client_name); + } + + int RBD::mirror_peer_site_set_name(IoCtx& io_ctx, const std::string &uuid, + const std::string &site_name) { + return librbd::api::Mirror<>::peer_site_set_name(io_ctx, uuid, + site_name); + } + + int RBD::mirror_peer_site_set_direction(IoCtx& io_ctx, + const std::string& uuid, + mirror_peer_direction_t direction) { + return librbd::api::Mirror<>::peer_site_set_direction(io_ctx, uuid, + direction); + } + + int RBD::mirror_peer_site_get_attributes( + IoCtx& io_ctx, const std::string &uuid, + std::map *key_vals) { + return librbd::api::Mirror<>::peer_site_get_attributes(io_ctx, uuid, + key_vals); + } + + int RBD::mirror_peer_site_set_attributes( + IoCtx& io_ctx, const std::string &uuid, + const std::map& key_vals) { + return librbd::api::Mirror<>::peer_site_set_attributes(io_ctx, uuid, + key_vals); + } + + int RBD::mirror_image_global_status_list( + IoCtx& io_ctx, const std::string &start_id, size_t max, + std::map *global_statuses) { + return librbd::api::Mirror<>::image_global_status_list( + io_ctx, start_id, max, global_statuses); + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + + int RBD::mirror_peer_add(IoCtx& io_ctx, std::string *uuid, + const std::string &cluster_name, + const std::string &client_name) { + return librbd::api::Mirror<>::peer_site_add( + io_ctx, uuid, RBD_MIRROR_PEER_DIRECTION_RX_TX, cluster_name, client_name); + } + + int RBD::mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid) { + return librbd::api::Mirror<>::peer_site_remove(io_ctx, uuid); + } + + int RBD::mirror_peer_list(IoCtx& io_ctx, std::vector *peers) { + std::vector peer_sites; + int r = librbd::api::Mirror<>::peer_site_list(io_ctx, &peer_sites); + if (r < 0) { + return r; + } + + peers->clear(); + peers->reserve(peer_sites.size()); + for (auto& peer_site : peer_sites) { + peers->push_back({peer_site.uuid, peer_site.site_name, + peer_site.client_name}); + } + return 0; + } + + int RBD::mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid, + const std::string &client_name) { + return librbd::api::Mirror<>::peer_site_set_client(io_ctx, uuid, + client_name); + } + + int RBD::mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid, + const std::string &cluster_name) { + return librbd::api::Mirror<>::peer_site_set_name(io_ctx, uuid, + cluster_name); + } + + int RBD::mirror_peer_get_attributes( + IoCtx& io_ctx, const std::string &uuid, + std::map *key_vals) { + return librbd::api::Mirror<>::peer_site_get_attributes(io_ctx, uuid, + key_vals); + } + + int RBD::mirror_peer_set_attributes( + IoCtx& io_ctx, const std::string &uuid, + const std::map& key_vals) { + return librbd::api::Mirror<>::peer_site_set_attributes(io_ctx, uuid, + key_vals); + } + + int RBD::mirror_image_status_list(IoCtx& io_ctx, const std::string &start_id, + size_t max, std::map *images) { + std::map global_statuses; + + int r = librbd::api::Mirror<>::image_global_status_list( + io_ctx, start_id, max, &global_statuses); + if (r < 0) { + return r; + } + + images->clear(); + for (auto &[id, global_status] : global_statuses) { + if (global_status.site_statuses.empty() || + global_status.site_statuses[0].mirror_uuid != + cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID) { + continue; + } + + auto& site_status = global_status.site_statuses[0]; + (*images)[id] = mirror_image_status_t{ + global_status.name, global_status.info, site_status.state, + site_status.description, site_status.last_update, site_status.up}; + } + + return 0; + } + +#pragma GCC diagnostic pop + + int RBD::mirror_image_status_summary(IoCtx& io_ctx, + std::map *states) { + return librbd::api::Mirror<>::image_status_summary(io_ctx, states); + } + + int RBD::mirror_image_instance_id_list(IoCtx& io_ctx, + const std::string &start_id, size_t max, + std::map *instance_ids) { + return librbd::api::Mirror<>::image_instance_id_list(io_ctx, start_id, max, + instance_ids); + } + + int RBD::mirror_image_info_list( + IoCtx& io_ctx, mirror_image_mode_t *mode_filter, + const std::string &start_id, size_t max, + std::map> *entries) { + return librbd::api::Mirror<>::image_info_list(io_ctx, mode_filter, start_id, + max, entries); + } + + int RBD::group_create(IoCtx& io_ctx, const char *group_name) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, group_create_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), group_name); + int r = librbd::api::Group<>::create(io_ctx, group_name); + tracepoint(librbd, group_create_exit, r); + return r; + } + + int RBD::group_remove(IoCtx& io_ctx, const char *group_name) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, group_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), group_name); + int r = librbd::api::Group<>::remove(io_ctx, group_name); + tracepoint(librbd, group_remove_exit, r); + return r; + } + + int RBD::group_list(IoCtx& io_ctx, vector *names) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, group_list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + + int r = librbd::api::Group<>::list(io_ctx, names); + if (r >= 0) { + for (auto itr : *names) { + tracepoint(librbd, group_list_entry, itr.c_str()); + } + } + tracepoint(librbd, group_list_exit, r); + return r; + } + + int RBD::group_rename(IoCtx& io_ctx, const char *src_name, + const char *dest_name) + { + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, group_rename_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), src_name, dest_name); + int r = librbd::api::Group<>::rename(io_ctx, src_name, dest_name); + tracepoint(librbd, group_rename_exit, r); + return r; + } + + int RBD::group_image_add(IoCtx& group_ioctx, const char *group_name, + IoCtx& image_ioctx, const char *image_name) + { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_image_add_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, + image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_name); + int r = librbd::api::Group<>::image_add(group_ioctx, group_name, + image_ioctx, image_name); + tracepoint(librbd, group_image_add_exit, r); + return r; + } + + int RBD::group_image_remove(IoCtx& group_ioctx, const char *group_name, + IoCtx& image_ioctx, const char *image_name) + { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_image_remove_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, + image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_name); + int r = librbd::api::Group<>::image_remove(group_ioctx, group_name, + image_ioctx, image_name); + tracepoint(librbd, group_image_remove_exit, r); + return r; + } + + int RBD::group_image_remove_by_id(IoCtx& group_ioctx, const char *group_name, + IoCtx& image_ioctx, const char *image_id) + { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_image_remove_by_id_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, + image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_id); + int r = librbd::api::Group<>::image_remove_by_id(group_ioctx, group_name, + image_ioctx, image_id); + tracepoint(librbd, group_image_remove_by_id_exit, r); + return r; + } + + int RBD::group_image_list(IoCtx& group_ioctx, const char *group_name, + std::vector *images, + size_t group_image_info_size) + { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_image_list_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name); + + if (group_image_info_size != sizeof(group_image_info_t)) { + tracepoint(librbd, group_image_list_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Group<>::image_list(group_ioctx, group_name, images); + tracepoint(librbd, group_image_list_exit, r); + return r; + } + + int RBD::group_snap_create(IoCtx& group_ioctx, const char *group_name, + const char *snap_name) { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_create_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + int r = librbd::api::Group<>::snap_create(group_ioctx, group_name, + snap_name, 0); + tracepoint(librbd, group_snap_create_exit, r); + return r; + } + + int RBD::group_snap_create2(IoCtx& group_ioctx, const char *group_name, + const char *snap_name, uint32_t flags) { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_create_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + int r = librbd::api::Group<>::snap_create(group_ioctx, group_name, + snap_name, flags); + tracepoint(librbd, group_snap_create_exit, r); + return r; + } + + int RBD::group_snap_remove(IoCtx& group_ioctx, const char *group_name, + const char *snap_name) { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_remove_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + int r = librbd::api::Group<>::snap_remove(group_ioctx, group_name, + snap_name); + tracepoint(librbd, group_snap_remove_exit, r); + return r; + } + + int RBD::group_snap_list(IoCtx& group_ioctx, const char *group_name, + std::vector *snaps, + size_t group_snap_info_size) + { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_list_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name); + + if (group_snap_info_size != sizeof(group_snap_info_t)) { + tracepoint(librbd, group_snap_list_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, snaps); + tracepoint(librbd, group_snap_list_exit, r); + return r; + } + + int RBD::group_snap_rename(IoCtx& group_ioctx, const char *group_name, + const char *old_snap_name, + const char *new_snap_name) + { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rename_enter, + group_ioctx.get_pool_name().c_str(), group_ioctx.get_id(), + group_name, old_snap_name, new_snap_name); + int r = librbd::api::Group<>::snap_rename(group_ioctx, group_name, + old_snap_name, new_snap_name); + tracepoint(librbd, group_snap_list_exit, r); + return r; + } + + int RBD::group_snap_rollback(IoCtx& group_ioctx, const char *group_name, + const char *snap_name) { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rollback_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name, + snap_name, prog_ctx); + tracepoint(librbd, group_snap_rollback_exit, r); + return r; + } + + int RBD::group_snap_rollback_with_progress(IoCtx& group_ioctx, + const char *group_name, + const char *snap_name, + ProgressContext& prog_ctx) { + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rollback_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name, + snap_name, prog_ctx); + tracepoint(librbd, group_snap_rollback_exit, r); + return r; + } + + int RBD::pool_metadata_get(IoCtx& ioctx, const std::string &key, + std::string *value) + { + int r = librbd::api::PoolMetadata<>::get(ioctx, key, value); + return r; + } + + int RBD::pool_metadata_set(IoCtx& ioctx, const std::string &key, + const std::string &value) + { + int r = librbd::api::PoolMetadata<>::set(ioctx, key, value); + return r; + } + + int RBD::pool_metadata_remove(IoCtx& ioctx, const std::string &key) + { + int r = librbd::api::PoolMetadata<>::remove(ioctx, key); + return r; + } + + int RBD::pool_metadata_list(IoCtx& ioctx, const std::string &start, + uint64_t max, map *pairs) + { + int r = librbd::api::PoolMetadata<>::list(ioctx, start, max, pairs); + return r; + } + + int RBD::config_list(IoCtx& io_ctx, std::vector *options) { + return librbd::api::Config<>::list(io_ctx, options); + } + + RBD::AioCompletion::AioCompletion(void *cb_arg, callback_t complete_cb) + { + auto aio_comp = librbd::io::AioCompletion::create( + cb_arg, complete_cb, this); + aio_comp->external_callback = true; + pc = reinterpret_cast(aio_comp); + } + + bool RBD::AioCompletion::is_complete() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + return c->is_complete(); + } + + int RBD::AioCompletion::wait_for_complete() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + return c->wait_for_complete(); + } + + ssize_t RBD::AioCompletion::get_return_value() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + return c->get_return_value(); + } + + void *RBD::AioCompletion::get_arg() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + return c->get_arg(); + } + + void RBD::AioCompletion::release() + { + librbd::io::AioCompletion *c = (librbd::io::AioCompletion *)pc; + c->release(); + delete this; + } + + /* + ImageOptions + */ + + ImageOptions::ImageOptions() + { + librbd::image_options_create(&opts); + } + + ImageOptions::ImageOptions(rbd_image_options_t opts_) + { + librbd::image_options_create_ref(&opts, opts_); + } + + ImageOptions::ImageOptions(const ImageOptions &imgopts) + { + librbd::image_options_copy(&opts, imgopts); + } + + ImageOptions::~ImageOptions() + { + librbd::image_options_destroy(opts); + } + + int ImageOptions::set(int optname, const std::string& optval) + { + return librbd::image_options_set(opts, optname, optval); + } + + int ImageOptions::set(int optname, uint64_t optval) + { + return librbd::image_options_set(opts, optname, optval); + } + + int ImageOptions::get(int optname, std::string* optval) const + { + return librbd::image_options_get(opts, optname, optval); + } + + int ImageOptions::get(int optname, uint64_t* optval) const + { + return librbd::image_options_get(opts, optname, optval); + } + + int ImageOptions::is_set(int optname, bool* is_set) + { + return librbd::image_options_is_set(opts, optname, is_set); + } + + int ImageOptions::unset(int optname) + { + return librbd::image_options_unset(opts, optname); + } + + void ImageOptions::clear() + { + librbd::image_options_clear(opts); + } + + bool ImageOptions::empty() const + { + return librbd::image_options_is_empty(opts); + } + + /* + Image + */ + + Image::Image() : ctx(NULL) + { + } + + Image::~Image() + { + close(); + } + + int Image::close() + { + int r = 0; + if (ctx) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + + r = ictx->state->close(); + ctx = NULL; + + tracepoint(librbd, close_image_exit, r); + } + return r; + } + + int Image::aio_close(RBD::AioCompletion *c) + { + if (!ctx) { + return -EINVAL; + } + + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), c->pc); + + ictx->state->close(new C_AioCompletion(ictx, librbd::io::AIO_TYPE_CLOSE, + get_aio_completion(c))); + ctx = NULL; + + tracepoint(librbd, aio_close_image_exit, 0); + return 0; + } + + int Image::resize(uint64_t size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->resize(size, true, prog_ctx); + tracepoint(librbd, resize_exit, r); + return r; + } + + int Image::resize2(uint64_t size, bool allow_shrink, librbd::ProgressContext& pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + int r = ictx->operations->resize(size, allow_shrink, pctx); + tracepoint(librbd, resize_exit, r); + return r; + } + + int Image::resize_with_progress(uint64_t size, librbd::ProgressContext& pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + int r = ictx->operations->resize(size, true, pctx); + tracepoint(librbd, resize_exit, r); + return r; + } + + int Image::stat(image_info_t& info, size_t infosize) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, stat_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::info(ictx, info, infosize); + tracepoint(librbd, stat_exit, r, &info); + return r; + } + + int Image::old_format(uint8_t *old) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_old_format_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_old_format(ictx, old); + tracepoint(librbd, get_old_format_exit, r, *old); + return r; + } + + int Image::size(uint64_t *size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_size_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_size(ictx, size); + tracepoint(librbd, get_size_exit, r, *size); + return r; + } + + int Image::get_group(group_info_t *group_info, size_t group_info_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, image_get_group_enter, ictx->name.c_str()); + + if (group_info_size != sizeof(group_info_t)) { + tracepoint(librbd, image_get_group_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Group<>::image_get_group(ictx, group_info); + tracepoint(librbd, image_get_group_exit, r); + return r; + } + + int Image::features(uint64_t *features) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_features_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_features(ictx, features); + tracepoint(librbd, get_features_exit, r, *features); + return r; + } + + int Image::update_features(uint64_t features, bool enabled) + { + ImageCtx *ictx = reinterpret_cast(ctx); + tracepoint(librbd, update_features_enter, ictx, features, enabled); + int r = ictx->operations->update_features(features, enabled); + tracepoint(librbd, update_features_exit, r); + return r; + } + + int Image::get_op_features(uint64_t *op_features) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Image<>::get_op_features(ictx, op_features); + } + + uint64_t Image::get_stripe_unit() const + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_stripe_unit_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + uint64_t stripe_unit = ictx->get_stripe_unit(); + tracepoint(librbd, get_stripe_unit_exit, 0, stripe_unit); + return stripe_unit; + } + + uint64_t Image::get_stripe_count() const + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_stripe_count_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + uint64_t stripe_count = ictx->get_stripe_count(); + tracepoint(librbd, get_stripe_count_exit, 0, stripe_count); + return stripe_count; + } + + int Image::get_create_timestamp(struct timespec *timestamp) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_create_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + utime_t time = ictx->get_create_timestamp(); + time.to_timespec(timestamp); + tracepoint(librbd, get_create_timestamp_exit, 0, timestamp); + return 0; + } + + int Image::get_access_timestamp(struct timespec *timestamp) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_access_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + { + std::shared_lock timestamp_locker{ictx->timestamp_lock}; + utime_t time = ictx->get_access_timestamp(); + time.to_timespec(timestamp); + } + tracepoint(librbd, get_access_timestamp_exit, 0, timestamp); + return 0; + } + + int Image::get_modify_timestamp(struct timespec *timestamp) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_modify_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + { + std::shared_lock timestamp_locker{ictx->timestamp_lock}; + utime_t time = ictx->get_modify_timestamp(); + time.to_timespec(timestamp); + } + tracepoint(librbd, get_modify_timestamp_exit, 0, timestamp); + return 0; + } + + int Image::overlap(uint64_t *overlap) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_overlap_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_overlap(ictx, overlap); + tracepoint(librbd, get_overlap_exit, r, *overlap); + return r; + } + + int Image::get_name(std::string *name) + { + ImageCtx *ictx = reinterpret_cast(ctx); + *name = ictx->name; + return 0; + } + + int Image::get_id(std::string *id) + { + ImageCtx *ictx = reinterpret_cast(ctx); + if (ictx->old_format) { + return -EINVAL; + } + *id = ictx->id; + return 0; + } + + std::string Image::get_block_name_prefix() + { + ImageCtx *ictx = reinterpret_cast(ctx); + return ictx->object_prefix; + } + + int64_t Image::get_data_pool_id() + { + ImageCtx *ictx = reinterpret_cast(ctx); + return librbd::api::Image<>::get_data_pool_id(ictx); + } + + int Image::parent_info(string *parent_pool_name, string *parent_name, + string *parent_snap_name) + { + librbd::linked_image_spec_t parent_image; + librbd::snap_spec_t parent_snap; + int r = get_parent(&parent_image, &parent_snap); + if (r >= 0) { + if (parent_pool_name != nullptr) { + *parent_pool_name = parent_image.pool_name; + } + if (parent_name != nullptr) { + *parent_name = parent_image.image_name; + } + if (parent_snap_name != nullptr) { + *parent_snap_name = parent_snap.name; + } + } + return r; + } + + int Image::parent_info2(string *parent_pool_name, string *parent_name, + string *parent_id, string *parent_snap_name) + { + librbd::linked_image_spec_t parent_image; + librbd::snap_spec_t parent_snap; + int r = get_parent(&parent_image, &parent_snap); + if (r >= 0) { + if (parent_pool_name != nullptr) { + *parent_pool_name = parent_image.pool_name; + } + if (parent_name != nullptr) { + *parent_name = parent_image.image_name; + } + if (parent_id != nullptr) { + *parent_id = parent_image.image_id; + } + if (parent_snap_name != nullptr) { + *parent_snap_name = parent_snap.name; + } + } + return r; + } + + int Image::get_parent(linked_image_spec_t *parent_image, + snap_spec_t *parent_snap) + { + auto ictx = reinterpret_cast(ctx); + tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + int r = librbd::api::Image<>::get_parent(ictx, parent_image, parent_snap); + + tracepoint(librbd, get_parent_info_exit, r, + parent_image->pool_name.c_str(), + parent_image->image_name.c_str(), + parent_image->image_id.c_str(), + parent_snap->name.c_str()); + return r; + } + + int Image::get_migration_source_spec(std::string* source_spec) + { + auto ictx = reinterpret_cast(ctx); + return librbd::api::Migration<>::get_source_spec(ictx, source_spec); + } + + int Image::get_flags(uint64_t *flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, get_flags_enter, ictx); + int r = librbd::get_flags(ictx, flags); + tracepoint(librbd, get_flags_exit, ictx, r, *flags); + return r; + } + + int Image::set_image_notification(int fd, int type) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, set_image_notification_enter, ictx, fd, type); + int r = librbd::set_image_notification(ictx, fd, type); + tracepoint(librbd, set_image_notification_exit, ictx, r); + return r; + } + + int Image::is_exclusive_lock_owner(bool *is_owner) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, is_exclusive_lock_owner_enter, ictx); + int r = librbd::is_exclusive_lock_owner(ictx, is_owner); + tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner); + return r; + } + + int Image::lock_acquire(rbd_lock_mode_t lock_mode) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_acquire_enter, ictx, lock_mode); + int r = librbd::lock_acquire(ictx, lock_mode); + tracepoint(librbd, lock_acquire_exit, ictx, r); + return r; + } + + int Image::lock_release() + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_release_enter, ictx); + int r = librbd::lock_release(ictx); + tracepoint(librbd, lock_release_exit, ictx, r); + return r; + } + + int Image::lock_get_owners(rbd_lock_mode_t *lock_mode, + std::list *lock_owners) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_get_owners_enter, ictx); + int r = librbd::lock_get_owners(ictx, lock_mode, lock_owners); + tracepoint(librbd, lock_get_owners_exit, ictx, r); + return r; + } + + int Image::lock_break(rbd_lock_mode_t lock_mode, + const std::string &lock_owner) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_break_enter, ictx, lock_mode, lock_owner.c_str()); + int r = librbd::lock_break(ictx, lock_mode, lock_owner); + tracepoint(librbd, lock_break_exit, ictx, r); + return r; + } + + int Image::rebuild_object_map(ProgressContext &prog_ctx) + { + ImageCtx *ictx = reinterpret_cast(ctx); + return ictx->operations->rebuild_object_map(prog_ctx); + } + + int Image::check_object_map(ProgressContext &prog_ctx) + { + ImageCtx *ictx = reinterpret_cast(ctx); + return ictx->operations->check_object_map(prog_ctx); + } + + int Image::copy(IoCtx& dest_io_ctx, const char *destname) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname); + ImageOptions opts; + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0); + tracepoint(librbd, copy_exit, r); + return r; + } + + int Image::copy2(Image& dest) + { + ImageCtx *srcctx = (ImageCtx *)ctx; + ImageCtx *destctx = (ImageCtx *)dest.ctx; + tracepoint(librbd, copy2_enter, srcctx, srcctx->name.c_str(), srcctx->snap_name.c_str(), srcctx->read_only, destctx, destctx->name.c_str(), destctx->snap_name.c_str(), destctx->read_only); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(srcctx, destctx, prog_ctx, 0); + tracepoint(librbd, copy2_exit, r); + return r; + } + + int Image::copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0); + tracepoint(librbd, copy3_exit, r); + return r; + } + + int Image::copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts, size_t sparse_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts, sparse_size); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, sparse_size); + tracepoint(librbd, copy4_exit, r); + return r; + } + + int Image::copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + librbd::ProgressContext &pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname); + ImageOptions opts; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, 0); + tracepoint(librbd, copy_exit, r); + return r; + } + + int Image::copy_with_progress2(Image& dest, librbd::ProgressContext &pctx) + { + ImageCtx *srcctx = (ImageCtx *)ctx; + ImageCtx *destctx = (ImageCtx *)dest.ctx; + tracepoint(librbd, copy2_enter, srcctx, srcctx->name.c_str(), srcctx->snap_name.c_str(), srcctx->read_only, destctx, destctx->name.c_str(), destctx->snap_name.c_str(), destctx->read_only); + int r = librbd::copy(srcctx, destctx, pctx, 0); + tracepoint(librbd, copy2_exit, r); + return r; + } + + int Image::copy_with_progress3(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, + librbd::ProgressContext &pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts); + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, 0); + tracepoint(librbd, copy3_exit, r); + return r; + } + + int Image::copy_with_progress4(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, + librbd::ProgressContext &pctx, + size_t sparse_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts, sparse_size); + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx, sparse_size); + tracepoint(librbd, copy4_exit, r); + return r; + } + + int Image::deep_copy(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, + dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), + destname, opts.opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts, + prog_ctx); + tracepoint(librbd, deep_copy_exit, r); + return r; + } + + int Image::deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, + librbd::ProgressContext &prog_ctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, + dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), + destname, opts.opts); + int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts, + prog_ctx); + tracepoint(librbd, deep_copy_exit, r); + return r; + } + + int Image::encryption_format(encryption_format_t format, + encryption_options_t opts, + size_t opts_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Image<>::encryption_format( + ictx, format, opts, opts_size, false); + } + + int Image::encryption_load(encryption_format_t format, + encryption_options_t opts, + size_t opts_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + encryption_spec_t spec = {format, opts, opts_size}; + return librbd::api::Image<>::encryption_load(ictx, &spec, 1, false); + } + + int Image::encryption_load2(const encryption_spec_t *specs, size_t spec_count) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Image<>::encryption_load( + ictx, specs, spec_count, false); + } + + int Image::flatten() + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->flatten(prog_ctx); + tracepoint(librbd, flatten_exit, r); + return r; + } + + int Image::flatten_with_progress(librbd::ProgressContext& prog_ctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + int r = ictx->operations->flatten(prog_ctx); + tracepoint(librbd, flatten_exit, r); + return r; + } + + int Image::sparsify(size_t sparse_size) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size, + ictx->id.c_str()); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->sparsify(sparse_size, prog_ctx); + tracepoint(librbd, sparsify_exit, r); + return r; + } + + int Image::sparsify_with_progress(size_t sparse_size, + librbd::ProgressContext& prog_ctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size, + ictx->id.c_str()); + int r = ictx->operations->sparsify(sparse_size, prog_ctx); + tracepoint(librbd, sparsify_exit, r); + return r; + } + + int Image::list_children(set > *children) + { + std::vector images; + int r = list_children3(&images); + if (r < 0) { + return r; + } + + for (auto& image : images) { + if (!image.trash) { + children->insert({image.pool_name, image.image_name}); + } + } + return 0; + } + + int Image::list_children2(vector *children) + { + std::vector images; + int r = list_children3(&images); + if (r < 0) { + return r; + } + + for (auto& image : images) { + children->push_back({ + .pool_name = image.pool_name, + .image_name = image.image_name, + .image_id = image.image_id, + .trash = image.trash}); + } + + return 0; + } + + int Image::list_children3(std::vector *images) + { + auto ictx = reinterpret_cast(ctx); + tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + int r = librbd::api::Image<>::list_children(ictx, images); +#ifdef WITH_LTTNG + if (r >= 0) { + for (auto& it : *images) { + tracepoint(librbd, list_children_entry, it.pool_name.c_str(), + it.image_name.c_str()); + } + } +#endif + tracepoint(librbd, list_children_exit, r); + return r; + } + + int Image::list_descendants(std::vector *images) + { + auto ictx = reinterpret_cast(ctx); + + images->clear(); + int r = librbd::api::Image<>::list_descendants(ictx, {}, images); + return r; + } + + int Image::list_lockers(std::list *lockers, + bool *exclusive, string *tag) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, list_lockers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::list_lockers(ictx, lockers, exclusive, tag); + if (r >= 0) { + for (std::list::const_iterator it = lockers->begin(); + it != lockers->end(); ++it) { + tracepoint(librbd, list_lockers_entry, it->client.c_str(), it->cookie.c_str(), it->address.c_str()); + } + } + tracepoint(librbd, list_lockers_exit, r); + return r; + } + + int Image::lock_exclusive(const string& cookie) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_exclusive_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str()); + int r = librbd::lock(ictx, true, cookie, ""); + tracepoint(librbd, lock_exclusive_exit, r); + return r; + } + + int Image::lock_shared(const string& cookie, const std::string& tag) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, lock_shared_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str(), tag.c_str()); + int r = librbd::lock(ictx, false, cookie, tag); + tracepoint(librbd, lock_shared_exit, r); + return r; + } + + int Image::unlock(const string& cookie) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, unlock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie.c_str()); + int r = librbd::unlock(ictx, cookie); + tracepoint(librbd, unlock_exit, r); + return r; + } + + int Image::break_lock(const string& client, const string& cookie) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, break_lock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, client.c_str(), cookie.c_str()); + int r = librbd::break_lock(ictx, client, cookie); + tracepoint(librbd, break_lock_exit, r); + return r; + } + + int Image::snap_create(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + auto flags = librbd::util::get_default_snap_create_flags(ictx); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Snapshot<>::create(ictx, snap_name, flags, prog_ctx); + tracepoint(librbd, snap_create_exit, r); + return r; + } + + int Image::snap_create2(const char *snap_name, uint32_t flags, + ProgressContext& prog_ctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = librbd::api::Snapshot<>::create(ictx, snap_name, flags, prog_ctx); + tracepoint(librbd, snap_create_exit, r); + return r; + } + + int Image::snap_remove(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_remove_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Snapshot<>::remove(ictx, snap_name, 0, prog_ctx); + tracepoint(librbd, snap_remove_exit, r); + return r; + } + + int Image::snap_remove2(const char *snap_name, uint32_t flags, ProgressContext& pctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_remove2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name, flags); + int r = librbd::api::Snapshot<>::remove(ictx, snap_name, flags, pctx); + tracepoint(librbd, snap_remove_exit, r); + return r; + } + + int Image::snap_remove_by_id(uint64_t snap_id) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Snapshot<>::remove(ictx, snap_id); + } + + int Image::snap_rollback(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx); + tracepoint(librbd, snap_rollback_exit, r); + return r; + } + + int Image::snap_rename(const char *srcname, const char *dstname) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname); + int r = ictx->operations->snap_rename(srcname, dstname); + tracepoint(librbd, snap_rename_exit, r); + return r; + } + + int Image::snap_rollback_with_progress(const char *snap_name, + ProgressContext& prog_ctx) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx); + tracepoint(librbd, snap_rollback_exit, r); + return r; + } + + int Image::snap_protect(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_protect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_protect_exit, r); + return r; + } + + int Image::snap_unprotect(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_unprotect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_unprotect_exit, r); + return r; + } + + int Image::snap_is_protected(const char *snap_name, bool *is_protected) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_is_protected_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = librbd::api::Snapshot<>::is_protected(ictx, snap_name, is_protected); + tracepoint(librbd, snap_is_protected_exit, r, *is_protected ? 1 : 0); + return r; + } + + int Image::snap_list(vector& snaps) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_list_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, &snaps); + int r = librbd::api::Snapshot<>::list(ictx, snaps); + if (r >= 0) { + for (int i = 0, n = snaps.size(); i < n; i++) { + tracepoint(librbd, snap_list_entry, snaps[i].id, snaps[i].size, snaps[i].name.c_str()); + } + } + tracepoint(librbd, snap_list_exit, r, snaps.size()); + if (r >= 0) { + // A little ugly, but the C++ API doesn't need a Image::snap_list_end, + // and we want the tracepoints to mirror the C API + tracepoint(librbd, snap_list_end_enter, &snaps); + tracepoint(librbd, snap_list_end_exit); + } + return r; + } + + bool Image::snap_exists(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, snap_name); + bool exists; + int r = librbd::api::Snapshot<>::exists(ictx, cls::rbd::UserSnapshotNamespace(), snap_name, &exists); + tracepoint(librbd, snap_exists_exit, r, exists); + if (r < 0) { + // lie to caller since we don't know the real answer yet. + return false; + } + return exists; + } + + // A safer verion of snap_exists. + int Image::snap_exists2(const char *snap_name, bool *exists) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = librbd::api::Snapshot<>::exists(ictx, cls::rbd::UserSnapshotNamespace(), snap_name, exists); + tracepoint(librbd, snap_exists_exit, r, *exists); + return r; + } + + int Image::snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_get_timestamp_enter, ictx, ictx->name.c_str()); + int r = librbd::api::Snapshot<>::get_timestamp(ictx, snap_id, timestamp); + tracepoint(librbd, snap_get_timestamp_exit, r); + return r; + } + + int Image::snap_get_limit(uint64_t *limit) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_get_limit_enter, ictx, ictx->name.c_str()); + int r = librbd::api::Snapshot<>::get_limit(ictx, limit); + tracepoint(librbd, snap_get_limit_exit, r, *limit); + return r; + } + + int Image::snap_get_namespace_type(uint64_t snap_id, + snap_namespace_type_t *namespace_type) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_get_namespace_type_enter, ictx, ictx->name.c_str()); + int r = librbd::api::Snapshot<>::get_namespace_type(ictx, snap_id, namespace_type); + tracepoint(librbd, snap_get_namespace_type_exit, r); + return r; + } + + int Image::snap_get_group_namespace(uint64_t snap_id, + snap_group_namespace_t *group_snap, + size_t group_snap_size) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_get_group_namespace_enter, ictx, + ictx->name.c_str()); + + if (group_snap_size != sizeof(snap_group_namespace_t)) { + tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE); + return -ERANGE; + } + + int r = librbd::api::Snapshot<>::get_group_namespace(ictx, snap_id, + group_snap); + tracepoint(librbd, snap_get_group_namespace_exit, r); + return r; + } + + int Image::snap_get_trash_namespace(uint64_t snap_id, + std::string* original_name) { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Snapshot<>::get_trash_namespace(ictx, snap_id, + original_name); + } + + int Image::snap_get_mirror_namespace( + uint64_t snap_id, snap_mirror_namespace_t *mirror_snap, + size_t mirror_snap_size) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (mirror_snap_size != sizeof(snap_mirror_namespace_t)) { + return -ERANGE; + } + + int r = librbd::api::Snapshot<>::get_mirror_namespace( + ictx, snap_id, mirror_snap); + return r; + } + + int Image::snap_set_limit(uint64_t limit) + { + ImageCtx *ictx = (ImageCtx *)ctx; + + tracepoint(librbd, snap_set_limit_enter, ictx, ictx->name.c_str(), limit); + int r = ictx->operations->snap_set_limit(limit); + tracepoint(librbd, snap_set_limit_exit, r); + return r; + } + + int Image::snap_set(const char *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, snap_set_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = librbd::api::Image<>::snap_set( + ictx, cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_set_exit, r); + return r; + } + + int Image::snap_set_by_id(uint64_t snap_id) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Image<>::snap_set(ictx, snap_id); + } + + int Image::snap_get_name(uint64_t snap_id, std::string *snap_name) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Snapshot<>::get_name(ictx, snap_id, snap_name); + } + + int Image::snap_get_id(const std::string snap_name, uint64_t *snap_id) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Snapshot<>::get_id(ictx, snap_name, snap_id); + } + + ssize_t Image::read(uint64_t ofs, size_t len, bufferlist& bl) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + + int r = api::Io<>::read(*ictx, ofs, len, io::ReadResult{&bl}, 0); + tracepoint(librbd, read_exit, r); + return r; + } + + ssize_t Image::read2(uint64_t ofs, size_t len, bufferlist& bl, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, ofs, len, op_flags); + + int r = api::Io<>::read(*ictx, ofs, len, io::ReadResult{&bl}, op_flags); + tracepoint(librbd, read_exit, r); + return r; + } + + int64_t Image::read_iterate(uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, read_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + + int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg); + tracepoint(librbd, read_iterate_exit, r); + return r; + } + + int Image::read_iterate2(uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, read_iterate2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + + int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg); + if (r > 0) + r = 0; + tracepoint(librbd, read_iterate2_exit, r); + return (int)r; + } + + int Image::diff_iterate(const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), + void *arg) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len, + true, false); + int r = librbd::api::DiffIterate<>::diff_iterate(ictx, + cls::rbd::UserSnapshotNamespace(), + fromsnapname, ofs, + len, true, false, cb, arg); + tracepoint(librbd, diff_iterate_exit, r); + return r; + } + + int Image::diff_iterate2(const char *fromsnapname, uint64_t ofs, uint64_t len, + bool include_parent, bool whole_object, + int (*cb)(uint64_t, size_t, int, void *), void *arg) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len, + include_parent, whole_object); + int r = librbd::api::DiffIterate<>::diff_iterate(ictx, + cls::rbd::UserSnapshotNamespace(), + fromsnapname, ofs, + len, include_parent, + whole_object, cb, arg); + tracepoint(librbd, diff_iterate_exit, r); + return r; + } + + ssize_t Image::write(uint64_t ofs, size_t len, bufferlist& bl) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len, bl.length() < len ? NULL : bl.c_str()); + if (bl.length() < len) { + tracepoint(librbd, write_exit, -EINVAL); + return -EINVAL; + } + + int r = api::Io<>::write(*ictx, ofs, len, bufferlist{bl}, 0); + tracepoint(librbd, write_exit, r); + return r; + } + + ssize_t Image::write2(uint64_t ofs, size_t len, bufferlist& bl, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, + ofs, len, bl.length() < len ? NULL : bl.c_str(), op_flags); + if (bl.length() < len) { + tracepoint(librbd, write_exit, -EINVAL); + return -EINVAL; + } + + int r = api::Io<>::write(*ictx, ofs, len, bufferlist{bl}, op_flags); + tracepoint(librbd, write_exit, r); + return r; + } + + int Image::discard(uint64_t ofs, uint64_t len) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + if (len > static_cast(std::numeric_limits::max())) { + tracepoint(librbd, discard_exit, -EINVAL); + return -EINVAL; + } + int r = api::Io<>::discard( + *ictx, ofs, len, ictx->discard_granularity_bytes); + tracepoint(librbd, discard_exit, r); + return r; + } + + ssize_t Image::writesame(uint64_t ofs, size_t len, bufferlist& bl, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, ofs, len, bl.length() == 0 ? NULL : bl.c_str(), bl.length(), + op_flags); + if (bl.length() == 0 || len % bl.length() || + len > static_cast(std::numeric_limits::max())) { + tracepoint(librbd, writesame_exit, -EINVAL); + return -EINVAL; + } + + bool discard_zero = ictx->config.get_val("rbd_discard_on_zeroed_write_same"); + if (discard_zero && bl.is_zero()) { + int r = api::Io<>::write_zeroes(*ictx, ofs, len, 0U, op_flags); + tracepoint(librbd, writesame_exit, r); + return r; + } + + int r = api::Io<>::write_same(*ictx, ofs, len, bufferlist{bl}, op_flags); + tracepoint(librbd, writesame_exit, r); + return r; + } + + ssize_t Image::write_zeroes(uint64_t ofs, size_t len, int zero_flags, + int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return api::Io<>::write_zeroes(*ictx, ofs, len, zero_flags, op_flags); + } + + ssize_t Image::compare_and_write(uint64_t ofs, size_t len, + ceph::bufferlist &cmp_bl, ceph::bufferlist& bl, + uint64_t *mismatch_off, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), + ictx->read_only, ofs, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(), + bl.length() < len ? NULL : bl.c_str(), op_flags); + + if (bl.length() < len || cmp_bl.length() < len) { + tracepoint(librbd, compare_and_write_exit, -EINVAL); + return -EINVAL; + } + + int r = api::Io<>::compare_and_write( + *ictx, ofs, len, bufferlist{cmp_bl}, bufferlist{bl}, mismatch_off, + op_flags); + + tracepoint(librbd, compare_and_write_exit, r); + + return r; + } + + int Image::aio_write(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, bl.length() < len ? NULL : bl.c_str(), c->pc); + if (bl.length() < len) { + tracepoint(librbd, aio_write_exit, -EINVAL); + return -EINVAL; + } + api::Io<>::aio_write(*ictx, get_aio_completion(c), off, len, bufferlist{bl}, + 0, true); + + tracepoint(librbd, aio_write_exit, 0); + return 0; + } + + int Image::aio_write2(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, bl.length() < len ? NULL : bl.c_str(), c->pc, op_flags); + if (bl.length() < len) { + tracepoint(librbd, aio_write_exit, -EINVAL); + return -EINVAL; + } + api::Io<>::aio_write(*ictx, get_aio_completion(c), off, len, bufferlist{bl}, + op_flags, true); + + tracepoint(librbd, aio_write_exit, 0); + return 0; + } + + int Image::aio_read(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, bl.c_str(), c->pc); + ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~" + << (void *)(bl.c_str() + len - 1) << dendl; + + api::Io<>::aio_read(*ictx, get_aio_completion(c), off, len, + io::ReadResult{&bl}, 0, true); + tracepoint(librbd, aio_read_exit, 0); + return 0; + } + + int Image::aio_read2(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, bl.c_str(), c->pc, op_flags); + ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~" + << (void *)(bl.c_str() + len - 1) << dendl; + + api::Io<>::aio_read(*ictx, get_aio_completion(c), off, len, + io::ReadResult{&bl}, op_flags, true); + tracepoint(librbd, aio_read_exit, 0); + return 0; + } + + int Image::flush() + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = api::Io<>::flush(*ictx); + tracepoint(librbd, flush_exit, r); + return r; + } + + int Image::aio_flush(RBD::AioCompletion *c) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc); + api::Io<>::aio_flush(*ictx, get_aio_completion(c), true); + tracepoint(librbd, aio_flush_exit, 0); + return 0; + } + + int Image::aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, c->pc); + api::Io<>::aio_discard( + *ictx, get_aio_completion(c), off, len, ictx->discard_granularity_bytes, + true); + tracepoint(librbd, aio_discard_exit, 0); + return 0; + } + + int Image::aio_writesame(uint64_t off, size_t len, bufferlist& bl, + RBD::AioCompletion *c, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, bl.length() <= len ? NULL : bl.c_str(), bl.length(), + c->pc, op_flags); + if (bl.length() == 0 || len % bl.length()) { + tracepoint(librbd, aio_writesame_exit, -EINVAL); + return -EINVAL; + } + + bool discard_zero = ictx->config.get_val("rbd_discard_on_zeroed_write_same"); + if (discard_zero && bl.is_zero()) { + api::Io<>::aio_write_zeroes(*ictx, get_aio_completion(c), off, len, 0U, + op_flags, true); + tracepoint(librbd, aio_writesame_exit, 0); + return 0; + } + + api::Io<>::aio_write_same(*ictx, get_aio_completion(c), off, len, + bufferlist{bl}, op_flags, true); + tracepoint(librbd, aio_writesame_exit, 0); + return 0; + } + + int Image::aio_write_zeroes(uint64_t off, size_t len, RBD::AioCompletion *c, + int zero_flags, int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + api::Io<>::aio_write_zeroes(*ictx, get_aio_completion(c), off, len, + zero_flags, op_flags, true); + return 0; + } + + int Image::aio_compare_and_write(uint64_t off, size_t len, + ceph::bufferlist& cmp_bl, ceph::bufferlist& bl, + RBD::AioCompletion *c, uint64_t *mismatch_off, + int op_flags) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), + ictx->read_only, off, len, cmp_bl.length() < len ? NULL : cmp_bl.c_str(), + bl.length() < len ? NULL : bl.c_str(), c->pc, op_flags); + + if (bl.length() < len || cmp_bl.length() < len) { + tracepoint(librbd, aio_compare_and_write_exit, -EINVAL); + return -EINVAL; + } + + api::Io<>::aio_compare_and_write(*ictx, get_aio_completion(c), off, len, + bufferlist{cmp_bl}, bufferlist{bl}, + mismatch_off, op_flags, false); + + tracepoint(librbd, aio_compare_and_write_exit, 0); + + return 0; + } + + int Image::invalidate_cache() + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, invalidate_cache_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::invalidate_cache(ictx); + tracepoint(librbd, invalidate_cache_exit, r); + return r; + } + + int Image::poll_io_events(RBD::AioCompletion **comps, int numcomp) + { + io::AioCompletion *cs[numcomp]; + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, poll_io_events_enter, ictx, numcomp); + int r = librbd::poll_io_events(ictx, cs, numcomp); + tracepoint(librbd, poll_io_events_exit, r); + if (r > 0) { + for (int i = 0; i < r; ++i) + comps[i] = (RBD::AioCompletion *)cs[i]->rbd_comp; + } + return r; + } + + int Image::metadata_get(const std::string &key, std::string *value) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, metadata_get_enter, ictx, key.c_str()); + int r = librbd::metadata_get(ictx, key, value); + if (r < 0) { + tracepoint(librbd, metadata_get_exit, r, key.c_str(), NULL); + } else { + tracepoint(librbd, metadata_get_exit, r, key.c_str(), value->c_str()); + } + return r; + } + + int Image::metadata_set(const std::string &key, const std::string &value) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, metadata_set_enter, ictx, key.c_str(), value.c_str()); + int r = ictx->operations->metadata_set(key, value); + tracepoint(librbd, metadata_set_exit, r); + return r; + } + + int Image::metadata_remove(const std::string &key) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, metadata_remove_enter, ictx, key.c_str()); + int r = ictx->operations->metadata_remove(key); + tracepoint(librbd, metadata_remove_exit, r); + return r; + } + + int Image::metadata_list(const std::string &start, uint64_t max, map *pairs) + { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, metadata_list_enter, ictx); + int r = librbd::metadata_list(ictx, start, max, pairs); + if (r >= 0) { + for (map::iterator it = pairs->begin(); + it != pairs->end(); ++it) { + tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str()); + } + } + tracepoint(librbd, metadata_list_exit, r); + return r; + } + + int Image::mirror_image_enable() { + return mirror_image_enable2(RBD_MIRROR_IMAGE_MODE_JOURNAL); + } + + int Image::mirror_image_enable2(mirror_image_mode_t mode) { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_enable(ictx, mode, false); + } + + int Image::mirror_image_disable(bool force) { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_disable(ictx, force); + } + + int Image::mirror_image_promote(bool force) { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_promote(ictx, force); + } + + int Image::mirror_image_demote() { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_demote(ictx); + } + + int Image::mirror_image_resync() + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_resync(ictx); + } + + int Image::mirror_image_create_snapshot(uint64_t *snap_id) + { + ImageCtx *ictx = (ImageCtx *)ctx; + auto flags = librbd::util::get_default_snap_create_flags(ictx); + return librbd::api::Mirror<>::image_snapshot_create(ictx, flags, snap_id); + } + + int Image::mirror_image_create_snapshot2(uint32_t flags, uint64_t *snap_id) + { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Mirror<>::image_snapshot_create(ictx, flags, snap_id); + } + + int Image::mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_info_t) != info_size) { + return -ERANGE; + } + + return librbd::api::Mirror<>::image_get_info(ictx, mirror_image_info); + } + + int Image::mirror_image_get_mode(mirror_image_mode_t *mode) { + ImageCtx *ictx = (ImageCtx *)ctx; + + return librbd::api::Mirror<>::image_get_mode(ictx, mode); + } + + int Image::mirror_image_get_global_status( + mirror_image_global_status_t *mirror_image_global_status, + size_t status_size) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_global_status_t) != status_size) { + return -ERANGE; + } + + return librbd::api::Mirror<>::image_get_global_status( + ictx, mirror_image_global_status); + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + + int Image::mirror_image_get_status(mirror_image_status_t *mirror_image_status, + size_t status_size) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_status_t) != status_size) { + return -ERANGE; + } + + mirror_image_global_status_t mirror_image_global_status; + int r = librbd::api::Mirror<>::image_get_global_status( + ictx, &mirror_image_global_status); + if (r < 0) { + return r; + } + + librbd::mirror_image_site_status_t local_status; + r = get_local_mirror_image_site_status(mirror_image_global_status, + &local_status); + if (r < 0) { + return r; + } + + *mirror_image_status = mirror_image_status_t{ + mirror_image_global_status.name, mirror_image_global_status.info, + local_status.state, local_status.description, local_status.last_update, + local_status.up}; + return 0; + } + +#pragma GCC diagnostic pop + + int Image::mirror_image_get_instance_id(std::string *instance_id) { + ImageCtx *ictx = (ImageCtx *)ctx; + + return librbd::api::Mirror<>::image_get_instance_id(ictx, instance_id); + } + + int Image::aio_mirror_image_promote(bool force, RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + librbd::api::Mirror<>::image_promote( + ictx, force, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + + int Image::aio_mirror_image_demote(RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + librbd::api::Mirror<>::image_demote( + ictx, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + + int Image::aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size, + RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_info_t) != info_size) { + return -ERANGE; + } + + librbd::api::Mirror<>::image_get_info( + ictx, mirror_image_info, + new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + + int Image::aio_mirror_image_get_mode(mirror_image_mode_t *mode, + RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + + librbd::api::Mirror<>::image_get_mode( + ictx, mode, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + + int Image::aio_mirror_image_get_global_status( + mirror_image_global_status_t *status, size_t status_size, + RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_global_status_t) != status_size) { + return -ERANGE; + } + + librbd::api::Mirror<>::image_get_global_status( + ictx, status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + + int Image::aio_mirror_image_get_status(mirror_image_status_t *status, + size_t status_size, + RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + + if (sizeof(mirror_image_status_t) != status_size) { + return -ERANGE; + } + + auto ctx = new C_MirrorImageGetStatus( + status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + librbd::api::Mirror<>::image_get_global_status( + ictx, &ctx->cpp_mirror_image_global_status, ctx); + return 0; + } + +#pragma GCC diagnostic pop + + int Image::aio_mirror_image_create_snapshot(uint32_t flags, uint64_t *snap_id, + RBD::AioCompletion *c) { + ImageCtx *ictx = (ImageCtx *)ctx; + + librbd::api::Mirror<>::image_snapshot_create( + ictx, flags, snap_id, new C_AioCompletion(ictx, + librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(c))); + return 0; + } + + int Image::update_watch(UpdateWatchCtx *wctx, uint64_t *handle) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, update_watch_enter, ictx, wctx); + int r = ictx->state->register_update_watcher(wctx, handle); + tracepoint(librbd, update_watch_exit, r, *handle); + return r; + } + + int Image::update_unwatch(uint64_t handle) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, update_unwatch_enter, ictx, handle); + int r = ictx->state->unregister_update_watcher(handle); + tracepoint(librbd, update_unwatch_exit, r); + return r; + } + + int Image::list_watchers(std::list &watchers) { + ImageCtx *ictx = (ImageCtx *)ctx; + tracepoint(librbd, list_watchers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::list_watchers(ictx, watchers); +#ifdef WITH_LTTNG + if (r >= 0) { + for (auto &watcher : watchers) { + tracepoint(librbd, list_watchers_entry, watcher.addr.c_str(), watcher.id, watcher.cookie); + } + } +#endif + tracepoint(librbd, list_watchers_exit, r, watchers.size()); + return r; + } + + int Image::config_list(std::vector *options) { + ImageCtx *ictx = (ImageCtx *)ctx; + return librbd::api::Config<>::list(ictx, options); + } + + int Image::quiesce_watch(QuiesceWatchCtx *wctx, uint64_t *handle) { + ImageCtx *ictx = (ImageCtx *)ctx; + int r = ictx->state->register_quiesce_watcher(wctx, handle); + return r; + } + + int Image::quiesce_unwatch(uint64_t handle) { + ImageCtx *ictx = (ImageCtx *)ctx; + int r = ictx->state->unregister_quiesce_watcher(handle); + return r; + } + + void Image::quiesce_complete(uint64_t handle, int r) { + ImageCtx *ictx = (ImageCtx *)ctx; + ictx->state->quiesce_complete(handle, r); + } + +} // namespace librbd + +extern "C" void rbd_version(int *major, int *minor, int *extra) +{ + if (major) + *major = LIBRBD_VER_MAJOR; + if (minor) + *minor = LIBRBD_VER_MINOR; + if (extra) + *extra = LIBRBD_VER_EXTRA; +} + +extern "C" void rbd_image_options_create(rbd_image_options_t* opts) +{ + librbd::image_options_create(opts); +} + +extern "C" void rbd_image_options_destroy(rbd_image_options_t opts) +{ + librbd::image_options_destroy(opts); +} + +extern "C" int rbd_image_options_set_string(rbd_image_options_t opts, int optname, + const char* optval) +{ + return librbd::image_options_set(opts, optname, optval); +} + +extern "C" int rbd_image_options_set_uint64(rbd_image_options_t opts, int optname, + uint64_t optval) +{ + return librbd::image_options_set(opts, optname, optval); +} + +extern "C" int rbd_image_options_get_string(rbd_image_options_t opts, int optname, + char* optval, size_t maxlen) +{ + std::string optval_; + + int r = librbd::image_options_get(opts, optname, &optval_); + + if (r < 0) { + return r; + } + + if (optval_.size() >= maxlen) { + return -E2BIG; + } + + strncpy(optval, optval_.c_str(), maxlen); + + return 0; +} + +extern "C" int rbd_image_options_get_uint64(rbd_image_options_t opts, int optname, + uint64_t* optval) +{ + return librbd::image_options_get(opts, optname, optval); +} + +extern "C" int rbd_image_options_is_set(rbd_image_options_t opts, int optname, + bool* is_set) +{ + return librbd::image_options_is_set(opts, optname, is_set); +} + +extern "C" int rbd_image_options_unset(rbd_image_options_t opts, int optname) +{ + return librbd::image_options_unset(opts, optname); +} + +extern "C" void rbd_image_options_clear(rbd_image_options_t opts) +{ + librbd::image_options_clear(opts); +} + +extern "C" int rbd_image_options_is_empty(rbd_image_options_t opts) +{ + return librbd::image_options_is_empty(opts); +} + +/* pool mirroring */ +extern "C" int rbd_mirror_site_name_get(rados_t cluster, char *name, + size_t *max_len) { + librados::Rados rados; + librados::Rados::from_rados_t(cluster, rados); + + std::string site_name; + int r = librbd::api::Mirror<>::site_name_get(rados, &site_name); + if (r < 0) { + return r; + } + + auto total_len = site_name.size() + 1; + if (*max_len < total_len) { + *max_len = total_len; + return -ERANGE; + } + *max_len = total_len; + + strcpy(name, site_name.c_str()); + return 0; +} + +extern "C" int rbd_mirror_site_name_set(rados_t cluster, const char *name) { + librados::Rados rados; + librados::Rados::from_rados_t(cluster, rados); + return librbd::api::Mirror<>::site_name_set(rados, name); +} + +extern "C" int rbd_mirror_mode_get(rados_ioctx_t p, + rbd_mirror_mode_t *mirror_mode) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + return librbd::api::Mirror<>::mode_get(io_ctx, mirror_mode); +} + +extern "C" int rbd_mirror_mode_set(rados_ioctx_t p, + rbd_mirror_mode_t mirror_mode) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + return librbd::api::Mirror<>::mode_set(io_ctx, mirror_mode); +} + +extern "C" int rbd_mirror_uuid_get(rados_ioctx_t p, + char *mirror_uuid, size_t *max_len) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::string mirror_uuid_str; + int r = librbd::api::Mirror<>::uuid_get(io_ctx, &mirror_uuid_str); + if (r < 0) { + return r; + } + + auto total_len = mirror_uuid_str.size() + 1; + if (*max_len < total_len) { + *max_len = total_len; + return -ERANGE; + } + *max_len = total_len; + + strcpy(mirror_uuid, mirror_uuid_str.c_str()); + return 0; +} + +extern "C" int rbd_mirror_peer_bootstrap_create(rados_ioctx_t p, char *token, + size_t *max_len) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::string token_str; + int r = librbd::api::Mirror<>::peer_bootstrap_create(io_ctx, &token_str); + if (r < 0) { + return r; + } + + auto total_len = token_str.size() + 1; + if (*max_len < total_len) { + *max_len = total_len; + return -ERANGE; + } + *max_len = total_len; + + strcpy(token, token_str.c_str()); + return 0; +} + +extern "C" int rbd_mirror_peer_bootstrap_import( + rados_ioctx_t p, rbd_mirror_peer_direction_t direction, + const char *token) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + return librbd::api::Mirror<>::peer_bootstrap_import(io_ctx, direction, token); +} + +extern "C" int rbd_mirror_peer_site_add(rados_ioctx_t p, char *uuid, + size_t uuid_max_length, + rbd_mirror_peer_direction_t direction, + const char *site_name, + const char *client_name) { + static const std::size_t UUID_LENGTH = 36; + + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + if (uuid_max_length < UUID_LENGTH + 1) { + return -E2BIG; + } + + std::string uuid_str; + int r = librbd::api::Mirror<>::peer_site_add(io_ctx, &uuid_str, direction, + site_name, client_name); + if (r >= 0) { + strncpy(uuid, uuid_str.c_str(), uuid_max_length); + uuid[uuid_max_length - 1] = '\0'; + } + return r; +} + +extern "C" int rbd_mirror_peer_site_remove(rados_ioctx_t p, const char *uuid) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + int r = librbd::api::Mirror<>::peer_site_remove(io_ctx, uuid); + return r; +} + +extern "C" int rbd_mirror_peer_site_list( + rados_ioctx_t p, rbd_mirror_peer_site_t *peers, int *max_peers) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::vector peer_vector; + int r = librbd::api::Mirror<>::peer_site_list(io_ctx, &peer_vector); + if (r < 0) { + return r; + } + + if (*max_peers < static_cast(peer_vector.size())) { + *max_peers = static_cast(peer_vector.size()); + return -ERANGE; + } + + for (int i = 0; i < static_cast(peer_vector.size()); ++i) { + peers[i].uuid = strdup(peer_vector[i].uuid.c_str()); + peers[i].direction = peer_vector[i].direction; + peers[i].site_name = strdup(peer_vector[i].site_name.c_str()); + peers[i].mirror_uuid = strdup(peer_vector[i].mirror_uuid.c_str()); + peers[i].client_name = strdup(peer_vector[i].client_name.c_str()); + } + *max_peers = static_cast(peer_vector.size()); + return 0; +} + +extern "C" void rbd_mirror_peer_site_list_cleanup(rbd_mirror_peer_site_t *peers, + int max_peers) { + for (int i = 0; i < max_peers; ++i) { + free(peers[i].uuid); + free(peers[i].site_name); + free(peers[i].mirror_uuid); + free(peers[i].client_name); + } +} + +extern "C" int rbd_mirror_peer_site_set_client_name( + rados_ioctx_t p, const char *uuid, const char *client_name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + return librbd::api::Mirror<>::peer_site_set_client(io_ctx, uuid, client_name); +} + +extern "C" int rbd_mirror_peer_site_set_name( + rados_ioctx_t p, const char *uuid, const char *site_name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + return librbd::api::Mirror<>::peer_site_set_name(io_ctx, uuid, site_name); +} + +extern "C" int rbd_mirror_peer_site_set_direction( + rados_ioctx_t p, const char *uuid, rbd_mirror_peer_direction_t direction) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + return librbd::api::Mirror<>::peer_site_set_direction(io_ctx, uuid, + direction); +} + +extern "C" int rbd_mirror_peer_site_get_attributes( + rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len, + char *values, size_t *max_val_len, size_t *key_value_count) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::map attributes; + int r = librbd::api::Mirror<>::peer_site_get_attributes( + io_ctx, uuid, &attributes); + if (r < 0) { + return r; + } + + size_t key_total_len = 0, val_total_len = 0; + for (auto& it : attributes) { + key_total_len += it.first.size() + 1; + val_total_len += it.second.length() + 1; + } + + bool too_short = ((*max_key_len < key_total_len) || + (*max_val_len < val_total_len)); + + *max_key_len = key_total_len; + *max_val_len = val_total_len; + *key_value_count = attributes.size(); + if (too_short) { + return -ERANGE; + } + + char *keys_p = keys; + char *values_p = values; + for (auto& it : attributes) { + strncpy(keys_p, it.first.c_str(), it.first.size() + 1); + keys_p += it.first.size() + 1; + + strncpy(values_p, it.second.c_str(), it.second.length() + 1); + values_p += it.second.length() + 1; + } + + return 0; +} + +extern "C" int rbd_mirror_peer_site_set_attributes( + rados_ioctx_t p, const char *uuid, const char *keys, const char *values, + size_t count) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::map attributes; + + for (size_t i = 0; i < count; ++i) { + const char* key = keys; + keys += strlen(key) + 1; + const char* value = values; + values += strlen(value) + 1; + attributes[key] = value; + } + + return librbd::api::Mirror<>::peer_site_set_attributes( + io_ctx, uuid, attributes); +} + +extern "C" int rbd_mirror_image_global_status_list(rados_ioctx_t p, + const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_global_status_t *images, size_t *len) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + std::map cpp_images; + + int r = librbd::api::Mirror<>::image_global_status_list( + io_ctx, start_id, max, &cpp_images); + if (r < 0) { + return r; + } + + size_t i = 0; + for (auto &it : cpp_images) { + ceph_assert(i < max); + const std::string &image_id = it.first; + image_ids[i] = strdup(image_id.c_str()); + mirror_image_global_status_cpp_to_c(it.second, &images[i]); + i++; + } + *len = i; + return 0; +} + +extern "C" void rbd_mirror_image_global_status_cleanup( + rbd_mirror_image_global_status_t *global_status) { + free(global_status->name); + rbd_mirror_image_get_info_cleanup(&global_status->info); + for (auto idx = 0U; idx < global_status->site_statuses_count; ++idx) { + free(global_status->site_statuses[idx].mirror_uuid); + free(global_status->site_statuses[idx].description); + } + free(global_status->site_statuses); +} + +extern "C" void rbd_mirror_image_global_status_list_cleanup( + char **image_ids, rbd_mirror_image_global_status_t *images, size_t len) { + for (size_t i = 0; i < len; i++) { + free(image_ids[i]); + rbd_mirror_image_global_status_cleanup(&images[i]); + } +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +extern "C" int rbd_mirror_peer_add(rados_ioctx_t p, char *uuid, + size_t uuid_max_length, + const char *cluster_name, + const char *client_name) { + return rbd_mirror_peer_site_add( + p, uuid, uuid_max_length, RBD_MIRROR_PEER_DIRECTION_RX_TX, cluster_name, + client_name); +} + +extern "C" int rbd_mirror_peer_remove(rados_ioctx_t p, const char *uuid) { + return rbd_mirror_peer_site_remove(p, uuid); +} + +extern "C" int rbd_mirror_peer_list(rados_ioctx_t p, + rbd_mirror_peer_t *peers, int *max_peers) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::vector peer_vector; + int r = librbd::api::Mirror<>::peer_site_list(io_ctx, &peer_vector); + if (r < 0) { + return r; + } + + if (*max_peers < static_cast(peer_vector.size())) { + *max_peers = static_cast(peer_vector.size()); + return -ERANGE; + } + + for (int i = 0; i < static_cast(peer_vector.size()); ++i) { + peers[i].uuid = strdup(peer_vector[i].uuid.c_str()); + peers[i].cluster_name = strdup(peer_vector[i].site_name.c_str()); + peers[i].client_name = strdup(peer_vector[i].client_name.c_str()); + } + *max_peers = static_cast(peer_vector.size()); + return 0; +} + +extern "C" void rbd_mirror_peer_list_cleanup(rbd_mirror_peer_t *peers, + int max_peers) { + for (int i = 0; i < max_peers; ++i) { + free(peers[i].uuid); + free(peers[i].cluster_name); + free(peers[i].client_name); + } +} + +extern "C" int rbd_mirror_peer_set_client(rados_ioctx_t p, const char *uuid, + const char *client_name) { + return rbd_mirror_peer_site_set_client_name(p, uuid, client_name); +} + +extern "C" int rbd_mirror_peer_set_cluster(rados_ioctx_t p, const char *uuid, + const char *cluster_name) { + return rbd_mirror_peer_site_set_name(p, uuid, cluster_name); +} + +extern "C" int rbd_mirror_peer_get_attributes( + rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len, + char *values, size_t *max_val_len, size_t *key_value_count) { + return rbd_mirror_peer_site_get_attributes( + p, uuid, keys, max_key_len, values, max_val_len, key_value_count); +} + +extern "C" int rbd_mirror_peer_set_attributes( + rados_ioctx_t p, const char *uuid, const char *keys, const char *values, + size_t count) { + return rbd_mirror_peer_site_set_attributes( + p, uuid, keys, values, count); +} + +extern "C" int rbd_mirror_image_status_list(rados_ioctx_t p, + const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_status_t *images, size_t *len) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + std::map cpp_images; + + int r = librbd::api::Mirror<>::image_global_status_list( + io_ctx, start_id, max, &cpp_images); + if (r < 0) { + return r; + } + + size_t i = 0; + for (auto &it : cpp_images) { + ceph_assert(i < max); + const std::string &image_id = it.first; + image_ids[i] = strdup(image_id.c_str()); + mirror_image_global_status_cpp_to_c(it.second, &images[i]); + i++; + } + *len = i; + return 0; +} + +extern "C" void rbd_mirror_image_status_list_cleanup(char **image_ids, + rbd_mirror_image_status_t *images, size_t len) { + for (size_t i = 0; i < len; i++) { + free(image_ids[i]); + free(images[i].name); + rbd_mirror_image_get_info_cleanup(&images[i].info); + free(images[i].description); + } +} + +#pragma GCC diagnostic pop + +extern "C" int rbd_mirror_image_status_summary(rados_ioctx_t p, + rbd_mirror_image_status_state_t *states, int *counts, size_t *maxlen) { + + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::map states_; + int r = librbd::api::Mirror<>::image_status_summary(io_ctx, &states_); + if (r < 0) { + return r; + } + + size_t i = 0; + for (auto &it : states_) { + if (i == *maxlen) { + return -ERANGE; + } + states[i] = it.first; + counts[i] = it.second; + i++; + } + *maxlen = i; + return 0; +} + +extern "C" int rbd_mirror_image_instance_id_list( + rados_ioctx_t p, const char *start_id, size_t max, char **image_ids, + char **instance_ids, size_t *len) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + std::map cpp_instance_ids; + + int r = librbd::api::Mirror<>::image_instance_id_list(io_ctx, start_id, max, + &cpp_instance_ids); + if (r < 0) { + return r; + } + + size_t i = 0; + for (auto &it : cpp_instance_ids) { + ceph_assert(i < max); + image_ids[i] = strdup(it.first.c_str()); + instance_ids[i] = strdup(it.second.c_str()); + i++; + } + *len = i; + return 0; +} + +extern "C" void rbd_mirror_image_instance_id_list_cleanup( + char **image_ids, char **instance_ids, size_t len) { + for (size_t i = 0; i < len; i++) { + free(image_ids[i]); + free(instance_ids[i]); + } +} + +extern "C" int rbd_mirror_image_info_list( + rados_ioctx_t p, rbd_mirror_image_mode_t *mode_filter, + const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_mode_t *mode_entries, + rbd_mirror_image_info_t *info_entries, size_t *num_entries) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + std::map> cpp_entries; + + int r = librbd::api::Mirror<>::image_info_list(io_ctx, mode_filter, start_id, + max, &cpp_entries); + if (r < 0) { + return r; + } + + ceph_assert(cpp_entries.size() <= max); + + for (auto &it : cpp_entries) { + *(image_ids++) = strdup(it.first.c_str()); + *(mode_entries++) = it.second.first; + mirror_image_info_cpp_to_c(it.second.second, info_entries++); + } + *num_entries = cpp_entries.size(); + + return 0; +} + +extern "C" void rbd_mirror_image_info_list_cleanup( + char **image_ids, rbd_mirror_image_info_t *info_entries, + size_t num_entries) { + for (size_t i = 0; i < num_entries; i++) { + free(*(image_ids++)); + rbd_mirror_image_get_info_cleanup(info_entries++); + } +} + +/* helpers */ + +extern "C" void rbd_image_spec_cleanup(rbd_image_spec_t *image) +{ + free(image->id); + free(image->name); +} + +extern "C" void rbd_image_spec_list_cleanup(rbd_image_spec_t *images, + size_t num_images) +{ + for (size_t idx = 0; idx < num_images; ++idx) { + rbd_image_spec_cleanup(&images[idx]); + } +} + +extern "C" void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image) +{ + free(image->pool_name); + free(image->pool_namespace); + free(image->image_id); + free(image->image_name); +} + +extern "C" void rbd_linked_image_spec_list_cleanup( + rbd_linked_image_spec_t *images, size_t num_images) +{ + for (size_t idx = 0; idx < num_images; ++idx) { + rbd_linked_image_spec_cleanup(&images[idx]); + } +} + +extern "C" void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap) +{ + free(snap->name); +} + +/* images */ +extern "C" int rbd_list(rados_ioctx_t p, char *names, size_t *size) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + std::vector cpp_image_specs; + int r = librbd::api::Image<>::list_images(io_ctx, &cpp_image_specs); + if (r < 0) { + tracepoint(librbd, list_exit, r, *size); + return r; + } + + size_t expected_size = 0; + + for (auto& it : cpp_image_specs) { + expected_size += it.name.size() + 1; + } + if (*size < expected_size) { + *size = expected_size; + tracepoint(librbd, list_exit, -ERANGE, *size); + return -ERANGE; + } + + if (names == NULL) { + tracepoint(librbd, list_exit, -EINVAL, *size); + return -EINVAL; + } + + for (auto& it : cpp_image_specs) { + const char* name = it.name.c_str(); + tracepoint(librbd, list_entry, name); + strcpy(names, name); + names += strlen(names) + 1; + } + tracepoint(librbd, list_exit, (int)expected_size, *size); + return (int)expected_size; +} + +extern "C" int rbd_list2(rados_ioctx_t p, rbd_image_spec_t *images, + size_t *size) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(images, 0, sizeof(*images) * *size); + std::vector cpp_image_specs; + int r = librbd::api::Image<>::list_images(io_ctx, &cpp_image_specs); + if (r < 0) { + tracepoint(librbd, list_exit, r, *size); + return r; + } + + size_t expected_size = cpp_image_specs.size(); + if (*size < expected_size) { + *size = expected_size; + tracepoint(librbd, list_exit, -ERANGE, *size); + return -ERANGE; + } + + *size = expected_size; + for (size_t idx = 0; idx < expected_size; ++idx) { + images[idx].id = strdup(cpp_image_specs[idx].id.c_str()); + images[idx].name = strdup(cpp_image_specs[idx].name.c_str()); + } + tracepoint(librbd, list_exit, 0, *size); + return 0; +} + +extern "C" int rbd_create(rados_ioctx_t p, const char *name, uint64_t size, int *order) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, create_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, *order); + int r = librbd::create(io_ctx, name, size, order); + tracepoint(librbd, create_exit, r, *order); + return r; +} + +extern "C" int rbd_create2(rados_ioctx_t p, const char *name, + uint64_t size, uint64_t features, + int *order) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, create2_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order); + int r = librbd::create(io_ctx, name, size, false, features, order, 0, 0); + tracepoint(librbd, create2_exit, r, *order); + return r; +} + +extern "C" int rbd_create3(rados_ioctx_t p, const char *name, + uint64_t size, uint64_t features, + int *order, + uint64_t stripe_unit, uint64_t stripe_count) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, create3_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, features, *order, stripe_unit, stripe_count); + int r = librbd::create(io_ctx, name, size, false, features, order, + stripe_unit, stripe_count); + tracepoint(librbd, create3_exit, r, *order); + return r; +} + +extern "C" int rbd_create4(rados_ioctx_t p, const char *name, + uint64_t size, rbd_image_options_t opts) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts); + librbd::ImageOptions opts_(opts); + int r = librbd::create(io_ctx, name, "", size, opts_, "", "", false); + tracepoint(librbd, create4_exit, r); + return r; +} + +extern "C" int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snap_name, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order) +{ + librados::IoCtx p_ioc, c_ioc; + librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc); + librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc); + TracepointProvider::initialize(get_cct(p_ioc)); + tracepoint(librbd, clone_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features); + int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name, + features, c_order, 0, 0); + tracepoint(librbd, clone_exit, r, *c_order); + return r; +} + +extern "C" int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snap_name, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order, + uint64_t stripe_unit, int stripe_count) +{ + librados::IoCtx p_ioc, c_ioc; + librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc); + librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc); + TracepointProvider::initialize(get_cct(p_ioc)); + tracepoint(librbd, clone2_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, features, stripe_unit, stripe_count); + int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name, + features, c_order, stripe_unit, stripe_count); + tracepoint(librbd, clone2_exit, r, *c_order); + return r; +} + +extern "C" int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snap_name, rados_ioctx_t c_ioctx, + const char *c_name, rbd_image_options_t c_opts) +{ + librados::IoCtx p_ioc, c_ioc; + librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc); + librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc); + TracepointProvider::initialize(get_cct(p_ioc)); + tracepoint(librbd, clone3_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, c_opts); + librbd::ImageOptions c_opts_(c_opts); + int r = librbd::clone(p_ioc, nullptr, p_name, p_snap_name, c_ioc, nullptr, + c_name, c_opts_, "", ""); + tracepoint(librbd, clone3_exit, r); + return r; +} + +extern "C" int rbd_remove(rados_ioctx_t p, const char *name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx); + tracepoint(librbd, remove_exit, r); + return r; +} + +extern "C" int rbd_remove_with_progress(rados_ioctx_t p, const char *name, + librbd_progress_fn_t cb, void *cbdata) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, remove_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = librbd::api::Image<>::remove(io_ctx, name, prog_ctx); + tracepoint(librbd, remove_exit, r); + return r; +} + +extern "C" int rbd_trash_move(rados_ioctx_t p, const char *name, + uint64_t delay) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_move_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + int r = librbd::api::Trash<>::move(io_ctx, RBD_TRASH_IMAGE_SOURCE_USER, name, + delay); + tracepoint(librbd, trash_move_exit, r); + return r; +} + +extern "C" int rbd_trash_get(rados_ioctx_t io, const char *id, + rbd_trash_image_info_t *info) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + librbd::trash_image_info_t cpp_info; + int r = librbd::api::Trash<>::get(io_ctx, id, &cpp_info); + if (r < 0) { + return r; + } + + trash_image_info_cpp_to_c(cpp_info, info); + return 0; +} + +extern "C" void rbd_trash_get_cleanup(rbd_trash_image_info_t *info) { + free(info->id); + free(info->name); +} + +extern "C" int rbd_trash_list(rados_ioctx_t p, rbd_trash_image_info_t *entries, + size_t *num_entries) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_list_enter, + io_ctx.get_pool_name().c_str(), io_ctx.get_id()); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(entries, 0, sizeof(*entries) * *num_entries); + + vector cpp_entries; + int r = librbd::api::Trash<>::list(io_ctx, cpp_entries, true); + if (r < 0) { + tracepoint(librbd, trash_list_exit, r, *num_entries); + return r; + } + + if (*num_entries < cpp_entries.size()) { + *num_entries = cpp_entries.size(); + tracepoint(librbd, trash_list_exit, -ERANGE, *num_entries); + return -ERANGE; + } + + int i=0; + for (const auto &entry : cpp_entries) { + trash_image_info_cpp_to_c(entry, &entries[i++]); + } + *num_entries = cpp_entries.size(); + + return *num_entries; +} + +extern "C" void rbd_trash_list_cleanup(rbd_trash_image_info_t *entries, + size_t num_entries) { + for (size_t i=0; i < num_entries; i++) { + rbd_trash_get_cleanup(&entries[i]); + } +} + +extern "C" int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts, + float threshold) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), expire_ts, threshold); + librbd::NoOpProgressContext nop_pctx; + int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, nop_pctx); + tracepoint(librbd, trash_purge_exit, r); + return r; +} + +extern "C" int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts, + float threshold, librbd_progress_fn_t cb, void* cbdata) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_purge_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), expire_ts, threshold); + librbd::CProgressContext pctx(cb, cbdata); + int r = librbd::api::Trash<>::purge(io_ctx, expire_ts, threshold, pctx); + tracepoint(librbd, trash_purge_exit, r); + return r; +} + +extern "C" int rbd_trash_remove(rados_ioctx_t p, const char *image_id, + bool force) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_id, force); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx); + tracepoint(librbd, trash_remove_exit, r); + return r; +} + +extern "C" int rbd_trash_remove_with_progress(rados_ioctx_t p, + const char *image_id, + bool force, + librbd_progress_fn_t cb, + void *cbdata) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_id, force); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = librbd::api::Trash<>::remove(io_ctx, image_id, force, prog_ctx); + tracepoint(librbd, trash_remove_exit, r); + return r; +} + +extern "C" int rbd_trash_restore(rados_ioctx_t p, const char *id, + const char *name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, trash_undelete_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), id, name); + int r = librbd::api::Trash<>::restore( + io_ctx, librbd::api::Trash<>::ALLOWED_RESTORE_SOURCES, id, name); + tracepoint(librbd, trash_undelete_exit, r); + return r; +} + +extern "C" int rbd_namespace_create(rados_ioctx_t io, + const char *namespace_name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + return librbd::api::Namespace<>::create(io_ctx, namespace_name); +} + +extern "C" int rbd_namespace_remove(rados_ioctx_t io, + const char *namespace_name) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + return librbd::api::Namespace<>::remove(io_ctx, namespace_name); +} + +extern "C" int rbd_namespace_list(rados_ioctx_t io, char *names, size_t *size) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + if (names == nullptr || size == nullptr) { + return -EINVAL; + } + + std::vector cpp_names; + int r = librbd::api::Namespace<>::list(io_ctx, &cpp_names); + if (r < 0) { + return r; + } + + size_t expected_size = 0; + for (size_t i = 0; i < cpp_names.size(); i++) { + expected_size += cpp_names[i].size() + 1; + } + if (*size < expected_size) { + *size = expected_size; + return -ERANGE; + } + + *size = expected_size; + for (int i = 0; i < (int)cpp_names.size(); i++) { + const char* name = cpp_names[i].c_str(); + strcpy(names, name); + names += strlen(names) + 1; + } + + return (int)expected_size; +} + +extern "C" int rbd_namespace_exists(rados_ioctx_t io, + const char *namespace_name, + bool *exists) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + return librbd::api::Namespace<>::exists(io_ctx, namespace_name, exists); +} + +extern "C" int rbd_pool_init(rados_ioctx_t io, bool force) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + return librbd::api::Pool<>::init(io_ctx, force); +} + +extern "C" void rbd_pool_stats_create(rbd_pool_stats_t *stats) { + *stats = reinterpret_cast( + new librbd::api::Pool<>::StatOptions{}); +} + +extern "C" void rbd_pool_stats_destroy(rbd_pool_stats_t stats) { + auto pool_stat_options = + reinterpret_cast::StatOptions*>(stats); + delete pool_stat_options; +} + +extern "C" int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats, + int stat_option, + uint64_t* stat_val) { + auto pool_stat_options = + reinterpret_cast::StatOptions*>(stats); + return librbd::api::Pool<>::add_stat_option( + pool_stat_options, static_cast(stat_option), + stat_val); +} + +extern "C" int rbd_pool_stats_get( + rados_ioctx_t io, rbd_pool_stats_t pool_stats) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(io, io_ctx); + + auto pool_stat_options = + reinterpret_cast::StatOptions*>(pool_stats); + return librbd::api::Pool<>::get_stats(io_ctx, pool_stat_options); +} + +extern "C" int rbd_copy(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname); + librbd::ImageOptions opts; + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0); + tracepoint(librbd, copy_exit, r); + return r; +} + +extern "C" int rbd_copy2(rbd_image_t srcp, rbd_image_t destp) +{ + librbd::ImageCtx *src = (librbd::ImageCtx *)srcp; + librbd::ImageCtx *dest = (librbd::ImageCtx *)destp; + tracepoint(librbd, copy2_enter, src, src->name.c_str(), src->snap_name.c_str(), src->read_only, dest, dest->name.c_str(), dest->snap_name.c_str(), dest->read_only); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(src, dest, prog_ctx, 0); + tracepoint(librbd, copy2_exit, r); + return r; +} + +extern "C" int rbd_copy3(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, rbd_image_options_t c_opts) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, c_opts); + librbd::ImageOptions c_opts_(c_opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, c_opts_, prog_ctx, 0); + tracepoint(librbd, copy3_exit, r); + return r; +} + +extern "C" int rbd_copy4(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, rbd_image_options_t c_opts, size_t sparse_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, c_opts, sparse_size); + librbd::ImageOptions c_opts_(c_opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::copy(ictx, dest_io_ctx, destname, c_opts_, prog_ctx, sparse_size); + tracepoint(librbd, copy4_exit, r); + return r; +} + +extern "C" int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, + librbd_progress_fn_t fn, void *data) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname); + librbd::ImageOptions opts; + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx, 0); + tracepoint(librbd, copy_exit, ret); + return ret; +} + +extern "C" int rbd_copy_with_progress2(rbd_image_t srcp, rbd_image_t destp, + librbd_progress_fn_t fn, void *data) +{ + librbd::ImageCtx *src = (librbd::ImageCtx *)srcp; + librbd::ImageCtx *dest = (librbd::ImageCtx *)destp; + tracepoint(librbd, copy2_enter, src, src->name.c_str(), src->snap_name.c_str(), src->read_only, dest, dest->name.c_str(), dest->snap_name.c_str(), dest->read_only); + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::copy(src, dest, prog_ctx, 0); + tracepoint(librbd, copy2_exit, ret); + return ret; +} + +extern "C" int rbd_copy_with_progress3(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t fn, void *data) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, dest_opts); + librbd::ImageOptions dest_opts_(dest_opts); + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::copy(ictx, dest_io_ctx, destname, dest_opts_, prog_ctx, 0); + tracepoint(librbd, copy3_exit, ret); + return ret; +} + +extern "C" int rbd_copy_with_progress4(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t fn, void *data, size_t sparse_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, copy4_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, dest_opts, sparse_size); + librbd::ImageOptions dest_opts_(dest_opts); + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::copy(ictx, dest_io_ctx, destname, dest_opts_, prog_ctx, sparse_size); + tracepoint(librbd, copy4_exit, ret); + return ret; +} + +extern "C" int rbd_deep_copy(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, rbd_image_options_t c_opts) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, + dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), + destname, c_opts); + librbd::ImageOptions opts(c_opts); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts, + prog_ctx); + tracepoint(librbd, deep_copy_exit, r); + return r; +} + +extern "C" int rbd_deep_copy_with_progress(rbd_image_t image, + rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t fn, void *data) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, deep_copy_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, + dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), + destname, dest_opts); + librbd::ImageOptions opts(dest_opts); + librbd::CProgressContext prog_ctx(fn, data); + int ret = librbd::api::Image<>::deep_copy(ictx, dest_io_ctx, destname, opts, + prog_ctx); + tracepoint(librbd, deep_copy_exit, ret); + return ret; +} + +extern "C" int rbd_encryption_format(rbd_image_t image, + rbd_encryption_format_t format, + rbd_encryption_options_t opts, + size_t opts_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Image<>::encryption_format( + ictx, format, opts, opts_size, true); +} + +extern "C" int rbd_encryption_load(rbd_image_t image, + rbd_encryption_format_t format, + rbd_encryption_options_t opts, + size_t opts_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::encryption_spec_t spec = {format, opts, opts_size}; + return librbd::api::Image<>::encryption_load(ictx, &spec, 1, true); +} + +extern "C" int rbd_encryption_load2(rbd_image_t image, + const rbd_encryption_spec_t *specs, + size_t spec_count) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Image<>::encryption_load(ictx, specs, spec_count, true); +} + +extern "C" int rbd_flatten(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->flatten(prog_ctx); + tracepoint(librbd, flatten_exit, r); + return r; +} + +extern "C" int rbd_flatten_with_progress(rbd_image_t image, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, flatten_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->flatten(prog_ctx); + tracepoint(librbd, flatten_exit, r); + return r; +} + +extern "C" int rbd_sparsify(rbd_image_t image, size_t sparse_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size, + ictx->id.c_str()); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->sparsify(sparse_size, prog_ctx); + tracepoint(librbd, sparsify_exit, r); + return r; +} + +extern "C" int rbd_sparsify_with_progress(rbd_image_t image, size_t sparse_size, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, sparsify_enter, ictx, ictx->name.c_str(), sparse_size, + ictx->id.c_str()); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->sparsify(sparse_size, prog_ctx); + tracepoint(librbd, sparsify_exit, r); + return r; +} + +extern "C" int rbd_rename(rados_ioctx_t src_p, const char *srcname, + const char *destname) +{ + librados::IoCtx src_io_ctx; + librados::IoCtx::from_rados_ioctx_t(src_p, src_io_ctx); + TracepointProvider::initialize(get_cct(src_io_ctx)); + tracepoint(librbd, rename_enter, src_io_ctx.get_pool_name().c_str(), src_io_ctx.get_id(), srcname, destname); + int r = librbd::rename(src_io_ctx, srcname, destname); + tracepoint(librbd, rename_exit, r); + return r; +} + +extern "C" int rbd_migration_prepare(rados_ioctx_t p, const char *image_name, + rados_ioctx_t dest_p, + const char *dest_image_name, + rbd_image_options_t opts_) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + tracepoint(librbd, migration_prepare_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name, dest_io_ctx.get_pool_name().c_str(), + dest_io_ctx.get_id(), dest_image_name, opts_); + librbd::ImageOptions opts(opts_); + int r = librbd::api::Migration<>::prepare(io_ctx, image_name, dest_io_ctx, + dest_image_name, opts); + tracepoint(librbd, migration_prepare_exit, r); + return r; +} + +extern "C" int rbd_migration_prepare_import( + const char *source_spec, rados_ioctx_t dest_p, + const char *dest_image_name, rbd_image_options_t opts_) { + librados::IoCtx dest_io_ctx; + librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx); + librbd::ImageOptions opts(opts_); + return librbd::api::Migration<>::prepare_import(source_spec, dest_io_ctx, + dest_image_name, opts); +} + +extern "C" int rbd_migration_execute(rados_ioctx_t p, const char *image_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::execute(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_execute_exit, r); + return r; +} + +extern "C" int rbd_migration_execute_with_progress(rados_ioctx_t p, + const char *name, + librbd_progress_fn_t fn, + void *data) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_execute_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + librbd::CProgressContext prog_ctx(fn, data); + int r = librbd::api::Migration<>::execute(io_ctx, name, prog_ctx); + tracepoint(librbd, migration_execute_exit, r); + return r; +} + +extern "C" int rbd_migration_abort(rados_ioctx_t p, const char *image_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::abort(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_abort_exit, r); + return r; +} + +extern "C" int rbd_migration_abort_with_progress(rados_ioctx_t p, + const char *name, + librbd_progress_fn_t fn, + void *data) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_abort_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + librbd::CProgressContext prog_ctx(fn, data); + int r = librbd::api::Migration<>::abort(io_ctx, name, prog_ctx); + tracepoint(librbd, migration_abort_exit, r); + return r; +} + +extern "C" int rbd_migration_commit(rados_ioctx_t p, const char *image_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Migration<>::commit(io_ctx, image_name, prog_ctx); + tracepoint(librbd, migration_commit_exit, r); + return r; +} + +extern "C" int rbd_migration_commit_with_progress(rados_ioctx_t p, + const char *name, + librbd_progress_fn_t fn, + void *data) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_commit_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + librbd::CProgressContext prog_ctx(fn, data); + int r = librbd::api::Migration<>::commit(io_ctx, name, prog_ctx); + tracepoint(librbd, migration_commit_exit, r); + return r; +} + +extern "C" int rbd_migration_status(rados_ioctx_t p, const char *image_name, + rbd_image_migration_status_t *status, + size_t status_size) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, migration_status_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), image_name); + + if (status_size != sizeof(rbd_image_migration_status_t)) { + tracepoint(librbd, migration_status_exit, -ERANGE); + return -ERANGE; + } + + librbd::image_migration_status_t cpp_status; + int r = librbd::api::Migration<>::status(io_ctx, image_name, &cpp_status); + if (r >= 0) { + status->source_pool_id = cpp_status.source_pool_id; + status->source_pool_namespace = + strdup(cpp_status.source_pool_namespace.c_str()); + status->source_image_name = strdup(cpp_status.source_image_name.c_str()); + status->source_image_id = strdup(cpp_status.source_image_id.c_str()); + status->dest_pool_id = cpp_status.dest_pool_id; + status->dest_pool_namespace = + strdup(cpp_status.dest_pool_namespace.c_str()); + status->dest_image_name = strdup(cpp_status.dest_image_name.c_str()); + status->dest_image_id = strdup(cpp_status.dest_image_id.c_str()); + status->state = cpp_status.state; + status->state_description = strdup(cpp_status.state_description.c_str()); + } + + tracepoint(librbd, migration_status_exit, r); + return r; +} + +extern "C" void rbd_migration_status_cleanup(rbd_image_migration_status_t *s) +{ + free(s->source_pool_namespace); + free(s->source_image_name); + free(s->source_image_id); + free(s->dest_pool_namespace); + free(s->dest_image_name); + free(s->dest_image_id); + free(s->state_description); +} + +extern "C" int rbd_pool_metadata_get(rados_ioctx_t p, const char *key, + char *value, size_t *vallen) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + string val_s; + int r = librbd::api::PoolMetadata<>::get(io_ctx, key, &val_s); + if (*vallen < val_s.size() + 1) { + r = -ERANGE; + *vallen = val_s.size() + 1; + } else { + strncpy(value, val_s.c_str(), val_s.size() + 1); + } + + return r; +} + +extern "C" int rbd_pool_metadata_set(rados_ioctx_t p, const char *key, + const char *value) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + int r = librbd::api::PoolMetadata<>::set(io_ctx, key, value); + return r; +} + +extern "C" int rbd_pool_metadata_remove(rados_ioctx_t p, const char *key) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + int r = librbd::api::PoolMetadata<>::remove(io_ctx, key); + return r; +} + +extern "C" int rbd_pool_metadata_list(rados_ioctx_t p, const char *start, + uint64_t max, char *key, size_t *key_len, + char *value, size_t *val_len) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + map pairs; + int r = librbd::api::PoolMetadata<>::list(io_ctx, start, max, &pairs); + if (r < 0) { + return r; + } + size_t key_total_len = 0, val_total_len = 0; + for (auto &it : pairs) { + key_total_len += it.first.size() + 1; + val_total_len += it.second.length() + 1; + } + if (*key_len < key_total_len || *val_len < val_total_len) { + *key_len = key_total_len; + *val_len = val_total_len; + return -ERANGE; + } + *key_len = key_total_len; + *val_len = val_total_len; + + char *key_p = key, *value_p = value; + for (auto &it : pairs) { + strncpy(key_p, it.first.c_str(), it.first.size() + 1); + key_p += it.first.size() + 1; + strncpy(value_p, it.second.c_str(), it.second.length()); + value_p += it.second.length(); + *value_p = '\0'; + value_p++; + } + return 0; +} + +extern "C" int rbd_config_pool_list(rados_ioctx_t p, + rbd_config_option_t *options, + int *max_options) { + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + + std::vector option_vector; + int r = librbd::api::Config<>::list(io_ctx, &option_vector); + if (r < 0) { + return r; + } + + if (*max_options < static_cast(option_vector.size())) { + *max_options = static_cast(option_vector.size()); + return -ERANGE; + } + + for (int i = 0; i < static_cast(option_vector.size()); ++i) { + config_option_cpp_to_c(option_vector[i], &options[i]); + } + *max_options = static_cast(option_vector.size()); + return 0; +} + +extern "C" void rbd_config_pool_list_cleanup(rbd_config_option_t *options, + int max_options) { + for (int i = 0; i < max_options; ++i) { + config_option_cleanup(options[i]); + } +} + +extern "C" int rbd_open(rados_ioctx_t p, const char *name, rbd_image_t *image, + const char *snap_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx, + false); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + int r = ictx->state->open(0); + if (r >= 0) { + *image = (rbd_image_t)ictx; + } + tracepoint(librbd, open_image_exit, r); + return r; +} + +extern "C" int rbd_open_by_id(rados_ioctx_t p, const char *id, + rbd_image_t *image, const char *snap_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx, + false); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), + ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + int r = ictx->state->open(0); + if (r >= 0) { + *image = (rbd_image_t)ictx; + } + tracepoint(librbd, open_image_exit, r); + return r; +} + +extern "C" int rbd_aio_open(rados_ioctx_t p, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx, + false); + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc); + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp), + image)); + tracepoint(librbd, aio_open_image_exit, 0); + return 0; +} + +extern "C" int rbd_aio_open_by_id(rados_ioctx_t p, const char *id, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx, + false); + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), + ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, + comp->pc); + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp), + image)); + tracepoint(librbd, aio_open_image_exit, 0); + return 0; +} + +extern "C" int rbd_open_read_only(rados_ioctx_t p, const char *name, + rbd_image_t *image, const char *snap_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx, + true); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + int r = ictx->state->open(0); + if (r >= 0) { + *image = (rbd_image_t)ictx; + } + tracepoint(librbd, open_image_exit, r); + return r; +} + +extern "C" int rbd_open_by_id_read_only(rados_ioctx_t p, const char *id, + rbd_image_t *image, const char *snap_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx, + true); + tracepoint(librbd, open_image_enter, ictx, ictx->name.c_str(), + ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only); + + int r = ictx->state->open(0); + if (r >= 0) { + *image = (rbd_image_t)ictx; + } + tracepoint(librbd, open_image_exit, r); + return r; +} + +extern "C" int rbd_aio_open_read_only(rados_ioctx_t p, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx(name, "", snap_name, io_ctx, + true); + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc); + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp), + image)); + tracepoint(librbd, aio_open_image_exit, 0); + return 0; +} + +extern "C" int rbd_aio_open_by_id_read_only(rados_ioctx_t p, const char *id, + rbd_image_t *image, + const char *snap_name, + rbd_completion_t c) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + librbd::ImageCtx *ictx = new librbd::ImageCtx("", id, snap_name, io_ctx, + true); + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_open_image_enter, ictx, ictx->name.c_str(), + ictx->id.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc); + ictx->state->open(0, new C_OpenComplete(ictx, get_aio_completion(comp), + image)); + tracepoint(librbd, aio_open_image_exit, 0); + return 0; +} + +extern "C" int rbd_features_to_string(uint64_t features, char *str_features, size_t *size) +{ + std::stringstream err; + std::string get_str_features = librbd::rbd_features_to_string(features, &err); + if (!err.str().empty()) { + return -EINVAL; + } + uint64_t expected_size = get_str_features.size(); + if (*size <= expected_size) { + *size = expected_size + 1; + return -ERANGE; + } + strncpy(str_features, get_str_features.c_str(), expected_size); + str_features[expected_size] = '\0'; + *size = expected_size + 1; + return 0; +} + +extern "C" int rbd_features_from_string(const char *str_features, uint64_t *features) +{ + std::stringstream err; + *features = librbd::rbd_features_from_string(str_features, &err); + if (!err.str().empty()) { + return -EINVAL; + } + + return 0; +} + +extern "C" int rbd_close(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str()); + + int r = ictx->state->close(); + + tracepoint(librbd, close_image_exit, r); + return r; +} + +extern "C" int rbd_aio_close(rbd_image_t image, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_close_image_enter, ictx, ictx->name.c_str(), ictx->id.c_str(), comp->pc); + ictx->state->close(new C_AioCompletion(ictx, librbd::io::AIO_TYPE_CLOSE, + get_aio_completion(comp))); + tracepoint(librbd, aio_close_image_exit, 0); + return 0; +} + +extern "C" int rbd_resize(rbd_image_t image, uint64_t size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->resize(size, true, prog_ctx); + tracepoint(librbd, resize_exit, r); + return r; +} + +extern "C" int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->resize(size, allow_shrink, prog_ctx); + tracepoint(librbd, resize_exit, r); + return r; +} + +extern "C" int rbd_resize_with_progress(rbd_image_t image, uint64_t size, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, resize_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, size); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->resize(size, true, prog_ctx); + tracepoint(librbd, resize_exit, r); + return r; +} + +extern "C" int rbd_stat(rbd_image_t image, rbd_image_info_t *info, + size_t infosize) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, stat_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::info(ictx, *info, infosize); + tracepoint(librbd, stat_exit, r, info); + return r; +} + +extern "C" int rbd_get_old_format(rbd_image_t image, uint8_t *old) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_old_format_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_old_format(ictx, old); + tracepoint(librbd, get_old_format_exit, r, *old); + return r; +} + +extern "C" int rbd_get_size(rbd_image_t image, uint64_t *size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_size_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_size(ictx, size); + tracepoint(librbd, get_size_exit, r, *size); + return r; +} + +extern "C" int rbd_get_features(rbd_image_t image, uint64_t *features) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_features_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_features(ictx, features); + tracepoint(librbd, get_features_exit, r, *features); + return r; +} + +extern "C" int rbd_update_features(rbd_image_t image, uint64_t features, + uint8_t enabled) +{ + librbd::ImageCtx *ictx = reinterpret_cast(image); + bool features_enabled = enabled != 0; + tracepoint(librbd, update_features_enter, ictx, features, features_enabled); + int r = ictx->operations->update_features(features, features_enabled); + tracepoint(librbd, update_features_exit, r); + return r; +} + +extern "C" int rbd_get_op_features(rbd_image_t image, uint64_t *op_features) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Image<>::get_op_features(ictx, op_features); +} + +extern "C" int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_stripe_unit_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + *stripe_unit = ictx->get_stripe_unit(); + tracepoint(librbd, get_stripe_unit_exit, 0, *stripe_unit); + return 0; +} + +extern "C" int rbd_get_stripe_count(rbd_image_t image, uint64_t *stripe_count) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_stripe_count_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + *stripe_count = ictx->get_stripe_count(); + tracepoint(librbd, get_stripe_count_exit, 0, *stripe_count); + return 0; +} + +extern "C" int rbd_get_create_timestamp(rbd_image_t image, + struct timespec *timestamp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_create_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + utime_t time = ictx->get_create_timestamp(); + time.to_timespec(timestamp); + tracepoint(librbd, get_create_timestamp_exit, 0, timestamp); + return 0; +} + +extern "C" int rbd_get_access_timestamp(rbd_image_t image, + struct timespec *timestamp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_access_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + utime_t time = ictx->get_access_timestamp(); + time.to_timespec(timestamp); + tracepoint(librbd, get_access_timestamp_exit, 0, timestamp); + return 0; +} + +extern "C" int rbd_get_modify_timestamp(rbd_image_t image, + struct timespec *timestamp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_modify_timestamp_enter, ictx, ictx->name.c_str(), + ictx->read_only); + utime_t time = ictx->get_modify_timestamp(); + time.to_timespec(timestamp); + tracepoint(librbd, get_modify_timestamp_exit, 0, timestamp); + return 0; +} + + +extern "C" int rbd_get_overlap(rbd_image_t image, uint64_t *overlap) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_overlap_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::get_overlap(ictx, overlap); + tracepoint(librbd, get_overlap_exit, r, *overlap); + return r; +} + +extern "C" int rbd_get_name(rbd_image_t image, char *name, size_t *name_len) +{ + librbd::ImageCtx *ictx = reinterpret_cast(image); + if (*name_len <= ictx->name.size()) { + *name_len = ictx->name.size() + 1; + return -ERANGE; + } + + strncpy(name, ictx->name.c_str(), ictx->name.size()); + name[ictx->name.size()] = '\0'; + *name_len = ictx->name.size() + 1; + return 0; +} + +extern "C" int rbd_get_id(rbd_image_t image, char *id, size_t id_len) +{ + librbd::ImageCtx *ictx = reinterpret_cast(image); + if (ictx->old_format) { + return -EINVAL; + } + if (ictx->id.size() >= id_len) { + return -ERANGE; + } + + strncpy(id, ictx->id.c_str(), id_len - 1); + id[id_len - 1] = '\0'; + return 0; +} + +extern "C" int rbd_get_block_name_prefix(rbd_image_t image, char *prefix, + size_t prefix_len) +{ + librbd::ImageCtx *ictx = reinterpret_cast(image); + if (ictx->object_prefix.size() >= prefix_len) { + return -ERANGE; + } + + strncpy(prefix, ictx->object_prefix.c_str(), prefix_len - 1); + prefix[prefix_len - 1] = '\0'; + return 0; +} + +extern "C" int64_t rbd_get_data_pool_id(rbd_image_t image) +{ + librbd::ImageCtx *ictx = reinterpret_cast(image); + return librbd::api::Image<>::get_data_pool_id(ictx); +} + +extern "C" int rbd_get_parent_info(rbd_image_t image, + char *parent_pool_name, size_t ppool_namelen, + char *parent_name, size_t pnamelen, + char *parent_snap_name, size_t psnap_namelen) +{ + auto ictx = reinterpret_cast(image); + tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + librbd::linked_image_spec_t parent_image; + librbd::snap_spec_t parent_snap; + int r = librbd::api::Image<>::get_parent(ictx, &parent_image, &parent_snap); + if (r >= 0) { + if (parent_pool_name) { + if (parent_image.pool_name.length() + 1 > ppool_namelen) { + r = -ERANGE; + } else { + strcpy(parent_pool_name, parent_image.pool_name.c_str()); + } + } + if (parent_name) { + if (parent_image.image_name.length() + 1 > pnamelen) { + r = -ERANGE; + } else { + strcpy(parent_name, parent_image.image_name.c_str()); + } + } + if (parent_snap_name) { + if (parent_snap.name.length() + 1 > psnap_namelen) { + r = -ERANGE; + } else { + strcpy(parent_snap_name, parent_snap.name.c_str()); + } + } + } + + if (r < 0) { + tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL, NULL); + return r; + } + + tracepoint(librbd, get_parent_info_exit, r, + parent_image.pool_name.c_str(), + parent_image.image_name.c_str(), + parent_image.image_id.c_str(), + parent_snap.name.c_str()); + return 0; +} + +extern "C" int rbd_get_parent_info2(rbd_image_t image, + char *parent_pool_name, + size_t ppool_namelen, + char *parent_name, size_t pnamelen, + char *parent_id, size_t pidlen, + char *parent_snap_name, + size_t psnap_namelen) +{ + auto ictx = reinterpret_cast(image); + tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + librbd::linked_image_spec_t parent_image; + librbd::snap_spec_t parent_snap; + int r = librbd::api::Image<>::get_parent(ictx, &parent_image, &parent_snap); + if (r >= 0) { + if (parent_pool_name) { + if (parent_image.pool_name.length() + 1 > ppool_namelen) { + r = -ERANGE; + } else { + strcpy(parent_pool_name, parent_image.pool_name.c_str()); + } + } + if (parent_name) { + if (parent_image.image_name.length() + 1 > pnamelen) { + r = -ERANGE; + } else { + strcpy(parent_name, parent_image.image_name.c_str()); + } + } + if (parent_id) { + if (parent_image.image_id.length() + 1 > pidlen) { + r = -ERANGE; + } else { + strcpy(parent_id, parent_image.image_id.c_str()); + } + } + if (parent_snap_name) { + if (parent_snap.name.length() + 1 > psnap_namelen) { + r = -ERANGE; + } else { + strcpy(parent_snap_name, parent_snap.name.c_str()); + } + } + } + + if (r < 0) { + tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL, NULL); + return r; + } + + tracepoint(librbd, get_parent_info_exit, r, + parent_image.pool_name.c_str(), + parent_image.image_name.c_str(), + parent_image.image_id.c_str(), + parent_snap.name.c_str()); + return 0; +} + +extern "C" int rbd_get_parent(rbd_image_t image, + rbd_linked_image_spec_t *parent_image, + rbd_snap_spec_t *parent_snap) +{ + auto ictx = reinterpret_cast(image); + tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + librbd::linked_image_spec_t cpp_parent_image; + librbd::snap_spec_t cpp_parent_snap; + int r = librbd::api::Image<>::get_parent(ictx, &cpp_parent_image, + &cpp_parent_snap); + if (r < 0) { + // FIPS zeroization audit 20191117: these memsets are not security related. + memset(parent_image, 0, sizeof(rbd_linked_image_spec_t)); + memset(parent_snap, 0, sizeof(rbd_snap_spec_t)); + } else { + *parent_image = { + .pool_id = cpp_parent_image.pool_id, + .pool_name = strdup(cpp_parent_image.pool_name.c_str()), + .pool_namespace = strdup(cpp_parent_image.pool_namespace.c_str()), + .image_id = strdup(cpp_parent_image.image_id.c_str()), + .image_name = strdup(cpp_parent_image.image_name.c_str()), + .trash = cpp_parent_image.trash}; + *parent_snap = { + .id = cpp_parent_snap.id, + .namespace_type = cpp_parent_snap.namespace_type, + .name = strdup(cpp_parent_snap.name.c_str())}; + } + + tracepoint(librbd, get_parent_info_exit, r, + parent_image->pool_name, + parent_image->image_name, + parent_image->image_id, + parent_snap->name); + return r; +} + +extern "C" int rbd_get_migration_source_spec(rbd_image_t image, + char* source_spec, + size_t* max_len) +{ + auto ictx = reinterpret_cast(image); + + std::string cpp_source_spec; + int r = librbd::api::Migration<>::get_source_spec(ictx, &cpp_source_spec); + if (r < 0) { + return r; + } + + size_t expected_size = cpp_source_spec.size(); + if (expected_size >= *max_len) { + *max_len = expected_size + 1; + return -ERANGE; + } + + strncpy(source_spec, cpp_source_spec.c_str(), expected_size); + source_spec[expected_size] = '\0'; + *max_len = expected_size + 1; + + return 0; +} + +extern "C" int rbd_get_flags(rbd_image_t image, uint64_t *flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, get_flags_enter, ictx); + int r = librbd::get_flags(ictx, flags); + tracepoint(librbd, get_flags_exit, ictx, r, *flags); + return r; +} + +extern "C" int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info, + size_t group_info_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, image_get_group_enter, ictx->name.c_str()); + + if (group_info_size != sizeof(rbd_group_info_t)) { + tracepoint(librbd, image_get_group_exit, -ERANGE); + return -ERANGE; + } + + librbd::group_info_t cpp_group_info; + int r = librbd::api::Group<>::image_get_group(ictx, &cpp_group_info); + if (r >= 0) { + group_info_cpp_to_c(cpp_group_info, group_info); + } else { + group_info->name = NULL; + } + + tracepoint(librbd, image_get_group_exit, r); + return r; +} + +extern "C" int rbd_set_image_notification(rbd_image_t image, int fd, int type) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, set_image_notification_enter, ictx, fd, type); + int r = librbd::set_image_notification(ictx, fd, type); + tracepoint(librbd, set_image_notification_exit, ictx, r); + return r; +} + +extern "C" int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, is_exclusive_lock_owner_enter, ictx); + bool owner; + int r = librbd::is_exclusive_lock_owner(ictx, &owner); + *is_owner = owner ? 1 : 0; + tracepoint(librbd, is_exclusive_lock_owner_exit, ictx, r, *is_owner); + return r; +} + +extern "C" int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, lock_acquire_enter, ictx, lock_mode); + int r = librbd::lock_acquire(ictx, lock_mode); + tracepoint(librbd, lock_acquire_exit, ictx, r); + return r; +} + +extern "C" int rbd_lock_release(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, lock_release_enter, ictx); + int r = librbd::lock_release(ictx); + tracepoint(librbd, lock_release_exit, ictx, r); + return r; +} + +extern "C" int rbd_lock_get_owners(rbd_image_t image, + rbd_lock_mode_t *lock_mode, + char **lock_owners, + size_t *max_lock_owners) +{ + librbd::ImageCtx *ictx = reinterpret_cast(image); + tracepoint(librbd, lock_get_owners_enter, ictx); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(lock_owners, 0, sizeof(*lock_owners) * *max_lock_owners); + std::list lock_owner_list; + int r = librbd::lock_get_owners(ictx, lock_mode, &lock_owner_list); + if (r >= 0) { + if (*max_lock_owners >= lock_owner_list.size()) { + *max_lock_owners = 0; + for (auto &lock_owner : lock_owner_list) { + lock_owners[(*max_lock_owners)++] = strdup(lock_owner.c_str()); + } + } else { + *max_lock_owners = lock_owner_list.size(); + r = -ERANGE; + } + } + tracepoint(librbd, lock_get_owners_exit, ictx, r); + return r; +} + +extern "C" void rbd_lock_get_owners_cleanup(char **lock_owners, + size_t lock_owner_count) +{ + for (size_t i = 0; i < lock_owner_count; ++i) { + free(lock_owners[i]); + } +} + +extern "C" int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode, + const char *lock_owner) +{ + librbd::ImageCtx *ictx = reinterpret_cast(image); + tracepoint(librbd, lock_break_enter, ictx, lock_mode, lock_owner); + int r = librbd::lock_break(ictx, lock_mode, lock_owner); + tracepoint(librbd, lock_break_exit, ictx, r); + return r; +} + +extern "C" int rbd_rebuild_object_map(rbd_image_t image, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = reinterpret_cast(image); + librbd::CProgressContext prog_ctx(cb, cbdata); + return ictx->operations->rebuild_object_map(prog_ctx); +} + +/* snapshots */ +extern "C" int rbd_snap_create(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + auto flags = librbd::util::get_default_snap_create_flags(ictx); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Snapshot<>::create(ictx, snap_name, flags, prog_ctx); + tracepoint(librbd, snap_create_exit, r); + return r; +} + +extern "C" int rbd_snap_create2(rbd_image_t image, const char *snap_name, + uint32_t flags, librbd_progress_fn_t cb, + void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_create_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = librbd::api::Snapshot<>::create(ictx, snap_name, flags, prog_ctx); + tracepoint(librbd, snap_create_exit, r); + return r; +} + +extern "C" int rbd_snap_rename(rbd_image_t image, const char *srcname, const char *dstname) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname); + int r = ictx->operations->snap_rename(srcname, dstname); + tracepoint(librbd, snap_rename_exit, r); + return r; +} + +extern "C" int rbd_snap_remove(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_remove_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Snapshot<>::remove(ictx, snap_name, 0, prog_ctx); + tracepoint(librbd, snap_remove_exit, r); + return r; +} + +extern "C" int rbd_snap_remove2(rbd_image_t image, const char *snap_name, uint32_t flags, + librbd_progress_fn_t cb, void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_remove2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name, flags); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = librbd::api::Snapshot<>::remove(ictx, snap_name, flags, prog_ctx); + tracepoint(librbd, snap_remove_exit, r); + return r; +} + +extern "C" int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Snapshot<>::remove(ictx, snap_id); +} + +extern "C" int rbd_snap_rollback(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::NoOpProgressContext prog_ctx; + int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx); + tracepoint(librbd, snap_rollback_exit, r); + return r; +} + +extern "C" int rbd_snap_rollback_with_progress(rbd_image_t image, + const char *snap_name, + librbd_progress_fn_t cb, + void *cbdata) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_rollback_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = ictx->operations->snap_rollback(cls::rbd::UserSnapshotNamespace(), snap_name, prog_ctx); + tracepoint(librbd, snap_rollback_exit, r); + return r; +} + +extern "C" int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps, + int *max_snaps) +{ + vector cpp_snaps; + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_list_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snaps); + + if (!max_snaps) { + tracepoint(librbd, snap_list_exit, -EINVAL, 0); + return -EINVAL; + } + // FIPS zeroization audit 20191117: this memset is not security related. + memset(snaps, 0, sizeof(*snaps) * *max_snaps); + + int r = librbd::api::Snapshot<>::list(ictx, cpp_snaps); + if (r == -ENOENT) { + tracepoint(librbd, snap_list_exit, 0, *max_snaps); + return 0; + } + if (r < 0) { + tracepoint(librbd, snap_list_exit, r, *max_snaps); + return r; + } + if (*max_snaps < (int)cpp_snaps.size() + 1) { + *max_snaps = (int)cpp_snaps.size() + 1; + tracepoint(librbd, snap_list_exit, -ERANGE, *max_snaps); + return -ERANGE; + } + + int i; + + for (i = 0; i < (int)cpp_snaps.size(); i++) { + snaps[i].id = cpp_snaps[i].id; + snaps[i].size = cpp_snaps[i].size; + snaps[i].name = strdup(cpp_snaps[i].name.c_str()); + if (!snaps[i].name) { + for (int j = 0; j < i; j++) + free((void *)snaps[j].name); + tracepoint(librbd, snap_list_exit, -ENOMEM, *max_snaps); + return -ENOMEM; + } + tracepoint(librbd, snap_list_entry, snaps[i].id, snaps[i].size, snaps[i].name); + } + snaps[i].id = 0; + snaps[i].size = 0; + snaps[i].name = NULL; + + r = (int)cpp_snaps.size(); + tracepoint(librbd, snap_list_exit, r, *max_snaps); + return r; +} + +extern "C" void rbd_snap_list_end(rbd_snap_info_t *snaps) +{ + tracepoint(librbd, snap_list_end_enter, snaps); + while (snaps->name) { + free((void *)snaps->name); + snaps++; + } + tracepoint(librbd, snap_list_end_exit); +} + +extern "C" int rbd_snap_protect(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_protect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_protect(cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_protect_exit, r); + return r; +} + +extern "C" int rbd_snap_unprotect(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_unprotect_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = ictx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_unprotect_exit, r); + return r; +} + +extern "C" int rbd_snap_is_protected(rbd_image_t image, const char *snap_name, + int *is_protected) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_is_protected_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + bool protected_snap; + int r = librbd::api::Snapshot<>::is_protected(ictx, snap_name, &protected_snap); + if (r < 0) { + tracepoint(librbd, snap_is_protected_exit, r, *is_protected ? 1 : 0); + return r; + } + *is_protected = protected_snap ? 1 : 0; + tracepoint(librbd, snap_is_protected_exit, 0, *is_protected ? 1 : 0); + return 0; +} + +extern "C" int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_get_limit_enter, ictx, ictx->name.c_str()); + int r = librbd::api::Snapshot<>::get_limit(ictx, limit); + tracepoint(librbd, snap_get_limit_exit, r, *limit); + return r; +} + +extern "C" int rbd_snap_exists(rbd_image_t image, const char *snapname, bool *exists) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_exists_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, snapname); + int r = librbd::api::Snapshot<>::exists(ictx, cls::rbd::UserSnapshotNamespace(), snapname, exists); + tracepoint(librbd, snap_exists_exit, r, *exists); + return r; +} + +extern "C" int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_get_timestamp_enter, ictx, ictx->name.c_str()); + int r = librbd::api::Snapshot<>::get_timestamp(ictx, snap_id, timestamp); + tracepoint(librbd, snap_get_timestamp_exit, r); + return r; +} + +extern "C" int rbd_snap_set_limit(rbd_image_t image, uint64_t limit) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_set_limit_enter, ictx, ictx->name.c_str(), limit); + int r = librbd::api::Snapshot<>::set_limit(ictx, limit); + tracepoint(librbd, snap_set_limit_exit, r); + return r; +} + +extern "C" int rbd_snap_set(rbd_image_t image, const char *snap_name) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_set_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, snap_name); + int r = librbd::api::Image<>::snap_set( + ictx, cls::rbd::UserSnapshotNamespace(), snap_name); + tracepoint(librbd, snap_set_exit, r); + return r; +} + +extern "C" int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Image<>::snap_set(ictx, snap_id); +} + +extern "C" int rbd_snap_get_name(rbd_image_t image, uint64_t snap_id, char *snapname, size_t *name_len) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + std::string snap_name; + int r = librbd::api::Snapshot<>::get_name(ictx, snap_id, &snap_name); + size_t expected_size = snap_name.size(); + if (*name_len <= expected_size) { + *name_len = expected_size + 1; + return -ERANGE; + } + strncpy(snapname, snap_name.c_str(), expected_size); + snapname[expected_size] = '\0'; + *name_len = expected_size + 1; + return r; +} + +extern "C" int rbd_snap_get_id(rbd_image_t image, const char *snapname, uint64_t *snap_id) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Snapshot<>::get_id(ictx, snapname, snap_id); +} + +extern "C" ssize_t rbd_list_children(rbd_image_t image, char *pools, + size_t *pools_len, char *images, + size_t *images_len) +{ + auto ictx = reinterpret_cast(image); + tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + + std::vector cpp_images; + int r = librbd::api::Image<>::list_children(ictx, &cpp_images); + if (r < 0) { + tracepoint(librbd, list_children_exit, r); + return r; + } + + std::set> image_set; + for (auto& image : cpp_images) { + if (!image.trash) { + image_set.insert({image.pool_name, image.image_name}); + } + } + + size_t pools_total = 0; + size_t images_total = 0; + for (auto it : image_set) { + pools_total += it.first.length() + 1; + images_total += it.second.length() + 1; + } + + bool too_short = false; + if (pools_total > *pools_len) + too_short = true; + if (images_total > *images_len) + too_short = true; + *pools_len = pools_total; + *images_len = images_total; + if (too_short) { + tracepoint(librbd, list_children_exit, -ERANGE); + return -ERANGE; + } + + char *pools_p = pools; + char *images_p = images; + for (auto it : image_set) { + const char* pool = it.first.c_str(); + strcpy(pools_p, pool); + pools_p += it.first.length() + 1; + const char* image = it.second.c_str(); + strcpy(images_p, image); + images_p += it.second.length() + 1; + tracepoint(librbd, list_children_entry, pool, image); + } + + ssize_t ret = image_set.size(); + tracepoint(librbd, list_children_exit, ret); + return ret; +} + +extern "C" int rbd_list_children2(rbd_image_t image, + rbd_child_info_t *children, + int *max_children) +{ + auto ictx = reinterpret_cast(image); + tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(children, 0, sizeof(*children) * *max_children); + + if (!max_children) { + tracepoint(librbd, list_children_exit, -EINVAL); + return -EINVAL; + } + + std::vector cpp_children; + int r = librbd::api::Image<>::list_children(ictx, &cpp_children); + if (r < 0) { + tracepoint(librbd, list_children_exit, r); + return r; + } + + if (*max_children < (int)cpp_children.size() + 1) { + *max_children = (int)cpp_children.size() + 1; + tracepoint(librbd, list_children_exit, *max_children); + return -ERANGE; + } + + int i; + for (i = 0; i < (int)cpp_children.size(); i++) { + children[i].pool_name = strdup(cpp_children[i].pool_name.c_str()); + children[i].image_name = strdup(cpp_children[i].image_name.c_str()); + children[i].image_id = strdup(cpp_children[i].image_id.c_str()); + children[i].trash = cpp_children[i].trash; + tracepoint(librbd, list_children_entry, children[i].pool_name, + children[i].image_name); + } + children[i].pool_name = NULL; + children[i].image_name = NULL; + children[i].image_id = NULL; + + r = (int)cpp_children.size(); + tracepoint(librbd, list_children_exit, *max_children); + return r; +} + +extern "C" void rbd_list_child_cleanup(rbd_child_info_t *child) +{ + free((void *)child->pool_name); + free((void *)child->image_name); + free((void *)child->image_id); +} + +extern "C" void rbd_list_children_cleanup(rbd_child_info_t *children, + size_t num_children) +{ + for (size_t i=0; i < num_children; i++) { + free((void *)children[i].pool_name); + free((void *)children[i].image_name); + free((void *)children[i].image_id); + } +} + +extern "C" int rbd_list_children3(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images) +{ + auto ictx = reinterpret_cast(image); + tracepoint(librbd, list_children_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(images, 0, sizeof(*images) * *max_images); + + std::vector cpp_children; + int r = librbd::api::Image<>::list_children(ictx, &cpp_children); + if (r < 0) { + tracepoint(librbd, list_children_exit, r); + return r; + } + + if (*max_images < cpp_children.size()) { + *max_images = cpp_children.size(); + return -ERANGE; + } + + *max_images = cpp_children.size(); + for (size_t idx = 0; idx < cpp_children.size(); ++idx) { + images[idx] = { + .pool_id = cpp_children[idx].pool_id, + .pool_name = strdup(cpp_children[idx].pool_name.c_str()), + .pool_namespace = strdup(cpp_children[idx].pool_namespace.c_str()), + .image_id = strdup(cpp_children[idx].image_id.c_str()), + .image_name = strdup(cpp_children[idx].image_name.c_str()), + .trash = cpp_children[idx].trash}; + tracepoint(librbd, list_children_entry, images[idx].pool_name, + images[idx].image_name); + } + return 0; +} + +extern "C" int rbd_list_descendants(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images) +{ + auto ictx = reinterpret_cast(image); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(images, 0, sizeof(*images) * *max_images); + + std::vector cpp_children; + int r = librbd::api::Image<>::list_descendants(ictx, {}, &cpp_children); + if (r < 0) { + return r; + } + + if (*max_images < cpp_children.size()) { + *max_images = cpp_children.size(); + return -ERANGE; + } + + *max_images = cpp_children.size(); + for (size_t idx = 0; idx < cpp_children.size(); ++idx) { + images[idx] = { + .pool_id = cpp_children[idx].pool_id, + .pool_name = strdup(cpp_children[idx].pool_name.c_str()), + .pool_namespace = strdup(cpp_children[idx].pool_namespace.c_str()), + .image_id = strdup(cpp_children[idx].image_id.c_str()), + .image_name = strdup(cpp_children[idx].image_name.c_str()), + .trash = cpp_children[idx].trash}; + } + return 0; +} + +extern "C" ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive, + char *tag, size_t *tag_len, + char *clients, size_t *clients_len, + char *cookies, size_t *cookies_len, + char *addrs, size_t *addrs_len) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, list_lockers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + std::list lockers; + bool exclusive_bool; + string tag_str; + + int r = list_lockers(ictx, &lockers, &exclusive_bool, &tag_str); + if (r < 0) { + tracepoint(librbd, list_lockers_exit, r); + return r; + } + + ldout(ictx->cct, 20) << "list_lockers r = " << r << " lockers.size() = " << lockers.size() << dendl; + + *exclusive = (int)exclusive_bool; + size_t clients_total = 0; + size_t cookies_total = 0; + size_t addrs_total = 0; + for (list::const_iterator it = lockers.begin(); + it != lockers.end(); ++it) { + clients_total += it->client.length() + 1; + cookies_total += it->cookie.length() + 1; + addrs_total += it->address.length() + 1; + } + + bool too_short = ((clients_total > *clients_len) || + (cookies_total > *cookies_len) || + (addrs_total > *addrs_len) || + (tag_str.length() + 1 > *tag_len)); + *clients_len = clients_total; + *cookies_len = cookies_total; + *addrs_len = addrs_total; + *tag_len = tag_str.length() + 1; + if (too_short) { + tracepoint(librbd, list_lockers_exit, -ERANGE); + return -ERANGE; + } + + strcpy(tag, tag_str.c_str()); + char *clients_p = clients; + char *cookies_p = cookies; + char *addrs_p = addrs; + for (list::const_iterator it = lockers.begin(); + it != lockers.end(); ++it) { + const char* client = it->client.c_str(); + strcpy(clients_p, client); + clients_p += it->client.length() + 1; + const char* cookie = it->cookie.c_str(); + strcpy(cookies_p, cookie); + cookies_p += it->cookie.length() + 1; + const char* address = it->address.c_str(); + strcpy(addrs_p, address); + addrs_p += it->address.length() + 1; + tracepoint(librbd, list_lockers_entry, client, cookie, address); + } + + ssize_t ret = lockers.size(); + tracepoint(librbd, list_lockers_exit, ret); + return ret; +} + +extern "C" int rbd_lock_exclusive(rbd_image_t image, const char *cookie) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, lock_exclusive_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie); + int r = librbd::lock(ictx, true, cookie ? cookie : "", ""); + tracepoint(librbd, lock_exclusive_exit, r); + return r; +} + +extern "C" int rbd_lock_shared(rbd_image_t image, const char *cookie, + const char *tag) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, lock_shared_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie, tag); + int r = librbd::lock(ictx, false, cookie ? cookie : "", tag ? tag : ""); + tracepoint(librbd, lock_shared_exit, r); + return r; +} + +extern "C" int rbd_unlock(rbd_image_t image, const char *cookie) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, unlock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, cookie); + int r = librbd::unlock(ictx, cookie ? cookie : ""); + tracepoint(librbd, unlock_exit, r); + return r; +} + +extern "C" int rbd_break_lock(rbd_image_t image, const char *client, + const char *cookie) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, break_lock_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, client, cookie); + int r = librbd::break_lock(ictx, client, cookie ? cookie : ""); + tracepoint(librbd, break_lock_exit, r); + return r; +} + +/* I/O */ +extern "C" ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len, + char *buf) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + int r = librbd::api::Io<>::read( + *ictx, ofs, len, librbd::io::ReadResult{buf, len}, 0); + tracepoint(librbd, read_exit, r); + return r; +} + +extern "C" ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len, + char *buf, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, read2_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, ofs, len, op_flags); + int r = librbd::api::Io<>::read( + *ictx, ofs, len, librbd::io::ReadResult{buf, len}, op_flags); + tracepoint(librbd, read_exit, r); + return r; +} + + +extern "C" int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, read_iterate_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg); + tracepoint(librbd, read_iterate_exit, r); + return r; +} + +extern "C" int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, read_iterate2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len); + int64_t r = librbd::read_iterate(ictx, ofs, len, cb, arg); + if (r > 0) + r = 0; + tracepoint(librbd, read_iterate2_exit, r); + return (int)r; +} + +extern "C" int rbd_diff_iterate(rbd_image_t image, + const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), + void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len, + true, false); + int r = librbd::api::DiffIterate<>::diff_iterate(ictx, + cls::rbd::UserSnapshotNamespace(), + fromsnapname, ofs, len, + true, false, cb, arg); + tracepoint(librbd, diff_iterate_exit, r); + return r; +} + +extern "C" int rbd_diff_iterate2(rbd_image_t image, const char *fromsnapname, + uint64_t ofs, uint64_t len, + uint8_t include_parent, uint8_t whole_object, + int (*cb)(uint64_t, size_t, int, void *), + void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, diff_iterate_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, fromsnapname, ofs, len, + include_parent != 0, whole_object != 0); + int r = librbd::api::DiffIterate<>::diff_iterate(ictx, + cls::rbd::UserSnapshotNamespace(), + fromsnapname, ofs, len, + include_parent, whole_object, + cb, arg); + tracepoint(librbd, diff_iterate_exit, r); + return r; +} + +extern "C" ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf); + + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len, nullptr)); + int r = librbd::api::Io<>::write(*ictx, ofs, len, std::move(bl), 0); + tracepoint(librbd, write_exit, r); + return r; +} + +extern "C" ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, write2_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf, op_flags); + + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len, nullptr)); + int r = librbd::api::Io<>::write(*ictx, ofs, len, std::move(bl), op_flags); + tracepoint(librbd, write_exit, r); + return r; +} + + +extern "C" int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, ofs, len); + if (len > static_cast(std::numeric_limits::max())) { + tracepoint(librbd, discard_exit, -EINVAL); + return -EINVAL; + } + + int r = librbd::api::Io<>::discard( + *ictx, ofs, len, ictx->discard_granularity_bytes); + tracepoint(librbd, discard_exit, r); + return r; +} + +extern "C" ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, size_t data_len, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, ofs, len, data_len == 0 ? NULL : buf, data_len, op_flags); + + if (data_len == 0 || len % data_len || + len > static_cast(std::numeric_limits::max())) { + tracepoint(librbd, writesame_exit, -EINVAL); + return -EINVAL; + } + + bool discard_zero = ictx->config.get_val("rbd_discard_on_zeroed_write_same"); + if (discard_zero && mem_is_zero(buf, data_len)) { + int r = librbd::api::Io<>::write_zeroes(*ictx, ofs, len, 0, op_flags); + tracepoint(librbd, writesame_exit, r); + return r; + } + + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, data_len, nullptr)); + int r = librbd::api::Io<>::write_same( + *ictx, ofs, len, std::move(bl), op_flags); + tracepoint(librbd, writesame_exit, r); + return r; +} + +extern "C" ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, size_t len, + int zero_flags, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Io<>::write_zeroes(*ictx, ofs, len, zero_flags, op_flags); +} + +extern "C" ssize_t rbd_compare_and_write(rbd_image_t image, + uint64_t ofs, size_t len, + const char *cmp_buf, + const char *buf, + uint64_t *mismatch_off, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, compare_and_write_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, ofs, + len, cmp_buf, buf, op_flags); + + bufferlist cmp_bl; + cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len, nullptr)); + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len, nullptr)); + + int r = librbd::api::Io<>::compare_and_write( + *ictx, ofs, len, std::move(cmp_bl), std::move(bl), mismatch_off, op_flags); + tracepoint(librbd, compare_and_write_exit, r); + return r; +} + +extern "C" int rbd_aio_create_completion(void *cb_arg, + rbd_callback_t complete_cb, + rbd_completion_t *c) +{ + librbd::RBD::AioCompletion *rbd_comp = + new librbd::RBD::AioCompletion(cb_arg, complete_cb); + *c = (rbd_completion_t) rbd_comp; + return 0; +} + +extern "C" int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc); + + auto aio_completion = get_aio_completion(comp); + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len, aio_completion)); + librbd::api::Io<>::aio_write( + *ictx, aio_completion, off, len, std::move(bl), 0, true); + tracepoint(librbd, aio_write_exit, 0); + return 0; +} + +extern "C" int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, buf, comp->pc, op_flags); + + auto aio_completion = get_aio_completion(comp); + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len, aio_completion)); + librbd::api::Io<>::aio_write( + *ictx, aio_completion, off, len, std::move(bl), op_flags, true); + tracepoint(librbd, aio_write_exit, 0); + return 0; +} + +extern "C" int rbd_aio_writev(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + size_t len; + int r = get_iovec_length(iov, iovcnt, len); + + tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, off, len, NULL, + comp->pc); + + if (r == 0) { + auto aio_completion = get_aio_completion(comp); + auto bl = iovec_to_bufferlist(ictx, iov, iovcnt, aio_completion); + librbd::api::Io<>::aio_write( + *ictx, aio_completion, off, len, std::move(bl), 0, true); + } + tracepoint(librbd, aio_write_exit, r); + return r; +} + +extern "C" int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc); + librbd::api::Io<>::aio_read( + *ictx, get_aio_completion(comp), off, len, librbd::io::ReadResult{buf, len}, + 0, true); + tracepoint(librbd, aio_read_exit, 0); + return 0; +} + +extern "C" int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c, int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, buf, comp->pc, op_flags); + librbd::api::Io<>::aio_read( + *ictx, get_aio_completion(comp), off, len, librbd::io::ReadResult{buf, len}, + op_flags, true); + tracepoint(librbd, aio_read_exit, 0); + return 0; +} + +extern "C" int rbd_aio_readv(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + size_t len; + int r = get_iovec_length(iov, iovcnt, len); + + tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, off, len, NULL, + comp->pc); + if (r == 0) { + librbd::io::ReadResult read_result; + if (iovcnt == 1) { + read_result = librbd::io::ReadResult( + static_cast(iov[0].iov_base), iov[0].iov_len); + } else { + read_result = librbd::io::ReadResult(iov, iovcnt); + } + librbd::api::Io<>::aio_read( + *ictx, get_aio_completion(comp), off, len, std::move(read_result), 0, + true); + } + tracepoint(librbd, aio_read_exit, r); + return r; +} + +extern "C" int rbd_flush(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::api::Io<>::flush(*ictx); + tracepoint(librbd, flush_exit, r); + return r; +} + +extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc); + librbd::api::Io<>::aio_flush(*ictx, get_aio_completion(comp), true); + tracepoint(librbd, aio_flush_exit, 0); + return 0; +} + +extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, + rbd_completion_t c) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, comp->pc); + librbd::api::Io<>::aio_discard( + *ictx, get_aio_completion(comp), off, len, + ictx->discard_granularity_bytes, true); + tracepoint(librbd, aio_discard_exit, 0); + return 0; +} + +extern "C" int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len, + const char *buf, size_t data_len, rbd_completion_t c, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_writesame_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, data_len == 0 ? NULL : buf, data_len, comp->pc, + op_flags); + + if (data_len == 0 || len % data_len) { + tracepoint(librbd, aio_writesame_exit, -EINVAL); + return -EINVAL; + } + + bool discard_zero = ictx->config.get_val("rbd_discard_on_zeroed_write_same"); + if (discard_zero && mem_is_zero(buf, data_len)) { + librbd::api::Io<>::aio_write_zeroes( + *ictx, get_aio_completion(comp), off, len, 0, op_flags, true); + tracepoint(librbd, aio_writesame_exit, 0); + return 0; + } + + auto aio_completion = get_aio_completion(comp); + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, data_len, aio_completion)); + librbd::api::Io<>::aio_write_same( + *ictx, aio_completion, off, len, std::move(bl), op_flags, true); + tracepoint(librbd, aio_writesame_exit, 0); + return 0; +} + +extern "C" int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, size_t len, + rbd_completion_t c, int zero_flags, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + librbd::api::Io<>::aio_write_zeroes(*ictx, get_aio_completion(comp), off, len, + zero_flags, op_flags, true); + return 0; +} + +extern "C" ssize_t rbd_aio_compare_and_write(rbd_image_t image, uint64_t off, + size_t len, const char *cmp_buf, + const char *buf, rbd_completion_t c, + uint64_t *mismatch_off, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), + ictx->read_only, off, len, cmp_buf, buf, comp->pc, op_flags); + + auto aio_completion = get_aio_completion(comp); + bufferlist cmp_bl; + cmp_bl.push_back(create_write_raw(ictx, cmp_buf, len, aio_completion)); + bufferlist bl; + bl.push_back(create_write_raw(ictx, buf, len, aio_completion)); + librbd::api::Io<>::aio_compare_and_write( + *ictx, aio_completion, off, len, std::move(cmp_bl), std::move(bl), + mismatch_off, op_flags, false); + + tracepoint(librbd, aio_compare_and_write_exit, 0); + return 0; +} + +extern "C" ssize_t rbd_aio_compare_and_writev(rbd_image_t image, + uint64_t off, + const struct iovec *cmp_iov, + int cmp_iovcnt, + const struct iovec *iov, + int iovcnt, + rbd_completion_t c, + uint64_t *mismatch_off, + int op_flags) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + size_t cmp_len; + int r = get_iovec_length(cmp_iov, cmp_iovcnt, cmp_len); + + tracepoint(librbd, aio_compare_and_write_enter, ictx, ictx->name.c_str(), + ictx->snap_name.c_str(), ictx->read_only, off, cmp_len, NULL, NULL, + comp->pc, op_flags); + if (r != 0) { + tracepoint(librbd, aio_compare_and_write_exit, r); + return r; + } + + size_t write_len; + r = get_iovec_length(iov, iovcnt, write_len); + if (r != 0) { + tracepoint(librbd, aio_compare_and_write_exit, r); + return r; + } + if (cmp_len != write_len) { + tracepoint(librbd, aio_compare_and_write_exit, -EINVAL); + return -EINVAL; + } + + auto aio_completion = get_aio_completion(comp); + auto cmp_bl = iovec_to_bufferlist(ictx, cmp_iov, cmp_iovcnt, aio_completion); + auto bl = iovec_to_bufferlist(ictx, iov, iovcnt, aio_completion); + librbd::api::Io<>::aio_compare_and_write(*ictx, aio_completion, off, cmp_len, + std::move(cmp_bl), std::move(bl), + mismatch_off, op_flags, false); + + tracepoint(librbd, aio_compare_and_write_exit, 0); + return 0; +} + +extern "C" int rbd_invalidate_cache(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, invalidate_cache_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + int r = librbd::invalidate_cache(ictx); + tracepoint(librbd, invalidate_cache_exit, r); + return r; +} + +extern "C" int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::io::AioCompletion *cs[numcomp]; + tracepoint(librbd, poll_io_events_enter, ictx, numcomp); + int r = librbd::poll_io_events(ictx, cs, numcomp); + tracepoint(librbd, poll_io_events_exit, r); + if (r > 0) { + for (int i = 0; i < r; ++i) + comps[i] = cs[i]->rbd_comp; + } + return r; +} + +extern "C" int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *vallen) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + string val_s; + tracepoint(librbd, metadata_get_enter, ictx, key); + int r = librbd::metadata_get(ictx, key, &val_s); + if (r < 0) { + tracepoint(librbd, metadata_get_exit, r, key, NULL); + return r; + } + if (*vallen < val_s.size() + 1) { + r = -ERANGE; + *vallen = val_s.size() + 1; + tracepoint(librbd, metadata_get_exit, r, key, NULL); + } else { + strncpy(value, val_s.c_str(), val_s.size() + 1); + tracepoint(librbd, metadata_get_exit, r, key, value); + } + return r; +} + +extern "C" int rbd_metadata_set(rbd_image_t image, const char *key, const char *value) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, metadata_set_enter, ictx, key, value); + int r = ictx->operations->metadata_set(key, value); + tracepoint(librbd, metadata_set_exit, r); + return r; +} + +extern "C" int rbd_metadata_remove(rbd_image_t image, const char *key) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, metadata_remove_enter, ictx, key); + int r = ictx->operations->metadata_remove(key); + tracepoint(librbd, metadata_remove_exit, r); + return r; +} + +extern "C" int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max, + char *key, size_t *key_len, char *value, size_t *val_len) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, metadata_list_enter, ictx); + map pairs; + int r = librbd::metadata_list(ictx, start, max, &pairs); + size_t key_total_len = 0, val_total_len = 0; + bool too_short = false; + for (map::iterator it = pairs.begin(); + it != pairs.end(); ++it) { + key_total_len += it->first.size() + 1; + val_total_len += it->second.length() + 1; + } + if (*key_len < key_total_len || *val_len < val_total_len) + too_short = true; + *key_len = key_total_len; + *val_len = val_total_len; + if (too_short) { + tracepoint(librbd, metadata_list_exit, -ERANGE); + return -ERANGE; + } + + char *key_p = key, *value_p = value; + + for (map::iterator it = pairs.begin(); + it != pairs.end(); ++it) { + strncpy(key_p, it->first.c_str(), it->first.size() + 1); + key_p += it->first.size() + 1; + strncpy(value_p, it->second.c_str(), it->second.length()); + value_p += it->second.length(); + *value_p = '\0'; + value_p++; + tracepoint(librbd, metadata_list_entry, it->first.c_str(), it->second.c_str()); + } + tracepoint(librbd, metadata_list_exit, r); + return r; +} + +extern "C" int rbd_mirror_image_enable(rbd_image_t image) +{ + return rbd_mirror_image_enable2(image, RBD_MIRROR_IMAGE_MODE_JOURNAL); +} + +extern "C" int rbd_mirror_image_enable2(rbd_image_t image, + rbd_mirror_image_mode_t mode) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_enable(ictx, mode, false); +} + +extern "C" int rbd_mirror_image_disable(rbd_image_t image, bool force) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_disable(ictx, force); +} + +extern "C" int rbd_mirror_image_promote(rbd_image_t image, bool force) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_promote(ictx, force); +} + +extern "C" int rbd_mirror_image_demote(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_demote(ictx); +} + +extern "C" int rbd_mirror_image_resync(rbd_image_t image) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_resync(ictx); +} + +extern "C" int rbd_mirror_image_create_snapshot(rbd_image_t image, + uint64_t *snap_id) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + auto flags = librbd::util::get_default_snap_create_flags(ictx); + return librbd::api::Mirror<>::image_snapshot_create(ictx, flags, snap_id); +} + +extern "C" int rbd_mirror_image_create_snapshot2(rbd_image_t image, + uint32_t flags, + uint64_t *snap_id) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + return librbd::api::Mirror<>::image_snapshot_create(ictx, flags, snap_id); +} + +extern "C" int rbd_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *mirror_image_info, + size_t info_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + if (sizeof(rbd_mirror_image_info_t) != info_size) { + return -ERANGE; + } + + librbd::mirror_image_info_t cpp_mirror_image; + int r = librbd::api::Mirror<>::image_get_info(ictx, &cpp_mirror_image); + if (r < 0) { + return r; + } + + mirror_image_info_cpp_to_c(cpp_mirror_image, mirror_image_info); + return 0; +} + +extern "C" void rbd_mirror_image_get_info_cleanup( + rbd_mirror_image_info_t *mirror_image_info) +{ + free(mirror_image_info->global_id); +} + +extern "C" int rbd_mirror_image_get_mode(rbd_image_t image, + rbd_mirror_image_mode_t *mode) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + return librbd::api::Mirror<>::image_get_mode(ictx, mode); +} + +extern "C" int rbd_mirror_image_get_global_status( + rbd_image_t image, rbd_mirror_image_global_status_t *status, + size_t status_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + if (sizeof(rbd_mirror_image_global_status_t) != status_size) { + return -ERANGE; + } + + librbd::mirror_image_global_status_t cpp_status; + int r = librbd::api::Mirror<>::image_get_global_status(ictx, &cpp_status); + if (r < 0) { + return r; + } + + mirror_image_global_status_cpp_to_c(cpp_status, status); + return 0; +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +extern "C" int rbd_mirror_image_get_status(rbd_image_t image, + rbd_mirror_image_status_t *status, + size_t status_size) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + if (sizeof(rbd_mirror_image_status_t) != status_size) { + return -ERANGE; + } + + librbd::mirror_image_global_status_t cpp_status; + int r = librbd::api::Mirror<>::image_get_global_status(ictx, &cpp_status); + if (r < 0) { + return r; + } + + mirror_image_global_status_cpp_to_c(cpp_status, status); + return 0; +} + +#pragma GCC diagnostic pop + +extern "C" int rbd_mirror_image_get_instance_id(rbd_image_t image, + char *instance_id, + size_t *instance_id_max_length) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + std::string cpp_instance_id; + int r = librbd::api::Mirror<>::image_get_instance_id(ictx, &cpp_instance_id); + if (r < 0) { + return r; + } + + if (cpp_instance_id.size() >= *instance_id_max_length) { + *instance_id_max_length = cpp_instance_id.size() + 1; + return -ERANGE; + } + + strcpy(instance_id, cpp_instance_id.c_str()); + *instance_id_max_length = cpp_instance_id.size() + 1; + return 0; +} + +extern "C" int rbd_aio_mirror_image_promote(rbd_image_t image, bool force, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + librbd::api::Mirror<>::image_promote( + ictx, force, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + return 0; +} + +extern "C" int rbd_aio_mirror_image_demote(rbd_image_t image, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + librbd::api::Mirror<>::image_demote( + ictx, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + return 0; +} + +extern "C" int rbd_aio_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *info, + size_t info_size, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + if (sizeof(rbd_mirror_image_info_t) != info_size) { + return -ERANGE; + } + + auto ctx = new C_MirrorImageGetInfo( + info, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + librbd::api::Mirror<>::image_get_info( + ictx, &ctx->cpp_mirror_image_info, ctx); + return 0; +} + +extern "C" int rbd_aio_mirror_image_get_mode(rbd_image_t image, + rbd_mirror_image_mode_t *mode, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + librbd::api::Mirror<>::image_get_mode( + ictx, mode, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + return 0; +} + +extern "C" int rbd_aio_mirror_image_get_global_status( + rbd_image_t image, rbd_mirror_image_global_status_t *status, + size_t status_size, rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + if (sizeof(rbd_mirror_image_global_status_t) != status_size) { + return -ERANGE; + } + + auto ctx = new C_MirrorImageGetGlobalStatus( + status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + librbd::api::Mirror<>::image_get_global_status( + ictx, &ctx->cpp_mirror_image_global_status, ctx); + return 0; +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +extern "C" int rbd_aio_mirror_image_get_status( + rbd_image_t image, rbd_mirror_image_status_t *status, size_t status_size, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + if (sizeof(rbd_mirror_image_status_t) != status_size) { + return -ERANGE; + } + + auto ctx = new C_MirrorImageGetStatus( + status, new C_AioCompletion(ictx, librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + librbd::api::Mirror<>::image_get_global_status( + ictx, &ctx->cpp_mirror_image_global_status, ctx); + return 0; +} + +#pragma GCC diagnostic pop + +extern "C" int rbd_aio_mirror_image_create_snapshot(rbd_image_t image, + uint32_t flags, + uint64_t *snap_id, + rbd_completion_t c) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + + librbd::api::Mirror<>::image_snapshot_create( + ictx, flags, snap_id, new C_AioCompletion(ictx, + librbd::io::AIO_TYPE_GENERIC, + get_aio_completion(comp))); + return 0; +} + +extern "C" int rbd_update_watch(rbd_image_t image, uint64_t *handle, + rbd_update_callback_t watch_cb, void *arg) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + C_UpdateWatchCB *wctx = new C_UpdateWatchCB(watch_cb, arg); + tracepoint(librbd, update_watch_enter, ictx, wctx); + int r = ictx->state->register_update_watcher(wctx, &wctx->handle); + tracepoint(librbd, update_watch_exit, r, wctx->handle); + *handle = reinterpret_cast(wctx); + return r; +} + +extern "C" int rbd_update_unwatch(rbd_image_t image, uint64_t handle) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + C_UpdateWatchCB *wctx = reinterpret_cast(handle); + tracepoint(librbd, update_unwatch_enter, ictx, wctx->handle); + int r = ictx->state->unregister_update_watcher(wctx->handle); + delete wctx; + tracepoint(librbd, update_unwatch_exit, r); + return r; +} + +extern "C" int rbd_aio_is_complete(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + return comp->is_complete(); +} + +extern "C" int rbd_aio_wait_for_complete(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + return comp->wait_for_complete(); +} + +extern "C" ssize_t rbd_aio_get_return_value(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + return comp->get_return_value(); +} + +extern "C" void *rbd_aio_get_arg(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + return comp->get_arg(); +} + +extern "C" void rbd_aio_release(rbd_completion_t c) +{ + librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c; + comp->release(); +} + +extern "C" int rbd_group_create(rados_ioctx_t p, const char *name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, group_create_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + int r = librbd::api::Group<>::create(io_ctx, name); + tracepoint(librbd, group_create_exit, r); + return r; +} + +extern "C" int rbd_group_remove(rados_ioctx_t p, const char *name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, group_remove_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), name); + int r = librbd::api::Group<>::remove(io_ctx, name); + tracepoint(librbd, group_remove_exit, r); + return r; +} + +extern "C" int rbd_group_list(rados_ioctx_t p, char *names, size_t *size) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, group_list_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id()); + + vector cpp_names; + int r = librbd::api::Group<>::list(io_ctx, &cpp_names); + + if (r < 0) { + tracepoint(librbd, group_list_exit, r); + return r; + } + + size_t expected_size = 0; + + for (size_t i = 0; i < cpp_names.size(); i++) { + expected_size += cpp_names[i].size() + 1; + } + if (*size < expected_size) { + *size = expected_size; + tracepoint(librbd, group_list_exit, -ERANGE); + return -ERANGE; + } + + if (names == NULL) { + tracepoint(librbd, group_list_exit, -EINVAL); + return -EINVAL; + } + + for (int i = 0; i < (int)cpp_names.size(); i++) { + const char* name = cpp_names[i].c_str(); + tracepoint(librbd, group_list_entry, name); + strcpy(names, name); + names += strlen(names) + 1; + } + tracepoint(librbd, group_list_exit, (int)expected_size); + return (int)expected_size; +} + +extern "C" int rbd_group_rename(rados_ioctx_t p, const char *src_name, + const char *dest_name) +{ + librados::IoCtx io_ctx; + librados::IoCtx::from_rados_ioctx_t(p, io_ctx); + TracepointProvider::initialize(get_cct(io_ctx)); + tracepoint(librbd, group_rename_enter, io_ctx.get_pool_name().c_str(), + io_ctx.get_id(), src_name, dest_name); + int r = librbd::api::Group<>::rename(io_ctx, src_name, dest_name); + tracepoint(librbd, group_rename_exit, r); + return r; +} + +extern "C" int rbd_group_image_add(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx image_ioctx; + + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_image_add_enter, group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_name); + + int r = librbd::api::Group<>::image_add(group_ioctx, group_name, image_ioctx, + image_name); + + tracepoint(librbd, group_image_add_exit, r); + return r; +} + +extern "C" int rbd_group_image_remove(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx image_ioctx; + + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_image_remove_enter, group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_name); + + int r = librbd::api::Group<>::image_remove(group_ioctx, group_name, + image_ioctx, image_name); + + tracepoint(librbd, group_image_remove_exit, r); + return r; +} + +extern "C" int rbd_group_image_remove_by_id(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_id) +{ + librados::IoCtx group_ioctx; + librados::IoCtx image_ioctx; + + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + librados::IoCtx::from_rados_ioctx_t(image_p, image_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_image_remove_by_id_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, + image_ioctx.get_pool_name().c_str(), + image_ioctx.get_id(), image_id); + + int r = librbd::api::Group<>::image_remove_by_id(group_ioctx, group_name, + image_ioctx, image_id); + + tracepoint(librbd, group_image_remove_by_id_exit, r); + return r; +} + +extern "C" int rbd_group_image_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t *image_size) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_image_list_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(images, 0, sizeof(*images) * *image_size); + + if (group_image_info_size != sizeof(rbd_group_image_info_t)) { + *image_size = 0; + tracepoint(librbd, group_image_list_exit, -ERANGE); + return -ERANGE; + } + + std::vector cpp_images; + int r = librbd::api::Group<>::image_list(group_ioctx, group_name, + &cpp_images); + + if (r == -ENOENT) { + tracepoint(librbd, group_image_list_exit, 0); + *image_size = 0; + return 0; + } + + if (r < 0) { + tracepoint(librbd, group_image_list_exit, r); + return r; + } + + if (*image_size < cpp_images.size()) { + *image_size = cpp_images.size(); + tracepoint(librbd, group_image_list_exit, -ERANGE); + return -ERANGE; + } + + for (size_t i = 0; i < cpp_images.size(); ++i) { + group_image_status_cpp_to_c(cpp_images[i], &images[i]); + } + + r = *image_size = cpp_images.size(); + tracepoint(librbd, group_image_list_exit, r); + return r; +} + +extern "C" int rbd_group_info_cleanup(rbd_group_info_t *group_info, + size_t group_info_size) { + if (group_info_size != sizeof(rbd_group_info_t)) { + return -ERANGE; + } + + free(group_info->name); + return 0; +} + +extern "C" int rbd_group_image_list_cleanup(rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t len) { + if (group_image_info_size != sizeof(rbd_group_image_info_t)) { + return -ERANGE; + } + + for (size_t i = 0; i < len; ++i) { + free(images[i].name); + } + return 0; +} + +extern "C" int rbd_group_snap_create(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_create_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + + int r = librbd::api::Group<>::snap_create(group_ioctx, group_name, + snap_name, 0); + tracepoint(librbd, group_snap_create_exit, r); + + return r; +} + +extern "C" int rbd_group_snap_create2(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name, + uint32_t flags) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_create_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + + int r = librbd::api::Group<>::snap_create(group_ioctx, group_name, snap_name, + flags); + tracepoint(librbd, group_snap_create_exit, r); + + return r; +} + +extern "C" int rbd_group_snap_remove(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_remove_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + + int r = librbd::api::Group<>::snap_remove(group_ioctx, group_name, snap_name); + + tracepoint(librbd, group_snap_remove_exit, r); + + return r; +} + +extern "C" int rbd_group_snap_rename(rados_ioctx_t group_p, + const char *group_name, + const char *old_snap_name, + const char *new_snap_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rename_enter, + group_ioctx.get_pool_name().c_str(), group_ioctx.get_id(), + group_name, old_snap_name, new_snap_name); + + int r = librbd::api::Group<>::snap_rename(group_ioctx, group_name, + old_snap_name, new_snap_name); + + tracepoint(librbd, group_snap_list_exit, r); + return r; +} + +extern "C" int rbd_group_snap_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t *snaps_size) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_list_enter, group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(snaps, 0, sizeof(*snaps) * *snaps_size); + + if (group_snap_info_size != sizeof(rbd_group_snap_info_t)) { + *snaps_size = 0; + tracepoint(librbd, group_snap_list_exit, -ERANGE); + return -ERANGE; + } + + std::vector cpp_snaps; + int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &cpp_snaps); + + if (r == -ENOENT) { + *snaps_size = 0; + tracepoint(librbd, group_snap_list_exit, 0); + return 0; + } + + if (r < 0) { + tracepoint(librbd, group_snap_list_exit, r); + return r; + } + + if (*snaps_size < cpp_snaps.size()) { + *snaps_size = cpp_snaps.size(); + tracepoint(librbd, group_snap_list_exit, -ERANGE); + return -ERANGE; + } + + for (size_t i = 0; i < cpp_snaps.size(); ++i) { + group_snap_info_cpp_to_c(cpp_snaps[i], &snaps[i]); + } + + r = *snaps_size = cpp_snaps.size(); + tracepoint(librbd, group_snap_list_exit, r); + return r; +} + +extern "C" int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t len) { + if (group_snap_info_size != sizeof(rbd_group_snap_info_t)) { + return -ERANGE; + } + + for (size_t i = 0; i < len; ++i) { + free(snaps[i].name); + } + return 0; +} + +extern "C" int rbd_group_snap_rollback(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rollback_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + + librbd::NoOpProgressContext prog_ctx; + int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name, + snap_name, prog_ctx); + + tracepoint(librbd, group_snap_rollback_exit, r); + + return r; +} + +extern "C" int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name, + librbd_progress_fn_t cb, + void *cbdata) +{ + librados::IoCtx group_ioctx; + librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx); + + TracepointProvider::initialize(get_cct(group_ioctx)); + tracepoint(librbd, group_snap_rollback_enter, + group_ioctx.get_pool_name().c_str(), + group_ioctx.get_id(), group_name, snap_name); + + librbd::CProgressContext prog_ctx(cb, cbdata); + int r = librbd::api::Group<>::snap_rollback(group_ioctx, group_name, + snap_name, prog_ctx); + + tracepoint(librbd, group_snap_rollback_exit, r); + + return r; +} + +extern "C" int rbd_snap_get_namespace_type(rbd_image_t image, + uint64_t snap_id, + rbd_snap_namespace_type_t *namespace_type) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_get_namespace_type_enter, ictx, ictx->name.c_str()); + int r = librbd::api::Snapshot<>::get_namespace_type(ictx, snap_id, + namespace_type); + tracepoint(librbd, snap_get_namespace_type_exit, r); + return r; +} + +extern "C" int rbd_snap_get_group_namespace(rbd_image_t image, uint64_t snap_id, + rbd_snap_group_namespace_t *group_snap, + size_t snap_group_namespace_size) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + tracepoint(librbd, snap_get_group_namespace_enter, ictx, + ictx->name.c_str()); + + if (snap_group_namespace_size != sizeof(rbd_snap_group_namespace_t)) { + tracepoint(librbd, snap_get_group_namespace_exit, -ERANGE); + return -ERANGE; + } + + librbd::snap_group_namespace_t group_namespace; + int r = librbd::api::Snapshot<>::get_group_namespace(ictx, snap_id, + &group_namespace); + if (r >= 0) { + group_snap->group_pool = group_namespace.group_pool; + group_snap->group_name = strdup(group_namespace.group_name.c_str()); + group_snap->group_snap_name = + strdup(group_namespace.group_snap_name.c_str()); + } + + tracepoint(librbd, snap_get_group_namespace_exit, r); + return r; +} + +extern "C" int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap, + size_t snap_group_namespace_size) { + if (snap_group_namespace_size != sizeof(rbd_snap_group_namespace_t)) { + return -ERANGE; + } + + free(group_snap->group_name); + free(group_snap->group_snap_name); + return 0; +} + +extern "C" int rbd_snap_get_trash_namespace(rbd_image_t image, uint64_t snap_id, + char *original_name, + size_t max_length) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + std::string cpp_original_name; + int r = librbd::api::Snapshot<>::get_trash_namespace(ictx, snap_id, + &cpp_original_name); + if (r < 0) { + return r; + } + + if (cpp_original_name.length() >= max_length) { + return -ERANGE; + } + + strcpy(original_name, cpp_original_name.c_str()); + return 0; +} + +extern "C" int rbd_snap_get_mirror_namespace( + rbd_image_t image, uint64_t snap_id, + rbd_snap_mirror_namespace_t *mirror_snap, + size_t mirror_snap_size) { + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + + if (mirror_snap_size != sizeof(rbd_snap_mirror_namespace_t)) { + return -ERANGE; + } + + librbd::snap_mirror_namespace_t mirror_namespace; + int r = librbd::api::Snapshot<>::get_mirror_namespace( + ictx, snap_id, &mirror_namespace); + if (r < 0) { + return r; + } + + mirror_snap->state = mirror_namespace.state; + mirror_snap->primary_mirror_uuid = + strdup(mirror_namespace.primary_mirror_uuid.c_str()); + mirror_snap->primary_snap_id = mirror_namespace.primary_snap_id; + mirror_snap->mirror_peer_uuids_count = + mirror_namespace.mirror_peer_uuids.size(); + size_t len = 0; + for (auto &peer : mirror_namespace.mirror_peer_uuids) { + len += peer.size() + 1; + } + mirror_snap->mirror_peer_uuids = (char *)malloc(len); + char *p = mirror_snap->mirror_peer_uuids; + for (auto &peer : mirror_namespace.mirror_peer_uuids) { + strncpy(p, peer.c_str(), peer.size() + 1); + p += peer.size() + 1; + } + mirror_snap->complete = mirror_namespace.complete; + mirror_snap->last_copied_object_number = + mirror_namespace.last_copied_object_number; + + return 0; +} + +extern "C" int rbd_snap_mirror_namespace_cleanup( + rbd_snap_mirror_namespace_t *mirror_snap, + size_t mirror_snap_size) { + if (mirror_snap_size != sizeof(rbd_snap_mirror_namespace_t)) { + return -ERANGE; + } + + free(mirror_snap->primary_mirror_uuid); + free(mirror_snap->mirror_peer_uuids); + return 0; +} + +extern "C" int rbd_watchers_list(rbd_image_t image, + rbd_image_watcher_t *watchers, + size_t *max_watchers) { + std::list watcher_list; + librbd::ImageCtx *ictx = (librbd::ImageCtx*)image; + + tracepoint(librbd, list_watchers_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only); + // FIPS zeroization audit 20191117: this memset is not security related. + memset(watchers, 0, sizeof(*watchers) * *max_watchers); + int r = librbd::list_watchers(ictx, watcher_list); + if (r < 0) { + tracepoint(librbd, list_watchers_exit, r, 0); + return r; + } + + if (watcher_list.size() > *max_watchers) { + *max_watchers = watcher_list.size(); + tracepoint(librbd, list_watchers_exit, -ERANGE, watcher_list.size()); + return -ERANGE; + } + + *max_watchers = 0; + for (auto &watcher : watcher_list) { + tracepoint(librbd, list_watchers_entry, watcher.addr.c_str(), watcher.id, watcher.cookie); + watchers[*max_watchers].addr = strdup(watcher.addr.c_str()); + watchers[*max_watchers].id = watcher.id; + watchers[*max_watchers].cookie = watcher.cookie; + *max_watchers += 1; + } + + tracepoint(librbd, list_watchers_exit, r, watcher_list.size()); + return 0; +} + +extern "C" void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers, + size_t num_watchers) { + for (size_t i = 0; i < num_watchers; ++i) { + free(watchers[i].addr); + } +} + +extern "C" int rbd_config_image_list(rbd_image_t image, + rbd_config_option_t *options, + int *max_options) { + librbd::ImageCtx *ictx = (librbd::ImageCtx*)image; + + std::vector option_vector; + int r = librbd::api::Config<>::list(ictx, &option_vector); + if (r < 0) { + return r; + } + + if (*max_options < static_cast(option_vector.size())) { + *max_options = static_cast(option_vector.size()); + return -ERANGE; + } + + for (int i = 0; i < static_cast(option_vector.size()); ++i) { + config_option_cpp_to_c(option_vector[i], &options[i]); + } + *max_options = static_cast(option_vector.size()); + return 0; +} + +extern "C" void rbd_config_image_list_cleanup(rbd_config_option_t *options, + int max_options) { + for (int i = 0; i < max_options; ++i) { + config_option_cleanup(options[i]); + } +} + +extern "C" int rbd_quiesce_watch(rbd_image_t image, + rbd_update_callback_t quiesce_cb, + rbd_update_callback_t unquiesce_cb, + void *arg, uint64_t *handle) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + auto wctx = new C_QuiesceWatchCB(quiesce_cb, unquiesce_cb, arg); + int r = ictx->state->register_quiesce_watcher(wctx, &wctx->handle); + if (r < 0) { + delete wctx; + return r; + } + *handle = reinterpret_cast(wctx); + return 0; +} + +extern "C" int rbd_quiesce_unwatch(rbd_image_t image, uint64_t handle) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + auto *wctx = reinterpret_cast(handle); + int r = ictx->state->unregister_quiesce_watcher(wctx->handle); + delete wctx; + return r; +} + +extern "C" void rbd_quiesce_complete(rbd_image_t image, uint64_t handle, int r) +{ + librbd::ImageCtx *ictx = (librbd::ImageCtx *)image; + ictx->state->quiesce_complete(handle, r); +} diff --git a/src/librbd/managed_lock/AcquireRequest.cc b/src/librbd/managed_lock/AcquireRequest.cc new file mode 100644 index 000000000..79be0f25a --- /dev/null +++ b/src/librbd/managed_lock/AcquireRequest.cc @@ -0,0 +1,184 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/AcquireRequest.h" +#include "librbd/Watcher.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/managed_lock/BreakRequest.h" +#include "librbd/managed_lock/GetLockerRequest.h" +#include "librbd/managed_lock/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::AcquireRequest: " << this \ + << " " << __func__ << ": " + +using std::string; + +namespace librbd { + +using librbd::util::detail::C_AsyncCallback; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +namespace managed_lock { + +template +AcquireRequest* AcquireRequest::create(librados::IoCtx& ioctx, + Watcher *watcher, + AsioEngine& asio_engine, + const string& oid, + const string& cookie, + bool exclusive, + bool blocklist_on_break_lock, + uint32_t blocklist_expire_seconds, + Context *on_finish) { + return new AcquireRequest(ioctx, watcher, asio_engine, oid, cookie, + exclusive, blocklist_on_break_lock, + blocklist_expire_seconds, on_finish); +} + +template +AcquireRequest::AcquireRequest(librados::IoCtx& ioctx, Watcher *watcher, + AsioEngine& asio_engine, + const string& oid, + const string& cookie, bool exclusive, + bool blocklist_on_break_lock, + uint32_t blocklist_expire_seconds, + Context *on_finish) + : m_ioctx(ioctx), m_watcher(watcher), + m_cct(reinterpret_cast(m_ioctx.cct())), + m_asio_engine(asio_engine), m_oid(oid), m_cookie(cookie), + m_exclusive(exclusive), + m_blocklist_on_break_lock(blocklist_on_break_lock), + m_blocklist_expire_seconds(blocklist_expire_seconds), + m_on_finish(new C_AsyncCallback( + asio_engine.get_work_queue(), on_finish)) { +} + +template +AcquireRequest::~AcquireRequest() { +} + +template +void AcquireRequest::send() { + send_get_locker(); +} + +template +void AcquireRequest::send_get_locker() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_context_callback< + AcquireRequest, &AcquireRequest::handle_get_locker>(this); + auto req = GetLockerRequest::create(m_ioctx, m_oid, m_exclusive, + &m_locker, ctx); + req->send(); +} + +template +void AcquireRequest::handle_get_locker(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -ENOENT) { + ldout(m_cct, 20) << "no lockers detected" << dendl; + m_locker = {}; + } else if (r == -EBUSY) { + ldout(m_cct, 5) << "incompatible lock detected" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_lock(); +} + +template +void AcquireRequest::send_lock() { + ldout(m_cct, 10) << "entity=client." << m_ioctx.get_instance_id() << ", " + << "cookie=" << m_cookie << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::lock(&op, RBD_LOCK_NAME, + m_exclusive ? ClsLockType::EXCLUSIVE : ClsLockType::SHARED, m_cookie, + util::get_watcher_lock_tag(), "", utime_t(), 0); + + using klass = AcquireRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void AcquireRequest::handle_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == 0) { + finish(0); + return; + } else if (r == -EBUSY && m_locker.cookie.empty()) { + ldout(m_cct, 5) << "already locked, refreshing locker" << dendl; + send_get_locker(); + return; + } else if (r != -EBUSY) { + lderr(m_cct) << "failed to lock: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + send_break_lock(); +} + +template +void AcquireRequest::send_break_lock() { + ldout(m_cct, 10) << dendl; + + Context *ctx = create_context_callback< + AcquireRequest, &AcquireRequest::handle_break_lock>(this); + auto req = BreakRequest::create( + m_ioctx, m_asio_engine, m_oid, m_locker, m_exclusive, + m_blocklist_on_break_lock, m_blocklist_expire_seconds, false, ctx); + req->send(); +} + +template +void AcquireRequest::handle_break_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -EAGAIN) { + ldout(m_cct, 5) << "lock owner is still alive" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to break lock : " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + m_locker = {}; + send_lock(); +} + +template +void AcquireRequest::finish(int r) { + m_on_finish->complete(r); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::AcquireRequest; diff --git a/src/librbd/managed_lock/AcquireRequest.h b/src/librbd/managed_lock/AcquireRequest.h new file mode 100644 index 000000000..19424a422 --- /dev/null +++ b/src/librbd/managed_lock/AcquireRequest.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "include/int_types.h" +#include "include/buffer.h" +#include "msg/msg_types.h" +#include "librbd/managed_lock/Types.h" +#include "librbd/watcher/Types.h" +#include + +class Context; + +namespace librbd { + +class AsioEngine; +class Watcher; + +namespace managed_lock { + +template +class AcquireRequest { +private: + typedef watcher::Traits TypeTraits; + typedef typename TypeTraits::Watcher Watcher; + +public: + static AcquireRequest* create(librados::IoCtx& ioctx, Watcher *watcher, + AsioEngine& asio_engine, + const std::string& oid, + const std::string& cookie, + bool exclusive, + bool blocklist_on_break_lock, + uint32_t blocklist_expire_seconds, + Context *on_finish); + + ~AcquireRequest(); + void send(); + +private: + + /** + * @verbatim + * + * + * | + * v + * GET_LOCKER + * | ^ + * | . (EBUSY && no cached locker) + * | . + * | . (EBUSY && cached locker) + * \--> LOCK_IMAGE * * * * * * * * > BREAK_LOCK . . . . . + * | ^ | . + * | | | (success) . + * | \-------------------------/ . + * v . + * < . . . . . . . . . . . . . . . . . . . + * + * @endverbatim + */ + + AcquireRequest(librados::IoCtx& ioctx, Watcher *watcher, + AsioEngine& asio_engine, const std::string& oid, + const std::string& cookie, bool exclusive, + bool blocklist_on_break_lock, + uint32_t blocklist_expire_seconds, Context *on_finish); + + librados::IoCtx& m_ioctx; + Watcher *m_watcher; + CephContext *m_cct; + AsioEngine& m_asio_engine; + std::string m_oid; + std::string m_cookie; + bool m_exclusive; + bool m_blocklist_on_break_lock; + uint32_t m_blocklist_expire_seconds; + Context *m_on_finish; + + bufferlist m_out_bl; + + Locker m_locker; + + void send_get_locker(); + void handle_get_locker(int r); + + void send_lock(); + void handle_lock(int r); + + void send_break_lock(); + void handle_break_lock(int r); + + void finish(int r); +}; + +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_ACQUIRE_REQUEST_H diff --git a/src/librbd/managed_lock/BreakRequest.cc b/src/librbd/managed_lock/BreakRequest.cc new file mode 100644 index 000000000..69cd35301 --- /dev/null +++ b/src/librbd/managed_lock/BreakRequest.cc @@ -0,0 +1,249 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/BreakRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/neorados/RADOS.hpp" +#include "include/stringify.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/asio/Utils.h" +#include "librbd/managed_lock/GetLockerRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::BreakRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace managed_lock { + +using util::create_context_callback; +using util::create_rados_callback; + +template +BreakRequest::BreakRequest(librados::IoCtx& ioctx, + AsioEngine& asio_engine, + const std::string& oid, const Locker &locker, + bool exclusive, bool blocklist_locker, + uint32_t blocklist_expire_seconds, + bool force_break_lock, Context *on_finish) + : m_ioctx(ioctx), m_cct(reinterpret_cast(m_ioctx.cct())), + m_asio_engine(asio_engine), m_oid(oid), m_locker(locker), + m_exclusive(exclusive), m_blocklist_locker(blocklist_locker), + m_blocklist_expire_seconds(blocklist_expire_seconds), + m_force_break_lock(force_break_lock), m_on_finish(on_finish) { +} + +template +void BreakRequest::send() { + send_get_watchers(); +} + +template +void BreakRequest::send_get_watchers() { + ldout(m_cct, 10) << dendl; + + librados::ObjectReadOperation op; + op.list_watchers(&m_watchers, &m_watchers_ret_val); + + using klass = BreakRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + m_out_bl.clear(); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void BreakRequest::handle_get_watchers(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == 0) { + r = m_watchers_ret_val; + } + if (r < 0) { + lderr(m_cct) << "failed to retrieve watchers: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + bool found_alive_locker = false; + for (auto &watcher : m_watchers) { + ldout(m_cct, 20) << "watcher=[" + << "addr=" << watcher.addr << ", " + << "entity=client." << watcher.watcher_id << "]" << dendl; + + if ((strncmp(m_locker.address.c_str(), + watcher.addr, sizeof(watcher.addr)) == 0) && + (m_locker.handle == watcher.cookie)) { + ldout(m_cct, 10) << "lock owner is still alive" << dendl; + found_alive_locker = true; + } + } + + if (!m_force_break_lock && found_alive_locker) { + finish(-EAGAIN); + return; + } + + send_get_locker(); +} + +template +void BreakRequest::send_get_locker() { + ldout(m_cct, 10) << dendl; + + using klass = BreakRequest; + Context *ctx = create_context_callback( + this); + auto req = GetLockerRequest::create(m_ioctx, m_oid, m_exclusive, + &m_refreshed_locker, ctx); + req->send(); +} + +template +void BreakRequest::handle_get_locker(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -ENOENT) { + ldout(m_cct, 5) << "no lock owner" << dendl; + finish(0); + return; + } else if (r < 0 && r != -EBUSY) { + lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl; + finish(r); + return; + } else if (r < 0) { + m_refreshed_locker = {}; + } + + if (m_refreshed_locker != m_locker || m_refreshed_locker == Locker{}) { + ldout(m_cct, 5) << "no longer lock owner" << dendl; + finish(-EAGAIN); + return; + } + + send_blocklist(); +} + +template +void BreakRequest::send_blocklist() { + if (!m_blocklist_locker) { + send_break_lock(); + return; + } + + entity_name_t entity_name = entity_name_t::CLIENT(m_ioctx.get_instance_id()); + ldout(m_cct, 10) << "local entity=" << entity_name << ", " + << "locker entity=" << m_locker.entity << dendl; + + if (m_locker.entity == entity_name) { + lderr(m_cct) << "attempting to self-blocklist" << dendl; + finish(-EINVAL); + return; + } + + entity_addr_t locker_addr; + if (!locker_addr.parse(m_locker.address)) { + lderr(m_cct) << "unable to parse locker address: " << m_locker.address + << dendl; + finish(-EINVAL); + return; + } + + std::optional expire; + if (m_blocklist_expire_seconds != 0) { + expire = std::chrono::seconds(m_blocklist_expire_seconds); + } + m_asio_engine.get_rados_api().blocklist_add( + m_locker.address, expire, + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_blocklist(r); })); +} + +template +void BreakRequest::handle_blocklist(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to blocklist lock owner: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + wait_for_osd_map(); +} + +template +void BreakRequest::wait_for_osd_map() { + ldout(m_cct, 10) << dendl; + + m_asio_engine.get_rados_api().wait_for_latest_osd_map( + librbd::asio::util::get_callback_adapter( + [this](int r) { handle_wait_for_osd_map(r); })); +} + +template +void BreakRequest::handle_wait_for_osd_map(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to wait for updated OSD map: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + send_break_lock(); +} + +template +void BreakRequest::send_break_lock() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, m_locker.cookie, + m_locker.entity); + + using klass = BreakRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void BreakRequest::handle_break_lock(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to break lock: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void BreakRequest::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::BreakRequest; diff --git a/src/librbd/managed_lock/BreakRequest.h b/src/librbd/managed_lock/BreakRequest.h new file mode 100644 index 000000000..dd46bbcc5 --- /dev/null +++ b/src/librbd/managed_lock/BreakRequest.h @@ -0,0 +1,120 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/rados/librados_fwd.hpp" +#include "msg/msg_types.h" +#include +#include +#include +#include "librbd/managed_lock/Types.h" + +class Context; +class ContextWQ; +class obj_watch_t; + +namespace librbd { + +class AsioEngine; +class ImageCtx; +template class Journal; +namespace asio { struct ContextWQ; } + +namespace managed_lock { + +template +class BreakRequest { +public: + static BreakRequest* create(librados::IoCtx& ioctx, + AsioEngine& asio_engine, + const std::string& oid, const Locker &locker, + bool exclusive, bool blocklist_locker, + uint32_t blocklist_expire_seconds, + bool force_break_lock, Context *on_finish) { + return new BreakRequest(ioctx, asio_engine, oid, locker, exclusive, + blocklist_locker, blocklist_expire_seconds, + force_break_lock, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_WATCHERS + * | + * v + * GET_LOCKER + * | + * v + * BLOCKLIST (skip if disabled) + * | + * v + * WAIT_FOR_OSD_MAP + * | + * v + * BREAK_LOCK + * | + * v + * + * + * @endvertbatim + */ + + librados::IoCtx &m_ioctx; + CephContext *m_cct; + AsioEngine& m_asio_engine; + std::string m_oid; + Locker m_locker; + bool m_exclusive; + bool m_blocklist_locker; + uint32_t m_blocklist_expire_seconds; + bool m_force_break_lock; + Context *m_on_finish; + + bufferlist m_out_bl; + + std::list m_watchers; + int m_watchers_ret_val; + + Locker m_refreshed_locker; + + BreakRequest(librados::IoCtx& ioctx, AsioEngine& asio_engine, + const std::string& oid, const Locker &locker, + bool exclusive, bool blocklist_locker, + uint32_t blocklist_expire_seconds, bool force_break_lock, + Context *on_finish); + + void send_get_watchers(); + void handle_get_watchers(int r); + + void send_get_locker(); + void handle_get_locker(int r); + + void send_blocklist(); + void handle_blocklist(int r); + + void wait_for_osd_map(); + void handle_wait_for_osd_map(int r); + + void send_break_lock(); + void handle_break_lock(int r); + + void finish(int r); + +}; + +} // namespace managed_lock +} // namespace librbd + +extern template class librbd::managed_lock::BreakRequest; + +#endif // CEPH_LIBRBD_MANAGED_LOCK_BREAK_REQUEST_H diff --git a/src/librbd/managed_lock/GetLockerRequest.cc b/src/librbd/managed_lock/GetLockerRequest.cc new file mode 100644 index 000000000..ea898ab96 --- /dev/null +++ b/src/librbd/managed_lock/GetLockerRequest.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/GetLockerRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/managed_lock/Types.h" +#include "librbd/managed_lock/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::GetLockerRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace managed_lock { + +using librbd::util::create_rados_callback; + +template +GetLockerRequest::GetLockerRequest(librados::IoCtx& ioctx, + const std::string& oid, bool exclusive, + Locker *locker, Context *on_finish) + : m_ioctx(ioctx), m_cct(reinterpret_cast(m_ioctx.cct())), + m_oid(oid), m_exclusive(exclusive), m_locker(locker), + m_on_finish(on_finish) { +} + +template +void GetLockerRequest::send() { + send_get_lockers(); +} + +template +void GetLockerRequest::send_get_lockers() { + ldout(m_cct, 10) << dendl; + + librados::ObjectReadOperation op; + rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME); + + using klass = GetLockerRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + m_out_bl.clear(); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void GetLockerRequest::handle_get_lockers(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + std::map lockers; + ClsLockType lock_type = ClsLockType::NONE; + std::string lock_tag; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = rados::cls::lock::get_lock_info_finish(&it, &lockers, &lock_type, + &lock_tag); + } + + if (r < 0) { + lderr(m_cct) << "failed to retrieve lockers: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (lockers.empty()) { + ldout(m_cct, 20) << "no lockers detected" << dendl; + finish(-ENOENT); + return; + } + + if (lock_tag != util::get_watcher_lock_tag()) { + ldout(m_cct, 5) <<"locked by external mechanism: tag=" << lock_tag << dendl; + finish(-EBUSY); + return; + } + + if (m_exclusive && lock_type == ClsLockType::SHARED) { + ldout(m_cct, 5) << "incompatible shared lock type detected" << dendl; + finish(-EBUSY); + return; + } else if (!m_exclusive && lock_type == ClsLockType::EXCLUSIVE) { + ldout(m_cct, 5) << "incompatible exclusive lock type detected" << dendl; + finish(-EBUSY); + return; + } + + std::map::iterator iter = lockers.begin(); + if (!util::decode_lock_cookie(iter->first.cookie, &m_locker->handle)) { + ldout(m_cct, 5) << "locked by external mechanism: " + << "cookie=" << iter->first.cookie << dendl; + finish(-EBUSY); + return; + } + + if (iter->second.addr.is_blank_ip()) { + ldout(m_cct, 5) << "locker has a blank address" << dendl; + finish(-EBUSY); + return; + } + m_locker->entity = iter->first.locker; + m_locker->cookie = iter->first.cookie; + m_locker->address = iter->second.addr.get_legacy_str(); + + ldout(m_cct, 10) << "retrieved exclusive locker: " + << m_locker->entity << "@" << m_locker->address << dendl; + finish(0); +} + +template +void GetLockerRequest::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::GetLockerRequest; diff --git a/src/librbd/managed_lock/GetLockerRequest.h b/src/librbd/managed_lock/GetLockerRequest.h new file mode 100644 index 000000000..b8fd08f6e --- /dev/null +++ b/src/librbd/managed_lock/GetLockerRequest.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "include/rados/librados_fwd.hpp" +#include + +class Context; + +namespace librbd { + +struct ImageCtx; + +namespace managed_lock { + +struct Locker; + +template +class GetLockerRequest { +public: + static GetLockerRequest* create(librados::IoCtx& ioctx, + const std::string& oid, bool exclusive, + Locker *locker, Context *on_finish) { + return new GetLockerRequest(ioctx, oid, exclusive, locker, on_finish); + } + + void send(); + +private: + librados::IoCtx &m_ioctx; + CephContext *m_cct; + std::string m_oid; + bool m_exclusive; + Locker *m_locker; + Context *m_on_finish; + + bufferlist m_out_bl; + + GetLockerRequest(librados::IoCtx& ioctx, const std::string& oid, + bool exclusive, Locker *locker, Context *on_finish); + + void send_get_lockers(); + void handle_get_lockers(int r); + + void finish(int r); + +}; + +} // namespace managed_lock +} // namespace librbd + +extern template class librbd::managed_lock::GetLockerRequest; + +#endif // CEPH_LIBRBD_MANAGED_LOCK_GET_LOCKER_REQUEST_H diff --git a/src/librbd/managed_lock/ReacquireRequest.cc b/src/librbd/managed_lock/ReacquireRequest.cc new file mode 100644 index 000000000..9eaa51569 --- /dev/null +++ b/src/librbd/managed_lock/ReacquireRequest.cc @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/ReacquireRequest.h" +#include "librbd/Watcher.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/managed_lock/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::ReacquireRequest: " \ + << this << ": " << __func__ + +using std::string; + +namespace librbd { +namespace managed_lock { + +using librbd::util::create_rados_callback; + +template +ReacquireRequest::ReacquireRequest(librados::IoCtx& ioctx, + const string& oid, + const string& old_cookie, + const string &new_cookie, + bool exclusive, + Context *on_finish) + : m_ioctx(ioctx), m_oid(oid), m_old_cookie(old_cookie), + m_new_cookie(new_cookie), m_exclusive(exclusive), m_on_finish(on_finish) { +} + + +template +void ReacquireRequest::send() { + set_cookie(); +} + +template +void ReacquireRequest::set_cookie() { + CephContext *cct = reinterpret_cast(m_ioctx.cct()); + ldout(cct, 10) << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::set_cookie(&op, RBD_LOCK_NAME, + m_exclusive ? ClsLockType::EXCLUSIVE : ClsLockType::SHARED, + m_old_cookie, util::get_watcher_lock_tag(), + m_new_cookie); + + librados::AioCompletion *rados_completion = create_rados_callback< + ReacquireRequest, &ReacquireRequest::handle_set_cookie>(this); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void ReacquireRequest::handle_set_cookie(int r) { + CephContext *cct = reinterpret_cast(m_ioctx.cct()); + ldout(cct, 10) << ": r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + ldout(cct, 10) << ": OSD doesn't support updating lock" << dendl; + } else if (r < 0) { + lderr(cct) << ": failed to update lock: " << cpp_strerror(r) << dendl; + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::ReacquireRequest; diff --git a/src/librbd/managed_lock/ReacquireRequest.h b/src/librbd/managed_lock/ReacquireRequest.h new file mode 100644 index 000000000..3f2b7d7e2 --- /dev/null +++ b/src/librbd/managed_lock/ReacquireRequest.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "include/int_types.h" +#include + +class Context; + +namespace librbd { + +class Watcher; + +namespace managed_lock { + +template +class ReacquireRequest { +public: + + static ReacquireRequest *create(librados::IoCtx& ioctx, + const std::string& oid, + const std::string& old_cookie, + const std::string &new_cookie, + bool exclusive, + Context *on_finish) { + return new ReacquireRequest(ioctx, oid, old_cookie, new_cookie, exclusive, + on_finish); + } + + ReacquireRequest(librados::IoCtx& ioctx, const std::string& oid, + const std::string& old_cookie, + const std::string &new_cookie, bool exclusive, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * SET_COOKIE + * | + * v + * + * + * @endverbatim + */ + librados::IoCtx& m_ioctx; + std::string m_oid; + std::string m_old_cookie; + std::string m_new_cookie; + bool m_exclusive; + Context *m_on_finish; + + void set_cookie(); + void handle_set_cookie(int r); + +}; + +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_REACQUIRE_REQUEST_H diff --git a/src/librbd/managed_lock/ReleaseRequest.cc b/src/librbd/managed_lock/ReleaseRequest.cc new file mode 100644 index 000000000..6707a149f --- /dev/null +++ b/src/librbd/managed_lock/ReleaseRequest.cc @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/managed_lock/ReleaseRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "cls/lock/cls_lock_types.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/Watcher.h" +#include "librbd/asio/ContextWQ.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::managed_lock::ReleaseRequest: " \ + << this << " " << __func__ << ": " + +using std::string; + +namespace librbd { +namespace managed_lock { + +using util::detail::C_AsyncCallback; +using util::create_context_callback; +using util::create_rados_callback; + +template +ReleaseRequest* ReleaseRequest::create(librados::IoCtx& ioctx, + Watcher *watcher, + asio::ContextWQ *work_queue, + const string& oid, + const string& cookie, + Context *on_finish) { + return new ReleaseRequest(ioctx, watcher, work_queue, oid, cookie, + on_finish); +} + +template +ReleaseRequest::ReleaseRequest(librados::IoCtx& ioctx, Watcher *watcher, + asio::ContextWQ *work_queue, + const string& oid, const string& cookie, + Context *on_finish) + : m_ioctx(ioctx), m_watcher(watcher), m_oid(oid), m_cookie(cookie), + m_on_finish(new C_AsyncCallback(work_queue, on_finish)) { +} + +template +ReleaseRequest::~ReleaseRequest() { +} + + +template +void ReleaseRequest::send() { + send_unlock(); +} + +template +void ReleaseRequest::send_unlock() { + CephContext *cct = reinterpret_cast(m_ioctx.cct()); + ldout(cct, 10) << "entity=client." << m_ioctx.get_instance_id() << ", " + << "cookie=" << m_cookie << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::unlock(&op, RBD_LOCK_NAME, m_cookie); + + using klass = ReleaseRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_ioctx.aio_operate(m_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void ReleaseRequest::handle_unlock(int r) { + CephContext *cct = reinterpret_cast(m_ioctx.cct()); + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to unlock: " << cpp_strerror(r) << dendl; + } + + finish(); +} + +template +void ReleaseRequest::finish() { + m_on_finish->complete(0); + delete this; +} + +} // namespace managed_lock +} // namespace librbd + +template class librbd::managed_lock::ReleaseRequest; + diff --git a/src/librbd/managed_lock/ReleaseRequest.h b/src/librbd/managed_lock/ReleaseRequest.h new file mode 100644 index 000000000..91d922282 --- /dev/null +++ b/src/librbd/managed_lock/ReleaseRequest.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H +#define CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "librbd/watcher/Types.h" +#include + +class Context; +class ContextWQ; + +namespace librbd { + +class Watcher; +namespace asio { struct ContextWQ; } + +namespace managed_lock { + +template +class ReleaseRequest { +private: + typedef watcher::Traits TypeTraits; + typedef typename TypeTraits::Watcher Watcher; + +public: + static ReleaseRequest* create(librados::IoCtx& ioctx, Watcher *watcher, + asio::ContextWQ *work_queue, + const std::string& oid, + const std::string& cookie, + Context *on_finish); + + ~ReleaseRequest(); + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * UNLOCK + * | + * v + * + * + * @endverbatim + */ + + ReleaseRequest(librados::IoCtx& ioctx, Watcher *watcher, + asio::ContextWQ *work_queue, const std::string& oid, + const std::string& cookie, Context *on_finish); + + librados::IoCtx& m_ioctx; + Watcher *m_watcher; + std::string m_oid; + std::string m_cookie; + Context *m_on_finish; + + void send_unlock(); + void handle_unlock(int r); + + void finish(); + +}; + +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_RELEASE_REQUEST_H diff --git a/src/librbd/managed_lock/Types.h b/src/librbd/managed_lock/Types.h new file mode 100644 index 000000000..319789c83 --- /dev/null +++ b/src/librbd/managed_lock/Types.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_TYPES_H +#define CEPH_LIBRBD_MANAGED_LOCK_TYPES_H + +#include "msg/msg_types.h" +#include + +namespace librbd { +namespace managed_lock { + +struct Locker { + entity_name_t entity; + std::string cookie; + std::string address; + uint64_t handle = 0; + + Locker() { + } + Locker(const entity_name_t& entity, const std::string &cookie, + const std::string &address, uint64_t handle) + : entity(entity), cookie(cookie), address(address), handle(handle) { + } + + inline bool operator==(const Locker &rhs) const { + return (entity == rhs.entity && + cookie == rhs.cookie && + address == rhs.address && + handle == rhs.handle); + } + inline bool operator!=(const Locker &rhs) const { + return !(*this == rhs); + } +}; + +enum Mode { + EXCLUSIVE, + SHARED +}; + + +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_TYPES_H diff --git a/src/librbd/managed_lock/Utils.cc b/src/librbd/managed_lock/Utils.cc new file mode 100644 index 000000000..0b4f908dd --- /dev/null +++ b/src/librbd/managed_lock/Utils.cc @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/ceph_assert.h" +#include "librbd/managed_lock/Utils.h" +#include + +namespace librbd { +namespace managed_lock { +namespace util { + +namespace { + +const std::string WATCHER_LOCK_COOKIE_PREFIX = "auto"; +const std::string WATCHER_LOCK_TAG("internal"); + +} // anonymous namespace + +const std::string &get_watcher_lock_tag() { + return WATCHER_LOCK_TAG; +} + +bool decode_lock_cookie(const std::string &tag, uint64_t *handle) { + std::string prefix; + std::istringstream ss(tag); + if (!(ss >> prefix >> *handle) || prefix != WATCHER_LOCK_COOKIE_PREFIX) { + return false; + } + return true; +} + +std::string encode_lock_cookie(uint64_t watch_handle) { + ceph_assert(watch_handle != 0); + std::ostringstream ss; + ss << WATCHER_LOCK_COOKIE_PREFIX << " " << watch_handle; + return ss.str(); +} + +} // namespace util +} // namespace managed_lock +} // namespace librbd + + diff --git a/src/librbd/managed_lock/Utils.h b/src/librbd/managed_lock/Utils.h new file mode 100644 index 000000000..679cbfe8e --- /dev/null +++ b/src/librbd/managed_lock/Utils.h @@ -0,0 +1,23 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MANAGED_LOCK_UTILS_H +#define CEPH_LIBRBD_MANAGED_LOCK_UTILS_H + +#include "include/int_types.h" +#include + +namespace librbd { +namespace managed_lock { +namespace util { + +const std::string &get_watcher_lock_tag(); + +bool decode_lock_cookie(const std::string &tag, uint64_t *handle); +std::string encode_lock_cookie(uint64_t watch_handle); + +} // namespace util +} // namespace managed_lock +} // namespace librbd + +#endif // CEPH_LIBRBD_MANAGED_LOCK_UTILS_H diff --git a/src/librbd/migration/FileStream.cc b/src/librbd/migration/FileStream.cc new file mode 100644 index 000000000..63cd722dd --- /dev/null +++ b/src/librbd/migration/FileStream.cc @@ -0,0 +1,232 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef _LARGEFILE64_SOURCE +#define _LARGEFILE64_SOURCE +#endif // _LARGEFILE64_SOURCE + +#include "librbd/migration/FileStream.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/Utils.h" +#include +#include +#include +#include +#include + +namespace librbd { +namespace migration { + +namespace { + +const std::string FILE_PATH {"file_path"}; + +} // anonymous namespace + +#ifdef BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::FileStream::ReadRequest " \ + << this << " " << __func__ << ": " + +template +struct FileStream::ReadRequest { + FileStream* file_stream; + io::Extents byte_extents; + bufferlist* data; + Context* on_finish; + size_t index = 0; + + ReadRequest(FileStream* file_stream, io::Extents&& byte_extents, + bufferlist* data, Context* on_finish) + : file_stream(file_stream), byte_extents(std::move(byte_extents)), + data(data), on_finish(on_finish) { + auto cct = file_stream->m_cct; + ldout(cct, 20) << dendl; + } + + void send() { + data->clear(); + read(); + } + + void read() { + auto cct = file_stream->m_cct; + if (index >= byte_extents.size()) { + finish(0); + return; + } + + auto& byte_extent = byte_extents[index++]; + ldout(cct, 20) << "byte_extent=" << byte_extent << dendl; + + auto ptr = buffer::ptr_node::create(buffer::create_small_page_aligned( + byte_extent.second)); + auto buffer = boost::asio::mutable_buffer( + ptr->c_str(), byte_extent.second); + data->push_back(std::move(ptr)); + + int r; + auto offset = lseek64(file_stream->m_file_no, byte_extent.first, SEEK_SET); + if (offset == -1) { + r = -errno; + lderr(cct) << "failed to seek file stream: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + boost::system::error_code ec; + size_t bytes_read = boost::asio::read( + *file_stream->m_stream_descriptor, std::move(buffer), ec); + r = -ec.value(); + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to read from file stream: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else if (bytes_read < byte_extent.second) { + lderr(cct) << "failed to read " << byte_extent.second << " bytes from " + << "file stream" << dendl; + finish(-ERANGE); + return; + } + + // re-queue the remainder of the read requests + boost::asio::post(file_stream->m_strand, [this]() { read(); }); + } + + void finish(int r) { + auto cct = file_stream->m_cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + data->clear(); + } + + on_finish->complete(r); + delete this; + } +}; + +#endif // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::FileStream: " << this \ + << " " << __func__ << ": " + +template +FileStream::FileStream(I* image_ctx, const json_spirit::mObject& json_object) + : m_cct(image_ctx->cct), m_asio_engine(image_ctx->asio_engine), + m_json_object(json_object), + m_strand(boost::asio::make_strand(*m_asio_engine)) { +} + +template +FileStream::~FileStream() { + if (m_file_no != -1) { + ::close(m_file_no); + } +} + +#ifdef BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR + +template +void FileStream::open(Context* on_finish) { + auto& file_path_value = m_json_object[FILE_PATH]; + if (file_path_value.type() != json_spirit::str_type) { + lderr(m_cct) << "failed to locate '" << FILE_PATH << "' key" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto& file_path = file_path_value.get_str(); + ldout(m_cct, 10) << "file_path=" << file_path << dendl; + + m_file_no = ::open(file_path.c_str(), O_RDONLY); + if (m_file_no < 0) { + int r = -errno; + lderr(m_cct) << "failed to open file stream '" << file_path << "': " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + m_stream_descriptor = std::make_optional< + boost::asio::posix::stream_descriptor>(m_strand, m_file_no); + on_finish->complete(0); +} + +template +void FileStream::close(Context* on_finish) { + ldout(m_cct, 10) << dendl; + + m_stream_descriptor.reset(); + on_finish->complete(0); +} + +template +void FileStream::get_size(uint64_t* size, Context* on_finish) { + ldout(m_cct, 10) << dendl; + + // execute IO operations in a single strand to prevent seek races + boost::asio::post( + m_strand, [this, size, on_finish]() { + auto offset = lseek64(m_file_no, 0, SEEK_END); + if (offset == -1) { + int r = -errno; + lderr(m_cct) << "failed to seek to file end: " << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + ldout(m_cct, 10) << "size=" << offset << dendl; + *size = offset; + on_finish->complete(0); + }); +} + +template +void FileStream::read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish) { + ldout(m_cct, 20) << byte_extents << dendl; + + auto ctx = new ReadRequest(this, std::move(byte_extents), data, on_finish); + + // execute IO operations in a single strand to prevent seek races + boost::asio::post(m_strand, [ctx]() { ctx->send(); }); +} + +#else // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR + +template +void FileStream::open(Context* on_finish) { + on_finish->complete(-EIO); +} + +template +void FileStream::close(Context* on_finish) { + on_finish->complete(-EIO); +} + +template +void FileStream::get_size(uint64_t* size, Context* on_finish) { + on_finish->complete(-EIO); +} + +template +void FileStream::read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish) { + on_finish->complete(-EIO); +} + +#endif // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR + +} // namespace migration +} // namespace librbd + +template class librbd::migration::FileStream; diff --git a/src/librbd/migration/FileStream.h b/src/librbd/migration/FileStream.h new file mode 100644 index 000000000..32face71e --- /dev/null +++ b/src/librbd/migration/FileStream.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_FILE_STREAM_H +#define CEPH_LIBRBD_MIGRATION_FILE_STREAM_H + +#include "include/int_types.h" +#include "librbd/migration/StreamInterface.h" +#include +#include +#include +#include +#include +#include + +struct Context; + +namespace librbd { + +struct AsioEngine; +struct ImageCtx; + +namespace migration { + +template +class FileStream : public StreamInterface { +public: + static FileStream* create(ImageCtxT* image_ctx, + const json_spirit::mObject& json_object) { + return new FileStream(image_ctx, json_object); + } + + FileStream(ImageCtxT* image_ctx, const json_spirit::mObject& json_object); + ~FileStream() override; + + FileStream(const FileStream&) = delete; + FileStream& operator=(const FileStream&) = delete; + + void open(Context* on_finish) override; + void close(Context* on_finish) override; + + void get_size(uint64_t* size, Context* on_finish) override; + + void read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish) override; + +private: + CephContext* m_cct; + std::shared_ptr m_asio_engine; + json_spirit::mObject m_json_object; + + boost::asio::strand m_strand; +#ifdef BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR + std::optional m_stream_descriptor; + + struct ReadRequest; + +#endif // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR + + int m_file_no = -1; +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::FileStream; + +#endif // CEPH_LIBRBD_MIGRATION_FILE_STREAM_H diff --git a/src/librbd/migration/FormatInterface.h b/src/librbd/migration/FormatInterface.h new file mode 100644 index 000000000..d13521d11 --- /dev/null +++ b/src/librbd/migration/FormatInterface.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_FORMAT_INTERFACE_H +#define CEPH_LIBRBD_MIGRATION_FORMAT_INTERFACE_H + +#include "include/buffer_fwd.h" +#include "include/int_types.h" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include + +struct Context; + +namespace librbd { + +namespace io { +struct AioCompletion; +struct ReadResult; +} // namespace io + +namespace migration { + +struct FormatInterface { + typedef std::map SnapInfos; + + virtual ~FormatInterface() { + } + + virtual void open(Context* on_finish) = 0; + virtual void close(Context* on_finish) = 0; + + virtual void get_snapshots(SnapInfos* snap_infos, Context* on_finish) = 0; + virtual void get_image_size(uint64_t snap_id, uint64_t* size, + Context* on_finish) = 0; + + virtual bool read(io::AioCompletion* aio_comp, uint64_t snap_id, + io::Extents&& image_extents, io::ReadResult&& read_result, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) = 0; + + virtual void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids, + int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, + Context* on_finish) = 0; +}; + +} // namespace migration +} // namespace librbd + +#endif // CEPH_LIBRBD_MIGRATION_FORMAT_INTERFACE_H diff --git a/src/librbd/migration/HttpClient.cc b/src/librbd/migration/HttpClient.cc new file mode 100644 index 000000000..90d5723ed --- /dev/null +++ b/src/librbd/migration/HttpClient.cc @@ -0,0 +1,947 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/HttpClient.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ReadResult.h" +#include "librbd/migration/Utils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace librbd { +namespace migration { + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::HttpClient::" \ + << "HttpSession " << this << " " << __func__ \ + << ": " + +/** + * boost::beast utilizes non-inheriting template classes for handling plain vs + * encrypted TCP streams. Utilize a base-class for handling the majority of the + * logic for handling connecting, disconnecting, reseting, and sending requests. + */ + +template +template +class HttpClient::HttpSession : public HttpSessionInterface { +public: + void init(Context* on_finish) override { + ceph_assert(m_http_client->m_strand.running_in_this_thread()); + + auto cct = m_http_client->m_cct; + ldout(cct, 15) << dendl; + + ceph_assert(m_state == STATE_UNINITIALIZED); + m_state = STATE_CONNECTING; + + resolve_host(on_finish); + } + + void shut_down(Context* on_finish) override { + ceph_assert(m_http_client->m_strand.running_in_this_thread()); + + auto cct = m_http_client->m_cct; + ldout(cct, 15) << dendl; + + ceph_assert(on_finish != nullptr); + ceph_assert(m_on_shutdown == nullptr); + m_on_shutdown = on_finish; + + auto current_state = m_state; + if (current_state == STATE_UNINITIALIZED) { + // never initialized or resolve/connect failed + on_finish->complete(0); + return; + } + + m_state = STATE_SHUTTING_DOWN; + if (current_state != STATE_READY) { + // delay shutdown until current state transition completes + return; + } + + disconnect(new LambdaContext([this](int r) { handle_shut_down(r); })); + } + + void issue(std::shared_ptr&& work) override { + ceph_assert(m_http_client->m_strand.running_in_this_thread()); + + auto cct = m_http_client->m_cct; + ldout(cct, 20) << "work=" << work.get() << dendl; + + if (is_shutdown()) { + lderr(cct) << "cannot issue HTTP request, client is shutdown" + << dendl; + work->complete(-ESHUTDOWN, {}); + return; + } + + bool first_issue = m_issue_queue.empty(); + m_issue_queue.emplace_back(work); + if (m_state == STATE_READY && first_issue) { + ldout(cct, 20) << "sending http request: work=" << work.get() << dendl; + finalize_issue(std::move(work)); + } else if (m_state == STATE_UNINITIALIZED) { + ldout(cct, 20) << "resetting HTTP session: work=" << work.get() << dendl; + m_state = STATE_RESET_CONNECTING; + resolve_host(nullptr); + } else { + ldout(cct, 20) << "queueing HTTP request: work=" << work.get() << dendl; + } + } + + void finalize_issue(std::shared_ptr&& work) { + auto cct = m_http_client->m_cct; + ldout(cct, 20) << "work=" << work.get() << dendl; + + ++m_in_flight_requests; + (*work)(derived().stream()); + } + + void handle_issue(boost::system::error_code ec, + std::shared_ptr&& work) override { + ceph_assert(m_http_client->m_strand.running_in_this_thread()); + + auto cct = m_http_client->m_cct; + ldout(cct, 20) << "work=" << work.get() << ", r=" << -ec.value() << dendl; + + ceph_assert(m_in_flight_requests > 0); + --m_in_flight_requests; + if (maybe_finalize_reset()) { + // previous request is attempting reset to this request will be resent + return; + } + + ceph_assert(!m_issue_queue.empty()); + m_issue_queue.pop_front(); + + if (is_shutdown()) { + lderr(cct) << "client shutdown during in-flight request" << dendl; + work->complete(-ESHUTDOWN, {}); + + maybe_finalize_shutdown(); + return; + } + + if (ec) { + if (ec == boost::asio::error::bad_descriptor || + ec == boost::asio::error::broken_pipe || + ec == boost::asio::error::connection_reset || + ec == boost::asio::error::operation_aborted || + ec == boost::asio::ssl::error::stream_truncated || + ec == boost::beast::http::error::end_of_stream || + ec == boost::beast::http::error::partial_message) { + ldout(cct, 5) << "remote peer stream closed, retrying request" << dendl; + m_issue_queue.push_front(work); + } else if (ec == boost::beast::error::timeout) { + lderr(cct) << "timed-out while issuing request" << dendl; + work->complete(-ETIMEDOUT, {}); + } else { + lderr(cct) << "failed to issue request: " << ec.message() << dendl; + work->complete(-ec.value(), {}); + } + + // attempt to recover the connection + reset(); + return; + } + + bool first_receive = m_receive_queue.empty(); + m_receive_queue.push_back(work); + if (first_receive) { + receive(std::move(work)); + } + + // TODO disable pipelining for non-idempotent requests + + // pipeline the next request into the stream + if (!m_issue_queue.empty()) { + work = m_issue_queue.front(); + ldout(cct, 20) << "sending http request: work=" << work.get() << dendl; + finalize_issue(std::move(work)); + } + } + +protected: + HttpClient* m_http_client; + + HttpSession(HttpClient* http_client) + : m_http_client(http_client), m_resolver(http_client->m_strand) { + } + + virtual void connect(boost::asio::ip::tcp::resolver::results_type results, + Context* on_finish) = 0; + virtual void disconnect(Context* on_finish) = 0; + + void close_socket() { + auto cct = m_http_client->m_cct; + ldout(cct, 15) << dendl; + + boost::system::error_code ec; + boost::beast::get_lowest_layer(derived().stream()).socket().close(ec); + } + +private: + enum State { + STATE_UNINITIALIZED, + STATE_CONNECTING, + STATE_READY, + STATE_RESET_PENDING, + STATE_RESET_DISCONNECTING, + STATE_RESET_CONNECTING, + STATE_SHUTTING_DOWN, + STATE_SHUTDOWN, + }; + + State m_state = STATE_UNINITIALIZED; + boost::asio::ip::tcp::resolver m_resolver; + + Context* m_on_shutdown = nullptr; + + uint64_t m_in_flight_requests = 0; + std::deque> m_issue_queue; + std::deque> m_receive_queue; + + boost::beast::flat_buffer m_buffer; + std::optional> m_header_parser; + std::optional> m_parser; + + D& derived() { + return static_cast(*this); + } + + void resolve_host(Context* on_finish) { + auto cct = m_http_client->m_cct; + ldout(cct, 15) << dendl; + + shutdown_socket(); + m_resolver.async_resolve( + m_http_client->m_url_spec.host, m_http_client->m_url_spec.port, + [this, on_finish](boost::system::error_code ec, auto results) { + handle_resolve_host(ec, results, on_finish); }); + } + + void handle_resolve_host( + boost::system::error_code ec, + boost::asio::ip::tcp::resolver::results_type results, + Context* on_finish) { + auto cct = m_http_client->m_cct; + int r = -ec.value(); + ldout(cct, 15) << "r=" << r << dendl; + + if (ec) { + if (ec == boost::asio::error::host_not_found) { + r = -ENOENT; + } else if (ec == boost::asio::error::host_not_found_try_again) { + // TODO: add retry throttle + r = -EAGAIN; + } + + lderr(cct) << "failed to resolve host '" + << m_http_client->m_url_spec.host << "': " + << cpp_strerror(r) << dendl; + advance_state(STATE_UNINITIALIZED, r, on_finish); + return; + } + + connect(results, new LambdaContext([this, on_finish](int r) { + handle_connect(r, on_finish); })); + } + + void handle_connect(int r, Context* on_finish) { + auto cct = m_http_client->m_cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to connect to host '" + << m_http_client->m_url_spec.host << "': " + << cpp_strerror(r) << dendl; + advance_state(STATE_UNINITIALIZED, r, on_finish); + return; + } + + advance_state(STATE_READY, 0, on_finish); + } + + void handle_shut_down(int r) { + auto cct = m_http_client->m_cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to disconnect stream: '" << cpp_strerror(r) + << dendl; + } + + // cancel all in-flight send/receives (if any) + shutdown_socket(); + + maybe_finalize_shutdown(); + } + + void maybe_finalize_shutdown() { + if (m_in_flight_requests > 0) { + return; + } + + // cancel any queued IOs + fail_queued_work(-ESHUTDOWN); + + advance_state(STATE_SHUTDOWN, 0, nullptr); + } + + bool is_shutdown() const { + ceph_assert(m_http_client->m_strand.running_in_this_thread()); + return (m_state == STATE_SHUTTING_DOWN || m_state == STATE_SHUTDOWN); + } + + void reset() { + ceph_assert(m_http_client->m_strand.running_in_this_thread()); + ceph_assert(m_state == STATE_READY); + + auto cct = m_http_client->m_cct; + ldout(cct, 15) << dendl; + + m_state = STATE_RESET_PENDING; + maybe_finalize_reset(); + } + + bool maybe_finalize_reset() { + if (m_state != STATE_RESET_PENDING) { + return false; + } + + if (m_in_flight_requests > 0) { + return true; + } + + ceph_assert(m_http_client->m_strand.running_in_this_thread()); + auto cct = m_http_client->m_cct; + ldout(cct, 15) << dendl; + + m_buffer.clear(); + + // move in-flight request back to the front of the issue queue + m_issue_queue.insert(m_issue_queue.begin(), + m_receive_queue.begin(), m_receive_queue.end()); + m_receive_queue.clear(); + + m_state = STATE_RESET_DISCONNECTING; + disconnect(new LambdaContext([this](int r) { handle_reset(r); })); + return true; + } + + void handle_reset(int r) { + auto cct = m_http_client->m_cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to disconnect stream: '" << cpp_strerror(r) + << dendl; + } + + advance_state(STATE_RESET_CONNECTING, r, nullptr); + } + + int shutdown_socket() { + if (!boost::beast::get_lowest_layer( + derived().stream()).socket().is_open()) { + return 0; + } + + auto cct = m_http_client->m_cct; + ldout(cct, 15) << dendl; + + boost::system::error_code ec; + boost::beast::get_lowest_layer(derived().stream()).socket().shutdown( + boost::asio::ip::tcp::socket::shutdown_both, ec); + + if (ec && ec != boost::beast::errc::not_connected) { + lderr(cct) << "failed to shutdown socket: " << ec.message() << dendl; + return -ec.value(); + } + + close_socket(); + return 0; + } + + void receive(std::shared_ptr&& work) { + auto cct = m_http_client->m_cct; + ldout(cct, 15) << "work=" << work.get() << dendl; + + ceph_assert(!m_receive_queue.empty()); + ++m_in_flight_requests; + + // receive the response for this request + m_parser.emplace(); + if (work->header_only()) { + // HEAD requests don't trasfer data but the parser still cares about max + // content-length + m_header_parser.emplace(); + m_header_parser->body_limit(std::numeric_limits::max()); + + boost::beast::http::async_read_header( + derived().stream(), m_buffer, *m_header_parser, + [this, work=std::move(work)] + (boost::beast::error_code ec, std::size_t) mutable { + handle_receive(ec, std::move(work)); + }); + } else { + m_parser->body_limit(1 << 25); // max RBD object size + boost::beast::http::async_read( + derived().stream(), m_buffer, *m_parser, + [this, work=std::move(work)] + (boost::beast::error_code ec, std::size_t) mutable { + handle_receive(ec, std::move(work)); + }); + } + } + + void handle_receive(boost::system::error_code ec, + std::shared_ptr&& work) { + auto cct = m_http_client->m_cct; + ldout(cct, 15) << "work=" << work.get() << ", r=" << -ec.value() << dendl; + + ceph_assert(m_in_flight_requests > 0); + --m_in_flight_requests; + if (maybe_finalize_reset()) { + // previous request is attempting reset to this request will be resent + return; + } + + ceph_assert(!m_receive_queue.empty()); + m_receive_queue.pop_front(); + + if (is_shutdown()) { + lderr(cct) << "client shutdown with in-flight request" << dendl; + work->complete(-ESHUTDOWN, {}); + + maybe_finalize_shutdown(); + return; + } + + if (ec) { + if (ec == boost::asio::error::bad_descriptor || + ec == boost::asio::error::broken_pipe || + ec == boost::asio::error::connection_reset || + ec == boost::asio::error::operation_aborted || + ec == boost::asio::ssl::error::stream_truncated || + ec == boost::beast::http::error::end_of_stream || + ec == boost::beast::http::error::partial_message) { + ldout(cct, 5) << "remote peer stream closed, retrying request" << dendl; + m_receive_queue.push_front(work); + } else if (ec == boost::beast::error::timeout) { + lderr(cct) << "timed-out while issuing request" << dendl; + work->complete(-ETIMEDOUT, {}); + } else { + lderr(cct) << "failed to issue request: " << ec.message() << dendl; + work->complete(-ec.value(), {}); + } + + reset(); + return; + } + + Response response; + if (work->header_only()) { + m_parser.emplace(std::move(*m_header_parser)); + } + response = m_parser->release(); + + // basic response code handling in a common location + int r = 0; + auto result = response.result(); + if (result == boost::beast::http::status::not_found) { + lderr(cct) << "requested resource does not exist" << dendl; + r = -ENOENT; + } else if (result == boost::beast::http::status::forbidden) { + lderr(cct) << "permission denied attempting to access resource" << dendl; + r = -EACCES; + } else if (boost::beast::http::to_status_class(result) != + boost::beast::http::status_class::successful) { + lderr(cct) << "failed to retrieve size: HTTP " << result << dendl; + r = -EIO; + } + + bool need_eof = response.need_eof(); + if (r < 0) { + work->complete(r, {}); + } else { + work->complete(0, std::move(response)); + } + + if (need_eof) { + ldout(cct, 20) << "reset required for non-pipelined response: " + << "work=" << work.get() << dendl; + reset(); + } else if (!m_receive_queue.empty()) { + auto work = m_receive_queue.front(); + receive(std::move(work)); + } + } + + void advance_state(State next_state, int r, Context* on_finish) { + auto cct = m_http_client->m_cct; + auto current_state = m_state; + ldout(cct, 15) << "current_state=" << current_state << ", " + << "next_state=" << next_state << ", " + << "r=" << r << dendl; + + m_state = next_state; + if (current_state == STATE_CONNECTING) { + if (next_state == STATE_UNINITIALIZED) { + shutdown_socket(); + on_finish->complete(r); + return; + } else if (next_state == STATE_READY) { + on_finish->complete(r); + return; + } + } else if (current_state == STATE_SHUTTING_DOWN) { + if (next_state == STATE_READY) { + // shut down requested while connecting/resetting + disconnect(new LambdaContext([this](int r) { handle_shut_down(r); })); + return; + } else if (next_state == STATE_UNINITIALIZED || + next_state == STATE_SHUTDOWN || + next_state == STATE_RESET_CONNECTING) { + ceph_assert(m_on_shutdown != nullptr); + m_on_shutdown->complete(r); + return; + } + } else if (current_state == STATE_RESET_DISCONNECTING) { + // disconnected from peer -- ignore errors and reconnect + ceph_assert(next_state == STATE_RESET_CONNECTING); + ceph_assert(on_finish == nullptr); + shutdown_socket(); + resolve_host(nullptr); + return; + } else if (current_state == STATE_RESET_CONNECTING) { + ceph_assert(on_finish == nullptr); + if (next_state == STATE_READY) { + // restart queued IO + if (!m_issue_queue.empty()) { + auto& work = m_issue_queue.front(); + finalize_issue(std::move(work)); + } + return; + } else if (next_state == STATE_UNINITIALIZED) { + shutdown_socket(); + + // fail all queued IO + fail_queued_work(r); + return; + } + } + + lderr(cct) << "unexpected state transition: " + << "current_state=" << current_state << ", " + << "next_state=" << next_state << dendl; + ceph_assert(false); + } + + void complete_work(std::shared_ptr work, int r, Response&& response) { + auto cct = m_http_client->m_cct; + ldout(cct, 20) << "work=" << work.get() << ", r=" << r << dendl; + + work->complete(r, std::move(response)); + } + + void fail_queued_work(int r) { + auto cct = m_http_client->m_cct; + ldout(cct, 10) << "r=" << r << dendl; + + for (auto& work : m_issue_queue) { + complete_work(work, r, {}); + } + m_issue_queue.clear(); + ceph_assert(m_receive_queue.empty()); + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::HttpClient::" \ + << "PlainHttpSession " << this << " " << __func__ \ + << ": " + +template +class HttpClient::PlainHttpSession : public HttpSession { +public: + PlainHttpSession(HttpClient* http_client) + : HttpSession(http_client), + m_stream(http_client->m_strand) { + } + ~PlainHttpSession() override { + this->close_socket(); + } + + inline boost::beast::tcp_stream& + stream() { + return m_stream; + } + +protected: + void connect(boost::asio::ip::tcp::resolver::results_type results, + Context* on_finish) override { + auto http_client = this->m_http_client; + auto cct = http_client->m_cct; + ldout(cct, 15) << dendl; + + m_stream.async_connect( + results, + [on_finish](boost::system::error_code ec, const auto& endpoint) { + on_finish->complete(-ec.value()); + }); + } + + void disconnect(Context* on_finish) override { + on_finish->complete(0); + } + +private: + boost::beast::tcp_stream m_stream; + +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::HttpClient::" \ + << "SslHttpSession " << this << " " << __func__ \ + << ": " + +template +class HttpClient::SslHttpSession : public HttpSession { +public: + SslHttpSession(HttpClient* http_client) + : HttpSession(http_client), + m_stream(http_client->m_strand, http_client->m_ssl_context) { + } + ~SslHttpSession() override { + this->close_socket(); + } + + inline boost::beast::ssl_stream& + stream() { + return m_stream; + } + +protected: + void connect(boost::asio::ip::tcp::resolver::results_type results, + Context* on_finish) override { + auto http_client = this->m_http_client; + auto cct = http_client->m_cct; + ldout(cct, 15) << dendl; + + boost::beast::get_lowest_layer(m_stream).async_connect( + results, + [this, on_finish](boost::system::error_code ec, const auto& endpoint) { + handle_connect(-ec.value(), on_finish); + }); + } + + void disconnect(Context* on_finish) override { + auto http_client = this->m_http_client; + auto cct = http_client->m_cct; + ldout(cct, 15) << dendl; + + if (!m_ssl_enabled) { + on_finish->complete(0); + return; + } + + m_stream.async_shutdown( + asio::util::get_callback_adapter([this, on_finish](int r) { + shutdown(r, on_finish); })); + } + +private: + boost::beast::ssl_stream m_stream; + bool m_ssl_enabled = false; + + void handle_connect(int r, Context* on_finish) { + auto http_client = this->m_http_client; + auto cct = http_client->m_cct; + ldout(cct, 15) << dendl; + + if (r < 0) { + lderr(cct) << "failed to connect to host '" + << http_client->m_url_spec.host << "': " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + handshake(on_finish); + } + + void handshake(Context* on_finish) { + auto http_client = this->m_http_client; + auto cct = http_client->m_cct; + ldout(cct, 15) << dendl; + + auto& host = http_client->m_url_spec.host; + m_stream.set_verify_mode( + boost::asio::ssl::verify_peer | + boost::asio::ssl::verify_fail_if_no_peer_cert); + m_stream.set_verify_callback( + [host, next=boost::asio::ssl::host_name_verification(host), + ignore_self_signed=http_client->m_ignore_self_signed_cert] + (bool preverified, boost::asio::ssl::verify_context& ctx) { + if (!preverified && ignore_self_signed) { + auto ec = X509_STORE_CTX_get_error(ctx.native_handle()); + switch (ec) { + case X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT: + case X509_V_ERR_SELF_SIGNED_CERT_IN_CHAIN: + // ignore self-signed cert issues + preverified = true; + break; + default: + break; + } + } + return next(preverified, ctx); + }); + + // Set SNI Hostname (many hosts need this to handshake successfully) + if(!SSL_set_tlsext_host_name(m_stream.native_handle(), + http_client->m_url_spec.host.c_str())) { + int r = -::ERR_get_error(); + lderr(cct) << "failed to initialize SNI hostname: " << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + // Perform the SSL/TLS handshake + m_stream.async_handshake( + boost::asio::ssl::stream_base::client, + asio::util::get_callback_adapter( + [this, on_finish](int r) { handle_handshake(r, on_finish); })); + } + + void handle_handshake(int r, Context* on_finish) { + auto http_client = this->m_http_client; + auto cct = http_client->m_cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to complete handshake: " << cpp_strerror(r) + << dendl; + disconnect(new LambdaContext([r, on_finish](int) { + on_finish->complete(r); })); + return; + } + + m_ssl_enabled = true; + on_finish->complete(0); + } + + void shutdown(int r, Context* on_finish) { + auto http_client = this->m_http_client; + auto cct = http_client->m_cct; + ldout(cct, 15) << "r=" << r << dendl; + + on_finish->complete(r); + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::HttpClient: " << this \ + << " " << __func__ << ": " + +template +HttpClient::HttpClient(I* image_ctx, const std::string& url) + : m_cct(image_ctx->cct), m_image_ctx(image_ctx), + m_asio_engine(image_ctx->asio_engine), m_url(url), + m_strand(boost::asio::make_strand(*m_asio_engine)), + m_ssl_context(boost::asio::ssl::context::sslv23_client) { + m_ssl_context.set_default_verify_paths(); +} + +template +void HttpClient::open(Context* on_finish) { + ldout(m_cct, 10) << "url=" << m_url << dendl; + + int r = util::parse_url(m_cct, m_url, &m_url_spec); + if (r < 0) { + lderr(m_cct) << "failed to parse url '" << m_url << "': " << cpp_strerror(r) + << dendl; + on_finish->complete(-EINVAL); + return; + } + + boost::asio::post(m_strand, [this, on_finish]() mutable { + create_http_session(on_finish); }); +} + +template +void HttpClient::close(Context* on_finish) { + boost::asio::post(m_strand, [this, on_finish]() mutable { + shut_down_http_session(on_finish); }); +} + +template +void HttpClient::get_size(uint64_t* size, Context* on_finish) { + ldout(m_cct, 10) << dendl; + + Request req; + req.method(boost::beast::http::verb::head); + + issue( + std::move(req), [this, size, on_finish](int r, Response&& response) { + handle_get_size(r, std::move(response), size, on_finish); + }); +} + +template +void HttpClient::handle_get_size(int r, Response&& response, uint64_t* size, + Context* on_finish) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to retrieve size: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } else if (!response.has_content_length()) { + lderr(m_cct) << "failed to retrieve size: missing content-length" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto content_length = response[boost::beast::http::field::content_length]; + try { + *size = boost::lexical_cast(content_length); + } catch (boost::bad_lexical_cast&) { + lderr(m_cct) << "invalid content-length in response" << dendl; + on_finish->complete(-EBADMSG); + return; + } + + on_finish->complete(0); +} + +template +void HttpClient::read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish) { + ldout(m_cct, 20) << dendl; + + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, librbd::util::get_image_ctx(m_image_ctx), io::AIO_TYPE_READ); + aio_comp->set_request_count(byte_extents.size()); + + // utilize ReadResult to assemble multiple byte extents into a single bl + // since boost::beast doesn't support multipart responses out-of-the-box + io::ReadResult read_result{data}; + aio_comp->read_result = std::move(read_result); + aio_comp->read_result.set_image_extents(byte_extents); + + // issue a range get request for each extent + uint64_t buffer_offset = 0; + for (auto [byte_offset, byte_length] : byte_extents) { + auto ctx = new io::ReadResult::C_ImageReadRequest( + aio_comp, buffer_offset, {{byte_offset, byte_length}}); + buffer_offset += byte_length; + + Request req; + req.method(boost::beast::http::verb::get); + + std::stringstream range; + ceph_assert(byte_length > 0); + range << "bytes=" << byte_offset << "-" << (byte_offset + byte_length - 1); + req.set(boost::beast::http::field::range, range.str()); + + issue( + std::move(req), + [this, byte_offset=byte_offset, byte_length=byte_length, ctx] + (int r, Response&& response) { + handle_read(r, std::move(response), byte_offset, byte_length, &ctx->bl, + ctx); + }); + } +} + +template +void HttpClient::handle_read(int r, Response&& response, + uint64_t byte_offset, uint64_t byte_length, + bufferlist* data, Context* on_finish) { + ldout(m_cct, 20) << "bytes=" << byte_offset << "~" << byte_length << ", " + << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to read requested byte range: " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } else if (response.result() != boost::beast::http::status::partial_content) { + lderr(m_cct) << "failed to retrieve requested byte range: HTTP " + << response.result() << dendl; + on_finish->complete(-EIO); + return; + } else if (byte_length != response.body().size()) { + lderr(m_cct) << "unexpected short range read: " + << "wanted=" << byte_length << ", " + << "received=" << response.body().size() << dendl; + on_finish->complete(-EINVAL); + return; + } + + data->clear(); + data->append(response.body()); + on_finish->complete(data->length()); +} + +template +void HttpClient::issue(std::shared_ptr&& work) { + boost::asio::post(m_strand, [this, work=std::move(work)]() mutable { + m_http_session->issue(std::move(work)); }); +} + +template +void HttpClient::create_http_session(Context* on_finish) { + ldout(m_cct, 15) << dendl; + + ceph_assert(m_http_session == nullptr); + switch (m_url_spec.scheme) { + case URL_SCHEME_HTTP: + m_http_session = std::make_unique(this); + break; + case URL_SCHEME_HTTPS: + m_http_session = std::make_unique(this); + break; + default: + ceph_assert(false); + break; + } + + m_http_session->init(on_finish); +} + +template +void HttpClient::shut_down_http_session(Context* on_finish) { + ldout(m_cct, 15) << dendl; + + if (m_http_session == nullptr) { + on_finish->complete(0); + return; + } + + m_http_session->shut_down(on_finish); +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::HttpClient; diff --git a/src/librbd/migration/HttpClient.h b/src/librbd/migration/HttpClient.h new file mode 100644 index 000000000..3997e6159 --- /dev/null +++ b/src/librbd/migration/HttpClient.h @@ -0,0 +1,205 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_HTTP_CLIENT_H +#define CEPH_LIBRBD_MIGRATION_HTTP_CLIENT_H + +#include "include/common_fwd.h" +#include "include/int_types.h" +#include "librbd/io/Types.h" +#include "librbd/migration/HttpProcessorInterface.h" +#include "librbd/migration/Types.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct Context; + +namespace librbd { + +struct AsioEngine; +struct ImageCtx; + +namespace migration { + +template +class HttpClient { +public: + using EmptyBody = boost::beast::http::empty_body; + using StringBody = boost::beast::http::string_body; + using Request = boost::beast::http::request; + using Response = boost::beast::http::response; + + using RequestPreprocessor = std::function; + + static HttpClient* create(ImageCtxT* image_ctx, const std::string& url) { + return new HttpClient(image_ctx, url); + } + + HttpClient(ImageCtxT* image_ctx, const std::string& url); + HttpClient(const HttpClient&) = delete; + HttpClient& operator=(const HttpClient&) = delete; + + void open(Context* on_finish); + void close(Context* on_finish); + + void get_size(uint64_t* size, Context* on_finish); + + void read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish); + + void set_ignore_self_signed_cert(bool ignore) { + m_ignore_self_signed_cert = ignore; + } + + void set_http_processor(HttpProcessorInterface* http_processor) { + m_http_processor = http_processor; + } + + template + void issue(boost::beast::http::request&& request, + Completion&& completion) { + struct WorkImpl : Work { + HttpClient* http_client; + boost::beast::http::request request; + Completion completion; + + WorkImpl(HttpClient* http_client, + boost::beast::http::request&& request, + Completion&& completion) + : http_client(http_client), request(std::move(request)), + completion(std::move(completion)) { + } + WorkImpl(const WorkImpl&) = delete; + WorkImpl& operator=(const WorkImpl&) = delete; + + bool need_eof() const override { + return request.need_eof(); + } + + bool header_only() const override { + return (request.method() == boost::beast::http::verb::head); + } + + void complete(int r, Response&& response) override { + completion(r, std::move(response)); + } + + void operator()(boost::beast::tcp_stream& stream) override { + preprocess_request(); + + boost::beast::http::async_write( + stream, request, + [http_session=http_client->m_http_session.get(), + work=this->shared_from_this()] + (boost::beast::error_code ec, std::size_t) mutable { + http_session->handle_issue(ec, std::move(work)); + }); + } + + void operator()( + boost::beast::ssl_stream& stream) override { + preprocess_request(); + + boost::beast::http::async_write( + stream, request, + [http_session=http_client->m_http_session.get(), + work=this->shared_from_this()] + (boost::beast::error_code ec, std::size_t) mutable { + http_session->handle_issue(ec, std::move(work)); + }); + } + + void preprocess_request() { + if (http_client->m_http_processor) { + http_client->m_http_processor->process_request(request); + } + } + }; + + initialize_default_fields(request); + issue(std::make_shared(this, std::move(request), + std::move(completion))); + } + +private: + struct Work; + struct HttpSessionInterface { + virtual ~HttpSessionInterface() {} + + virtual void init(Context* on_finish) = 0; + virtual void shut_down(Context* on_finish) = 0; + + virtual void issue(std::shared_ptr&& work) = 0; + virtual void handle_issue(boost::system::error_code ec, + std::shared_ptr&& work) = 0; + }; + + struct Work : public std::enable_shared_from_this { + virtual ~Work() {} + virtual bool need_eof() const = 0; + virtual bool header_only() const = 0; + virtual void complete(int r, Response&&) = 0; + virtual void operator()(boost::beast::tcp_stream& stream) = 0; + virtual void operator()( + boost::beast::ssl_stream& stream) = 0; + }; + + template struct HttpSession; + struct PlainHttpSession; + struct SslHttpSession; + + CephContext* m_cct; + ImageCtxT* m_image_ctx; + std::shared_ptr m_asio_engine; + std::string m_url; + + UrlSpec m_url_spec; + + bool m_ignore_self_signed_cert = false; + + HttpProcessorInterface* m_http_processor = nullptr; + + boost::asio::strand m_strand; + + boost::asio::ssl::context m_ssl_context; + std::unique_ptr m_http_session; + + template + void initialize_default_fields(Fields& fields) const { + fields.target(m_url_spec.path); + fields.set(boost::beast::http::field::host, m_url_spec.host); + fields.set(boost::beast::http::field::user_agent, + BOOST_BEAST_VERSION_STRING); + } + + void handle_get_size(int r, Response&& response, uint64_t* size, + Context* on_finish); + + void handle_read(int r, Response&& response, uint64_t byte_offset, + uint64_t byte_length, bufferlist* data, Context* on_finish); + + void issue(std::shared_ptr&& work); + + void create_http_session(Context* on_finish); + void shut_down_http_session(Context* on_finish); +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::HttpClient; + +#endif // CEPH_LIBRBD_MIGRATION_HTTP_CLIENT_H diff --git a/src/librbd/migration/HttpProcessorInterface.h b/src/librbd/migration/HttpProcessorInterface.h new file mode 100644 index 000000000..3d9af88bd --- /dev/null +++ b/src/librbd/migration/HttpProcessorInterface.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_HTTP_PROCESSOR_INTERFACE_H +#define CEPH_LIBRBD_MIGRATION_HTTP_PROCESSOR_INTERFACE_H + +#include +#include + +namespace librbd { +namespace migration { + +struct HttpProcessorInterface { + using EmptyBody = boost::beast::http::empty_body; + using EmptyRequest = boost::beast::http::request; + + virtual ~HttpProcessorInterface() { + } + + virtual void process_request(EmptyRequest& request) = 0; + +}; + +} // namespace migration +} // namespace librbd + +#endif // CEPH_LIBRBD_MIGRATION_HTTP_PROCESSOR_INTERFACE_H diff --git a/src/librbd/migration/HttpStream.cc b/src/librbd/migration/HttpStream.cc new file mode 100644 index 000000000..fa3cc0032 --- /dev/null +++ b/src/librbd/migration/HttpStream.cc @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/HttpStream.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/Utils.h" +#include "librbd/migration/HttpClient.h" +#include + +namespace librbd { +namespace migration { + +namespace { + +const std::string URL_KEY {"url"}; + +} // anonymous namespace + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::HttpStream: " << this \ + << " " << __func__ << ": " + +template +HttpStream::HttpStream(I* image_ctx, const json_spirit::mObject& json_object) + : m_image_ctx(image_ctx), m_cct(image_ctx->cct), + m_asio_engine(image_ctx->asio_engine), m_json_object(json_object) { +} + +template +HttpStream::~HttpStream() { +} + +template +void HttpStream::open(Context* on_finish) { + auto& url_value = m_json_object[URL_KEY]; + if (url_value.type() != json_spirit::str_type) { + lderr(m_cct) << "failed to locate '" << URL_KEY << "' key" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_url = url_value.get_str(); + ldout(m_cct, 10) << "url=" << m_url << dendl; + + m_http_client.reset(HttpClient::create(m_image_ctx, m_url)); + m_http_client->open(on_finish); +} + +template +void HttpStream::close(Context* on_finish) { + ldout(m_cct, 10) << dendl; + + if (!m_http_client) { + on_finish->complete(0); + return; + } + + m_http_client->close(on_finish); +} + +template +void HttpStream::get_size(uint64_t* size, Context* on_finish) { + ldout(m_cct, 10) << dendl; + + m_http_client->get_size(size, on_finish); +} + +template +void HttpStream::read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish) { + ldout(m_cct, 20) << "byte_extents=" << byte_extents << dendl; + + m_http_client->read(std::move(byte_extents), data, on_finish); +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::HttpStream; diff --git a/src/librbd/migration/HttpStream.h b/src/librbd/migration/HttpStream.h new file mode 100644 index 000000000..01a583714 --- /dev/null +++ b/src/librbd/migration/HttpStream.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_HTTP_STREAM_H +#define CEPH_LIBRBD_MIGRATION_HTTP_STREAM_H + +#include "include/int_types.h" +#include "librbd/migration/StreamInterface.h" +#include +#include +#include +#include +#include + +struct Context; + +namespace librbd { + +struct AsioEngine; +struct ImageCtx; + +namespace migration { + +template class HttpClient; + +template +class HttpStream : public StreamInterface { +public: + static HttpStream* create(ImageCtxT* image_ctx, + const json_spirit::mObject& json_object) { + return new HttpStream(image_ctx, json_object); + } + + HttpStream(ImageCtxT* image_ctx, const json_spirit::mObject& json_object); + ~HttpStream() override; + + HttpStream(const HttpStream&) = delete; + HttpStream& operator=(const HttpStream&) = delete; + + void open(Context* on_finish) override; + void close(Context* on_finish) override; + + void get_size(uint64_t* size, Context* on_finish) override; + + void read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish) override; + +private: + using HttpResponse = boost::beast::http::response< + boost::beast::http::string_body>; + + ImageCtxT* m_image_ctx; + CephContext* m_cct; + std::shared_ptr m_asio_engine; + json_spirit::mObject m_json_object; + + std::string m_url; + + std::unique_ptr> m_http_client; + +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::HttpStream; + +#endif // CEPH_LIBRBD_MIGRATION_HTTP_STREAM_H diff --git a/src/librbd/migration/ImageDispatch.cc b/src/librbd/migration/ImageDispatch.cc new file mode 100644 index 000000000..1ae143d78 --- /dev/null +++ b/src/librbd/migration/ImageDispatch.cc @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/ImageDispatch.h" +#include "include/neorados/RADOS.hpp" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/migration/FormatInterface.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::ImageDispatch: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace migration { + +template +ImageDispatch::ImageDispatch(I* image_ctx, + std::unique_ptr format) + : m_image_ctx(image_ctx), m_format(std::move(format)) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "ictx=" << image_ctx << dendl; +} + +template +void ImageDispatch::shut_down(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + on_finish->complete(0); +} + +template +bool ImageDispatch::read( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + io::ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + return m_format->read(aio_comp, io_context->read_snap().value_or(CEPH_NOSNAP), + std::move(image_extents), std::move(read_result), + op_flags, read_flags, parent_trace); +} + +template +bool ImageDispatch::write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + lderr(cct) << dendl; + + fail_io(-EROFS, aio_comp, dispatch_result); + return true; +} + +template +bool ImageDispatch::discard( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + lderr(cct) << dendl; + + fail_io(-EROFS, aio_comp, dispatch_result); + return true; +} + +template +bool ImageDispatch::write_same( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + lderr(cct) << dendl; + + fail_io(-EROFS, aio_comp, dispatch_result); + return true; +} + +template +bool ImageDispatch::compare_and_write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + lderr(cct) << dendl; + + fail_io(-EROFS, aio_comp, dispatch_result); + return true; +} + +template +bool ImageDispatch::flush( + io::AioCompletion* aio_comp, io::FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + aio_comp->set_request_count(0); + return true; +} + +template +bool ImageDispatch::list_snaps( + io::AioCompletion* aio_comp, io::Extents&& image_extents, + io::SnapIds&& snap_ids, int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << dendl; + + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + + aio_comp->set_request_count(1); + auto ctx = new io::C_AioRequest(aio_comp); + + m_format->list_snaps(std::move(image_extents), std::move(snap_ids), + list_snaps_flags, snapshot_delta, parent_trace, + ctx); + return true; +} + +template +void ImageDispatch::fail_io(int r, io::AioCompletion* aio_comp, + io::DispatchResult* dispatch_result) { + *dispatch_result = io::DISPATCH_RESULT_COMPLETE; + aio_comp->fail(r); +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::ImageDispatch; diff --git a/src/librbd/migration/ImageDispatch.h b/src/librbd/migration/ImageDispatch.h new file mode 100644 index 000000000..cd96141c0 --- /dev/null +++ b/src/librbd/migration/ImageDispatch.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_IMAGE_DISPATCH_H +#define CEPH_LIBRBD_MIGRATION_IMAGE_DISPATCH_H + +#include "librbd/io/ImageDispatchInterface.h" +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace migration { + +struct FormatInterface; + +template +class ImageDispatch : public io::ImageDispatchInterface { +public: + static ImageDispatch* create(ImageCtxT* image_ctx, + std::unique_ptr source) { + return new ImageDispatch(image_ctx, std::move(source)); + } + + ImageDispatch(ImageCtxT* image_ctx, std::unique_ptr source); + + void shut_down(Context* on_finish) override; + + io::ImageDispatchLayer get_dispatch_layer() const override { + return io::IMAGE_DISPATCH_LAYER_MIGRATION; + } + + bool read( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + io::ReadResult &&read_result, IOContext io_context, int op_flags, + int read_flags, const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool discard( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + uint32_t discard_granularity_bytes, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool write_same( + io::AioCompletion* aio_comp, io::Extents &&image_extents, bufferlist &&bl, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool compare_and_write( + io::AioCompletion* aio_comp, io::Extents &&image_extents, + bufferlist &&cmp_bl, bufferlist &&bl, uint64_t *mismatch_offset, + int op_flags, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + bool flush( + io::AioCompletion* aio_comp, io::FlushSource flush_source, + const ZTracer::Trace &parent_trace, uint64_t tid, + std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool list_snaps( + io::AioCompletion* aio_comp, io::Extents&& image_extents, + io::SnapIds&& snap_ids, int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, const ZTracer::Trace &parent_trace, + uint64_t tid, std::atomic* image_dispatch_flags, + io::DispatchResult* dispatch_result, Context** on_finish, + Context* on_dispatched) override; + + bool invalidate_cache(Context* on_finish) override { + return false; + } + +private: + ImageCtxT* m_image_ctx; + std::unique_ptr m_format; + + void fail_io(int r, io::AioCompletion* aio_comp, + io::DispatchResult* dispatch_result); + +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::ImageDispatch; + +#endif // CEPH_LIBRBD_MIGRATION_IMAGE_DISPATCH_H diff --git a/src/librbd/migration/NativeFormat.cc b/src/librbd/migration/NativeFormat.cc new file mode 100644 index 000000000..51248b95e --- /dev/null +++ b/src/librbd/migration/NativeFormat.cc @@ -0,0 +1,309 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/NativeFormat.h" +#include "include/neorados/RADOS.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "json_spirit/json_spirit.h" +#include "boost/lexical_cast.hpp" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::NativeFormat: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace migration { + +namespace { + +const std::string TYPE_KEY{"type"}; +const std::string POOL_ID_KEY{"pool_id"}; +const std::string POOL_NAME_KEY{"pool_name"}; +const std::string POOL_NAMESPACE_KEY{"pool_namespace"}; +const std::string IMAGE_NAME_KEY{"image_name"}; +const std::string IMAGE_ID_KEY{"image_id"}; +const std::string SNAP_NAME_KEY{"snap_name"}; +const std::string SNAP_ID_KEY{"snap_id"}; + +} // anonymous namespace + +template +std::string NativeFormat::build_source_spec( + int64_t pool_id, const std::string& pool_namespace, + const std::string& image_name, const std::string& image_id) { + json_spirit::mObject source_spec; + source_spec[TYPE_KEY] = "native"; + source_spec[POOL_ID_KEY] = pool_id; + source_spec[POOL_NAMESPACE_KEY] = pool_namespace; + source_spec[IMAGE_NAME_KEY] = image_name; + if (!image_id.empty()) { + source_spec[IMAGE_ID_KEY] = image_id; + } + return json_spirit::write(source_spec); +} + +template +NativeFormat::NativeFormat( + I* image_ctx, const json_spirit::mObject& json_object, bool import_only) + : m_image_ctx(image_ctx), m_json_object(json_object), + m_import_only(import_only) { +} + +template +void NativeFormat::open(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto& pool_name_val = m_json_object[POOL_NAME_KEY]; + if (pool_name_val.type() == json_spirit::str_type) { + librados::Rados rados(m_image_ctx->md_ctx); + librados::IoCtx io_ctx; + int r = rados.ioctx_create(pool_name_val.get_str().c_str(), io_ctx); + if (r < 0 ) { + lderr(cct) << "invalid pool name" << dendl; + on_finish->complete(r); + return; + } + + m_pool_id = io_ctx.get_id(); + } else if (pool_name_val.type() != json_spirit::null_type) { + lderr(cct) << "invalid pool name" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto& pool_id_val = m_json_object[POOL_ID_KEY]; + if (m_pool_id != -1 && pool_id_val.type() != json_spirit::null_type) { + lderr(cct) << "cannot specify both pool name and pool id" << dendl; + on_finish->complete(-EINVAL); + return; + } else if (pool_id_val.type() == json_spirit::int_type) { + m_pool_id = pool_id_val.get_int64(); + } else if (pool_id_val.type() == json_spirit::str_type) { + try { + m_pool_id = boost::lexical_cast(pool_id_val.get_str()); + } catch (boost::bad_lexical_cast &) { + } + } + + if (m_pool_id == -1) { + lderr(cct) << "missing or invalid pool id" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto& pool_namespace_val = m_json_object[POOL_NAMESPACE_KEY]; + if (pool_namespace_val.type() == json_spirit::str_type) { + m_pool_namespace = pool_namespace_val.get_str(); + } else if (pool_namespace_val.type() != json_spirit::null_type) { + lderr(cct) << "invalid pool namespace" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto& image_name_val = m_json_object[IMAGE_NAME_KEY]; + if (image_name_val.type() != json_spirit::str_type) { + lderr(cct) << "missing or invalid image name" << dendl; + on_finish->complete(-EINVAL); + return; + } + m_image_name = image_name_val.get_str(); + + auto& image_id_val = m_json_object[IMAGE_ID_KEY]; + if (image_id_val.type() == json_spirit::str_type) { + m_image_id = image_id_val.get_str(); + } else if (image_id_val.type() != json_spirit::null_type) { + lderr(cct) << "invalid image id" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto& snap_name_val = m_json_object[SNAP_NAME_KEY]; + if (snap_name_val.type() == json_spirit::str_type) { + m_snap_name = snap_name_val.get_str(); + } else if (snap_name_val.type() != json_spirit::null_type) { + lderr(cct) << "invalid snap name" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto& snap_id_val = m_json_object[SNAP_ID_KEY]; + if (!m_snap_name.empty() && snap_id_val.type() != json_spirit::null_type) { + lderr(cct) << "cannot specify both snap name and snap id" << dendl; + on_finish->complete(-EINVAL); + return; + } else if (snap_id_val.type() == json_spirit::str_type) { + try { + m_snap_id = boost::lexical_cast(snap_id_val.get_str()); + } catch (boost::bad_lexical_cast &) { + } + } else if (snap_id_val.type() == json_spirit::int_type) { + m_snap_id = snap_id_val.get_uint64(); + } + + if (snap_id_val.type() != json_spirit::null_type && + m_snap_id == CEPH_NOSNAP) { + lderr(cct) << "invalid snap id" << dendl; + on_finish->complete(-EINVAL); + return; + } + + // snapshot is required for import to keep source read-only + if (m_import_only && m_snap_name.empty() && m_snap_id == CEPH_NOSNAP) { + lderr(cct) << "snapshot required for import" << dendl; + on_finish->complete(-EINVAL); + return; + } + + // TODO add support for external clusters + librados::IoCtx io_ctx; + int r = util::create_ioctx(m_image_ctx->md_ctx, "source image", + m_pool_id, m_pool_namespace, &io_ctx); + if (r < 0) { + on_finish->complete(r); + return; + } + + m_image_ctx->md_ctx.dup(io_ctx); + m_image_ctx->data_ctx.dup(io_ctx); + m_image_ctx->name = m_image_name; + + uint64_t flags = 0; + if (m_image_id.empty() && !m_import_only) { + flags |= OPEN_FLAG_OLD_FORMAT; + } else { + m_image_ctx->id = m_image_id; + } + + if (m_image_ctx->child != nullptr) { + // set rados flags for reading the parent image + if (m_image_ctx->child->config.template get_val("rbd_balance_parent_reads")) { + m_image_ctx->set_read_flag(librados::OPERATION_BALANCE_READS); + } else if (m_image_ctx->child->config.template get_val("rbd_localize_parent_reads")) { + m_image_ctx->set_read_flag(librados::OPERATION_LOCALIZE_READS); + } + } + + // open the source RBD image + on_finish = new LambdaContext([this, on_finish](int r) { + handle_open(r, on_finish); }); + m_image_ctx->state->open(flags, on_finish); +} + +template +void NativeFormat::handle_open(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to open image: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + if (m_snap_id == CEPH_NOSNAP && m_snap_name.empty()) { + on_finish->complete(0); + return; + } + + if (!m_snap_name.empty()) { + std::shared_lock image_locker{m_image_ctx->image_lock}; + m_snap_id = m_image_ctx->get_snap_id(cls::rbd::UserSnapshotNamespace{}, + m_snap_name); + } + + if (m_snap_id == CEPH_NOSNAP) { + lderr(cct) << "failed to locate snapshot " << m_snap_name << dendl; + on_finish = new LambdaContext([on_finish](int) { + on_finish->complete(-ENOENT); }); + m_image_ctx->state->close(on_finish); + return; + } + + on_finish = new LambdaContext([this, on_finish](int r) { + handle_snap_set(r, on_finish); }); + m_image_ctx->state->snap_set(m_snap_id, on_finish); +} + +template +void NativeFormat::handle_snap_set(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to set snapshot " << m_snap_id << ": " + << cpp_strerror(r) << dendl; + on_finish = new LambdaContext([r, on_finish](int) { + on_finish->complete(r); }); + m_image_ctx->state->close(on_finish); + return; + } + + on_finish->complete(0); +} + +template +void NativeFormat::close(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + // the native librbd::image::CloseRequest handles all cleanup + on_finish->complete(0); +} + +template +void NativeFormat::get_snapshots(SnapInfos* snap_infos, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + m_image_ctx->image_lock.lock_shared(); + *snap_infos = m_image_ctx->snap_info; + m_image_ctx->image_lock.unlock_shared(); + + on_finish->complete(0); +} + +template +void NativeFormat::get_image_size(uint64_t snap_id, uint64_t* size, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + m_image_ctx->image_lock.lock_shared(); + *size = m_image_ctx->get_image_size(snap_id); + m_image_ctx->image_lock.unlock_shared(); + + + on_finish->complete(0); +} + +template +void NativeFormat::list_snaps(io::Extents&& image_extents, + io::SnapIds&& snap_ids, int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + auto aio_comp = io::AioCompletion::create_and_start( + on_finish, util::get_image_ctx(m_image_ctx), io::AIO_TYPE_GENERIC); + auto req = io::ImageDispatchSpec::create_list_snaps( + *m_image_ctx, io::IMAGE_DISPATCH_LAYER_MIGRATION, aio_comp, + std::move(image_extents), io::ImageArea::DATA, std::move(snap_ids), + list_snaps_flags, snapshot_delta, {}); + req->send(); +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::NativeFormat; diff --git a/src/librbd/migration/NativeFormat.h b/src/librbd/migration/NativeFormat.h new file mode 100644 index 000000000..e58c04121 --- /dev/null +++ b/src/librbd/migration/NativeFormat.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_NATIVE_FORMAT_H +#define CEPH_LIBRBD_MIGRATION_NATIVE_FORMAT_H + +#include "include/int_types.h" +#include "librbd/Types.h" +#include "librbd/migration/FormatInterface.h" +#include "json_spirit/json_spirit.h" +#include + +struct Context; + +namespace librbd { + +struct AsioEngine; +struct ImageCtx; + +namespace migration { + +template +class NativeFormat : public FormatInterface { +public: + static std::string build_source_spec(int64_t pool_id, + const std::string& pool_namespace, + const std::string& image_name, + const std::string& image_id); + + static NativeFormat* create(ImageCtxT* image_ctx, + const json_spirit::mObject& json_object, + bool import_only) { + return new NativeFormat(image_ctx, json_object, import_only); + } + + NativeFormat(ImageCtxT* image_ctx, const json_spirit::mObject& json_object, + bool import_only); + NativeFormat(const NativeFormat&) = delete; + NativeFormat& operator=(const NativeFormat&) = delete; + + void open(Context* on_finish) override; + void close(Context* on_finish) override; + + void get_snapshots(SnapInfos* snap_infos, Context* on_finish) override; + void get_image_size(uint64_t snap_id, uint64_t* size, + Context* on_finish) override; + + bool read(io::AioCompletion* aio_comp, uint64_t snap_id, + io::Extents&& image_extents, io::ReadResult&& read_result, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) override { + return false; + } + + void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids, + int list_snaps_flags, io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, + Context* on_finish) override; + +private: + ImageCtxT* m_image_ctx; + json_spirit::mObject m_json_object; + bool m_import_only; + + int64_t m_pool_id = -1; + std::string m_pool_namespace; + std::string m_image_name; + std::string m_image_id; + std::string m_snap_name; + uint64_t m_snap_id = CEPH_NOSNAP; + + void handle_open(int r, Context* on_finish); + void handle_snap_set(int r, Context* on_finish); + +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::NativeFormat; + +#endif // CEPH_LIBRBD_MIGRATION_NATIVE_FORMAT_H diff --git a/src/librbd/migration/OpenSourceImageRequest.cc b/src/librbd/migration/OpenSourceImageRequest.cc new file mode 100644 index 000000000..8abdedf33 --- /dev/null +++ b/src/librbd/migration/OpenSourceImageRequest.cc @@ -0,0 +1,249 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/OpenSourceImageRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/io/ImageDispatcher.h" +#include "librbd/migration/ImageDispatch.h" +#include "librbd/migration/NativeFormat.h" +#include "librbd/migration/SourceSpecBuilder.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::OpenSourceImageRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace migration { + +template +OpenSourceImageRequest::OpenSourceImageRequest( + librados::IoCtx& io_ctx, I* dst_image_ctx, uint64_t src_snap_id, + const MigrationInfo &migration_info, I** src_image_ctx, Context* on_finish) + : m_cct(reinterpret_cast(io_ctx.cct())), m_io_ctx(io_ctx), + m_dst_image_ctx(dst_image_ctx), m_src_snap_id(src_snap_id), + m_migration_info(migration_info), m_src_image_ctx(src_image_ctx), + m_on_finish(on_finish) { + ldout(m_cct, 10) << dendl; +} + +template +void OpenSourceImageRequest::send() { + open_source(); +} + +template +void OpenSourceImageRequest::open_source() { + ldout(m_cct, 10) << dendl; + + // note that all source image ctx properties are placeholders + *m_src_image_ctx = I::create("", "", CEPH_NOSNAP, m_io_ctx, true); + auto src_image_ctx = *m_src_image_ctx; + src_image_ctx->child = m_dst_image_ctx; + + // use default layout values (can be overridden by source layers later) + src_image_ctx->order = 22; + src_image_ctx->layout = file_layout_t(); + src_image_ctx->layout.stripe_count = 1; + src_image_ctx->layout.stripe_unit = 1ULL << src_image_ctx->order; + src_image_ctx->layout.object_size = 1Ull << src_image_ctx->order; + src_image_ctx->layout.pool_id = -1; + + bool import_only = true; + auto source_spec = m_migration_info.source_spec; + if (source_spec.empty()) { + // implies legacy migration from RBD image in same cluster + source_spec = NativeFormat::build_source_spec( + m_migration_info.pool_id, m_migration_info.pool_namespace, + m_migration_info.image_name, m_migration_info.image_id); + import_only = false; + } + + ldout(m_cct, 15) << "source_spec=" << source_spec << ", " + << "source_snap_id=" << m_src_snap_id << ", " + << "import_only=" << import_only << dendl; + + SourceSpecBuilder source_spec_builder{src_image_ctx}; + json_spirit::mObject source_spec_object; + int r = source_spec_builder.parse_source_spec(source_spec, + &source_spec_object); + if (r < 0) { + lderr(m_cct) << "failed to parse migration source-spec:" << cpp_strerror(r) + << dendl; + (*m_src_image_ctx)->state->close(); + finish(r); + return; + } + + r = source_spec_builder.build_format(source_spec_object, import_only, + &m_format); + if (r < 0) { + lderr(m_cct) << "failed to build migration format handler: " + << cpp_strerror(r) << dendl; + (*m_src_image_ctx)->state->close(); + finish(r); + return; + } + + auto ctx = util::create_context_callback< + OpenSourceImageRequest, + &OpenSourceImageRequest::handle_open_source>(this); + m_format->open(ctx); +} + +template +void OpenSourceImageRequest::handle_open_source(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to open migration source: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + get_image_size(); +} + +template +void OpenSourceImageRequest::get_image_size() { + ldout(m_cct, 10) << dendl; + + auto ctx = util::create_context_callback< + OpenSourceImageRequest, + &OpenSourceImageRequest::handle_get_image_size>(this); + m_format->get_image_size(CEPH_NOSNAP, &m_image_size, ctx); +} + +template +void OpenSourceImageRequest::handle_get_image_size(int r) { + ldout(m_cct, 10) << "r=" << r << ", " + << "image_size=" << m_image_size << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to retrieve image size: " << cpp_strerror(r) + << dendl; + close_image(r); + return; + } + + auto src_image_ctx = *m_src_image_ctx; + src_image_ctx->image_lock.lock(); + src_image_ctx->size = m_image_size; + src_image_ctx->image_lock.unlock(); + + get_snapshots(); +} + +template +void OpenSourceImageRequest::get_snapshots() { + ldout(m_cct, 10) << dendl; + + auto ctx = util::create_context_callback< + OpenSourceImageRequest, + &OpenSourceImageRequest::handle_get_snapshots>(this); + m_format->get_snapshots(&m_snap_infos, ctx); +} + +template +void OpenSourceImageRequest::handle_get_snapshots(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to retrieve snapshots: " << cpp_strerror(r) + << dendl; + close_image(r); + return; + } + + // copy snapshot metadata to image ctx + auto src_image_ctx = *m_src_image_ctx; + src_image_ctx->image_lock.lock(); + + src_image_ctx->snaps.clear(); + src_image_ctx->snap_info.clear(); + src_image_ctx->snap_ids.clear(); + + ::SnapContext snapc; + for (auto it = m_snap_infos.rbegin(); it != m_snap_infos.rend(); ++it) { + auto& [snap_id, snap_info] = *it; + snapc.snaps.push_back(snap_id); + + ldout(m_cct, 10) << "adding snap: ns=" << snap_info.snap_namespace << ", " + << "name=" << snap_info.name << ", " + << "id=" << snap_id << dendl; + src_image_ctx->add_snap( + snap_info.snap_namespace, snap_info.name, snap_id, + snap_info.size, snap_info.parent, snap_info.protection_status, + snap_info.flags, snap_info.timestamp); + } + if (!snapc.snaps.empty()) { + snapc.seq = snapc.snaps[0]; + } + src_image_ctx->snapc = snapc; + + ldout(m_cct, 15) << "read snap id: " << m_src_snap_id << ", " + << "write snapc={" + << "seq=" << snapc.seq << ", " + << "snaps=" << snapc.snaps << "}" << dendl; + + // ensure data_ctx and data_io_context are pointing to correct snapshot + if (m_src_snap_id != CEPH_NOSNAP) { + int r = src_image_ctx->snap_set(m_src_snap_id); + if (r < 0) { + src_image_ctx->image_lock.unlock(); + + lderr(m_cct) << "error setting source image snap id: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + } + + src_image_ctx->image_lock.unlock(); + + finish(0); +} + +template +void OpenSourceImageRequest::close_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + auto ctx = new LambdaContext([this, r](int) { + finish(r); + }); + (*m_src_image_ctx)->state->close(ctx); +} + +template +void OpenSourceImageRequest::register_image_dispatch() { + ldout(m_cct, 10) << dendl; + + // intercept any IO requests to the source image + auto io_image_dispatch = ImageDispatch::create( + *m_src_image_ctx, std::move(m_format)); + (*m_src_image_ctx)->io_image_dispatcher->register_dispatch(io_image_dispatch); +} + +template +void OpenSourceImageRequest::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + *m_src_image_ctx = nullptr; + } else { + register_image_dispatch(); + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::OpenSourceImageRequest; diff --git a/src/librbd/migration/OpenSourceImageRequest.h b/src/librbd/migration/OpenSourceImageRequest.h new file mode 100644 index 000000000..f0dab3ad9 --- /dev/null +++ b/src/librbd/migration/OpenSourceImageRequest.h @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_OPEN_SOURCE_IMAGE_REQUEST_H +#define CEPH_LIBRBD_MIGRATION_OPEN_SOURCE_IMAGE_REQUEST_H + +#include "include/rados/librados_fwd.hpp" +#include "librbd/Types.h" +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace migration { + +struct FormatInterface; + +template +class OpenSourceImageRequest { +public: + static OpenSourceImageRequest* create(librados::IoCtx& io_ctx, + ImageCtxT* destination_image_ctx, + uint64_t src_snap_id, + const MigrationInfo &migration_info, + ImageCtxT** source_image_ctx, + Context* on_finish) { + return new OpenSourceImageRequest(io_ctx, destination_image_ctx, + src_snap_id, migration_info, + source_image_ctx, on_finish); + } + + OpenSourceImageRequest(librados::IoCtx& io_ctx, + ImageCtxT* destination_image_ctx, + uint64_t src_snap_id, + const MigrationInfo &migration_info, + ImageCtxT** source_image_ctx, + Context* on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * OPEN_SOURCE + * | + * v + * GET_IMAGE_SIZE * * * * * * * + * | * + * v v + * GET_SNAPSHOTS * * * * > CLOSE_IMAGE + * | | + * v | + * <------------------/ + * + * @endverbatim + */ + + typedef std::map SnapInfos; + + CephContext* m_cct; + librados::IoCtx& m_io_ctx; + ImageCtxT* m_dst_image_ctx; + uint64_t m_src_snap_id; + MigrationInfo m_migration_info; + ImageCtxT** m_src_image_ctx; + Context* m_on_finish; + + std::unique_ptr m_format; + + uint64_t m_image_size = 0; + SnapInfos m_snap_infos; + + void open_source(); + void handle_open_source(int r); + + void get_image_size(); + void handle_get_image_size(int r); + + void get_snapshots(); + void handle_get_snapshots(int r); + + void close_image(int r); + + void register_image_dispatch(); + + void finish(int r); + +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::OpenSourceImageRequest; + +#endif // CEPH_LIBRBD_MIGRATION_OPEN_SOURCE_IMAGE_REQUEST_H diff --git a/src/librbd/migration/QCOW.h b/src/librbd/migration/QCOW.h new file mode 100644 index 000000000..23401e515 --- /dev/null +++ b/src/librbd/migration/QCOW.h @@ -0,0 +1,466 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* Based on QEMU block/qcow.cc and block/qcow2.h, which has this license: */ + +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef CEPH_LIBRBD_MIGRATION_QCOW2_H +#define CEPH_LIBRBD_MIGRATION_QCOW2_H + +#include "include/ceph_assert.h" +#include "include/int_types.h" +#include "librbd/migration/QCOW.h" + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 +#define QCOW_CRYPT_LUKS 2 + +#define QCOW_MAX_CRYPT_CLUSTERS 32 +#define QCOW_MAX_SNAPSHOTS 65536 + +/* Field widths in qcow2 mean normal cluster offsets cannot reach + * 64PB; depending on cluster size, compressed clusters can have a + * smaller limit (64PB for up to 16k clusters, then ramps down to + * 512TB for 2M clusters). */ +#define QCOW_MAX_CLUSTER_OFFSET ((1ULL << 56) - 1) + +/* 8 MB refcount table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_REFTABLE_SIZE (1ULL << 23) + +/* 32 MB L1 table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_L1_SIZE (1ULL << 25) + +/* Allow for an average of 1k per snapshot table entry, should be plenty of + * space for snapshot names and IDs */ +#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS) + +/* Maximum amount of extra data per snapshot table entry to accept */ +#define QCOW_MAX_SNAPSHOT_EXTRA_DATA 1024 + +/* Bitmap header extension constraints */ +#define QCOW2_MAX_BITMAPS 65535 +#define QCOW2_MAX_BITMAP_DIRECTORY_SIZE (1024 * QCOW2_MAX_BITMAPS) + +/* Maximum of parallel sub-request per guest request */ +#define QCOW2_MAX_WORKERS 8 + +/* indicate that the refcount of the referenced cluster is exactly one. */ +#define QCOW_OFLAG_COPIED (1ULL << 63) +/* indicate that the cluster is compressed (they never have the copied flag) */ +#define QCOW_OFLAG_COMPRESSED (1ULL << 62) +/* The cluster reads as all zeros */ +#define QCOW_OFLAG_ZERO (1ULL << 0) + +#define QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER 32 + +/* The subcluster X [0..31] is allocated */ +#define QCOW_OFLAG_SUB_ALLOC(X) (1ULL << (X)) +/* The subcluster X [0..31] reads as zeroes */ +#define QCOW_OFLAG_SUB_ZERO(X) (QCOW_OFLAG_SUB_ALLOC(X) << 32) +/* Subclusters [X, Y) (0 <= X <= Y <= 32) are allocated */ +#define QCOW_OFLAG_SUB_ALLOC_RANGE(X, Y) \ + (QCOW_OFLAG_SUB_ALLOC(Y) - QCOW_OFLAG_SUB_ALLOC(X)) +/* Subclusters [X, Y) (0 <= X <= Y <= 32) read as zeroes */ +#define QCOW_OFLAG_SUB_ZERO_RANGE(X, Y) \ + (QCOW_OFLAG_SUB_ALLOC_RANGE(X, Y) << 32) +/* L2 entry bitmap with all allocation bits set */ +#define QCOW_L2_BITMAP_ALL_ALLOC (QCOW_OFLAG_SUB_ALLOC_RANGE(0, 32)) +/* L2 entry bitmap with all "read as zeroes" bits set */ +#define QCOW_L2_BITMAP_ALL_ZEROES (QCOW_OFLAG_SUB_ZERO_RANGE(0, 32)) + +/* Size of normal and extended L2 entries */ +#define QCOW_L2E_SIZE_NORMAL (sizeof(uint64_t)) +#define QCOW_L2E_SIZE_EXTENDED (sizeof(uint64_t) * 2) + +/* Size of L1 table entries */ +#define QCOW_L1E_SIZE (sizeof(uint64_t)) + +/* Size of reftable entries */ +#define QCOW_REFTABLE_ENTRY_SIZE (sizeof(uint64_t)) + +#define QCOW_MIN_CLUSTER_BITS 9 +#define QCOW_MAX_CLUSTER_BITS 21 + +/* Defined in the qcow2 spec (compressed cluster descriptor) */ +#define QCOW2_COMPRESSED_SECTOR_SIZE 512U +#define QCOW2_COMPRESSED_SECTOR_MASK (~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL)) + +#define QCOW_L2_CACHE_SIZE 16 + +/* Must be at least 2 to cover COW */ +#define QCOW_MIN_L2_CACHE_SIZE 2 /* cache entries */ + +/* Must be at least 4 to cover all cases of refcount table growth */ +#define QCOW_MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */ + +#define QCOW_DEFAULT_L2_CACHE_MAX_SIZE (1ULL << 25) +#define QCOW_DEFAULT_CACHE_CLEAN_INTERVAL 600 /* seconds */ + +#define QCOW_DEFAULT_CLUSTER_SIZE 65536 + +#define QCOW2_OPT_DATA_FILE "data-file" +#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts" +#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request" +#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" +#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" +#define QCOW2_OPT_OVERLAP "overlap-check" +#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template" +#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header" +#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1" +#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2" +#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table" +#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block" +#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table" +#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1" +#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2" +#define QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY "overlap-check.bitmap-directory" +#define QCOW2_OPT_CACHE_SIZE "cache-size" +#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size" +#define QCOW2_OPT_L2_CACHE_ENTRY_SIZE "l2-cache-entry-size" +#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size" +#define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval" + +typedef struct QCowHeaderProbe { + uint32_t magic; + uint32_t version; +} __attribute__((__packed__)) QCowHeaderProbe; + +typedef struct QCowHeaderV1 +{ + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t mtime; + uint64_t size; /* in bytes */ + uint8_t cluster_bits; + uint8_t l2_bits; + uint16_t padding; + uint32_t crypt_method; + uint64_t l1_table_offset; +} __attribute__((__packed__)) QCowHeaderV1; + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t cluster_bits; + uint64_t size; /* in bytes */ + uint32_t crypt_method; + uint32_t l1_size; /* XXX: save number of clusters instead ? */ + uint64_t l1_table_offset; + uint64_t refcount_table_offset; + uint32_t refcount_table_clusters; + uint32_t nb_snapshots; + uint64_t snapshots_offset; + + /* The following fields are only valid for version >= 3 */ + uint64_t incompatible_features; + uint64_t compatible_features; + uint64_t autoclear_features; + + uint32_t refcount_order; + uint32_t header_length; + + /* Additional fields */ + uint8_t compression_type; + + /* header must be a multiple of 8 */ + uint8_t padding[7]; +} __attribute__((__packed__)) QCowHeader; + +typedef struct QCowSnapshotHeader { + /* header is 8 byte aligned */ + uint64_t l1_table_offset; + + uint32_t l1_size; + uint16_t id_str_size; + uint16_t name_size; + + uint32_t date_sec; + uint32_t date_nsec; + + uint64_t vm_clock_nsec; + + uint32_t vm_state_size; + uint32_t extra_data_size; /* for extension */ + /* extra data follows */ + /* id_str follows */ + /* name follows */ +} __attribute__((__packed__)) QCowSnapshotHeader; + +typedef struct QCowSnapshotExtraData { + uint64_t vm_state_size_large; + uint64_t disk_size; + uint64_t icount; +} __attribute__((__packed__)) QCowSnapshotExtraData; + + +typedef struct QCowSnapshot { + uint64_t l1_table_offset; + uint32_t l1_size; + char *id_str; + char *name; + uint64_t disk_size; + uint64_t vm_state_size; + uint32_t date_sec; + uint32_t date_nsec; + uint64_t vm_clock_nsec; + /* icount value for the moment when snapshot was taken */ + uint64_t icount; + /* Size of all extra data, including QCowSnapshotExtraData if available */ + uint32_t extra_data_size; + /* Data beyond QCowSnapshotExtraData, if any */ + void *unknown_extra_data; +} QCowSnapshot; + +typedef struct Qcow2CryptoHeaderExtension { + uint64_t offset; + uint64_t length; +} __attribute__((__packed__)) Qcow2CryptoHeaderExtension; + +typedef struct Qcow2UnknownHeaderExtension { + uint32_t magic; + uint32_t len; + uint8_t data[]; +} Qcow2UnknownHeaderExtension; + +enum { + QCOW2_FEAT_TYPE_INCOMPATIBLE = 0, + QCOW2_FEAT_TYPE_COMPATIBLE = 1, + QCOW2_FEAT_TYPE_AUTOCLEAR = 2, +}; + +/* Incompatible feature bits */ +enum { + QCOW2_INCOMPAT_DIRTY_BITNR = 0, + QCOW2_INCOMPAT_CORRUPT_BITNR = 1, + QCOW2_INCOMPAT_DATA_FILE_BITNR = 2, + QCOW2_INCOMPAT_COMPRESSION_BITNR = 3, + QCOW2_INCOMPAT_EXTL2_BITNR = 4, + QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, + QCOW2_INCOMPAT_CORRUPT = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR, + QCOW2_INCOMPAT_DATA_FILE = 1 << QCOW2_INCOMPAT_DATA_FILE_BITNR, + QCOW2_INCOMPAT_COMPRESSION = 1 << QCOW2_INCOMPAT_COMPRESSION_BITNR, + QCOW2_INCOMPAT_EXTL2 = 1 << QCOW2_INCOMPAT_EXTL2_BITNR, + + QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY + | QCOW2_INCOMPAT_CORRUPT + | QCOW2_INCOMPAT_DATA_FILE + | QCOW2_INCOMPAT_COMPRESSION + | QCOW2_INCOMPAT_EXTL2, +}; + +/* Compatible feature bits */ +enum { + QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0, + QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + + QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS, +}; + +/* Autoclear feature bits */ +enum { + QCOW2_AUTOCLEAR_BITMAPS_BITNR = 0, + QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR = 1, + QCOW2_AUTOCLEAR_BITMAPS = 1 << QCOW2_AUTOCLEAR_BITMAPS_BITNR, + QCOW2_AUTOCLEAR_DATA_FILE_RAW = 1 << QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR, + + QCOW2_AUTOCLEAR_MASK = QCOW2_AUTOCLEAR_BITMAPS + | QCOW2_AUTOCLEAR_DATA_FILE_RAW, +}; + +enum qcow2_discard_type { + QCOW2_DISCARD_NEVER = 0, + QCOW2_DISCARD_ALWAYS, + QCOW2_DISCARD_REQUEST, + QCOW2_DISCARD_SNAPSHOT, + QCOW2_DISCARD_OTHER, + QCOW2_DISCARD_MAX +}; + +typedef struct Qcow2Feature { + uint8_t type; + uint8_t bit; + char name[46]; +} __attribute__((__packed__)) Qcow2Feature; + +typedef struct Qcow2DiscardRegion { + uint64_t offset; + uint64_t bytes; +} Qcow2DiscardRegion; + +typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array, + uint64_t index); +typedef void Qcow2SetRefcountFunc(void *refcount_array, + uint64_t index, uint64_t value); + +typedef struct Qcow2BitmapHeaderExt { + uint32_t nb_bitmaps; + uint32_t reserved32; + uint64_t bitmap_directory_size; + uint64_t bitmap_directory_offset; +} __attribute__((__packed__)) Qcow2BitmapHeaderExt; + +#define QCOW_RC_CACHE_SIZE QCOW_L2_CACHE_SIZE; + +typedef struct Qcow2COWRegion { + /** + * Offset of the COW region in bytes from the start of the first cluster + * touched by the request. + */ + unsigned offset; + + /** Number of bytes to copy */ + unsigned nb_bytes; +} Qcow2COWRegion; + +/** + * Describes an in-flight (part of a) write request that writes to clusters + * that are not referenced in their L2 table yet. + */ +typedef struct QCowL2Meta +{ + /** Guest offset of the first newly allocated cluster */ + uint64_t offset; + + /** Host offset of the first newly allocated cluster */ + uint64_t alloc_offset; + + /** Number of newly allocated clusters */ + int nb_clusters; + + /** Do not free the old clusters */ + bool keep_old_clusters; + + /** + * The COW Region between the start of the first allocated cluster and the + * area the guest actually writes to. + */ + Qcow2COWRegion cow_start; + + /** + * The COW Region between the area the guest actually writes to and the + * end of the last allocated cluster. + */ + Qcow2COWRegion cow_end; + + /* + * Indicates that COW regions are already handled and do not require + * any more processing. + */ + bool skip_cow; + + /** + * Indicates that this is not a normal write request but a preallocation. + * If the image has extended L2 entries this means that no new individual + * subclusters will be marked as allocated in the L2 bitmap (but any + * existing contents of that bitmap will be kept). + */ + bool prealloc; + + /** Pointer to next L2Meta of the same write request */ + struct QCowL2Meta *next; +} QCowL2Meta; + +typedef enum QCow2ClusterType { + QCOW2_CLUSTER_UNALLOCATED, + QCOW2_CLUSTER_ZERO_PLAIN, + QCOW2_CLUSTER_ZERO_ALLOC, + QCOW2_CLUSTER_NORMAL, + QCOW2_CLUSTER_COMPRESSED, +} QCow2ClusterType; + +typedef enum QCow2MetadataOverlap { + QCOW2_OL_MAIN_HEADER_BITNR = 0, + QCOW2_OL_ACTIVE_L1_BITNR = 1, + QCOW2_OL_ACTIVE_L2_BITNR = 2, + QCOW2_OL_REFCOUNT_TABLE_BITNR = 3, + QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4, + QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5, + QCOW2_OL_INACTIVE_L1_BITNR = 6, + QCOW2_OL_INACTIVE_L2_BITNR = 7, + QCOW2_OL_BITMAP_DIRECTORY_BITNR = 8, + + QCOW2_OL_MAX_BITNR = 9, + + QCOW2_OL_NONE = 0, + QCOW2_OL_MAIN_HEADER = (1 << QCOW2_OL_MAIN_HEADER_BITNR), + QCOW2_OL_ACTIVE_L1 = (1 << QCOW2_OL_ACTIVE_L1_BITNR), + QCOW2_OL_ACTIVE_L2 = (1 << QCOW2_OL_ACTIVE_L2_BITNR), + QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR), + QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR), + QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR), + QCOW2_OL_INACTIVE_L1 = (1 << QCOW2_OL_INACTIVE_L1_BITNR), + /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv + * reads. */ + QCOW2_OL_INACTIVE_L2 = (1 << QCOW2_OL_INACTIVE_L2_BITNR), + QCOW2_OL_BITMAP_DIRECTORY = (1 << QCOW2_OL_BITMAP_DIRECTORY_BITNR), +} QCow2MetadataOverlap; + +/* Perform all overlap checks which can be done in constant time */ +#define QCOW2_OL_CONSTANT \ + (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \ + QCOW2_OL_SNAPSHOT_TABLE | QCOW2_OL_BITMAP_DIRECTORY) + +/* Perform all overlap checks which don't require disk access */ +#define QCOW2_OL_CACHED \ + (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \ + QCOW2_OL_INACTIVE_L1) + +/* Perform all overlap checks */ +#define QCOW2_OL_ALL \ + (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) + +#define QCOW_L1E_OFFSET_MASK 0x00fffffffffffe00ULL +#define QCOW_L2E_OFFSET_MASK 0x00fffffffffffe00ULL +#define QCOW_L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL + +#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL + +#define INV_OFFSET (-1ULL) + +static inline uint64_t l2meta_cow_start(QCowL2Meta *m) +{ + return m->offset + m->cow_start.offset; +} + +static inline uint64_t l2meta_cow_end(QCowL2Meta *m) +{ + return m->offset + m->cow_end.offset + m->cow_end.nb_bytes; +} + +static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2) +{ + return r1 > r2 ? r1 - r2 : r2 - r1; +} + +#endif // CEPH_LIBRBD_MIGRATION_QCOW2_H diff --git a/src/librbd/migration/QCOWFormat.cc b/src/librbd/migration/QCOWFormat.cc new file mode 100644 index 000000000..cedf9aa20 --- /dev/null +++ b/src/librbd/migration/QCOWFormat.cc @@ -0,0 +1,1545 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/QCOWFormat.h" +#include "common/Clock.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/intarith.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ReadResult.h" +#include "librbd/migration/SnapshotInterface.h" +#include "librbd/migration/SourceSpecBuilder.h" +#include "librbd/migration/StreamInterface.h" +#include "librbd/migration/Utils.h" +#include +#include +#include +#include +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd + +namespace librbd { +namespace migration { + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat: " \ + << __func__ << ": " + +using boost::endian::big_to_native; + +namespace qcow_format { + +struct ClusterExtent { + uint64_t cluster_offset; + uint64_t cluster_length; + uint64_t intra_cluster_offset; + uint64_t image_offset; + uint64_t buffer_offset; + + ClusterExtent(uint64_t cluster_offset, uint64_t cluster_length, + uint64_t intra_cluster_offset, uint64_t image_offset, + uint64_t buffer_offset) + : cluster_offset(cluster_offset), cluster_length(cluster_length), + intra_cluster_offset(intra_cluster_offset), image_offset(image_offset), + buffer_offset(buffer_offset) { + } +}; + +typedef std::vector ClusterExtents; + +void LookupTable::init() { + if (cluster_offsets == nullptr) { + cluster_offsets = reinterpret_cast(bl.c_str()); + } +} + +void LookupTable::decode() { + init(); + + // L2 tables are selectively byte-swapped on demand if only requesting a + // single cluster offset + if (decoded) { + return; + } + + // translate the lookup table (big-endian -> CPU endianess) + for (auto idx = 0UL; idx < size; ++idx) { + cluster_offsets[idx] = big_to_native(cluster_offsets[idx]); + } + + decoded = true; +} + +void populate_cluster_extents(CephContext* cct, uint64_t cluster_size, + const io::Extents& image_extents, + ClusterExtents* cluster_extents) { + uint64_t buffer_offset = 0; + for (auto [image_offset, image_length] : image_extents) { + while (image_length > 0) { + auto intra_cluster_offset = image_offset & (cluster_size - 1); + auto intra_cluster_length = cluster_size - intra_cluster_offset; + auto cluster_length = std::min(image_length, intra_cluster_length); + + ldout(cct, 20) << "image_offset=" << image_offset << ", " + << "image_length=" << image_length << ", " + << "cluster_length=" << cluster_length << dendl; + + + cluster_extents->emplace_back(0, cluster_length, intra_cluster_offset, + image_offset, buffer_offset); + + image_offset += cluster_length; + image_length -= cluster_length; + buffer_offset += cluster_length; + } + } +} + +} // namespace qcow_format + +using namespace qcow_format; + +template +struct QCOWFormat::Cluster { + const uint64_t cluster_offset; + bufferlist cluster_data_bl; + + Cluster(uint64_t cluster_offset) : cluster_offset(cluster_offset) { + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat::ClusterCache: " \ + << this << " " << __func__ << ": " + +template +class QCOWFormat::ClusterCache { +public: + ClusterCache(QCOWFormat* qcow_format) + : qcow_format(qcow_format), + m_strand(*qcow_format->m_image_ctx->asio_engine) { + } + + void get_cluster(uint64_t cluster_offset, uint64_t cluster_length, + uint64_t intra_cluster_offset, bufferlist* bl, + Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl; + + // cache state machine runs in a single strand thread + boost::asio::dispatch( + m_strand, + [this, cluster_offset, cluster_length, intra_cluster_offset, bl, + on_finish]() { + execute_get_cluster(cluster_offset, cluster_length, + intra_cluster_offset, bl, on_finish); + }); + } + +private: + typedef std::tuple Completion; + typedef std::list Completions; + + QCOWFormat* qcow_format; + boost::asio::io_context::strand m_strand; + + std::shared_ptr cluster; + std::unordered_map cluster_completions; + + void execute_get_cluster(uint64_t cluster_offset, uint64_t cluster_length, + uint64_t intra_cluster_offset, bufferlist* bl, + Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl; + + if (cluster && cluster->cluster_offset == cluster_offset) { + // most-recent cluster matches + bl->substr_of(cluster->cluster_data_bl, intra_cluster_offset, + cluster_length); + boost::asio::post(*qcow_format->m_image_ctx->asio_engine, + [on_finish]() { on_finish->complete(0); }); + return; + } + + // record callback for cluster + bool new_request = (cluster_completions.count(cluster_offset) == 0); + cluster_completions[cluster_offset].emplace_back( + intra_cluster_offset, cluster_length, bl, on_finish); + if (new_request) { + // start the new read request + read_cluster(std::make_shared(cluster_offset)); + } + } + + void read_cluster(std::shared_ptr cluster) { + auto cct = qcow_format->m_image_ctx->cct; + + uint64_t stream_offset = cluster->cluster_offset; + uint64_t stream_length = qcow_format->m_cluster_size; + if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) { + // compressed clusters encode the compressed length in the lower bits + stream_offset = cluster->cluster_offset & + qcow_format->m_cluster_offset_mask; + stream_length = (cluster->cluster_offset >> + (63 - qcow_format->m_cluster_bits)) & + (qcow_format->m_cluster_size - 1); + } + + ldout(cct, 20) << "cluster_offset=" << cluster->cluster_offset << ", " + << "stream_offset=" << stream_offset << ", " + << "stream_length=" << stream_length << dendl; + + // read the cluster into the cache entry + auto ctx = new LambdaContext([this, cluster](int r) { + boost::asio::post(m_strand, [this, cluster, r]() { + handle_read_cluster(r, cluster); }); }); + qcow_format->m_stream->read({{stream_offset, stream_length}}, + &cluster->cluster_data_bl, ctx); + } + + void handle_read_cluster(int r, std::shared_ptr cluster) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "cluster_offset=" << cluster->cluster_offset << dendl; + + auto completions = std::move(cluster_completions[cluster->cluster_offset]); + cluster_completions.erase(cluster->cluster_offset); + + if (r < 0) { + lderr(cct) << "failed to read cluster offset " << cluster->cluster_offset + << ": " << cpp_strerror(r) << dendl; + } else { + if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) { + bufferlist compressed_bl{std::move(cluster->cluster_data_bl)}; + cluster->cluster_data_bl.clear(); + + // TODO + lderr(cct) << "support for compressed clusters is not available" + << dendl; + r = -EINVAL; + } else { + // cache the MRU cluster in case of sequential IO + this->cluster = cluster; + } + } + + // complete the IO back to caller + boost::asio::post(*qcow_format->m_image_ctx->asio_engine, + [r, cluster, completions=std::move(completions)]() { + for (auto completion : completions) { + if (r >= 0) { + std::get<2>(completion)->substr_of( + cluster->cluster_data_bl, + std::get<0>(completion), + std::get<1>(completion)); + } + std::get<3>(completion)->complete(r); + } + }); + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat::L2TableCache: " \ + << this << " " << __func__ << ": " + +template +class QCOWFormat::L2TableCache { +public: + L2TableCache(QCOWFormat* qcow_format) + : qcow_format(qcow_format), + m_strand(*qcow_format->m_image_ctx->asio_engine), + l2_cache_entries(QCOW_L2_CACHE_SIZE) { + } + + void get_l2_table(const LookupTable* l1_table, uint64_t l2_table_offset, + std::shared_ptr* l2_table, + Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "l2_table_offset=" << l2_table_offset << dendl; + + // cache state machine runs in a single strand thread + Request request{l1_table, l2_table_offset, l2_table, on_finish}; + boost::asio::dispatch( + m_strand, [this, request=std::move(request)]() { + requests.push_back(std::move(request)); + }); + dispatch_request(); + } + + void get_cluster_offset(const LookupTable* l1_table, + uint64_t image_offset, uint64_t* cluster_offset, + Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + uint32_t l1_table_index = image_offset >> qcow_format->m_l1_shift; + uint64_t l2_table_offset = l1_table->cluster_offsets[std::min( + l1_table_index, l1_table->size - 1)] & + qcow_format->m_cluster_mask; + uint32_t l2_table_index = (image_offset >> qcow_format->m_cluster_bits) & + (qcow_format->m_l2_size - 1); + ldout(cct, 20) << "image_offset=" << image_offset << ", " + << "l1_table_index=" << l1_table_index << ", " + << "l2_table_offset=" << l2_table_offset << ", " + << "l2_table_index=" << l2_table_index << dendl; + + if (l1_table_index >= l1_table->size) { + lderr(cct) << "L1 index " << l1_table_index << " out-of-bounds" << dendl; + on_finish->complete(-ERANGE); + return; + } else if (l2_table_offset == 0) { + // L2 table has not been allocated for specified offset + ldout(cct, 20) << "image_offset=" << image_offset << ", " + << "cluster_offset=DNE" << dendl; + *cluster_offset = 0; + on_finish->complete(-ENOENT); + return; + } + + // cache state machine runs in a single strand thread + Request request{l1_table, l2_table_offset, l2_table_index, cluster_offset, + on_finish}; + boost::asio::dispatch( + m_strand, [this, request=std::move(request)]() { + requests.push_back(std::move(request)); + }); + dispatch_request(); + } + +private: + QCOWFormat* qcow_format; + + boost::asio::io_context::strand m_strand; + + struct Request { + const LookupTable* l1_table; + + uint64_t l2_table_offset; + + // get_cluster_offset request + uint32_t l2_table_index; + uint64_t* cluster_offset = nullptr; + + // get_l2_table request + std::shared_ptr* l2_table; + + Context* on_finish; + + Request(const LookupTable* l1_table, uint64_t l2_table_offset, + uint32_t l2_table_index, uint64_t* cluster_offset, + Context* on_finish) + : l1_table(l1_table), l2_table_offset(l2_table_offset), + l2_table_index(l2_table_index), cluster_offset(cluster_offset), + on_finish(on_finish) { + } + Request(const LookupTable* l1_table, uint64_t l2_table_offset, + std::shared_ptr* l2_table, Context* on_finish) + : l1_table(l1_table), l2_table_offset(l2_table_offset), + l2_table(l2_table), on_finish(on_finish) { + } + }; + + typedef std::deque Requests; + + struct L2Cache { + uint64_t l2_offset = 0; + std::shared_ptr l2_table; + + utime_t timestamp; + uint32_t count = 0; + bool in_flight = false; + + int ret_val = 0; + }; + std::vector l2_cache_entries; + + Requests requests; + + void dispatch_request() { + boost::asio::dispatch(m_strand, [this]() { execute_request(); }); + } + + void execute_request() { + auto cct = qcow_format->m_image_ctx->cct; + if (requests.empty()) { + return; + } + + auto request = requests.front(); + ldout(cct, 20) << "l2_table_offset=" << request.l2_table_offset << dendl; + + std::shared_ptr l2_table; + int r = l2_table_lookup(request.l2_table_offset, &l2_table); + if (r < 0) { + lderr(cct) << "failed to load L2 table: l2_table_offset=" + << request.l2_table_offset << ": " + << cpp_strerror(r) << dendl; + } else if (l2_table == nullptr) { + // table not in cache -- will restart once its loaded + return; + } else if (request.cluster_offset != nullptr) { + auto cluster_offset = l2_table->cluster_offsets[request.l2_table_index]; + if (!l2_table->decoded) { + // table hasn't been byte-swapped + cluster_offset = big_to_native(cluster_offset); + } + + *request.cluster_offset = cluster_offset & qcow_format->m_cluster_mask; + if (*request.cluster_offset == QCOW_OFLAG_ZERO) { + ldout(cct, 20) << "l2_table_offset=" << request.l2_table_offset << ", " + << "l2_table_index=" << request.l2_table_index << ", " + << "cluster_offset=zeroed" << dendl; + } else { + ldout(cct, 20) << "l2_table_offset=" << request.l2_table_offset << ", " + << "l2_table_index=" << request.l2_table_index << ", " + << "cluster_offset=" << *request.cluster_offset + << dendl; + } + } else if (request.l2_table != nullptr) { + // ensure it's in the correct byte-order + l2_table->decode(); + *request.l2_table = l2_table; + } else { + ceph_assert(false); + } + + // complete the L2 cache request + boost::asio::post(*qcow_format->m_image_ctx->asio_engine, + [r, ctx=request.on_finish]() { ctx->complete(r); }); + requests.pop_front(); + + // process next request (if any) + dispatch_request(); + } + + int l2_table_lookup(uint64_t l2_offset, + std::shared_ptr* l2_table) { + auto cct = qcow_format->m_image_ctx->cct; + + l2_table->reset(); + + // find a match in the existing cache + for (auto idx = 0U; idx < l2_cache_entries.size(); ++idx) { + auto& l2_cache = l2_cache_entries[idx]; + if (l2_cache.l2_offset == l2_offset) { + if (l2_cache.in_flight) { + ldout(cct, 20) << "l2_offset=" << l2_offset << ", " + << "index=" << idx << " (in-flight)" << dendl; + return 0; + } + + if (l2_cache.ret_val < 0) { + ldout(cct, 20) << "l2_offset=" << l2_offset << ", " + << "index=" << idx << " (error): " + << cpp_strerror(l2_cache.ret_val) << dendl; + int r = l2_cache.ret_val; + l2_cache = L2Cache{}; + + return r; + } + + ++l2_cache.count; + if (l2_cache.count == std::numeric_limits::max()) { + for (auto& entry : l2_cache_entries) { + entry.count >>= 1; + } + } + + ldout(cct, 20) << "l2_offset=" << l2_offset << ", " << "index=" << idx + << dendl; + *l2_table = l2_cache.l2_table; + return 0; + } + } + + // find the least used entry + int32_t min_idx = -1; + uint32_t min_count = std::numeric_limits::max(); + utime_t min_timestamp; + for (uint32_t idx = 0U; idx < l2_cache_entries.size(); ++idx) { + auto& l2_cache = l2_cache_entries[idx]; + if (l2_cache.in_flight) { + continue; + } + + if (l2_cache.count > 0) { + --l2_cache.count; + } + + if (l2_cache.count <= min_count) { + if (min_idx == -1 || l2_cache.timestamp < min_timestamp) { + min_timestamp = l2_cache.timestamp; + min_count = l2_cache.count; + min_idx = idx; + } + } + } + + if (min_idx == -1) { + // no space in the cache due to in-flight requests + ldout(cct, 20) << "l2_offset=" << l2_offset << ", " + << "index=DNE (cache busy)" << dendl; + return 0; + } + + ldout(cct, 20) << "l2_offset=" << l2_offset << ", " + << "index=" << min_idx << " (loading)" << dendl; + auto& l2_cache = l2_cache_entries[min_idx]; + l2_cache.l2_table = std::make_shared(qcow_format->m_l2_size); + l2_cache.l2_offset = l2_offset; + l2_cache.timestamp = ceph_clock_now(); + l2_cache.count = 1; + l2_cache.in_flight = true; + + // read the L2 table into the L2 cache entry + auto ctx = new LambdaContext([this, index=min_idx, l2_offset](int r) { + boost::asio::post(m_strand, [this, index, l2_offset, r]() { + handle_l2_table_lookup(r, index, l2_offset); }); }); + qcow_format->m_stream->read( + {{l2_offset, qcow_format->m_l2_size * sizeof(uint64_t)}}, + &l2_cache.l2_table->bl, ctx); + return 0; + } + + void handle_l2_table_lookup(int r, uint32_t index, uint64_t l2_offset) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "l2_offset=" << l2_offset << ", " + << "index=" << index << dendl; + + auto& l2_cache = l2_cache_entries[index]; + ceph_assert(l2_cache.in_flight); + l2_cache.in_flight = false; + + if (r < 0) { + lderr(cct) << "failed to load L2 table: " + << "l2_offset=" << l2_cache.l2_offset << ": " + << cpp_strerror(r) << dendl; + l2_cache.ret_val = r; + } else { + // keep the L2 table in big-endian byte-order until the full table + // is requested + l2_cache.l2_table->init(); + } + + // restart the state machine + dispatch_request(); + } + +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat::ReadRequest: " \ + << this << " " << __func__ << ": " + +template +class QCOWFormat::ReadRequest { +public: + ReadRequest(QCOWFormat* qcow_format, io::AioCompletion* aio_comp, + const LookupTable* l1_table, io::Extents&& image_extents) + : qcow_format(qcow_format), aio_comp(aio_comp), l1_table(l1_table), + image_extents(std::move(image_extents)) { + } + + void send() { + get_cluster_offsets(); + } + +private: + QCOWFormat* qcow_format; + io::AioCompletion* aio_comp; + + const LookupTable* l1_table; + io::Extents image_extents; + + size_t image_extents_idx = 0; + uint32_t image_extent_offset = 0; + + ClusterExtents cluster_extents; + + void get_cluster_offsets() { + auto cct = qcow_format->m_image_ctx->cct; + populate_cluster_extents(cct, qcow_format->m_cluster_size, image_extents, + &cluster_extents); + + ldout(cct, 20) << dendl; + auto ctx = new LambdaContext([this](int r) { + handle_get_cluster_offsets(r); }); + auto gather_ctx = new C_Gather(cct, ctx); + + for (auto& cluster_extent : cluster_extents) { + auto sub_ctx = new LambdaContext( + [this, &cluster_extent, on_finish=gather_ctx->new_sub()](int r) { + handle_get_cluster_offset(r, cluster_extent, on_finish); }); + qcow_format->m_l2_table_cache->get_cluster_offset( + l1_table, cluster_extent.image_offset, + &cluster_extent.cluster_offset, sub_ctx); + } + + gather_ctx->activate(); + } + + void handle_get_cluster_offset(int r, const ClusterExtent& cluster_extent, + Context* on_finish) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "image_offset=" << cluster_extent.image_offset << ", " + << "cluster_offset=" << cluster_extent.cluster_offset + << dendl; + + if (r == -ENOENT) { + ldout(cct, 20) << "image offset DNE in QCOW image" << dendl; + r = 0; + } else if (r < 0) { + lderr(cct) << "failed to map image offset " << cluster_extent.image_offset + << ": " << cpp_strerror(r) << dendl; + } + + on_finish->complete(r); + } + + void handle_get_cluster_offsets(int r) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to retrieve cluster extents: " << cpp_strerror(r) + << dendl; + aio_comp->fail(r); + delete this; + return; + } + + read_clusters(); + } + + void read_clusters() { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << dendl; + + aio_comp->set_request_count(cluster_extents.size()); + for (auto& cluster_extent : cluster_extents) { + auto read_ctx = new io::ReadResult::C_ImageReadRequest( + aio_comp, cluster_extent.buffer_offset, + {{cluster_extent.image_offset, cluster_extent.cluster_length}}); + read_ctx->ignore_enoent = true; + + auto log_ctx = new LambdaContext( + [this, cct=qcow_format->m_image_ctx->cct, + image_offset=cluster_extent.image_offset, + image_length=cluster_extent.cluster_length, ctx=read_ctx](int r) { + handle_read_cluster(cct, r, image_offset, image_length, ctx); + }); + + if (cluster_extent.cluster_offset == 0) { + // QCOW header is at offset 0, implies cluster DNE + log_ctx->complete(-ENOENT); + } else if (cluster_extent.cluster_offset == QCOW_OFLAG_ZERO) { + // explicitly zeroed section + read_ctx->bl.append_zero(cluster_extent.cluster_length); + log_ctx->complete(0); + } else { + // request the (sub)cluster from the cluster cache + qcow_format->m_cluster_cache->get_cluster( + cluster_extent.cluster_offset, cluster_extent.cluster_length, + cluster_extent.intra_cluster_offset, &read_ctx->bl, log_ctx); + } + } + + delete this; + } + + void handle_read_cluster(CephContext* cct, int r, uint64_t image_offset, + uint64_t image_length, Context* on_finish) const { + // NOTE: treat as static function, expect object has been deleted + + ldout(cct, 20) << "r=" << r << ", " + << "image_offset=" << image_offset << ", " + << "image_length=" << image_length << dendl; + + if (r != -ENOENT && r < 0) { + lderr(cct) << "failed to read image extent " << image_offset << "~" + << image_length << ": " << cpp_strerror(r) << dendl; + } + + on_finish->complete(r); + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat::" \ + << "ListSnapsRequest: " << this << " " \ + << __func__ << ": " + +template +class QCOWFormat::ListSnapsRequest { +public: + ListSnapsRequest( + QCOWFormat* qcow_format, uint32_t l1_table_index, + ClusterExtents&& cluster_extents, + const std::map& snap_id_to_l1_table, + io::SnapshotDelta* snapshot_delta, Context* on_finish) + : qcow_format(qcow_format), l1_table_index(l1_table_index), + cluster_extents(std::move(cluster_extents)), + snap_id_to_l1_table(snap_id_to_l1_table), snapshot_delta(snapshot_delta), + on_finish(on_finish) { + } + + void send() { + get_l2_table(); + } + +private: + QCOWFormat* qcow_format; + uint32_t l1_table_index; + ClusterExtents cluster_extents; + std::map snap_id_to_l1_table; + io::SnapshotDelta* snapshot_delta; + Context* on_finish; + + std::shared_ptr previous_l2_table; + std::shared_ptr l2_table; + + void get_l2_table() { + auto cct = qcow_format->m_image_ctx->cct; + if (snap_id_to_l1_table.empty()) { + finish(0); + return; + } + + auto it = snap_id_to_l1_table.begin(); + auto [snap_id, l1_table] = *it; + snap_id_to_l1_table.erase(it); + + previous_l2_table = l2_table; + l2_table.reset(); + + auto ctx = new LambdaContext([this, snap_id = snap_id](int r) { + boost::asio::post(qcow_format->m_strand, [this, snap_id, r]() { + handle_get_l2_table(r, snap_id); + }); + }); + + if (l1_table_index >= l1_table->size || + l1_table->cluster_offsets[l1_table_index] == 0) { + ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", " + << "snap_id=" << snap_id << ": DNE" << dendl; + ctx->complete(-ENOENT); + return; + } + + uint64_t l2_table_offset = l1_table->cluster_offsets[l1_table_index] & + qcow_format->m_cluster_mask; + + ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", " + << "snap_id=" << snap_id << ", " + << "l2_table_offset=" << l2_table_offset << dendl; + qcow_format->m_l2_table_cache->get_l2_table(l1_table, l2_table_offset, + &l2_table, ctx); + } + + void handle_get_l2_table(int r, uint64_t snap_id) { + ceph_assert(qcow_format->m_strand.running_in_this_thread()); + + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "snap_id=" << snap_id << dendl; + + if (r == -ENOENT) { + l2_table.reset(); + } else if (r < 0) { + lderr(cct) << "failed to retrieve L2 table for snapshot " << snap_id + << ": " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + // compare the cluster offsets at each requested L2 offset between + // the previous snapshot's L2 table and the current L2 table. + auto& sparse_extents = (*snapshot_delta)[{snap_id, snap_id}]; + for (auto& cluster_extent : cluster_extents) { + uint32_t l2_table_index = + (cluster_extent.image_offset >> qcow_format->m_cluster_bits) & + (qcow_format->m_l2_size - 1); + + std::optional cluster_offset; + if (l2_table && l2_table_index < l2_table->size) { + cluster_offset = l2_table->cluster_offsets[l2_table_index] & + qcow_format->m_cluster_offset_mask; + } + + std::optional prev_cluster_offset; + if (previous_l2_table && l2_table_index < previous_l2_table->size) { + prev_cluster_offset = + previous_l2_table->cluster_offsets[l2_table_index] & + qcow_format->m_cluster_offset_mask; + } + + ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", " + << "snap_id=" << snap_id << ", " + << "image_offset=" << cluster_extent.image_offset << ", " + << "l2_table_index=" << l2_table_index << ", " + << "cluster_offset=" << cluster_offset << ", " + << "prev_cluster_offset=" << prev_cluster_offset << dendl; + + auto state = io::SPARSE_EXTENT_STATE_DATA; + if (cluster_offset == prev_cluster_offset) { + continue; + } else if ((prev_cluster_offset && !cluster_offset) || + *cluster_offset == QCOW_OFLAG_ZERO) { + // explicitly zeroed or deallocated + state = io::SPARSE_EXTENT_STATE_ZEROED; + } + + sparse_extents.insert( + cluster_extent.image_offset, cluster_extent.cluster_length, + {state, cluster_extent.cluster_length}); + } + + ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", " + << "snap_id=" << snap_id << ", " + << "sparse_extents=" << sparse_extents << dendl; + + // continue processing the L2 table at this index for all snapshots + boost::asio::post(*qcow_format->m_image_ctx->asio_engine, + [this]() { get_l2_table(); }); + } + + + void finish(int r) { + auto cct = qcow_format->m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << dendl; + + on_finish->complete(r); + delete this; + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::QCOWFormat: " << this \ + << " " << __func__ << ": " + +template +QCOWFormat::QCOWFormat( + I* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder) + : m_image_ctx(image_ctx), m_json_object(json_object), + m_source_spec_builder(source_spec_builder), + m_strand(*image_ctx->asio_engine) { +} + +template +void QCOWFormat::open(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + int r = m_source_spec_builder->build_stream(m_json_object, &m_stream); + if (r < 0) { + lderr(cct) << "failed to build migration stream handler" << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_open(r, on_finish); }); + m_stream->open(ctx); +} + +template +void QCOWFormat::handle_open(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to open QCOW image: " << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + probe(on_finish); +} + +template +void QCOWFormat::probe(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_probe(r, on_finish); }); + m_bl.clear(); + m_stream->read({{0, 8}}, &m_bl, ctx); +} + +template +void QCOWFormat::handle_probe(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to probe QCOW image: " << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + auto header_probe = *reinterpret_cast( + m_bl.c_str()); + header_probe.magic = big_to_native(header_probe.magic); + header_probe.version = big_to_native(header_probe.version); + + if (header_probe.magic != QCOW_MAGIC) { + lderr(cct) << "invalid QCOW header magic" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_bl.clear(); + if (header_probe.version == 1) { +#ifdef WITH_RBD_MIGRATION_FORMAT_QCOW_V1 + read_v1_header(on_finish); +#else // WITH_RBD_MIGRATION_FORMAT_QCOW_V1 + lderr(cct) << "QCOW is not supported" << dendl; + on_finish->complete(-ENOTSUP); +#endif // WITH_RBD_MIGRATION_FORMAT_QCOW_V1 + return; + } else if (header_probe.version >= 2 && header_probe.version <= 3) { + read_v2_header(on_finish); + return; + } else { + lderr(cct) << "invalid QCOW header version " << header_probe.version + << dendl; + on_finish->complete(-EINVAL); + return; + } +} + +#ifdef WITH_RBD_MIGRATION_FORMAT_QCOW_V1 + +template +void QCOWFormat::read_v1_header(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_read_v1_header(r, on_finish); }); + m_bl.clear(); + m_stream->read({{0, sizeof(QCowHeaderV1)}}, &m_bl, ctx); +} + +template +void QCOWFormat::handle_read_v1_header(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to read QCOW header: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + auto header = *reinterpret_cast(m_bl.c_str()); + + // byte-swap important fields + header.magic = big_to_native(header.magic); + header.version = big_to_native(header.version); + header.backing_file_offset = big_to_native(header.backing_file_offset); + header.backing_file_size = big_to_native(header.backing_file_size); + header.size = big_to_native(header.size); + header.crypt_method = big_to_native(header.crypt_method); + header.l1_table_offset = big_to_native(header.l1_table_offset); + + if (header.magic != QCOW_MAGIC || header.version != 1) { + // honestly shouldn't happen since we've already validated it + lderr(cct) << "header is not QCOW" << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (header.cluster_bits < QCOW_MIN_CLUSTER_BITS || + header.cluster_bits > QCOW_MAX_CLUSTER_BITS) { + lderr(cct) << "invalid cluster bits: " << header.cluster_bits << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (header.l2_bits < (QCOW_MIN_CLUSTER_BITS - 3) || + header.l2_bits > (QCOW_MAX_CLUSTER_BITS - 3)) { + lderr(cct) << "invalid L2 bits: " << header.l2_bits << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (header.crypt_method != QCOW_CRYPT_NONE) { + lderr(cct) << "invalid or unsupported encryption method" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_size = header.size; + if (p2roundup(m_size, static_cast(512)) != m_size) { + lderr(cct) << "image size is not a multiple of block size" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_backing_file_offset = header.backing_file_offset; + m_backing_file_size = header.backing_file_size; + + m_cluster_bits = header.cluster_bits; + m_cluster_size = 1UL << header.cluster_bits; + m_cluster_offset_mask = (1ULL << (63 - header.cluster_bits)) - 1; + m_cluster_mask = ~QCOW_OFLAG_COMPRESSED; + + m_l2_bits = header.l2_bits; + m_l2_size = (1UL << m_l2_bits); + + m_l1_shift = m_cluster_bits + m_l2_bits; + m_l1_table.size = (m_size + (1LL << m_l1_shift) - 1) >> m_l1_shift; + m_l1_table_offset = header.l1_table_offset; + if (m_size > (std::numeric_limits::max() - (1ULL << m_l1_shift)) || + m_l1_table.size > + (std::numeric_limits::max() / sizeof(uint64_t))) { + lderr(cct) << "image size too big: " << m_size << dendl; + on_finish->complete(-EINVAL); + return; + } + + ldout(cct, 15) << "size=" << m_size << ", " + << "cluster_bits=" << m_cluster_bits << ", " + << "l2_bits=" << m_l2_bits << dendl; + + // allocate memory for L1 table and L2 + cluster caches + m_l2_table_cache = std::make_unique(this); + m_cluster_cache = std::make_unique(this); + + read_l1_table(on_finish); +} + +#endif // WITH_RBD_MIGRATION_FORMAT_QCOW_V1 + +template +void QCOWFormat::read_v2_header(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_read_v2_header(r, on_finish); }); + m_bl.clear(); + m_stream->read({{0, sizeof(QCowHeader)}}, &m_bl, ctx); +} + +template +void QCOWFormat::handle_read_v2_header(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to read QCOW2 header: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + auto header = *reinterpret_cast(m_bl.c_str()); + + // byte-swap important fields + header.magic = big_to_native(header.magic); + header.version = big_to_native(header.version); + header.backing_file_offset = big_to_native(header.backing_file_offset); + header.backing_file_size = big_to_native(header.backing_file_size); + header.cluster_bits = big_to_native(header.cluster_bits); + header.size = big_to_native(header.size); + header.crypt_method = big_to_native(header.crypt_method); + header.l1_size = big_to_native(header.l1_size); + header.l1_table_offset = big_to_native(header.l1_table_offset); + header.nb_snapshots = big_to_native(header.nb_snapshots); + header.snapshots_offset = big_to_native(header.snapshots_offset); + + if (header.version == 2) { + // valid only for version >= 3 + header.incompatible_features = 0; + header.compatible_features = 0; + header.autoclear_features = 0; + header.header_length = 72; + header.compression_type = 0; + } else { + header.incompatible_features = big_to_native(header.incompatible_features); + header.compatible_features = big_to_native(header.compatible_features); + header.autoclear_features = big_to_native(header.autoclear_features); + header.header_length = big_to_native(header.header_length); + } + + if (header.magic != QCOW_MAGIC || header.version < 2 || header.version > 3) { + // honestly shouldn't happen since we've already validated it + lderr(cct) << "header is not QCOW2" << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (header.cluster_bits < QCOW_MIN_CLUSTER_BITS || + header.cluster_bits > QCOW_MAX_CLUSTER_BITS) { + lderr(cct) << "invalid cluster bits: " << header.cluster_bits << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (header.crypt_method != QCOW_CRYPT_NONE) { + lderr(cct) << "invalid or unsupported encryption method" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_size = header.size; + if (p2roundup(m_size, static_cast(512)) != m_size) { + lderr(cct) << "image size is not a multiple of block size" << dendl; + on_finish->complete(-EINVAL); + return; + } + + if (header.header_length <= offsetof(QCowHeader, compression_type)) { + header.compression_type = 0; + } + + if ((header.compression_type != 0) || + ((header.incompatible_features & QCOW2_INCOMPAT_COMPRESSION) != 0)) { + lderr(cct) << "invalid or unsupported compression type" << dendl; + on_finish->complete(-EINVAL); + return; + } + + if ((header.incompatible_features & QCOW2_INCOMPAT_DATA_FILE) != 0) { + lderr(cct) << "external data file feature not supported" << dendl; + on_finish->complete(-ENOTSUP); + } + + if ((header.incompatible_features & QCOW2_INCOMPAT_EXTL2) != 0) { + lderr(cct) << "extended L2 table feature not supported" << dendl; + on_finish->complete(-ENOTSUP); + return; + } + + header.incompatible_features &= ~QCOW2_INCOMPAT_MASK; + if (header.incompatible_features != 0) { + lderr(cct) << "unknown incompatible feature enabled" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_backing_file_offset = header.backing_file_offset; + m_backing_file_size = header.backing_file_size; + + m_cluster_bits = header.cluster_bits; + m_cluster_size = 1UL << header.cluster_bits; + m_cluster_offset_mask = (1ULL << (63 - header.cluster_bits)) - 1; + m_cluster_mask = ~(QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_COPIED); + + // L2 table is fixed a (1) cluster block to hold 8-byte (3 bit) offsets + m_l2_bits = m_cluster_bits - 3; + m_l2_size = (1UL << m_l2_bits); + + m_l1_shift = m_cluster_bits + m_l2_bits; + m_l1_table.size = (m_size + (1LL << m_l1_shift) - 1) >> m_l1_shift; + m_l1_table_offset = header.l1_table_offset; + if (m_size > (std::numeric_limits::max() - (1ULL << m_l1_shift)) || + m_l1_table.size > + (std::numeric_limits::max() / sizeof(uint64_t))) { + lderr(cct) << "image size too big: " << m_size << dendl; + on_finish->complete(-EINVAL); + return; + } else if (m_l1_table.size > header.l1_size) { + lderr(cct) << "invalid L1 table size in header (" << header.l1_size + << " < " << m_l1_table.size << ")" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_snapshot_count = header.nb_snapshots; + m_snapshots_offset = header.snapshots_offset; + + ldout(cct, 15) << "size=" << m_size << ", " + << "cluster_bits=" << m_cluster_bits << ", " + << "l1_table_offset=" << m_l1_table_offset << ", " + << "snapshot_count=" << m_snapshot_count << ", " + << "snapshots_offset=" << m_snapshots_offset << dendl; + + // allocate memory for L1 table and L2 + cluster caches + m_l2_table_cache = std::make_unique(this); + m_cluster_cache = std::make_unique(this); + + read_snapshot(on_finish); +} + +template +void QCOWFormat::read_snapshot(Context* on_finish) { + if (m_snapshots_offset == 0 || m_snapshots.size() == m_snapshot_count) { + read_l1_table(on_finish); + return; + } + + // header is always aligned on 8 byte boundary + m_snapshots_offset = p2roundup(m_snapshots_offset, static_cast(8)); + + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "snap_id=" << (m_snapshots.size() + 1) << ", " + << "offset=" << m_snapshots_offset << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_read_snapshot(r, on_finish); }); + m_bl.clear(); + m_stream->read({{m_snapshots_offset, sizeof(QCowSnapshotHeader)}}, &m_bl, + ctx); +} + +template +void QCOWFormat::handle_read_snapshot(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << ", " + << "index=" << m_snapshots.size() << dendl; + + if (r < 0) { + lderr(cct) << "failed to read QCOW2 snapshot header: " << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + m_snapshots_offset += m_bl.length(); + auto header = *reinterpret_cast(m_bl.c_str()); + + auto& snapshot = m_snapshots[m_snapshots.size() + 1]; + snapshot.id.resize(big_to_native(header.id_str_size)); + snapshot.name.resize(big_to_native(header.name_size)); + snapshot.l1_table_offset = big_to_native(header.l1_table_offset); + snapshot.l1_table.size = big_to_native(header.l1_size); + snapshot.timestamp.sec_ref() = big_to_native(header.date_sec); + snapshot.timestamp.nsec_ref() = big_to_native(header.date_nsec); + snapshot.extra_data_size = big_to_native(header.extra_data_size); + + ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", " + << "id_str_len=" << snapshot.id.size() << ", " + << "name_str_len=" << snapshot.name.size() << ", " + << "l1_table_offset=" << snapshot.l1_table_offset << ", " + << "l1_size=" << snapshot.l1_table.size << ", " + << "extra_data_size=" << snapshot.extra_data_size << dendl; + + read_snapshot_extra(on_finish); +} + +template +void QCOWFormat::read_snapshot_extra(Context* on_finish) { + ceph_assert(!m_snapshots.empty()); + auto& snapshot = m_snapshots.rbegin()->second; + + uint32_t length = snapshot.extra_data_size + + snapshot.id.size() + + snapshot.name.size(); + if (length == 0) { + uuid_d uuid_gen; + uuid_gen.generate_random(); + snapshot.name = uuid_gen.to_string(); + + read_snapshot(on_finish); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", " + << "offset=" << m_snapshots_offset << ", " + << "length=" << length << dendl; + + auto offset = m_snapshots_offset; + m_snapshots_offset += length; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_read_snapshot_extra(r, on_finish); }); + m_bl.clear(); + m_stream->read({{offset, length}}, &m_bl, ctx); +} + +template +void QCOWFormat::handle_read_snapshot_extra(int r, Context* on_finish) { + ceph_assert(!m_snapshots.empty()); + auto& snapshot = m_snapshots.rbegin()->second; + + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << ", " + << "snap_id=" << m_snapshots.size() << dendl; + + if (r < 0) { + lderr(cct) << "failed to read QCOW2 snapshot header extra: " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + if (snapshot.extra_data_size >= + offsetof(QCowSnapshotExtraData, disk_size) + sizeof(uint64_t)) { + auto extra = reinterpret_cast(m_bl.c_str()); + snapshot.size = big_to_native(extra->disk_size); + } else { + snapshot.size = m_size; + } + + auto data = reinterpret_cast(m_bl.c_str()); + data += snapshot.extra_data_size; + + if (!snapshot.id.empty()) { + snapshot.id = std::string(data, snapshot.id.size()); + data += snapshot.id.size(); + } + + if (!snapshot.name.empty()) { + snapshot.name = std::string(data, snapshot.name.size()); + data += snapshot.name.size(); + } else { + uuid_d uuid_gen; + uuid_gen.generate_random(); + snapshot.name = uuid_gen.to_string(); + } + + ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", " + << "name=" << snapshot.name << ", " + << "size=" << snapshot.size << dendl; + read_snapshot_l1_table(on_finish); +} + +template +void QCOWFormat::read_snapshot_l1_table(Context* on_finish) { + ceph_assert(!m_snapshots.empty()); + auto& snapshot = m_snapshots.rbegin()->second; + + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", " + << "l1_table_offset=" << snapshot.l1_table_offset + << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_read_snapshot_l1_table(r, on_finish); }); + m_stream->read({{snapshot.l1_table_offset, + snapshot.l1_table.size * sizeof(uint64_t)}}, + &snapshot.l1_table.bl, ctx); +} + +template +void QCOWFormat::handle_read_snapshot_l1_table(int r, Context* on_finish) { + ceph_assert(!m_snapshots.empty()); + auto& snapshot = m_snapshots.rbegin()->second; + + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << ", " + << "snap_id=" << m_snapshots.size() << dendl; + + if (r < 0) { + lderr(cct) << "failed to read snapshot L1 table: " << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + snapshot.l1_table.decode(); + read_snapshot(on_finish); +} + +template +void QCOWFormat::read_l1_table(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_read_l1_table(r, on_finish); }); + m_stream->read({{m_l1_table_offset, + m_l1_table.size * sizeof(uint64_t)}}, + &m_l1_table.bl, ctx); +} + +template +void QCOWFormat::handle_read_l1_table(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to read L1 table: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + m_l1_table.decode(); + read_backing_file(on_finish); +} + +template +void QCOWFormat::read_backing_file(Context* on_finish) { + if (m_backing_file_offset == 0 || m_backing_file_size == 0) { + // all data is within the specified file + on_finish->complete(0); + return; + } + + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + // TODO add support for backing files + on_finish->complete(-ENOTSUP); +} + +template +void QCOWFormat::close(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + m_stream->close(on_finish); +} + +template +void QCOWFormat::get_snapshots(SnapInfos* snap_infos, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + snap_infos->clear(); + for (auto& [snap_id, snapshot] : m_snapshots) { + SnapInfo snap_info(snapshot.name, cls::rbd::UserSnapshotNamespace{}, + snapshot.size, {}, 0, 0, snapshot.timestamp); + snap_infos->emplace(snap_id, snap_info); + } + + on_finish->complete(0); +} + +template +void QCOWFormat::get_image_size(uint64_t snap_id, uint64_t* size, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "snap_id=" << snap_id << dendl; + + if (snap_id == CEPH_NOSNAP) { + *size = m_size; + } else { + auto snapshot_it = m_snapshots.find(snap_id); + if (snapshot_it == m_snapshots.end()) { + on_finish->complete(-ENOENT); + return; + } + + auto& snapshot = snapshot_it->second; + *size = snapshot.size; + } + + on_finish->complete(0); +} + +template +bool QCOWFormat::read( + io::AioCompletion* aio_comp, uint64_t snap_id, io::Extents&& image_extents, + io::ReadResult&& read_result, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "snap_id=" << snap_id << ", " + << "image_extents=" << image_extents << dendl; + + const LookupTable* l1_table = nullptr; + if (snap_id == CEPH_NOSNAP) { + l1_table = &m_l1_table; + } else { + auto snapshot_it = m_snapshots.find(snap_id); + if (snapshot_it == m_snapshots.end()) { + aio_comp->fail(-ENOENT); + return true; + } + + auto& snapshot = snapshot_it->second; + l1_table = &snapshot.l1_table; + } + + aio_comp->read_result = std::move(read_result); + aio_comp->read_result.set_image_extents(image_extents); + + auto read_request = new ReadRequest(this, aio_comp, l1_table, + std::move(image_extents)); + read_request->send(); + + return true; +} + +template +void QCOWFormat::list_snaps(io::Extents&& image_extents, + io::SnapIds&& snap_ids, int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + ClusterExtents cluster_extents; + populate_cluster_extents(cct, m_cluster_size, image_extents, + &cluster_extents); + + // map L1 table indexes to cluster extents + std::map l1_cluster_extents; + for (auto& cluster_extent : cluster_extents) { + uint32_t l1_table_index = cluster_extent.image_offset >> m_l1_shift; + auto& l1_cluster_extent = l1_cluster_extents[l1_table_index]; + l1_cluster_extent.reserve(cluster_extents.size()); + l1_cluster_extent.push_back(cluster_extent); + } + + std::map snap_id_to_l1_table; + for (auto& [snap_id, snapshot] : m_snapshots) { + snap_id_to_l1_table[snap_id] = &snapshot.l1_table; + } + snap_id_to_l1_table[CEPH_NOSNAP] = &m_l1_table; + + on_finish = new LambdaContext([this, image_extents, + snap_ids=std::move(snap_ids), + snapshot_delta, on_finish](int r) mutable { + handle_list_snaps(r, std::move(image_extents), std::move(snap_ids), + snapshot_delta, on_finish); + }); + + auto gather_ctx = new C_Gather(cct, on_finish); + + for (auto& [l1_table_index, cluster_extents] : l1_cluster_extents) { + auto list_snaps_request = new ListSnapsRequest( + this, l1_table_index, std::move(cluster_extents), snap_id_to_l1_table, + snapshot_delta, gather_ctx->new_sub()); + list_snaps_request->send(); + } + + gather_ctx->activate(); +} + +template +void QCOWFormat::handle_list_snaps(int r, io::Extents&& image_extents, + io::SnapIds&& snap_ids, + io::SnapshotDelta* snapshot_delta, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "snapshot_delta=" << *snapshot_delta << dendl; + + std::optional previous_size = std::nullopt; + for (auto& [snap_id, snapshot] : m_snapshots) { + auto sparse_extents = &(*snapshot_delta)[{snap_id, snap_id}]; + util::zero_shrunk_snapshot(cct, image_extents, snap_id, snapshot.size, + &previous_size, sparse_extents); + } + + auto sparse_extents = &(*snapshot_delta)[{CEPH_NOSNAP, CEPH_NOSNAP}]; + util::zero_shrunk_snapshot(cct, image_extents, CEPH_NOSNAP, m_size, + &previous_size, sparse_extents); + + util::merge_snapshot_delta(snap_ids, snapshot_delta); + on_finish->complete(r); +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::QCOWFormat; diff --git a/src/librbd/migration/QCOWFormat.h b/src/librbd/migration/QCOWFormat.h new file mode 100644 index 000000000..b36506716 --- /dev/null +++ b/src/librbd/migration/QCOWFormat.h @@ -0,0 +1,211 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H +#define CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H + +#include "include/int_types.h" +#include "librbd/Types.h" +#include "librbd/migration/FormatInterface.h" +#include "librbd/migration/QCOW.h" +#include "acconfig.h" +#include "json_spirit/json_spirit.h" +#include +#include +#include +#include +#include + +struct Context; + +namespace librbd { + +struct AsioEngine; +struct ImageCtx; + +namespace migration { + +template struct SourceSpecBuilder; +struct StreamInterface; + +namespace qcow_format { + +struct LookupTable { + LookupTable() {} + LookupTable(uint32_t size) : size(size) {} + + bufferlist bl; + uint64_t* cluster_offsets = nullptr; + uint32_t size = 0; + bool decoded = false; + + void init(); + void decode(); +}; + +} // namespace qcow_format + +template +class QCOWFormat : public FormatInterface { +public: + static QCOWFormat* create( + ImageCtxT* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder) { + return new QCOWFormat(image_ctx, json_object, source_spec_builder); + } + + QCOWFormat(ImageCtxT* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder); + QCOWFormat(const QCOWFormat&) = delete; + QCOWFormat& operator=(const QCOWFormat&) = delete; + + void open(Context* on_finish) override; + void close(Context* on_finish) override; + + void get_snapshots(SnapInfos* snap_infos, Context* on_finish) override; + void get_image_size(uint64_t snap_id, uint64_t* size, + Context* on_finish) override; + + bool read(io::AioCompletion* aio_comp, uint64_t snap_id, + io::Extents&& image_extents, io::ReadResult&& read_result, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) override; + + void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids, + int list_snaps_flags, io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, + Context* on_finish) override; + +private: + /** + * @verbatim + * + * + * | + * v + * OPEN + * | + * v + * PROBE + * | + * |\---> READ V1 HEADER ----------\ + * | | + * \----> READ V2 HEADER | + * | | + * | /----------\ | + * | | | | + * v v | | + * READ SNAPSHOT | | + * | | | + * v | | + * READ SNAPSHOT EXTRA | | + * | | | + * v | | + * READ SNAPSHOT L1 TABLE | + * | | + * \--------------------\| + * | + * v + * READ L1 TABLE + * | + * v + * READ BACKING FILE + * | + * /-------------------------------/ + * | + * v + * + * + * @endverbatim + */ + + struct Cluster; + struct ClusterCache; + struct L2TableCache; + struct ReadRequest; + struct ListSnapsRequest; + + struct Snapshot { + std::string id; + std::string name; + + utime_t timestamp; + uint64_t size = 0; + + uint64_t l1_table_offset = 0; + qcow_format::LookupTable l1_table; + + uint32_t extra_data_size = 0; + }; + + ImageCtxT* m_image_ctx; + json_spirit::mObject m_json_object; + const SourceSpecBuilder* m_source_spec_builder; + + boost::asio::io_context::strand m_strand; + std::shared_ptr m_stream; + + bufferlist m_bl; + + uint64_t m_size = 0; + + uint64_t m_backing_file_offset = 0; + uint32_t m_backing_file_size = 0; + + uint32_t m_cluster_bits = 0; + uint32_t m_cluster_size = 0; + uint64_t m_cluster_offset_mask = 0; + uint64_t m_cluster_mask = 0; + + uint32_t m_l1_shift = 0; + uint64_t m_l1_table_offset = 0; + qcow_format::LookupTable m_l1_table; + + uint32_t m_l2_bits = 0; + uint32_t m_l2_size = 0; + + uint32_t m_snapshot_count = 0; + uint64_t m_snapshots_offset = 0; + std::map m_snapshots; + + std::unique_ptr m_l2_table_cache; + std::unique_ptr m_cluster_cache; + + void handle_open(int r, Context* on_finish); + + void probe(Context* on_finish); + void handle_probe(int r, Context* on_finish); + +#ifdef WITH_RBD_MIGRATION_FORMAT_QCOW_V1 + void read_v1_header(Context* on_finish); + void handle_read_v1_header(int r, Context* on_finish); +#endif // WITH_RBD_MIGRATION_FORMAT_QCOW_V1 + + void read_v2_header(Context* on_finish); + void handle_read_v2_header(int r, Context* on_finish); + + void read_snapshot(Context* on_finish); + void handle_read_snapshot(int r, Context* on_finish); + + void read_snapshot_extra(Context* on_finish); + void handle_read_snapshot_extra(int r, Context* on_finish); + + void read_snapshot_l1_table(Context* on_finish); + void handle_read_snapshot_l1_table(int r, Context* on_finish); + + void read_l1_table(Context* on_finish); + void handle_read_l1_table(int r, Context* on_finish); + + void read_backing_file(Context* on_finish); + + void handle_list_snaps(int r, io::Extents&& image_extents, + io::SnapIds&& snap_ids, + io::SnapshotDelta* snapshot_delta, Context* on_finish); +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::QCOWFormat; + +#endif // CEPH_LIBRBD_MIGRATION_QCOW_FORMAT_H diff --git a/src/librbd/migration/RawFormat.cc b/src/librbd/migration/RawFormat.cc new file mode 100644 index 000000000..0b655d368 --- /dev/null +++ b/src/librbd/migration/RawFormat.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/RawFormat.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ReadResult.h" +#include "librbd/migration/SnapshotInterface.h" +#include "librbd/migration/SourceSpecBuilder.h" +#include "librbd/migration/Utils.h" + +namespace librbd { +namespace migration { + +namespace { + +static const std::string SNAPSHOTS_KEY {"snapshots"}; + + +} // anonymous namespace + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::RawFormat: " << this \ + << " " << __func__ << ": " + +template +RawFormat::RawFormat( + I* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder) + : m_image_ctx(image_ctx), m_json_object(json_object), + m_source_spec_builder(source_spec_builder) { +} + +template +void RawFormat::open(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + on_finish = new LambdaContext([this, on_finish](int r) { + handle_open(r, on_finish); }); + + // treat the base image as a HEAD-revision snapshot + Snapshots snapshots; + int r = m_source_spec_builder->build_snapshot(m_json_object, CEPH_NOSNAP, + &snapshots[CEPH_NOSNAP]); + if (r < 0) { + lderr(cct) << "failed to build HEAD revision handler: " << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + auto& snapshots_val = m_json_object[SNAPSHOTS_KEY]; + if (snapshots_val.type() == json_spirit::array_type) { + auto& snapshots_arr = snapshots_val.get_array(); + for (auto& snapshot_val : snapshots_arr) { + uint64_t index = snapshots.size(); + if (snapshot_val.type() != json_spirit::obj_type) { + lderr(cct) << "invalid snapshot " << index << " JSON: " + << cpp_strerror(r) << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto& snapshot_obj = snapshot_val.get_obj(); + r = m_source_spec_builder->build_snapshot(snapshot_obj, index, + &snapshots[index]); + if (r < 0) { + lderr(cct) << "failed to build snapshot " << index << " handler: " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + } + } else if (snapshots_val.type() != json_spirit::null_type) { + lderr(cct) << "invalid snapshots array" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_snapshots = std::move(snapshots); + + auto gather_ctx = new C_Gather(cct, on_finish); + SnapshotInterface* previous_snapshot = nullptr; + for (auto& [_, snapshot] : m_snapshots) { + snapshot->open(previous_snapshot, gather_ctx->new_sub()); + previous_snapshot = snapshot.get(); + } + gather_ctx->activate(); +} + +template +void RawFormat::handle_open(int r, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to open raw image: " << cpp_strerror(r) + << dendl; + + auto gather_ctx = new C_Gather(cct, on_finish); + for (auto& [_, snapshot] : m_snapshots) { + snapshot->close(gather_ctx->new_sub()); + } + + m_image_ctx->state->close(new LambdaContext( + [r, on_finish=gather_ctx->new_sub()](int _) { on_finish->complete(r); })); + + gather_ctx->activate(); + return; + } + + on_finish->complete(0); +} + +template +void RawFormat::close(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto gather_ctx = new C_Gather(cct, on_finish); + for (auto& [snap_id, snapshot] : m_snapshots) { + snapshot->close(gather_ctx->new_sub()); + } + + gather_ctx->activate(); +} + +template +void RawFormat::get_snapshots(SnapInfos* snap_infos, Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + snap_infos->clear(); + for (auto& [snap_id, snapshot] : m_snapshots) { + if (snap_id == CEPH_NOSNAP) { + continue; + } + snap_infos->emplace(snap_id, snapshot->get_snap_info()); + } + on_finish->complete(0); +} + +template +void RawFormat::get_image_size(uint64_t snap_id, uint64_t* size, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto snapshot_it = m_snapshots.find(snap_id); + if (snapshot_it == m_snapshots.end()) { + on_finish->complete(-ENOENT); + return; + } + + *size = snapshot_it->second->get_snap_info().size; + on_finish->complete(0); +} + +template +bool RawFormat::read( + io::AioCompletion* aio_comp, uint64_t snap_id, io::Extents&& image_extents, + io::ReadResult&& read_result, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "snap_id=" << snap_id << ", " + << "image_extents=" << image_extents << dendl; + + auto snapshot_it = m_snapshots.find(snap_id); + if (snapshot_it == m_snapshots.end()) { + aio_comp->fail(-ENOENT); + return true; + } + + snapshot_it->second->read(aio_comp, std::move(image_extents), + std::move(read_result), op_flags, read_flags, + parent_trace); + return true; +} + +template +void RawFormat::list_snaps(io::Extents&& image_extents, + io::SnapIds&& snap_ids, int list_snaps_flags, + io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + on_finish = new LambdaContext([this, snap_ids=std::move(snap_ids), + snapshot_delta, on_finish](int r) mutable { + handle_list_snaps(r, std::move(snap_ids), snapshot_delta, on_finish); + }); + + auto gather_ctx = new C_Gather(cct, on_finish); + + std::optional previous_size = std::nullopt; + for (auto& [snap_id, snapshot] : m_snapshots) { + auto& sparse_extents = (*snapshot_delta)[{snap_id, snap_id}]; + + // zero out any space between the previous snapshot end and this + // snapshot's end + auto& snap_info = snapshot->get_snap_info(); + util::zero_shrunk_snapshot(cct, image_extents, snap_id, snap_info.size, + &previous_size, &sparse_extents); + + // build set of data/zeroed extents for the current snapshot + snapshot->list_snap(io::Extents{image_extents}, list_snaps_flags, + &sparse_extents, parent_trace, gather_ctx->new_sub()); + } + + gather_ctx->activate(); +} + +template +void RawFormat::handle_list_snaps(int r, io::SnapIds&& snap_ids, + io::SnapshotDelta* snapshot_delta, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "r=" << r << ", " + << "snapshot_delta=" << snapshot_delta << dendl; + + util::merge_snapshot_delta(snap_ids, snapshot_delta); + on_finish->complete(r); +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::RawFormat; diff --git a/src/librbd/migration/RawFormat.h b/src/librbd/migration/RawFormat.h new file mode 100644 index 000000000..a20c0814f --- /dev/null +++ b/src/librbd/migration/RawFormat.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_RAW_FORMAT_H +#define CEPH_LIBRBD_MIGRATION_RAW_FORMAT_H + +#include "include/int_types.h" +#include "librbd/Types.h" +#include "librbd/migration/FormatInterface.h" +#include "json_spirit/json_spirit.h" +#include +#include + +struct Context; + +namespace librbd { + +struct AsioEngine; +struct ImageCtx; + +namespace migration { + +template struct SourceSpecBuilder; +struct SnapshotInterface; + +template +class RawFormat : public FormatInterface { +public: + static RawFormat* create( + ImageCtxT* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder) { + return new RawFormat(image_ctx, json_object, source_spec_builder); + } + + RawFormat(ImageCtxT* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder); + RawFormat(const RawFormat&) = delete; + RawFormat& operator=(const RawFormat&) = delete; + + void open(Context* on_finish) override; + void close(Context* on_finish) override; + + void get_snapshots(SnapInfos* snap_infos, Context* on_finish) override; + void get_image_size(uint64_t snap_id, uint64_t* size, + Context* on_finish) override; + + bool read(io::AioCompletion* aio_comp, uint64_t snap_id, + io::Extents&& image_extents, io::ReadResult&& read_result, + int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) override; + + void list_snaps(io::Extents&& image_extents, io::SnapIds&& snap_ids, + int list_snaps_flags, io::SnapshotDelta* snapshot_delta, + const ZTracer::Trace &parent_trace, + Context* on_finish) override; + +private: + typedef std::shared_ptr Snapshot; + typedef std::map Snapshots; + + ImageCtxT* m_image_ctx; + json_spirit::mObject m_json_object; + const SourceSpecBuilder* m_source_spec_builder; + + Snapshots m_snapshots; + + void handle_open(int r, Context* on_finish); + + void handle_list_snaps(int r, io::SnapIds&& snap_ids, + io::SnapshotDelta* snapshot_delta, Context* on_finish); +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::RawFormat; + +#endif // CEPH_LIBRBD_MIGRATION_RAW_FORMAT_H diff --git a/src/librbd/migration/RawSnapshot.cc b/src/librbd/migration/RawSnapshot.cc new file mode 100644 index 000000000..4a83fd8ad --- /dev/null +++ b/src/librbd/migration/RawSnapshot.cc @@ -0,0 +1,220 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/RawSnapshot.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ReadResult.h" +#include "librbd/migration/SourceSpecBuilder.h" +#include "librbd/migration/StreamInterface.h" + +namespace librbd { +namespace migration { + +namespace { + +const std::string NAME_KEY{"name"}; + +} // anonymous namespace + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::RawSnapshot::OpenRequest " \ + << this << " " << __func__ << ": " + +template +struct RawSnapshot::OpenRequest { + RawSnapshot* raw_snapshot; + Context* on_finish; + + OpenRequest(RawSnapshot* raw_snapshot, Context* on_finish) + : raw_snapshot(raw_snapshot), on_finish(on_finish) { + } + + void send() { + open_stream(); + } + + void open_stream() { + auto cct = raw_snapshot->m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = util::create_context_callback< + OpenRequest, &OpenRequest::handle_open_stream>(this); + raw_snapshot->m_stream->open(ctx); + } + + void handle_open_stream(int r) { + auto cct = raw_snapshot->m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to open stream: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + get_image_size(); + } + + void get_image_size() { + auto cct = raw_snapshot->m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = util::create_context_callback< + OpenRequest, &OpenRequest::handle_get_image_size>(this); + raw_snapshot->m_stream->get_size(&raw_snapshot->m_snap_info.size, ctx); + } + + void handle_get_image_size(int r) { + auto cct = raw_snapshot->m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << ", " + << "image_size=" << raw_snapshot->m_snap_info.size << dendl; + + if (r < 0) { + lderr(cct) << "failed to open stream: " << cpp_strerror(r) << dendl; + close_stream(r); + return; + } + + finish(0); + } + + void close_stream(int r) { + auto cct = raw_snapshot->m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = new LambdaContext([this, r](int) { + handle_close_stream(r); + }); + raw_snapshot->m_stream->close(ctx); + } + + void handle_close_stream(int r) { + auto cct = raw_snapshot->m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + raw_snapshot->m_stream.reset(); + + finish(r); + } + + void finish(int r) { + auto cct = raw_snapshot->m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + on_finish->complete(r); + delete this; + } +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::RawSnapshot: " << this \ + << " " << __func__ << ": " + +template +RawSnapshot::RawSnapshot(I* image_ctx, + const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder, + uint64_t index) + : m_image_ctx(image_ctx), m_json_object(json_object), + m_source_spec_builder(source_spec_builder), m_index(index), + m_snap_info({}, {}, 0, {}, 0, 0, {}) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; +} + +template +void RawSnapshot::open(SnapshotInterface* previous_snapshot, + Context* on_finish) { + auto cct = m_image_ctx->cct; + + // special-case for treating the HEAD revision as a snapshot + if (m_index != CEPH_NOSNAP) { + auto& name_val = m_json_object[NAME_KEY]; + if (name_val.type() == json_spirit::str_type) { + m_snap_info.name = name_val.get_str(); + } else if (name_val.type() == json_spirit::null_type) { + uuid_d uuid_gen; + uuid_gen.generate_random(); + + m_snap_info.name = uuid_gen.to_string(); + } else { + lderr(cct) << "invalid snapshot name" << dendl; + on_finish->complete(-EINVAL); + return; + } + } + + ldout(cct, 10) << "name=" << m_snap_info.name << dendl; + + int r = m_source_spec_builder->build_stream(m_json_object, &m_stream); + if (r < 0) { + lderr(cct) << "failed to build migration stream handler" << cpp_strerror(r) + << dendl; + on_finish->complete(r); + return; + } + + auto req = new OpenRequest(this, on_finish); + req->send(); +} + +template +void RawSnapshot::close(Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + if (!m_stream) { + on_finish->complete(0); + return; + } + + m_stream->close(on_finish); +} + +template +void RawSnapshot::read(io::AioCompletion* aio_comp, + io::Extents&& image_extents, + io::ReadResult&& read_result, int op_flags, + int read_flags, + const ZTracer::Trace &parent_trace) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + aio_comp->read_result = std::move(read_result); + aio_comp->read_result.set_image_extents(image_extents); + + aio_comp->set_request_count(1); + auto ctx = new io::ReadResult::C_ImageReadRequest(aio_comp, + 0, image_extents); + + // raw directly maps the image-extent IO down to a byte IO extent + m_stream->read(std::move(image_extents), &ctx->bl, ctx); +} + +template +void RawSnapshot::list_snap(io::Extents&& image_extents, + int list_snaps_flags, + io::SparseExtents* sparse_extents, + const ZTracer::Trace &parent_trace, + Context* on_finish) { + auto cct = m_image_ctx->cct; + ldout(cct, 20) << "image_extents=" << image_extents << dendl; + + // raw does support sparse extents so list the full IO extent as a delta + for (auto& [image_offset, image_length] : image_extents) { + sparse_extents->insert(image_offset, image_length, + {io::SPARSE_EXTENT_STATE_DATA, image_length}); + } + + on_finish->complete(0); +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::RawSnapshot; diff --git a/src/librbd/migration/RawSnapshot.h b/src/librbd/migration/RawSnapshot.h new file mode 100644 index 000000000..9f76d6878 --- /dev/null +++ b/src/librbd/migration/RawSnapshot.h @@ -0,0 +1,75 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_RAW_SNAPSHOT_H +#define CEPH_LIBRBD_MIGRATION_RAW_SNAPSHOT_H + +#include "include/buffer_fwd.h" +#include "include/int_types.h" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include "librbd/migration/SnapshotInterface.h" +#include "json_spirit/json_spirit.h" +#include + +namespace librbd { + +struct ImageCtx; + +namespace migration { + +template struct SourceSpecBuilder; +struct StreamInterface; + +template +class RawSnapshot : public SnapshotInterface { +public: + static RawSnapshot* create( + ImageCtx* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder, uint64_t index) { + return new RawSnapshot(image_ctx, json_object, source_spec_builder, index); + } + + RawSnapshot(ImageCtxT* image_ctx, const json_spirit::mObject& json_object, + const SourceSpecBuilder* source_spec_builder, + uint64_t index); + RawSnapshot(const RawSnapshot&) = delete; + RawSnapshot& operator=(const RawSnapshot&) = delete; + + void open(SnapshotInterface* previous_snapshot, Context* on_finish) override; + void close(Context* on_finish) override; + + const SnapInfo& get_snap_info() const override { + return m_snap_info; + } + + void read(io::AioCompletion* aio_comp, io::Extents&& image_extents, + io::ReadResult&& read_result, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) override; + + void list_snap(io::Extents&& image_extents, int list_snaps_flags, + io::SparseExtents* sparse_extents, + const ZTracer::Trace &parent_trace, + Context* on_finish) override; + +private: + struct OpenRequest; + + ImageCtxT* m_image_ctx; + json_spirit::mObject m_json_object; + const SourceSpecBuilder* m_source_spec_builder; + uint64_t m_index = 0; + + SnapInfo m_snap_info; + + std::shared_ptr m_stream; + +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::RawSnapshot; + +#endif // CEPH_LIBRBD_MIGRATION_RAW_SNAPSHOT_H diff --git a/src/librbd/migration/S3Stream.cc b/src/librbd/migration/S3Stream.cc new file mode 100644 index 000000000..a611e274a --- /dev/null +++ b/src/librbd/migration/S3Stream.cc @@ -0,0 +1,200 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/S3Stream.h" +#include "common/armor.h" +#include "common/ceph_crypto.h" +#include "common/ceph_time.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsioEngine.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ReadResult.h" +#include "librbd/migration/HttpClient.h" +#include "librbd/migration/HttpProcessorInterface.h" +#include + +#include +#include + +#include + +namespace librbd { +namespace migration { + +using HttpRequest = boost::beast::http::request; + +namespace { + +const std::string URL_KEY {"url"}; +const std::string ACCESS_KEY {"access_key"}; +const std::string SECRET_KEY {"secret_key"}; + +} // anonymous namespace + +template +struct S3Stream::HttpProcessor : public HttpProcessorInterface { + S3Stream* s3stream; + + HttpProcessor(S3Stream* s3stream) : s3stream(s3stream) { + } + + void process_request(EmptyRequest& request) override { + s3stream->process_request(request); + } +}; + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::S3Stream: " << this \ + << " " << __func__ << ": " + +template +S3Stream::S3Stream(I* image_ctx, const json_spirit::mObject& json_object) + : m_image_ctx(image_ctx), m_cct(image_ctx->cct), + m_asio_engine(image_ctx->asio_engine), m_json_object(json_object), + m_http_processor(std::make_unique(this)) { +} + +template +S3Stream::~S3Stream() { +} + +template +void S3Stream::open(Context* on_finish) { + auto& url_value = m_json_object[URL_KEY]; + if (url_value.type() != json_spirit::str_type) { + lderr(m_cct) << "failed to locate '" << URL_KEY << "' key" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto& access_key = m_json_object[ACCESS_KEY]; + if (access_key.type() != json_spirit::str_type) { + lderr(m_cct) << "failed to locate '" << ACCESS_KEY << "' key" << dendl; + on_finish->complete(-EINVAL); + return; + } + + auto& secret_key = m_json_object[SECRET_KEY]; + if (secret_key.type() != json_spirit::str_type) { + lderr(m_cct) << "failed to locate '" << SECRET_KEY << "' key" << dendl; + on_finish->complete(-EINVAL); + return; + } + + m_url = url_value.get_str(); + + librados::Rados rados(m_image_ctx->md_ctx); + int r = 0; + m_access_key = access_key.get_str(); + if (util::is_config_key_uri(m_access_key)) { + r = util::get_config_key(rados, m_access_key, &m_access_key); + if (r < 0) { + lderr(m_cct) << "failed to retrieve access key from config: " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + } + + m_secret_key = secret_key.get_str(); + if (util::is_config_key_uri(m_secret_key)) { + r = util::get_config_key(rados, m_secret_key, &m_secret_key); + if (r < 0) { + lderr(m_cct) << "failed to retrieve secret key from config: " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + } + + ldout(m_cct, 10) << "url=" << m_url << ", " + << "access_key=" << m_access_key << dendl; + + m_http_client.reset(HttpClient::create(m_image_ctx, m_url)); + m_http_client->set_http_processor(m_http_processor.get()); + m_http_client->open(on_finish); +} + +template +void S3Stream::close(Context* on_finish) { + ldout(m_cct, 10) << dendl; + + if (!m_http_client) { + on_finish->complete(0); + return; + } + + m_http_client->close(on_finish); +} + +template +void S3Stream::get_size(uint64_t* size, Context* on_finish) { + ldout(m_cct, 10) << dendl; + + m_http_client->get_size(size, on_finish); +} + +template +void S3Stream::read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish) { + ldout(m_cct, 20) << "byte_extents=" << byte_extents << dendl; + + m_http_client->read(std::move(byte_extents), data, on_finish); +} + +template +void S3Stream::process_request(HttpRequest& http_request) { + ldout(m_cct, 20) << dendl; + + // format RFC 1123 date/time + auto time = ceph::real_clock::to_time_t(ceph::real_clock::now()); + struct tm timeInfo; + gmtime_r(&time, &timeInfo); + + std::string date = fmt::format("{:%a, %d %b %Y %H:%M:%S %z}", timeInfo); + http_request.set(boost::beast::http::field::date, date); + + // note: we don't support S3 subresources + std::string canonicalized_resource = std::string(http_request.target()); + + std::string string_to_sign = fmt::format( + "{}\n\n\n{}\n{}", + std::string(boost::beast::http::to_string(http_request.method())), + date, canonicalized_resource); + + // create HMAC-SHA1 signature from secret key + string-to-sign + sha1_digest_t digest; + ceph::crypto::HMACSHA1 hmac( + reinterpret_cast(m_secret_key.data()), + m_secret_key.size()); + hmac.Update(reinterpret_cast(string_to_sign.data()), + string_to_sign.size()); + hmac.Final(reinterpret_cast(digest.v)); + + // base64 encode the result + char buf[64]; + int r = ceph_armor(std::begin(buf), std::begin(buf) + sizeof(buf), + reinterpret_cast(digest.v), + reinterpret_cast(digest.v + digest.SIZE)); + if (r < 0) { + ceph_abort("ceph_armor failed"); + } + + // store the access-key + signature in the HTTP authorization header + std::string signature = std::string(std::begin(buf), std::begin(buf) + r); + std::string authorization = fmt::format("AWS {}:{}", m_access_key, signature); + http_request.set(boost::beast::http::field::authorization, authorization); + + ldout(m_cct, 20) << "string_to_sign=" << string_to_sign << ", " + << "authorization=" << authorization << dendl; +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::S3Stream; diff --git a/src/librbd/migration/S3Stream.h b/src/librbd/migration/S3Stream.h new file mode 100644 index 000000000..586b21787 --- /dev/null +++ b/src/librbd/migration/S3Stream.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_S3_STREAM_H +#define CEPH_LIBRBD_MIGRATION_S3_STREAM_H + +#include "include/int_types.h" +#include "librbd/migration/StreamInterface.h" +#include +#include +#include +#include +#include +#include + +struct Context; + +namespace librbd { + +struct AsioEngine; +struct ImageCtx; + +namespace migration { + +template class HttpClient; + +template +class S3Stream : public StreamInterface { +public: + static S3Stream* create(ImageCtxT* image_ctx, + const json_spirit::mObject& json_object) { + return new S3Stream(image_ctx, json_object); + } + + S3Stream(ImageCtxT* image_ctx, const json_spirit::mObject& json_object); + ~S3Stream() override; + + S3Stream(const S3Stream&) = delete; + S3Stream& operator=(const S3Stream&) = delete; + + void open(Context* on_finish) override; + void close(Context* on_finish) override; + + void get_size(uint64_t* size, Context* on_finish) override; + + void read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish) override; + +private: + using HttpRequest = boost::beast::http::request< + boost::beast::http::empty_body>; + using HttpResponse = boost::beast::http::response< + boost::beast::http::string_body>; + + struct HttpProcessor; + + ImageCtxT* m_image_ctx; + CephContext* m_cct; + std::shared_ptr m_asio_engine; + json_spirit::mObject m_json_object; + + std::string m_url; + std::string m_access_key; + std::string m_secret_key; + + std::unique_ptr m_http_processor; + std::unique_ptr> m_http_client; + + void process_request(HttpRequest& http_request); + +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::S3Stream; + +#endif // CEPH_LIBRBD_MIGRATION_S3_STREAM_H diff --git a/src/librbd/migration/SnapshotInterface.h b/src/librbd/migration/SnapshotInterface.h new file mode 100644 index 000000000..9990802c5 --- /dev/null +++ b/src/librbd/migration/SnapshotInterface.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_SNAPSHOT_INTERFACE_H +#define CEPH_LIBRBD_MIGRATION_SNAPSHOT_INTERFACE_H + +#include "include/buffer_fwd.h" +#include "include/int_types.h" +#include "common/zipkin_trace.h" +#include "librbd/Types.h" +#include "librbd/io/Types.h" +#include + +struct Context; + +namespace librbd { + +namespace io { +struct AioCompletion; +struct ReadResult; +} // namespace io + +namespace migration { + +struct SnapshotInterface { + virtual ~SnapshotInterface() { + } + + virtual void open(SnapshotInterface* previous_snapshot, + Context* on_finish) = 0; + virtual void close(Context* on_finish) = 0; + + virtual const SnapInfo& get_snap_info() const = 0; + + virtual void read(io::AioCompletion* aio_comp, io::Extents&& image_extents, + io::ReadResult&& read_result, int op_flags, int read_flags, + const ZTracer::Trace &parent_trace) = 0; + + virtual void list_snap(io::Extents&& image_extents, int list_snaps_flags, + io::SparseExtents* sparse_extents, + const ZTracer::Trace &parent_trace, + Context* on_finish) = 0; +}; + +} // namespace migration +} // namespace librbd + +#endif // CEPH_LIBRBD_MIGRATION_SNAPSHOT_INTERFACE_H diff --git a/src/librbd/migration/SourceSpecBuilder.cc b/src/librbd/migration/SourceSpecBuilder.cc new file mode 100644 index 000000000..214d7ce0e --- /dev/null +++ b/src/librbd/migration/SourceSpecBuilder.cc @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/SourceSpecBuilder.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/migration/FileStream.h" +#include "librbd/migration/HttpStream.h" +#include "librbd/migration/S3Stream.h" +#include "librbd/migration/NativeFormat.h" +#include "librbd/migration/QCOWFormat.h" +#include "librbd/migration/RawFormat.h" +#include "librbd/migration/RawSnapshot.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::SourceSpecBuilder: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace migration { + +namespace { + +const std::string STREAM_KEY{"stream"}; +const std::string TYPE_KEY{"type"}; + +} // anonymous namespace + +template +int SourceSpecBuilder::parse_source_spec( + const std::string& source_spec, + json_spirit::mObject* source_spec_object) const { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + json_spirit::mValue json_root; + if(json_spirit::read(source_spec, json_root)) { + try { + *source_spec_object = json_root.get_obj(); + return 0; + } catch (std::runtime_error&) { + } + } + + lderr(cct) << "invalid source-spec JSON" << dendl; + return -EBADMSG; +} + +template +int SourceSpecBuilder::build_format( + const json_spirit::mObject& source_spec_object, bool import_only, + std::unique_ptr* format) const { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto type_value_it = source_spec_object.find(TYPE_KEY); + if (type_value_it == source_spec_object.end() || + type_value_it->second.type() != json_spirit::str_type) { + lderr(cct) << "failed to locate format type value" << dendl; + return -EINVAL; + } + + auto& type = type_value_it->second.get_str(); + if (type == "native") { + format->reset(NativeFormat::create(m_image_ctx, source_spec_object, + import_only)); + } else if (type == "qcow") { + format->reset(QCOWFormat::create(m_image_ctx, source_spec_object, this)); + } else if (type == "raw") { + format->reset(RawFormat::create(m_image_ctx, source_spec_object, this)); + } else { + lderr(cct) << "unknown or unsupported format type '" << type << "'" + << dendl; + return -ENOSYS; + } + return 0; +} + +template +int SourceSpecBuilder::build_snapshot( + const json_spirit::mObject& source_spec_object, uint64_t index, + std::shared_ptr* snapshot) const { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto type_value_it = source_spec_object.find(TYPE_KEY); + if (type_value_it == source_spec_object.end() || + type_value_it->second.type() != json_spirit::str_type) { + lderr(cct) << "failed to locate snapshot type value" << dendl; + return -EINVAL; + } + + auto& type = type_value_it->second.get_str(); + if (type == "raw") { + snapshot->reset(RawSnapshot::create(m_image_ctx, source_spec_object, + this, index)); + } else { + lderr(cct) << "unknown or unsupported format type '" << type << "'" + << dendl; + return -ENOSYS; + } + return 0; +} + +template +int SourceSpecBuilder::build_stream( + const json_spirit::mObject& source_spec_object, + std::shared_ptr* stream) const { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto stream_value_it = source_spec_object.find(STREAM_KEY); + if (stream_value_it == source_spec_object.end() || + stream_value_it->second.type() != json_spirit::obj_type) { + lderr(cct) << "failed to locate stream object" << dendl; + return -EINVAL; + } + + auto& stream_obj = stream_value_it->second.get_obj(); + auto type_value_it = stream_obj.find(TYPE_KEY); + if (type_value_it == stream_obj.end() || + type_value_it->second.type() != json_spirit::str_type) { + lderr(cct) << "failed to locate stream type value" << dendl; + return -EINVAL; + } + + auto& type = type_value_it->second.get_str(); + if (type == "file") { + stream->reset(FileStream::create(m_image_ctx, stream_obj)); + } else if (type == "http") { + stream->reset(HttpStream::create(m_image_ctx, stream_obj)); + } else if (type == "s3") { + stream->reset(S3Stream::create(m_image_ctx, stream_obj)); + } else { + lderr(cct) << "unknown or unsupported stream type '" << type << "'" + << dendl; + return -ENOSYS; + } + + return 0; +} + +} // namespace migration +} // namespace librbd + +template class librbd::migration::SourceSpecBuilder; diff --git a/src/librbd/migration/SourceSpecBuilder.h b/src/librbd/migration/SourceSpecBuilder.h new file mode 100644 index 000000000..191cb1cbd --- /dev/null +++ b/src/librbd/migration/SourceSpecBuilder.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_SOURCE_SPEC_BUILDER_H +#define CEPH_LIBRBD_MIGRATION_SOURCE_SPEC_BUILDER_H + +#include "include/int_types.h" +#include +#include +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace migration { + +struct FormatInterface; +struct SnapshotInterface; +struct StreamInterface; + +template +class SourceSpecBuilder { +public: + SourceSpecBuilder(ImageCtxT* image_ctx) : m_image_ctx(image_ctx) { + } + + int parse_source_spec(const std::string& source_spec, + json_spirit::mObject* source_spec_object) const; + + int build_format(const json_spirit::mObject& format_object, bool import_only, + std::unique_ptr* format) const; + + int build_snapshot(const json_spirit::mObject& source_spec_object, + uint64_t index, + std::shared_ptr* snapshot) const; + + int build_stream(const json_spirit::mObject& source_spec_object, + std::shared_ptr* stream) const; + +private: + ImageCtxT* m_image_ctx; + +}; + +} // namespace migration +} // namespace librbd + +extern template class librbd::migration::SourceSpecBuilder; + +#endif // CEPH_LIBRBD_MIGRATION_SOURCE_SPEC_BUILDER_H diff --git a/src/librbd/migration/StreamInterface.h b/src/librbd/migration/StreamInterface.h new file mode 100644 index 000000000..782a9a5f8 --- /dev/null +++ b/src/librbd/migration/StreamInterface.h @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_STREAM_INTERFACE_H +#define CEPH_LIBRBD_MIGRATION_STREAM_INTERFACE_H + +#include "include/buffer_fwd.h" +#include "include/int_types.h" +#include "librbd/io/Types.h" + +struct Context; + +namespace librbd { +namespace migration { + +struct StreamInterface { + virtual ~StreamInterface() { + } + + virtual void open(Context* on_finish) = 0; + virtual void close(Context* on_finish) = 0; + + virtual void get_size(uint64_t* size, Context* on_finish) = 0; + + virtual void read(io::Extents&& byte_extents, bufferlist* data, + Context* on_finish) = 0; +}; + +} // namespace migration +} // namespace librbd + +#endif // CEPH_LIBRBD_MIGRATION_STREAM_INTERFACE_H diff --git a/src/librbd/migration/Types.h b/src/librbd/migration/Types.h new file mode 100644 index 000000000..244dc28b7 --- /dev/null +++ b/src/librbd/migration/Types.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_TYPES_H +#define CEPH_LIBRBD_MIGRATION_TYPES_H + +#include +#include + +namespace librbd { +namespace migration { + +enum UrlScheme { + URL_SCHEME_HTTP, + URL_SCHEME_HTTPS, +}; + +struct UrlSpec { + UrlSpec() {} + UrlSpec(UrlScheme scheme, const std::string& host, const std::string& port, + const std::string& path) + : scheme(scheme), host(host), port(port), path(path) { + } + + UrlScheme scheme = URL_SCHEME_HTTP; + std::string host; + std::string port = "80"; + std::string path = "/"; + +}; + +inline bool operator==(const UrlSpec& lhs, const UrlSpec& rhs) { + return (lhs.scheme == rhs.scheme && + lhs.host == rhs.host && + lhs.port == rhs.port && + lhs.path == rhs.path); +} + +} // namespace migration +} // namespace librbd + +#endif // CEPH_LIBRBD_MIGRATION_TYPES_H diff --git a/src/librbd/migration/Utils.cc b/src/librbd/migration/Utils.cc new file mode 100644 index 000000000..c5c1279d8 --- /dev/null +++ b/src/librbd/migration/Utils.cc @@ -0,0 +1,133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/migration/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include +#include + +namespace librbd { +namespace migration { +namespace util { + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::migration::util::" << __func__ << ": " + +int parse_url(CephContext* cct, const std::string& url, UrlSpec* url_spec) { + ldout(cct, 10) << "url=" << url << dendl; + *url_spec = UrlSpec{}; + + // parse the provided URL (scheme, user, password, host, port, path, + // parameters, query, and fragment) + std::regex url_regex( + R"(^(?:([^:/]*)://)?(?:(\w+)(?::(\w+))?@)?([^/;\?:#]+)(?::([^/;\?#]+))?)" + R"((?:/([^;\?#]*))?(?:;([^\?#]+))?(?:\?([^#]+))?(?:#(\w+))?$)"); + std::smatch match; + if(!std::regex_match(url, match, url_regex)) { + lderr(cct) << "invalid url: '" << url << "'" << dendl; + return -EINVAL; + } + + auto& scheme = match[1]; + if (scheme == "http" || scheme == "") { + url_spec->scheme = URL_SCHEME_HTTP; + } else if (scheme == "https") { + url_spec->scheme = URL_SCHEME_HTTPS; + url_spec->port = "443"; + } else { + lderr(cct) << "invalid url scheme: '" << url << "'" << dendl; + return -EINVAL; + } + + url_spec->host = match[4]; + auto& port = match[5]; + if (port.matched) { + try { + boost::lexical_cast(port); + } catch (boost::bad_lexical_cast&) { + lderr(cct) << "invalid url port: '" << url << "'" << dendl; + return -EINVAL; + } + url_spec->port = port; + } + + auto& path = match[6]; + if (path.matched) { + url_spec->path += path; + } + return 0; +} + +void zero_shrunk_snapshot(CephContext* cct, const io::Extents& image_extents, + uint64_t snap_id, uint64_t new_size, + std::optional *previous_size, + io::SparseExtents* sparse_extents) { + if (*previous_size && **previous_size > new_size) { + ldout(cct, 20) << "snapshot resize " << **previous_size << " -> " + << new_size << dendl; + interval_set zero_interval; + zero_interval.insert(new_size, **previous_size - new_size); + + for (auto& image_extent : image_extents) { + interval_set image_interval; + image_interval.insert(image_extent.first, image_extent.second); + + image_interval.intersection_of(zero_interval); + for (auto [image_offset, image_length] : image_interval) { + ldout(cct, 20) << "zeroing extent " << image_offset << "~" + << image_length << " at snapshot " << snap_id << dendl; + sparse_extents->insert(image_offset, image_length, + {io::SPARSE_EXTENT_STATE_ZEROED, image_length}); + } + } + } + *previous_size = new_size; +} + +void merge_snapshot_delta(const io::SnapIds& snap_ids, + io::SnapshotDelta* snapshot_delta) { + io::SnapshotDelta orig_snapshot_delta = std::move(*snapshot_delta); + snapshot_delta->clear(); + + auto snap_id_it = snap_ids.begin(); + ceph_assert(snap_id_it != snap_ids.end()); + + // merge any snapshot intervals that were not requested + std::list pending_sparse_extents; + for (auto& [snap_key, sparse_extents] : orig_snapshot_delta) { + // advance to next valid requested snap id + while (snap_id_it != snap_ids.end() && *snap_id_it < snap_key.first) { + ++snap_id_it; + } + if (snap_id_it == snap_ids.end()) { + break; + } + + // loop through older write/read snapshot sparse extents to remove any + // overlaps with the current sparse extent + for (auto prev_sparse_extents : pending_sparse_extents) { + for (auto& sparse_extent : sparse_extents) { + prev_sparse_extents->erase(sparse_extent.get_off(), + sparse_extent.get_len()); + } + } + + auto write_read_snap_ids = std::make_pair(*snap_id_it, snap_key.second); + (*snapshot_delta)[write_read_snap_ids] = std::move(sparse_extents); + + if (write_read_snap_ids.first > snap_key.first) { + // the current snapshot wasn't requested so it might need to get + // merged with a later snapshot + pending_sparse_extents.push_back(&(*snapshot_delta)[write_read_snap_ids]); + } else { + // we don't merge results passed a valid requested snapshot + pending_sparse_extents.clear(); + } + } +} + +} // namespace util +} // namespace migration +} // namespace librbd diff --git a/src/librbd/migration/Utils.h b/src/librbd/migration/Utils.h new file mode 100644 index 000000000..afbadde7d --- /dev/null +++ b/src/librbd/migration/Utils.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIGRATION_UTILS_H +#define CEPH_LIBRBD_MIGRATION_UTILS_H + +#include "include/common_fwd.h" +#include "librbd/io/Types.h" +#include "librbd/migration/Types.h" +#include +#include + +namespace librbd { +namespace migration { +namespace util { + +int parse_url(CephContext* cct, const std::string& url, UrlSpec* url_spec); + +void zero_shrunk_snapshot(CephContext* cct, const io::Extents& image_extents, + uint64_t snap_id, uint64_t new_size, + std::optional *previous_size, + io::SparseExtents* sparse_extents); +void merge_snapshot_delta(const io::SnapIds& snap_ids, + io::SnapshotDelta* snapshot_delta); + +} // namespace util +} // namespace migration +} // namespace librbd + +#endif // CEPH_LIBRBD_MIGRATION_UTILS_H diff --git a/src/librbd/mirror/DemoteRequest.cc b/src/librbd/mirror/DemoteRequest.cc new file mode 100644 index 000000000..350a76d83 --- /dev/null +++ b/src/librbd/mirror/DemoteRequest.cc @@ -0,0 +1,216 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/DemoteRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/mirror/GetInfoRequest.h" +#include "librbd/mirror/snapshot/DemoteRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::DemoteRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using librbd::util::create_context_callback; + +template +void DemoteRequest::send() { + get_info(); +} + +template +void DemoteRequest::get_info() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + DemoteRequest, &DemoteRequest::handle_get_info>(this); + auto req = GetInfoRequest::create(m_image_ctx, &m_mirror_image, + &m_promotion_state, + &m_primary_mirror_uuid, ctx); + req->send(); +} + +template +void DemoteRequest::handle_get_info(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else if (m_mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(cct) << "mirroring is not currently enabled" << dendl; + finish(-EINVAL); + return; + } else if (m_promotion_state != PROMOTION_STATE_PRIMARY) { + lderr(cct) << "image is not primary" << dendl; + finish(-EINVAL); + return; + } + + acquire_lock(); +} + +template +void DemoteRequest::acquire_lock() { + CephContext *cct = m_image_ctx.cct; + + m_image_ctx.owner_lock.lock_shared(); + if (m_image_ctx.exclusive_lock == nullptr) { + m_image_ctx.owner_lock.unlock_shared(); + if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) { + lderr(cct) << "exclusive lock is not active" << dendl; + finish(-EINVAL); + } else { + demote(); + } + return; + } + + // avoid accepting new requests from peers while we demote + // the image + m_image_ctx.exclusive_lock->block_requests(0); + m_blocked_requests = true; + + if (m_image_ctx.exclusive_lock->is_lock_owner()) { + m_image_ctx.owner_lock.unlock_shared(); + demote(); + return; + } + + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + DemoteRequest, + &DemoteRequest::handle_acquire_lock>(this, m_image_ctx.exclusive_lock); + m_image_ctx.exclusive_lock->acquire_lock(ctx); + m_image_ctx.owner_lock.unlock_shared(); +} + +template +void DemoteRequest::handle_acquire_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to lock image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + m_image_ctx.owner_lock.lock_shared(); + if (m_image_ctx.exclusive_lock != nullptr && + !m_image_ctx.exclusive_lock->is_lock_owner()) { + r = m_image_ctx.exclusive_lock->get_unlocked_op_error(); + m_image_ctx.owner_lock.unlock_shared(); + lderr(cct) << "failed to acquire exclusive lock" << dendl; + finish(r); + return; + } + m_image_ctx.owner_lock.unlock_shared(); + + demote(); +} + +template +void DemoteRequest::demote() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + DemoteRequest, &DemoteRequest::handle_demote>(this); + if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) { + Journal::demote(&m_image_ctx, ctx); + } else if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + auto req = mirror::snapshot::DemoteRequest::create( + &m_image_ctx, m_mirror_image.global_image_id, ctx); + req->send(); + } else { + lderr(cct) << "unknown image mirror mode: " << m_mirror_image.mode << dendl; + m_ret_val = -EOPNOTSUPP; + release_lock(); + } +} + +template +void DemoteRequest::handle_demote(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + m_ret_val = r; + lderr(cct) << "failed to demote image: " << cpp_strerror(r) << dendl; + } + + release_lock(); +} + +template +void DemoteRequest::release_lock() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + m_image_ctx.owner_lock.lock_shared(); + if (m_image_ctx.exclusive_lock == nullptr) { + m_image_ctx.owner_lock.unlock_shared(); + finish(0); + return; + } + + auto ctx = create_context_callback< + DemoteRequest, + &DemoteRequest::handle_release_lock>(this, m_image_ctx.exclusive_lock); + m_image_ctx.exclusive_lock->release_lock(ctx); + m_image_ctx.owner_lock.unlock_shared(); +} + +template +void DemoteRequest::handle_release_lock(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template +void DemoteRequest::finish(int r) { + if (m_ret_val < 0) { + r = m_ret_val; + } + + { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + if (m_blocked_requests && m_image_ctx.exclusive_lock != nullptr) { + m_image_ctx.exclusive_lock->unblock_requests(); + } + } + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::DemoteRequest; diff --git a/src/librbd/mirror/DemoteRequest.h b/src/librbd/mirror/DemoteRequest.h new file mode 100644 index 000000000..ab9239068 --- /dev/null +++ b/src/librbd/mirror/DemoteRequest.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/mirror/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { + +template +class DemoteRequest { +public: + static DemoteRequest *create(ImageCtxT &image_ctx, Context *on_finish) { + return new DemoteRequest(image_ctx, on_finish); + } + + DemoteRequest(ImageCtxT &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_INFO + * | + * v + * ACQUIRE_LOCK * * * * + * | * + * v * + * DEMOTE * + * | * + * v * + * RELEASE_LOCK * + * | * + * v * + * < * * * * * + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + int m_ret_val = 0; + bool m_blocked_requests = false; + + cls::rbd::MirrorImage m_mirror_image; + PromotionState m_promotion_state = PROMOTION_STATE_PRIMARY; + std::string m_primary_mirror_uuid; + + void get_info(); + void handle_get_info(int r); + + void acquire_lock(); + void handle_acquire_lock(int r); + + void demote(); + void handle_demote(int r); + + void release_lock(); + void handle_release_lock(int r); + + void finish(int r); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::DemoteRequest; + +#endif // CEPH_LIBRBD_MIRROR_DEMOTE_REQUEST_H diff --git a/src/librbd/mirror/DisableRequest.cc b/src/librbd/mirror/DisableRequest.cc new file mode 100644 index 000000000..09378ce58 --- /dev/null +++ b/src/librbd/mirror/DisableRequest.cc @@ -0,0 +1,479 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/DisableRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/journal/cls_journal_client.h" +#include "journal/Journaler.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/journal/PromoteRequest.h" +#include "librbd/mirror/GetInfoRequest.h" +#include "librbd/mirror/ImageRemoveRequest.h" +#include "librbd/mirror/ImageStateUpdateRequest.h" +#include "librbd/mirror/snapshot/PromoteRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::DisableRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using util::create_rados_callback; + +template +DisableRequest::DisableRequest(I *image_ctx, bool force, bool remove, + Context *on_finish) + : m_image_ctx(image_ctx), m_force(force), m_remove(remove), + m_on_finish(on_finish) { +} + +template +void DisableRequest::send() { + send_get_mirror_info(); +} + +template +void DisableRequest::send_get_mirror_info() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + + using klass = DisableRequest; + Context *ctx = util::create_context_callback< + klass, &klass::handle_get_mirror_info>(this); + + auto req = GetInfoRequest::create(*m_image_ctx, &m_mirror_image, + &m_promotion_state, + &m_primary_mirror_uuid, ctx); + req->send(); +} + +template +Context *DisableRequest::handle_get_mirror_info(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << *result << dendl; + + if (*result < 0) { + if (*result == -ENOENT) { + ldout(cct, 20) << "mirroring is not enabled for this image" << dendl; + *result = 0; + } else { + lderr(cct) << "failed to get mirroring info: " << cpp_strerror(*result) + << dendl; + } + return m_on_finish; + } + + m_is_primary = (m_promotion_state == PROMOTION_STATE_PRIMARY || + m_promotion_state == PROMOTION_STATE_UNKNOWN); + + if (!m_is_primary && !m_force) { + lderr(cct) << "mirrored image is not primary, " + << "add force option to disable mirroring" << dendl; + *result = -EINVAL; + return m_on_finish; + } + + send_image_state_update(); + return nullptr; +} + +template +void DisableRequest::send_image_state_update() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = util::create_context_callback< + DisableRequest, + &DisableRequest::handle_image_state_update>(this); + auto req = ImageStateUpdateRequest::create( + m_image_ctx->md_ctx, m_image_ctx->id, + cls::rbd::MIRROR_IMAGE_STATE_DISABLING, m_mirror_image, ctx); + req->send(); +} + +template +Context *DisableRequest::handle_image_state_update(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to disable mirroring: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + send_promote_image(); + return nullptr; +} + +template +void DisableRequest::send_promote_image() { + if (m_is_primary) { + clean_mirror_state(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = util::create_context_callback< + DisableRequest, &DisableRequest::handle_promote_image>(this); + if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) { + // Not primary -- shouldn't have the journal open + ceph_assert(m_image_ctx->journal == nullptr); + + auto req = journal::PromoteRequest::create(m_image_ctx, true, ctx); + req->send(); + } else if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + auto req = mirror::snapshot::PromoteRequest::create( + m_image_ctx, m_mirror_image.global_image_id, ctx); + req->send(); + } else { + lderr(cct) << "unknown image mirror mode: " << m_mirror_image.mode << dendl; + ctx->complete(-EOPNOTSUPP); + } +} + +template +Context *DisableRequest::handle_promote_image(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to promote image: " << cpp_strerror(*result) << dendl; + return m_on_finish; + } + + send_refresh_image(); + return nullptr; +} + +template +void DisableRequest::send_refresh_image() { + if (!m_image_ctx->state->is_refresh_required()) { + clean_mirror_state(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = util::create_context_callback< + DisableRequest, + &DisableRequest::handle_refresh_image>(this); + m_image_ctx->state->refresh(ctx); +} + +template +Context *DisableRequest::handle_refresh_image(int* result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(*result) << dendl; + return m_on_finish; + } + + clean_mirror_state(); + return nullptr; +} + +template +void DisableRequest::clean_mirror_state() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + remove_mirror_snapshots(); + } else { + send_get_clients(); + } +} + +template +void DisableRequest::send_get_clients() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + using klass = DisableRequest; + Context *ctx = util::create_context_callback< + klass, &klass::handle_get_clients>(this); + + std::string header_oid = ::journal::Journaler::header_oid(m_image_ctx->id); + m_clients.clear(); + cls::journal::client::client_list(m_image_ctx->md_ctx, header_oid, &m_clients, + ctx); +} + +template +Context *DisableRequest::handle_get_clients(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << *result << dendl; + + std::unique_lock locker{m_lock}; + ceph_assert(m_current_ops.empty()); + + if (*result < 0) { + lderr(cct) << "failed to get registered clients: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + for (auto client : m_clients) { + journal::ClientData client_data; + auto bl_it = client.data.cbegin(); + try { + using ceph::decode; + decode(client_data, bl_it); + } catch (const buffer::error &err) { + lderr(cct) << "failed to decode client data" << dendl; + m_error_result = -EBADMSG; + continue; + } + + journal::ClientMetaType type = client_data.get_client_meta_type(); + if (type != journal::ClientMetaType::MIRROR_PEER_CLIENT_META_TYPE) { + continue; + } + + if (m_current_ops.find(client.id) != m_current_ops.end()) { + // Should not happen. + lderr(cct) << "clients with the same id " + << client.id << dendl; + continue; + } + + m_current_ops[client.id] = 0; + m_ret[client.id] = 0; + + journal::MirrorPeerClientMeta client_meta = + boost::get(client_data.client_meta); + + for (const auto& sync : client_meta.sync_points) { + send_remove_snap(client.id, sync.snap_namespace, sync.snap_name); + } + + if (m_current_ops[client.id] == 0) { + // no snaps to remove + send_unregister_client(client.id); + } + } + + if (m_current_ops.empty()) { + if (m_error_result < 0) { + *result = m_error_result; + return m_on_finish; + } else if (!m_remove) { + return m_on_finish; + } + locker.unlock(); + + // no mirror clients to unregister + send_remove_mirror_image(); + } + + return nullptr; +} + +template +void DisableRequest::remove_mirror_snapshots() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + // remove snapshot-based mirroring snapshots + bool removing_snapshots = false; + { + std::lock_guard locker{m_lock}; + std::shared_lock image_locker{m_image_ctx->image_lock}; + + for (auto &it : m_image_ctx->snap_info) { + auto &snap_info = it.second; + auto type = cls::rbd::get_snap_namespace_type( + snap_info.snap_namespace); + if (type == cls::rbd::SNAPSHOT_NAMESPACE_TYPE_MIRROR) { + send_remove_snap("", snap_info.snap_namespace, snap_info.name); + removing_snapshots = true; + } + } + } + + if (!removing_snapshots) { + send_remove_mirror_image(); + } +} + +template +void DisableRequest::send_remove_snap( + const std::string &client_id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "client_id=" << client_id + << ", snap_name=" << snap_name << dendl; + + ceph_assert(ceph_mutex_is_locked(m_lock)); + + m_current_ops[client_id]++; + + Context *ctx = create_context_callback( + &DisableRequest::handle_remove_snap, client_id); + + ctx = new LambdaContext([this, snap_namespace, snap_name, ctx](int r) { + m_image_ctx->operations->snap_remove(snap_namespace, + snap_name.c_str(), + ctx); + }); + + m_image_ctx->op_work_queue->queue(ctx, 0); +} + +template +Context *DisableRequest::handle_remove_snap(int *result, + const std::string &client_id) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << *result << dendl; + + std::unique_lock locker{m_lock}; + + ceph_assert(m_current_ops[client_id] > 0); + m_current_ops[client_id]--; + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to remove mirroring snapshot: " + << cpp_strerror(*result) << dendl; + m_ret[client_id] = *result; + } + + if (m_current_ops[client_id] == 0) { + if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + ceph_assert(client_id.empty()); + m_current_ops.erase(client_id); + if (m_ret[client_id] < 0) { + return m_on_finish; + } + locker.unlock(); + + send_remove_mirror_image(); + return nullptr; + } + + send_unregister_client(client_id); + } + + return nullptr; +} + +template +void DisableRequest::send_unregister_client( + const std::string &client_id) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + ceph_assert(ceph_mutex_is_locked(m_lock)); + ceph_assert(m_current_ops[client_id] == 0); + + Context *ctx = create_context_callback( + &DisableRequest::handle_unregister_client, client_id); + + if (m_ret[client_id] < 0) { + m_image_ctx->op_work_queue->queue(ctx, m_ret[client_id]); + return; + } + + librados::ObjectWriteOperation op; + cls::journal::client::client_unregister(&op, client_id); + std::string header_oid = ::journal::Journaler::header_oid(m_image_ctx->id); + librados::AioCompletion *comp = create_rados_callback(ctx); + + int r = m_image_ctx->md_ctx.aio_operate(header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *DisableRequest::handle_unregister_client( + int *result, const std::string &client_id) { + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << *result << dendl; + + std::unique_lock locker{m_lock}; + ceph_assert(m_current_ops[client_id] == 0); + m_current_ops.erase(client_id); + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to unregister remote journal client: " + << cpp_strerror(*result) << dendl; + m_error_result = *result; + } + + if (!m_current_ops.empty()) { + return nullptr; + } + + if (m_error_result < 0) { + *result = m_error_result; + return m_on_finish; + } + locker.unlock(); + + send_get_clients(); + return nullptr; +} + +template +void DisableRequest::send_remove_mirror_image() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + auto ctx = util::create_context_callback< + DisableRequest, + &DisableRequest::handle_remove_mirror_image>(this); + auto req = ImageRemoveRequest::create( + m_image_ctx->md_ctx, m_mirror_image.global_image_id, m_image_ctx->id, + ctx); + req->send(); +} + +template +Context *DisableRequest::handle_remove_mirror_image(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to remove mirror image: " << cpp_strerror(*result) + << dendl; + return m_on_finish; + } + + ldout(cct, 20) << "removed image state from rbd_mirroring object" << dendl; + return m_on_finish; +} + +template +Context *DisableRequest::create_context_callback( + Context*(DisableRequest::*handle)(int*, const std::string &client_id), + const std::string &client_id) { + + return new LambdaContext([this, handle, client_id](int r) { + Context *on_finish = (this->*handle)(&r, client_id); + if (on_finish != nullptr) { + on_finish->complete(r); + delete this; + } + }); +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::DisableRequest; diff --git a/src/librbd/mirror/DisableRequest.h b/src/librbd/mirror/DisableRequest.h new file mode 100644 index 000000000..f45d1a14c --- /dev/null +++ b/src/librbd/mirror/DisableRequest.h @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H + +#include "include/buffer.h" +#include "common/ceph_mutex.h" +#include "cls/journal/cls_journal_types.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/mirror/Types.h" + +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace mirror { + +template +class DisableRequest { +public: + static DisableRequest *create(ImageCtxT *image_ctx, bool force, + bool remove, Context *on_finish) { + return new DisableRequest(image_ctx, force, remove, on_finish); + } + + DisableRequest(ImageCtxT *image_ctx, bool force, bool remove, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_MIRROR_INFO * * * * * * * * * * * * * * * * * * * * * * * + * | * + * v * + * IMAGE_STATE_UPDATE * * * * * * * * * * * * * * * * * * * * * * + * | * + * v * + * PROMOTE_IMAGE (skip if primary) * + * | * + * v * + * REFRESH_IMAGE (skip if necessary) * + * | * + * v * + * GET_CLIENTS <----------------------------------------\ * * * * + * | | (unregister clients) | * (on error) + * | |/----------------------------\ | * + * | | | | * + * | | /-----------\ (repeat | (repeat | (repeat + * | | | | as needed) | as needed) | as needed) + * | v v | | | * + * | REMOVE_SYNC_SNAP --/ * * * * * * | * * * * * * | * * * * + * | | | | * + * | v | | * + * | UNREGISTER_CLIENT ---------------/-------------/ * * * * + * | * + * | (no more clients * + * | to unregister) * + * v * + * REMOVE_MIRROR_IMAGE * * * * * * * * * * * * * * * * * * * * * + * | (skip if no remove) * + * v * + * < * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + bool m_force; + bool m_remove; + Context *m_on_finish; + + bool m_is_primary = false; + cls::rbd::MirrorImage m_mirror_image; + PromotionState m_promotion_state = PROMOTION_STATE_NON_PRIMARY; + std::string m_primary_mirror_uuid; + std::set m_clients; + std::map m_ret; + std::map m_current_ops; + int m_error_result = 0; + mutable ceph::mutex m_lock = + ceph::make_mutex("mirror::DisableRequest::m_lock"); + + void send_get_mirror_info(); + Context *handle_get_mirror_info(int *result); + + void send_image_state_update(); + Context *handle_image_state_update(int *result); + + void send_notify_mirroring_watcher(); + Context *handle_notify_mirroring_watcher(int *result); + + void send_promote_image(); + Context *handle_promote_image(int *result); + + void send_refresh_image(); + Context* handle_refresh_image(int* result); + + void clean_mirror_state(); + + void send_get_clients(); + Context *handle_get_clients(int *result); + + void remove_mirror_snapshots(); + + void send_remove_snap(const std::string &client_id, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name); + Context *handle_remove_snap(int *result, const std::string &client_id); + + void send_unregister_client(const std::string &client_id); + Context *handle_unregister_client(int *result, const std::string &client_id); + + void send_remove_mirror_image(); + Context *handle_remove_mirror_image(int *result); + + void send_notify_mirroring_watcher_removed(); + Context *handle_notify_mirroring_watcher_removed(int *result); + + Context *create_context_callback( + Context*(DisableRequest::*handle)( + int*, const std::string &client_id), + const std::string &client_id); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::DisableRequest; + +#endif // CEPH_LIBRBD_MIRROR_DISABLE_REQUEST_H diff --git a/src/librbd/mirror/EnableRequest.cc b/src/librbd/mirror/EnableRequest.cc new file mode 100644 index 000000000..fd74a25ba --- /dev/null +++ b/src/librbd/mirror/EnableRequest.cc @@ -0,0 +1,329 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/EnableRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/mirror/ImageStateUpdateRequest.h" +#include "librbd/mirror/snapshot/CreatePrimaryRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::EnableRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using util::create_context_callback; +using util::create_rados_callback; + +template +EnableRequest::EnableRequest(librados::IoCtx &io_ctx, + const std::string &image_id, + I* image_ctx, + cls::rbd::MirrorImageMode mode, + const std::string &non_primary_global_image_id, + bool image_clean, + asio::ContextWQ *op_work_queue, + Context *on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), m_image_ctx(image_ctx), + m_mode(mode), m_non_primary_global_image_id(non_primary_global_image_id), + m_image_clean(image_clean), m_op_work_queue(op_work_queue), + m_on_finish(on_finish), + m_cct(reinterpret_cast(io_ctx.cct())) { +} + +template +void EnableRequest::send() { + get_mirror_image(); +} + +template +void EnableRequest::get_mirror_image() { + ldout(m_cct, 10) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_get_start(&op, m_image_id); + + using klass = EnableRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void EnableRequest::handle_get_mirror_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = cls_client::mirror_image_get_finish(&iter, &m_mirror_image); + } + + if (r == 0 && m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_CREATING && + !m_non_primary_global_image_id.empty()) { + // special case where rbd-mirror injects a disabled record to record the + // local image id prior to creating ther image + ldout(m_cct, 10) << "enabling mirroring on in-progress image replication" + << dendl; + } else if (r == 0) { + if (m_mirror_image.mode != m_mode) { + lderr(m_cct) << "invalid current image mirror mode" << dendl; + r = -EINVAL; + } else if (m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + ldout(m_cct, 10) << "mirroring is already enabled" << dendl; + } else { + lderr(m_cct) << "currently disabling" << dendl; + r = -EINVAL; + } + finish(r); + return; + } else if (r != -ENOENT) { + lderr(m_cct) << "failed to retrieve mirror image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + r = 0; + m_mirror_image.mode = m_mode; + if (m_non_primary_global_image_id.empty()) { + uuid_d uuid_gen; + uuid_gen.generate_random(); + m_mirror_image.global_image_id = uuid_gen.to_string(); + } else { + m_mirror_image.global_image_id = m_non_primary_global_image_id; + } + + get_tag_owner(); +} + +template +void EnableRequest::get_tag_owner() { + if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + open_image(); + return; + } else if (!m_non_primary_global_image_id.empty()) { + image_state_update(); + return; + } + + ldout(m_cct, 10) << dendl; + + using klass = EnableRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_get_tag_owner>(this); + librbd::Journal<>::is_tag_owner(m_io_ctx, m_image_id, &m_is_primary, + m_op_work_queue, ctx); +} + +template +void EnableRequest::handle_get_tag_owner(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to check tag ownership: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (!m_is_primary) { + lderr(m_cct) << "last journal tag not owned by local cluster" << dendl; + finish(-EINVAL); + return; + } + + image_state_update(); +} + +template +void EnableRequest::open_image() { + if (!m_non_primary_global_image_id.empty()) { + // special case for rbd-mirror creating a non-primary image + enable_non_primary_feature(); + return; + } else if (m_image_ctx != nullptr) { + create_primary_snapshot(); + return; + } + + ldout(m_cct, 10) << dendl; + + m_close_image = true; + m_image_ctx = I::create("", m_image_id, CEPH_NOSNAP, m_io_ctx, false); + + auto ctx = create_context_callback< + EnableRequest, &EnableRequest::handle_open_image>(this); + m_image_ctx->state->open(OPEN_FLAG_SKIP_OPEN_PARENT | + OPEN_FLAG_IGNORE_MIGRATING, ctx); +} + +template +void EnableRequest::handle_open_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to open image: " << cpp_strerror(r) << dendl; + m_image_ctx = nullptr; + finish(r); + return; + } + + create_primary_snapshot(); +} + +template +void EnableRequest::create_primary_snapshot() { + ldout(m_cct, 10) << dendl; + + ceph_assert(m_image_ctx != nullptr); + uint64_t snap_create_flags; + int r = util::snap_create_flags_api_to_internal( + m_cct, util::get_default_snap_create_flags(m_image_ctx), + &snap_create_flags); + ceph_assert(r == 0); + auto ctx = create_context_callback< + EnableRequest, + &EnableRequest::handle_create_primary_snapshot>(this); + auto req = snapshot::CreatePrimaryRequest::create( + m_image_ctx, m_mirror_image.global_image_id, + (m_image_clean ? 0 : CEPH_NOSNAP), snap_create_flags, + snapshot::CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS, &m_snap_id, ctx); + req->send(); +} + +template +void EnableRequest::handle_create_primary_snapshot(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to create initial primary snapshot: " + << cpp_strerror(r) << dendl; + m_ret_val = r; + } + + close_image(); +} + +template +void EnableRequest::close_image() { + if (!m_close_image) { + if (m_ret_val < 0) { + finish(m_ret_val); + } else { + image_state_update(); + } + return; + } + + ldout(m_cct, 10) << dendl; + + auto ctx = create_context_callback< + EnableRequest, &EnableRequest::handle_close_image>(this); + m_image_ctx->state->close(ctx); +} + +template +void EnableRequest::handle_close_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_image_ctx = nullptr; + + if (r < 0) { + lderr(m_cct) << "failed to close image: " << cpp_strerror(r) << dendl; + if (m_ret_val == 0) { + m_ret_val = r; + } + } + + if (m_ret_val < 0) { + finish(m_ret_val); + return; + } + + image_state_update(); +} + + +template +void EnableRequest::enable_non_primary_feature() { + if (m_mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + image_state_update(); + return; + } + + ldout(m_cct, 10) << dendl; + + // ensure image is flagged with non-primary feature so that + // standard RBD clients cannot write to it. + librados::ObjectWriteOperation op; + cls_client::set_features(&op, RBD_FEATURE_NON_PRIMARY, + RBD_FEATURE_NON_PRIMARY); + + auto aio_comp = create_rados_callback< + EnableRequest, + &EnableRequest::handle_enable_non_primary_feature>(this); + int r = m_io_ctx.aio_operate(util::header_name(m_image_id), aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void EnableRequest::handle_enable_non_primary_feature(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to enable non-primary feature: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void EnableRequest::image_state_update() { + ldout(m_cct, 10) << dendl; + + auto ctx = create_context_callback< + EnableRequest, &EnableRequest::handle_image_state_update>(this); + auto req = ImageStateUpdateRequest::create( + m_io_ctx, m_image_id, cls::rbd::MIRROR_IMAGE_STATE_ENABLED, + m_mirror_image, ctx); + req->send(); +} + +template +void EnableRequest::handle_image_state_update(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to enable mirroring: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template +void EnableRequest::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::EnableRequest; diff --git a/src/librbd/mirror/EnableRequest.h b/src/librbd/mirror/EnableRequest.h new file mode 100644 index 000000000..391028e6e --- /dev/null +++ b/src/librbd/mirror/EnableRequest.h @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H + +#include "include/buffer_fwd.h" +#include "include/rados/librados_fwd.hpp" +#include "include/rbd/librbd.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/ImageCtx.h" +#include "librbd/mirror/Types.h" +#include +#include + +class Context; + +namespace librbd { + +namespace asio { struct ContextWQ; } + +namespace mirror { + +template +class EnableRequest { +public: + static EnableRequest *create(ImageCtxT *image_ctx, + cls::rbd::MirrorImageMode mode, + const std::string &non_primary_global_image_id, + bool image_clean, Context *on_finish) { + return new EnableRequest(image_ctx->md_ctx, image_ctx->id, image_ctx, mode, + non_primary_global_image_id, image_clean, + image_ctx->op_work_queue, on_finish); + } + static EnableRequest *create(librados::IoCtx &io_ctx, + const std::string &image_id, + cls::rbd::MirrorImageMode mode, + const std::string &non_primary_global_image_id, + bool image_clean, asio::ContextWQ *op_work_queue, + Context *on_finish) { + return new EnableRequest(io_ctx, image_id, nullptr, mode, + non_primary_global_image_id, image_clean, + op_work_queue, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_MIRROR_IMAGE * * * * * * * + * | * (on error) + * v (skip if not needed) * + * GET_TAG_OWNER * * * * * * * * + * | * + * v (skip if not needed) * + * OPEN_IMAGE * + * | * + * v (skip if not needed) * + * CREATE_PRIMARY_SNAPSHOT * * * + * | * + * v (skip of not opened) * + * CLOSE_IMAGE * + * | * + * v (skip if not needed) * + * ENABLE_NON_PRIMARY_FEATURE * + * | * + * v (skip if not needed) * + * IMAGE_STATE_UPDATE * * * * * * + * | * + * v * + * < * * * * * * * * * + * + * @endverbatim + */ + + EnableRequest(librados::IoCtx &io_ctx, const std::string &image_id, + ImageCtxT* image_ctx, cls::rbd::MirrorImageMode mode, + const std::string &non_primary_global_image_id, + bool image_clean, asio::ContextWQ *op_work_queue, + Context *on_finish); + + librados::IoCtx &m_io_ctx; + std::string m_image_id; + ImageCtxT* m_image_ctx; + cls::rbd::MirrorImageMode m_mode; + std::string m_non_primary_global_image_id; + bool m_image_clean; + asio::ContextWQ *m_op_work_queue; + Context *m_on_finish; + + CephContext *m_cct = nullptr; + bufferlist m_out_bl; + cls::rbd::MirrorImage m_mirror_image; + + int m_ret_val = 0; + bool m_close_image = false; + + bool m_is_primary = false; + uint64_t m_snap_id = CEPH_NOSNAP; + + void get_mirror_image(); + void handle_get_mirror_image(int r); + + void get_tag_owner(); + void handle_get_tag_owner(int r); + + void open_image(); + void handle_open_image(int r); + + void create_primary_snapshot(); + void handle_create_primary_snapshot(int r); + + void close_image(); + void handle_close_image(int r); + + void enable_non_primary_feature(); + void handle_enable_non_primary_feature(int r); + + void image_state_update(); + void handle_image_state_update(int r); + + void finish(int r); +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::EnableRequest; + +#endif // CEPH_LIBRBD_MIRROR_ENABLE_REQUEST_H diff --git a/src/librbd/mirror/GetInfoRequest.cc b/src/librbd/mirror/GetInfoRequest.cc new file mode 100644 index 000000000..2db8aaa84 --- /dev/null +++ b/src/librbd/mirror/GetInfoRequest.cc @@ -0,0 +1,290 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/GetInfoRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::GetInfoRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +GetInfoRequest::GetInfoRequest(librados::IoCtx& io_ctx, + asio::ContextWQ *op_work_queue, + const std::string &image_id, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, + std::string* primary_mirror_uuid, + Context *on_finish) + : m_io_ctx(io_ctx), m_op_work_queue(op_work_queue), m_image_id(image_id), + m_mirror_image(mirror_image), m_promotion_state(promotion_state), + m_primary_mirror_uuid(primary_mirror_uuid), m_on_finish(on_finish), + m_cct(reinterpret_cast(io_ctx.cct())) { +} + +template +GetInfoRequest::GetInfoRequest(I &image_ctx, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, + std::string* primary_mirror_uuid, + Context *on_finish) + : m_image_ctx(&image_ctx), m_io_ctx(image_ctx.md_ctx), + m_op_work_queue(image_ctx.op_work_queue), m_image_id(image_ctx.id), + m_mirror_image(mirror_image), m_promotion_state(promotion_state), + m_primary_mirror_uuid(primary_mirror_uuid), m_on_finish(on_finish), + m_cct(image_ctx.cct) { +} + +template +void GetInfoRequest::send() { + get_mirror_image(); +} + +template +void GetInfoRequest::get_mirror_image() { + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_get_start(&op, m_image_id); + + librados::AioCompletion *comp = create_rados_callback< + GetInfoRequest, &GetInfoRequest::handle_get_mirror_image>(this); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void GetInfoRequest::handle_get_mirror_image(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_mirror_image->state = cls::rbd::MIRROR_IMAGE_STATE_DISABLED; + *m_promotion_state = PROMOTION_STATE_NON_PRIMARY; + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = cls_client::mirror_image_get_finish(&iter, m_mirror_image); + } + + if (r == -ENOENT) { + ldout(m_cct, 20) << "mirroring is disabled" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (m_mirror_image->mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) { + get_journal_tag_owner(); + } else if (m_mirror_image->mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + get_snapcontext(); + } else { + ldout(m_cct, 20) << "unknown mirror image mode: " << m_mirror_image->mode + << dendl; + finish(-EOPNOTSUPP); + } +} + +template +void GetInfoRequest::get_journal_tag_owner() { + ldout(m_cct, 20) << dendl; + + auto ctx = create_context_callback< + GetInfoRequest, &GetInfoRequest::handle_get_journal_tag_owner>(this); + Journal::get_tag_owner(m_io_ctx, m_image_id, &m_mirror_uuid, + m_op_work_queue, ctx); +} + +template +void GetInfoRequest::handle_get_journal_tag_owner(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to determine tag ownership: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (m_mirror_uuid == Journal<>::LOCAL_MIRROR_UUID) { + *m_promotion_state = PROMOTION_STATE_PRIMARY; + *m_primary_mirror_uuid = ""; + } else if (m_mirror_uuid == Journal<>::ORPHAN_MIRROR_UUID) { + *m_promotion_state = PROMOTION_STATE_ORPHAN; + *m_primary_mirror_uuid = ""; + } else { + *m_primary_mirror_uuid = m_mirror_uuid; + } + + finish(0); +} + +template +void GetInfoRequest::get_snapcontext() { + if (m_image_ctx != nullptr) { + { + std::shared_lock image_locker{m_image_ctx->image_lock}; + calc_promotion_state(m_image_ctx->snap_info); + } + finish(0); + return; + } + + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + cls_client::get_snapcontext_start(&op); + + librados::AioCompletion *comp = create_rados_callback< + GetInfoRequest, &GetInfoRequest::handle_get_snapcontext>(this); + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(util::header_name(m_image_id), comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void GetInfoRequest::handle_get_snapcontext(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r >= 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::get_snapcontext_finish(&it, &m_snapc); + } + + if (r == -ENOENT && + m_mirror_image->state == cls::rbd::MIRROR_IMAGE_STATE_CREATING) { + // image doesn't exist but we have a mirror image record for it + ldout(m_cct, 10) << "image does not exist for mirror image id " + << m_image_id << dendl; + *m_promotion_state = PROMOTION_STATE_UNKNOWN; + *m_primary_mirror_uuid = ""; + finish(0); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to get snapcontext: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + get_snapshots(); +} + + +template +void GetInfoRequest::get_snapshots() { + ldout(m_cct, 20) << dendl; + + if (m_snapc.snaps.empty()) { + handle_get_snapshots(0); + return; + } + + librados::ObjectReadOperation op; + for (auto snap_id : m_snapc.snaps) { + cls_client::snapshot_get_start(&op, snap_id); + } + + librados::AioCompletion *comp = create_rados_callback< + GetInfoRequest, &GetInfoRequest::handle_get_snapshots>(this); + m_out_bl.clear(); + int r = m_io_ctx.aio_operate(util::header_name(m_image_id), comp, &op, + &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void GetInfoRequest::handle_get_snapshots(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + std::map snap_info; + + auto it = m_out_bl.cbegin(); + for (auto snap_id : m_snapc.snaps) { + cls::rbd::SnapshotInfo snap; + if (r >= 0) { + r = cls_client::snapshot_get_finish(&it, &snap); + } + snap_info.emplace( + snap_id, SnapInfo(snap.name, snap.snapshot_namespace, 0, {}, 0, 0, {})); + } + + if (r == -ENOENT) { + // restart + get_snapcontext(); + return; + } + + if (r < 0) { + lderr(m_cct) << "failed to get snapshots: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + calc_promotion_state(snap_info); + finish(0); +} + +template +void GetInfoRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +template +void GetInfoRequest::calc_promotion_state( + const std::map &snap_info) { + *m_promotion_state = PROMOTION_STATE_UNKNOWN; + *m_primary_mirror_uuid = ""; + + for (auto it = snap_info.rbegin(); it != snap_info.rend(); it++) { + auto mirror_ns = std::get_if( + &it->second.snap_namespace); + + if (mirror_ns != nullptr) { + switch (mirror_ns->state) { + case cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY: + *m_promotion_state = PROMOTION_STATE_PRIMARY; + break; + case cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY: + *m_promotion_state = PROMOTION_STATE_NON_PRIMARY; + *m_primary_mirror_uuid = mirror_ns->primary_mirror_uuid; + break; + case cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY_DEMOTED: + case cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY_DEMOTED: + *m_promotion_state = PROMOTION_STATE_ORPHAN; + break; + } + break; + } + } + + ldout(m_cct, 10) << "promotion_state=" << *m_promotion_state << ", " + << "primary_mirror_uuid=" << *m_primary_mirror_uuid << dendl; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::GetInfoRequest; diff --git a/src/librbd/mirror/GetInfoRequest.h b/src/librbd/mirror/GetInfoRequest.h new file mode 100644 index 000000000..dcc6da7da --- /dev/null +++ b/src/librbd/mirror/GetInfoRequest.h @@ -0,0 +1,123 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H +#define CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H + +#include "common/snap_types.h" +#include "include/buffer.h" +#include "include/common_fwd.h" +#include "include/rados/librados.hpp" +#include "librbd/Types.h" +#include "librbd/mirror/Types.h" +#include + +struct Context; + +namespace cls { namespace rbd { struct MirrorImage; } } + +namespace librbd { + +struct ImageCtx; +namespace asio { struct ContextWQ; } + +namespace mirror { + +template +class GetInfoRequest { +public: + static GetInfoRequest *create(librados::IoCtx &io_ctx, + asio::ContextWQ *op_work_queue, + const std::string &image_id, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, + std::string* primary_mirror_uuid, + Context *on_finish) { + return new GetInfoRequest(io_ctx, op_work_queue, image_id, mirror_image, + promotion_state, primary_mirror_uuid, on_finish); + } + static GetInfoRequest *create(ImageCtxT &image_ctx, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, + std::string* primary_mirror_uuid, + Context *on_finish) { + return new GetInfoRequest(image_ctx, mirror_image, promotion_state, + primary_mirror_uuid, on_finish); + } + + GetInfoRequest(librados::IoCtx& io_ctx, asio::ContextWQ *op_work_queue, + const std::string &image_id, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, + std::string* primary_mirror_uuid, Context *on_finish); + GetInfoRequest(ImageCtxT &image_ctx, cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, + std::string* primary_mirror_uuid, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_MIRROR_IMAGE + * | + * (journal /--------/ \--------\ (snapshot + * mode) | | mode) + * v v + * GET_JOURNAL_TAG_OWNER GET_SNAPCONTEXT (skip if + * | | cached) + * | v + * | GET_SNAPSHOTS (skip if + * | | cached) + * \--------\ /--------/ + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx = nullptr; + librados::IoCtx &m_io_ctx; + asio::ContextWQ *m_op_work_queue; + std::string m_image_id; + cls::rbd::MirrorImage *m_mirror_image; + PromotionState *m_promotion_state; + std::string* m_primary_mirror_uuid; + Context *m_on_finish; + + CephContext *m_cct; + + bufferlist m_out_bl; + std::string m_mirror_uuid; + ::SnapContext m_snapc; + + void get_mirror_image(); + void handle_get_mirror_image(int r); + + void get_journal_tag_owner(); + void handle_get_journal_tag_owner(int r); + + void get_snapcontext(); + void handle_get_snapcontext(int r); + + void get_snapshots(); + void handle_get_snapshots(int r); + + void finish(int r); + + void calc_promotion_state( + const std::map &snap_info); +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::GetInfoRequest; + +#endif // CEPH_LIBRBD_MIRROR_GET_INFO_REQUEST_H + diff --git a/src/librbd/mirror/GetStatusRequest.cc b/src/librbd/mirror/GetStatusRequest.cc new file mode 100644 index 000000000..40d4a664b --- /dev/null +++ b/src/librbd/mirror/GetStatusRequest.cc @@ -0,0 +1,116 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/GetStatusRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/mirror/GetInfoRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::GetStatusRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +void GetStatusRequest::send() { + *m_mirror_image_status = cls::rbd::MirrorImageStatus( + {{cls::rbd::MirrorImageSiteStatus::LOCAL_MIRROR_UUID, + cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN, "status not found"}}); + + get_info(); +} + +template +void GetStatusRequest::get_info() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + GetStatusRequest, &GetStatusRequest::handle_get_info>(this); + auto req = GetInfoRequest::create(m_image_ctx, m_mirror_image, + m_promotion_state, + &m_primary_mirror_uuid, ctx); + req->send(); +} + +template +void GetStatusRequest::handle_get_info(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + if (r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + } + finish(r); + return; + } else if (m_mirror_image->state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + finish(0); + return; + } + + get_status(); +} + +template +void GetStatusRequest::get_status() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_status_get_start( + &op, m_mirror_image->global_image_id); + + librados::AioCompletion *comp = create_rados_callback< + GetStatusRequest, &GetStatusRequest::handle_get_status>(this); + int r = m_image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void GetStatusRequest::handle_get_status(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = cls_client::mirror_image_status_get_finish(&iter, + m_mirror_image_status); + } + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirror image status: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void GetStatusRequest::finish(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::GetStatusRequest; diff --git a/src/librbd/mirror/GetStatusRequest.h b/src/librbd/mirror/GetStatusRequest.h new file mode 100644 index 000000000..581a0d667 --- /dev/null +++ b/src/librbd/mirror/GetStatusRequest.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H +#define CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H + +#include "include/buffer.h" +#include "librbd/mirror/Types.h" +#include + +struct Context; +namespace cls { namespace rbd { struct MirrorImage; } } +namespace cls { namespace rbd { struct MirrorImageStatus; } } + +namespace librbd { + +struct ImageCtx; + +namespace mirror { + +template +class GetStatusRequest { +public: + static GetStatusRequest *create(ImageCtxT &image_ctx, + cls::rbd::MirrorImageStatus *status, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, + Context *on_finish) { + return new GetStatusRequest(image_ctx, status, mirror_image, + promotion_state, on_finish); + } + + GetStatusRequest(ImageCtxT &image_ctx, cls::rbd::MirrorImageStatus *status, + cls::rbd::MirrorImage *mirror_image, + PromotionState *promotion_state, Context *on_finish) + : m_image_ctx(image_ctx), m_mirror_image_status(status), + m_mirror_image(mirror_image), m_promotion_state(promotion_state), + m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_INFO + * | + * v + * GET_STATUS + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + cls::rbd::MirrorImageStatus *m_mirror_image_status; + cls::rbd::MirrorImage *m_mirror_image; + PromotionState *m_promotion_state; + Context *m_on_finish; + + bufferlist m_out_bl; + std::string m_primary_mirror_uuid; + + void get_info(); + void handle_get_info(int r); + + void get_status(); + void handle_get_status(int r); + + void finish(int r); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::GetStatusRequest; + +#endif // CEPH_LIBRBD_MIRROR_GET_STATUS_REQUEST_H + diff --git a/src/librbd/mirror/GetUuidRequest.cc b/src/librbd/mirror/GetUuidRequest.cc new file mode 100644 index 000000000..f8209f905 --- /dev/null +++ b/src/librbd/mirror/GetUuidRequest.cc @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/GetUuidRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::GetUuidRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using librbd::util::create_rados_callback; + +template +GetUuidRequest::GetUuidRequest( + librados::IoCtx& io_ctx, std::string* mirror_uuid, Context* on_finish) + : m_mirror_uuid(mirror_uuid), m_on_finish(on_finish), + m_cct(reinterpret_cast(io_ctx.cct())) { + m_io_ctx.dup(io_ctx); + m_io_ctx.set_namespace(""); +} + +template +void GetUuidRequest::send() { + get_mirror_uuid(); +} + +template +void GetUuidRequest::get_mirror_uuid() { + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + librbd::cls_client::mirror_uuid_get_start(&op); + + auto aio_comp = create_rados_callback< + GetUuidRequest, &GetUuidRequest::handle_get_mirror_uuid>(this); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void GetUuidRequest::handle_get_mirror_uuid(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r >= 0) { + auto it = m_out_bl.cbegin(); + r = librbd::cls_client::mirror_uuid_get_finish(&it, m_mirror_uuid); + if (r >= 0 && m_mirror_uuid->empty()) { + r = -ENOENT; + } + } + + if (r < 0) { + if (r == -ENOENT) { + ldout(m_cct, 5) << "mirror uuid missing" << dendl; + } else { + lderr(m_cct) << "failed to retrieve mirror uuid: " << cpp_strerror(r) + << dendl; + } + *m_mirror_uuid = ""; + } + + finish(r); +} + +template +void GetUuidRequest::finish(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::GetUuidRequest; diff --git a/src/librbd/mirror/GetUuidRequest.h b/src/librbd/mirror/GetUuidRequest.h new file mode 100644 index 000000000..73cc2d5b2 --- /dev/null +++ b/src/librbd/mirror/GetUuidRequest.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_GET_UUID_REQUEST_H +#define CEPH_LIBRBD_MIRROR_GET_UUID_REQUEST_H + +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_types.h" + +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { + +template +class GetUuidRequest { +public: + static GetUuidRequest *create(librados::IoCtx& io_ctx, + std::string* mirror_uuid, Context* on_finish) { + return new GetUuidRequest(io_ctx, mirror_uuid, on_finish); + } + + GetUuidRequest(librados::IoCtx& io_ctx, std::string* mirror_uuid, + Context* on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_MIRROR_UUID + * | + * v + * + * + * @endverbatim + */ + + librados::IoCtx m_io_ctx; + std::string* m_mirror_uuid; + Context* m_on_finish; + + CephContext* m_cct; + + bufferlist m_out_bl; + + void get_mirror_uuid(); + void handle_get_mirror_uuid(int r); + + void finish(int r); +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::GetUuidRequest; + +#endif // CEPH_LIBRBD_MIRROR_GET_UUID_REQUEST_H diff --git a/src/librbd/mirror/ImageRemoveRequest.cc b/src/librbd/mirror/ImageRemoveRequest.cc new file mode 100644 index 000000000..1aa265dae --- /dev/null +++ b/src/librbd/mirror/ImageRemoveRequest.cc @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/ImageRemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/MirroringWatcher.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::ImageRemoveRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using util::create_rados_callback; + +template +ImageRemoveRequest::ImageRemoveRequest( + librados::IoCtx& io_ctx, const std::string& global_image_id, + const std::string& image_id, Context* on_finish) + : m_io_ctx(io_ctx), m_global_image_id(global_image_id), m_image_id(image_id), + m_on_finish(on_finish), m_cct(static_cast(m_io_ctx.cct())) { +} + +template +void ImageRemoveRequest::send() { + remove_mirror_image(); +} + +template +void ImageRemoveRequest::remove_mirror_image() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + cls_client::mirror_image_remove(&op, m_image_id); + + auto comp = create_rados_callback< + ImageRemoveRequest, + &ImageRemoveRequest::handle_remove_mirror_image>(this); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void ImageRemoveRequest::handle_remove_mirror_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to remove mirroring image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + notify_mirroring_watcher(); +} + +template +void ImageRemoveRequest::notify_mirroring_watcher() { + ldout(m_cct, 10) << dendl; + + auto ctx = util::create_context_callback< + ImageRemoveRequest, + &ImageRemoveRequest::handle_notify_mirroring_watcher>(this); + MirroringWatcher::notify_image_updated( + m_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLED, + m_image_id, m_global_image_id, ctx); +} + +template +void ImageRemoveRequest::handle_notify_mirroring_watcher(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to notify mirror image update: " << cpp_strerror(r) + << dendl; + } + + finish(0); +} + +template +void ImageRemoveRequest::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::ImageRemoveRequest; diff --git a/src/librbd/mirror/ImageRemoveRequest.h b/src/librbd/mirror/ImageRemoveRequest.h new file mode 100644 index 000000000..c04f9fadc --- /dev/null +++ b/src/librbd/mirror/ImageRemoveRequest.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_IMAGE_REMOVE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_IMAGE_REMOVE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "common/ceph_mutex.h" +#include "cls/rbd/cls_rbd_types.h" + +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace mirror { + +template +class ImageRemoveRequest { +public: + static ImageRemoveRequest *create(librados::IoCtx& io_ctx, + const std::string& global_image_id, + const std::string& image_id, + Context* on_finish) { + return new ImageRemoveRequest(io_ctx, global_image_id, image_id, on_finish); + } + + ImageRemoveRequest(librados::IoCtx& io_ctx, + const std::string& global_image_id, + const std::string& image_id, + Context* on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * REMOVE_MIRROR_IMAGE + * | + * v + * NOTIFY_MIRRORING_WATCHER + * | + * v + * + * + * @endverbatim + */ + + librados::IoCtx& m_io_ctx; + std::string m_global_image_id; + std::string m_image_id; + Context* m_on_finish; + + CephContext* m_cct; + + void remove_mirror_image(); + void handle_remove_mirror_image(int r); + + void notify_mirroring_watcher(); + void handle_notify_mirroring_watcher(int r); + + void finish(int r); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::ImageRemoveRequest; + +#endif // CEPH_LIBRBD_MIRROR_IMAGE_REMOVE_REQUEST_H diff --git a/src/librbd/mirror/ImageStateUpdateRequest.cc b/src/librbd/mirror/ImageStateUpdateRequest.cc new file mode 100644 index 000000000..98e987190 --- /dev/null +++ b/src/librbd/mirror/ImageStateUpdateRequest.cc @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/ImageStateUpdateRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/MirroringWatcher.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::ImageStateUpdateRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using util::create_rados_callback; + +template +ImageStateUpdateRequest::ImageStateUpdateRequest( + librados::IoCtx& io_ctx, + const std::string& image_id, + cls::rbd::MirrorImageState mirror_image_state, + const cls::rbd::MirrorImage& mirror_image, + Context* on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), + m_mirror_image_state(mirror_image_state), m_mirror_image(mirror_image), + m_on_finish(on_finish), m_cct(static_cast(m_io_ctx.cct())) { + ceph_assert(m_mirror_image_state != cls::rbd::MIRROR_IMAGE_STATE_DISABLED); +} + +template +void ImageStateUpdateRequest::send() { + get_mirror_image(); +} + +template +void ImageStateUpdateRequest::get_mirror_image() { + if (!m_mirror_image.global_image_id.empty()) { + set_mirror_image(); + return; + } + + ldout(m_cct, 10) << dendl; + librados::ObjectReadOperation op; + cls_client::mirror_image_get_start(&op, m_image_id); + + auto comp = create_rados_callback< + ImageStateUpdateRequest, + &ImageStateUpdateRequest::handle_get_mirror_image>(this); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void ImageStateUpdateRequest::handle_get_mirror_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = cls_client::mirror_image_get_finish(&iter, &m_mirror_image); + } + + if (r == -ENOENT) { + ldout(m_cct, 20) << "mirroring is disabled" << dendl; + finish(0); + return; + } else if (r < 0) { + lderr(m_cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + set_mirror_image(); +} + +template +void ImageStateUpdateRequest::set_mirror_image() { + if (m_mirror_image.state == m_mirror_image_state) { + finish(0); + return; + } + + ldout(m_cct, 10) << dendl; + m_mirror_image.state = m_mirror_image_state; + + librados::ObjectWriteOperation op; + cls_client::mirror_image_set(&op, m_image_id, m_mirror_image); + + auto comp = create_rados_callback< + ImageStateUpdateRequest, + &ImageStateUpdateRequest::handle_set_mirror_image>(this); + int r = m_io_ctx.aio_operate(RBD_MIRRORING, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void ImageStateUpdateRequest::handle_set_mirror_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to disable mirroring image: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + notify_mirroring_watcher(); +} + +template +void ImageStateUpdateRequest::notify_mirroring_watcher() { + ldout(m_cct, 10) << dendl; + + auto ctx = util::create_context_callback< + ImageStateUpdateRequest, + &ImageStateUpdateRequest::handle_notify_mirroring_watcher>(this); + MirroringWatcher::notify_image_updated( + m_io_ctx, m_mirror_image_state, m_image_id, m_mirror_image.global_image_id, + ctx); +} + +template +void ImageStateUpdateRequest::handle_notify_mirroring_watcher(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to notify mirror image update: " << cpp_strerror(r) + << dendl; + } + + finish(0); +} + +template +void ImageStateUpdateRequest::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::ImageStateUpdateRequest; diff --git a/src/librbd/mirror/ImageStateUpdateRequest.h b/src/librbd/mirror/ImageStateUpdateRequest.h new file mode 100644 index 000000000..9e0affe6a --- /dev/null +++ b/src/librbd/mirror/ImageStateUpdateRequest.h @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_IMAGE_STATE_UPDATE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_IMAGE_STATE_UPDATE_REQUEST_H + +#include "include/rados/librados.hpp" +#include "common/ceph_mutex.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/mirror/Types.h" + +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace mirror { + +template +class ImageStateUpdateRequest { +public: + static ImageStateUpdateRequest *create( + librados::IoCtx& io_ctx, + const std::string& image_id, + cls::rbd::MirrorImageState mirror_image_state, + const cls::rbd::MirrorImage& mirror_image, + Context* on_finish) { + return new ImageStateUpdateRequest( + io_ctx, image_id, mirror_image_state, mirror_image, on_finish); + } + + ImageStateUpdateRequest( + librados::IoCtx& io_ctx, + const std::string& image_id, + cls::rbd::MirrorImageState mirror_image_state, + const cls::rbd::MirrorImage& mirror_image, + Context* on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v (skip if provided) + * GET_MIRROR_IMAGE + * | + * v + * SET_MIRROR_IMAGE + * | + * v + * NOTIFY_MIRRORING_WATCHER + * | + * v + * + * + * @endverbatim + */ + + librados::IoCtx& m_io_ctx; + std::string m_image_id; + cls::rbd::MirrorImageState m_mirror_image_state; + cls::rbd::MirrorImage m_mirror_image; + Context* m_on_finish; + + CephContext* m_cct; + bufferlist m_out_bl; + + void get_mirror_image(); + void handle_get_mirror_image(int r); + + void set_mirror_image(); + void handle_set_mirror_image(int r); + + void notify_mirroring_watcher(); + void handle_notify_mirroring_watcher(int r); + + void finish(int r); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::ImageStateUpdateRequest; + +#endif // CEPH_LIBRBD_MIRROR_IMAGE_STATE_UPDATE_REQUEST_H diff --git a/src/librbd/mirror/PromoteRequest.cc b/src/librbd/mirror/PromoteRequest.cc new file mode 100644 index 000000000..b119e4edc --- /dev/null +++ b/src/librbd/mirror/PromoteRequest.cc @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/PromoteRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/mirror/GetInfoRequest.h" +#include "librbd/mirror/snapshot/PromoteRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::PromoteRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace mirror { + +using librbd::util::create_context_callback; + +template +void PromoteRequest::send() { + get_info(); +} + +template +void PromoteRequest::get_info() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + PromoteRequest, &PromoteRequest::handle_get_info>(this); + auto req = GetInfoRequest::create(m_image_ctx, &m_mirror_image, + &m_promotion_state, + &m_primary_mirror_uuid, ctx); + req->send(); +} + +template +void PromoteRequest::handle_get_info(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else if (m_mirror_image.state != cls::rbd::MIRROR_IMAGE_STATE_ENABLED) { + lderr(cct) << "mirroring is not currently enabled" << dendl; + finish(-EINVAL); + return; + } else if (m_promotion_state == PROMOTION_STATE_PRIMARY) { + lderr(cct) << "image is already primary" << dendl; + finish(-EINVAL); + return; + } else if (m_promotion_state == PROMOTION_STATE_NON_PRIMARY && !m_force) { + lderr(cct) << "image is primary within a remote cluster or demotion is not propagated yet" + << dendl; + finish(-EBUSY); + return; + } + + promote(); +} + +template +void PromoteRequest::promote() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << dendl; + + auto ctx = create_context_callback< + PromoteRequest, &PromoteRequest::handle_promote>(this); + if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) { + Journal::promote(&m_image_ctx, ctx); + } else if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + auto req = mirror::snapshot::PromoteRequest::create( + &m_image_ctx, m_mirror_image.global_image_id, ctx); + req->send(); + } else { + lderr(cct) << "unknown image mirror mode: " << m_mirror_image.mode << dendl; + finish(-EOPNOTSUPP); + } +} + +template +void PromoteRequest::handle_promote(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to promote image: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template +void PromoteRequest::finish(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::PromoteRequest; diff --git a/src/librbd/mirror/PromoteRequest.h b/src/librbd/mirror/PromoteRequest.h new file mode 100644 index 000000000..c54f3bb76 --- /dev/null +++ b/src/librbd/mirror/PromoteRequest.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/mirror/Types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { + +template +class PromoteRequest { +public: + static PromoteRequest *create(ImageCtxT &image_ctx, bool force, + Context *on_finish) { + return new PromoteRequest(image_ctx, force, on_finish); + } + + PromoteRequest(ImageCtxT &image_ctx, bool force, Context *on_finish) + : m_image_ctx(image_ctx), m_force(force), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_INFO + * | + * v + * GET_TAG_OWNER + * | + * v + * PROMOTE + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + bool m_force; + Context *m_on_finish; + + cls::rbd::MirrorImage m_mirror_image; + PromotionState m_promotion_state = PROMOTION_STATE_PRIMARY; + std::string m_primary_mirror_uuid; + + void get_info(); + void handle_get_info(int r); + + void promote(); + void handle_promote(int r); + + void finish(int r); + +}; + +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::PromoteRequest; + +#endif // CEPH_LIBRBD_MIRROR_PROMOTE_REQUEST_H diff --git a/src/librbd/mirror/Types.h b/src/librbd/mirror/Types.h new file mode 100644 index 000000000..2388b74ef --- /dev/null +++ b/src/librbd/mirror/Types.h @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_TYPES_H +#define CEPH_LIBRBD_MIRROR_TYPES_H + +namespace librbd { +namespace mirror { + +enum PromotionState { + PROMOTION_STATE_UNKNOWN, + PROMOTION_STATE_PRIMARY, + PROMOTION_STATE_NON_PRIMARY, + PROMOTION_STATE_ORPHAN +}; + +} // namespace mirror +} // namespace librbd + +#endif // CEPH_LIBRBD_MIRROR_TYPES_H + diff --git a/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.cc b/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.cc new file mode 100644 index 000000000..eed0aa506 --- /dev/null +++ b/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.cc @@ -0,0 +1,273 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/CreateNonPrimaryRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/mirror/snapshot/Utils.h" +#include "librbd/mirror/snapshot/WriteImageStateRequest.h" + +#define dout_subsys ceph_subsys_rbd + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::CreateNonPrimaryRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +CreateNonPrimaryRequest::CreateNonPrimaryRequest( + I* image_ctx, bool demoted, const std::string &primary_mirror_uuid, + uint64_t primary_snap_id, const SnapSeqs& snap_seqs, + const ImageState &image_state, uint64_t *snap_id, Context *on_finish) + : m_image_ctx(image_ctx), m_demoted(demoted), + m_primary_mirror_uuid(primary_mirror_uuid), + m_primary_snap_id(primary_snap_id), m_snap_seqs(snap_seqs), + m_image_state(image_state), m_snap_id(snap_id), m_on_finish(on_finish) { + m_default_ns_ctx.dup(m_image_ctx->md_ctx); + m_default_ns_ctx.set_namespace(""); +} + +template +void CreateNonPrimaryRequest::send() { + refresh_image(); +} + +template +void CreateNonPrimaryRequest::refresh_image() { + if (!m_image_ctx->state->is_refresh_required()) { + get_mirror_image(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + CreateNonPrimaryRequest, + &CreateNonPrimaryRequest::handle_refresh_image>(this); + m_image_ctx->state->refresh(ctx); +} + +template +void CreateNonPrimaryRequest::handle_refresh_image(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + get_mirror_image(); +} + +template +void CreateNonPrimaryRequest::get_mirror_image() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_get_start(&op, m_image_ctx->id); + + librados::AioCompletion *comp = create_rados_callback< + CreateNonPrimaryRequest, + &CreateNonPrimaryRequest::handle_get_mirror_image>(this); + int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreateNonPrimaryRequest::handle_get_mirror_image(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + cls::rbd::MirrorImage mirror_image; + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = cls_client::mirror_image_get_finish(&iter, &mirror_image); + } + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve mirroring state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) { + lderr(cct) << "snapshot based mirroring is not enabled" << dendl; + finish(-EINVAL); + return; + } + + if (!is_orphan() && !util::can_create_non_primary_snapshot(m_image_ctx)) { + finish(-EINVAL); + return; + } + + uuid_d uuid_gen; + uuid_gen.generate_random(); + m_snap_name = ".mirror.non_primary." + mirror_image.global_image_id + "." + + uuid_gen.to_string(); + + get_mirror_peers(); +} + +template +void CreateNonPrimaryRequest::get_mirror_peers() { + if (!m_demoted) { + create_snapshot(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_peer_list_start(&op); + + auto aio_comp = create_rados_callback< + CreateNonPrimaryRequest, + &CreateNonPrimaryRequest::handle_get_mirror_peers>(this); + m_out_bl.clear(); + int r = m_default_ns_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void CreateNonPrimaryRequest::handle_get_mirror_peers(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + std::vector peers; + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = cls_client::mirror_peer_list_finish(&iter, &peers); + } + + if (r < 0) { + lderr(cct) << "failed to retrieve mirror peers: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + for (auto &peer : peers) { + if (peer.mirror_peer_direction == cls::rbd::MIRROR_PEER_DIRECTION_RX) { + continue; + } + m_mirror_peer_uuids.insert(peer.uuid); + } + + create_snapshot(); +} + +template +void CreateNonPrimaryRequest::create_snapshot() { + CephContext *cct = m_image_ctx->cct; + + cls::rbd::MirrorSnapshotNamespace ns{ + (m_demoted ? cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY_DEMOTED : + cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY), {}, + m_primary_mirror_uuid, m_primary_snap_id}; + if (m_demoted) { + ns.mirror_peer_uuids = m_mirror_peer_uuids; + } + ns.snap_seqs = m_snap_seqs; + ns.complete = is_orphan(); + ldout(cct, 15) << "ns=" << ns << dendl; + + auto ctx = create_context_callback< + CreateNonPrimaryRequest, + &CreateNonPrimaryRequest::handle_create_snapshot>(this); + m_image_ctx->operations->snap_create(ns, m_snap_name, 0, m_prog_ctx, ctx); +} + +template +void CreateNonPrimaryRequest::handle_create_snapshot(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to create mirror snapshot: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + write_image_state(); +} + +template +void CreateNonPrimaryRequest::write_image_state() { + uint64_t snap_id; + { + std::shared_lock image_locker{m_image_ctx->image_lock}; + snap_id = m_image_ctx->get_snap_id( + cls::rbd::MirrorSnapshotNamespace{}, m_snap_name); + } + + if (m_snap_id != nullptr) { + *m_snap_id = snap_id; + } + + if (is_orphan()) { + finish(0); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + CreateNonPrimaryRequest, + &CreateNonPrimaryRequest::handle_write_image_state>(this); + + auto req = WriteImageStateRequest::create(m_image_ctx, snap_id, + m_image_state, ctx); + req->send(); +} + +template +void CreateNonPrimaryRequest::handle_write_image_state(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to write image state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void CreateNonPrimaryRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::CreateNonPrimaryRequest; diff --git a/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.h b/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.h new file mode 100644 index 000000000..36f155413 --- /dev/null +++ b/src/librbd/mirror/snapshot/CreateNonPrimaryRequest.h @@ -0,0 +1,123 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_NON_PRIMARY_REQUEST_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_NON_PRIMARY_REQUEST_H + +#include "include/buffer.h" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/Types.h" +#include "librbd/internal.h" +#include "librbd/mirror/snapshot/Types.h" + +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +template +class CreateNonPrimaryRequest { +public: + static CreateNonPrimaryRequest *create(ImageCtxT *image_ctx, + bool demoted, + const std::string &primary_mirror_uuid, + uint64_t primary_snap_id, + const SnapSeqs& snap_seqs, + const ImageState &image_state, + uint64_t *snap_id, + Context *on_finish) { + return new CreateNonPrimaryRequest(image_ctx, demoted, primary_mirror_uuid, + primary_snap_id, snap_seqs, image_state, + snap_id, on_finish); + } + + CreateNonPrimaryRequest(ImageCtxT *image_ctx, + bool demoted, + const std::string &primary_mirror_uuid, + uint64_t primary_snap_id, + const SnapSeqs& snap_seqs, + const ImageState &image_state, uint64_t *snap_id, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * REFRESH_IMAGE + * | + * v + * GET_MIRROR_IMAGE + * | + * v (skip if not needed) + * GET_MIRROR_PEERS + * | + * v + * CREATE_SNAPSHOT + * | + * v + * WRITE_IMAGE_STATE + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + bool m_demoted; + std::string m_primary_mirror_uuid; + uint64_t m_primary_snap_id; + SnapSeqs m_snap_seqs; + ImageState m_image_state; + uint64_t *m_snap_id; + Context *m_on_finish; + + librados::IoCtx m_default_ns_ctx; + std::set m_mirror_peer_uuids; + + std::string m_snap_name; + + bufferlist m_out_bl; + NoOpProgressContext m_prog_ctx; + + bool is_orphan() const { + return m_primary_mirror_uuid.empty(); + } + + void refresh_image(); + void handle_refresh_image(int r); + + void get_mirror_image(); + void handle_get_mirror_image(int r); + + void get_mirror_peers(); + void handle_get_mirror_peers(int r); + + void create_snapshot(); + void handle_create_snapshot(int r); + + void write_image_state(); + void handle_write_image_state(int r); + + void finish(int r); +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::CreateNonPrimaryRequest; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_NON_PRIMARY_REQUEST_H diff --git a/src/librbd/mirror/snapshot/CreatePrimaryRequest.cc b/src/librbd/mirror/snapshot/CreatePrimaryRequest.cc new file mode 100644 index 000000000..c8e3a4fe7 --- /dev/null +++ b/src/librbd/mirror/snapshot/CreatePrimaryRequest.cc @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/CreatePrimaryRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/mirror/snapshot/UnlinkPeerRequest.h" +#include "librbd/mirror/snapshot/Utils.h" + +#define dout_subsys ceph_subsys_rbd + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::CreatePrimaryRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +CreatePrimaryRequest::CreatePrimaryRequest( + I *image_ctx, const std::string& global_image_id, + uint64_t clean_since_snap_id, uint64_t snap_create_flags, uint32_t flags, + uint64_t *snap_id, Context *on_finish) + : m_image_ctx(image_ctx), m_global_image_id(global_image_id), + m_clean_since_snap_id(clean_since_snap_id), + m_snap_create_flags(snap_create_flags), m_flags(flags), m_snap_id(snap_id), + m_on_finish(on_finish) { + m_default_ns_ctx.dup(m_image_ctx->md_ctx); + m_default_ns_ctx.set_namespace(""); +} + +template +void CreatePrimaryRequest::send() { + if (!util::can_create_primary_snapshot( + m_image_ctx, + ((m_flags & CREATE_PRIMARY_FLAG_DEMOTED) != 0), + ((m_flags & CREATE_PRIMARY_FLAG_FORCE) != 0), nullptr, nullptr)) { + finish(-EINVAL); + return; + } + + uuid_d uuid_gen; + uuid_gen.generate_random(); + m_snap_name = ".mirror.primary." + m_global_image_id + "." + + uuid_gen.to_string(); + + get_mirror_peers(); +} + +template +void CreatePrimaryRequest::get_mirror_peers() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_peer_list_start(&op); + + librados::AioCompletion *comp = create_rados_callback< + CreatePrimaryRequest, + &CreatePrimaryRequest::handle_get_mirror_peers>(this); + m_out_bl.clear(); + int r = m_default_ns_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void CreatePrimaryRequest::handle_get_mirror_peers(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + std::vector peers; + if (r == 0) { + auto iter = m_out_bl.cbegin(); + r = cls_client::mirror_peer_list_finish(&iter, &peers); + } + + if (r < 0) { + lderr(cct) << "failed to retrieve mirror peers: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + for (auto &peer : peers) { + if (peer.mirror_peer_direction == cls::rbd::MIRROR_PEER_DIRECTION_RX) { + continue; + } + m_mirror_peer_uuids.insert(peer.uuid); + } + + if (m_mirror_peer_uuids.empty() && + ((m_flags & CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS) == 0)) { + lderr(cct) << "no mirror tx peers configured for the pool" << dendl; + finish(-EINVAL); + return; + } + + create_snapshot(); +} + +template +void CreatePrimaryRequest::create_snapshot() { + cls::rbd::MirrorSnapshotNamespace ns{ + ((m_flags & CREATE_PRIMARY_FLAG_DEMOTED) != 0 ? + cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY_DEMOTED : + cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY), + m_mirror_peer_uuids, "", m_clean_since_snap_id}; + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "name=" << m_snap_name << ", " + << "ns=" << ns << dendl; + auto ctx = create_context_callback< + CreatePrimaryRequest, + &CreatePrimaryRequest::handle_create_snapshot>(this); + m_image_ctx->operations->snap_create(ns, m_snap_name, m_snap_create_flags, + m_prog_ctx, ctx); +} + +template +void CreatePrimaryRequest::handle_create_snapshot(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to create mirror snapshot: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + refresh_image(); +} + +template +void CreatePrimaryRequest::refresh_image() { + // refresh is required to retrieve the snapshot id (if snapshot + // created via remote RPC) and complete flag (regardless) + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + CreatePrimaryRequest, + &CreatePrimaryRequest::handle_refresh_image>(this); + m_image_ctx->state->refresh(ctx); +} + +template +void CreatePrimaryRequest::handle_refresh_image(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + if (m_snap_id != nullptr) { + std::shared_lock image_locker{m_image_ctx->image_lock}; + *m_snap_id = m_image_ctx->get_snap_id( + cls::rbd::MirrorSnapshotNamespace{}, m_snap_name); + ldout(cct, 15) << "snap_id=" << *m_snap_id << dendl; + } + + unlink_peer(); +} + +template +void CreatePrimaryRequest::unlink_peer() { + // TODO: Document semantics for unlink_peer + uint64_t max_snapshots = m_image_ctx->config.template get_val( + "rbd_mirroring_max_mirroring_snapshots"); + ceph_assert(max_snapshots >= 3); + + std::string peer_uuid; + uint64_t snap_id = CEPH_NOSNAP; + + { + std::shared_lock image_locker{m_image_ctx->image_lock}; + for (const auto& peer : m_mirror_peer_uuids) { + for (const auto& snap_info_pair : m_image_ctx->snap_info) { + auto info = std::get_if( + &snap_info_pair.second.snap_namespace); + if (info == nullptr) { + continue; + } + if (info->mirror_peer_uuids.empty() || + (info->mirror_peer_uuids.count(peer) != 0 && + info->is_primary() && !info->complete)) { + peer_uuid = peer; + snap_id = snap_info_pair.first; + goto do_unlink; + } + } + } + for (const auto& peer : m_mirror_peer_uuids) { + size_t count = 0; + uint64_t unlink_snap_id = 0; + for (const auto& snap_info_pair : m_image_ctx->snap_info) { + auto info = std::get_if( + &snap_info_pair.second.snap_namespace); + if (info == nullptr) { + continue; + } + if (info->state != cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY) { + // reset counters -- we count primary snapshots after the last + // promotion + count = 0; + unlink_snap_id = 0; + continue; + } + if (info->mirror_peer_uuids.count(peer) == 0) { + // snapshot is not linked with this peer + continue; + } + count++; + if (count == max_snapshots) { + unlink_snap_id = snap_info_pair.first; + } + if (count > max_snapshots) { + peer_uuid = peer; + snap_id = unlink_snap_id; + goto do_unlink; + } + } + } + } + + finish(0); + return; + +do_unlink: + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "peer=" << peer_uuid << ", snap_id=" << snap_id << dendl; + + auto ctx = create_context_callback< + CreatePrimaryRequest, + &CreatePrimaryRequest::handle_unlink_peer>(this); + auto req = UnlinkPeerRequest::create(m_image_ctx, snap_id, peer_uuid, true, + ctx); + req->send(); +} + +template +void CreatePrimaryRequest::handle_unlink_peer(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to unlink peer: " << cpp_strerror(r) << dendl; + finish(0); // not fatal + return; + } + + unlink_peer(); +} + +template +void CreatePrimaryRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::CreatePrimaryRequest; diff --git a/src/librbd/mirror/snapshot/CreatePrimaryRequest.h b/src/librbd/mirror/snapshot/CreatePrimaryRequest.h new file mode 100644 index 000000000..b8e84cf2b --- /dev/null +++ b/src/librbd/mirror/snapshot/CreatePrimaryRequest.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_PRIMARY_REQUEST_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_PRIMARY_REQUEST_H + +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/internal.h" +#include "librbd/mirror/snapshot/Types.h" + +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +template +class CreatePrimaryRequest { +public: + static CreatePrimaryRequest *create(ImageCtxT *image_ctx, + const std::string& global_image_id, + uint64_t clean_since_snap_id, + uint64_t snap_create_flags, + uint32_t flags, uint64_t *snap_id, + Context *on_finish) { + return new CreatePrimaryRequest(image_ctx, global_image_id, + clean_since_snap_id, snap_create_flags, flags, + snap_id, on_finish); + } + + CreatePrimaryRequest(ImageCtxT *image_ctx, + const std::string& global_image_id, + uint64_t clean_since_snap_id, uint64_t snap_create_flags, + uint32_t flags, uint64_t *snap_id, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_MIRROR_PEERS + * | + * v + * CREATE_SNAPSHOT + * | + * v + * REFRESH_IMAGE + * | + * v + * UNLINK_PEER (skip if not needed, + * | repeat if needed) + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + std::string m_global_image_id; + uint64_t m_clean_since_snap_id; + const uint64_t m_snap_create_flags; + const uint32_t m_flags; + uint64_t *m_snap_id; + Context *m_on_finish; + + librados::IoCtx m_default_ns_ctx; + std::set m_mirror_peer_uuids; + std::string m_snap_name; + + bufferlist m_out_bl; + NoOpProgressContext m_prog_ctx; + + void get_mirror_peers(); + void handle_get_mirror_peers(int r); + + void create_snapshot(); + void handle_create_snapshot(int r); + + void refresh_image(); + void handle_refresh_image(int r); + + void unlink_peer(); + void handle_unlink_peer(int r); + + void finish(int r); +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::CreatePrimaryRequest; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_CREATE_PRIMARY_REQUEST_H diff --git a/src/librbd/mirror/snapshot/DemoteRequest.cc b/src/librbd/mirror/snapshot/DemoteRequest.cc new file mode 100644 index 000000000..ccaa33c83 --- /dev/null +++ b/src/librbd/mirror/snapshot/DemoteRequest.cc @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/DemoteRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/mirror/snapshot/CreatePrimaryRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::DemoteRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +void DemoteRequest::send() { + enable_non_primary_feature(); +} + +template +void DemoteRequest::enable_non_primary_feature() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + // ensure image is flagged with non-primary feature so that + // standard RBD clients cannot write to it. + librados::ObjectWriteOperation op; + cls_client::set_features(&op, RBD_FEATURE_NON_PRIMARY, + RBD_FEATURE_NON_PRIMARY); + + auto aio_comp = create_rados_callback< + DemoteRequest, + &DemoteRequest::handle_enable_non_primary_feature>(this); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp, + &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void DemoteRequest::handle_enable_non_primary_feature(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to enable non-primary feature: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + create_snapshot(); +} + +template +void DemoteRequest::create_snapshot() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + DemoteRequest, &DemoteRequest::handle_create_snapshot>(this); + + auto req = CreatePrimaryRequest::create( + m_image_ctx, m_global_image_id, CEPH_NOSNAP, + SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE, + (snapshot::CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS | + snapshot::CREATE_PRIMARY_FLAG_DEMOTED), nullptr, ctx); + req->send(); +} + +template +void DemoteRequest::handle_create_snapshot(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to create mirror snapshot: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void DemoteRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::DemoteRequest; diff --git a/src/librbd/mirror/snapshot/DemoteRequest.h b/src/librbd/mirror/snapshot/DemoteRequest.h new file mode 100644 index 000000000..63c935645 --- /dev/null +++ b/src/librbd/mirror/snapshot/DemoteRequest.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_DEMOTE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_DEMOTE_REQUEST_H + +#include "include/buffer.h" + +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +template +class DemoteRequest { +public: + static DemoteRequest *create(ImageCtxT *image_ctx, + const std::string& global_image_id, + Context *on_finish) { + return new DemoteRequest(image_ctx, global_image_id, on_finish); + } + + DemoteRequest(ImageCtxT *image_ctx, const std::string& global_image_id, + Context *on_finish) + : m_image_ctx(image_ctx), m_global_image_id(global_image_id), + m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * ENABLE_NON_PRIMARY_FEATURE + * | + * v + * CREATE_SNAPSHOT + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + std::string m_global_image_id; + Context *m_on_finish; + + void enable_non_primary_feature(); + void handle_enable_non_primary_feature(int r); + + void create_snapshot(); + void handle_create_snapshot(int r); + + void finish(int r); + +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::DemoteRequest; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_DEMOTE_REQUEST_H diff --git a/src/librbd/mirror/snapshot/GetImageStateRequest.cc b/src/librbd/mirror/snapshot/GetImageStateRequest.cc new file mode 100644 index 000000000..4692f88cb --- /dev/null +++ b/src/librbd/mirror/snapshot/GetImageStateRequest.cc @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/GetImageStateRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/mirror/snapshot/Types.h" +#include "librbd/mirror/snapshot/Utils.h" + +#define dout_subsys ceph_subsys_rbd + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::GetImageStateRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +using librbd::util::create_rados_callback; + +template +void GetImageStateRequest::send() { + read_object(); +} + + +template +void GetImageStateRequest::read_object() { + CephContext *cct = m_image_ctx->cct; + + auto oid = util::image_state_object_name(m_image_ctx, m_snap_id, + m_object_index); + ldout(cct, 15) << oid << dendl; + + librados::ObjectReadOperation op; + m_bl.clear(); + op.read(0, 0, &m_bl, nullptr); + + librados::AioCompletion *comp = create_rados_callback< + GetImageStateRequest, + &GetImageStateRequest::handle_read_object>(this); + int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op, nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template +void GetImageStateRequest::handle_read_object(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to read image state object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + auto iter = m_bl.cbegin(); + + if (m_object_index == 0) { + ImageStateHeader header; + try { + using ceph::decode; + decode(header, iter); + } catch (const buffer::error &err) { + lderr(cct) << "failed to decode image state object header" << dendl; + finish(-EBADMSG); + return; + } + m_object_count = header.object_count; + } + + bufferlist bl; + bl.substr_of(m_bl, iter.get_off(), m_bl.length() - iter.get_off()); + m_state_bl.claim_append(bl); + + m_object_index++; + + if (m_object_index >= m_object_count) { + finish(0); + return; + } + + read_object(); +} + +template +void GetImageStateRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r == 0) { + try { + using ceph::decode; + decode(*m_image_state, m_state_bl); + } catch (const buffer::error &err) { + lderr(cct) << "failed to decode image state" << dendl; + r = -EBADMSG; + } + } + + m_on_finish->complete(r); + delete this; +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::GetImageStateRequest; diff --git a/src/librbd/mirror/snapshot/GetImageStateRequest.h b/src/librbd/mirror/snapshot/GetImageStateRequest.h new file mode 100644 index 000000000..483e3a228 --- /dev/null +++ b/src/librbd/mirror/snapshot/GetImageStateRequest.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_GET_IMAGE_STATE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_GET_IMAGE_STATE_REQUEST_H + +#include "include/buffer.h" +#include "include/types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +struct ImageState; + +template +class GetImageStateRequest { +public: + static GetImageStateRequest *create(ImageCtxT *image_ctx, uint64_t snap_id, + ImageState *image_state, + Context *on_finish) { + return new GetImageStateRequest(image_ctx, snap_id, image_state, on_finish); + } + + GetImageStateRequest(ImageCtxT *image_ctx, uint64_t snap_id, + ImageState *image_state, Context *on_finish) + : m_image_ctx(image_ctx), m_snap_id(snap_id), m_image_state(image_state), + m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * READ_OBJECT (repeat for + * | every object) + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + uint64_t m_snap_id; + ImageState *m_image_state; + Context *m_on_finish; + + bufferlist m_bl; + bufferlist m_state_bl; + + size_t m_object_count = 0; + size_t m_object_index = 0; + + void read_object(); + void handle_read_object(int r); + + void finish(int r); +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::GetImageStateRequest; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_GET_IMAGE_STATE_REQUEST_H diff --git a/src/librbd/mirror/snapshot/ImageMeta.cc b/src/librbd/mirror/snapshot/ImageMeta.cc new file mode 100644 index 000000000..826899775 --- /dev/null +++ b/src/librbd/mirror/snapshot/ImageMeta.cc @@ -0,0 +1,175 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/ImageMeta.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "json_spirit/json_spirit.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/WatchNotifyTypes.h" +#include "librbd/mirror/snapshot/Utils.h" +#include "librbd/watcher/Notifier.h" + +#define dout_subsys ceph_subsys_rbd + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::ImageMeta: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +using librbd::util::create_rados_callback; +using librbd::mirror::snapshot::util::get_image_meta_key; + +template +ImageMeta::ImageMeta(I* image_ctx, const std::string& mirror_uuid) + : m_image_ctx(image_ctx), m_mirror_uuid(mirror_uuid) { +} + +template +void ImageMeta::load(Context* on_finish) { + ldout(m_image_ctx->cct, 15) << "oid=" << m_image_ctx->header_oid << ", " + << "key=" << get_image_meta_key(m_mirror_uuid) + << dendl; + + librados::ObjectReadOperation op; + cls_client::metadata_get_start(&op, get_image_meta_key(m_mirror_uuid)); + + m_out_bl.clear(); + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_load(on_finish, r); + }); + auto aio_comp = create_rados_callback(ctx); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp, + &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void ImageMeta::handle_load(Context* on_finish, int r) { + ldout(m_image_ctx->cct, 15) << "r=" << r << dendl; + + std::string data; + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::metadata_get_finish(&it, &data); + } + + if (r == -ENOENT) { + ldout(m_image_ctx->cct, 15) << "no snapshot-based mirroring image-meta: " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } else if (r < 0) { + lderr(m_image_ctx->cct) << "failed to load snapshot-based mirroring " + << "image-meta: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + bool json_valid = false; + json_spirit::mValue json_root; + if (json_spirit::read(data, json_root)) { + try { + auto& json_obj = json_root.get_obj(); + resync_requested = json_obj["resync_requested"].get_bool(); + json_valid = true; + } catch (std::runtime_error&) { + } + } + + if (!json_valid) { + lderr(m_image_ctx->cct) << "invalid image-meta JSON received" << dendl; + on_finish->complete(-EBADMSG); + return; + } + + on_finish->complete(0); +} + +template +void ImageMeta::save(Context* on_finish) { + ldout(m_image_ctx->cct, 15) << "oid=" << m_image_ctx->header_oid << ", " + << "key=" << get_image_meta_key(m_mirror_uuid) + << dendl; + + // simple implementation for now + std::string json = "{\"resync_requested\": " + + std::string(resync_requested ? "true" : "false") + "}"; + + bufferlist bl; + bl.append(json); + + // avoid using built-in metadata_set operation since that would require + // opening the non-primary image in read/write mode which isn't supported + librados::ObjectWriteOperation op; + cls_client::metadata_set(&op, {{get_image_meta_key(m_mirror_uuid), bl}}); + + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_save(on_finish, r); + }); + auto aio_comp = create_rados_callback(ctx); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp, + &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void ImageMeta::handle_save(Context* on_finish, int r) { + ldout(m_image_ctx->cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "failed to save snapshot-based mirroring " + << "image-meta: " << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + notify_update(on_finish); +} + +template +void ImageMeta::notify_update(Context* on_finish) { + ldout(m_image_ctx->cct, 15) << dendl; + + // directly send header notification on image since you cannot + // open a non-primary image read/write and therefore cannot re-use + // the ImageWatcher to send the notification + bufferlist bl; + encode(watch_notify::NotifyMessage(new watch_notify::HeaderUpdatePayload()), + bl); + + m_out_bl.clear(); + auto ctx = new LambdaContext([this, on_finish](int r) { + handle_notify_update(on_finish, r); + }); + auto aio_comp = create_rados_callback(ctx); + int r = m_image_ctx->md_ctx.aio_notify( + m_image_ctx->header_oid, aio_comp, bl, watcher::Notifier::NOTIFY_TIMEOUT, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void ImageMeta::handle_notify_update(Context* on_finish, int r) { + ldout(m_image_ctx->cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_image_ctx->cct) << "failed to notify image update: " + << cpp_strerror(r) << dendl; + } + on_finish->complete(r); +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::ImageMeta; diff --git a/src/librbd/mirror/snapshot/ImageMeta.h b/src/librbd/mirror/snapshot/ImageMeta.h new file mode 100644 index 000000000..5d05f1927 --- /dev/null +++ b/src/librbd/mirror/snapshot/ImageMeta.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_IMAGE_META_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_IMAGE_META_H + +#include "include/rados/librados.hpp" +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +template +class ImageMeta { +public: + static ImageMeta* create(ImageCtxT* image_ctx, + const std::string& mirror_uuid) { + return new ImageMeta(image_ctx, mirror_uuid); + } + + ImageMeta(ImageCtxT* image_ctx, const std::string& mirror_uuid); + + void load(Context* on_finish); + void save(Context* on_finish); + + bool resync_requested = false; + +private: + /** + * @verbatim + * + * + * | + * v + * METADATA_GET + * | + * v + * + * | + * v + * METADATA_SET + * | + * v + * NOTIFY_UPDATE + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT* m_image_ctx; + std::string m_mirror_uuid; + + bufferlist m_out_bl; + + void handle_load(Context* on_finish, int r); + + void handle_save(Context* on_finish, int r); + + void notify_update(Context* on_finish); + void handle_notify_update(Context* on_finish, int r); + +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::ImageMeta; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_IMAGE_META_H diff --git a/src/librbd/mirror/snapshot/PromoteRequest.cc b/src/librbd/mirror/snapshot/PromoteRequest.cc new file mode 100644 index 000000000..9718c299e --- /dev/null +++ b/src/librbd/mirror/snapshot/PromoteRequest.cc @@ -0,0 +1,405 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/PromoteRequest.h" +#include "common/Timer.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/image/ListWatchersRequest.h" +#include "librbd/mirror/snapshot/CreateNonPrimaryRequest.h" +#include "librbd/mirror/snapshot/CreatePrimaryRequest.h" +#include "librbd/mirror/snapshot/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::PromoteRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +using librbd::util::create_async_context_callback; +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +void PromoteRequest::send() { + CephContext *cct = m_image_ctx->cct; + bool requires_orphan = false; + if (!util::can_create_primary_snapshot(m_image_ctx, false, true, + &requires_orphan, + &m_rollback_snap_id)) { + lderr(cct) << "cannot promote" << dendl; + finish(-EINVAL); + return; + } else if (m_rollback_snap_id == CEPH_NOSNAP && !requires_orphan) { + create_promote_snapshot(); + return; + } + + ldout(cct, 15) << "requires_orphan=" << requires_orphan << ", " + << "rollback_snap_id=" << m_rollback_snap_id << dendl; + create_orphan_snapshot(); +} + +template +void PromoteRequest::create_orphan_snapshot() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + PromoteRequest, + &PromoteRequest::handle_create_orphan_snapshot>(this); + + auto req = CreateNonPrimaryRequest::create( + m_image_ctx, false, "", CEPH_NOSNAP, {}, {}, nullptr, ctx); + req->send(); +} + +template +void PromoteRequest::handle_create_orphan_snapshot(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to create orphan snapshot: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + list_watchers(); +} + +template +void PromoteRequest::list_watchers() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + PromoteRequest, + &PromoteRequest::handle_list_watchers>(this); + + m_watchers.clear(); + auto flags = librbd::image::LIST_WATCHERS_FILTER_OUT_MY_INSTANCE | + librbd::image::LIST_WATCHERS_MIRROR_INSTANCES_ONLY; + auto req = librbd::image::ListWatchersRequest::create( + *m_image_ctx, flags, &m_watchers, ctx); + req->send(); +} + +template +void PromoteRequest::handle_list_watchers(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to list watchers: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (m_watchers.empty()) { + acquire_exclusive_lock(); + return; + } + + wait_update_notify(); +} + +template +void PromoteRequest::wait_update_notify() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + ImageCtx::get_timer_instance(cct, &m_timer, &m_timer_lock); + + std::lock_guard timer_lock{*m_timer_lock}; + + m_scheduler_ticks = 5; + + int r = m_image_ctx->state->register_update_watcher(&m_update_watch_ctx, + &m_update_watcher_handle); + if (r < 0) { + lderr(cct) << "failed to register update watcher: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + scheduler_unregister_update_watcher(); +} + +template +void PromoteRequest::handle_update_notify() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + std::lock_guard timer_lock{*m_timer_lock}; + m_scheduler_ticks = 0; +} + +template +void PromoteRequest::scheduler_unregister_update_watcher() { + ceph_assert(ceph_mutex_is_locked(*m_timer_lock)); + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "scheduler_ticks=" << m_scheduler_ticks << dendl; + + if (m_scheduler_ticks > 0) { + m_scheduler_ticks--; + m_timer->add_event_after(1, new LambdaContext([this](int) { + scheduler_unregister_update_watcher(); + })); + return; + } + + m_image_ctx->op_work_queue->queue(new LambdaContext([this](int) { + unregister_update_watcher(); + }), 0); +} + +template +void PromoteRequest::unregister_update_watcher() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + PromoteRequest, + &PromoteRequest::handle_unregister_update_watcher>(this); + + m_image_ctx->state->unregister_update_watcher(m_update_watcher_handle, ctx); +} + +template +void PromoteRequest::handle_unregister_update_watcher(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to unregister update watcher: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + list_watchers(); +} + +template +void PromoteRequest::acquire_exclusive_lock() { + { + std::unique_lock locker{m_image_ctx->owner_lock}; + if (m_image_ctx->exclusive_lock != nullptr && + !m_image_ctx->exclusive_lock->is_lock_owner()) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + m_lock_acquired = true; + m_image_ctx->exclusive_lock->block_requests(0); + + auto ctx = create_context_callback< + PromoteRequest, + &PromoteRequest::handle_acquire_exclusive_lock>(this); + + m_image_ctx->exclusive_lock->acquire_lock(ctx); + return; + } + } + + rollback(); +} + +template +void PromoteRequest::handle_acquire_exclusive_lock(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to acquire exclusive lock: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } else { + std::unique_lock locker{m_image_ctx->owner_lock}; + if (m_image_ctx->exclusive_lock != nullptr && + !m_image_ctx->exclusive_lock->is_lock_owner()) { + lderr(cct) << "failed to acquire exclusive lock" << dendl; + r = m_image_ctx->exclusive_lock->get_unlocked_op_error(); + locker.unlock(); + finish(r); + return; + } + } + + rollback(); +} + +template +void PromoteRequest::rollback() { + if (m_rollback_snap_id == CEPH_NOSNAP) { + create_promote_snapshot(); + return; + } + + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + std::shared_lock owner_locker{m_image_ctx->owner_lock}; + std::shared_lock image_locker{m_image_ctx->image_lock}; + + auto info = m_image_ctx->get_snap_info(m_rollback_snap_id); + ceph_assert(info != nullptr); + auto snap_namespace = info->snap_namespace; + auto snap_name = info->name; + + image_locker.unlock(); + + auto ctx = create_async_context_callback( + *m_image_ctx, create_context_callback< + PromoteRequest, &PromoteRequest::handle_rollback>(this)); + + m_image_ctx->operations->execute_snap_rollback(snap_namespace, snap_name, + m_progress_ctx, ctx); +} + +template +void PromoteRequest::handle_rollback(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to rollback: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + create_promote_snapshot(); +} + +template +void PromoteRequest::create_promote_snapshot() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + PromoteRequest, + &PromoteRequest::handle_create_promote_snapshot>(this); + + auto req = CreatePrimaryRequest::create( + m_image_ctx, m_global_image_id, CEPH_NOSNAP, + SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE, + (snapshot::CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS | + snapshot::CREATE_PRIMARY_FLAG_FORCE), nullptr, ctx); + req->send(); +} + +template +void PromoteRequest::handle_create_promote_snapshot(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to create promote snapshot: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + disable_non_primary_feature(); +} + +template +void PromoteRequest::disable_non_primary_feature() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << dendl; + + // remove the non-primary feature flag so that the image can be + // R/W by standard RBD clients + librados::ObjectWriteOperation op; + cls_client::set_features(&op, 0U, RBD_FEATURE_NON_PRIMARY); + + auto aio_comp = create_rados_callback< + PromoteRequest, + &PromoteRequest::handle_disable_non_primary_feature>(this); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp, + &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void PromoteRequest::handle_disable_non_primary_feature(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to disable non-primary feature: " + << cpp_strerror(r) << dendl; + finish(r); + return; + } + + release_exclusive_lock(); +} + +template +void PromoteRequest::release_exclusive_lock() { + if (m_lock_acquired) { + std::unique_lock locker{m_image_ctx->owner_lock}; + if (m_image_ctx->exclusive_lock != nullptr) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + m_image_ctx->exclusive_lock->unblock_requests(); + + auto ctx = create_context_callback< + PromoteRequest, + &PromoteRequest::handle_release_exclusive_lock>(this); + + m_image_ctx->exclusive_lock->release_lock(ctx); + return; + } + } + + finish(0); +} + +template +void PromoteRequest::handle_release_exclusive_lock(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to release exclusive lock: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void PromoteRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::PromoteRequest; diff --git a/src/librbd/mirror/snapshot/PromoteRequest.h b/src/librbd/mirror/snapshot/PromoteRequest.h new file mode 100644 index 000000000..1d9a862a0 --- /dev/null +++ b/src/librbd/mirror/snapshot/PromoteRequest.h @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_PROMOTE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_PROMOTE_REQUEST_H + +#include "include/buffer.h" +#include "include/rbd/librbd.hpp" +#include "common/ceph_mutex.h" +#include "common/Timer.h" +#include "librbd/internal.h" + +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +template +class PromoteRequest { +public: + static PromoteRequest *create(ImageCtxT *image_ctx, + const std::string& global_image_id, + Context *on_finish) { + return new PromoteRequest(image_ctx, global_image_id, on_finish); + } + + PromoteRequest(ImageCtxT *image_ctx, const std::string& global_image_id, + Context *on_finish) + : m_image_ctx(image_ctx), m_global_image_id(global_image_id), + m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * | (can promote) + * |\----------------------------------------\ + * | | + * | | + * v (skip if not needed) | + * CREATE_ORPHAN_SNAPSHOT | + * | | + * | /-- UNREGISTER_UPDATE_WATCHER <-\ | + * v v | | + * LIST_WATCHERS ----> WAIT_UPDATE_NOTIFY --/ | + * | | + * | (no watchers) | + * v | + * ACQUIRE_EXCLUSIVE_LOCK | + * | (skip if not needed) | + * v | + * ROLLBACK | + * | | + * v | + * CREATE_PROMOTE_SNAPSHOT <--------------------/ + * | + * v + * DISABLE_NON_PRIMARY_FEATURE + * | + * v + * RELEASE_EXCLUSIVE_LOCK (skip if not needed) + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + std::string m_global_image_id; + Context *m_on_finish; + + uint64_t m_rollback_snap_id = CEPH_NOSNAP; + bool m_lock_acquired = false; + NoOpProgressContext m_progress_ctx; + + class UpdateWatchCtx : public librbd::UpdateWatchCtx { + public: + UpdateWatchCtx(PromoteRequest *promote_request) + : promote_request(promote_request) { + } + + void handle_notify() { + promote_request->handle_update_notify(); + } + + private: + PromoteRequest *promote_request; + + } m_update_watch_ctx = {this}; + + std::list m_watchers; + uint64_t m_update_watcher_handle = 0; + uint64_t m_scheduler_ticks = 0; + SafeTimer *m_timer = nullptr; + ceph::mutex *m_timer_lock = nullptr; + + void refresh_image(); + void handle_refresh_image(int r); + + void create_orphan_snapshot(); + void handle_create_orphan_snapshot(int r); + + void list_watchers(); + void handle_list_watchers(int r); + + void wait_update_notify(); + void handle_update_notify(); + void scheduler_unregister_update_watcher(); + + void unregister_update_watcher(); + void handle_unregister_update_watcher(int r); + + void acquire_exclusive_lock(); + void handle_acquire_exclusive_lock(int r); + + void rollback(); + void handle_rollback(int r); + + void create_promote_snapshot(); + void handle_create_promote_snapshot(int r); + + void disable_non_primary_feature(); + void handle_disable_non_primary_feature(int r); + + void release_exclusive_lock(); + void handle_release_exclusive_lock(int r); + + void finish(int r); + +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::PromoteRequest; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_PROMOTE_REQUEST_H diff --git a/src/librbd/mirror/snapshot/RemoveImageStateRequest.cc b/src/librbd/mirror/snapshot/RemoveImageStateRequest.cc new file mode 100644 index 000000000..204e0489a --- /dev/null +++ b/src/librbd/mirror/snapshot/RemoveImageStateRequest.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/RemoveImageStateRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/mirror/snapshot/Types.h" +#include "librbd/mirror/snapshot/Utils.h" + +#define dout_subsys ceph_subsys_rbd + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::RemoveImageStateRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +using librbd::util::create_rados_callback; + +template +void RemoveImageStateRequest::send() { + get_object_count(); +} + + +template +void RemoveImageStateRequest::get_object_count() { + CephContext *cct = m_image_ctx->cct; + + auto oid = util::image_state_object_name(m_image_ctx, m_snap_id, 0); + ldout(cct, 15) << oid << dendl; + + librados::ObjectReadOperation op; + op.read(0, 0, &m_bl, nullptr); + + librados::AioCompletion *comp = create_rados_callback< + RemoveImageStateRequest, + &RemoveImageStateRequest::handle_get_object_count>(this); + int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op, nullptr); + ceph_assert(r == 0); + comp->release(); +} + +template +void RemoveImageStateRequest::handle_get_object_count(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to read image state object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + ImageStateHeader header(1); + auto iter = m_bl.cbegin(); + try { + using ceph::decode; + + decode(header, iter); + } catch (const buffer::error &err) { + lderr(cct) << "failed to decode image state object header" << dendl; + // still try to remove it + } + + m_object_count = header.object_count > 0 ? header.object_count : 1; + + remove_object(); +} + +template +void RemoveImageStateRequest::remove_object() { + CephContext *cct = m_image_ctx->cct; + + ceph_assert(m_object_count > 0); + m_object_count--; + + auto oid = util::image_state_object_name(m_image_ctx, m_snap_id, + m_object_count); + ldout(cct, 15) << oid << dendl; + + librados::ObjectWriteOperation op; + op.remove(); + + librados::AioCompletion *comp = create_rados_callback< + RemoveImageStateRequest, + &RemoveImageStateRequest::handle_remove_object>(this); + int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void RemoveImageStateRequest::handle_remove_object(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to remove image state object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (m_object_count == 0) { + finish(0); + return; + } + + remove_object(); +} + +template +void RemoveImageStateRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::RemoveImageStateRequest; diff --git a/src/librbd/mirror/snapshot/RemoveImageStateRequest.h b/src/librbd/mirror/snapshot/RemoveImageStateRequest.h new file mode 100644 index 000000000..be7dad8e0 --- /dev/null +++ b/src/librbd/mirror/snapshot/RemoveImageStateRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_REMOVE_IMAGE_STATE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_REMOVE_IMAGE_STATE_REQUEST_H + +#include "include/buffer.h" +#include "include/types.h" + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +template +class RemoveImageStateRequest { +public: + static RemoveImageStateRequest *create(ImageCtxT *image_ctx, uint64_t snap_id, + Context *on_finish) { + return new RemoveImageStateRequest(image_ctx, snap_id, on_finish); + } + + RemoveImageStateRequest(ImageCtxT *image_ctx, uint64_t snap_id, + Context *on_finish) + : m_image_ctx(image_ctx), m_snap_id(snap_id), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_OBJECT_COUNT + * | + * v + * REMOVE_OBJECT (repeat for + * | every object) + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + uint64_t m_snap_id; + Context *m_on_finish; + + bufferlist m_bl; + + size_t m_object_count = 0; + + void get_object_count(); + void handle_get_object_count(int r); + + void remove_object(); + void handle_remove_object(int r); + + void finish(int r); +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::RemoveImageStateRequest; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_REMOVE_IMAGE_STATE_REQUEST_H diff --git a/src/librbd/mirror/snapshot/SetImageStateRequest.cc b/src/librbd/mirror/snapshot/SetImageStateRequest.cc new file mode 100644 index 000000000..9fcee0322 --- /dev/null +++ b/src/librbd/mirror/snapshot/SetImageStateRequest.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/SetImageStateRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/image/GetMetadataRequest.h" +#include "librbd/mirror/snapshot/WriteImageStateRequest.h" + +#include + +#define dout_subsys ceph_subsys_rbd + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror_snapshot::SetImageStateRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +void SetImageStateRequest::send() { + get_name(); +} + +template +void SetImageStateRequest::get_name() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::dir_get_name_start(&op, m_image_ctx->id); + + librados::AioCompletion *comp = create_rados_callback< + SetImageStateRequest, + &SetImageStateRequest::handle_get_name>(this); + m_bl.clear(); + int r = m_image_ctx->md_ctx.aio_operate(RBD_DIRECTORY, comp, &op, &m_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void SetImageStateRequest::handle_get_name(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r == 0) { + auto it = m_bl.cbegin(); + r = cls_client::dir_get_name_finish(&it, &m_image_state.name); + } + + if (r < 0) { + lderr(cct) << "failed to retrieve image name: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + ldout(cct, 15) << "name=" << m_image_state.name << dendl; + + get_snap_limit(); +} + +template +void SetImageStateRequest::get_snap_limit() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + librados::ObjectReadOperation op; + cls_client::snapshot_get_limit_start(&op); + + librados::AioCompletion *comp = create_rados_callback< + SetImageStateRequest, + &SetImageStateRequest::handle_get_snap_limit>(this); + m_bl.clear(); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, comp, &op, + &m_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void SetImageStateRequest::handle_get_snap_limit(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r == 0) { + auto it = m_bl.cbegin(); + r = cls_client::snapshot_get_limit_finish(&it, &m_image_state.snap_limit); + } + + if (r < 0) { + lderr(cct) << "failed to retrieve snapshot limit: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + ldout(cct, 15) << "snap_limit=" << m_image_state.snap_limit << dendl; + + get_metadata(); +} + +template +void SetImageStateRequest::get_metadata() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + SetImageStateRequest, + &SetImageStateRequest::handle_get_metadata>(this); + auto req = image::GetMetadataRequest::create( + m_image_ctx->md_ctx, m_image_ctx->header_oid, true, "", "", 0, + &m_image_state.metadata, ctx); + req->send(); +} + +template +void SetImageStateRequest::handle_get_metadata(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to retrieve metadata: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + { + std::shared_lock image_locker{m_image_ctx->image_lock}; + + m_image_state.features = + m_image_ctx->features & ~RBD_FEATURES_IMPLICIT_ENABLE; + + for (auto &[snap_id, snap_info] : m_image_ctx->snap_info) { + auto type = cls::rbd::get_snap_namespace_type(snap_info.snap_namespace); + if (type != cls::rbd::SNAPSHOT_NAMESPACE_TYPE_USER) { + // only replicate user snapshots -- trash snapshots will be + // replicated by an implicit delete if required + continue; + } + m_image_state.snapshots[snap_id] = {snap_info.snap_namespace, + snap_info.name, + snap_info.protection_status}; + } + } + + write_image_state(); +} + +template +void SetImageStateRequest::write_image_state() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + SetImageStateRequest, + &SetImageStateRequest::handle_write_image_state>(this); + + auto req = WriteImageStateRequest::create(m_image_ctx, m_snap_id, + m_image_state, ctx); + req->send(); +} + +template +void SetImageStateRequest::handle_write_image_state(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to write image state: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + update_primary_snapshot(); +} + +template +void SetImageStateRequest::update_primary_snapshot() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::mirror_image_snapshot_set_copy_progress( + &op, m_snap_id, true, 0); + + auto aio_comp = create_rados_callback< + SetImageStateRequest, + &SetImageStateRequest::handle_update_primary_snapshot>(this); + int r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp, + &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void SetImageStateRequest::handle_update_primary_snapshot(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to update primary snapshot: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void SetImageStateRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::SetImageStateRequest; diff --git a/src/librbd/mirror/snapshot/SetImageStateRequest.h b/src/librbd/mirror/snapshot/SetImageStateRequest.h new file mode 100644 index 000000000..fd2815494 --- /dev/null +++ b/src/librbd/mirror/snapshot/SetImageStateRequest.h @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_SET_IMAGE_STATE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_SET_IMAGE_STATE_REQUEST_H + +#include "librbd/mirror/snapshot/Types.h" + +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +template +class SetImageStateRequest { +public: + static SetImageStateRequest *create(ImageCtxT *image_ctx, uint64_t snap_id, + Context *on_finish) { + return new SetImageStateRequest(image_ctx, snap_id, on_finish); + } + + SetImageStateRequest(ImageCtxT *image_ctx, uint64_t snap_id, + Context *on_finish) + : m_image_ctx(image_ctx), m_snap_id(snap_id), m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * GET_NAME + * | + * v + * GET_SNAP_LIMIT + * | + * v + * GET_METADATA + * | + * v + * WRITE_IMAGE_STATE + * | + * v + * UPDATE_PRIMARY_SNAPSHOT + * | + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + uint64_t m_snap_id; + Context *m_on_finish; + + ImageState m_image_state; + + bufferlist m_bl; + bufferlist m_state_bl; + + void get_name(); + void handle_get_name(int r); + + void get_snap_limit(); + void handle_get_snap_limit(int r); + + void get_metadata(); + void handle_get_metadata(int r); + + void write_image_state(); + void handle_write_image_state(int r); + + void update_primary_snapshot(); + void handle_update_primary_snapshot(int r); + + void finish(int r); +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::SetImageStateRequest; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_SET_IMAGE_STATE_REQUEST_H diff --git a/src/librbd/mirror/snapshot/Types.cc b/src/librbd/mirror/snapshot/Types.cc new file mode 100644 index 000000000..866b4c3e2 --- /dev/null +++ b/src/librbd/mirror/snapshot/Types.cc @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/Formatter.h" +#include "include/encoding.h" +#include "include/stringify.h" +#include "librbd/mirror/snapshot/Types.h" + +namespace librbd { +namespace mirror { +namespace snapshot { + +void ImageStateHeader::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(object_count, bl); + ENCODE_FINISH(bl); +} + +void ImageStateHeader::decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(object_count, bl); + DECODE_FINISH(bl); +} + +void SnapState::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(snap_namespace, bl); + encode(name, bl); + encode(protection_status, bl); + ENCODE_FINISH(bl); +} + +void SnapState::decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(snap_namespace, bl); + decode(name, bl); + decode(protection_status, bl); + DECODE_FINISH(bl); +} + +void SnapState::dump(Formatter *f) const { + f->open_object_section("namespace"); + snap_namespace.dump(f); + f->close_section(); + f->dump_string("name", name); + f->dump_unsigned("protection_status", protection_status); +} + +std::ostream& operator<<(std::ostream& os, const SnapState& snap_state) { + os << "[" + << "namespace=" << snap_state.snap_namespace << ", " + << "name=" << snap_state.name << ", " + << "protection=" << static_cast(snap_state.protection_status) + << "]"; + return os; +} + +void ImageState::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(name, bl); + encode(features, bl); + encode(snap_limit, bl); + encode(snapshots, bl); + encode(metadata, bl); + ENCODE_FINISH(bl); +} + +void ImageState::decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(name, bl); + decode(features, bl); + decode(snap_limit, bl); + decode(snapshots, bl); + decode(metadata, bl); + DECODE_FINISH(bl); +} + +void ImageState::dump(Formatter *f) const { + f->dump_string("name", name); + f->dump_unsigned("features", features); + f->dump_unsigned("snap_limit", snap_limit); + f->open_array_section("snapshots"); + for (auto &[id, snap_state] : snapshots) { + f->open_object_section(stringify(id).c_str()); + snap_state.dump(f); + f->close_section(); // snap_state + } + f->close_section(); // snapshots + f->open_object_section("metadata"); + for (auto &it : metadata) { + f->dump_stream(it.first.c_str()) << it.second; + } + f->close_section(); // metadata +} + +std::ostream& operator<<(std::ostream& os, const ImageState& image_state) { + os << "[" + << "name=" << image_state.name << ", " + << "features=" << image_state.features << ", " + << "snap_limit=" << image_state.snap_limit << ", " + << "snaps=" << image_state.snapshots << ", " + << "metadata_count=" << image_state.metadata.size() + << "]"; + return os; +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd diff --git a/src/librbd/mirror/snapshot/Types.h b/src/librbd/mirror/snapshot/Types.h new file mode 100644 index 000000000..79947a5f8 --- /dev/null +++ b/src/librbd/mirror/snapshot/Types.h @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_TYPES_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_TYPES_H + +#include "cls/rbd/cls_rbd_types.h" +#include "include/buffer.h" +#include "include/types.h" + +#include +#include + +namespace librbd { +namespace mirror { +namespace snapshot { + +enum CreatePrimaryFlags { + CREATE_PRIMARY_FLAG_IGNORE_EMPTY_PEERS = (1 << 0), + CREATE_PRIMARY_FLAG_DEMOTED = (1 << 1), + CREATE_PRIMARY_FLAG_FORCE = (1 << 2) +}; + +struct ImageStateHeader { + uint32_t object_count = 0; + + ImageStateHeader() { + } + ImageStateHeader(uint32_t object_count) : object_count(object_count) { + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &it); +}; + +WRITE_CLASS_ENCODER(ImageStateHeader); + +struct SnapState { + cls::rbd::SnapshotNamespace snap_namespace; + std::string name; + uint8_t protection_status = 0; + + SnapState() { + } + SnapState(const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &name, uint8_t protection_status) + : snap_namespace(snap_namespace), name(name), + protection_status(protection_status) { + } + + bool operator==(const SnapState& rhs) const { + return snap_namespace == rhs.snap_namespace && + name == rhs.name && protection_status == rhs.protection_status; + } + + bool operator<(const SnapState& rhs) const { + if (snap_namespace != rhs.snap_namespace) { + return snap_namespace < rhs.snap_namespace; + } + if (name != rhs.name) { + return name < rhs.name; + } + return protection_status < rhs.protection_status; + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &it); + void dump(Formatter *f) const; +}; + +std::ostream& operator<<(std::ostream& os, const SnapState& snap_state); + +WRITE_CLASS_ENCODER(SnapState); + +struct ImageState { + std::string name; + uint64_t features = 0; + uint64_t snap_limit = 0; + std::map snapshots; + std::map metadata; + + ImageState() { + } + ImageState(const std::string &name, uint64_t features, uint64_t snap_limit, + const std::map &snapshots, + const std::map &metadata) + : name(name), features(features), snap_limit(snap_limit), + snapshots(snapshots), metadata(metadata) { + } + + bool operator==(const ImageState& rhs) const { + return name == rhs.name && features == rhs.features && + snap_limit == rhs.snap_limit && snapshots == rhs.snapshots; + } + + bool operator<(const ImageState& rhs) const { + if (name != rhs.name) { + return name < rhs.name; + } + if (features != rhs.features) { + return features < rhs.features; + } + if (snap_limit != rhs.snap_limit) { + return snap_limit < rhs.snap_limit; + } + return snapshots < rhs.snapshots; + } + + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator &it); + void dump(Formatter *f) const; +}; + +std::ostream& operator<<(std::ostream& os, const ImageState& image_state); + +WRITE_CLASS_ENCODER(ImageState); + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_TYPES_H diff --git a/src/librbd/mirror/snapshot/UnlinkPeerRequest.cc b/src/librbd/mirror/snapshot/UnlinkPeerRequest.cc new file mode 100644 index 000000000..35313f627 --- /dev/null +++ b/src/librbd/mirror/snapshot/UnlinkPeerRequest.cc @@ -0,0 +1,230 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/UnlinkPeerRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::UnlinkPeerRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +using librbd::util::create_context_callback; +using librbd::util::create_rados_callback; + +template +void UnlinkPeerRequest::send() { + if (!m_image_ctx->state->is_refresh_required()) { + unlink_peer(); + return; + } + + refresh_image(); +} + +template +void UnlinkPeerRequest::refresh_image() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + UnlinkPeerRequest, &UnlinkPeerRequest::handle_refresh_image>(this); + m_image_ctx->state->refresh(ctx); +} + +template +void UnlinkPeerRequest::handle_refresh_image(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to refresh image: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + unlink_peer(); +} + +template +void UnlinkPeerRequest::unlink_peer() { + CephContext *cct = m_image_ctx->cct; + + m_image_ctx->image_lock.lock_shared(); + int r = -ENOENT; + cls::rbd::SnapshotNamespace snap_namespace; + std::string snap_name; + bool have_newer_mirror_snapshot = false; + for (auto snap_it = m_image_ctx->snap_info.find(m_snap_id); + snap_it != m_image_ctx->snap_info.end(); ++snap_it) { + if (snap_it->first == m_snap_id) { + r = 0; + snap_namespace = snap_it->second.snap_namespace; + snap_name = snap_it->second.name; + } else if (std::holds_alternative( + snap_it->second.snap_namespace)) { + ldout(cct, 15) << "located newer mirror snapshot" << dendl; + have_newer_mirror_snapshot = true; + break; + } + } + + if (r == -ENOENT) { + ldout(cct, 15) << "missing snapshot: snap_id=" << m_snap_id << dendl; + m_image_ctx->image_lock.unlock_shared(); + finish(r); + return; + } + + auto mirror_ns = std::get_if( + &snap_namespace); + if (mirror_ns == nullptr) { + lderr(cct) << "not mirror snapshot (snap_id=" << m_snap_id << ")" << dendl; + m_image_ctx->image_lock.unlock_shared(); + finish(-EINVAL); + return; + } + + // if there is or will be no more peers in the mirror snapshot and we have + // a more recent mirror snapshot, remove the older one + if ((mirror_ns->mirror_peer_uuids.empty() || + (mirror_ns->mirror_peer_uuids.size() == 1 && + mirror_ns->mirror_peer_uuids.count(m_mirror_peer_uuid) != 0)) && + have_newer_mirror_snapshot) { + if (m_allow_remove) { + m_image_ctx->image_lock.unlock_shared(); + remove_snapshot(snap_namespace, snap_name); + return; + } else { + ldout(cct, 15) << "skipping removal of snapshot: snap_id=" << m_snap_id + << ", mirror_peer_uuid=" << m_mirror_peer_uuid + << ", mirror_peer_uuids=" << mirror_ns->mirror_peer_uuids + << dendl; + } + } + + if (mirror_ns->mirror_peer_uuids.count(m_mirror_peer_uuid) == 0) { + ldout(cct, 15) << "no peer to unlink: snap_id=" << m_snap_id + << ", mirror_peer_uuid=" << m_mirror_peer_uuid + << ", mirror_peer_uuids=" << mirror_ns->mirror_peer_uuids + << dendl; + m_image_ctx->image_lock.unlock_shared(); + finish(0); + return; + } + + m_image_ctx->image_lock.unlock_shared(); + + ldout(cct, 15) << "snap_id=" << m_snap_id << ", " + << "mirror_peer_uuid=" << m_mirror_peer_uuid << dendl; + librados::ObjectWriteOperation op; + librbd::cls_client::mirror_image_snapshot_unlink_peer(&op, m_snap_id, + m_mirror_peer_uuid); + auto aio_comp = create_rados_callback< + UnlinkPeerRequest, &UnlinkPeerRequest::handle_unlink_peer>(this); + r = m_image_ctx->md_ctx.aio_operate(m_image_ctx->header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void UnlinkPeerRequest::handle_unlink_peer(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r == -ERESTART || r == -ENOENT) { + if (r == -ERESTART) { + ldout(cct, 15) << "unlinking last peer not supported" << dendl; + m_allow_remove = true; + } + refresh_image(); + return; + } + + if (r < 0) { + lderr(cct) << "failed to unlink peer: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + notify_update(); +} + +template +void UnlinkPeerRequest::notify_update() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + UnlinkPeerRequest, &UnlinkPeerRequest::handle_notify_update>(this); + m_image_ctx->notify_update(ctx); +} + +template +void UnlinkPeerRequest::handle_notify_update(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r == -ENOENT || r == -ETIMEDOUT) { + // non-fatel errors + lderr(cct) << "failed to notify update: " << cpp_strerror(r) << dendl; + } else if (r < 0) { + lderr(cct) << "failed to notify update: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + refresh_image(); +} + +template +void UnlinkPeerRequest::remove_snapshot( + const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << dendl; + + auto ctx = create_context_callback< + UnlinkPeerRequest, &UnlinkPeerRequest::handle_remove_snapshot>(this); + m_image_ctx->operations->snap_remove(snap_namespace, snap_name, ctx); +} + +template +void UnlinkPeerRequest::handle_remove_snapshot(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to remove snapshot: " << cpp_strerror(r) << dendl; + finish(r); + return; + } + + finish(0); +} + +template +void UnlinkPeerRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + auto on_finish = m_on_finish; + delete this; + on_finish->complete(r); +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::UnlinkPeerRequest; diff --git a/src/librbd/mirror/snapshot/UnlinkPeerRequest.h b/src/librbd/mirror/snapshot/UnlinkPeerRequest.h new file mode 100644 index 000000000..192b40d6e --- /dev/null +++ b/src/librbd/mirror/snapshot/UnlinkPeerRequest.h @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_UNLINK_PEER_REQUEST_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_UNLINK_PEER_REQUEST_H + +#include "include/buffer.h" +#include "cls/rbd/cls_rbd_client.h" + +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +template +class UnlinkPeerRequest { +public: + static UnlinkPeerRequest *create(ImageCtxT *image_ctx, uint64_t snap_id, + const std::string &mirror_peer_uuid, + bool allow_remove, Context *on_finish) { + return new UnlinkPeerRequest(image_ctx, snap_id, mirror_peer_uuid, + allow_remove, on_finish); + } + + UnlinkPeerRequest(ImageCtxT *image_ctx, uint64_t snap_id, + const std::string &mirror_peer_uuid, bool allow_remove, + Context *on_finish) + : m_image_ctx(image_ctx), m_snap_id(snap_id), + m_mirror_peer_uuid(mirror_peer_uuid), m_allow_remove(allow_remove), + m_on_finish(on_finish) { + } + + void send(); + +private: + /* + * @verbatim + * + * + * | + * v + * REFRESH_IMAGE <--------------------------\ + * | ^ (not found | + * | * or last) | + * | * | + * |\---------------> UNLINK_PEER --> NOTIFY_UPDATE + * | (not last peer or + * | no newer mirror + * | snap exists) + * | + * |\---------------> REMOVE_SNAPSHOT + * | (last peer and | + * | newer mirror | + * | snap exists) | + * | | + * |(peer not found) | + * v | + * <---------------/ + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + uint64_t m_snap_id; + std::string m_mirror_peer_uuid; + bool m_allow_remove; + Context *m_on_finish; + + void refresh_image(); + void handle_refresh_image(int r); + + void unlink_peer(); + void handle_unlink_peer(int r); + + void notify_update(); + void handle_notify_update(int r); + + void remove_snapshot(const cls::rbd::SnapshotNamespace& snap_namespace, + const std::string& snap_name); + void handle_remove_snapshot(int r); + + void finish(int r); +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::UnlinkPeerRequest; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_UNLINK_PEER_REQUEST_H diff --git a/src/librbd/mirror/snapshot/Utils.cc b/src/librbd/mirror/snapshot/Utils.cc new file mode 100644 index 000000000..36d1558be --- /dev/null +++ b/src/librbd/mirror/snapshot/Utils.cc @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/dout.h" +#include "common/errno.h" +#include "include/stringify.h" +#include "librbd/ImageCtx.h" +#include "librbd/mirror/snapshot/Utils.h" + +#define dout_subsys ceph_subsys_rbd + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::util: " \ + << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { +namespace util { + +namespace { + +const std::string IMAGE_STATE_OBJECT_PREFIX = "rbd_mirror_snapshot."; + +bool get_rollback_snap_id( + std::map::reverse_iterator it, + std::map::reverse_iterator end, + uint64_t *rollback_snap_id) { + + for (; it != end; it++) { + auto mirror_ns = std::get( + it->second.snap_namespace); + if (mirror_ns.state != cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY) { + break; + } + if (mirror_ns.complete) { + break; + } + } + + if (it != end) { + *rollback_snap_id = it->first; + return true; + } + + return false; +} + +} // anonymous namespace + +std::string get_image_meta_key(const std::string& mirror_uuid) { + return ".rbd_mirror." + mirror_uuid; +} + +template +bool can_create_primary_snapshot(I *image_ctx, bool demoted, bool force, + bool* requires_orphan, + uint64_t *rollback_snap_id) { + CephContext *cct = image_ctx->cct; + + if (requires_orphan != nullptr) { + *requires_orphan = false; + } + if (rollback_snap_id) { + *rollback_snap_id = CEPH_NOSNAP; + } + + std::shared_lock image_locker{image_ctx->image_lock}; + + for (auto it = image_ctx->snap_info.rbegin(); + it != image_ctx->snap_info.rend(); it++) { + auto mirror_ns = std::get_if( + &it->second.snap_namespace); + if (mirror_ns == nullptr) { + continue; + } + ldout(cct, 20) << "previous snapshot snap_id=" << it->first << " " + << *mirror_ns << dendl; + if (mirror_ns->is_demoted() && !force) { + lderr(cct) << "trying to create primary snapshot without force " + << "when previous primary snapshot is demoted" + << dendl; + return false; + } + + if (mirror_ns->state == cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY) { + if (!force) { + lderr(cct) << "trying to create primary snapshot without force " + << "when previous snapshot is non-primary" + << dendl; + return false; + } + if (demoted) { + lderr(cct) << "trying to create primary demoted snapshot " + << "when previous snapshot is non-primary" + << dendl; + return false; + } + + if (requires_orphan != nullptr) { + *requires_orphan = !mirror_ns->is_demoted(); + } + if (!mirror_ns->complete) { + ldout(cct, 20) << "needs rollback" << dendl; + if (!rollback_snap_id) { + lderr(cct) << "trying to create primary snapshot " + << "when previous non-primary snapshot is not copied yet" + << dendl; + return false; + } + if (!get_rollback_snap_id(++it, image_ctx->snap_info.rend(), + rollback_snap_id)) { + lderr(cct) << "cannot rollback" << dendl; + return false; + } + ldout(cct, 20) << "rollback_snap_id=" << *rollback_snap_id << dendl; + } + return true; + } + + return true; + } + + ldout(cct, 20) << "no previous mirror snapshots found" << dendl; + return true; +} + +template +bool can_create_non_primary_snapshot(I *image_ctx) { + CephContext *cct = image_ctx->cct; + + std::shared_lock image_locker{image_ctx->image_lock}; + + for (auto it = image_ctx->snap_info.rbegin(); + it != image_ctx->snap_info.rend(); it++) { + auto mirror_ns = std::get_if( + &it->second.snap_namespace); + if (mirror_ns != nullptr) { + ldout(cct, 20) << "previous mirror snapshot snap_id=" << it->first << " " + << *mirror_ns << dendl; + + if (mirror_ns->state == cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY) { + if (!mirror_ns->complete) { + lderr(cct) << "trying to create non-primary snapshot " + << "when previous non-primary snapshot is not copied yet" + << dendl; + return false; + } + return true; + } + + if (mirror_ns->state == cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY) { + lderr(cct) << "trying to create non-primary snapshot " + << "when previous primary snapshot is not in demoted state" + << dendl; + return false; + } + return true; + } + } + + ldout(cct, 20) << "no previous mirror snapshots found" << dendl; + return true; +} + +template +std::string image_state_object_name(I *image_ctx, uint64_t snap_id, + uint64_t index) { + return IMAGE_STATE_OBJECT_PREFIX + image_ctx->id + "." + + stringify(snap_id) + "." + stringify(index); +} + +} // namespace util +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template bool librbd::mirror::snapshot::util::can_create_primary_snapshot( + librbd::ImageCtx *image_ctx, bool demoted, bool force, + bool* requires_orphan, uint64_t *rollback_snap_id); + +template bool librbd::mirror::snapshot::util::can_create_non_primary_snapshot( + librbd::ImageCtx *image_ctx); + +template std::string librbd::mirror::snapshot::util::image_state_object_name( + librbd::ImageCtx *image_ctx, uint64_t snap_id, uint64_t index); diff --git a/src/librbd/mirror/snapshot/Utils.h b/src/librbd/mirror/snapshot/Utils.h new file mode 100644 index 000000000..127ec5865 --- /dev/null +++ b/src/librbd/mirror/snapshot/Utils.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_UTILS_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_UTILS_H + +#include "include/int_types.h" +#include "include/stringify.h" +#include + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { +namespace util { + +std::string get_image_meta_key(const std::string& mirror_uuid); + +template +bool can_create_primary_snapshot(ImageCtxT *image_ctx, bool demoted, bool force, + bool* requires_orphan, + uint64_t *rollback_snap_id); + +template +bool can_create_non_primary_snapshot(ImageCtxT *image_ctx); + +template +std::string image_state_object_name(ImageCtxT *image_ctx, uint64_t snap_id, + uint64_t index); + +} // namespace util +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_UTILS_H diff --git a/src/librbd/mirror/snapshot/WriteImageStateRequest.cc b/src/librbd/mirror/snapshot/WriteImageStateRequest.cc new file mode 100644 index 000000000..c79dd7e2c --- /dev/null +++ b/src/librbd/mirror/snapshot/WriteImageStateRequest.cc @@ -0,0 +1,120 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/mirror/snapshot/WriteImageStateRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/mirror/snapshot/Utils.h" + +#define dout_subsys ceph_subsys_rbd + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::mirror::snapshot::WriteImageStateRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace mirror { +namespace snapshot { + +namespace { + +static size_t header_length() { + bufferlist bl; + ImageStateHeader header; + + using ceph::encode; + encode(header, bl); + + return bl.length(); +} + +} +using librbd::util::create_rados_callback; + +template +WriteImageStateRequest::WriteImageStateRequest(I *image_ctx, + uint64_t snap_id, + const ImageState &image_state, + Context *on_finish) + : m_image_ctx(image_ctx), m_snap_id(snap_id), m_image_state(image_state), + m_on_finish(on_finish), m_object_size( + 1 << image_ctx->config.template get_val("rbd_default_order")) { + bufferlist bl; + encode(m_image_state, bl); + + m_object_count = 1 + (header_length() + bl.length()) / m_object_size; + ImageStateHeader header(m_object_count); + + encode(header, m_bl); + m_bl.claim_append(bl); +} + +template +void WriteImageStateRequest::send() { + write_object(); +} + +template +void WriteImageStateRequest::write_object() { + CephContext *cct = m_image_ctx->cct; + ceph_assert(m_object_count > 0); + + m_object_count--; + + auto oid = util::image_state_object_name(m_image_ctx, m_snap_id, + m_object_count); + ldout(cct, 15) << oid << dendl; + + size_t off = m_object_count * m_object_size; + size_t len = std::min(m_bl.length() - off, m_object_size); + bufferlist bl; + bl.substr_of(m_bl, off, len); + + librados::ObjectWriteOperation op; + op.write_full(bl); + + librados::AioCompletion *comp = create_rados_callback< + WriteImageStateRequest, + &WriteImageStateRequest::handle_write_object>(this); + int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void WriteImageStateRequest::handle_write_object(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to write object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + if (m_object_count == 0) { + finish(0); + return; + } + + write_object(); +} + +template +void WriteImageStateRequest::finish(int r) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 15) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +template class librbd::mirror::snapshot::WriteImageStateRequest; diff --git a/src/librbd/mirror/snapshot/WriteImageStateRequest.h b/src/librbd/mirror/snapshot/WriteImageStateRequest.h new file mode 100644 index 000000000..d2c4a7f80 --- /dev/null +++ b/src/librbd/mirror/snapshot/WriteImageStateRequest.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRROR_SNAPSHOT_WRITE_IMAGE_STATE_REQUEST_H +#define CEPH_LIBRBD_MIRROR_SNAPSHOT_WRITE_IMAGE_STATE_REQUEST_H + +#include "librbd/mirror/snapshot/Types.h" + +#include +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace mirror { +namespace snapshot { + +template +class WriteImageStateRequest { +public: + static WriteImageStateRequest *create(ImageCtxT *image_ctx, uint64_t snap_id, + const ImageState &image_state, + Context *on_finish) { + return new WriteImageStateRequest(image_ctx, snap_id, image_state, + on_finish); + } + + WriteImageStateRequest(ImageCtxT *image_ctx, uint64_t snap_id, + const ImageState &image_state, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * WRITE_OBJECT (repeat for + * | every object) + * v + * + * + * @endverbatim + */ + + ImageCtxT *m_image_ctx; + uint64_t m_snap_id; + ImageState m_image_state; + Context *m_on_finish; + + bufferlist m_bl; + + const size_t m_object_size; + size_t m_object_count = 0; + + void write_object(); + void handle_write_object(int r); + + void finish(int r); +}; + +} // namespace snapshot +} // namespace mirror +} // namespace librbd + +extern template class librbd::mirror::snapshot::WriteImageStateRequest; + +#endif // CEPH_LIBRBD_MIRROR_SNAPSHOT_WRITE_IMAGE_STATE_REQUEST_H diff --git a/src/librbd/mirroring_watcher/Types.cc b/src/librbd/mirroring_watcher/Types.cc new file mode 100644 index 000000000..3226b6352 --- /dev/null +++ b/src/librbd/mirroring_watcher/Types.cc @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/Formatter.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "librbd/mirroring_watcher/Types.h" +#include "librbd/watcher/Utils.h" + +namespace librbd { +namespace mirroring_watcher { + +namespace { + +class DumpPayloadVisitor : public boost::static_visitor { +public: + explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {} + + template + inline void operator()(const Payload &payload) const { + NotifyOp notify_op = Payload::NOTIFY_OP; + m_formatter->dump_string("notify_op", stringify(notify_op)); + payload.dump(m_formatter); + } + +private: + ceph::Formatter *m_formatter; +}; + +} // anonymous namespace + +void ModeUpdatedPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(static_cast(mirror_mode), bl); +} + +void ModeUpdatedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + uint32_t mirror_mode_decode; + decode(mirror_mode_decode, iter); + mirror_mode = static_cast(mirror_mode_decode); +} + +void ModeUpdatedPayload::dump(Formatter *f) const { + f->dump_stream("mirror_mode") << mirror_mode; +} + +void ImageUpdatedPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(static_cast(mirror_image_state), bl); + encode(image_id, bl); + encode(global_image_id, bl); +} + +void ImageUpdatedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + uint32_t mirror_image_state_decode; + decode(mirror_image_state_decode, iter); + mirror_image_state = static_cast( + mirror_image_state_decode); + decode(image_id, iter); + decode(global_image_id, iter); +} + +void ImageUpdatedPayload::dump(Formatter *f) const { + f->dump_stream("mirror_image_state") << mirror_image_state; + f->dump_string("image_id", image_id); + f->dump_string("global_image_id", global_image_id); +} + +void UnknownPayload::encode(bufferlist &bl) const { + ceph_abort(); +} + +void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void UnknownPayload::dump(Formatter *f) const { +} + +void NotifyMessage::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload); + ENCODE_FINISH(bl); +} + +void NotifyMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(1, iter); + + uint32_t notify_op; + decode(notify_op, iter); + + // select the correct payload variant based upon the encoded op + switch (notify_op) { + case NOTIFY_OP_MODE_UPDATED: + payload = ModeUpdatedPayload(); + break; + case NOTIFY_OP_IMAGE_UPDATED: + payload = ImageUpdatedPayload(); + break; + default: + payload = UnknownPayload(); + break; + } + + apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload); + DECODE_FINISH(iter); +} + +void NotifyMessage::dump(Formatter *f) const { + apply_visitor(DumpPayloadVisitor(f), payload); +} + +void NotifyMessage::generate_test_instances(std::list &o) { + o.push_back(new NotifyMessage(ModeUpdatedPayload(cls::rbd::MIRROR_MODE_DISABLED))); + o.push_back(new NotifyMessage(ImageUpdatedPayload(cls::rbd::MIRROR_IMAGE_STATE_DISABLING, + "image id", "global image id"))); +} + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op) { + switch (op) { + case NOTIFY_OP_MODE_UPDATED: + out << "ModeUpdated"; + break; + case NOTIFY_OP_IMAGE_UPDATED: + out << "ImageUpdated"; + break; + default: + out << "Unknown (" << static_cast(op) << ")"; + break; + } + return out; +} + +} // namespace mirroring_watcher +} // namespace librbd diff --git a/src/librbd/mirroring_watcher/Types.h b/src/librbd/mirroring_watcher/Types.h new file mode 100644 index 000000000..1e096a9d3 --- /dev/null +++ b/src/librbd/mirroring_watcher/Types.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H +#define CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "cls/rbd/cls_rbd_types.h" +#include +#include +#include +#include + +namespace ceph { class Formatter; } + +namespace librbd { +namespace mirroring_watcher { + +enum NotifyOp { + NOTIFY_OP_MODE_UPDATED = 0, + NOTIFY_OP_IMAGE_UPDATED = 1 +}; + +struct ModeUpdatedPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_MODE_UPDATED; + + cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + + ModeUpdatedPayload() { + } + ModeUpdatedPayload(cls::rbd::MirrorMode mirror_mode) + : mirror_mode(mirror_mode) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct ImageUpdatedPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_UPDATED; + + cls::rbd::MirrorImageState mirror_image_state = + cls::rbd::MIRROR_IMAGE_STATE_ENABLED; + std::string image_id; + std::string global_image_id; + + ImageUpdatedPayload() { + } + ImageUpdatedPayload(cls::rbd::MirrorImageState mirror_image_state, + const std::string &image_id, + const std::string &global_image_id) + : mirror_image_state(mirror_image_state), image_id(image_id), + global_image_id(global_image_id) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct UnknownPayload { + static const NotifyOp NOTIFY_OP = static_cast(-1); + + UnknownPayload() { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +typedef boost::variant Payload; + +struct NotifyMessage { + NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) { + } + + Payload payload; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list &o); +}; + +WRITE_CLASS_ENCODER(NotifyMessage); + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op); + +} // namespace mirroring_watcher +} // namespace librbd + +using librbd::mirroring_watcher::encode; +using librbd::mirroring_watcher::decode; + +#endif // CEPH_LIBRBD_MIRRORING_WATCHER_TYPES_H diff --git a/src/librbd/object_map/CreateRequest.cc b/src/librbd/object_map/CreateRequest.cc new file mode 100644 index 000000000..d26f929fa --- /dev/null +++ b/src/librbd/object_map/CreateRequest.cc @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/CreateRequest.h" +#include "include/ceph_assert.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "osdc/Striper.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::CreateRequest: " + +namespace librbd { +namespace object_map { + +using util::create_context_callback; +using util::create_rados_callback; + +template +CreateRequest::CreateRequest(I *image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { +} + +template +void CreateRequest::send() { + CephContext *cct = m_image_ctx->cct; + + uint64_t max_size = m_image_ctx->size; + + { + std::unique_lock image_locker{m_image_ctx->image_lock}; + m_snap_ids.push_back(CEPH_NOSNAP); + for (auto it : m_image_ctx->snap_info) { + max_size = std::max(max_size, it.second.size); + m_snap_ids.push_back(it.first); + } + + if (ObjectMap<>::is_compatible(m_image_ctx->layout, max_size)) { + send_object_map_resize(); + return; + } + } + + lderr(cct) << "image size not compatible with object map" << dendl; + m_on_finish->complete(-EINVAL); +} + +template +void CreateRequest::send_object_map_resize() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + Context *ctx = create_context_callback< + CreateRequest, &CreateRequest::handle_object_map_resize>(this); + C_Gather *gather_ctx = new C_Gather(cct, ctx); + + for (auto snap_id : m_snap_ids) { + librados::ObjectWriteOperation op; + uint64_t snap_size = m_image_ctx->get_image_size(snap_id); + + cls_client::object_map_resize(&op, Striper::get_num_objects( + m_image_ctx->layout, snap_size), + OBJECT_NONEXISTENT); + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, snap_id)); + librados::AioCompletion *comp = create_rados_callback(gather_ctx->new_sub()); + int r = m_image_ctx->md_ctx.aio_operate(oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + gather_ctx->activate(); +} + +template +Context *CreateRequest::handle_object_map_resize(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "object map resize failed: " << cpp_strerror(*result) + << dendl; + } + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::CreateRequest; diff --git a/src/librbd/object_map/CreateRequest.h b/src/librbd/object_map/CreateRequest.h new file mode 100644 index 000000000..33984cda1 --- /dev/null +++ b/src/librbd/object_map/CreateRequest.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H + +#include "include/buffer.h" +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template +class CreateRequest { +public: + static CreateRequest *create(ImageCtxT *image_ctx, Context *on_finish) { + return new CreateRequest(image_ctx, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * + * | . . . + * v v . + * OBJECT_MAP_RESIZE . (for every snapshot) + * | . . + * v . . . + * + * + * @endverbatim + */ + + CreateRequest(ImageCtxT *image_ctx, Context *on_finish); + + ImageCtxT *m_image_ctx; + Context *m_on_finish; + + std::vector m_snap_ids; + + void send_object_map_resize(); + Context *handle_object_map_resize(int *result); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::CreateRequest; + +#endif // CEPH_LIBRBD_OBJECT_MAP_CREATE_REQUEST_H diff --git a/src/librbd/object_map/DiffRequest.cc b/src/librbd/object_map/DiffRequest.cc new file mode 100644 index 000000000..606d48bbf --- /dev/null +++ b/src/librbd/object_map/DiffRequest.cc @@ -0,0 +1,258 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/DiffRequest.h" +#include "common/debug.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "osdc/Striper.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::DiffRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace object_map { + +using util::create_rados_callback; + +template +void DiffRequest::send() { + auto cct = m_image_ctx->cct; + + if (m_snap_id_start == CEPH_NOSNAP || m_snap_id_start > m_snap_id_end) { + lderr(cct) << "invalid start/end snap ids: " + << "snap_id_start=" << m_snap_id_start << ", " + << "snap_id_end=" << m_snap_id_end << dendl; + finish(-EINVAL); + return; + } else if (m_snap_id_start == m_snap_id_end) { + // no delta between the same snapshot + finish(0); + return; + } + + m_object_diff_state->clear(); + + // collect all the snap ids in the provided range (inclusive) + if (m_snap_id_start != 0) { + m_snap_ids.insert(m_snap_id_start); + } + + std::shared_lock image_locker{m_image_ctx->image_lock}; + auto snap_info_it = m_image_ctx->snap_info.upper_bound(m_snap_id_start); + auto snap_info_it_end = m_image_ctx->snap_info.lower_bound(m_snap_id_end); + for (; snap_info_it != snap_info_it_end; ++snap_info_it) { + m_snap_ids.insert(snap_info_it->first); + } + m_snap_ids.insert(m_snap_id_end); + + load_object_map(&image_locker); +} + +template +void DiffRequest::load_object_map( + std::shared_lock* image_locker) { + ceph_assert(ceph_mutex_is_locked(m_image_ctx->image_lock)); + + if (m_snap_ids.empty()) { + image_locker->unlock(); + + finish(0); + return; + } + + m_current_snap_id = *m_snap_ids.begin(); + m_snap_ids.erase(m_current_snap_id); + + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "snap_id=" << m_current_snap_id << dendl; + + if ((m_image_ctx->features & RBD_FEATURE_FAST_DIFF) == 0) { + image_locker->unlock(); + + ldout(cct, 10) << "fast-diff feature not enabled" << dendl; + finish(-EINVAL); + return; + } + + // ignore ENOENT with intermediate snapshots since deleted + // snaps will get merged with later snapshots + m_ignore_enoent = (m_current_snap_id != m_snap_id_start && + m_current_snap_id != m_snap_id_end); + + if (m_current_snap_id == CEPH_NOSNAP) { + m_current_size = m_image_ctx->size; + } else { + auto snap_it = m_image_ctx->snap_info.find(m_current_snap_id); + if (snap_it == m_image_ctx->snap_info.end()) { + ldout(cct, 10) << "snapshot " << m_current_snap_id << " does not exist" + << dendl; + if (!m_ignore_enoent) { + image_locker->unlock(); + + finish(-ENOENT); + return; + } + + load_object_map(image_locker); + return; + } + + m_current_size = snap_it->second.size; + } + + uint64_t flags = 0; + int r = m_image_ctx->get_flags(m_current_snap_id, &flags); + if (r < 0) { + image_locker->unlock(); + + lderr(cct) << "failed to retrieve image flags: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + image_locker->unlock(); + + if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) { + ldout(cct, 1) << "cannot perform fast diff on invalid object map" + << dendl; + finish(-EINVAL); + return; + } + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, + m_current_snap_id)); + + librados::ObjectReadOperation op; + cls_client::object_map_load_start(&op); + + m_out_bl.clear(); + auto aio_comp = create_rados_callback< + DiffRequest, &DiffRequest::handle_load_object_map>(this); + r = m_image_ctx->md_ctx.aio_operate(oid, aio_comp, &op, &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void DiffRequest::handle_load_object_map(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r == 0) { + auto bl_it = m_out_bl.cbegin(); + r = cls_client::object_map_load_finish(&bl_it, &m_object_map); + } + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, + m_current_snap_id)); + if (r == -ENOENT && m_ignore_enoent) { + ldout(cct, 10) << "object map " << oid << " does not exist" << dendl; + + std::shared_lock image_locker{m_image_ctx->image_lock}; + load_object_map(&image_locker); + return; + } else if (r < 0) { + lderr(cct) << "failed to load object map: " << oid << dendl; + finish(r); + return; + } + ldout(cct, 20) << "loaded object map " << oid << dendl; + + uint64_t num_objs = Striper::get_num_objects(m_image_ctx->layout, + m_current_size); + if (m_object_map.size() < num_objs) { + ldout(cct, 1) << "object map too small: " + << m_object_map.size() << " < " << num_objs << dendl; + finish(-EINVAL); + return; + } else { + m_object_map.resize(num_objs); + } + + uint64_t prev_object_diff_state_size = m_object_diff_state->size(); + if (prev_object_diff_state_size < num_objs) { + // the diff state should be the largest of all snapshots in the set + m_object_diff_state->resize(num_objs); + } + if (m_object_map.size() < m_object_diff_state->size()) { + // the image was shrunk so expanding the object map will flag end objects + // as non-existent and they will be compared against the previous object + // diff state + m_object_map.resize(m_object_diff_state->size()); + } + + uint64_t overlap = std::min(m_object_map.size(), prev_object_diff_state_size); + auto it = m_object_map.begin(); + auto overlap_end_it = it + overlap; + auto diff_it = m_object_diff_state->begin(); + uint64_t i = 0; + for (; it != overlap_end_it; ++it, ++diff_it, ++i) { + uint8_t object_map_state = *it; + uint8_t prev_object_diff_state = *diff_it; + if (object_map_state == OBJECT_EXISTS || + object_map_state == OBJECT_PENDING || + (object_map_state == OBJECT_EXISTS_CLEAN && + prev_object_diff_state != DIFF_STATE_DATA && + prev_object_diff_state != DIFF_STATE_DATA_UPDATED)) { + *diff_it = DIFF_STATE_DATA_UPDATED; + } else if (object_map_state == OBJECT_NONEXISTENT && + prev_object_diff_state != DIFF_STATE_HOLE && + prev_object_diff_state != DIFF_STATE_HOLE_UPDATED) { + *diff_it = DIFF_STATE_HOLE_UPDATED; + } + + ldout(cct, 20) << "object state: " << i << " " + << static_cast(prev_object_diff_state) + << "->" << static_cast(*diff_it) << " (" + << static_cast(object_map_state) << ")" + << dendl; + } + ldout(cct, 20) << "computed overlap diffs" << dendl; + + bool diff_from_start = (m_snap_id_start == 0); + auto end_it = m_object_map.end(); + if (m_object_map.size() > prev_object_diff_state_size) { + for (; it != end_it; ++it,++diff_it, ++i) { + uint8_t object_map_state = *it; + if (object_map_state == OBJECT_NONEXISTENT) { + *diff_it = DIFF_STATE_HOLE; + } else if (diff_from_start || + (m_object_diff_state_valid && + object_map_state != OBJECT_EXISTS_CLEAN)) { + *diff_it = DIFF_STATE_DATA_UPDATED; + } else { + *diff_it = DIFF_STATE_DATA; + } + + ldout(cct, 20) << "object state: " << i << " " + << "->" << static_cast(*diff_it) << " (" + << static_cast(*it) << ")" << dendl; + } + } + ldout(cct, 20) << "computed resize diffs" << dendl; + + m_object_diff_state_valid = true; + + std::shared_lock image_locker{m_image_ctx->image_lock}; + load_object_map(&image_locker); +} + +template +void DiffRequest::finish(int r) { + auto cct = m_image_ctx->cct; + ldout(cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::DiffRequest; diff --git a/src/librbd/object_map/DiffRequest.h b/src/librbd/object_map/DiffRequest.h new file mode 100644 index 000000000..e83a1629e --- /dev/null +++ b/src/librbd/object_map/DiffRequest.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_DIFF_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_DIFF_REQUEST_H + +#include "include/int_types.h" +#include "common/bit_vector.hpp" +#include "common/ceph_mutex.h" +#include "librbd/object_map/Types.h" +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace object_map { + +template +class DiffRequest { +public: + static DiffRequest* create(ImageCtxT* image_ctx, uint64_t snap_id_start, + uint64_t snap_id_end, + BitVector<2>* object_diff_state, + Context* on_finish) { + return new DiffRequest(image_ctx, snap_id_start, snap_id_end, + object_diff_state, on_finish); + } + + DiffRequest(ImageCtxT* image_ctx, uint64_t snap_id_start, + uint64_t snap_id_end, BitVector<2>* object_diff_state, + Context* on_finish) + : m_image_ctx(image_ctx), m_snap_id_start(snap_id_start), + m_snap_id_end(snap_id_end), m_object_diff_state(object_diff_state), + m_on_finish(on_finish) { + } + + void send(); + +private: + /** + * @verbatim + * + * + * | + * | /---------\ + * | | | + * v v | + * LOAD_OBJECT_MAP ---/ + * | + * v + * + * + * @endverbatim + */ + ImageCtxT* m_image_ctx; + uint64_t m_snap_id_start; + uint64_t m_snap_id_end; + BitVector<2>* m_object_diff_state; + Context* m_on_finish; + + std::set m_snap_ids; + uint64_t m_current_snap_id = 0; + bool m_ignore_enoent = false; + + uint64_t m_current_size = 0; + + BitVector<2> m_object_map; + bool m_object_diff_state_valid = false; + + bufferlist m_out_bl; + + void load_object_map(std::shared_lock* image_locker); + void handle_load_object_map(int r); + + void finish(int r); + +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::DiffRequest; + +#endif // CEPH_LIBRBD_OBJECT_MAP_DIFF_REQUEST_H diff --git a/src/librbd/object_map/InvalidateRequest.cc b/src/librbd/object_map/InvalidateRequest.cc new file mode 100644 index 000000000..bf2db9660 --- /dev/null +++ b/src/librbd/object_map/InvalidateRequest.cc @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/InvalidateRequest.h" +#include "common/dout.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::InvalidateRequest: " + +namespace librbd { +namespace object_map { + +template +InvalidateRequest* InvalidateRequest::create(I &image_ctx, + uint64_t snap_id, bool force, + Context *on_finish) { + return new InvalidateRequest(image_ctx, snap_id, force, on_finish); +} + +template +void InvalidateRequest::send() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + ceph_assert(ceph_mutex_is_wlocked(image_ctx.image_lock)); + + uint64_t snap_flags; + int r = image_ctx.get_flags(m_snap_id, &snap_flags); + if (r < 0 || ((snap_flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0)) { + this->async_complete(r); + return; + } + + CephContext *cct = image_ctx.cct; + lderr(cct) << this << " invalidating object map in-memory" << dendl; + + // update in-memory flags + uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID; + if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) { + flags |= RBD_FLAG_FAST_DIFF_INVALID; + } + + r = image_ctx.update_flags(m_snap_id, flags, true); + if (r < 0) { + this->async_complete(r); + return; + } + + // do not update on-disk flags if not image owner + if (image_ctx.image_watcher == nullptr || + (!m_force && m_snap_id == CEPH_NOSNAP && + image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner())) { + this->async_complete(-EROFS); + return; + } + + lderr(cct) << this << " invalidating object map on-disk" << dendl; + librados::ObjectWriteOperation op; + cls_client::set_flags(&op, m_snap_id, flags, flags); + + librados::AioCompletion *rados_completion = + this->create_callback_completion(); + r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion, + &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +bool InvalidateRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + lderr(cct) << this << " " << __func__ << ": r=" << r << dendl; + return true; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::InvalidateRequest; diff --git a/src/librbd/object_map/InvalidateRequest.h b/src/librbd/object_map/InvalidateRequest.h new file mode 100644 index 000000000..ce15bb2d3 --- /dev/null +++ b/src/librbd/object_map/InvalidateRequest.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H + +#include "include/int_types.h" +#include "librbd/AsyncRequest.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template +class InvalidateRequest : public AsyncRequest { +public: + static InvalidateRequest* create(ImageCtxT &image_ctx, uint64_t snap_id, + bool force, Context *on_finish); + + InvalidateRequest(ImageCtxT &image_ctx, uint64_t snap_id, bool force, + Context *on_finish) + : AsyncRequest(image_ctx, on_finish), + m_snap_id(snap_id), m_force(force) { + } + + void send() override; + +protected: + bool should_complete(int r) override; + +private: + uint64_t m_snap_id; + bool m_force; +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::InvalidateRequest; + +#endif // CEPH_LIBRBD_OBJECT_MAP_INVALIDATE_REQUEST_H diff --git a/src/librbd/object_map/LockRequest.cc b/src/librbd/object_map/LockRequest.cc new file mode 100644 index 000000000..d9e6ccfbc --- /dev/null +++ b/src/librbd/object_map/LockRequest.cc @@ -0,0 +1,157 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/LockRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::LockRequest: " + +namespace librbd { +namespace object_map { + +using util::create_rados_callback; + +template +LockRequest::LockRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish), m_broke_lock(false) { +} + +template +void LockRequest::send() { + send_lock(); +} + +template +void LockRequest::send_lock() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::lock(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", "", "", + utime_t(), 0); + + using klass = LockRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +Context *LockRequest::handle_lock(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val == 0) { + return m_on_finish; + } else if (*ret_val == -EEXIST) { + // already locked by myself + *ret_val = 0; + return m_on_finish; + } else if (m_broke_lock || *ret_val != -EBUSY) { + lderr(cct) << "failed to lock object map: " << cpp_strerror(*ret_val) + << dendl; + *ret_val = 0; + return m_on_finish; + } + + send_get_lock_info(); + return nullptr; +} + +template +void LockRequest::send_get_lock_info() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectReadOperation op; + rados::cls::lock::get_lock_info_start(&op, RBD_LOCK_NAME); + + using klass = LockRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +Context *LockRequest::handle_get_lock_info(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val == -ENOENT) { + send_lock(); + return nullptr; + } + + ClsLockType lock_type; + std::string lock_tag; + if (*ret_val == 0) { + auto it = m_out_bl.cbegin(); + *ret_val = rados::cls::lock::get_lock_info_finish(&it, &m_lockers, + &lock_type, &lock_tag); + } + if (*ret_val < 0) { + lderr(cct) << "failed to list object map locks: " << cpp_strerror(*ret_val) + << dendl; + *ret_val = 0; + return m_on_finish; + } + + send_break_locks(); + return nullptr; +} + +template +void LockRequest::send_break_locks() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << ", " + << "num_lockers=" << m_lockers.size() << dendl; + + librados::ObjectWriteOperation op; + for (auto &locker : m_lockers) { + rados::cls::lock::break_lock(&op, RBD_LOCK_NAME, locker.first.cookie, + locker.first.locker); + } + + using klass = LockRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +Context *LockRequest::handle_break_locks(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + m_broke_lock = true; + if (*ret_val == 0 || *ret_val == -ENOENT) { + send_lock(); + return nullptr; + } + + lderr(cct) << "failed to break object map lock: " << cpp_strerror(*ret_val) + << dendl; + *ret_val = 0; + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::LockRequest; diff --git a/src/librbd/object_map/LockRequest.h b/src/librbd/object_map/LockRequest.h new file mode 100644 index 000000000..75d11235c --- /dev/null +++ b/src/librbd/object_map/LockRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H + +#include "include/buffer.h" +#include "cls/lock/cls_lock_types.h" +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template +class LockRequest { +public: + static LockRequest* create(ImageCtxT &image_ctx, Context *on_finish) { + return new LockRequest(image_ctx, on_finish); + } + LockRequest(ImageCtxT &image_ctx, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * /------------------------------------- BREAK_LOCKS * * * + * | | ^ * + * | | | * + * | | | * + * | v (EBUSY && !broke_lock) | * + * \---------> LOCK_OBJECT_MAP * * * * * * * * * * * > GET_LOCK_INFO * * + * | * ^ * * + * | * * * * + * | * * (ENOENT) * * + * | * * * * * * * * * * * * * * * * * * + * | * * + * | * (other errors) * + * | * * + * v v (other errors) * + * < * * * * * * * * * * * * * * * * * * * * * * * * + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + bool m_broke_lock; + std::map m_lockers; + bufferlist m_out_bl; + + void send_lock(); + Context *handle_lock(int *ret_val); + + void send_get_lock_info(); + Context *handle_get_lock_info(int *ret_val); + + void send_break_locks(); + Context *handle_break_locks(int *ret_val); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::LockRequest; + +#endif // CEPH_LIBRBD_OBJECT_MAP_LOCK_REQUEST_H diff --git a/src/librbd/object_map/RefreshRequest.cc b/src/librbd/object_map/RefreshRequest.cc new file mode 100644 index 000000000..1527f07c7 --- /dev/null +++ b/src/librbd/object_map/RefreshRequest.cc @@ -0,0 +1,311 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/RefreshRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/object_map/InvalidateRequest.h" +#include "librbd/object_map/LockRequest.h" +#include "librbd/object_map/ResizeRequest.h" +#include "librbd/Utils.h" +#include "osdc/Striper.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::RefreshRequest: " + +namespace librbd { + +using util::create_context_callback; +using util::create_rados_callback; + +namespace object_map { + +template +RefreshRequest::RefreshRequest(I &image_ctx, ceph::shared_mutex* object_map_lock, + ceph::BitVector<2> *object_map, + uint64_t snap_id, Context *on_finish) + : m_image_ctx(image_ctx), m_object_map_lock(object_map_lock), + m_object_map(object_map), m_snap_id(snap_id), m_on_finish(on_finish), + m_object_count(0), m_truncate_on_disk_object_map(false) { +} + +template +void RefreshRequest::send() { + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + m_object_count = Striper::get_num_objects( + m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id)); + } + + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": " + << "object_count=" << m_object_count << dendl; + send_lock(); +} + +template +void RefreshRequest::apply() { + uint64_t num_objs; + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + num_objs = Striper::get_num_objects( + m_image_ctx.layout, m_image_ctx.get_image_size(m_snap_id)); + } + ceph_assert(m_on_disk_object_map.size() >= num_objs); + + std::unique_lock object_map_locker{*m_object_map_lock}; + *m_object_map = m_on_disk_object_map; +} + +template +void RefreshRequest::send_lock() { + CephContext *cct = m_image_ctx.cct; + if (m_object_count > cls::rbd::MAX_OBJECT_MAP_OBJECT_COUNT) { + send_invalidate_and_close(); + return; + } else if (m_snap_id != CEPH_NOSNAP) { + send_load(); + return; + } + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_lock>(this); + + LockRequest *req = LockRequest::create(m_image_ctx, ctx); + req->send(); +} + +template +Context *RefreshRequest::handle_lock(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + ceph_assert(*ret_val == 0); + send_load(); + return nullptr; +} + +template +void RefreshRequest::send_load() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectReadOperation op; + cls_client::object_map_load_start(&op); + + using klass = RefreshRequest; + m_out_bl.clear(); + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +Context *RefreshRequest::handle_load(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val == 0) { + auto bl_it = m_out_bl.cbegin(); + *ret_val = cls_client::object_map_load_finish(&bl_it, + &m_on_disk_object_map); + } + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + if (*ret_val == -EINVAL) { + // object map is corrupt on-disk -- clear it and properly size it + // so future IO can keep the object map in sync + lderr(cct) << "object map corrupt on-disk: " << oid << dendl; + m_truncate_on_disk_object_map = true; + send_resize_invalidate(); + return nullptr; + } else if (*ret_val < 0) { + lderr(cct) << "failed to load object map: " << oid << dendl; + if (*ret_val == -ETIMEDOUT && + !cct->_conf.get_val("rbd_invalidate_object_map_on_timeout")) { + return m_on_finish; + } + + send_invalidate(); + return nullptr; + } + + if (m_on_disk_object_map.size() < m_object_count) { + lderr(cct) << "object map smaller than current object count: " + << m_on_disk_object_map.size() << " != " + << m_object_count << dendl; + send_resize_invalidate(); + return nullptr; + } + + ldout(cct, 20) << "refreshed object map: num_objs=" + << m_on_disk_object_map.size() << dendl; + if (m_on_disk_object_map.size() > m_object_count) { + // resize op might have been interrupted + ldout(cct, 1) << "object map larger than current object count: " + << m_on_disk_object_map.size() << " != " + << m_object_count << dendl; + } + + apply(); + return m_on_finish; +} + +template +void RefreshRequest::send_invalidate() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_on_disk_object_map.clear(); + object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count, + OBJECT_EXISTS); + + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_invalidate>(this); + InvalidateRequest *req = InvalidateRequest::create( + m_image_ctx, m_snap_id, true, ctx); + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::unique_lock image_locker{m_image_ctx.image_lock}; + req->send(); +} + +template +Context *RefreshRequest::handle_invalidate(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val) + << dendl; + } + + apply(); + return m_on_finish; +} + +template +void RefreshRequest::send_resize_invalidate() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_on_disk_object_map.clear(); + object_map::ResizeRequest::resize(&m_on_disk_object_map, m_object_count, + OBJECT_EXISTS); + + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_resize_invalidate>(this); + InvalidateRequest *req = InvalidateRequest::create( + m_image_ctx, m_snap_id, true, ctx); + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::unique_lock image_locker{m_image_ctx.image_lock}; + req->send(); +} + +template +Context *RefreshRequest::handle_resize_invalidate(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val) + << dendl; + apply(); + return m_on_finish; + } + + send_resize(); + return nullptr; +} + +template +void RefreshRequest::send_resize() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + if (m_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", ""); + } + if (m_truncate_on_disk_object_map) { + op.truncate(0); + } + cls_client::object_map_resize(&op, m_object_count, OBJECT_NONEXISTENT); + + using klass = RefreshRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +Context *RefreshRequest::handle_resize(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0) { + lderr(cct) << "failed to adjust object map size: " << cpp_strerror(*ret_val) + << dendl; + *ret_val = 0; + } + + apply(); + return m_on_finish; +} + +template +void RefreshRequest::send_invalidate_and_close() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + using klass = RefreshRequest; + Context *ctx = create_context_callback< + klass, &klass::handle_invalidate_and_close>(this); + InvalidateRequest *req = InvalidateRequest::create( + m_image_ctx, m_snap_id, false, ctx); + + lderr(cct) << "object map too large: " << m_object_count << dendl; + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::unique_lock image_locker{m_image_ctx.image_lock}; + req->send(); +} + +template +Context *RefreshRequest::handle_invalidate_and_close(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(*ret_val) + << dendl; + } else { + *ret_val = -EFBIG; + } + + std::unique_lock object_map_locker{*m_object_map_lock}; + m_object_map->clear(); + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::RefreshRequest; diff --git a/src/librbd/object_map/RefreshRequest.h b/src/librbd/object_map/RefreshRequest.h new file mode 100644 index 000000000..ddecc9604 --- /dev/null +++ b/src/librbd/object_map/RefreshRequest.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/bit_vector.hpp" +#include "common/ceph_mutex.h" + +class Context; +class RWLock; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template +class RefreshRequest { +public: + static RefreshRequest *create(ImageCtxT &image_ctx, + ceph::shared_mutex* object_map_lock, + ceph::BitVector<2> *object_map, + uint64_t snap_id, Context *on_finish) { + return new RefreshRequest(image_ctx, object_map_lock, object_map, snap_id, + on_finish); + } + + RefreshRequest(ImageCtxT &image_ctx, ceph::shared_mutex* object_map_lock, + ceph::BitVector<2> *object_map, uint64_t snap_id, + Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * -----> LOCK (skip if snapshot) + * * | + * * v (other errors) + * * LOAD * * * * * * * > INVALIDATE ------------\ + * * | * | + * * | * (-EINVAL or too small) | + * * | * * * * * * > INVALIDATE_AND_RESIZE | + * * | | * | + * * | | * | + * * | v * | + * * | RESIZE * | + * * | | * | + * * | | * * * * * * * | + * * | | * | + * * | v v | + * * \--------------------> LOCK <-------------/ + * * | + * v v + * INVALIDATE_AND_CLOSE ---------------> + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + ceph::shared_mutex* m_object_map_lock; + ceph::BitVector<2> *m_object_map; + uint64_t m_snap_id; + Context *m_on_finish; + + uint64_t m_object_count; + ceph::BitVector<2> m_on_disk_object_map; + bool m_truncate_on_disk_object_map; + bufferlist m_out_bl; + + void send_lock(); + Context *handle_lock(int *ret_val); + + void send_load(); + Context *handle_load(int *ret_val); + + void send_invalidate(); + Context *handle_invalidate(int *ret_val); + + void send_resize_invalidate(); + Context *handle_resize_invalidate(int *ret_val); + + void send_resize(); + Context *handle_resize(int *ret_val); + + void send_invalidate_and_close(); + Context *handle_invalidate_and_close(int *ret_val); + + void apply(); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::RefreshRequest; + +#endif // CEPH_LIBRBD_OBJECT_MAP_REFRESH_REQUEST_H diff --git a/src/librbd/object_map/RemoveRequest.cc b/src/librbd/object_map/RemoveRequest.cc new file mode 100644 index 000000000..a718d81fc --- /dev/null +++ b/src/librbd/object_map/RemoveRequest.cc @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/RemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::RemoveRequest: " + +namespace librbd { +namespace object_map { + +using util::create_rados_callback; + +template +RemoveRequest::RemoveRequest(I *image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { +} + +template +void RemoveRequest::send() { + send_remove_object_map(); +} + +template +void RemoveRequest::send_remove_object_map() { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << dendl; + + std::unique_lock image_locker{m_image_ctx->image_lock}; + std::vector snap_ids; + snap_ids.push_back(CEPH_NOSNAP); + for (auto it : m_image_ctx->snap_info) { + snap_ids.push_back(it.first); + } + + std::lock_guard locker{m_lock}; + ceph_assert(m_ref_counter == 0); + + for (auto snap_id : snap_ids) { + m_ref_counter++; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx->id, snap_id)); + using klass = RemoveRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + + int r = m_image_ctx->md_ctx.aio_remove(oid, comp); + ceph_assert(r == 0); + comp->release(); + } +} + +template +Context *RemoveRequest::handle_remove_object_map(int *result) { + CephContext *cct = m_image_ctx->cct; + ldout(cct, 20) << __func__ << ": r=" << *result << dendl; + + { + std::lock_guard locker{m_lock}; + ceph_assert(m_ref_counter > 0); + m_ref_counter--; + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to remove object map: " << cpp_strerror(*result) + << dendl; + m_error_result = *result; + } + if (m_ref_counter > 0) { + return nullptr; + } + } + if (m_error_result < 0) { + *result = m_error_result; + } + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::RemoveRequest; diff --git a/src/librbd/object_map/RemoveRequest.h b/src/librbd/object_map/RemoveRequest.h new file mode 100644 index 000000000..ce82e603c --- /dev/null +++ b/src/librbd/object_map/RemoveRequest.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H + +#include "include/buffer.h" +#include "common/ceph_mutex.h" +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template +class RemoveRequest { +public: + static RemoveRequest *create(ImageCtxT *image_ctx, Context *on_finish) { + return new RemoveRequest(image_ctx, on_finish); + } + + void send(); + +private: + /** + * @verbatim + * + * + * | . . . + * v v . + * REMOVE_OBJECT_MAP . (for every snapshot) + * | . . + * v . . . + * + * + * @endverbatim + */ + + RemoveRequest(ImageCtxT *image_ctx, Context *on_finish); + + ImageCtxT *m_image_ctx; + Context *m_on_finish; + + int m_error_result = 0; + int m_ref_counter = 0; + mutable ceph::mutex m_lock = + ceph::make_mutex("object_map::RemoveRequest::m_lock"); + + void send_remove_object_map(); + Context *handle_remove_object_map(int *result); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::RemoveRequest; + +#endif // CEPH_LIBRBD_OBJECT_MAP_REMOVE_REQUEST_H diff --git a/src/librbd/object_map/Request.cc b/src/librbd/object_map/Request.cc new file mode 100644 index 000000000..1e1aab2ae --- /dev/null +++ b/src/librbd/object_map/Request.cc @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/Request.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/RWLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/object_map/InvalidateRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::Request: " + +namespace librbd { +namespace object_map { + +bool Request::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 20) << this << " should_complete: r=" << r << dendl; + + switch (m_state) + { + case STATE_REQUEST: + if (r == -ETIMEDOUT && + !cct->_conf.get_val("rbd_invalidate_object_map_on_timeout")) { + m_state = STATE_TIMEOUT; + return true; + } else if (r < 0) { + lderr(cct) << "failed to update object map: " << cpp_strerror(r) + << dendl; + return invalidate(); + } + + finish_request(); + return true; + + case STATE_INVALIDATE: + ldout(cct, 20) << "INVALIDATE" << dendl; + if (r < 0) { + lderr(cct) << "failed to invalidate object map: " << cpp_strerror(r) + << dendl; + } + return true; + + default: + lderr(cct) << "invalid state: " << m_state << dendl; + ceph_abort(); + break; + } + return false; +} + +bool Request::invalidate() { + bool flags_set; + int r = m_image_ctx.test_flags(m_snap_id, RBD_FLAG_OBJECT_MAP_INVALID, + &flags_set); + if (r < 0 || flags_set) { + return true; + } + + m_state = STATE_INVALIDATE; + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::unique_lock image_locker{m_image_ctx.image_lock}; + InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id, + true, + create_callback_context()); + req->send(); + return false; +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/Request.h b/src/librbd/object_map/Request.h new file mode 100644 index 000000000..7e9bfb88d --- /dev/null +++ b/src/librbd/object_map/Request.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_REQUEST_H + +#include "include/int_types.h" +#include "librbd/AsyncRequest.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +class Request : public AsyncRequest<> { +public: + Request(ImageCtx &image_ctx, uint64_t snap_id, Context *on_finish) + : AsyncRequest(image_ctx, on_finish), m_snap_id(snap_id), + m_state(STATE_REQUEST) + { + } + + void send() override = 0; + +protected: + const uint64_t m_snap_id; + + bool should_complete(int r) override; + int filter_return_code(int r) const override { + if (m_state == STATE_REQUEST) { + // never propagate an error back to the caller + return 0; + } + return r; + } + virtual void finish_request() { + } + +private: + /** + * STATE_TIMEOUT --------\ + * ^ | + * | v + * ---> STATE_REQUEST ---> + * | ^ + * v | + * STATE_INVALIDATE -------/ + */ + enum State { + STATE_REQUEST, + STATE_TIMEOUT, + STATE_INVALIDATE + }; + + State m_state; + + bool invalidate(); +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_REQUEST_H diff --git a/src/librbd/object_map/ResizeRequest.cc b/src/librbd/object_map/ResizeRequest.cc new file mode 100644 index 000000000..91a3140ed --- /dev/null +++ b/src/librbd/object_map/ResizeRequest.cc @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/ResizeRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "cls/lock/cls_lock_client.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::ResizeRequest: " + +namespace librbd { +namespace object_map { + +void ResizeRequest::resize(ceph::BitVector<2> *object_map, uint64_t num_objs, + uint8_t default_state) { + size_t orig_object_map_size = object_map->size(); + object_map->resize(num_objs); + if (num_objs > orig_object_map_size) { + auto it = object_map->begin() + orig_object_map_size; + auto end_it = object_map->begin() + num_objs; + for (;it != end_it; ++it) { + *it = default_state; + } + } +} + +void ResizeRequest::send() { + CephContext *cct = m_image_ctx.cct; + + std::unique_lock l{*m_object_map_lock}; + m_num_objs = Striper::get_num_objects(m_image_ctx.layout, m_new_size); + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 5) << this << " resizing on-disk object map: " + << "ictx=" << &m_image_ctx << ", " + << "oid=" << oid << ", num_objs=" << m_num_objs << dendl; + + librados::ObjectWriteOperation op; + if (m_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", ""); + } + cls_client::object_map_resize(&op, m_num_objs, m_default_object_state); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +void ResizeRequest::finish_request() { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " resizing in-memory object map: " + << m_num_objs << dendl; + + std::unique_lock object_map_locker{*m_object_map_lock}; + resize(m_object_map, m_num_objs, m_default_object_state); +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/ResizeRequest.h b/src/librbd/object_map/ResizeRequest.h new file mode 100644 index 000000000..5d933bb53 --- /dev/null +++ b/src/librbd/object_map/ResizeRequest.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H + +#include "include/int_types.h" +#include "librbd/object_map/Request.h" +#include "common/bit_vector.hpp" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +class ResizeRequest : public Request { +public: + ResizeRequest(ImageCtx &image_ctx, ceph::shared_mutex *object_map_lock, + ceph::BitVector<2> *object_map, uint64_t snap_id, + uint64_t new_size, uint8_t default_object_state, + Context *on_finish) + : Request(image_ctx, snap_id, on_finish), + m_object_map_lock(object_map_lock), m_object_map(object_map), + m_num_objs(0), m_new_size(new_size), + m_default_object_state(default_object_state) + { + } + + static void resize(ceph::BitVector<2> *object_map, uint64_t num_objs, + uint8_t default_state); + + void send() override; + +protected: + void finish_request() override; + +private: + ceph::shared_mutex* m_object_map_lock; + ceph::BitVector<2> *m_object_map; + uint64_t m_num_objs; + uint64_t m_new_size; + uint8_t m_default_object_state; +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_RESIZE_REQUEST_H diff --git a/src/librbd/object_map/SnapshotCreateRequest.cc b/src/librbd/object_map/SnapshotCreateRequest.cc new file mode 100644 index 000000000..3b2e7ee82 --- /dev/null +++ b/src/librbd/object_map/SnapshotCreateRequest.cc @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/SnapshotCreateRequest.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "cls/lock/cls_lock_client.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::SnapshotCreateRequest: " + +namespace librbd { +namespace object_map { + +namespace { + +std::ostream& operator<<(std::ostream& os, + const SnapshotCreateRequest::State& state) { + switch(state) { + case SnapshotCreateRequest::STATE_READ_MAP: + os << "READ_MAP"; + break; + case SnapshotCreateRequest::STATE_WRITE_MAP: + os << "WRITE_MAP"; + break; + case SnapshotCreateRequest::STATE_ADD_SNAPSHOT: + os << "ADD_SNAPSHOT"; + break; + default: + os << "UNKNOWN (" << static_cast(state) << ")"; + break; + } + return os; +} + +} // anonymous namespace + +void SnapshotCreateRequest::send() { + send_read_map(); +} + +bool SnapshotCreateRequest::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0 && m_ret_val == 0) { + m_ret_val = r; + } + if (m_ret_val < 0) { + // pass errors down to base class to invalidate the object map + return Request::should_complete(r); + } + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + bool finished = false; + switch (m_state) { + case STATE_READ_MAP: + send_write_map(); + break; + case STATE_WRITE_MAP: + finished = send_add_snapshot(); + break; + case STATE_ADD_SNAPSHOT: + update_object_map(); + finished = true; + break; + default: + ceph_abort(); + break; + } + return finished; +} + +void SnapshotCreateRequest::send_read_map() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl; + m_state = STATE_READ_MAP; + + // IO is blocked due to the snapshot creation -- consistent to read from disk + librados::ObjectReadOperation op; + op.read(0, 0, NULL, NULL); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op, + &m_read_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotCreateRequest::send_write_map() { + CephContext *cct = m_image_ctx.cct; + std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid + << dendl; + m_state = STATE_WRITE_MAP; + + librados::ObjectWriteOperation op; + op.write_full(m_read_bl); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +bool SnapshotCreateRequest::send_add_snapshot() { + std::shared_lock image_locker{m_image_ctx.image_lock}; + if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) == 0) { + return true; + } + + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 5) << this << " " << __func__ << ": oid=" << oid << dendl; + m_state = STATE_ADD_SNAPSHOT; + + librados::ObjectWriteOperation op; + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", ""); + cls_client::object_map_snap_add(&op); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); + return false; +} + +void SnapshotCreateRequest::update_object_map() { + std::unique_lock object_map_locker{*m_object_map_lock}; + + auto it = m_object_map.begin(); + auto end_it = m_object_map.end(); + for (; it != end_it; ++it) { + if (*it == OBJECT_EXISTS) { + *it = OBJECT_EXISTS_CLEAN; + } + } +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/SnapshotCreateRequest.h b/src/librbd/object_map/SnapshotCreateRequest.h new file mode 100644 index 000000000..3074d059d --- /dev/null +++ b/src/librbd/object_map/SnapshotCreateRequest.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H + +#include "include/int_types.h" +#include "common/bit_vector.hpp" +#include "librbd/object_map/Request.h" + +class Context; +class RWLock; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +class SnapshotCreateRequest : public Request { +public: + /** + * Snapshot create goes through the following state machine: + * + * @verbatim + * + * + * | + * v + * STATE_READ_MAP + * | + * v (skip) + * STATE_WRITE_MAP . . . . . . . + * | . + * v v + * STATE_ADD_SNAPSHOT ---> + * + * @endverbatim + * + * The _ADD_SNAPSHOT state is skipped if the FAST_DIFF feature isn't enabled. + */ + enum State { + STATE_READ_MAP, + STATE_WRITE_MAP, + STATE_ADD_SNAPSHOT + }; + + SnapshotCreateRequest(ImageCtx &image_ctx, ceph::shared_mutex* object_map_lock, + ceph::BitVector<2> *object_map, uint64_t snap_id, + Context *on_finish) + : Request(image_ctx, snap_id, on_finish), + m_object_map_lock(object_map_lock), m_object_map(*object_map), + m_ret_val(0) { + } + + void send() override; + +protected: + bool should_complete(int r) override; + +private: + ceph::shared_mutex* m_object_map_lock; + ceph::BitVector<2> &m_object_map; + + State m_state = STATE_READ_MAP; + bufferlist m_read_bl; + int m_ret_val; + + void send_read_map(); + void send_write_map(); + bool send_add_snapshot(); + + void update_object_map(); + +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_CREATE_REQUEST_H diff --git a/src/librbd/object_map/SnapshotRemoveRequest.cc b/src/librbd/object_map/SnapshotRemoveRequest.cc new file mode 100644 index 000000000..1c2ffc753 --- /dev/null +++ b/src/librbd/object_map/SnapshotRemoveRequest.cc @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/SnapshotRemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/object_map/InvalidateRequest.h" +#include "cls/lock/cls_lock_client.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::SnapshotRemoveRequest: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace object_map { + +void SnapshotRemoveRequest::send() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(ceph_mutex_is_wlocked(m_image_ctx.image_lock)); + + if ((m_image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0) { + int r = m_image_ctx.get_flags(m_snap_id, &m_flags); + ceph_assert(r == 0); + + compute_next_snap_id(); + load_map(); + } else { + remove_map(); + } +} + +void SnapshotRemoveRequest::load_map() { + CephContext *cct = m_image_ctx.cct; + std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 5) << "snap_oid=" << snap_oid << dendl; + + librados::ObjectReadOperation op; + cls_client::object_map_load_start(&op); + + auto rados_completion = librbd::util::create_rados_callback< + SnapshotRemoveRequest, &SnapshotRemoveRequest::handle_load_map>(this); + int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op, + &m_out_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRemoveRequest::handle_load_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::object_map_load_finish(&it, &m_snap_object_map); + } + if (r == -ENOENT) { + // implies we have already deleted this snapshot and handled the + // necessary fast-diff cleanup + complete(0); + return; + } else if (r < 0) { + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + lderr(cct) << "failed to load object map " << oid << ": " + << cpp_strerror(r) << dendl; + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::unique_lock image_locker{m_image_ctx.image_lock}; + invalidate_next_map(); + return; + } + + remove_snapshot(); +} + +void SnapshotRemoveRequest::remove_snapshot() { + if ((m_flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) { + // snapshot object map exists on disk but is invalid. cannot clean fast-diff + // on next snapshot if current snapshot was invalid. + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::unique_lock image_locker{m_image_ctx.image_lock}; + invalidate_next_map(); + return; + } + + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_next_snap_id)); + ldout(cct, 5) << "oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + if (m_next_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", ""); + } + cls_client::object_map_snap_remove(&op, m_snap_object_map); + + auto rados_completion = librbd::util::create_rados_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_remove_snapshot>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRemoveRequest::handle_remove_snapshot(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + if (r < 0 && r != -ENOENT) { + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, + m_next_snap_id)); + lderr(cct) << "failed to remove object map snapshot " << oid << ": " + << cpp_strerror(r) << dendl; + + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::unique_lock image_locker{m_image_ctx.image_lock}; + invalidate_next_map(); + return; + } + + std::shared_lock image_locker{m_image_ctx.image_lock}; + update_object_map(); + remove_map(); +} + +void SnapshotRemoveRequest::invalidate_next_map() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + ceph_assert(ceph_mutex_is_wlocked(m_image_ctx.image_lock)); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + auto ctx = librbd::util::create_context_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_invalidate_next_map>(this); + InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, + m_next_snap_id, true, ctx); + req->send(); +} + +void SnapshotRemoveRequest::handle_invalidate_next_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, + m_next_snap_id)); + lderr(cct) << "failed to invalidate object map " << oid << ": " + << cpp_strerror(r) << dendl; + complete(r); + return; + } + + remove_map(); +} + +void SnapshotRemoveRequest::remove_map() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 5) << "oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + op.remove(); + + auto rados_completion = librbd::util::create_rados_callback< + SnapshotRemoveRequest, &SnapshotRemoveRequest::handle_remove_map>(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRemoveRequest::handle_remove_map(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + lderr(cct) << "failed to remove object map " << oid << ": " + << cpp_strerror(r) << dendl; + complete(r); + return; + } + + complete(0); +} + +void SnapshotRemoveRequest::compute_next_snap_id() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + + m_next_snap_id = CEPH_NOSNAP; + std::map::const_iterator it = + m_image_ctx.snap_info.find(m_snap_id); + ceph_assert(it != m_image_ctx.snap_info.end()); + + ++it; + if (it != m_image_ctx.snap_info.end()) { + m_next_snap_id = it->first; + } +} + +void SnapshotRemoveRequest::update_object_map() { + assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + std::unique_lock object_map_locker{*m_object_map_lock}; + if (m_next_snap_id == m_image_ctx.snap_id && m_next_snap_id == CEPH_NOSNAP) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << dendl; + + auto it = m_object_map.begin(); + auto end_it = m_object_map.end(); + auto snap_it = m_snap_object_map.begin(); + uint64_t i = 0; + for (; it != end_it; ++it) { + if (*it == OBJECT_EXISTS_CLEAN && + (i >= m_snap_object_map.size() || + *snap_it == OBJECT_EXISTS)) { + *it = OBJECT_EXISTS; + } + if (i < m_snap_object_map.size()) { + ++snap_it; + } + ++i; + } + } +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/SnapshotRemoveRequest.h b/src/librbd/object_map/SnapshotRemoveRequest.h new file mode 100644 index 000000000..1e9c75d81 --- /dev/null +++ b/src/librbd/object_map/SnapshotRemoveRequest.h @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H + +#include "include/int_types.h" +#include "include/buffer.h" +#include "common/bit_vector.hpp" +#include "librbd/AsyncRequest.h" + +namespace librbd { +namespace object_map { + +class SnapshotRemoveRequest : public AsyncRequest<> { +public: + /** + * Snapshot rollback goes through the following state machine: + * + * @verbatim + * + * -----------> STATE_LOAD_MAP ----\ + * . * | + * . * (error) | + * . (invalid object map) v | + * . . . > STATE_INVALIDATE_NEXT_MAP | + * . | | + * . | | + * . (fast diff disabled) v v + * . . . . . . . . . . > STATE_REMOVE_MAP + * | + * v + * + * + * @endverbatim + * + * The _LOAD_MAP state is skipped if the fast diff feature is disabled. + * If the fast diff feature is enabled and the snapshot is flagged as + * invalid, the next snapshot / HEAD object mapis flagged as invalid; + * otherwise, the state machine proceeds to remove the object map. + */ + + SnapshotRemoveRequest(ImageCtx &image_ctx, ceph::shared_mutex* object_map_lock, + ceph::BitVector<2> *object_map, uint64_t snap_id, + Context *on_finish) + : AsyncRequest(image_ctx, on_finish), + m_object_map_lock(object_map_lock), m_object_map(*object_map), + m_snap_id(snap_id), m_next_snap_id(CEPH_NOSNAP) { + } + + void send() override; + +protected: + bool should_complete(int r) override { + return true; + } + +private: + ceph::shared_mutex* m_object_map_lock; + ceph::BitVector<2> &m_object_map; + uint64_t m_snap_id; + uint64_t m_next_snap_id; + + uint64_t m_flags = 0; + + ceph::BitVector<2> m_snap_object_map; + bufferlist m_out_bl; + + void load_map(); + void handle_load_map(int r); + + void remove_snapshot(); + void handle_remove_snapshot(int r); + + void invalidate_next_map(); + void handle_invalidate_next_map(int r); + + void remove_map(); + void handle_remove_map(int r); + + void compute_next_snap_id(); + void update_object_map(); +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_REMOVE_REQUEST_H diff --git a/src/librbd/object_map/SnapshotRollbackRequest.cc b/src/librbd/object_map/SnapshotRollbackRequest.cc new file mode 100644 index 000000000..7c2f441cc --- /dev/null +++ b/src/librbd/object_map/SnapshotRollbackRequest.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/SnapshotRollbackRequest.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/object_map/InvalidateRequest.h" +#include "cls/lock/cls_lock_client.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::SnapshotRollbackRequest: " + +namespace librbd { +namespace object_map { + +namespace { + +std::ostream& operator<<(std::ostream& os, + const SnapshotRollbackRequest::State& state) { + switch(state) { + case SnapshotRollbackRequest::STATE_READ_MAP: + os << "READ_MAP"; + break; + case SnapshotRollbackRequest::STATE_INVALIDATE_MAP: + os << "INVALIDATE_MAP"; + break; + case SnapshotRollbackRequest::STATE_WRITE_MAP: + os << "WRITE_MAP"; + break; + default: + os << "UNKNOWN (" << static_cast(state) << ")"; + break; + } + return os; +} + +} // anonymous namespace + +void SnapshotRollbackRequest::send() { + send_read_map(); +} + +bool SnapshotRollbackRequest::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0 && m_ret_val == 0) { + m_ret_val = r; + } + + bool finished = false; + switch (m_state) { + case STATE_READ_MAP: + if (r < 0) { + // invalidate the snapshot object map + send_invalidate_map(); + } else { + send_write_map(); + } + break; + case STATE_INVALIDATE_MAP: + // invalidate the HEAD object map as well + finished = Request::should_complete(m_ret_val); + break; + case STATE_WRITE_MAP: + finished = Request::should_complete(r); + break; + default: + ceph_abort(); + break; + } + return finished; +} + +void SnapshotRollbackRequest::send_read_map() { + std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid + << dendl; + m_state = STATE_READ_MAP; + + librados::ObjectReadOperation op; + op.read(0, 0, NULL, NULL); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op, + &m_read_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRollbackRequest::send_write_map() { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + + CephContext *cct = m_image_ctx.cct; + std::string snap_oid(ObjectMap<>::object_map_name(m_image_ctx.id, + CEPH_NOSNAP)); + ldout(cct, 5) << this << " " << __func__ << ": snap_oid=" << snap_oid + << dendl; + m_state = STATE_WRITE_MAP; + + librados::ObjectWriteOperation op; + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", ""); + op.write_full(m_read_bl); + + librados::AioCompletion *rados_completion = create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(snap_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +void SnapshotRollbackRequest::send_invalidate_map() { + std::shared_lock owner_locker{m_image_ctx.owner_lock}; + std::unique_lock image_locker{m_image_ctx.image_lock}; + + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_INVALIDATE_MAP; + + InvalidateRequest<> *req = new InvalidateRequest<>(m_image_ctx, m_snap_id, + false, + create_callback_context()); + req->send(); +} + +} // namespace object_map +} // namespace librbd diff --git a/src/librbd/object_map/SnapshotRollbackRequest.h b/src/librbd/object_map/SnapshotRollbackRequest.h new file mode 100644 index 000000000..e26b1e0a3 --- /dev/null +++ b/src/librbd/object_map/SnapshotRollbackRequest.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H + +#include "include/int_types.h" +#include "librbd/object_map/Request.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +class SnapshotRollbackRequest : public Request { +public: + /** + * Snapshot rollback goes through the following state machine: + * + * @verbatim + * + * + * | + * v (error) + * STATE_READ_MAP * * * * > STATE_INVALIDATE_MAP + * | | + * v v + * STATE_WRITE_MAP -------> + * + * @endverbatim + * + * If an error occurs within the READ_MAP state, the associated snapshot's + * object map will be flagged as invalid. Otherwise, an error from any state + * will result in the HEAD object map being flagged as invalid via the base + * class. + */ + enum State { + STATE_READ_MAP, + STATE_INVALIDATE_MAP, + STATE_WRITE_MAP + }; + + SnapshotRollbackRequest(ImageCtx &image_ctx, uint64_t snap_id, + Context *on_finish) + : Request(image_ctx, CEPH_NOSNAP, on_finish), + m_snap_id(snap_id), m_ret_val(0) { + ceph_assert(snap_id != CEPH_NOSNAP); + } + + void send() override; + +protected: + bool should_complete(int r) override; + +private: + State m_state = STATE_READ_MAP; + uint64_t m_snap_id; + int m_ret_val; + + bufferlist m_read_bl; + + void send_read_map(); + void send_invalidate_map(); + void send_write_map(); + +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_SNAPSHOT_ROLLBACK_REQUEST_H diff --git a/src/librbd/object_map/Types.h b/src/librbd/object_map/Types.h new file mode 100644 index 000000000..0ce91bd96 --- /dev/null +++ b/src/librbd/object_map/Types.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_TYPES_H +#define CEPH_LIBRBD_OBJECT_MAP_TYPES_H + +namespace librbd { +namespace object_map { + +enum DiffState { + DIFF_STATE_HOLE = 0, /* unchanged hole */ + DIFF_STATE_DATA = 1, /* unchanged data */ + DIFF_STATE_HOLE_UPDATED = 2, /* new hole */ + DIFF_STATE_DATA_UPDATED = 3 /* new data */ +}; + +} // namespace object_map +} // namespace librbd + +#endif // CEPH_LIBRBD_OBJECT_MAP_TYPES_H diff --git a/src/librbd/object_map/UnlockRequest.cc b/src/librbd/object_map/UnlockRequest.cc new file mode 100644 index 000000000..47d1a870b --- /dev/null +++ b/src/librbd/object_map/UnlockRequest.cc @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/UnlockRequest.h" +#include "cls/lock/cls_lock_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::UnlockRequest: " + +namespace librbd { +namespace object_map { + +using util::create_rados_callback; + +template +UnlockRequest::UnlockRequest(I &image_ctx, Context *on_finish) + : m_image_ctx(image_ctx), m_on_finish(on_finish) { +} + +template +void UnlockRequest::send() { + send_unlock(); +} + +template +void UnlockRequest::send_unlock() { + CephContext *cct = m_image_ctx.cct; + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, CEPH_NOSNAP)); + ldout(cct, 10) << this << " " << __func__ << ": oid=" << oid << dendl; + + librados::ObjectWriteOperation op; + rados::cls::lock::unlock(&op, RBD_LOCK_NAME, ""); + + using klass = UnlockRequest; + librados::AioCompletion *rados_completion = + create_rados_callback(this); + int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +Context *UnlockRequest::handle_unlock(int *ret_val) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << *ret_val << dendl; + + if (*ret_val < 0 && *ret_val != -ENOENT) { + lderr(m_image_ctx.cct) << "failed to release object map lock: " + << cpp_strerror(*ret_val) << dendl; + + } + + *ret_val = 0; + return m_on_finish; +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::UnlockRequest; diff --git a/src/librbd/object_map/UnlockRequest.h b/src/librbd/object_map/UnlockRequest.h new file mode 100644 index 000000000..d2d8dcd48 --- /dev/null +++ b/src/librbd/object_map/UnlockRequest.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template +class UnlockRequest { +public: + static UnlockRequest *create(ImageCtxT &image_ctx, Context *on_finish) { + return new UnlockRequest(image_ctx, on_finish); + } + + UnlockRequest(ImageCtxT &image_ctx, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * ----> UNLOCK ----> + * + * @endverbatim + */ + + ImageCtxT &m_image_ctx; + Context *m_on_finish; + + void send_unlock(); + Context* handle_unlock(int *ret_val); +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::UnlockRequest; + +#endif // CEPH_LIBRBD_OBJECT_MAP_UNLOCK_REQUEST_H diff --git a/src/librbd/object_map/UpdateRequest.cc b/src/librbd/object_map/UpdateRequest.cc new file mode 100644 index 000000000..30a1f2121 --- /dev/null +++ b/src/librbd/object_map/UpdateRequest.cc @@ -0,0 +1,129 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/object_map/UpdateRequest.h" +#include "include/rbd/object_map_types.h" +#include "include/stringify.h" +#include "common/dout.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "cls/lock/cls_lock_client.h" +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::object_map::UpdateRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace object_map { + +namespace { + +// keep aligned to bit_vector 4K block sizes +const uint64_t MAX_OBJECTS_PER_UPDATE = 256 * (1 << 10); + +} + +template +void UpdateRequest::send() { + update_object_map(); +} + +template +void UpdateRequest::update_object_map() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + ceph_assert(ceph_mutex_is_locked(*m_object_map_lock)); + CephContext *cct = m_image_ctx.cct; + + // break very large requests into manageable batches + m_update_end_object_no = std::min( + m_end_object_no, m_update_start_object_no + MAX_OBJECTS_PER_UPDATE); + + std::string oid(ObjectMap<>::object_map_name(m_image_ctx.id, m_snap_id)); + ldout(cct, 20) << "ictx=" << &m_image_ctx << ", oid=" << oid << ", " + << "[" << m_update_start_object_no << "," + << m_update_end_object_no << ") = " + << (m_current_state ? + stringify(static_cast(*m_current_state)) : "") + << "->" << static_cast(m_new_state) + << dendl; + + librados::ObjectWriteOperation op; + if (m_snap_id == CEPH_NOSNAP) { + rados::cls::lock::assert_locked(&op, RBD_LOCK_NAME, ClsLockType::EXCLUSIVE, "", ""); + } + cls_client::object_map_update(&op, m_update_start_object_no, + m_update_end_object_no, m_new_state, + m_current_state); + + auto rados_completion = librbd::util::create_rados_callback< + UpdateRequest, &UpdateRequest::handle_update_object_map>(this); + std::vector snaps; + int r = m_image_ctx.md_ctx.aio_operate( + oid, rados_completion, &op, 0, snaps, + (m_trace.valid() ? m_trace.get_info() : nullptr)); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void UpdateRequest::handle_update_object_map(int r) { + ldout(m_image_ctx.cct, 20) << "r=" << r << dendl; + + if (r == -ENOENT && m_ignore_enoent) { + r = 0; + } + if (r < 0 && m_ret_val == 0) { + m_ret_val = r; + } + + { + std::shared_lock image_locker{m_image_ctx.image_lock}; + std::unique_lock object_map_locker{*m_object_map_lock}; + update_in_memory_object_map(); + + if (m_update_end_object_no < m_end_object_no) { + m_update_start_object_no = m_update_end_object_no; + update_object_map(); + return; + } + } + + // no more batch updates to send + complete(m_ret_val); +} + +template +void UpdateRequest::update_in_memory_object_map() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + ceph_assert(ceph_mutex_is_locked(*m_object_map_lock)); + + // rebuilding the object map might update on-disk only + if (m_snap_id == m_image_ctx.snap_id) { + ldout(m_image_ctx.cct, 20) << dendl; + + auto it = m_object_map.begin() + + std::min(m_update_start_object_no, m_object_map.size()); + auto end_it = m_object_map.begin() + + std::min(m_update_end_object_no, m_object_map.size()); + for (; it != end_it; ++it) { + auto state_ref = *it; + uint8_t state = state_ref; + if (!m_current_state || state == *m_current_state || + (*m_current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) { + state_ref = m_new_state; + } + } + } +} + +template +void UpdateRequest::finish_request() { +} + +} // namespace object_map +} // namespace librbd + +template class librbd::object_map::UpdateRequest; diff --git a/src/librbd/object_map/UpdateRequest.h b/src/librbd/object_map/UpdateRequest.h new file mode 100644 index 000000000..b5a72d591 --- /dev/null +++ b/src/librbd/object_map/UpdateRequest.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H +#define CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H + +#include "include/int_types.h" +#include "librbd/object_map/Request.h" +#include "common/bit_vector.hpp" +#include "common/zipkin_trace.h" +#include "librbd/Utils.h" +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace object_map { + +template +class UpdateRequest : public Request { +public: + static UpdateRequest *create(ImageCtx &image_ctx, + ceph::shared_mutex* object_map_lock, + ceph::BitVector<2> *object_map, + uint64_t snap_id, uint64_t start_object_no, + uint64_t end_object_no, uint8_t new_state, + const boost::optional ¤t_state, + const ZTracer::Trace &parent_trace, + bool ignore_enoent, Context *on_finish) { + return new UpdateRequest(image_ctx, object_map_lock, object_map, snap_id, + start_object_no, end_object_no, new_state, + current_state, parent_trace, ignore_enoent, + on_finish); + } + + UpdateRequest(ImageCtx &image_ctx, ceph::shared_mutex* object_map_lock, + ceph::BitVector<2> *object_map, uint64_t snap_id, + uint64_t start_object_no, uint64_t end_object_no, + uint8_t new_state, + const boost::optional ¤t_state, + const ZTracer::Trace &parent_trace, bool ignore_enoent, + Context *on_finish) + : Request(image_ctx, snap_id, on_finish), + m_object_map_lock(object_map_lock), m_object_map(*object_map), + m_start_object_no(start_object_no), m_end_object_no(end_object_no), + m_update_start_object_no(start_object_no), m_new_state(new_state), + m_current_state(current_state), + m_trace(util::create_trace(image_ctx, "update object map", parent_trace)), + m_ignore_enoent(ignore_enoent) + { + m_trace.event("start"); + } + virtual ~UpdateRequest() { + m_trace.event("finish"); + } + + void send() override; + +protected: + void finish_request() override; + +private: + /** + * @verbatim + * + * + * | + * |/------------------\ + * v | (repeat in batches) + * UPDATE_OBJECT_MAP -----/ + * | + * v + * + * + * @endverbatim + */ + + ceph::shared_mutex* m_object_map_lock; + ceph::BitVector<2> &m_object_map; + uint64_t m_start_object_no; + uint64_t m_end_object_no; + uint64_t m_update_start_object_no; + uint64_t m_update_end_object_no = 0; + uint8_t m_new_state; + boost::optional m_current_state; + ZTracer::Trace m_trace; + bool m_ignore_enoent; + + int m_ret_val = 0; + + void update_object_map(); + void handle_update_object_map(int r); + + void update_in_memory_object_map(); + +}; + +} // namespace object_map +} // namespace librbd + +extern template class librbd::object_map::UpdateRequest; + +#endif // CEPH_LIBRBD_OBJECT_MAP_UPDATE_REQUEST_H diff --git a/src/librbd/operation/DisableFeaturesRequest.cc b/src/librbd/operation/DisableFeaturesRequest.cc new file mode 100644 index 000000000..32db4b518 --- /dev/null +++ b/src/librbd/operation/DisableFeaturesRequest.cc @@ -0,0 +1,655 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/DisableFeaturesRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/image/SetFlagsRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/journal/RemoveRequest.h" +#include "librbd/journal/TypeTraits.h" +#include "librbd/mirror/DisableRequest.h" +#include "librbd/object_map/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::DisableFeaturesRequest: " + +namespace librbd { +namespace operation { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template +DisableFeaturesRequest::DisableFeaturesRequest(I &image_ctx, + Context *on_finish, + uint64_t journal_op_tid, + uint64_t features, + bool force) + : Request(image_ctx, on_finish, journal_op_tid), m_features(features), + m_force(force) { +} + +template +void DisableFeaturesRequest::send_op() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + ldout(cct, 20) << this << " " << __func__ << ": features=" << m_features + << dendl; + + send_prepare_lock(); +} + +template +bool DisableFeaturesRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template +void DisableFeaturesRequest::send_prepare_lock() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + image_ctx.state->prepare_lock(create_async_context_callback( + image_ctx, create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_prepare_lock>(this))); +} + +template +Context *DisableFeaturesRequest::handle_prepare_lock(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_block_writes(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + std::unique_lock locker{image_ctx.owner_lock}; + image_ctx.io_image_dispatcher->block_writes(create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_block_writes>(this)); +} + +template +Context *DisableFeaturesRequest::handle_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + m_writes_blocked = true; + + { + std::unique_lock locker{image_ctx.owner_lock}; + // avoid accepting new requests from peers while we manipulate + // the image features + if (image_ctx.exclusive_lock != nullptr && + (image_ctx.journal == nullptr || + !image_ctx.journal->is_journal_replaying())) { + image_ctx.exclusive_lock->block_requests(0); + m_requests_blocked = true; + } + } + + return send_acquire_exclusive_lock(result); +} + +template +Context *DisableFeaturesRequest::send_acquire_exclusive_lock(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + { + std::unique_lock locker{image_ctx.owner_lock}; + // if disabling features w/ exclusive lock supported, we need to + // acquire the lock to temporarily block IO against the image + if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + m_acquired_lock = true; + + Context *ctx = create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_acquire_exclusive_lock>( + this, image_ctx.exclusive_lock); + image_ctx.exclusive_lock->acquire_lock(ctx); + return nullptr; + } + } + + return handle_acquire_exclusive_lock(result); +} + +template +Context *DisableFeaturesRequest::handle_acquire_exclusive_lock(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + image_ctx.owner_lock.lock_shared(); + if (*result < 0) { + lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl; + image_ctx.owner_lock.unlock_shared(); + return handle_finish(*result); + } else if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + lderr(cct) << "failed to acquire exclusive lock" << dendl; + *result = image_ctx.exclusive_lock->get_unlocked_op_error(); + image_ctx.owner_lock.unlock_shared(); + return handle_finish(*result); + } + + do { + m_features &= image_ctx.features; + + // interlock object-map and fast-diff together + if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) || + ((m_features & RBD_FEATURE_FAST_DIFF) != 0)) { + m_features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF); + } + + m_new_features = image_ctx.features & ~m_features; + m_features_mask = m_features; + + if ((m_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) { + if ((m_new_features & RBD_FEATURE_OBJECT_MAP) != 0 || + (m_new_features & RBD_FEATURE_JOURNALING) != 0) { + lderr(cct) << "cannot disable exclusive-lock. object-map " + "or journaling must be disabled before " + "disabling exclusive-lock." << dendl; + *result = -EINVAL; + break; + } + m_features_mask |= (RBD_FEATURE_OBJECT_MAP | + RBD_FEATURE_FAST_DIFF | + RBD_FEATURE_JOURNALING); + } + if ((m_features & RBD_FEATURE_FAST_DIFF) != 0) { + m_disable_flags |= RBD_FLAG_FAST_DIFF_INVALID; + } + if ((m_features & RBD_FEATURE_OBJECT_MAP) != 0) { + m_disable_flags |= RBD_FLAG_OBJECT_MAP_INVALID; + } + } while (false); + image_ctx.owner_lock.unlock_shared(); + + if (*result < 0) { + return handle_finish(*result); + } + + send_get_mirror_mode(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_get_mirror_mode() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if ((m_features & RBD_FEATURE_JOURNALING) == 0) { + send_append_op_event(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_mode_get_start(&op); + + using klass = DisableFeaturesRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_out_bl.clear(); + int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *DisableFeaturesRequest::handle_get_mirror_mode(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::mirror_mode_get_finish(&it, &m_mirror_mode); + } + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to retrieve pool mirror mode: " + << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + + ldout(cct, 20) << this << " " << __func__ << ": m_mirror_mode=" + << m_mirror_mode << dendl; + + send_get_mirror_image(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_get_mirror_image() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_mirror_mode != cls::rbd::MIRROR_MODE_IMAGE) { + send_disable_mirror_image(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_image_get_start(&op, image_ctx.id); + + using klass = DisableFeaturesRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_out_bl.clear(); + int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *DisableFeaturesRequest::handle_get_mirror_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + cls::rbd::MirrorImage mirror_image; + + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::mirror_image_get_finish(&it, &mirror_image); + } + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to retrieve pool mirror image: " + << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + + if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED && + mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_JOURNAL && !m_force) { + lderr(cct) << "cannot disable journaling: journal-based mirroring " + << "enabled and mirror pool mode set to image" + << dendl; + *result = -EINVAL; + return handle_finish(*result); + } + + if (mirror_image.mode != cls::rbd::MIRROR_IMAGE_MODE_JOURNAL) { + send_close_journal(); + } else { + send_disable_mirror_image(); + } + return nullptr; +} + +template +void DisableFeaturesRequest::send_disable_mirror_image() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_disable_mirror_image>(this); + + mirror::DisableRequest *req = + mirror::DisableRequest::create(&image_ctx, m_force, true, ctx); + req->send(); +} + +template +Context *DisableFeaturesRequest::handle_disable_mirror_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to disable image mirroring: " << cpp_strerror(*result) + << dendl; + // not fatal + } + + send_close_journal(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_close_journal() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + { + std::unique_lock locker{image_ctx.owner_lock}; + if (image_ctx.journal != nullptr) { + ldout(cct, 20) << this << " " << __func__ << dendl; + + std::swap(m_journal, image_ctx.journal); + Context *ctx = create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_close_journal>(this); + + m_journal->close(ctx); + return; + } + } + + send_remove_journal(); +} + +template +Context *DisableFeaturesRequest::handle_close_journal(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to close image journal: " << cpp_strerror(*result) + << dendl; + } + + ceph_assert(m_journal != nullptr); + m_journal->put(); + m_journal = nullptr; + + send_remove_journal(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_remove_journal() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_remove_journal>(this); + + typename journal::TypeTraits::ContextWQ* context_wq; + Journal::get_work_queue(cct, &context_wq); + + journal::RemoveRequest *req = journal::RemoveRequest::create( + image_ctx.md_ctx, image_ctx.id, librbd::Journal<>::IMAGE_CLIENT_ID, + context_wq, ctx); + + req->send(); +} + +template +Context *DisableFeaturesRequest::handle_remove_journal(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to remove image journal: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_append_op_event(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_append_op_event() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (!this->template append_op_event< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_append_op_event>(this)) { + send_remove_object_map(); + } + + ldout(cct, 20) << this << " " << __func__ << dendl; +} + +template +Context *DisableFeaturesRequest::handle_append_op_event(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_remove_object_map(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_remove_object_map() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + if ((m_features & RBD_FEATURE_OBJECT_MAP) == 0) { + send_set_features(); + return; + } + + Context *ctx = create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_remove_object_map>(this); + + object_map::RemoveRequest *req = + object_map::RemoveRequest::create(&image_ctx, ctx); + req->send(); +} + +template +Context *DisableFeaturesRequest::handle_remove_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0 && *result != -ENOENT) { + lderr(cct) << "failed to remove object map: " << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + + send_set_features(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_set_features() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": new_features=" + << m_new_features << ", features_mask=" << m_features_mask + << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::set_features(&op, m_new_features, m_features_mask); + + using klass = DisableFeaturesRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *DisableFeaturesRequest::handle_set_features(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -EINVAL && (m_features_mask & RBD_FEATURE_JOURNALING) != 0) { + // NOTE: infernalis OSDs will not accept a mask with new features, so + // re-attempt with a reduced mask. + ldout(cct, 5) << this << " " << __func__ + << ": re-attempt with a reduced mask" << dendl; + m_features_mask &= ~RBD_FEATURE_JOURNALING; + send_set_features(); + } + + if (*result < 0) { + lderr(cct) << "failed to update features: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_update_flags(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_update_flags() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_disable_flags == 0) { + send_notify_update(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": disable_flags=" + << m_disable_flags << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_update_flags>(this); + + image::SetFlagsRequest *req = + image::SetFlagsRequest::create(&image_ctx, 0, m_disable_flags, ctx); + req->send(); +} + +template +Context *DisableFeaturesRequest::handle_update_flags(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to update image flags: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_notify_update(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_notify_update() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_notify_update>(this); + + image_ctx.notify_update(ctx); +} + +template +Context *DisableFeaturesRequest::handle_notify_update(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (image_ctx.exclusive_lock == nullptr || !m_acquired_lock) { + return handle_finish(*result); + } + + send_release_exclusive_lock(); + return nullptr; +} + +template +void DisableFeaturesRequest::send_release_exclusive_lock() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + DisableFeaturesRequest, + &DisableFeaturesRequest::handle_release_exclusive_lock>( + this, image_ctx.exclusive_lock); + + image_ctx.exclusive_lock->release_lock(ctx); +} + +template +Context *DisableFeaturesRequest::handle_release_exclusive_lock(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + return handle_finish(*result); +} + +template +Context *DisableFeaturesRequest::handle_finish(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + { + std::unique_lock locker{image_ctx.owner_lock}; + if (image_ctx.exclusive_lock != nullptr && m_requests_blocked) { + image_ctx.exclusive_lock->unblock_requests(); + } + + image_ctx.io_image_dispatcher->unblock_writes(); + } + image_ctx.state->handle_prepare_lock_complete(); + + return this->create_context_finisher(r); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::DisableFeaturesRequest; diff --git a/src/librbd/operation/DisableFeaturesRequest.h b/src/librbd/operation/DisableFeaturesRequest.h new file mode 100644 index 000000000..719a03399 --- /dev/null +++ b/src/librbd/operation/DisableFeaturesRequest.h @@ -0,0 +1,171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H +#define CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H + +#include "librbd/ImageCtx.h" +#include "librbd/operation/Request.h" +#include "cls/rbd/cls_rbd_client.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class DisableFeaturesRequest : public Request { +public: + static DisableFeaturesRequest *create(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid, + uint64_t features, bool force) { + return new DisableFeaturesRequest(image_ctx, on_finish, journal_op_tid, + features, force); + } + + DisableFeaturesRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid, uint64_t features, bool force); + +protected: + void send_op() override; + bool should_complete(int r) override; + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + return journal::UpdateFeaturesEvent(op_tid, m_features, false); + } + +private: + /** + * DisableFeatures goes through the following state machine: + * + * @verbatim + * + * + * | + * v + * STATE_PREPARE_LOCK + * | + * v + * STATE_BLOCK_WRITES + * | + * v + * STATE_ACQUIRE_EXCLUSIVE_LOCK (skip if not + * | required) + * | (disbling journaling) + * \-------------------\ + * | | + * | V + * | STATE_GET_MIRROR_MODE + * |(not | + * | disabling v + * | journaling) STATE_GET_MIRROR_IMAGE + * | | + * | v + * | STATE_DISABLE_MIRROR_IMAGE (skip if not + * | | required) + * | v + * | STATE_CLOSE_JOURNAL + * | | + * | v + * | STATE_REMOVE_JOURNAL + * | | + * |/-------------------/ + * | + * v + * STATE_APPEND_OP_EVENT (skip if journaling + * | disabled) + * v + * STATE_REMOVE_OBJECT_MAP (skip if not + * | disabling object map) + * v + * STATE_SET_FEATURES + * | + * v + * STATE_UPDATE_FLAGS + * | + * v + * STATE_NOTIFY_UPDATE + * | + * v + * STATE_REALEASE_EXCLUSIVE_LOCK (skip if not + * | required) + * | (unblock writes) + * v + * + * + * @endverbatim + * + */ + + uint64_t m_features; + bool m_force; + + bool m_acquired_lock = false; + bool m_writes_blocked = false; + bool m_image_lock_acquired = false; + bool m_requests_blocked = false; + + uint64_t m_new_features = 0; + uint64_t m_disable_flags = 0; + uint64_t m_features_mask = 0; + + decltype(ImageCtxT::journal) m_journal = nullptr; + cls::rbd::MirrorMode m_mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + bufferlist m_out_bl; + + void send_prepare_lock(); + Context *handle_prepare_lock(int *result); + + void send_block_writes(); + Context *handle_block_writes(int *result); + + Context *send_acquire_exclusive_lock(int *result); + Context *handle_acquire_exclusive_lock(int *result); + + void send_get_mirror_mode(); + Context *handle_get_mirror_mode(int *result); + + void send_get_mirror_image(); + Context *handle_get_mirror_image(int *result); + + void send_disable_mirror_image(); + Context *handle_disable_mirror_image(int *result); + + void send_close_journal(); + Context *handle_close_journal(int *result); + + void send_remove_journal(); + Context *handle_remove_journal(int *result); + + void send_append_op_event(); + Context *handle_append_op_event(int *result); + + void send_remove_object_map(); + Context *handle_remove_object_map(int *result); + + void send_set_features(); + Context *handle_set_features(int *result); + + void send_update_flags(); + Context *handle_update_flags(int *result); + + void send_notify_update(); + Context *handle_notify_update(int *result); + + void send_release_exclusive_lock(); + Context *handle_release_exclusive_lock(int *result); + + Context *handle_finish(int r); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::DisableFeaturesRequest; + +#endif // CEPH_LIBRBD_OPERATION_DISABLE_FEATURES_REQUEST_H diff --git a/src/librbd/operation/EnableFeaturesRequest.cc b/src/librbd/operation/EnableFeaturesRequest.cc new file mode 100644 index 000000000..8e3dad94b --- /dev/null +++ b/src/librbd/operation/EnableFeaturesRequest.cc @@ -0,0 +1,494 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/EnableFeaturesRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Journal.h" +#include "librbd/Utils.h" +#include "librbd/image/SetFlagsRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/journal/CreateRequest.h" +#include "librbd/journal/TypeTraits.h" +#include "librbd/mirror/EnableRequest.h" +#include "librbd/object_map/CreateRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::EnableFeaturesRequest: " + +namespace librbd { +namespace operation { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template +EnableFeaturesRequest::EnableFeaturesRequest(I &image_ctx, + Context *on_finish, + uint64_t journal_op_tid, + uint64_t features) + : Request(image_ctx, on_finish, journal_op_tid), m_features(features) { +} + +template +void EnableFeaturesRequest::send_op() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + ldout(cct, 20) << this << " " << __func__ << ": features=" << m_features + << dendl; + send_prepare_lock(); +} + +template +bool EnableFeaturesRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template +void EnableFeaturesRequest::send_prepare_lock() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + image_ctx.state->prepare_lock(create_async_context_callback( + image_ctx, create_context_callback< + EnableFeaturesRequest, + &EnableFeaturesRequest::handle_prepare_lock>(this))); +} + +template +Context *EnableFeaturesRequest::handle_prepare_lock(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to lock image: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_block_writes(); + return nullptr; +} + +template +void EnableFeaturesRequest::send_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + std::unique_lock locker{image_ctx.owner_lock}; + image_ctx.io_image_dispatcher->block_writes(create_context_callback< + EnableFeaturesRequest, + &EnableFeaturesRequest::handle_block_writes>(this)); +} + +template +Context *EnableFeaturesRequest::handle_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + m_writes_blocked = true; + + send_get_mirror_mode(); + return nullptr; +} + +template +void EnableFeaturesRequest::send_get_mirror_mode() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if ((m_features & RBD_FEATURE_JOURNALING) == 0) { + Context *ctx = create_context_callback< + EnableFeaturesRequest, + &EnableFeaturesRequest::handle_get_mirror_mode>(this); + ctx->complete(-ENOENT); + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + + librados::ObjectReadOperation op; + cls_client::mirror_mode_get_start(&op); + + using klass = EnableFeaturesRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + m_out_bl.clear(); + int r = image_ctx.md_ctx.aio_operate(RBD_MIRRORING, comp, &op, &m_out_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *EnableFeaturesRequest::handle_get_mirror_mode(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED; + if (*result == 0) { + auto it = m_out_bl.cbegin(); + *result = cls_client::mirror_mode_get_finish(&it, &mirror_mode); + } else if (*result == -ENOENT) { + *result = 0; + } + + if (*result < 0) { + lderr(cct) << "failed to retrieve pool mirror mode: " + << cpp_strerror(*result) << dendl; + return handle_finish(*result); + } + + m_enable_mirroring = (mirror_mode == cls::rbd::MIRROR_MODE_POOL); + + bool create_journal = false; + do { + std::unique_lock locker{image_ctx.owner_lock}; + + // avoid accepting new requests from peers while we manipulate + // the image features + if (image_ctx.exclusive_lock != nullptr && + (image_ctx.journal == nullptr || + !image_ctx.journal->is_journal_replaying())) { + image_ctx.exclusive_lock->block_requests(0); + m_requests_blocked = true; + } + + m_features &= ~image_ctx.features; + + // interlock object-map and fast-diff together + if (((m_features & RBD_FEATURE_OBJECT_MAP) != 0) || + ((m_features & RBD_FEATURE_FAST_DIFF) != 0)) { + m_features |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF); + } + + m_new_features = image_ctx.features | m_features; + m_features_mask = m_features; + + if ((m_features & RBD_FEATURE_OBJECT_MAP) != 0) { + if ((m_new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + lderr(cct) << "cannot enable object-map. exclusive-lock must be " + "enabled before enabling object-map." << dendl; + *result = -EINVAL; + break; + } + m_enable_flags |= RBD_FLAG_OBJECT_MAP_INVALID; + m_features_mask |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_FAST_DIFF); + } + if ((m_features & RBD_FEATURE_FAST_DIFF) != 0) { + m_enable_flags |= RBD_FLAG_FAST_DIFF_INVALID; + m_features_mask |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_OBJECT_MAP); + } + + if ((m_features & RBD_FEATURE_JOURNALING) != 0) { + if ((m_new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) { + lderr(cct) << "cannot enable journaling. exclusive-lock must be " + "enabled before enabling journaling." << dendl; + *result = -EINVAL; + break; + } + m_features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK; + create_journal = true; + } + } while (false); + + if (*result < 0) { + return handle_finish(*result); + } + if (create_journal) { + send_create_journal(); + return nullptr; + } + send_append_op_event(); + return nullptr; +} + +template +void EnableFeaturesRequest::send_create_journal() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + ldout(cct, 20) << this << " " << __func__ << dendl; + + journal::TagData tag_data(librbd::Journal<>::LOCAL_MIRROR_UUID); + Context *ctx = create_context_callback< + EnableFeaturesRequest, + &EnableFeaturesRequest::handle_create_journal>(this); + + typename journal::TypeTraits::ContextWQ* context_wq; + Journal::get_work_queue(cct, &context_wq); + + journal::CreateRequest *req = journal::CreateRequest::create( + image_ctx.md_ctx, image_ctx.id, + image_ctx.config.template get_val("rbd_journal_order"), + image_ctx.config.template get_val("rbd_journal_splay_width"), + image_ctx.config.template get_val("rbd_journal_pool"), + cls::journal::Tag::TAG_CLASS_NEW, tag_data, + librbd::Journal<>::IMAGE_CLIENT_ID, context_wq, ctx); + + req->send(); +} + +template +Context *EnableFeaturesRequest::handle_create_journal(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to create journal: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_append_op_event(); + return nullptr; +} + +template +void EnableFeaturesRequest::send_append_op_event() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (!this->template append_op_event< + EnableFeaturesRequest, + &EnableFeaturesRequest::handle_append_op_event>(this)) { + send_update_flags(); + } + + ldout(cct, 20) << this << " " << __func__ << dendl; +} + +template +Context *EnableFeaturesRequest::handle_append_op_event(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_update_flags(); + return nullptr; +} + +template +void EnableFeaturesRequest::send_update_flags() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_enable_flags == 0) { + send_set_features(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << ": enable_flags=" + << m_enable_flags << dendl; + + Context *ctx = create_context_callback< + EnableFeaturesRequest, + &EnableFeaturesRequest::handle_update_flags>(this); + + image::SetFlagsRequest *req = + image::SetFlagsRequest::create(&image_ctx, m_enable_flags, + m_enable_flags, ctx); + req->send(); +} + +template +Context *EnableFeaturesRequest::handle_update_flags(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to update image flags: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_set_features(); + return nullptr; +} + +template +void EnableFeaturesRequest::send_set_features() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": new_features=" + << m_new_features << ", features_mask=" << m_features_mask + << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::set_features(&op, m_new_features, m_features_mask); + + using klass = EnableFeaturesRequest; + librados::AioCompletion *comp = + create_rados_callback(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +Context *EnableFeaturesRequest::handle_set_features(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to update features: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_create_object_map(); + return nullptr; +} + +template +void EnableFeaturesRequest::send_create_object_map() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (((image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0) || + ((m_features & RBD_FEATURE_OBJECT_MAP) == 0)) { + send_enable_mirror_image(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + EnableFeaturesRequest, + &EnableFeaturesRequest::handle_create_object_map>(this); + + object_map::CreateRequest *req = + object_map::CreateRequest::create(&image_ctx, ctx); + req->send(); +} + +template +Context *EnableFeaturesRequest::handle_create_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to create object map: " << cpp_strerror(*result) + << dendl; + return handle_finish(*result); + } + + send_enable_mirror_image(); + return nullptr; +} + +template +void EnableFeaturesRequest::send_enable_mirror_image() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (!m_enable_mirroring) { + send_notify_update(); + return; + } + + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + EnableFeaturesRequest, + &EnableFeaturesRequest::handle_enable_mirror_image>(this); + + auto req = mirror::EnableRequest::create( + &image_ctx, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, "", false, ctx); + req->send(); +} + +template +Context *EnableFeaturesRequest::handle_enable_mirror_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to enable mirroring: " << cpp_strerror(*result) + << dendl; + // not fatal + } + + send_notify_update(); + return nullptr; +} + +template +void EnableFeaturesRequest::send_notify_update() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + EnableFeaturesRequest, + &EnableFeaturesRequest::handle_notify_update>(this); + + image_ctx.notify_update(ctx); +} + +template +Context *EnableFeaturesRequest::handle_notify_update(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << *result << dendl; + + return handle_finish(*result); +} + +template +Context *EnableFeaturesRequest::handle_finish(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; + + { + std::unique_lock locker{image_ctx.owner_lock}; + + if (image_ctx.exclusive_lock != nullptr && m_requests_blocked) { + image_ctx.exclusive_lock->unblock_requests(); + } + if (m_writes_blocked) { + image_ctx.io_image_dispatcher->unblock_writes(); + } + } + image_ctx.state->handle_prepare_lock_complete(); + + return this->create_context_finisher(r); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::EnableFeaturesRequest; diff --git a/src/librbd/operation/EnableFeaturesRequest.h b/src/librbd/operation/EnableFeaturesRequest.h new file mode 100644 index 000000000..1c91b4dc7 --- /dev/null +++ b/src/librbd/operation/EnableFeaturesRequest.h @@ -0,0 +1,135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H +#define CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H + +#include "librbd/operation/Request.h" + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class EnableFeaturesRequest : public Request { +public: + static EnableFeaturesRequest *create(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid, + uint64_t features) { + return new EnableFeaturesRequest(image_ctx, on_finish, journal_op_tid, + features); + } + + EnableFeaturesRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid, uint64_t features); + +protected: + void send_op() override; + bool should_complete(int r) override; + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + return journal::UpdateFeaturesEvent(op_tid, m_features, true); + } + +private: + /** + * EnableFeatures goes through the following state machine: + * + * @verbatim + * + * + * | + * v + * STATE_PREPARE_LOCK + * | + * v + * STATE_BLOCK_WRITES + * | + * v + * STATE_GET_MIRROR_MODE + * | + * v + * STATE_CREATE_JOURNAL (skip if not + * | required) + * v + * STATE_APPEND_OP_EVENT (skip if journaling + * | disabled) + * v + * STATE_UPDATE_FLAGS + * | + * v + * STATE_SET_FEATURES + * | + * v + * STATE_CREATE_OBJECT_MAP (skip if not + * | required) + * v + * STATE_ENABLE_MIRROR_IMAGE + * | + * V + * STATE_NOTIFY_UPDATE + * | + * | (unblock writes) + * v + * + * @endverbatim + * + */ + + uint64_t m_features; + + bool m_enable_mirroring = false; + bool m_requests_blocked = false; + bool m_writes_blocked = false; + + uint64_t m_new_features = 0; + uint64_t m_enable_flags = 0; + uint64_t m_features_mask = 0; + + bufferlist m_out_bl; + + void send_prepare_lock(); + Context *handle_prepare_lock(int *result); + + void send_block_writes(); + Context *handle_block_writes(int *result); + + void send_get_mirror_mode(); + Context *handle_get_mirror_mode(int *result); + + void send_create_journal(); + Context *handle_create_journal(int *result); + + void send_append_op_event(); + Context *handle_append_op_event(int *result); + + void send_update_flags(); + Context *handle_update_flags(int *result); + + void send_set_features(); + Context *handle_set_features(int *result); + + void send_create_object_map(); + Context *handle_create_object_map(int *result); + + void send_enable_mirror_image(); + Context *handle_enable_mirror_image(int *result); + + void send_notify_update(); + Context *handle_notify_update(int *result); + + Context *handle_finish(int r); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::EnableFeaturesRequest; + +#endif // CEPH_LIBRBD_OPERATION_ENABLE_FEATURES_REQUEST_H diff --git a/src/librbd/operation/FlattenRequest.cc b/src/librbd/operation/FlattenRequest.cc new file mode 100644 index 000000000..7bc346819 --- /dev/null +++ b/src/librbd/operation/FlattenRequest.cc @@ -0,0 +1,265 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/FlattenRequest.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/crypto/CryptoInterface.h" +#include "librbd/crypto/EncryptionFormat.h" +#include "librbd/image/DetachChildRequest.h" +#include "librbd/image/DetachParentRequest.h" +#include "librbd/Types.h" +#include "librbd/io/ObjectRequest.h" +#include "librbd/io/Utils.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::FlattenRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace operation { + +using util::create_context_callback; +using util::create_rados_callback; + +template +class C_FlattenObject : public C_AsyncObjectThrottle { +public: + C_FlattenObject(AsyncObjectThrottle &throttle, I *image_ctx, + IOContext io_context, uint64_t object_no) + : C_AsyncObjectThrottle(throttle, *image_ctx), m_io_context(io_context), + m_object_no(object_no) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + CephContext *cct = image_ctx.cct; + + if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + ldout(cct, 1) << "lost exclusive lock during flatten" << dendl; + return -ERESTART; + } + + { + std::shared_lock image_lock{image_ctx.image_lock}; + if (image_ctx.object_map != nullptr && + !image_ctx.object_map->object_may_not_exist(m_object_no)) { + // can skip because the object already exists + return 1; + } + } + + if (!io::util::trigger_copyup( + &image_ctx, m_object_no, m_io_context, this)) { + // stop early if the parent went away - it just means + // another flatten finished first or the image was resized + return 1; + } + + return 0; + } + +private: + IOContext m_io_context; + uint64_t m_object_no; +}; + +template +bool FlattenRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template +void FlattenRequest::send_op() { + flatten_objects(); +} + +template +void FlattenRequest::flatten_objects() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + auto ctx = create_context_callback< + FlattenRequest, + &FlattenRequest::handle_flatten_objects>(this); + typename AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr >(), + boost::lambda::_1, &image_ctx, image_ctx.get_data_io_context(), + boost::lambda::_2)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + this, image_ctx, context_factory, ctx, &m_prog_ctx, m_start_object_no, + m_start_object_no + m_overlap_objects); + throttle->start_ops( + image_ctx.config.template get_val("rbd_concurrent_management_ops")); +} + +template +void FlattenRequest::handle_flatten_objects(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == -ERESTART) { + ldout(cct, 5) << "flatten operation interrupted" << dendl; + this->complete(r); + return; + } else if (r < 0) { + lderr(cct) << "flatten encountered an error: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + crypto_flatten(); +} + + +template +void FlattenRequest::crypto_flatten() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + auto encryption_format = image_ctx.encryption_format.get(); + if (encryption_format == nullptr) { + detach_child(); + return; + } + + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + FlattenRequest, + &FlattenRequest::handle_crypto_flatten>(this); + encryption_format->flatten(&image_ctx, ctx); +} + +template +void FlattenRequest::handle_crypto_flatten(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "error flattening crypto: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + detach_child(); +} + +template +void FlattenRequest::detach_child() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + // should have been canceled prior to releasing lock + image_ctx.owner_lock.lock_shared(); + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + // if there are no snaps, remove from the children object as well + // (if snapshots remain, they have their own parent info, and the child + // will be removed when the last snap goes away) + image_ctx.image_lock.lock_shared(); + if ((image_ctx.features & RBD_FEATURE_DEEP_FLATTEN) == 0 && + !image_ctx.snaps.empty()) { + image_ctx.image_lock.unlock_shared(); + image_ctx.owner_lock.unlock_shared(); + detach_parent(); + return; + } + image_ctx.image_lock.unlock_shared(); + + ldout(cct, 5) << dendl; + auto ctx = create_context_callback< + FlattenRequest, + &FlattenRequest::handle_detach_child>(this); + auto req = image::DetachChildRequest::create(image_ctx, ctx); + req->send(); + image_ctx.owner_lock.unlock_shared(); +} + +template +void FlattenRequest::handle_detach_child(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "detach encountered an error: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + detach_parent(); +} + +template +void FlattenRequest::detach_parent() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + // should have been canceled prior to releasing lock + image_ctx.owner_lock.lock_shared(); + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + // stop early if the parent went away - it just means + // another flatten finished first, so this one is useless. + image_ctx.image_lock.lock_shared(); + if (!image_ctx.parent) { + ldout(cct, 5) << "image already flattened" << dendl; + image_ctx.image_lock.unlock_shared(); + image_ctx.owner_lock.unlock_shared(); + this->complete(0); + return; + } + image_ctx.image_lock.unlock_shared(); + + // remove parent from this (base) image + auto ctx = create_context_callback< + FlattenRequest, + &FlattenRequest::handle_detach_parent>(this); + auto req = image::DetachParentRequest::create(image_ctx, ctx); + req->send(); + image_ctx.owner_lock.unlock_shared(); +} + +template +void FlattenRequest::handle_detach_parent(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "remove parent encountered an error: " << cpp_strerror(r) + << dendl; + } + + this->complete(r); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::FlattenRequest; diff --git a/src/librbd/operation/FlattenRequest.h b/src/librbd/operation/FlattenRequest.h new file mode 100644 index 000000000..ec6a38a9d --- /dev/null +++ b/src/librbd/operation/FlattenRequest.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H +#define CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H + +#include "librbd/operation/Request.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template +class FlattenRequest : public Request +{ +public: + FlattenRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t start_object_no, uint64_t overlap_objects, + ProgressContext& prog_ctx) + : Request(image_ctx, on_finish), + m_start_object_no(start_object_no), + m_overlap_objects(overlap_objects), + m_prog_ctx(prog_ctx) {} + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::FlattenEvent(op_tid); + } + +private: + /** + * @verbatim + * + * + * | + * v + * FLATTEN_OBJECTS + * | + * v + * CRYPTO_FLATTEN + * | + * v + * DETACH_CHILD + * | + * v + * DETACH_PARENT + * | + * v + * + * + * @endverbatim + */ + + uint64_t m_start_object_no; + uint64_t m_overlap_objects; + ProgressContext &m_prog_ctx; + + void flatten_objects(); + void handle_flatten_objects(int r); + + + void crypto_flatten(); + void handle_crypto_flatten(int r); + + void detach_child(); + void handle_detach_child(int r); + + void detach_parent(); + void handle_detach_parent(int r); + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::FlattenRequest; + +#endif // CEPH_LIBRBD_OPERATION_FLATTEN_REQUEST_H diff --git a/src/librbd/operation/MetadataRemoveRequest.cc b/src/librbd/operation/MetadataRemoveRequest.cc new file mode 100644 index 000000000..c5d6141ad --- /dev/null +++ b/src/librbd/operation/MetadataRemoveRequest.cc @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/MetadataRemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::MetadataRemoveRequest: " + +namespace librbd { +namespace operation { + +template +MetadataRemoveRequest::MetadataRemoveRequest(I &image_ctx, + Context *on_finish, + const std::string &key) + : Request(image_ctx, on_finish), m_key(key) { +} + +template +void MetadataRemoveRequest::send_op() { + send_metadata_remove(); +} + +template +bool MetadataRemoveRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template +void MetadataRemoveRequest::send_metadata_remove() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + librados::ObjectWriteOperation op; + cls_client::metadata_remove(&op, m_key); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::MetadataRemoveRequest; diff --git a/src/librbd/operation/MetadataRemoveRequest.h b/src/librbd/operation/MetadataRemoveRequest.h new file mode 100644 index 000000000..1d7f2a46a --- /dev/null +++ b/src/librbd/operation/MetadataRemoveRequest.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offremove:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H + +#include "librbd/operation/Request.h" +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class MetadataRemoveRequest : public Request { +public: + MetadataRemoveRequest(ImageCtxT &image_ctx, Context *on_finish, + const std::string &key); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::MetadataRemoveEvent(op_tid, m_key); + } + +private: + std::string m_key; + + void send_metadata_remove(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::MetadataRemoveRequest; + +#endif // CEPH_LIBRBD_OPERATION_METADATA_REMOVE_REQUEST_H diff --git a/src/librbd/operation/MetadataSetRequest.cc b/src/librbd/operation/MetadataSetRequest.cc new file mode 100644 index 000000000..5fb939352 --- /dev/null +++ b/src/librbd/operation/MetadataSetRequest.cc @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/MetadataSetRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::MetadataSetRequest: " + +namespace librbd { +namespace operation { + +template +MetadataSetRequest::MetadataSetRequest(I &image_ctx, + Context *on_finish, + const std::string &key, + const std::string &value) + : Request(image_ctx, on_finish), m_key(key), m_value(value) { +} + +template +void MetadataSetRequest::send_op() { + send_metadata_set(); +} + +template +bool MetadataSetRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template +void MetadataSetRequest::send_metadata_set() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << this << " " << __func__ << dendl; + + m_data[m_key].append(m_value); + librados::ObjectWriteOperation op; + cls_client::metadata_set(&op, m_data); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::MetadataSetRequest; diff --git a/src/librbd/operation/MetadataSetRequest.h b/src/librbd/operation/MetadataSetRequest.h new file mode 100644 index 000000000..5f8daa2f1 --- /dev/null +++ b/src/librbd/operation/MetadataSetRequest.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H +#define CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H + +#include "librbd/operation/Request.h" +#include "include/buffer.h" +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class MetadataSetRequest : public Request { +public: + MetadataSetRequest(ImageCtxT &image_ctx, Context *on_finish, + const std::string &key, const std::string &value); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::MetadataSetEvent(op_tid, m_key, m_value); + } + +private: + std::string m_key; + std::string m_value; + std::map m_data; + + void send_metadata_set(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::MetadataSetRequest; + +#endif // CEPH_LIBRBD_OPERATION_METADATA_SET_REQUEST_H diff --git a/src/librbd/operation/MigrateRequest.cc b/src/librbd/operation/MigrateRequest.cc new file mode 100644 index 000000000..2b9adb773 --- /dev/null +++ b/src/librbd/operation/MigrateRequest.cc @@ -0,0 +1,238 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/MigrateRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/deep_copy/ObjectCopyRequest.h" +#include "librbd/io/AsyncOperation.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/ObjectRequest.h" +#include "osdc/Striper.h" +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::MigrateRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace operation { + +using util::create_context_callback; +using util::create_async_context_callback; + +namespace { + +template +class C_MigrateObject : public C_AsyncObjectThrottle { +public: + C_MigrateObject(AsyncObjectThrottle &throttle, I *image_ctx, + IOContext io_context, uint64_t object_no) + : C_AsyncObjectThrottle(throttle, *image_ctx), m_io_context(io_context), + m_object_no(object_no) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + CephContext *cct = image_ctx.cct; + + if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + ldout(cct, 1) << "lost exclusive lock during migrate" << dendl; + return -ERESTART; + } + + start_async_op(); + return 0; + } + +private: + IOContext m_io_context; + uint64_t m_object_no; + + io::AsyncOperation *m_async_op = nullptr; + + void start_async_op() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << dendl; + + ceph_assert(m_async_op == nullptr); + m_async_op = new io::AsyncOperation(); + m_async_op->start_op(image_ctx); + + if (!image_ctx.io_image_dispatcher->writes_blocked()) { + migrate_object(); + return; + } + + auto ctx = create_async_context_callback( + image_ctx, create_context_callback< + C_MigrateObject, &C_MigrateObject::handle_start_async_op>(this)); + m_async_op->finish_op(); + delete m_async_op; + m_async_op = nullptr; + image_ctx.io_image_dispatcher->wait_on_writes_unblocked(ctx); + } + + void handle_start_async_op(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to start async op: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + std::shared_lock owner_locker{image_ctx.owner_lock}; + start_async_op(); + } + + bool is_within_overlap_bounds() { + I &image_ctx = this->m_image_ctx; + std::shared_lock image_locker{image_ctx.image_lock}; + + auto overlap = std::min(image_ctx.size, image_ctx.migration_info.overlap); + return overlap > 0 && + Striper::get_num_objects(image_ctx.layout, overlap) > m_object_no; + } + + void migrate_object() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + CephContext *cct = image_ctx.cct; + + auto ctx = create_context_callback< + C_MigrateObject, &C_MigrateObject::handle_migrate_object>(this); + + if (is_within_overlap_bounds()) { + bufferlist bl; + auto req = new io::ObjectWriteRequest(&image_ctx, m_object_no, 0, + std::move(bl), m_io_context, 0, + 0, std::nullopt, {}, ctx); + + ldout(cct, 20) << "copyup object req " << req << ", object_no " + << m_object_no << dendl; + + req->send(); + } else { + ceph_assert(image_ctx.parent != nullptr); + + uint32_t flags = deep_copy::OBJECT_COPY_REQUEST_FLAG_MIGRATION; + if (image_ctx.migration_info.flatten) { + flags |= deep_copy::OBJECT_COPY_REQUEST_FLAG_FLATTEN; + } + + auto req = deep_copy::ObjectCopyRequest::create( + image_ctx.parent, &image_ctx, 0, 0, image_ctx.migration_info.snap_map, + m_object_no, flags, nullptr, ctx); + + ldout(cct, 20) << "deep copy object req " << req << ", object_no " + << m_object_no << dendl; + req->send(); + } + } + + void handle_migrate_object(int r) { + CephContext *cct = this->m_image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r == -ENOENT) { + r = 0; + } + + m_async_op->finish_op(); + delete m_async_op; + this->complete(r); + } +}; + +} // anonymous namespace + +template +void MigrateRequest::send_op() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << dendl; + + migrate_objects(); +} + +template +bool MigrateRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + + return true; +} + +template +void MigrateRequest::migrate_objects() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + uint64_t overlap_objects = get_num_overlap_objects(); + + ldout(cct, 10) << "from 0 to " << overlap_objects << dendl; + + auto ctx = create_context_callback< + MigrateRequest, &MigrateRequest::handle_migrate_objects>(this); + + typename AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr >(), + boost::lambda::_1, &image_ctx, image_ctx.get_data_io_context(), + boost::lambda::_2)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, overlap_objects); + throttle->start_ops( + image_ctx.config.template get_val("rbd_concurrent_management_ops")); +} + +template +void MigrateRequest::handle_migrate_objects(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to migrate objects: " << cpp_strerror(r) << dendl; + } + + this->complete(r); +} + +template +uint64_t MigrateRequest::get_num_overlap_objects() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << dendl; + + std::shared_lock image_locker{image_ctx.image_lock}; + + auto overlap = image_ctx.migration_info.overlap; + + return overlap > 0 ? + Striper::get_num_objects(image_ctx.layout, overlap) : 0; +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::MigrateRequest; diff --git a/src/librbd/operation/MigrateRequest.h b/src/librbd/operation/MigrateRequest.h new file mode 100644 index 000000000..a143b579c --- /dev/null +++ b/src/librbd/operation/MigrateRequest.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H + +#include "librbd/operation/Request.h" +#include "librbd/Types.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template +class MigrateRequest : public Request +{ +public: + MigrateRequest(ImageCtxT &image_ctx, Context *on_finish, + ProgressContext &prog_ctx) + : Request(image_ctx, on_finish), m_prog_ctx(prog_ctx) { + } + +protected: + void send_op() override; + bool should_complete(int r) override; + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + ceph_abort(); + return journal::UnknownEvent(); + } + +private: + /** + * Migrate goes through the following state machine to copy objects + * from the parent (migrating source) image: + * + * @verbatim + * + * + * | + * v + * MIGRATE_OBJECTS + * | + * v + * + * + * @endverbatim + * + */ + + ProgressContext &m_prog_ctx; + + void migrate_objects(); + void handle_migrate_objects(int r); + + uint64_t get_num_overlap_objects(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::MigrateRequest; + +#endif // CEPH_LIBRBD_OPERATION_MIGRATE_REQUEST_H diff --git a/src/librbd/operation/ObjectMapIterate.cc b/src/librbd/operation/ObjectMapIterate.cc new file mode 100644 index 000000000..50db3df85 --- /dev/null +++ b/src/librbd/operation/ObjectMapIterate.cc @@ -0,0 +1,308 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/ObjectMapIterate.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "librbd/operation/ResizeRequest.h" +#include "librbd/object_map/InvalidateRequest.h" +#include "librbd/Utils.h" +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::ObjectMapIterateRequest: " + +namespace librbd { +namespace operation { + +namespace { + +template +class C_VerifyObjectCallback : public C_AsyncObjectThrottle { +public: + C_VerifyObjectCallback(AsyncObjectThrottle &throttle, I *image_ctx, + uint64_t snap_id, uint64_t object_no, + ObjectIterateWork handle_mismatch, + std::atomic_flag *invalidate) + : C_AsyncObjectThrottle(throttle, *image_ctx), + m_snap_id(snap_id), m_object_no(object_no), + m_oid(image_ctx->get_object_name(m_object_no)), + m_handle_mismatch(handle_mismatch), + m_invalidate(invalidate) + { + m_io_ctx.dup(image_ctx->data_ctx); + m_io_ctx.snap_set_read(CEPH_SNAPDIR); + } + + void complete(int r) override { + I &image_ctx = this->m_image_ctx; + if (should_complete(r)) { + ldout(image_ctx.cct, 20) << m_oid << " C_VerifyObjectCallback completed " + << dendl; + m_io_ctx.close(); + + this->finish(r); + delete this; + } + } + + int send() override { + send_list_snaps(); + return 0; + } + +private: + librados::IoCtx m_io_ctx; + uint64_t m_snap_id; + uint64_t m_object_no; + std::string m_oid; + ObjectIterateWork m_handle_mismatch; + std::atomic_flag *m_invalidate; + + librados::snap_set_t m_snap_set; + int m_snap_list_ret = 0; + + bool should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + if (r == 0) { + r = m_snap_list_ret; + } + if (r < 0 && r != -ENOENT) { + lderr(cct) << m_oid << " C_VerifyObjectCallback::should_complete: " + << "encountered an error: " << cpp_strerror(r) << dendl; + return true; + } + + ldout(cct, 20) << m_oid << " C_VerifyObjectCallback::should_complete: " + << " r=" + << r << dendl; + return object_map_action(get_object_state()); + } + + void send_list_snaps() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + ldout(image_ctx.cct, 5) << m_oid + << " C_VerifyObjectCallback::send_list_snaps" + << dendl; + + librados::ObjectReadOperation op; + op.list_snaps(&m_snap_set, &m_snap_list_ret); + + librados::AioCompletion *comp = util::create_rados_callback(this); + int r = m_io_ctx.aio_operate(m_oid, comp, &op, NULL); + ceph_assert(r == 0); + comp->release(); + } + + uint8_t get_object_state() { + I &image_ctx = this->m_image_ctx; + std::shared_lock image_locker{image_ctx.image_lock}; + for (std::vector::const_iterator r = + m_snap_set.clones.begin(); r != m_snap_set.clones.end(); ++r) { + librados::snap_t from_snap_id; + librados::snap_t to_snap_id; + if (r->cloneid == librados::SNAP_HEAD) { + from_snap_id = next_valid_snap_id(m_snap_set.seq + 1); + to_snap_id = librados::SNAP_HEAD; + } else { + from_snap_id = next_valid_snap_id(r->snaps[0]); + to_snap_id = r->snaps[r->snaps.size()-1]; + } + + if (to_snap_id < m_snap_id) { + continue; + } else if (m_snap_id < from_snap_id) { + break; + } + + if ((image_ctx.features & RBD_FEATURE_FAST_DIFF) != 0 && + from_snap_id != m_snap_id) { + return OBJECT_EXISTS_CLEAN; + } + return OBJECT_EXISTS; + } + return OBJECT_NONEXISTENT; + } + + uint64_t next_valid_snap_id(uint64_t snap_id) { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + + std::map::iterator it = + image_ctx.snap_info.lower_bound(snap_id); + if (it == image_ctx.snap_info.end()) { + return CEPH_NOSNAP; + } + return it->first; + } + + bool object_map_action(uint8_t new_state) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + std::shared_lock owner_locker{image_ctx.owner_lock}; + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + std::shared_lock image_locker{image_ctx.image_lock}; + ceph_assert(image_ctx.object_map != nullptr); + + uint8_t state = (*image_ctx.object_map)[m_object_no]; + ldout(cct, 10) << "C_VerifyObjectCallback::object_map_action" + << " object " << image_ctx.get_object_name(m_object_no) + << " state " << (int)state + << " new_state " << (int)new_state << dendl; + + if (state != new_state) { + int r = 0; + + ceph_assert(m_handle_mismatch); + r = m_handle_mismatch(image_ctx, m_object_no, state, new_state); + if (r) { + lderr(cct) << "object map error: object " + << image_ctx.get_object_name(m_object_no) + << " marked as " << (int)state << ", but should be " + << (int)new_state << dendl; + m_invalidate->test_and_set(); + } else { + ldout(cct, 1) << "object map inconsistent: object " + << image_ctx.get_object_name(m_object_no) + << " marked as " << (int)state << ", but should be " + << (int)new_state << dendl; + } + } + + return true; + } +}; + +} // anonymous namespace + +template +void ObjectMapIterateRequest::send() { + if (!m_image_ctx.data_ctx.is_valid()) { + this->async_complete(-ENODEV); + return; + } + + send_verify_objects(); +} + +template +bool ObjectMapIterateRequest::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl; + + if (r == -ENODEV) { + lderr(cct) << "missing data pool" << dendl; + return true; + } + + if (r < 0) { + lderr(cct) << "object map operation encountered an error: " + << cpp_strerror(r) << dendl; + } + + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + switch (m_state) { + case STATE_VERIFY_OBJECTS: + if (m_invalidate.test_and_set()) { + send_invalidate_object_map(); + return false; + } else if (r == 0) { + return true; + } + break; + + case STATE_INVALIDATE_OBJECT_MAP: + if (r == 0) { + return true; + } + break; + + default: + ceph_abort(); + break; + } + + if (r < 0) { + return true; + } + + return false; +} + +template +void ObjectMapIterateRequest::send_verify_objects() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + CephContext *cct = m_image_ctx.cct; + + uint64_t snap_id; + uint64_t num_objects; + { + std::shared_lock l{m_image_ctx.image_lock}; + snap_id = m_image_ctx.snap_id; + num_objects = Striper::get_num_objects(m_image_ctx.layout, + m_image_ctx.get_image_size(snap_id)); + } + ldout(cct, 5) << this << " send_verify_objects" << dendl; + + m_state = STATE_VERIFY_OBJECTS; + + typename AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr >(), + boost::lambda::_1, &m_image_ctx, snap_id, + boost::lambda::_2, m_handle_mismatch, &m_invalidate)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + this, m_image_ctx, context_factory, this->create_callback_context(), + &m_prog_ctx, 0, num_objects); + throttle->start_ops( + m_image_ctx.config.template get_val("rbd_concurrent_management_ops")); +} + +template +uint64_t ObjectMapIterateRequest::get_image_size() const { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + if (m_image_ctx.snap_id == CEPH_NOSNAP) { + if (!m_image_ctx.resize_reqs.empty()) { + return m_image_ctx.resize_reqs.front()->get_image_size(); + } else { + return m_image_ctx.size; + } + } + return m_image_ctx.get_image_size(m_image_ctx.snap_id); +} + +template +void ObjectMapIterateRequest::send_invalidate_object_map() { + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 5) << this << " send_invalidate_object_map" << dendl; + m_state = STATE_INVALIDATE_OBJECT_MAP; + + object_map::InvalidateRequest*req = + object_map::InvalidateRequest::create(m_image_ctx, m_image_ctx.snap_id, + true, + this->create_callback_context()); + + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + std::unique_lock image_locker{m_image_ctx.image_lock}; + req->send(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::ObjectMapIterateRequest; diff --git a/src/librbd/operation/ObjectMapIterate.h b/src/librbd/operation/ObjectMapIterate.h new file mode 100644 index 000000000..14215902a --- /dev/null +++ b/src/librbd/operation/ObjectMapIterate.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_OBJECT_MAP_ITERATE_H +#define CEPH_LIBRBD_OPERATION_OBJECT_MAP_ITERATE_H + +#include +#include + +#include "include/int_types.h" +#include "include/rbd/object_map_types.h" +#include "librbd/AsyncRequest.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template +using ObjectIterateWork = bool(*)(ImageCtxT &image_ctx, + uint64_t object_no, + uint8_t current_state, + uint8_t new_state); + +template +class ObjectMapIterateRequest : public AsyncRequest { +public: + ObjectMapIterateRequest(ImageCtxT &image_ctx, Context *on_finish, + ProgressContext &prog_ctx, + ObjectIterateWork handle_mismatch) + : AsyncRequest(image_ctx, on_finish), m_image_ctx(image_ctx), + m_prog_ctx(prog_ctx), m_handle_mismatch(handle_mismatch) + { + } + + void send() override; + +protected: + bool should_complete(int r) override; + +private: + enum State { + STATE_VERIFY_OBJECTS, + STATE_INVALIDATE_OBJECT_MAP + }; + + ImageCtxT &m_image_ctx; + ProgressContext &m_prog_ctx; + ObjectIterateWork m_handle_mismatch; + std::atomic_flag m_invalidate = ATOMIC_FLAG_INIT; + State m_state = STATE_VERIFY_OBJECTS; + + void send_verify_objects(); + void send_invalidate_object_map(); + + uint64_t get_image_size() const; +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::ObjectMapIterateRequest; + +#endif diff --git a/src/librbd/operation/RebuildObjectMapRequest.cc b/src/librbd/operation/RebuildObjectMapRequest.cc new file mode 100644 index 000000000..5deb182e5 --- /dev/null +++ b/src/librbd/operation/RebuildObjectMapRequest.cc @@ -0,0 +1,250 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/RebuildObjectMapRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "librbd/operation/ResizeRequest.h" +#include "librbd/operation/TrimRequest.h" +#include "librbd/operation/ObjectMapIterate.h" +#include "librbd/Utils.h" +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::RebuildObjectMapRequest: " + +namespace librbd { +namespace operation { + +using util::create_context_callback; + +template +void RebuildObjectMapRequest::send() { + send_resize_object_map(); +} + +template +bool RebuildObjectMapRequest::should_complete(int r) { + CephContext *cct = m_image_ctx.cct; + ldout(cct, 5) << this << " should_complete: " << " r=" << r << dendl; + + std::shared_lock owner_lock{m_image_ctx.owner_lock}; + switch (m_state) { + case STATE_RESIZE_OBJECT_MAP: + ldout(cct, 5) << "RESIZE_OBJECT_MAP" << dendl; + if (r == -ESTALE && !m_attempted_trim) { + // objects are still flagged as in-use -- delete them + m_attempted_trim = true; + send_trim_image(); + return false; + } else if (r == 0) { + send_verify_objects(); + } + break; + + case STATE_TRIM_IMAGE: + ldout(cct, 5) << "TRIM_IMAGE" << dendl; + if (r == 0) { + send_resize_object_map(); + } + break; + + case STATE_VERIFY_OBJECTS: + ldout(cct, 5) << "VERIFY_OBJECTS" << dendl; + if (r == 0) { + send_save_object_map(); + } + break; + + case STATE_SAVE_OBJECT_MAP: + ldout(cct, 5) << "SAVE_OBJECT_MAP" << dendl; + if (r == 0) { + send_update_header(); + } + break; + case STATE_UPDATE_HEADER: + ldout(cct, 5) << "UPDATE_HEADER" << dendl; + if (r == 0) { + return true; + } + break; + + default: + ceph_abort(); + break; + } + + if (r == -ERESTART) { + ldout(cct, 5) << "rebuild object map operation interrupted" << dendl; + return true; + } else if (r < 0) { + lderr(cct) << "rebuild object map encountered an error: " << cpp_strerror(r) + << dendl; + return true; + } + return false; +} + +template +void RebuildObjectMapRequest::send_resize_object_map() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + CephContext *cct = m_image_ctx.cct; + + m_image_ctx.image_lock.lock_shared(); + ceph_assert(m_image_ctx.object_map != nullptr); + + uint64_t size = get_image_size(); + uint64_t num_objects = Striper::get_num_objects(m_image_ctx.layout, size); + + if (m_image_ctx.object_map->size() == num_objects) { + m_image_ctx.image_lock.unlock_shared(); + send_verify_objects(); + return; + } + + ldout(cct, 5) << this << " send_resize_object_map" << dendl; + m_state = STATE_RESIZE_OBJECT_MAP; + + // should have been canceled prior to releasing lock + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + m_image_ctx.object_map->aio_resize(size, OBJECT_NONEXISTENT, + this->create_callback_context()); + m_image_ctx.image_lock.unlock_shared(); +} + +template +void RebuildObjectMapRequest::send_trim_image() { + CephContext *cct = m_image_ctx.cct; + + std::shared_lock l{m_image_ctx.owner_lock}; + + // should have been canceled prior to releasing lock + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + ldout(cct, 5) << this << " send_trim_image" << dendl; + m_state = STATE_TRIM_IMAGE; + + uint64_t new_size; + uint64_t orig_size; + { + std::shared_lock l{m_image_ctx.image_lock}; + ceph_assert(m_image_ctx.object_map != nullptr); + + new_size = get_image_size(); + orig_size = m_image_ctx.get_object_size() * + m_image_ctx.object_map->size(); + } + TrimRequest *req = TrimRequest::create(m_image_ctx, + this->create_callback_context(), + orig_size, new_size, m_prog_ctx); + req->send(); +} + +template +bool update_object_map(I& image_ctx, uint64_t object_no, uint8_t current_state, + uint8_t new_state) { + CephContext *cct = image_ctx.cct; + uint64_t snap_id = image_ctx.snap_id; + + current_state = (*image_ctx.object_map)[object_no]; + if (current_state == OBJECT_EXISTS && new_state == OBJECT_NONEXISTENT && + snap_id == CEPH_NOSNAP) { + // might be writing object to OSD concurrently + new_state = current_state; + } + + if (new_state != current_state) { + ldout(cct, 15) << image_ctx.get_object_name(object_no) + << " rebuild updating object map " + << static_cast(current_state) << "->" + << static_cast(new_state) << dendl; + image_ctx.object_map->set_state(object_no, new_state, current_state); + } + return false; +} + +template +void RebuildObjectMapRequest::send_verify_objects() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + CephContext *cct = m_image_ctx.cct; + + m_state = STATE_VERIFY_OBJECTS; + ldout(cct, 5) << this << " send_verify_objects" << dendl; + + ObjectMapIterateRequest *req = + new ObjectMapIterateRequest(m_image_ctx, + this->create_callback_context(), + m_prog_ctx, update_object_map); + + req->send(); +} + +template +void RebuildObjectMapRequest::send_save_object_map() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + CephContext *cct = m_image_ctx.cct; + + ldout(cct, 5) << this << " send_save_object_map" << dendl; + m_state = STATE_SAVE_OBJECT_MAP; + + // should have been canceled prior to releasing lock + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + std::shared_lock image_locker{m_image_ctx.image_lock}; + ceph_assert(m_image_ctx.object_map != nullptr); + m_image_ctx.object_map->aio_save(this->create_callback_context()); +} + +template +void RebuildObjectMapRequest::send_update_header() { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.owner_lock)); + + // should have been canceled prior to releasing lock + ceph_assert(m_image_ctx.exclusive_lock == nullptr || + m_image_ctx.exclusive_lock->is_lock_owner()); + + ldout(m_image_ctx.cct, 5) << this << " send_update_header" << dendl; + m_state = STATE_UPDATE_HEADER; + + librados::ObjectWriteOperation op; + + uint64_t flags = RBD_FLAG_OBJECT_MAP_INVALID | RBD_FLAG_FAST_DIFF_INVALID; + cls_client::set_flags(&op, m_image_ctx.snap_id, 0, flags); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + + std::unique_lock image_locker{m_image_ctx.image_lock}; + m_image_ctx.update_flags(m_image_ctx.snap_id, flags, false); +} + +template +uint64_t RebuildObjectMapRequest::get_image_size() const { + ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); + if (m_image_ctx.snap_id == CEPH_NOSNAP) { + if (!m_image_ctx.resize_reqs.empty()) { + return m_image_ctx.resize_reqs.front()->get_image_size(); + } else { + return m_image_ctx.size; + } + } + return m_image_ctx.get_image_size(m_image_ctx.snap_id); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::RebuildObjectMapRequest; diff --git a/src/librbd/operation/RebuildObjectMapRequest.h b/src/librbd/operation/RebuildObjectMapRequest.h new file mode 100644 index 000000000..c7f1aa3b7 --- /dev/null +++ b/src/librbd/operation/RebuildObjectMapRequest.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H +#define CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H + +#include "include/int_types.h" +#include "librbd/AsyncRequest.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template +class RebuildObjectMapRequest : public AsyncRequest { +public: + + RebuildObjectMapRequest(ImageCtxT &image_ctx, Context *on_finish, + ProgressContext &prog_ctx) + : AsyncRequest(image_ctx, on_finish), m_image_ctx(image_ctx), + m_prog_ctx(prog_ctx), m_attempted_trim(false) + { + } + + void send() override; + +protected: + bool should_complete(int r) override; + +private: + /** + * Rebuild object map goes through the following state machine to + * verify per-object state: + * + * + * . | . . . . . . . . . . + * . | . . + * . v v . + * . STATE_RESIZE_OBJECT_MAP . . . > STATE_TRIM_IMAGE + * . | + * . v + * . . . > STATE_VERIFY_OBJECTS + * | + * v + * STATE_SAVE_OBJECT_MAP + * | + * v + * STATE_UPDATE_HEADER + * + * The _RESIZE_OBJECT_MAP state will be skipped if the object map + * is appropriately sized for the image. The _TRIM_IMAGE state will + * only be hit if the resize failed due to an in-use object. + */ + enum State { + STATE_RESIZE_OBJECT_MAP, + STATE_TRIM_IMAGE, + STATE_VERIFY_OBJECTS, + STATE_SAVE_OBJECT_MAP, + STATE_UPDATE_HEADER + }; + + ImageCtxT &m_image_ctx; + ProgressContext &m_prog_ctx; + State m_state = STATE_RESIZE_OBJECT_MAP; + bool m_attempted_trim; + + void send_resize_object_map(); + void send_trim_image(); + void send_verify_objects(); + void send_save_object_map(); + void send_update_header(); + + uint64_t get_image_size() const; + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::RebuildObjectMapRequest; + +#endif // CEPH_LIBRBD_OPERATION_REBUILD_OBJECT_MAP_REQUEST_H diff --git a/src/librbd/operation/RenameRequest.cc b/src/librbd/operation/RenameRequest.cc new file mode 100644 index 000000000..15bcd819c --- /dev/null +++ b/src/librbd/operation/RenameRequest.cc @@ -0,0 +1,257 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/RenameRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/rados/librados.hpp" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::RenameRequest: " + +namespace librbd { +namespace operation { + +namespace { + +template +std::ostream& operator<<(std::ostream& os, + const typename RenameRequest::State& state) { + switch(state) { + case RenameRequest::STATE_READ_DIRECTORY: + os << "READ_DIRECTORY"; + break; + case RenameRequest::STATE_READ_SOURCE_HEADER: + os << "READ_SOURCE_HEADER"; + break; + case RenameRequest::STATE_WRITE_DEST_HEADER: + os << "WRITE_DEST_HEADER"; + break; + case RenameRequest::STATE_UPDATE_DIRECTORY: + os << "UPDATE_DIRECTORY"; + break; + case RenameRequest::STATE_REMOVE_SOURCE_HEADER: + os << "REMOVE_SOURCE_HEADER"; + break; + default: + os << "UNKNOWN (" << static_cast(state) << ")"; + break; + } + return os; +} + +} // anonymous namespace + +template +RenameRequest::RenameRequest(I &image_ctx, Context *on_finish, + const std::string &dest_name) + : Request(image_ctx, on_finish), m_dest_name(dest_name), + m_source_oid(image_ctx.old_format ? util::old_header_name(image_ctx.name) : + util::id_obj_name(image_ctx.name)), + m_dest_oid(image_ctx.old_format ? util::old_header_name(dest_name) : + util::id_obj_name(dest_name)) { +} + +template +void RenameRequest::send_op() { + I &image_ctx = this->m_image_ctx; + if (image_ctx.old_format) { + send_read_source_header(); + return; + } + send_read_directory(); +} + +template +bool RenameRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + r = filter_return_code(r); + if (r < 0) { + if (r == -EEXIST) { + ldout(cct, 1) << "image already exists" << dendl; + } else { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; + } + + if (m_state == STATE_READ_DIRECTORY) { + std::string name; + auto it = m_source_name_bl.cbegin(); + r = cls_client::dir_get_name_finish(&it, &name); + if (r < 0) { + lderr(cct) << "could not read directory: " << cpp_strerror(r) << dendl; + return true; + } + bool update = false; + { + std::shared_lock image_locker{image_ctx.image_lock}; + update = image_ctx.name != name; + } + if (update) { + image_ctx.set_image_name(name); + m_source_oid = util::id_obj_name(name); + } + } else if (m_state == STATE_UPDATE_DIRECTORY) { + // update in-memory name before removing source header + apply(); + } else if (m_state == STATE_REMOVE_SOURCE_HEADER) { + return true; + } + + std::shared_lock owner_lock{image_ctx.owner_lock}; + switch (m_state) { + case STATE_READ_DIRECTORY: + send_read_source_header(); + break; + case STATE_READ_SOURCE_HEADER: + send_write_destination_header(); + break; + case STATE_WRITE_DEST_HEADER: + send_update_directory(); + break; + case STATE_UPDATE_DIRECTORY: + send_remove_source_header(); + break; + default: + ceph_abort(); + break; + } + return false; +} + +template +int RenameRequest::filter_return_code(int r) const { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_state == STATE_READ_SOURCE_HEADER && r == -ENOENT) { + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.name == m_dest_name) { + // signal that replay raced with itself + return -EEXIST; + } + } else if (m_state == STATE_REMOVE_SOURCE_HEADER && r < 0) { + if (r != -ENOENT) { + lderr(cct) << "warning: couldn't remove old source object (" + << m_source_oid << ")" << dendl; + } + return 0; + } + return r; +} + +template +void RenameRequest::send_read_directory() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_READ_DIRECTORY; + + librados::ObjectReadOperation op; + cls_client::dir_get_name_start(&op, image_ctx.id); + + auto comp = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(RBD_DIRECTORY, comp, &op, + &m_source_name_bl); + ceph_assert(r == 0); + comp->release(); +} + +template +void RenameRequest::send_read_source_header() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_READ_SOURCE_HEADER; + + librados::ObjectReadOperation op; + op.read(0, 0, NULL, NULL); + + // TODO: old code read omap values but there are no omap values on the + // old format header nor the new format id object + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op, + &m_header_bl); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RenameRequest::send_write_destination_header() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_WRITE_DEST_HEADER; + + librados::ObjectWriteOperation op; + op.create(true); + op.write_full(m_header_bl); + + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(m_dest_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RenameRequest::send_update_directory() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_UPDATE_DIRECTORY; + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + bufferlist cmd_bl; + bufferlist empty_bl; + encode(static_cast<__u8>(CEPH_OSD_TMAP_SET), cmd_bl); + encode(m_dest_name, cmd_bl); + encode(empty_bl, cmd_bl); + encode(static_cast<__u8>(CEPH_OSD_TMAP_RM), cmd_bl); + encode(image_ctx.name, cmd_bl); + op.tmap_update(cmd_bl); + } else { + cls_client::dir_rename_image(&op, image_ctx.name, m_dest_name, + image_ctx.id); + } + + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(RBD_DIRECTORY, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RenameRequest::send_remove_source_header() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_REMOVE_SOURCE_HEADER; + + librados::ObjectWriteOperation op; + op.remove(); + + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(m_source_oid, rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +void RenameRequest::apply() { + I &image_ctx = this->m_image_ctx; + image_ctx.set_image_name(m_dest_name); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::RenameRequest; diff --git a/src/librbd/operation/RenameRequest.h b/src/librbd/operation/RenameRequest.h new file mode 100644 index 000000000..11fdec648 --- /dev/null +++ b/src/librbd/operation/RenameRequest.h @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_RENAME_REQUEST_H +#define CEPH_LIBRBD_RENAME_REQUEST_H + +#include "librbd/operation/Request.h" +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class RenameRequest : public Request +{ +public: + /** + * Rename goes through the following state machine: + * + * @verbatim + * + * + * | + * v + * STATE_READ_DIRECTORY + * | + * v + * STATE_READ_SOURCE_HEADER + * | + * v + * STATE_WRITE_DEST_HEADER + * | + * v + * STATE_UPDATE_DIRECTORY + * | + * v + * STATE_REMOVE_SOURCE_HEADER + * | + * v + * + * + * @endverbatim + * + */ + enum State { + STATE_READ_DIRECTORY, + STATE_READ_SOURCE_HEADER, + STATE_WRITE_DEST_HEADER, + STATE_UPDATE_DIRECTORY, + STATE_REMOVE_SOURCE_HEADER + }; + + RenameRequest(ImageCtxT &image_ctx, Context *on_finish, + const std::string &dest_name); + +protected: + void send_op() override; + bool should_complete(int r) override; + int filter_return_code(int r) const override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::RenameEvent(op_tid, m_dest_name); + } + +private: + std::string m_dest_name; + + std::string m_source_oid; + std::string m_dest_oid; + + State m_state = STATE_READ_DIRECTORY; + + bufferlist m_source_name_bl; + bufferlist m_header_bl; + + void send_read_directory(); + void send_read_source_header(); + void send_write_destination_header(); + void send_update_directory(); + void send_remove_source_header(); + + void apply(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::RenameRequest; + +#endif // CEPH_LIBRBD_RENAME_REQUEST_H diff --git a/src/librbd/operation/Request.cc b/src/librbd/operation/Request.cc new file mode 100644 index 000000000..269c8a4f9 --- /dev/null +++ b/src/librbd/operation/Request.cc @@ -0,0 +1,183 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/Request.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" +#include "librbd/asio/ContextWQ.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::Request: " + +namespace librbd { +namespace operation { + +template +Request::Request(I &image_ctx, Context *on_finish, uint64_t journal_op_tid) + : AsyncRequest(image_ctx, on_finish), m_op_tid(journal_op_tid) { +} + +template +void Request::send() { + [[maybe_unused]] I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + // automatically create the event if we don't need to worry + // about affecting concurrent IO ops + if (can_affect_io() || !append_op_event()) { + send_op(); + } +} + +template +Context *Request::create_context_finisher(int r) { + // automatically commit the event if required (delete after commit) + if (m_appended_op_event && !m_committed_op_event && + commit_op_event(r)) { + return nullptr; + } + + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + return util::create_context_callback, &Request::finish>(this); +} + +template +void Request::finish_and_destroy(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + // automatically commit the event if required (delete after commit) + if (m_appended_op_event && !m_committed_op_event && + commit_op_event(r)) { + return; + } + + AsyncRequest::finish_and_destroy(r); +} + +template +void Request::finish(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + ceph_assert(!m_appended_op_event || m_committed_op_event); + AsyncRequest::finish(r); +} + +template +bool Request::append_op_event() { + I &image_ctx = this->m_image_ctx; + + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()) { + append_op_event(util::create_context_callback< + Request, &Request::handle_op_event_safe>(this)); + return true; + } + return false; +} + +template +bool Request::commit_op_event(int r) { + I &image_ctx = this->m_image_ctx; + std::shared_lock image_locker{image_ctx.image_lock}; + + if (!m_appended_op_event) { + return false; + } + + ceph_assert(m_op_tid != 0); + ceph_assert(!m_committed_op_event); + m_committed_op_event = true; + + if (image_ctx.journal != nullptr && + image_ctx.journal->is_journal_appending()) { + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + // ops will be canceled / completed before closing journal + ceph_assert(image_ctx.journal->is_journal_ready()); + image_ctx.journal->commit_op_event(m_op_tid, r, + new C_CommitOpEvent(this, r)); + return true; + } + return false; +} + +template +void Request::handle_commit_op_event(int r, int original_ret_val) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to commit op event to journal: " << cpp_strerror(r) + << dendl; + } + if (original_ret_val < 0) { + r = original_ret_val; + } + finish(r); +} + +template +void Request::replay_op_ready(Context *on_safe) { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + ceph_assert(m_op_tid != 0); + + m_appended_op_event = true; + image_ctx.journal->replay_op_ready( + m_op_tid, util::create_async_context_callback(image_ctx, on_safe)); +} + +template +void Request::append_op_event(Context *on_safe) { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << dendl; + + m_op_tid = image_ctx.journal->allocate_op_tid(); + image_ctx.journal->append_op_event( + m_op_tid, journal::EventEntry{create_event(m_op_tid)}, + new C_AppendOpEvent(this, on_safe)); +} + +template +void Request::handle_op_event_safe(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " " << __func__ << ": r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to commit op event to journal: " << cpp_strerror(r) + << dendl; + this->finish(r); + delete this; + } else { + ceph_assert(!can_affect_io()); + + // haven't started the request state machine yet + std::shared_lock owner_locker{image_ctx.owner_lock}; + send_op(); + } +} + +} // namespace operation +} // namespace librbd + +#ifndef TEST_F +template class librbd::operation::Request; +#endif diff --git a/src/librbd/operation/Request.h b/src/librbd/operation/Request.h new file mode 100644 index 000000000..a36d20857 --- /dev/null +++ b/src/librbd/operation/Request.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_REQUEST_H +#define CEPH_LIBRBD_OPERATION_REQUEST_H + +#include "librbd/AsyncRequest.h" +#include "include/Context.h" +#include "librbd/Utils.h" +#include "librbd/Journal.h" + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class Request : public AsyncRequest { +public: + Request(ImageCtxT &image_ctx, Context *on_finish, + uint64_t journal_op_tid = 0); + + void send(); + +protected: + void finish(int r) override; + virtual void send_op() = 0; + + virtual bool can_affect_io() const { + return false; + } + virtual journal::Event create_event(uint64_t op_tid) const = 0; + + template + bool append_op_event(T *request) { + ImageCtxT &image_ctx = this->m_image_ctx; + + ceph_assert(can_affect_io()); + std::scoped_lock locker{image_ctx.owner_lock, image_ctx.image_lock}; + if (image_ctx.journal != nullptr) { + if (image_ctx.journal->is_journal_replaying()) { + Context *ctx = util::create_context_callback(request); + replay_op_ready(ctx); + return true; + } else if (image_ctx.journal->is_journal_appending()) { + Context *ctx = util::create_context_callback(request); + append_op_event(ctx); + return true; + } + } + return false; + } + + bool append_op_event(); + + // NOTE: temporary until converted to new state machine format + Context *create_context_finisher(int r); + void finish_and_destroy(int r) override; + +private: + struct C_AppendOpEvent : public Context { + Request *request; + Context *on_safe; + C_AppendOpEvent(Request *request, Context *on_safe) + : request(request), on_safe(on_safe) { + } + void finish(int r) override { + if (r >= 0) { + request->m_appended_op_event = true; + } + on_safe->complete(r); + } + }; + + struct C_CommitOpEvent : public Context { + Request *request; + int ret_val; + C_CommitOpEvent(Request *request, int ret_val) + : request(request), ret_val(ret_val) { + } + void finish(int r) override { + request->handle_commit_op_event(r, ret_val); + delete request; + } + }; + + uint64_t m_op_tid = 0; + bool m_appended_op_event = false; + bool m_committed_op_event = false; + + void replay_op_ready(Context *on_safe); + void append_op_event(Context *on_safe); + void handle_op_event_safe(int r); + + bool commit_op_event(int r); + void handle_commit_op_event(int r, int original_ret_val); + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::Request; + +#endif // CEPH_LIBRBD_OPERATION_REQUEST_H diff --git a/src/librbd/operation/ResizeRequest.cc b/src/librbd/operation/ResizeRequest.cc new file mode 100644 index 000000000..1f09ea308 --- /dev/null +++ b/src/librbd/operation/ResizeRequest.cc @@ -0,0 +1,466 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/ResizeRequest.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/ImageDispatchSpec.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/operation/TrimRequest.h" +#include "common/dout.h" +#include "common/errno.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::ResizeRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace operation { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template +ResizeRequest::ResizeRequest(I &image_ctx, Context *on_finish, + uint64_t new_size, bool allow_shrink, ProgressContext &prog_ctx, + uint64_t journal_op_tid, bool disable_journal) + : Request(image_ctx, on_finish, journal_op_tid), + m_original_size(0), m_new_size(new_size), m_allow_shrink(allow_shrink), + m_prog_ctx(prog_ctx), m_new_parent_overlap(0), m_disable_journal(disable_journal), + m_xlist_item(this) +{ +} + +template +ResizeRequest::~ResizeRequest() { + I &image_ctx = this->m_image_ctx; + ResizeRequest *next_req = NULL; + { + std::unique_lock image_locker{image_ctx.image_lock}; + ceph_assert(m_xlist_item.remove_myself()); + if (!image_ctx.resize_reqs.empty()) { + next_req = image_ctx.resize_reqs.front(); + } + } + + if (next_req != NULL) { + std::shared_lock owner_locker{image_ctx.owner_lock}; + next_req->send(); + } +} + +template +void ResizeRequest::send() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + { + std::unique_lock image_locker{image_ctx.image_lock}; + if (!m_xlist_item.is_on_list()) { + image_ctx.resize_reqs.push_back(&m_xlist_item); + if (image_ctx.resize_reqs.front() != this) { + return; + } + } + + ceph_assert(image_ctx.resize_reqs.front() == this); + m_original_size = image_ctx.size; + compute_parent_overlap(); + } + + Request::send(); +} + +template +void ResizeRequest::send_op() { + [[maybe_unused]] I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + if (this->is_canceled()) { + this->async_complete(-ERESTART); + } else { + send_pre_block_writes(); + } +} + +template +void ResizeRequest::send_pre_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + image_ctx.io_image_dispatcher->block_writes(create_context_callback< + ResizeRequest, &ResizeRequest::handle_pre_block_writes>(this)); +} + +template +Context *ResizeRequest::handle_pre_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + image_ctx.io_image_dispatcher->unblock_writes(); + return this->create_context_finisher(*result); + } + + return send_append_op_event(); +} + +template +Context *ResizeRequest::send_append_op_event() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_new_size < m_original_size && !m_allow_shrink) { + ldout(cct, 1) << "shrinking the image is not permitted" << dendl; + image_ctx.io_image_dispatcher->unblock_writes(); + this->async_complete(-EINVAL); + return nullptr; + } + + if (m_disable_journal || !this->template append_op_event< + ResizeRequest, &ResizeRequest::handle_append_op_event>(this)) { + return send_grow_object_map(); + } + + ldout(cct, 5) << dendl; + return nullptr; +} + +template +Context *ResizeRequest::handle_append_op_event(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result) + << dendl; + image_ctx.io_image_dispatcher->unblock_writes(); + return this->create_context_finisher(*result); + } + + return send_grow_object_map(); +} + +template +void ResizeRequest::send_trim_image() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + std::shared_lock owner_locker{image_ctx.owner_lock}; + TrimRequest *req = TrimRequest::create( + image_ctx, create_context_callback< + ResizeRequest, &ResizeRequest::handle_trim_image>(this), + m_original_size, m_new_size, m_prog_ctx); + req->send(); +} + +template +Context *ResizeRequest::handle_trim_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result == -ERESTART) { + ldout(cct, 5) << "resize operation interrupted" << dendl; + return this->create_context_finisher(*result); + } else if (*result < 0) { + lderr(cct) << "failed to trim image: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_post_block_writes(); + return nullptr; +} + +template +void ResizeRequest::send_flush_cache() { + I &image_ctx = this->m_image_ctx; + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + std::shared_lock owner_locker{image_ctx.owner_lock}; + auto ctx = create_context_callback< + ResizeRequest, &ResizeRequest::handle_flush_cache>(this); + auto aio_comp = io::AioCompletion::create_and_start( + ctx, util::get_image_ctx(&image_ctx), io::AIO_TYPE_FLUSH); + auto req = io::ImageDispatchSpec::create_flush( + image_ctx, io::IMAGE_DISPATCH_LAYER_INTERNAL_START, aio_comp, + io::FLUSH_SOURCE_INTERNAL, {}); + req->send(); +} + +template +Context *ResizeRequest::handle_flush_cache(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to flush cache: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_invalidate_cache(); + return nullptr; +} + +template +void ResizeRequest::send_invalidate_cache() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + // need to invalidate since we're deleting objects, and + // ObjectCacher doesn't track non-existent objects + image_ctx.io_image_dispatcher->invalidate_cache(create_context_callback< + ResizeRequest, &ResizeRequest::handle_invalidate_cache>(this)); +} + +template +Context *ResizeRequest::handle_invalidate_cache(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + // ignore busy error -- writeback was successfully flushed so we might be + // wasting some cache space for trimmed objects, but they will get purged + // eventually. Most likely cause of the issue was a in-flight cache read + if (*result < 0 && *result != -EBUSY) { + lderr(cct) << "failed to invalidate cache: " << cpp_strerror(*result) + << dendl; + return this->create_context_finisher(*result); + } + + send_trim_image(); + return nullptr; +} + +template +Context *ResizeRequest::send_grow_object_map() { + I &image_ctx = this->m_image_ctx; + + { + std::unique_lock image_locker{image_ctx.image_lock}; + m_shrink_size_visible = true; + } + + if (m_original_size == m_new_size) { + image_ctx.io_image_dispatcher->unblock_writes(); + return this->create_context_finisher(0); + } else if (m_new_size < m_original_size) { + image_ctx.io_image_dispatcher->unblock_writes(); + send_flush_cache(); + return nullptr; + } + + image_ctx.owner_lock.lock_shared(); + image_ctx.image_lock.lock_shared(); + if (image_ctx.object_map == nullptr) { + image_ctx.image_lock.unlock_shared(); + image_ctx.owner_lock.unlock_shared(); + + // IO is still blocked + send_update_header(); + return nullptr; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + image_ctx.object_map->aio_resize( + m_new_size, OBJECT_NONEXISTENT, create_context_callback< + ResizeRequest, &ResizeRequest::handle_grow_object_map>(this)); + image_ctx.image_lock.unlock_shared(); + image_ctx.owner_lock.unlock_shared(); + return nullptr; +} + +template +Context *ResizeRequest::handle_grow_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to resize object map: " + << cpp_strerror(*result) << dendl; + image_ctx.io_image_dispatcher->unblock_writes(); + return this->create_context_finisher(*result); + } + + // IO is still blocked + send_update_header(); + return nullptr; +} + +template +Context *ResizeRequest::send_shrink_object_map() { + I &image_ctx = this->m_image_ctx; + + image_ctx.owner_lock.lock_shared(); + image_ctx.image_lock.lock_shared(); + if (image_ctx.object_map == nullptr || m_new_size > m_original_size) { + image_ctx.image_lock.unlock_shared(); + image_ctx.owner_lock.unlock_shared(); + + update_size_and_overlap(); + return this->create_context_finisher(0); + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "original_size=" << m_original_size << ", " + << "new_size=" << m_new_size << dendl; + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + image_ctx.object_map->aio_resize( + m_new_size, OBJECT_NONEXISTENT, create_context_callback< + ResizeRequest, &ResizeRequest::handle_shrink_object_map>(this)); + image_ctx.image_lock.unlock_shared(); + image_ctx.owner_lock.unlock_shared(); + return nullptr; +} + +template +Context *ResizeRequest::handle_shrink_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to resize object map: " + << cpp_strerror(*result) << dendl; + image_ctx.io_image_dispatcher->unblock_writes(); + return this->create_context_finisher(*result); + } + + update_size_and_overlap(); + return this->create_context_finisher(0); +} + +template +void ResizeRequest::send_post_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + std::shared_lock owner_locker{image_ctx.owner_lock}; + image_ctx.io_image_dispatcher->block_writes(create_context_callback< + ResizeRequest, &ResizeRequest::handle_post_block_writes>(this)); +} + +template +Context *ResizeRequest::handle_post_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + image_ctx.io_image_dispatcher->unblock_writes(); + lderr(cct) << "failed to block writes prior to header update: " + << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_update_header(); + return nullptr; +} + +template +void ResizeRequest::send_update_header() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "original_size=" << m_original_size << ", " + << "new_size=" << m_new_size << dendl;; + + // should have been canceled prior to releasing lock + std::shared_lock owner_locker{image_ctx.owner_lock}; + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + // rewrite only the size field of the header + ceph_le64 new_size(m_new_size); + bufferlist bl; + bl.append(reinterpret_cast(&new_size), sizeof(new_size)); + op.write(offsetof(rbd_obj_header_ondisk, image_size), bl); + } else { + cls_client::set_size(&op, m_new_size); + } + + librados::AioCompletion *rados_completion = create_rados_callback< + ResizeRequest, &ResizeRequest::handle_update_header>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, + rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +Context *ResizeRequest::handle_update_header(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to update image header: " << cpp_strerror(*result) + << dendl; + image_ctx.io_image_dispatcher->unblock_writes(); + return this->create_context_finisher(*result); + } + + return send_shrink_object_map(); +} + +template +void ResizeRequest::compute_parent_overlap() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + + if (image_ctx.parent == NULL) { + m_new_parent_overlap = 0; + } else { + m_new_parent_overlap = std::min(m_new_size, image_ctx.parent_md.overlap); + } +} + +template +void ResizeRequest::update_size_and_overlap() { + I &image_ctx = this->m_image_ctx; + { + std::unique_lock image_locker{image_ctx.image_lock}; + image_ctx.size = m_new_size; + + if (image_ctx.parent != NULL && m_new_size < m_original_size) { + image_ctx.parent_md.overlap = m_new_parent_overlap; + } + } + + // blocked by PRE_BLOCK_WRITES (grow) or POST_BLOCK_WRITES (shrink) state + image_ctx.io_image_dispatcher->unblock_writes(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::ResizeRequest; diff --git a/src/librbd/operation/ResizeRequest.h b/src/librbd/operation/ResizeRequest.h new file mode 100644 index 000000000..f5e2f807f --- /dev/null +++ b/src/librbd/operation/ResizeRequest.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H + +#include "librbd/operation/Request.h" +#include "include/xlist.h" + +namespace librbd +{ + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template +class ResizeRequest : public Request { +public: + static ResizeRequest *create(ImageCtxT &image_ctx, Context *on_finish, + uint64_t new_size, bool allow_shrink, + ProgressContext &prog_ctx, uint64_t journal_op_tid, + bool disable_journal) { + return new ResizeRequest(image_ctx, on_finish, new_size, allow_shrink, prog_ctx, + journal_op_tid, disable_journal); + } + + ResizeRequest(ImageCtxT &image_ctx, Context *on_finish, uint64_t new_size, + bool allow_shrink, ProgressContext &prog_ctx, uint64_t journal_op_tid, + bool disable_journal); + ~ResizeRequest() override; + + inline bool shrinking() const { + return (m_shrink_size_visible && m_new_size < m_original_size); + } + + inline uint64_t get_image_size() const { + return m_new_size; + } + + void send() override; + +protected: + void send_op() override; + bool should_complete(int r) override { + return true; + } + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + return journal::ResizeEvent(op_tid, m_new_size); + } + +private: + /** + * Resize goes through the following state machine to resize the image + * and update the object map: + * + * @verbatim + * + * + * | + * v + * STATE_PRE_BLOCK_WRITES + * | + * v + * STATE_APPEND_OP_EVENT (skip if journaling + * | disabled) + * | + * | (grow) + * |\--------> STATE_GROW_OBJECT_MAP (skip if object map + * | | disabled) + * | v + * | STATE_UPDATE_HEADER ----------------------------\ + * | (unblock writes) | + * | | + * | (unblock writes) | + * | | + * | (shrink) | + * |\--------> STATE_FLUSH_CACHE | + * | | | + * | v | + * | STATE_INVALIDATE_CACHE | + * | | | + * | v | + * | STATE_TRIM_IMAGE | + * | | | + * | v | + * | STATE_POST_BLOCK_WRITES | + * | | | + * | v | + * | STATE_UPDATE_HEADER | + * | | | + * | v | + * | STATE_SHRINK_OBJECT_MAP (skip if object map | + * | | disabled) | + * | | (unblock writes) | + * | (no change) v | + * \------------> <-----------------------------------/ + * + * @endverbatim + * + * The _OBJECT_MAP states are skipped if the object map isn't enabled. + * The state machine will immediately transition to _FINISHED if there + * are no objects to trim. + */ + + uint64_t m_original_size; + uint64_t m_new_size; + bool m_allow_shrink = true; + ProgressContext &m_prog_ctx; + uint64_t m_new_parent_overlap; + bool m_shrink_size_visible = false; + bool m_disable_journal = false; + + typename xlist*>::item m_xlist_item; + + void send_pre_block_writes(); + Context *handle_pre_block_writes(int *result); + + Context *send_append_op_event(); + Context *handle_append_op_event(int *result); + + void send_flush_cache(); + Context *handle_flush_cache(int *result); + + void send_invalidate_cache(); + Context *handle_invalidate_cache(int *result); + + void send_trim_image(); + Context *handle_trim_image(int *result); + + Context *send_grow_object_map(); + Context *handle_grow_object_map(int *result); + + Context *send_shrink_object_map(); + Context *handle_shrink_object_map(int *result); + + void send_post_block_writes(); + Context *handle_post_block_writes(int *result); + + void send_update_header(); + Context *handle_update_header(int *result); + + void compute_parent_overlap(); + void update_size_and_overlap(); + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::ResizeRequest; + +#endif // CEPH_LIBRBD_OPERATION_RESIZE_REQUEST_H diff --git a/src/librbd/operation/SnapshotCreateRequest.cc b/src/librbd/operation/SnapshotCreateRequest.cc new file mode 100644 index 000000000..8cea35d2f --- /dev/null +++ b/src/librbd/operation/SnapshotCreateRequest.cc @@ -0,0 +1,449 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/operation/SnapshotCreateRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageWatcher.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/mirror/snapshot/SetImageStateRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotCreateRequest: " + +namespace librbd { +namespace operation { + +using util::create_async_context_callback; +using util::create_context_callback; +using util::create_rados_callback; + +template +SnapshotCreateRequest::SnapshotCreateRequest(I &image_ctx, + Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t journal_op_tid, + uint64_t flags, + ProgressContext &prog_ctx) + : Request(image_ctx, on_finish, journal_op_tid), + m_snap_namespace(snap_namespace), m_snap_name(snap_name), + m_skip_object_map(flags & SNAP_CREATE_FLAG_SKIP_OBJECT_MAP), + m_skip_notify_quiesce(flags & SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE), + m_ignore_notify_quiesce_error(flags & SNAP_CREATE_FLAG_IGNORE_NOTIFY_QUIESCE_ERROR), + m_prog_ctx(prog_ctx) { +} + +template +void SnapshotCreateRequest::send_op() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (!image_ctx.data_ctx.is_valid()) { + lderr(cct) << "missing data pool" << dendl; + this->async_complete(-ENODEV); + return; + } + + send_notify_quiesce(); +} + +template +void SnapshotCreateRequest::send_notify_quiesce() { + if (m_skip_notify_quiesce) { + send_suspend_requests(); + return; + } + + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + image_ctx.image_watcher->notify_quiesce( + &m_request_id, m_prog_ctx, create_async_context_callback( + image_ctx, create_context_callback, + &SnapshotCreateRequest::handle_notify_quiesce>(this))); +} + +template +Context *SnapshotCreateRequest::handle_notify_quiesce(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0 && !m_ignore_notify_quiesce_error) { + lderr(cct) << "failed to notify quiesce: " << cpp_strerror(*result) + << dendl; + save_result(result); + send_notify_unquiesce(); + return nullptr; + } + + std::shared_lock owner_locker{image_ctx.owner_lock}; + send_suspend_requests(); + return nullptr; +} + +template +void SnapshotCreateRequest::send_suspend_requests() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + // TODO suspend (shrink) resize to ensure consistent RBD mirror + send_suspend_aio(); +} + +template +Context *SnapshotCreateRequest::handle_suspend_requests(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + // TODO + send_suspend_aio(); + return nullptr; +} + +template +void SnapshotCreateRequest::send_suspend_aio() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + image_ctx.io_image_dispatcher->block_writes(create_context_callback< + SnapshotCreateRequest, + &SnapshotCreateRequest::handle_suspend_aio>(this)); +} + +template +Context *SnapshotCreateRequest::handle_suspend_aio(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + save_result(result); + return send_notify_unquiesce(); + } + + m_writes_blocked = true; + + send_append_op_event(); + return nullptr; +} + +template +void SnapshotCreateRequest::send_append_op_event() { + I &image_ctx = this->m_image_ctx; + if (!this->template append_op_event< + SnapshotCreateRequest, + &SnapshotCreateRequest::handle_append_op_event>(this)) { + send_allocate_snap_id(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; +} + +template +Context *SnapshotCreateRequest::handle_append_op_event(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to commit journal entry: " << cpp_strerror(*result) + << dendl; + save_result(result); + return send_notify_unquiesce(); + } + + send_allocate_snap_id(); + return nullptr; +} + +template +void SnapshotCreateRequest::send_allocate_snap_id() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + librados::AioCompletion *rados_completion = create_rados_callback< + SnapshotCreateRequest, + &SnapshotCreateRequest::handle_allocate_snap_id>(this); + image_ctx.data_ctx.aio_selfmanaged_snap_create(&m_snap_id, rados_completion); + rados_completion->release(); +} + +template +Context *SnapshotCreateRequest::handle_allocate_snap_id(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << ", " + << "snap_id=" << m_snap_id << dendl; + + if (*result < 0) { + lderr(cct) << "failed to allocate snapshot id: " << cpp_strerror(*result) + << dendl; + save_result(result); + return send_notify_unquiesce(); + } + + send_create_snap(); + return nullptr; +} + +template +void SnapshotCreateRequest::send_create_snap() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + std::shared_lock owner_locker{image_ctx.owner_lock}; + std::shared_lock image_locker{image_ctx.image_lock}; + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + // save current size / parent info for creating snapshot record in ImageCtx + m_size = image_ctx.size; + m_parent_info = image_ctx.parent_md; + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + cls_client::old_snapshot_add(&op, m_snap_id, m_snap_name); + } else { + cls_client::snapshot_add(&op, m_snap_id, m_snap_name, m_snap_namespace); + } + + librados::AioCompletion *rados_completion = create_rados_callback< + SnapshotCreateRequest, + &SnapshotCreateRequest::handle_create_snap>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, + rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +template +Context *SnapshotCreateRequest::handle_create_snap(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -ESTALE) { + send_allocate_snap_id(); + return nullptr; + } else if (*result < 0) { + save_result(result); + send_release_snap_id(); + return nullptr; + } + + return send_create_object_map(); +} + +template +Context *SnapshotCreateRequest::send_create_object_map() { + I &image_ctx = this->m_image_ctx; + + image_ctx.image_lock.lock_shared(); + if (image_ctx.object_map == nullptr || m_skip_object_map) { + image_ctx.image_lock.unlock_shared(); + + return send_create_image_state(); + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + image_ctx.object_map->snapshot_add( + m_snap_id, create_context_callback< + SnapshotCreateRequest, + &SnapshotCreateRequest::handle_create_object_map>(this)); + image_ctx.image_lock.unlock_shared(); + return nullptr; +} + +template +Context *SnapshotCreateRequest::handle_create_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << this << " " << __func__ << ": failed to snapshot object map: " + << cpp_strerror(*result) << dendl; + + save_result(result); + update_snap_context(); + return send_notify_unquiesce(); + } + + return send_create_image_state(); +} + +template +Context *SnapshotCreateRequest::send_create_image_state() { + I &image_ctx = this->m_image_ctx; + auto mirror_ns = std::get_if( + &m_snap_namespace); + if (mirror_ns == nullptr || !mirror_ns->is_primary()) { + update_snap_context(); + return send_notify_unquiesce(); + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + auto req = mirror::snapshot::SetImageStateRequest::create( + &image_ctx, m_snap_id, create_context_callback< + SnapshotCreateRequest, + &SnapshotCreateRequest::handle_create_image_state>(this)); + req->send(); + return nullptr; +} + +template +Context *SnapshotCreateRequest::handle_create_image_state(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + update_snap_context(); + if (*result < 0) { + lderr(cct) << this << " " << __func__ << ": failed to create image state: " + << cpp_strerror(*result) << dendl; + save_result(result); + } + + return send_notify_unquiesce(); +} + +template +void SnapshotCreateRequest::send_release_snap_id() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + ceph_assert(m_snap_id != CEPH_NOSNAP); + + librados::AioCompletion *rados_completion = create_rados_callback< + SnapshotCreateRequest, + &SnapshotCreateRequest::handle_release_snap_id>(this); + image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, rados_completion); + rados_completion->release(); +} + +template +Context *SnapshotCreateRequest::handle_release_snap_id(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + return send_notify_unquiesce(); +} + +template +Context *SnapshotCreateRequest::send_notify_unquiesce() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (m_writes_blocked) { + image_ctx.io_image_dispatcher->unblock_writes(); + } + + if (m_skip_notify_quiesce) { + return this->create_context_finisher(m_ret_val); + } + + ldout(cct, 5) << this << " " << __func__ << dendl; + + image_ctx.image_watcher->notify_unquiesce( + m_request_id, create_context_callback< + SnapshotCreateRequest, + &SnapshotCreateRequest::handle_notify_unquiesce>(this)); + + return nullptr; +} + +template +Context *SnapshotCreateRequest::handle_notify_unquiesce(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to notify unquiesce: " << cpp_strerror(*result) + << dendl; + // ignore error + } + + *result = m_ret_val; + return this->create_context_finisher(m_ret_val); +} + +template +void SnapshotCreateRequest::update_snap_context() { + I &image_ctx = this->m_image_ctx; + + std::shared_lock owner_locker{image_ctx.owner_lock}; + std::unique_lock image_locker{image_ctx.image_lock}; + if (image_ctx.get_snap_info(m_snap_id) != NULL) { + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + // immediately add a reference to the new snapshot + utime_t snap_time = ceph_clock_now(); + image_ctx.add_snap(m_snap_namespace, m_snap_name, m_snap_id, m_size, + m_parent_info, RBD_PROTECTION_STATUS_UNPROTECTED, + 0, snap_time); + + // immediately start using the new snap context if we + // own the exclusive lock + std::vector snaps; + snaps.push_back(m_snap_id); + snaps.insert(snaps.end(), image_ctx.snapc.snaps.begin(), + image_ctx.snapc.snaps.end()); + + image_ctx.snapc.seq = m_snap_id; + image_ctx.snapc.snaps.swap(snaps); + image_ctx.data_ctx.selfmanaged_snap_set_write_ctx( + image_ctx.snapc.seq, image_ctx.snaps); + image_ctx.rebuild_data_io_context(); + + if (!image_ctx.migration_info.empty()) { + auto it = image_ctx.migration_info.snap_map.find(CEPH_NOSNAP); + ceph_assert(it != image_ctx.migration_info.snap_map.end()); + ceph_assert(!it->second.empty()); + if (it->second[0] == CEPH_NOSNAP) { + ldout(cct, 5) << this << " " << __func__ + << ": updating migration snap_map" << dendl; + it->second[0] = m_snap_id; + } + } +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotCreateRequest; diff --git a/src/librbd/operation/SnapshotCreateRequest.h b/src/librbd/operation/SnapshotCreateRequest.h new file mode 100644 index 000000000..d306ee21b --- /dev/null +++ b/src/librbd/operation/SnapshotCreateRequest.h @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H + +#include "cls/rbd/cls_rbd_types.h" +#include "librbd/Types.h" +#include "librbd/operation/Request.h" +#include + +class Context; + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template +class SnapshotCreateRequest : public Request { +public: + /** + * Snap Create goes through the following state machine: + * + * @verbatim + * + * + * | + * v + * STATE_NOTIFY_QUIESCE * * * * * * * * * * * * * + * | * + * v * + * STATE_SUSPEND_REQUESTS * + * | * + * v * + * STATE_SUSPEND_AIO * * * * * * * * * * * * * * * + * | * + * v * + * STATE_APPEND_OP_EVENT (skip if journal * + * | disabled) * + * (retry) v * + * . . . > STATE_ALLOCATE_SNAP_ID * + * . | * + * . v * + * . . . . STATE_CREATE_SNAP * * * * * * * * * * * * + * | * * + * v * * + * STATE_CREATE_OBJECT_MAP (skip if * * + * | disabled) * * + * v * * + * STATE_CREATE_IMAGE_STATE (skip if * * + * | not mirror * * + * | snapshot) * * + * | v * + * | STATE_RELEASE_SNAP_ID * + * | | * + * | v * + * \------------> STATE_NOTIFY_UNQUIESCE < * * + * | + * v + * + * @endverbatim + * + * The _CREATE_STATE state may repeat back to the _ALLOCATE_SNAP_ID state + * if a stale snapshot context is allocated. If the create operation needs + * to abort, the error path is followed to record the result in the journal + * (if enabled) and bubble the originating error code back to the client. + */ + SnapshotCreateRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, uint64_t journal_op_tid, + uint64_t flags, ProgressContext &prog_ctx); + +protected: + void send_op() override; + bool should_complete(int r) override { + return true; + } + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapCreateEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + std::string m_snap_name; + bool m_skip_object_map; + bool m_skip_notify_quiesce; + bool m_ignore_notify_quiesce_error; + ProgressContext &m_prog_ctx; + + uint64_t m_request_id = 0; + int m_ret_val = 0; + bool m_writes_blocked = false; + + uint64_t m_snap_id = CEPH_NOSNAP; + uint64_t m_size; + ParentImageInfo m_parent_info; + + void send_notify_quiesce(); + Context *handle_notify_quiesce(int *result); + + void send_suspend_requests(); + Context *handle_suspend_requests(int *result); + + void send_suspend_aio(); + Context *handle_suspend_aio(int *result); + + void send_append_op_event(); + Context *handle_append_op_event(int *result); + + void send_allocate_snap_id(); + Context *handle_allocate_snap_id(int *result); + + void send_create_snap(); + Context *handle_create_snap(int *result); + + Context *send_create_object_map(); + Context *handle_create_object_map(int *result); + + Context *send_create_image_state(); + Context *handle_create_image_state(int *result); + + void send_release_snap_id(); + Context *handle_release_snap_id(int *result); + + Context *send_notify_unquiesce(); + Context *handle_notify_unquiesce(int *result); + + void update_snap_context(); + + void save_result(int *result) { + if (m_ret_val == 0 && *result < 0) { + m_ret_val = *result; + } + } +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotCreateRequest; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_CREATE_REQUEST_H diff --git a/src/librbd/operation/SnapshotLimitRequest.cc b/src/librbd/operation/SnapshotLimitRequest.cc new file mode 100644 index 000000000..17aed5f6a --- /dev/null +++ b/src/librbd/operation/SnapshotLimitRequest.cc @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotLimitRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotLimitRequest: " + +namespace librbd { +namespace operation { + +template +SnapshotLimitRequest::SnapshotLimitRequest(I &image_ctx, + Context *on_finish, + uint64_t limit) + : Request(image_ctx, on_finish), m_snap_limit(limit) { +} + +template +void SnapshotLimitRequest::send_op() { + send_limit_snaps(); +} + +template +bool SnapshotLimitRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << " r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template +void SnapshotLimitRequest::send_limit_snaps() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + { + std::shared_lock image_locker{image_ctx.image_lock}; + + librados::ObjectWriteOperation op; + cls_client::snapshot_set_limit(&op, m_snap_limit); + + librados::AioCompletion *rados_completion = + this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion, + &op); + ceph_assert(r == 0); + rados_completion->release(); + } +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotLimitRequest; diff --git a/src/librbd/operation/SnapshotLimitRequest.h b/src/librbd/operation/SnapshotLimitRequest.h new file mode 100644 index 000000000..09622a459 --- /dev/null +++ b/src/librbd/operation/SnapshotLimitRequest.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H + +#include "librbd/operation/Request.h" +#include +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class SnapshotLimitRequest : public Request { +public: + SnapshotLimitRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t limit); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapLimitEvent(op_tid, m_snap_limit); + } + +private: + uint64_t m_snap_limit; + + void send_limit_snaps(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotLimitRequest; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_LIMIT_REQUEST_H diff --git a/src/librbd/operation/SnapshotProtectRequest.cc b/src/librbd/operation/SnapshotProtectRequest.cc new file mode 100644 index 000000000..f3b9e7e0b --- /dev/null +++ b/src/librbd/operation/SnapshotProtectRequest.cc @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotProtectRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotProtectRequest: " + +namespace librbd { +namespace operation { + +namespace { + +template +std::ostream& operator<<(std::ostream& os, + const typename SnapshotProtectRequest::State& state) { + switch(state) { + case SnapshotProtectRequest::STATE_PROTECT_SNAP: + os << "PROTECT_SNAP"; + break; + } + return os; +} + +} // anonymous namespace + +template +SnapshotProtectRequest::SnapshotProtectRequest(I &image_ctx, + Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name) + : Request(image_ctx, on_finish), m_snap_namespace(snap_namespace), + m_snap_name(snap_name), m_state(STATE_PROTECT_SNAP) { +} + +template +void SnapshotProtectRequest::send_op() { + send_protect_snap(); +} + +template +bool SnapshotProtectRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0) { + if (r == -EBUSY) { + ldout(cct, 1) << "snapshot is already protected" << dendl; + } else { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + } + return true; +} + +template +void SnapshotProtectRequest::send_protect_snap() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + int r = verify_and_send_protect_snap(); + if (r < 0) { + this->async_complete(r); + return; + } +} + +template +int SnapshotProtectRequest::verify_and_send_protect_snap() { + I &image_ctx = this->m_image_ctx; + std::shared_lock image_locker{image_ctx.image_lock}; + + CephContext *cct = image_ctx.cct; + if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) { + lderr(cct) << "image must support layering" << dendl; + return -ENOSYS; + } + + uint64_t snap_id = image_ctx.get_snap_id(m_snap_namespace, m_snap_name); + if (snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + + bool is_protected; + int r = image_ctx.is_snap_protected(snap_id, &is_protected); + if (r < 0) { + return r; + } + + if (is_protected) { + return -EBUSY; + } + + librados::ObjectWriteOperation op; + cls_client::set_protection_status(&op, snap_id, + RBD_PROTECTION_STATUS_PROTECTED); + + librados::AioCompletion *rados_completion = + this->create_callback_completion(); + r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, rados_completion, + &op); + ceph_assert(r == 0); + rados_completion->release(); + return 0; +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotProtectRequest; diff --git a/src/librbd/operation/SnapshotProtectRequest.h b/src/librbd/operation/SnapshotProtectRequest.h new file mode 100644 index 000000000..bef80229a --- /dev/null +++ b/src/librbd/operation/SnapshotProtectRequest.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H + +#include "librbd/operation/Request.h" +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class SnapshotProtectRequest : public Request { +public: + /** + * Snap Protect goes through the following state machine: + * + * @verbatim + * + * + * | + * v + * STATE_PROTECT_SNAP + * | + * v + * + * + * @endverbatim + * + */ + enum State { + STATE_PROTECT_SNAP + }; + + SnapshotProtectRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapProtectEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + std::string m_snap_name; + State m_state; + + void send_protect_snap(); + + int verify_and_send_protect_snap(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotProtectRequest; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_PROTECT_REQUEST_H diff --git a/src/librbd/operation/SnapshotRemoveRequest.cc b/src/librbd/operation/SnapshotRemoveRequest.cc new file mode 100644 index 000000000..f3b4dc62e --- /dev/null +++ b/src/librbd/operation/SnapshotRemoveRequest.cc @@ -0,0 +1,506 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotRemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/ceph_assert.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/image/DetachChildRequest.h" +#include "librbd/mirror/snapshot/RemoveImageStateRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotRemoveRequest: " << this << " " \ + << __func__ << ": " + +namespace librbd { +namespace operation { + +using util::create_context_callback; +using util::create_rados_callback; + +template +SnapshotRemoveRequest::SnapshotRemoveRequest( + I &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, uint64_t snap_id) + : Request(image_ctx, on_finish), m_snap_namespace(snap_namespace), + m_snap_name(snap_name), m_snap_id(snap_id) { +} + +template +void SnapshotRemoveRequest::send_op() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + { + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.snap_info.find(m_snap_id) == image_ctx.snap_info.end()) { + lderr(cct) << "snapshot doesn't exist" << dendl; + this->async_complete(-ENOENT); + return; + } + } + + trash_snap(); +} + +template +bool SnapshotRemoveRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + if (r < 0 && r != -EBUSY) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template +void SnapshotRemoveRequest::trash_snap() { + I &image_ctx = this->m_image_ctx; + if (image_ctx.old_format) { + release_snap_id(); + return; + } else if (cls::rbd::get_snap_namespace_type(m_snap_namespace) == + cls::rbd::SNAPSHOT_NAMESPACE_TYPE_TRASH) { + get_snap(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectWriteOperation op; + cls_client::snapshot_trash_add(&op, m_snap_id); + + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_trash_snap>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void SnapshotRemoveRequest::handle_trash_snap(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + // trash / clone v2 not supported + detach_child(); + return; + } else if (r < 0 && r != -EEXIST) { + lderr(cct) << "failed to move snapshot to trash: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + m_trashed_snapshot = true; + get_snap(); +} + +template +void SnapshotRemoveRequest::get_snap() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectReadOperation op; + cls_client::snapshot_get_start(&op, m_snap_id); + + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_get_snap>(this); + m_out_bl.clear(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void SnapshotRemoveRequest::handle_get_snap(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == 0) { + cls::rbd::SnapshotInfo snap_info; + + auto it = m_out_bl.cbegin(); + r = cls_client::snapshot_get_finish(&it, &snap_info); + m_child_attached = (snap_info.child_count > 0); + if (r == 0 && m_child_attached) { + list_children(); + return; + } + } + + if (r < 0) { + lderr(cct) << "failed to retrieve snapshot: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + detach_child(); +} + +template +void SnapshotRemoveRequest::list_children() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectReadOperation op; + cls_client::children_list_start(&op, m_snap_id); + + m_out_bl.clear(); + m_child_images.clear(); + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_list_children>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op, + &m_out_bl); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void SnapshotRemoveRequest::handle_list_children(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == 0) { + auto it = m_out_bl.cbegin(); + r = cls_client::children_list_finish(&it, &m_child_images); + } + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to retrieve child: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + detach_stale_child(); +} + +template +void SnapshotRemoveRequest::detach_stale_child() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + for (auto& child_image : m_child_images) { + m_child_attached = true; + IoCtx ioctx; + int r = util::create_ioctx(image_ctx.md_ctx, "child image", + child_image.pool_id, + child_image.pool_namespace, &ioctx); + if (r == -ENOENT) { + librados::ObjectWriteOperation op; + cls_client::child_detach(&op, m_snap_id, + {child_image.pool_id, + child_image.pool_namespace, + child_image.image_id}); + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_detach_stale_child>(this); + r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); + return; + } else if (r < 0) { + this->async_complete(r); + return; + } + } + + detach_child(); +} + +template +void SnapshotRemoveRequest::handle_detach_stale_child(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to detach stale child: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + m_child_attached = false; + list_children(); +} + +template +void SnapshotRemoveRequest::detach_child() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + bool detach_child = false; + { + std::shared_lock image_locker{image_ctx.image_lock}; + + cls::rbd::ParentImageSpec our_pspec; + int r = image_ctx.get_parent_spec(m_snap_id, &our_pspec); + if (r < 0) { + if (r == -ENOENT) { + ldout(cct, 1) << "No such snapshot" << dendl; + } else { + lderr(cct) << "failed to retrieve parent spec" << dendl; + } + + this->async_complete(r); + return; + } + + if (image_ctx.parent_md.spec != our_pspec && + (scan_for_parents(our_pspec) == -ENOENT)) { + // no other references to the parent image + detach_child = true; + } + } + + if (!detach_child) { + // HEAD image or other snapshots still associated with parent + remove_object_map(); + return; + } + + ldout(cct, 5) << dendl; + auto ctx = create_context_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_detach_child>(this); + auto req = image::DetachChildRequest::create(image_ctx, ctx); + req->send(); +} + +template +void SnapshotRemoveRequest::handle_detach_child(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to detach child from parent: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + remove_object_map(); +} + +template +void SnapshotRemoveRequest::remove_object_map() { + I &image_ctx = this->m_image_ctx; + if (m_child_attached) { + // if a clone v2 child is attached to this snapshot, we cannot + // proceed. It's only an error if the snap was already in the trash + this->complete(m_trashed_snapshot ? 0 : -EBUSY); + return; + } + + CephContext *cct = image_ctx.cct; + + { + std::shared_lock owner_lock{image_ctx.owner_lock}; + std::unique_lock image_locker{image_ctx.image_lock}; + if (image_ctx.object_map != nullptr) { + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_remove_object_map>(this); + image_ctx.object_map->snapshot_remove(m_snap_id, ctx); + return; + } + } + + // object map disabled + remove_image_state(); +} + +template +void SnapshotRemoveRequest::handle_remove_object_map(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove snapshot object map: " << cpp_strerror(r) + << dendl; + this->complete(r); + return; + } + + remove_image_state(); +} + +template +void SnapshotRemoveRequest::remove_image_state() { + I &image_ctx = this->m_image_ctx; + + const auto* info = std::get_if( + &m_snap_namespace); + if (info == nullptr || info->is_orphan()) { + release_snap_id(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + auto ctx = create_context_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_remove_image_state>(this); + auto req = mirror::snapshot::RemoveImageStateRequest::create( + &image_ctx, m_snap_id, ctx); + req->send(); +} + +template +void SnapshotRemoveRequest::handle_remove_image_state(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove image state: " << cpp_strerror(r) + << dendl; + if (r != -ENOENT) { + this->complete(r); + return; + } + } + + release_snap_id(); +} + +template +void SnapshotRemoveRequest::release_snap_id() { + I &image_ctx = this->m_image_ctx; + + if (!image_ctx.data_ctx.is_valid()) { + remove_snap(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "snap_name=" << m_snap_name << ", " + << "snap_id=" << m_snap_id << dendl; + + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_release_snap_id>(this); + image_ctx.data_ctx.aio_selfmanaged_snap_remove(m_snap_id, aio_comp); + aio_comp->release(); +} + +template +void SnapshotRemoveRequest::handle_release_snap_id(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(cct) << "failed to release snap id: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + remove_snap(); +} + +template +void SnapshotRemoveRequest::remove_snap() { + I &image_ctx = this->m_image_ctx; + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + cls_client::old_snapshot_remove(&op, m_snap_name); + } else { + cls_client::snapshot_remove(&op, m_snap_id); + } + + auto aio_comp = create_rados_callback< + SnapshotRemoveRequest, + &SnapshotRemoveRequest::handle_remove_snap>(this); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void SnapshotRemoveRequest::handle_remove_snap(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "failed to remove snapshot: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + remove_snap_context(); + this->complete(0); +} + +template +void SnapshotRemoveRequest::remove_snap_context() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + std::unique_lock image_locker{image_ctx.image_lock}; + image_ctx.rm_snap(m_snap_namespace, m_snap_name, m_snap_id); +} + +template +int SnapshotRemoveRequest::scan_for_parents( + cls::rbd::ParentImageSpec &pspec) { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + + if (pspec.pool_id != -1) { + std::map::iterator it; + for (it = image_ctx.snap_info.begin(); + it != image_ctx.snap_info.end(); ++it) { + // skip our snap id (if checking base image, CEPH_NOSNAP won't match) + if (it->first == m_snap_id) { + continue; + } + if (it->second.parent.spec == pspec) { + break; + } + } + if (it == image_ctx.snap_info.end()) { + return -ENOENT; + } + } + return 0; +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotRemoveRequest; diff --git a/src/librbd/operation/SnapshotRemoveRequest.h b/src/librbd/operation/SnapshotRemoveRequest.h new file mode 100644 index 000000000..17638a529 --- /dev/null +++ b/src/librbd/operation/SnapshotRemoveRequest.h @@ -0,0 +1,128 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H + +#include "librbd/operation/Request.h" +#include "include/buffer.h" +#include "librbd/Types.h" +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class SnapshotRemoveRequest : public Request { +public: + /** + * @verbatim + * + * + * | + * v + * TRASH_SNAP + * | + * v (skip if unsupported) + * GET_SNAP + * | + * v (skip if unnecessary) + * LIST_CHILDREN <-------------\ + * | | + * v (skip if unnecessary) | (repeat as needed) + * DETACH_STALE_CHILD ---------/ + * | + * v (skip if unnecessary) + * DETACH_CHILD + * | + * v (skip if disabled/in-use) + * REMOVE_OBJECT_MAP + * | + * v (skip if not mirror snpashot) + * REMOVE_IMAGE_STATE + * | + * v (skip if in-use) + * RELEASE_SNAP_ID + * | + * v (skip if in-use) + * REMOVE_SNAP + * | + * v + * + * + * @endverbatim + */ + + static SnapshotRemoveRequest *create( + ImageCtxT &image_ctx, const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, uint64_t snap_id, Context *on_finish) { + return new SnapshotRemoveRequest(image_ctx, on_finish, snap_namespace, + snap_name, snap_id); + } + + SnapshotRemoveRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t snap_id); + +protected: + void send_op() override; + bool should_complete(int r) override; + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapRemoveEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + cls::rbd::ChildImageSpecs m_child_images; + std::string m_snap_name; + uint64_t m_snap_id; + bool m_trashed_snapshot = false; + bool m_child_attached = false; + + ceph::bufferlist m_out_bl; + + void trash_snap(); + void handle_trash_snap(int r); + + void get_snap(); + void handle_get_snap(int r); + + void list_children(); + void handle_list_children(int r); + + void detach_stale_child(); + void handle_detach_stale_child(int r); + + void detach_child(); + void handle_detach_child(int r); + + void remove_object_map(); + void handle_remove_object_map(int r); + + void remove_image_state(); + void handle_remove_image_state(int r); + + void release_snap_id(); + void handle_release_snap_id(int r); + + void remove_snap(); + void handle_remove_snap(int r); + + void remove_snap_context(); + int scan_for_parents(cls::rbd::ParentImageSpec &pspec); + +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotRemoveRequest; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_REMOVE_REQUEST_H diff --git a/src/librbd/operation/SnapshotRenameRequest.cc b/src/librbd/operation/SnapshotRenameRequest.cc new file mode 100644 index 000000000..e9257f18c --- /dev/null +++ b/src/librbd/operation/SnapshotRenameRequest.cc @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotRenameRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/ImageCtx.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotRenameRequest: " + +namespace librbd { +namespace operation { + +namespace { + +template +std::ostream& operator<<(std::ostream& os, + const typename SnapshotRenameRequest::State& state) { + switch(state) { + case SnapshotRenameRequest::STATE_RENAME_SNAP: + os << "RENAME_SNAP"; + break; + } + return os; +} + +} // anonymous namespace + +template +SnapshotRenameRequest::SnapshotRenameRequest(I &image_ctx, + Context *on_finish, + uint64_t snap_id, + const std::string &snap_name) + : Request(image_ctx, on_finish), m_snap_id(snap_id), + m_snap_name(snap_name), m_state(STATE_RENAME_SNAP) { +} + +template +journal::Event SnapshotRenameRequest::create_event(uint64_t op_tid) const { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.image_lock)); + + std::string src_snap_name; + auto snap_info_it = image_ctx.snap_info.find(m_snap_id); + if (snap_info_it != image_ctx.snap_info.end()) { + src_snap_name = snap_info_it->second.name; + } + + return journal::SnapRenameEvent(op_tid, m_snap_id, src_snap_name, + m_snap_name); +} + +template +void SnapshotRenameRequest::send_op() { + send_rename_snap(); +} + +template +bool SnapshotRenameRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0) { + if (r == -EEXIST) { + ldout(cct, 1) << "snapshot already exists" << dendl; + } else { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + } + return true; +} + +template +void SnapshotRenameRequest::send_rename_snap() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + std::shared_lock image_locker{image_ctx.image_lock}; + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + librados::ObjectWriteOperation op; + if (image_ctx.old_format) { + cls_client::old_snapshot_rename(&op, m_snap_id, m_snap_name); + } else { + cls_client::snapshot_rename(&op, m_snap_id, m_snap_name); + } + + librados::AioCompletion *rados_completion = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, + rados_completion, &op); + ceph_assert(r == 0); + rados_completion->release(); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotRenameRequest; diff --git a/src/librbd/operation/SnapshotRenameRequest.h b/src/librbd/operation/SnapshotRenameRequest.h new file mode 100644 index 000000000..697772e02 --- /dev/null +++ b/src/librbd/operation/SnapshotRenameRequest.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H + +#include "librbd/operation/Request.h" +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class SnapshotRenameRequest : public Request { +public: + /** + * Snap Rename goes through the following state machine: + * + * @verbatim + * + * + * | + * v + * STATE_RENAME_SNAP + * | + * v + * + * + * @endverbatim + * + */ + enum State { + STATE_RENAME_SNAP + }; + + SnapshotRenameRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t snap_id, const std::string &snap_name); + + journal::Event create_event(uint64_t op_tid) const override; + +protected: + void send_op() override; + bool should_complete(int r) override; + +private: + uint64_t m_snap_id; + std::string m_snap_name; + State m_state; + + void send_rename_snap(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotRenameRequest; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_RENAME_REQUEST_H diff --git a/src/librbd/operation/SnapshotRollbackRequest.cc b/src/librbd/operation/SnapshotRollbackRequest.cc new file mode 100644 index 000000000..87c5212de --- /dev/null +++ b/src/librbd/operation/SnapshotRollbackRequest.cc @@ -0,0 +1,424 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotRollbackRequest.h" +#include "include/rados/librados.hpp" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ImageCtx.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "librbd/operation/ResizeRequest.h" +#include "osdc/Striper.h" +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotRollbackRequest: " + +namespace librbd { +namespace operation { + +using util::create_context_callback; +using util::create_rados_callback; + +namespace { + +template +class C_RollbackObject : public C_AsyncObjectThrottle { +public: + C_RollbackObject(AsyncObjectThrottle &throttle, I *image_ctx, + uint64_t snap_id, uint64_t object_num, + uint64_t head_num_objects, + decltype(I::object_map) snap_object_map) + : C_AsyncObjectThrottle(throttle, *image_ctx), m_snap_id(snap_id), + m_object_num(object_num), m_head_num_objects(head_num_objects), + m_snap_object_map(snap_object_map) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 20) << "C_RollbackObject: " << __func__ << ": object_num=" + << m_object_num << dendl; + + { + std::shared_lock image_locker{image_ctx.image_lock}; + if (m_object_num < m_head_num_objects && + m_snap_object_map != nullptr && + !image_ctx.object_map->object_may_exist(m_object_num) && + !m_snap_object_map->object_may_exist(m_object_num)) { + return 1; + } + } + + std::string oid = image_ctx.get_object_name(m_object_num); + + librados::ObjectWriteOperation op; + op.selfmanaged_snap_rollback(m_snap_id); + + librados::AioCompletion *rados_completion = + util::create_rados_callback(this); + image_ctx.data_ctx.aio_operate(oid, rados_completion, &op); + rados_completion->release(); + return 0; + } + +private: + uint64_t m_snap_id; + uint64_t m_object_num; + uint64_t m_head_num_objects; + decltype(I::object_map) m_snap_object_map; +}; + +} // anonymous namespace + +template +SnapshotRollbackRequest::SnapshotRollbackRequest(I &image_ctx, + Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t snap_id, + uint64_t snap_size, + ProgressContext &prog_ctx) + : Request(image_ctx, on_finish), m_snap_namespace(snap_namespace), + m_snap_name(snap_name), m_snap_id(snap_id), + m_snap_size(snap_size), m_prog_ctx(prog_ctx), + m_object_map(nullptr), m_snap_object_map(nullptr) { +} + +template +SnapshotRollbackRequest::~SnapshotRollbackRequest() { + I &image_ctx = this->m_image_ctx; + if (m_blocking_writes) { + image_ctx.io_image_dispatcher->unblock_writes(); + } + if (m_object_map) { + m_object_map->put(); + m_object_map = nullptr; + } + if (m_snap_object_map) { + m_snap_object_map->put(); + m_snap_object_map = nullptr; + } +} + +template +void SnapshotRollbackRequest::send_op() { + send_block_writes(); +} + +template +void SnapshotRollbackRequest::send_block_writes() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_blocking_writes = true; + image_ctx.io_image_dispatcher->block_writes(create_context_callback< + SnapshotRollbackRequest, + &SnapshotRollbackRequest::handle_block_writes>(this)); +} + +template +Context *SnapshotRollbackRequest::handle_block_writes(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to block writes: " << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_resize_image(); + return nullptr; +} + +template +void SnapshotRollbackRequest::send_resize_image() { + I &image_ctx = this->m_image_ctx; + + uint64_t current_size; + { + std::shared_lock owner_locker{image_ctx.owner_lock}; + std::shared_lock image_locker{image_ctx.image_lock}; + current_size = image_ctx.get_image_size(CEPH_NOSNAP); + } + + m_head_num_objects = Striper::get_num_objects(image_ctx.layout, current_size); + + if (current_size == m_snap_size) { + send_get_snap_object_map(); + return; + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + std::shared_lock owner_locker{image_ctx.owner_lock}; + Context *ctx = create_context_callback< + SnapshotRollbackRequest, + &SnapshotRollbackRequest::handle_resize_image>(this); + ResizeRequest *req = ResizeRequest::create(image_ctx, ctx, m_snap_size, + true, m_no_op_prog_ctx, 0, true); + req->send(); +} + +template +Context *SnapshotRollbackRequest::handle_resize_image(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to resize image for rollback: " + << cpp_strerror(*result) << dendl; + return this->create_context_finisher(*result); + } + + send_get_snap_object_map(); + return nullptr; +} + +template +void SnapshotRollbackRequest::send_get_snap_object_map() { + I &image_ctx = this->m_image_ctx; + + uint64_t flags = 0; + bool object_map_enabled; + CephContext *cct = image_ctx.cct; + { + std::shared_lock owner_locker{image_ctx.owner_lock}; + std::shared_lock image_locker{image_ctx.image_lock}; + object_map_enabled = (image_ctx.object_map != nullptr); + int r = image_ctx.get_flags(m_snap_id, &flags); + if (r < 0) { + object_map_enabled = false; + } + } + if (object_map_enabled && + (flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) { + lderr(cct) << "warning: object-map is invalid for snapshot" << dendl; + object_map_enabled = false; + } + if (!object_map_enabled) { + send_rollback_object_map(); + return; + } + + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_snap_object_map = image_ctx.create_object_map(m_snap_id); + + Context *ctx = create_context_callback< + SnapshotRollbackRequest, + &SnapshotRollbackRequest::handle_get_snap_object_map>(this); + m_snap_object_map->open(ctx); + return; +} + +template +Context *SnapshotRollbackRequest::handle_get_snap_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << this << " " << __func__ << ": failed to open object map: " + << cpp_strerror(*result) << dendl; + m_snap_object_map->put(); + m_snap_object_map = nullptr; + } + + send_rollback_object_map(); + return nullptr; +} + +template +void SnapshotRollbackRequest::send_rollback_object_map() { + I &image_ctx = this->m_image_ctx; + + { + std::shared_lock owner_locker{image_ctx.owner_lock}; + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.object_map != nullptr) { + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + Context *ctx = create_context_callback< + SnapshotRollbackRequest, + &SnapshotRollbackRequest::handle_rollback_object_map>(this); + image_ctx.object_map->rollback(m_snap_id, ctx); + return; + } + } + + send_rollback_objects(); +} + +template +Context *SnapshotRollbackRequest::handle_rollback_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << this << " " << __func__ << ": failed to roll back object " + << "map: " << cpp_strerror(*result) << dendl; + + ceph_assert(m_object_map == nullptr); + apply(); + return this->create_context_finisher(*result); + } + + send_rollback_objects(); + return nullptr; +} + +template +void SnapshotRollbackRequest::send_rollback_objects() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + std::shared_lock owner_locker{image_ctx.owner_lock}; + uint64_t num_objects; + { + std::shared_lock image_locker{image_ctx.image_lock}; + num_objects = Striper::get_num_objects(image_ctx.layout, + image_ctx.get_current_size()); + } + + Context *ctx = create_context_callback< + SnapshotRollbackRequest, + &SnapshotRollbackRequest::handle_rollback_objects>(this); + typename AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr >(), + boost::lambda::_1, &image_ctx, m_snap_id, boost::lambda::_2, + m_head_num_objects, m_snap_object_map)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, num_objects); + throttle->start_ops( + image_ctx.config.template get_val("rbd_concurrent_management_ops")); +} + +template +Context *SnapshotRollbackRequest::handle_rollback_objects(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result == -ERESTART) { + ldout(cct, 5) << "snapshot rollback operation interrupted" << dendl; + return this->create_context_finisher(*result); + } else if (*result < 0) { + lderr(cct) << "failed to rollback objects: " << cpp_strerror(*result) + << dendl; + return this->create_context_finisher(*result); + } + + return send_refresh_object_map(); +} + +template +Context *SnapshotRollbackRequest::send_refresh_object_map() { + I &image_ctx = this->m_image_ctx; + + bool object_map_enabled; + { + std::shared_lock owner_locker{image_ctx.owner_lock}; + std::shared_lock image_locker{image_ctx.image_lock}; + object_map_enabled = (image_ctx.object_map != nullptr); + } + if (!object_map_enabled) { + return send_invalidate_cache(); + } + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_object_map = image_ctx.create_object_map(CEPH_NOSNAP); + + Context *ctx = create_context_callback< + SnapshotRollbackRequest, + &SnapshotRollbackRequest::handle_refresh_object_map>(this); + m_object_map->open(ctx); + return nullptr; +} + +template +Context *SnapshotRollbackRequest::handle_refresh_object_map(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << this << " " << __func__ << ": failed to open object map: " + << cpp_strerror(*result) << dendl; + m_object_map->put(); + m_object_map = nullptr; + apply(); + + return this->create_context_finisher(*result); + } + + return send_invalidate_cache(); +} + +template +Context *SnapshotRollbackRequest::send_invalidate_cache() { + I &image_ctx = this->m_image_ctx; + + apply(); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + if(m_object_map != nullptr) { + Context *ctx = create_context_callback< + SnapshotRollbackRequest, + &SnapshotRollbackRequest::handle_invalidate_cache>(this, m_object_map); + image_ctx.io_image_dispatcher->invalidate_cache(ctx); + } + else { + Context *ctx = create_context_callback< + SnapshotRollbackRequest, + &SnapshotRollbackRequest::handle_invalidate_cache>(this); + image_ctx.io_image_dispatcher->invalidate_cache(ctx); + } + return nullptr; +} + +template +Context *SnapshotRollbackRequest::handle_invalidate_cache(int *result) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": r=" << *result << dendl; + + if (*result < 0) { + lderr(cct) << "failed to invalidate cache: " << cpp_strerror(*result) + << dendl; + } + return this->create_context_finisher(*result); +} + +template +void SnapshotRollbackRequest::apply() { + I &image_ctx = this->m_image_ctx; + + std::shared_lock owner_locker{image_ctx.owner_lock}; + std::unique_lock image_locker{image_ctx.image_lock}; + if (image_ctx.object_map != nullptr) { + std::swap(m_object_map, image_ctx.object_map); + } +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotRollbackRequest; diff --git a/src/librbd/operation/SnapshotRollbackRequest.h b/src/librbd/operation/SnapshotRollbackRequest.h new file mode 100644 index 000000000..e58a618f2 --- /dev/null +++ b/src/librbd/operation/SnapshotRollbackRequest.h @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H + +#include "librbd/operation/Request.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/journal/Types.h" +#include + +class Context; + +namespace librbd { + +class ProgressContext; + +namespace operation { + +template +class SnapshotRollbackRequest : public Request { +public: + /** + * Snap Rollback goes through the following state machine: + * + * @verbatim + * + * ---------\ + * | + * v + * STATE_BLOCK_WRITES + * | + * v + * STATE_RESIZE_IMAGE (skip if resize not + * | required) + * v + * STATE_GET_SNAP_OBJECT_MAP (skip if object) + * | map disabled) + * v + * STATE_ROLLBACK_OBJECT_MAP (skip if object + * | map disabled) + * v + * STATE_ROLLBACK_OBJECTS + * | + * v + * STATE_REFRESH_OBJECT_MAP (skip if object + * | map disabled) + * v + * STATE_INVALIDATE_CACHE (skip if cache + * | disabled) + * v + * + * + * @endverbatim + * + * The _RESIZE_IMAGE state is skipped if the image doesn't need to be resized. + * The _ROLLBACK_OBJECT_MAP state is skipped if the object map isn't enabled. + * The _INVALIDATE_CACHE state is skipped if the cache isn't enabled. + */ + + SnapshotRollbackRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name, + uint64_t snap_id, + uint64_t snap_size, ProgressContext &prog_ctx); + ~SnapshotRollbackRequest() override; + +protected: + void send_op() override; + bool should_complete(int r) override { + return true; + } + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapRollbackEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + std::string m_snap_name; + uint64_t m_snap_id; + uint64_t m_snap_size; + uint64_t m_head_num_objects; + ProgressContext &m_prog_ctx; + + NoOpProgressContext m_no_op_prog_ctx; + + bool m_blocking_writes = false; + decltype(ImageCtxT::object_map) m_object_map; + decltype(ImageCtxT::object_map) m_snap_object_map; + + void send_block_writes(); + Context *handle_block_writes(int *result); + + void send_resize_image(); + Context *handle_resize_image(int *result); + + void send_get_snap_object_map(); + Context *handle_get_snap_object_map(int *result); + + void send_rollback_object_map(); + Context *handle_rollback_object_map(int *result); + + void send_rollback_objects(); + Context *handle_rollback_objects(int *result); + + Context *send_refresh_object_map(); + Context *handle_refresh_object_map(int *result); + + Context *send_invalidate_cache(); + Context *handle_invalidate_cache(int *result); + + void apply(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotRollbackRequest; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_ROLLBACK_REQUEST_H diff --git a/src/librbd/operation/SnapshotUnprotectRequest.cc b/src/librbd/operation/SnapshotUnprotectRequest.cc new file mode 100644 index 000000000..76caf68f3 --- /dev/null +++ b/src/librbd/operation/SnapshotUnprotectRequest.cc @@ -0,0 +1,353 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SnapshotUnprotectRequest.h" +#include "include/rados/librados.hpp" +#include "include/stringify.h" +#include "common/dout.h" +#include "common/errno.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/Types.h" +#include "librbd/Utils.h" +#include +#include +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::SnapshotUnprotectRequest: " + +namespace librbd { +namespace operation { + +namespace { + +typedef std::pair Pool; +typedef std::vector Pools; + +template +std::ostream& operator<<(std::ostream& os, + const typename SnapshotUnprotectRequest::State& state) { + switch(state) { + case SnapshotUnprotectRequest::STATE_UNPROTECT_SNAP_START: + os << "UNPROTECT_SNAP_START"; + break; + case SnapshotUnprotectRequest::STATE_SCAN_POOL_CHILDREN: + os << "SCAN_POOL_CHILDREN"; + break; + case SnapshotUnprotectRequest::STATE_UNPROTECT_SNAP_FINISH: + os << "UNPROTECT_SNAP_FINISH"; + break; + case SnapshotUnprotectRequest::STATE_UNPROTECT_SNAP_ROLLBACK: + os << "UNPROTECT_SNAP_ROLLBACK"; + break; + default: + os << "UNKNOWN (" << static_cast(state) << ")"; + break; + } + return os; +} + +template +class C_ScanPoolChildren : public C_AsyncObjectThrottle { +public: + C_ScanPoolChildren(AsyncObjectThrottle &throttle, I *image_ctx, + const cls::rbd::ParentImageSpec &pspec, const Pools &pools, + size_t pool_idx) + : C_AsyncObjectThrottle(throttle, *image_ctx), m_pspec(pspec), + m_pool(pools[pool_idx]) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " scanning pool '" << m_pool.second << "'" + << dendl; + + librados::Rados rados(image_ctx.md_ctx); + int64_t base_tier; + int r = rados.pool_get_base_tier(m_pool.first, &base_tier); + if (r == -ENOENT) { + ldout(cct, 1) << "pool '" << m_pool.second << "' no longer exists" + << dendl; + return 1; + } else if (r < 0) { + lderr(cct) << "error retrieving base tier for pool '" + << m_pool.second << "'" << dendl; + return r; + } + if (m_pool.first != base_tier) { + // pool is a cache; skip it + return 1; + } + + r = util::create_ioctx(image_ctx.md_ctx, "child image", m_pool.first, {}, + &m_pool_ioctx); + if (r == -ENOENT) { + return 1; + } else if (r < 0) { + return r; + } + + librados::ObjectReadOperation op; + cls_client::get_children_start(&op, m_pspec); + + librados::AioCompletion *rados_completion = + util::create_rados_callback(this); + r = m_pool_ioctx.aio_operate(RBD_CHILDREN, rados_completion, &op, + &m_children_bl); + ceph_assert(r == 0); + rados_completion->release(); + return 0; + } + +protected: + void finish(int r) override { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (r == 0) { + auto it = m_children_bl.cbegin(); + r= cls_client::get_children_finish(&it, &m_children); + } + + ldout(cct, 10) << this << " retrieved children: r=" << r << dendl; + if (r == -ENOENT) { + // no children -- proceed with unprotect + r = 0; + } else if (r < 0) { + lderr(cct) << "cannot get children for pool '" << m_pool.second << "'" + << dendl; + } else { + lderr(cct) << "cannot unprotect: at least " << m_children.size() << " " + << "child(ren) [" << joinify(m_children.begin(), + m_children.end(), + std::string(",")) << "] " + << "in pool '" << m_pool.second << "'" << dendl; + r = -EBUSY; + } + C_AsyncObjectThrottle::finish(r); + } + +private: + cls::rbd::ParentImageSpec m_pspec; + Pool m_pool; + + IoCtx m_pool_ioctx; + std::set m_children; + bufferlist m_children_bl; +}; + +} // anonymous namespace + +template +SnapshotUnprotectRequest::SnapshotUnprotectRequest(I &image_ctx, + Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name) + : Request(image_ctx, on_finish), m_snap_namespace(snap_namespace), + m_snap_name(snap_name), m_state(STATE_UNPROTECT_SNAP_START), + m_ret_val(0), m_snap_id(CEPH_NOSNAP) { +} + +template +void SnapshotUnprotectRequest::send_op() { + send_unprotect_snap_start(); +} + +template +bool SnapshotUnprotectRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << ": state=" << m_state << ", " + << "r=" << r << dendl; + if (r < 0) { + if (r == -EINVAL) { + ldout(cct, 1) << "snapshot is already unprotected" << dendl; + } else { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + if (m_ret_val == 0) { + m_ret_val = r; + } + } + + // use a different state machine once an error is encountered + if (m_ret_val < 0) { + return should_complete_error(); + } + + std::shared_lock owner_lock{image_ctx.owner_lock}; + bool finished = false; + switch (m_state) { + case STATE_UNPROTECT_SNAP_START: + send_scan_pool_children(); + break; + case STATE_SCAN_POOL_CHILDREN: + send_unprotect_snap_finish(); + break; + case STATE_UNPROTECT_SNAP_FINISH: + finished = true; + break; + default: + ceph_abort(); + break; + } + return finished; +} + +template +bool SnapshotUnprotectRequest::should_complete_error() { + I &image_ctx = this->m_image_ctx; + std::shared_lock owner_locker{image_ctx.owner_lock}; + CephContext *cct = image_ctx.cct; + lderr(cct) << this << " " << __func__ << ": " + << "ret_val=" << m_ret_val << dendl; + + bool finished = true; + if (m_state == STATE_SCAN_POOL_CHILDREN || + m_state == STATE_UNPROTECT_SNAP_FINISH) { + send_unprotect_snap_rollback(); + finished = false; + } + return finished; +} + +template +void SnapshotUnprotectRequest::send_unprotect_snap_start() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + int r = verify_and_send_unprotect_snap_start(); + if (r < 0) { + this->async_complete(r); + return; + } +} + +template +void SnapshotUnprotectRequest::send_scan_pool_children() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + m_state = STATE_SCAN_POOL_CHILDREN; + + // search all pools for children depending on this snapshot + // TODO add async version of wait_for_latest_osdmap + librados::Rados rados(image_ctx.md_ctx); + rados.wait_for_latest_osdmap(); + + // protect against pools being renamed/deleted + std::list pool_list; + rados.pool_list2(pool_list); + + cls::rbd::ParentImageSpec pspec(image_ctx.md_ctx.get_id(), + image_ctx.md_ctx.get_namespace(), + image_ctx.id, m_snap_id); + Pools pools(pool_list.begin(), pool_list.end()); + + Context *ctx = this->create_callback_context(); + typename AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr >(), + boost::lambda::_1, &image_ctx, pspec, pools, boost::lambda::_2)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + nullptr, image_ctx, context_factory, ctx, NULL, 0, pools.size()); + throttle->start_ops( + image_ctx.config.template get_val("rbd_concurrent_management_ops")); +} + +template +void SnapshotUnprotectRequest::send_unprotect_snap_finish() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_state = STATE_UNPROTECT_SNAP_FINISH; + + librados::ObjectWriteOperation op; + cls_client::set_protection_status(&op, m_snap_id, + RBD_PROTECTION_STATUS_UNPROTECTED); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +void SnapshotUnprotectRequest::send_unprotect_snap_rollback() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " " << __func__ << dendl; + + m_state = STATE_UNPROTECT_SNAP_ROLLBACK; + + librados::ObjectWriteOperation op; + cls_client::set_protection_status(&op, m_snap_id, + RBD_PROTECTION_STATUS_PROTECTED); + + librados::AioCompletion *comp = this->create_callback_completion(); + int r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); +} + +template +int SnapshotUnprotectRequest::verify_and_send_unprotect_snap_start() { + I &image_ctx = this->m_image_ctx; + std::shared_lock image_locker{image_ctx.image_lock}; + + CephContext *cct = image_ctx.cct; + if ((image_ctx.features & RBD_FEATURE_LAYERING) == 0) { + lderr(cct) << "image must support layering" << dendl; + return -ENOSYS; + } + + m_snap_id = image_ctx.get_snap_id(m_snap_namespace, m_snap_name); + if (m_snap_id == CEPH_NOSNAP) { + return -ENOENT; + } + + bool is_unprotected; + int r = image_ctx.is_snap_unprotected(m_snap_id, &is_unprotected); + if (r < 0) { + return r; + } + + if (is_unprotected) { + lderr(cct) << "snapshot is already unprotected" << dendl; + return -EINVAL; + } + + librados::ObjectWriteOperation op; + cls_client::set_protection_status(&op, m_snap_id, + RBD_PROTECTION_STATUS_UNPROTECTING); + + librados::AioCompletion *comp = this->create_callback_completion(); + r = image_ctx.md_ctx.aio_operate(image_ctx.header_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + + // TODO legacy code threw a notification post UNPROTECTING update -- required? + return 0; +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SnapshotUnprotectRequest; diff --git a/src/librbd/operation/SnapshotUnprotectRequest.h b/src/librbd/operation/SnapshotUnprotectRequest.h new file mode 100644 index 000000000..19cc6d32b --- /dev/null +++ b/src/librbd/operation/SnapshotUnprotectRequest.h @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H + +#include "librbd/operation/Request.h" +#include + +class Context; + +namespace librbd { + +class ImageCtx; + +namespace operation { + +template +class SnapshotUnprotectRequest : public Request { +public: + /** + * Snap Unprotect goes through the following state machine: + * + * @verbatim + * + * + * | + * v + * STATE_UNPROTECT_SNAP_START + * | + * v + * STATE_SCAN_POOL_CHILDREN * * * * > STATE_UNPROTECT_SNAP_ROLLBACK + * | | + * v | + * STATE_UNPROTECT_SNAP_FINISH | + * | | + * v | + * <----------------------------/ + * + * @endverbatim + * + * If the unprotect operation needs to abort, the error path is followed + * to rollback the unprotect in-progress status on the image. + */ + enum State { + STATE_UNPROTECT_SNAP_START, + STATE_SCAN_POOL_CHILDREN, + STATE_UNPROTECT_SNAP_FINISH, + STATE_UNPROTECT_SNAP_ROLLBACK + }; + + SnapshotUnprotectRequest(ImageCtxT &image_ctx, Context *on_finish, + const cls::rbd::SnapshotNamespace &snap_namespace, + const std::string &snap_name); + +protected: + void send_op() override; + bool should_complete(int r) override; + + int filter_return_code(int r) const override { + if (m_ret_val < 0) { + return m_ret_val; + } + return 0; + } + + journal::Event create_event(uint64_t op_tid) const override { + return journal::SnapUnprotectEvent(op_tid, m_snap_namespace, m_snap_name); + } + +private: + cls::rbd::SnapshotNamespace m_snap_namespace; + std::string m_snap_name; + State m_state; + + int m_ret_val; + uint64_t m_snap_id; + + bool should_complete_error(); + + void send_unprotect_snap_start(); + void send_scan_pool_children(); + void send_unprotect_snap_finish(); + void send_unprotect_snap_rollback(); + + int verify_and_send_unprotect_snap_start(); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SnapshotUnprotectRequest; + +#endif // CEPH_LIBRBD_OPERATION_SNAPSHOT_UNPROTECT_REQUEST_H diff --git a/src/librbd/operation/SparsifyRequest.cc b/src/librbd/operation/SparsifyRequest.cc new file mode 100644 index 000000000..ef7fc78f5 --- /dev/null +++ b/src/librbd/operation/SparsifyRequest.cc @@ -0,0 +1,519 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/SparsifyRequest.h" +#include "cls/rbd/cls_rbd_client.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/err.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/Types.h" +#include "librbd/io/ObjectRequest.h" +#include "librbd/io/Utils.h" +#include "osdc/Striper.h" +#include +#include + +#define dout_subsys ceph_subsys_rbd + +namespace librbd { +namespace operation { + +namespace { + +bool may_be_trimmed(const std::map &extent_map, + const bufferlist &bl, size_t sparse_size, + uint64_t *new_end_ptr) { + if (extent_map.empty()) { + *new_end_ptr = 0; + return true; + } + + uint64_t end = extent_map.rbegin()->first + extent_map.rbegin()->second; + uint64_t new_end = end; + uint64_t bl_off = bl.length(); + + for (auto it = extent_map.rbegin(); it != extent_map.rend(); it++) { + auto off = it->first; + auto len = it->second; + + new_end = p2roundup(off + len, sparse_size); + + uint64_t extent_left = len; + uint64_t sub_len = len % sparse_size; + if (sub_len == 0) { + sub_len = sparse_size; + } + while (extent_left > 0) { + ceph_assert(bl_off >= sub_len); + bl_off -= sub_len; + bufferlist sub_bl; + sub_bl.substr_of(bl, bl_off, sub_len); + if (!sub_bl.is_zero()) { + break; + } + new_end -= sparse_size; + extent_left -= sub_len; + sub_len = sparse_size; + } + if (extent_left > 0) { + break; + } + } + + if (new_end < end) { + *new_end_ptr = new_end; + return true; + } + + return false; +} + +} // anonymous namespace + +using util::create_context_callback; +using util::create_rados_callback; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::SparsifyObject: " << this \ + << " " << m_oid << " " << __func__ << ": " + +template +class C_SparsifyObject : public C_AsyncObjectThrottle { +public: + + /** + * @verbatim + * + * + * | + * v (not supported) + * SPARSIFY * * * * * * * * * * * * > READ < * * * * * * * * * * (concurrent + * | | * update is + * | (object map disabled) | (can trim) * detected) + * |------------------------\ V * + * | | PRE UPDATE OBJECT MAP * + * | (object map enabled) | | (if needed) * + * v | V * + * PRE UPDATE OBJECT MAP | TRIM * * * * * * * * * * * + * | | | + * v | V + * CHECK EXISTS | POST UPDATE OBJECT MAP + * | | | (if needed) + * v | | + * POST UPDATE OBJECT MAP | | + * | | | + * v | | + * <------------------/<-------/ + * + * @endverbatim + * + */ + + C_SparsifyObject(AsyncObjectThrottle &throttle, I *image_ctx, + uint64_t object_no, size_t sparse_size) + : C_AsyncObjectThrottle(throttle, *image_ctx), m_cct(image_ctx->cct), + m_object_no(object_no), m_sparse_size(sparse_size), + m_oid(image_ctx->get_object_name(object_no)) { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + ldout(m_cct, 20) << dendl; + + if (!image_ctx.data_ctx.is_valid()) { + lderr(m_cct) << "missing data pool" << dendl; + return -ENODEV; + } + + if (image_ctx.exclusive_lock != nullptr && + !image_ctx.exclusive_lock->is_lock_owner()) { + ldout(m_cct, 1) << "lost exclusive lock during sparsify" << dendl; + return -ERESTART; + } + + { + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.object_map != nullptr && + !image_ctx.object_map->object_may_exist(m_object_no)) { + // can skip because the object does not exist + return 1; + } + + uint64_t raw_overlap = 0; + uint64_t object_overlap = 0; + int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &raw_overlap); + ceph_assert(r == 0); + if (raw_overlap > 0) { + auto [parent_extents, area] = io::util::object_to_area_extents( + &image_ctx, m_object_no, {{0, image_ctx.layout.object_size}}); + object_overlap = image_ctx.prune_parent_extents(parent_extents, area, + raw_overlap, false); + } + m_remove_empty = object_overlap == 0; + } + + send_sparsify(); + return 0; + } + + void send_sparsify() { + I &image_ctx = this->m_image_ctx; + ldout(m_cct, 20) << dendl; + + librados::ObjectWriteOperation op; + cls_client::sparsify(&op, m_sparse_size, m_remove_empty); + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_sparsify>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + + void handle_sparsify(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r == -EOPNOTSUPP) { + m_trying_trim = true; + send_read(); + return; + } + + if (r == -ENOENT) { + finish_op(0); + return; + } + + if (r < 0) { + lderr(m_cct) << "failed to sparsify: " << cpp_strerror(r) << dendl; + finish_op(r); + return; + } + + send_pre_update_object_map(); + } + + void send_pre_update_object_map() { + I &image_ctx = this->m_image_ctx; + + if (m_trying_trim) { + if (!m_remove_empty || m_new_end != 0 || + !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + send_trim(); + return; + } + } else if (!m_remove_empty || + !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + finish_op(0); + return; + } + + ldout(m_cct, 20) << dendl; + + image_ctx.owner_lock.lock_shared(); + image_ctx.image_lock.lock_shared(); + if (image_ctx.object_map == nullptr) { + // possible that exclusive lock was lost in background + lderr(m_cct) << "object map is not initialized" << dendl; + + image_ctx.image_lock.unlock_shared(); + image_ctx.owner_lock.unlock_shared(); + finish_op(-EINVAL); + return; + } + + int r; + m_finish_op_ctx = image_ctx.exclusive_lock->start_op(&r); + if (m_finish_op_ctx == nullptr) { + lderr(m_cct) << "lost exclusive lock" << dendl; + image_ctx.image_lock.unlock_shared(); + image_ctx.owner_lock.unlock_shared(); + finish_op(r); + return; + } + + auto ctx = create_context_callback< + C_SparsifyObject, + &C_SparsifyObject::handle_pre_update_object_map>(this); + + bool sent = image_ctx.object_map->template aio_update< + Context, &Context::complete>(CEPH_NOSNAP, m_object_no, OBJECT_PENDING, + OBJECT_EXISTS, {}, false, ctx); + + // NOTE: state machine might complete before we reach here + image_ctx.image_lock.unlock_shared(); + image_ctx.owner_lock.unlock_shared(); + if (!sent) { + finish_op(0); + } + } + + void handle_pre_update_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to update object map: " << cpp_strerror(r) + << dendl; + finish_op(r); + return; + } + + if (m_trying_trim) { + send_trim(); + } else { + send_check_exists(); + } + } + + void send_check_exists() { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + op.stat(NULL, NULL, NULL); + m_bl.clear(); + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_check_exists>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl); + ceph_assert(r == 0); + comp->release(); + } + + void handle_check_exists(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "stat failed: " << cpp_strerror(r) << dendl; + finish_op(r); + return; + } + + send_post_update_object_map(r == 0); + } + + void send_post_update_object_map(bool exists) { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + auto ctx = create_context_callback< + C_SparsifyObject, + &C_SparsifyObject::handle_post_update_object_map>(this); + bool sent; + { + std::shared_lock owner_locker{image_ctx.owner_lock}; + std::shared_lock image_locker{image_ctx.image_lock}; + + assert(image_ctx.exclusive_lock->is_lock_owner()); + assert(image_ctx.object_map != nullptr); + + sent = image_ctx.object_map->template aio_update< + Context, &Context::complete>(CEPH_NOSNAP, m_object_no, + exists ? OBJECT_EXISTS : OBJECT_NONEXISTENT, + OBJECT_PENDING, {}, false, ctx); + } + if (!sent) { + ctx->complete(0); + } + } + + void handle_post_update_object_map(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + lderr(m_cct) << "failed to update object map: " << cpp_strerror(r) + << dendl; + finish_op(r); + return; + } + + finish_op(0); + } + + void send_read() { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + librados::ObjectReadOperation op; + m_bl.clear(); + op.sparse_read(0, image_ctx.layout.object_size, &m_extent_map, &m_bl, + nullptr); + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_read>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op, &m_bl); + ceph_assert(r == 0); + comp->release(); + } + + void handle_read(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r < 0) { + if (r == -ENOENT) { + r = 0; + } else { + lderr(m_cct) << "failed to read object: " << cpp_strerror(r) << dendl; + } + finish_op(r); + return; + } + + if (!may_be_trimmed(m_extent_map, m_bl, m_sparse_size, &m_new_end)) { + finish_op(0); + return; + } + + send_pre_update_object_map(); + } + + void send_trim() { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << dendl; + + ceph_assert(m_new_end < image_ctx.layout.object_size); + + librados::ObjectWriteOperation op; + m_bl.clear(); + m_bl.append_zero(image_ctx.layout.object_size - m_new_end); + op.cmpext(m_new_end, m_bl, nullptr); + if (m_new_end == 0 && m_remove_empty) { + op.remove(); + } else { + op.truncate(m_new_end); + } + + auto comp = create_rados_callback< + C_SparsifyObject, &C_SparsifyObject::handle_trim>(this); + int r = image_ctx.data_ctx.aio_operate(m_oid, comp, &op); + ceph_assert(r == 0); + comp->release(); + } + + void handle_trim(int r) { + I &image_ctx = this->m_image_ctx; + + ldout(m_cct, 20) << "r=" << r << dendl; + + if (r <= -MAX_ERRNO) { + m_finish_op_ctx->complete(0); + m_finish_op_ctx = nullptr; + send_read(); + return; + } + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to trim: " << cpp_strerror(r) << dendl; + finish_op(r); + return; + } + + if (!m_remove_empty || m_new_end != 0 || + !image_ctx.test_features(RBD_FEATURE_OBJECT_MAP)) { + finish_op(0); + return; + } + + send_post_update_object_map(false); + } + + void finish_op(int r) { + ldout(m_cct, 20) << "r=" << r << dendl; + + if (m_finish_op_ctx != nullptr) { + m_finish_op_ctx->complete(0); + } + this->complete(r); + } + +private: + CephContext *m_cct; + uint64_t m_object_no; + size_t m_sparse_size; + std::string m_oid; + + bool m_remove_empty = false; + bool m_trying_trim = false; + bufferlist m_bl; + std::map m_extent_map; + uint64_t m_new_end = 0; + Context *m_finish_op_ctx = nullptr; +}; + +#undef dout_prefix +#define dout_prefix *_dout << "librbd::operation::SparsifyRequest: " << this \ + << " " << __func__ << ": " + +template +bool SparsifyRequest::should_complete(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + if (r < 0) { + lderr(cct) << "encountered error: " << cpp_strerror(r) << dendl; + } + return true; +} + +template +void SparsifyRequest::send_op() { + sparsify_objects(); +} + +template +void SparsifyRequest::sparsify_objects() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << dendl; + + assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + uint64_t objects = 0; + { + std::shared_lock image_locker{image_ctx.image_lock}; + objects = image_ctx.get_object_count(CEPH_NOSNAP); + } + + auto ctx = create_context_callback< + SparsifyRequest, + &SparsifyRequest::handle_sparsify_objects>(this); + typename AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr >(), + boost::lambda::_1, &image_ctx, boost::lambda::_2, m_sparse_size)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + this, image_ctx, context_factory, ctx, &m_prog_ctx, 0, objects); + throttle->start_ops( + image_ctx.config.template get_val("rbd_concurrent_management_ops")); +} + +template +void SparsifyRequest::handle_sparsify_objects(int r) { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << "r=" << r << dendl; + + if (r == -ERESTART) { + ldout(cct, 5) << "sparsify operation interrupted" << dendl; + this->complete(r); + return; + } else if (r < 0) { + lderr(cct) << "sparsify encountered an error: " << cpp_strerror(r) << dendl; + this->complete(r); + return; + } + + this->complete(0); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::SparsifyRequest; diff --git a/src/librbd/operation/SparsifyRequest.h b/src/librbd/operation/SparsifyRequest.h new file mode 100644 index 000000000..74f9eb727 --- /dev/null +++ b/src/librbd/operation/SparsifyRequest.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H +#define CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H + +#include "librbd/operation/Request.h" +#include "common/snap_types.h" + +namespace librbd { + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template +class SparsifyRequest : public Request +{ +public: + SparsifyRequest(ImageCtxT &image_ctx, size_t sparse_size, Context *on_finish, + ProgressContext &prog_ctx) + : Request(image_ctx, on_finish), m_sparse_size(sparse_size), + m_prog_ctx(prog_ctx) { + } + +protected: + void send_op() override; + bool should_complete(int r) override; + bool can_affect_io() const override { + return true; + } + journal::Event create_event(uint64_t op_tid) const override { + ceph_abort(); + return journal::UnknownEvent(); + } + +private: + /** + * @verbatim + * + * + * | + * v + * SPARSIFY OBJECTS + * | + * v + * + * + * @endverbatim + */ + + size_t m_sparse_size; + ProgressContext &m_prog_ctx; + + void sparsify_objects(); + void handle_sparsify_objects(int r); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::SparsifyRequest; + +#endif // CEPH_LIBRBD_OPERATION_SPARSIFY_REQUEST_H diff --git a/src/librbd/operation/TrimRequest.cc b/src/librbd/operation/TrimRequest.cc new file mode 100644 index 000000000..6c6685f2b --- /dev/null +++ b/src/librbd/operation/TrimRequest.cc @@ -0,0 +1,382 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/operation/TrimRequest.h" +#include "librbd/AsyncObjectThrottle.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/internal.h" +#include "librbd/ObjectMap.h" +#include "librbd/Utils.h" +#include "librbd/io/ObjectDispatchSpec.h" +#include "librbd/io/ObjectDispatcherInterface.h" +#include "common/ContextCompletion.h" +#include "common/dout.h" +#include "common/errno.h" +#include "osdc/Striper.h" + +#include +#include +#include + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::TrimRequest: " + +namespace librbd { +namespace operation { + +template +class C_CopyupObject : public C_AsyncObjectThrottle { +public: + C_CopyupObject(AsyncObjectThrottle &throttle, I *image_ctx, + IOContext io_context, uint64_t object_no) + : C_AsyncObjectThrottle(throttle, *image_ctx), m_io_context(io_context), + m_object_no(object_no) + { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + std::string oid = image_ctx.get_object_name(m_object_no); + ldout(image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl; + + auto object_dispatch_spec = io::ObjectDispatchSpec::create_discard( + &image_ctx, io::OBJECT_DISPATCH_LAYER_NONE, m_object_no, 0, + image_ctx.layout.object_size, m_io_context, + io::OBJECT_DISCARD_FLAG_DISABLE_OBJECT_MAP_UPDATE, 0, {}, this); + object_dispatch_spec->send(); + return 0; + } +private: + IOContext m_io_context; + uint64_t m_object_no; +}; + +template +class C_RemoveObject : public C_AsyncObjectThrottle { +public: + C_RemoveObject(AsyncObjectThrottle &throttle, I *image_ctx, + uint64_t object_no) + : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_no(object_no) + { + } + + int send() override { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + + { + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.object_map != nullptr && + !image_ctx.object_map->object_may_exist(m_object_no)) { + return 1; + } + } + + std::string oid = image_ctx.get_object_name(m_object_no); + ldout(image_ctx.cct, 10) << "removing " << oid << dendl; + + librados::AioCompletion *rados_completion = + util::create_rados_callback(this); + int r = image_ctx.data_ctx.aio_remove(oid, rados_completion); + ceph_assert(r == 0); + rados_completion->release(); + return 0; + } + +private: + uint64_t m_object_no; +}; + +template +TrimRequest::TrimRequest(I &image_ctx, Context *on_finish, + uint64_t original_size, uint64_t new_size, + ProgressContext &prog_ctx) + : AsyncRequest(image_ctx, on_finish), m_new_size(new_size), + m_prog_ctx(prog_ctx) +{ + uint64_t period = image_ctx.get_stripe_period(); + uint64_t new_num_periods = ((m_new_size + period - 1) / period); + m_delete_off = std::min(new_num_periods * period, original_size); + // first object we can delete free and clear + m_delete_start = new_num_periods * image_ctx.get_stripe_count(); + m_delete_start_min = m_delete_start; + m_num_objects = Striper::get_num_objects(image_ctx.layout, original_size); + + CephContext *cct = image_ctx.cct; + ldout(cct, 10) << this << " trim image " << original_size << " -> " + << m_new_size << " periods " << new_num_periods + << " discard to offset " << m_delete_off + << " delete objects " << m_delete_start + << " to " << m_num_objects << dendl; +} + +template +bool TrimRequest::should_complete(int r) +{ + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + ldout(cct, 5) << this << " should_complete: r=" << r << dendl; + if (r == -ERESTART) { + ldout(cct, 5) << "trim operation interrupted" << dendl; + return true; + } else if (r < 0) { + lderr(cct) << "trim encountered an error: " << cpp_strerror(r) << dendl; + return true; + } + + std::shared_lock owner_lock{image_ctx.owner_lock}; + switch (m_state) { + case STATE_PRE_TRIM: + ldout(cct, 5) << " PRE_TRIM" << dendl; + send_copyup_objects(); + break; + + case STATE_COPYUP_OBJECTS: + ldout(cct, 5) << " COPYUP_OBJECTS" << dendl; + send_remove_objects(); + break; + + case STATE_REMOVE_OBJECTS: + ldout(cct, 5) << " REMOVE_OBJECTS" << dendl; + send_post_trim(); + break; + + case STATE_POST_TRIM: + ldout(cct, 5) << " POST_TRIM" << dendl; + send_clean_boundary(); + break; + + case STATE_CLEAN_BOUNDARY: + ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl; + send_finish(0); + break; + + case STATE_FINISHED: + ldout(cct, 5) << "FINISHED" << dendl; + return true; + + default: + lderr(cct) << "invalid state: " << m_state << dendl; + ceph_abort(); + break; + } + return false; +} + +template +void TrimRequest::send() { + I &image_ctx = this->m_image_ctx; + CephContext *cct = image_ctx.cct; + + if (!image_ctx.data_ctx.is_valid()) { + lderr(cct) << "missing data pool" << dendl; + send_finish(-ENODEV); + return; + } + + send_pre_trim(); +} + +template +void TrimRequest::send_pre_trim() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + if (m_delete_start >= m_num_objects) { + send_clean_boundary(); + return; + } + + { + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.object_map != nullptr) { + ldout(image_ctx.cct, 5) << this << " send_pre_trim: " + << " delete_start_min=" << m_delete_start_min + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_PRE_TRIM; + + ceph_assert(image_ctx.exclusive_lock->is_lock_owner()); + + if (image_ctx.object_map->template aio_update >( + CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_PENDING, + OBJECT_EXISTS, {}, false, this)) { + return; + } + } + } + + send_copyup_objects(); +} + +template +void TrimRequest::send_copyup_objects() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + IOContext io_context; + bool has_snapshots; + uint64_t copyup_end; + { + std::shared_lock image_locker{image_ctx.image_lock}; + + io_context = image_ctx.get_data_io_context(); + has_snapshots = !image_ctx.snaps.empty(); + + uint64_t crypto_header_objects = Striper::get_num_objects( + image_ctx.layout, + image_ctx.get_area_size(io::ImageArea::CRYPTO_HEADER)); + + uint64_t raw_overlap; + int r = image_ctx.get_parent_overlap(CEPH_NOSNAP, &raw_overlap); + ceph_assert(r == 0); + auto overlap = image_ctx.reduce_parent_overlap(raw_overlap, false); + uint64_t data_overlap_objects = Striper::get_num_objects( + image_ctx.layout, + (overlap.second == io::ImageArea::DATA ? overlap.first : 0)); + + // copyup is only required for portion of image that overlaps parent + ceph_assert(m_delete_start >= crypto_header_objects); + copyup_end = crypto_header_objects + data_overlap_objects; + } + + // TODO: protect against concurrent shrink and snap create? + // skip to remove if no copyup is required. + if (copyup_end <= m_delete_start || !has_snapshots) { + send_remove_objects(); + return; + } + + uint64_t copyup_start = m_delete_start; + m_delete_start = copyup_end; + + ldout(image_ctx.cct, 5) << this << " send_copyup_objects: " + << " start object=" << copyup_start << ", " + << " end object=" << copyup_end << dendl; + m_state = STATE_COPYUP_OBJECTS; + + Context *ctx = this->create_callback_context(); + typename AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr >(), + boost::lambda::_1, &image_ctx, io_context, boost::lambda::_2)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + this, image_ctx, context_factory, ctx, &m_prog_ctx, copyup_start, + copyup_end); + throttle->start_ops( + image_ctx.config.template get_val("rbd_concurrent_management_ops")); +} + +template +void TrimRequest::send_remove_objects() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + ldout(image_ctx.cct, 5) << this << " send_remove_objects: " + << " delete_start=" << m_delete_start + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_REMOVE_OBJECTS; + + Context *ctx = this->create_callback_context(); + typename AsyncObjectThrottle::ContextFactory context_factory( + boost::lambda::bind(boost::lambda::new_ptr >(), + boost::lambda::_1, &image_ctx, boost::lambda::_2)); + AsyncObjectThrottle *throttle = new AsyncObjectThrottle( + this, image_ctx, context_factory, ctx, &m_prog_ctx, m_delete_start, + m_num_objects); + throttle->start_ops( + image_ctx.config.template get_val("rbd_concurrent_management_ops")); +} + +template +void TrimRequest::send_post_trim() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + + { + std::shared_lock image_locker{image_ctx.image_lock}; + if (image_ctx.object_map != nullptr) { + ldout(image_ctx.cct, 5) << this << " send_post_trim:" + << " delete_start_min=" << m_delete_start_min + << " num_objects=" << m_num_objects << dendl; + m_state = STATE_POST_TRIM; + + ceph_assert(image_ctx.exclusive_lock->is_lock_owner()); + + if (image_ctx.object_map->template aio_update >( + CEPH_NOSNAP, m_delete_start_min, m_num_objects, OBJECT_NONEXISTENT, + OBJECT_PENDING, {}, false, this)) { + return; + } + } + } + + send_clean_boundary(); +} + +template +void TrimRequest::send_clean_boundary() { + I &image_ctx = this->m_image_ctx; + ceph_assert(ceph_mutex_is_locked(image_ctx.owner_lock)); + CephContext *cct = image_ctx.cct; + if (m_delete_off <= m_new_size) { + send_finish(0); + return; + } + + // should have been canceled prior to releasing lock + ceph_assert(image_ctx.exclusive_lock == nullptr || + image_ctx.exclusive_lock->is_lock_owner()); + uint64_t delete_len = m_delete_off - m_new_size; + ldout(image_ctx.cct, 5) << this << " send_clean_boundary: " + << " delete_off=" << m_delete_off + << " length=" << delete_len << dendl; + m_state = STATE_CLEAN_BOUNDARY; + + IOContext io_context; + { + std::shared_lock image_locker{image_ctx.image_lock}; + io_context = image_ctx.get_data_io_context(); + } + + // discard the weird boundary + std::vector extents; + Striper::file_to_extents(cct, image_ctx.format_string, + &image_ctx.layout, m_new_size, delete_len, 0, + extents); + + ContextCompletion *completion = + new ContextCompletion(this->create_async_callback_context(), true); + for (auto& extent : extents) { + ldout(cct, 20) << " ex " << extent << dendl; + Context *req_comp = new C_ContextCompletion(*completion); + + if (extent.offset == 0) { + // treat as a full object delete on the boundary + extent.length = image_ctx.layout.object_size; + } + + auto object_dispatch_spec = io::ObjectDispatchSpec::create_discard( + &image_ctx, io::OBJECT_DISPATCH_LAYER_NONE, extent.objectno, extent.offset, + extent.length, io_context, 0, 0, {}, req_comp); + object_dispatch_spec->send(); + } + completion->finish_adding_requests(); +} + +template +void TrimRequest::send_finish(int r) { + m_state = STATE_FINISHED; + this->async_complete(r); +} + +} // namespace operation +} // namespace librbd + +template class librbd::operation::TrimRequest; diff --git a/src/librbd/operation/TrimRequest.h b/src/librbd/operation/TrimRequest.h new file mode 100644 index 000000000..8526046c9 --- /dev/null +++ b/src/librbd/operation/TrimRequest.h @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H +#define CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H + +#include "librbd/AsyncRequest.h" + +namespace librbd +{ + +class ImageCtx; +class ProgressContext; + +namespace operation { + +template +class TrimRequest : public AsyncRequest +{ +public: + static TrimRequest *create(ImageCtxT &image_ctx, Context *on_finish, + uint64_t original_size, uint64_t new_size, + ProgressContext &prog_ctx) { + return new TrimRequest(image_ctx, on_finish, original_size, new_size, + prog_ctx); + } + + TrimRequest(ImageCtxT &image_ctx, Context *on_finish, + uint64_t original_size, uint64_t new_size, + ProgressContext &prog_ctx); + + void send() override; + +protected: + /** + * Trim goes through the following state machine to remove whole objects, + * clean partially trimmed objects, and update the object map: + * + * @verbatim + * + * . . . . . . . . . . . . . . . . . + * | . + * v (skip if not needed) . + * STATE_PRE_TRIM . + * | . + * v (skip if not needed) . + * STATE_COPYUP_OBJECTS . + * | . + * v (skip if not needed) . + * STATE_REMOVE_OBJECTS . + * | . + * v (skip if not needed) . + * STATE_POST_TRIM . + * | . + * v (skip if not needed) . + * STATE_CLEAN_BOUNDARY . + * | . + * v . + * STATE_FINISHED < . . . . . . . . . . . . . . . + * | + * v + * + * + * The _COPYUP_OBJECTS state is skipped if there is no parent overlap + * within the new image size and the image does not have any snapshots. + * The _PRE_TRIM/_POST_TRIM states are skipped if the object map + * isn't enabled. The _REMOVE_OBJECTS state is skipped if no whole objects + * are removed. The _CLEAN_BOUNDARY state is skipped if no boundary + * objects are cleaned. The state machine will immediately transition + * to _FINISHED state if there are no bytes to trim. + */ + + enum State { + STATE_PRE_TRIM, + STATE_COPYUP_OBJECTS, + STATE_REMOVE_OBJECTS, + STATE_POST_TRIM, + STATE_CLEAN_BOUNDARY, + STATE_FINISHED + }; + + bool should_complete(int r) override; + + State m_state = STATE_PRE_TRIM; + +private: + uint64_t m_delete_start; + uint64_t m_delete_start_min = 0; + uint64_t m_num_objects; + uint64_t m_delete_off; + uint64_t m_new_size; + ProgressContext &m_prog_ctx; + + void send_pre_trim(); + void send_copyup_objects(); + void send_remove_objects(); + void send_post_trim(); + + void send_clean_boundary(); + void send_finish(int r); +}; + +} // namespace operation +} // namespace librbd + +extern template class librbd::operation::TrimRequest; + +#endif // CEPH_LIBRBD_OPERATION_TRIM_REQUEST_H diff --git a/src/librbd/plugin/Api.cc b/src/librbd/plugin/Api.cc new file mode 100644 index 000000000..67303be3f --- /dev/null +++ b/src/librbd/plugin/Api.cc @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/Timer.h" +#include "librbd/plugin/Api.h" +#include "librbd/ImageCtx.h" +#include "librbd/io/AioCompletion.h" +#include "librbd/io/Utils.h" +#include "librbd/Operations.h" +#include "librbd/Utils.h" + +namespace librbd { +namespace plugin { + +template +void Api::read_parent( + I *image_ctx, uint64_t object_no, io::ReadExtents* extents, + librados::snap_t snap_id, const ZTracer::Trace &trace, + Context* on_finish) { + io::util::read_parent(image_ctx, object_no, extents, snap_id, trace, + on_finish); +} + +template +void Api::execute_image_metadata_set( + I *image_ctx, const std::string &key, + const std::string &value, Context *on_finish) { + ImageCtx* ictx = util::get_image_ctx(image_ctx); + ictx->operations->execute_metadata_set(key, value, on_finish); +} + +template +void Api::execute_image_metadata_remove( + I *image_ctx, const std::string &key, Context *on_finish) { + ImageCtx* ictx = util::get_image_ctx(image_ctx); + ictx->operations->execute_metadata_remove(key, on_finish); +} + +template +void Api::get_image_timer_instance( + CephContext *cct, SafeTimer **timer, ceph::mutex **timer_lock) { + ImageCtx::get_timer_instance(cct, timer, timer_lock); +} + +template +bool Api::test_image_features(I *image_ctx, uint64_t features) { + return image_ctx->test_features(features); +} + +template +void Api::update_aio_comp(io::AioCompletion* aio_comp, + uint32_t request_count, + io::ReadResult &read_result, + io::Extents &image_extents) { + aio_comp->set_request_count(request_count); + aio_comp->read_result = std::move(read_result); + aio_comp->read_result.set_image_extents(image_extents); + start_in_flight_io(aio_comp); +} + +template +void Api::update_aio_comp( + io::AioCompletion* aio_comp, uint32_t request_count) { + aio_comp->set_request_count(request_count); + start_in_flight_io(aio_comp); +} + +template +io::ReadResult::C_ImageReadRequest* Api::create_image_read_request( + io::AioCompletion* aio_comp, uint64_t buffer_offset, + const Extents& image_extents) { + return new io::ReadResult::C_ImageReadRequest( + aio_comp, buffer_offset, image_extents); +} + +template +io::C_AioRequest* Api::create_aio_request(io::AioCompletion* aio_comp) { + io::C_AioRequest *req_comp = new io::C_AioRequest(aio_comp); + return req_comp; +} + +template +void Api::start_in_flight_io(io::AioCompletion* aio_comp) { + if (!aio_comp->async_op.started()) { + aio_comp->start_op(); + } +} + +} // namespace plugin +} // namespace librbd + +template class librbd::plugin::Api; diff --git a/src/librbd/plugin/Api.h b/src/librbd/plugin/Api.h new file mode 100644 index 000000000..04f77e5c3 --- /dev/null +++ b/src/librbd/plugin/Api.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_PLUGIN_API_H +#define CEPH_LIBRBD_PLUGIN_API_H + +#include "common/Timer.h" +#include "common/ceph_mutex.h" +#include "include/common_fwd.h" +#include "include/int_types.h" +#include "include/rados/librados.hpp" +#include "librbd/io/Types.h" +#include "librbd/io/ReadResult.h" + +namespace ZTracer { struct Trace; } + +namespace librbd { + +namespace io { +class AioCompletion; +class C_AioRequest; +} + +struct ImageCtx; + +namespace plugin { + +template +struct Api { + using Extents = librbd::io::Extents; + + Api() {} + virtual ~Api() {} + + virtual void read_parent( + ImageCtxT *image_ctx, uint64_t object_no, io::ReadExtents* extents, + librados::snap_t snap_id, const ZTracer::Trace &trace, + Context* on_finish); + + virtual void execute_image_metadata_set( + ImageCtxT *image_ctx, + const std::string &key, + const std::string &value, + Context *on_finish); + + virtual void execute_image_metadata_remove( + ImageCtxT *image_ctx, + const std::string &key, + Context *on_finish); + + virtual void get_image_timer_instance( + CephContext *cct, SafeTimer **timer, + ceph::mutex **timer_lock); + + virtual bool test_image_features( + ImageCtxT *image_ctx, + uint64_t features); + + virtual void update_aio_comp( + io::AioCompletion* aio_comp, + uint32_t request_count, + io::ReadResult& read_result, + io::Extents &image_extents); + + virtual void update_aio_comp( + io::AioCompletion* aio_comp, + uint32_t request_count); + + virtual io::ReadResult::C_ImageReadRequest* create_image_read_request( + io::AioCompletion* aio_comp, uint64_t buffer_offset, + const Extents& image_extents); + + virtual io::C_AioRequest* create_aio_request(io::AioCompletion* aio_comp); + +private: + void start_in_flight_io(io::AioCompletion* aio_comp); +}; + +} // namespace plugin +} // namespace librbd + +extern template class librbd::plugin::Api; + +#endif // CEPH_LIBRBD_PLUGIN_API_H diff --git a/src/librbd/plugin/ParentCache.cc b/src/librbd/plugin/ParentCache.cc new file mode 100644 index 000000000..3eba430ab --- /dev/null +++ b/src/librbd/plugin/ParentCache.cc @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/plugin/ParentCache.h" +#include "ceph_ver.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/PluginRegistry.h" +#include "librbd/ImageCtx.h" +#include "librbd/cache/ParentCacheObjectDispatch.h" + +extern "C" { + +const char *__ceph_plugin_version() { + return CEPH_GIT_NICE_VER; +} + +int __ceph_plugin_init(CephContext *cct, const std::string& type, + const std::string& name) { + auto plugin_registry = cct->get_plugin_registry(); + return plugin_registry->add( + type, name, new librbd::plugin::ParentCache(cct)); +} + +} // extern "C" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::plugin::ParentCache: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace plugin { + +template +void ParentCache::init(I* image_ctx, Api& api, + cache::ImageWritebackInterface& image_writeback, + PluginHookPoints& hook_points_list, + Context* on_finish) { + bool parent_cache_enabled = image_ctx->config.template get_val( + "rbd_parent_cache_enabled"); + if (image_ctx->child == nullptr || !parent_cache_enabled || + !image_ctx->data_ctx.is_valid()) { + on_finish->complete(0); + return; + } + + auto cct = image_ctx->cct; + ldout(cct, 5) << dendl; + + auto parent_cache = cache::ParentCacheObjectDispatch::create( + image_ctx, api); + on_finish = new LambdaContext([this, on_finish, parent_cache](int r) { + if (r < 0) { + // the object dispatcher will handle cleanup if successfully initialized + delete parent_cache; + } + + handle_init_parent_cache(r, on_finish); + }); + parent_cache->init(on_finish); +} + +template +void ParentCache::handle_init_parent_cache(int r, Context* on_finish) { + ldout(cct, 5) << "r=" << r << dendl; + + if (r < 0) { + lderr(cct) << "Failed to initialize parent cache object dispatch layer: " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + on_finish->complete(0); +} + +} // namespace plugin +} // namespace librbd + +template class librbd::plugin::ParentCache; diff --git a/src/librbd/plugin/ParentCache.h b/src/librbd/plugin/ParentCache.h new file mode 100644 index 000000000..1039efff9 --- /dev/null +++ b/src/librbd/plugin/ParentCache.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_PLUGIN_PARENT_CACHE_H +#define CEPH_LIBRBD_PLUGIN_PARENT_CACHE_H + +#include "librbd/plugin/Types.h" +#include "include/Context.h" + +namespace librbd { + +struct ImageCtx; + +namespace plugin { + +template +class ParentCache : public Interface { +public: + ParentCache(CephContext* cct) : Interface(cct) { + } + + void init(ImageCtxT* image_ctx, Api& api, + cache::ImageWritebackInterface& image_writeback, + PluginHookPoints& hook_points_list, + Context* on_finish) override; + +private: + void handle_init_parent_cache(int r, Context* on_finish); + using ceph::Plugin::cct; + +}; + +} // namespace plugin +} // namespace librbd + +extern template class librbd::plugin::ParentCache; + +#endif // CEPH_LIBRBD_PLUGIN_PARENT_CACHE_H diff --git a/src/librbd/plugin/Types.h b/src/librbd/plugin/Types.h new file mode 100644 index 000000000..b66d754ac --- /dev/null +++ b/src/librbd/plugin/Types.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_PLUGIN_TYPES_H +#define CEPH_LIBRBD_PLUGIN_TYPES_H + +#include "include/common_fwd.h" +#include "include/Context.h" +#include "common/PluginRegistry.h" +#include "librbd/cache/ImageWriteback.h" + +namespace librbd { +namespace plugin { + +template struct Api; + +struct HookPoints { + virtual ~HookPoints() { + } + virtual void acquired_exclusive_lock(Context* on_finish) = 0; + virtual void prerelease_exclusive_lock(Context* on_finish) = 0; + virtual void discard(Context* on_finish) { + on_finish->complete(0); + } +}; + +typedef std::list> PluginHookPoints; + +template +struct Interface : public ceph::Plugin { + Interface(CephContext* cct) : Plugin(cct) { + } + + virtual ~Interface() { + } + + virtual void init(ImageCtxT* image_ctx, Api& api, + librbd::cache::ImageWritebackInterface& image_writeback, + PluginHookPoints& hook_points_list, Context* on_finish) = 0; +}; + +} // namespace plugin +} // namespace librbd + +#endif // CEPH_LIBRBD_PLUGIN_TYPES_H diff --git a/src/librbd/plugin/WriteLogImageCache.cc b/src/librbd/plugin/WriteLogImageCache.cc new file mode 100644 index 000000000..308bb6a00 --- /dev/null +++ b/src/librbd/plugin/WriteLogImageCache.cc @@ -0,0 +1,104 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ceph_ver.h" +#include "common/dout.h" +#include "common/errno.h" +#include "common/PluginRegistry.h" +#include "librbd/ImageCtx.h" +#include "librbd/cache/WriteLogImageDispatch.h" +#include "librbd/cache/ImageWriteback.h" +#include "librbd/cache/Utils.h" +#include "librbd/cache/pwl/DiscardRequest.h" +#include "librbd/cache/pwl/InitRequest.h" +#include "librbd/io/ImageDispatcherInterface.h" +#include "librbd/plugin/WriteLogImageCache.h" + +extern "C" { + +const char *__ceph_plugin_version() { + return CEPH_GIT_NICE_VER; +} + +int __ceph_plugin_init(CephContext *cct, const std::string& type, + const std::string& name) { + auto plugin_registry = cct->get_plugin_registry(); + return plugin_registry->add( + type, name, new librbd::plugin::WriteLogImageCache(cct)); +} + +} // extern "C" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::plugin::WriteLogImageCache: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace plugin { + +template +void WriteLogImageCache::init(I* image_ctx, Api& api, + cache::ImageWritebackInterface& image_writeback, + PluginHookPoints& hook_points_list, + Context* on_finish) { + bool pwl_enabled = librbd::cache::util::is_pwl_enabled(*image_ctx); + if (!pwl_enabled || !image_ctx->data_ctx.is_valid()) { + on_finish->complete(0); + return; + } + + auto cct = image_ctx->cct; + ldout(cct, 5) << dendl; + + auto hook_points = std::make_unique( + image_ctx, image_writeback, api); + hook_points_list.emplace_back(std::move(hook_points)); + + on_finish->complete(0); +} + +template +WriteLogImageCache::~WriteLogImageCache() { +} + +template +WriteLogImageCache::HookPoints::HookPoints( + I* image_ctx, cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api) + : m_image_ctx(image_ctx), m_image_writeback(image_writeback), + m_plugin_api(plugin_api) +{ +} + +template +WriteLogImageCache::HookPoints::~HookPoints() { +} + +template +void WriteLogImageCache::HookPoints::acquired_exclusive_lock( + Context* on_finish) { + cache::pwl::InitRequest *req = cache::pwl::InitRequest::create( + *m_image_ctx, m_image_writeback, m_plugin_api, on_finish); + req->send(); +} + +template +void WriteLogImageCache::HookPoints::prerelease_exclusive_lock( + Context* on_finish) { + m_image_ctx->io_image_dispatcher->shut_down_dispatch( + io::IMAGE_DISPATCH_LAYER_WRITEBACK_CACHE, on_finish); +} + +template +void WriteLogImageCache::HookPoints::discard( + Context* on_finish) { + cache::pwl::DiscardRequest *req = cache::pwl::DiscardRequest::create( + *m_image_ctx, m_plugin_api, on_finish); + req->send(); +} + +} // namespace plugin +} // namespace librbd + +template class librbd::plugin::WriteLogImageCache; diff --git a/src/librbd/plugin/WriteLogImageCache.h b/src/librbd/plugin/WriteLogImageCache.h new file mode 100644 index 000000000..2ceb87ec6 --- /dev/null +++ b/src/librbd/plugin/WriteLogImageCache.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_PLUGIN_WRITELOG_IMAGE_CACHE_H +#define CEPH_LIBRBD_PLUGIN_WRITELOG_IMAGE_CACHE_H + +#include "librbd/plugin/Types.h" +#include "include/Context.h" + +namespace librbd { + +struct ImageCtx; + +namespace plugin { + +template +class WriteLogImageCache : public Interface { +public: + WriteLogImageCache(CephContext* cct) : Interface(cct) { + } + + ~WriteLogImageCache() override; + + void init(ImageCtxT* image_ctx, Api& api, + cache::ImageWritebackInterface& image_writeback, + PluginHookPoints& hook_points_list, + Context* on_finish) override; + + class HookPoints : public plugin::HookPoints { + public: + HookPoints(ImageCtxT* image_ctx, + cache::ImageWritebackInterface& image_writeback, + plugin::Api& plugin_api); + ~HookPoints() override; + + void acquired_exclusive_lock(Context* on_finish) override; + void prerelease_exclusive_lock(Context* on_finish) override; + void discard(Context* on_finish) override; + + private: + ImageCtxT* m_image_ctx; + cache::ImageWritebackInterface& m_image_writeback; + plugin::Api& m_plugin_api; + }; + +}; + +} // namespace plugin +} // namespace librbd + +extern template class librbd::plugin::WriteLogImageCache; + +#endif // CEPH_LIBRBD_PLUGIN_WRITELOG_IMAGE_CACHE_H diff --git a/src/librbd/trash/MoveRequest.cc b/src/librbd/trash/MoveRequest.cc new file mode 100644 index 000000000..7b7abe452 --- /dev/null +++ b/src/librbd/trash/MoveRequest.cc @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/trash/MoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::trash::MoveRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace trash { + +using util::create_context_callback; +using util::create_rados_callback; + +template +void MoveRequest::send() { + trash_add(); +} + +template +void MoveRequest::trash_add() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::trash_add(&op, m_image_id, m_trash_image_spec); + + auto aio_comp = create_rados_callback< + MoveRequest, &MoveRequest::handle_trash_add>(this); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void MoveRequest::handle_trash_add(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -EEXIST) { + ldout(m_cct, 10) << "previous unfinished deferred remove for image: " + << m_image_id << dendl; + } else if (r < 0) { + lderr(m_cct) << "failed to add image to trash: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + remove_id(); +} + +template +void MoveRequest::remove_id() { + ldout(m_cct, 10) << dendl; + + auto aio_comp = create_rados_callback< + MoveRequest, &MoveRequest::handle_remove_id>(this); + int r = m_io_ctx.aio_remove(util::id_obj_name(m_trash_image_spec.name), + aio_comp); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void MoveRequest::handle_remove_id(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "failed to remove image id object: " << cpp_strerror(r) + << dendl; + finish(r); + return; + } + + directory_remove(); +} + +template +void MoveRequest::directory_remove() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + librbd::cls_client::dir_remove_image(&op, m_trash_image_spec.name, + m_image_id); + + auto aio_comp = create_rados_callback< + MoveRequest, &MoveRequest::handle_directory_remove>(this); + int r = m_io_ctx.aio_operate(RBD_DIRECTORY, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void MoveRequest::handle_directory_remove(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r == -ENOENT) { + r = 0; + } + if (r < 0) { + lderr(m_cct) << "failed to remove image from directory: " << cpp_strerror(r) + << dendl; + } + + finish(r); +} + +template +void MoveRequest::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace trash +} // namespace librbd + +template class librbd::trash::MoveRequest; diff --git a/src/librbd/trash/MoveRequest.h b/src/librbd/trash/MoveRequest.h new file mode 100644 index 000000000..d08011e85 --- /dev/null +++ b/src/librbd/trash/MoveRequest.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_TRASH_MOVE_REQUEST_H +#define CEPH_LIBRBD_TRASH_MOVE_REQUEST_H + +#include "include/common_fwd.h" +#include "include/utime.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include + +struct Context; + +namespace librbd { + +struct ImageCtx; + +namespace trash { + +template +class MoveRequest { +public: + static MoveRequest* create(librados::IoCtx& io_ctx, + const std::string& image_id, + const cls::rbd::TrashImageSpec& trash_image_spec, + Context* on_finish) { + return new MoveRequest(io_ctx, image_id, trash_image_spec, on_finish); + } + + MoveRequest(librados::IoCtx& io_ctx, const std::string& image_id, + const cls::rbd::TrashImageSpec& trash_image_spec, + Context* on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), + m_trash_image_spec(trash_image_spec), m_on_finish(on_finish), + m_cct(reinterpret_cast(io_ctx.cct())) { + } + + void send(); + +private: + /* + * @verbatim + * + * + * | + * v + * TRASH_ADD + * | + * v + * REMOVE_ID + * | + * v + * DIRECTORY_REMOVE + * | + * v + * + * + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + std::string m_image_id; + cls::rbd::TrashImageSpec m_trash_image_spec; + Context *m_on_finish; + + CephContext *m_cct; + + void trash_add(); + void handle_trash_add(int r); + + void remove_id(); + void handle_remove_id(int r); + + void directory_remove(); + void handle_directory_remove(int r); + + void finish(int r); + +}; + +} // namespace trash +} // namespace librbd + +extern template class librbd::trash::MoveRequest; + +#endif // CEPH_LIBRBD_TRASH_MOVE_REQUEST_H diff --git a/src/librbd/trash/RemoveRequest.cc b/src/librbd/trash/RemoveRequest.cc new file mode 100644 index 000000000..1149d1d80 --- /dev/null +++ b/src/librbd/trash/RemoveRequest.cc @@ -0,0 +1,170 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/trash/RemoveRequest.h" +#include "common/dout.h" +#include "common/errno.h" +#include "cls/rbd/cls_rbd_client.h" +#include "librbd/ExclusiveLock.h" +#include "librbd/ImageCtx.h" +#include "librbd/ImageState.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/image/RemoveRequest.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::trash::RemoveRequest: " << this \ + << " " << __func__ << ": " + +namespace librbd { +namespace trash { + +using util::create_context_callback; +using util::create_rados_callback; + +template +void RemoveRequest::send() { + set_state(); +} + +template +void RemoveRequest::set_state() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + cls_client::trash_state_set(&op, m_image_id, m_trash_set_state, + m_trash_expect_state); + + auto aio_comp = create_rados_callback< + RemoveRequest, &RemoveRequest::handle_set_state>(this); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void RemoveRequest::handle_set_state(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -EOPNOTSUPP) { + lderr(m_cct) << "error setting trash image state: " << cpp_strerror(r) + << dendl; + if (m_ret_val == 0) { + m_ret_val = r; + } + if (m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) { + close_image(); + } else { + finish(m_ret_val); + } + return; + } + + if (m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) { + remove_image(); + } else { + ceph_assert(m_trash_set_state == cls::rbd::TRASH_IMAGE_STATE_NORMAL); + finish(m_ret_val < 0 ? m_ret_val : r); + }; +} + +template +void RemoveRequest::close_image() { + if (m_image_ctx == nullptr) { + finish(m_ret_val); + return; + } + + ldout(m_cct, 10) << dendl; + + auto ctx = create_context_callback< + RemoveRequest, &RemoveRequest::handle_close_image>(this); + m_image_ctx->state->close(ctx); +} + +template +void RemoveRequest::handle_close_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + ldout(m_cct, 5) << "failed to close image:" << cpp_strerror(r) << dendl; + } + + m_image_ctx = nullptr; + finish(m_ret_val); +} + +template +void RemoveRequest::remove_image() { + ldout(m_cct, 10) << dendl; + + auto ctx = create_context_callback< + RemoveRequest, &RemoveRequest::handle_remove_image>(this); + if (m_image_ctx != nullptr) { + auto req = librbd::image::RemoveRequest::create( + m_io_ctx, m_image_ctx, m_force, true, m_prog_ctx, m_op_work_queue, ctx); + req->send(); + } else { + auto req = librbd::image::RemoveRequest::create( + m_io_ctx, "", m_image_id, m_force, true, m_prog_ctx, m_op_work_queue, + ctx); + req->send(); + } +} + +template +void RemoveRequest::handle_remove_image(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0) { + ldout(m_cct, 5) << "failed to remove image:" << cpp_strerror(r) << dendl; + + m_ret_val = r; + m_trash_set_state = cls::rbd::TRASH_IMAGE_STATE_NORMAL; + m_trash_expect_state = cls::rbd::TRASH_IMAGE_STATE_REMOVING; + set_state(); + return; + } + + m_image_ctx = nullptr; + remove_trash_entry(); +} + +template +void RemoveRequest::remove_trash_entry() { + ldout(m_cct, 10) << dendl; + + librados::ObjectWriteOperation op; + cls_client::trash_remove(&op, m_image_id); + + auto aio_comp = create_rados_callback< + RemoveRequest, &RemoveRequest::handle_remove_trash_entry>(this); + int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +template +void RemoveRequest::handle_remove_trash_entry(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + if (r < 0 && r != -ENOENT) { + lderr(m_cct) << "error removing trash entry: " << cpp_strerror(r) << dendl; + } + + finish(0); +} + +template +void RemoveRequest::finish(int r) { + ldout(m_cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace trash +} // namespace librbd + +template class librbd::trash::RemoveRequest; diff --git a/src/librbd/trash/RemoveRequest.h b/src/librbd/trash/RemoveRequest.h new file mode 100644 index 000000000..86082ca49 --- /dev/null +++ b/src/librbd/trash/RemoveRequest.h @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H +#define CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H + +#include "include/common_fwd.h" +#include "include/utime.h" +#include "include/rados/librados.hpp" +#include "cls/rbd/cls_rbd_types.h" +#include + +class Context; + +namespace librbd { + +struct ImageCtx; +class ProgressContext; +namespace asio { struct ContextWQ; } + +namespace trash { + +template +class RemoveRequest { +public: + static RemoveRequest* create(librados::IoCtx &io_ctx, + const std::string &image_id, + asio::ContextWQ *op_work_queue, bool force, + ProgressContext &prog_ctx, Context *on_finish) { + return new RemoveRequest(io_ctx, image_id, op_work_queue, force, prog_ctx, + on_finish); + } + + static RemoveRequest* create(librados::IoCtx &io_ctx, ImageCtxT *image_ctx, + asio::ContextWQ *op_work_queue, bool force, + ProgressContext &prog_ctx, Context *on_finish) { + return new RemoveRequest(io_ctx, image_ctx, op_work_queue, force, prog_ctx, + on_finish); + } + + + RemoveRequest(librados::IoCtx &io_ctx, const std::string &image_id, + asio::ContextWQ *op_work_queue, bool force, + ProgressContext &prog_ctx, Context *on_finish) + : m_io_ctx(io_ctx), m_image_id(image_id), m_op_work_queue(op_work_queue), + m_force(force), m_prog_ctx(prog_ctx), m_on_finish(on_finish), + m_cct(reinterpret_cast(io_ctx.cct())) { + } + + RemoveRequest(librados::IoCtx &io_ctx, ImageCtxT *image_ctx, + asio::ContextWQ *op_work_queue, bool force, + ProgressContext &prog_ctx, Context *on_finish) + : m_io_ctx(io_ctx), m_image_ctx(image_ctx), m_image_id(m_image_ctx->id), + m_op_work_queue(op_work_queue), m_force(force), m_prog_ctx(prog_ctx), + m_on_finish(on_finish), + m_cct(reinterpret_cast(io_ctx.cct())) { + } + + void send(); + +private: + /* + * @verbatim + * + * + * | + * v + * SET_STATE (removing) * * * * * * *> CLOSE_IMAGE + * | | + * v | + * REMOVE_IMAGE * * *> SET_STATE (normal) | + * | | | + * v | | + * REMOVE_TRASH_ENTRY | | + * | | | + * v | | + * <-------------/<---------------/ + * + * @endverbatim + */ + + librados::IoCtx &m_io_ctx; + ImageCtxT *m_image_ctx = nullptr; + std::string m_image_id; + asio::ContextWQ *m_op_work_queue; + bool m_force; + ProgressContext &m_prog_ctx; + Context *m_on_finish; + + CephContext *m_cct; + + cls::rbd::TrashImageState m_trash_set_state = + cls::rbd::TRASH_IMAGE_STATE_REMOVING; + cls::rbd::TrashImageState m_trash_expect_state = + cls::rbd::TRASH_IMAGE_STATE_NORMAL; + int m_ret_val = 0; + + void set_state(); + void handle_set_state(int r); + + void close_image(); + void handle_close_image(int r); + + void remove_image(); + void handle_remove_image(int r); + + void remove_trash_entry(); + void handle_remove_trash_entry(int r); + + void finish(int r); +}; + +} // namespace trash +} // namespace librbd + +extern template class librbd::trash::RemoveRequest; + +#endif // CEPH_LIBRBD_TRASH_REMOVE_REQUEST_H diff --git a/src/librbd/trash_watcher/Types.cc b/src/librbd/trash_watcher/Types.cc new file mode 100644 index 000000000..c95ea223b --- /dev/null +++ b/src/librbd/trash_watcher/Types.cc @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/Formatter.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "librbd/trash_watcher/Types.h" +#include "librbd/watcher/Utils.h" + +namespace librbd { +namespace trash_watcher { + +namespace { + +class DumpPayloadVisitor : public boost::static_visitor { +public: + explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {} + + template + inline void operator()(const Payload &payload) const { + NotifyOp notify_op = Payload::NOTIFY_OP; + m_formatter->dump_string("notify_op", stringify(notify_op)); + payload.dump(m_formatter); + } + +private: + ceph::Formatter *m_formatter; +}; + +} // anonymous namespace + +void ImageAddedPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(image_id, bl); + encode(trash_image_spec, bl); +} + +void ImageAddedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(image_id, iter); + decode(trash_image_spec, iter); +} + +void ImageAddedPayload::dump(Formatter *f) const { + f->dump_string("image_id", image_id); + f->open_object_section("trash_image_spec"); + trash_image_spec.dump(f); + f->close_section(); +} + +void ImageRemovedPayload::encode(bufferlist &bl) const { + using ceph::encode; + encode(image_id, bl); +} + +void ImageRemovedPayload::decode(__u8 version, bufferlist::const_iterator &iter) { + using ceph::decode; + decode(image_id, iter); +} + +void ImageRemovedPayload::dump(Formatter *f) const { + f->dump_string("image_id", image_id); +} + +void UnknownPayload::encode(bufferlist &bl) const { + ceph_abort(); +} + +void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) { +} + +void UnknownPayload::dump(Formatter *f) const { +} + +void NotifyMessage::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + boost::apply_visitor(watcher::util::EncodePayloadVisitor(bl), payload); + ENCODE_FINISH(bl); +} + +void NotifyMessage::decode(bufferlist::const_iterator& iter) { + DECODE_START(1, iter); + + uint32_t notify_op; + decode(notify_op, iter); + + // select the correct payload variant based upon the encoded op + switch (notify_op) { + case NOTIFY_OP_IMAGE_ADDED: + payload = ImageAddedPayload(); + break; + case NOTIFY_OP_IMAGE_REMOVED: + payload = ImageRemovedPayload(); + break; + default: + payload = UnknownPayload(); + break; + } + + apply_visitor(watcher::util::DecodePayloadVisitor(struct_v, iter), payload); + DECODE_FINISH(iter); +} + +void NotifyMessage::dump(Formatter *f) const { + apply_visitor(DumpPayloadVisitor(f), payload); +} + +void NotifyMessage::generate_test_instances(std::list &o) { + o.push_back(new NotifyMessage{ImageAddedPayload{ + "id", {cls::rbd::TRASH_IMAGE_SOURCE_USER, "name", {}, {}}}}); + o.push_back(new NotifyMessage{ImageRemovedPayload{"id"}}); +} + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op) { + switch (op) { + case NOTIFY_OP_IMAGE_ADDED: + out << "ImageAdded"; + break; + case NOTIFY_OP_IMAGE_REMOVED: + out << "ImageRemoved"; + break; + default: + out << "Unknown (" << static_cast(op) << ")"; + break; + } + return out; +} + +} // namespace trash_watcher +} // namespace librbd diff --git a/src/librbd/trash_watcher/Types.h b/src/librbd/trash_watcher/Types.h new file mode 100644 index 000000000..22c2b4375 --- /dev/null +++ b/src/librbd/trash_watcher/Types.h @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_TRASH_WATCHER_TYPES_H +#define CEPH_LIBRBD_TRASH_WATCHER_TYPES_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "cls/rbd/cls_rbd_types.h" +#include +#include +#include +#include + + +namespace librbd { +namespace trash_watcher { + +enum NotifyOp { + NOTIFY_OP_IMAGE_ADDED = 0, + NOTIFY_OP_IMAGE_REMOVED = 1 +}; + +struct ImageAddedPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_ADDED; + + std::string image_id; + cls::rbd::TrashImageSpec trash_image_spec; + + ImageAddedPayload() { + } + ImageAddedPayload(const std::string& image_id, + const cls::rbd::TrashImageSpec& trash_image_spec) + : image_id(image_id), trash_image_spec(trash_image_spec) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct ImageRemovedPayload { + static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_REMOVED; + + std::string image_id; + + ImageRemovedPayload() { + } + ImageRemovedPayload(const std::string& image_id) + : image_id(image_id) { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +struct UnknownPayload { + static const NotifyOp NOTIFY_OP = static_cast(-1); + + UnknownPayload() { + } + + void encode(bufferlist &bl) const; + void decode(__u8 version, bufferlist::const_iterator &iter); + void dump(Formatter *f) const; +}; + +typedef boost::variant Payload; + +struct NotifyMessage { + NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) { + } + + Payload payload; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + static void generate_test_instances(std::list &o); +}; + +WRITE_CLASS_ENCODER(NotifyMessage); + +std::ostream &operator<<(std::ostream &out, const NotifyOp &op); + +} // namespace trash_watcher +} // namespace librbd + +using librbd::trash_watcher::encode; +using librbd::trash_watcher::decode; + +#endif // CEPH_LIBRBD_TRASH_WATCHER_TYPES_H diff --git a/src/librbd/watcher/Notifier.cc b/src/librbd/watcher/Notifier.cc new file mode 100644 index 000000000..9a4134402 --- /dev/null +++ b/src/librbd/watcher/Notifier.cc @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/watcher/Notifier.h" +#include "librbd/ImageCtx.h" +#include "librbd/Utils.h" +#include "librbd/asio/ContextWQ.h" +#include "librbd/watcher/Types.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::watcher::Notifier: " \ + << this << " " << __func__ << ": " + +namespace librbd { +namespace watcher { + +const uint64_t Notifier::NOTIFY_TIMEOUT = 5000; + +Notifier::C_AioNotify::C_AioNotify(Notifier *notifier, NotifyResponse *response, + Context *on_finish) + : notifier(notifier), response(response), on_finish(on_finish) { +} + +void Notifier::C_AioNotify::finish(int r) { + if (response != nullptr) { + if (r == 0 || r == -ETIMEDOUT) { + try { + auto it = out_bl.cbegin(); + decode(*response, it); + } catch (const buffer::error &err) { + r = -EBADMSG; + } + } + } + notifier->handle_notify(r, on_finish); +} + +Notifier::Notifier(asio::ContextWQ *work_queue, IoCtx &ioctx, + const std::string &oid) + : m_work_queue(work_queue), m_ioctx(ioctx), m_oid(oid), + m_aio_notify_lock(ceph::make_mutex(util::unique_lock_name( + "librbd::object_watcher::Notifier::m_aio_notify_lock", this))) { + m_cct = reinterpret_cast(m_ioctx.cct()); +} + +Notifier::~Notifier() { + std::lock_guard aio_notify_locker{m_aio_notify_lock}; + ceph_assert(m_pending_aio_notifies == 0); +} + +void Notifier::flush(Context *on_finish) { + std::lock_guard aio_notify_locker{m_aio_notify_lock}; + if (m_pending_aio_notifies == 0) { + m_work_queue->queue(on_finish, 0); + return; + } + + m_aio_notify_flush_ctxs.push_back(on_finish); +} + +void Notifier::notify(bufferlist &bl, NotifyResponse *response, + Context *on_finish) { + { + std::lock_guard aio_notify_locker{m_aio_notify_lock}; + ++m_pending_aio_notifies; + + ldout(m_cct, 20) << "pending=" << m_pending_aio_notifies << dendl; + } + + C_AioNotify *ctx = new C_AioNotify(this, response, on_finish); + librados::AioCompletion *comp = util::create_rados_callback(ctx); + int r = m_ioctx.aio_notify(m_oid, comp, bl, NOTIFY_TIMEOUT, &ctx->out_bl); + ceph_assert(r == 0); + comp->release(); +} + +void Notifier::handle_notify(int r, Context *on_finish) { + ldout(m_cct, 20) << "r=" << r << dendl; + + std::lock_guard aio_notify_locker{m_aio_notify_lock}; + ceph_assert(m_pending_aio_notifies > 0); + --m_pending_aio_notifies; + + ldout(m_cct, 20) << "pending=" << m_pending_aio_notifies << dendl; + if (m_pending_aio_notifies == 0) { + for (auto ctx : m_aio_notify_flush_ctxs) { + m_work_queue->queue(ctx, 0); + } + m_aio_notify_flush_ctxs.clear(); + } + + if (on_finish != nullptr) { + m_work_queue->queue(on_finish, r); + } +} + +} // namespace watcher +} // namespace librbd diff --git a/src/librbd/watcher/Notifier.h b/src/librbd/watcher/Notifier.h new file mode 100644 index 000000000..79546b505 --- /dev/null +++ b/src/librbd/watcher/Notifier.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_NOTIFIER_H +#define CEPH_LIBRBD_WATCHER_NOTIFIER_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "common/ceph_mutex.h" +#include + +namespace librbd { + +namespace asio { struct ContextWQ; } + +namespace watcher { + +struct NotifyResponse; + +class Notifier { +public: + static const uint64_t NOTIFY_TIMEOUT; + + Notifier(asio::ContextWQ *work_queue, librados::IoCtx &ioctx, + const std::string &oid); + ~Notifier(); + + void flush(Context *on_finish); + void notify(bufferlist &bl, NotifyResponse *response, Context *on_finish); + +private: + typedef std::list Contexts; + + struct C_AioNotify : public Context { + Notifier *notifier; + NotifyResponse *response; + Context *on_finish; + bufferlist out_bl; + + C_AioNotify(Notifier *notifier, NotifyResponse *response, + Context *on_finish); + + void finish(int r) override; + }; + + asio::ContextWQ *m_work_queue; + librados::IoCtx &m_ioctx; + CephContext *m_cct; + std::string m_oid; + + ceph::mutex m_aio_notify_lock; + size_t m_pending_aio_notifies = 0; + Contexts m_aio_notify_flush_ctxs; + + void handle_notify(int r, Context *on_finish); + +}; + +} // namespace watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_NOTIFIER_H diff --git a/src/librbd/watcher/RewatchRequest.cc b/src/librbd/watcher/RewatchRequest.cc new file mode 100644 index 000000000..b890cb3c5 --- /dev/null +++ b/src/librbd/watcher/RewatchRequest.cc @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/watcher/RewatchRequest.h" +#include "common/ceph_mutex.h" +#include "common/errno.h" +#include "librbd/Utils.h" + +#define dout_subsys ceph_subsys_rbd +#undef dout_prefix +#define dout_prefix *_dout << "librbd::watcher::RewatchRequest: " \ + << this << " " << __func__ << " " + +namespace librbd { + +using util::create_context_callback; +using util::create_rados_callback; + +namespace watcher { + +using std::string; + +RewatchRequest::RewatchRequest(librados::IoCtx& ioctx, const string& oid, + ceph::shared_mutex &watch_lock, + librados::WatchCtx2 *watch_ctx, + uint64_t *watch_handle, Context *on_finish) + : m_ioctx(ioctx), m_oid(oid), m_watch_lock(watch_lock), + m_watch_ctx(watch_ctx), m_watch_handle(watch_handle), + m_on_finish(on_finish) { +} + +void RewatchRequest::send() { + unwatch(); +} + +void RewatchRequest::unwatch() { + ceph_assert(ceph_mutex_is_wlocked(m_watch_lock)); + if (*m_watch_handle == 0) { + rewatch(); + return; + } + + CephContext *cct = reinterpret_cast(m_ioctx.cct()); + ldout(cct, 10) << dendl; + + uint64_t watch_handle = 0; + std::swap(*m_watch_handle, watch_handle); + + librados::AioCompletion *aio_comp = create_rados_callback< + RewatchRequest, &RewatchRequest::handle_unwatch>(this); + int r = m_ioctx.aio_unwatch(watch_handle, aio_comp); + ceph_assert(r == 0); + aio_comp->release(); +} + +void RewatchRequest::handle_unwatch(int r) { + CephContext *cct = reinterpret_cast(m_ioctx.cct()); + ldout(cct, 10) << "r=" << r << dendl; + + if (r == -EBLOCKLISTED) { + lderr(cct) << "client blocklisted" << dendl; + finish(r); + return; + } else if (r < 0) { + lderr(cct) << "failed to unwatch: " << cpp_strerror(r) << dendl; + } + rewatch(); +} + +void RewatchRequest::rewatch() { + CephContext *cct = reinterpret_cast(m_ioctx.cct()); + ldout(cct, 10) << dendl; + + librados::AioCompletion *aio_comp = create_rados_callback< + RewatchRequest, &RewatchRequest::handle_rewatch>(this); + int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_rewatch_handle, m_watch_ctx); + ceph_assert(r == 0); + aio_comp->release(); +} + +void RewatchRequest::handle_rewatch(int r) { + CephContext *cct = reinterpret_cast(m_ioctx.cct()); + ldout(cct, 10) << "r=" << r << dendl; + if (r < 0) { + lderr(cct) << "failed to watch object: " << cpp_strerror(r) + << dendl; + m_rewatch_handle = 0; + } + + { + std::unique_lock watch_locker{m_watch_lock}; + *m_watch_handle = m_rewatch_handle; + } + + finish(r); +} + +void RewatchRequest::finish(int r) { + CephContext *cct = reinterpret_cast(m_ioctx.cct()); + ldout(cct, 10) << "r=" << r << dendl; + + m_on_finish->complete(r); + delete this; +} + +} // namespace watcher +} // namespace librbd + diff --git a/src/librbd/watcher/RewatchRequest.h b/src/librbd/watcher/RewatchRequest.h new file mode 100644 index 000000000..ce5e31539 --- /dev/null +++ b/src/librbd/watcher/RewatchRequest.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H +#define CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H + +#include "common/ceph_mutex.h" +#include "include/int_types.h" +#include "include/rados/librados.hpp" + +struct Context; + +namespace librbd { + +namespace watcher { + +class RewatchRequest { +public: + + static RewatchRequest *create(librados::IoCtx& ioctx, const std::string& oid, + ceph::shared_mutex &watch_lock, + librados::WatchCtx2 *watch_ctx, + uint64_t *watch_handle, Context *on_finish) { + return new RewatchRequest(ioctx, oid, watch_lock, watch_ctx, watch_handle, + on_finish); + } + + RewatchRequest(librados::IoCtx& ioctx, const std::string& oid, + ceph::shared_mutex &watch_lock, librados::WatchCtx2 *watch_ctx, + uint64_t *watch_handle, Context *on_finish); + + void send(); + +private: + /** + * @verbatim + * + * + * | + * v + * UNWATCH + * | + * | . . . . + * | . . (recoverable error) + * v v . + * REWATCH . . . + * | + * v + * + * + * @endverbatim + */ + + librados::IoCtx& m_ioctx; + std::string m_oid; + ceph::shared_mutex &m_watch_lock; + librados::WatchCtx2 *m_watch_ctx; + uint64_t *m_watch_handle; + Context *m_on_finish; + + uint64_t m_rewatch_handle = 0; + + void unwatch(); + void handle_unwatch(int r); + + void rewatch(); + void handle_rewatch(int r); + + void finish(int r); +}; + +} // namespace watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_REWATCH_REQUEST_H diff --git a/src/librbd/watcher/Types.cc b/src/librbd/watcher/Types.cc new file mode 100644 index 000000000..8f1991d7b --- /dev/null +++ b/src/librbd/watcher/Types.cc @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "librbd/watcher/Types.h" +#include "common/Formatter.h" + +namespace librbd { +namespace watcher { + +void ClientId::encode(bufferlist &bl) const { + using ceph::encode; + encode(gid, bl); + encode(handle, bl); +} + +void ClientId::decode(bufferlist::const_iterator &iter) { + using ceph::decode; + decode(gid, iter); + decode(handle, iter); +} + +void ClientId::dump(Formatter *f) const { + f->dump_unsigned("gid", gid); + f->dump_unsigned("handle", handle); +} + +void NotifyResponse::encode(bufferlist& bl) const { + using ceph::encode; + encode(acks, bl); + encode(timeouts, bl); +} + +void NotifyResponse::decode(bufferlist::const_iterator& iter) { + using ceph::decode; + decode(acks, iter); + decode(timeouts, iter); +} +std::ostream &operator<<(std::ostream &out, + const ClientId &client_id) { + out << "[" << client_id.gid << "," << client_id.handle << "]"; + return out; +} + +} // namespace watcher +} // namespace librbd diff --git a/src/librbd/watcher/Types.h b/src/librbd/watcher/Types.h new file mode 100644 index 000000000..d1517fb0f --- /dev/null +++ b/src/librbd/watcher/Types.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_TYPES_H +#define CEPH_LIBRBD_WATCHER_TYPES_H + +#include "include/int_types.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" + +namespace ceph { class Formatter; } + +namespace librbd { + +class Watcher; + +namespace watcher { + +struct ClientId { + uint64_t gid; + uint64_t handle; + + ClientId() : gid(0), handle(0) {} + ClientId(uint64_t gid, uint64_t handle) : gid(gid), handle(handle) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); + void dump(Formatter *f) const; + + inline bool is_valid() const { + return (*this != ClientId()); + } + + inline bool operator==(const ClientId &rhs) const { + return (gid == rhs.gid && handle == rhs.handle); + } + inline bool operator!=(const ClientId &rhs) const { + return !(*this == rhs); + } + inline bool operator<(const ClientId &rhs) const { + if (gid != rhs.gid) { + return gid < rhs.gid; + } else { + return handle < rhs.handle; + } + } +}; + +struct NotifyResponse { + std::map acks; + std::vector timeouts; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& it); +}; + +template +struct Traits { + typedef librbd::Watcher Watcher; +}; + +std::ostream &operator<<(std::ostream &out, + const ClientId &client); + +WRITE_CLASS_ENCODER(ClientId); +WRITE_CLASS_ENCODER(NotifyResponse); + +} // namespace watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_TYPES_H diff --git a/src/librbd/watcher/Utils.h b/src/librbd/watcher/Utils.h new file mode 100644 index 000000000..d2510aaf3 --- /dev/null +++ b/src/librbd/watcher/Utils.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_LIBRBD_WATCHER_UTILS_H +#define CEPH_LIBRBD_WATCHER_UTILS_H + +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "include/Context.h" +#include "librbd/Watcher.h" + +namespace ceph { class Formatter; } + +namespace librbd { +namespace watcher { +namespace util { + +template +struct HandlePayloadVisitor : public boost::static_visitor { + Watcher *watcher; + uint64_t notify_id; + uint64_t handle; + + HandlePayloadVisitor(Watcher *watcher_, uint64_t notify_id_, + uint64_t handle_) + : watcher(watcher_), notify_id(notify_id_), handle(handle_) + { + } + + template + inline void operator()(const P &payload) const { + typename Watcher::C_NotifyAck *ctx = + new typename Watcher::C_NotifyAck(watcher, notify_id, handle); + if (watcher->handle_payload(payload, ctx)) { + ctx->complete(0); + } + } +}; + +class EncodePayloadVisitor : public boost::static_visitor { +public: + explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {} + + template + inline void operator()(const P &payload) const { + using ceph::encode; + encode(static_cast(P::NOTIFY_OP), m_bl); + payload.encode(m_bl); + } + +private: + bufferlist &m_bl; +}; + +class DecodePayloadVisitor : public boost::static_visitor { +public: + DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter) + : m_version(version), m_iter(iter) {} + + template + inline void operator()(P &payload) const { + payload.decode(m_version, m_iter); + } + +private: + __u8 m_version; + bufferlist::const_iterator &m_iter; +}; + +} // namespace util +} // namespace watcher +} // namespace librbd + +#endif // CEPH_LIBRBD_WATCHER_UTILS_H -- cgit v1.2.3