summaryrefslogtreecommitdiffstats
path: root/src/tools
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/tools
parentInitial commit. (diff)
downloadceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/tools')
-rw-r--r--src/tools/CMakeLists.txt155
-rw-r--r--src/tools/RadosDump.cc166
-rw-r--r--src/tools/RadosDump.h409
-rw-r--r--src/tools/ceph-client-debug.cc190
-rw-r--r--src/tools/ceph-dencoder/CMakeLists.txt109
-rw-r--r--src/tools/ceph-dencoder/ceph_dencoder.cc272
-rw-r--r--src/tools/ceph-dencoder/ceph_time.h68
-rw-r--r--src/tools/ceph-dencoder/common_types.cc36
-rw-r--r--src/tools/ceph-dencoder/common_types.h457
-rw-r--r--src/tools/ceph-dencoder/denc_plugin.h82
-rw-r--r--src/tools/ceph-dencoder/denc_registry.h241
-rw-r--r--src/tools/ceph-dencoder/mds_types.cc36
-rw-r--r--src/tools/ceph-dencoder/mds_types.h111
-rw-r--r--src/tools/ceph-dencoder/osd_types.cc39
-rw-r--r--src/tools/ceph-dencoder/osd_types.h153
-rw-r--r--src/tools/ceph-dencoder/rbd_types.cc36
-rw-r--r--src/tools/ceph-dencoder/rbd_types.h52
-rw-r--r--src/tools/ceph-dencoder/rgw_types.cc36
-rw-r--r--src/tools/ceph-dencoder/rgw_types.h131
-rw-r--r--src/tools/ceph-dencoder/sstring.h40
-rw-r--r--src/tools/ceph-diff-sorted.cc173
-rw-r--r--src/tools/ceph-lazy/bash_completion.d/ceph-lazy27
-rwxr-xr-xsrc/tools/ceph-lazy/ceph-lazy709
-rwxr-xr-xsrc/tools/ceph-monstore-update-crush.sh174
-rw-r--r--src/tools/ceph_authtool.cc314
-rw-r--r--src/tools/ceph_conf.cc275
-rw-r--r--src/tools/ceph_dedup_tool.cc964
-rw-r--r--src/tools/ceph_kvstore_tool.cc356
-rw-r--r--src/tools/ceph_monstore_tool.cc1335
-rw-r--r--src/tools/ceph_objectstore_tool.cc4684
-rw-r--r--src/tools/ceph_objectstore_tool.h44
-rw-r--r--src/tools/ceph_osdomap_tool.cc211
-rw-r--r--src/tools/cephfs/CMakeLists.txt58
-rw-r--r--src/tools/cephfs/DataScan.cc2239
-rw-r--r--src/tools/cephfs/DataScan.h341
-rw-r--r--src/tools/cephfs/Dumper.cc431
-rw-r--r--src/tools/cephfs/Dumper.h45
-rw-r--r--src/tools/cephfs/EventOutput.cc153
-rw-r--r--src/tools/cephfs/EventOutput.h42
-rw-r--r--src/tools/cephfs/JournalFilter.cc315
-rw-r--r--src/tools/cephfs/JournalFilter.h73
-rw-r--r--src/tools/cephfs/JournalScanner.cc438
-rw-r--r--src/tools/cephfs/JournalScanner.h133
-rw-r--r--src/tools/cephfs/JournalTool.cc1266
-rw-r--r--src/tools/cephfs/JournalTool.h101
-rw-r--r--src/tools/cephfs/MDSUtility.cc155
-rw-r--r--src/tools/cephfs/MDSUtility.h60
-rw-r--r--src/tools/cephfs/MetaTool.cc999
-rw-r--r--src/tools/cephfs/MetaTool.h272
-rw-r--r--src/tools/cephfs/PgFiles.cc194
-rw-r--r--src/tools/cephfs/PgFiles.h51
-rw-r--r--src/tools/cephfs/Resetter.cc220
-rw-r--r--src/tools/cephfs/Resetter.h50
-rw-r--r--src/tools/cephfs/RoleSelector.cc59
-rw-r--r--src/tools/cephfs/RoleSelector.h36
-rw-r--r--src/tools/cephfs/TableTool.cc417
-rw-r--r--src/tools/cephfs/TableTool.h40
-rw-r--r--src/tools/cephfs/cephfs-data-scan.cc47
-rw-r--r--src/tools/cephfs/cephfs-journal-tool.cc58
-rw-r--r--src/tools/cephfs/cephfs-meta-injection.cc97
-rw-r--r--src/tools/cephfs/cephfs-table-tool.cc47
-rw-r--r--src/tools/cephfs/shell/CMakeLists.txt7
-rwxr-xr-xsrc/tools/cephfs/shell/cephfs-shell1684
-rw-r--r--src/tools/cephfs/shell/setup.py27
-rw-r--r--src/tools/cephfs/shell/tox.ini7
-rw-r--r--src/tools/cephfs/top/CMakeLists.txt7
-rwxr-xr-xsrc/tools/cephfs/top/cephfs-top888
-rw-r--r--src/tools/cephfs/top/setup.py25
-rw-r--r--src/tools/cephfs/top/tox.ini7
-rw-r--r--src/tools/cephfs/type_helper.hpp28
-rw-r--r--src/tools/cephfs_mirror/CMakeLists.txt30
-rw-r--r--src/tools/cephfs_mirror/ClusterWatcher.cc182
-rw-r--r--src/tools/cephfs_mirror/ClusterWatcher.h77
-rw-r--r--src/tools/cephfs_mirror/FSMirror.cc441
-rw-r--r--src/tools/cephfs_mirror/FSMirror.h158
-rw-r--r--src/tools/cephfs_mirror/InstanceWatcher.cc251
-rw-r--r--src/tools/cephfs_mirror/InstanceWatcher.h85
-rw-r--r--src/tools/cephfs_mirror/Mirror.cc602
-rw-r--r--src/tools/cephfs_mirror/Mirror.h140
-rw-r--r--src/tools/cephfs_mirror/MirrorWatcher.cc148
-rw-r--r--src/tools/cephfs_mirror/MirrorWatcher.h79
-rw-r--r--src/tools/cephfs_mirror/PeerReplayer.cc1552
-rw-r--r--src/tools/cephfs_mirror/PeerReplayer.h319
-rw-r--r--src/tools/cephfs_mirror/ServiceDaemon.cc225
-rw-r--r--src/tools/cephfs_mirror/ServiceDaemon.h62
-rw-r--r--src/tools/cephfs_mirror/Types.cc21
-rw-r--r--src/tools/cephfs_mirror/Types.h87
-rw-r--r--src/tools/cephfs_mirror/Utils.cc166
-rw-r--r--src/tools/cephfs_mirror/Utils.h22
-rw-r--r--src/tools/cephfs_mirror/Watcher.cc285
-rw-r--r--src/tools/cephfs_mirror/Watcher.h102
-rw-r--r--src/tools/cephfs_mirror/aio_utils.h53
-rw-r--r--src/tools/cephfs_mirror/main.cc124
-rw-r--r--src/tools/cephfs_mirror/watcher/RewatchRequest.cc102
-rw-r--r--src/tools/cephfs_mirror/watcher/RewatchRequest.h60
-rw-r--r--src/tools/crimson/CMakeLists.txt8
-rw-r--r--src/tools/crimson/perf_async_msgr.cc140
-rw-r--r--src/tools/crimson/perf_crimson_msgr.cc746
-rw-r--r--src/tools/crimson/perf_staged_fltree.cc129
-rw-r--r--src/tools/crushtool.cc1328
-rw-r--r--src/tools/erasure-code/CMakeLists.txt5
-rw-r--r--src/tools/erasure-code/ceph-erasure-code-tool.cc322
-rwxr-xr-xsrc/tools/histogram_dump.py174
-rw-r--r--src/tools/immutable_object_cache/CMakeLists.txt19
-rw-r--r--src/tools/immutable_object_cache/CacheClient.cc435
-rw-r--r--src/tools/immutable_object_cache/CacheClient.h84
-rw-r--r--src/tools/immutable_object_cache/CacheController.cc139
-rw-r--r--src/tools/immutable_object_cache/CacheController.h40
-rw-r--r--src/tools/immutable_object_cache/CacheServer.cc106
-rw-r--r--src/tools/immutable_object_cache/CacheServer.h45
-rw-r--r--src/tools/immutable_object_cache/CacheSession.cc140
-rw-r--r--src/tools/immutable_object_cache/CacheSession.h56
-rw-r--r--src/tools/immutable_object_cache/ObjectCacheStore.cc466
-rw-r--r--src/tools/immutable_object_cache/ObjectCacheStore.h85
-rw-r--r--src/tools/immutable_object_cache/Policy.h34
-rw-r--r--src/tools/immutable_object_cache/SimplePolicy.cc216
-rw-r--r--src/tools/immutable_object_cache/SimplePolicy.h68
-rw-r--r--src/tools/immutable_object_cache/SocketCommon.h31
-rw-r--r--src/tools/immutable_object_cache/Types.cc184
-rw-r--r--src/tools/immutable_object_cache/Types.h136
-rw-r--r--src/tools/immutable_object_cache/Utils.h31
-rw-r--r--src/tools/immutable_object_cache/main.cc84
-rw-r--r--src/tools/kvstore_tool.cc316
-rw-r--r--src/tools/kvstore_tool.h80
-rw-r--r--src/tools/monmaptool.cc478
-rw-r--r--src/tools/neorados.cc385
-rw-r--r--src/tools/osdmaptool.cc846
-rw-r--r--src/tools/psim.cc117
-rw-r--r--src/tools/rados/PoolDump.cc169
-rw-r--r--src/tools/rados/PoolDump.h29
-rw-r--r--src/tools/rados/RadosImport.cc399
-rw-r--r--src/tools/rados/RadosImport.h45
-rw-r--r--src/tools/rados/rados.cc4209
-rw-r--r--src/tools/radosacl.cc186
-rw-r--r--src/tools/rbd/ArgumentTypes.cc548
-rw-r--r--src/tools/rbd/ArgumentTypes.h233
-rw-r--r--src/tools/rbd/CMakeLists.txt80
-rw-r--r--src/tools/rbd/IndentStream.cc59
-rw-r--r--src/tools/rbd/IndentStream.h60
-rw-r--r--src/tools/rbd/MirrorDaemonServiceInfo.cc307
-rw-r--r--src/tools/rbd/MirrorDaemonServiceInfo.h78
-rw-r--r--src/tools/rbd/OptionPrinter.cc161
-rw-r--r--src/tools/rbd/OptionPrinter.h43
-rw-r--r--src/tools/rbd/Schedule.cc367
-rw-r--r--src/tools/rbd/Schedule.h67
-rw-r--r--src/tools/rbd/Shell.cc488
-rw-r--r--src/tools/rbd/Shell.h76
-rw-r--r--src/tools/rbd/Utils.cc1136
-rw-r--r--src/tools/rbd/Utils.h242
-rw-r--r--src/tools/rbd/action/Bench.cc588
-rw-r--r--src/tools/rbd/action/Children.cc167
-rw-r--r--src/tools/rbd/action/Clone.cc99
-rw-r--r--src/tools/rbd/action/Config.cc891
-rw-r--r--src/tools/rbd/action/Copy.cc195
-rw-r--r--src/tools/rbd/action/Create.cc261
-rw-r--r--src/tools/rbd/action/Device.cc280
-rw-r--r--src/tools/rbd/action/Diff.cc142
-rw-r--r--src/tools/rbd/action/DiskUsage.cc377
-rw-r--r--src/tools/rbd/action/Encryption.cc120
-rw-r--r--src/tools/rbd/action/Export.cc650
-rw-r--r--src/tools/rbd/action/Feature.cc116
-rw-r--r--src/tools/rbd/action/Flatten.cc74
-rw-r--r--src/tools/rbd/action/Ggate.cc180
-rw-r--r--src/tools/rbd/action/Group.cc912
-rw-r--r--src/tools/rbd/action/ImageMeta.cc345
-rw-r--r--src/tools/rbd/action/Import.cc1033
-rw-r--r--src/tools/rbd/action/Info.cc471
-rw-r--r--src/tools/rbd/action/Journal.cc1251
-rw-r--r--src/tools/rbd/action/Kernel.cc681
-rw-r--r--src/tools/rbd/action/List.cc346
-rw-r--r--src/tools/rbd/action/Lock.cc279
-rw-r--r--src/tools/rbd/action/MergeDiff.cc454
-rw-r--r--src/tools/rbd/action/Migration.cc429
-rw-r--r--src/tools/rbd/action/MirrorImage.cc605
-rw-r--r--src/tools/rbd/action/MirrorPool.cc1772
-rw-r--r--src/tools/rbd/action/MirrorSnapshotSchedule.cc322
-rw-r--r--src/tools/rbd/action/Namespace.cc191
-rw-r--r--src/tools/rbd/action/Nbd.cc369
-rw-r--r--src/tools/rbd/action/ObjectMap.cc131
-rw-r--r--src/tools/rbd/action/Perf.cc717
-rw-r--r--src/tools/rbd/action/PersistentCache.cc122
-rw-r--r--src/tools/rbd/action/Pool.cc162
-rw-r--r--src/tools/rbd/action/Remove.cc161
-rw-r--r--src/tools/rbd/action/Rename.cc94
-rw-r--r--src/tools/rbd/action/Resize.cc106
-rw-r--r--src/tools/rbd/action/Snap.cc972
-rw-r--r--src/tools/rbd/action/Sparsify.cc82
-rw-r--r--src/tools/rbd/action/Status.cc365
-rw-r--r--src/tools/rbd/action/Trash.cc540
-rw-r--r--src/tools/rbd/action/TrashPurgeSchedule.cc355
-rw-r--r--src/tools/rbd/action/Watch.cc149
-rw-r--r--src/tools/rbd/action/Wnbd.cc172
-rw-r--r--src/tools/rbd/rbd.cc10
-rw-r--r--src/tools/rbd_ggate/CMakeLists.txt9
-rw-r--r--src/tools/rbd_ggate/Driver.cc165
-rw-r--r--src/tools/rbd_ggate/Driver.h50
-rw-r--r--src/tools/rbd_ggate/Request.h55
-rw-r--r--src/tools/rbd_ggate/Server.cc262
-rw-r--r--src/tools/rbd_ggate/Server.h88
-rw-r--r--src/tools/rbd_ggate/Watcher.cc48
-rw-r--r--src/tools/rbd_ggate/Watcher.h34
-rw-r--r--src/tools/rbd_ggate/debug.cc55
-rw-r--r--src/tools/rbd_ggate/debug.h17
-rw-r--r--src/tools/rbd_ggate/ggate_drv.c379
-rw-r--r--src/tools/rbd_ggate/ggate_drv.h64
-rw-r--r--src/tools/rbd_ggate/main.cc521
-rw-r--r--src/tools/rbd_mirror/BaseRequest.h33
-rw-r--r--src/tools/rbd_mirror/CMakeLists.txt90
-rw-r--r--src/tools/rbd_mirror/CancelableRequest.h44
-rw-r--r--src/tools/rbd_mirror/ClusterWatcher.cc251
-rw-r--r--src/tools/rbd_mirror/ClusterWatcher.h73
-rw-r--r--src/tools/rbd_mirror/ImageDeleter.cc548
-rw-r--r--src/tools/rbd_mirror/ImageDeleter.h189
-rw-r--r--src/tools/rbd_mirror/ImageMap.cc602
-rw-r--r--src/tools/rbd_mirror/ImageMap.h175
-rw-r--r--src/tools/rbd_mirror/ImageReplayer.cc1190
-rw-r--r--src/tools/rbd_mirror/ImageReplayer.h273
-rw-r--r--src/tools/rbd_mirror/ImageSync.cc469
-rw-r--r--src/tools/rbd_mirror/ImageSync.h151
-rw-r--r--src/tools/rbd_mirror/InstanceReplayer.cc543
-rw-r--r--src/tools/rbd_mirror/InstanceReplayer.h138
-rw-r--r--src/tools/rbd_mirror/InstanceWatcher.cc1290
-rw-r--r--src/tools/rbd_mirror/InstanceWatcher.h269
-rw-r--r--src/tools/rbd_mirror/Instances.cc356
-rw-r--r--src/tools/rbd_mirror/Instances.h168
-rw-r--r--src/tools/rbd_mirror/LeaderWatcher.cc1069
-rw-r--r--src/tools/rbd_mirror/LeaderWatcher.h313
-rw-r--r--src/tools/rbd_mirror/Mirror.cc748
-rw-r--r--src/tools/rbd_mirror/Mirror.h89
-rw-r--r--src/tools/rbd_mirror/MirrorStatusUpdater.cc397
-rw-r--r--src/tools/rbd_mirror/MirrorStatusUpdater.h119
-rw-r--r--src/tools/rbd_mirror/MirrorStatusWatcher.cc74
-rw-r--r--src/tools/rbd_mirror/MirrorStatusWatcher.h43
-rw-r--r--src/tools/rbd_mirror/NamespaceReplayer.cc862
-rw-r--r--src/tools/rbd_mirror/NamespaceReplayer.h308
-rw-r--r--src/tools/rbd_mirror/PoolMetaCache.cc83
-rw-r--r--src/tools/rbd_mirror/PoolMetaCache.h47
-rw-r--r--src/tools/rbd_mirror/PoolReplayer.cc1109
-rw-r--r--src/tools/rbd_mirror/PoolReplayer.h288
-rw-r--r--src/tools/rbd_mirror/PoolWatcher.cc473
-rw-r--r--src/tools/rbd_mirror/PoolWatcher.h161
-rw-r--r--src/tools/rbd_mirror/ProgressContext.h21
-rw-r--r--src/tools/rbd_mirror/RemotePoolPoller.cc267
-rw-r--r--src/tools/rbd_mirror/RemotePoolPoller.h133
-rw-r--r--src/tools/rbd_mirror/ServiceDaemon.cc327
-rw-r--r--src/tools/rbd_mirror/ServiceDaemon.h94
-rw-r--r--src/tools/rbd_mirror/Threads.cc38
-rw-r--r--src/tools/rbd_mirror/Threads.h45
-rw-r--r--src/tools/rbd_mirror/Throttler.cc240
-rw-r--r--src/tools/rbd_mirror/Throttler.h74
-rw-r--r--src/tools/rbd_mirror/Types.cc32
-rw-r--r--src/tools/rbd_mirror/Types.h166
-rw-r--r--src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc299
-rw-r--r--src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h105
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc419
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h142
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc265
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h117
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashWatcher.cc384
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashWatcher.h139
-rw-r--r--src/tools/rbd_mirror/image_deleter/Types.h54
-rw-r--r--src/tools/rbd_mirror/image_map/LoadRequest.cc174
-rw-r--r--src/tools/rbd_mirror/image_map/LoadRequest.h77
-rw-r--r--src/tools/rbd_mirror/image_map/Policy.cc407
-rw-r--r--src/tools/rbd_mirror/image_map/Policy.h123
-rw-r--r--src/tools/rbd_mirror/image_map/SimplePolicy.cc89
-rw-r--r--src/tools/rbd_mirror/image_map/SimplePolicy.h39
-rw-r--r--src/tools/rbd_mirror/image_map/StateTransition.cc94
-rw-r--r--src/tools/rbd_mirror/image_map/StateTransition.h76
-rw-r--r--src/tools/rbd_mirror/image_map/Types.cc138
-rw-r--r--src/tools/rbd_mirror/image_map/Types.h130
-rw-r--r--src/tools/rbd_mirror/image_map/UpdateRequest.cc100
-rw-r--r--src/tools/rbd_mirror/image_map/UpdateRequest.h65
-rw-r--r--src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc485
-rw-r--r--src/tools/rbd_mirror/image_replayer/BootstrapRequest.h181
-rw-r--r--src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc62
-rw-r--r--src/tools/rbd_mirror/image_replayer/CloseImageRequest.h56
-rw-r--r--src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc451
-rw-r--r--src/tools/rbd_mirror/image_replayer/CreateImageRequest.h144
-rw-r--r--src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc85
-rw-r--r--src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h75
-rw-r--r--src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc79
-rw-r--r--src/tools/rbd_mirror/image_replayer/OpenImageRequest.h71
-rw-r--r--src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc292
-rw-r--r--src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h97
-rw-r--r--src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc197
-rw-r--r--src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h115
-rw-r--r--src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc283
-rw-r--r--src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h153
-rw-r--r--src/tools/rbd_mirror/image_replayer/Replayer.h39
-rw-r--r--src/tools/rbd_mirror/image_replayer/ReplayerListener.h21
-rw-r--r--src/tools/rbd_mirror/image_replayer/StateBuilder.cc138
-rw-r--r--src/tools/rbd_mirror/image_replayer/StateBuilder.h114
-rw-r--r--src/tools/rbd_mirror/image_replayer/TimeRollingMean.cc34
-rw-r--r--src/tools/rbd_mirror/image_replayer/TimeRollingMean.h40
-rw-r--r--src/tools/rbd_mirror/image_replayer/Types.h21
-rw-r--r--src/tools/rbd_mirror/image_replayer/Utils.cc61
-rw-r--r--src/tools/rbd_mirror/image_replayer/Utils.h29
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.cc162
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.h116
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.cc206
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.h127
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.cc316
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.h115
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.cc284
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.h70
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/Replayer.cc1303
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/Replayer.h323
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/StateBuilder.cc149
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/StateBuilder.h94
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.cc109
-rw-r--r--src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.h55
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.cc658
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.h155
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.cc204
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.h121
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.cc70
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.h92
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc1586
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/Replayer.h346
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.cc120
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h93
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/Utils.cc65
-rw-r--r--src/tools/rbd_mirror/image_replayer/snapshot/Utils.h30
-rw-r--r--src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc172
-rw-r--r--src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h93
-rw-r--r--src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc213
-rw-r--r--src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h91
-rw-r--r--src/tools/rbd_mirror/image_sync/Types.h74
-rw-r--r--src/tools/rbd_mirror/image_sync/Utils.cc24
-rw-r--r--src/tools/rbd_mirror/image_sync/Utils.h16
-rw-r--r--src/tools/rbd_mirror/instance_watcher/Types.cc245
-rw-r--r--src/tools/rbd_mirror/instance_watcher/Types.h197
-rw-r--r--src/tools/rbd_mirror/instances/Types.h28
-rw-r--r--src/tools/rbd_mirror/leader_watcher/Types.cc161
-rw-r--r--src/tools/rbd_mirror/leader_watcher/Types.h117
-rw-r--r--src/tools/rbd_mirror/main.cc123
-rw-r--r--src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc89
-rw-r--r--src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h73
-rw-r--r--src/tools/rbd_mirror/pool_watcher/Types.h27
-rw-r--r--src/tools/rbd_mirror/service_daemon/Types.cc29
-rw-r--r--src/tools/rbd_mirror/service_daemon/Types.h33
-rw-r--r--src/tools/rbd_nbd/CMakeLists.txt4
-rw-r--r--src/tools/rbd_nbd/nbd-netlink.h100
-rw-r--r--src/tools/rbd_nbd/rbd-nbd.cc2304
-rwxr-xr-xsrc/tools/rbd_nbd/rbd-nbd_quiesce31
-rw-r--r--src/tools/rbd_recover_tool/FAQ16
-rw-r--r--src/tools/rbd_recover_tool/README97
-rw-r--r--src/tools/rbd_recover_tool/TODO2
-rw-r--r--src/tools/rbd_recover_tool/common_h412
-rw-r--r--src/tools/rbd_recover_tool/config/mds_host0
-rw-r--r--src/tools/rbd_recover_tool/config/mon_host0
-rw-r--r--src/tools/rbd_recover_tool/config/osd_host_path0
-rw-r--r--src/tools/rbd_recover_tool/database_h1134
-rw-r--r--src/tools/rbd_recover_tool/epoch_h119
-rw-r--r--src/tools/rbd_recover_tool/metadata_h368
-rwxr-xr-xsrc/tools/rbd_recover_tool/osd_job170
-rwxr-xr-xsrc/tools/rbd_recover_tool/rbd-recover-tool327
-rwxr-xr-xsrc/tools/rbd_recover_tool/test_rbd_recover_tool.sh542
-rw-r--r--src/tools/rbd_wnbd/CMakeLists.txt10
-rw-r--r--src/tools/rbd_wnbd/rbd_wnbd.cc1690
-rw-r--r--src/tools/rbd_wnbd/rbd_wnbd.h192
-rw-r--r--src/tools/rbd_wnbd/wnbd_handler.cc430
-rw-r--r--src/tools/rbd_wnbd/wnbd_handler.h186
-rw-r--r--src/tools/rebuild_mondb.cc351
-rw-r--r--src/tools/rebuild_mondb.h9
-rwxr-xr-xsrc/tools/rgw/parse-cr-dump.py168
-rw-r--r--src/tools/scratchtool.c319
-rw-r--r--src/tools/scratchtoolpp.cc293
-rwxr-xr-xsrc/tools/setup-virtualenv.sh101
370 files changed, 103895 insertions, 0 deletions
diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt
new file mode 100644
index 000000000..53314d138
--- /dev/null
+++ b/src/tools/CMakeLists.txt
@@ -0,0 +1,155 @@
+set(rados_srcs
+ rados/rados.cc
+ RadosDump.cc
+ rados/RadosImport.cc
+ rados/PoolDump.cc
+ ${PROJECT_SOURCE_DIR}/src/common/util.cc
+ ${PROJECT_SOURCE_DIR}/src/common/obj_bencher.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/ECUtil.cc)
+add_executable(rados ${rados_srcs})
+
+target_link_libraries(rados librados global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+if(WITH_LIBRADOSSTRIPER)
+ target_link_libraries(rados radosstriper)
+else()
+ target_link_libraries(rados cls_lock_client)
+endif()
+install(TARGETS rados DESTINATION bin)
+
+if(NOT WIN32)
+ set(neorados_srcs
+ neorados.cc)
+ add_executable(neorados ${neorados_srcs})
+ target_link_libraries(neorados libneorados spawn fmt::fmt ${CMAKE_DL_LIBS})
+ #install(TARGETS neorados DESTINATION bin)
+endif()
+
+if(WITH_TESTS)
+add_executable(ceph_scratchtool scratchtool.c)
+target_link_libraries(ceph_scratchtool librados global)
+install(TARGETS ceph_scratchtool DESTINATION bin)
+
+add_executable(ceph_scratchtoolpp scratchtoolpp.cc)
+target_link_libraries(ceph_scratchtoolpp librados global)
+install(TARGETS ceph_scratchtoolpp DESTINATION bin)
+
+add_executable(ceph_radosacl radosacl.cc)
+target_link_libraries(ceph_radosacl librados global)
+install(TARGETS ceph_radosacl DESTINATION bin)
+
+install(PROGRAMS
+ ceph-monstore-update-crush.sh
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/ceph)
+endif(WITH_TESTS)
+
+add_executable(ceph-osdomap-tool ceph_osdomap_tool.cc)
+target_link_libraries(ceph-osdomap-tool os global Boost::program_options)
+install(TARGETS ceph-osdomap-tool DESTINATION bin)
+
+add_executable(ceph-monstore-tool
+ ceph_monstore_tool.cc
+ ../auth/cephx/CephxKeyServer.cc
+ ../mgr/mgr_commands.cc)
+target_link_libraries(ceph-monstore-tool os global Boost::program_options)
+install(TARGETS ceph-monstore-tool DESTINATION bin)
+
+add_executable(ceph-objectstore-tool
+ ceph_objectstore_tool.cc
+ rebuild_mondb.cc
+ RadosDump.cc)
+target_link_libraries(ceph-objectstore-tool osd os global Boost::program_options ${CMAKE_DL_LIBS})
+if(WITH_FUSE)
+ target_link_libraries(ceph-objectstore-tool FUSE::FUSE)
+endif(WITH_FUSE)
+install(TARGETS ceph-objectstore-tool DESTINATION bin)
+
+if(WITH_LIBCEPHFS)
+if(WITH_TESTS)
+ add_executable(ceph-client-debug ceph-client-debug.cc)
+ target_link_libraries(ceph-client-debug cephfs global client)
+ install(TARGETS ceph-client-debug DESTINATION bin)
+endif(WITH_TESTS)
+endif(WITH_LIBCEPHFS)
+
+add_executable(ceph-kvstore-tool
+ kvstore_tool.cc
+ ceph_kvstore_tool.cc)
+target_link_libraries(ceph-kvstore-tool os global)
+install(TARGETS ceph-kvstore-tool DESTINATION bin)
+
+set(ceph_conf_srcs ceph_conf.cc)
+add_executable(ceph-conf ${ceph_conf_srcs})
+target_link_libraries(ceph-conf global)
+install(TARGETS ceph-conf DESTINATION bin)
+
+set(crushtool_srcs crushtool.cc)
+add_executable(crushtool ${crushtool_srcs})
+target_link_libraries(crushtool global)
+install(TARGETS crushtool DESTINATION bin)
+
+set(monmaptool_srcs monmaptool.cc)
+add_executable(monmaptool ${monmaptool_srcs})
+target_link_libraries(monmaptool global)
+install(TARGETS monmaptool DESTINATION bin)
+
+set(osdomaptool_srcs osdmaptool.cc)
+add_executable(osdmaptool ${osdomaptool_srcs})
+target_link_libraries(osdmaptool global)
+install(TARGETS osdmaptool DESTINATION bin)
+
+set(ceph-diff-sorted_srcs ceph-diff-sorted.cc)
+add_executable(ceph-diff-sorted ${ceph-diff-sorted_srcs})
+set_target_properties(ceph-diff-sorted PROPERTIES
+ SKIP_RPATH TRUE
+ INSTALL_RPATH "")
+install(TARGETS ceph-diff-sorted DESTINATION bin)
+
+if(WITH_TESTS)
+set(ceph_psim_srcs psim.cc)
+add_executable(ceph_psim ${ceph_psim_srcs})
+target_link_libraries(ceph_psim global)
+install(TARGETS ceph_psim DESTINATION bin)
+endif(WITH_TESTS)
+
+set(ceph_authtool_srcs ceph_authtool.cc)
+add_executable(ceph-authtool ${ceph_authtool_srcs})
+target_link_libraries(ceph-authtool global ${EXTRALIBS} ${CRYPTO_LIBS})
+install(TARGETS ceph-authtool DESTINATION bin)
+
+if(WITH_TESTS)
+set(ceph_dedup_tool_srcs ceph_dedup_tool.cc)
+add_executable(ceph-dedup-tool ${ceph_dedup_tool_srcs})
+target_link_libraries(ceph-dedup-tool
+ librados
+ global
+ cls_cas_client
+ cls_cas_internal)
+install(TARGETS ceph-dedup-tool DESTINATION bin)
+endif(WITH_TESTS)
+
+if(WITH_CEPHFS)
+ add_subdirectory(cephfs)
+ add_subdirectory(cephfs_mirror)
+endif(WITH_CEPHFS)
+
+if(WITH_RBD)
+ add_subdirectory(rbd)
+ add_subdirectory(rbd_mirror)
+ if(LINUX)
+ add_subdirectory(rbd_nbd)
+ endif()
+ if(WIN32)
+ add_subdirectory(rbd_wnbd)
+ endif()
+ if(FREEBSD)
+ add_subdirectory(rbd_ggate)
+ endif()
+endif(WITH_RBD)
+
+if(WITH_SEASTAR)
+ add_subdirectory(crimson)
+endif()
+
+add_subdirectory(immutable_object_cache)
+add_subdirectory(ceph-dencoder)
+add_subdirectory(erasure-code)
diff --git a/src/tools/RadosDump.cc b/src/tools/RadosDump.cc
new file mode 100644
index 000000000..420cd9fc6
--- /dev/null
+++ b/src/tools/RadosDump.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "RadosDump.h"
+
+int RadosDump::read_super()
+{
+ bufferlist ebl;
+ auto ebliter = ebl.cbegin();
+ ssize_t bytes;
+
+ bytes = ebl.read_fd(file_fd, super_header::FIXED_LENGTH);
+ if ((size_t)bytes != super_header::FIXED_LENGTH) {
+ cerr << "Unexpected EOF" << std::endl;
+ return -EFAULT;
+ }
+
+ sh.decode(ebliter);
+
+ return 0;
+}
+
+
+int RadosDump::get_header(header *h)
+{
+ assert (h != NULL);
+
+ bufferlist ebl;
+ auto ebliter = ebl.cbegin();
+ ssize_t bytes;
+
+ bytes = ebl.read_fd(file_fd, sh.header_size);
+ if ((size_t)bytes != sh.header_size) {
+ cerr << "Unexpected EOF" << std::endl;
+ return -EFAULT;
+ }
+
+ h->decode(ebliter);
+
+ return 0;
+}
+
+int RadosDump::get_footer(footer *f)
+{
+ ceph_assert(f != NULL);
+
+ bufferlist ebl;
+ auto ebliter = ebl.cbegin();
+ ssize_t bytes;
+
+ bytes = ebl.read_fd(file_fd, sh.footer_size);
+ if ((size_t)bytes != sh.footer_size) {
+ cerr << "Unexpected EOF" << std::endl;
+ return -EFAULT;
+ }
+
+ f->decode(ebliter);
+
+ if (f->magic != endmagic) {
+ cerr << "Bad footer magic" << std::endl;
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+int RadosDump::read_section(sectiontype_t *type, bufferlist *bl)
+{
+ header hdr;
+ ssize_t bytes;
+
+ int ret = get_header(&hdr);
+ if (ret)
+ return ret;
+
+ *type = hdr.type;
+
+ bl->clear();
+ bytes = bl->read_fd(file_fd, hdr.size);
+ if (bytes != hdr.size) {
+ cerr << "Unexpected EOF" << std::endl;
+ return -EFAULT;
+ }
+
+ if (hdr.size > 0) {
+ footer ft;
+ ret = get_footer(&ft);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+
+int RadosDump::skip_object(bufferlist &bl)
+{
+ bufferlist ebl;
+ bool done = false;
+ while(!done) {
+ sectiontype_t type;
+ int ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown object section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_DATA:
+ case TYPE_ATTRS:
+ case TYPE_OMAP_HDR:
+ case TYPE_OMAP:
+#ifdef DIAGNOSTIC
+ cerr << "Skip type " << (int)type << std::endl;
+#endif
+ break;
+ case TYPE_OBJECT_END:
+ done = true;
+ break;
+ default:
+ cerr << "Can't skip unknown type: " << type << std::endl;
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
+
+//Write super_header with its fixed 16 byte length
+void RadosDump::write_super()
+{
+ if (dry_run) {
+ return;
+ }
+
+ bufferlist superbl;
+ super_header sh;
+ footer ft;
+
+ header hdr(TYPE_NONE, 0);
+ hdr.encode(superbl);
+
+ sh.magic = super_header::super_magic;
+ sh.version = super_header::super_ver;
+ sh.header_size = superbl.length();
+ superbl.clear();
+ ft.encode(superbl);
+ sh.footer_size = superbl.length();
+ superbl.clear();
+
+ sh.encode(superbl);
+ ceph_assert(super_header::FIXED_LENGTH == superbl.length());
+ superbl.write_fd(file_fd);
+}
diff --git a/src/tools/RadosDump.h b/src/tools/RadosDump.h
new file mode 100644
index 000000000..83f02e69d
--- /dev/null
+++ b/src/tools/RadosDump.h
@@ -0,0 +1,409 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef RADOS_DUMP_H_
+#define RADOS_DUMP_H_
+
+#include <stdint.h>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+
+#include "osd/osd_types.h"
+#include "osd/OSDMap.h"
+
+typedef uint8_t sectiontype_t;
+typedef uint32_t mymagic_t;
+typedef int64_t mysize_t;
+
+enum {
+ TYPE_NONE = 0,
+ TYPE_PG_BEGIN,
+ TYPE_PG_END,
+ TYPE_OBJECT_BEGIN,
+ TYPE_OBJECT_END,
+ TYPE_DATA,
+ TYPE_ATTRS,
+ TYPE_OMAP_HDR,
+ TYPE_OMAP,
+ TYPE_PG_METADATA,
+ TYPE_POOL_BEGIN,
+ TYPE_POOL_END,
+ END_OF_TYPES, //Keep at the end
+};
+
+const uint16_t shortmagic = 0xffce; //goes into stream as "ceff"
+//endmagic goes into stream as "ceff ffec"
+const mymagic_t endmagic = (0xecff << 16) | shortmagic;
+
+//The first FIXED_LENGTH bytes are a fixed
+//portion of the export output. This includes the overall
+//version number, and size of header and footer.
+//THIS STRUCTURE CAN ONLY BE APPENDED TO. If it needs to expand,
+//the version can be bumped and then anything
+//can be added to the export format.
+struct super_header {
+ static const uint32_t super_magic = (shortmagic << 16) | shortmagic;
+ // ver = 1, Initial version
+ // ver = 2, Add OSDSuperblock to pg_begin
+ static const uint32_t super_ver = 2;
+ static const uint32_t FIXED_LENGTH = 16;
+ uint32_t magic;
+ uint32_t version;
+ uint32_t header_size;
+ uint32_t footer_size;
+
+ super_header() : magic(0), version(0), header_size(0), footer_size(0) { }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(magic, bl);
+ encode(version, bl);
+ encode(header_size, bl);
+ encode(footer_size, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(magic, bl);
+ decode(version, bl);
+ decode(header_size, bl);
+ decode(footer_size, bl);
+ }
+};
+
+struct header {
+ sectiontype_t type;
+ mysize_t size;
+ header(sectiontype_t type, mysize_t size) :
+ type(type), size(size) { }
+ header(): type(0), size(0) { }
+
+ void encode(bufferlist& bl) const {
+ uint32_t debug_type = (type << 24) | (type << 16) | shortmagic;
+ ENCODE_START(1, 1, bl);
+ encode(debug_type, bl);
+ encode(size, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ uint32_t debug_type;
+ DECODE_START(1, bl);
+ decode(debug_type, bl);
+ type = debug_type >> 24;
+ decode(size, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct footer {
+ mymagic_t magic;
+ footer() : magic(endmagic) { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(magic, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(magic, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct pg_begin {
+ spg_t pgid;
+ OSDSuperblock superblock;
+
+ pg_begin(spg_t pg, const OSDSuperblock& sb):
+ pgid(pg), superblock(sb) { }
+ pg_begin() { }
+
+ void encode(bufferlist& bl) const {
+ // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then
+ // shard will be NO_SHARD for a replicated pool. This means
+ // that we allow the decode by struct_v 2.
+ ENCODE_START(3, 2, bl);
+ encode(pgid.pgid, bl);
+ encode(superblock, bl);
+ encode(pgid.shard, bl);
+ ENCODE_FINISH(bl);
+ }
+ // NOTE: New super_ver prevents decode from ver 1
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(3, bl);
+ decode(pgid.pgid, bl);
+ if (struct_v > 1) {
+ decode(superblock, bl);
+ }
+ if (struct_v > 2) {
+ decode(pgid.shard, bl);
+ } else {
+ pgid.shard = shard_id_t::NO_SHARD;
+ }
+ DECODE_FINISH(bl);
+ }
+};
+
+struct object_begin {
+ ghobject_t hoid;
+
+ // Duplicate what is in the OI_ATTR so we have it at the start
+ // of object processing.
+ object_info_t oi;
+
+ explicit object_begin(const ghobject_t &hoid): hoid(hoid) { }
+ object_begin() { }
+
+ // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then
+ // generation will be NO_GEN, shard_id will be NO_SHARD for a replicated
+ // pool. This means we will allow the decode by struct_v 1.
+ void encode(bufferlist& bl) const {
+ ENCODE_START(3, 1, bl);
+ encode(hoid.hobj, bl);
+ encode(hoid.generation, bl);
+ encode(hoid.shard_id, bl);
+ encode(oi, bl, -1); /* FIXME: we always encode with full features */
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(3, bl);
+ decode(hoid.hobj, bl);
+ if (struct_v > 1) {
+ decode(hoid.generation, bl);
+ decode(hoid.shard_id, bl);
+ } else {
+ hoid.generation = ghobject_t::NO_GEN;
+ hoid.shard_id = shard_id_t::NO_SHARD;
+ }
+ if (struct_v > 2) {
+ decode(oi, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+};
+
+struct data_section {
+ uint64_t offset;
+ uint64_t len;
+ bufferlist databl;
+ data_section(uint64_t offset, uint64_t len, bufferlist bl):
+ offset(offset), len(len), databl(bl) { }
+ data_section(): offset(0), len(0) { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(offset, bl);
+ encode(len, bl);
+ encode(databl, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(offset, bl);
+ decode(len, bl);
+ decode(databl, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct attr_section {
+ map<string,bufferlist> data;
+ explicit attr_section(const map<string,bufferlist> &data) : data(data) { }
+ explicit attr_section(map<string, bufferptr> &data_)
+ {
+ for (std::map<std::string, bufferptr>::iterator i = data_.begin();
+ i != data_.end(); ++i) {
+ bufferlist bl;
+ bl.push_back(i->second);
+ data[i->first] = bl;
+ }
+ }
+
+ attr_section() { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(data, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(data, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct omap_hdr_section {
+ bufferlist hdr;
+ explicit omap_hdr_section(bufferlist hdr) : hdr(hdr) { }
+ omap_hdr_section() { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(hdr, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(hdr, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct omap_section {
+ map<string, bufferlist> omap;
+ explicit omap_section(const map<string, bufferlist> &omap) :
+ omap(omap) { }
+ omap_section() { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(omap, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(omap, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct metadata_section {
+ // struct_ver is the on-disk version of original pg
+ __u8 struct_ver; // for reference
+ epoch_t map_epoch;
+ pg_info_t info;
+ pg_log_t log;
+ PastIntervals past_intervals;
+ OSDMap osdmap;
+ bufferlist osdmap_bl; // Used in lieu of encoding osdmap due to crc checking
+ map<eversion_t, hobject_t> divergent_priors;
+ pg_missing_t missing;
+
+ metadata_section(
+ __u8 struct_ver,
+ epoch_t map_epoch,
+ const pg_info_t &info,
+ const pg_log_t &log,
+ const PastIntervals &past_intervals,
+ const pg_missing_t &missing)
+ : struct_ver(struct_ver),
+ map_epoch(map_epoch),
+ info(info),
+ log(log),
+ past_intervals(past_intervals),
+ missing(missing) {}
+ metadata_section()
+ : struct_ver(0),
+ map_epoch(0) { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(6, 6, bl);
+ encode(struct_ver, bl);
+ encode(map_epoch, bl);
+ encode(info, bl);
+ encode(log, bl);
+ encode(past_intervals, bl);
+ // Equivalent to osdmap.encode(bl, features); but
+ // preserving exact layout for CRC checking.
+ bl.append(osdmap_bl);
+ encode(divergent_priors, bl);
+ encode(missing, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(6, bl);
+ decode(struct_ver, bl);
+ decode(map_epoch, bl);
+ decode(info, bl);
+ decode(log, bl);
+ if (struct_v >= 6) {
+ decode(past_intervals, bl);
+ } else if (struct_v > 1) {
+ cout << "NOTICE: Older export with classic past_intervals" << std::endl;
+ } else {
+ cout << "NOTICE: Older export without past_intervals" << std::endl;
+ }
+ if (struct_v > 2) {
+ osdmap.decode(bl);
+ } else {
+ cout << "WARNING: Older export without OSDMap information" << std::endl;
+ }
+ if (struct_v > 3) {
+ decode(divergent_priors, bl);
+ }
+ if (struct_v > 4) {
+ decode(missing, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+};
+
+/**
+ * Superclass for classes that will need to handle a serialized RADOS
+ * dump. Requires that the serialized dump be opened with a known FD.
+ */
+class RadosDump
+{
+ protected:
+ int file_fd;
+ super_header sh;
+ bool dry_run;
+
+ public:
+ RadosDump(int file_fd_, bool dry_run_)
+ : file_fd(file_fd_), dry_run(dry_run_)
+ {}
+
+ int read_super();
+ int get_header(header *h);
+ int get_footer(footer *f);
+ int read_section(sectiontype_t *type, bufferlist *bl);
+ int skip_object(bufferlist &bl);
+ void write_super();
+
+ // Define this in .h because it's templated
+ template <typename T>
+ int write_section(sectiontype_t type, const T& obj, int fd) {
+ if (dry_run)
+ return 0;
+ bufferlist blhdr, bl, blftr;
+ obj.encode(bl);
+ header hdr(type, bl.length());
+ hdr.encode(blhdr);
+ footer ft;
+ ft.encode(blftr);
+
+ int ret = blhdr.write_fd(fd);
+ if (ret) return ret;
+ ret = bl.write_fd(fd);
+ if (ret) return ret;
+ ret = blftr.write_fd(fd);
+ return ret;
+ }
+
+ int write_simple(sectiontype_t type, int fd)
+ {
+ if (dry_run)
+ return 0;
+ bufferlist hbl;
+
+ header hdr(type, 0);
+ hdr.encode(hbl);
+ return hbl.write_fd(fd);
+ }
+};
+
+#endif
diff --git a/src/tools/ceph-client-debug.cc b/src/tools/ceph-client-debug.cc
new file mode 100644
index 000000000..7a43c9c2a
--- /dev/null
+++ b/src/tools/ceph-client-debug.cc
@@ -0,0 +1,190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/Formatter.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "client/Inode.h"
+#include "client/Dentry.h"
+#include "client/Dir.h"
+#include "include/cephfs/libcephfs.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_client
+
+void usage()
+{
+ std::cout << "Usage: ceph-client-debug [options] <inode number>" << std::endl;
+ generic_client_usage();
+}
+
+
+/**
+ * Given an inode, look up the path from the Client cache: assumes
+ * client cache is fully populated.
+ */
+void traverse_dentries(Inode *ino, std::vector<Dentry*> &parts)
+{
+ if (ino->dentries.empty()) {
+ return;
+ }
+
+ Dentry* dn = *(ino->dentries.begin());
+ parts.push_back(dn);
+ traverse_dentries(dn->dir->parent_inode, parts);
+}
+
+
+/**
+ * Given an inode, send lookup requests to the MDS for
+ * all its ancestors, such that the full trace will be
+ * populated in client cache.
+ */
+int lookup_trace(ceph_mount_info *client, inodeno_t const ino)
+{
+ Inode *inode;
+ int r = ceph_ll_lookup_inode(client, ino, &inode);
+ if (r != 0) {
+ return r;
+ } else {
+ if (!inode->dentries.empty()) {
+ Dentry *dn = *(inode->dentries.begin());
+ ceph_assert(dn->dir);
+ ceph_assert(dn->dir->parent_inode);
+ r = lookup_trace(client, dn->dir->parent_inode->ino);
+ if (r) {
+ return r;
+ }
+ } else {
+ // We reached the root of the tree
+ ceph_assert(inode->ino == CEPH_INO_ROOT);
+ }
+ }
+
+ return r;
+}
+
+
+int main(int argc, const char **argv)
+{
+ // Argument handling
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS|
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
+ common_init_finish(g_ceph_context);
+
+ // Expect exactly one positional argument (inode number)
+ if (args.size() != 1) {
+ cerr << "missing position argument (inode number)" << std::endl;
+ exit(1);
+ }
+ char const *inode_str = args[0];
+ inodeno_t inode = strtoll(inode_str, NULL, 0);
+ if (inode <= 0) {
+ derr << "Invalid inode: " << inode_str << dendl;
+ return -1;
+ }
+
+ // Initialize filesystem client
+ struct ceph_mount_info *client;
+ int r = ceph_create_with_context(&client, g_ceph_context);
+ if (r) {
+ derr << "Error initializing libcephfs: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_mount(client, "/");
+ if (r) {
+ derr << "Error mounting: " << cpp_strerror(r) << dendl;
+ ceph_shutdown(client);
+ return r;
+ }
+
+
+ // Populate client cache with inode of interest & ancestors
+ r = lookup_trace(client, inode);
+ if (r) {
+ derr << "Error looking up inode " << std::hex << inode << std::dec <<
+ ": " << cpp_strerror(r) << dendl;
+ return -1;
+ }
+
+ // Retrieve inode of interest
+ struct vinodeno_t vinode;
+ vinode.ino = inode;
+ vinode.snapid = CEPH_NOSNAP;
+ Inode *ino = ceph_ll_get_inode(client, vinode);
+
+ // Retrieve dentry trace
+ std::vector<Dentry*> path;
+ traverse_dentries(ino, path);
+
+ // Print inode and path as a JSON object
+ JSONFormatter jf(true);
+ jf.open_object_section("client_debug");
+ {
+ jf.open_object_section("inode");
+ {
+ ino->dump(&jf);
+ }
+ jf.close_section(); // inode
+ jf.open_array_section("path");
+ {
+ for (std::vector<Dentry*>::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) {
+ jf.open_object_section("dentry");
+ {
+ (*p)->dump(&jf);
+ }
+ jf.close_section(); // dentry
+ }
+ }
+ jf.close_section(); // path
+ }
+ jf.close_section(); // client_debug
+ jf.flush(std::cout);
+ std::cout << std::endl;
+
+ // Release Inode references
+ ceph_ll_forget(client, ino, 1);
+ for (std::vector<Dentry*>::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) {
+ ceph_ll_forget(client, (*p)->inode.get(), 1);
+ }
+ ino = NULL;
+ path.clear();
+
+ // Shut down
+ r = ceph_unmount(client);
+ if (r) {
+ derr << "Error mounting: " << cpp_strerror(r) << dendl;
+ }
+ ceph_shutdown(client);
+
+ return r;
+}
diff --git a/src/tools/ceph-dencoder/CMakeLists.txt b/src/tools/ceph-dencoder/CMakeLists.txt
new file mode 100644
index 000000000..a92ac5e69
--- /dev/null
+++ b/src/tools/ceph-dencoder/CMakeLists.txt
@@ -0,0 +1,109 @@
+## dencoder
+set_source_files_properties(
+ ceph_dencoder.cc
+ APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/include/ceph_ver.h)
+
+if(HAS_VTA)
+ set_source_files_properties(ceph_dencoder.cc
+ PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+endif()
+
+set(dencoder_srcs
+ ceph_dencoder.cc
+ ../../include/uuid.cc
+ ../../include/utime.cc
+ $<TARGET_OBJECTS:common_texttable_obj>)
+add_executable(ceph-dencoder ${dencoder_srcs})
+
+set(denc_plugin_dir ${CEPH_INSTALL_PKGLIBDIR}/denc)
+add_custom_target(ceph-dencoder-modules)
+
+function(add_denc_mod name)
+ add_library(${name} SHARED
+ ${ARGN})
+ set_target_properties(${name} PROPERTIES
+ PREFIX ""
+ OUTPUT_NAME ${name}
+ CXX_VISIBILITY_PRESET hidden
+ VISIBILITY_INLINES_HIDDEN ON)
+ install(
+ TARGETS ${name}
+ DESTINATION ${denc_plugin_dir})
+ add_dependencies(ceph-dencoder-modules
+ ${name})
+endfunction()
+
+add_denc_mod(denc-mod-common
+ common_types.cc)
+target_link_libraries(denc-mod-common
+ journal
+ cls_cas_internal
+ cls_lock_client
+ cls_refcount_client
+ cls_timeindex_client)
+add_denc_mod(denc-mod-osd
+ osd_types.cc)
+target_link_libraries(denc-mod-osd
+ os
+ osd
+ mon
+ erasure_code)
+
+if(WITH_RADOSGW)
+ add_denc_mod(denc-mod-rgw
+ rgw_types.cc
+ ${CMAKE_SOURCE_DIR}/src/rgw/rgw_dencoder.cc)
+ target_link_libraries(denc-mod-rgw
+ rgw_a
+ cls_rgw_client
+ cls_journal_client)
+ if(WITH_RADOSGW_AMQP_ENDPOINT)
+ target_link_libraries(denc-mod-rgw
+ rabbitmq ssl)
+ endif()
+ if(WITH_RADOSGW_KAFKA_ENDPOINT)
+ target_link_libraries(denc-mod-rgw
+ rdkafka)
+ endif()
+ if(WITH_RADOSGW_LUA_PACKAGES)
+ target_link_libraries(denc-mod-rgw
+ Boost::filesystem)
+ endif()
+endif()
+
+if(WITH_RBD)
+ add_denc_mod(denc-mod-rbd
+ rbd_types.cc)
+ target_link_libraries(denc-mod-rbd
+ cls_rbd_client
+ rbd_mirror_types
+ rbd_types
+ rbd_replay_types)
+ if(WITH_KRBD)
+ target_link_libraries(denc-mod-rbd
+ krbd)
+ endif()
+endif()
+
+if(WITH_CEPHFS)
+ add_denc_mod(denc-mod-cephfs
+ mds_types.cc)
+ target_link_libraries(denc-mod-cephfs
+ mds)
+endif()
+
+target_compile_definitions(ceph-dencoder PRIVATE
+ "CEPH_DENC_MOD_DIR=\"${denc_plugin_dir}\"")
+
+target_link_libraries(ceph-dencoder
+ StdFilesystem::filesystem
+ global
+ ${DENCODER_EXTRALIBS}
+ cls_log_client
+ cls_version_client
+ cls_user_client
+ cls_cas_client
+ ${ALLOC_LIBS}
+ ${EXTRALIBS}
+ ${CMAKE_DL_LIBS})
+install(TARGETS ceph-dencoder DESTINATION bin)
diff --git a/src/tools/ceph-dencoder/ceph_dencoder.cc b/src/tools/ceph-dencoder/ceph_dencoder.cc
new file mode 100644
index 000000000..42060f860
--- /dev/null
+++ b/src/tools/ceph-dencoder/ceph_dencoder.cc
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <errno.h>
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#else
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#endif
+
+#include "ceph_ver.h"
+#include "include/types.h"
+#include "common/Formatter.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "denc_plugin.h"
+#include "denc_registry.h"
+
+#define MB(m) ((m) * 1024 * 1024)
+
+void usage(ostream &out)
+{
+ out << "usage: ceph-dencoder [commands ...]" << std::endl;
+ out << "\n";
+ out << " version print version string (to stdout)\n";
+ out << "\n";
+ out << " import <encfile> read encoded data from encfile\n";
+ out << " export <outfile> write encoded data to outfile\n";
+ out << "\n";
+ out << " set_features <num> set feature bits used for encoding\n";
+ out << " get_features print feature bits (int) to stdout\n";
+ out << "\n";
+ out << " list_types list supported types\n";
+ out << " type <classname> select in-memory type\n";
+ out << " skip <num> skip <num> leading bytes before decoding\n";
+ out << " decode decode into in-memory object\n";
+ out << " encode encode in-memory object\n";
+ out << " dump_json dump in-memory object as json (to stdout)\n";
+ out << " hexdump print encoded data in hex\n";
+ out << " get_struct_v print version of the encoded object\n";
+ out << " get_struct_compat print the oldest version of decoder that can decode the encoded object\n";
+ out << "\n";
+ out << " copy copy object (via operator=)\n";
+ out << " copy_ctor copy object (via copy ctor)\n";
+ out << "\n";
+ out << " count_tests print number of generated test objects (to stdout)\n";
+ out << " select_test <n> select generated test object as in-memory object\n";
+ out << " is_deterministic exit w/ success if type encodes deterministically\n";
+}
+
+vector<DencoderPlugin> load_plugins()
+{
+ fs::path mod_dir{CEPH_DENC_MOD_DIR};
+ if (auto ceph_lib = getenv("CEPH_LIB"); ceph_lib) {
+ mod_dir = ceph_lib;
+ } else if (fs::is_regular_file("CMakeCache.txt")) {
+ mod_dir = fs::canonical("lib");
+ }
+ vector<DencoderPlugin> dencoder_plugins;
+ for (auto& entry : fs::directory_iterator(mod_dir)) {
+ static const string_view DENC_MOD_PREFIX = "denc-mod-";
+ if (entry.path().stem().string().compare(0, DENC_MOD_PREFIX.size(),
+ DENC_MOD_PREFIX) != 0) {
+ continue;
+ }
+ DencoderPlugin plugin(entry);
+ if (!plugin.good()) {
+ continue;
+ }
+ dencoder_plugins.push_back(std::move(plugin));
+ }
+ return dencoder_plugins;
+}
+
+int main(int argc, const char **argv)
+{
+ vector<DencoderPlugin> plugins = load_plugins();
+ DencoderRegistry registry;
+ for (auto& plugin : plugins) {
+ for (auto& [name, denc] : plugin.register_dencoders()) {
+ registry.register_dencoder(name, denc);
+ }
+ }
+
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ env_to_vec(args);
+
+ Dencoder *den = NULL;
+ uint64_t features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ bufferlist encbl;
+ uint64_t skip = 0;
+
+ if (args.empty()) {
+ cerr << "-h for help" << std::endl;
+ return 1;
+ }
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ++i) {
+ string err;
+
+ auto& dencoders = registry.get();
+ if (*i == string("help") || *i == string("-h") || *i == string("--help")) {
+ usage(cout);
+ return 0;
+ } else if (*i == string("version")) {
+ cout << CEPH_GIT_NICE_VER << std::endl;
+ } else if (*i == string("list_types")) {
+ for (auto& dencoder : dencoders)
+ cout << dencoder.first << std::endl;
+ return 0;
+ } else if (*i == string("type")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting type" << std::endl;
+ return 1;
+ }
+ string cname = *i;
+ if (!dencoders.count(cname)) {
+ cerr << "class '" << cname << "' unknown" << std::endl;
+ return 1;
+ }
+ den = dencoders[cname];
+ den->generate();
+ } else if (*i == string("skip")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting byte count" << std::endl;
+ return 1;
+ }
+ skip = atoi(*i);
+ } else if (*i == string("get_features")) {
+ cout << CEPH_FEATURES_SUPPORTED_DEFAULT << std::endl;
+ return 0;
+ } else if (*i == string("set_features")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting features" << std::endl;
+ return 1;
+ }
+ features = atoll(*i);
+ } else if (*i == string("encode")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ return 1;
+ }
+ den->encode(encbl, features | CEPH_FEATURE_RESERVED); // hack for OSDMap
+ } else if (*i == string("decode")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ return 1;
+ }
+ err = den->decode(encbl, skip);
+ } else if (*i == string("copy_ctor")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ return 1;
+ }
+ den->copy_ctor();
+ } else if (*i == string("copy")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ return 1;
+ }
+ den->copy();
+ } else if (*i == string("dump_json")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ return 1;
+ }
+ JSONFormatter jf(true);
+ jf.open_object_section("object");
+ den->dump(&jf);
+ jf.close_section();
+ jf.flush(cout);
+ cout << std::endl;
+
+ } else if (*i == string("hexdump")) {
+ encbl.hexdump(cout);
+ } else if (*i == string("get_struct_v")) {
+ std::cout << den->get_struct_v(encbl, 0) << std::endl;
+ } else if (*i == string("get_struct_compat")) {
+ std::cout << den->get_struct_v(encbl, sizeof(uint8_t)) << std::endl;
+ } else if (*i == string("import")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting filename" << std::endl;
+ return 1;
+ }
+ int r;
+ if (*i == string("-")) {
+ *i = "stdin";
+ // Read up to 1mb if stdin specified
+ r = encbl.read_fd(STDIN_FILENO, MB(1));
+ } else {
+ r = encbl.read_file(*i, &err);
+ }
+ if (r < 0) {
+ cerr << "error reading " << *i << ": " << err << std::endl;
+ return 1;
+ }
+
+ } else if (*i == string("export")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting filename" << std::endl;
+ return 1;
+ }
+ int fd = ::open(*i, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0644);
+ if (fd < 0) {
+ cerr << "error opening " << *i << " for write: " << cpp_strerror(errno) << std::endl;
+ return 1;
+ }
+ int r = encbl.write_fd(fd);
+ if (r < 0) {
+ cerr << "error writing " << *i << ": " << cpp_strerror(errno) << std::endl;
+ return 1;
+ }
+ ::close(fd);
+
+ } else if (*i == string("count_tests")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ return 1;
+ }
+ cout << den->num_generated() << std::endl;
+ } else if (*i == string("select_test")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ return 1;
+ }
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting instance number" << std::endl;
+ return 1;
+ }
+ int n = atoi(*i);
+ err = den->select_generated(n);
+ } else if (*i == string("is_deterministic")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ return 1;
+ }
+ if (den->is_deterministic())
+ return 0;
+ else
+ return 1;
+ } else {
+ cerr << "unknown option '" << *i << "'" << std::endl;
+ return 1;
+ }
+ if (err.length()) {
+ cerr << "error: " << err << std::endl;
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/src/tools/ceph-dencoder/ceph_time.h b/src/tools/ceph-dencoder/ceph_time.h
new file mode 100644
index 000000000..c27cb5746
--- /dev/null
+++ b/src/tools/ceph-dencoder/ceph_time.h
@@ -0,0 +1,68 @@
+#ifndef TEST_CEPH_TIME_H
+#define TEST_CEPH_TIME_H
+
+#include <list>
+
+#include "include/encoding.h"
+#include "common/ceph_time.h"
+#include "common/Formatter.h"
+
+// wrapper for ceph::real_time that implements the dencoder interface
+template <typename Clock>
+class time_point_wrapper {
+ using time_point = typename Clock::time_point;
+ time_point t;
+ public:
+ time_point_wrapper() = default;
+ explicit time_point_wrapper(const time_point& t) : t(t) {}
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(t, bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ using ceph::decode;
+ decode(t, p);
+ }
+ void dump(Formatter* f) {
+ auto epoch_time = Clock::to_time_t(t);
+ f->dump_string("time", std::ctime(&epoch_time));
+ }
+ static void generate_test_instances(std::list<time_point_wrapper*>& ls) {
+ constexpr time_t t{455500800}; // Ghostbusters release date
+ ls.push_back(new time_point_wrapper(Clock::from_time_t(t)));
+ }
+};
+
+using real_time_wrapper = time_point_wrapper<ceph::real_clock>;
+WRITE_CLASS_ENCODER(real_time_wrapper)
+
+using coarse_real_time_wrapper = time_point_wrapper<ceph::coarse_real_clock>;
+WRITE_CLASS_ENCODER(coarse_real_time_wrapper)
+
+// wrapper for ceph::timespan that implements the dencoder interface
+class timespan_wrapper {
+ ceph::timespan d;
+ public:
+ timespan_wrapper() = default;
+ explicit timespan_wrapper(const ceph::timespan& d) : d(d) {}
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(d, bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ using ceph::decode;
+ decode(d, p);
+ }
+ void dump(Formatter* f) {
+ f->dump_int("timespan", d.count());
+ }
+ static void generate_test_instances(std::list<timespan_wrapper*>& ls) {
+ constexpr std::chrono::seconds d{7377}; // marathon world record (2:02:57)
+ ls.push_back(new timespan_wrapper(d));
+ }
+};
+WRITE_CLASS_ENCODER(timespan_wrapper)
+
+#endif
diff --git a/src/tools/ceph-dencoder/common_types.cc b/src/tools/ceph-dencoder/common_types.cc
new file mode 100644
index 000000000..fa763c3bb
--- /dev/null
+++ b/src/tools/ceph-dencoder/common_types.cc
@@ -0,0 +1,36 @@
+#include "acconfig.h"
+#include <cstdint>
+using namespace std;
+#include "include/ceph_features.h"
+
+#define TYPE(t)
+#define TYPE_STRAYDATA(t)
+#define TYPE_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL(t)
+#define TYPE_FEATUREFUL_STRAYDATA(t)
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL_NOCOPY(t)
+#define TYPE_NOCOPY(t)
+#define MESSAGE(t)
+#include "common_types.h"
+#undef TYPE
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
+#undef TYPE_NOCOPY
+#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
+#undef TYPE_FEATUREFUL_NOCOPY
+#undef MESSAGE
+
+#include "denc_plugin.h"
+
+DENC_API void register_dencoders(DencoderPlugin* plugin)
+{
+#include "common_types.h"
+}
+
+DENC_API void unregister_dencoders(DencoderPlugin* plugin)
+{
+ plugin->unregister_dencoders();
+}
diff --git a/src/tools/ceph-dencoder/common_types.h b/src/tools/ceph-dencoder/common_types.h
new file mode 100644
index 000000000..1eba16bb8
--- /dev/null
+++ b/src/tools/ceph-dencoder/common_types.h
@@ -0,0 +1,457 @@
+#include "ceph_time.h"
+TYPE(real_time_wrapper)
+TYPE(coarse_real_time_wrapper)
+TYPE(timespan_wrapper)
+
+#include "include/utime.h"
+TYPE(utime_t)
+
+#include "include/uuid.h"
+TYPE(uuid_d)
+
+#include "sstring.h"
+TYPE(sstring_wrapper)
+
+#include "include/CompatSet.h"
+TYPE(CompatSet)
+
+#include "include/filepath.h"
+TYPE(filepath)
+
+#include "include/fs_types.h"
+TYPE_FEATUREFUL(file_layout_t)
+
+#include "include/util.h"
+TYPE(ceph_data_stats)
+
+#include "common/bit_vector.hpp"
+TYPE(BitVector<2>)
+
+#include "common/bloom_filter.hpp"
+TYPE(bloom_filter)
+TYPE(compressible_bloom_filter)
+
+#include "common/DecayCounter.h"
+TYPE(DecayCounter)
+
+#include "common/histogram.h"
+TYPE(pow2_hist_t)
+
+#include "common/hobject.h"
+TYPE(hobject_t)
+TYPE(ghobject_t)
+
+#include "common/LogEntry.h"
+TYPE_FEATUREFUL(LogEntry)
+TYPE_FEATUREFUL(LogSummary)
+
+#include "common/SloppyCRCMap.h"
+TYPE(SloppyCRCMap)
+
+#include "common/snap_types.h"
+TYPE(SnapContext)
+TYPE(SnapRealmInfo)
+
+#include "msg/msg_types.h"
+TYPE(entity_name_t)
+TYPE_FEATUREFUL(entity_addr_t)
+TYPE_FEATUREFUL(entity_addrvec_t)
+TYPE_FEATUREFUL(entity_inst_t)
+
+#include "crush/CrushWrapper.h"
+TYPE_FEATUREFUL_NOCOPY(CrushWrapper)
+
+#include "cls/cas/cls_cas_ops.h"
+TYPE(cls_cas_chunk_create_or_get_ref_op)
+TYPE(cls_cas_chunk_get_ref_op)
+TYPE(cls_cas_chunk_put_ref_op)
+
+#include "cls/cas/cls_cas_internal.h"
+TYPE(chunk_refs_t)
+
+#include "cls/lock/cls_lock_types.h"
+TYPE(rados::cls::lock::locker_id_t)
+TYPE_FEATUREFUL(rados::cls::lock::locker_info_t)
+TYPE_FEATUREFUL(rados::cls::lock::lock_info_t)
+
+#include "cls/lock/cls_lock_ops.h"
+TYPE(cls_lock_lock_op)
+TYPE(cls_lock_unlock_op)
+TYPE(cls_lock_break_op)
+TYPE(cls_lock_get_info_op)
+TYPE_FEATUREFUL(cls_lock_get_info_reply)
+TYPE(cls_lock_list_locks_reply)
+TYPE(cls_lock_assert_op)
+TYPE(cls_lock_set_cookie_op)
+
+#include "cls/refcount/cls_refcount_ops.h"
+TYPE(cls_refcount_get_op)
+TYPE(cls_refcount_put_op)
+TYPE(cls_refcount_set_op)
+TYPE(cls_refcount_read_op)
+TYPE(cls_refcount_read_ret)
+TYPE(obj_refcount)
+
+#include "cls/timeindex/cls_timeindex_types.h"
+TYPE(cls_timeindex_entry)
+
+#include "journal/Entry.h"
+TYPE(journal::Entry)
+
+// --- messages ---
+#include "messages/MAuth.h"
+MESSAGE(MAuth)
+
+#include "messages/MAuthReply.h"
+MESSAGE(MAuthReply)
+
+#include "messages/MCacheExpire.h"
+MESSAGE(MCacheExpire)
+
+#include "messages/MClientCapRelease.h"
+MESSAGE(MClientCapRelease)
+
+#include "messages/MClientCaps.h"
+MESSAGE(MClientCaps)
+
+#include "messages/MClientLease.h"
+MESSAGE(MClientLease)
+
+#include "messages/MClientReconnect.h"
+MESSAGE(MClientReconnect)
+
+#include "messages/MClientReply.h"
+MESSAGE(MClientReply)
+
+#include "messages/MClientRequest.h"
+MESSAGE(MClientRequest)
+
+#include "messages/MClientRequestForward.h"
+MESSAGE(MClientRequestForward)
+
+#include "messages/MClientQuota.h"
+MESSAGE(MClientQuota)
+
+#include "messages/MClientSession.h"
+MESSAGE(MClientSession)
+
+#include "messages/MClientSnap.h"
+MESSAGE(MClientSnap)
+
+#include "messages/MCommand.h"
+MESSAGE(MCommand)
+
+#include "messages/MCommandReply.h"
+MESSAGE(MCommandReply)
+
+#include "messages/MConfig.h"
+MESSAGE(MConfig)
+
+#include "messages/MDentryLink.h"
+MESSAGE(MDentryLink)
+
+#include "messages/MDentryUnlink.h"
+MESSAGE(MDentryUnlink)
+
+#include "messages/MDirUpdate.h"
+MESSAGE(MDirUpdate)
+
+#include "messages/MDiscover.h"
+MESSAGE(MDiscover)
+
+#include "messages/MDiscoverReply.h"
+MESSAGE(MDiscoverReply)
+
+#include "messages/MExportCaps.h"
+MESSAGE(MExportCaps)
+
+#include "messages/MExportCapsAck.h"
+MESSAGE(MExportCapsAck)
+
+#include "messages/MExportDir.h"
+MESSAGE(MExportDir)
+
+#include "messages/MExportDirAck.h"
+MESSAGE(MExportDirAck)
+
+#include "messages/MExportDirCancel.h"
+MESSAGE(MExportDirCancel)
+
+#include "messages/MExportDirDiscover.h"
+MESSAGE(MExportDirDiscover)
+
+#include "messages/MExportDirDiscoverAck.h"
+MESSAGE(MExportDirDiscoverAck)
+
+#include "messages/MExportDirFinish.h"
+MESSAGE(MExportDirFinish)
+
+#include "messages/MExportDirNotify.h"
+MESSAGE(MExportDirNotify)
+
+#include "messages/MExportDirNotifyAck.h"
+MESSAGE(MExportDirNotifyAck)
+
+#include "messages/MExportDirPrep.h"
+MESSAGE(MExportDirPrep)
+
+#include "messages/MExportDirPrepAck.h"
+MESSAGE(MExportDirPrepAck)
+
+#include "messages/MForward.h"
+MESSAGE(MForward)
+
+#include "messages/MFSMap.h"
+MESSAGE(MFSMap)
+
+#include "messages/MFSMapUser.h"
+MESSAGE(MFSMapUser)
+
+#include "messages/MGatherCaps.h"
+MESSAGE(MGatherCaps)
+
+#include "messages/MGenericMessage.h"
+MESSAGE(MGenericMessage)
+
+#include "messages/MGetConfig.h"
+MESSAGE(MGetConfig)
+
+#include "messages/MGetPoolStats.h"
+MESSAGE(MGetPoolStats)
+
+#include "messages/MGetPoolStatsReply.h"
+MESSAGE(MGetPoolStatsReply)
+
+#include "messages/MHeartbeat.h"
+MESSAGE(MHeartbeat)
+
+#include "messages/MInodeFileCaps.h"
+MESSAGE(MInodeFileCaps)
+
+#include "messages/MLock.h"
+MESSAGE(MLock)
+
+#include "messages/MLog.h"
+MESSAGE(MLog)
+
+#include "messages/MLogAck.h"
+MESSAGE(MLogAck)
+
+#include "messages/MMDSOpenIno.h"
+MESSAGE(MMDSOpenIno)
+
+#include "messages/MMDSOpenInoReply.h"
+MESSAGE(MMDSOpenInoReply)
+
+#include "messages/MMDSBeacon.h"
+MESSAGE(MMDSBeacon)
+
+#include "messages/MMDSCacheRejoin.h"
+MESSAGE(MMDSCacheRejoin)
+
+#include "messages/MMDSFindIno.h"
+MESSAGE(MMDSFindIno)
+
+#include "messages/MMDSFindInoReply.h"
+MESSAGE(MMDSFindInoReply)
+
+#include "messages/MMDSFragmentNotify.h"
+MESSAGE(MMDSFragmentNotify)
+
+#include "messages/MMDSLoadTargets.h"
+MESSAGE(MMDSLoadTargets)
+
+#include "messages/MMDSMap.h"
+MESSAGE(MMDSMap)
+
+#include "messages/MMgrReport.h"
+MESSAGE(MMgrReport)
+
+#include "messages/MMDSResolve.h"
+MESSAGE(MMDSResolve)
+
+#include "messages/MMDSResolveAck.h"
+MESSAGE(MMDSResolveAck)
+
+#include "messages/MMDSPeerRequest.h"
+MESSAGE(MMDSPeerRequest)
+
+#include "messages/MMDSSnapUpdate.h"
+MESSAGE(MMDSSnapUpdate)
+
+#include "messages/MMDSTableRequest.h"
+MESSAGE(MMDSTableRequest)
+
+#include "messages/MMgrClose.h"
+MESSAGE(MMgrClose)
+
+#include "messages/MMgrConfigure.h"
+MESSAGE(MMgrConfigure)
+
+#include "messages/MMgrDigest.h"
+MESSAGE(MMgrDigest)
+
+#include "messages/MMgrMap.h"
+MESSAGE(MMgrMap)
+
+#include "messages/MMgrOpen.h"
+MESSAGE(MMgrOpen)
+
+#include "messages/MMonCommand.h"
+MESSAGE(MMonCommand)
+
+#include "messages/MMonCommandAck.h"
+MESSAGE(MMonCommandAck)
+
+#include "messages/MMonElection.h"
+MESSAGE(MMonElection)
+
+#include "messages/MMonGetMap.h"
+MESSAGE(MMonGetMap)
+
+#include "messages/MMonGetVersion.h"
+MESSAGE(MMonGetVersion)
+
+#include "messages/MMonGetVersionReply.h"
+MESSAGE(MMonGetVersionReply)
+
+#include "messages/MMonGlobalID.h"
+MESSAGE(MMonGlobalID)
+
+#include "messages/MMonJoin.h"
+MESSAGE(MMonJoin)
+
+#include "messages/MMonMap.h"
+MESSAGE(MMonMap)
+
+#include "messages/MMonPaxos.h"
+MESSAGE(MMonPaxos)
+
+#include "messages/MMonProbe.h"
+MESSAGE(MMonProbe)
+
+#include "messages/MMonScrub.h"
+MESSAGE(MMonScrub)
+
+#include "messages/MMonSync.h"
+MESSAGE(MMonSync)
+
+#include "messages/MMonSubscribe.h"
+MESSAGE(MMonSubscribe)
+
+#include "messages/MMonSubscribeAck.h"
+MESSAGE(MMonSubscribeAck)
+
+#include "messages/MOSDAlive.h"
+MESSAGE(MOSDAlive)
+
+#include "messages/MOSDBoot.h"
+MESSAGE(MOSDBoot)
+
+#include "messages/MOSDFailure.h"
+MESSAGE(MOSDFailure)
+
+#include "messages/MOSDMap.h"
+MESSAGE(MOSDMap)
+
+#include "messages/MOSDOp.h"
+MESSAGE(MOSDOp)
+
+#include "messages/MOSDOpReply.h"
+MESSAGE(MOSDOpReply)
+
+#include "messages/MOSDPGBackfill.h"
+MESSAGE(MOSDPGBackfill)
+
+#include "messages/MOSDPGCreate.h"
+MESSAGE(MOSDPGCreate)
+
+#include "messages/MOSDPGCreate2.h"
+MESSAGE(MOSDPGCreate2)
+
+#include "messages/MOSDPGInfo.h"
+MESSAGE(MOSDPGInfo)
+
+#include "messages/MOSDPGLog.h"
+MESSAGE(MOSDPGLog)
+
+#include "messages/MOSDPGNotify.h"
+MESSAGE(MOSDPGNotify)
+
+#include "messages/MOSDPGQuery.h"
+MESSAGE(MOSDPGQuery)
+
+#include "messages/MOSDPGRemove.h"
+MESSAGE(MOSDPGRemove)
+
+#include "messages/MOSDPGRecoveryDelete.h"
+MESSAGE(MOSDPGRecoveryDelete)
+
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+MESSAGE(MOSDPGRecoveryDeleteReply)
+
+#include "messages/MOSDPGScan.h"
+MESSAGE(MOSDPGScan)
+
+#include "messages/MOSDPGTemp.h"
+MESSAGE(MOSDPGTemp)
+
+#include "messages/MOSDPGTrim.h"
+MESSAGE(MOSDPGTrim)
+
+#include "messages/MOSDPing.h"
+MESSAGE(MOSDPing)
+
+#include "messages/MOSDRepScrub.h"
+MESSAGE(MOSDRepScrub)
+
+#include "messages/MOSDScrub.h"
+MESSAGE(MOSDScrub)
+
+#include "messages/MOSDScrub2.h"
+MESSAGE(MOSDScrub2)
+
+#include "messages/MOSDForceRecovery.h"
+MESSAGE(MOSDForceRecovery)
+
+#include "messages/MPGStats.h"
+MESSAGE(MPGStats)
+
+#include "messages/MPGStatsAck.h"
+MESSAGE(MPGStatsAck)
+
+#include "messages/MPing.h"
+MESSAGE(MPing)
+
+#include "messages/MPoolOp.h"
+MESSAGE(MPoolOp)
+
+#include "messages/MPoolOpReply.h"
+MESSAGE(MPoolOpReply)
+
+#include "messages/MRemoveSnaps.h"
+MESSAGE(MRemoveSnaps)
+
+#include "messages/MRoute.h"
+MESSAGE(MRoute)
+
+#include "messages/MServiceMap.h"
+MESSAGE(MServiceMap)
+
+#include "messages/MStatfs.h"
+MESSAGE(MStatfs)
+
+#include "messages/MStatfsReply.h"
+MESSAGE(MStatfsReply)
+
+#include "messages/MTimeCheck.h"
+MESSAGE(MTimeCheck)
+
+#include "messages/MTimeCheck2.h"
+MESSAGE(MTimeCheck2)
+
+#include "messages/MWatchNotify.h"
+MESSAGE(MWatchNotify)
+
+#include "messages/MMgrUpdate.h"
+MESSAGE(MMgrUpdate)
diff --git a/src/tools/ceph-dencoder/denc_plugin.h b/src/tools/ceph-dencoder/denc_plugin.h
new file mode 100644
index 000000000..58690a4b9
--- /dev/null
+++ b/src/tools/ceph-dencoder/denc_plugin.h
@@ -0,0 +1,82 @@
+#include <dlfcn.h>
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#else
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#endif
+#include <vector>
+
+#include "denc_registry.h"
+
+class DencoderPlugin {
+ using dencoders_t = std::vector<std::pair<std::string, Dencoder*>>;
+public:
+ DencoderPlugin(const fs::path& path) {
+ mod = dlopen(path.c_str(), RTLD_NOW);
+ if (mod == nullptr) {
+ std::cerr << "failed to dlopen(" << path << "): " << dlerror() << std::endl;
+ }
+ }
+ DencoderPlugin(DencoderPlugin&& other)
+ : mod{other.mod},
+ dencoders{std::move(other.dencoders)}
+ {
+ other.mod = nullptr;
+ other.dencoders.clear();
+ }
+ ~DencoderPlugin() {
+#if !defined(__FreeBSD__)
+ if (mod) {
+ dlclose(mod);
+ }
+#endif
+ }
+ const dencoders_t& register_dencoders() {
+ static constexpr string_view REGISTER_DENCODERS_FUNCTION = "register_dencoders\0";
+
+ assert(mod);
+ using register_dencoders_t = void (*)(DencoderPlugin*);
+ const auto do_register =
+ reinterpret_cast<register_dencoders_t>(dlsym(mod, REGISTER_DENCODERS_FUNCTION.data()));
+ if (do_register == nullptr) {
+ std::cerr << "failed to dlsym(" << REGISTER_DENCODERS_FUNCTION << "): "
+ << dlerror() << std::endl;
+ return dencoders;
+ }
+ do_register(this);
+ return dencoders;
+ }
+
+ bool good() const {
+ return mod != nullptr;
+ }
+
+ void unregister_dencoders() {
+ while (!dencoders.empty()) {
+ delete dencoders.back().second;
+ dencoders.pop_back();
+ }
+ }
+ template<typename DencoderT, typename...Args>
+ void emplace(const char* name, Args&&...args) {
+ dencoders.emplace_back(name, new DencoderT(std::forward<Args>(args)...));
+ }
+
+private:
+ void *mod = nullptr;
+ dencoders_t dencoders;
+};
+
+#define TYPE(t) plugin->emplace<DencoderImplNoFeature<t>>(#t, false, false);
+#define TYPE_STRAYDATA(t) plugin->emplace<DencoderImplNoFeature<t>>(#t, true, false);
+#define TYPE_NONDETERMINISTIC(t) plugin->emplace<DencoderImplNoFeature<t>>(#t, false, true);
+#define TYPE_FEATUREFUL(t) plugin->emplace<DencoderImplFeatureful<t>>(#t, false, false);
+#define TYPE_FEATUREFUL_STRAYDATA(t) plugin->emplace<DencoderImplFeatureful<t>>(#t, true, false);
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) plugin->emplace<DencoderImplFeatureful<t>>(#t, false, true);
+#define TYPE_FEATUREFUL_NOCOPY(t) plugin->emplace<DencoderImplFeaturefulNoCopy<t>>(#t, false, false);
+#define TYPE_NOCOPY(t) plugin->emplace<DencoderImplNoFeatureNoCopy<t>>(#t, false, false);
+#define MESSAGE(t) plugin->emplace<MessageDencoderImpl<t>>(#t);
+
+#define DENC_API extern "C" [[gnu::visibility("default")]]
diff --git a/src/tools/ceph-dencoder/denc_registry.h b/src/tools/ceph-dencoder/denc_registry.h
new file mode 100644
index 000000000..dc1db36d3
--- /dev/null
+++ b/src/tools/ceph-dencoder/denc_registry.h
@@ -0,0 +1,241 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <string_view>
+
+#include "include/buffer_fwd.h"
+#include "msg/Message.h"
+
+namespace ceph {
+ class Formatter;
+}
+
+struct Dencoder {
+ virtual ~Dencoder() {}
+ virtual std::string decode(bufferlist bl, uint64_t seek) = 0;
+ virtual void encode(bufferlist& out, uint64_t features) = 0;
+ virtual void dump(ceph::Formatter *f) = 0;
+ virtual void copy() {
+ std::cerr << "copy operator= not supported" << std::endl;
+ }
+ virtual void copy_ctor() {
+ std::cerr << "copy ctor not supported" << std::endl;
+ }
+ virtual void generate() = 0;
+ virtual int num_generated() = 0;
+ virtual std::string select_generated(unsigned n) = 0;
+ virtual bool is_deterministic() = 0;
+ unsigned get_struct_v(bufferlist bl, uint64_t seek) const {
+ auto p = bl.cbegin(seek);
+ uint8_t struct_v = 0;
+ ceph::decode(struct_v, p);
+ return struct_v;
+ }
+ //virtual void print(ostream& out) = 0;
+};
+
+template<class T>
+class DencoderBase : public Dencoder {
+protected:
+ T* m_object;
+ list<T*> m_list;
+ bool stray_okay;
+ bool nondeterministic;
+
+public:
+ DencoderBase(bool stray_okay, bool nondeterministic)
+ : m_object(new T),
+ stray_okay(stray_okay),
+ nondeterministic(nondeterministic) {}
+ ~DencoderBase() override {
+ delete m_object;
+ }
+
+ std::string decode(bufferlist bl, uint64_t seek) override {
+ auto p = bl.cbegin();
+ p.seek(seek);
+ try {
+ using ceph::decode;
+ decode(*m_object, p);
+ }
+ catch (buffer::error& e) {
+ return e.what();
+ }
+ if (!stray_okay && !p.end()) {
+ ostringstream ss;
+ ss << "stray data at end of buffer, offset " << p.get_off();
+ return ss.str();
+ }
+ return {};
+ }
+
+ void encode(bufferlist& out, uint64_t features) override = 0;
+
+ void dump(ceph::Formatter *f) override {
+ m_object->dump(f);
+ }
+ void generate() override {
+ T::generate_test_instances(m_list);
+ }
+ int num_generated() override {
+ return m_list.size();
+ }
+ string select_generated(unsigned i) override {
+ // allow 0- or 1-based (by wrapping)
+ if (i == 0)
+ i = m_list.size();
+ if ((i == 0) || (i > m_list.size()))
+ return "invalid id for generated object";
+ m_object = *(std::next(m_list.begin(), i-1));
+ return string();
+ }
+
+ bool is_deterministic() override {
+ return !nondeterministic;
+ }
+};
+
+template<class T>
+class DencoderImplNoFeatureNoCopy : public DencoderBase<T> {
+public:
+ DencoderImplNoFeatureNoCopy(bool stray_ok, bool nondeterministic)
+ : DencoderBase<T>(stray_ok, nondeterministic) {}
+ void encode(bufferlist& out, uint64_t features) override {
+ out.clear();
+ using ceph::encode;
+ encode(*this->m_object, out);
+ }
+};
+
+template<class T>
+class DencoderImplNoFeature : public DencoderImplNoFeatureNoCopy<T> {
+public:
+ DencoderImplNoFeature(bool stray_ok, bool nondeterministic)
+ : DencoderImplNoFeatureNoCopy<T>(stray_ok, nondeterministic) {}
+ void copy() override {
+ T *n = new T;
+ *n = *this->m_object;
+ delete this->m_object;
+ this->m_object = n;
+ }
+ void copy_ctor() override {
+ T *n = new T(*this->m_object);
+ delete this->m_object;
+ this->m_object = n;
+ }
+};
+
+template<class T>
+class DencoderImplFeaturefulNoCopy : public DencoderBase<T> {
+public:
+ DencoderImplFeaturefulNoCopy(bool stray_ok, bool nondeterministic)
+ : DencoderBase<T>(stray_ok, nondeterministic) {}
+ void encode(bufferlist& out, uint64_t features) override {
+ out.clear();
+ using ceph::encode;
+ encode(*(this->m_object), out, features);
+ }
+};
+
+template<class T>
+class DencoderImplFeatureful : public DencoderImplFeaturefulNoCopy<T> {
+public:
+ DencoderImplFeatureful(bool stray_ok, bool nondeterministic)
+ : DencoderImplFeaturefulNoCopy<T>(stray_ok, nondeterministic) {}
+ void copy() override {
+ T *n = new T;
+ *n = *this->m_object;
+ delete this->m_object;
+ this->m_object = n;
+ }
+ void copy_ctor() override {
+ T *n = new T(*this->m_object);
+ delete this->m_object;
+ this->m_object = n;
+ }
+};
+
+template<class T>
+class MessageDencoderImpl : public Dencoder {
+ ref_t<T> m_object;
+ list<ref_t<T>> m_list;
+
+public:
+ MessageDencoderImpl() : m_object{make_message<T>()} {}
+ ~MessageDencoderImpl() override {}
+
+ string decode(bufferlist bl, uint64_t seek) override {
+ auto p = bl.cbegin();
+ p.seek(seek);
+ try {
+ ref_t<Message> n(decode_message(g_ceph_context, 0, p), false);
+ if (!n)
+ throw std::runtime_error("failed to decode");
+ if (n->get_type() != m_object->get_type()) {
+ stringstream ss;
+ ss << "decoded type " << n->get_type() << " instead of expected " << m_object->get_type();
+ throw std::runtime_error(ss.str());
+ }
+ m_object = ref_cast<T>(n);
+ }
+ catch (buffer::error& e) {
+ return e.what();
+ }
+ if (!p.end()) {
+ ostringstream ss;
+ ss << "stray data at end of buffer, offset " << p.get_off();
+ return ss.str();
+ }
+ return string();
+ }
+
+ void encode(bufferlist& out, uint64_t features) override {
+ out.clear();
+ encode_message(m_object.get(), features, out);
+ }
+
+ void dump(ceph::Formatter *f) override {
+ m_object->dump(f);
+ }
+ void generate() override {
+ //T::generate_test_instances(m_list);
+ }
+ int num_generated() override {
+ return m_list.size();
+ }
+ string select_generated(unsigned i) override {
+ // allow 0- or 1-based (by wrapping)
+ if (i == 0)
+ i = m_list.size();
+ if ((i == 0) || (i > m_list.size()))
+ return "invalid id for generated object";
+ m_object = *(std::next(m_list.begin(), i-1));
+ return string();
+ }
+ bool is_deterministic() override {
+ return true;
+ }
+
+ //void print(ostream& out) {
+ //out << m_object << std::endl;
+ //}
+};
+
+class DencoderRegistry
+{
+ using dencoders_t = std::map<std::string_view, Dencoder*>;
+
+public:
+ dencoders_t& get() {
+ return dencoders;
+ }
+ void register_dencoder(std::string_view name, Dencoder* denc) {
+ dencoders.emplace(name, denc);
+ }
+private:
+ dencoders_t dencoders;
+};
diff --git a/src/tools/ceph-dencoder/mds_types.cc b/src/tools/ceph-dencoder/mds_types.cc
new file mode 100644
index 000000000..94280477a
--- /dev/null
+++ b/src/tools/ceph-dencoder/mds_types.cc
@@ -0,0 +1,36 @@
+#include "acconfig.h"
+#include <cstdint>
+using namespace std;
+#include "include/ceph_features.h"
+
+#define TYPE(t)
+#define TYPE_STRAYDATA(t)
+#define TYPE_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL(t)
+#define TYPE_FEATUREFUL_STRAYDATA(t)
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL_NOCOPY(t)
+#define TYPE_NOCOPY(t)
+#define MESSAGE(t)
+#include "mds_types.h"
+#undef TYPE
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
+#undef TYPE_NOCOPY
+#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
+#undef TYPE_FEATUREFUL_NOCOPY
+#undef MESSAGE
+
+#include "denc_plugin.h"
+
+DENC_API void register_dencoders(DencoderPlugin* plugin)
+{
+#include "mds_types.h"
+}
+
+DENC_API void unregister_dencoders(DencoderPlugin* plugin)
+{
+ plugin->unregister_dencoders();
+}
diff --git a/src/tools/ceph-dencoder/mds_types.h b/src/tools/ceph-dencoder/mds_types.h
new file mode 100644
index 000000000..9406bf88b
--- /dev/null
+++ b/src/tools/ceph-dencoder/mds_types.h
@@ -0,0 +1,111 @@
+#ifdef WITH_CEPHFS
+#include "mds/JournalPointer.h"
+TYPE(JournalPointer)
+
+#include "osdc/Journaler.h"
+TYPE(Journaler::Header)
+
+#include "mds/snap.h"
+TYPE(SnapInfo)
+TYPE(snaplink_t)
+TYPE(sr_t)
+
+#include "mds/mdstypes.h"
+TYPE(frag_info_t)
+TYPE(nest_info_t)
+TYPE(quota_info_t)
+TYPE(client_writeable_range_t)
+TYPE_FEATUREFUL(inode_t<std::allocator>)
+TYPE_FEATUREFUL(old_inode_t<std::allocator>)
+TYPE(fnode_t)
+TYPE(old_rstat_t)
+TYPE_FEATUREFUL(session_info_t)
+TYPE(string_snap_t)
+TYPE(MDSCacheObjectInfo)
+TYPE(mds_table_pending_t)
+TYPE(cap_reconnect_t)
+TYPE(inode_load_vec_t)
+TYPE(dirfrag_load_vec_t)
+TYPE(mds_load_t)
+TYPE(MDSCacheObjectInfo)
+TYPE(inode_backtrace_t)
+TYPE(inode_backpointer_t)
+
+#include "mds/CInode.h"
+TYPE_FEATUREFUL(InodeStore)
+TYPE_FEATUREFUL(InodeStoreBare)
+
+#include "mds/MDSMap.h"
+TYPE_FEATUREFUL(MDSMap)
+TYPE_FEATUREFUL(MDSMap::mds_info_t)
+
+#include "mds/FSMap.h"
+//TYPE_FEATUREFUL(Filesystem)
+TYPE_FEATUREFUL(FSMap)
+
+#include "mds/Capability.h"
+TYPE_NOCOPY(Capability)
+
+#include "mds/inode_backtrace.h"
+TYPE(inode_backpointer_t)
+TYPE(inode_backtrace_t)
+
+#include "mds/InoTable.h"
+TYPE(InoTable)
+
+#include "mds/SnapServer.h"
+TYPE_STRAYDATA(SnapServer)
+
+#include "mds/events/ECommitted.h"
+TYPE_FEATUREFUL_NOCOPY(ECommitted)
+
+#include "mds/events/EExport.h"
+TYPE_FEATUREFUL_NOCOPY(EExport)
+
+#include "mds/events/EFragment.h"
+TYPE_FEATUREFUL_NOCOPY(EFragment)
+
+#include "mds/events/EImportFinish.h"
+TYPE_FEATUREFUL_NOCOPY(EImportFinish)
+
+#include "mds/events/EImportStart.h"
+TYPE_FEATUREFUL_NOCOPY(EImportStart)
+
+#include "mds/events/EMetaBlob.h"
+TYPE_FEATUREFUL_NOCOPY(EMetaBlob::fullbit)
+TYPE(EMetaBlob::remotebit)
+TYPE(EMetaBlob::nullbit)
+TYPE_FEATUREFUL_NOCOPY(EMetaBlob::dirlump)
+TYPE_FEATUREFUL_NOCOPY(EMetaBlob)
+
+#include "mds/events/EOpen.h"
+TYPE_FEATUREFUL_NOCOPY(EOpen)
+
+#include "mds/events/EResetJournal.h"
+TYPE_FEATUREFUL_NOCOPY(EResetJournal)
+
+#include "mds/events/ESession.h"
+TYPE_FEATUREFUL_NOCOPY(ESession)
+
+#include "mds/events/ESessions.h"
+TYPE_FEATUREFUL_NOCOPY(ESessions)
+
+#include "mds/events/EPeerUpdate.h"
+TYPE(link_rollback)
+TYPE(rmdir_rollback)
+TYPE(rename_rollback::drec)
+TYPE(rename_rollback)
+TYPE_FEATUREFUL_NOCOPY(EPeerUpdate)
+
+#include "mds/events/ESubtreeMap.h"
+TYPE_FEATUREFUL_NOCOPY(ESubtreeMap)
+
+#include "mds/events/ETableClient.h"
+TYPE_FEATUREFUL_NOCOPY(ETableClient)
+
+#include "mds/events/ETableServer.h"
+TYPE_FEATUREFUL_NOCOPY(ETableServer)
+
+#include "mds/events/EUpdate.h"
+TYPE_FEATUREFUL_NOCOPY(EUpdate)
+#endif // WITH_CEPHFS
diff --git a/src/tools/ceph-dencoder/osd_types.cc b/src/tools/ceph-dencoder/osd_types.cc
new file mode 100644
index 000000000..13a90685b
--- /dev/null
+++ b/src/tools/ceph-dencoder/osd_types.cc
@@ -0,0 +1,39 @@
+#include "acconfig.h"
+#include <cstdint>
+using namespace std;
+#include "include/ceph_features.h"
+
+#define TYPE(t)
+#define TYPE_STRAYDATA(t)
+#define TYPE_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL(t)
+#define TYPE_FEATUREFUL_STRAYDATA(t)
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL_NOCOPY(t)
+#define TYPE_NOCOPY(t)
+#define MESSAGE(t)
+#include "osd_types.h"
+#undef TYPE
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
+#undef TYPE_NOCOPY
+#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
+#undef TYPE_FEATUREFUL_NOCOPY
+#undef MESSAGE
+
+#include "denc_plugin.h"
+
+// cannot initialize dencoders when initializing static variables, as some of
+// the types are allocated using mempool, and the mempools are initialized as
+// static variables.
+DENC_API void register_dencoders(DencoderPlugin* plugin)
+{
+#include "osd_types.h"
+}
+
+DENC_API void unregister_dencoders(DencoderPlugin* plugin)
+{
+ plugin->unregister_dencoders();
+}
diff --git a/src/tools/ceph-dencoder/osd_types.h b/src/tools/ceph-dencoder/osd_types.h
new file mode 100644
index 000000000..d60a7b5a8
--- /dev/null
+++ b/src/tools/ceph-dencoder/osd_types.h
@@ -0,0 +1,153 @@
+#include "osd/OSDMap.h"
+TYPE(osd_info_t)
+TYPE_FEATUREFUL(osd_xinfo_t)
+TYPE_FEATUREFUL_NOCOPY(OSDMap)
+TYPE_FEATUREFUL_STRAYDATA(OSDMap::Incremental)
+
+#include "osd/osd_types.h"
+TYPE(osd_reqid_t)
+TYPE(object_locator_t)
+TYPE(request_redirect_t)
+TYPE(pg_t)
+TYPE(coll_t)
+TYPE_FEATUREFUL(objectstore_perf_stat_t)
+TYPE_FEATUREFUL(osd_stat_t)
+TYPE(OSDSuperblock)
+TYPE_FEATUREFUL(pool_snap_info_t)
+TYPE_FEATUREFUL(pg_pool_t)
+TYPE(object_stat_sum_t)
+TYPE(object_stat_collection_t)
+TYPE(pg_stat_t)
+TYPE_FEATUREFUL(pool_stat_t)
+TYPE(pg_hit_set_info_t)
+TYPE(pg_hit_set_history_t)
+TYPE(pg_history_t)
+TYPE(pg_info_t)
+TYPE(PastIntervals)
+TYPE_FEATUREFUL(pg_query_t)
+TYPE(ObjectModDesc)
+TYPE(pg_log_entry_t)
+TYPE(pg_log_dup_t)
+TYPE(pg_log_t)
+TYPE_FEATUREFUL(pg_missing_item)
+TYPE_FEATUREFUL(pg_missing_t)
+TYPE(pg_nls_response_t)
+TYPE(pg_ls_response_t)
+TYPE(object_copy_cursor_t)
+TYPE_FEATUREFUL(object_copy_data_t)
+TYPE(pg_create_t)
+TYPE(OSDSuperblock)
+TYPE(SnapSet)
+TYPE_FEATUREFUL(watch_info_t)
+TYPE_FEATUREFUL(watch_item_t)
+TYPE(object_manifest_t)
+TYPE_FEATUREFUL(object_info_t)
+TYPE(SnapSet)
+TYPE_FEATUREFUL(ObjectRecoveryInfo)
+TYPE(ObjectRecoveryProgress)
+TYPE(PushReplyOp)
+TYPE_FEATUREFUL(PullOp)
+TYPE_FEATUREFUL(PushOp)
+TYPE(ScrubMap::object)
+TYPE(ScrubMap)
+TYPE_FEATUREFUL(obj_list_watch_response_t)
+TYPE(clone_info)
+TYPE(obj_list_snap_response_t)
+TYPE(pool_pg_num_history_t)
+
+#include "osd/ECUtil.h"
+// TYPE(stripe_info_t) non-standard encoding/decoding functions
+TYPE(ECUtil::HashInfo)
+
+#include "osd/ECMsgTypes.h"
+TYPE_NOCOPY(ECSubWrite)
+TYPE(ECSubWriteReply)
+TYPE_FEATUREFUL(ECSubRead)
+TYPE(ECSubReadReply)
+
+#include "osd/HitSet.h"
+TYPE_NONDETERMINISTIC(ExplicitHashHitSet)
+TYPE_NONDETERMINISTIC(ExplicitObjectHitSet)
+TYPE(BloomHitSet)
+TYPE_NONDETERMINISTIC(HitSet) // because some subclasses are
+TYPE(HitSet::Params)
+
+#include "os/ObjectStore.h"
+TYPE(ObjectStore::Transaction)
+
+#include "os/filestore/SequencerPosition.h"
+TYPE(SequencerPosition)
+
+#ifdef WITH_BLUESTORE
+#include "os/bluestore/bluestore_types.h"
+TYPE(bluestore_bdev_label_t)
+TYPE(bluestore_cnode_t)
+TYPE(bluestore_compression_header_t)
+TYPE(bluestore_extent_ref_map_t)
+TYPE(bluestore_pextent_t)
+TYPE(bluestore_blob_use_tracker_t)
+// TODO: bluestore_blob_t repurposes the "feature" param of encode() for its
+// struct_v. at a higher level, BlueStore::ExtentMap encodes the extends using
+// a different interface than the normal ones. see
+// BlueStore::ExtentMap::encode_some(). maybe we can test it using another
+// approach.
+// TYPE_FEATUREFUL(bluestore_blob_t)
+// TYPE(bluestore_shared_blob_t) there is no encode here
+TYPE(bluestore_onode_t)
+TYPE(bluestore_deferred_op_t)
+TYPE(bluestore_deferred_transaction_t)
+// TYPE(bluestore_compression_header_t) there is no encode here
+
+#include "os/bluestore/bluefs_types.h"
+TYPE(bluefs_extent_t)
+TYPE(bluefs_fnode_t)
+TYPE(bluefs_super_t)
+TYPE(bluefs_transaction_t)
+#endif
+
+#include "mon/AuthMonitor.h"
+TYPE_FEATUREFUL(AuthMonitor::Incremental)
+
+#include "mon/PGMap.h"
+TYPE_FEATUREFUL_NONDETERMINISTIC(PGMapDigest)
+TYPE_FEATUREFUL_NONDETERMINISTIC(PGMap)
+
+#include "mon/MonitorDBStore.h"
+TYPE(MonitorDBStore::Transaction)
+TYPE(MonitorDBStore::Op)
+
+#include "mon/MonMap.h"
+TYPE_FEATUREFUL(MonMap)
+
+#include "mon/MonCap.h"
+TYPE(MonCap)
+
+#include "mon/MgrMap.h"
+TYPE_FEATUREFUL(MgrMap)
+
+#include "mon/mon_types.h"
+TYPE(LevelDBStoreStats)
+TYPE(ScrubResult)
+
+#include "mon/CreatingPGs.h"
+TYPE_FEATUREFUL(creating_pgs_t)
+
+#include "mgr/ServiceMap.h"
+TYPE_FEATUREFUL(ServiceMap)
+TYPE_FEATUREFUL(ServiceMap::Service)
+TYPE_FEATUREFUL(ServiceMap::Daemon)
+
+#include "mon/ConnectionTracker.h"
+TYPE(ConnectionReport);
+TYPE(ConnectionTracker);
+
+#include "os/filestore/DBObjectMap.h"
+TYPE(DBObjectMap::_Header)
+TYPE(DBObjectMap::State)
+
+#include "os/filestore/FileStore.h"
+TYPE(FSSuperblock)
+
+#include "os/kstore/kstore_types.h"
+TYPE(kstore_cnode_t)
+TYPE(kstore_onode_t)
diff --git a/src/tools/ceph-dencoder/rbd_types.cc b/src/tools/ceph-dencoder/rbd_types.cc
new file mode 100644
index 000000000..e04efc30d
--- /dev/null
+++ b/src/tools/ceph-dencoder/rbd_types.cc
@@ -0,0 +1,36 @@
+#include "acconfig.h"
+#include <cstdint>
+using namespace std;
+#include "include/ceph_features.h"
+
+#define TYPE(t)
+#define TYPE_STRAYDATA(t)
+#define TYPE_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL(t)
+#define TYPE_FEATUREFUL_STRAYDATA(t)
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL_NOCOPY(t)
+#define TYPE_NOCOPY(t)
+#define MESSAGE(t)
+#include "rbd_types.h"
+#undef TYPE
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
+#undef TYPE_NOCOPY
+#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
+#undef TYPE_FEATUREFUL_NOCOPY
+#undef MESSAGE
+
+#include "denc_plugin.h"
+
+DENC_API void register_dencoders(DencoderPlugin* plugin)
+{
+#include "rbd_types.h"
+}
+
+DENC_API void unregister_dencoders(DencoderPlugin* plugin)
+{
+ plugin->unregister_dencoders();
+}
diff --git a/src/tools/ceph-dencoder/rbd_types.h b/src/tools/ceph-dencoder/rbd_types.h
new file mode 100644
index 000000000..6fb84dea6
--- /dev/null
+++ b/src/tools/ceph-dencoder/rbd_types.h
@@ -0,0 +1,52 @@
+#ifdef WITH_RBD
+#include "librbd/journal/Types.h"
+TYPE(librbd::journal::EventEntry)
+TYPE(librbd::journal::ClientData)
+TYPE(librbd::journal::TagData)
+#include "librbd/mirroring_watcher/Types.h"
+TYPE(librbd::mirroring_watcher::NotifyMessage)
+#include "librbd/trash_watcher/Types.h"
+TYPE(librbd::mirroring_watcher::NotifyMessage)
+#include "librbd/WatchNotifyTypes.h"
+TYPE_NOCOPY(librbd::watch_notify::NotifyMessage)
+TYPE(librbd::watch_notify::ResponseMessage)
+
+#include "rbd_replay/ActionTypes.h"
+TYPE(rbd_replay::action::Dependency)
+TYPE(rbd_replay::action::ActionEntry)
+
+#include "tools/rbd_mirror/image_map/Types.h"
+TYPE(rbd::mirror::image_map::PolicyData)
+#endif
+
+#if defined(WITH_RBD) && defined(WITH_RBD_SSD_CACHE)
+#include "librbd/cache/pwl/Types.h"
+#include "librbd/cache/pwl/ssd/Types.h"
+TYPE(librbd::cache::pwl::WriteLogCacheEntry)
+TYPE(librbd::cache::pwl::WriteLogPoolRoot)
+TYPE(librbd::cache::pwl::ssd::SuperBlock)
+#endif
+
+#ifdef WITH_RBD
+#include "cls/rbd/cls_rbd.h"
+TYPE_FEATUREFUL(cls_rbd_parent)
+TYPE_FEATUREFUL(cls_rbd_snap)
+
+#include "cls/rbd/cls_rbd_types.h"
+TYPE(cls::rbd::ParentImageSpec)
+TYPE(cls::rbd::ChildImageSpec)
+TYPE(cls::rbd::MigrationSpec)
+TYPE(cls::rbd::MirrorPeer)
+TYPE(cls::rbd::MirrorImage)
+TYPE(cls::rbd::MirrorImageMap)
+TYPE(cls::rbd::MirrorImageStatus)
+TYPE(cls::rbd::MirrorImageSiteStatus)
+TYPE_FEATUREFUL(cls::rbd::MirrorImageSiteStatusOnDisk)
+TYPE(cls::rbd::GroupImageSpec)
+TYPE(cls::rbd::GroupImageStatus)
+TYPE(cls::rbd::GroupSnapshot)
+TYPE(cls::rbd::GroupSpec)
+TYPE(cls::rbd::ImageSnapshotSpec)
+TYPE(cls::rbd::SnapshotInfo)
+TYPE(cls::rbd::SnapshotNamespace)
+#endif
diff --git a/src/tools/ceph-dencoder/rgw_types.cc b/src/tools/ceph-dencoder/rgw_types.cc
new file mode 100644
index 000000000..79688b534
--- /dev/null
+++ b/src/tools/ceph-dencoder/rgw_types.cc
@@ -0,0 +1,36 @@
+#include "acconfig.h"
+#include <cstdint>
+using namespace std;
+#include "include/ceph_features.h"
+
+#define TYPE(t)
+#define TYPE_STRAYDATA(t)
+#define TYPE_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL(t)
+#define TYPE_FEATUREFUL_STRAYDATA(t)
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL_NOCOPY(t)
+#define TYPE_NOCOPY(t)
+#define MESSAGE(t)
+#include "rgw_types.h"
+#undef TYPE
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
+#undef TYPE_NOCOPY
+#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
+#undef TYPE_FEATUREFUL_NOCOPY
+#undef MESSAGE
+
+#include "denc_plugin.h"
+
+DENC_API void register_dencoders(DencoderPlugin* plugin)
+{
+#include "rgw_types.h"
+}
+
+DENC_API void unregister_dencoders(DencoderPlugin* plugin)
+{
+ plugin->unregister_dencoders();
+}
diff --git a/src/tools/ceph-dencoder/rgw_types.h b/src/tools/ceph-dencoder/rgw_types.h
new file mode 100644
index 000000000..bd1443ddf
--- /dev/null
+++ b/src/tools/ceph-dencoder/rgw_types.h
@@ -0,0 +1,131 @@
+#ifdef WITH_RADOSGW
+
+#include "rgw/rgw_rados.h"
+TYPE(RGWOLHInfo)
+TYPE(RGWObjManifestPart)
+TYPE(RGWObjManifest)
+TYPE(objexp_hint_entry)
+
+#include "rgw/rgw_zone.h"
+TYPE(RGWZoneParams)
+TYPE(RGWZone)
+TYPE(RGWZoneGroup)
+TYPE(RGWRealm)
+TYPE(RGWPeriod)
+TYPE(RGWPeriodLatestEpochInfo)
+
+#include "rgw/rgw_acl.h"
+TYPE(ACLPermission)
+TYPE(ACLGranteeType)
+TYPE(ACLGrant)
+TYPE(RGWAccessControlList)
+TYPE(ACLOwner)
+TYPE(RGWAccessControlPolicy)
+
+#include "rgw/rgw_cache.h"
+TYPE(ObjectMetaInfo)
+TYPE(ObjectCacheInfo)
+TYPE(RGWCacheNotifyInfo)
+
+#include "rgw/rgw_lc.h"
+TYPE(RGWLifecycleConfiguration)
+
+#include "cls/rgw/cls_rgw_types.h"
+TYPE(rgw_bucket_pending_info)
+TYPE(rgw_bucket_dir_entry_meta)
+TYPE(rgw_bucket_entry_ver)
+TYPE(rgw_bucket_dir_entry)
+TYPE(rgw_bucket_category_stats)
+TYPE(rgw_bucket_dir_header)
+TYPE(rgw_bucket_dir)
+TYPE(rgw_bucket_entry_ver)
+TYPE(cls_rgw_obj_key)
+TYPE(rgw_bucket_olh_log_entry)
+TYPE(rgw_usage_log_entry)
+
+#include "cls/rgw/cls_rgw_ops.h"
+TYPE(rgw_cls_obj_prepare_op)
+TYPE(rgw_cls_obj_complete_op)
+TYPE(rgw_cls_list_op)
+TYPE(rgw_cls_list_ret)
+TYPE(cls_rgw_gc_defer_entry_op)
+TYPE(cls_rgw_gc_list_op)
+TYPE(cls_rgw_gc_list_ret)
+TYPE(cls_rgw_gc_obj_info)
+TYPE(cls_rgw_gc_remove_op)
+TYPE(cls_rgw_gc_set_entry_op)
+TYPE(cls_rgw_obj)
+TYPE(cls_rgw_obj_chain)
+TYPE(rgw_cls_tag_timeout_op)
+TYPE(cls_rgw_bi_log_list_op)
+TYPE(cls_rgw_bi_log_trim_op)
+TYPE(cls_rgw_bi_log_list_ret)
+TYPE(rgw_cls_link_olh_op)
+TYPE(rgw_cls_unlink_instance_op)
+TYPE(rgw_cls_read_olh_log_op)
+TYPE(rgw_cls_read_olh_log_ret)
+TYPE(rgw_cls_trim_olh_log_op)
+TYPE(rgw_cls_bucket_clear_olh_op)
+TYPE(rgw_cls_check_index_ret)
+TYPE(cls_rgw_reshard_add_op)
+TYPE(cls_rgw_reshard_list_op)
+TYPE(cls_rgw_reshard_list_ret)
+TYPE(cls_rgw_reshard_get_op)
+TYPE(cls_rgw_reshard_get_ret)
+TYPE(cls_rgw_reshard_remove_op)
+TYPE(cls_rgw_set_bucket_resharding_op)
+TYPE(cls_rgw_clear_bucket_resharding_op)
+TYPE(cls_rgw_lc_obj_head)
+
+#include "cls/rgw/cls_rgw_client.h"
+TYPE(rgw_bi_log_entry)
+TYPE(cls_rgw_reshard_entry)
+TYPE(cls_rgw_bucket_instance_entry)
+
+#include "cls/user/cls_user_types.h"
+TYPE(cls_user_bucket)
+TYPE(cls_user_bucket_entry)
+TYPE(cls_user_stats)
+TYPE(cls_user_header)
+
+#include "cls/user/cls_user_ops.h"
+TYPE(cls_user_set_buckets_op)
+TYPE(cls_user_remove_bucket_op)
+TYPE(cls_user_list_buckets_op)
+TYPE(cls_user_list_buckets_ret)
+TYPE(cls_user_get_header_op)
+TYPE(cls_user_get_header_ret)
+TYPE(cls_user_complete_stats_sync_op)
+
+#include "cls/journal/cls_journal_types.h"
+TYPE(cls::journal::ObjectPosition)
+TYPE(cls::journal::ObjectSetPosition)
+TYPE(cls::journal::Client)
+TYPE(cls::journal::Tag)
+
+#include "rgw/rgw_common.h"
+TYPE(RGWAccessKey)
+TYPE(RGWSubUser)
+TYPE(RGWUserInfo)
+TYPE(rgw_bucket)
+TYPE(RGWBucketInfo)
+TYPE(RGWBucketEnt)
+TYPE(rgw_obj)
+
+#include "rgw/rgw_log.h"
+TYPE(rgw_log_entry)
+
+#include "rgw/rgw_meta_sync_status.h"
+TYPE(rgw_meta_sync_info)
+TYPE(rgw_meta_sync_marker)
+TYPE(rgw_meta_sync_status)
+
+#include "rgw/rgw_multi.h"
+TYPE(RGWUploadPartInfo)
+
+#include "rgw/rgw_data_sync.h"
+TYPE(rgw_data_sync_info)
+TYPE(rgw_data_sync_marker)
+TYPE(rgw_data_sync_status)
+
+#endif
diff --git a/src/tools/ceph-dencoder/sstring.h b/src/tools/ceph-dencoder/sstring.h
new file mode 100644
index 000000000..c2493c10e
--- /dev/null
+++ b/src/tools/ceph-dencoder/sstring.h
@@ -0,0 +1,40 @@
+#ifndef TEST_SSTRING_H
+#define TEST_SSTRING_H
+
+#include "common/sstring.hh"
+
+// wrapper for sstring that implements the dencoder interface
+class sstring_wrapper {
+ using sstring16 = basic_sstring<char, uint32_t, 16>;
+ sstring16 s1;
+ using sstring24 = basic_sstring<unsigned char, uint16_t, 24>;
+ sstring24 s2;
+ public:
+ sstring_wrapper() = default;
+ sstring_wrapper(sstring16&& s1, sstring24&& s2)
+ : s1(std::move(s1)), s2(std::move(s2))
+ {}
+
+ DENC(sstring_wrapper, w, p) {
+ DENC_START(1, 1, p);
+ denc(w.s1, p);
+ denc(w.s2, p);
+ DENC_FINISH(p);
+ }
+ void dump(Formatter* f) {
+ f->dump_string("s1", s1.c_str());
+ f->dump_string("s2", reinterpret_cast<const char*>(s2.c_str()));
+ }
+ static void generate_test_instances(std::list<sstring_wrapper*>& ls) {
+ ls.push_back(new sstring_wrapper());
+ // initialize sstrings that fit in internal storage
+ constexpr auto cstr6 = "abcdef";
+ ls.push_back(new sstring_wrapper(sstring16{cstr6}, sstring24{cstr6}));
+ // initialize sstrings that overflow into external storage
+ constexpr auto cstr26 = "abcdefghijklmnopqrstuvwxyz";
+ ls.push_back(new sstring_wrapper(sstring16{cstr26}, sstring24{cstr26}));
+ }
+};
+WRITE_CLASS_DENC(sstring_wrapper)
+
+#endif
diff --git a/src/tools/ceph-diff-sorted.cc b/src/tools/ceph-diff-sorted.cc
new file mode 100644
index 000000000..f8e4c28e6
--- /dev/null
+++ b/src/tools/ceph-diff-sorted.cc
@@ -0,0 +1,173 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * diffsorted -- a utility to compute a line-by-line diff on two
+ * sorted input files
+ *
+ * Copyright © 2019 Red Hat
+ *
+ * Author: J. Eric Ivancich
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.
+ */
+
+
+/*
+ * SUMMARY
+ *
+ * The `diffsorted` utility does a line-by-line diff on two sorted text
+ * files and indicating lines that are in one file but not the other
+ * using diff-style notation (although line numbers are not indicated).
+ *
+ * USAGE
+ *
+ * rgw-diff-sorted file1.txt file2.txt
+ *
+ * NOTES
+ *
+ * Each files should have its lines in sorted order and should have no
+ * empty lines.
+ *
+ * A potential input file can be sorted using the `sort` utility provided
+ * that LANG=C to insure byte lexical order. For example:
+ *
+ * LANG=C sort unsorted.txt >sorted.txt
+ *
+ * or:
+ *
+ * export LANG=C
+ * sort unsorted.txt >sorted.txt
+ *
+ * EXIT STATUS
+ *
+ * 0 : files same
+ * 1 : files different
+ * 2 : usage problem (e.g., wrong number of command-line arguments)
+ * 3 : problem opening input file
+ * 4 : bad file content (e.g., unsorted order or empty lines)
+ */
+
+
+#include <iostream>
+#include <fstream>
+
+
+struct FileOfLines {
+ const char* filename;
+ std::ifstream input;
+ std::string this_line, prev_line;
+ bool next_eof;
+ bool is_eof;
+
+ FileOfLines(const char* _filename) :
+ filename(_filename),
+ input(filename),
+ next_eof(false),
+ is_eof(false)
+ { }
+
+ void dump(const std::string& prefix) {
+ do {
+ std::cout << prefix << this_line << std::endl;
+ advance();
+ } while (!eof());
+ }
+
+ bool eof() const {
+ return is_eof;
+ }
+
+ bool good() const {
+ return input.good();
+ }
+
+ void advance() {
+ if (next_eof) {
+ is_eof = true;
+ return;
+ }
+
+ prev_line = this_line;
+ std::getline(input, this_line);
+ if (this_line.empty()) {
+ if (!input.eof()) {
+ std::cerr << "Error: " << filename << " has an empty line." <<
+ std::endl;
+ exit(4);
+ }
+ is_eof = true;
+ return;
+ } else if (input.eof()) {
+ next_eof = true;
+ }
+
+ if (this_line < prev_line) {
+ std::cerr << "Error: " << filename << " is not in sorted order; \"" <<
+ this_line << "\" follows \"" << prev_line << "\"." << std::endl;
+ exit(4);
+ }
+ }
+
+ const std::string line() const {
+ return this_line;
+ }
+};
+
+int main(int argc, const char* argv[]) {
+ if (argc != 3) {
+ std::cerr << "Usage: " << argv[0] << " <file1> <file2>" << std::endl;
+ exit(2);
+ }
+
+ FileOfLines input1(argv[1]);
+ if (!input1.good()) {
+ std::cerr << "Error opening " << argv[1] <<
+ "." << std::endl;
+ exit(3);
+ }
+
+ FileOfLines input2(argv[2]);
+ if (!input2.good()) {
+ std::cerr << "Error opening " << argv[2] <<
+ "." << std::endl;
+ exit(3);
+ }
+
+ bool files_same = true;
+
+ input1.advance();
+ input2.advance();
+
+ while (!input1.eof() && !input2.eof()) {
+ if (input1.line() == input2.line()) {
+ input1.advance();
+ input2.advance();
+ } else if (input1.line() < input2.line()) {
+ files_same = false;
+ std::cout << "< " << input1.line() << std::endl;
+ input1.advance();
+ } else {
+ files_same = false;
+ std::cout << "> " << input2.line() << std::endl;
+ input2.advance();
+ }
+ }
+
+ if (!input1.eof()) {
+ files_same = false;
+ input1.dump("< ");
+ } else if (!input2.eof()) {
+ files_same = false;
+ input2.dump("> ");
+ }
+
+ if (files_same) {
+ exit(0);
+ } else {
+ exit(1);
+ }
+}
diff --git a/src/tools/ceph-lazy/bash_completion.d/ceph-lazy b/src/tools/ceph-lazy/bash_completion.d/ceph-lazy
new file mode 100644
index 000000000..4429def42
--- /dev/null
+++ b/src/tools/ceph-lazy/bash_completion.d/ceph-lazy
@@ -0,0 +1,27 @@
+_ceph-lazy()
+{
+ local cur prev all_opts commands
+ COMPREPLY=()
+ cur="${COMP_WORDS[COMP_CWORD]}"
+ prev="${COMP_WORDS[COMP_CWORD-1]}"
+
+ commands="host-get-osd host-get-nodes host-osd-usage host-all-usage pg-get-host pg-most-write pg-less-write pg-most-write-kb pg-less-write-kb pg-most-read pg-less-read pg-most-read-kb pg-less-read-kb pg-empty rbd-prefix rbd-count rbd-host rbd-osd rbd-size rbd-all-size osd-most-used osd-less-used osd-get-ppg osd-get-pg object-get-host"
+
+ all_opts="$commands -d -h"
+
+
+
+# If first option is -d keep completing without -d & -h
+ if [[ ${prev} == "-d" && ${#COMP_WORDS[@]} -eq 3 ]] ; then
+ COMPREPLY=( $(compgen -W "${commands}" -- ${cur}) )
+ return 0
+# Do completion for first args
+ elif [[ ${#COMP_WORDS[@]} -eq 2 ]]; then
+ COMPREPLY=( $(compgen -W "${all_opts}" -- ${cur}) )
+ return 0
+# Else do nothing
+ else
+ return 0
+ fi
+}
+complete -F _ceph-lazy ceph-lazy
diff --git a/src/tools/ceph-lazy/ceph-lazy b/src/tools/ceph-lazy/ceph-lazy
new file mode 100755
index 000000000..39a331921
--- /dev/null
+++ b/src/tools/ceph-lazy/ceph-lazy
@@ -0,0 +1,709 @@
+#!/usr/bin/env bash
+#
+# ceph-lazy : Be efficient, be lazy !
+#
+# Author: Gregory Charot <gcharot@redhat.com>
+#
+# This is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+
+# Specify exta option for ceph like the username/keyring/etc. Can also be done with CEPH_ARGS global variable
+#CEPH_OPT="-n client.username"
+VERSION="1.1.2"
+
+#
+# Print info message to stderr
+#
+
+function echoinfo() {
+ printf "INFO: %s\n" "$*" >&2;
+}
+
+
+#
+# Print error message to stderr
+#
+
+function echoerr() {
+ printf "ERROR: %s\n" "$*" >&2;
+}
+
+
+function help() {
+ >&2 echo "Usage : ceph-lazy [-d | -h] [command] [parameters]
+
+Ceph complex querying tool - Version $VERSION
+
+OPTIONS
+========
+ -d Activate debug mode
+ -h Print help
+
+COMMANDS
+=========
+
+ Host
+ -----
+ host-get-osd hostname List all OSD IDs attached to a particular node.
+ host-get-nodes List all storage nodes.
+ host-osd-usage hostname Show total OSD space usage of a particular node (-d for details).
+ host-all-usage Show total OSD space usage of each nodes (-d for details)
+
+ Placement groups
+ -----------------
+ pg-get-host pgid Find PG storage hosts (first is primary)
+ pg-most-write Find most written PG (nb operations)
+ pg-less-write Find less written PG (nb operations)
+ pg-most-write-kb Find most written PG (data written)
+ pg-less-write-kb Find less written PG (data written)
+ pg-most-read Find most read PG (nb operations)
+ pg-less-read Find less read PG (nb operations)
+ pg-most-read-kb Find most read PG (data read)
+ pg-less-read-kb Find less read PG (data read)
+ pg-empty Find empty PGs (no stored object)
+
+ RBD
+ ----
+ rbd-prefix pool_name image_name Return RBD image prefix
+ rbd-count pool_name image_name Count number of objects in a RBD image
+ rbd-host pool_name image_name Find RBD primary storage hosts
+ rbd-osd pool_name image_name Find RBD primary OSDs
+ rbd-size pool_name image_name Print RBD image real size
+ rbd-all-size pool_name Print all RBD images size (Top first)
+
+ OSD
+ ----
+ osd-most-used Show the most used OSD (capacity)
+ osd-less-used Show the less used OSD (capacity)
+ osd-get-ppg osd_id Show all primaries PGS hosted on a OSD
+ osd-get-pg osd_id Show all PGS hosted on a OSD
+
+ Objects
+ --------
+ object-get-host pool_name object_id Find object storage hosts (first is primary)
+ "
+
+}
+
+#
+# Check dependencies
+#
+function check_requirements()
+{
+
+ # List of command dependencies
+ local bin_dep="ceph rados rbd osdmaptool jq"
+
+ for cmd in $bin_dep; do
+ [ $DEBUG -eq 1 ] && echoinfo "Checking for $cmd..."
+ $cmd --version >/dev/null 2>&1 || { echoerr "$cmd cannot be found... Aborting."; return 1; }
+ done
+
+ CEPH="ceph $CEPH_OPT"
+
+ [ $DEBUG -eq 1 ] && echoinfo "Checking Ceph connectivity & basic permissions..."
+
+ if ! $CEPH -s &> /dev/null; then
+ echoerr "Cannot connect to cluster, please check your username & permissions"
+ echoerr "Command $CEPH -s failed"
+ return 1
+ fi
+
+ JQ="jq -M --raw-output"
+}
+
+#
+# Print the host that hosts a specific PG
+#
+function find_host_from_pg() {
+
+ if [ $# -eq 1 ]; then
+ local PGID=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "PG $PGID has been found at (first is primary) : "
+
+ for osd in $($CEPH pg $PGID query | $JQ -cr .up[]); do
+ echo -n "OSD:osd.$osd | Host:"
+ $CEPH osd find $osd --format json 2> /dev/null | $JQ .crush_location.host
+ done
+}
+
+
+#
+# Print the host that hosts a specific object
+#
+function find_host_from_object() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local objid=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ local pgid=$($CEPH osd map $pool $objid --format json 2> /dev/null | $JQ -cr .pgid)
+
+ [ $DEBUG -eq 1 ] && echoinfo $objid found into PG $pgid
+
+ while read host; do
+ echo "PG:$pgid | $host"
+ done < <(find_host_from_pg $pgid)
+}
+
+
+#
+# Print all primary pgs hosted by an OSD
+#
+function find_prim_pg_from_osd() {
+
+ if [ $# -eq 1 ]; then
+ local posd=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for primary PGs belonging to OSD $posd"
+ $CEPH pg dump pgs --format json 2>/dev/null | $JQ --argjson posd $posd '.[] | select(.acting_primary==$posd).pgid'
+}
+
+
+#
+# Print all pgs (primary & secondary) hosted by an OSD
+#
+function find_all_pg_from_osd() {
+
+ if [ $# -eq 1 ]; then
+ local osd=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for all PGs mapped to OSD $osd"
+ $CEPH pg dump pgs --format json 2> /dev/null | $JQ -M --argjson osd $osd '.[] | select(.up[]==$osd).pgid'
+}
+
+
+#
+# Check if a given image exists
+#
+function check_rbd_exists(){
+
+ pool=$1
+ rbd=$2
+
+ if ! rbd info -p $pool $rbd &> /dev/null; then
+ echoerr "Unable to find image $pool/$rbd"
+ exit 1
+ fi
+}
+
+
+#
+# Return RBD prefix from image name
+#
+function get_rbd_prefix() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ check_rbd_exists $pool $rbd
+
+ local prefix=$(rbd --image $rbd -p $pool info --format json 2> /dev/null | jq --raw-output .block_name_prefix)
+ if [ -z $prefix ]; then
+ echoerr "Unable to find RBD Prefix for image $pool/$rbd"
+ exit 1
+ else
+ echo $prefix
+ fi
+
+}
+
+
+#
+# Count number of object in a RBD image
+#
+function count_rbd_object() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ check_rbd_exists $pool $rbd
+
+ local rbd_prefix=$(get_rbd_prefix $pool $rbd)
+
+ [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now couning objects..."
+
+ local nb_obj=$(rados -p $pool ls | grep $rbd_prefix | wc -l)
+
+ [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has $nb_obj objects"
+ echo $nb_obj
+}
+
+
+#
+# Find primary storage host for a given RBD image
+#
+function find_prim_host_from_rbd() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ check_rbd_exists $pool $rbd
+
+ local osd="null"
+ local osdmap_t=$(mktemp)
+ local osdtree_t=$(mktemp)
+ # Get RBD image prefix
+ local rbd_prefix=$(get_rbd_prefix $pool $rbd)
+# Exit if we received an empty prefix
+ [ -z $rbd_prefix ] && exit 1
+
+# Get pool ID from pool name
+ local pool_id=$(ceph osd lspools -f json | $JQ -M --arg pool $pool '.[]|select(.poolname==$pool).poolnum')
+
+ [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now finding primary host..."
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD map to $osdmap_t"
+ if ! $CEPH osd getmap > $osdmap_t 2> /dev/null; then
+ echoerr "Failed to retrieve OSD map"
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD tree to $osdtree_t"
+
+ if ! $CEPH osd tree --format json > $osdtree_t; then
+ echoerr "Failed to retrieve OSD tree"
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for hosts..."
+
+# For each object in the RBD image
+ for obj in $(rados -p $pool ls | grep $rbd_prefix);
+ do
+# Map object to osd. osdmaptoot does not support json output so using dirty sed.
+ osd=$(osdmaptool --test-map-object $obj --pool $pool_id $osdmap_t 2>/dev/null | sed -r 's/.*\[([[:digit:]]+),.*/\1/' | grep -v osdmaptool)
+# Map osd to host
+ $JQ --argjson osd $osd '.nodes[] | select(.type=="host") | select(.children[] == $osd).name' $osdtree_t
+ done | sort -u
+
+# Cleaning files
+ rm -f $osdtree_t $osdmap_t
+}
+
+
+#
+# Find primary OSDs for a given RBD image
+#
+function find_prim_osd_from_rbd() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ check_rbd_exists $pool $rbd
+
+ local osd="null"
+ local osdmap_t=$(mktemp)
+ local osdtree_t=$(mktemp)
+ # Get RBD image prefix
+ local rbd_prefix=$(get_rbd_prefix $pool $rbd)
+
+# Exit if we received an empty prefix
+ [ -z $rbd_prefix ] && exit 1
+
+ [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now finding primary OSDs..."
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD map to $osdmap_t"
+ if ! $CEPH osd getmap > $osdmap_t; then
+ echoerr "Failed to retrieve OSD map"
+ exit 1
+ fi
+
+# For each object in the RBD image
+ for obj in $(rados -p $pool ls | grep $rbd_prefix);
+ do
+# Map object to osd. osdmaptoot does not support json output so using dirty sed.
+ osd=$(osdmaptool --test-map-object $obj $osdmap_t 2>/dev/null | sed -r 's/.*\[([[:digit:]]+),.*/\1/' | grep -v osdmaptool)
+ echo "osd.${osd}"
+ done | sort -u
+
+# Cleaning files
+ rm -f $osdmap_t
+}
+
+
+#
+# Print RBD image real size - Source http://ceph.com/planet/real-size-of-a-ceph-rbd-image/
+#
+
+function print_rbd_real_size {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Checking if RBD image exists..."
+
+ check_rbd_exists $pool $rbd
+
+ rbd diff $pool/$rbd | awk '{ SUM += $2 } END { print SUM/1024/1024 " MB" }'
+
+}
+
+
+#
+# Print all RBD image real sizes - Top first
+#
+
+function list_all_rbd_real_size {
+
+ if [ $# -eq 1 ]; then
+ local pool=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for RBD images in pool $pool"
+
+ while read rbd; do
+ [ $DEBUG -eq 1 ] && echoinfo "Inspecting image $rbd"
+ rbd diff $pool/$rbd | awk -v rbd="$rbd" '{ SUM += $2 } END { print SUM/1024/1024 " MB - " rbd }'
+ done < <(rbd -p $pool ls) | sort -rV
+}
+
+
+#
+# Print OSDs belonging to a particular storage host
+#
+
+function list_osd_from_host() {
+
+ if [ $# -eq 1 ]; then
+ local host=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ $CEPH osd tree --format json-pretty 2> /dev/null | $JQ --arg host $host '.nodes[] | select(.type=="host") | select(.name == $host).children[]' | sort -V
+
+}
+
+
+#
+# List all OSD nodes
+#
+
+function list_all_nodes() {
+
+
+ $CEPH osd tree --format json | $JQ -M --raw-output '.nodes[] | select(.type=="host") | .name' | sort -V
+
+}
+
+
+#
+# Print Total OSD usage of a particular storage host
+#
+
+function show_host_osd_usage() {
+
+ if [ $# -eq 1 ]; then
+ local host=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ local pgmap_t=$(mktemp)
+
+ local osd_used_kb=0
+ local total_used_kb=0
+
+ local total_available_kb=0
+ local osd_available_kb=0
+
+ local total_size_kb=0
+ local osd_size_kb=0
+ local nb_osd=0
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping PG map..."
+ if ! $CEPH pg dump osds --format json 2>/dev/null > $pgmap_t; then
+ echoerr "Failed to retrieve PG map"
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for all OSDs on host $host..."
+
+ for osd in $(list_osd_from_host $host); do
+
+ osd_used_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb_used' $pgmap_t)
+ osd_available_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb_avail' $pgmap_t)
+ osd_size_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb' $pgmap_t)
+
+ [ $DEBUG -eq 1 ] && echoinfo "OSD:$osd | Size:$(echo "scale=1;$osd_size_kb/1024/1024" | bc -l)GB | Used:$(echo "scale=1;$osd_used_kb /1024/1024" | bc -l)GB | Available:$(echo "scale=1;$osd_available_kb/1024/1024" | bc -l)GB"
+
+ let "total_used_kb=total_used_kb+osd_used_kb"
+ let "total_available_kb=total_available_kb+osd_available_kb"
+ let "total_size_kb=total_size_kb+osd_size_kb"
+ let "nb_osd++"
+
+ done
+
+ echo "Host:$host | OSDs:$nb_osd | Total_Size:$(echo "scale=1;$total_size_kb/1024/1024" | bc -l)GB | Total_Used:$(echo "scale=1;$total_used_kb /1024/1024" | bc -l)GB | Total_Available:$(echo "scale=1;$total_available_kb/1024/1024" | bc -l)GB"
+
+ rm -f $pgmap_t
+}
+
+
+#
+# Print Total OSD usage of all nodes
+#
+
+function list_all_nodes_osd_usage() {
+
+
+ for host in $(list_all_nodes); do
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking at node $host..."
+
+ show_host_osd_usage $host
+ done
+
+}
+
+
+#
+# Find most used (space) OSD
+#
+
+function find_most_used_osd() {
+
+ local osd=$($CEPH pg dump osds --format json 2> /dev/null| $JQ 'max_by(.kb_used) | .osd')
+ local host=$($CEPH osd find $osd 2> /dev/null | $JQ .crush_location.host)
+
+ echo "OSD:osd.${osd} | host:$host"
+}
+
+
+#
+# Find less used (space) OSD
+#
+
+function find_less_used_osd() {
+
+ local osd=$($CEPH pg dump osds --format json 2> /dev/null| $JQ 'min_by(.kb_used) | .osd')
+ local host=$($CEPH osd find $osd 2> /dev/null | $JQ .crush_location.host)
+
+ echo "OSD:osd.${osd} | host:$host"
+}
+
+
+#
+# Query PG stats
+#
+
+function pg_stat_query() {
+
+ if [ $# -eq 1 ]; then
+ local query_type=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ local pgmap_t=$(mktemp)
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping PG map..."
+ if ! $CEPH pg dump pgs --format json 2>/dev/null > $pgmap_t; then
+ echoerr "Failed to retrieve PG map"
+ exit 1
+ fi
+
+ local pgid=$($JQ --arg query_type $query_type "$query_type" $pgmap_t)
+ [ $DEBUG -eq 1 ] && echoinfo "Found PGID $pgid"
+
+ local osd=$($JQ --arg pgid $pgid '.[] | select(.pgid == $pgid).acting_primary' $pgmap_t)
+ [ $DEBUG -eq 1 ] && echoinfo "Found OSD $osd"
+
+ local host=$($CEPH osd find $osd --format json 2> /dev/null | $JQ .crush_location.host)
+ [ $DEBUG -eq 1 ] && echoinfo "Found host $host"
+
+ echo "PG:$pgid | OSD:osd.$osd | Host:$host"
+
+ rm -f $pgmap_t
+}
+
+
+#
+# Find empty pgs (no object stored)
+#
+
+function find_empty_pg() {
+
+ $CEPH pg dump pgs --format json 2>/dev/null | $JQ '.[] | select(.stat_sum.num_objects == 0).pgid'
+
+}
+
+
+#
+# MAIN
+#
+
+
+# Print help if no argument is given
+if [ $# -eq 0 ]; then
+ help
+ exit 1
+fi
+
+# Activate debug mode if -d is specified as first parameter
+if [ "$1" = "-d" ]; then
+ echoinfo "Debug mode activated"
+ DEBUG=1
+ shift
+else
+ DEBUG=0
+fi
+
+
+# Check if all requirements are met
+check_requirements || exit 1
+
+
+# Call proper function
+case $1 in
+ "-h")
+ help
+ exit 0
+ ;;
+ "host-get-osd")
+ list_osd_from_host $2
+ ;;
+ "host-get-nodes")
+ list_all_nodes
+ ;;
+ "host-osd-usage")
+ show_host_osd_usage $2
+ ;;
+ "host-all-usage")
+ list_all_nodes_osd_usage
+ ;;
+ "pg-get-host")
+ find_host_from_pg $2
+ ;;
+ "pg-most-write")
+ pg_stat_query "max_by(.stat_sum.num_write).pgid"
+ ;;
+ "pg-less-write")
+ pg_stat_query "min_by(.stat_sum.num_write).pgid"
+ ;;
+ "pg-most-write-kb")
+ pg_stat_query "max_by(.stat_sum.num_write_kb).pgid"
+ ;;
+ "pg-less-write-kb")
+ pg_stat_query "min_by(.stat_sum.num_write_kb).pgid"
+ ;;
+ "pg-most-read")
+ pg_stat_query "max_by(.stat_sum.num_read).pgid"
+ ;;
+ "pg-less-read")
+ pg_stat_query "min_by(.stat_sum.num_read).pgid"
+ ;;
+ "pg-most-read-kb")
+ pg_stat_query "max_by(.stat_sum.num_read_kb).pgid"
+ ;;
+ "pg-less-read-kb")
+ pg_stat_query "min_by(.stat_sum.num_read_kb).pgid"
+ ;;
+ "rbd-prefix")
+ get_rbd_prefix $2 $3
+ ;;
+ "rbd-count")
+ count_rbd_object $2 $3
+ ;;
+ "rbd-host")
+ find_prim_host_from_rbd $2 $3
+ ;;
+ "rbd-osd")
+ find_prim_osd_from_rbd $2 $3
+ ;;
+ "rbd-size")
+ print_rbd_real_size $2 $3
+ ;;
+ "rbd-all-size")
+ list_all_rbd_real_size $2
+ ;;
+ "osd-most-used")
+ find_most_used_osd
+ ;;
+ "osd-less-used")
+ find_less_used_osd
+ ;;
+ "osd-get-ppg")
+ find_prim_pg_from_osd $2
+ ;;
+ "osd-get-pg")
+ find_all_pg_from_osd $2
+ ;;
+ "pg-empty")
+ find_empty_pg
+ ;;
+ "object-get-host")
+ find_host_from_object $2 $3
+ ;;
+ *)
+ echoerr "Unknown command : $1"
+ help
+ exit 1
+ ;;
+esac
+
diff --git a/src/tools/ceph-monstore-update-crush.sh b/src/tools/ceph-monstore-update-crush.sh
new file mode 100755
index 000000000..5adfacdc2
--- /dev/null
+++ b/src/tools/ceph-monstore-update-crush.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2015 Red Hat <contact@redhat.com>
+#
+# Author: Kefu Chai <kchai@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+verbose=
+
+test -d ../src && export PATH=$PATH:.
+
+if ! which jq ; then
+ echo "Missing jq binary!"
+ exit 1
+fi
+
+if [ `uname` = FreeBSD ]; then
+ GETOPT=/usr/local/bin/getopt
+else
+ GETOPT=getopt
+fi
+
+function osdmap_get() {
+ local store_path=$1
+ local query=$2
+ local epoch=${3:+-v $3}
+ local osdmap=`mktemp`
+
+ $CEPH_BIN/ceph-monstore-tool $store_path get osdmap -- \
+ $epoch -o $osdmap > /dev/null || return
+
+ echo $($CEPH_BIN/osdmaptool --dump json $osdmap 2> /dev/null | \
+ jq "$query")
+
+ rm -f $osdmap
+}
+
+function test_crush() {
+ local store_path=$1
+ local epoch=$2
+ local max_osd=$3
+ local crush=$4
+ local osdmap=`mktemp`
+
+ $CEPH_BIN/ceph-monstore-tool $store_path get osdmap -- \
+ -v $epoch -o $osdmap > /dev/null
+ $CEPH_BIN/osdmaptool --export-crush $crush $osdmap &> /dev/null
+
+ if $CEPH_BIN/crushtool --test --check $max_osd -i $crush > /dev/null; then
+ good=true
+ else
+ good=false
+ fi
+ rm -f $osdmap
+ $good || return 1
+}
+
+function die() {
+ local retval=$?
+ echo "$@" >&2
+ exit $retval
+}
+
+function usage() {
+ [ $# -gt 0 ] && echo -e "\n$@"
+ cat <<EOF
+
+Usage: $0 [options ...] <mon-store>
+
+Search backward for a latest known-good epoch in monstore. Rewrite the osdmap
+epochs after it with the crush map in the found epoch if asked to do so. By
+default, print out the crush map in the good epoch.
+
+ [-h|--help] display this message
+ [--out] write the found crush map to given file (default: stdout)
+ [--rewrite] rewrite the monitor storage with the found crush map
+ [--verbose] be more chatty
+EOF
+ [ $# -gt 0 ] && exit 1
+ exit 0
+}
+
+function main() {
+ local temp
+ temp=$($GETOPT -o h --long verbose,help,mon-store:,out:,rewrite -n $0 -- "$@") || return 1
+
+ eval set -- "$temp"
+ local rewrite
+ while [ "$1" != "--" ]; do
+ case "$1" in
+ --verbose)
+ verbose=true
+ # set -xe
+ # PS4='${FUNCNAME[0]}: $LINENO: '
+ shift;;
+ -h|--help)
+ usage
+ return 0;;
+ --out)
+ output=$2
+ shift 2;;
+ --osdmap-epoch)
+ osdmap_epoch=$2
+ shift 2;;
+ --rewrite)
+ rewrite=true
+ shift;;
+ *)
+ usage "unexpected argument $1"
+ shift;;
+ esac
+ done
+ shift
+
+ local store_path="$1"
+ test $store_path || usage "I need the path to mon-store."
+
+ # try accessing the store; if it fails, likely means a mon is running
+ local last_osdmap_epoch
+ local max_osd
+ last_osdmap_epoch=$(osdmap_get $store_path ".epoch") || \
+ die "error accessing mon store at $store_path"
+ # get the max_osd # in last osdmap epoch, crushtool will use it to check
+ # the crush maps in previous osdmaps
+ max_osd=$(osdmap_get $store_path ".max_osd" $last_osdmap_epoch)
+
+ local good_crush
+ local good_epoch
+ test $verbose && echo "the latest osdmap epoch is $last_osdmap_epoch"
+ for epoch in `seq $last_osdmap_epoch -1 1`; do
+ local crush_path=`mktemp`
+ test $verbose && echo "checking crush map #$epoch"
+ if test_crush $store_path $epoch $max_osd $crush_path; then
+ test $verbose && echo "crush map version #$epoch works with osdmap epoch #$osdmap_epoch"
+ good_epoch=$epoch
+ good_crush=$crush_path
+ break
+ fi
+ rm -f $crush_path
+ done
+
+ if test $good_epoch; then
+ echo "good crush map found at epoch $epoch/$last_osdmap_epoch"
+ else
+ echo "Unable to find a crush map for osdmap version #$osdmap_epoch." 2>&1
+ return 1
+ fi
+
+ if test $good_epoch -eq $last_osdmap_epoch; then
+ echo "and mon store has no faulty crush maps."
+ elif test $output; then
+ $CEPH_BIN/crushtool --decompile $good_crush --outfn $output
+ elif test $rewrite; then
+ $CEPH_BIN/ceph-monstore-tool $store_path rewrite-crush -- \
+ --crush $good_crush \
+ --good-epoch $good_epoch
+ else
+ echo
+ $CEPH_BIN/crushtool --decompile $good_crush
+ fi
+ rm -f $good_crush
+}
+
+main "$@"
diff --git a/src/tools/ceph_authtool.cc b/src/tools/ceph_authtool.cc
new file mode 100644
index 000000000..c650cc880
--- /dev/null
+++ b/src/tools/ceph_authtool.cc
@@ -0,0 +1,314 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/ConfUtils.h"
+#include "common/ceph_argparse.h"
+#include "common/config_proxy.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+
+#include "auth/Crypto.h"
+#include "auth/Auth.h"
+#include "auth/KeyRing.h"
+
+void usage()
+{
+ cout << "usage: ceph-authtool keyringfile [OPTIONS]...\n"
+ << "where the options are:\n"
+ << " -l, --list will list all keys and capabilities present in\n"
+ << " the keyring\n"
+ << " -p, --print-key will print an encoded key for the specified\n"
+ << " entityname. This is suitable for the\n"
+ << " 'mount -o secret=..' argument\n"
+ << " -C, --create-keyring will create a new keyring, overwriting any\n"
+ << " existing keyringfile\n"
+ << " -g, --gen-key will generate a new secret key for the\n"
+ << " specified entityname\n"
+ << " --gen-print-key will generate a new secret key without set it\n"
+ << " to the keyringfile, prints the secret to stdout\n"
+ << " --import-keyring FILE will import the content of a given keyring\n"
+ << " into the keyringfile\n"
+ << " -n NAME, --name NAME specify entityname to operate on\n"
+ << " -a BASE64, --add-key BASE64 will add an encoded key to the keyring\n"
+ << " --cap SUBSYSTEM CAPABILITY will set the capability for given subsystem\n"
+ << " --caps CAPSFILE will set all of capabilities associated with a\n"
+ << " given key, for all subsystems\n"
+ << " --mode MODE will set the desired file mode to the keyring\n"
+ << " e.g: '0644', defaults to '0600'"
+ << std::endl;
+ exit(1);
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ std::string add_key;
+ std::string caps_fn;
+ std::string import_keyring;
+ map<string,bufferlist> caps;
+ std::string fn;
+
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
+ bool gen_key = false;
+ bool gen_print_key = false;
+ bool list = false;
+ bool print_key = false;
+ bool create_keyring = false;
+ int mode = 0600; // keyring file mode
+ std::vector<const char*>::iterator i;
+
+ /* Handle options unique to ceph-authtool
+ * -n NAME, --name NAME is handled by global_init
+ * */
+ for (i = args.begin(); i != args.end(); ) {
+ std::string val;
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_flag(args, i, "-g", "--gen-key", (char*)NULL)) {
+ gen_key = true;
+ } else if (ceph_argparse_flag(args, i, "--gen-print-key", (char*)NULL)) {
+ gen_print_key = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "-a", "--add-key", (char*)NULL)) {
+ if (val.empty()) {
+ cerr << "Option --add-key requires an argument" << std::endl;
+ exit(1);
+ }
+ add_key = val;
+ } else if (ceph_argparse_flag(args, i, "-l", "--list", (char*)NULL)) {
+ list = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) {
+ caps_fn = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--cap", (char*)NULL)) {
+ std::string my_key = val;
+ if (i == args.end()) {
+ cerr << "must give two arguments to --cap: key and val." << std::endl;
+ exit(1);
+ }
+ std::string my_val = *i;
+ ++i;
+ encode(my_val, caps[my_key]);
+ } else if (ceph_argparse_flag(args, i, "-p", "--print-key", (char*)NULL)) {
+ print_key = true;
+ } else if (ceph_argparse_flag(args, i, "-C", "--create-keyring", (char*)NULL)) {
+ create_keyring = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--import-keyring", (char*)NULL)) {
+ import_keyring = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--mode", (char*)NULL)) {
+ std::string err;
+ mode = strict_strtoll(val.c_str(), 8, &err);
+ if (!err.empty()) {
+ cerr << "Option --mode requires an argument" << std::endl;
+ exit(1);
+ }
+ } else if (fn.empty()) {
+ fn = *i++;
+ } else {
+ cerr << argv[0] << ": unexpected '" << *i << "'" << std::endl;
+ usage();
+ }
+ }
+
+ if (fn.empty() && !gen_print_key) {
+ cerr << argv[0] << ": must specify filename" << std::endl;
+ usage();
+ }
+ if (!(gen_key ||
+ gen_print_key ||
+ !add_key.empty() ||
+ list ||
+ !caps_fn.empty() ||
+ !caps.empty() ||
+ print_key ||
+ create_keyring ||
+ !import_keyring.empty())) {
+ cerr << "no command specified" << std::endl;
+ usage();
+ }
+ if (gen_key && (!add_key.empty())) {
+ cerr << "can't both gen-key and add-key" << std::endl;
+ usage();
+ }
+
+ common_init_finish(g_ceph_context);
+ EntityName ename(g_conf()->name);
+
+ // Enforce the use of gen-key or add-key when creating to avoid ending up
+ // with an "empty" key (key = AAAAAAAAAAAAAAAA)
+ if (create_keyring && !gen_key && add_key.empty() && !caps.empty()) {
+ cerr << "must specify either gen-key or add-key when creating" << std::endl;
+ usage();
+ }
+
+ if (gen_print_key) {
+ CryptoKey key;
+ key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ cout << key << std::endl;
+ return 0;
+ }
+
+ // keyring --------
+ bool modified = false;
+ bool added_entity = false;
+ KeyRing keyring;
+
+ bufferlist bl;
+ int r = 0;
+ if (create_keyring) {
+ cout << "creating " << fn << std::endl;
+ modified = true;
+ } else {
+ std::string err;
+ r = bl.read_file(fn.c_str(), &err);
+ if (r >= 0) {
+ try {
+ auto iter = bl.cbegin();
+ decode(keyring, iter);
+ } catch (const buffer::error &err) {
+ cerr << "error reading file " << fn << std::endl;
+ exit(1);
+ }
+ } else {
+ cerr << "can't open " << fn << ": " << err << std::endl;
+ exit(1);
+ }
+ }
+
+ // Validate that "name" actually has an existing key in this keyring if we
+ // have not given gen-key or add-key options
+ if (!gen_key && add_key.empty() && !caps.empty()) {
+ CryptoKey key;
+ if (!keyring.get_secret(ename, key)) {
+ cerr << "can't find existing key for " << ename
+ << " and neither gen-key nor add-key specified" << std::endl;
+ exit(1);
+ }
+ }
+
+ // write commands
+ if (!import_keyring.empty()) {
+ KeyRing other;
+ bufferlist obl;
+ std::string err;
+ int r = obl.read_file(import_keyring.c_str(), &err);
+ if (r >= 0) {
+ try {
+ auto iter = obl.cbegin();
+ decode(other, iter);
+ } catch (const buffer::error &err) {
+ cerr << "error reading file " << import_keyring << std::endl;
+ exit(1);
+ }
+
+ cout << "importing contents of " << import_keyring << " into " << fn << std::endl;
+ //other.print(cout);
+ keyring.import(g_ceph_context, other);
+ modified = true;
+ } else {
+ cerr << "can't open " << import_keyring << ": " << err << std::endl;
+ exit(1);
+ }
+ }
+ if (gen_key) {
+ EntityAuth eauth;
+ eauth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ keyring.add(ename, eauth);
+ modified = true;
+ }
+ if (!add_key.empty()) {
+ EntityAuth eauth;
+ try {
+ eauth.key.decode_base64(add_key);
+ } catch (const buffer::error &err) {
+ cerr << "can't decode key '" << add_key << "'" << std::endl;
+ exit(1);
+ }
+ keyring.add(ename, eauth);
+ modified = true;
+ cout << "added entity " << ename << " " << eauth << std::endl;
+ added_entity = true;
+ }
+ if (!caps_fn.empty()) {
+ ConfFile cf;
+ if (cf.parse_file(caps_fn, &cerr) != 0) {
+ cerr << "could not parse caps file " << caps_fn << std::endl;
+ exit(1);
+ }
+ map<string, bufferlist> caps;
+ const char *key_names[] = { "mon", "osd", "mds", "mgr", NULL };
+ for (int i=0; key_names[i]; i++) {
+ std::string val;
+ if (cf.read("global", key_names[i], val) == 0) {
+ bufferlist bl;
+ encode(val, bl);
+ string s(key_names[i]);
+ caps[s] = bl;
+ }
+ }
+ keyring.set_caps(ename, caps);
+ modified = true;
+ }
+ if (!caps.empty()) {
+ keyring.set_caps(ename, caps);
+ modified = true;
+ }
+ if (added_entity && caps.size() > 0) {
+ cout << "added " << caps.size() << " caps to entity " << ename << std::endl;
+ }
+
+ // read commands
+ if (list) {
+ try {
+ keyring.print(cout);
+ } catch (ceph::buffer::end_of_buffer &eob) {
+ cout << "Exception (end_of_buffer) in print(), exit." << std::endl;
+ exit(1);
+ }
+ }
+ if (print_key) {
+ CryptoKey key;
+ if (keyring.get_secret(ename, key)) {
+ cout << key << std::endl;
+ } else {
+ cerr << "entity " << ename << " not found" << std::endl;
+ exit(1);
+ }
+ }
+
+ // write result?
+ if (modified) {
+ bufferlist bl;
+ keyring.encode_plaintext(bl);
+ r = bl.write_file(fn.c_str(), mode);
+ if (r < 0) {
+ cerr << "could not write " << fn << std::endl;
+ exit(1);
+ }
+ //cout << "wrote " << bl.length() << " bytes to " << fn << std::endl;
+ }
+ return 0;
+}
diff --git a/src/tools/ceph_conf.cc b/src/tools/ceph_conf.cc
new file mode 100644
index 000000000..d26cbb039
--- /dev/null
+++ b/src/tools/ceph_conf.cc
@@ -0,0 +1,275 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iomanip>
+#include <string>
+
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "mon/AuthMonitor.h"
+#include "common/Formatter.h"
+
+using std::deque;
+using std::string;
+
+static void usage(std::ostream& out)
+{
+ // TODO: add generic_usage once cerr/derr issues are resolved
+ out << R"(Ceph configuration query tool
+
+USAGE
+ceph-conf <flags> <action>
+
+ACTIONS
+ -L|--list-all-sections List all sections
+ -l|--list-sections <prefix> List sections with the given prefix
+ --filter-key <key> Filter section list to only include sections
+ with given key defined.
+ --filter-key-value <key>=<val> Filter section list to only include sections
+ with given key/value pair.
+ --lookup <key> Print a configuration setting to stdout.
+ Returns 0 (success) if the configuration setting is
+ found; 1 otherwise.
+ -r|--resolve-search search for the first file that exists and
+ can be opened in the resulted comma
+ delimited search list.
+ -D|--dump-all dump all variables.
+ --show-config-value <key> Print the corresponding ceph.conf value
+ that matches the specified key. Also searches
+ global defaults.
+
+FLAGS
+ --name name Set type.id
+ [-s <section>] Add to list of sections to search
+ [--format plain|json|json-pretty]
+ dump variables in plain text, json or pretty
+ json
+ [--pid <pid>] Override the $pid when expanding options
+
+If there is no action given, the action will default to --lookup.
+
+EXAMPLES
+$ ceph-conf --name mon.0 -c /etc/ceph/ceph.conf 'mon addr'
+Find out what the value of 'mon addr' is for monitor 0.
+
+$ ceph-conf -l mon
+List sections beginning with 'mon'.
+
+RETURN CODE
+Return code will be 0 on success; error code otherwise.
+)";
+}
+
+static int list_sections(const std::string &prefix,
+ const std::list<string>& filter_key,
+ const std::map<string,string>& filter_key_value)
+{
+ std::vector <std::string> sections;
+ int ret = g_conf().get_all_sections(sections);
+ if (ret)
+ return 2;
+ for (std::vector<std::string>::const_iterator p = sections.begin();
+ p != sections.end(); ++p) {
+ if (strncmp(prefix.c_str(), p->c_str(), prefix.size()))
+ continue;
+
+ std::vector<std::string> sec;
+ sec.push_back(*p);
+
+ int r = 0;
+ for (std::list<string>::const_iterator q = filter_key.begin(); q != filter_key.end(); ++q) {
+ string v;
+ r = g_conf().get_val_from_conf_file(sec, q->c_str(), v, false);
+ if (r < 0)
+ break;
+ }
+ if (r < 0)
+ continue;
+
+ for (std::map<string,string>::const_iterator q = filter_key_value.begin();
+ q != filter_key_value.end();
+ ++q) {
+ string v;
+ r = g_conf().get_val_from_conf_file(sec, q->first.c_str(), v, false);
+ if (r < 0 || v != q->second) {
+ r = -1;
+ break;
+ }
+ }
+ if (r < 0)
+ continue;
+
+ cout << *p << std::endl;
+ }
+ return 0;
+}
+
+static int lookup(const std::deque<std::string> &sections,
+ const std::string &key, bool resolve_search)
+{
+ std::vector<std::string> my_sections{sections.begin(), sections.end()};
+ for (auto& section : g_conf().get_my_sections()) {
+ my_sections.push_back(section);
+ }
+ std::string val;
+ int ret = g_conf().get_val_from_conf_file(my_sections, key.c_str(), val, true);
+ if (ret == -ENOENT)
+ return 1;
+ else if (ret == 0) {
+ if (resolve_search) {
+ string result;
+ ret = ceph_resolve_file_search(val, result);
+ if (!ret)
+ puts(result.c_str());
+ }
+ else {
+ puts(val.c_str());
+ }
+ return 0;
+ }
+ else {
+ cerr << "error looking up '" << key << "': error " << ret << std::endl;
+ return 2;
+ }
+}
+
+static int dump_all(const string& format)
+{
+ if (format == "" || format == "plain") {
+ g_conf().show_config(std::cout);
+ return 0;
+ } else {
+ unique_ptr<Formatter> f(Formatter::create(format));
+ if (f) {
+ f->open_object_section("ceph-conf");
+ g_conf().show_config(f.get());
+ f->close_section();
+ f->flush(std::cout);
+ return 0;
+ }
+ cerr << "format '" << format << "' not recognized." << std::endl;
+ usage(cerr);
+ return 1;
+ }
+}
+
+static void maybe_override_pid(vector<const char*>& args)
+{
+ for (auto i = args.begin(); i != args.end(); ++i) {
+ string val;
+ if (ceph_argparse_witharg(args, i, &val, "--pid", (char*)NULL)) {
+ setenv("PID", val.c_str(), 1);
+ break;
+ }
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ deque<std::string> sections;
+ bool resolve_search = false;
+ std::string action;
+ std::string lookup_key;
+ std::string section_list_prefix;
+ std::list<string> filter_key;
+ std::map<string,string> filter_key_value;
+ std::string dump_format;
+
+ argv_to_vec(argc, argv, args);
+
+ auto orig_args = args;
+ auto cct = [&args] {
+ // override the PID before options are expanded
+ maybe_override_pid(args);
+ std::map<std::string,std::string> defaults = {{"log_to_file", "false"}};
+ return global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_NO_DAEMON_ACTIONS |
+ CINIT_FLAG_NO_MON_CONFIG);
+ }();
+
+ // do not common_init_finish(); do not start threads; do not do any of thing
+ // wonky things the daemon whose conf we are examining would do (like initialize
+ // the admin socket).
+ //common_init_finish(g_ceph_context);
+
+ std::string val;
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_witharg(args, i, &val, "-s", "--section", (char*)NULL)) {
+ sections.push_back(val);
+ } else if (ceph_argparse_flag(args, i, "-r", "--resolve_search", (char*)NULL)) {
+ resolve_search = true;
+ } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+ action = "help";
+ } else if (ceph_argparse_witharg(args, i, &val, "--lookup", (char*)NULL)) {
+ action = "lookup";
+ lookup_key = val;
+ } else if (ceph_argparse_flag(args, i, "-L", "--list_all_sections", (char*)NULL)) {
+ action = "list-sections";
+ section_list_prefix = "";
+ } else if (ceph_argparse_witharg(args, i, &val, "-l", "--list_sections", (char*)NULL)) {
+ action = "list-sections";
+ section_list_prefix = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--filter_key", (char*)NULL)) {
+ filter_key.push_back(val);
+ } else if (ceph_argparse_witharg(args, i, &val, "--filter_key_value", (char*)NULL)) {
+ size_t pos = val.find_first_of('=');
+ if (pos == string::npos) {
+ cerr << "expecting argument like 'key=value' for --filter-key-value (not '" << val << "')" << std::endl;
+ usage(cerr);
+ return EXIT_FAILURE;
+ }
+ string key(val, 0, pos);
+ string value(val, pos+1);
+ filter_key_value[key] = value;
+ } else if (ceph_argparse_flag(args, i, "-D", "--dump_all", (char*)NULL)) {
+ action = "dumpall";
+ } else if (ceph_argparse_witharg(args, i, &val, "--format", (char*)NULL)) {
+ dump_format = val;
+ } else {
+ if (((action == "lookup") || (action == "")) && (lookup_key.empty())) {
+ action = "lookup";
+ lookup_key = *i++;
+ } else {
+ cerr << "unable to parse option: '" << *i << "'" << std::endl;
+ cerr << "args:";
+ for (auto arg : orig_args) {
+ cerr << " " << quoted(arg);
+ }
+ cerr << std::endl;
+ usage(cerr);
+ return EXIT_FAILURE;
+ }
+ }
+ }
+
+ cct->_log->flush();
+ if (action == "help") {
+ usage(cout);
+ return EXIT_SUCCESS;
+ } else if (action == "list-sections") {
+ return list_sections(section_list_prefix, filter_key, filter_key_value);
+ } else if (action == "lookup") {
+ return lookup(sections, lookup_key, resolve_search);
+ } else if (action == "dumpall") {
+ return dump_all(dump_format);
+ } else {
+ cerr << "You must give an action, such as --lookup or --list-all-sections." << std::endl;
+ cerr << "Pass --help for more help." << std::endl;
+ return EXIT_FAILURE;
+ }
+}
diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc
new file mode 100644
index 000000000..fff294374
--- /dev/null
+++ b/src/tools/ceph_dedup_tool.cc
@@ -0,0 +1,964 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Myoungwon Oh <ohmyoungwon@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "include/types.h"
+
+#include "include/rados/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rados/rados_types.hpp"
+
+#include "acconfig.h"
+
+#include "common/Cond.h"
+#include "common/Formatter.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_crypto.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/obj_bencher.h"
+#include "global/global_init.h"
+
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <time.h>
+#include <sstream>
+#include <errno.h>
+#include <dirent.h>
+#include <stdexcept>
+#include <climits>
+#include <locale>
+#include <memory>
+#include <math.h>
+
+#include "tools/RadosDump.h"
+#include "cls/cas/cls_cas_client.h"
+#include "cls/cas/cls_cas_internal.h"
+#include "include/stringify.h"
+#include "global/signal_handler.h"
+#include "common/CDC.h"
+
+struct EstimateResult {
+ std::unique_ptr<CDC> cdc;
+
+ uint64_t chunk_size;
+
+ ceph::mutex lock = ceph::make_mutex("EstimateResult::lock");
+
+ // < key, <count, chunk_size> >
+ map< string, pair <uint64_t, uint64_t> > chunk_statistics;
+ uint64_t total_bytes = 0;
+ std::atomic<uint64_t> total_objects = {0};
+
+ EstimateResult(std::string alg, int chunk_size)
+ : cdc(CDC::create(alg, chunk_size)),
+ chunk_size(1ull << chunk_size) {}
+
+ void add_chunk(bufferlist& chunk, const std::string& fp_algo) {
+ string fp;
+ if (fp_algo == "sha1") {
+ sha1_digest_t sha1_val = crypto::digest<crypto::SHA1>(chunk);
+ fp = sha1_val.to_str();
+ } else if (fp_algo == "sha256") {
+ sha256_digest_t sha256_val = crypto::digest<crypto::SHA256>(chunk);
+ fp = sha256_val.to_str();
+ } else if (fp_algo == "sha512") {
+ sha512_digest_t sha512_val = crypto::digest<crypto::SHA512>(chunk);
+ fp = sha512_val.to_str();
+ } else {
+ ceph_assert(0 == "no support fingerperint algorithm");
+ }
+
+ std::lock_guard l(lock);
+ auto p = chunk_statistics.find(fp);
+ if (p != chunk_statistics.end()) {
+ p->second.first++;
+ if (p->second.second != chunk.length()) {
+ cerr << "warning: hash collision on " << fp
+ << ": was " << p->second.second
+ << " now " << chunk.length() << std::endl;
+ }
+ } else {
+ chunk_statistics[fp] = make_pair(1, chunk.length());
+ }
+ total_bytes += chunk.length();
+ }
+
+ void dump(Formatter *f) const {
+ f->dump_unsigned("target_chunk_size", chunk_size);
+
+ uint64_t dedup_bytes = 0;
+ uint64_t dedup_objects = chunk_statistics.size();
+ for (auto& j : chunk_statistics) {
+ dedup_bytes += j.second.second;
+ }
+ //f->dump_unsigned("dedup_bytes", dedup_bytes);
+ //f->dump_unsigned("original_bytes", total_bytes);
+ f->dump_float("dedup_bytes_ratio",
+ (double)dedup_bytes / (double)total_bytes);
+ f->dump_float("dedup_objects_ratio",
+ (double)dedup_objects / (double)total_objects);
+
+ uint64_t avg = total_bytes / dedup_objects;
+ uint64_t sqsum = 0;
+ for (auto& j : chunk_statistics) {
+ sqsum += (avg - j.second.second) * (avg - j.second.second);
+ }
+ uint64_t stddev = sqrt(sqsum / dedup_objects);
+ f->dump_unsigned("chunk_size_average", avg);
+ f->dump_unsigned("chunk_size_stddev", stddev);
+ }
+};
+
+map<uint64_t, EstimateResult> dedup_estimates; // chunk size -> result
+
+using namespace librados;
+unsigned default_op_size = 1 << 26;
+unsigned default_max_thread = 2;
+int32_t default_report_period = 10;
+ceph::mutex glock = ceph::make_mutex("glock");
+
+void usage()
+{
+ cout << " usage: [--op <estimate|chunk-scrub|chunk-get-ref|chunk-put-ref|dump-chunk-refs>] [--pool <pool_name> ] " << std::endl;
+ cout << " --object <object_name> " << std::endl;
+ cout << " --chunk-size <size> chunk-size (byte) " << std::endl;
+ cout << " --chunk-algorithm <fixed|fastcdc> " << std::endl;
+ cout << " --fingerprint-algorithm <sha1|sha256|sha512> " << std::endl;
+ cout << " --chunk-pool <pool name> " << std::endl;
+ cout << " --max-thread <threads> " << std::endl;
+ cout << " --report-period <seconds> " << std::endl;
+ cout << " --max-seconds <seconds>" << std::endl;
+ cout << " --max-read-size <bytes> " << std::endl;
+ exit(1);
+}
+
+template <typename I, typename T>
+static int rados_sistrtoll(I &i, T *val) {
+ std::string err;
+ *val = strict_iecstrtoll(i->second.c_str(), &err);
+ if (err != "") {
+ cerr << "Invalid value for " << i->first << ": " << err << std::endl;
+ return -EINVAL;
+ } else {
+ return 0;
+ }
+}
+
+class EstimateDedupRatio;
+class ChunkScrub;
+class CrawlerThread : public Thread
+{
+ IoCtx io_ctx;
+ int n;
+ int m;
+ ObjectCursor begin;
+ ObjectCursor end;
+ ceph::mutex m_lock = ceph::make_mutex("CrawlerThread::Locker");
+ ceph::condition_variable m_cond;
+ int32_t report_period;
+ bool m_stop = false;
+ uint64_t total_bytes = 0;
+ uint64_t total_objects = 0;
+ uint64_t examined_objects = 0;
+ uint64_t examined_bytes = 0;
+ uint64_t max_read_size = 0;
+ bool debug = false;
+#define COND_WAIT_INTERVAL 10
+
+public:
+ CrawlerThread(IoCtx& io_ctx, int n, int m,
+ ObjectCursor begin, ObjectCursor end, int32_t report_period,
+ uint64_t num_objects, uint64_t max_read_size = default_op_size):
+ io_ctx(io_ctx), n(n), m(m), begin(begin), end(end),
+ report_period(report_period), total_objects(num_objects), max_read_size(max_read_size)
+ {}
+ void signal(int signum) {
+ std::lock_guard l{m_lock};
+ m_stop = true;
+ m_cond.notify_all();
+ }
+ virtual void print_status(Formatter *f, ostream &out) {}
+ uint64_t get_examined_objects() { return examined_objects; }
+ uint64_t get_examined_bytes() { return examined_bytes; }
+ uint64_t get_total_bytes() { return total_bytes; }
+ uint64_t get_total_objects() { return total_objects; }
+ void set_debug(const bool debug_) { debug = debug_; }
+ friend class EstimateDedupRatio;
+ friend class ChunkScrub;
+};
+
+class EstimateDedupRatio : public CrawlerThread
+{
+ string chunk_algo;
+ string fp_algo;
+ uint64_t chunk_size;
+ uint64_t max_seconds;
+
+public:
+ EstimateDedupRatio(
+ IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end,
+ string chunk_algo, string fp_algo, uint64_t chunk_size, int32_t report_period,
+ uint64_t num_objects, uint64_t max_read_size,
+ uint64_t max_seconds):
+ CrawlerThread(io_ctx, n, m, begin, end, report_period, num_objects,
+ max_read_size),
+ chunk_algo(chunk_algo),
+ fp_algo(fp_algo),
+ chunk_size(chunk_size),
+ max_seconds(max_seconds) {
+ }
+
+ void* entry() {
+ estimate_dedup_ratio();
+ return NULL;
+ }
+ void estimate_dedup_ratio();
+};
+
+class ChunkScrub: public CrawlerThread
+{
+ IoCtx chunk_io_ctx;
+ int damaged_objects = 0;
+
+public:
+ ChunkScrub(IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end,
+ IoCtx& chunk_io_ctx, int32_t report_period, uint64_t num_objects):
+ CrawlerThread(io_ctx, n, m, begin, end, report_period, num_objects), chunk_io_ctx(chunk_io_ctx)
+ { }
+ void* entry() {
+ chunk_scrub_common();
+ return NULL;
+ }
+ void chunk_scrub_common();
+ int get_damaged_objects() { return damaged_objects; }
+ void print_status(Formatter *f, ostream &out);
+};
+
+vector<std::unique_ptr<CrawlerThread>> estimate_threads;
+
+static void print_dedup_estimate(std::ostream& out, std::string chunk_algo)
+{
+ /*
+ uint64_t total_bytes = 0;
+ uint64_t total_objects = 0;
+ */
+ uint64_t examined_objects = 0;
+ uint64_t examined_bytes = 0;
+
+ for (auto &et : estimate_threads) {
+ examined_objects += et->get_examined_objects();
+ examined_bytes += et->get_examined_bytes();
+ }
+
+ auto f = Formatter::create("json-pretty");
+ f->open_object_section("results");
+ f->dump_string("chunk_algo", chunk_algo);
+ f->open_array_section("chunk_sizes");
+ for (auto& i : dedup_estimates) {
+ f->dump_object("chunker", i.second);
+ }
+ f->close_section();
+
+ f->open_object_section("summary");
+ f->dump_unsigned("examined_objects", examined_objects);
+ f->dump_unsigned("examined_bytes", examined_bytes);
+ /*
+ f->dump_unsigned("total_objects", total_objects);
+ f->dump_unsigned("total_bytes", total_bytes);
+ f->dump_float("examined_ratio", (float)examined_bytes / (float)total_bytes);
+ */
+ f->close_section();
+ f->close_section();
+ f->flush(out);
+}
+
+static void handle_signal(int signum)
+{
+ std::lock_guard l{glock};
+ for (auto &p : estimate_threads) {
+ p->signal(signum);
+ }
+}
+
+void EstimateDedupRatio::estimate_dedup_ratio()
+{
+ ObjectCursor shard_start;
+ ObjectCursor shard_end;
+
+ io_ctx.object_list_slice(
+ begin,
+ end,
+ n,
+ m,
+ &shard_start,
+ &shard_end);
+
+ utime_t start = ceph_clock_now();
+ utime_t end;
+ if (max_seconds) {
+ end = start;
+ end += max_seconds;
+ }
+
+ utime_t next_report;
+ if (report_period) {
+ next_report = start;
+ next_report += report_period;
+ }
+
+ ObjectCursor c(shard_start);
+ while (c < shard_end)
+ {
+ std::vector<ObjectItem> result;
+ int r = io_ctx.object_list(c, shard_end, 12, {}, &result, &c);
+ if (r < 0 ){
+ cerr << "error object_list : " << cpp_strerror(r) << std::endl;
+ return;
+ }
+
+ unsigned op_size = max_read_size;
+
+ for (const auto & i : result) {
+ const auto &oid = i.oid;
+
+ utime_t now = ceph_clock_now();
+ if (max_seconds && now > end) {
+ m_stop = true;
+ }
+ if (m_stop) {
+ return;
+ }
+
+ if (n == 0 && // first thread only
+ next_report != utime_t() && now > next_report) {
+ cerr << (int)(now - start) << "s : read "
+ << dedup_estimates.begin()->second.total_bytes << " bytes so far..."
+ << std::endl;
+ print_dedup_estimate(cerr, chunk_algo);
+ next_report = now;
+ next_report += report_period;
+ }
+
+ // read entire object
+ bufferlist bl;
+ uint64_t offset = 0;
+ while (true) {
+ bufferlist t;
+ int ret = io_ctx.read(oid, t, op_size, offset);
+ if (ret <= 0) {
+ break;
+ }
+ offset += ret;
+ bl.claim_append(t);
+ }
+ examined_objects++;
+ examined_bytes += bl.length();
+
+ // do the chunking
+ for (auto& i : dedup_estimates) {
+ vector<pair<uint64_t, uint64_t>> chunks;
+ i.second.cdc->calc_chunks(bl, &chunks);
+ for (auto& p : chunks) {
+ bufferlist chunk;
+ chunk.substr_of(bl, p.first, p.second);
+ i.second.add_chunk(chunk, fp_algo);
+ if (debug) {
+ cout << " " << oid << " " << p.first << "~" << p.second << std::endl;
+ }
+ }
+ ++i.second.total_objects;
+ }
+ }
+ }
+}
+
+void ChunkScrub::chunk_scrub_common()
+{
+ ObjectCursor shard_start;
+ ObjectCursor shard_end;
+ int ret;
+ Rados rados;
+
+ ret = rados.init_with_context(g_ceph_context);
+ if (ret < 0) {
+ cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl;
+ return;
+ }
+ ret = rados.connect();
+ if (ret) {
+ cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl;
+ return;
+ }
+
+ chunk_io_ctx.object_list_slice(
+ begin,
+ end,
+ n,
+ m,
+ &shard_start,
+ &shard_end);
+
+ ObjectCursor c(shard_start);
+ while(c < shard_end)
+ {
+ std::vector<ObjectItem> result;
+ int r = chunk_io_ctx.object_list(c, shard_end, 12, {}, &result, &c);
+ if (r < 0 ){
+ cerr << "error object_list : " << cpp_strerror(r) << std::endl;
+ return;
+ }
+
+ for (const auto & i : result) {
+ std::unique_lock l{m_lock};
+ if (m_stop) {
+ Formatter *formatter = Formatter::create("json-pretty");
+ print_status(formatter, cout);
+ delete formatter;
+ return;
+ }
+ auto oid = i.oid;
+ cout << oid << std::endl;
+ chunk_refs_t refs;
+ {
+ bufferlist t;
+ ret = chunk_io_ctx.getxattr(oid, CHUNK_REFCOUNT_ATTR, t);
+ if (ret < 0) {
+ continue;
+ }
+ auto p = t.cbegin();
+ decode(refs, p);
+ }
+
+ examined_objects++;
+ if (refs.get_type() != chunk_refs_t::TYPE_BY_OBJECT) {
+ // we can't do anything here
+ continue;
+ }
+
+ // check all objects
+ chunk_refs_by_object_t *byo =
+ static_cast<chunk_refs_by_object_t*>(refs.r.get());
+ set<hobject_t> real_refs;
+
+ uint64_t pool_missing = 0;
+ uint64_t object_missing = 0;
+ uint64_t does_not_ref = 0;
+ for (auto& pp : byo->by_object) {
+ IoCtx target_io_ctx;
+ ret = rados.ioctx_create2(pp.pool, target_io_ctx);
+ if (ret < 0) {
+ cerr << oid << " ref " << pp
+ << ": referencing pool does not exist" << std::endl;
+ ++pool_missing;
+ continue;
+ }
+
+ ret = cls_cas_references_chunk(target_io_ctx, pp.oid.name, oid);
+ if (ret == -ENOENT) {
+ cerr << oid << " ref " << pp
+ << ": referencing object missing" << std::endl;
+ ++object_missing;
+ } else if (ret == -ENOLINK) {
+ cerr << oid << " ref " << pp
+ << ": referencing object does not reference chunk"
+ << std::endl;
+ ++does_not_ref;
+ }
+ }
+ if (pool_missing || object_missing || does_not_ref) {
+ ++damaged_objects;
+ }
+ }
+ }
+ cout << "--done--" << std::endl;
+}
+
+void ChunkScrub::print_status(Formatter *f, ostream &out)
+{
+ if (f) {
+ f->open_array_section("chunk_scrub");
+ f->dump_string("PID", stringify(get_pid()));
+ f->open_object_section("Status");
+ f->dump_string("Total object", stringify(total_objects));
+ f->dump_string("Examined objects", stringify(examined_objects));
+ f->dump_string("damaged objects", stringify(damaged_objects));
+ f->close_section();
+ f->flush(out);
+ cout << std::endl;
+ }
+}
+
+int estimate_dedup_ratio(const std::map < std::string, std::string > &opts,
+ std::vector<const char*> &nargs)
+{
+ Rados rados;
+ IoCtx io_ctx;
+ std::string chunk_algo = "fastcdc";
+ string fp_algo = "sha1";
+ string pool_name;
+ uint64_t chunk_size = 0;
+ uint64_t min_chunk_size = 8192;
+ uint64_t max_chunk_size = 4*1024*1024;
+ unsigned max_thread = default_max_thread;
+ uint32_t report_period = default_report_period;
+ uint64_t max_read_size = default_op_size;
+ uint64_t max_seconds = 0;
+ int ret;
+ std::map<std::string, std::string>::const_iterator i;
+ bool debug = false;
+ ObjectCursor begin;
+ ObjectCursor end;
+ librados::pool_stat_t s;
+ list<string> pool_names;
+ map<string, librados::pool_stat_t> stats;
+
+ i = opts.find("pool");
+ if (i != opts.end()) {
+ pool_name = i->second.c_str();
+ }
+ i = opts.find("chunk-algorithm");
+ if (i != opts.end()) {
+ chunk_algo = i->second.c_str();
+ if (!CDC::create(chunk_algo, 12)) {
+ cerr << "unrecognized chunk-algorithm " << chunk_algo << std::endl;
+ exit(1);
+ }
+ } else {
+ cerr << "must specify chunk-algorithm" << std::endl;
+ exit(1);
+ }
+
+ i = opts.find("fingerprint-algorithm");
+ if (i != opts.end()) {
+ fp_algo = i->second.c_str();
+ if (fp_algo != "sha1"
+ && fp_algo != "sha256" && fp_algo != "sha512") {
+ cerr << "unrecognized fingerprint-algorithm " << fp_algo << std::endl;
+ exit(1);
+ }
+ }
+
+ i = opts.find("chunk-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &chunk_size)) {
+ return -EINVAL;
+ }
+ }
+
+ i = opts.find("min-chunk-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &min_chunk_size)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-chunk-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_chunk_size)) {
+ return -EINVAL;
+ }
+ }
+
+ i = opts.find("max-thread");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_thread)) {
+ return -EINVAL;
+ }
+ }
+
+ i = opts.find("report-period");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &report_period)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-seconds");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_seconds)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-read-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_read_size)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("debug");
+ if (i != opts.end()) {
+ debug = true;
+ }
+
+ i = opts.find("pgid");
+ boost::optional<pg_t> pgid(i != opts.end(), pg_t());
+
+ ret = rados.init_with_context(g_ceph_context);
+ if (ret < 0) {
+ cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = rados.connect();
+ if (ret) {
+ cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl;
+ ret = -1;
+ goto out;
+ }
+ if (pool_name.empty()) {
+ cerr << "--create-pool requested but pool_name was not specified!" << std::endl;
+ exit(1);
+ }
+ ret = rados.ioctx_create(pool_name.c_str(), io_ctx);
+ if (ret < 0) {
+ cerr << "error opening pool "
+ << pool_name << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ // set up chunkers
+ if (chunk_size) {
+ dedup_estimates.emplace(std::piecewise_construct,
+ std::forward_as_tuple(chunk_size),
+ std::forward_as_tuple(chunk_algo, cbits(chunk_size)-1));
+ } else {
+ for (size_t cs = min_chunk_size; cs <= max_chunk_size; cs *= 2) {
+ dedup_estimates.emplace(std::piecewise_construct,
+ std::forward_as_tuple(cs),
+ std::forward_as_tuple(chunk_algo, cbits(cs)-1));
+ }
+ }
+
+ glock.lock();
+ begin = io_ctx.object_list_begin();
+ end = io_ctx.object_list_end();
+ pool_names.push_back(pool_name);
+ ret = rados.get_pool_stats(pool_names, stats);
+ if (ret < 0) {
+ cerr << "error fetching pool stats: " << cpp_strerror(ret) << std::endl;
+ glock.unlock();
+ return ret;
+ }
+ if (stats.find(pool_name) == stats.end()) {
+ cerr << "stats can not find pool name: " << pool_name << std::endl;
+ glock.unlock();
+ return ret;
+ }
+ s = stats[pool_name];
+
+ for (unsigned i = 0; i < max_thread; i++) {
+ std::unique_ptr<CrawlerThread> ptr (
+ new EstimateDedupRatio(io_ctx, i, max_thread, begin, end,
+ chunk_algo, fp_algo, chunk_size,
+ report_period, s.num_objects, max_read_size,
+ max_seconds));
+ ptr->create("estimate_thread");
+ ptr->set_debug(debug);
+ estimate_threads.push_back(move(ptr));
+ }
+ glock.unlock();
+
+ for (auto &p : estimate_threads) {
+ p->join();
+ }
+
+ print_dedup_estimate(cout, chunk_algo);
+
+ out:
+ return (ret < 0) ? 1 : 0;
+}
+
+static void print_chunk_scrub()
+{
+ uint64_t total_objects = 0;
+ uint64_t examined_objects = 0;
+ int damaged_objects = 0;
+
+ for (auto &et : estimate_threads) {
+ if (!total_objects) {
+ total_objects = et->get_total_objects();
+ }
+ examined_objects += et->get_examined_objects();
+ ChunkScrub *ptr = static_cast<ChunkScrub*>(et.get());
+ damaged_objects += ptr->get_damaged_objects();
+ }
+
+ cout << " Total object : " << total_objects << std::endl;
+ cout << " Examined object : " << examined_objects << std::endl;
+ cout << " Damaged object : " << damaged_objects << std::endl;
+}
+
+int chunk_scrub_common(const std::map < std::string, std::string > &opts,
+ std::vector<const char*> &nargs)
+{
+ Rados rados;
+ IoCtx io_ctx, chunk_io_ctx;
+ std::string object_name, target_object_name;
+ string chunk_pool_name, op_name;
+ int ret;
+ unsigned max_thread = default_max_thread;
+ std::map<std::string, std::string>::const_iterator i;
+ uint32_t report_period = default_report_period;
+ ObjectCursor begin;
+ ObjectCursor end;
+ librados::pool_stat_t s;
+ list<string> pool_names;
+ map<string, librados::pool_stat_t> stats;
+
+ i = opts.find("op_name");
+ if (i != opts.end()) {
+ op_name= i->second.c_str();
+ } else {
+ cerr << "must specify op" << std::endl;
+ exit(1);
+ }
+
+ i = opts.find("chunk-pool");
+ if (i != opts.end()) {
+ chunk_pool_name = i->second.c_str();
+ } else {
+ cerr << "must specify --chunk-pool" << std::endl;
+ exit(1);
+ }
+ i = opts.find("max-thread");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_thread)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("report-period");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &report_period)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("pgid");
+ boost::optional<pg_t> pgid(i != opts.end(), pg_t());
+
+ ret = rados.init_with_context(g_ceph_context);
+ if (ret < 0) {
+ cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = rados.connect();
+ if (ret) {
+ cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl;
+ ret = -1;
+ goto out;
+ }
+ ret = rados.ioctx_create(chunk_pool_name.c_str(), chunk_io_ctx);
+ if (ret < 0) {
+ cerr << "error opening pool "
+ << chunk_pool_name << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ if (op_name == "chunk-get-ref" ||
+ op_name == "chunk-put-ref") {
+ string target_object_name;
+ uint64_t pool_id;
+ i = opts.find("object");
+ if (i != opts.end()) {
+ object_name = i->second.c_str();
+ } else {
+ cerr << "must specify object" << std::endl;
+ exit(1);
+ }
+ i = opts.find("target-ref");
+ if (i != opts.end()) {
+ target_object_name = i->second.c_str();
+ } else {
+ cerr << "must specify target ref" << std::endl;
+ exit(1);
+ }
+ i = opts.find("target-ref-pool-id");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &pool_id)) {
+ return -EINVAL;
+ }
+ } else {
+ cerr << "must specify target-ref-pool-id" << std::endl;
+ exit(1);
+ }
+
+ uint32_t hash;
+ ret = chunk_io_ctx.get_object_hash_position2(object_name, &hash);
+ if (ret < 0) {
+ return ret;
+ }
+ hobject_t oid(sobject_t(target_object_name, CEPH_NOSNAP), "", hash, pool_id, "");
+
+ ObjectWriteOperation op;
+ if (op_name == "chunk-get-ref") {
+ cls_cas_chunk_get_ref(op, oid);
+ } else {
+ cls_cas_chunk_put_ref(op, oid);
+ }
+ ret = chunk_io_ctx.operate(object_name, &op);
+ if (ret < 0) {
+ cerr << " operate fail : " << cpp_strerror(ret) << std::endl;
+ }
+
+ return ret;
+
+ } else if (op_name == "dump-chunk-refs") {
+ i = opts.find("object");
+ if (i != opts.end()) {
+ object_name = i->second.c_str();
+ } else {
+ cerr << "must specify object" << std::endl;
+ exit(1);
+ }
+ bufferlist t;
+ ret = chunk_io_ctx.getxattr(object_name, CHUNK_REFCOUNT_ATTR, t);
+ if (ret < 0) {
+ return ret;
+ }
+ chunk_refs_t refs;
+ auto p = t.cbegin();
+ decode(refs, p);
+ auto f = Formatter::create("json-pretty");
+ f->dump_object("refs", refs);
+ f->flush(cout);
+ return 0;
+ }
+
+ glock.lock();
+ begin = chunk_io_ctx.object_list_begin();
+ end = chunk_io_ctx.object_list_end();
+ pool_names.push_back(chunk_pool_name);
+ ret = rados.get_pool_stats(pool_names, stats);
+ if (ret < 0) {
+ cerr << "error fetching pool stats: " << cpp_strerror(ret) << std::endl;
+ glock.unlock();
+ return ret;
+ }
+ if (stats.find(chunk_pool_name) == stats.end()) {
+ cerr << "stats can not find pool name: " << chunk_pool_name << std::endl;
+ glock.unlock();
+ return ret;
+ }
+ s = stats[chunk_pool_name];
+
+ for (unsigned i = 0; i < max_thread; i++) {
+ std::unique_ptr<CrawlerThread> ptr (
+ new ChunkScrub(io_ctx, i, max_thread, begin, end, chunk_io_ctx,
+ report_period, s.num_objects));
+ ptr->create("estimate_thread");
+ estimate_threads.push_back(move(ptr));
+ }
+ glock.unlock();
+
+ for (auto &p : estimate_threads) {
+ cout << "join " << std::endl;
+ p->join();
+ cout << "joined " << std::endl;
+ }
+
+ print_chunk_scrub();
+
+out:
+ return (ret < 0) ? 1 : 0;
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ std::string fn;
+ string op_name;
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+ init_async_signal_handler();
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+ std::map < std::string, std::string > opts;
+ std::string val;
+ std::vector<const char*>::iterator i;
+ for (i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_witharg(args, i, &val, "--op", (char*)NULL)) {
+ opts["op_name"] = val;
+ op_name = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--pool", (char*)NULL)) {
+ opts["pool"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--object", (char*)NULL)) {
+ opts["object"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--chunk-algorithm", (char*)NULL)) {
+ opts["chunk-algorithm"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--chunk-size", (char*)NULL)) {
+ opts["chunk-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--fingerprint-algorithm", (char*)NULL)) {
+ opts["fingerprint-algorithm"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--chunk-pool", (char*)NULL)) {
+ opts["chunk-pool"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-ref", (char*)NULL)) {
+ opts["target-ref"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-ref-pool-id", (char*)NULL)) {
+ opts["target-ref-pool-id"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-thread", (char*)NULL)) {
+ opts["max-thread"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--report-period", (char*)NULL)) {
+ opts["report-period"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-read-size", (char*)NULL)) {
+ opts["max-seconds"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-seconds", (char*)NULL)) {
+ opts["max-seconds"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--min-chunk-size", (char*)NULL)) {
+ opts["min-chunk-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-chunk-size", (char*)NULL)) {
+ opts["max-chunk-size"] = val;
+ } else if (ceph_argparse_flag(args, i, "--debug", (char*)NULL)) {
+ opts["debug"] = "true";
+ } else {
+ if (val[0] == '-') {
+ cerr << "unrecognized option " << val << std::endl;
+ exit(1);
+ }
+ ++i;
+ }
+ }
+
+ if (op_name == "estimate") {
+ return estimate_dedup_ratio(opts, args);
+ } else if (op_name == "chunk-scrub") {
+ return chunk_scrub_common(opts, args);
+ } else if (op_name == "chunk-get-ref" ||
+ op_name == "chunk-put-ref") {
+ return chunk_scrub_common(opts, args);
+ } else if (op_name == "dump-chunk-refs") {
+ return chunk_scrub_common(opts, args);
+ } else {
+ cerr << "unrecognized op " << op_name << std::endl;
+ exit(1);
+ }
+
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ return 0;
+}
diff --git a/src/tools/ceph_kvstore_tool.cc b/src/tools/ceph_kvstore_tool.cc
new file mode 100644
index 000000000..a50666850
--- /dev/null
+++ b/src/tools/ceph_kvstore_tool.cc
@@ -0,0 +1,356 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#include <map>
+#include <set>
+#include <string>
+#include <fstream>
+
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/url_escape.h"
+
+#include "global/global_context.h"
+#include "global/global_init.h"
+
+#include "kvstore_tool.h"
+
+void usage(const char *pname)
+{
+ std::cout << "Usage: " << pname << " <leveldb|rocksdb|bluestore-kv> <store path> command [args...]\n"
+ << "\n"
+ << "Commands:\n"
+ << " list [prefix]\n"
+ << " list-crc [prefix]\n"
+ << " dump [prefix]\n"
+ << " exists <prefix> [key]\n"
+ << " get <prefix> <key> [out <file>]\n"
+ << " crc <prefix> <key>\n"
+ << " get-size [<prefix> <key>]\n"
+ << " set <prefix> <key> [ver <N>|in <file>]\n"
+ << " rm <prefix> <key>\n"
+ << " rm-prefix <prefix>\n"
+ << " store-copy <path> [num-keys-per-tx] [leveldb|rocksdb|...] \n"
+ << " store-crc <path>\n"
+ << " compact\n"
+ << " compact-prefix <prefix>\n"
+ << " compact-range <prefix> <start> <end>\n"
+ << " destructive-repair (use only as last resort! may corrupt healthy data)\n"
+ << " stats\n"
+ << std::endl;
+}
+
+int main(int argc, const char *argv[])
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage(argv[0]);
+ exit(0);
+ }
+
+ map<string,string> defaults = {
+ { "debug_rocksdb", "2" }
+ };
+
+ auto cct = global_init(
+ &defaults, args,
+ CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+
+ ceph_assert((int)args.size() < argc);
+ for(size_t i=0; i<args.size(); i++)
+ argv[i+1] = args[i];
+ argc = args.size() + 1;
+
+ if (args.size() < 3) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ string type(args[0]);
+ string path(args[1]);
+ string cmd(args[2]);
+
+ if (type != "leveldb" &&
+ type != "rocksdb" &&
+ type != "bluestore-kv") {
+
+ std::cerr << "Unrecognized type: " << args[0] << std::endl;
+ usage(argv[0]);
+ return 1;
+ }
+
+ bool to_repair = (cmd == "destructive-repair");
+ bool need_stats = (cmd == "stats");
+ StoreTool st(type, path, to_repair, need_stats);
+
+ if (cmd == "destructive-repair") {
+ int ret = st.destructive_repair();
+ if (!ret) {
+ std::cout << "destructive-repair completed without reporting an error"
+ << std::endl;
+ } else {
+ std::cout << "destructive-repair failed with " << cpp_strerror(ret)
+ << std::endl;
+ }
+ return ret;
+ } else if (cmd == "list" || cmd == "list-crc") {
+ string prefix;
+ if (argc > 4)
+ prefix = url_unescape(argv[4]);
+
+ bool do_crc = (cmd == "list-crc");
+ st.list(prefix, do_crc, false);
+
+ } else if (cmd == "dump") {
+ string prefix;
+ if (argc > 4)
+ prefix = url_unescape(argv[4]);
+ st.list(prefix, false, true);
+
+ } else if (cmd == "exists") {
+ string key;
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ if (argc > 5)
+ key = url_unescape(argv[5]);
+
+ bool ret = st.exists(prefix, key);
+ std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ") "
+ << (ret ? "exists" : "does not exist")
+ << std::endl;
+ return (ret ? 0 : 1);
+
+ } else if (cmd == "get") {
+ if (argc < 6) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+
+ bool exists = false;
+ bufferlist bl = st.get(prefix, key, exists);
+ std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ")";
+ if (!exists) {
+ std::cout << " does not exist" << std::endl;
+ return 1;
+ }
+ std::cout << std::endl;
+
+ if (argc >= 7) {
+ string subcmd(argv[6]);
+ if (subcmd != "out") {
+ std::cerr << "unrecognized subcmd '" << subcmd << "'"
+ << std::endl;
+ return 1;
+ }
+ if (argc < 8) {
+ std::cerr << "output path not specified" << std::endl;
+ return 1;
+ }
+ string out(argv[7]);
+
+ if (out.empty()) {
+ std::cerr << "unspecified out file" << std::endl;
+ return 1;
+ }
+
+ int err = bl.write_file(argv[7], 0644);
+ if (err < 0) {
+ std::cerr << "error writing value to '" << out << "': "
+ << cpp_strerror(err) << std::endl;
+ return 1;
+ }
+ } else {
+ ostringstream os;
+ bl.hexdump(os);
+ std::cout << os.str() << std::endl;
+ }
+
+ } else if (cmd == "crc") {
+ if (argc < 6) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+
+ bool exists = false;
+ bufferlist bl = st.get(prefix, key, exists);
+ std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ") ";
+ if (!exists) {
+ std::cout << " does not exist" << std::endl;
+ return 1;
+ }
+ std::cout << " crc " << bl.crc32c(0) << std::endl;
+
+ } else if (cmd == "get-size") {
+ std::cout << "estimated store size: " << st.get_size() << std::endl;
+
+ if (argc < 5)
+ return 0;
+
+ if (argc < 6) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+
+ bool exists = false;
+ bufferlist bl = st.get(prefix, key, exists);
+ if (!exists) {
+ std::cerr << "(" << url_escape(prefix) << "," << url_escape(key)
+ << ") does not exist" << std::endl;
+ return 1;
+ }
+ std::cout << "(" << url_escape(prefix) << "," << url_escape(key)
+ << ") size " << byte_u_t(bl.length()) << std::endl;
+
+ } else if (cmd == "set") {
+ if (argc < 8) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+ string subcmd(argv[6]);
+
+ bufferlist val;
+ string errstr;
+ if (subcmd == "ver") {
+ version_t v = (version_t) strict_strtoll(argv[7], 10, &errstr);
+ if (!errstr.empty()) {
+ std::cerr << "error reading version: " << errstr << std::endl;
+ return 1;
+ }
+ encode(v, val);
+ } else if (subcmd == "in") {
+ int ret = val.read_file(argv[7], &errstr);
+ if (ret < 0 || !errstr.empty()) {
+ std::cerr << "error reading file: " << errstr << std::endl;
+ return 1;
+ }
+ } else {
+ std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl;
+ usage(argv[0]);
+ return 1;
+ }
+
+ bool ret = st.set(prefix, key, val);
+ if (!ret) {
+ std::cerr << "error setting ("
+ << url_escape(prefix) << "," << url_escape(key) << ")" << std::endl;
+ return 1;
+ }
+ } else if (cmd == "rm") {
+ if (argc < 6) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+
+ bool ret = st.rm(prefix, key);
+ if (!ret) {
+ std::cerr << "error removing ("
+ << url_escape(prefix) << "," << url_escape(key) << ")"
+ << std::endl;
+ return 1;
+ }
+ } else if (cmd == "rm-prefix") {
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+
+ bool ret = st.rm_prefix(prefix);
+ if (!ret) {
+ std::cerr << "error removing prefix ("
+ << url_escape(prefix) << ")"
+ << std::endl;
+ return 1;
+ }
+ } else if (cmd == "store-copy") {
+ int num_keys_per_tx = 128; // magic number that just feels right.
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ } else if (argc > 5) {
+ string err;
+ num_keys_per_tx = strict_strtol(argv[5], 10, &err);
+ if (!err.empty()) {
+ std::cerr << "invalid num_keys_per_tx: " << err << std::endl;
+ return 1;
+ }
+ }
+ string other_store_type = argv[1];
+ if (argc > 6) {
+ other_store_type = argv[6];
+ }
+
+ int ret = st.copy_store_to(argv[1], argv[4], num_keys_per_tx, other_store_type);
+ if (ret < 0) {
+ std::cerr << "error copying store to path '" << argv[4]
+ << "': " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ } else if (cmd == "store-crc") {
+ if (argc < 4) {
+ usage(argv[0]);
+ return 1;
+ }
+ std::ofstream fs(argv[4]);
+ uint32_t crc = st.traverse(string(), true, false, &fs);
+ std::cout << "store at '" << argv[4] << "' crc " << crc << std::endl;
+
+ } else if (cmd == "compact") {
+ st.compact();
+ } else if (cmd == "compact-prefix") {
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ st.compact_prefix(prefix);
+ } else if (cmd == "compact-range") {
+ if (argc < 7) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string start(url_unescape(argv[5]));
+ string end(url_unescape(argv[6]));
+ st.compact_range(prefix, start, end);
+ } else if (cmd == "stats") {
+ st.print_stats();
+ } else {
+ std::cerr << "Unrecognized command: " << cmd << std::endl;
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc
new file mode 100644
index 000000000..6614c9b88
--- /dev/null
+++ b/src/tools/ceph_monstore_tool.cc
@@ -0,0 +1,1335 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/scope_exit.hpp>
+
+#include <stdlib.h>
+#include <string>
+
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "auth/KeyRing.h"
+#include "auth/cephx/CephxKeyServer.h"
+#include "global/global_init.h"
+#include "include/stringify.h"
+#include "mgr/mgr_commands.h"
+#include "mon/AuthMonitor.h"
+#include "mon/MonitorDBStore.h"
+#include "mon/Paxos.h"
+#include "mon/MonMap.h"
+#include "mds/FSMap.h"
+#include "mon/MgrMap.h"
+#include "osd/OSDMap.h"
+#include "crush/CrushCompiler.h"
+#include "mon/CreatingPGs.h"
+
+namespace po = boost::program_options;
+
+class TraceIter {
+ int fd;
+ unsigned idx;
+ MonitorDBStore::TransactionRef t;
+public:
+ explicit TraceIter(string fname) : fd(-1), idx(-1) {
+ fd = ::open(fname.c_str(), O_RDONLY|O_BINARY);
+ t.reset(new MonitorDBStore::Transaction);
+ }
+ bool valid() {
+ return fd != -1;
+ }
+ MonitorDBStore::TransactionRef cur() {
+ ceph_assert(valid());
+ return t;
+ }
+ unsigned num() { return idx; }
+ void next() {
+ ++idx;
+ bufferlist bl;
+ int r = bl.read_fd(fd, 6);
+ if (r < 0) {
+ std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd"
+ << std::endl;
+ ::close(fd);
+ fd = -1;
+ return;
+ } else if ((unsigned)r < 6) {
+ std::cerr << "short read" << std::endl;
+ ::close(fd);
+ fd = -1;
+ return;
+ }
+ auto bliter = bl.cbegin();
+ uint8_t ver, ver2;
+ decode(ver, bliter);
+ decode(ver2, bliter);
+ uint32_t len;
+ decode(len, bliter);
+ r = bl.read_fd(fd, len);
+ if (r < 0) {
+ std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd"
+ << std::endl;
+ ::close(fd);
+ fd = -1;
+ return;
+ } else if ((unsigned)r < len) {
+ std::cerr << "short read" << std::endl;
+ ::close(fd);
+ fd = -1;
+ return;
+ }
+ bliter = bl.cbegin();
+ t.reset(new MonitorDBStore::Transaction);
+ t->decode(bliter);
+ }
+ void init() {
+ next();
+ }
+ ~TraceIter() {
+ if (fd != -1) {
+ ::close(fd);
+ fd = -1;
+ }
+ }
+};
+
+
+int parse_cmd_args(
+ po::options_description *desc, /// < visible options description
+ po::options_description *hidden_desc, /// < hidden options description
+ po::positional_options_description *positional, /// < positional args
+ vector<string> &cmd_args, /// < arguments to be parsed
+ po::variables_map *vm /// > post-parsing variable map
+ )
+{
+ // desc_all will aggregate all visible and hidden options for parsing.
+ //
+ // From boost's program_options point of view, there is absolutely no
+ // distinction between 'desc' and 'hidden_desc'. This is a distinction
+ // that is only useful to us: 'desc' is whatever we are willing to show
+ // on 'usage()', whereas 'hidden_desc' refers to parameters we wish to
+ // take advantage of but do not wish to show on 'usage()'.
+ //
+ // For example, consider that program_options matches positional arguments
+ // (specified via 'positional') against the paramenters defined on a
+ // given 'po::options_description' class. This is performed below,
+ // supplying both the description and the positional arguments to the
+ // parser. However, we do not want the parameters that are mapped to
+ // positional arguments to be shown on usage, as that makes for ugly and
+ // confusing usage messages. Therefore we dissociate the options'
+ // description that is to be used as an aid to the user from those options
+ // that are nothing but useful for internal purposes (i.e., mapping options
+ // to positional arguments). We still need to aggregate them before parsing
+ // and that's what 'desc_all' is all about.
+ //
+
+ ceph_assert(desc != NULL);
+
+ po::options_description desc_all;
+ desc_all.add(*desc);
+ if (hidden_desc != NULL)
+ desc_all.add(*hidden_desc);
+
+ try {
+ po::command_line_parser parser = po::command_line_parser(cmd_args).
+ options(desc_all);
+
+ if (positional) {
+ parser = parser.positional(*positional);
+ }
+
+ po::parsed_options parsed = parser.run();
+ po::store(parsed, *vm);
+ po::notify(*vm);
+ } catch (po::error &e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+
+/**
+ * usage: ceph-monstore-tool <store-path> <command> [options]
+ *
+ * commands:
+ *
+ * store-copy < --out arg >
+ * dump-keys
+ * compact
+ * getmonmap < --out arg [ --version arg ] >
+ * getosdmap < --out arg [ --version arg ] >
+ * dump-paxos <--dump-start VER> <--dump-end VER>
+ * dump-trace < --trace-file arg >
+ * replay-trace
+ * random-gen
+ * rewrite-crush
+ *
+ * wanted syntax:
+ *
+ * ceph-monstore-tool PATH CMD [options]
+ *
+ * ceph-monstore-tool PATH store-copy <PATH2 | -o PATH2>
+ * ceph-monstore-tool PATH dump-keys
+ * ceph-monstore-tool PATH compact
+ * ceph-monstore-tool PATH get monmap [VER]
+ * ceph-monstore-tool PATH get osdmap [VER]
+ * ceph-monstore-tool PATH dump-paxos STARTVER ENDVER
+ *
+ *
+ */
+void usage(const char *n, po::options_description &d)
+{
+ std::cerr <<
+ "usage: " << n << " <store-path> <cmd> [args|options]\n"
+ << "\n"
+ << "Commands:\n"
+ << " store-copy PATH copies store to PATH\n"
+ << " compact compacts the store\n"
+ << " get monmap [-- options] get monmap (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " get osdmap [-- options] get osdmap (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " get mdsmap [-- options] get mdsmap (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " get mgr [-- options] get mgr map (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " get crushmap [-- options] get crushmap (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " show-versions [-- options] show the first&last committed version of map\n"
+ << " (show-versions -- --help for more info)\n"
+ << " dump-keys dumps store keys to FILE\n"
+ << " (default: stdout)\n"
+ << " dump-paxos [-- options] dump paxos transactions\n"
+ << " (dump-paxos -- --help for more info)\n"
+ << " dump-trace FILE [-- options] dump contents of trace file FILE\n"
+ << " (dump-trace -- --help for more info)\n"
+ << " replay-trace FILE [-- options] replay trace from FILE\n"
+ << " (replay-trace -- --help for more info)\n"
+ << " random-gen [-- options] add randomly generated ops to the store\n"
+ << " (random-gen -- --help for more info)\n"
+ << " rewrite-crush [-- options] add a rewrite commit to the store\n"
+ << " (rewrite-crush -- --help for more info)\n"
+ << " rebuild rebuild store\n"
+ << " (rebuild -- --help for more info)\n"
+ << std::endl;
+ std::cerr << d << std::endl;
+ std::cerr
+ << "\nPlease Note:\n"
+ << "* Ceph-specific options should be in the format --option-name=VAL\n"
+ << " (specifically, do not forget the '='!!)\n"
+ << "* Command-specific options need to be passed after a '--'\n"
+ << " e.g., 'get monmap -- --version 10 --out /tmp/foo'"
+ << std::endl;
+}
+
+int update_osdmap(MonitorDBStore& store, version_t ver, bool copy,
+ std::shared_ptr<CrushWrapper> crush,
+ MonitorDBStore::Transaction* t) {
+ const string prefix("osdmap");
+
+ // full
+ bufferlist bl;
+ int r = 0;
+ r = store.get(prefix, store.combine_strings("full", ver), bl);
+ if (r) {
+ std::cerr << "Error getting full map: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ osdmap.crush = crush;
+ if (copy) {
+ osdmap.inc_epoch();
+ }
+ bl.clear();
+ // be consistent with OSDMonitor::update_from_paxos()
+ osdmap.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+ t->put(prefix, store.combine_strings("full", osdmap.get_epoch()), bl);
+
+ // incremental
+ OSDMap::Incremental inc;
+ if (copy) {
+ inc.epoch = osdmap.get_epoch();
+ inc.fsid = osdmap.get_fsid();
+ } else {
+ bl.clear();
+ r = store.get(prefix, ver, bl);
+ if (r) {
+ std::cerr << "Error getting inc map: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ OSDMap::Incremental inc(bl);
+ if (inc.crush.length()) {
+ inc.crush.clear();
+ crush->encode(inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ }
+ if (inc.fullmap.length()) {
+ OSDMap fullmap;
+ fullmap.decode(inc.fullmap);
+ fullmap.crush = crush;
+ inc.fullmap.clear();
+ fullmap.encode(inc.fullmap);
+ }
+ }
+ ceph_assert(osdmap.have_crc());
+ inc.full_crc = osdmap.get_crc();
+ bl.clear();
+ // be consistent with OSDMonitor::update_from_paxos()
+ inc.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+ t->put(prefix, inc.epoch, bl);
+ return 0;
+}
+
+int rewrite_transaction(MonitorDBStore& store, int version,
+ const string& crush_file,
+ MonitorDBStore::Transaction* t) {
+ const string prefix("osdmap");
+
+ // calc the known-good epoch
+ version_t last_committed = store.get(prefix, "last_committed");
+ version_t good_version = 0;
+ if (version <= 0) {
+ if (last_committed >= (unsigned)-version) {
+ good_version = last_committed + version;
+ } else {
+ std::cerr << "osdmap-version is less than: -" << last_committed << std::endl;
+ return EINVAL;
+ }
+ } else {
+ good_version = version;
+ }
+ if (good_version >= last_committed) {
+ std::cout << "good epoch is greater or equal to the last committed one: "
+ << good_version << " >= " << last_committed << std::endl;
+ return 0;
+ }
+
+ // load/extract the crush map
+ int r = 0;
+ std::shared_ptr<CrushWrapper> crush(new CrushWrapper);
+ if (crush_file.empty()) {
+ bufferlist bl;
+ r = store.get(prefix, store.combine_strings("full", good_version), bl);
+ if (r) {
+ std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ crush = osdmap.crush;
+ } else {
+ string err;
+ bufferlist bl;
+ r = bl.read_file(crush_file.c_str(), &err);
+ if (r) {
+ std::cerr << err << ": " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ auto p = bl.cbegin();
+ crush->decode(p);
+ }
+
+ // prepare a transaction to rewrite the epochs
+ // (good_version, last_committed]
+ // with the good crush map.
+ // XXX: may need to break this into several paxos versions?
+ ceph_assert(good_version < last_committed);
+ for (version_t v = good_version + 1; v <= last_committed; v++) {
+ cout << "rewriting epoch #" << v << "/" << last_committed << std::endl;
+ r = update_osdmap(store, v, false, crush, t);
+ if (r)
+ return r;
+ }
+
+ // add a new osdmap epoch to store, so monitors will update their current osdmap
+ // in addition to the ones stored in epochs.
+ //
+ // This is needed due to the way the monitor updates from paxos and the
+ // facilities we are leveraging to push this update to the rest of the
+ // quorum.
+ //
+ // In a nutshell, we are generating a good version of the osdmap, with a
+ // proper crush, and building a transaction that will replace the bad
+ // osdmaps with good osdmaps. But this transaction needs to be applied on
+ // all nodes, so that the monitors will have good osdmaps to share with
+ // clients. We thus leverage Paxos, specifically the recovery mechanism, by
+ // creating a pending value that will be committed once the monitors form an
+ // initial quorum after being brought back to life.
+ //
+ // However, the way the monitor works has the paxos services, including the
+ // OSDMonitor, updating their state from disk *prior* to the recovery phase
+ // begins (so they have an up to date state in memory). This means the
+ // OSDMonitor will see the old, broken map, before the new paxos version is
+ // applied to disk, and the old version is cached. Even though we have the
+ // good map now, and we share the good map with clients, we will still be
+ // working on the old broken map. Instead of mucking around the monitor to
+ // make this work, we instead opt for adding the same osdmap but with a
+ // newer version, so that the OSDMonitor picks up on it when it updates from
+ // paxos after the proposal has been committed. This is not elegant, but
+ // avoids further unpleasantness that would arise from kludging around the
+ // current behavior. Also, has the added benefit of making sure the clients
+ // get an updated version of the map (because last_committed+1 >
+ // last_committed) :)
+ //
+ cout << "adding a new epoch #" << last_committed+1 << std::endl;
+ r = update_osdmap(store, last_committed++, true, crush, t);
+ if (r)
+ return r;
+ t->put(prefix, store.combine_strings("full", "latest"), last_committed);
+ t->put(prefix, "last_committed", last_committed);
+ return 0;
+}
+
+/**
+ * create a new paxos version which carries a proposal to rewrite all epochs
+ * of incremental and full map of "osdmap" after a faulty crush map is injected.
+ * so the leader will trigger a recovery and propagate this fix to its peons,
+ * after the proposal is accepted, and the transaction in it is applied. all
+ * monitors will rewrite the bad crush map with the good one, and have a new
+ * osdmap epoch with the good crush map in it.
+ */
+int rewrite_crush(const char* progname,
+ vector<string>& subcmds,
+ MonitorDBStore& store) {
+ po::options_description op_desc("Allowed 'rewrite-crush' options");
+ int version = -1;
+ string crush_file;
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("crush", po::value<string>(&crush_file),
+ ("path to the crush map file "
+ "(default: will instead extract it from the known-good osdmap)"))
+ ("good-epoch", po::value<int>(&version),
+ "known-good epoch of osdmap, if a negative number '-N' is given, the "
+ "$last_committed-N is used instead (default: -1). "
+ "Please note, -1 is not necessarily a good epoch, because there are "
+ "good chance that we have more epochs slipped into the monstore after "
+ "the one where the crushmap is firstly injected.")
+ ;
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, NULL, NULL, subcmds, &op_vm);
+ if (r) {
+ return -r;
+ }
+ if (op_vm.count("help")) {
+ usage(progname, op_desc);
+ return 0;
+ }
+
+ MonitorDBStore::Transaction rewrite_txn;
+ r = rewrite_transaction(store, version, crush_file, &rewrite_txn);
+ if (r) {
+ return r;
+ }
+
+ // store the transaction into store as a proposal
+ const string prefix("paxos");
+ version_t pending_v = store.get(prefix, "last_committed") + 1;
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ bufferlist bl;
+ rewrite_txn.encode(bl);
+ cout << "adding pending commit " << pending_v
+ << " " << bl.length() << " bytes" << std::endl;
+ t->put(prefix, pending_v, bl);
+ t->put(prefix, "pending_v", pending_v);
+ // a large enough yet unique proposal number will probably do the trick
+ version_t pending_pn = (store.get(prefix, "accepted_pn") / 100 + 4) * 100 + 1;
+ t->put(prefix, "pending_pn", pending_pn);
+ store.apply_transaction(t);
+ return 0;
+}
+
+static int update_auth(MonitorDBStore& st, const string& keyring_path)
+{
+ // import all keyrings stored in the keyring file
+ KeyRing keyring;
+ int r = keyring.load(g_ceph_context, keyring_path);
+ if (r < 0) {
+ cerr << "unable to load admin keyring: " << keyring_path << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ __u8 v = 1;
+ encode(v, bl);
+
+ for (const auto& k : keyring.get_keys()) {
+ KeyServerData::Incremental auth_inc;
+ auth_inc.name = k.first;
+ auth_inc.auth = k.second;
+ if (auth_inc.auth.caps.empty()) {
+ cerr << "no caps granted to: " << auth_inc.name << std::endl;
+ return -EINVAL;
+ }
+ map<string,string> caps;
+ std::transform(begin(auth_inc.auth.caps), end(auth_inc.auth.caps),
+ inserter(caps, end(caps)),
+ [](auto& cap) {
+ string c;
+ auto p = cap.second.cbegin();
+ decode(c, p);
+ return make_pair(cap.first, c);
+ });
+ cout << "adding auth for '"
+ << auth_inc.name << "': " << auth_inc.auth
+ << " with caps(" << caps << ")" << std::endl;
+ auth_inc.op = KeyServerData::AUTH_INC_ADD;
+
+ AuthMonitor::Incremental inc;
+ inc.inc_type = AuthMonitor::AUTH_DATA;
+ encode(auth_inc, inc.auth_data);
+ inc.auth_type = CEPH_AUTH_CEPHX;
+ inc.encode(bl, CEPH_FEATURES_ALL);
+ }
+
+ // prime rotating secrets
+ {
+ KeyServer ks(g_ceph_context, nullptr);
+ KeyServerData::Incremental auth_inc;
+ auth_inc.op = KeyServerData::AUTH_INC_SET_ROTATING;
+ bool r = ks.prepare_rotating_update(auth_inc.rotating_bl);
+ ceph_assert(r);
+ AuthMonitor::Incremental inc;
+ inc.inc_type = AuthMonitor::AUTH_DATA;
+ encode(auth_inc, inc.auth_data);
+ inc.auth_type = CEPH_AUTH_CEPHX;
+ inc.encode(bl, CEPH_FEATURES_ALL);
+ }
+
+ const string prefix("auth");
+ auto last_committed = st.get(prefix, "last_committed") + 1;
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put(prefix, last_committed, bl);
+ t->put(prefix, "last_committed", last_committed);
+ auto first_committed = st.get(prefix, "first_committed");
+ if (!first_committed) {
+ t->put(prefix, "first_committed", last_committed);
+ }
+ st.apply_transaction(t);
+ return 0;
+}
+
+static int update_mkfs(MonitorDBStore& st,
+ const string& monmap_path,
+ const vector<string>& mon_ids)
+{
+ MonMap monmap;
+ if (!monmap_path.empty()) {
+ cout << __func__ << " pulling initial monmap from " << monmap_path << std::endl;
+ bufferlist bl;
+ string err;
+ int r = bl.read_file(monmap_path.c_str(), &err);
+ if (r < 0) {
+ cerr << "failed to read monmap from " << monmap_path << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ monmap.decode(bl);
+ } else {
+ cout << __func__ << " generating seed initial monmap" << std::endl;
+ int r = monmap.build_initial(g_ceph_context, true, cerr);
+ if (r) {
+ cerr << "no initial monitors" << std::endl;
+ return -EINVAL;
+ }
+ vector<string> new_names;
+ if (!mon_ids.empty()) {
+ if (mon_ids.size() != monmap.size()) {
+ cerr << "Please pass the same number of <mon-ids> to name the hosts "
+ << "listed in 'mon_host'. "
+ << mon_ids.size() << " mon-id(s) specified, "
+ << "while you have " << monmap.size() << " mon hosts." << std::endl;
+ return -EINVAL;
+ }
+ new_names = mon_ids;
+ } else {
+ for (unsigned rank = 0; rank < monmap.size(); rank++) {
+ string new_name{"a"};
+ new_name[0] += rank;
+ new_names.push_back(std::move(new_name));
+ }
+ }
+ for (unsigned rank = 0; rank < monmap.size(); rank++) {
+ auto name = monmap.get_name(rank);
+ if (name.compare(0, 7, "noname-") == 0) {
+ monmap.rename(name, new_names[rank]);
+ }
+ }
+ }
+ monmap.print(cout);
+ bufferlist bl;
+ monmap.encode(bl, CEPH_FEATURES_ALL);
+ monmap.set_epoch(0);
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put("mkfs", "monmap", bl);
+ st.apply_transaction(t);
+ return 0;
+}
+
+static int update_monitor(MonitorDBStore& st)
+{
+ const string prefix("monitor");
+ // a stripped-down Monitor::mkfs()
+ bufferlist bl;
+ bl.append(CEPH_MON_ONDISK_MAGIC "\n");
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put(prefix, "magic", bl);
+ st.apply_transaction(t);
+ return 0;
+}
+
+// rebuild
+// - creating_pgs
+static int update_creating_pgs(MonitorDBStore& st)
+{
+ bufferlist bl;
+ auto last_osdmap_epoch = st.get("osdmap", "last_committed");
+ int r = st.get("osdmap", st.combine_strings("full", last_osdmap_epoch), bl);
+ if (r < 0) {
+ cerr << "unable to load osdmap e" << last_osdmap_epoch << std::endl;
+ return r;
+ }
+
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ creating_pgs_t creating;
+ for (auto& i : osdmap.get_pools()) {
+ creating.created_pools.insert(i.first);
+ }
+ creating.last_scan_epoch = last_osdmap_epoch;
+
+ bufferlist newbl;
+ encode(creating, newbl, CEPH_FEATURES_ALL);
+
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put("osd_pg_creating", "creating", newbl);
+ st.apply_transaction(t);
+ return 0;
+}
+
+// rebuild
+// - mgr
+// - mgr_command_desc
+static int update_mgrmap(MonitorDBStore& st)
+{
+ auto t = make_shared<MonitorDBStore::Transaction>();
+
+ {
+ MgrMap map;
+ // mgr expects epoch > 1
+ map.epoch++;
+ auto initial_modules =
+ get_str_vec(g_ceph_context->_conf.get_val<string>("mgr_initial_modules"));
+ copy(begin(initial_modules),
+ end(initial_modules),
+ inserter(map.modules, end(map.modules)));
+ bufferlist bl;
+ map.encode(bl, CEPH_FEATURES_ALL);
+ t->put("mgr", map.epoch, bl);
+ t->put("mgr", "last_committed", map.epoch);
+ }
+ {
+ auto mgr_command_descs = mgr_commands;
+ for (auto& c : mgr_command_descs) {
+ c.set_flag(MonCommand::FLAG_MGR);
+ }
+ bufferlist bl;
+ encode(mgr_command_descs, bl);
+ t->put("mgr_command_descs", "", bl);
+ }
+ return st.apply_transaction(t);
+}
+
+static int update_paxos(MonitorDBStore& st)
+{
+ const string prefix("paxos");
+ // a large enough version greater than the maximum possible `last_committed`
+ // that could be replied by the peons when the leader is collecting paxos
+ // transactions during recovery
+ constexpr version_t first_committed = 0x42;
+ constexpr version_t last_committed = first_committed;
+ for (version_t v = first_committed; v < last_committed + 1; v++) {
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ if (v == first_committed) {
+ t->put(prefix, "first_committed", v);
+ }
+ bufferlist proposal;
+ MonitorDBStore::Transaction empty_txn;
+ empty_txn.encode(proposal);
+ t->put(prefix, v, proposal);
+ t->put(prefix, "last_committed", v);
+ st.apply_transaction(t);
+ }
+ // build a pending paxos proposal from all non-permanent k/v pairs. once the
+ // proposal is committed, it will gets applied. on the sync provider side, it
+ // will be a no-op, but on its peers, the paxos commit will help to build up
+ // the necessary epochs.
+ bufferlist pending_proposal;
+ {
+ MonitorDBStore::Transaction t;
+ vector<string> prefixes = {"auth", "osdmap",
+ "mgr", "mgr_command_desc"};
+ for (const auto& prefix : prefixes) {
+ for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
+ auto key = i->raw_key();
+ auto val = i->value();
+ t.put(key.first, key.second, val);
+ }
+ }
+ t.encode(pending_proposal);
+ }
+ auto pending_v = last_committed + 1;
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put(prefix, pending_v, pending_proposal);
+ t->put(prefix, "pending_v", pending_v);
+ t->put(prefix, "pending_pn", 400);
+ st.apply_transaction(t);
+ return 0;
+}
+
+int rebuild_monstore(const char* progname,
+ vector<string>& subcmds,
+ MonitorDBStore& st)
+{
+ po::options_description op_desc("Allowed 'rebuild' options");
+ string keyring_path;
+ string monmap_path;
+ vector<string> mon_ids;
+ op_desc.add_options()
+ ("keyring", po::value<string>(&keyring_path),
+ "path to the client.admin key")
+ ("monmap", po::value<string>(&monmap_path),
+ "path to the initial monmap")
+ ("mon-ids", po::value<vector<string>>(&mon_ids)->multitoken(),
+ "mon ids, use 'a', 'b', ... if not specified");
+ po::positional_options_description pos_desc;
+ pos_desc.add("mon-ids", -1);
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, nullptr, &pos_desc, subcmds, &op_vm);
+ if (r) {
+ return -r;
+ }
+ if (op_vm.count("help")) {
+ usage(progname, op_desc);
+ return 0;
+ }
+ if (!keyring_path.empty())
+ update_auth(st, keyring_path);
+ if ((r = update_creating_pgs(st))) {
+ return r;
+ }
+ if ((r = update_mgrmap(st))) {
+ return r;
+ }
+ if ((r = update_paxos(st))) {
+ return r;
+ }
+ if ((r = update_mkfs(st, monmap_path, mon_ids))) {
+ return r;
+ }
+ if ((r = update_monitor(st))) {
+ return r;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ int err = 0;
+ po::options_description desc("Allowed options");
+ string store_path, cmd;
+ vector<string> subcmds;
+ desc.add_options()
+ ("help,h", "produce help message")
+ ;
+
+ /* Dear Future Developer:
+ *
+ * for further improvement, should you need to pass specific options to
+ * a command (e.g., get osdmap VER --hex), you can expand the current
+ * format by creating additional 'po::option_description' and passing
+ * 'subcmds' to 'po::command_line_parser', much like what is currently
+ * done by default. However, beware: in order to differentiate a
+ * command-specific option from the generic/global options, you will need
+ * to pass '--' in the command line (so that the first parser, the one
+ * below, assumes it has reached the end of all options); e.g.,
+ * 'get osdmap VER -- --hex'. Not pretty; far from intuitive; it was as
+ * far as I got with this library. Improvements on this format will be
+ * left as an excercise for the reader. -Joao
+ */
+ po::options_description positional_desc("Positional argument options");
+ positional_desc.add_options()
+ ("store-path", po::value<string>(&store_path),
+ "path to monitor's store")
+ ("command", po::value<string>(&cmd),
+ "Command")
+ ("subcmd", po::value<vector<string> >(&subcmds),
+ "Command arguments/Sub-Commands")
+ ;
+ po::positional_options_description positional;
+ positional.add("store-path", 1);
+ positional.add("command", 1);
+ positional.add("subcmd", -1);
+
+ po::options_description all_desc("All options");
+ all_desc.add(desc).add(positional_desc);
+
+ vector<string> ceph_option_strings;
+ po::variables_map vm;
+ try {
+ po::parsed_options parsed =
+ po::command_line_parser(argc, argv).
+ options(all_desc).
+ positional(positional).
+ allow_unregistered().run();
+
+ po::store(
+ parsed,
+ vm);
+ po::notify(vm);
+
+ // Specifying po::include_positional would have our positional arguments
+ // being collected (thus being part of ceph_option_strings and eventually
+ // passed on to global_init() below).
+ // Instead we specify po::exclude_positional, which has the upside of
+ // completely avoid this, but the downside of having to specify ceph
+ // options as --VAR=VAL (note the '='); otherwise we will capture the
+ // positional 'VAL' as belonging to us, never being collected.
+ ceph_option_strings = po::collect_unrecognized(parsed.options,
+ po::exclude_positional);
+
+ } catch(po::error &e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ return 1;
+ }
+
+ // parse command structure before calling global_init() and friends.
+
+ if (vm.empty() || vm.count("help") ||
+ store_path.empty() || cmd.empty() ||
+ *cmd.begin() == '-') {
+ usage(argv[0], desc);
+ return 1;
+ }
+
+ vector<const char *> ceph_options;
+ ceph_options.reserve(ceph_option_strings.size());
+ for (vector<string>::iterator i = ceph_option_strings.begin();
+ i != ceph_option_strings.end();
+ ++i) {
+ ceph_options.push_back(i->c_str());
+ }
+
+ auto cct = global_init(
+ NULL, ceph_options, CEPH_ENTITY_TYPE_MON,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_MON_CONFIG);
+ common_init_finish(g_ceph_context);
+ cct->_conf.apply_changes(nullptr);
+
+ // this is where we'll write *whatever*, on a per-command basis.
+ // not all commands require some place to write their things.
+ MonitorDBStore st(store_path);
+ if (store_path.size()) {
+ stringstream ss;
+ int r = st.open(ss);
+ if (r < 0) {
+ std::cerr << ss.str() << std::endl;
+ return EINVAL;
+ }
+ }
+
+ if (cmd == "dump-keys") {
+ KeyValueDB::WholeSpaceIterator iter = st.get_iterator();
+ while (iter->valid()) {
+ pair<string,string> key(iter->raw_key());
+ cout << key.first << " / " << key.second << std::endl;
+ iter->next();
+ }
+ } else if (cmd == "compact") {
+ st.compact();
+ } else if (cmd == "get") {
+ unsigned v = 0;
+ string outpath;
+ string map_type;
+ // visible options for this command
+ po::options_description op_desc("Allowed 'get' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("out,o", po::value<string>(&outpath),
+ "output file (default: stdout)")
+ ("version,v", po::value<unsigned>(&v),
+ "map version to obtain")
+ ("readable,r", "print the map information in human readable format")
+ ;
+ // this is going to be a positional argument; we don't want to show
+ // it as an option during --help, but we do want to have it captured
+ // when parsing.
+ po::options_description hidden_op_desc("Hidden 'get' options");
+ hidden_op_desc.add_options()
+ ("map-type", po::value<string>(&map_type),
+ "map-type")
+ ;
+ po::positional_options_description op_positional;
+ op_positional.add("map-type", 1);
+
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
+ subcmds, &op_vm);
+ if (r < 0) {
+ err = -r;
+ goto done;
+ }
+
+ if (op_vm.count("help") || map_type.empty()) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ if (v == 0) {
+ if (map_type == "crushmap") {
+ v = st.get("osdmap", "last_committed");
+ } else {
+ v = st.get(map_type, "last_committed");
+ }
+ }
+
+ int fd = STDOUT_FILENO;
+ if (!outpath.empty()){
+ fd = ::open(outpath.c_str(), O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0666);
+ if (fd < 0) {
+ std::cerr << "error opening output file: "
+ << cpp_strerror(errno) << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+ }
+
+ BOOST_SCOPE_EXIT((&r) (&fd) (&outpath)) {
+ ::close(fd);
+ if (r < 0 && fd != STDOUT_FILENO) {
+ ::remove(outpath.c_str());
+ }
+ } BOOST_SCOPE_EXIT_END
+
+ bufferlist bl;
+ r = 0;
+ if (map_type == "osdmap") {
+ r = st.get(map_type, st.combine_strings("full", v), bl);
+ } else if (map_type == "crushmap") {
+ bufferlist tmp;
+ r = st.get("osdmap", st.combine_strings("full", v), tmp);
+ if (r >= 0) {
+ OSDMap osdmap;
+ osdmap.decode(tmp);
+ osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ }
+ } else {
+ r = st.get(map_type, v, bl);
+ }
+ if (r < 0) {
+ std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ if (op_vm.count("readable")) {
+ stringstream ss;
+ bufferlist out;
+ try {
+ if (map_type == "monmap") {
+ MonMap monmap;
+ monmap.decode(bl);
+ monmap.print(ss);
+ } else if (map_type == "osdmap") {
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ osdmap.print(ss);
+ } else if (map_type == "mdsmap") {
+ FSMap fs_map;
+ fs_map.decode(bl);
+ fs_map.print(ss);
+ } else if (map_type == "mgr") {
+ MgrMap mgr_map;
+ auto p = bl.cbegin();
+ mgr_map.decode(p);
+ JSONFormatter f;
+ f.dump_object("mgrmap", mgr_map);
+ f.flush(ss);
+ } else if (map_type == "crushmap") {
+ CrushWrapper cw;
+ auto it = bl.cbegin();
+ cw.decode(it);
+ CrushCompiler cc(cw, std::cerr, 0);
+ cc.decompile(ss);
+ } else {
+ std::cerr << "This type of readable map does not exist: " << map_type
+ << std::endl << "You can only specify[osdmap|monmap|mdsmap"
+ "|crushmap|mgr]" << std::endl;
+ }
+ } catch (const buffer::error &err) {
+ std::cerr << "Could not decode for human readable output (you may still"
+ " use non-readable mode). Detail: " << err.what() << std::endl;
+ }
+
+ out.append(ss);
+ out.write_fd(fd);
+ } else {
+ bl.write_fd(fd);
+ }
+
+ if (!outpath.empty()) {
+ std::cout << "wrote " << map_type
+ << " version " << v << " to " << outpath
+ << std::endl;
+ }
+ } else if (cmd == "show-versions") {
+ string map_type; //map type:osdmap,monmap...
+ // visible options for this command
+ po::options_description op_desc("Allowed 'show-versions' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("map-type", po::value<string>(&map_type), "map_type");
+
+ po::positional_options_description op_positional;
+ op_positional.add("map-type", 1);
+
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, NULL, &op_positional,
+ subcmds, &op_vm);
+ if (r < 0) {
+ err = -r;
+ goto done;
+ }
+
+ if (op_vm.count("help") || map_type.empty()) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ unsigned int v_first = 0;
+ unsigned int v_last = 0;
+ v_first = st.get(map_type, "first_committed");
+ v_last = st.get(map_type, "last_committed");
+
+ std::cout << "first committed:\t" << v_first << "\n"
+ << "last committed:\t" << v_last << std::endl;
+ } else if (cmd == "dump-paxos") {
+ unsigned dstart = 0;
+ unsigned dstop = ~0;
+ po::options_description op_desc("Allowed 'dump-paxos' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("start,s", po::value<unsigned>(&dstart),
+ "starting version (default: 0)")
+ ("end,e", po::value<unsigned>(&dstop),
+ "finish version (default: ~0)")
+ ;
+
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, NULL, NULL,
+ subcmds, &op_vm);
+ if (r < 0) {
+ err = -r;
+ goto done;
+ }
+
+ if (op_vm.count("help")) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ if (dstart > dstop) {
+ std::cerr << "error: 'start' version (value: " << dstart << ") "
+ << " is greater than 'end' version (value: " << dstop << ")"
+ << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ version_t v = dstart;
+ for (; v <= dstop; ++v) {
+ bufferlist bl;
+ st.get("paxos", v, bl);
+ if (bl.length() == 0)
+ break;
+ cout << "\n--- " << v << " ---" << std::endl;
+ auto tx(std::make_shared<MonitorDBStore::Transaction>());
+ Paxos::decode_append_transaction(tx, bl);
+ JSONFormatter f(true);
+ tx->dump(&f);
+ f.flush(cout);
+ }
+
+ std::cout << "dumped " << v << " paxos versions" << std::endl;
+
+ } else if (cmd == "dump-trace") {
+ unsigned dstart = 0;
+ unsigned dstop = ~0;
+ string outpath;
+
+ // visible options for this command
+ po::options_description op_desc("Allowed 'dump-trace' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("start,s", po::value<unsigned>(&dstart),
+ "starting version (default: 0)")
+ ("end,e", po::value<unsigned>(&dstop),
+ "finish version (default: ~0)")
+ ;
+ // this is going to be a positional argument; we don't want to show
+ // it as an option during --help, but we do want to have it captured
+ // when parsing.
+ po::options_description hidden_op_desc("Hidden 'dump-trace' options");
+ hidden_op_desc.add_options()
+ ("out,o", po::value<string>(&outpath),
+ "file to write the dump to")
+ ;
+ po::positional_options_description op_positional;
+ op_positional.add("out", 1);
+
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
+ subcmds, &op_vm);
+ if (r < 0) {
+ err = -r;
+ goto done;
+ }
+
+ if (op_vm.count("help")) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ if (outpath.empty()) {
+ usage(argv[0], op_desc);
+ err = EINVAL;
+ goto done;
+ }
+
+ if (dstart > dstop) {
+ std::cerr << "error: 'start' version (value: " << dstart << ") "
+ << " is greater than 'stop' version (value: " << dstop << ")"
+ << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ TraceIter iter(outpath.c_str());
+ iter.init();
+ while (true) {
+ if (!iter.valid())
+ break;
+ if (iter.num() >= dstop) {
+ break;
+ }
+ if (iter.num() >= dstart) {
+ JSONFormatter f(true);
+ iter.cur()->dump(&f, false);
+ f.flush(std::cout);
+ std::cout << std::endl;
+ }
+ iter.next();
+ }
+ std::cerr << "Read up to transaction " << iter.num() << std::endl;
+ } else if (cmd == "replay-trace") {
+ string inpath;
+ unsigned num_replays = 1;
+ // visible options for this command
+ po::options_description op_desc("Allowed 'replay-trace' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("num-replays,n", po::value<unsigned>(&num_replays),
+ "finish version (default: 1)")
+ ;
+ // this is going to be a positional argument; we don't want to show
+ // it as an option during --help, but we do want to have it captured
+ // when parsing.
+ po::options_description hidden_op_desc("Hidden 'replay-trace' options");
+ hidden_op_desc.add_options()
+ ("in,i", po::value<string>(&inpath),
+ "file to write the dump to")
+ ;
+ po::positional_options_description op_positional;
+ op_positional.add("in", 1);
+
+ // op_desc_all will aggregate all visible and hidden options for parsing.
+ // when we call 'usage()' we just pass 'op_desc', as that's the description
+ // holding the visible options.
+ po::options_description op_desc_all;
+ op_desc_all.add(op_desc).add(hidden_op_desc);
+
+ po::variables_map op_vm;
+ try {
+ po::parsed_options op_parsed = po::command_line_parser(subcmds).
+ options(op_desc_all).positional(op_positional).run();
+ po::store(op_parsed, op_vm);
+ po::notify(op_vm);
+ } catch (po::error &e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ if (op_vm.count("help")) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ if (inpath.empty()) {
+ usage(argv[0], op_desc);
+ err = EINVAL;
+ goto done;
+ }
+
+ unsigned num = 0;
+ for (unsigned i = 0; i < num_replays; ++i) {
+ TraceIter iter(inpath.c_str());
+ iter.init();
+ while (true) {
+ if (!iter.valid())
+ break;
+ std::cerr << "Replaying trans num " << num << std::endl;
+ st.apply_transaction(iter.cur());
+ iter.next();
+ ++num;
+ }
+ std::cerr << "Read up to transaction " << iter.num() << std::endl;
+ }
+ } else if (cmd == "random-gen") {
+ unsigned tsize = 200;
+ unsigned tvalsize = 1024;
+ unsigned ntrans = 100;
+ po::options_description op_desc("Allowed 'random-gen' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("num-keys,k", po::value<unsigned>(&tsize),
+ "keys to write in each transaction (default: 200)")
+ ("size,s", po::value<unsigned>(&tvalsize),
+ "size (in bytes) of the value to write in each key (default: 1024)")
+ ("ntrans,n", po::value<unsigned>(&ntrans),
+ "number of transactions to run (default: 100)")
+ ;
+
+ po::variables_map op_vm;
+ try {
+ po::parsed_options op_parsed = po::command_line_parser(subcmds).
+ options(op_desc).run();
+ po::store(op_parsed, op_vm);
+ po::notify(op_vm);
+ } catch (po::error &e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ if (op_vm.count("help")) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ unsigned num = 0;
+ for (unsigned i = 0; i < ntrans; ++i) {
+ std::cerr << "Applying trans " << i << std::endl;
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ string prefix;
+ prefix.push_back((i%26)+'a');
+ for (unsigned j = 0; j < tsize; ++j) {
+ stringstream os;
+ os << num;
+ bufferlist bl;
+ for (unsigned k = 0; k < tvalsize; ++k) bl.append(rand());
+ t->put(prefix, os.str(), bl);
+ ++num;
+ }
+ t->compact_prefix(prefix);
+ st.apply_transaction(t);
+ }
+ } else if (cmd == "store-copy") {
+ if (subcmds.size() < 1 || subcmds[0].empty()) {
+ usage(argv[0], desc);
+ err = EINVAL;
+ goto done;
+ }
+
+ string out_path = subcmds[0];
+
+ MonitorDBStore out_store(out_path);
+ {
+ stringstream ss;
+ int r = out_store.create_and_open(ss);
+ if (r < 0) {
+ std::cerr << ss.str() << std::endl;
+ goto done;
+ }
+ }
+
+
+ KeyValueDB::WholeSpaceIterator it = st.get_iterator();
+ uint64_t total_keys = 0;
+ uint64_t total_size = 0;
+ uint64_t total_tx = 0;
+
+ do {
+ uint64_t num_keys = 0;
+
+ auto tx(std::make_shared<MonitorDBStore::Transaction>());
+
+ while (it->valid() && num_keys < 128) {
+ pair<string,string> k = it->raw_key();
+ bufferlist v = it->value();
+ tx->put(k.first, k.second, v);
+
+ num_keys ++;
+ total_tx ++;
+ total_size += v.length();
+
+ it->next();
+ }
+
+ total_keys += num_keys;
+
+ if (!tx->empty())
+ out_store.apply_transaction(tx);
+
+ std::cout << "copied " << total_keys << " keys so far ("
+ << stringify(byte_u_t(total_size)) << ")" << std::endl;
+
+ } while (it->valid());
+ out_store.close();
+ std::cout << "summary: copied " << total_keys << " keys, using "
+ << total_tx << " transactions, totalling "
+ << stringify(byte_u_t(total_size)) << std::endl;
+ std::cout << "from '" << store_path << "' to '" << out_path << "'"
+ << std::endl;
+ } else if (cmd == "rewrite-crush") {
+ err = rewrite_crush(argv[0], subcmds, st);
+ } else if (cmd == "rebuild") {
+ err = rebuild_monstore(argv[0], subcmds, st);
+ } else {
+ std::cerr << "Unrecognized command: " << cmd << std::endl;
+ usage(argv[0], desc);
+ goto done;
+ }
+
+ done:
+ st.close();
+ return err;
+}
diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc
new file mode 100644
index 000000000..590b4a344
--- /dev/null
+++ b/src/tools/ceph_objectstore_tool.cc
@@ -0,0 +1,4684 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/optional.hpp>
+#include <fstream>
+
+#include <stdlib.h>
+
+#include "common/Formatter.h"
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+#include "common/url_escape.h"
+
+#include "global/global_init.h"
+
+#include "os/ObjectStore.h"
+#include "os/filestore/FileJournal.h"
+#include "os/filestore/FileStore.h"
+#ifdef HAVE_LIBFUSE
+#include "os/FuseStore.h"
+#endif
+
+#include "osd/PGLog.h"
+#include "osd/OSD.h"
+#include "osd/PG.h"
+#include "osd/ECUtil.h"
+
+#include "json_spirit/json_spirit_value.h"
+#include "json_spirit/json_spirit_reader.h"
+
+#include "rebuild_mondb.h"
+#include "ceph_objectstore_tool.h"
+#include "include/compat.h"
+#include "include/util.h"
+
+namespace po = boost::program_options;
+
+#ifdef INTERNAL_TEST
+CompatSet get_test_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+#ifdef INTERNAL_TEST2
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+#endif
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+#endif
+
+const ssize_t max_read = 1024 * 1024;
+const int fd_none = INT_MIN;
+bool outistty;
+bool dry_run;
+
+struct action_on_object_t {
+ virtual ~action_on_object_t() {}
+ virtual void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) = 0;
+};
+
+int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug)
+{
+ auto ch = store->open_collection(coll);
+ unsigned LIST_AT_A_TIME = 100;
+ ghobject_t next;
+ while (!next.is_max()) {
+ vector<ghobject_t> list;
+ int r = store->collection_list(ch,
+ next,
+ ghobject_t::get_max(),
+ LIST_AT_A_TIME,
+ &list,
+ &next);
+ if (r < 0) {
+ cerr << "Error listing collection: " << coll << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ for (vector<ghobject_t>::iterator obj = list.begin();
+ obj != list.end();
+ ++obj) {
+ if (obj->is_pgmeta())
+ continue;
+ object_info_t oi;
+ if (coll != coll_t::meta()) {
+ bufferlist attr;
+ r = store->getattr(ch, *obj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
+ << cpp_strerror(r) << std::endl;
+ } else {
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding attr on : " << make_pair(coll, *obj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+ }
+ action.call(store, coll, *obj, oi);
+ }
+ }
+ return 0;
+}
+
+int action_on_all_objects_in_pg(ObjectStore *store, string pgidstr, action_on_object_t &action, bool debug)
+{
+ spg_t pgid;
+ // Scan collections in case this is an ec pool but no shard specified
+ unsigned scanned = 0;
+ int r = 0;
+ vector<coll_t> colls_to_check;
+ vector<coll_t> candidates;
+ r = store->list_collections(candidates);
+ if (r < 0) {
+ cerr << "Error listing collections: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ pgid.parse(pgidstr.c_str());
+ for (vector<coll_t>::iterator i = candidates.begin();
+ i != candidates.end();
+ ++i) {
+ spg_t cand_pgid;
+ if (!i->is_pg(&cand_pgid))
+ continue;
+
+ // If an exact match or treat no shard as any shard
+ if (cand_pgid == pgid ||
+ (pgid.is_no_shard() && pgid.pgid == cand_pgid.pgid)) {
+ colls_to_check.push_back(*i);
+ }
+ }
+
+ if (debug)
+ cerr << colls_to_check.size() << " pgs to scan" << std::endl;
+ for (vector<coll_t>::iterator i = colls_to_check.begin();
+ i != colls_to_check.end();
+ ++i, ++scanned) {
+ if (debug)
+ cerr << "Scanning " << *i << ", " << scanned << "/"
+ << colls_to_check.size() << " completed" << std::endl;
+ r = _action_on_all_objects_in_pg(store, *i, action, debug);
+ if (r < 0)
+ break;
+ }
+ return r;
+}
+
+int action_on_all_objects_in_exact_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug)
+{
+ int r = _action_on_all_objects_in_pg(store, coll, action, debug);
+ return r;
+}
+
+int _action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool debug)
+{
+ unsigned scanned = 0;
+ int r = 0;
+ vector<coll_t> colls_to_check;
+ vector<coll_t> candidates;
+ r = store->list_collections(candidates);
+ if (r < 0) {
+ cerr << "Error listing collections: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ for (vector<coll_t>::iterator i = candidates.begin();
+ i != candidates.end();
+ ++i) {
+ if (i->is_pg()) {
+ colls_to_check.push_back(*i);
+ }
+ }
+
+ if (debug)
+ cerr << colls_to_check.size() << " pgs to scan" << std::endl;
+ for (vector<coll_t>::iterator i = colls_to_check.begin();
+ i != colls_to_check.end();
+ ++i, ++scanned) {
+ if (debug)
+ cerr << "Scanning " << *i << ", " << scanned << "/"
+ << colls_to_check.size() << " completed" << std::endl;
+ r = _action_on_all_objects_in_pg(store, *i, action, debug);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+int action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool debug)
+{
+ int r = _action_on_all_objects(store, action, debug);
+ return r;
+}
+
+struct pgid_object_list {
+ list<pair<coll_t, ghobject_t> > _objects;
+
+ void insert(coll_t coll, ghobject_t &ghobj) {
+ _objects.push_back(make_pair(coll, ghobj));
+ }
+
+ void dump(Formatter *f, bool human_readable) const {
+ if (!human_readable)
+ f->open_array_section("pgid_objects");
+ for (list<pair<coll_t, ghobject_t> >::const_iterator i = _objects.begin();
+ i != _objects.end();
+ ++i) {
+ f->open_array_section("pgid_object");
+ spg_t pgid;
+ bool is_pg = i->first.is_pg(&pgid);
+ if (is_pg)
+ f->dump_string("pgid", stringify(pgid));
+ if (!is_pg || !human_readable)
+ f->dump_string("coll", i->first.to_str());
+ f->open_object_section("ghobject");
+ i->second.dump(f);
+ f->close_section();
+ f->close_section();
+ if (human_readable) {
+ f->flush(cout);
+ cout << std::endl;
+ }
+ }
+ if (!human_readable) {
+ f->close_section();
+ f->flush(cout);
+ cout << std::endl;
+ }
+ }
+};
+
+struct lookup_ghobject : public action_on_object_t {
+ pgid_object_list _objects;
+ const string _name;
+ const boost::optional<std::string> _namespace;
+ bool _need_snapset;
+
+ lookup_ghobject(const string& name, const boost::optional<std::string>& nspace, bool need_snapset = false) : _name(name),
+ _namespace(nspace), _need_snapset(need_snapset) { }
+
+ void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) override {
+ if (_need_snapset && !ghobj.hobj.has_snapset())
+ return;
+ if ((_name.length() == 0 || ghobj.hobj.oid.name == _name) &&
+ (!_namespace || ghobj.hobj.nspace == _namespace))
+ _objects.insert(coll, ghobj);
+ return;
+ }
+
+ int size() const {
+ return _objects._objects.size();
+ }
+
+ pair<coll_t, ghobject_t> pop() {
+ pair<coll_t, ghobject_t> front = _objects._objects.front();
+ _objects._objects.pop_front();
+ return front;
+ }
+
+ void dump(Formatter *f, bool human_readable) const {
+ _objects.dump(f, human_readable);
+ }
+};
+
+struct lookup_slow_ghobject : public action_on_object_t {
+ list<tuple<
+ coll_t,
+ ghobject_t,
+ ceph::signedspan,
+ ceph::signedspan,
+ ceph::signedspan,
+ string> > _objects;
+ const string _name;
+ double threshold;
+
+ coll_t last_coll;
+
+ lookup_slow_ghobject(const string& name, double _threshold) :
+ _name(name), threshold(_threshold) { }
+
+ void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) override {
+ ObjectMap::ObjectMapIterator iter;
+ auto start1 = mono_clock::now();
+ ceph::signedspan first_seek_time = start1 - start1;
+ ceph::signedspan last_seek_time = first_seek_time;
+ ceph::signedspan total_time = first_seek_time;
+ {
+ auto ch = store->open_collection(coll);
+ iter = store->get_omap_iterator(ch, ghobj);
+ if (!iter) {
+ cerr << "omap_get_iterator: " << cpp_strerror(ENOENT)
+ << " obj:" << ghobj
+ << std::endl;
+ return;
+ }
+ auto start = mono_clock::now();
+ iter->seek_to_first();
+ first_seek_time = mono_clock::now() - start;
+
+ while(iter->valid()) {
+ start = mono_clock::now();
+ iter->next();
+ last_seek_time = mono_clock::now() - start;
+ }
+ }
+
+ if (coll != last_coll) {
+ cerr << ">>> inspecting coll" << coll << std::endl;
+ last_coll = coll;
+ }
+
+ total_time = mono_clock::now() - start1;
+ if ( total_time >= make_timespan(threshold)) {
+ _objects.emplace_back(coll, ghobj,
+ first_seek_time, last_seek_time, total_time,
+ url_escape(iter->tail_key()));
+ cerr << ">>>>> found obj " << ghobj
+ << " first_seek_time "
+ << std::chrono::duration_cast<std::chrono::seconds>(first_seek_time).count()
+ << " last_seek_time "
+ << std::chrono::duration_cast<std::chrono::seconds>(last_seek_time).count()
+ << " total_time "
+ << std::chrono::duration_cast<std::chrono::seconds>(total_time).count()
+ << " tail key: " << url_escape(iter->tail_key())
+ << std::endl;
+ }
+ return;
+ }
+
+ int size() const {
+ return _objects.size();
+ }
+
+ void dump(Formatter *f, bool human_readable) const {
+ if (!human_readable)
+ f->open_array_section("objects");
+ for (auto i = _objects.begin();
+ i != _objects.end();
+ ++i) {
+ f->open_array_section("object");
+ coll_t coll;
+ ghobject_t ghobj;
+ ceph::signedspan first_seek_time;
+ ceph::signedspan last_seek_time;
+ ceph::signedspan total_time;
+ string tail_key;
+ std::tie(coll, ghobj, first_seek_time, last_seek_time, total_time, tail_key) = *i;
+
+ spg_t pgid;
+ bool is_pg = coll.is_pg(&pgid);
+ if (is_pg)
+ f->dump_string("pgid", stringify(pgid));
+ if (!is_pg || !human_readable)
+ f->dump_string("coll", coll.to_str());
+ f->dump_object("ghobject", ghobj);
+ f->open_object_section("times");
+ f->dump_int("first_seek_time",
+ std::chrono::duration_cast<std::chrono::seconds>(first_seek_time).count());
+ f->dump_int("last_seek_time",
+ std::chrono::duration_cast<std::chrono::seconds>
+ (last_seek_time).count());
+ f->dump_int("total_time",
+ std::chrono::duration_cast<std::chrono::seconds>(total_time).count());
+ f->dump_string("tail_key", tail_key);
+ f->close_section();
+
+ f->close_section();
+ if (human_readable) {
+ f->flush(cout);
+ cout << std::endl;
+ }
+ }
+ if (!human_readable) {
+ f->close_section();
+ f->flush(cout);
+ cout << std::endl;
+ }
+ }
+};
+
+int file_fd = fd_none;
+bool debug;
+bool force = false;
+bool no_superblock = false;
+
+super_header sh;
+
+static int get_fd_data(int fd, bufferlist &bl)
+{
+ uint64_t total = 0;
+ do {
+ ssize_t bytes = bl.read_fd(fd, max_read);
+ if (bytes < 0) {
+ cerr << "read_fd error " << cpp_strerror(bytes) << std::endl;
+ return bytes;
+ }
+
+ if (bytes == 0)
+ break;
+
+ total += bytes;
+ } while(true);
+
+ ceph_assert(bl.length() == total);
+ return 0;
+}
+
+int get_log(CephContext *cct, ObjectStore *fs, __u8 struct_ver,
+ spg_t pgid, const pg_info_t &info,
+ PGLog::IndexedLog &log, pg_missing_t &missing)
+{
+ try {
+ auto ch = fs->open_collection(coll_t(pgid));
+ if (!ch) {
+ return -ENOENT;
+ }
+ ostringstream oss;
+ ceph_assert(struct_ver > 0);
+ PGLog::read_log_and_missing(
+ cct, fs, ch,
+ pgid.make_pgmeta_oid(),
+ info, log, missing,
+ oss,
+ g_ceph_context->_conf->osd_ignore_stale_divergent_priors);
+ if (debug && oss.str().size())
+ cerr << oss.str() << std::endl;
+ }
+ catch (const buffer::error &e) {
+ cerr << "read_log_and_missing threw exception error " << e.what() << std::endl;
+ return -EFAULT;
+ }
+ return 0;
+}
+
+void dump_log(Formatter *formatter, ostream &out, pg_log_t &log,
+ pg_missing_t &missing)
+{
+ formatter->open_object_section("op_log");
+ formatter->open_object_section("pg_log_t");
+ log.dump(formatter);
+ formatter->close_section();
+ formatter->flush(out);
+ formatter->open_object_section("pg_missing_t");
+ missing.dump(formatter);
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(out);
+}
+
+//Based on part of OSD::load_pgs()
+int finish_remove_pgs(ObjectStore *store)
+{
+ vector<coll_t> ls;
+ int r = store->list_collections(ls);
+ if (r < 0) {
+ cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ for (vector<coll_t>::iterator it = ls.begin();
+ it != ls.end();
+ ++it) {
+ spg_t pgid;
+
+ if (it->is_temp(&pgid) ||
+ (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
+ cout << "finish_remove_pgs " << *it << " removing " << pgid << std::endl;
+ OSD::recursive_remove_collection(g_ceph_context, store, pgid, *it);
+ continue;
+ }
+
+ //cout << "finish_remove_pgs ignoring unrecognized " << *it << std::endl;
+ }
+ return 0;
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t)
+{
+ pg_info_t info(pgid);
+ coll_t coll(pgid);
+ ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid());
+
+ epoch_t map_epoch = 0;
+ int r = PG::peek_map_epoch(fs, pgid, &map_epoch);
+ if (r < 0)
+ cerr << __func__ << " warning: peek_map_epoch reported error" << std::endl;
+ PastIntervals past_intervals;
+ __u8 struct_v;
+ r = PG::read_info(fs, pgid, coll, info, past_intervals, struct_v);
+ if (r < 0) {
+ cerr << __func__ << " error on read_info " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ ceph_assert(struct_v >= 8);
+ // new omap key
+ cout << "setting '_remove' omap key" << std::endl;
+ map<string,bufferlist> values;
+ encode((char)1, values["_remove"]);
+ t->omap_setkeys(coll, pgmeta_oid, values);
+ return 0;
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+template<typename Func>
+void wait_until_done(ObjectStore::Transaction* txn, Func&& func)
+{
+ bool finished = false;
+ std::condition_variable cond;
+ std::mutex m;
+ txn->register_on_complete(make_lambda_context([&](int) {
+ std::unique_lock lock{m};
+ finished = true;
+ cond.notify_one();
+ }));
+ std::move(func)();
+ std::unique_lock lock{m};
+ cond.wait(lock, [&] {return finished;});
+}
+
+int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid)
+{
+ if (!dry_run)
+ finish_remove_pgs(store);
+ if (!store->collection_exists(coll_t(r_pgid)))
+ return -ENOENT;
+
+ cout << " marking collection for removal" << std::endl;
+ if (dry_run)
+ return 0;
+ ObjectStore::Transaction rmt;
+ int r = mark_pg_for_removal(store, r_pgid, &rmt);
+ if (r < 0) {
+ return r;
+ }
+ ObjectStore::CollectionHandle ch = store->open_collection(coll_t(r_pgid));
+ store->queue_transaction(ch, std::move(rmt));
+ finish_remove_pgs(store);
+ return r;
+}
+
+int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
+ PastIntervals &past_intervals)
+{
+ //Empty for this
+ coll_t coll(info.pgid);
+ ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid());
+ map<string,bufferlist> km;
+ string key_to_remove;
+ pg_info_t last_written_info;
+ int ret = prepare_info_keymap(
+ g_ceph_context,
+ &km, &key_to_remove,
+ epoch,
+ info,
+ last_written_info,
+ past_intervals,
+ true, true, false);
+ if (ret) cerr << "Failed to write info" << std::endl;
+ t.omap_setkeys(coll, pgmeta_oid, km);
+ if (!key_to_remove.empty()) {
+ t.omap_rmkey(coll, pgmeta_oid, key_to_remove);
+ }
+ return ret;
+}
+
+typedef map<eversion_t, hobject_t> divergent_priors_t;
+
+int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
+ pg_log_t &log, PastIntervals &past_intervals,
+ divergent_priors_t &divergent,
+ pg_missing_t &missing)
+{
+ cout << __func__ << " epoch " << epoch << " info " << info << std::endl;
+ int ret = write_info(t, epoch, info, past_intervals);
+ if (ret)
+ return ret;
+ coll_t coll(info.pgid);
+ map<string,bufferlist> km;
+
+ if (!divergent.empty()) {
+ ceph_assert(missing.get_items().empty());
+ PGLog::write_log_and_missing_wo_missing(
+ t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true);
+ } else {
+ pg_missing_tracker_t tmissing(missing);
+ bool rebuilt_missing_set_with_deletes = missing.may_include_deletes;
+ PGLog::write_log_and_missing(
+ t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true,
+ &rebuilt_missing_set_with_deletes);
+ }
+ t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km);
+ return 0;
+}
+
+int do_trim_pg_log(ObjectStore *store, const coll_t &coll,
+ pg_info_t &info, const spg_t &pgid,
+ epoch_t map_epoch,
+ PastIntervals &past_intervals)
+{
+ ghobject_t oid = pgid.make_pgmeta_oid();
+ struct stat st;
+ auto ch = store->open_collection(coll);
+ int r = store->stat(ch, oid, &st);
+ ceph_assert(r == 0);
+ ceph_assert(st.st_size == 0);
+
+ cerr << "Log bounds are: " << "(" << info.log_tail << ","
+ << info.last_update << "]" << std::endl;
+
+ uint64_t max_entries = g_ceph_context->_conf->osd_max_pg_log_entries;
+ if (info.last_update.version - info.log_tail.version <= max_entries) {
+ cerr << "Log not larger than osd_max_pg_log_entries " << max_entries << std::endl;
+ return 0;
+ }
+
+ ceph_assert(info.last_update.version > max_entries);
+ version_t trim_to = info.last_update.version - max_entries;
+ size_t trim_at_once = g_ceph_context->_conf->osd_pg_log_trim_max;
+ eversion_t new_tail;
+ bool done = false;
+
+ while (!done) {
+ // gather keys so we can delete them in a batch without
+ // affecting the iterator
+ set<string> keys_to_trim;
+ {
+ ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, oid);
+ if (!p)
+ break;
+ for (p->seek_to_first(); p->valid(); p->next()) {
+ if (p->key()[0] == '_')
+ continue;
+ if (p->key() == "can_rollback_to")
+ continue;
+ if (p->key() == "divergent_priors")
+ continue;
+ if (p->key() == "rollback_info_trimmed_to")
+ continue;
+ if (p->key() == "may_include_deletes_in_missing")
+ continue;
+ if (p->key().substr(0, 7) == string("missing"))
+ continue;
+ if (p->key().substr(0, 4) == string("dup_"))
+ continue;
+
+ bufferlist bl = p->value();
+ auto bp = bl.cbegin();
+ pg_log_entry_t e;
+ try {
+ e.decode_with_checksum(bp);
+ } catch (const buffer::error &e) {
+ cerr << "Error reading pg log entry: " << e.what() << std::endl;
+ }
+ if (debug) {
+ cerr << "read entry " << e << std::endl;
+ }
+ if (e.version.version > trim_to) {
+ done = true;
+ break;
+ }
+ keys_to_trim.insert(p->key());
+ new_tail = e.version;
+ if (keys_to_trim.size() >= trim_at_once)
+ break;
+ }
+
+ if (!p->valid())
+ done = true;
+ } // deconstruct ObjectMapIterator
+
+ // delete the keys
+ if (!dry_run && !keys_to_trim.empty()) {
+ cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl;
+ ObjectStore::Transaction t;
+ t.omap_rmkeys(coll, oid, keys_to_trim);
+ store->queue_transaction(ch, std::move(t));
+ ch->flush();
+ }
+ }
+
+ // update pg info with new tail
+ if (!dry_run && new_tail != eversion_t()) {
+ info.log_tail = new_tail;
+ ObjectStore::Transaction t;
+ int ret = write_info(t, map_epoch, info, past_intervals);
+ if (ret)
+ return ret;
+ store->queue_transaction(ch, std::move(t));
+ ch->flush();
+ }
+
+ // compact the db since we just removed a bunch of data
+ cerr << "Finished trimming, now compacting..." << std::endl;
+ if (!dry_run)
+ store->compact();
+ return 0;
+}
+
+int do_trim_pg_log_dups(ObjectStore *store, const coll_t &coll,
+ pg_info_t &info, const spg_t &pgid,
+ epoch_t map_epoch,
+ PastIntervals &past_intervals)
+{
+ ghobject_t oid = pgid.make_pgmeta_oid();
+ struct stat st;
+ auto ch = store->open_collection(coll);
+ int r = store->stat(ch, oid, &st);
+ ceph_assert(r == 0);
+ ceph_assert(st.st_size == 0);
+
+ const size_t max_dup_entries = g_ceph_context->_conf->osd_pg_log_dups_tracked;
+ ceph_assert(max_dup_entries > 0);
+ const size_t max_chunk_size = g_ceph_context->_conf->osd_pg_log_trim_max;
+ ceph_assert(max_chunk_size > 0);
+
+ cout << "max_dup_entries=" << max_dup_entries
+ << " max_chunk_size=" << max_chunk_size << std::endl;
+ if (dry_run) {
+ cout << "Dry run enabled, so when many chunks are needed,"
+ << " the trimming will never stop!" << std::endl;
+ }
+
+ set<string> keys_to_keep;
+ size_t num_removed = 0;
+ do {
+ set<string> keys_to_trim;
+ {
+ ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, oid);
+ if (!p)
+ break;
+ for (p->seek_to_first(); p->valid(); p->next()) {
+ if (p->key()[0] == '_')
+ continue;
+ if (p->key() == "can_rollback_to")
+ continue;
+ if (p->key() == "divergent_priors")
+ continue;
+ if (p->key() == "rollback_info_trimmed_to")
+ continue;
+ if (p->key() == "may_include_deletes_in_missing")
+ continue;
+ if (p->key().substr(0, 7) == string("missing"))
+ continue;
+ if (p->key().substr(0, 4) != string("dup_"))
+ continue;
+ keys_to_keep.insert(p->key());
+ if (keys_to_keep.size() > max_dup_entries) {
+ auto oldest_to_keep = keys_to_keep.begin();
+ keys_to_trim.emplace(*oldest_to_keep);
+ keys_to_keep.erase(oldest_to_keep);
+ }
+ if (keys_to_trim.size() >= max_chunk_size) {
+ break;
+ }
+ }
+ } // deconstruct ObjectMapIterator
+ // delete the keys
+ num_removed = keys_to_trim.size();
+ if (!dry_run && !keys_to_trim.empty()) {
+ cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl;
+ ObjectStore::Transaction t;
+ t.omap_rmkeys(coll, oid, keys_to_trim);
+ store->queue_transaction(ch, std::move(t));
+ ch->flush();
+ }
+ } while (num_removed == max_chunk_size);
+
+ // compact the db since we just removed a bunch of data
+ cerr << "Finished trimming, now compacting..." << std::endl;
+ if (!dry_run)
+ store->compact();
+ return 0;
+}
+
+const int OMAP_BATCH_SIZE = 25;
+void get_omap_batch(ObjectMap::ObjectMapIterator &iter, map<string, bufferlist> &oset)
+{
+ oset.clear();
+ for (int count = OMAP_BATCH_SIZE; count && iter->valid(); --count, iter->next()) {
+ oset.insert(pair<string, bufferlist>(iter->key(), iter->value()));
+ }
+}
+
+int ObjectStoreTool::export_file(ObjectStore *store, coll_t cid, ghobject_t &obj)
+{
+ struct stat st;
+ mysize_t total;
+ footer ft;
+
+ auto ch = store->open_collection(cid);
+ int ret = store->stat(ch, obj, &st);
+ if (ret < 0)
+ return ret;
+
+ cerr << "Read " << obj << std::endl;
+
+ total = st.st_size;
+ if (debug)
+ cerr << "size=" << total << std::endl;
+
+ object_begin objb(obj);
+
+ {
+ bufferptr bp;
+ bufferlist bl;
+ ret = store->getattr(ch, obj, OI_ATTR, bp);
+ if (ret < 0) {
+ cerr << "getattr failure object_info " << ret << std::endl;
+ return ret;
+ }
+ bl.push_back(bp);
+ decode(objb.oi, bl);
+ if (debug)
+ cerr << "object_info: " << objb.oi << std::endl;
+ }
+
+ // NOTE: we include whiteouts, lost, etc.
+
+ ret = write_section(TYPE_OBJECT_BEGIN, objb, file_fd);
+ if (ret < 0)
+ return ret;
+
+ uint64_t offset = 0;
+ bufferlist rawdatabl;
+ while(total > 0) {
+ rawdatabl.clear();
+ mysize_t len = max_read;
+ if (len > total)
+ len = total;
+
+ ret = store->read(ch, obj, offset, len, rawdatabl);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -EINVAL;
+
+ data_section dblock(offset, len, rawdatabl);
+ if (debug)
+ cerr << "data section offset=" << offset << " len=" << len << std::endl;
+
+ total -= ret;
+ offset += ret;
+
+ ret = write_section(TYPE_DATA, dblock, file_fd);
+ if (ret) return ret;
+ }
+
+ //Handle attrs for this object
+ map<string,bufferptr> aset;
+ ret = store->getattrs(ch, obj, aset);
+ if (ret) return ret;
+ attr_section as(aset);
+ ret = write_section(TYPE_ATTRS, as, file_fd);
+ if (ret)
+ return ret;
+
+ if (debug) {
+ cerr << "attrs size " << aset.size() << std::endl;
+ }
+
+ //Handle omap information
+ bufferlist hdrbuf;
+ ret = store->omap_get_header(ch, obj, &hdrbuf, true);
+ if (ret < 0) {
+ cerr << "omap_get_header: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ omap_hdr_section ohs(hdrbuf);
+ ret = write_section(TYPE_OMAP_HDR, ohs, file_fd);
+ if (ret)
+ return ret;
+
+ ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, obj);
+ if (!iter) {
+ ret = -ENOENT;
+ cerr << "omap_get_iterator: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ iter->seek_to_first();
+ int mapcount = 0;
+ map<string, bufferlist> out;
+ while(iter->valid()) {
+ get_omap_batch(iter, out);
+
+ if (out.empty()) break;
+
+ mapcount += out.size();
+ omap_section oms(out);
+ ret = write_section(TYPE_OMAP, oms, file_fd);
+ if (ret)
+ return ret;
+ }
+ if (debug)
+ cerr << "omap map size " << mapcount << std::endl;
+
+ ret = write_simple(TYPE_OBJECT_END, file_fd);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int ObjectStoreTool::export_files(ObjectStore *store, coll_t coll)
+{
+ ghobject_t next;
+ auto ch = store->open_collection(coll);
+ while (!next.is_max()) {
+ vector<ghobject_t> objects;
+ int r = store->collection_list(ch, next, ghobject_t::get_max(), 300,
+ &objects, &next);
+ if (r < 0)
+ return r;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ ceph_assert(!i->hobj.is_meta());
+ if (i->is_pgmeta() || i->hobj.is_temp() || !i->is_no_gen()) {
+ continue;
+ }
+ r = export_file(store, coll, *i);
+ if (r < 0)
+ return r;
+ }
+ }
+ return 0;
+}
+
+int set_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) {
+ OSDMap::Incremental inc;
+ auto it = bl.cbegin();
+ inc.decode(it);
+ if (e == 0) {
+ e = inc.epoch;
+ } else if (e != inc.epoch) {
+ cerr << "incremental.epoch mismatch: "
+ << inc.epoch << " != " << e << std::endl;
+ if (force) {
+ cerr << "But will continue anyway." << std::endl;
+ } else {
+ return -EINVAL;
+ }
+ }
+ auto ch = store->open_collection(coll_t::meta());
+ const ghobject_t inc_oid = OSD::get_inc_osdmap_pobject_name(e);
+ if (!store->exists(ch, inc_oid)) {
+ cerr << "inc-osdmap (" << inc_oid << ") does not exist." << std::endl;
+ if (!force) {
+ return -ENOENT;
+ }
+ cout << "Creating a new epoch." << std::endl;
+ }
+ if (dry_run)
+ return 0;
+ ObjectStore::Transaction t;
+ t.write(coll_t::meta(), inc_oid, 0, bl.length(), bl);
+ t.truncate(coll_t::meta(), inc_oid, bl.length());
+ store->queue_transaction(ch, std::move(t));
+ return 0;
+}
+
+int get_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl)
+{
+ auto ch = store->open_collection(coll_t::meta());
+ if (store->read(ch,
+ OSD::get_inc_osdmap_pobject_name(e),
+ 0, 0, bl) < 0) {
+ return -ENOENT;
+ }
+ return 0;
+}
+
+int set_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) {
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ if (e == 0) {
+ e = osdmap.get_epoch();
+ } else if (e != osdmap.get_epoch()) {
+ cerr << "osdmap.epoch mismatch: "
+ << e << " != " << osdmap.get_epoch() << std::endl;
+ if (force) {
+ cerr << "But will continue anyway." << std::endl;
+ } else {
+ return -EINVAL;
+ }
+ }
+ auto ch = store->open_collection(coll_t::meta());
+ const ghobject_t full_oid = OSD::get_osdmap_pobject_name(e);
+ if (!store->exists(ch, full_oid)) {
+ cerr << "osdmap (" << full_oid << ") does not exist." << std::endl;
+ if (!force) {
+ return -ENOENT;
+ }
+ cout << "Creating a new epoch." << std::endl;
+ }
+ if (dry_run)
+ return 0;
+ ObjectStore::Transaction t;
+ t.write(coll_t::meta(), full_oid, 0, bl.length(), bl);
+ t.truncate(coll_t::meta(), full_oid, bl.length());
+ store->queue_transaction(ch, std::move(t));
+ return 0;
+}
+
+int get_osdmap(ObjectStore *store, epoch_t e, OSDMap &osdmap, bufferlist& bl)
+{
+ ObjectStore::CollectionHandle ch = store->open_collection(coll_t::meta());
+ bool found = store->read(
+ ch, OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
+ if (!found) {
+ cerr << "Can't find OSDMap for pg epoch " << e << std::endl;
+ return -ENOENT;
+ }
+ osdmap.decode(bl);
+ if (debug)
+ cerr << osdmap << std::endl;
+ return 0;
+}
+
+int get_pg_num_history(ObjectStore *store, pool_pg_num_history_t *h)
+{
+ ObjectStore::CollectionHandle ch = store->open_collection(coll_t::meta());
+ bufferlist bl;
+ auto pghist = OSD::make_pg_num_history_oid();
+ int r = store->read(ch, pghist, 0, 0, bl, 0);
+ if (r >= 0 && bl.length() > 0) {
+ auto p = bl.cbegin();
+ decode(*h, p);
+ }
+ cout << __func__ << " pg_num_history " << *h << std::endl;
+ return 0;
+}
+
+int add_osdmap(ObjectStore *store, metadata_section &ms)
+{
+ return get_osdmap(store, ms.map_epoch, ms.osdmap, ms.osdmap_bl);
+}
+
+int ObjectStoreTool::do_export(
+ CephContext *cct, ObjectStore *fs, coll_t coll, spg_t pgid,
+ pg_info_t &info, epoch_t map_epoch, __u8 struct_ver,
+ const OSDSuperblock& superblock,
+ PastIntervals &past_intervals)
+{
+ PGLog::IndexedLog log;
+ pg_missing_t missing;
+
+ cerr << "Exporting " << pgid << " info " << info << std::endl;
+
+ int ret = get_log(cct, fs, struct_ver, pgid, info, log, missing);
+ if (ret > 0)
+ return ret;
+
+ if (debug) {
+ Formatter *formatter = Formatter::create("json-pretty");
+ ceph_assert(formatter);
+ dump_log(formatter, cerr, log, missing);
+ delete formatter;
+ }
+ write_super();
+
+ pg_begin pgb(pgid, superblock);
+ // Special case: If replicated pg don't require the importing OSD to have shard feature
+ if (pgid.is_no_shard()) {
+ pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+ }
+ ret = write_section(TYPE_PG_BEGIN, pgb, file_fd);
+ if (ret)
+ return ret;
+
+ // The metadata_section is now before files, so import can detect
+ // errors and abort without wasting time.
+ metadata_section ms(
+ struct_ver,
+ map_epoch,
+ info,
+ log,
+ past_intervals,
+ missing);
+ ret = add_osdmap(fs, ms);
+ if (ret)
+ return ret;
+ ret = write_section(TYPE_PG_METADATA, ms, file_fd);
+ if (ret)
+ return ret;
+
+ ret = export_files(fs, coll);
+ if (ret) {
+ cerr << "export_files error " << ret << std::endl;
+ return ret;
+ }
+
+ ret = write_simple(TYPE_PG_END, file_fd);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int dump_data(Formatter *formatter, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ data_section ds;
+ ds.decode(ebliter);
+
+ formatter->open_object_section("data_block");
+ formatter->dump_unsigned("offset", ds.offset);
+ formatter->dump_unsigned("len", ds.len);
+ // XXX: Add option to dump data like od -cx ?
+ formatter->close_section();
+ formatter->flush(cout);
+ return 0;
+}
+
+int get_data(ObjectStore *store, coll_t coll, ghobject_t hoid,
+ ObjectStore::Transaction *t, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ data_section ds;
+ ds.decode(ebliter);
+
+ if (debug)
+ cerr << "\tdata: offset " << ds.offset << " len " << ds.len << std::endl;
+ t->write(coll, hoid, ds.offset, ds.len, ds.databl);
+ return 0;
+}
+
+int dump_attrs(
+ Formatter *formatter, ghobject_t hoid,
+ bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ attr_section as;
+ as.decode(ebliter);
+
+ // This could have been handled in the caller if we didn't need to
+ // support exports that didn't include object_info_t in object_begin.
+ if (hoid.generation == ghobject_t::NO_GEN &&
+ hoid.hobj.is_head()) {
+ map<string,bufferlist>::iterator mi = as.data.find(SS_ATTR);
+ if (mi != as.data.end()) {
+ SnapSet snapset;
+ auto p = mi->second.cbegin();
+ snapset.decode(p);
+ formatter->open_object_section("snapset");
+ snapset.dump(formatter);
+ formatter->close_section();
+ } else {
+ formatter->open_object_section("snapset");
+ formatter->dump_string("error", "missing SS_ATTR");
+ formatter->close_section();
+ }
+ }
+
+ formatter->open_object_section("attrs");
+ formatter->open_array_section("user");
+ for (auto kv : as.data) {
+ // Skip system attributes
+ if (('_' != kv.first.at(0)) || kv.first.size() == 1)
+ continue;
+ formatter->open_object_section("user_attr");
+ formatter->dump_string("name", kv.first.substr(1));
+ bool b64;
+ formatter->dump_string("value", cleanbin(kv.second, b64));
+ formatter->dump_bool("Base64", b64);
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->open_array_section("system");
+ for (auto kv : as.data) {
+ // Skip user attributes
+ if (('_' == kv.first.at(0)) && kv.first.size() != 1)
+ continue;
+ formatter->open_object_section("sys_attr");
+ formatter->dump_string("name", kv.first);
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(cout);
+
+ return 0;
+}
+
+int get_attrs(
+ ObjectStore *store, coll_t coll, ghobject_t hoid,
+ ObjectStore::Transaction *t, bufferlist &bl,
+ OSDriver &driver, SnapMapper &snap_mapper)
+{
+ auto ebliter = bl.cbegin();
+ attr_section as;
+ as.decode(ebliter);
+
+ auto ch = store->open_collection(coll);
+ if (debug)
+ cerr << "\tattrs: len " << as.data.size() << std::endl;
+ t->setattrs(coll, hoid, as.data);
+
+ // This could have been handled in the caller if we didn't need to
+ // support exports that didn't include object_info_t in object_begin.
+ if (hoid.generation == ghobject_t::NO_GEN &&
+ hoid.hobj.is_head()) {
+ map<string,bufferlist>::iterator mi = as.data.find(SS_ATTR);
+ if (mi != as.data.end()) {
+ SnapSet snapset;
+ auto p = mi->second.cbegin();
+ snapset.decode(p);
+ cout << "snapset " << snapset << std::endl;
+ for (auto& p : snapset.clone_snaps) {
+ ghobject_t clone = hoid;
+ clone.hobj.snap = p.first;
+ set<snapid_t> snaps(p.second.begin(), p.second.end());
+ if (!store->exists(ch, clone)) {
+ // no clone, skip. this is probably a cache pool. this works
+ // because we use a separate transaction per object and clones
+ // come before head in the archive.
+ if (debug)
+ cerr << "\tskipping missing " << clone << " (snaps "
+ << snaps << ")" << std::endl;
+ continue;
+ }
+ if (debug)
+ cerr << "\tsetting " << clone.hobj << " snaps " << snaps
+ << std::endl;
+ OSDriver::OSTransaction _t(driver.get_transaction(t));
+ ceph_assert(!snaps.empty());
+ snap_mapper.add_oid(clone.hobj, snaps, &_t);
+ }
+ } else {
+ cerr << "missing SS_ATTR on " << hoid << std::endl;
+ }
+ }
+ return 0;
+}
+
+int dump_omap_hdr(Formatter *formatter, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ omap_hdr_section oh;
+ oh.decode(ebliter);
+
+ formatter->open_object_section("omap_header");
+ formatter->dump_string("value", string(oh.hdr.c_str(), oh.hdr.length()));
+ formatter->close_section();
+ formatter->flush(cout);
+ return 0;
+}
+
+int get_omap_hdr(ObjectStore *store, coll_t coll, ghobject_t hoid,
+ ObjectStore::Transaction *t, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ omap_hdr_section oh;
+ oh.decode(ebliter);
+
+ if (debug)
+ cerr << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length())
+ << std::endl;
+ t->omap_setheader(coll, hoid, oh.hdr);
+ return 0;
+}
+
+int dump_omap(Formatter *formatter, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ omap_section os;
+ os.decode(ebliter);
+
+ formatter->open_object_section("omaps");
+ formatter->dump_unsigned("count", os.omap.size());
+ formatter->open_array_section("data");
+ for (auto o : os.omap) {
+ formatter->open_object_section("omap");
+ formatter->dump_string("name", o.first);
+ bool b64;
+ formatter->dump_string("value", cleanbin(o.second, b64));
+ formatter->dump_bool("Base64", b64);
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(cout);
+ return 0;
+}
+
+int get_omap(ObjectStore *store, coll_t coll, ghobject_t hoid,
+ ObjectStore::Transaction *t, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ omap_section os;
+ os.decode(ebliter);
+
+ if (debug)
+ cerr << "\tomap: size " << os.omap.size() << std::endl;
+ t->omap_setkeys(coll, hoid, os.omap);
+ return 0;
+}
+
+int ObjectStoreTool::dump_object(Formatter *formatter,
+ bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ object_begin ob;
+ ob.decode(ebliter);
+
+ if (ob.hoid.hobj.is_temp()) {
+ cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl;
+ return -EFAULT;
+ }
+
+ formatter->open_object_section("object");
+ formatter->open_object_section("oid");
+ ob.hoid.dump(formatter);
+ formatter->close_section();
+ formatter->open_object_section("object_info");
+ ob.oi.dump(formatter);
+ formatter->close_section();
+
+ bufferlist ebl;
+ bool done = false;
+ while(!done) {
+ sectiontype_t type;
+ int ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ //cout << "\tdo_object: Section type " << hex << type << dec << std::endl;
+ //cout << "\t\tsection size " << ebl.length() << std::endl;
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown object section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_DATA:
+ if (dry_run) break;
+ ret = dump_data(formatter, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_ATTRS:
+ if (dry_run) break;
+ ret = dump_attrs(formatter, ob.hoid, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OMAP_HDR:
+ if (dry_run) break;
+ ret = dump_omap_hdr(formatter, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OMAP:
+ if (dry_run) break;
+ ret = dump_omap(formatter, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OBJECT_END:
+ done = true;
+ break;
+ default:
+ cerr << "Unknown section type " << type << std::endl;
+ return -EFAULT;
+ }
+ }
+ formatter->close_section();
+ return 0;
+}
+
+int ObjectStoreTool::get_object(ObjectStore *store,
+ OSDriver& driver,
+ SnapMapper& mapper,
+ coll_t coll,
+ bufferlist &bl, OSDMap &origmap,
+ bool *skipped_objects)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+ auto ebliter = bl.cbegin();
+ object_begin ob;
+ ob.decode(ebliter);
+
+ if (ob.hoid.hobj.is_temp()) {
+ cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl;
+ return -EFAULT;
+ }
+ ceph_assert(g_ceph_context);
+
+ auto ch = store->open_collection(coll);
+ if (ob.hoid.hobj.nspace != g_ceph_context->_conf->osd_hit_set_namespace) {
+ object_t oid = ob.hoid.hobj.oid;
+ object_locator_t loc(ob.hoid.hobj);
+ pg_t raw_pgid = origmap.object_locator_to_pg(oid, loc);
+ pg_t pgid = origmap.raw_pg_to_pg(raw_pgid);
+
+ spg_t coll_pgid;
+ if (coll.is_pg(&coll_pgid) == false) {
+ cerr << "INTERNAL ERROR: Bad collection during import" << std::endl;
+ return -EFAULT;
+ }
+ if (coll_pgid.shard != ob.hoid.shard_id) {
+ cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard
+ << " but object shard is " << ob.hoid.shard_id << std::endl;
+ return -EFAULT;
+ }
+
+ if (coll_pgid.pgid != pgid) {
+ cerr << "Skipping object '" << ob.hoid << "' which belongs in pg " << pgid << std::endl;
+ *skipped_objects = true;
+ skip_object(bl);
+ return 0;
+ }
+ }
+
+ if (!dry_run)
+ t->touch(coll, ob.hoid);
+
+ cout << "Write " << ob.hoid << std::endl;
+
+ bufferlist ebl;
+ bool done = false;
+ while(!done) {
+ sectiontype_t type;
+ int ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ //cout << "\tdo_object: Section type " << hex << type << dec << std::endl;
+ //cout << "\t\tsection size " << ebl.length() << std::endl;
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown object section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_DATA:
+ if (dry_run) break;
+ ret = get_data(store, coll, ob.hoid, t, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_ATTRS:
+ if (dry_run) break;
+ ret = get_attrs(store, coll, ob.hoid, t, ebl, driver, mapper);
+ if (ret) return ret;
+ break;
+ case TYPE_OMAP_HDR:
+ if (dry_run) break;
+ ret = get_omap_hdr(store, coll, ob.hoid, t, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OMAP:
+ if (dry_run) break;
+ ret = get_omap(store, coll, ob.hoid, t, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OBJECT_END:
+ done = true;
+ break;
+ default:
+ cerr << "Unknown section type " << type << std::endl;
+ return -EFAULT;
+ }
+ }
+ if (!dry_run) {
+ wait_until_done(t, [&] {
+ store->queue_transaction(ch, std::move(*t));
+ ch->flush();
+ });
+ }
+ return 0;
+}
+
+int dump_pg_metadata(Formatter *formatter, bufferlist &bl, metadata_section &ms)
+{
+ auto ebliter = bl.cbegin();
+ ms.decode(ebliter);
+
+ formatter->open_object_section("metadata_section");
+
+ formatter->dump_unsigned("pg_disk_version", (int)ms.struct_ver);
+ formatter->dump_unsigned("map_epoch", ms.map_epoch);
+
+ formatter->open_object_section("OSDMap");
+ ms.osdmap.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+
+ formatter->open_object_section("info");
+ ms.info.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+
+ formatter->open_object_section("log");
+ ms.log.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+
+ formatter->open_object_section("pg_missing_t");
+ ms.missing.dump(formatter);
+ formatter->close_section();
+
+ // XXX: ms.past_intervals?
+
+ formatter->close_section();
+ formatter->flush(cout);
+
+ if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) {
+ cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl;
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms,
+ const OSDSuperblock& sb, spg_t pgid)
+{
+ auto ebliter = bl.cbegin();
+ ms.decode(ebliter);
+ spg_t old_pgid = ms.info.pgid;
+ ms.info.pgid = pgid;
+
+ if (debug) {
+ cout << "export pgid " << old_pgid << std::endl;
+ cout << "struct_v " << (int)ms.struct_ver << std::endl;
+ cout << "map epoch " << ms.map_epoch << std::endl;
+
+#ifdef DIAGNOSTIC
+ Formatter *formatter = new JSONFormatter(true);
+ formatter->open_object_section("stuff");
+
+ formatter->open_object_section("importing OSDMap");
+ ms.osdmap.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+
+ cout << "osd current epoch " << sb.current_epoch << std::endl;
+
+ formatter->open_object_section("info");
+ ms.info.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+
+ formatter->open_object_section("log");
+ ms.log.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+#endif
+ }
+
+ if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) {
+ cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl;
+ return -EFAULT;
+ }
+
+ if (ms.map_epoch > sb.current_epoch) {
+ cerr << "ERROR: Export PG's map_epoch " << ms.map_epoch << " > OSD's epoch " << sb.current_epoch << std::endl;
+ cerr << "The OSD you are using is older than the exported PG" << std::endl;
+ cerr << "Either use another OSD or join selected OSD to cluster to update it first" << std::endl;
+ return -EINVAL;
+ }
+
+ // Old exports didn't include OSDMap
+ if (ms.osdmap.get_epoch() == 0) {
+ cerr << "WARNING: No OSDMap in old export, this is an ancient export."
+ " Not supported." << std::endl;
+ return -EINVAL;
+ }
+
+ if (ms.osdmap.get_epoch() < sb.oldest_map) {
+ cerr << "PG export's map " << ms.osdmap.get_epoch()
+ << " is older than OSD's oldest_map " << sb.oldest_map << std::endl;
+ if (!force) {
+ cerr << " pass --force to proceed anyway (with incomplete PastIntervals)"
+ << std::endl;
+ return -EINVAL;
+ }
+ }
+ if (debug) {
+ cerr << "Import pgid " << ms.info.pgid << std::endl;
+ cerr << "Previous past_intervals " << ms.past_intervals << std::endl;
+ cerr << "history.same_interval_since "
+ << ms.info.history.same_interval_since << std::endl;
+ }
+
+ return 0;
+}
+
+// out: pg_log_t that only has entries that apply to import_pgid using curmap
+// reject: Entries rejected from "in" are in the reject.log. Other fields not set.
+void filter_divergent_priors(spg_t import_pgid, const OSDMap &curmap,
+ const string &hit_set_namespace, const divergent_priors_t &in,
+ divergent_priors_t &out, divergent_priors_t &reject)
+{
+ out.clear();
+ reject.clear();
+
+ for (divergent_priors_t::const_iterator i = in.begin();
+ i != in.end(); ++i) {
+
+ // Reject divergent priors for temporary objects
+ if (i->second.is_temp()) {
+ reject.insert(*i);
+ continue;
+ }
+
+ if (i->second.nspace != hit_set_namespace) {
+ object_t oid = i->second.oid;
+ object_locator_t loc(i->second);
+ pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
+ pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
+
+ if (import_pgid.pgid == pgid) {
+ out.insert(*i);
+ } else {
+ reject.insert(*i);
+ }
+ } else {
+ out.insert(*i);
+ }
+ }
+}
+
+int ObjectStoreTool::dump_export(Formatter *formatter)
+{
+ bufferlist ebl;
+ pg_info_t info;
+ PGLog::IndexedLog log;
+ //bool skipped_objects = false;
+
+ int ret = read_super();
+ if (ret)
+ return ret;
+
+ if (sh.magic != super_header::super_magic) {
+ cerr << "Invalid magic number" << std::endl;
+ return -EFAULT;
+ }
+
+ if (sh.version > super_header::super_ver) {
+ cerr << "Can't handle export format version=" << sh.version << std::endl;
+ return -EINVAL;
+ }
+
+ formatter->open_object_section("Export");
+
+ //First section must be TYPE_PG_BEGIN
+ sectiontype_t type;
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+ if (type == TYPE_POOL_BEGIN) {
+ cerr << "Dump of pool exports not supported" << std::endl;
+ return -EINVAL;
+ } else if (type != TYPE_PG_BEGIN) {
+ cerr << "Invalid first section type " << std::to_string(type) << std::endl;
+ return -EFAULT;
+ }
+
+ auto ebliter = ebl.cbegin();
+ pg_begin pgb;
+ pgb.decode(ebliter);
+ spg_t pgid = pgb.pgid;
+
+ formatter->dump_string("pgid", stringify(pgid));
+ formatter->dump_string("cluster_fsid", stringify(pgb.superblock.cluster_fsid));
+ formatter->dump_string("features", stringify(pgb.superblock.compat_features));
+
+ bool done = false;
+ bool found_metadata = false;
+ metadata_section ms;
+ bool objects_started = false;
+ while(!done) {
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ if (debug) {
+ cerr << "dump_export: Section type " << std::to_string(type) << std::endl;
+ }
+ if (type >= END_OF_TYPES) {
+ cerr << "Skipping unknown section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_OBJECT_BEGIN:
+ if (!objects_started) {
+ formatter->open_array_section("objects");
+ objects_started = true;
+ }
+ ret = dump_object(formatter, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_PG_METADATA:
+ if (objects_started)
+ cerr << "WARNING: metadata_section out of order" << std::endl;
+ ret = dump_pg_metadata(formatter, ebl, ms);
+ if (ret) return ret;
+ found_metadata = true;
+ break;
+ case TYPE_PG_END:
+ if (objects_started) {
+ formatter->close_section();
+ }
+ done = true;
+ break;
+ default:
+ cerr << "Unknown section type " << std::to_string(type) << std::endl;
+ return -EFAULT;
+ }
+ }
+
+ if (!found_metadata) {
+ cerr << "Missing metadata section" << std::endl;
+ return -EFAULT;
+ }
+
+ formatter->close_section();
+ formatter->flush(cout);
+
+ return 0;
+}
+
+int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb,
+ bool force, std::string pgidstr)
+{
+ bufferlist ebl;
+ pg_info_t info;
+ PGLog::IndexedLog log;
+ bool skipped_objects = false;
+
+ if (!dry_run)
+ finish_remove_pgs(store);
+
+ int ret = read_super();
+ if (ret)
+ return ret;
+
+ if (sh.magic != super_header::super_magic) {
+ cerr << "Invalid magic number" << std::endl;
+ return -EFAULT;
+ }
+
+ if (sh.version > super_header::super_ver) {
+ cerr << "Can't handle export format version=" << sh.version << std::endl;
+ return -EINVAL;
+ }
+
+ //First section must be TYPE_PG_BEGIN
+ sectiontype_t type;
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+ if (type == TYPE_POOL_BEGIN) {
+ cerr << "Pool exports cannot be imported into a PG" << std::endl;
+ return -EINVAL;
+ } else if (type != TYPE_PG_BEGIN) {
+ cerr << "Invalid first section type " << std::to_string(type) << std::endl;
+ return -EFAULT;
+ }
+
+ auto ebliter = ebl.cbegin();
+ pg_begin pgb;
+ pgb.decode(ebliter);
+ spg_t pgid = pgb.pgid;
+
+ if (pgidstr.length()) {
+ spg_t user_pgid;
+
+ bool ok = user_pgid.parse(pgidstr.c_str());
+ // This succeeded in main() already
+ ceph_assert(ok);
+ if (pgid != user_pgid) {
+ cerr << "specified pgid " << user_pgid
+ << " does not match actual pgid " << pgid << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (!pgb.superblock.cluster_fsid.is_zero()
+ && pgb.superblock.cluster_fsid != sb.cluster_fsid) {
+ cerr << "Export came from different cluster with fsid "
+ << pgb.superblock.cluster_fsid << std::endl;
+ return -EINVAL;
+ }
+
+ if (debug) {
+ cerr << "Exported features: " << pgb.superblock.compat_features << std::endl;
+ }
+
+ // Special case: Old export has SHARDS incompat feature on replicated pg, removqqe it
+ if (pgid.is_no_shard())
+ pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+
+ if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
+ CompatSet unsupported = sb.compat_features.unsupported(pgb.superblock.compat_features);
+
+ cerr << "Export has incompatible features set " << unsupported << std::endl;
+
+ // Let them import if they specify the --force option
+ if (!force)
+ return 11; // Positive return means exit status
+ }
+
+ // we need the latest OSDMap to check for collisions
+ OSDMap curmap;
+ bufferlist bl;
+ ret = get_osdmap(store, sb.current_epoch, curmap, bl);
+ if (ret) {
+ cerr << "Can't find latest local OSDMap " << sb.current_epoch << std::endl;
+ return ret;
+ }
+ if (!curmap.have_pg_pool(pgid.pgid.m_pool)) {
+ cerr << "Pool " << pgid.pgid.m_pool << " no longer exists" << std::endl;
+ // Special exit code for this error, used by test code
+ return 10; // Positive return means exit status
+ }
+
+ pool_pg_num_history_t pg_num_history;
+ get_pg_num_history(store, &pg_num_history);
+
+ ghobject_t pgmeta_oid = pgid.make_pgmeta_oid();
+
+ // Check for PG already present.
+ coll_t coll(pgid);
+ if (store->collection_exists(coll)) {
+ cerr << "pgid " << pgid << " already exists" << std::endl;
+ return -EEXIST;
+ }
+
+ ObjectStore::CollectionHandle ch;
+
+ OSDriver driver(
+ store,
+ coll_t(),
+ OSD::make_snapmapper_oid());
+ SnapMapper mapper(g_ceph_context, &driver, 0, 0, 0, pgid.shard);
+
+ cout << "Importing pgid " << pgid;
+ cout << std::endl;
+
+ bool done = false;
+ bool found_metadata = false;
+ metadata_section ms;
+ while(!done) {
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ if (debug) {
+ cout << __func__ << ": Section type " << std::to_string(type) << std::endl;
+ }
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_OBJECT_BEGIN:
+ ceph_assert(found_metadata);
+ ret = get_object(store, driver, mapper, coll, ebl, ms.osdmap,
+ &skipped_objects);
+ if (ret) return ret;
+ break;
+ case TYPE_PG_METADATA:
+ ret = get_pg_metadata(store, ebl, ms, sb, pgid);
+ if (ret) return ret;
+ found_metadata = true;
+
+ if (pgid != ms.info.pgid) {
+ cerr << "specified pgid " << pgid << " does not match import file pgid "
+ << ms.info.pgid << std::endl;
+ return -EINVAL;
+ }
+
+ // make sure there are no conflicting splits or merges
+ if (ms.osdmap.have_pg_pool(pgid.pgid.pool())) {
+ auto p = pg_num_history.pg_nums.find(pgid.pgid.m_pool);
+ if (p != pg_num_history.pg_nums.end() &&
+ !p->second.empty()) {
+ unsigned start_pg_num = ms.osdmap.get_pg_num(pgid.pgid.pool());
+ unsigned pg_num = start_pg_num;
+ for (auto q = p->second.lower_bound(ms.map_epoch);
+ q != p->second.end();
+ ++q) {
+ unsigned new_pg_num = q->second;
+ cout << "pool " << pgid.pgid.pool() << " pg_num " << pg_num
+ << " -> " << new_pg_num << std::endl;
+
+ // check for merge target
+ spg_t target;
+ if (pgid.is_merge_source(pg_num, new_pg_num, &target)) {
+ // FIXME: this checks assumes the OSD's PG is at the OSD's
+ // map epoch; it could be, say, at *our* epoch, pre-merge.
+ coll_t coll(target);
+ if (store->collection_exists(coll)) {
+ cerr << "pgid " << pgid << " merges to target " << target
+ << " which already exists" << std::endl;
+ return 12;
+ }
+ }
+
+ // check for split children
+ set<spg_t> children;
+ if (pgid.is_split(start_pg_num, new_pg_num, &children)) {
+ cerr << " children are " << children << std::endl;
+ for (auto child : children) {
+ coll_t coll(child);
+ if (store->collection_exists(coll)) {
+ cerr << "pgid " << pgid << " splits to " << children
+ << " and " << child << " exists" << std::endl;
+ return 12;
+ }
+ }
+ }
+ pg_num = new_pg_num;
+ }
+ }
+ } else {
+ cout << "pool " << pgid.pgid.pool() << " doesn't existing, not checking"
+ << " for splits or mergers" << std::endl;
+ }
+
+ if (!dry_run) {
+ ObjectStore::Transaction t;
+ ch = store->create_new_collection(coll);
+ create_pg_collection(
+ t, pgid,
+ pgid.get_split_bits(ms.osdmap.get_pg_pool(pgid.pool())->get_pg_num()));
+ init_pg_ondisk(t, pgid, NULL);
+
+ // mark this coll for removal until we're done
+ map<string,bufferlist> values;
+ encode((char)1, values["_remove"]);
+ t.omap_setkeys(coll, pgid.make_pgmeta_oid(), values);
+
+ store->queue_transaction(ch, std::move(t));
+ }
+
+ break;
+ case TYPE_PG_END:
+ ceph_assert(found_metadata);
+ done = true;
+ break;
+ default:
+ cerr << "Unknown section type " << std::to_string(type) << std::endl;
+ return -EFAULT;
+ }
+ }
+
+ if (!found_metadata) {
+ cerr << "Missing metadata section" << std::endl;
+ return -EFAULT;
+ }
+
+ ObjectStore::Transaction t;
+ if (!dry_run) {
+ pg_log_t newlog, reject;
+ pg_log_t::filter_log(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace,
+ ms.log, newlog, reject);
+ if (debug) {
+ for (list<pg_log_entry_t>::iterator i = newlog.log.begin();
+ i != newlog.log.end(); ++i)
+ cerr << "Keeping log entry " << *i << std::endl;
+ for (list<pg_log_entry_t>::iterator i = reject.log.begin();
+ i != reject.log.end(); ++i)
+ cerr << "Skipping log entry " << *i << std::endl;
+ }
+
+ divergent_priors_t newdp, rejectdp;
+ filter_divergent_priors(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace,
+ ms.divergent_priors, newdp, rejectdp);
+ ms.divergent_priors = newdp;
+ if (debug) {
+ for (divergent_priors_t::iterator i = newdp.begin();
+ i != newdp.end(); ++i)
+ cerr << "Keeping divergent_prior " << *i << std::endl;
+ for (divergent_priors_t::iterator i = rejectdp.begin();
+ i != rejectdp.end(); ++i)
+ cerr << "Skipping divergent_prior " << *i << std::endl;
+ }
+
+ ms.missing.filter_objects([&](const hobject_t &obj) {
+ if (obj.nspace == g_ceph_context->_conf->osd_hit_set_namespace)
+ return false;
+ ceph_assert(!obj.is_temp());
+ object_t oid = obj.oid;
+ object_locator_t loc(obj);
+ pg_t raw_pgid = ms.osdmap.object_locator_to_pg(oid, loc);
+ pg_t _pgid = ms.osdmap.raw_pg_to_pg(raw_pgid);
+
+ return pgid.pgid != _pgid;
+ });
+
+
+ if (debug) {
+ pg_missing_t missing;
+ Formatter *formatter = Formatter::create("json-pretty");
+ dump_log(formatter, cerr, newlog, ms.missing);
+ delete formatter;
+ }
+
+ // Just like a split invalidate stats since the object count is changed
+ if (skipped_objects)
+ ms.info.stats.stats_invalid = true;
+
+ ret = write_pg(
+ t,
+ ms.map_epoch,
+ ms.info,
+ newlog,
+ ms.past_intervals,
+ ms.divergent_priors,
+ ms.missing);
+ if (ret) return ret;
+ }
+
+ // done, clear removal flag
+ if (debug)
+ cerr << "done, clearing removal flag" << std::endl;
+
+ if (!dry_run) {
+ t.omap_rmkey(coll, pgid.make_pgmeta_oid(), "_remove");
+ wait_until_done(&t, [&] {
+ store->queue_transaction(ch, std::move(t));
+ // make sure we flush onreadable items before mapper/driver are destroyed.
+ ch->flush();
+ });
+ }
+ return 0;
+}
+
+int do_list(ObjectStore *store, string pgidstr, string object, boost::optional<std::string> nspace,
+ Formatter *formatter, bool debug, bool human_readable, bool head)
+{
+ int r;
+ lookup_ghobject lookup(object, nspace, head);
+ if (pgidstr.length() > 0) {
+ r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug);
+ } else {
+ r = action_on_all_objects(store, lookup, debug);
+ }
+ if (r)
+ return r;
+ lookup.dump(formatter, human_readable);
+ formatter->flush(cout);
+ return 0;
+}
+
+int do_list_slow(ObjectStore *store, string pgidstr, string object,
+ double threshold, Formatter *formatter, bool debug, bool human_readable)
+{
+ int r;
+ lookup_slow_ghobject lookup(object, threshold);
+ if (pgidstr.length() > 0) {
+ r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug);
+ } else {
+ r = action_on_all_objects(store, lookup, debug);
+ }
+ if (r)
+ return r;
+ lookup.dump(formatter, human_readable);
+ formatter->flush(cout);
+ return 0;
+}
+
+int do_meta(ObjectStore *store, string object, Formatter *formatter, bool debug, bool human_readable)
+{
+ int r;
+ boost::optional<std::string> nspace; // Not specified
+ lookup_ghobject lookup(object, nspace);
+ r = action_on_all_objects_in_exact_pg(store, coll_t::meta(), lookup, debug);
+ if (r)
+ return r;
+ lookup.dump(formatter, human_readable);
+ formatter->flush(cout);
+ return 0;
+}
+
+enum rmtype {
+ BOTH,
+ SNAPMAP,
+ NOSNAPMAP
+};
+
+int remove_object(coll_t coll, ghobject_t &ghobj,
+ SnapMapper &mapper,
+ MapCacher::Transaction<std::string, bufferlist> *_t,
+ ObjectStore::Transaction *t,
+ enum rmtype type)
+{
+ if (type == BOTH || type == SNAPMAP) {
+ int r = mapper.remove_oid(ghobj.hobj, _t);
+ if (r < 0 && r != -ENOENT) {
+ cerr << "remove_oid returned " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ if (type == BOTH || type == NOSNAPMAP) {
+ t->remove(coll, ghobj);
+ }
+ return 0;
+}
+
+int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent);
+
+int do_remove_object(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, bool all, bool force, enum rmtype type)
+{
+ auto ch = store->open_collection(coll);
+ spg_t pg;
+ coll.is_pg_prefix(&pg);
+ OSDriver driver(
+ store,
+ coll_t(),
+ OSD::make_snapmapper_oid());
+ SnapMapper mapper(g_ceph_context, &driver, 0, 0, 0, pg.shard);
+ struct stat st;
+
+ int r = store->stat(ch, ghobj, &st);
+ if (r < 0) {
+ cerr << "remove: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ SnapSet ss;
+ if (ghobj.hobj.has_snapset()) {
+ r = get_snapset(store, coll, ghobj, ss, false);
+ if (r < 0) {
+ cerr << "Can't get snapset error " << cpp_strerror(r) << std::endl;
+ // If --force and bad snapset let them remove the head
+ if (!(force && !all))
+ return r;
+ }
+// cout << "snapset " << ss << std::endl;
+ if (!ss.clone_snaps.empty() && !all) {
+ if (force) {
+ cout << "WARNING: only removing "
+ << (ghobj.hobj.is_head() ? "head" : "snapdir")
+ << " with clones present" << std::endl;
+ ss.clone_snaps.clear();
+ } else {
+ cerr << "Clones are present, use removeall to delete everything"
+ << std::endl;
+ return -EINVAL;
+ }
+ }
+ }
+
+ ObjectStore::Transaction t;
+ OSDriver::OSTransaction _t(driver.get_transaction(&t));
+
+ ghobject_t snapobj = ghobj;
+ for (auto& p : ss.clone_snaps) {
+ snapobj.hobj.snap = p.first;
+ cout << "remove clone " << snapobj << std::endl;
+ if (!dry_run) {
+ r = remove_object(coll, snapobj, mapper, &_t, &t, type);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ cout << "remove " << ghobj << std::endl;
+
+ if (!dry_run) {
+ r = remove_object(coll, ghobj, mapper, &_t, &t, type);
+ if (r < 0)
+ return r;
+ }
+
+ if (!dry_run) {
+ wait_until_done(&t, [&] {
+ store->queue_transaction(ch, std::move(t));
+ ch->flush();
+ });
+ }
+ return 0;
+}
+
+int do_list_attrs(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
+{
+ auto ch = store->open_collection(coll);
+ map<string,bufferptr> aset;
+ int r = store->getattrs(ch, ghobj, aset);
+ if (r < 0) {
+ cerr << "getattrs: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ for (map<string,bufferptr>::iterator i = aset.begin();i != aset.end(); ++i) {
+ string key(i->first);
+ if (outistty)
+ key = cleanbin(key);
+ cout << key << std::endl;
+ }
+ return 0;
+}
+
+int do_list_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
+{
+ auto ch = store->open_collection(coll);
+ ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, ghobj);
+ if (!iter) {
+ cerr << "omap_get_iterator: " << cpp_strerror(ENOENT) << std::endl;
+ return -ENOENT;
+ }
+ iter->seek_to_first();
+ map<string, bufferlist> oset;
+ while(iter->valid()) {
+ get_omap_batch(iter, oset);
+
+ for (map<string,bufferlist>::iterator i = oset.begin();i != oset.end(); ++i) {
+ string key(i->first);
+ if (outistty)
+ key = cleanbin(key);
+ cout << key << std::endl;
+ }
+ }
+ return 0;
+}
+
+int do_get_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
+{
+ auto ch = store->open_collection(coll);
+ struct stat st;
+ mysize_t total;
+
+ int ret = store->stat(ch, ghobj, &st);
+ if (ret < 0) {
+ cerr << "get-bytes: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ total = st.st_size;
+ if (debug)
+ cerr << "size=" << total << std::endl;
+
+ uint64_t offset = 0;
+ bufferlist rawdatabl;
+ while(total > 0) {
+ rawdatabl.clear();
+ mysize_t len = max_read;
+ if (len > total)
+ len = total;
+
+ ret = store->read(ch, ghobj, offset, len, rawdatabl);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -EINVAL;
+
+ if (debug)
+ cerr << "data section offset=" << offset << " len=" << len << std::endl;
+
+ total -= ret;
+ offset += ret;
+
+ ret = write(fd, rawdatabl.c_str(), ret);
+ if (ret == -1) {
+ perror("write");
+ return -errno;
+ }
+ }
+
+ return 0;
+}
+
+int do_set_bytes(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, int fd)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (debug)
+ cerr << "Write " << ghobj << std::endl;
+
+ if (!dry_run) {
+ t->touch(coll, ghobj);
+ t->truncate(coll, ghobj, 0);
+ }
+
+ uint64_t offset = 0;
+ bufferlist rawdatabl;
+ do {
+ rawdatabl.clear();
+ ssize_t bytes = rawdatabl.read_fd(fd, max_read);
+ if (bytes < 0) {
+ cerr << "read_fd error " << cpp_strerror(bytes) << std::endl;
+ return bytes;
+ }
+
+ if (bytes == 0)
+ break;
+
+ if (debug)
+ cerr << "\tdata: offset " << offset << " bytes " << bytes << std::endl;
+ if (!dry_run)
+ t->write(coll, ghobj, offset, bytes, rawdatabl);
+
+ offset += bytes;
+ // XXX: Should we queue_transaction() every once in a while for very large files
+ } while(true);
+
+ auto ch = store->open_collection(coll);
+ if (!dry_run)
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_get_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
+{
+ auto ch = store->open_collection(coll);
+ bufferptr bp;
+
+ int r = store->getattr(ch, ghobj, key.c_str(), bp);
+ if (r < 0) {
+ cerr << "getattr: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ string value(bp.c_str(), bp.length());
+ if (outistty) {
+ value = cleanbin(value);
+ value.push_back('\n');
+ }
+ cout << value;
+
+ return 0;
+}
+
+int do_set_attr(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, string key, int fd)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+ bufferlist bl;
+
+ if (debug)
+ cerr << "Setattr " << ghobj << std::endl;
+
+ int ret = get_fd_data(fd, bl);
+ if (ret < 0)
+ return ret;
+
+ if (dry_run)
+ return 0;
+
+ t->touch(coll, ghobj);
+
+ t->setattr(coll, ghobj, key, bl);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_rm_attr(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, string key)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (debug)
+ cerr << "Rmattr " << ghobj << std::endl;
+
+ if (dry_run)
+ return 0;
+
+ t->rmattr(coll, ghobj, key);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_get_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
+{
+ auto ch = store->open_collection(coll);
+ set<string> keys;
+ map<string, bufferlist> out;
+
+ keys.insert(key);
+
+ int r = store->omap_get_values(ch, ghobj, keys, &out);
+ if (r < 0) {
+ cerr << "omap_get_values: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (out.empty()) {
+ cerr << "Key not found" << std::endl;
+ return -ENOENT;
+ }
+
+ ceph_assert(out.size() == 1);
+
+ bufferlist bl = out.begin()->second;
+ string value(bl.c_str(), bl.length());
+ if (outistty) {
+ value = cleanbin(value);
+ value.push_back('\n');
+ }
+ cout << value;
+
+ return 0;
+}
+
+int do_set_omap(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, string key, int fd)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+ map<string, bufferlist> attrset;
+ bufferlist valbl;
+
+ if (debug)
+ cerr << "Set_omap " << ghobj << std::endl;
+
+ int ret = get_fd_data(fd, valbl);
+ if (ret < 0)
+ return ret;
+
+ attrset.insert(pair<string, bufferlist>(key, valbl));
+
+ if (dry_run)
+ return 0;
+
+ t->touch(coll, ghobj);
+
+ t->omap_setkeys(coll, ghobj, attrset);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_rm_omap(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, string key)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (debug)
+ cerr << "Rm_omap " << ghobj << std::endl;
+
+ if (dry_run)
+ return 0;
+
+ t->omap_rmkey(coll, ghobj, key);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_get_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
+{
+ auto ch = store->open_collection(coll);
+ bufferlist hdrbl;
+
+ int r = store->omap_get_header(ch, ghobj, &hdrbl, true);
+ if (r < 0) {
+ cerr << "omap_get_header: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ string header(hdrbl.c_str(), hdrbl.length());
+ if (outistty) {
+ header = cleanbin(header);
+ header.push_back('\n');
+ }
+ cout << header;
+
+ return 0;
+}
+
+int do_set_omaphdr(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, int fd)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+ bufferlist hdrbl;
+
+ if (debug)
+ cerr << "Omap_setheader " << ghobj << std::endl;
+
+ int ret = get_fd_data(fd, hdrbl);
+ if (ret)
+ return ret;
+
+ if (dry_run)
+ return 0;
+
+ t->touch(coll, ghobj);
+
+ t->omap_setheader(coll, ghobj, hdrbl);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+struct do_fix_lost : public action_on_object_t {
+ void call(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, object_info_t &oi) override {
+ if (oi.is_lost()) {
+ cout << coll << "/" << ghobj << " is lost";
+ if (!dry_run)
+ cout << ", fixing";
+ cout << std::endl;
+ if (dry_run)
+ return;
+ oi.clear_flag(object_info_t::FLAG_LOST);
+ bufferlist bl;
+ encode(oi, bl, -1); /* fixme: using full features */
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, OI_ATTR, bl);
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(t));
+ }
+ return;
+ }
+};
+
+int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent = false)
+{
+ auto ch = store->open_collection(coll);
+ bufferlist attr;
+ int r = store->getattr(ch, ghobj, SS_ATTR, attr);
+ if (r < 0) {
+ if (!silent)
+ cerr << "Error getting snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ auto bp = attr.cbegin();
+ try {
+ decode(ss, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter)
+{
+ auto ch = store->open_collection(coll);
+ int r = 0;
+ formatter->open_object_section("obj");
+ formatter->open_object_section("id");
+ ghobj.dump(formatter);
+ formatter->close_section();
+
+ bufferlist attr;
+ int gr = store->getattr(ch, ghobj, OI_ATTR, attr);
+ if (gr < 0) {
+ r = gr;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ } else {
+ object_info_t oi;
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ formatter->open_object_section("info");
+ oi.dump(formatter);
+ formatter->close_section();
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+ struct stat st;
+ int sr = store->stat(ch, ghobj, &st, true);
+ if (sr < 0) {
+ r = sr;
+ cerr << "Error stat on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ } else {
+ formatter->open_object_section("stat");
+ formatter->dump_int("size", st.st_size);
+ formatter->dump_int("blksize", st.st_blksize);
+ formatter->dump_int("blocks", st.st_blocks);
+ formatter->dump_int("nlink", st.st_nlink);
+ formatter->close_section();
+ }
+
+ if (ghobj.hobj.has_snapset()) {
+ SnapSet ss;
+ int snr = get_snapset(store, coll, ghobj, ss);
+ if (snr < 0) {
+ r = snr;
+ } else {
+ formatter->open_object_section("SnapSet");
+ ss.dump(formatter);
+ formatter->close_section();
+ }
+ }
+ bufferlist hattr;
+ gr = store->getattr(ch, ghobj, ECUtil::get_hinfo_key(), hattr);
+ if (gr == 0) {
+ ECUtil::HashInfo hinfo;
+ auto hp = hattr.cbegin();
+ try {
+ decode(hinfo, hp);
+ formatter->open_object_section("hinfo");
+ hinfo.dump(formatter);
+ formatter->close_section();
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding hinfo on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+ gr = store->dump_onode(ch, ghobj, "onode", formatter);
+
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+ return r;
+}
+
+int corrupt_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter)
+{
+ auto ch = store->open_collection(coll);
+ bufferlist attr;
+ int r = store->getattr(ch, ghobj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ object_info_t oi;
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (!dry_run) {
+ attr.clear();
+ oi.alloc_hint_flags += 0xff;
+ ObjectStore::Transaction t;
+ encode(oi, attr, -1); /* fixme: using full features */
+ t.setattr(coll, ghobj, OI_ATTR, attr);
+ auto ch = store->open_collection(coll);
+ r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int set_size(
+ ObjectStore *store, coll_t coll, ghobject_t &ghobj, uint64_t setsize, Formatter* formatter,
+ bool corrupt)
+{
+ auto ch = store->open_collection(coll);
+ if (ghobj.hobj.is_snapdir()) {
+ cerr << "Can't set the size of a snapdir" << std::endl;
+ return -EINVAL;
+ }
+ bufferlist attr;
+ int r = store->getattr(ch, ghobj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ object_info_t oi;
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ struct stat st;
+ r = store->stat(ch, ghobj, &st, true);
+ if (r < 0) {
+ cerr << "Error stat on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ ghobject_t head(ghobj);
+ SnapSet ss;
+ bool found_head = true;
+ map<snapid_t, uint64_t>::iterator csi;
+ bool is_snap = ghobj.hobj.is_snap();
+ if (is_snap) {
+ head.hobj = head.hobj.get_head();
+ r = get_snapset(store, coll, head, ss, true);
+ if (r < 0 && r != -ENOENT) {
+ // Requested get_snapset() silent, so if not -ENOENT show error
+ cerr << "Error getting snapset on : " << make_pair(coll, head) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (r == -ENOENT) {
+ head.hobj = head.hobj.get_snapdir();
+ r = get_snapset(store, coll, head, ss);
+ if (r < 0)
+ return r;
+ found_head = false;
+ } else {
+ found_head = true;
+ }
+ csi = ss.clone_size.find(ghobj.hobj.snap);
+ if (csi == ss.clone_size.end()) {
+ cerr << "SnapSet is missing clone_size for snap " << ghobj.hobj.snap << std::endl;
+ return -EINVAL;
+ }
+ }
+ if ((uint64_t)st.st_size == setsize && oi.size == setsize
+ && (!is_snap || csi->second == setsize)) {
+ cout << "Size of object is already " << setsize << std::endl;
+ return 0;
+ }
+ cout << "Setting size to " << setsize << ", stat size " << st.st_size
+ << ", obj info size " << oi.size;
+ if (is_snap) {
+ cout << ", " << (found_head ? "head" : "snapdir")
+ << " clone_size " << csi->second;
+ csi->second = setsize;
+ }
+ cout << std::endl;
+ if (!dry_run) {
+ attr.clear();
+ oi.size = setsize;
+ ObjectStore::Transaction t;
+ // Only modify object info if we want to corrupt it
+ if (!corrupt && (uint64_t)st.st_size != setsize) {
+ t.truncate(coll, ghobj, setsize);
+ // Changing objectstore size will invalidate data_digest, so clear it.
+ oi.clear_data_digest();
+ }
+ encode(oi, attr, -1); /* fixme: using full features */
+ t.setattr(coll, ghobj, OI_ATTR, attr);
+ if (is_snap) {
+ bufferlist snapattr;
+ snapattr.clear();
+ encode(ss, snapattr);
+ t.setattr(coll, head, SS_ATTR, snapattr);
+ }
+ auto ch = store->open_collection(coll);
+ r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int clear_data_digest(ObjectStore *store, coll_t coll, ghobject_t &ghobj) {
+ auto ch = store->open_collection(coll);
+ bufferlist attr;
+ int r = store->getattr(ch, ghobj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ object_info_t oi;
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (!dry_run) {
+ attr.clear();
+ oi.clear_data_digest();
+ encode(oi, attr, -1); /* fixme: using full features */
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, OI_ATTR, attr);
+ auto ch = store->open_collection(coll);
+ r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int clear_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj,
+ string arg)
+{
+ SnapSet ss;
+ int ret = get_snapset(store, coll, ghobj, ss);
+ if (ret < 0)
+ return ret;
+
+ // Use "corrupt" to clear entire SnapSet
+ // Use "seq" to just corrupt SnapSet.seq
+ if (arg == "corrupt" || arg == "seq")
+ ss.seq = 0;
+ // Use "snaps" to just clear SnapSet.clone_snaps
+ if (arg == "corrupt" || arg == "snaps")
+ ss.clone_snaps.clear();
+ // By default just clear clone, clone_overlap and clone_size
+ if (arg == "corrupt")
+ arg = "";
+ if (arg == "" || arg == "clones")
+ ss.clones.clear();
+ if (arg == "" || arg == "clone_overlap")
+ ss.clone_overlap.clear();
+ if (arg == "" || arg == "clone_size")
+ ss.clone_size.clear();
+ // Break all clone sizes by adding 1
+ if (arg == "size") {
+ for (map<snapid_t, uint64_t>::iterator i = ss.clone_size.begin();
+ i != ss.clone_size.end(); ++i)
+ ++(i->second);
+ }
+
+ if (!dry_run) {
+ bufferlist bl;
+ encode(ss, bl);
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, SS_ATTR, bl);
+ auto ch = store->open_collection(coll);
+ int r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+vector<snapid_t>::iterator find(vector<snapid_t> &v, snapid_t clid)
+{
+ return std::find(v.begin(), v.end(), clid);
+}
+
+map<snapid_t, interval_set<uint64_t> >::iterator
+find(map<snapid_t, interval_set<uint64_t> > &m, snapid_t clid)
+{
+ return m.find(clid);
+}
+
+map<snapid_t, uint64_t>::iterator find(map<snapid_t, uint64_t> &m,
+ snapid_t clid)
+{
+ return m.find(clid);
+}
+
+template<class T>
+int remove_from(T &mv, string name, snapid_t cloneid, bool force)
+{
+ typename T::iterator i = find(mv, cloneid);
+ if (i != mv.end()) {
+ mv.erase(i);
+ } else {
+ cerr << "Clone " << cloneid << " doesn't exist in " << name;
+ if (force) {
+ cerr << " (ignored)" << std::endl;
+ return 0;
+ }
+ cerr << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int remove_clone(
+ ObjectStore *store, coll_t coll, ghobject_t &ghobj, snapid_t cloneid, bool force)
+{
+ // XXX: Don't allow this if in a cache tier or former cache tier
+ // bool allow_incomplete_clones() const {
+ // return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
+
+ SnapSet snapset;
+ int ret = get_snapset(store, coll, ghobj, snapset);
+ if (ret < 0)
+ return ret;
+
+ // Derived from trim_object()
+ // ...from snapset
+ vector<snapid_t>::iterator p;
+ for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p)
+ if (*p == cloneid)
+ break;
+ if (p == snapset.clones.end()) {
+ cerr << "Clone " << cloneid << " not present";
+ return -ENOENT;
+ }
+ if (p != snapset.clones.begin()) {
+ // not the oldest... merge overlap into next older clone
+ vector<snapid_t>::iterator n = p - 1;
+ hobject_t prev_coid = ghobj.hobj;
+ prev_coid.snap = *n;
+ //bool adjust_prev_bytes = is_present_clone(prev_coid);
+
+ //if (adjust_prev_bytes)
+ // ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
+
+ snapset.clone_overlap[*n].intersection_of(
+ snapset.clone_overlap[*p]);
+
+ //if (adjust_prev_bytes)
+ // ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
+ }
+
+ ret = remove_from(snapset.clones, "clones", cloneid, force);
+ if (ret) return ret;
+ ret = remove_from(snapset.clone_overlap, "clone_overlap", cloneid, force);
+ if (ret) return ret;
+ ret = remove_from(snapset.clone_size, "clone_size", cloneid, force);
+ if (ret) return ret;
+
+ if (dry_run)
+ return 0;
+
+ bufferlist bl;
+ encode(snapset, bl);
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, SS_ATTR, bl);
+ auto ch = store->open_collection(coll);
+ int r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ cout << "Removal of clone " << cloneid << " complete" << std::endl;
+ cout << "Use pg repair after OSD restarted to correct stat information" << std::endl;
+ return 0;
+}
+
+int dup(string srcpath, ObjectStore *src, string dstpath, ObjectStore *dst)
+{
+ cout << "dup from " << src->get_type() << ": " << srcpath << "\n"
+ << " to " << dst->get_type() << ": " << dstpath
+ << std::endl;
+ int num, i;
+ vector<coll_t> collections;
+ int r;
+
+ r = src->mount();
+ if (r < 0) {
+ cerr << "failed to mount src: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ r = dst->mount();
+ if (r < 0) {
+ cerr << "failed to mount dst: " << cpp_strerror(r) << std::endl;
+ goto out_src;
+ }
+
+ if (src->get_fsid() != dst->get_fsid()) {
+ cerr << "src fsid " << src->get_fsid() << " != dest " << dst->get_fsid()
+ << std::endl;
+ goto out;
+ }
+ cout << "fsid " << src->get_fsid() << std::endl;
+
+ // make sure dst is empty
+ r = dst->list_collections(collections);
+ if (r < 0) {
+ cerr << "error listing collections on dst: " << cpp_strerror(r) << std::endl;
+ goto out;
+ }
+ if (!collections.empty()) {
+ cerr << "destination store is not empty" << std::endl;
+ goto out;
+ }
+
+ r = src->list_collections(collections);
+ if (r < 0) {
+ cerr << "error listing collections on src: " << cpp_strerror(r) << std::endl;
+ goto out;
+ }
+
+ num = collections.size();
+ cout << num << " collections" << std::endl;
+ i = 1;
+ for (auto cid : collections) {
+ cout << i++ << "/" << num << " " << cid << std::endl;
+ auto ch = src->open_collection(cid);
+ auto dch = dst->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ int bits = src->collection_bits(ch);
+ if (bits < 0) {
+ if (src->get_type() == "filestore" && cid.is_meta()) {
+ bits = 0;
+ } else {
+ cerr << "cannot get bit count for collection " << cid << ": "
+ << cpp_strerror(bits) << std::endl;
+ goto out;
+ }
+ }
+ t.create_collection(cid, bits);
+ dst->queue_transaction(dch, std::move(t));
+ }
+
+ ghobject_t pos;
+ uint64_t n = 0;
+ uint64_t bytes = 0, keys = 0;
+ while (true) {
+ vector<ghobject_t> ls;
+ r = src->collection_list(ch, pos, ghobject_t::get_max(), 1000, &ls, &pos);
+ if (r < 0) {
+ cerr << "collection_list on " << cid << " from " << pos << " got: "
+ << cpp_strerror(r) << std::endl;
+ goto out;
+ }
+ if (ls.empty()) {
+ break;
+ }
+
+ for (auto& oid : ls) {
+ //cout << " " << cid << " " << oid << std::endl;
+ if (n % 100 == 0) {
+ cout << " " << std::setw(16) << n << " objects, "
+ << std::setw(16) << bytes << " bytes, "
+ << std::setw(16) << keys << " keys"
+ << std::setw(1) << "\r" << std::flush;
+ }
+ n++;
+
+ ObjectStore::Transaction t;
+ t.touch(cid, oid);
+
+ map<string,bufferptr> attrs;
+ src->getattrs(ch, oid, attrs);
+ if (!attrs.empty()) {
+ t.setattrs(cid, oid, attrs);
+ }
+
+ bufferlist bl;
+ src->read(ch, oid, 0, 0, bl);
+ if (bl.length()) {
+ t.write(cid, oid, 0, bl.length(), bl);
+ bytes += bl.length();
+ }
+
+ bufferlist header;
+ map<string,bufferlist> omap;
+ src->omap_get(ch, oid, &header, &omap);
+ if (header.length()) {
+ t.omap_setheader(cid, oid, header);
+ ++keys;
+ }
+ if (!omap.empty()) {
+ keys += omap.size();
+ t.omap_setkeys(cid, oid, omap);
+ }
+
+ dst->queue_transaction(dch, std::move(t));
+ }
+ }
+ cout << " " << std::setw(16) << n << " objects, "
+ << std::setw(16) << bytes << " bytes, "
+ << std::setw(16) << keys << " keys"
+ << std::setw(1) << std::endl;
+ }
+
+ // keyring
+ cout << "keyring" << std::endl;
+ {
+ bufferlist bl;
+ string s = srcpath + "/keyring";
+ string err;
+ r = bl.read_file(s.c_str(), &err);
+ if (r < 0) {
+ cerr << "failed to copy " << s << ": " << err << std::endl;
+ } else {
+ string d = dstpath + "/keyring";
+ bl.write_file(d.c_str(), 0600);
+ }
+ }
+
+ // osd metadata
+ cout << "duping osd metadata" << std::endl;
+ {
+ for (auto k : {"magic", "whoami", "ceph_fsid", "fsid"}) {
+ string val;
+ src->read_meta(k, &val);
+ dst->write_meta(k, val);
+ }
+ }
+
+ dst->write_meta("ready", "ready");
+
+ cout << "done." << std::endl;
+ r = 0;
+ out:
+ dst->umount();
+ out_src:
+ src->umount();
+ return r;
+}
+
+
+const int ceph_entity_name_type(const string name)
+{
+ if (name == "mds") return CEPH_ENTITY_TYPE_MDS;
+ if (name == "osd") return CEPH_ENTITY_TYPE_OSD;
+ if (name == "mon") return CEPH_ENTITY_TYPE_MON;
+ if (name == "client") return CEPH_ENTITY_TYPE_CLIENT;
+ if (name == "mgr") return CEPH_ENTITY_TYPE_MGR;
+ if (name == "auth") return CEPH_ENTITY_TYPE_AUTH;
+ return -1;
+}
+
+eversion_t get_eversion_from_str(const string& s) {
+ eversion_t e;
+ vector<string> result;
+ boost::split(result, s, boost::is_any_of("'"));
+ if (result.size() != 2) {
+ cerr << "eversion_t: invalid format: '" << s << "'" << std::endl;
+ return e;
+ }
+ e.epoch = atoi(result[0].c_str());
+ e.version = atoi(result[1].c_str());
+ return e;
+}
+
+osd_reqid_t get_reqid_from_str(const string& s) {
+ osd_reqid_t reqid;
+
+ vector<string> result;
+ boost::split(result, s, boost::is_any_of(".:"));
+ if (result.size() != 4) {
+ cerr << "reqid: invalid format " << s << std::endl;
+ return osd_reqid_t();
+ }
+ reqid.name._type = ceph_entity_name_type(result[0]);
+ reqid.name._num = atoi(result[1].c_str());
+
+ reqid.inc = atoi(result[2].c_str());
+ reqid.tid = atoi(result[3].c_str());
+ return reqid;
+}
+
+void do_dups_inject_transction(ObjectStore *store, spg_t r_pgid, map<string,bufferlist> *new_dups)
+{
+ ObjectStore::Transaction t;
+ coll_t coll(r_pgid);
+ cerr << "injecting dups into pgid:" << r_pgid << " num of dups:" << new_dups->size() << std::endl;
+ t.omap_setkeys(coll, r_pgid.make_pgmeta_oid(), (*new_dups));
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(t));
+ new_dups->clear();
+}
+
+int do_dups_inject_object(ObjectStore *store, spg_t r_pgid, json_spirit::mObject &in_json_obj,
+ map<string,bufferlist> *new_dups, bool debug) {
+ std::map<std::string, json_spirit::mValue>::const_iterator it = in_json_obj.find("generate");
+ int32_t generate = 0;
+ if (it != in_json_obj.end()) {
+ generate = atoi(it->second.get_str().c_str());
+ }
+
+ it = in_json_obj.find("reqid");
+ if (it == in_json_obj.end()) {
+ return 1;
+ }
+ osd_reqid_t reqid(get_reqid_from_str(it->second.get_str()));
+ it = in_json_obj.find("version");
+ if (it == in_json_obj.end()) {
+ return 1;
+ }
+ eversion_t version(get_eversion_from_str(it->second.get_str()));
+ it = in_json_obj.find("user_version");
+ if (it == in_json_obj.end()) {
+ return 1;
+ }
+ version_t user_version = atoi(it->second.get_str().c_str());
+ it = in_json_obj.find("return_code");
+ if (it == in_json_obj.end()) {
+ return 1;
+ }
+ int32_t return_code = atoi(it->second.get_str().c_str());
+ if (generate) {
+ for(auto i = 0; i < generate; ++i) {
+ version.version++;
+ if (debug) {
+ cout << "generate dups reqid " << reqid << " v=" << version << std::endl;
+ }
+ pg_log_dup_t tmp(version, user_version, reqid, return_code);
+ bufferlist bl;
+ encode(tmp, bl);
+ (*new_dups)[tmp.get_key_name()] = std::move(bl);
+ if ( new_dups->size() > 50000 ) {
+ do_dups_inject_transction(store, r_pgid, new_dups);
+ cout << "inject of " << i << " dups into pgid:" << r_pgid << " done..." << std::endl;
+ }
+ }
+ return 0;
+ } else {
+ pg_log_dup_t tmp(version, user_version, reqid, return_code);
+ if (debug) {
+ cout << "adding dup: " << tmp << "into key:" << tmp.get_key_name() << std::endl;
+ }
+ bufferlist bl;
+ encode(tmp, bl);
+ (*new_dups)[tmp.get_key_name()] = std::move(bl);
+ }
+ return 0;
+}
+
+void do_dups_inject_from_json(ObjectStore *store, spg_t r_pgid, json_spirit::mValue &inJson, bool debug)
+{
+ map<string,bufferlist> new_dups;
+ const vector<json_spirit::mValue>& o = inJson.get_array();
+ for (const auto& obj : o) {
+ if (obj.type() == json_spirit::obj_type) {
+ json_spirit::mObject Mobj = obj.get_obj();
+ do_dups_inject_object(store, r_pgid, Mobj, &new_dups, debug);
+ } else {
+ throw std::runtime_error("JSON array/object not allowed type:" + obj.type());
+ return;
+ }
+ }
+ if (new_dups.size() > 0) {
+ do_dups_inject_transction(store, r_pgid, &new_dups);
+ }
+
+
+ return ;
+}
+
+void usage(po::options_description &desc)
+{
+ cerr << std::endl;
+ cerr << desc << std::endl;
+ cerr << std::endl;
+ cerr << "Positional syntax:" << std::endl;
+ cerr << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> (get|set)-bytes [file]" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> set-(attr|omap) <key> [file]" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> (get|rm)-(attr|omap) <key>" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> get-omaphdr" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> set-omaphdr [file]" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> list-attrs" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> list-omap" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> remove|removeall" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> dump" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> set-size" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> clear-data-digest" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> remove-clone-metadata <cloneid>" << std::endl;
+ cerr << std::endl;
+ cerr << "<object> can be a JSON object description as displayed" << std::endl;
+ cerr << "by --op list." << std::endl;
+ cerr << "<object> can be an object name which will be looked up in all" << std::endl;
+ cerr << "the OSD's PGs." << std::endl;
+ cerr << "<object> can be the empty string ('') which with a provided pgid " << std::endl;
+ cerr << "specifies the pgmeta object" << std::endl;
+ cerr << std::endl;
+ cerr << "The optional [file] argument will read stdin or write stdout" << std::endl;
+ cerr << "if not specified or if '-' specified." << std::endl;
+}
+
+bool ends_with(const string& check, const string& ending)
+{
+ return check.size() >= ending.size() && check.rfind(ending) == (check.size() - ending.size());
+}
+
+// Based on FileStore::dump_journal(), set-up enough to only dump
+int mydump_journal(Formatter *f, string journalpath, bool m_journal_dio)
+{
+ int r;
+
+ if (!journalpath.length())
+ return -EINVAL;
+
+ FileJournal *journal = new FileJournal(g_ceph_context, uuid_d(), NULL, NULL,
+ journalpath.c_str(), m_journal_dio);
+ r = journal->_fdump(*f, false);
+ delete journal;
+ return r;
+}
+
+int apply_layout_settings(ObjectStore *os, const OSDSuperblock &superblock,
+ const string &pool_name, const spg_t &pgid, bool dry_run,
+ int target_level)
+{
+ int r = 0;
+
+ FileStore *fs = dynamic_cast<FileStore*>(os);
+ if (!fs) {
+ cerr << "Nothing to do for non-filestore backend" << std::endl;
+ return 0; // making this return success makes testing easier
+ }
+
+ OSDMap curmap;
+ bufferlist bl;
+ r = get_osdmap(os, superblock.current_epoch, curmap, bl);
+ if (r) {
+ cerr << "Can't find local OSDMap: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ int64_t poolid = -1;
+ if (pool_name.length()) {
+ poolid = curmap.lookup_pg_pool_name(pool_name);
+ if (poolid < 0) {
+ cerr << "Couldn't find pool " << pool_name << ": " << cpp_strerror(poolid)
+ << std::endl;
+ return poolid;
+ }
+ }
+
+ vector<coll_t> collections, filtered_colls;
+ r = os->list_collections(collections);
+ if (r < 0) {
+ cerr << "Error listing collections: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ for (auto const &coll : collections) {
+ spg_t coll_pgid;
+ if (coll.is_pg(&coll_pgid) &&
+ ((poolid >= 0 && coll_pgid.pool() == (uint64_t)poolid) ||
+ coll_pgid == pgid)) {
+ filtered_colls.push_back(coll);
+ }
+ }
+
+ size_t done = 0, total = filtered_colls.size();
+ for (auto const &coll : filtered_colls) {
+ if (dry_run) {
+ cerr << "Would apply layout settings to " << coll << std::endl;
+ } else {
+ cerr << "Finished " << done << "/" << total << " collections" << "\r";
+ r = fs->apply_layout_settings(coll, target_level);
+ if (r < 0) {
+ cerr << "Error applying layout settings to " << coll << std::endl;
+ return r;
+ }
+ }
+ ++done;
+ }
+
+ cerr << "Finished " << total << "/" << total << " collections" << "\r" << std::endl;
+ return r;
+}
+
+int main(int argc, char **argv)
+{
+ string dpath, jpath, pgidstr, op, file, mountpoint, mon_store_path, object;
+ string target_data_path, fsid;
+ string objcmd, arg1, arg2, type, format, argnspace, pool, rmtypestr;
+ boost::optional<std::string> nspace;
+ spg_t pgid;
+ unsigned epoch = 0;
+ unsigned slow_threshold = 16;
+ ghobject_t ghobj;
+ bool human_readable;
+ Formatter *formatter;
+ bool head, tty;
+
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("help", "produce help message")
+ ("type", po::value<string>(&type),
+ "Arg is one of [bluestore (default), filestore, memstore]")
+ ("data-path", po::value<string>(&dpath),
+ "path to object store, mandatory")
+ ("journal-path", po::value<string>(&jpath),
+ "path to journal, use if tool can't find it")
+ ("pgid", po::value<string>(&pgidstr),
+ "PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, trim-pg-log-dups and mandatory for apply-layout-settings if --pool is not specified")
+ ("pool", po::value<string>(&pool),
+ "Pool name, mandatory for apply-layout-settings if --pgid is not specified")
+ ("op", po::value<string>(&op),
+ "Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, list-slow-omap, fix-lost, list-pgs, dump-journal, dump-super, meta-list, "
+ "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log, trim-pg-log-dups statfs]")
+ ("epoch", po::value<unsigned>(&epoch),
+ "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
+ ("file", po::value<string>(&file),
+ "path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
+ ("mon-store-path", po::value<string>(&mon_store_path),
+ "path of monstore to update-mon-db")
+ ("fsid", po::value<string>(&fsid),
+ "fsid for new store created by mkfs")
+ ("target-data-path", po::value<string>(&target_data_path),
+ "path of target object store (for --op dup)")
+ ("mountpoint", po::value<string>(&mountpoint),
+ "fuse mountpoint")
+ ("format", po::value<string>(&format)->default_value("json-pretty"),
+ "Output format which may be json, json-pretty, xml, xml-pretty")
+ ("debug", "Enable diagnostic output to stderr")
+ ("no-mon-config", "Do not contact mons for config")
+ ("no-superblock", "Do not read superblock")
+ ("force", "Ignore some types of errors and proceed with operation - USE WITH CAUTION: CORRUPTION POSSIBLE NOW OR IN THE FUTURE")
+ ("skip-journal-replay", "Disable journal replay")
+ ("skip-mount-omap", "Disable mounting of omap")
+ ("head", "Find head/snapdir when searching for objects by name")
+ ("dry-run", "Don't modify the objectstore")
+ ("tty", "Treat stdout as a tty (no binary data)")
+ ("namespace", po::value<string>(&argnspace), "Specify namespace when searching for objects")
+ ("rmtype", po::value<string>(&rmtypestr), "Specify corrupting object removal 'snapmap' or 'nosnapmap' - TESTING USE ONLY")
+ ("slow-omap-threshold", po::value<unsigned>(&slow_threshold),
+ "Threshold (in seconds) to consider omap listing slow (for op=list-slow-omap)")
+ ;
+
+ po::options_description positional("Positional options");
+ positional.add_options()
+ ("object", po::value<string>(&object), "'' for pgmeta_oid, object name or ghobject in json")
+ ("objcmd", po::value<string>(&objcmd), "command [(get|set)-bytes, (get|set|rm)-(attr|omap), (get|set)-omaphdr, list-attrs, list-omap, remove]")
+ ("arg1", po::value<string>(&arg1), "arg1 based on cmd")
+ ("arg2", po::value<string>(&arg2), "arg2 based on cmd")
+ ;
+
+ po::options_description all;
+ all.add(desc).add(positional);
+
+ po::positional_options_description pd;
+ pd.add("object", 1).add("objcmd", 1).add("arg1", 1).add("arg2", 1);
+
+ vector<string> ceph_option_strings;
+
+ po::variables_map vm;
+ try {
+ po::parsed_options parsed =
+ po::command_line_parser(argc, argv).options(all).allow_unregistered().positional(pd).run();
+ po::store( parsed, vm);
+ po::notify(vm);
+ ceph_option_strings = po::collect_unrecognized(parsed.options,
+ po::include_positional);
+ } catch(po::error &e) {
+ std::cerr << e.what() << std::endl;
+ return 1;
+ }
+
+ if (vm.count("help")) {
+ usage(desc);
+ return 1;
+ }
+
+ // Compatibility with previous option name
+ if (op == "dump-import")
+ op = "dump-export";
+
+ debug = (vm.count("debug") > 0);
+
+ force = (vm.count("force") > 0);
+
+ no_superblock = (vm.count("no-superblock") > 0);
+
+ if (vm.count("namespace"))
+ nspace = argnspace;
+
+ dry_run = (vm.count("dry-run") > 0);
+ tty = (vm.count("tty") > 0);
+
+ osflagbits_t flags = 0;
+ if (dry_run || vm.count("skip-journal-replay"))
+ flags |= SKIP_JOURNAL_REPLAY;
+ if (vm.count("skip-mount-omap"))
+ flags |= SKIP_MOUNT_OMAP;
+ if (op == "update-mon-db")
+ flags |= SKIP_JOURNAL_REPLAY;
+
+ head = (vm.count("head") > 0);
+
+ // infer osd id so we can authenticate
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/whoami", dpath.c_str());
+ int fd = ::open(fn, O_RDONLY);
+ if (fd >= 0) {
+ bufferlist bl;
+ bl.read_fd(fd, 64);
+ string s(bl.c_str(), bl.length());
+ int whoami = atoi(s.c_str());
+ vector<string> tmp;
+ // identify ourselves as this osd so we can auth and fetch our configs
+ tmp.push_back("-n");
+ tmp.push_back(string("osd.") + stringify(whoami));
+ // populate osd_data so that the default keyring location works
+ tmp.push_back("--osd-data");
+ tmp.push_back(dpath);
+ tmp.insert(tmp.end(), ceph_option_strings.begin(),
+ ceph_option_strings.end());
+ tmp.swap(ceph_option_strings);
+ }
+
+ vector<const char *> ceph_options;
+ ceph_options.reserve(ceph_options.size() + ceph_option_strings.size());
+ for (vector<string>::iterator i = ceph_option_strings.begin();
+ i != ceph_option_strings.end();
+ ++i) {
+ ceph_options.push_back(i->c_str());
+ }
+
+ snprintf(fn, sizeof(fn), "%s/type", dpath.c_str());
+ fd = ::open(fn, O_RDONLY);
+ if (fd >= 0) {
+ bufferlist bl;
+ bl.read_fd(fd, 64);
+ if (bl.length()) {
+ string dp_type = string(bl.c_str(), bl.length() - 1); // drop \n
+ if (vm.count("type") && dp_type != "" && type != dp_type)
+ cerr << "WARNING: Ignoring type \"" << type << "\" - found data-path type \""
+ << dp_type << "\"" << std::endl;
+ type = dp_type;
+ //cout << "object store type is " << type << std::endl;
+ }
+ ::close(fd);
+ }
+
+ if (!vm.count("type") && type == "") {
+ type = "bluestore";
+ }
+ if (!vm.count("data-path") &&
+ op != "dump-export" &&
+ !(op == "dump-journal" && type == "filestore")) {
+ cerr << "Must provide --data-path" << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (type == "filestore" && !vm.count("journal-path")) {
+ jpath = dpath + "/journal";
+ }
+ if (!vm.count("op") && !vm.count("object")) {
+ cerr << "Must provide --op or object command..." << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (op != "list" && op != "apply-layout-settings" &&
+ vm.count("op") && vm.count("object")) {
+ cerr << "Can't specify both --op and object command syntax" << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (op == "apply-layout-settings" && !(vm.count("pool") ^ vm.count("pgid"))) {
+ cerr << "apply-layout-settings requires either --pool or --pgid"
+ << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (op != "list" && op != "apply-layout-settings" && vm.count("object") && !vm.count("objcmd")) {
+ cerr << "Invalid syntax, missing command" << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (op == "fuse" && mountpoint.length() == 0) {
+ cerr << "Missing fuse mountpoint" << std::endl;
+ usage(desc);
+ return 1;
+ }
+ outistty = isatty(STDOUT_FILENO) || tty;
+
+ file_fd = fd_none;
+ if ((op == "export" || op == "export-remove" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
+ if (!vm.count("file") || file == "-") {
+ if (outistty) {
+ cerr << "stdout is a tty and no --file filename specified" << std::endl;
+ return 1;
+ }
+ file_fd = STDOUT_FILENO;
+ } else {
+ file_fd = open(file.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
+ }
+ } else if (op == "import" || op == "dump-export" || op == "set-osdmap" || op == "set-inc-osdmap" || op == "pg-log-inject-dups") {
+ if (!vm.count("file") || file == "-") {
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no --file filename specified" << std::endl;
+ return 1;
+ }
+ file_fd = STDIN_FILENO;
+ } else {
+ file_fd = open(file.c_str(), O_RDONLY);
+ }
+ }
+
+ ObjectStoreTool tool = ObjectStoreTool(file_fd, dry_run);
+
+ if (vm.count("file") && file_fd == fd_none && !dry_run) {
+ cerr << "--file option only applies to import, dump-export, export, export-remove, "
+ << "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl;
+ return 1;
+ }
+
+ if (file_fd != fd_none && file_fd < 0) {
+ string err = string("file: ") + file;
+ perror(err.c_str());
+ return 1;
+ }
+ int init_flags = 0;
+ if (vm.count("no-mon-config") > 0) {
+ init_flags |= CINIT_FLAG_NO_MON_CONFIG;
+ }
+
+ auto cct = global_init(
+ NULL, ceph_options,
+ CEPH_ENTITY_TYPE_OSD,
+ CODE_ENVIRONMENT_UTILITY_NODOUT,
+ init_flags);
+ common_init_finish(g_ceph_context);
+ if (debug) {
+ g_conf().set_val_or_die("log_to_stderr", "true");
+ g_conf().set_val_or_die("err_to_stderr", "true");
+ }
+ g_conf().apply_changes(nullptr);
+
+ // Special list handling. Treating pretty_format as human readable,
+ // with one object per line and not an enclosing array.
+ human_readable = ends_with(format, "-pretty");
+ if ((op == "list" || op == "meta-list") && human_readable) {
+ // Remove -pretty from end of format which we know is there
+ format = format.substr(0, format.size() - strlen("-pretty"));
+ }
+
+ formatter = Formatter::create(format);
+ if (formatter == NULL) {
+ cerr << "unrecognized format: " << format << std::endl;
+ return 1;
+ }
+
+ // Special handling for filestore journal, so we can dump it without mounting
+ if (op == "dump-journal" && type == "filestore") {
+ int ret = mydump_journal(formatter, jpath, g_conf()->journal_dio);
+ if (ret < 0) {
+ cerr << "journal-path: " << jpath << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ formatter->flush(cout);
+ return 0;
+ }
+
+ if (op == "dump-export") {
+ int ret = tool.dump_export(formatter);
+ if (ret < 0) {
+ cerr << "dump-export: "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ return 0;
+ }
+
+ //Verify that data-path really exists
+ struct stat st;
+ if (::stat(dpath.c_str(), &st) == -1) {
+ string err = string("data-path: ") + dpath;
+ perror(err.c_str());
+ return 1;
+ }
+
+ if (pgidstr.length() && !pgid.parse(pgidstr.c_str())) {
+ cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl;
+ return 1;
+ }
+
+ //Verify that the journal-path really exists
+ if (type == "filestore") {
+ if (::stat(jpath.c_str(), &st) == -1) {
+ string err = string("journal-path: ") + jpath;
+ perror(err.c_str());
+ return 1;
+ }
+ if (S_ISDIR(st.st_mode)) {
+ cerr << "journal-path: " << jpath << ": "
+ << cpp_strerror(EISDIR) << std::endl;
+ return 1;
+ }
+ }
+
+ ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);
+ if (fs == NULL) {
+ cerr << "Unable to create store of type " << type << std::endl;
+ return 1;
+ }
+
+ if (op == "fsck" || op == "fsck-deep") {
+ int r = fs->fsck(op == "fsck-deep");
+ if (r < 0) {
+ cerr << "fsck failed: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ if (r > 0) {
+ cerr << "fsck status: " << r << " remaining error(s) and warning(s)" << std::endl;
+ return 1;
+ }
+ cout << "fsck success" << std::endl;
+ return 0;
+ }
+ if (op == "repair" || op == "repair-deep") {
+ int r = fs->repair(op == "repair-deep");
+ if (r < 0) {
+ cerr << "repair failed: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ if (r > 0) {
+ cerr << "repair status: " << r << " remaining error(s) and warning(s)" << std::endl;
+ return 1;
+ }
+ cout << "repair success" << std::endl;
+ return 0;
+ }
+ if (op == "mkfs") {
+ if (fsid.length()) {
+ uuid_d f;
+ bool r = f.parse(fsid.c_str());
+ if (!r) {
+ cerr << "failed to parse uuid '" << fsid << "'" << std::endl;
+ return 1;
+ }
+ fs->set_fsid(f);
+ }
+ int r = fs->mkfs();
+ if (r < 0) {
+ cerr << "mkfs failed: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ return 0;
+ }
+ if (op == "dup") {
+ string target_type;
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/type", target_data_path.c_str());
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ cerr << "Unable to open " << target_data_path << "/type" << std::endl;
+ exit(1);
+ }
+ bufferlist bl;
+ bl.read_fd(fd, 64);
+ if (bl.length()) {
+ target_type = string(bl.c_str(), bl.length() - 1); // drop \n
+ }
+ ::close(fd);
+ ObjectStore *targetfs = ObjectStore::create(
+ g_ceph_context, target_type,
+ target_data_path, "", 0);
+ if (targetfs == NULL) {
+ cerr << "Unable to open store of type " << target_type << std::endl;
+ return 1;
+ }
+ int r = dup(dpath, fs, target_data_path, targetfs);
+ if (r < 0) {
+ cerr << "dup failed: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ return 0;
+ }
+
+ int ret = fs->mount();
+ if (ret < 0) {
+ if (ret == -EBUSY) {
+ cerr << "OSD has the store locked" << std::endl;
+ } else {
+ cerr << "Mount failed with '" << cpp_strerror(ret) << "'" << std::endl;
+ }
+ return 1;
+ }
+
+ if (op == "fuse") {
+#ifdef HAVE_LIBFUSE
+ FuseStore fuse(fs, mountpoint);
+ cout << "mounting fuse at " << mountpoint << " ..." << std::endl;
+ int r = fuse.main();
+ if (r < 0) {
+ cerr << "failed to mount fuse: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+#else
+ cerr << "fuse support not enabled" << std::endl;
+#endif
+ return 0;
+ }
+
+ vector<coll_t> ls;
+ vector<coll_t>::iterator it;
+ CompatSet supported;
+
+#ifdef INTERNAL_TEST
+ supported = get_test_compat_set();
+#else
+ supported = OSD::get_osd_compat_set();
+#endif
+
+ bufferlist bl;
+ auto ch = fs->open_collection(coll_t::meta());
+ std::unique_ptr<OSDSuperblock> superblock;
+ if (!no_superblock) {
+ superblock.reset(new OSDSuperblock);
+ bufferlist::const_iterator p;
+ ret = fs->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
+ if (ret < 0) {
+ cerr << "Failure to read OSD superblock: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ p = bl.cbegin();
+ decode(*superblock, p);
+
+ if (debug) {
+ cerr << "Cluster fsid=" << superblock->cluster_fsid << std::endl;
+ }
+
+ if (debug) {
+ cerr << "Supported features: " << supported << std::endl;
+ cerr << "On-disk features: " << superblock->compat_features << std::endl;
+ }
+ if (supported.compare(superblock->compat_features) == -1) {
+ CompatSet unsupported = supported.unsupported(superblock->compat_features);
+ cerr << "On-disk OSD incompatible features set "
+ << unsupported << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (op == "apply-layout-settings") {
+ int target_level = 0;
+ // Single positional argument with apply-layout-settings
+ // for target_level.
+ if (vm.count("object") && isdigit(object[0])) {
+ target_level = atoi(object.c_str());
+ // This requires --arg1 to be specified since
+ // this is the third positional argument and normally
+ // used with object operations.
+ } else if (vm.count("arg1") && isdigit(arg1[0])) {
+ target_level = atoi(arg1.c_str());
+ }
+ ceph_assert(superblock != nullptr);
+ ret = apply_layout_settings(fs, *superblock, pool, pgid, dry_run, target_level);
+ goto out;
+ }
+
+ if (op != "list" && vm.count("object")) {
+ // Special case: Create pgmeta_oid if empty string specified
+ // This can't conflict with any actual object names.
+ if (object == "") {
+ ghobj = pgid.make_pgmeta_oid();
+ } else {
+ json_spirit::Value v;
+ try {
+ if (!json_spirit::read(object, v) ||
+ (v.type() != json_spirit::array_type && v.type() != json_spirit::obj_type)) {
+ // Special: Need head/snapdir so set even if user didn't specify
+ if (vm.count("objcmd") && (objcmd == "remove-clone-metadata"))
+ head = true;
+ lookup_ghobject lookup(object, nspace, head);
+ if (pgidstr.length())
+ ret = action_on_all_objects_in_exact_pg(fs, coll_t(pgid), lookup, debug);
+ else
+ ret = action_on_all_objects(fs, lookup, debug);
+ if (ret) {
+ throw std::runtime_error("Internal error");
+ } else {
+ if (lookup.size() != 1) {
+ stringstream ss;
+ if (lookup.size() == 0)
+ ss << "No object id '" << object << "' found or invalid JSON specified";
+ else
+ ss << "Found " << lookup.size() << " objects with id '" << object
+ << "', please use a JSON spec from --op list instead";
+ throw std::runtime_error(ss.str());
+ }
+ pair<coll_t, ghobject_t> found = lookup.pop();
+ pgidstr = found.first.to_str();
+ pgid.parse(pgidstr.c_str());
+ ghobj = found.second;
+ }
+ } else {
+ stringstream ss;
+ if (pgidstr.length() == 0 && v.type() != json_spirit::array_type) {
+ ss << "Without --pgid the object '" << object
+ << "' must be a JSON array";
+ throw std::runtime_error(ss.str());
+ }
+ if (v.type() == json_spirit::array_type) {
+ json_spirit::Array array = v.get_array();
+ if (array.size() != 2) {
+ ss << "Object '" << object
+ << "' must be a JSON array with 2 elements";
+ throw std::runtime_error(ss.str());
+ }
+ vector<json_spirit::Value>::iterator i = array.begin();
+ ceph_assert(i != array.end());
+ if (i->type() != json_spirit::str_type) {
+ ss << "Object '" << object
+ << "' must be a JSON array with the first element a string";
+ throw std::runtime_error(ss.str());
+ }
+ string object_pgidstr = i->get_str();
+ if (object_pgidstr != "meta") {
+ spg_t object_pgid;
+ object_pgid.parse(object_pgidstr.c_str());
+ if (pgidstr.length() > 0) {
+ if (object_pgid != pgid) {
+ ss << "object '" << object
+ << "' has a pgid different from the --pgid="
+ << pgidstr << " option";
+ throw std::runtime_error(ss.str());
+ }
+ } else {
+ pgidstr = object_pgidstr;
+ pgid = object_pgid;
+ }
+ } else {
+ pgidstr = object_pgidstr;
+ }
+ ++i;
+ v = *i;
+ }
+ try {
+ ghobj.decode(v);
+ } catch (std::runtime_error& e) {
+ ss << "Decode object JSON error: " << e.what();
+ throw std::runtime_error(ss.str());
+ }
+ if (pgidstr != "meta" && (uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) {
+ cerr << "Object pool and pgid pool don't match" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ if (pgidstr != "meta") {
+ auto ch = fs->open_collection(coll_t(pgid));
+ if (!ghobj.match(fs->collection_bits(ch), pgid.ps())) {
+ stringstream ss;
+ ss << "object " << ghobj << " not contained by pg " << pgid;
+ throw std::runtime_error(ss.str());
+ }
+ }
+ }
+ } catch (std::runtime_error& e) {
+ cerr << e.what() << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ }
+
+ // The ops which require --pgid option are checked here and
+ // mentioned in the usage for --pgid.
+ if ((op == "info" || op == "log" || op == "remove" || op == "export"
+ || op == "export-remove" || op == "mark-complete"
+ || op == "reset-last-complete"
+ || op == "trim-pg-log"
+ || op == "pg-log-inject-dups") &&
+ pgidstr.length() == 0) {
+ cerr << "Must provide pgid" << std::endl;
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+
+ if (op == "import") {
+ ceph_assert(superblock != nullptr);
+ try {
+ ret = tool.do_import(fs, *superblock, force, pgidstr);
+ }
+ catch (const buffer::error &e) {
+ cerr << "do_import threw exception error " << e.what() << std::endl;
+ ret = -EFAULT;
+ }
+ if (ret == -EFAULT) {
+ cerr << "Corrupt input for import" << std::endl;
+ }
+ if (ret == 0)
+ cout << "Import successful" << std::endl;
+ goto out;
+ } else if (op == "dump-journal-mount") {
+ // Undocumented feature to dump journal with mounted fs
+ // This doesn't support the format option, but it uses the
+ // ObjectStore::dump_journal() and mounts to get replay to run.
+ ret = fs->dump_journal(cout);
+ if (ret) {
+ if (ret == -EOPNOTSUPP) {
+ cerr << "Object store type \"" << type << "\" doesn't support journal dump" << std::endl;
+ } else {
+ cerr << "Journal dump failed with error " << cpp_strerror(ret) << std::endl;
+ }
+ }
+ goto out;
+ } else if (op == "get-osdmap") {
+ bufferlist bl;
+ OSDMap osdmap;
+ if (epoch == 0) {
+ ceph_assert(superblock != nullptr);
+ epoch = superblock->current_epoch;
+ }
+ ret = get_osdmap(fs, epoch, osdmap, bl);
+ if (ret) {
+ cerr << "Failed to get osdmap#" << epoch << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = bl.write_fd(file_fd);
+ if (ret) {
+ cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl;
+ } else {
+ cout << "osdmap#" << epoch << " exported." << std::endl;
+ }
+ goto out;
+ } else if (op == "set-osdmap") {
+ bufferlist bl;
+ ret = get_fd_data(file_fd, bl);
+ if (ret < 0) {
+ cerr << "Failed to read osdmap " << cpp_strerror(ret) << std::endl;
+ } else {
+ ret = set_osdmap(fs, epoch, bl, force);
+ }
+ goto out;
+ } else if (op == "get-inc-osdmap") {
+ bufferlist bl;
+ if (epoch == 0) {
+ ceph_assert(superblock != nullptr);
+ epoch = superblock->current_epoch;
+ }
+ ret = get_inc_osdmap(fs, epoch, bl);
+ if (ret < 0) {
+ cerr << "Failed to get incremental osdmap# " << epoch << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = bl.write_fd(file_fd);
+ if (ret) {
+ cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl;
+ } else {
+ cout << "inc-osdmap#" << epoch << " exported." << std::endl;
+ }
+ goto out;
+ } else if (op == "set-inc-osdmap") {
+ bufferlist bl;
+ ret = get_fd_data(file_fd, bl);
+ if (ret < 0) {
+ cerr << "Failed to read incremental osdmap " << cpp_strerror(ret) << std::endl;
+ goto out;
+ } else {
+ ret = set_inc_osdmap(fs, epoch, bl, force);
+ }
+ goto out;
+ } else if (op == "update-mon-db") {
+ if (!vm.count("mon-store-path")) {
+ cerr << "Please specify the path to monitor db to update" << std::endl;
+ ret = -EINVAL;
+ } else {
+ ceph_assert(superblock != nullptr);
+ ret = update_mon_db(*fs, *superblock, dpath + "/keyring", mon_store_path);
+ }
+ goto out;
+ }
+
+ if (op == "remove") {
+ if (!force && !dry_run) {
+ cerr << "Please use export-remove or you must use --force option" << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = initiate_new_remove_pg(fs, pgid);
+ if (ret < 0) {
+ cerr << "PG '" << pgid << "' not found" << std::endl;
+ goto out;
+ }
+ cout << "Remove successful" << std::endl;
+ goto out;
+ }
+
+ if (op == "fix-lost") {
+ boost::scoped_ptr<action_on_object_t> action;
+ action.reset(new do_fix_lost());
+ if (pgidstr.length())
+ ret = action_on_all_objects_in_exact_pg(fs, coll_t(pgid), *action, debug);
+ else
+ ret = action_on_all_objects(fs, *action, debug);
+ goto out;
+ }
+
+ if (op == "list") {
+ ret = do_list(fs, pgidstr, object, nspace, formatter, debug,
+ human_readable, head);
+ if (ret < 0) {
+ cerr << "do_list failed: " << cpp_strerror(ret) << std::endl;
+ }
+ goto out;
+ }
+ if (op == "list-slow-omap") {
+ ret = do_list_slow(fs, pgidstr, object, slow_threshold, formatter, debug,
+ human_readable);
+ if (ret < 0) {
+ cerr << "do_list failed: " << cpp_strerror(ret) << std::endl;
+ }
+ goto out;
+ }
+
+ if (op == "dump-super") {
+ ceph_assert(superblock != nullptr);
+ formatter->open_object_section("superblock");
+ superblock->dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+ goto out;
+ }
+
+ if (op == "statfs") {
+ store_statfs_t statsbuf;
+ ret = fs->statfs(&statsbuf);
+ if (ret < 0) {
+ cerr << "error from statfs: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ formatter->open_object_section("statfs");
+ statsbuf.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+ goto out;
+ }
+
+ if (op == "meta-list") {
+ ret = do_meta(fs, object, formatter, debug, human_readable);
+ if (ret < 0) {
+ cerr << "do_meta failed: " << cpp_strerror(ret) << std::endl;
+ }
+ goto out;
+ }
+
+ ret = fs->list_collections(ls);
+ if (ret < 0) {
+ cerr << "failed to list pgs: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ if (debug && op == "list-pgs")
+ cout << "Performing list-pgs operation" << std::endl;
+
+ // Find pg
+ for (it = ls.begin(); it != ls.end(); ++it) {
+ spg_t tmppgid;
+
+ if (pgidstr == "meta") {
+ if (it->to_str() == "meta")
+ break;
+ else
+ continue;
+ }
+
+ if (!it->is_pg(&tmppgid)) {
+ continue;
+ }
+
+ if (it->is_temp(&tmppgid)) {
+ continue;
+ }
+
+ if (op != "list-pgs" && tmppgid != pgid) {
+ continue;
+ }
+
+ if (op != "list-pgs") {
+ //Found!
+ break;
+ }
+
+ cout << tmppgid << std::endl;
+ }
+
+ if (op == "list-pgs") {
+ ret = 0;
+ goto out;
+ }
+
+ // If not an object command nor any of the ops handled below, then output this usage
+ // before complaining about a bad pgid
+ if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log" && op != "trim-pg-log-dups" && op != "pg-log-inject-dups") {
+ cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, "
+ "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, trim-pg-log-dups statfs)"
+ << std::endl;
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ epoch_t map_epoch;
+// The following code for export, info, log require omap or !skip-mount-omap
+ if (it != ls.end()) {
+
+ coll_t coll = *it;
+
+ if (vm.count("objcmd")) {
+ ret = 0;
+ if (objcmd == "remove" || objcmd == "removeall") {
+ bool all = (objcmd == "removeall");
+ enum rmtype type = BOTH;
+ if (rmtypestr == "nosnapmap")
+ type = NOSNAPMAP;
+ else if (rmtypestr == "snapmap")
+ type = SNAPMAP;
+ ret = do_remove_object(fs, coll, ghobj, all, force, type);
+ goto out;
+ } else if (objcmd == "list-attrs") {
+ ret = do_list_attrs(fs, coll, ghobj);
+ goto out;
+ } else if (objcmd == "list-omap") {
+ ret = do_list_omap(fs, coll, ghobj);
+ goto out;
+ } else if (objcmd == "get-bytes" || objcmd == "set-bytes") {
+ if (objcmd == "get-bytes") {
+ int fd;
+ if (vm.count("arg1") == 0 || arg1 == "-") {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = open(arg1.c_str(), O_WRONLY|O_TRUNC|O_CREAT|O_EXCL|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_get_bytes(fs, coll, ghobj, fd);
+ if (fd != STDOUT_FILENO)
+ close(fd);
+ } else {
+ int fd;
+ if (vm.count("arg1") == 0 || arg1 == "-") {
+ // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no file specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(arg1.c_str(), O_RDONLY|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_set_bytes(fs, coll, ghobj, fd);
+ if (fd != STDIN_FILENO)
+ close(fd);
+ }
+ goto out;
+ } else if (objcmd == "get-attr") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_get_attr(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "set-attr") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ }
+
+ int fd;
+ if (vm.count("arg2") == 0 || arg2 == "-") {
+ // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no file specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(arg2.c_str(), O_RDONLY|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg2 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_set_attr(fs, coll, ghobj, arg1, fd);
+ if (fd != STDIN_FILENO)
+ close(fd);
+ goto out;
+ } else if (objcmd == "rm-attr") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_rm_attr(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "get-omap") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_get_omap(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "set-omap") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ int fd;
+ if (vm.count("arg2") == 0 || arg2 == "-") {
+ // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no file specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(arg2.c_str(), O_RDONLY|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg2 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_set_omap(fs, coll, ghobj, arg1, fd);
+ if (fd != STDIN_FILENO)
+ close(fd);
+ goto out;
+ } else if (objcmd == "rm-omap") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_rm_omap(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "get-omaphdr") {
+ if (vm.count("arg1")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_get_omaphdr(fs, coll, ghobj);
+ goto out;
+ } else if (objcmd == "set-omaphdr") {
+ // Extra arg
+ if (vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ int fd;
+ if (vm.count("arg1") == 0 || arg1 == "-") {
+ // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no file specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(arg1.c_str(), O_RDONLY|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_set_omaphdr(fs, coll, ghobj, fd);
+ if (fd != STDIN_FILENO)
+ close(fd);
+ goto out;
+ } else if (objcmd == "dump") {
+ // There should not be any other arguments
+ if (vm.count("arg1") || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = print_obj_info(fs, coll, ghobj, formatter);
+ goto out;
+ } else if (objcmd == "corrupt-info") { // Undocumented testing feature
+ // There should not be any other arguments
+ if (vm.count("arg1") || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = corrupt_info(fs, coll, ghobj, formatter);
+ goto out;
+ } else if (objcmd == "set-size" || objcmd == "corrupt-size") {
+ // Undocumented testing feature
+ bool corrupt = (objcmd == "corrupt-size");
+ // Extra arg
+ if (vm.count("arg1") == 0 || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) {
+ cerr << "Invalid size '" << arg1 << "' specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ uint64_t size = atoll(arg1.c_str());
+ ret = set_size(fs, coll, ghobj, size, formatter, corrupt);
+ goto out;
+ } else if (objcmd == "clear-data-digest") {
+ ret = clear_data_digest(fs, coll, ghobj);
+ goto out;
+ } else if (objcmd == "clear-snapset") {
+ // UNDOCUMENTED: For testing zap SnapSet
+ // IGNORE extra args since not in usage anyway
+ if (!ghobj.hobj.has_snapset()) {
+ cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ ret = clear_snapset(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "remove-clone-metadata") {
+ // Extra arg
+ if (vm.count("arg1") == 0 || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ if (!ghobj.hobj.has_snapset()) {
+ cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) {
+ cerr << "Invalid cloneid '" << arg1 << "' specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ snapid_t cloneid = atoi(arg1.c_str());
+ ret = remove_clone(fs, coll, ghobj, cloneid, force);
+ goto out;
+ }
+ cerr << "Unknown object command '" << objcmd << "'" << std::endl;
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+
+ map_epoch = 0;
+ ret = PG::peek_map_epoch(fs, pgid, &map_epoch);
+ if (ret < 0)
+ cerr << "peek_map_epoch reports error" << std::endl;
+ if (debug)
+ cerr << "map_epoch " << map_epoch << std::endl;
+
+ pg_info_t info(pgid);
+ PastIntervals past_intervals;
+ __u8 struct_ver;
+ ret = PG::read_info(fs, pgid, coll, info, past_intervals, struct_ver);
+ if (ret < 0) {
+ cerr << "read_info error " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ if (struct_ver < PG::get_compat_struct_v()) {
+ cerr << "PG is too old to upgrade, use older Ceph version" << std::endl;
+ ret = -EFAULT;
+ goto out;
+ }
+ if (debug)
+ cerr << "struct_v " << (int)struct_ver << std::endl;
+
+ if (op == "export" || op == "export-remove") {
+ ceph_assert(superblock != nullptr);
+ ret = tool.do_export(cct.get(), fs, coll, pgid, info, map_epoch, struct_ver, *superblock, past_intervals);
+ if (ret == 0) {
+ cerr << "Export successful" << std::endl;
+ if (op == "export-remove") {
+ ret = initiate_new_remove_pg(fs, pgid);
+ // Export succeeded, so pgid is there
+ ceph_assert(ret == 0);
+ cerr << "Remove successful" << std::endl;
+ }
+ }
+ } else if (op == "info") {
+ formatter->open_object_section("info");
+ info.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+ } else if (op == "log") {
+ PGLog::IndexedLog log;
+ pg_missing_t missing;
+ ret = get_log(cct.get(), fs, struct_ver, pgid, info, log, missing);
+ if (ret < 0)
+ goto out;
+
+ dump_log(formatter, cout, log, missing);
+ } else if (op == "mark-complete") {
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (struct_ver < PG::get_compat_struct_v()) {
+ cerr << "Can't mark-complete, version mismatch " << (int)struct_ver
+ << " (pg) < compat " << (int)PG::get_compat_struct_v() << " (tool)"
+ << std::endl;
+ ret = 1;
+ goto out;
+ }
+
+ cout << "Marking complete " << std::endl;
+
+ ceph_assert(superblock != nullptr);
+ info.last_update = eversion_t(superblock->current_epoch, info.last_update.version + 1);
+ info.last_backfill = hobject_t::get_max();
+ info.last_epoch_started = superblock->current_epoch;
+ info.history.last_epoch_started = superblock->current_epoch;
+ info.history.last_epoch_clean = superblock->current_epoch;
+ past_intervals.clear();
+
+ if (!dry_run) {
+ ret = write_info(*t, map_epoch, info, past_intervals);
+ if (ret != 0)
+ goto out;
+ auto ch = fs->open_collection(coll_t(pgid));
+ fs->queue_transaction(ch, std::move(*t));
+ }
+ cout << "Marking complete succeeded" << std::endl;
+ } else if (op == "trim-pg-log") {
+ ret = do_trim_pg_log(fs, coll, info, pgid,
+ map_epoch, past_intervals);
+ if (ret < 0) {
+ cerr << "Error trimming pg log: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ cout << "Finished trimming pg log" << std::endl;
+ goto out;
+ } else if (op == "trim-pg-log-dups") {
+ ret = do_trim_pg_log_dups(fs, coll, info, pgid,
+ map_epoch, past_intervals);
+ if (ret < 0) {
+ cerr << "Error trimming pg log dups: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ cout << "Finished trimming pg log dups" << std::endl;
+ goto out;
+ } else if (op == "reset-last-complete") {
+ if (!force) {
+ std::cerr << "WARNING: reset-last-complete is extremely dangerous and almost "
+ << "certain to lead to permanent data loss unless you know exactly "
+ << "what you are doing. Pass --force to proceed anyway."
+ << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (struct_ver < PG::get_compat_struct_v()) {
+ cerr << "Can't reset-last-complete, version mismatch " << (int)struct_ver
+ << " (pg) < compat " << (int)PG::get_compat_struct_v() << " (tool)"
+ << std::endl;
+ ret = 1;
+ goto out;
+ }
+
+ cout << "Reseting last_complete " << std::endl;
+
+ info.last_complete = info.last_update;
+
+ if (!dry_run) {
+ ret = write_info(*t, map_epoch, info, past_intervals);
+ if (ret != 0)
+ goto out;
+ fs->queue_transaction(ch, std::move(*t));
+ }
+ cout << "Reseting last_complete succeeded" << std::endl;
+
+ } else if (op == "pg-log-inject-dups") {
+ if (!vm.count("file") || file == "-") {
+ cerr << "Must provide file containing JSON dups entries" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ if (debug)
+ cerr << "opening file " << file << std::endl;
+
+ ifstream json_file_stream(file , std::ifstream::in);
+ if (!json_file_stream.is_open()) {
+ cerr << "unable to open file " << file << std::endl;
+ ret = -1;
+ goto out;
+ }
+ json_spirit::mValue result;
+ try {
+ if (!json_spirit::read(json_file_stream, result))
+ throw std::runtime_error("unparseable JSON " + file);
+ if (result.type() != json_spirit::array_type) {
+ cerr << "result is not an array_type - type=" << result.type() << std::endl;
+ throw std::runtime_error("not JSON array_type " + file);
+ }
+ do_dups_inject_from_json(fs, pgid, result, debug);
+ } catch (const std::runtime_error &e) {
+ cerr << e.what() << std::endl;;
+ return -1;
+ }
+ } else {
+ ceph_assert(!"Should have already checked for valid --op");
+ }
+ } else {
+ cerr << "PG '" << pgid << "' not found" << std::endl;
+ ret = -ENOENT;
+ }
+
+out:
+ if (debug) {
+ ostringstream ostr;
+ Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty");
+ cct->get_perfcounters_collection()->dump_formatted(f, false);
+ ostr << "ceph-objectstore-tool ";
+ f->flush(ostr);
+ delete f;
+ cout << ostr.str() << std::endl;
+ }
+
+ int r = fs->umount();
+ if (r < 0) {
+ cerr << "umount failed: " << cpp_strerror(r) << std::endl;
+ // If no previous error, then use umount() error
+ if (ret == 0)
+ ret = r;
+ }
+
+ if (dry_run) {
+ // Export output can go to stdout, so put this message on stderr
+ if (op == "export")
+ cerr << "dry-run: Nothing changed" << std::endl;
+ else
+ cout << "dry-run: Nothing changed" << std::endl;
+ }
+
+ if (ret < 0)
+ ret = 1;
+ return ret;
+}
diff --git a/src/tools/ceph_objectstore_tool.h b/src/tools/ceph_objectstore_tool.h
new file mode 100644
index 000000000..82aa83e5d
--- /dev/null
+++ b/src/tools/ceph_objectstore_tool.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OBJECTSTORE_TOOL_H_
+#define CEPH_OBJECTSTORE_TOOL_H_
+
+#include "RadosDump.h"
+
+class ObjectStoreTool : public RadosDump
+{
+ public:
+ ObjectStoreTool(int file_fd, bool dry_run)
+ : RadosDump(file_fd, dry_run)
+ {}
+
+ int dump_export(Formatter *formatter);
+ int do_import(ObjectStore *store, OSDSuperblock& sb, bool force,
+ std::string pgidstr);
+ int do_export(CephContext *cct, ObjectStore *fs, coll_t coll, spg_t pgid,
+ pg_info_t &info, epoch_t map_epoch, __u8 struct_ver,
+ const OSDSuperblock& superblock,
+ PastIntervals &past_intervals);
+ int dump_object(Formatter *formatter,
+ bufferlist &bl);
+ int get_object(
+ ObjectStore *store, OSDriver& driver, SnapMapper& mapper, coll_t coll,
+ bufferlist &bl, OSDMap &curmap, bool *skipped_objects);
+ int export_file(
+ ObjectStore *store, coll_t cid, ghobject_t &obj);
+ int export_files(ObjectStore *store, coll_t coll);
+};
+
+#endif // CEPH_OBJECSTORE_TOOL_H_
diff --git a/src/tools/ceph_osdomap_tool.cc b/src/tools/ceph_osdomap_tool.cc
new file mode 100644
index 000000000..8e15851d8
--- /dev/null
+++ b/src/tools/ceph_osdomap_tool.cc
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License kkjversion 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include <stdlib.h>
+#include <string>
+
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "os/filestore/DBObjectMap.h"
+#include "kv/KeyValueDB.h"
+
+namespace po = boost::program_options;
+
+int main(int argc, char **argv) {
+ po::options_description desc("Allowed options");
+ string store_path, cmd, oid, backend;
+ bool debug = false;
+ desc.add_options()
+ ("help", "produce help message")
+ ("omap-path", po::value<string>(&store_path),
+ "path to omap directory, mandatory (current/omap usually)")
+ ("paranoid", "use paranoid checking")
+ ("debug", "Additional debug output from DBObjectMap")
+ ("oid", po::value<string>(&oid), "Restrict to this object id when dumping objects")
+ ("command", po::value<string>(&cmd),
+ "command arg is one of [dump-raw-keys, dump-raw-key-vals, dump-objects, dump-objects-with-keys, check, dump-headers, repair, compact], mandatory")
+ ("backend", po::value<string>(&backend),
+ "DB backend (default rocksdb)")
+ ;
+ po::positional_options_description p;
+ p.add("command", 1);
+
+ vector<string> ceph_option_strings;
+ po::variables_map vm;
+ try {
+ po::parsed_options parsed =
+ po::command_line_parser(argc, argv).options(desc).positional(p).allow_unregistered().run();
+ po::store(
+ parsed,
+ vm);
+ po::notify(vm);
+
+ ceph_option_strings = po::collect_unrecognized(parsed.options,
+ po::include_positional);
+ } catch(po::error &e) {
+ std::cerr << e.what() << std::endl;
+ return 1;
+ }
+
+ vector<const char *> ceph_options;
+ ceph_options.reserve(ceph_option_strings.size());
+ for (vector<string>::iterator i = ceph_option_strings.begin();
+ i != ceph_option_strings.end();
+ ++i) {
+ ceph_options.push_back(i->c_str());
+ }
+
+ if (vm.count("debug")) debug = true;
+
+ if (vm.count("help")) {
+ std::cerr << desc << std::endl;
+ return 1;
+ }
+
+ auto cct = global_init(
+ NULL, ceph_options, CEPH_ENTITY_TYPE_OSD,
+ CODE_ENVIRONMENT_UTILITY_NODOUT, 0);
+ common_init_finish(g_ceph_context);
+ cct->_conf.apply_changes(nullptr);
+ if (debug) {
+ g_conf().set_val_or_die("log_to_stderr", "true");
+ g_conf().set_val_or_die("err_to_stderr", "true");
+ }
+ g_conf().apply_changes(nullptr);
+
+ if (vm.count("omap-path") == 0) {
+ std::cerr << "Required argument --omap-path" << std::endl;
+ return 1;
+ }
+
+ if (vm.count("command") == 0) {
+ std::cerr << "Required argument --command" << std::endl;
+ return 1;
+ }
+
+ if (vm.count("backend") == 0) {
+ backend = "rocksdb";
+ }
+
+ KeyValueDB* store(KeyValueDB::create(g_ceph_context, backend, store_path));
+ if (store == NULL) {
+ std::cerr << "Invalid backend '" << backend << "' specified" << std::endl;
+ return 1;
+ }
+ /*if (vm.count("paranoid")) {
+ std::cerr << "Enabling paranoid checks" << std::endl;
+ store->options.paranoid_checks = true;
+ }*/
+ DBObjectMap omap(cct.get(), store);
+ stringstream out;
+ int r = store->open(out);
+ if (r < 0) {
+ std::cerr << "Store open got: " << cpp_strerror(r) << std::endl;
+ std::cerr << "Output: " << out.str() << std::endl;
+ return r;
+ }
+ // We don't call omap.init() here because it will repair
+ // the DBObjectMap which we might want to examine for diagnostic
+ // reasons. Instead use --command repair.
+
+ omap.get_state();
+ std::cout << "Version: " << (int)omap.state.v << std::endl;
+ std::cout << "Seq: " << omap.state.seq << std::endl;
+ std::cout << "legacy: " << (omap.state.legacy ? "true" : "false") << std::endl;
+
+ if (cmd == "dump-raw-keys") {
+ KeyValueDB::WholeSpaceIterator i = store->get_wholespace_iterator();
+ for (i->seek_to_first(); i->valid(); i->next()) {
+ std::cout << i->raw_key() << std::endl;
+ }
+ return 0;
+ } else if (cmd == "dump-raw-key-vals") {
+ KeyValueDB::WholeSpaceIterator i = store->get_wholespace_iterator();
+ for (i->seek_to_first(); i->valid(); i->next()) {
+ std::cout << i->raw_key() << std::endl;
+ i->value().hexdump(std::cout);
+ }
+ return 0;
+ } else if (cmd == "dump-objects") {
+ vector<ghobject_t> objects;
+ r = omap.list_objects(&objects);
+ if (r < 0) {
+ std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (vm.count("oid") != 0 && i->hobj.oid.name != oid)
+ continue;
+ std::cout << *i << std::endl;
+ }
+ return 0;
+ } else if (cmd == "dump-objects-with-keys") {
+ vector<ghobject_t> objects;
+ r = omap.list_objects(&objects);
+ if (r < 0) {
+ std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (vm.count("oid") != 0 && i->hobj.oid.name != oid)
+ continue;
+ std::cout << "Object: " << *i << std::endl;
+ ObjectMap::ObjectMapIterator j = omap.get_iterator(ghobject_t(i->hobj));
+ for (j->seek_to_first(); j->valid(); j->next()) {
+ std::cout << j->key() << std::endl;
+ j->value().hexdump(std::cout);
+ }
+ }
+ return 0;
+ } else if (cmd == "check" || cmd == "repair") {
+ ostringstream ss;
+ bool repair = (cmd == "repair");
+ r = omap.check(ss, repair, true);
+ if (r) {
+ std::cerr << ss.str() << std::endl;
+ if (r > 0) {
+ std::cerr << "check got " << r << " error(s)" << std::endl;
+ return 1;
+ }
+ }
+ std::cout << (repair ? "repair" : "check") << " succeeded" << std::endl;
+ return 0;
+ } else if (cmd == "dump-headers") {
+ vector<DBObjectMap::_Header> headers;
+ r = omap.list_object_headers(&headers);
+ if (r < 0) {
+ std::cerr << "list_object_headers got: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ for (auto i : headers)
+ std::cout << i << std::endl;
+ return 0;
+ } else if (cmd == "resetv2") {
+ omap.state.v = 2;
+ omap.state.legacy = false;
+ omap.set_state();
+ } else if (cmd == "compact") {
+ omap.compact();
+ return 0;
+ } else {
+ std::cerr << "Did not recognize command " << cmd << std::endl;
+ return 1;
+ }
+}
diff --git a/src/tools/cephfs/CMakeLists.txt b/src/tools/cephfs/CMakeLists.txt
new file mode 100644
index 000000000..5d40f8ffb
--- /dev/null
+++ b/src/tools/cephfs/CMakeLists.txt
@@ -0,0 +1,58 @@
+set(cephfs_journal_tool_srcs
+ cephfs-journal-tool.cc
+ JournalTool.cc
+ JournalFilter.cc
+ JournalScanner.cc
+ EventOutput.cc
+ Dumper.cc
+ Resetter.cc
+ RoleSelector.cc
+ MDSUtility.cc)
+add_executable(cephfs-journal-tool ${cephfs_journal_tool_srcs})
+target_link_libraries(cephfs-journal-tool librados mds osdc global
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs-meta-injection_srcs
+ cephfs-meta-injection.cc
+ MetaTool.cc
+ RoleSelector.cc
+ MDSUtility.cc)
+add_executable(cephfs-meta-injection ${cephfs-meta-injection_srcs})
+target_link_libraries(cephfs-meta-injection librados mds osdc global
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs_table_tool_srcs
+ cephfs-table-tool.cc
+ TableTool.cc
+ RoleSelector.cc
+ MDSUtility.cc)
+add_executable(cephfs-table-tool ${cephfs_table_tool_srcs})
+target_link_libraries(cephfs-table-tool librados mds osdc global
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs_data_scan_srcs
+ cephfs-data-scan.cc
+ DataScan.cc
+ RoleSelector.cc
+ PgFiles.cc
+ MDSUtility.cc)
+add_executable(cephfs-data-scan ${cephfs_data_scan_srcs})
+target_link_libraries(cephfs-data-scan librados cephfs mds osdc global
+ cls_cephfs_client
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+install(TARGETS
+ cephfs-journal-tool
+ cephfs-table-tool
+ cephfs-data-scan
+ DESTINATION bin)
+
+option(WITH_CEPHFS_SHELL "install cephfs-shell" OFF)
+if(WITH_CEPHFS_SHELL)
+ add_subdirectory(shell)
+endif()
+
+option(WITH_CEPHFS_TOP "install cephfs-top utility" ON)
+if(WITH_CEPHFS_TOP)
+ add_subdirectory(top)
+endif()
diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc
new file mode 100644
index 000000000..9f942964d
--- /dev/null
+++ b/src/tools/cephfs/DataScan.cc
@@ -0,0 +1,2239 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+#include <fstream>
+#include "include/util.h"
+#include "include/ceph_fs.h"
+
+#include "mds/CDentry.h"
+#include "mds/CInode.h"
+#include "mds/CDentry.h"
+#include "mds/InoTable.h"
+#include "mds/SnapServer.h"
+#include "cls/cephfs/cls_cephfs_client.h"
+
+#include "PgFiles.h"
+#include "DataScan.h"
+#include "include/compat.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "datascan." << __func__ << ": "
+
+void DataScan::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-data-scan init [--force-init]\n"
+ << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
+ << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
+ << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
+ << " cephfs-data-scan scan_links\n"
+ << "\n"
+ << " --force-corrupt: overrite apparently corrupt structures\n"
+ << " --force-init: write root inodes even if they exist\n"
+ << " --force-pool: use data pool even if it is not in FSMap\n"
+ << " --worker_m: Maximum number of workers\n"
+ << " --worker_n: Worker number, range 0-(worker_m-1)\n"
+ << "\n"
+ << " cephfs-data-scan scan_frags [--force-corrupt]\n"
+ << " cephfs-data-scan cleanup <data pool name>\n"
+ << std::endl;
+
+ generic_client_usage();
+}
+
+bool DataScan::parse_kwarg(
+ const std::vector<const char*> &args,
+ std::vector<const char *>::const_iterator &i,
+ int *r)
+{
+ if (i + 1 == args.end()) {
+ return false;
+ }
+
+ const std::string arg(*i);
+ const std::string val(*(i + 1));
+
+ if (arg == std::string("--output-dir")) {
+ if (driver != NULL) {
+ derr << "Unexpected --output-dir: output already selected!" << dendl;
+ *r = -EINVAL;
+ return false;
+ }
+ dout(4) << "Using local file output to '" << val << "'" << dendl;
+ driver = new LocalFileDriver(val, data_io);
+ return true;
+ } else if (arg == std::string("--worker_n")) {
+ std::string err;
+ n = strict_strtoll(val.c_str(), 10, &err);
+ if (!err.empty()) {
+ std::cerr << "Invalid worker number '" << val << "'" << std::endl;
+ *r = -EINVAL;
+ return false;
+ }
+ return true;
+ } else if (arg == std::string("--worker_m")) {
+ std::string err;
+ m = strict_strtoll(val.c_str(), 10, &err);
+ if (!err.empty()) {
+ std::cerr << "Invalid worker count '" << val << "'" << std::endl;
+ *r = -EINVAL;
+ return false;
+ }
+ return true;
+ } else if (arg == std::string("--filter-tag")) {
+ filter_tag = val;
+ dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
+ return true;
+ } else if (arg == std::string("--filesystem")) {
+ std::shared_ptr<const Filesystem> fs;
+ *r = fsmap->parse_filesystem(val, &fs);
+ if (*r != 0) {
+ std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
+ return false;
+ }
+ fscid = fs->fscid;
+ return true;
+ } else if (arg == std::string("--alternate-pool")) {
+ metadata_pool_name = val;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool DataScan::parse_arg(
+ const std::vector<const char*> &args,
+ std::vector<const char *>::const_iterator &i)
+{
+ const std::string arg(*i);
+ if (arg == "--force-pool") {
+ force_pool = true;
+ return true;
+ } else if (arg == "--force-corrupt") {
+ force_corrupt = true;
+ return true;
+ } else if (arg == "--force-init") {
+ force_init = true;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+int DataScan::main(const std::vector<const char*> &args)
+{
+ // Parse args
+ // ==========
+ if (args.size() < 1) {
+ cerr << "missing position argument" << std::endl;
+ return -EINVAL;
+ }
+
+ // Common RADOS init: open metadata pool
+ // =====================================
+ librados::Rados rados;
+ int r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable" << dendl;
+ return r;
+ }
+
+ std::string const &command = args[0];
+ std::string data_pool_name;
+
+ std::string pg_files_path;
+ std::set<pg_t> pg_files_pgs;
+
+ // Consume any known --key val or --flag arguments
+ for (std::vector<const char *>::const_iterator i = args.begin() + 1;
+ i != args.end(); ++i) {
+ if (parse_kwarg(args, i, &r)) {
+ // Skip the kwarg value field
+ ++i;
+ continue;
+ } else if (r) {
+ return r;
+ }
+
+ if (parse_arg(args, i)) {
+ continue;
+ }
+
+ // Trailing positional argument
+ if (i + 1 == args.end() &&
+ (command == "scan_inodes"
+ || command == "scan_extents"
+ || command == "cleanup")) {
+ data_pool_name = *i;
+ continue;
+ }
+
+ if (command == "pg_files") {
+ if (i == args.begin() + 1) {
+ pg_files_path = *i;
+ continue;
+ } else {
+ pg_t pg;
+ bool parsed = pg.parse(*i);
+ if (!parsed) {
+ std::cerr << "Invalid PG '" << *i << "'" << std::endl;
+ return -EINVAL;
+ } else {
+ pg_files_pgs.insert(pg);
+ continue;
+ }
+ }
+
+ }
+
+ // Fall through: unhandled
+ std::cerr << "Unknown argument '" << *i << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // If caller didn't specify a namespace, try to pick
+ // one if only one exists
+ if (fscid == FS_CLUSTER_ID_NONE) {
+ if (fsmap->filesystem_count() == 1) {
+ fscid = fsmap->get_filesystem()->fscid;
+ } else {
+ std::cerr << "Specify a filesystem with --filesystem" << std::endl;
+ return -EINVAL;
+ }
+ }
+ auto fs = fsmap->get_filesystem(fscid);
+ ceph_assert(fs != nullptr);
+
+ // Default to output to metadata pool
+ if (driver == NULL) {
+ driver = new MetadataDriver();
+ driver->set_force_corrupt(force_corrupt);
+ driver->set_force_init(force_init);
+ dout(4) << "Using metadata pool output" << dendl;
+ }
+
+ dout(4) << "connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ r = driver->init(rados, metadata_pool_name, fsmap, fscid);
+ if (r < 0) {
+ return r;
+ }
+
+ if (command == "pg_files") {
+ auto pge = PgFiles(objecter, pg_files_pgs);
+ pge.init();
+ return pge.scan_path(pg_files_path);
+ }
+
+ // Initialize data_io for those commands that need it
+ if (command == "scan_inodes" ||
+ command == "scan_extents" ||
+ command == "cleanup") {
+ if (data_pool_name.empty()) {
+ std::cerr << "Data pool not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ data_pool_id = rados.pool_lookup(data_pool_name.c_str());
+ if (data_pool_id < 0) {
+ std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
+ return -ENOENT;
+ } else {
+ dout(4) << "data pool '" << data_pool_name
+ << "' has ID " << data_pool_id << dendl;
+ }
+
+ if (!fs->mds_map.is_data_pool(data_pool_id)) {
+ std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
+ "CephFS data pool!" << std::endl;
+ if (!force_pool) {
+ std::cerr << "Use --force-pool to continue" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
+ r = rados.ioctx_create(data_pool_name.c_str(), data_io);
+ if (r != 0) {
+ return r;
+ }
+ }
+
+ // Initialize metadata_io from MDSMap for scan_frags
+ if (command == "scan_frags" || command == "scan_links") {
+ const auto fs = fsmap->get_filesystem(fscid);
+ if (fs == nullptr) {
+ std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
+ return -ENOENT;
+ }
+ int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
+
+ dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+ int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+ if (r < 0) {
+ std::cerr << "Pool " << metadata_pool_id
+ << " identified in MDS map not found in RADOS!" << std::endl;
+ return r;
+ }
+
+ r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+ if (r != 0) {
+ return r;
+ }
+
+ data_pools = fs->mds_map.get_data_pools();
+ }
+
+ // Finally, dispatch command
+ if (command == "scan_inodes") {
+ return scan_inodes();
+ } else if (command == "scan_extents") {
+ return scan_extents();
+ } else if (command == "scan_frags") {
+ return scan_frags();
+ } else if (command == "scan_links") {
+ return scan_links();
+ } else if (command == "cleanup") {
+ return cleanup();
+ } else if (command == "init") {
+ return driver->init_roots(fs->mds_map.get_first_data_pool());
+ } else {
+ std::cerr << "Unknown command '" << command << "'" << std::endl;
+ return -EINVAL;
+ }
+}
+
+int MetadataDriver::inject_unlinked_inode(
+ inodeno_t inono, int mode, int64_t data_pool_id)
+{
+ const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
+
+ // Skip if exists
+ bool already_exists = false;
+ int r = root_exists(inono, &already_exists);
+ if (r) {
+ return r;
+ }
+ if (already_exists && !force_init) {
+ std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
+ " exists, skipping create. Use --force-init to overwrite"
+ " the existing object." << std::endl;
+ return 0;
+ }
+
+ // Compose
+ InodeStore inode_data;
+ auto inode = inode_data.get_inode();
+ inode->ino = inono;
+ inode->version = 1;
+ inode->xattr_version = 1;
+ inode->mode = 0500 | mode;
+ // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
+ // (we won't actually give the *correct* dirstat here though)
+ inode->dirstat.nfiles = 1;
+
+ inode->ctime = inode->mtime = ceph_clock_now();
+ inode->nlink = 1;
+ inode->truncate_size = -1ull;
+ inode->truncate_seq = 1;
+ inode->uid = g_conf()->mds_root_ino_uid;
+ inode->gid = g_conf()->mds_root_ino_gid;
+
+ // Force layout to default: should we let users override this so that
+ // they don't have to mount the filesystem to correct it?
+ inode->layout = file_layout_t::get_default();
+ inode->layout.pool_id = data_pool_id;
+ inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+
+ // Assume that we will get our stats wrong, and that we may
+ // be ignoring dirfrags that exist
+ inode_data.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
+
+ if (inono == CEPH_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) {
+ sr_t srnode;
+ srnode.seq = 1;
+ encode(srnode, inode_data.snap_blob);
+ }
+
+ // Serialize
+ bufferlist inode_bl;
+ encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
+ inode_data.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ // Write
+ r = metadata_io.write_full(oid.name, inode_bl);
+ if (r != 0) {
+ derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+}
+
+int MetadataDriver::root_exists(inodeno_t ino, bool *result)
+{
+ object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+ uint64_t size;
+ time_t mtime;
+ int r = metadata_io.stat(oid.name, &size, &mtime);
+ if (r == -ENOENT) {
+ *result = false;
+ return 0;
+ } else if (r < 0) {
+ return r;
+ }
+
+ *result = true;
+ return 0;
+}
+
+int MetadataDriver::init_roots(int64_t data_pool_id)
+{
+ int r = 0;
+ r = inject_unlinked_inode(CEPH_INO_ROOT, S_IFDIR|0755, data_pool_id);
+ if (r != 0) {
+ return r;
+ }
+ r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
+ if (r != 0) {
+ return r;
+ }
+ bool created = false;
+ r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
+ if (r != 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int MetadataDriver::check_roots(bool *result)
+{
+ int r;
+ r = root_exists(CEPH_INO_ROOT, result);
+ if (r != 0) {
+ return r;
+ }
+ if (!*result) {
+ return 0;
+ }
+
+ r = root_exists(MDS_INO_MDSDIR(0), result);
+ if (r != 0) {
+ return r;
+ }
+ if (!*result) {
+ return 0;
+ }
+
+ return 0;
+}
+
+/**
+ * Stages:
+ *
+ * SERIAL init
+ * 0. Create root inodes if don't exist
+ * PARALLEL scan_extents
+ * 1. Size and mtime recovery: scan ALL objects, and update 0th
+ * objects with max size and max mtime seen.
+ * PARALLEL scan_inodes
+ * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
+ * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
+ * or rstats at this stage. Inodes without backtraces go into
+ * lost+found
+ * TODO: SERIAL "recover stats"
+ * 3. Dirfrag statistics: depth first traverse into metadata tree,
+ * rebuilding dir sizes.
+ * TODO PARALLEL "clean up"
+ * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
+ * anything onto them) and remove any of the xattrs that we
+ * used for accumulating.
+ */
+
+
+int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
+{
+ if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
+ return -EINVAL;
+ }
+
+ std::string err;
+ std::string inode_str = oid.substr(0, oid.find("."));
+ *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+
+ std::string pos_string = oid.substr(oid.find(".") + 1);
+ *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+int DataScan::scan_extents()
+{
+ return forall_objects(data_io, false, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ // Read size
+ uint64_t size;
+ time_t mtime;
+ int r = data_io.stat(oid, &size, &mtime);
+ dout(10) << "handling object " << obj_name_ino
+ << "." << obj_name_offset << dendl;
+ if (r != 0) {
+ dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
+ return r;
+ }
+
+ // I need to keep track of
+ // * The highest object ID seen
+ // * The size of the highest object ID seen
+ // * The largest object seen
+ //
+ // Given those things, I can later infer the object chunking
+ // size, the offset of the last object (chunk size * highest ID seen)
+ // and the actual size (offset of last object + size of highest ID seen)
+ //
+ // This logic doesn't take account of striping.
+ r = ClsCephFSClient::accumulate_inode_metadata(
+ data_io,
+ obj_name_ino,
+ obj_name_offset,
+ size,
+ mtime);
+ if (r < 0) {
+ derr << "Failed to accumulate metadata data from '"
+ << oid << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+ });
+}
+
+int DataScan::probe_filter(librados::IoCtx &ioctx)
+{
+ bufferlist filter_bl;
+ ClsCephFSClient::build_tag_filter("test", &filter_bl);
+ librados::ObjectCursor range_i;
+ librados::ObjectCursor range_end;
+
+ std::vector<librados::ObjectItem> tmp_result;
+ librados::ObjectCursor tmp_next;
+ int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
+ 1, filter_bl, &tmp_result, &tmp_next);
+
+ return r >= 0;
+}
+
+int DataScan::forall_objects(
+ librados::IoCtx &ioctx,
+ bool untagged_only,
+ std::function<int(std::string, uint64_t, uint64_t)> handler
+ )
+{
+ librados::ObjectCursor range_i;
+ librados::ObjectCursor range_end;
+ ioctx.object_list_slice(
+ ioctx.object_list_begin(),
+ ioctx.object_list_end(),
+ n,
+ m,
+ &range_i,
+ &range_end);
+
+
+ bufferlist filter_bl;
+
+ bool legacy_filtering = false;
+ if (untagged_only) {
+ // probe to deal with older OSDs that don't support
+ // the cephfs pgls filtering mode
+ legacy_filtering = !probe_filter(ioctx);
+ if (!legacy_filtering) {
+ ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
+ }
+ }
+
+ int r = 0;
+ while(range_i < range_end) {
+ std::vector<librados::ObjectItem> result;
+ int r = ioctx.object_list(range_i, range_end, 1,
+ filter_bl, &result, &range_i);
+ if (r < 0) {
+ derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto &i : result) {
+ const std::string &oid = i.oid;
+ uint64_t obj_name_ino = 0;
+ uint64_t obj_name_offset = 0;
+ r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+ if (r != 0) {
+ dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+ continue;
+ }
+
+ if (untagged_only && legacy_filtering) {
+ dout(20) << "Applying filter to " << oid << dendl;
+
+ // We are only interested in 0th objects during this phase: we touched
+ // the other objects during scan_extents
+ if (obj_name_offset != 0) {
+ dout(20) << "Non-zeroth object" << dendl;
+ continue;
+ }
+
+ bufferlist scrub_tag_bl;
+ int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
+ if (r >= 0) {
+ std::string read_tag;
+ auto q = scrub_tag_bl.cbegin();
+ try {
+ decode(read_tag, q);
+ if (read_tag == filter_tag) {
+ dout(20) << "skipping " << oid << " because it has the filter_tag"
+ << dendl;
+ continue;
+ }
+ } catch (const buffer::error &err) {
+ }
+ dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
+ } else {
+ dout(20) << "no tag read (" << r << ")" << dendl;
+ }
+
+ } else if (untagged_only) {
+ ceph_assert(obj_name_offset == 0);
+ dout(20) << "OSD matched oid " << oid << dendl;
+ }
+
+ int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
+ if (r == 0 && this_oid_r < 0) {
+ r = this_oid_r;
+ }
+ }
+ }
+
+ return r;
+}
+
+int DataScan::scan_inodes()
+{
+ bool roots_present;
+ int r = driver->check_roots(&roots_present);
+ if (r != 0) {
+ derr << "Unexpected error checking roots: '"
+ << cpp_strerror(r) << "'" << dendl;
+ return r;
+ }
+
+ if (!roots_present) {
+ std::cerr << "Some or all system inodes are absent. Run 'init' from "
+ "one node before running 'scan_inodes'" << std::endl;
+ return -EIO;
+ }
+
+ return forall_objects(data_io, true, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ int r = 0;
+
+ dout(10) << "handling object "
+ << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
+ << dendl;
+
+ AccumulateResult accum_res;
+ inode_backtrace_t backtrace;
+ file_layout_t loaded_layout = file_layout_t::get_default();
+ r = ClsCephFSClient::fetch_inode_accumulate_result(
+ data_io, oid, &backtrace, &loaded_layout, &accum_res);
+
+ if (r == -EINVAL) {
+ dout(4) << "Accumulated metadata missing from '"
+ << oid << ", did you run scan_extents?" << dendl;
+ return r;
+ } else if (r < 0) {
+ dout(4) << "Unexpected error loading accumulated metadata from '"
+ << oid << "': " << cpp_strerror(r) << dendl;
+ // FIXME: this creates situation where if a client has a corrupt
+ // backtrace/layout, we will fail to inject it. We should (optionally)
+ // proceed if the backtrace/layout is corrupt but we have valid
+ // accumulated metadata.
+ return r;
+ }
+
+ const time_t file_mtime = accum_res.max_mtime;
+ uint64_t file_size = 0;
+ bool have_backtrace = !(backtrace.ancestors.empty());
+
+ // This is the layout we will use for injection, populated either
+ // from loaded_layout or from best guesses
+ file_layout_t guessed_layout;
+ guessed_layout.pool_id = data_pool_id;
+
+ // Calculate file_size, guess the layout
+ if (accum_res.ceiling_obj_index > 0) {
+ uint32_t chunk_size = file_layout_t::get_default().object_size;
+ // When there are multiple objects, the largest object probably
+ // indicates the chunk size. But not necessarily, because files
+ // can be sparse. Only make this assumption if size seen
+ // is a power of two, as chunk sizes typically are.
+ if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
+ chunk_size = accum_res.max_obj_size;
+ }
+
+ if (loaded_layout.pool_id == -1) {
+ // If no stashed layout was found, guess it
+ guessed_layout.object_size = chunk_size;
+ guessed_layout.stripe_unit = chunk_size;
+ guessed_layout.stripe_count = 1;
+ } else if (!loaded_layout.is_valid() ||
+ loaded_layout.object_size < accum_res.max_obj_size) {
+ // If the max size seen exceeds what the stashed layout claims, then
+ // disbelieve it. Guess instead. Same for invalid layouts on disk.
+ dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
+ << std::dec << ", ignoring in favour of best guess" << dendl;
+ guessed_layout.object_size = chunk_size;
+ guessed_layout.stripe_unit = chunk_size;
+ guessed_layout.stripe_count = 1;
+ } else {
+ // We have a stashed layout that we can't disprove, so apply it
+ guessed_layout = loaded_layout;
+ dout(20) << "loaded layout from xattr:"
+ << " os: " << guessed_layout.object_size
+ << " sc: " << guessed_layout.stripe_count
+ << " su: " << guessed_layout.stripe_unit
+ << dendl;
+ // User might have transplanted files from a pool with a different
+ // ID, so whatever the loaded_layout says, we'll force the injected
+ // layout to point to the pool we really read from
+ guessed_layout.pool_id = data_pool_id;
+ }
+
+ if (guessed_layout.stripe_count == 1) {
+ // Unstriped file: simple chunking
+ file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
+ + accum_res.ceiling_obj_size;
+ } else {
+ // Striped file: need to examine the last stripe_count objects
+ // in the file to determine the size.
+
+ // How many complete (i.e. not last stripe) objects?
+ uint64_t complete_objs = 0;
+ if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
+ complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
+ } else {
+ complete_objs = 0;
+ }
+
+ // How many potentially-short objects (i.e. last stripe set) objects?
+ uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
+
+ dout(10) << "calculating striped size from complete objs: "
+ << complete_objs << ", partial objs: " << partial_objs
+ << dendl;
+
+ // Maximum amount of data that may be in the incomplete objects
+ uint64_t incomplete_size = 0;
+
+ // For each short object, calculate the max file size within it
+ // and accumulate the maximum
+ for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
+ char buf[60];
+ snprintf(buf, sizeof(buf), "%llx.%08llx",
+ (long long unsigned)obj_name_ino, (long long unsigned)i);
+
+ uint64_t osize(0);
+ time_t omtime(0);
+ r = data_io.stat(std::string(buf), &osize, &omtime);
+ if (r == 0) {
+ if (osize > 0) {
+ // Upper bound within this object
+ uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
+ * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
+ + (i % guessed_layout.stripe_count)
+ * guessed_layout.stripe_unit + (osize - 1)
+ % guessed_layout.stripe_unit + 1;
+ incomplete_size = std::max(incomplete_size, upper_size);
+ }
+ } else if (r == -ENOENT) {
+ // Absent object, treat as size 0 and ignore.
+ } else {
+ // Unexpected error, carry r to outer scope for handling.
+ break;
+ }
+ }
+ if (r != 0 && r != -ENOENT) {
+ derr << "Unexpected error checking size of ino 0x" << std::hex
+ << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ file_size = complete_objs * guessed_layout.object_size
+ + incomplete_size;
+ }
+ } else {
+ file_size = accum_res.ceiling_obj_size;
+ if (loaded_layout.pool_id < 0
+ || loaded_layout.object_size < accum_res.max_obj_size) {
+ // No layout loaded, or inconsistent layout, use default
+ guessed_layout = file_layout_t::get_default();
+ guessed_layout.pool_id = data_pool_id;
+ } else {
+ guessed_layout = loaded_layout;
+ }
+ }
+
+ // Santity checking backtrace ino against object name
+ if (have_backtrace && backtrace.ino != obj_name_ino) {
+ dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+ << " doesn't match object name ino 0x" << obj_name_ino
+ << std::dec << dendl;
+ have_backtrace = false;
+ }
+
+ InodeStore dentry;
+ build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry);
+
+ // Inject inode to the metadata pool
+ if (have_backtrace) {
+ inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+ if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+ /* Special case for strays: even if we have a good backtrace,
+ * don't put it in the stray dir, because while that would technically
+ * give it linkage it would still be invisible to the user */
+ r = driver->inject_lost_and_found(obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ } else {
+ /* Happy case: we will inject a named dentry for this inode */
+ r = driver->inject_with_backtrace(backtrace, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+ } else {
+ /* Backtrace-less case: we will inject a lost+found dentry */
+ r = driver->inject_lost_and_found(
+ obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+
+ return r;
+ });
+}
+
+int DataScan::cleanup()
+{
+ // We are looking for only zeroth object
+ //
+ return forall_objects(data_io, true, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ int r = 0;
+ r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
+ if (r < 0) {
+ dout(4) << "Error deleting accumulated metadata from '"
+ << oid << "': " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ });
+}
+
+bool DataScan::valid_ino(inodeno_t ino) const
+{
+ return (ino >= inodeno_t((1ull << 40)))
+ || (MDS_INO_IS_STRAY(ino))
+ || (MDS_INO_IS_MDSDIR(ino))
+ || ino == CEPH_INO_ROOT
+ || ino == CEPH_INO_CEPH;
+}
+
+int DataScan::scan_links()
+{
+ MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
+ if (!metadata_driver) {
+ derr << "Unexpected --output-dir option for scan_links" << dendl;
+ return -EINVAL;
+ }
+
+ interval_set<uint64_t> used_inos;
+ map<inodeno_t, int> remote_links;
+ map<snapid_t, SnapInfo> snaps;
+ snapid_t last_snap = 1;
+ snapid_t snaprealm_v2_since = 2;
+
+ struct link_info_t {
+ inodeno_t dirino;
+ frag_t frag;
+ string name;
+ version_t version;
+ int nlink;
+ bool is_dir;
+ map<snapid_t, SnapInfo> snaps;
+ link_info_t() : version(0), nlink(0), is_dir(false) {}
+ link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::inode_const_ptr& i) :
+ dirino(di), frag(df), name(n),
+ version(i->version), nlink(i->nlink), is_dir(S_IFDIR & i->mode) {}
+ dirfrag_t dirfrag() const {
+ return dirfrag_t(dirino, frag);
+ }
+ };
+ map<inodeno_t, list<link_info_t> > dup_primaries;
+ map<inodeno_t, link_info_t> bad_nlink_inos;
+ map<inodeno_t, link_info_t> injected_inos;
+
+ map<dirfrag_t, set<string> > to_remove;
+
+ enum {
+ SCAN_INOS = 1,
+ CHECK_LINK,
+ };
+
+ for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
+ const librados::NObjectIterator it_end = metadata_io.nobjects_end();
+ for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
+ const std::string oid = it->get_oid();
+
+ dout(10) << "step " << step << ": handling object " << oid << dendl;
+
+ uint64_t dir_ino = 0;
+ uint64_t frag_id = 0;
+ int r = parse_oid(oid, &dir_ino, &frag_id);
+ if (r == -EINVAL) {
+ dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
+ continue;
+ } else {
+ // parse_oid can only do 0 or -EINVAL
+ ceph_assert(r == 0);
+ }
+
+ if (!valid_ino(dir_ino)) {
+ dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
+ continue;
+ }
+
+ std::map<std::string, bufferlist> items;
+ r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
+ if (r < 0) {
+ derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& p : items) {
+ auto q = p.second.cbegin();
+ string dname;
+ snapid_t last;
+ dentry_key_t::decode_helper(p.first, dname, last);
+
+ if (last != CEPH_NOSNAP) {
+ if (last > last_snap)
+ last_snap = last;
+ continue;
+ }
+
+ try {
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ if (dnfirst <= CEPH_MAXSNAP) {
+ if (dnfirst - 1 > last_snap)
+ last_snap = dnfirst - 1;
+ }
+ char dentry_type;
+ decode(dentry_type, q);
+ mempool::mds_co::string alternate_name;
+ if (dentry_type == 'I' || dentry_type == 'i') {
+ InodeStore inode;
+ if (dentry_type == 'i') {
+ DECODE_START(2, q);
+ if (struct_v >= 2)
+ decode(alternate_name, q);
+ inode.decode(q);
+ DECODE_FINISH(q);
+ } else {
+ inode.decode_bare(q);
+ }
+
+ inodeno_t ino = inode.inode->ino;
+
+ if (step == SCAN_INOS) {
+ if (used_inos.contains(ino, 1)) {
+ dup_primaries[ino].size();
+ } else {
+ used_inos.insert(ino);
+ }
+ } else if (step == CHECK_LINK) {
+ sr_t srnode;
+ if (inode.snap_blob.length()) {
+ auto p = inode.snap_blob.cbegin();
+ decode(srnode, p);
+ for (auto it = srnode.snaps.begin();
+ it != srnode.snaps.end(); ) {
+ if (it->second.ino != ino ||
+ it->second.snapid != it->first) {
+ srnode.snaps.erase(it++);
+ } else {
+ ++it;
+ }
+ }
+ if (!srnode.past_parents.empty()) {
+ snapid_t last = srnode.past_parents.rbegin()->first;
+ if (last + 1 > snaprealm_v2_since)
+ snaprealm_v2_since = last + 1;
+ }
+ }
+ if (inode.old_inodes && !inode.old_inodes->empty()) {
+ auto _last_snap = inode.old_inodes->rbegin()->first;
+ if (_last_snap > last_snap)
+ last_snap = _last_snap;
+ }
+ auto q = dup_primaries.find(ino);
+ if (q != dup_primaries.end()) {
+ q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
+ q->second.back().snaps.swap(srnode.snaps);
+ } else {
+ int nlink = 0;
+ auto r = remote_links.find(ino);
+ if (r != remote_links.end())
+ nlink = r->second;
+ if (!MDS_INO_IS_STRAY(dir_ino))
+ nlink++;
+ if (inode.inode->nlink != nlink) {
+ derr << "Bad nlink on " << ino << " expected " << nlink
+ << " has " << inode.inode->nlink << dendl;
+ bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
+ bad_nlink_inos[ino].nlink = nlink;
+ }
+ snaps.insert(make_move_iterator(begin(srnode.snaps)),
+ make_move_iterator(end(srnode.snaps)));
+ }
+ if (dnfirst == CEPH_NOSNAP)
+ injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
+ }
+ } else if (dentry_type == 'L' || dentry_type == 'l') {
+ inodeno_t ino;
+ unsigned char d_type;
+ CDentry::decode_remote(dentry_type, ino, d_type, alternate_name, q);
+
+ if (step == SCAN_INOS) {
+ remote_links[ino]++;
+ } else if (step == CHECK_LINK) {
+ if (!used_inos.contains(ino, 1)) {
+ derr << "Bad remote link dentry 0x" << std::hex << dir_ino
+ << std::dec << "/" << dname
+ << ", ino " << ino << " not found" << dendl;
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+ dn_key.encode(key);
+ to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
+ }
+ }
+ } else {
+ derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
+ << std::dec << "/" << dname << dendl;
+ return -EINVAL;
+ }
+ } catch (const buffer::error &err) {
+ derr << "Error decoding dentry 0x" << std::hex << dir_ino
+ << std::dec << "/" << dname << dendl;
+ return -EINVAL;
+ }
+ }
+ }
+ }
+
+ map<unsigned, uint64_t> max_ino_map;
+ {
+ auto prev_max_ino = (uint64_t)1 << 40;
+ for (auto p = used_inos.begin(); p != used_inos.end(); ++p) {
+ auto cur_max = p.get_start() + p.get_len() - 1;
+ if (cur_max < prev_max_ino)
+ continue; // system inodes
+
+ if ((prev_max_ino >> 40) != (cur_max >> 40)) {
+ unsigned rank = (prev_max_ino >> 40) - 1;
+ max_ino_map[rank] = prev_max_ino;
+ } else if ((p.get_start() >> 40) != (cur_max >> 40)) {
+ unsigned rank = (p.get_start() >> 40) - 1;
+ max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1;
+ }
+ prev_max_ino = cur_max;
+ }
+ unsigned rank = (prev_max_ino >> 40) - 1;
+ max_ino_map[rank] = prev_max_ino;
+ }
+
+ used_inos.clear();
+
+ dout(10) << "processing " << dup_primaries.size() << " dup_primaries, "
+ << remote_links.size() << " remote_links" << dendl;
+
+ for (auto& p : dup_primaries) {
+
+ dout(10) << "handling dup " << p.first << dendl;
+
+ link_info_t newest;
+ for (auto& q : p.second) {
+ if (q.version > newest.version) {
+ newest = q;
+ } else if (q.version == newest.version &&
+ !MDS_INO_IS_STRAY(q.dirino) &&
+ MDS_INO_IS_STRAY(newest.dirino)) {
+ newest = q;
+ }
+ }
+
+ for (auto& q : p.second) {
+ // in the middle of dir fragmentation?
+ if (newest.dirino == q.dirino && newest.name == q.name) {
+ snaps.insert(make_move_iterator(begin(q.snaps)),
+ make_move_iterator(end(q.snaps)));
+ continue;
+ }
+
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
+ dn_key.encode(key);
+ to_remove[q.dirfrag()].insert(key);
+ derr << "Remove duplicated ino 0x" << p.first << " from "
+ << q.dirfrag() << "/" << q.name << dendl;
+ }
+
+ int nlink = 0;
+ auto q = remote_links.find(p.first);
+ if (q != remote_links.end())
+ nlink = q->second;
+ if (!MDS_INO_IS_STRAY(newest.dirino))
+ nlink++;
+
+ if (nlink != newest.nlink) {
+ derr << "Bad nlink on " << p.first << " expected " << nlink
+ << " has " << newest.nlink << dendl;
+ bad_nlink_inos[p.first] = newest;
+ bad_nlink_inos[p.first].nlink = nlink;
+ }
+ }
+ dup_primaries.clear();
+ remote_links.clear();
+
+ {
+ objecter->with_osdmap([&](const OSDMap& o) {
+ for (auto p : data_pools) {
+ const pg_pool_t *pi = o.get_pg_pool(p);
+ if (!pi)
+ continue;
+ if (pi->snap_seq > last_snap)
+ last_snap = pi->snap_seq;
+ }
+ });
+
+ if (!snaps.empty()) {
+ if (snaps.rbegin()->first > last_snap)
+ last_snap = snaps.rbegin()->first;
+ }
+ }
+
+ dout(10) << "removing dup dentries from " << to_remove.size() << " objects"
+ << dendl;
+
+ for (auto& p : to_remove) {
+ object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
+
+ dout(10) << "removing dup dentries from " << p.first << dendl;
+
+ int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
+ if (r != 0) {
+ derr << "Error removing duplicated dentries from " << p.first << dendl;
+ return r;
+ }
+ }
+ to_remove.clear();
+
+ dout(10) << "processing " << bad_nlink_inos.size() << " bad_nlink_inos"
+ << dendl;
+
+ for (auto &p : bad_nlink_inos) {
+ dout(10) << "handling bad_nlink_ino " << p.first << dendl;
+
+ InodeStore inode;
+ snapid_t first;
+ int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
+ if (r < 0) {
+ derr << "Unexpected error reading dentry "
+ << p.second.dirfrag() << "/" << p.second.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (inode.inode->ino != p.first || inode.inode->version != p.second.version)
+ continue;
+
+ inode.get_inode()->nlink = p.second.nlink;
+ r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
+ if (r < 0)
+ return r;
+ }
+
+ dout(10) << "processing " << injected_inos.size() << " injected_inos"
+ << dendl;
+
+ for (auto &p : injected_inos) {
+ dout(10) << "handling injected_ino " << p.first << dendl;
+
+ InodeStore inode;
+ snapid_t first;
+ int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
+ if (r < 0) {
+ derr << "Unexpected error reading dentry "
+ << p.second.dirfrag() << "/" << p.second.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (first != CEPH_NOSNAP)
+ continue;
+
+ first = last_snap + 1;
+ r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
+ if (r < 0)
+ return r;
+ }
+
+ dout(10) << "updating inotable" << dendl;
+
+ for (auto& p : max_ino_map) {
+ InoTable inotable(nullptr);
+ inotable.set_rank(p.first);
+ bool dirty = false;
+ int r = metadata_driver->load_table(&inotable);
+ if (r < 0) {
+ inotable.reset_state();
+ dirty = true;
+ }
+ if (inotable.force_consume_to(p.second))
+ dirty = true;
+ if (dirty) {
+ r = metadata_driver->save_table(&inotable);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ dout(10) << "updating snaptable" << dendl;
+
+ {
+ SnapServer snaptable;
+ snaptable.set_rank(0);
+ bool dirty = false;
+ int r = metadata_driver->load_table(&snaptable);
+ if (r < 0) {
+ snaptable.reset_state();
+ dirty = true;
+ }
+ if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps))
+ dirty = true;
+ if (dirty) {
+ r = metadata_driver->save_table(&snaptable);
+ if (r < 0)
+ return r;
+ }
+ }
+ return 0;
+}
+
+int DataScan::scan_frags()
+{
+ bool roots_present;
+ int r = driver->check_roots(&roots_present);
+ if (r != 0) {
+ derr << "Unexpected error checking roots: '"
+ << cpp_strerror(r) << "'" << dendl;
+ return r;
+ }
+
+ if (!roots_present) {
+ std::cerr << "Some or all system inodes are absent. Run 'init' from "
+ "one node before running 'scan_inodes'" << std::endl;
+ return -EIO;
+ }
+
+ return forall_objects(metadata_io, true, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ int r = 0;
+ r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+ if (r != 0) {
+ dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+ return r;
+ }
+
+ if (obj_name_ino < (1ULL << 40)) {
+ // FIXME: we're skipping stray dirs here: if they're
+ // orphaned then we should be resetting them some other
+ // way
+ dout(10) << "Skipping system ino " << obj_name_ino << dendl;
+ return 0;
+ }
+
+ AccumulateResult accum_res;
+ inode_backtrace_t backtrace;
+
+ // Default to inherit layout (i.e. no explicit layout on dir) which is
+ // expressed as a zeroed layout struct (see inode_t::has_layout)
+ file_layout_t loaded_layout;
+
+ int parent_r = 0;
+ bufferlist parent_bl;
+ int layout_r = 0;
+ bufferlist layout_bl;
+ bufferlist op_bl;
+
+ librados::ObjectReadOperation op;
+ op.getxattr("parent", &parent_bl, &parent_r);
+ op.getxattr("layout", &layout_bl, &layout_r);
+ r = metadata_io.operate(oid, &op, &op_bl);
+ if (r != 0 && r != -ENODATA) {
+ derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
+ return r;
+ }
+
+ if (parent_r != -ENODATA) {
+ try {
+ auto q = parent_bl.cbegin();
+ backtrace.decode(q);
+ } catch (buffer::error &e) {
+ dout(4) << "Corrupt backtrace on '" << oid << "': " << e.what() << dendl;
+ if (!force_corrupt) {
+ return -EINVAL;
+ } else {
+ // Treat backtrace as absent: we'll inject into lost+found
+ backtrace = inode_backtrace_t();
+ }
+ }
+ }
+
+ if (layout_r != -ENODATA) {
+ try {
+ auto q = layout_bl.cbegin();
+ decode(loaded_layout, q);
+ } catch (buffer::error &e) {
+ dout(4) << "Corrupt layout on '" << oid << "': " << e.what() << dendl;
+ if (!force_corrupt) {
+ return -EINVAL;
+ }
+ }
+ }
+
+ bool have_backtrace = !(backtrace.ancestors.empty());
+
+ // Santity checking backtrace ino against object name
+ if (have_backtrace && backtrace.ino != obj_name_ino) {
+ dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+ << " doesn't match object name ino 0x" << obj_name_ino
+ << std::dec << dendl;
+ have_backtrace = false;
+ }
+
+ uint64_t fnode_version = 0;
+ fnode_t fnode;
+ r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
+ if (r == -EINVAL) {
+ derr << "Corrupt fnode on " << oid << dendl;
+ if (force_corrupt) {
+ fnode.fragstat.mtime = 0;
+ fnode.fragstat.nfiles = 1;
+ fnode.fragstat.nsubdirs = 0;
+ fnode.accounted_fragstat = fnode.fragstat;
+ } else {
+ return r;
+ }
+ }
+
+ InodeStore dentry;
+ build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
+ loaded_layout, &dentry);
+
+ // Inject inode to the metadata pool
+ if (have_backtrace) {
+ inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+ if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+ /* Special case for strays: even if we have a good backtrace,
+ * don't put it in the stray dir, because while that would technically
+ * give it linkage it would still be invisible to the user */
+ r = driver->inject_lost_and_found(obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ } else {
+ /* Happy case: we will inject a named dentry for this inode */
+ r = driver->inject_with_backtrace(backtrace, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+ } else {
+ /* Backtrace-less case: we will inject a lost+found dentry */
+ r = driver->inject_lost_and_found(
+ obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+
+ return r;
+ });
+}
+
+int MetadataTool::read_fnode(
+ inodeno_t ino, frag_t frag, fnode_t *fnode,
+ uint64_t *last_version)
+{
+ ceph_assert(fnode != NULL);
+
+ object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
+ bufferlist fnode_bl;
+ int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
+ *last_version = metadata_io.get_last_version();
+ if (r < 0) {
+ return r;
+ }
+
+ auto old_fnode_iter = fnode_bl.cbegin();
+ try {
+ (*fnode).decode(old_fnode_iter);
+ } catch (const buffer::error &err) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
+ const std::string &dname, InodeStore *inode, snapid_t *dnfirst)
+{
+ ceph_assert(inode != NULL);
+
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+ dn_key.encode(key);
+
+ std::set<std::string> keys;
+ keys.insert(key);
+ std::map<std::string, bufferlist> vals;
+ object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
+ int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);
+ dout(20) << "oid=" << frag_oid.name
+ << " dname=" << dname
+ << " frag=" << frag
+ << ", r=" << r << dendl;
+ if (r < 0) {
+ return r;
+ }
+
+ if (vals.find(key) == vals.end()) {
+ dout(20) << key << " not found in result" << dendl;
+ return -ENOENT;
+ }
+
+ try {
+ auto q = vals[key].cbegin();
+ snapid_t first;
+ decode(first, q);
+ char dentry_type;
+ decode(dentry_type, q);
+ if (dentry_type == 'I' || dentry_type == 'i') {
+ if (dentry_type == 'i') {
+ mempool::mds_co::string alternate_name;
+
+ DECODE_START(2, q);
+ if (struct_v >= 2)
+ decode(alternate_name, q);
+ inode->decode(q);
+ DECODE_FINISH(q);
+ } else {
+ inode->decode_bare(q);
+ }
+ } else {
+ dout(20) << "dentry type '" << dentry_type << "': cannot"
+ "read an inode out of that" << dendl;
+ return -EINVAL;
+ }
+ if (dnfirst)
+ *dnfirst = first;
+ } catch (const buffer::error &err) {
+ dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
+ << std::dec << "/" << dname << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int MetadataDriver::load_table(MDSTable *table)
+{
+ object_t table_oid = table->get_object_name();
+
+ bufferlist table_bl;
+ int r = metadata_io.read(table_oid.name, table_bl, 0, 0);
+ if (r < 0) {
+ derr << "unable to read mds table '" << table_oid.name << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ try {
+ version_t table_ver;
+ auto p = table_bl.cbegin();
+ decode(table_ver, p);
+ table->decode_state(p);
+ table->force_replay_version(table_ver);
+ } catch (const buffer::error &err) {
+ derr << "unable to decode mds table '" << table_oid.name << "': "
+ << err.what() << dendl;
+ return -EIO;
+ }
+ return 0;
+}
+
+int MetadataDriver::save_table(MDSTable *table)
+{
+ object_t table_oid = table->get_object_name();
+
+ bufferlist table_bl;
+ encode(table->get_version(), table_bl);
+ table->encode_state(table_bl);
+ int r = metadata_io.write_full(table_oid.name, table_bl);
+ if (r != 0) {
+ derr << "error updating mds table " << table_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+int MetadataDriver::inject_lost_and_found(
+ inodeno_t ino, const InodeStore &dentry)
+{
+ // Create lost+found if doesn't exist
+ bool created = false;
+ int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
+ if (r < 0) {
+ return r;
+ }
+ InodeStore lf_ino;
+ r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
+ if (r == -ENOENT || r == -EINVAL) {
+ if (r == -EINVAL && !force_corrupt) {
+ return r;
+ }
+
+ // To have a directory not specify a layout, give it zeros (see
+ // inode_t::has_layout)
+ file_layout_t inherit_layout;
+
+ // Construct LF inode
+ frag_info_t fragstat;
+ fragstat.nfiles = 1,
+ build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
+
+ // Inject link to LF inode in the root dir
+ r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
+ if (r < 0) {
+ return r;
+ }
+ } else {
+ if (!(lf_ino.inode->mode & S_IFDIR)) {
+ derr << "lost+found exists but is not a directory!" << dendl;
+ // In this case we error out, and the user should do something about
+ // this problem.
+ return -EINVAL;
+ }
+ }
+
+ r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
+ if (r < 0) {
+ return r;
+ }
+
+ const std::string dname = lost_found_dname(ino);
+
+ // Write dentry into lost+found dirfrag
+ return inject_linkage(lf_ino.inode->ino, dname, frag_t(), dentry);
+}
+
+
+int MetadataDriver::get_frag_of(
+ inodeno_t dirino,
+ const std::string &target_dname,
+ frag_t *result_ft)
+{
+ object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
+
+ dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
+
+ // Find and load fragtree if existing dirfrag
+ // ==========================================
+ bool have_backtrace = false;
+ bufferlist parent_bl;
+ int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
+ if (r == -ENODATA) {
+ dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
+ } else if (r < 0) {
+ dout(4) << "Unexpected error on '" << root_frag_oid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Deserialize backtrace
+ inode_backtrace_t backtrace;
+ if (parent_bl.length()) {
+ try {
+ auto q = parent_bl.cbegin();
+ backtrace.decode(q);
+ have_backtrace = true;
+ } catch (buffer::error &e) {
+ dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': "
+ << e.what() << dendl;
+ }
+ }
+
+ if (!(have_backtrace && backtrace.ancestors.size())) {
+ // Can't work out fragtree without a backtrace
+ dout(4) << "No backtrace on '" << root_frag_oid
+ << "': cannot determine fragtree" << dendl;
+ return -ENOENT;
+ }
+
+ // The parentage of dirino
+ const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
+
+ // The inode of dirino's parent
+ const inodeno_t parent_ino = bp.dirino;
+
+ // The dname of dirino in its parent.
+ const std::string &parent_dname = bp.dname;
+
+ dout(20) << "got backtrace parent " << parent_ino << "/"
+ << parent_dname << dendl;
+
+ // The primary dentry for dirino
+ InodeStore existing_dentry;
+
+ // See if we can find ourselves in dirfrag zero of the parent: this
+ // is a fast path that avoids needing to go further up the tree
+ // if the parent isn't fragmented (worst case we would have to
+ // go all the way to the root)
+ r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
+ if (r >= 0) {
+ // Great, fast path: return the fragtree from here
+ if (existing_dentry.inode->ino != dirino) {
+ dout(4) << "Unexpected inode in dentry! 0x" << std::hex
+ << existing_dentry.inode->ino
+ << " vs expected 0x" << dirino << std::dec << dendl;
+ return -ENOENT;
+ }
+ dout(20) << "fast path, fragtree is "
+ << existing_dentry.dirfragtree << dendl;
+ *result_ft = existing_dentry.pick_dirfrag(target_dname);
+ dout(20) << "frag is " << *result_ft << dendl;
+ return 0;
+ } else if (r != -ENOENT) {
+ // Dentry not present in 0th frag, must read parent's fragtree
+ frag_t parent_frag;
+ r = get_frag_of(parent_ino, parent_dname, &parent_frag);
+ if (r == 0) {
+ // We have the parent fragtree, so try again to load our dentry
+ r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
+ if (r >= 0) {
+ // Got it!
+ *result_ft = existing_dentry.pick_dirfrag(target_dname);
+ dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
+ return 0;
+ } else {
+ if (r == -EINVAL || r == -ENOENT) {
+ return -ENOENT; // dentry missing or corrupt, so frag is missing
+ } else {
+ return r;
+ }
+ }
+ } else {
+ // Couldn't resolve parent fragtree, so can't find ours.
+ return r;
+ }
+ } else if (r == -EINVAL) {
+ // Unreadable dentry, can't know the fragtree.
+ return -ENOENT;
+ } else {
+ // Unexpected error, raise it
+ return r;
+ }
+}
+
+
+int MetadataDriver::inject_with_backtrace(
+ const inode_backtrace_t &backtrace, const InodeStore &dentry)
+
+{
+
+ // On dirfrags
+ // ===========
+ // In order to insert something into a directory, we first (ideally)
+ // need to know the fragtree for the directory. Sometimes we can't
+ // get that, in which case we just go ahead and insert it into
+ // fragment zero for a good chance of that being the right thing
+ // anyway (most moderate-sized dirs aren't fragmented!)
+
+ // On ancestry
+ // ===========
+ // My immediate ancestry should be correct, so if we can find that
+ // directory's dirfrag then go inject it there. This works well
+ // in the case that this inode's dentry was somehow lost and we
+ // are recreating it, because the rest of the hierarchy
+ // will probably still exist.
+ //
+ // It's more of a "better than nothing" approach when rebuilding
+ // a whole tree, as backtraces will in general not be up to date
+ // beyond the first parent, if anything in the trace was ever
+ // moved after the file was created.
+
+ // On inode numbers
+ // ================
+ // The backtrace tells us inodes for each of the parents. If we are
+ // creating those parent dirfrags, then there is a risk that somehow
+ // the inode indicated here was also used for data (not a dirfrag) at
+ // some stage. That would be a zany situation, and we don't check
+ // for it here, because to do so would require extra IOs for everything
+ // we inject, and anyway wouldn't guarantee that the inode number
+ // wasn't in use in some dentry elsewhere in the metadata tree that
+ // just happened not to have any data objects.
+
+ // On multiple workers touching the same traces
+ // ============================================
+ // When creating linkage for a directory, *only* create it if we are
+ // also creating the object. That way, we might not manage to get the
+ // *right* linkage for a directory, but at least we won't multiply link
+ // it. We assume that if a root dirfrag exists for a directory, then
+ // it is linked somewhere (i.e. that the metadata pool is not already
+ // inconsistent).
+ //
+ // Making sure *that* is true is someone else's job! Probably someone
+ // who is not going to run in parallel, so that they can self-consistently
+ // look at versions and move things around as they go.
+ // Note this isn't 100% safe: if we die immediately after creating dirfrag
+ // object, next run will fail to create linkage for the dirfrag object
+ // and leave it orphaned.
+
+ inodeno_t ino = backtrace.ino;
+ dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl;
+ for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
+ i != backtrace.ancestors.end(); ++i) {
+ const inode_backpointer_t &backptr = *i;
+ dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec
+ << "/" << backptr.dname << dendl;
+
+ // Examine root dirfrag for parent
+ const inodeno_t parent_ino = backptr.dirino;
+ const std::string dname = backptr.dname;
+
+ frag_t fragment;
+ int r = get_frag_of(parent_ino, dname, &fragment);
+ if (r == -ENOENT) {
+ // Don't know fragment, fall back to assuming root
+ dout(20) << "don't know fragment for 0x" << std::hex <<
+ parent_ino << std::dec << "/" << dname << ", will insert to root"
+ << dendl;
+ }
+
+ // Find or create dirfrag
+ // ======================
+ bool created_dirfrag;
+ r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
+ if (r < 0) {
+ return r;
+ }
+
+ // Check if dentry already exists
+ // ==============================
+ InodeStore existing_dentry;
+ r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
+ bool write_dentry = false;
+ if (r == -ENOENT || r == -EINVAL) {
+ if (r == -EINVAL && !force_corrupt) {
+ return r;
+ }
+ // Missing or corrupt dentry
+ write_dentry = true;
+ } else if (r < 0) {
+ derr << "Unexpected error reading dentry 0x" << std::hex
+ << parent_ino << std::dec << "/"
+ << dname << ": " << cpp_strerror(r) << dendl;
+ break;
+ } else {
+ // Dentry already present, does it link to me?
+ if (existing_dentry.inode->ino == ino) {
+ dout(20) << "Dentry 0x" << std::hex
+ << parent_ino << std::dec << "/"
+ << dname << " already exists and points to me" << dendl;
+ } else {
+ derr << "Dentry 0x" << std::hex
+ << parent_ino << std::dec << "/"
+ << dname << " already exists but points to 0x"
+ << std::hex << existing_dentry.inode->ino << std::dec << dendl;
+ // Fall back to lost+found!
+ return inject_lost_and_found(backtrace.ino, dentry);
+ }
+ }
+
+ // Inject linkage
+ // ==============
+
+ if (write_dentry) {
+ if (i == backtrace.ancestors.begin()) {
+ // This is the linkage for the file of interest
+ dout(10) << "Linking inode 0x" << std::hex << ino
+ << " at 0x" << parent_ino << "/" << dname << std::dec
+ << " with size=" << dentry.inode->size << " bytes" << dendl;
+
+ r = inject_linkage(parent_ino, dname, fragment, dentry);
+ } else {
+ // This is the linkage for an ancestor directory
+ InodeStore ancestor_dentry;
+ auto inode = ancestor_dentry.get_inode();
+ inode->mode = 0755 | S_IFDIR;
+
+ // Set nfiles to something non-zero, to fool any other code
+ // that tries to ignore 'empty' directories. This won't be
+ // accurate, but it should avoid functional issues.
+
+ inode->dirstat.nfiles = 1;
+ inode->dir_layout.dl_dir_hash =
+ g_conf()->mds_default_dir_hash;
+
+ inode->nlink = 1;
+ inode->ino = ino;
+ inode->uid = g_conf()->mds_root_ino_uid;
+ inode->gid = g_conf()->mds_root_ino_gid;
+ inode->version = 1;
+ inode->backtrace_version = 1;
+ r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
+ }
+
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (!created_dirfrag) {
+ // If the parent dirfrag already existed, then stop traversing the
+ // backtrace: assume that the other ancestors already exist too. This
+ // is an assumption rather than a truth, but it's a convenient way
+ // to avoid the risk of creating multiply-linked directories while
+ // injecting data. If there are in fact missing ancestors, this
+ // should be fixed up using a separate tool scanning the metadata
+ // pool.
+ break;
+ } else {
+ // Proceed up the backtrace, creating parents
+ ino = parent_ino;
+ }
+ }
+
+ return 0;
+}
+
+int MetadataDriver::find_or_create_dirfrag(
+ inodeno_t ino,
+ frag_t fragment,
+ bool *created)
+{
+ ceph_assert(created != NULL);
+
+ fnode_t existing_fnode;
+ *created = false;
+
+ uint64_t read_version = 0;
+ int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
+ dout(10) << "read_version = " << read_version << dendl;
+
+ if (r == -ENOENT || r == -EINVAL) {
+ if (r == -EINVAL && !force_corrupt) {
+ return r;
+ }
+
+ // Missing or corrupt fnode, create afresh
+ bufferlist fnode_bl;
+ fnode_t blank_fnode;
+ blank_fnode.version = 1;
+ // mark it as non-empty
+ blank_fnode.fragstat.nfiles = 1;
+ blank_fnode.accounted_fragstat = blank_fnode.fragstat;
+ blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
+ blank_fnode.encode(fnode_bl);
+
+
+ librados::ObjectWriteOperation op;
+
+ if (read_version) {
+ ceph_assert(r == -EINVAL);
+ // Case A: We must assert that the version isn't changed since we saw the object
+ // was unreadable, to avoid the possibility of two data-scan processes
+ // both creating the frag.
+ op.assert_version(read_version);
+ } else {
+ ceph_assert(r == -ENOENT);
+ // Case B: The object didn't exist in read_fnode, so while creating it we must
+ // use an exclusive create to correctly populate *creating with
+ // whether we created it ourselves or someone beat us to it.
+ op.create(true);
+ }
+
+ object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
+ op.omap_set_header(fnode_bl);
+ r = metadata_io.operate(frag_oid.name, &op);
+ if (r == -EOVERFLOW || r == -EEXIST) {
+ // Someone else wrote it (see case A above)
+ dout(10) << "Dirfrag creation race: 0x" << std::hex
+ << ino << " " << fragment << std::dec << dendl;
+ *created = false;
+ return 0;
+ } else if (r < 0) {
+ // We were unable to create or write it, error out
+ derr << "Failed to create dirfrag 0x" << std::hex
+ << ino << std::dec << ": " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ // Success: the dirfrag object now exists with a value header
+ dout(10) << "Created dirfrag: 0x" << std::hex
+ << ino << std::dec << dendl;
+ *created = true;
+ }
+ } else if (r < 0) {
+ derr << "Unexpected error reading dirfrag 0x" << std::hex
+ << ino << std::dec << " : " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ dout(20) << "Dirfrag already exists: 0x" << std::hex
+ << ino << " " << fragment << std::dec << dendl;
+ }
+
+ return 0;
+}
+
+int MetadataDriver::inject_linkage(
+ inodeno_t dir_ino, const std::string &dname,
+ const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst)
+{
+ object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
+
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+ dn_key.encode(key);
+
+ bufferlist dentry_bl;
+ encode(dnfirst, dentry_bl);
+ encode('I', dentry_bl);
+ inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ // Write out
+ std::map<std::string, bufferlist> vals;
+ vals[key] = dentry_bl;
+ int r = metadata_io.omap_set(frag_oid.name, vals);
+ if (r != 0) {
+ derr << "Error writing dentry 0x" << std::hex
+ << dir_ino << std::dec << "/"
+ << dname << ": " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ dout(20) << "Injected dentry 0x" << std::hex
+ << dir_ino << "/" << dname << " pointing to 0x"
+ << inode.inode->ino << std::dec << dendl;
+ return 0;
+ }
+}
+
+
+int MetadataDriver::init(
+ librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+ fs_cluster_id_t fscid)
+{
+ if (metadata_pool_name.empty()) {
+ auto fs = fsmap->get_filesystem(fscid);
+ ceph_assert(fs != nullptr);
+ int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
+
+ dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+ int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+ if (r < 0) {
+ derr << "Pool " << metadata_pool_id
+ << " identified in MDS map not found in RADOS!" << dendl;
+ return r;
+ }
+ dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
+ } else {
+ dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
+ }
+ return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+}
+
+int LocalFileDriver::init(
+ librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+ fs_cluster_id_t fscid)
+{
+ return 0;
+}
+
+int LocalFileDriver::inject_data(
+ const std::string &file_path,
+ uint64_t size,
+ uint32_t chunk_size,
+ inodeno_t ino)
+{
+ // Scrape the file contents out of the data pool and into the
+ // local filesystem
+ std::fstream f;
+ f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
+
+ for (uint64_t offset = 0; offset < size; offset += chunk_size) {
+ bufferlist bl;
+
+ char buf[32];
+ snprintf(buf, sizeof(buf),
+ "%llx.%08llx",
+ (unsigned long long)ino,
+ (unsigned long long)(offset / chunk_size));
+ std::string oid(buf);
+
+ int r = data_io.read(oid, bl, chunk_size, 0);
+
+ if (r <= 0 && r != -ENOENT) {
+ derr << "error reading data object '" << oid << "': "
+ << cpp_strerror(r) << dendl;
+ f.close();
+ return r;
+ } else if (r >=0) {
+
+ f.seekp(offset);
+ bl.write_stream(f);
+ }
+ }
+ f.close();
+
+ return 0;
+}
+
+
+int LocalFileDriver::inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry)
+{
+ std::string path_builder = path;
+
+ // Iterate through backtrace creating directory parents
+ std::vector<inode_backpointer_t>::const_reverse_iterator i;
+ for (i = bt.ancestors.rbegin();
+ i != bt.ancestors.rend(); ++i) {
+
+ const inode_backpointer_t &backptr = *i;
+ path_builder += "/";
+ path_builder += backptr.dname;
+
+ // Last entry is the filename itself
+ bool is_file = (i + 1 == bt.ancestors.rend());
+ if (is_file) {
+ // FIXME: inject_data won't cope with interesting (i.e. striped)
+ // layouts (need a librados-compatible Filer to read these)
+ inject_data(path_builder, dentry.inode->size,
+ dentry.inode->layout.object_size, bt.ino);
+ } else {
+ int r = mkdir(path_builder.c_str(), 0755);
+ if (r != 0 && r != -EPERM) {
+ derr << "error creating directory: '" << path_builder << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int LocalFileDriver::inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry)
+{
+ std::string lf_path = path + "/lost+found";
+ int r = mkdir(lf_path.c_str(), 0755);
+ if (r != 0 && r != -EPERM) {
+ derr << "error creating directory: '" << lf_path << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::string file_path = lf_path + "/" + lost_found_dname(ino);
+ return inject_data(file_path, dentry.inode->size,
+ dentry.inode->layout.object_size, ino);
+}
+
+int LocalFileDriver::init_roots(int64_t data_pool_id)
+{
+ // Ensure that the path exists and is a directory
+ bool exists;
+ int r = check_roots(&exists);
+ if (r != 0) {
+ return r;
+ }
+
+ if (exists) {
+ return 0;
+ } else {
+ return ::mkdir(path.c_str(), 0755);
+ }
+}
+
+int LocalFileDriver::check_roots(bool *result)
+{
+ // Check if the path exists and is a directory
+ DIR *d = ::opendir(path.c_str());
+ if (d == NULL) {
+ *result = false;
+ } else {
+ int r = closedir(d);
+ if (r != 0) {
+ // Weird, but maybe possible with e.g. stale FD on NFS mount?
+ *result = false;
+ } else {
+ *result = true;
+ }
+ }
+
+ return 0;
+}
+
+void MetadataTool::build_file_dentry(
+ inodeno_t ino, uint64_t file_size, time_t file_mtime,
+ const file_layout_t &layout, InodeStore *out)
+{
+ ceph_assert(out != NULL);
+
+ auto inode = out->get_inode();
+ inode->mode = 0500 | S_IFREG;
+ inode->size = file_size;
+ inode->max_size_ever = file_size;
+ inode->mtime.tv.tv_sec = file_mtime;
+ inode->atime.tv.tv_sec = file_mtime;
+ inode->ctime.tv.tv_sec = file_mtime;
+
+ inode->layout = layout;
+
+ inode->truncate_seq = 1;
+ inode->truncate_size = -1ull;
+
+ inode->inline_data.version = CEPH_INLINE_NONE;
+
+ inode->nlink = 1;
+ inode->ino = ino;
+ inode->version = 1;
+ inode->backtrace_version = 1;
+ inode->uid = g_conf()->mds_root_ino_uid;
+ inode->gid = g_conf()->mds_root_ino_gid;
+}
+
+void MetadataTool::build_dir_dentry(
+ inodeno_t ino, const frag_info_t &fragstat,
+ const file_layout_t &layout, InodeStore *out)
+{
+ ceph_assert(out != NULL);
+
+ auto inode = out->get_inode();
+ inode->mode = 0755 | S_IFDIR;
+ inode->dirstat = fragstat;
+ inode->mtime.tv.tv_sec = fragstat.mtime;
+ inode->atime.tv.tv_sec = fragstat.mtime;
+ inode->ctime.tv.tv_sec = fragstat.mtime;
+
+ inode->layout = layout;
+ inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+
+ inode->truncate_seq = 1;
+ inode->truncate_size = -1ull;
+
+ inode->inline_data.version = CEPH_INLINE_NONE;
+
+ inode->nlink = 1;
+ inode->ino = ino;
+ inode->version = 1;
+ inode->backtrace_version = 1;
+ inode->uid = g_conf()->mds_root_ino_uid;
+ inode->gid = g_conf()->mds_root_ino_gid;
+}
+
diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h
new file mode 100644
index 000000000..5c87fe2bd
--- /dev/null
+++ b/src/tools/cephfs/DataScan.h
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "MDSUtility.h"
+#include "include/rados/librados.hpp"
+
+class InodeStore;
+class MDSTable;
+
+class RecoveryDriver {
+ protected:
+ // If true, overwrite structures that generate decoding errors.
+ bool force_corrupt;
+
+ // If true, overwrite root objects during init_roots even if they
+ // exist
+ bool force_init;
+
+ public:
+ virtual int init(
+ librados::Rados &rados,
+ std::string &metadata_pool_name,
+ const FSMap *fsmap,
+ fs_cluster_id_t fscid) = 0;
+
+ void set_force_corrupt(const bool val)
+ {
+ force_corrupt = val;
+ }
+
+ void set_force_init(const bool val)
+ {
+ force_init = val;
+ }
+
+
+ /**
+ * Inject an inode + dentry parents into the metadata pool,
+ * based on a backtrace recovered from the data pool
+ */
+ virtual int inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry) = 0;
+
+ /**
+ * Inject an inode + dentry into the lost+found directory,
+ * when all we know about a file is its inode.
+ */
+ virtual int inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry) = 0;
+
+ /**
+ * Create any missing roots (i.e. mydir, strays, root inode)
+ */
+ virtual int init_roots(
+ int64_t data_pool_id) = 0;
+
+ /**
+ * Pre-injection check that all the roots are present in
+ * the metadata pool. Used to avoid parallel workers interfering
+ * with one another, by cueing the user to go run 'init' on a
+ * single node before running a parallel scan.
+ *
+ * @param result: set to true if roots are present, else set to false
+ * @returns 0 on no unexpected errors, else error code. Missing objects
+ * are not considered an unexpected error: check *result for
+ * this case.
+ */
+ virtual int check_roots(bool *result) = 0;
+
+ /**
+ * Helper to compose dnames for links to lost+found
+ * inodes.
+ */
+ std::string lost_found_dname(inodeno_t ino)
+ {
+ char s[20];
+ snprintf(s, sizeof(s), "%llx", (unsigned long long)ino);
+ return std::string(s);
+ }
+
+ RecoveryDriver()
+ : force_corrupt(false),
+ force_init(false)
+ {}
+
+ virtual ~RecoveryDriver() {}
+};
+
+class LocalFileDriver : public RecoveryDriver
+{
+ protected:
+ const std::string path;
+ librados::IoCtx &data_io;
+
+ int inject_data(
+ const std::string &file_path,
+ uint64_t size,
+ uint32_t chunk_size,
+ inodeno_t ino);
+ public:
+
+ LocalFileDriver(const std::string &path_, librados::IoCtx &data_io_)
+ : RecoveryDriver(), path(path_), data_io(data_io_)
+ {}
+
+ // Implement RecoveryDriver interface
+ int init(
+ librados::Rados &rados,
+ std::string &metadata_pool_name,
+ const FSMap *fsmap,
+ fs_cluster_id_t fscid) override;
+
+ int inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry) override;
+
+ int inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry) override;
+
+ int init_roots(int64_t data_pool_id) override;
+
+ int check_roots(bool *result) override;
+};
+
+/**
+ * A class that knows how to work with objects in a CephFS
+ * metadata pool.
+ */
+class MetadataTool
+{
+ protected:
+
+ librados::IoCtx metadata_io;
+
+ /**
+ * Construct a synthetic InodeStore for a normal file
+ */
+ void build_file_dentry(
+ inodeno_t ino, uint64_t file_size, time_t file_mtime,
+ const file_layout_t &layout,
+ InodeStore *out);
+
+ /**
+ * Construct a synthetic InodeStore for a directory
+ */
+ void build_dir_dentry(
+ inodeno_t ino,
+ const frag_info_t &fragstat,
+ const file_layout_t &layout,
+ InodeStore *out);
+
+ /**
+ * Try and read an fnode from a dirfrag
+ */
+ int read_fnode(inodeno_t ino, frag_t frag,
+ fnode_t *fnode, uint64_t *read_version);
+
+ /**
+ * Try and read a dentry from a dirfrag
+ */
+ int read_dentry(inodeno_t parent_ino, frag_t frag,
+ const std::string &dname, InodeStore *inode, snapid_t *dnfirst=nullptr);
+};
+
+/**
+ * A class that knows how to manipulate CephFS metadata pools
+ */
+class MetadataDriver : public RecoveryDriver, public MetadataTool
+{
+ protected:
+ /**
+ * Create a .inode object, i.e. root or mydir
+ */
+ int inject_unlinked_inode(inodeno_t inono, int mode, int64_t data_pool_id);
+
+ /**
+ * Check for existence of .inode objects, before
+ * trying to go ahead and inject metadata.
+ */
+ int root_exists(inodeno_t ino, bool *result);
+ int find_or_create_dirfrag(
+ inodeno_t ino,
+ frag_t fragment,
+ bool *created);
+
+
+ /**
+ * Work out which fragment of a directory should contain a named
+ * dentry, recursing up the trace as necessary to retrieve
+ * fragtrees.
+ */
+ int get_frag_of(
+ inodeno_t dirino,
+ const std::string &dname,
+ frag_t *result_ft);
+
+ public:
+
+ // Implement RecoveryDriver interface
+ int init(
+ librados::Rados &rados,
+ std::string &metadata_pool_name,
+ const FSMap *fsmap,
+ fs_cluster_id_t fscid) override;
+
+ int inject_linkage(
+ inodeno_t dir_ino, const std::string &dname,
+ const frag_t fragment, const InodeStore &inode, snapid_t dnfirst=CEPH_NOSNAP);
+
+ int inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry) override;
+
+ int inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry) override;
+
+ int init_roots(int64_t data_pool_id) override;
+
+ int check_roots(bool *result) override;
+
+ int load_table(MDSTable *table);
+ int save_table(MDSTable *table);
+};
+
+class DataScan : public MDSUtility, public MetadataTool
+{
+ protected:
+ RecoveryDriver *driver;
+ fs_cluster_id_t fscid;
+
+ string metadata_pool_name;
+ std::vector<int64_t> data_pools;
+
+ // IoCtx for data pool (where we scrape file backtraces from)
+ librados::IoCtx data_io;
+ // Remember the data pool ID for use in layouts
+ int64_t data_pool_id;
+
+ uint32_t n;
+ uint32_t m;
+
+ /**
+ * Scan data pool for backtraces, and inject inodes to metadata pool
+ */
+ int scan_inodes();
+
+ /**
+ * Scan data pool for file sizes and mtimes
+ */
+ int scan_extents();
+
+ /**
+ * Scan metadata pool for 0th dirfrags to link orphaned
+ * directory inodes.
+ */
+ int scan_frags();
+
+ /**
+ * Cleanup xattrs from data pool
+ */
+ int cleanup();
+
+ /**
+ * Check if an inode number is in the permitted ranges
+ */
+ bool valid_ino(inodeno_t ino) const;
+
+
+ int scan_links();
+
+ // Accept pools which are not in the FSMap
+ bool force_pool;
+ // Respond to decode errors by overwriting
+ bool force_corrupt;
+ // Overwrite root objects even if they exist
+ bool force_init;
+ // Only scan inodes without this scrub tag
+ string filter_tag;
+
+ /**
+ * @param r set to error on valid key with invalid value
+ * @return true if argument consumed, else false
+ */
+ bool parse_kwarg(
+ const std::vector<const char*> &args,
+ std::vector<const char *>::const_iterator &i,
+ int *r);
+
+ /**
+ * @return true if argument consumed, else false
+ */
+ bool parse_arg(
+ const std::vector<const char*> &arg,
+ std::vector<const char *>::const_iterator &i);
+
+ int probe_filter(librados::IoCtx &ioctx);
+
+ /**
+ * Apply a function to all objects in an ioctx's pool, optionally
+ * restricted to only those objects with a 00000000 offset and
+ * no tag matching DataScan::scrub_tag.
+ */
+ int forall_objects(
+ librados::IoCtx &ioctx,
+ bool untagged_only,
+ std::function<int(std::string, uint64_t, uint64_t)> handler);
+
+ public:
+ static void usage();
+ int main(const std::vector<const char *> &args);
+
+ DataScan()
+ : driver(NULL), fscid(FS_CLUSTER_ID_NONE),
+ data_pool_id(-1), n(0), m(1),
+ force_pool(false), force_corrupt(false),
+ force_init(false)
+ {
+ }
+
+ ~DataScan() override
+ {
+ delete driver;
+ }
+};
+
diff --git a/src/tools/cephfs/Dumper.cc b/src/tools/cephfs/Dumper.cc
new file mode 100644
index 000000000..f3b07c551
--- /dev/null
+++ b/src/tools/cephfs/Dumper.cc
@@ -0,0 +1,431 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef _BACKWARD_BACKWARD_WARNING_H
+#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_*
+#endif
+
+#include "include/compat.h"
+#include "include/fs_types.h"
+#include "common/entity_name.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/JournalPointer.h"
+#include "osdc/Journaler.h"
+#include "mon/MonClient.h"
+
+#include "Dumper.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+#define HEADER_LEN 4096
+
+int Dumper::init(mds_role_t role_, const std::string &type)
+{
+ role = role_;
+
+ int r = MDSUtility::init();
+ if (r < 0) {
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ if (type == "mdlog") {
+ JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool());
+ int jp_load_result = jp.load(objecter);
+ if (jp_load_result != 0) {
+ std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl;
+ return jp_load_result;
+ } else {
+ ino = jp.front;
+ }
+ } else if (type == "purge_queue") {
+ ino = MDS_INO_PURGE_QUEUE + role.rank;
+ } else {
+ ceph_abort(); // should not get here
+ }
+ return 0;
+}
+
+
+int Dumper::recover_journal(Journaler *journaler)
+{
+ C_SaferCond cond;
+ lock.lock();
+ journaler->recover(&cond);
+ lock.unlock();
+ const int r = cond.wait();
+
+ if (r < 0) { // Error
+ derr << "error on recovery: " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ dout(10) << "completed journal recovery" << dendl;
+ return 0;
+ }
+}
+
+
+int Dumper::dump(const char *dump_file)
+{
+ int r = 0;
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
+ &finisher);
+ r = recover_journal(&journaler);
+ if (r) {
+ return r;
+ }
+ uint64_t start = journaler.get_read_pos();
+ uint64_t end = journaler.get_write_pos();
+ uint64_t len = end-start;
+
+ Filer filer(objecter, &finisher);
+
+ cout << "journal is " << start << "~" << len << std::endl;
+
+ int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0644);
+ if (fd >= 0) {
+ // include an informative header
+ uuid_d fsid = monc->get_fsid();
+ char fsid_str[40];
+ fsid.print(fsid_str);
+ char buf[HEADER_LEN];
+ memset(buf, 0, sizeof(buf));
+ snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\
+ length %llu (0x%llx)\n write_pos %llu (0x%llx)\n format %llu\n\
+ trimmed_pos %llu (0x%llx)\n stripe_unit %lu (0x%lx)\n stripe_count %lu (0x%lx)\n\
+ object_size %lu (0x%lx)\n fsid %s\n%c",
+ role.rank,
+ (unsigned long long)start, (unsigned long long)start,
+ (unsigned long long)len, (unsigned long long)len,
+ (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos,
+ (unsigned long long)journaler.last_committed.stream_format,
+ (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos,
+ (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit,
+ (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count,
+ (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size,
+ fsid_str,
+ 4);
+ r = safe_write(fd, buf, sizeof(buf));
+ if (r) {
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl;
+ ::close(fd);
+ return r;
+ }
+
+ // write the data
+ off64_t seeked = ::lseek64(fd, start, SEEK_SET);
+ if (seeked == (off64_t)-1) {
+ r = errno;
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl;
+ ::close(fd);
+ return r;
+ }
+
+
+ // Read and write 32MB chunks. Slower than it could be because we're not
+ // streaming, but that's okay because this is just a debug/disaster tool.
+ const uint32_t chunk_size = 32 * 1024 * 1024;
+
+ for (uint64_t pos = start; pos < start + len; pos += chunk_size) {
+ bufferlist bl;
+ dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl;
+
+ const uint32_t read_size = std::min<uint64_t>(chunk_size, end - pos);
+
+ C_SaferCond cond;
+ lock.lock();
+ filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP,
+ pos, read_size, &bl, 0, &cond);
+ lock.unlock();
+ r = cond.wait();
+ if (r < 0) {
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") reading "
+ "journal at offset 0x" << std::hex << pos << std::dec << dendl;
+ ::close(fd);
+ return r;
+ }
+ dout(10) << "Got 0x" << std::hex << bl.length() << std::dec
+ << " bytes" << dendl;
+
+ r = bl.write_fd(fd);
+ if (r) {
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl;
+ ::close(fd);
+ return r;
+ }
+ }
+
+ r = ::close(fd);
+ if (r) {
+ r = errno;
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl;
+ return r;
+ }
+
+ cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n"
+ << "NOTE: this is a _sparse_ file; you can\n"
+ << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n"
+ << " to efficiently compress it while preserving sparseness." << std::endl;
+ return 0;
+ } else {
+ int err = errno;
+ derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl;
+ return err;
+ }
+}
+
+int Dumper::undump(const char *dump_file, bool force)
+{
+ cout << "undump " << dump_file << std::endl;
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ int r = 0;
+ // try get layout info from cluster
+ Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
+ &finisher);
+ int recovered = recover_journal(&journaler);
+ if (recovered != 0) {
+ derr << "recover_journal failed, try to get header from dump file " << dendl;
+ }
+
+ int fd = ::open(dump_file, O_RDONLY|O_BINARY);
+ if (fd < 0) {
+ r = errno;
+ derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Ceph mds0 journal dump
+ // start offset 232401996 (0xdda2c4c)
+ // length 1097504 (0x10bf20)
+
+ char buf[HEADER_LEN];
+ r = safe_read(fd, buf, sizeof(buf));
+ if (r < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+
+ long long unsigned start, len, write_pos, format, trimmed_pos;
+ long unsigned stripe_unit, stripe_count, object_size;
+ sscanf(strstr(buf, "start offset"), "start offset %llu", &start);
+ sscanf(strstr(buf, "length"), "length %llu", &len);
+ sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos);
+ sscanf(strstr(buf, "format"), "format %llu", &format);
+
+ if (!force) {
+ // need to check if fsid match onlien cluster fsid
+ if (strstr(buf, "fsid")) {
+ uuid_d fsid;
+ char fsid_str[40];
+ sscanf(strstr(buf, "fsid"), "fsid %39s", fsid_str);
+ r = fsid.parse(fsid_str);
+ if (!r) {
+ derr << "Invalid fsid" << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+
+ if (fsid != monc->get_fsid()) {
+ derr << "Imported journal fsid does not match online cluster fsid" << dendl;
+ derr << "Use --force to skip fsid check" << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+ } else {
+ derr << "Invalid header, no fsid embeded" << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+ }
+
+ if (recovered == 0) {
+ stripe_unit = journaler.last_committed.layout.stripe_unit;
+ stripe_count = journaler.last_committed.layout.stripe_count;
+ object_size = journaler.last_committed.layout.object_size;
+ } else {
+ // try to get layout from dump file header, if failed set layout to default
+ if (strstr(buf, "stripe_unit")) {
+ sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit);
+ } else {
+ stripe_unit = file_layout_t::get_default().stripe_unit;
+ }
+ if (strstr(buf, "stripe_count")) {
+ sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count);
+ } else {
+ stripe_count = file_layout_t::get_default().stripe_count;
+ }
+ if (strstr(buf, "object_size")) {
+ sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size);
+ } else {
+ object_size = file_layout_t::get_default().object_size;
+ }
+ }
+
+ if (strstr(buf, "trimmed_pos")) {
+ sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos);
+ } else {
+ // Old format dump, any untrimmed objects before expire_pos will
+ // be discarded as trash.
+ trimmed_pos = start - (start % object_size);
+ }
+
+ if (trimmed_pos > start) {
+ derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos
+ << " > expire 0x" << start << std::dec << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+
+ if (start > write_pos) {
+ derr << std::hex << "Invalid header (expire 0x" << start
+ << " > write 0x" << write_pos << std::dec << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+
+ cout << "start " << start <<
+ " len " << len <<
+ " write_pos " << write_pos <<
+ " format " << format <<
+ " trimmed_pos " << trimmed_pos <<
+ " stripe_unit " << stripe_unit <<
+ " stripe_count " << stripe_count <<
+ " object_size " << object_size << std::endl;
+
+ Journaler::Header h;
+ h.trimmed_pos = trimmed_pos;
+ h.expire_pos = start;
+ h.write_pos = write_pos;
+ h.stream_format = format;
+ h.magic = CEPH_FS_ONDISK_MAGIC;
+
+ h.layout.stripe_unit = stripe_unit;
+ h.layout.stripe_count = stripe_count;
+ h.layout.object_size = object_size;
+ h.layout.pool_id = fs->mds_map.get_metadata_pool();
+
+ bufferlist hbl;
+ encode(h, hbl);
+
+ object_t oid = file_object_t(ino, 0);
+ object_locator_t oloc(fs->mds_map.get_metadata_pool());
+ SnapContext snapc;
+
+ cout << "writing header " << oid << std::endl;
+ C_SaferCond header_cond;
+ lock.lock();
+ objecter->write_full(oid, oloc, snapc, hbl,
+ ceph::real_clock::now(), 0,
+ &header_cond);
+ lock.unlock();
+
+ r = header_cond.wait();
+ if (r != 0) {
+ derr << "Failed to write header: " << cpp_strerror(r) << dendl;
+ ::close(fd);
+ return r;
+ }
+
+ Filer filer(objecter, &finisher);
+
+ /* Erase any objects at the end of the region to which we shall write
+ * the new log data. This is to avoid leaving trailing junk after
+ * the newly written data. Any junk more than one object ahead
+ * will be taken care of during normal operation by Journaler's
+ * prezeroing behaviour */
+ {
+ uint32_t const object_size = h.layout.object_size;
+ ceph_assert(object_size > 0);
+ uint64_t last_obj = h.write_pos / object_size;
+ uint64_t purge_count = 2;
+ /* When the length is zero, the last_obj should be zeroed
+ * from the offset determined by the new write_pos instead of being purged.
+ */
+ if (!len) {
+ purge_count = 1;
+ ++last_obj;
+ }
+ C_SaferCond purge_cond;
+ cout << "Purging " << purge_count << " objects from " << last_obj << std::endl;
+ lock.lock();
+ filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count,
+ ceph::real_clock::now(), 0, &purge_cond);
+ lock.unlock();
+ purge_cond.wait();
+ }
+ /* When the length is zero, zero the last object
+ * from the offset determined by the new write_pos.
+ */
+ if (!len) {
+ uint64_t offset_in_obj = h.write_pos % h.layout.object_size;
+ uint64_t len = h.layout.object_size - offset_in_obj;
+ C_SaferCond zero_cond;
+ cout << "Zeroing " << len << " bytes in the last object." << std::endl;
+
+ lock.lock();
+ filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond);
+ lock.unlock();
+ zero_cond.wait();
+ }
+
+ // Stream from `fd` to `filer`
+ uint64_t pos = start;
+ uint64_t left = len;
+ while (left > 0) {
+ // Read
+ bufferlist j;
+ lseek64(fd, pos, SEEK_SET);
+ uint64_t l = std::min<uint64_t>(left, 1024*1024);
+ j.read_fd(fd, l);
+
+ // Write
+ cout << " writing " << pos << "~" << l << std::endl;
+ C_SaferCond write_cond;
+ lock.lock();
+ filer.write(ino, &h.layout, snapc, pos, l, j,
+ ceph::real_clock::now(), 0, &write_cond);
+ lock.unlock();
+
+ r = write_cond.wait();
+ if (r != 0) {
+ derr << "Failed to write header: " << cpp_strerror(r) << dendl;
+ ::close(fd);
+ return r;
+ }
+
+ // Advance
+ pos += l;
+ left -= l;
+ }
+
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ cout << "done." << std::endl;
+ return 0;
+}
+
diff --git a/src/tools/cephfs/Dumper.h b/src/tools/cephfs/Dumper.h
new file mode 100644
index 000000000..758f3cdea
--- /dev/null
+++ b/src/tools/cephfs/Dumper.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef JOURNAL_DUMPER_H_
+#define JOURNAL_DUMPER_H_
+
+
+#include "MDSUtility.h"
+
+class Journaler;
+
+/**
+ * This class lets you dump out an mds journal for troubleshooting or whatever.
+ *
+ * It was built to work with cmds so some of the design choices are random.
+ * To use, create a Dumper, call init(), and then call dump() with the name
+ * of the file to dump to.
+ */
+
+class Dumper : public MDSUtility {
+private:
+ mds_role_t role;
+ inodeno_t ino;
+
+public:
+ Dumper() : ino(-1)
+ {}
+
+ int init(mds_role_t role_, const std::string &type);
+ int recover_journal(Journaler *journaler);
+ int dump(const char *dumpfile);
+ int undump(const char *dumpfile, bool force);
+};
+
+#endif /* JOURNAL_DUMPER_H_ */
diff --git a/src/tools/cephfs/EventOutput.cc b/src/tools/cephfs/EventOutput.cc
new file mode 100644
index 000000000..8cb235a82
--- /dev/null
+++ b/src/tools/cephfs/EventOutput.cc
@@ -0,0 +1,153 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include <iostream>
+#include <fstream>
+
+#include "common/errno.h"
+#include "mds/mdstypes.h"
+#include "mds/events/EUpdate.h"
+#include "mds/LogEvent.h"
+#include "JournalScanner.h"
+
+#include "EventOutput.h"
+
+
+int EventOutput::binary() const
+{
+ // Binary output, files
+ int r = ::mkdir(path.c_str(), 0755);
+ if (r != 0) {
+ r = -errno;
+ if (r != -EEXIST) {
+ std::cerr << "Error creating output directory: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ bufferlist bin;
+ std::stringstream filename;
+ if (auto& le = i->second.log_event; le) {
+ le->encode(bin, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ filename << "0x" << std::hex << i->first << std::dec << "_" << le->get_type_str() << ".bin";
+ } else if (auto& pi = i->second.pi; pi) {
+ pi->encode(bin);
+ filename << "0x" << std::hex << i->first << std::dec << "_" << pi->get_type_str() << ".bin";
+ }
+
+ std::string const file_path = path + std::string("/") + filename.str();
+ std::ofstream bin_file(file_path.c_str(), std::ofstream::out | std::ofstream::binary);
+ bin.write_stream(bin_file);
+ bin_file.close();
+ if (bin_file.fail()) {
+ return -EIO;
+ }
+ }
+ std::cerr << "Wrote output to binary files in directory '" << path << "'" << std::endl;
+
+ return 0;
+}
+
+int EventOutput::json() const
+{
+ JSONFormatter jf(true);
+ std::ofstream out_file(path.c_str(), std::ofstream::out);
+ jf.open_array_section("journal");
+ {
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ if (auto& le = i->second.log_event; le) {
+ jf.open_object_section("log_event");
+ le->dump(&jf);
+ jf.close_section(); // log_event
+ } else if (auto& pi = i->second.pi; pi) {
+ jf.open_object_section("purge_action");
+ pi->dump(&jf);
+ jf.close_section();
+ }
+ }
+ }
+ jf.close_section(); // journal
+ jf.flush(out_file);
+ out_file.close();
+
+ if (out_file.fail()) {
+ return -EIO;
+ } else {
+ std::cerr << "Wrote output to JSON file '" << path << "'" << std::endl;
+ return 0;
+ }
+}
+
+void EventOutput::list() const
+{
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ if (auto& le = i->second.log_event; le) {
+ std::vector<std::string> ev_paths;
+ EMetaBlob const *emb = le->get_metablob();
+ if (emb) {
+ emb->get_paths(ev_paths);
+ }
+
+ std::string detail;
+ if (le->get_type() == EVENT_UPDATE) {
+ auto& eu = reinterpret_cast<EUpdate&>(*le);
+ detail = eu.type;
+ }
+
+ std::cout << le->get_stamp() << " 0x"
+ << std::hex << i->first << std::dec << " "
+ << le->get_type_str() << ": "
+ << " (" << detail << ")" << std::endl;
+ for (std::vector<std::string>::iterator i = ev_paths.begin(); i != ev_paths.end(); ++i) {
+ std::cout << " " << *i << std::endl;
+ }
+ } else if (auto& pi = i->second.pi; pi) {
+ std::cout << pi->stamp << " 0x"
+ << std::hex << i->first << std::dec << " "
+ << pi->get_type_str() << std::endl;
+ }
+ }
+}
+
+void EventOutput::summary() const
+{
+ std::map<std::string, int> type_count;
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ std::string type;
+ if (auto& le = i->second.log_event; le)
+ type = le->get_type_str();
+ else if (auto& pi = i->second.pi; pi)
+ type = pi->get_type_str();
+ if (type_count.count(type) == 0) {
+ type_count[type] = 0;
+ }
+ type_count[type] += 1;
+ }
+
+ std::cout << "Events by type:" << std::endl;
+ for (std::map<std::string, int>::iterator i = type_count.begin(); i != type_count.end(); ++i) {
+ std::cout << " " << i->first << ": " << i->second << std::endl;
+ }
+
+ std::cout << "Errors: " << scan.errors.size() << std::endl;
+ if (!scan.errors.empty()) {
+ for (JournalScanner::ErrorMap::const_iterator i = scan.errors.begin();
+ i != scan.errors.end(); ++i) {
+ std::cout << " 0x" << std::hex << i->first << std::dec
+ << ": " << i->second.r << " "
+ << i->second.description << std::endl;
+ }
+ }
+}
diff --git a/src/tools/cephfs/EventOutput.h b/src/tools/cephfs/EventOutput.h
new file mode 100644
index 000000000..65d968409
--- /dev/null
+++ b/src/tools/cephfs/EventOutput.h
@@ -0,0 +1,42 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#ifndef EVENT_OUTPUT_H
+#define EVENT_OUTPUT_H
+
+#include <string>
+
+class JournalScanner;
+
+/**
+ * Different output formats for the results of a journal scan
+ */
+class EventOutput
+{
+ private:
+ JournalScanner const &scan;
+ std::string const path;
+
+ public:
+ EventOutput(JournalScanner const &scan_, std::string const &path_)
+ : scan(scan_), path(path_) {}
+
+ void summary() const;
+ void list() const;
+ int json() const;
+ int binary() const;
+};
+
+#endif // EVENT_OUTPUT_H
+
diff --git a/src/tools/cephfs/JournalFilter.cc b/src/tools/cephfs/JournalFilter.cc
new file mode 100644
index 000000000..266d7fccb
--- /dev/null
+++ b/src/tools/cephfs/JournalFilter.cc
@@ -0,0 +1,315 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include "JournalFilter.h"
+
+#include "common/ceph_argparse.h"
+
+#include "mds/events/ESession.h"
+#include "mds/events/EUpdate.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+
+const string JournalFilter::range_separator("..");
+
+bool JournalFilter::apply(uint64_t pos, PurgeItem &pi) const
+{
+ /* Filtering by journal offset range */
+ if (pos < range_start || pos >= range_end) {
+ return false;
+ }
+
+ if (purge_action != PurgeItem::NONE) {
+ if (pi.action != purge_action)
+ return false;
+ }
+
+ if (inode) {
+ if (inode != pi.ino)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Return whether a LogEvent is to be included or excluded.
+ *
+ * The filter parameters are applied on an AND basis: if any
+ * condition is not met, the event is excluded. Try to do
+ * the fastest checks first.
+ */
+bool JournalFilter::apply(uint64_t pos, LogEvent &le) const
+{
+ /* Filtering by journal offset range */
+ if (pos < range_start || pos >= range_end) {
+ return false;
+ }
+
+ /* Filtering by event type */
+ if (event_type != 0) {
+ if (le.get_type() != event_type) {
+ return false;
+ }
+ }
+
+ /* Filtering by client */
+ if (client_name.num()) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ if (metablob->get_client_name() != client_name) {
+ return false;
+ }
+ } else if (le.get_type() == EVENT_SESSION) {
+ ESession *es = reinterpret_cast<ESession*>(&le);
+ if (es->get_client_inst().name != client_name) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /* Filtering by inode */
+ if (inode) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ std::set<inodeno_t> inodes;
+ metablob->get_inodes(inodes);
+ bool match_any = false;
+ for (std::set<inodeno_t>::iterator i = inodes.begin(); i != inodes.end(); ++i) {
+ if (*i == inode) {
+ match_any = true;
+ break;
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /* Filtering by frag and dentry */
+ if (!frag_dentry.empty() || frag.ino) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ std::map<dirfrag_t, std::set<std::string> > dentries;
+ metablob->get_dentries(dentries);
+
+ if (frag.ino) {
+ bool match_any = false;
+ for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin();
+ i != dentries.end(); ++i) {
+ if (i->first == frag) {
+ match_any = true;
+ break;
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ }
+
+ if (!frag_dentry.empty()) {
+ bool match_any = false;
+ for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin();
+ i != dentries.end() && !match_any; ++i) {
+ std::set<std::string> const &names = i->second;
+ for (std::set<std::string>::iterator j = names.begin();
+ j != names.end() && !match_any; ++j) {
+ if (*j == frag_dentry) {
+ match_any = true;
+ }
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ }
+
+ } else {
+ return false;
+ }
+ }
+
+ /* Filtering by file path */
+ if (!path_expr.empty()) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ std::vector<std::string> paths;
+ metablob->get_paths(paths);
+ bool match_any = false;
+ for (std::vector<std::string>::iterator p = paths.begin(); p != paths.end(); ++p) {
+ if ((*p).find(path_expr) != std::string::npos) {
+ match_any = true;
+ break;
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+int JournalFilter::parse_args(
+ std::vector<const char*> &argv,
+ std::vector<const char*>::iterator &arg)
+{
+ while(arg != argv.end()) {
+ std::string arg_str;
+ if (ceph_argparse_witharg(argv, arg, &arg_str, "--range", (char*)NULL)) {
+ size_t sep_loc = arg_str.find(JournalFilter::range_separator);
+ if (sep_loc == std::string::npos || arg_str.size() <= JournalFilter::range_separator.size()) {
+ derr << "Invalid range '" << arg_str << "'" << dendl;
+ return -EINVAL;
+ }
+
+ // We have a lower bound
+ if (sep_loc > 0) {
+ std::string range_start_str = arg_str.substr(0, sep_loc);
+ std::string parse_err;
+ range_start = strict_strtoll(range_start_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid lower bound '" << range_start_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+ }
+
+ if (sep_loc < arg_str.size() - JournalFilter::range_separator.size()) {
+ std::string range_end_str = arg_str.substr(sep_loc + range_separator.size());
+ std::string parse_err;
+ range_end = strict_strtoll(range_end_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid upper bound '" << range_end_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+ }
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--path\"." << dendl;
+ return -EINVAL;
+ }
+ dout(4) << "Filtering by path '" << arg_str << "'" << dendl;
+ path_expr = arg_str;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--inode", (char*)NULL)) {
+ dout(4) << "Filtering by inode '" << arg_str << "'" << dendl;
+ std::string parse_err;
+ inode = strict_strtoll(arg_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid inode '" << arg_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--type", (char*)NULL)) {
+ try {
+ if (!type.compare("mdlog")) {
+ event_type = LogEvent::str_to_type(arg_str);
+ } else if (!type.compare("purge_queue")) {
+ purge_action = PurgeItem::str_to_type(arg_str);
+ }
+ } catch (const std::out_of_range&) {
+ derr << "Invalid event type '" << arg_str << "'" << dendl;
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--frag", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--frag\"." << dendl;
+ return -EINVAL;
+ }
+ std::string const frag_sep = ".";
+ size_t sep_loc = arg_str.find(frag_sep);
+ std::string inode_str;
+ std::string frag_str;
+ if (sep_loc != std::string::npos) {
+ inode_str = arg_str.substr(0, sep_loc);
+ frag_str = arg_str.substr(sep_loc + 1);
+ } else {
+ inode_str = arg_str;
+ frag_str = "0";
+ }
+
+ std::string parse_err;
+ inodeno_t frag_ino = strict_strtoll(inode_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid inode '" << inode_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ uint32_t frag_enc = strict_strtoll(frag_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid frag '" << frag_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ frag = dirfrag_t(frag_ino, frag_t(frag_enc));
+ dout(4) << "dirfrag filter: '" << frag << "'" << dendl;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--dname", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--dname\"." << dendl;
+ return -EINVAL;
+ }
+ frag_dentry = arg_str;
+ dout(4) << "dentry filter: '" << frag_dentry << "'" << dendl;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--client", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--client\"." << dendl;
+ return -EINVAL;
+ }
+
+ std::string parse_err;
+ int64_t client_num = strict_strtoll(arg_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid client number " << arg_str << dendl;
+ return -EINVAL;
+ }
+ client_name = entity_name_t::CLIENT(client_num);
+ } else {
+ // We're done with args the filter understands
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * If the filter params are only range, then return
+ * true and set start & end. Else return false.
+ *
+ * Use this to discover if the user has requested a contiguous range
+ * rather than any per-event filtering.
+ */
+bool JournalFilter::get_range(uint64_t &start, uint64_t &end) const
+{
+ if (!path_expr.empty()
+ || inode != 0
+ || event_type != 0
+ || frag.ino != 0
+ || client_name.num() != 0
+ || (range_start == 0 && range_end == (uint64_t)(-1))) {
+ return false;
+ } else {
+ start = range_start;
+ end = range_end;
+ return true;
+ }
+}
diff --git a/src/tools/cephfs/JournalFilter.h b/src/tools/cephfs/JournalFilter.h
new file mode 100644
index 000000000..f7a2db614
--- /dev/null
+++ b/src/tools/cephfs/JournalFilter.h
@@ -0,0 +1,73 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#ifndef JOURNAL_FILTER_H
+#define JOURNAL_FILTER_H
+
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/PurgeQueue.h"
+
+/**
+ * A set of conditions for narrowing down a search through the journal
+ */
+class JournalFilter
+{
+ private:
+
+ /* Filtering by journal offset range */
+ uint64_t range_start;
+ uint64_t range_end;
+ static const std::string range_separator;
+
+ /* Filtering by file (sub) path */
+ std::string path_expr;
+
+ /* Filtering by inode */
+ inodeno_t inode;
+
+ /* Filtering by type */
+ LogEvent::EventType event_type;
+
+ std::string type;
+
+ /* Filtering by PurgeItem::Action */
+ PurgeItem::Action purge_action;
+
+ /* Filtering by dirfrag */
+ dirfrag_t frag;
+ std::string frag_dentry; //< optional, filter dentry name within fragment
+
+ /* Filtering by metablob client name */
+ entity_name_t client_name;
+
+ public:
+ JournalFilter(std::string t) :
+ range_start(0),
+ range_end(-1),
+ inode(0),
+ event_type(0),
+ type(t),
+ purge_action(PurgeItem::NONE) {}
+
+ bool get_range(uint64_t &start, uint64_t &end) const;
+ bool apply(uint64_t pos, LogEvent &le) const;
+ bool apply(uint64_t pos, PurgeItem &pi) const;
+ int parse_args(
+ std::vector<const char*> &argv,
+ std::vector<const char*>::iterator &arg);
+};
+
+#endif // JOURNAL_FILTER_H
+
diff --git a/src/tools/cephfs/JournalScanner.cc b/src/tools/cephfs/JournalScanner.cc
new file mode 100644
index 000000000..e72542fd4
--- /dev/null
+++ b/src/tools/cephfs/JournalScanner.cc
@@ -0,0 +1,438 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include "include/rados/librados.hpp"
+#include "mds/JournalPointer.h"
+
+#include "mds/events/ESubtreeMap.h"
+#include "mds/PurgeQueue.h"
+
+#include "JournalScanner.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+/**
+ * Read journal header, followed by sequential scan through journal space.
+ *
+ * Return 0 on success, else error code. Note that success has the special meaning
+ * that we were able to apply our checks, it does *not* mean that the journal is
+ * healthy.
+ */
+int JournalScanner::scan(bool const full)
+{
+ int r = 0;
+
+ r = set_journal_ino();
+ if (r < 0) {
+ return r;
+ }
+
+ if (!is_mdlog || pointer_present) {
+ r = scan_header();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (full && header_present) {
+ r = scan_events();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+
+int JournalScanner::set_journal_ino()
+{
+ int r = 0;
+ if (type == "purge_queue") {
+ ino = MDS_INO_PURGE_QUEUE + rank;
+ }
+ else if (type == "mdlog"){
+ r = scan_pointer();
+ is_mdlog = true;
+ }
+ else {
+ ceph_abort(); // should not get here
+ }
+ return r;
+}
+
+int JournalScanner::scan_pointer()
+{
+ // Issue read
+ std::string const pointer_oid = obj_name(MDS_INO_LOG_POINTER_OFFSET + rank, 0);
+ bufferlist pointer_bl;
+ int r = io.read(pointer_oid, pointer_bl, INT_MAX, 0);
+ if (r == -ENOENT) {
+ // 'Successfully' discovered the pointer is missing.
+ derr << "Pointer " << pointer_oid << " is absent" << dendl;
+ return 0;
+ } else if (r < 0) {
+ // Error preventing us interrogating pointer
+ derr << "Pointer " << pointer_oid << " is unreadable" << dendl;
+ return r;
+ } else {
+ dout(4) << "Pointer " << pointer_oid << " is readable" << dendl;
+ pointer_present = true;
+
+ JournalPointer jp;
+ try {
+ auto q = pointer_bl.cbegin();
+ jp.decode(q);
+ } catch(buffer::error &e) {
+ derr << "Pointer " << pointer_oid << " is corrupt: " << e.what() << dendl;
+ return 0;
+ }
+
+ pointer_valid = true;
+ ino = jp.front;
+ return 0;
+ }
+}
+
+
+int JournalScanner::scan_header()
+{
+ int r;
+
+ bufferlist header_bl;
+ std::string header_name = obj_name(0);
+ dout(4) << "JournalScanner::scan: reading header object '" << header_name << "'" << dendl;
+ r = io.read(header_name, header_bl, INT_MAX, 0);
+ if (r < 0) {
+ derr << "Header " << header_name << " is unreadable" << dendl;
+ return 0; // "Successfully" found an error
+ } else {
+ header_present = true;
+ }
+
+ auto header_bl_i = header_bl.cbegin();
+ header = new Journaler::Header();
+ try
+ {
+ header->decode(header_bl_i);
+ }
+ catch (buffer::error &e)
+ {
+ derr << "Header is corrupt (" << e.what() << ")" << dendl;
+ delete header;
+ header = NULL;
+ return 0; // "Successfully" found an error
+ }
+
+ if (header->magic != std::string(CEPH_FS_ONDISK_MAGIC)) {
+ derr << "Header is corrupt (bad magic)" << dendl;
+ return 0; // "Successfully" found an error
+ }
+ if (!((header->trimmed_pos <= header->expire_pos) && (header->expire_pos <= header->write_pos))) {
+ derr << "Header is invalid (inconsistent offsets)" << dendl;
+ return 0; // "Successfully" found an error
+ }
+ header_valid = true;
+
+ return 0;
+}
+
+
+int JournalScanner::scan_events()
+{
+ uint64_t object_size = g_conf()->mds_log_segment_size;
+ if (object_size == 0) {
+ // Default layout object size
+ object_size = file_layout_t::get_default().object_size;
+ }
+
+ uint64_t read_offset = header->expire_pos;
+ dout(10) << std::hex << "Header 0x"
+ << header->trimmed_pos << " 0x"
+ << header->expire_pos << " 0x"
+ << header->write_pos << std::dec << dendl;
+ dout(10) << "Starting journal scan from offset 0x" << std::hex << read_offset << std::dec << dendl;
+
+ // TODO also check for extraneous objects before the trimmed pos or after the write pos,
+ // which would indicate a bogus header.
+
+ bufferlist read_buf;
+ bool gap = false;
+ uint64_t gap_start = -1;
+ for (uint64_t obj_offset = (read_offset / object_size); ; obj_offset++) {
+ uint64_t offset_in_obj = 0;
+ if (obj_offset * object_size < header->expire_pos) {
+ // Skip up to expire_pos from start of the object
+ // (happens for the first object we read)
+ offset_in_obj = header->expire_pos - obj_offset * object_size;
+ }
+
+ // Read this journal segment
+ bufferlist this_object;
+ std::string const oid = obj_name(obj_offset);
+ int r = io.read(oid, this_object, INT_MAX, offset_in_obj);
+
+ // Handle absent journal segments
+ if (r < 0) {
+ if (obj_offset > (header->write_pos / object_size)) {
+ dout(4) << "Reached end of journal objects" << dendl;
+ break;
+ } else {
+ derr << "Missing object " << oid << dendl;
+ }
+
+ objects_missing.push_back(obj_offset);
+ if (!gap) {
+ gap_start = read_offset;
+ gap = true;
+ }
+ if (read_buf.length() > 0) {
+ read_offset += read_buf.length();
+ read_buf.clear();
+ }
+ read_offset += object_size - offset_in_obj;
+ continue;
+ } else {
+ dout(4) << "Read 0x" << std::hex << this_object.length() << std::dec
+ << " bytes from " << oid << " gap=" << gap << dendl;
+ objects_valid.push_back(oid);
+ this_object.begin().copy(this_object.length(), read_buf);
+ }
+
+ if (gap) {
+ // No valid data at the current read offset, scan forward until we find something valid looking
+ // or have to drop out to load another object.
+ dout(4) << "Searching for sentinel from 0x" << std::hex << read_offset
+ << ", 0x" << read_buf.length() << std::dec << " bytes available" << dendl;
+
+ do {
+ auto p = read_buf.cbegin();
+ uint64_t candidate_sentinel;
+ decode(candidate_sentinel, p);
+
+ dout(4) << "Data at 0x" << std::hex << read_offset << " = 0x" << candidate_sentinel << std::dec << dendl;
+
+ if (candidate_sentinel == JournalStream::sentinel) {
+ dout(4) << "Found sentinel at 0x" << std::hex << read_offset << std::dec << dendl;
+ ranges_invalid.push_back(Range(gap_start, read_offset));
+ gap = false;
+ break;
+ } else {
+ // No sentinel, discard this byte
+ read_buf.splice(0, 1);
+ read_offset += 1;
+ }
+ } while (read_buf.length() >= sizeof(JournalStream::sentinel));
+ dout(4) << "read_buf size is " << read_buf.length() << dendl;
+ }
+ {
+ dout(10) << "Parsing data, 0x" << std::hex << read_buf.length() << std::dec << " bytes available" << dendl;
+ while(true) {
+ // TODO: detect and handle legacy format journals: can do many things
+ // on them but on read errors have to give up instead of searching
+ // for sentinels.
+ JournalStream journal_stream(JOURNAL_FORMAT_RESILIENT);
+ bool readable = false;
+ try {
+ uint64_t need;
+ readable = journal_stream.readable(read_buf, &need);
+ } catch (buffer::error &e) {
+ readable = false;
+ dout(4) << "Invalid container encoding at 0x" << std::hex << read_offset << std::dec << dendl;
+ gap = true;
+ gap_start = read_offset;
+ read_buf.splice(0, 1);
+ read_offset += 1;
+ break;
+ }
+
+ if (!readable) {
+ // Out of data, continue to read next object
+ break;
+ }
+
+ bufferlist le_bl; //< Serialized LogEvent blob
+ dout(10) << "Attempting decode at 0x" << std::hex << read_offset << std::dec << dendl;
+ // This cannot fail to decode because we pre-checked that a serialized entry
+ // blob would be readable.
+ uint64_t start_ptr = 0;
+ uint64_t consumed = journal_stream.read(read_buf, &le_bl, &start_ptr);
+ dout(10) << "Consumed 0x" << std::hex << consumed << std::dec << " bytes" << dendl;
+ if (start_ptr != read_offset) {
+ derr << "Bad entry start ptr (0x" << std::hex << start_ptr << ") at 0x"
+ << read_offset << std::dec << dendl;
+ gap = true;
+ gap_start = read_offset;
+ // FIXME: given that entry was invalid, should we be skipping over it?
+ // maybe push bytes back onto start of read_buf and just advance one byte
+ // to start scanning instead. e.g. if a bogus size value is found it can
+ // cause us to consume and thus skip a bunch of following valid events.
+ read_offset += consumed;
+ break;
+ }
+ bool valid_entry = true;
+ if (is_mdlog) {
+ auto le = LogEvent::decode_event(le_bl.cbegin());
+
+ if (le) {
+ dout(10) << "Valid entry at 0x" << std::hex << read_offset << std::dec << dendl;
+
+ if (le->get_type() == EVENT_SUBTREEMAP
+ || le->get_type() == EVENT_SUBTREEMAP_TEST) {
+ auto&& sle = dynamic_cast<ESubtreeMap&>(*le);
+ if (sle.expire_pos > read_offset) {
+ errors.insert(std::make_pair(
+ read_offset, EventError(
+ -ERANGE,
+ "ESubtreeMap has expire_pos ahead of its own position")));
+ }
+ }
+
+ if (filter.apply(read_offset, *le)) {
+ events.insert_or_assign(read_offset, EventRecord(std::move(le), consumed));
+ }
+ } else {
+ valid_entry = false;
+ }
+ } else if (type == "purge_queue"){
+ auto pi = std::make_unique<PurgeItem>();
+ try {
+ auto q = le_bl.cbegin();
+ pi->decode(q);
+ if (filter.apply(read_offset, *pi)) {
+ events.insert_or_assign(read_offset, EventRecord(std::move(pi), consumed));
+ }
+ } catch (const buffer::error &err) {
+ valid_entry = false;
+ }
+ } else {
+ ceph_abort(); // should not get here
+ }
+ if (!valid_entry) {
+ dout(10) << "Invalid entry at 0x" << std::hex << read_offset << std::dec << dendl;
+ gap = true;
+ gap_start = read_offset;
+ read_offset += consumed;
+ break;
+ } else {
+ events_valid.push_back(read_offset);
+ read_offset += consumed;
+ }
+ }
+ }
+ }
+
+ if (gap) {
+ // Ended on a gap, assume it ran to end
+ ranges_invalid.push_back(Range(gap_start, -1));
+ }
+
+ dout(4) << "Scanned objects, " << objects_missing.size() << " missing, " << objects_valid.size() << " valid" << dendl;
+ dout(4) << "Events scanned, " << ranges_invalid.size() << " gaps" << dendl;
+ dout(4) << "Found " << events_valid.size() << " valid events" << dendl;
+ dout(4) << "Selected " << events.size() << " events events for processing" << dendl;
+
+ return 0;
+}
+
+
+JournalScanner::~JournalScanner()
+{
+ if (header) {
+ delete header;
+ header = NULL;
+ }
+ dout(4) << events.size() << " events" << dendl;
+ events.clear();
+}
+
+
+/**
+ * Whether the journal data looks valid and replayable
+ */
+bool JournalScanner::is_healthy() const
+{
+ return ((!is_mdlog || (pointer_present && pointer_valid))
+ && header_present && header_valid
+ && ranges_invalid.empty()
+ && objects_missing.empty());
+}
+
+
+/**
+ * Whether the journal data can be read from RADOS
+ */
+bool JournalScanner::is_readable() const
+{
+ return (header_present && header_valid && objects_missing.empty());
+}
+
+
+/**
+ * Calculate the object name for a given offset
+ */
+std::string JournalScanner::obj_name(inodeno_t ino, uint64_t offset) const
+{
+ char name[60];
+ snprintf(name, sizeof(name), "%llx.%08llx",
+ (unsigned long long)(ino),
+ (unsigned long long)offset);
+ return std::string(name);
+}
+
+
+std::string JournalScanner::obj_name(uint64_t offset) const
+{
+ return obj_name(ino, offset);
+}
+
+
+/*
+ * Write a human readable summary of the journal health
+ */
+void JournalScanner::report(std::ostream &out) const
+{
+ out << "Overall journal integrity: " << (is_healthy() ? "OK" : "DAMAGED") << std::endl;
+
+ if (is_mdlog) {
+ if (!pointer_present) {
+ out << "Pointer not found" << std::endl;
+ } else if (!pointer_valid) {
+ out << "Pointer could not be decoded" << std::endl;
+ }
+ }
+ if (!header_present) {
+ out << "Header not found" << std::endl;
+ } else if (!header_valid) {
+ out << "Header could not be decoded" << std::endl;
+ }
+
+ if (objects_missing.size()) {
+ out << "Objects missing:" << std::endl;
+ for (std::vector<uint64_t>::const_iterator om = objects_missing.begin();
+ om != objects_missing.end(); ++om) {
+ out << " 0x" << std::hex << *om << std::dec << std::endl;
+ }
+ }
+
+ if (ranges_invalid.size()) {
+ out << "Corrupt regions:" << std::endl;
+ for (std::vector<Range>::const_iterator r = ranges_invalid.begin();
+ r != ranges_invalid.end(); ++r) {
+ out << " 0x" << std::hex << r->first << "-" << r->second << std::dec << std::endl;
+ }
+ }
+}
+
diff --git a/src/tools/cephfs/JournalScanner.h b/src/tools/cephfs/JournalScanner.h
new file mode 100644
index 000000000..9197b5596
--- /dev/null
+++ b/src/tools/cephfs/JournalScanner.h
@@ -0,0 +1,133 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+#ifndef JOURNAL_SCANNER_H
+#define JOURNAL_SCANNER_H
+
+#include "include/rados/librados_fwd.hpp"
+
+// For Journaler::Header, can't forward-declare nested classes
+#include <osdc/Journaler.h>
+
+#include "JournalFilter.h"
+
+/**
+ * A simple sequential reader for metadata journals. Unlike
+ * the MDS Journaler class, this is written to detect, record,
+ * and read past corruptions and missing objects. It is also
+ * less efficient but more plainly written.
+ */
+class JournalScanner
+{
+ private:
+ librados::IoCtx &io;
+
+ // Input constraints
+ const int rank;
+ std::string type;
+ JournalFilter const filter;
+
+ void gap_advance();
+
+ public:
+ JournalScanner(
+ librados::IoCtx &io_,
+ int rank_,
+ const std::string &type_,
+ JournalFilter const &filter_) :
+ io(io_),
+ rank(rank_),
+ type(type_),
+ filter(filter_),
+ is_mdlog(false),
+ pointer_present(false),
+ pointer_valid(false),
+ header_present(false),
+ header_valid(false),
+ header(NULL) {};
+
+ JournalScanner(
+ librados::IoCtx &io_,
+ int rank_,
+ const std::string &type_) :
+ io(io_),
+ rank(rank_),
+ type(type_),
+ filter(type_),
+ is_mdlog(false),
+ pointer_present(false),
+ pointer_valid(false),
+ header_present(false),
+ header_valid(false),
+ header(NULL) {};
+
+ ~JournalScanner();
+
+ int set_journal_ino();
+ int scan(bool const full=true);
+ int scan_pointer();
+ int scan_header();
+ int scan_events();
+ void report(std::ostream &out) const;
+
+ std::string obj_name(uint64_t offset) const;
+ std::string obj_name(inodeno_t ino, uint64_t offset) const;
+
+ // The results of the scan
+ inodeno_t ino; // Corresponds to journal ino according their type
+ struct EventRecord {
+ EventRecord(std::unique_ptr<LogEvent> le, uint32_t rs) : log_event(std::move(le)), raw_size(rs) {}
+ EventRecord(std::unique_ptr<PurgeItem> p, uint32_t rs) : pi(std::move(p)), raw_size(rs) {}
+ std::unique_ptr<LogEvent> log_event;
+ std::unique_ptr<PurgeItem> pi;
+ uint32_t raw_size = 0; //< Size from start offset including all encoding overhead
+ };
+
+ class EventError {
+ public:
+ int r;
+ std::string description;
+ EventError(int r_, const std::string &desc_)
+ : r(r_), description(desc_) {}
+ };
+
+ typedef std::map<uint64_t, EventRecord> EventMap;
+ typedef std::map<uint64_t, EventError> ErrorMap;
+ typedef std::pair<uint64_t, uint64_t> Range;
+ bool is_mdlog;
+ bool pointer_present; //mdlog specific
+ bool pointer_valid; //mdlog specific
+ bool header_present;
+ bool header_valid;
+ Journaler::Header *header;
+
+ bool is_healthy() const;
+ bool is_readable() const;
+ std::vector<std::string> objects_valid;
+ std::vector<uint64_t> objects_missing;
+ std::vector<Range> ranges_invalid;
+ std::vector<uint64_t> events_valid;
+ EventMap events;
+
+ // For events present in ::events (i.e. scanned successfully),
+ // any subsequent errors handling them (e.g. replaying)
+ ErrorMap errors;
+
+
+ private:
+ // Forbid copy construction because I have ptr members
+ JournalScanner(const JournalScanner &rhs);
+};
+
+#endif // JOURNAL_SCANNER_H
+
diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc
new file mode 100644
index 000000000..ec9860980
--- /dev/null
+++ b/src/tools/cephfs/JournalTool.cc
@@ -0,0 +1,1266 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include <sstream>
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/InoTable.h"
+
+#include "mds/events/ENoOp.h"
+#include "mds/events/EUpdate.h"
+
+#include "JournalScanner.h"
+#include "EventOutput.h"
+#include "Dumper.h"
+#include "Resetter.h"
+
+#include "JournalTool.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+
+
+void JournalTool::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-journal-tool [options] journal <command>\n"
+ << " <command>:\n"
+ << " inspect\n"
+ << " import <path> [--force]\n"
+ << " export <path>\n"
+ << " reset [--force]\n"
+ << " cephfs-journal-tool [options] header <get|set> <field> <value>\n"
+ << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n"
+ << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
+ << " <selector>:\n"
+ << " --range=<start>..<end>\n"
+ << " --path=<substring>\n"
+ << " --inode=<integer>\n"
+ << " --type=<UPDATE|OPEN|SESSION...><\n"
+ << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
+ << " --client=<session id integer>\n"
+ << " <effect>: [get|recover_dentries|splice]\n"
+ << " <output>: [summary|list|binary|json] [--path <path>]\n"
+ << "\n"
+ << "General options:\n"
+ << " --rank=filesystem:mds-rank|all Journal rank (mandatory)\n"
+ << " --journal=<mdlog|purge_queue> Journal type (purge_queue means\n"
+ << " this journal is used to queue for purge operation,\n"
+ << " default is mdlog, and only mdlog support event mode)\n"
+ << "\n"
+ << "Special options\n"
+ << " --alternate-pool <name> Alternative metadata pool to target\n"
+ << " when using recover_dentries.\n";
+
+ generic_client_usage();
+}
+
+
+/**
+ * Handle arguments and hand off to journal/header/event mode
+ */
+int JournalTool::main(std::vector<const char*> &argv)
+{
+ int r;
+
+ dout(10) << "JournalTool::main " << dendl;
+ // Common arg parsing
+ // ==================
+ if (argv.empty()) {
+ cerr << "missing positional argument" << std::endl;
+ return -EINVAL;
+ }
+
+ std::vector<const char*>::iterator arg = argv.begin();
+
+ std::string rank_str;
+ if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
+ derr << "missing mandatory \"--rank\" argument" << dendl;
+ return -EINVAL;
+ }
+
+ if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) {
+ // Default is mdlog
+ type = "mdlog";
+ }
+
+ r = validate_type(type);
+ if (r != 0) {
+ derr << "journal type is not correct." << dendl;
+ return r;
+ }
+
+ r = role_selector.parse(*fsmap, rank_str, false);
+ if (r != 0) {
+ derr << "Couldn't determine MDS rank." << dendl;
+ return r;
+ }
+
+ std::string mode;
+ if (arg == argv.end()) {
+ derr << "Missing mode [journal|header|event]" << dendl;
+ return -EINVAL;
+ }
+ mode = std::string(*arg);
+ arg = argv.erase(arg);
+
+ // RADOS init
+ // ==========
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+ return r;
+ }
+
+ dout(4) << "JournalTool: connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ ceph_assert(fs != nullptr);
+ int64_t const pool_id = fs->mds_map.get_metadata_pool();
+ dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
+ std::string pool_name;
+ r = rados.pool_reverse_lookup(pool_id, &pool_name);
+ if (r < 0) {
+ derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
+ return r;
+ }
+
+ dout(4) << "JournalTool: creating IoCtx.." << dendl;
+ r = rados.ioctx_create(pool_name.c_str(), input);
+ ceph_assert(r == 0);
+ output.dup(input);
+
+ // Execution
+ // =========
+ // journal and header are general journal mode
+ // event mode is only specific for mdlog
+ auto roles = role_selector.get_roles();
+ if (roles.size() > 1) {
+ const std::string &command = argv[0];
+ bool allowed = can_execute_for_all_ranks(mode, command);
+ if (!allowed) {
+ derr << "operation not allowed for all ranks" << dendl;
+ return -EINVAL;
+ }
+
+ all_ranks = true;
+ }
+ for (auto role : roles) {
+ rank = role.rank;
+ std::vector<const char *> rank_argv(argv);
+ dout(4) << "Executing for rank " << rank << dendl;
+ if (mode == std::string("journal")) {
+ r = main_journal(rank_argv);
+ } else if (mode == std::string("header")) {
+ r = main_header(rank_argv);
+ } else if (mode == std::string("event")) {
+ r = main_event(rank_argv);
+ } else {
+ cerr << "Bad command '" << mode << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (r != 0) {
+ return r;
+ }
+ }
+
+ return r;
+}
+
+int JournalTool::validate_type(const std::string &type)
+{
+ if (type == "mdlog" || type == "purge_queue") {
+ return 0;
+ }
+ return -1;
+}
+
+std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
+ if (!all_ranks) {
+ return prefix;
+ }
+
+ return prefix + "." + std::to_string(rank);
+}
+
+bool JournalTool::can_execute_for_all_ranks(const std::string &mode,
+ const std::string &command) {
+ if (mode == "journal" && command == "import") {
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * Handle arguments for 'journal' mode
+ *
+ * This is for operations that act on the journal as a whole.
+ */
+int JournalTool::main_journal(std::vector<const char*> &argv)
+{
+ if (argv.empty()) {
+ derr << "Missing journal command, please see help" << dendl;
+ return -EINVAL;
+ }
+
+ std::string command = argv[0];
+ if (command == "inspect") {
+ return journal_inspect();
+ } else if (command == "export" || command == "import") {
+ bool force = false;
+ if (argv.size() >= 2) {
+ std::string const path = argv[1];
+ if (argv.size() == 3) {
+ if (std::string(argv[2]) == "--force") {
+ force = true;
+ } else {
+ std::cerr << "Unknown argument " << argv[1] << std::endl;
+ return -EINVAL;
+ }
+ }
+ return journal_export(path, command == "import", force);
+ } else {
+ derr << "Missing path" << dendl;
+ return -EINVAL;
+ }
+ } else if (command == "reset") {
+ bool force = false;
+ if (argv.size() == 2) {
+ if (std::string(argv[1]) == "--force") {
+ force = true;
+ } else {
+ std::cerr << "Unknown argument " << argv[1] << std::endl;
+ return -EINVAL;
+ }
+ } else if (argv.size() > 2) {
+ std::cerr << "Too many arguments!" << std::endl;
+ return -EINVAL;
+ }
+ return journal_reset(force);
+ } else {
+ derr << "Bad journal command '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+}
+
+
+/**
+ * Parse arguments and execute for 'header' mode
+ *
+ * This is for operations that act on the header only.
+ */
+int JournalTool::main_header(std::vector<const char*> &argv)
+{
+ JournalFilter filter(type);
+ JournalScanner js(input, rank, type, filter);
+ int r = js.scan(false);
+ if (r < 0) {
+ std::cerr << "Unable to scan journal" << std::endl;
+ return r;
+ }
+
+ if (!js.header_present) {
+ std::cerr << "Header object not found!" << std::endl;
+ return -ENOENT;
+ } else if (!js.header_valid && js.header == NULL) {
+ // Can't do a read or a single-field write without a copy of the original
+ derr << "Header could not be read!" << dendl;
+ return -ENOENT;
+ } else {
+ ceph_assert(js.header != NULL);
+ }
+
+ if (argv.empty()) {
+ derr << "Missing header command, must be [get|set]" << dendl;
+ return -EINVAL;
+ }
+ std::vector<const char *>::iterator arg = argv.begin();
+ std::string const command = *arg;
+ arg = argv.erase(arg);
+
+ if (command == std::string("get")) {
+ // Write JSON journal dump to stdout
+ JSONFormatter jf(true);
+ js.header->dump(&jf);
+ jf.flush(std::cout);
+ std::cout << std::endl;
+ } else if (command == std::string("set")) {
+ // Need two more args <key> <val>
+ if (argv.size() != 2) {
+ derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
+ return -EINVAL;
+ }
+
+ std::string const field_name = *arg;
+ arg = argv.erase(arg);
+
+ std::string const value_str = *arg;
+ arg = argv.erase(arg);
+ ceph_assert(argv.empty());
+
+ std::string parse_err;
+ uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t *field = NULL;
+ if (field_name == "trimmed_pos") {
+ field = &(js.header->trimmed_pos);
+ } else if (field_name == "expire_pos") {
+ field = &(js.header->expire_pos);
+ } else if (field_name == "write_pos") {
+ field = &(js.header->write_pos);
+ } else if (field_name == "pool_id") {
+ field = (uint64_t*)(&(js.header->layout.pool_id));
+ } else {
+ derr << "Invalid field '" << field_name << "'" << dendl;
+ return -EINVAL;
+ }
+
+ std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
+ *field = new_val;
+
+ dout(4) << "Writing object..." << dendl;
+ bufferlist header_bl;
+ encode(*(js.header), header_bl);
+ output.write_full(js.obj_name(0), header_bl);
+ dout(4) << "Write complete." << dendl;
+ std::cout << "Successfully updated header." << std::endl;
+ } else {
+ derr << "Bad header command '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+/**
+ * Parse arguments and execute for 'event' mode
+ *
+ * This is for operations that act on LogEvents within the log
+ */
+int JournalTool::main_event(std::vector<const char*> &argv)
+{
+ int r;
+
+ if (argv.empty()) {
+ derr << "Missing event command, please see help" << dendl;
+ return -EINVAL;
+ }
+
+ std::vector<const char*>::iterator arg = argv.begin();
+ bool dry_run = false;
+
+ std::string command = *(arg++);
+ if (command != "get" && command != "splice" && command != "recover_dentries") {
+ derr << "Unknown argument '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+
+ if (command == "recover_dentries") {
+ if (type != "mdlog") {
+ derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl;
+ return -EINVAL;
+ } else {
+ if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
+ dry_run = true;
+ }
+ }
+ }
+
+ if (arg == argv.end()) {
+ derr << "Incomplete command line" << dendl;
+ return -EINVAL;
+ }
+
+ // Parse filter options
+ // ====================
+ JournalFilter filter(type);
+ r = filter.parse_args(argv, arg);
+ if (r) {
+ return r;
+ }
+
+ // Parse output options
+ // ====================
+ if (arg == argv.end()) {
+ cerr << "Missing output command" << std::endl;
+ return -EINVAL;
+ }
+ std::string output_style = *(arg++);
+ if (output_style != "binary" && output_style != "json" &&
+ output_style != "summary" && output_style != "list") {
+ cerr << "Unknown argument: '" << output_style << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string output_path = "dump";
+ while(arg != argv.end()) {
+ std::string arg_str;
+ if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
+ output_path = arg_str;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
+ nullptr)) {
+ dout(1) << "Using alternate pool " << arg_str << dendl;
+ int r = rados.ioctx_create(arg_str.c_str(), output);
+ ceph_assert(r == 0);
+ other_pool = true;
+ } else {
+ cerr << "Unknown argument: '" << *arg << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ const std::string dump_path = gen_dump_file_path(output_path);
+
+ // Execute command
+ // ===============
+ JournalScanner js(input, rank, type, filter);
+ if (command == "get") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+ } else if (command == "recover_dentries") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+
+ /**
+ * Iterate over log entries, attempting to scavenge from each one
+ */
+ std::set<inodeno_t> consumed_inos;
+ for (JournalScanner::EventMap::iterator i = js.events.begin();
+ i != js.events.end(); ++i) {
+ auto& le = i->second.log_event;
+ EMetaBlob const *mb = le->get_metablob();
+ if (mb) {
+ int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
+ if (scav_r) {
+ dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
+ << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
+ if (r == 0) {
+ r = scav_r;
+ }
+ // Our goal is to read all we can, so don't stop on errors, but
+ // do record them for possible later output
+ js.errors.insert(std::make_pair(i->first,
+ JournalScanner::EventError(scav_r, cpp_strerror(r))));
+ }
+ }
+ }
+
+ /**
+ * Update InoTable to reflect any inode numbers consumed during scavenge
+ */
+ dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
+ if (consumed_inos.size() && !dry_run) {
+ int consume_r = consume_inos(consumed_inos);
+ if (consume_r) {
+ dout(1) << "Error updating InoTable for " << consumed_inos.size()
+ << " consume inos: " << cpp_strerror(consume_r) << dendl;
+ if (r == 0) {
+ r = consume_r;
+ }
+ }
+ }
+
+ // Remove consumed dentries from lost+found.
+ if (other_pool && !dry_run) {
+ std::set<std::string> found;
+
+ for (auto i : consumed_inos) {
+ char s[20];
+
+ snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
+ dout(20) << "removing " << s << dendl;
+ found.insert(std::string(s));
+ }
+
+ object_t frag_oid;
+ frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
+ frag_t(), "");
+ output.omap_rm_keys(frag_oid.name, found);
+ }
+ } else if (command == "splice") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+
+ uint64_t start, end;
+ if (filter.get_range(start, end)) {
+ // Special case for range filter: erase a numeric range in the log
+ uint64_t range = end - start;
+ int r = erase_region(js, start, range);
+ if (r) {
+ derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else {
+ // General case: erase a collection of individual entries in the log
+ for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
+ dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
+
+ int r = erase_region(js, i->first, i->second.raw_size);
+ if (r) {
+ derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+
+ } else {
+ cerr << "Unknown argument '" << command << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // Generate output
+ // ===============
+ EventOutput output(js, dump_path);
+ int output_result = 0;
+ if (output_style == "binary") {
+ output_result = output.binary();
+ } else if (output_style == "json") {
+ output_result = output.json();
+ } else if (output_style == "summary") {
+ output.summary();
+ } else if (output_style == "list") {
+ output.list();
+ } else {
+ std::cerr << "Bad output command '" << output_style << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (output_result != 0) {
+ std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
+ }
+
+ return output_result;
+}
+
+/**
+ * Provide the user with information about the condition of the journal,
+ * especially indicating what range of log events is available and where
+ * any gaps or corruptions in the journal are.
+ */
+int JournalTool::journal_inspect()
+{
+ int r;
+
+ JournalFilter filter(type);
+ JournalScanner js(input, rank, type, filter);
+ r = js.scan();
+ if (r) {
+ std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
+ return r;
+ }
+
+ js.report(std::cout);
+
+ return 0;
+}
+
+
+/**
+ * Attempt to export a binary dump of the journal.
+ *
+ * This is allowed to fail if the header is malformed or there are
+ * objects inaccessible, in which case the user would have to fall
+ * back to manually listing RADOS objects and extracting them, which
+ * they can do with the ``rados`` CLI.
+ */
+int JournalTool::journal_export(std::string const &path, bool import, bool force)
+{
+ int r = 0;
+ JournalScanner js(input, rank, type);
+
+ if (!import) {
+ /*
+ * If doing an export, first check that the header is valid and
+ * no objects are missing before trying to dump
+ */
+ r = js.scan();
+ if (r < 0) {
+ derr << "Unable to scan journal, assuming badly damaged" << dendl;
+ return r;
+ }
+ if (!js.is_readable()) {
+ derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
+ return -EIO;
+ }
+ }
+
+ /*
+ * Assuming we can cleanly read the journal data, dump it out to a file
+ */
+ {
+ Dumper dumper;
+ r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type);
+ if (r < 0) {
+ derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (import) {
+ r = dumper.undump(path.c_str(), force);
+ } else {
+ const std::string ex_path = gen_dump_file_path(path);
+ r = dumper.dump(ex_path.c_str());
+ }
+ }
+
+ return r;
+}
+
+
+/**
+ * Truncate journal and insert EResetJournal
+ */
+int JournalTool::journal_reset(bool hard)
+{
+ int r = 0;
+ Resetter resetter;
+ r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard);
+ if (r < 0) {
+ derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (hard) {
+ r = resetter.reset_hard();
+ } else {
+ r = resetter.reset();
+ }
+
+ return r;
+}
+
+
+/**
+ * Selective offline replay which only reads out dentries and writes
+ * them to the backing store iff their version is > what is currently
+ * in the backing store.
+ *
+ * In order to write dentries to the backing store, we may create the
+ * required enclosing dirfrag objects.
+ *
+ * Test this by running scavenge on an unflushed journal, then nuking
+ * it offline, then starting an MDS and seeing that the dentries are
+ * visible.
+ *
+ * @param metablob an EMetaBlob retrieved from the journal
+ * @param dry_run if true, do no writes to RADOS
+ * @param consumed_inos output, populated with any inos inserted
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::recover_dentries(
+ EMetaBlob const &metablob,
+ bool const dry_run,
+ std::set<inodeno_t> *consumed_inos)
+{
+ ceph_assert(consumed_inos != NULL);
+
+ int r = 0;
+
+ // Replay fullbits (dentry+inode)
+ for (const auto& frag : metablob.lump_order) {
+ EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
+ lump._decode_bits();
+ object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
+
+ dout(4) << "inspecting lump " << frag_oid.name << dendl;
+
+
+ // We will record old fnode version for use in hard link handling
+ // If we don't read an old fnode, take version as zero and write in
+ // all hardlinks we find.
+ version_t old_fnode_version = 0;
+
+ // Update fnode in omap header of dirfrag object
+ bool write_fnode = false;
+ bufferlist old_fnode_bl;
+ r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
+ if (r == -ENOENT) {
+ // Creating dirfrag from scratch
+ dout(4) << "failed to read OMAP header from directory fragment "
+ << frag_oid.name << " " << cpp_strerror(r) << dendl;
+ write_fnode = true;
+ // Note: creating the dirfrag *without* a backtrace, relying on
+ // MDS to regenerate backtraces on read or in FSCK
+ } else if (r == 0) {
+ // Conditionally update existing omap header
+ fnode_t old_fnode;
+ auto old_fnode_iter = old_fnode_bl.cbegin();
+ try {
+ old_fnode.decode(old_fnode_iter);
+ dout(4) << "frag " << frag_oid.name << " fnode old v" <<
+ old_fnode.version << " vs new v" << lump.fnode->version << dendl;
+ old_fnode_version = old_fnode.version;
+ write_fnode = old_fnode_version < lump.fnode->version;
+ } catch (const buffer::error &err) {
+ dout(1) << "frag " << frag_oid.name
+ << " is corrupt, overwriting" << dendl;
+ write_fnode = true;
+ }
+ } else {
+ // Unexpected error
+ dout(4) << "failed to read OMAP header from directory fragment "
+ << frag_oid.name << " " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if ((other_pool || write_fnode) && !dry_run) {
+ dout(4) << "writing fnode to omap header" << dendl;
+ bufferlist fnode_bl;
+ lump.fnode->encode(fnode_bl);
+ if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
+ r = output.omap_set_header(frag_oid.name, fnode_bl);
+ }
+ if (r != 0) {
+ derr << "Failed to write fnode for frag object "
+ << frag_oid.name << dendl;
+ return r;
+ }
+ }
+
+ std::set<std::string> read_keys;
+
+ // Compose list of potentially-existing dentries we would like to fetch
+ for (const auto& fb : lump.get_dfull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ for(const auto& rb : lump.get_dremote()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ for (const auto& nb : lump.get_dnull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ // Perform bulk read of existing dentries
+ std::map<std::string, bufferlist> read_vals;
+ r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ if (r == -ENOENT && other_pool) {
+ r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ }
+ if (r != 0) {
+ derr << "unexpected error reading fragment object "
+ << frag_oid.name << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Compose list of dentries we will write back
+ std::map<std::string, bufferlist> write_vals;
+ for (const auto& fb : lump.get_dfull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
+ << dendl;
+ bool write_dentry = false;
+ if (read_vals.find(key) == read_vals.end()) {
+ dout(4) << "dentry did not already exist, will create" << dendl;
+ write_dentry = true;
+ } else {
+ dout(4) << "dentry " << key << " existed already" << dendl;
+ dout(4) << "dentry exists, checking versions..." << dendl;
+ bufferlist &old_dentry = read_vals[key];
+ // Decode dentry+inode
+ auto q = old_dentry.cbegin();
+
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ if (dentry_type == 'L' || dentry_type == 'l') {
+ // leave write_dentry false, we have no version to
+ // compare with in a hardlink, so it's not safe to
+ // squash over it with what's in this fullbit
+ dout(10) << "Existing remote inode in slot to be (maybe) written "
+ << "by a full inode from the journal dn '" << fb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode->version;
+ } else if (dentry_type == 'I' || dentry_type == 'i') {
+ // Read out inode version to compare with backing store
+ InodeStore inode;
+ if (dentry_type == 'i') {
+ mempool::mds_co::string alternate_name;
+
+ DECODE_START(2, q);
+ if (struct_v >= 2)
+ decode(alternate_name, q);
+ inode.decode(q);
+ DECODE_FINISH(q);
+ } else {
+ inode.decode_bare(q);
+ }
+ dout(4) << "decoded embedded inode version "
+ << inode.inode->version << " vs fullbit version "
+ << fb.inode->version << dendl;
+ if (inode.inode->version < fb.inode->version) {
+ write_dentry = true;
+ }
+ } else {
+ dout(4) << "corrupt dentry in backing store, overwriting from "
+ "journal" << dendl;
+ write_dentry = true;
+ }
+ }
+
+ if ((other_pool || write_dentry) && !dry_run) {
+ dout(4) << "writing I dentry " << key << " into frag "
+ << frag_oid.name << dendl;
+
+ // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+ bufferlist dentry_bl;
+ encode(fb.dnfirst, dentry_bl);
+ encode('I', dentry_bl);
+ encode_fullbit_as_inode(fb, true, &dentry_bl);
+
+ // Record for writing to RADOS
+ write_vals[key] = dentry_bl;
+ consumed_inos->insert(fb.inode->ino);
+ }
+ }
+
+ for(const auto& rb : lump.get_dremote()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
+ << dendl;
+ bool write_dentry = false;
+ if (read_vals.find(key) == read_vals.end()) {
+ dout(4) << "dentry did not already exist, will create" << dendl;
+ write_dentry = true;
+ } else {
+ dout(4) << "dentry " << key << " existed already" << dendl;
+ dout(4) << "dentry exists, checking versions..." << dendl;
+ bufferlist &old_dentry = read_vals[key];
+ // Decode dentry+inode
+ auto q = old_dentry.cbegin();
+
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ if (dentry_type == 'L' || dentry_type == 'l') {
+ dout(10) << "Existing hardlink inode in slot to be (maybe) written "
+ << "by a remote inode from the journal dn '" << rb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode->version;
+ } else if (dentry_type == 'I' || dentry_type == 'i') {
+ dout(10) << "Existing full inode in slot to be (maybe) written "
+ << "by a remote inode from the journal dn '" << rb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode->version;
+ } else {
+ dout(4) << "corrupt dentry in backing store, overwriting from "
+ "journal" << dendl;
+ write_dentry = true;
+ }
+ }
+
+ if ((other_pool || write_dentry) && !dry_run) {
+ dout(4) << "writing L dentry " << key << " into frag "
+ << frag_oid.name << dendl;
+
+ // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+ bufferlist dentry_bl;
+ encode(rb.dnfirst, dentry_bl);
+ encode('L', dentry_bl);
+ encode(rb.ino, dentry_bl);
+ encode(rb.d_type, dentry_bl);
+
+ // Record for writing to RADOS
+ write_vals[key] = dentry_bl;
+ consumed_inos->insert(rb.ino);
+ }
+ }
+
+ std::set<std::string> null_vals;
+ for (const auto& nb : lump.get_dnull()) {
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
+ << dendl;
+
+ auto it = read_vals.find(key);
+ if (it != read_vals.end()) {
+ dout(4) << "dentry exists, will remove" << dendl;
+
+ auto q = it->second.cbegin();
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ bool remove_dentry = false;
+ if (dentry_type == 'L' || dentry_type == 'l') {
+ dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode->version;
+ } else if (dentry_type == 'I' || dentry_type == 'i') {
+ dout(10) << "Existing full inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode->version;
+ } else {
+ dout(4) << "corrupt dentry in backing store, will remove" << dendl;
+ remove_dentry = true;
+ }
+
+ if (remove_dentry)
+ null_vals.insert(key);
+ }
+ }
+
+ // Write back any new/changed dentries
+ if (!write_vals.empty()) {
+ r = output.omap_set(frag_oid.name, write_vals);
+ if (r != 0) {
+ derr << "error writing dentries to " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ // remove any null dentries
+ if (!null_vals.empty()) {
+ r = output.omap_rm_keys(frag_oid.name, null_vals);
+ if (r != 0) {
+ derr << "error removing dentries from " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ /* Now that we've looked at the dirlumps, we finally pay attention to
+ * the roots (i.e. inodes without ancestry). This is necessary in order
+ * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
+ * important because clients use them to infer completeness
+ * of directories
+ */
+ for (const auto& fb : metablob.roots) {
+ inodeno_t ino = fb.inode->ino;
+ dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
+
+ object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+ dout(4) << "object id " << root_oid.name << dendl;
+
+ bool write_root_ino = false;
+ bufferlist old_root_ino_bl;
+ r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
+ if (r == -ENOENT) {
+ dout(4) << "root does not exist, will create" << dendl;
+ write_root_ino = true;
+ } else if (r >= 0) {
+ r = 0;
+ InodeStore old_inode;
+ dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
+ << ")" << dendl;
+ auto inode_bl_iter = old_root_ino_bl.cbegin();
+ std::string magic;
+ decode(magic, inode_bl_iter);
+ if (magic == CEPH_FS_ONDISK_MAGIC) {
+ dout(4) << "magic ok" << dendl;
+ old_inode.decode(inode_bl_iter);
+
+ if (old_inode.inode->version < fb.inode->version) {
+ write_root_ino = true;
+ }
+ } else {
+ dout(4) << "magic bad: '" << magic << "'" << dendl;
+ write_root_ino = true;
+ }
+ } else {
+ derr << "error reading root inode object " << root_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (write_root_ino && !dry_run) {
+ dout(4) << "writing root ino " << root_oid.name
+ << " version " << fb.inode->version << dendl;
+
+ // Compose: root ino format is magic,InodeStore(bare=false)
+ bufferlist new_root_ino_bl;
+ encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
+ encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
+
+ // Write to RADOS
+ r = output.write_full(root_oid.name, new_root_ino_bl);
+ if (r != 0) {
+ derr << "error writing inode object " << root_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ return r;
+}
+
+
+/**
+ * Erase a region of the log by overwriting it with ENoOp
+ *
+ */
+int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
+{
+ // To erase this region, we use our preamble, the encoding overhead
+ // of an ENoOp, and our trailing start ptr. Calculate how much padding
+ // is needed inside the ENoOp to make up the difference.
+ bufferlist tmp;
+ if (type == "mdlog") {
+ ENoOp enoop(0);
+ enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else if (type == "purge_queue") {
+ PurgeItem pi;
+ pi.encode(tmp);
+ }
+
+ dout(4) << "erase_region " << pos << " len=" << length << dendl;
+
+ // FIXME: get the preamble/postamble length via JournalStream
+ int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
+ dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
+
+ if (padding < 0) {
+ derr << "Erase region " << length << " too short" << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist entry;
+ if (type == "mdlog") {
+ // Serialize an ENoOp with the correct amount of padding
+ ENoOp enoop(padding);
+ enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else if (type == "purge_queue") {
+ PurgeItem pi;
+ pi.pad_size = padding;
+ pi.encode(entry);
+ }
+ JournalStream stream(JOURNAL_FORMAT_RESILIENT);
+ // Serialize region of log stream
+ bufferlist log_data;
+ stream.write(entry, &log_data, pos);
+
+ dout(4) << "erase_region data length " << log_data.length() << dendl;
+ ceph_assert(log_data.length() == length);
+
+ // Write log stream region to RADOS
+ // FIXME: get object size somewhere common to scan_events
+ uint32_t object_size = g_conf()->mds_log_segment_size;
+ if (object_size == 0) {
+ // Default layout object size
+ object_size = file_layout_t::get_default().object_size;
+ }
+
+ uint64_t write_offset = pos;
+ uint64_t obj_offset = (pos / object_size);
+ int r = 0;
+ while(log_data.length()) {
+ std::string const oid = js.obj_name(obj_offset);
+ uint32_t offset_in_obj = write_offset % object_size;
+ uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
+
+ r = output.write(oid, log_data, write_len, offset_in_obj);
+ if (r < 0) {
+ return r;
+ } else {
+ dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
+ r = 0;
+ }
+
+ log_data.splice(0, write_len);
+ write_offset += write_len;
+ obj_offset++;
+ }
+
+ return r;
+}
+
+/**
+ * Given an EMetaBlob::fullbit containing an inode, write out
+ * the encoded inode in the format used by InodeStore (i.e. the
+ * backing store format)
+ *
+ * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
+ * on an offline InodeStore instance. It's way simpler, because we are just
+ * uncritically hauling the data between structs.
+ *
+ * @param fb a fullbit extracted from a journal entry
+ * @param bare if true, leave out [EN|DE]CODE_START decoration
+ * @param out_bl output, write serialized inode to this bufferlist
+ */
+void JournalTool::encode_fullbit_as_inode(
+ const EMetaBlob::fullbit &fb,
+ const bool bare,
+ bufferlist *out_bl)
+{
+ ceph_assert(out_bl != NULL);
+
+ // Compose InodeStore
+ InodeStore new_inode;
+ new_inode.inode = fb.inode;
+ new_inode.xattrs = fb.xattrs;
+ new_inode.dirfragtree = fb.dirfragtree;
+ new_inode.snap_blob = fb.snapbl;
+ new_inode.symlink = fb.symlink;
+ new_inode.old_inodes = fb.old_inodes;
+
+ // Serialize InodeStore
+ if (bare) {
+ new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else {
+ new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ }
+}
+
+/**
+ * Given a list of inode numbers known to be in use by
+ * inodes in the backing store, ensure that none of these
+ * numbers are listed as free in the InoTables in the
+ * backing store.
+ *
+ * Used after injecting inodes into the backing store, to
+ * ensure that the same inode numbers are not subsequently
+ * used for new files during ordinary operation.
+ *
+ * @param inos list of inode numbers to be removed from
+ * free lists in InoTables
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
+{
+ int r = 0;
+
+ // InoTable is a per-MDS structure, so iterate over assigned ranks
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ std::set<mds_rank_t> in_ranks;
+ fs->mds_map.get_mds_set(in_ranks);
+
+ for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
+ rank_i != in_ranks.end(); ++rank_i)
+ {
+ // Compose object name
+ std::ostringstream oss;
+ oss << "mds" << *rank_i << "_inotable";
+ object_t inotable_oid = object_t(oss.str());
+
+ // Read object
+ bufferlist inotable_bl;
+ int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
+ if (read_r < 0) {
+ // Things are really bad if we can't read inotable. Beyond our powers.
+ derr << "unable to read inotable '" << inotable_oid.name << "': "
+ << cpp_strerror(read_r) << dendl;
+ r = r ? r : read_r;
+ continue;
+ }
+
+ // Deserialize InoTable
+ version_t inotable_ver;
+ auto q = inotable_bl.cbegin();
+ decode(inotable_ver, q);
+ InoTable ino_table(NULL);
+ ino_table.decode(q);
+
+ // Update InoTable in memory
+ bool inotable_modified = false;
+ for (std::set<inodeno_t>::iterator i = inos.begin();
+ i != inos.end(); ++i)
+ {
+ const inodeno_t ino = *i;
+ if (ino_table.force_consume(ino)) {
+ dout(4) << "Used ino 0x" << std::hex << ino << std::dec
+ << " requires inotable update" << dendl;
+ inotable_modified = true;
+ }
+ }
+
+ // Serialize and write InoTable
+ if (inotable_modified) {
+ inotable_ver += 1;
+ dout(4) << "writing modified inotable version " << inotable_ver << dendl;
+ bufferlist inotable_new_bl;
+ encode(inotable_ver, inotable_new_bl);
+ ino_table.encode_state(inotable_new_bl);
+ int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
+ if (write_r != 0) {
+ derr << "error writing modified inotable " << inotable_oid.name
+ << ": " << cpp_strerror(write_r) << dendl;
+ r = r ? r : read_r;
+ continue;
+ }
+ }
+ }
+
+ return r;
+}
+
diff --git a/src/tools/cephfs/JournalTool.h b/src/tools/cephfs/JournalTool.h
new file mode 100644
index 000000000..8d610a866
--- /dev/null
+++ b/src/tools/cephfs/JournalTool.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+#include <vector>
+
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/events/EMetaBlob.h"
+
+#include "include/rados/librados.hpp"
+
+#include "JournalFilter.h"
+
+class JournalScanner;
+
+
+/**
+ * Command line tool for investigating and repairing filesystems
+ * with damaged metadata logs
+ */
+class JournalTool : public MDSUtility
+{
+ private:
+ MDSRoleSelector role_selector;
+ // Bit hacky, use this `rank` member to control behaviour of the
+ // various main_ functions.
+ mds_rank_t rank;
+ // when set, generate per rank dump file path
+ bool all_ranks = false;
+
+ std::string type;
+
+ // Entry points
+ int main_journal(std::vector<const char*> &argv);
+ int main_header(std::vector<const char*> &argv);
+ int main_event(std::vector<const char*> &argv);
+
+ // Shared functionality
+ int recover_journal();
+
+ // Journal operations
+ int journal_inspect();
+ int journal_export(std::string const &path, bool import, bool force);
+ int journal_reset(bool hard);
+
+ // Header operations
+ int header_set();
+
+ // I/O handles
+ librados::Rados rados;
+ librados::IoCtx input;
+ librados::IoCtx output;
+
+ bool other_pool;
+
+ // Metadata backing store manipulation
+ int read_lost_found(std::set<std::string> &lost);
+ int recover_dentries(
+ EMetaBlob const &metablob,
+ bool const dry_run,
+ std::set<inodeno_t> *consumed_inos);
+
+ // Splicing
+ int erase_region(JournalScanner const &jp, uint64_t const pos, uint64_t const length);
+
+ // Backing store helpers
+ void encode_fullbit_as_inode(
+ const EMetaBlob::fullbit &fb,
+ const bool bare,
+ bufferlist *out_bl);
+ int consume_inos(const std::set<inodeno_t> &inos);
+
+ //validate type
+ int validate_type(const std::string &type);
+
+ // generate output file path for dump/export
+ std::string gen_dump_file_path(const std::string &prefix);
+
+ // check if an operation (mode, command) is safe to be
+ // executed on all ranks.
+ bool can_execute_for_all_ranks(const std::string &mode,
+ const std::string &command);
+ public:
+ static void usage();
+ JournalTool() :
+ rank(0), other_pool(false) {}
+ int main(std::vector<const char*> &argv);
+};
+
diff --git a/src/tools/cephfs/MDSUtility.cc b/src/tools/cephfs/MDSUtility.cc
new file mode 100644
index 000000000..54386d219
--- /dev/null
+++ b/src/tools/cephfs/MDSUtility.cc
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "MDSUtility.h"
+#include "mon/MonClient.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+
+MDSUtility::MDSUtility() :
+ Dispatcher(g_ceph_context),
+ objecter(NULL),
+ finisher(g_ceph_context, "MDSUtility", "fn_mds_utility"),
+ waiting_for_mds_map(NULL),
+ inited(false)
+{
+ monc = new MonClient(g_ceph_context, poolctx);
+ messenger = Messenger::create_client_messenger(g_ceph_context, "mds");
+ fsmap = new FSMap();
+ objecter = new Objecter(g_ceph_context, messenger, monc, poolctx);
+}
+
+
+MDSUtility::~MDSUtility()
+{
+ if (inited) {
+ shutdown();
+ }
+ delete objecter;
+ delete monc;
+ delete messenger;
+ delete fsmap;
+ ceph_assert(waiting_for_mds_map == NULL);
+}
+
+
+int MDSUtility::init()
+{
+ // Initialize Messenger
+ poolctx.start(1);
+ messenger->start();
+
+ objecter->set_client_incarnation(0);
+ objecter->init();
+
+ // Connect dispatchers before starting objecter
+ messenger->add_dispatcher_tail(objecter);
+ messenger->add_dispatcher_tail(this);
+
+ // Initialize MonClient
+ if (monc->build_initial_monmap() < 0) {
+ objecter->shutdown();
+ messenger->shutdown();
+ messenger->wait();
+ return -1;
+ }
+
+ monc->set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD|CEPH_ENTITY_TYPE_MDS);
+ monc->set_messenger(messenger);
+ monc->init();
+ int r = monc->authenticate();
+ if (r < 0) {
+ derr << "Authentication failed, did you specify an MDS ID with a valid keyring?" << dendl;
+ monc->shutdown();
+ objecter->shutdown();
+ messenger->shutdown();
+ messenger->wait();
+ return r;
+ }
+
+ client_t whoami = monc->get_global_id();
+ messenger->set_myname(entity_name_t::CLIENT(whoami.v));
+
+ // Start Objecter and wait for OSD map
+ objecter->start();
+ objecter->wait_for_osd_map();
+
+ // Prepare to receive MDS map and request it
+ ceph::mutex init_lock = ceph::make_mutex("MDSUtility:init");
+ ceph::condition_variable cond;
+ bool done = false;
+ ceph_assert(!fsmap->get_epoch());
+ lock.lock();
+ waiting_for_mds_map = new C_SafeCond(init_lock, cond, &done, NULL);
+ lock.unlock();
+ monc->sub_want("fsmap", 0, CEPH_SUBSCRIBE_ONETIME);
+ monc->renew_subs();
+
+ // Wait for MDS map
+ dout(4) << "waiting for MDS map..." << dendl;
+ {
+ std::unique_lock locker{init_lock};
+ cond.wait(locker, [&done] { return done; });
+ }
+ dout(4) << "Got MDS map " << fsmap->get_epoch() << dendl;
+
+ finisher.start();
+
+ inited = true;
+ return 0;
+}
+
+
+void MDSUtility::shutdown()
+{
+ finisher.stop();
+
+ lock.lock();
+ objecter->shutdown();
+ lock.unlock();
+ monc->shutdown();
+ messenger->shutdown();
+ messenger->wait();
+ poolctx.finish();
+}
+
+
+bool MDSUtility::ms_dispatch(Message *m)
+{
+ std::lock_guard locker{lock};
+ switch (m->get_type()) {
+ case CEPH_MSG_FS_MAP:
+ handle_fs_map((MFSMap*)m);
+ break;
+ case CEPH_MSG_OSD_MAP:
+ break;
+ default:
+ return false;
+ }
+ m->put();
+ return true;
+}
+
+
+void MDSUtility::handle_fs_map(MFSMap* m)
+{
+ *fsmap = m->get_fsmap();
+ if (waiting_for_mds_map) {
+ waiting_for_mds_map->complete(0);
+ waiting_for_mds_map = NULL;
+ }
+}
+
+
diff --git a/src/tools/cephfs/MDSUtility.h b/src/tools/cephfs/MDSUtility.h
new file mode 100644
index 000000000..09f1918ba
--- /dev/null
+++ b/src/tools/cephfs/MDSUtility.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef MDS_UTILITY_H_
+#define MDS_UTILITY_H_
+
+#include "osdc/Objecter.h"
+#include "mds/FSMap.h"
+#include "messages/MFSMap.h"
+#include "msg/Dispatcher.h"
+#include "msg/Messenger.h"
+#include "auth/Auth.h"
+#include "common/async/context_pool.h"
+#include "common/Finisher.h"
+#include "common/Timer.h"
+
+/// MDS Utility
+/**
+ * This class is the parent for MDS utilities, i.e. classes that
+ * need access the objects belonging to the MDS without actually
+ * acting as an MDS daemon themselves.
+ */
+class MDSUtility : public Dispatcher {
+protected:
+ Objecter *objecter;
+ FSMap *fsmap;
+ Messenger *messenger;
+ MonClient *monc;
+
+ ceph::mutex lock = ceph::make_mutex("MDSUtility::lock");
+ Finisher finisher;
+ ceph::async::io_context_pool poolctx;
+
+ Context *waiting_for_mds_map;
+
+ bool inited;
+public:
+ MDSUtility();
+ ~MDSUtility() override;
+
+ void handle_fs_map(MFSMap* m);
+ bool ms_dispatch(Message *m) override;
+ bool ms_handle_reset(Connection *con) override { return false; }
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override { return false; }
+ int init();
+ void shutdown();
+};
+
+#endif /* MDS_UTILITY_H_ */
diff --git a/src/tools/cephfs/MetaTool.cc b/src/tools/cephfs/MetaTool.cc
new file mode 100644
index 000000000..527ae636f
--- /dev/null
+++ b/src/tools/cephfs/MetaTool.cc
@@ -0,0 +1,999 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <string.h>
+#include <map>
+#include <sstream>
+#include <fstream>
+
+#include "include/types.h"
+#include "common/Formatter.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/InoTable.h"
+#include "mds/CDentry.h"
+
+#include "mds/events/ENoOp.h"
+#include "mds/events/EUpdate.h"
+
+#include "mds/JournalPointer.h"
+// #include "JournalScanner.h"
+// #include "EventOutput.h"
+// #include "Dumper.h"
+// #include "Resetter.h"
+
+// #include "JournalTool.h"
+#include "MetaTool.h"
+#include "type_helper.hpp"
+#include "include/object.h"
+
+WRITE_RAW_ENCODER(char)
+WRITE_RAW_ENCODER(unsigned char)
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+
+void MetaTool::meta_op::release()
+{
+ for (const auto& i : inodes) {
+ delete i.second;
+ }
+
+ while (!sub_ops.empty()) {
+ delete sub_ops.top();
+ sub_ops.pop();
+ }
+}
+
+void MetaTool::inode_meta_t::decode_json(JSONObj *obj)
+{
+ unsigned long long tmp;
+ JSONDecoder::decode_json("snapid_t", tmp, obj, true);
+ _f.val = tmp;
+ JSONDecoder::decode_json("itype", tmp, obj, true);
+ _t = tmp;
+ if (NULL == _i)
+ _i = new InodeStore;
+ JSONDecoder::decode_json("store", *_i, obj, true);
+}
+
+void MetaTool::usage()
+{
+ generic_client_usage();
+}
+
+int MetaTool::main(string& mode,
+ string& rank_str,
+ string& minfo,
+ string&ino,
+ string& out,
+ string& in,
+ bool confirm
+ )
+{
+ int r = 0;
+
+ std::string manual_meta_pool;
+ std::string manual_data_pool;
+ std::string manual_rank_num;
+ bool manual_mode = false;
+ if (minfo != "") {
+ vector<string> v;
+ string_split(minfo, v);
+ manual_meta_pool = v.size() >= 1 ? v[0] : "";
+ manual_data_pool = v.size() >= 2 ? v[1] : "";
+ manual_rank_num = v.size() >= 3 ? v[2] : "";
+ std::cout << "("<< minfo<< ")=>"
+ << " mpool: " << manual_meta_pool
+ << " dpool: " << manual_data_pool
+ << " rank: " << manual_rank_num
+ << std::endl;
+ if (!manual_meta_pool.empty() && !manual_data_pool.empty() && !manual_rank_num.empty()) {
+ std::cout << "you specify rank: " << manual_rank_num
+ << " mpool: " << manual_meta_pool
+ << " dpool: " << manual_data_pool
+ << "\nstart manual mode!!"<< std::endl;
+ manual_mode = true;
+ }
+ }
+
+ // RADOS init
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ cerr << "RADOS unavailable" << std::endl;
+ return r;
+ }
+
+ if (_debug)
+ cout << "MetaTool: connecting to RADOS..." << std::endl;
+ r = rados.connect();
+ if (r < 0) {
+ cerr << "couldn't connect to cluster: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (!manual_mode) {
+ r = role_selector.parse(*fsmap, rank_str);
+ if (r != 0) {
+ cerr << "Couldn't determine MDS rank." << std::endl;
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ assert(fs != nullptr);
+
+ // prepare io for meta pool
+ int64_t const pool_id = fs->mds_map.get_metadata_pool();
+ features = fs->mds_map.get_up_features();
+ if (features == 0)
+ features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ else if (features != CEPH_FEATURES_SUPPORTED_DEFAULT) {
+ cout << "I think we need to check the feature! : " << features << std::endl;
+ return -1;
+ }
+
+ std::string pool_name;
+ r = rados.pool_reverse_lookup(pool_id, &pool_name);
+ if (r < 0) {
+ cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl;
+ return r;
+ }
+
+ if (_debug)
+ cout << "MetaTool: creating IoCtx.." << std::endl;
+ r = rados.ioctx_create(pool_name.c_str(), io_meta);
+ assert(r == 0);
+ output.dup(io_meta);
+
+ // prepare io for data pool
+ for (const auto p : fs->mds_map.get_data_pools()) {
+ r = rados.pool_reverse_lookup(p, &pool_name);
+ if (r < 0) {
+ cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl;
+ return r;
+ }
+ librados::IoCtx* io_data = new librados::IoCtx;
+ r = rados.ioctx_create(pool_name.c_str(), *io_data);
+ assert(r == 0);
+ io_data_v.push_back(io_data);
+ }
+
+ for (auto role : role_selector.get_roles()) {
+ rank = role.rank;
+
+ r = process(mode, ino, out, in, confirm);
+ cout << "executing for rank " << rank << " op[" <<mode<< "] ret : " << r << std::endl;
+ }
+
+ } else {
+ features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ r = rados.ioctx_create(manual_meta_pool.c_str(), io_meta);
+ assert(r == 0);
+
+ librados::IoCtx* io_data = new librados::IoCtx;
+ r = rados.ioctx_create(manual_data_pool.c_str(), *io_data);
+ assert(r == 0);
+ io_data_v.push_back(io_data);
+
+
+ rank = conv_t<int>(manual_rank_num);
+ r = process(mode, ino, out, in, confirm);
+ cout << "op[" << mode << "] ret : " << r << std::endl;
+ }
+ return r;
+}
+
+int MetaTool::process(string& mode, string& ino, string out, string in, bool confirm)
+{
+ if (mode == "showm") {
+ return show_meta_info(ino, out);
+ } else if (mode == "showfn") {
+ return show_fnode(ino, out);
+ } else if (mode == "listc") {
+ return list_meta_info(ino, out);
+ } else if (mode == "amend") {
+ return amend_meta_info(ino, in, confirm);
+ } else if (mode == "amendfn") {
+ return amend_fnode(in, confirm);
+ } else {
+ cerr << "bad command '" << mode << "'" << std::endl;
+ return -EINVAL;
+ }
+}
+int MetaTool::show_fnode(string& ino, string& out)
+{
+ if (ino != "0") {
+ inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+ meta_op op(_debug, out);
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_SHOW_FN;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = i_ino;
+ op.push_op(nsop);
+ return op_process(op);
+ } else {
+ cerr << "parameter error? : ino = " << ino << std::endl;
+ }
+ return 0;
+}
+int MetaTool::amend_fnode(string& in, bool confirm)
+{
+ meta_op op(_debug, "", in, confirm);
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_AMEND_FN;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = 0;
+ op.push_op(nsop);
+ return op_process(op);
+}
+int MetaTool::amend_meta_info(string& ino, string& in, bool confirm)
+{
+ if (ino != "0" && in != "") {
+ inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+ meta_op op(_debug, "", in, confirm);
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_AMEND;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = i_ino;
+ op.push_op(nsop);
+ return op_process(op);
+ } else {
+ cerr << "parameter error? : ino = " << ino << std::endl;
+ }
+ return 0;
+}
+int MetaTool::list_meta_info(string& ino, string& out)
+{
+ if (ino != "0") {
+ inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+ meta_op op(_debug, out);
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = i_ino;
+ op.push_op(nsop);
+ return op_process(op);
+ } else {
+ cerr << "parameter error? : ino = " << ino << std::endl;
+ }
+ return 0;
+}
+int MetaTool::show_meta_info(string& ino, string& out)
+{
+ if (ino != "0") {
+ inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+ meta_op op(_debug, out);
+
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_SHOW;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = i_ino;
+ op.push_op(nsop);
+ return op_process(op);
+ } else {
+ cerr << "parameter error? : ino = " << ino << std::endl;
+ }
+ return 0;
+}
+
+int MetaTool::op_process(meta_op& op)
+{
+ int r = 0;
+ while (!op.no_sops()) {
+ if (_debug)
+ std::cout << "process : " << op.top_op()->detail() << std::endl;
+ switch(op.top_op()->sub_op_t) {
+ case meta_op::OP_LIST:
+ r = list_meta(op);
+ break;
+ case meta_op::OP_LTRACE:
+ r = file_meta(op);
+ break;
+ case meta_op::OP_SHOW:
+ r = show_meta(op);
+ break;
+ case meta_op::OP_AMEND:
+ r = amend_meta(op);
+ break;
+ case meta_op::OP_SHOW_FN:
+ r = show_fn(op);
+ break;
+ case meta_op::OP_AMEND_FN:
+ r = amend_fn(op);
+ break;
+ default:
+ cerr << "unknow op" << std::endl;
+ }
+ if (r == 0)
+ op.pop_op();
+ else if (r < 0)
+ op.clear_sops();
+ }
+ op.release();
+ return r;
+}
+
+int MetaTool::amend_meta(meta_op &op)
+{
+ meta_op::sub_op* sop = op.top_op();
+ auto item = op.inodes.find(sop->ino);
+ auto item_k = op.okeys.find(sop->ino);
+ if (item != op.inodes.end() && item_k != op.okeys.end()) {
+ if (_amend_meta(item_k->second, *(item->second), op.infile(), op) < 0)
+ return -1;
+ } else {
+ if (op.inodes.empty()) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->trace_level = 0;
+ nsop->ino_c = sop->ino;
+ op.push_op(nsop);
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+void MetaTool::inode_meta_t::encode(::ceph::bufferlist& bl, uint64_t features)
+{
+ ::encode(_f, bl);
+ ::encode(_t, bl);
+ _i->encode_bare(bl, features);
+}
+int MetaTool::_amend_meta(string& k, inode_meta_t& inode_meta, const string& fn, meta_op& op)
+{
+ JSONParser parser;
+ if (!parser.parse(fn.c_str())) {
+ cout << "Error parsing create user response" << std::endl;
+ return -1;
+ }
+
+ try {
+ inode_meta.decode_json(&parser);
+ } catch (JSONDecoder::err& e) {
+ cout << "failed to decode JSON input: " << e.what() << std::endl;
+ return -1;
+ }
+
+ if (!op.confirm_chg() || op.is_debug()) {
+ cout << "you will amend info of inode ==>: " << std::endl;
+ _show_meta(inode_meta, "");
+ }
+
+ if (!op.confirm_chg()) {
+ cout << "warning: this operation is irreversibl!!!\n"
+ << " You must confirm that all logs of mds have been flushed!!!\n"
+ << " if you want amend it, please add --yes-i-really-really-mean-it!!!"
+ << std::endl;
+ return -1;
+ }
+
+ bufferlist bl;
+ inode_meta.encode(bl, features);
+ map<string, bufferlist> to_set;
+ to_set[k].swap(bl);
+ inode_backpointer_t bp;
+ if (!op.top_op()->get_ancestor(bp))
+ return -1;
+ frag_t frag;
+ auto item = op.inodes.find(bp.dirino);
+ if (item != op.inodes.end()) {
+ frag = item->second->get_meta()->pick_dirfrag(bp.dname);
+ }
+ string oid = obj_name(bp.dirino, frag);
+ int ret = io_meta.omap_set(oid, to_set);
+ to_set.clear();
+ return ret;
+}
+int MetaTool::show_fn(meta_op &op)
+{
+ meta_op::sub_op* sop = op.top_op();
+ auto item = op.inodes.find(sop->ino);
+ if (item != op.inodes.end()) {
+ if (_show_fn(*(item->second), op.outfile()) < 0)
+ return -1;
+ } else {
+ if (op.inodes.empty()) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->trace_level = 0;
+ nsop->ino_c = sop->ino;
+ op.push_op(nsop);
+ return 1;
+ } else
+ return -1;
+ }
+ return 0;
+}
+int MetaTool::_show_fn(inode_meta_t& inode_meta, const string& fn)
+{
+ std::list<frag_t> frags;
+ inode_meta.get_meta()->dirfragtree.get_leaves(frags);
+ std::stringstream ds;
+ std::string format = "json";
+ std::string oids;
+ Formatter* f = Formatter::create(format);
+ f->enable_line_break();
+ f->open_object_section("fnodes");
+ for (const auto &frag : frags) {
+ bufferlist hbl;
+ string oid = obj_name(inode_meta.get_meta()->inode->ino, frag);
+ int ret = io_meta.omap_get_header(oid, &hbl);
+ if (ret < 0) {
+ std::cerr << __func__ << " : can't find oid("<< oid << ")" << std::endl;
+ return -1;
+ }
+ {
+ fnode_t got_fnode;
+ try {
+ auto p = hbl.cbegin();
+ ::decode(got_fnode, p);
+ } catch (const buffer::error &err) {
+ cerr << "corrupt fnode header in " << oid
+ << ": " << err.what() << std::endl;
+ return -1;
+ }
+ if (!oids.empty())
+ oids += ",";
+ oids += oid;
+ f->open_object_section(oid.c_str());
+ got_fnode.dump(f);
+ f->close_section();
+ }
+ }
+ f->dump_string("oids", oids.c_str());
+ f->close_section();
+ f->flush(ds);
+ if (fn != "") {
+ ofstream o;
+ o.open(fn);
+ if (o) {
+ o << ds.str();
+ o.close();
+ } else {
+ cout << "out to file (" << fn << ") failed" << std::endl;
+ cout << ds.str() << std::endl;
+ }
+ } else
+ std::cout << ds.str() << std::endl;
+ return 0;
+}
+int MetaTool::amend_fn(meta_op &op)
+{
+ if (_amend_fn(op.infile(), op.confirm_chg()) < 0)
+ return -1;
+ return 0;
+}
+int MetaTool::_amend_fn(const string& fn, bool confirm)
+{
+ JSONParser parser;
+ if (!parser.parse(fn.c_str())) {
+ cout << "Error parsing create user response : " << fn << std::endl;
+ return -1;
+ }
+ if (!confirm) {
+ cout << "warning: this operation is irreversibl!!!\n"
+ << " You must confirm that all logs of mds have been flushed!!!\n"
+ << " if you want amend it, please add --yes-i-really-really-mean-it!!!"
+ << std::endl;
+ return -1;
+ }
+ try {
+ string tmp;
+ JSONDecoder::decode_json("oids", tmp, &parser, true);
+ string::size_type pos1, pos2;
+ vector<string> v;
+ string c = ",";
+ pos2 = tmp.find(c);
+ pos1 = 0;
+ while (string::npos != pos2) {
+ v.push_back(tmp.substr(pos1, pos2-pos1));
+ pos1 = pos2 + c.size();
+ pos2 = tmp.find(c, pos1);
+ }
+ if (pos1 != tmp.length())
+ v.push_back(tmp.substr(pos1));
+ int ret = 0;
+ for (auto i : v) {
+ cout << "amend frag : " << i << "..." << std::endl;
+ fnode_t fnode;
+ JSONDecoder::decode_json(i.c_str(), fnode, &parser, true);
+ bufferlist bl;
+ fnode.encode(bl);
+ ret = io_meta.omap_set_header(i, bl);
+ if (ret < 0)
+ return ret;
+ }
+ } catch (JSONDecoder::err& e) {
+ cout << "failed to decode JSON input: " << e.what() << std::endl;
+ return -1;
+ }
+ return 0;
+}
+int MetaTool::show_meta(meta_op &op)
+{
+ meta_op::sub_op* sop = op.top_op();
+ auto item = op.inodes.find(sop->ino);
+ if (item != op.inodes.end()) {
+ if (_show_meta(*(item->second), op.outfile()) < 0)
+ return -1;
+ } else {
+ if (op.inodes.empty()) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->trace_level = 0;
+ nsop->ino_c = sop->ino;
+ op.push_op(nsop);
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ return 0;
+}
+int MetaTool::_show_meta(inode_meta_t& inode_meta, const string& fn)
+{
+ std::stringstream ds;
+ std::string format = "json";
+ InodeStore& inode_data = *inode_meta.get_meta();
+ Formatter* f = Formatter::create(format);
+ f->enable_line_break();
+ f->open_object_section("meta");
+ f->dump_unsigned("snapid_t", inode_meta.get_snapid());
+ f->dump_unsigned("itype", inode_meta.get_type());
+ f->open_object_section("store");
+ inode_data.dump(f);
+ try {
+ if (inode_data.snap_blob.length()) {
+ sr_t srnode;
+ auto p = inode_data.snap_blob.cbegin();
+ decode(srnode, p);
+ f->open_object_section("snap_blob");
+ srnode.dump(f);
+ f->close_section();
+ }
+ } catch (const buffer::error &err) {
+ cerr << "corrupt decode in snap_blob"
+ << ": " << err.what() << std::endl;
+ return -1;
+ }
+
+ f->close_section();
+ f->close_section();
+ f->flush(ds);
+
+ if (fn != "") {
+ ofstream o;
+ o.open(fn);
+ if (o) {
+ o << ds.str();
+ o.close();
+ } else {
+ cout << "out to file (" << fn << ") failed" << std::endl;
+ cout << ds.str() << std::endl;
+ }
+
+ } else
+ std::cout << ds.str() << std::endl;
+ return 0;
+}
+int MetaTool::list_meta(meta_op &op)
+{
+ meta_op::sub_op* sop = op.top_op();
+
+ bool list_all = false;
+ string oid;
+ inodeno_t ino = sop->ino_c;
+ frag_t frag = sop->frag;
+
+ if (sop->ino_c == 0) {
+ list_all = true;
+ oid = obj_name(sop->ino, frag);
+ } else {
+ if (_debug)
+ std::cout << __func__ << " : " << sop->trace_level << " " << op.ancestors.size() << std::endl;
+ inode_backpointer_t bp;
+ if (sop->get_c_ancestor(bp)) {
+ auto item = op.inodes.find(bp.dirino);
+ if (item != op.inodes.end()) {
+ frag = item->second->get_meta()->pick_dirfrag(bp.dname);
+ }
+ oid = obj_name(bp.dirino, frag);
+ } else {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->ino = sop->ino_c;
+ nsop->sub_op_t = meta_op::OP_LTRACE;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ op.push_op(nsop);
+ return 1;
+ }
+ }
+ if (_debug)
+ std::cout << __func__ << " : " << string(list_all?"listall ":"info ") << oid << " "<< ino << std::endl;
+ bufferlist hbl;
+ int ret = io_meta.omap_get_header(oid, &hbl);
+ if (ret < 0) {
+ std::cerr << __func__ << " : can't find it, maybe it (ino:"<< sop->ino<< ")isn't a normal dir!" << std::endl;
+ return -1;
+ }
+
+ if (hbl.length() == 0) { // obj has splite
+ if (list_all) {
+ if (frag == frag_t()) {
+ auto item = op.inodes.find(sop->ino);
+ if (item != op.inodes.end()) {
+ inodeno_t tmp = sop->ino;
+ op.pop_op();
+ std::list<frag_t> frags;
+ item->second->get_meta()->dirfragtree.get_leaves(frags);
+ for (const auto &frag : frags) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->ino = tmp;
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->frag = frag;
+ op.push_op(nsop);
+ }
+ } else {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->ino_c = sop->ino;
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ op.push_op(nsop);
+ }
+ return 1;
+ } else {
+ cerr << __func__ << " missing some data (" << oid << ")???" << std::endl;
+ return -1;
+ }
+ } else {
+ if (frag == frag_t()) {
+ inode_backpointer_t bp;
+ if (sop->get_c_ancestor(bp)) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->ino_c = bp.dirino;
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->trace_level = sop->trace_level + 1;
+ op.push_op(nsop);
+ return 1;
+ } else {
+ cerr << __func__ << "can't find obj(" << oid << ") ,miss ancestors or miss some objs??? " << std::endl;
+ return -1;
+ }
+ } else {
+ cerr << __func__ << "missing some objs(" << oid << ")??? " << std::endl;
+ return -1;
+ }
+ }
+ }
+
+ fnode_t got_fnode;
+ try {
+ auto p = hbl.cbegin();
+ ::decode(got_fnode, p);
+ } catch (const buffer::error &err) {
+ cerr << "corrupt fnode header in " << oid
+ << ": " << err.what() << std::endl;
+ return -1;
+ }
+
+ if (_debug) {
+ std::string format = "json";
+ Formatter* f = Formatter::create(format);
+ f->enable_line_break();
+ f->dump_string("type", "--fnode--");
+ f->open_object_section("fnode");
+ got_fnode.dump(f);
+ f->close_section();
+ f->flush(std::cout);
+ std::cout << std::endl;
+ }
+
+ // print children
+ std::map<string, bufferlist> out_vals;
+ int max_vals = 5;
+ io_meta.omap_get_vals(oid, "", max_vals, &out_vals);
+
+ bool force_dirty = false;
+ const set<snapid_t> *snaps = NULL;
+ unsigned pos = out_vals.size() - 1;
+ std::string last_dname;
+ for (map<string, bufferlist>::iterator p = out_vals.begin();
+ p != out_vals.end();
+ ++p, --pos) {
+ string dname;
+ snapid_t last;
+ dentry_key_t::decode_helper(p->first, dname, last);
+ if (_debug)
+ last_dname = dname;
+ try {
+ if (!list_all) {
+ if (show_child(p->first, dname, last, p->second, pos, snaps,
+ &force_dirty, ino, &op) == 1) {
+ return 0;
+ }
+ } else {
+ cout << "dname : " << dname << " " << last << std::endl;
+ if (show_child(p->first, dname, last, p->second, pos, snaps,
+ &force_dirty) == 1)
+ return 0;
+ }
+ } catch (const buffer::error &err) {
+ derr << "Corrupt dentry '" << dname << "' : "
+ << err.what() << "(" << "" << ")" << dendl;
+ return -1;
+ }
+ }
+ while (out_vals.size() == (size_t)max_vals) {
+ out_vals.clear();
+ io_meta.omap_get_vals(oid, last_dname, max_vals, &out_vals);
+ pos = out_vals.size() - 1;
+ for (map<string, bufferlist>::iterator p = (++out_vals.begin());
+ p != out_vals.end();
+ ++p, --pos) {
+ string dname;
+ snapid_t last;
+ dentry_key_t::decode_helper(p->first, dname, last);
+ last_dname = dname;
+ try {
+ if (!list_all) {
+ if (show_child(p->first, dname, last, p->second, pos, snaps,
+ &force_dirty, ino, &op) == 1) {
+ return 0;
+ }
+ } else {
+ cout << "dname : " << dname << " " << last << std::endl;
+ if (show_child(p->first, dname, last, p->second, pos, snaps,
+ &force_dirty) == 1)
+ return 0;
+ }
+ } catch (const buffer::error &err) {
+ derr << "Corrupt dentry '" << dname << "' : "
+ << err.what() << "(" << "" << ")" << dendl;
+ return -1;
+ }
+ }
+ }
+
+ if (!list_all) {
+ cerr << __func__ << "miss obj(ino:" << ino << ")??? " << std::endl;
+ return -1;
+ }
+ return 0;
+}
+
+int MetaTool::file_meta(meta_op &op)
+{
+ int r = 0;
+ if (op.top_op()->sub_ino_t == meta_op::INO_DIR) {
+ r = _file_meta(op, io_meta);
+ } else if (op.top_op()->sub_ino_t == meta_op::INO_F) {
+ for (auto i = io_data_v.begin(); i != io_data_v.end(); ++i)
+ if ((r = _file_meta(op, **i)) == 1)
+ break;
+ }
+ if (r == 1) {
+ inode_backpointer_t bp;
+ if (op.top_op()->get_ancestor(bp)) {
+ return 0;
+ } else {
+ std::cerr << "no trace for obj (ino:" << op.top_op()->ino <<")??" << std::endl;
+ return -1;
+ }
+ } else if (op.top_op()->sub_ino_t == meta_op::INO_DIR) {
+ std::cerr << "\tmaybe it's a file(ino:" << op.top_op()->ino << ")" << std::endl;
+ op.top_op()->sub_ino_t = meta_op::INO_F;
+ return 1;
+ }
+
+ std::cerr << "can't get (ino:" << op.top_op()->ino <<")trace??" << std::endl;
+ return -1;
+}
+
+int MetaTool::_file_meta(meta_op &op, librados::IoCtx& io)
+{
+ inodeno_t ino = op.top_op()->ino;
+ std::string oid = obj_name(ino);
+ bufferlist pointer_bl;
+ std::map<std::string, bufferlist> attrset;
+ int r = 0;
+ bool have_data = false;
+ r = io.getxattrs (oid.c_str(), attrset);
+ if (0 == r) {
+ std::stringstream ds;
+ std::string format = "json";
+ Formatter* f = Formatter::create(format);
+ auto item = attrset.find("parent");
+ if (item != attrset.end()) {
+ inode_backtrace_t i_bt;
+ try {
+ bufferlist::const_iterator q = item->second.cbegin();
+ i_bt.decode(q);
+ f->open_array_section("info");
+ have_data = true;
+ if (i_bt.ancestors.size() > 0)
+ op.ancestors[ino] = i_bt.ancestors[0];
+ f->dump_string("type", "--i_bt--");
+ f->open_object_section("parent");
+ i_bt.dump(f);
+ f->close_section();
+ } catch (buffer::error &e) {
+ cerr << "failed to decode parent of " << oid << std::endl;
+ return -1;
+ }
+ } else {
+ cerr << oid << " in " << io.get_pool_name() << " , but no parent" << std::endl;
+ return -1;
+ }
+
+ item = attrset.find("layout");
+ if (item != attrset.end()) {
+ file_layout_t layout;
+ try {
+ auto q = item->second.cbegin();
+ layout.decode(q);
+ f->dump_string("type", "--layout--");
+ f->open_object_section("layout");
+ layout.dump(f);
+ f->close_section();
+
+ } catch (buffer::error &e) {
+ cerr << "failed to decode layout of " << oid << std::endl;
+ return -1;
+ }
+ } else {
+ cerr << oid << " in " << io.get_pool_name() << " , but no layout" << std::endl;
+ }
+ if (have_data) {
+ f->close_section();
+ f->flush(ds);
+ if (_debug)
+ cout << ino << " : "<< ds.str() << std::endl;
+ return 1;
+ }
+ }
+ return 0;
+}
+std::string MetaTool::obj_name(inodeno_t ino, uint64_t offset, const char *suffix) const
+{
+ char name[60];
+ snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)offset, suffix ? suffix : "");
+ return std::string(name);
+}
+std::string MetaTool::obj_name(inodeno_t ino, frag_t fg, const char *suffix) const
+{
+ char name[60];
+ snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)fg, suffix ? suffix : "");
+ return std::string(name);
+}
+
+std::string MetaTool::obj_name(const char* ino, uint64_t offset, const char *suffix) const
+{
+ char name[60];
+ snprintf(name, sizeof(name), "%s.%08llx%s", ino, (long long unsigned)offset, suffix ? suffix : "");
+ std::string out = name;
+ transform(out.begin(), out.end(), out.begin(),::tolower);
+ return out;
+}
+
+int MetaTool::show_child(std::string_view key,
+ std::string_view dname,
+ const snapid_t last,
+ bufferlist &bl,
+ const int pos,
+ const std::set<snapid_t> *snaps,
+ bool *force_dirty,
+ inodeno_t sp_ino,
+ meta_op* op)
+{
+ bufferlist::const_iterator q = bl.cbegin();
+
+ snapid_t first;
+ ::decode(first, q);
+
+ // marker
+ char type;
+ ::decode(type, q);
+
+ if (_debug)
+ std::cout << pos << " type '" << type << "' dname '" << dname
+ << " [" << first << "," << last << "]"
+ << std::endl;
+ // bool stale = false;
+ if (snaps && last != CEPH_NOSNAP) {
+ derr << "!!!! erro !!!!" << dendl;
+ return -1;
+ }
+
+ // CDentry *dn = NULL;
+ // look for existing dentry for _last_ snap, can't process snap of obj
+ //if *(stale)
+ // dn = lookup_exact_snap(dname, last);
+ //else
+ // dn = lookup(dname, last);
+ if (type == 'L' || type == 'l') {
+ // hard link
+ inodeno_t ino;
+ unsigned char d_type;
+ mempool::mds_co::string alternate_name;
+
+ CDentry::decode_remote(type, ino, d_type, alternate_name, q);
+
+ if (sp_ino > 0) {
+ if (sp_ino == ino) {
+ std::cout << "find hard link : " << ino << "," << d_type << std::endl;
+ return 1;
+ }
+ }
+
+ std::cout << "hard link : " << ino << "," << d_type << std::endl;
+ } else if (type == 'I' || type == 'i') {
+ // inode
+ // load inode data before lookuping up or constructing CInode
+ InodeStore& inode_data = *(new InodeStore);
+ if (type == 'i') {
+ mempool::mds_co::string alternate_name;
+
+ DECODE_START(2, q);
+ if (struct_v >= 2)
+ decode(alternate_name, q);
+ inode_data.decode(q);
+ DECODE_FINISH(q);
+ } else {
+ inode_data.decode_bare(q);
+ }
+
+ std::stringstream ds;
+ std::string format = "json";
+ Formatter* f = Formatter::create(format);
+ f->enable_line_break();
+ f->open_object_section("meta");
+ f->dump_unsigned("snapid_t", first);
+ f->dump_unsigned("itype", type);
+ f->open_object_section("store");
+ inode_data.dump(f);
+ try {
+ if (inode_data.snap_blob.length()) {
+ sr_t srnode;
+ auto p = inode_data.snap_blob.cbegin();
+ srnode.decode(p);
+ f->open_object_section("snap_blob");
+ srnode.dump(f);
+ f->close_section();
+ }
+ } catch (const buffer::error &err) {
+ cerr << "corrupt decode in snap_blob"
+ << ": " << err.what() << std::endl;
+ }
+ f->close_section();
+ f->close_section();
+ f->flush(ds);
+
+ if (sp_ino > 0 && op != NULL && sp_ino == inode_data.inode->ino) {
+ inode_meta_t* tmp = new inode_meta_t(first, type, &inode_data);
+ op->inodes[inode_data.inode->ino] = tmp;
+ op->okeys[inode_data.inode->ino] = key.data();
+ return 1;
+ } else {
+ delete &inode_data;
+ }
+
+ if (sp_ino == 0) {
+ cout << ds.str() << std::endl;
+ }
+ } else {
+ std::cerr << __func__ << "unknow type : " << dname << "," << type << std::endl;
+ }
+ return 0;
+}
diff --git a/src/tools/cephfs/MetaTool.h b/src/tools/cephfs/MetaTool.h
new file mode 100644
index 000000000..510be6552
--- /dev/null
+++ b/src/tools/cephfs/MetaTool.h
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef METATOOL_H__
+#define METATOOL_H__
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+#include <vector>
+#include <stack>
+using std::stack;
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/events/EMetaBlob.h"
+
+#include "include/rados/librados.hpp"
+#include "common/ceph_json.h"
+
+using ::ceph::bufferlist;
+class MetaTool : public MDSUtility
+{
+public:
+ class inode_meta_t {
+ public:
+ inode_meta_t(snapid_t f = CEPH_NOSNAP, char t = char(255), InodeStore* i = NULL):
+ _f(f),_t(t),_i(i) {
+ };
+ snapid_t get_snapid() const {
+ return _f;
+ }
+ InodeStore* get_meta() const {
+ if (_t == 'I')
+ return _i;
+ else
+ return NULL;
+ }
+ int get_type() const {
+ return _t;
+ }
+ void decode_json(JSONObj *obj);
+ void encode(::ceph::bufferlist& bl, uint64_t features);
+ private:
+ snapid_t _f;
+ char _t;
+ InodeStore* _i;
+ };
+private:
+ class meta_op {
+ public:
+ meta_op(bool debug = false, string out = "", string in = "", bool confirm = false):
+ _debug(debug),
+ _out(out),
+ _in(in),
+ _confirm(confirm)
+ {}
+ void release();
+ typedef enum {
+ OP_LIST = 0,
+ OP_LTRACE,
+ OP_SHOW,
+ OP_AMEND,
+ OP_SHOW_FN,
+ OP_AMEND_FN,
+ OP_NO
+ } op_type;
+
+ typedef enum {
+ INO_DIR = 0,
+ INO_F
+ } ino_type;
+
+ static string op_type_name(op_type& t) {
+ string name;
+ switch (t) {
+ case OP_LIST:
+ name = "list dir";
+ break;
+ case OP_LTRACE:
+ name = "load trace";
+ break;
+ case OP_SHOW:
+ name = "show info";
+ break;
+ case OP_AMEND:
+ name = "amend info";
+ break;
+ case OP_SHOW_FN:
+ name = "show fnode";
+ break;
+ case OP_AMEND_FN:
+ name = "amend fnode";
+ break;
+ case OP_NO:
+ name = "noop";
+ break;
+ default:
+ name = "unknow op type";
+ }
+ return name;
+ }
+ static string ino_type_name(ino_type& t) {
+ string name;
+ switch (t) {
+ case INO_DIR:
+ name = "dir";
+ break;
+ case INO_F:
+ name = "file";
+ break;
+ default:
+ name = "unknow file type";
+ }
+ return name;
+ }
+ class sub_op {
+ public:
+ sub_op(meta_op* mop):
+ trace_level(0),
+ _proc(false),
+ _mop(mop)
+ {}
+ void print() {
+ std::cout << detail() << std::endl;
+ }
+ string detail() {
+ std::stringstream ds;
+ ds << " [sub_op]" << op_type_name(sub_op_t) << "|"
+ << ino_type_name(sub_ino_t) << "|"
+ << ino << "|"
+ << frag << "|"
+ << ino_c << "|"
+ << trace_level << "|"
+ << name;
+ return ds.str();
+ }
+ bool get_c_ancestor(inode_backpointer_t& bp) {
+ if (!_mop || !ino_c)
+ return false;
+ auto item = _mop->ancestors.find(ino_c);
+ if (item != _mop->ancestors.end()) {
+ bp = item->second;
+ return true;
+ } else
+ return false;
+ }
+ bool get_ancestor(inode_backpointer_t& bp) {
+ if (!_mop || !ino)
+ return false;
+ auto item = _mop->ancestors.find(ino);
+ if (item != _mop->ancestors.end()) {
+ bp = item->second;
+ return true;
+ } else
+ return false;
+ }
+ op_type sub_op_t;
+ ino_type sub_ino_t;
+ inodeno_t ino;
+ frag_t frag;
+ inodeno_t ino_c;
+ unsigned trace_level;
+ std::string name;
+ bool _proc;
+ meta_op* _mop;
+ };
+
+ std::map<inodeno_t, inode_backpointer_t > ancestors;
+ std::map<inodeno_t, inode_meta_t* > inodes;
+ std::map<inodeno_t, string > okeys;
+
+ void clear_sops() {
+ while(!no_sops())
+ pop_op();
+ }
+ bool no_sops() {
+ return sub_ops.empty();
+ }
+ void push_op(sub_op* sop) {
+ if (_debug)
+ std::cout << "<<====" << sop->detail() << std::endl;
+ sub_ops.push(sop);
+ }
+ sub_op* top_op() {
+ return sub_ops.top();
+ }
+ void pop_op() {
+ sub_op* sop = sub_ops.top();
+ if (_debug)
+ std::cout << "====>>" << sop->detail() << std::endl;;
+ delete sop;
+ sub_ops.pop();
+ }
+ string outfile() {
+ return _out;
+ }
+ string infile() {
+ return _in;
+ }
+ bool is_debug() {
+ return _debug;
+ }
+ bool confirm_chg() {
+ return _confirm;
+ }
+ private:
+ stack<sub_op*> sub_ops;
+ bool _debug;
+ string _out;
+ string _in;
+ bool _confirm;
+ };
+ MDSRoleSelector role_selector;
+ mds_rank_t rank;
+
+ // I/O handles
+ librados::Rados rados;
+ librados::IoCtx io_meta;
+ std::vector<librados::IoCtx*> io_data_v;
+ librados::IoCtx output;
+ bool _debug;
+ uint64_t features;
+
+ std::string obj_name(inodeno_t ino, frag_t fg = frag_t(), const char *suffix = NULL) const;
+ std::string obj_name(inodeno_t ino, uint64_t offset, const char *suffix = NULL) const;
+ std::string obj_name(const char* ino, uint64_t offset, const char *suffix = NULL) const;
+
+ // 0 : continue to find
+ // 1 : stop to find it
+ int show_child(std::string_view key,
+ std::string_view dname,
+ const snapid_t last,
+ bufferlist &bl,
+ const int pos,
+ const std::set<snapid_t> *snaps,
+ bool *force_dirty,
+ inodeno_t sp_ino = 0,
+ meta_op* op = NULL
+ );
+
+ int process(string& mode, string& ino, string out, string in, bool confirm);
+ int show_meta_info(string& ino, string& out);
+ int list_meta_info(string& ino, string& out);
+ int amend_meta_info(string& ino, string& in, bool confirm);
+ int show_fnode(string& ino, string& out);
+ int amend_fnode(string& in, bool confirm);
+ int op_process(meta_op &op);
+ int list_meta(meta_op &op);
+ int file_meta(meta_op &op);
+ int show_meta(meta_op &op);
+ int amend_meta(meta_op &op);
+ int show_fn(meta_op &op);
+ int amend_fn(meta_op &op);
+ public:
+ int _file_meta(meta_op &op, librados::IoCtx& io);
+ int _show_meta(inode_meta_t& i, const string& fn);
+ int _amend_meta(string &k, inode_meta_t& i, const string& fn, meta_op& op);
+ int _show_fn(inode_meta_t& i, const string& fn);
+ int _amend_fn(const string& fn, bool confirm);
+ void usage();
+ MetaTool(bool debug=false):
+ _debug(debug) {}
+ ~MetaTool() {}
+
+ int main(string& mode,
+ string& rank_str,
+ string& minfo,
+ string&ino,
+ string& out,
+ string& in,
+ bool confirm = false
+ );
+};
+#endif // METATOOL_H__
diff --git a/src/tools/cephfs/PgFiles.cc b/src/tools/cephfs/PgFiles.cc
new file mode 100644
index 000000000..2abca7223
--- /dev/null
+++ b/src/tools/cephfs/PgFiles.cc
@@ -0,0 +1,194 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "osdc/Striper.h"
+
+#include "PgFiles.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "pgeffects." << __func__ << ": "
+
+int PgFiles::init()
+{
+ int r = ceph_create_with_context(&cmount, g_ceph_context);
+ if (r != 0) {
+ return r;
+ }
+
+ return ceph_init(cmount);
+}
+
+PgFiles::PgFiles(Objecter *o, const std::set<pg_t> &pgs_)
+ : objecter(o), pgs(pgs_)
+{
+ for (const auto &i : pgs) {
+ pools.insert(i.m_pool);
+ }
+}
+
+PgFiles::~PgFiles()
+{
+ ceph_release(cmount);
+}
+
+void PgFiles::hit_dir(std::string const &path)
+{
+ dout(10) << "entering " << path << dendl;
+
+ ceph_dir_result *dr = nullptr;
+ int r = ceph_opendir(cmount, path.c_str(), &dr);
+ if (r != 0) {
+ derr << "Failed to open path: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ struct dirent de;
+ while((r = ceph_readdir_r(cmount, dr, &de)) != 0) {
+ if (r < 0) {
+ derr << "Error reading path " << path << ": " << cpp_strerror(r)
+ << dendl;
+ ceph_closedir(cmount, dr); // best effort, ignore r
+ return;
+ }
+
+ if (std::string(de.d_name) == "." || std::string(de.d_name) == "..") {
+ continue;
+ }
+
+ struct ceph_statx stx;
+ std::string de_path = (path + std::string("/") + de.d_name);
+ r = ceph_statx(cmount, de_path.c_str(), &stx,
+ CEPH_STATX_INO|CEPH_STATX_SIZE, 0);
+ if (r != 0) {
+ derr << "Failed to stat path " << de_path << ": "
+ << cpp_strerror(r) << dendl;
+ // Don't hold up the whole process for one bad inode
+ continue;
+ }
+
+ if (S_ISREG(stx.stx_mode)) {
+ hit_file(de_path, stx);
+ } else if (S_ISDIR(stx.stx_mode)) {
+ hit_dir(de_path);
+ } else {
+ dout(20) << "Skipping non reg/dir file: " << de_path << dendl;
+ }
+ }
+
+ r = ceph_closedir(cmount, dr);
+ if (r != 0) {
+ derr << "Error closing path " << path << ": " << cpp_strerror(r) << dendl;
+ return;
+ }
+}
+
+void PgFiles::hit_file(std::string const &path, const struct ceph_statx &stx)
+{
+ ceph_assert(S_ISREG(stx.stx_mode));
+
+ dout(20) << "Hitting file '" << path << "'" << dendl;
+
+ int l_stripe_unit = 0;
+ int l_stripe_count = 0;
+ int l_object_size = 0;
+ int l_pool_id = 0;
+ int r = ceph_get_path_layout(cmount, path.c_str(), &l_stripe_unit,
+ &l_stripe_count, &l_object_size,
+ &l_pool_id);
+ if (r != 0) {
+ derr << "Error reading layout on " << path << ": " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+
+ struct file_layout_t layout;
+ layout.stripe_unit = l_stripe_unit;
+ layout.stripe_count = l_stripe_count;
+ layout.object_size = l_object_size;
+ layout.pool_id = l_pool_id;
+
+ // Avoid calculating PG if the layout targeted a completely different pool
+ if (pools.count(layout.pool_id) == 0) {
+ dout(20) << "Fast check missed: pool " << layout.pool_id << " not in "
+ "target set" << dendl;
+ return;
+ }
+
+ auto num_objects = Striper::get_num_objects(layout, stx.stx_size);
+
+ for (uint64_t i = 0; i < num_objects; ++i) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)stx.stx_ino,
+ (long long unsigned int)i);
+ dout(20) << " object " << std::string(buf) << dendl;
+
+ pg_t target;
+ object_t oid;
+ object_locator_t loc;
+ loc.pool = layout.pool_id;
+ loc.key = std::string(buf);
+
+ unsigned pg_num_mask = 0;
+ unsigned pg_num = 0;
+
+ int r = 0;
+ objecter->with_osdmap([&r, oid, loc, &target, &pg_num_mask, &pg_num]
+ (const OSDMap &osd_map) {
+ r = osd_map.object_locator_to_pg(oid, loc, target);
+ if (r == 0) {
+ auto pool = osd_map.get_pg_pool(loc.pool);
+ pg_num_mask = pool->get_pg_num_mask();
+ pg_num = pool->get_pg_num();
+ }
+ });
+ if (r != 0) {
+ // Can happen if layout pointed to pool not in osdmap, for example
+ continue;
+ }
+
+ target.m_seed = ceph_stable_mod(target.ps(), pg_num, pg_num_mask);
+
+ dout(20) << " target " << target << dendl;
+
+ if (pgs.count(target)) {
+ std::cout << path << std::endl;
+ return;
+ }
+ }
+
+}
+
+int PgFiles::scan_path(std::string const &path)
+{
+ int r = ceph_mount(cmount, "/");
+ if (r != 0) {
+ derr << "Failed to mount: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ hit_dir(path);
+
+ r = ceph_unmount(cmount);
+ if (r != 0) {
+ derr << "Failed to unmount: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+}
+
diff --git a/src/tools/cephfs/PgFiles.h b/src/tools/cephfs/PgFiles.h
new file mode 100644
index 000000000..1ba4b3d28
--- /dev/null
+++ b/src/tools/cephfs/PgFiles.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef PG_EFFECTS_H_
+#define PG_EFFECTS_H_
+
+#include "include/cephfs/libcephfs.h"
+#include "osd/osd_types.h"
+#include <set>
+#include "osdc/Objecter.h"
+
+/**
+ * This utility scans the files (via an online MDS) and works out
+ * which ones rely on named PGs. For use when someone has
+ * some bad/damaged PGs and wants to see which files might be
+ * affected.
+ */
+class PgFiles
+{
+private:
+ Objecter *objecter;
+ struct ceph_mount_info *cmount = nullptr;
+
+ std::set<pg_t> pgs;
+ std::set<uint64_t> pools;
+
+ void hit_file(std::string const &path, const struct ceph_statx &stx);
+ void hit_dir(std::string const &path);
+
+
+public:
+ PgFiles(Objecter *o, const std::set<pg_t> &pgs_);
+ ~PgFiles();
+
+ int init();
+ int scan_path(std::string const &path);
+};
+
+#endif
+
diff --git a/src/tools/cephfs/Resetter.cc b/src/tools/cephfs/Resetter.cc
new file mode 100644
index 000000000..278a48767
--- /dev/null
+++ b/src/tools/cephfs/Resetter.cc
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include <memory>
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/JournalPointer.h"
+
+#include "mds/mdstypes.h"
+#include "mds/MDCache.h"
+#include "mon/MonClient.h"
+#include "mds/events/EResetJournal.h"
+
+#include "Resetter.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+int Resetter::init(mds_role_t role_, const std::string &type, bool hard)
+{
+ role = role_;
+ int r = MDSUtility::init();
+ if (r < 0) {
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(nullptr != fs);
+
+ is_mdlog = false;
+ if (type == "mdlog") {
+ JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool());
+ int rt = 0;
+ if (hard) {
+ jp.front = role.rank + MDS_INO_LOG_OFFSET;
+ jp.back = 0;
+ rt = jp.save(objecter);
+ if (rt != 0) {
+ derr << "Error writing journal pointer: " << cpp_strerror(rt) << dendl;
+ return rt;
+ }
+ ino = jp.front; // only need to reset ino for mdlog
+ } else {
+ rt = jp.load(objecter);
+ if (rt != 0) {
+ std::cerr << "Error loading journal: " << cpp_strerror(rt) <<
+ ", pass --force to forcibly reset this journal" << std::endl;
+ return rt;
+ } else {
+ ino = jp.front;
+ }
+ }
+ is_mdlog = true;
+ } else if (type == "purge_queue") {
+ ino = MDS_INO_PURGE_QUEUE + role.rank;
+ } else {
+ ceph_abort(); // should not get here
+ }
+ return 0;
+}
+
+int Resetter::reset()
+{
+ ceph::mutex mylock = ceph::make_mutex("Resetter::reset::lock");
+ ceph::condition_variable cond;
+ bool done;
+ int r;
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ Journaler journaler("resetter", ino,
+ fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC,
+ objecter, 0, 0, &finisher);
+ {
+ std::lock_guard locker{lock};
+ journaler.recover(new C_SafeCond(mylock, cond, &done, &r));
+ }
+ {
+ std::unique_lock locker{mylock};
+ cond.wait(locker, [&done] { return done; });
+ }
+ if (r != 0) {
+ if (r == -ENOENT) {
+ cerr << "journal does not exist on-disk. Did you set a bad rank?"
+ << std::endl;
+ std::cerr << "Error loading journal: " << cpp_strerror(r) <<
+ ", pass --force to forcibly reset this journal" << std::endl;
+ return r;
+ } else {
+ cerr << "got error " << r << "from Journaler, failing" << std::endl;
+ return r;
+ }
+ }
+
+ lock.lock();
+ uint64_t old_start = journaler.get_read_pos();
+ uint64_t old_end = journaler.get_write_pos();
+ uint64_t old_len = old_end - old_start;
+ cout << "old journal was " << old_start << "~" << old_len << std::endl;
+
+ uint64_t new_start = round_up_to(old_end+1, journaler.get_layout_period());
+ cout << "new journal start will be " << new_start
+ << " (" << (new_start - old_end) << " bytes past old end)" << std::endl;
+
+ journaler.set_read_pos(new_start);
+ journaler.set_write_pos(new_start);
+ journaler.set_expire_pos(new_start);
+ journaler.set_trimmed_pos(new_start);
+ journaler.set_writeable();
+
+ cout << "writing journal head" << std::endl;
+ journaler.write_head(new C_SafeCond(mylock, cond, &done, &r));
+ lock.unlock();
+ {
+ std::unique_lock locker{mylock};
+ cond.wait(locker, [&done] { return done; });
+ }
+ std::lock_guard l{lock};
+ if (r != 0) {
+ return r;
+ }
+
+ if (is_mdlog) {
+ r = _write_reset_event(&journaler); // reset envent is specific for mdlog journal
+ if (r != 0) {
+ return r;
+ }
+ }
+ cout << "done" << std::endl;
+
+ return 0;
+}
+
+int Resetter::reset_hard()
+{
+ auto fs = fsmap->get_filesystem(role.fscid);
+
+ Journaler journaler("resetter", ino,
+ fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC,
+ objecter, 0, 0, &finisher);
+ journaler.set_writeable();
+
+ file_layout_t default_log_layout = MDCache::gen_default_log_layout(
+ fsmap->get_filesystem(role.fscid)->mds_map);
+ journaler.create(&default_log_layout, g_conf()->mds_journal_format);
+
+ C_SaferCond cond;
+ {
+ std::lock_guard l{lock};
+ journaler.write_head(&cond);
+ }
+
+ int r = cond.wait();
+ if (r != 0) {
+ derr << "Error writing journal header: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (is_mdlog) // reset event is specific for mdlog journal
+ {
+ std::lock_guard l{lock};
+ r = _write_reset_event(&journaler);
+ if (r != 0) {
+ derr << "Error writing EResetJournal: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ if (is_mdlog) {
+ dout(4) << "Successfully wrote new journal pointer and header for rank "
+ << role << dendl;
+ } else {
+ dout(4) << "Successfully wrote header for rank " << role << dendl;
+ }
+ return 0;
+}
+
+int Resetter::_write_reset_event(Journaler *journaler)
+{
+ ceph_assert(journaler != NULL);
+
+ auto le = std::make_unique<EResetJournal>();
+
+ bufferlist bl;
+ le->encode_with_header(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ cout << "writing EResetJournal entry" << std::endl;
+ journaler->append_entry(bl);
+
+ int ret;
+ {
+ C_SaferCond cond;
+ journaler->flush(&cond);
+ ret = cond.wait();
+ if (ret < 0)
+ return ret;
+ }
+ {
+ // wait until all journal prezero ops are done
+ C_SaferCond cond;
+ journaler->wait_for_prezero(&cond);
+ cond.wait();
+ }
+
+ return ret;
+}
+
diff --git a/src/tools/cephfs/Resetter.h b/src/tools/cephfs/Resetter.h
new file mode 100644
index 000000000..6998e4598
--- /dev/null
+++ b/src/tools/cephfs/Resetter.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef JOURNAL_RESETTER_H_
+#define JOURNAL_RESETTER_H_
+
+
+#include "MDSUtility.h"
+
+class Journaler;
+
+/**
+ * This class lets you reset an mds journal for troubleshooting or whatever.
+ *
+ * To use, create a Resetter, call init(), and then call reset() with the name
+ * of the file to dump to.
+ */
+class Resetter : public MDSUtility {
+private:
+ mds_role_t role;
+ inodeno_t ino;
+ bool is_mdlog;
+
+protected:
+ int _write_reset_event(Journaler *journaler);
+
+public:
+ Resetter() {}
+ ~Resetter() {}
+
+ int init(mds_role_t role_, const std::string &type, bool hard);
+ /**
+ * For use when no journal header/pointer was present: write one
+ * out from scratch.
+ */
+ int reset_hard();
+ int reset();
+};
+
+#endif /* JOURNAL_RESETTER_H_ */
diff --git a/src/tools/cephfs/RoleSelector.cc b/src/tools/cephfs/RoleSelector.cc
new file mode 100644
index 000000000..e2d53b86e
--- /dev/null
+++ b/src/tools/cephfs/RoleSelector.cc
@@ -0,0 +1,59 @@
+
+#include "RoleSelector.h"
+
+int MDSRoleSelector::parse_rank(
+ const FSMap &fsmap,
+ std::string const &str)
+{
+ if (str == "all" || str == "*") {
+ std::set<mds_rank_t> in;
+ const MDSMap &mds_map = fsmap.get_filesystem(fscid)->mds_map;
+ mds_map.get_mds_set(in);
+
+ for (auto rank : in) {
+ roles.push_back(mds_role_t(fscid, rank));
+ }
+
+ return 0;
+ } else {
+ std::string rank_err;
+ mds_rank_t rank = strict_strtol(str.c_str(), 10, &rank_err);
+ if (!rank_err.empty()) {
+ return -EINVAL;
+ }
+ if (fsmap.get_filesystem(fscid)->mds_map.is_dne(rank)) {
+ return -ENOENT;
+ }
+ roles.push_back(mds_role_t(fscid, rank));
+ return 0;
+ }
+}
+
+int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str,
+ bool allow_unqualified_rank)
+{
+ auto colon_pos = str.find(":");
+ if (colon_pos == std::string::npos) {
+ // An unqualified rank. Only valid if there is only one
+ // namespace.
+ if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) {
+ fscid = fsmap.get_filesystem()->fscid;
+ return parse_rank(fsmap, str);
+ } else {
+ return -EINVAL;
+ }
+ } else if (colon_pos == 0 || colon_pos == str.size() - 1) {
+ return -EINVAL;
+ } else {
+ const std::string ns_str = str.substr(0, colon_pos);
+ const std::string rank_str = str.substr(colon_pos + 1);
+ std::shared_ptr<const Filesystem> fs_ptr;
+ int r = fsmap.parse_filesystem(ns_str, &fs_ptr);
+ if (r != 0) {
+ return r;
+ }
+ fscid = fs_ptr->fscid;
+ return parse_rank(fsmap, rank_str);
+ }
+}
+
diff --git a/src/tools/cephfs/RoleSelector.h b/src/tools/cephfs/RoleSelector.h
new file mode 100644
index 000000000..9090b7200
--- /dev/null
+++ b/src/tools/cephfs/RoleSelector.h
@@ -0,0 +1,36 @@
+
+#ifndef ROLE_SELECTOR_H_
+#define ROLE_SELECTOR_H_
+
+#include <string>
+#include <vector>
+#include "mds/mdstypes.h"
+#include "mds/FSMap.h"
+
+/**
+ * When you want to let the user act on a single rank in a namespace,
+ * or all of them.
+ */
+class MDSRoleSelector
+{
+ public:
+ const std::vector<mds_role_t> &get_roles() const {return roles;}
+ int parse(const FSMap &fsmap, std::string const &str,
+ bool allow_unqualified_rank=true);
+ MDSRoleSelector()
+ : fscid(FS_CLUSTER_ID_NONE)
+ {}
+ fs_cluster_id_t get_ns() const
+ {
+ return fscid;
+ }
+ protected:
+ int parse_rank(
+ const FSMap &fsmap,
+ std::string const &str);
+ std::vector<mds_role_t> roles;
+ fs_cluster_id_t fscid;
+};
+
+#endif // ROLE_SELECTOR_H_
+
diff --git a/src/tools/cephfs/TableTool.cc b/src/tools/cephfs/TableTool.cc
new file mode 100644
index 000000000..e779b4b66
--- /dev/null
+++ b/src/tools/cephfs/TableTool.cc
@@ -0,0 +1,417 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+
+#include "mds/SessionMap.h"
+#include "mds/InoTable.h"
+#include "mds/SnapServer.h"
+
+#include "TableTool.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+void TableTool::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-table-tool <all|[mds rank]> <reset|show> <session|snap|inode>"
+ << " cephfs-table-tool <all|[mds rank]> <take_inos> <max_ino>"
+ << std::endl;
+
+ generic_client_usage();
+}
+
+
+/**
+ * For a function that takes an MDS role as an argument and
+ * returns an error code, execute it on the roles specified
+ * by `role_selector`.
+ */
+int TableTool::apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f)
+{
+ ceph_assert(f != NULL);
+
+ int r = 0;
+
+ f->open_object_section("ranks");
+
+ for (auto role : role_selector.get_roles()) {
+ std::ostringstream rank_str;
+ rank_str << role.rank;
+ f->open_object_section(rank_str.str().c_str());
+
+ f->open_object_section("data");
+ int rank_r = fptr(role, f);
+ f->close_section();
+ r = r ? r : rank_r;
+
+ f->dump_int("result", rank_r);
+ f->close_section();
+
+
+ }
+
+ f->close_section();
+
+ return r;
+}
+
+
+/**
+ * This class wraps an MDS table class (SessionMap, SnapServer, InoTable)
+ * with offline load/store code such that we can do offline dumps and resets
+ * on those tables.
+ */
+template <typename A>
+class TableHandler
+{
+protected:
+ // The RADOS object ID for the table
+ std::string object_name;
+
+ // The role in question (may be NONE)
+ mds_role_t role;
+
+ // Whether this is an MDSTable subclass (i.e. has leading version field to decode)
+ bool mds_table;
+
+public:
+ TableHandler(mds_role_t r, std::string const &name, bool mds_table_)
+ : role(r), mds_table(mds_table_)
+ {
+ // Compose object name of the table we will dump
+ std::ostringstream oss;
+ oss << "mds";
+ if (!role.is_none()) {
+ oss << role.rank;
+ }
+ oss << "_" << name;
+ object_name = oss.str();
+ }
+
+ int load_and_dump(librados::IoCtx *io, Formatter *f)
+ {
+ ceph_assert(io != NULL);
+ ceph_assert(f != NULL);
+
+ // Attempt read
+ bufferlist table_bl;
+ int read_r = io->read(object_name, table_bl, 0, 0);
+ if (read_r >= 0) {
+ auto q = table_bl.cbegin();
+ try {
+ if (mds_table) {
+ version_t version;
+ decode(version, q);
+ f->dump_int("version", version);
+ }
+ A table_inst;
+ table_inst.set_rank(role.rank);
+ table_inst.decode(q);
+ table_inst.dump(f);
+
+ return 0;
+ } catch (buffer::error &e) {
+ derr << "table " << object_name << " is corrupt" << dendl;
+ return -EIO;
+ }
+ } else {
+ derr << "error reading table object " << object_name
+ << ": " << cpp_strerror(read_r) << dendl;
+ return read_r;
+ }
+ }
+
+ int reset(librados::IoCtx *io)
+ {
+ A table_inst;
+ // Compose new (blank) table
+ table_inst.set_rank(role.rank);
+ table_inst.reset_state();
+ // Write the table out
+ return write(table_inst, io);
+ }
+
+protected:
+
+ int write(const A &table_inst, librados::IoCtx *io)
+ {
+ bufferlist new_bl;
+ if (mds_table) {
+ version_t version = 1;
+ encode(version, new_bl);
+ }
+ table_inst.encode_state(new_bl);
+
+ // Write out new table
+ int r = io->write_full(object_name, new_bl);
+ if (r != 0) {
+ derr << "error writing table object " << object_name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+ }
+};
+
+template <typename A>
+class TableHandlerOmap
+{
+private:
+ // The RADOS object ID for the table
+ std::string object_name;
+
+ // The role (rank may be NONE)
+ mds_role_t role;
+
+ // Whether this is an MDSTable subclass (i.e. has leading version field to decode)
+ bool mds_table;
+
+public:
+ TableHandlerOmap(mds_role_t r, std::string const &name, bool mds_table_)
+ : role(r), mds_table(mds_table_)
+ {
+ // Compose object name of the table we will dump
+ std::ostringstream oss;
+ oss << "mds";
+ if (!role.is_none()) {
+ oss << role.rank;
+ }
+ oss << "_" << name;
+ object_name = oss.str();
+ }
+
+ int load_and_dump(librados::IoCtx *io, Formatter *f)
+ {
+ ceph_assert(io != NULL);
+ ceph_assert(f != NULL);
+
+ // Read in the header
+ bufferlist header_bl;
+ int r = io->omap_get_header(object_name, &header_bl);
+ if (r != 0) {
+ derr << "error reading header on '" << object_name << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Decode the header
+ A table_inst;
+ table_inst.set_rank(role.rank);
+ try {
+ table_inst.decode_header(header_bl);
+ } catch (buffer::error &e) {
+ derr << "table " << object_name << " is corrupt" << dendl;
+ return -EIO;
+ }
+
+ // Read and decode OMAP values in chunks
+ std::string last_key = "";
+ while(true) {
+ std::map<std::string, bufferlist> values;
+ int r = io->omap_get_vals(object_name, last_key,
+ g_conf()->mds_sessionmap_keys_per_op, &values);
+
+ if (r != 0) {
+ derr << "error reading values: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (values.empty()) {
+ break;
+ }
+
+ try {
+ table_inst.decode_values(values);
+ } catch (buffer::error &e) {
+ derr << "table " << object_name << " is corrupt" << dendl;
+ return -EIO;
+ }
+ last_key = values.rbegin()->first;
+ }
+
+ table_inst.dump(f);
+
+ return 0;
+ }
+
+ int reset(librados::IoCtx *io)
+ {
+ A table_inst;
+ table_inst.set_rank(role.rank);
+ table_inst.reset_state();
+ bufferlist header_bl;
+ table_inst.encode_header(&header_bl);
+
+ // Compose a transaction to clear and write header
+ librados::ObjectWriteOperation op;
+ op.omap_clear();
+ op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
+ op.omap_set_header(header_bl);
+
+ return io->operate(object_name, &op);
+ }
+};
+
+class InoTableHandler : public TableHandler<InoTable>
+{
+ public:
+ explicit InoTableHandler(mds_role_t r)
+ : TableHandler(r, "inotable", true)
+ {}
+
+ int take_inos(librados::IoCtx *io, inodeno_t max, Formatter *f)
+ {
+ InoTable inst;
+ inst.set_rank(role.rank);
+ inst.reset_state();
+
+ int r = 0;
+ if (inst.force_consume_to(max)) {
+ r = write(inst, io);
+ }
+
+ f->dump_int("version", inst.get_version());
+ inst.dump(f);
+
+ return r;
+ }
+};
+
+
+int TableTool::main(std::vector<const char*> &argv)
+{
+ int r;
+
+ dout(10) << __func__ << dendl;
+
+ // RADOS init
+ // ==========
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+ return r;
+ }
+
+ dout(4) << "connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Require at least 3 args <rank> <mode> <arg> [args...]
+ if (argv.size() < 3) {
+ cerr << "missing required 3 arguments" << std::endl;
+ return -EINVAL;
+ }
+
+ const std::string role_str = std::string(argv[0]);
+ const std::string mode = std::string(argv[1]);
+ const std::string table = std::string(argv[2]);
+
+ r = role_selector.parse(*fsmap, role_str);
+ if (r < 0) {
+ derr << "Bad rank selection: " << role_str << "'" << dendl;
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ ceph_assert(fs != nullptr);
+ int64_t const pool_id = fs->mds_map.get_metadata_pool();
+ dout(4) << "resolving pool " << pool_id << dendl;
+ std::string pool_name;
+ r = rados.pool_reverse_lookup(pool_id, &pool_name);
+ if (r < 0) {
+ derr << "Pool " << pool_id << " identified in MDS map not found in RADOS!"
+ << dendl;
+ return r;
+ }
+
+ dout(4) << "creating IoCtx.." << dendl;
+ r = rados.ioctx_create(pool_name.c_str(), io);
+ if (r != 0) {
+ return r;
+ }
+
+ JSONFormatter jf(true);
+ if (mode == "reset") {
+ const std::string table = std::string(argv[2]);
+ if (table == "session") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).reset(&io);
+ }, &jf);
+ } else if (table == "inode") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandler<InoTable>(rank, "inotable", true).reset(&io);
+ }, &jf);
+ } else if (table == "snap") {
+ r = TableHandler<SnapServer>(mds_role_t(), "snaptable", true).reset(&io);
+ jf.open_object_section("reset_snap_status");
+ jf.dump_int("result", r);
+ jf.close_section();
+ } else {
+ cerr << "Invalid table '" << table << "'" << std::endl;
+ return -EINVAL;
+ }
+ } else if (mode == "show") {
+ const std::string table = std::string(argv[2]);
+ if (table == "session") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f);
+ }, &jf);
+ } else if (table == "inode") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandler<InoTable>(rank, "inotable", true).load_and_dump(&io, f);;
+ }, &jf);
+ } else if (table == "snap") {
+ jf.open_object_section("show_snap_table");
+ {
+ r = TableHandler<SnapServer>(
+ mds_role_t(), "snaptable", true).load_and_dump(&io, &jf);
+ jf.dump_int("result", r);
+ }
+ jf.close_section();
+ } else {
+ cerr << "Invalid table '" << table << "'" << std::endl;
+ return -EINVAL;
+ }
+ } else if (mode == "take_inos") {
+ const std::string ino_str = std::string(argv[2]);
+ std::string ino_err;
+ inodeno_t ino = strict_strtoll(ino_str.c_str(), 10, &ino_err);
+ if (!ino_err.empty()) {
+ derr << "Bad ino '" << ino_str << "'" << dendl;
+ return -EINVAL;
+ }
+ r = apply_role_fn([this, ino](mds_role_t rank, Formatter *f) -> int {
+ return InoTableHandler(rank).take_inos(&io, ino, f);
+ }, &jf);
+ } else {
+ cerr << "Invalid mode '" << mode << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // Subcommand should have written to formatter, flush it
+ jf.flush(std::cout);
+ std::cout << std::endl;
+ return r;
+}
+
diff --git a/src/tools/cephfs/TableTool.h b/src/tools/cephfs/TableTool.h
new file mode 100644
index 000000000..bf9b95c12
--- /dev/null
+++ b/src/tools/cephfs/TableTool.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+
+#include "include/rados/librados.hpp"
+
+/**
+ * Command line tool for debugging the backing store of
+ * MDSTable instances.
+ */
+class TableTool : public MDSUtility
+{
+ private:
+ MDSRoleSelector role_selector;
+
+ // I/O handles
+ librados::Rados rados;
+ librados::IoCtx io;
+
+ int apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f);
+
+ public:
+ static void usage();
+ int main(std::vector<const char*> &argv);
+
+};
+
diff --git a/src/tools/cephfs/cephfs-data-scan.cc b/src/tools/cephfs/cephfs-data-scan.cc
new file mode 100644
index 000000000..e6efff66c
--- /dev/null
+++ b/src/tools/cephfs/cephfs-data-scan.cc
@@ -0,0 +1,47 @@
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "DataScan.h"
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ DataScan::usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ DataScan data_scan;
+
+ // Connect to mon cluster, download MDS map etc
+ int rc = data_scan.init();
+ if (rc != 0) {
+ std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+
+ // Finally, execute the user's commands
+ rc = data_scan.main(args);
+ if (rc != 0) {
+ std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+ }
+
+
+ return rc;
+}
+
diff --git a/src/tools/cephfs/cephfs-journal-tool.cc b/src/tools/cephfs/cephfs-journal-tool.cc
new file mode 100644
index 000000000..290cb305b
--- /dev/null
+++ b/src/tools/cephfs/cephfs-journal-tool.cc
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "JournalTool.h"
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ JournalTool::usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ JournalTool jt;
+
+ // Connect to mon cluster, download MDS map etc
+ int rc = jt.init();
+ if (rc != 0) {
+ std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+
+ // Finally, execute the user's commands
+ rc = jt.main(args);
+ if (rc != 0) {
+ std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+ }
+
+ return rc;
+}
+
diff --git a/src/tools/cephfs/cephfs-meta-injection.cc b/src/tools/cephfs/cephfs-meta-injection.cc
new file mode 100644
index 000000000..5768d3869
--- /dev/null
+++ b/src/tools/cephfs/cephfs-meta-injection.cc
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <include/types.h>
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "MetaTool.h"
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <boost/program_options.hpp>
+namespace po = boost::program_options;
+using std::string;
+using namespace std;
+static string version = "cephfs-meta-injection v1.1";
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ env_to_vec(args);
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ string rank_str, minfo, ino, out,in;
+ po::options_description general("general options");
+ general.add_options()
+ ("help,h", "produce help message")
+ ("debug", "show debug info")
+ ("rank,r", po::value<string>(&rank_str), "the rank of cephfs, default(0) (e.g. -r cephfs_a:0)")
+ ("minfo", po::value<string>(&minfo), "specify metapool, datapools and rank (e.g. cephfs_metadata_a:cephfs_data_a:0)")
+ ("ino,i", po::value<string>(&ino), "specify inode. e.g. 1099511627776 or 0x10000000000, you can find it with cmd, 'ls -i'")
+ ("out,o", po::value<string>(&out), "output file")
+ ("in", po::value<string>(&in), "input file")
+ ("yes-i-really-really-mean-it", "need by amend info")
+ ;
+
+ string mode;
+ po::options_description modeoptions("mode options");
+ modeoptions.add_options()
+ ("mode", po::value<string>(&mode),
+ "\tlistc : list all obj of dir\n" \
+ "\tshowm : show the info of ino\n" \
+ "\tshowfn : show the fnode of dir\n" \
+ "\tamend : amend part of the meta data\n" \
+ "\tamendfn : amend fnode from file\n"
+ );
+
+ po::positional_options_description p;
+ p.add("mode", 1);
+
+ po::options_description all("all options");
+ all.add(modeoptions).add(general);
+ po::variables_map vm;
+ try {
+ po::store(po::command_line_parser(argc, argv).options(all).positional(p).allow_unregistered().run(), vm);
+ } catch(exception &e) {
+ cerr << "error : " << e.what() << std::endl;
+ return -1;
+ } catch(...) {
+ cout << "param error" << std::endl;
+ return 0;
+ }
+
+ boost::program_options::notify(vm);
+ if (vm.count("help")) {
+ std::cout << version << std::endl;
+ std::cout << "usage : \n"
+ << " cephfs-meta-injection <listc|showm|showfn|amend|amendfn> -r <fsname:rank> -i <ino>"
+ << std::endl;
+ std::cout << "example : \n"
+ << " amend info of inode(1099531628828)\n"
+ << " cephfs-meta-injection showm -r cephfs_a:0 -i 1099531628828 -o out\n"
+ << " alter file\n"
+ << " cephfs-meta-injection amend -r cephfs_a:0 -i 1099531628828 --in out --yes-i-really-mean-it"
+ << std::endl;
+ std::cout << all << std::endl;
+ return 0;
+ }
+
+ MetaTool mt(vm.count("debug"));
+ int rc = mt.init();
+ if (rc != 0) {
+ std::cerr << "error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+ rc = mt.main(mode, rank_str, minfo, ino, out, in, vm.count("yes-i-really-really-mean-it"));
+ if (rc != 0) {
+ std::cerr << "error (" << cpp_strerror(rc) << ")" << std::endl;
+ return -1;
+ }
+ return rc;
+}
diff --git a/src/tools/cephfs/cephfs-table-tool.cc b/src/tools/cephfs/cephfs-table-tool.cc
new file mode 100644
index 000000000..47b475dd0
--- /dev/null
+++ b/src/tools/cephfs/cephfs-table-tool.cc
@@ -0,0 +1,47 @@
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "TableTool.h"
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ TableTool::usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ TableTool tt;
+
+ // Connect to mon cluster, download MDS map etc
+ int rc = tt.init();
+ if (rc != 0) {
+ std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+
+ // Finally, execute the user's commands
+ rc = tt.main(args);
+ if (rc != 0) {
+ std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+ }
+
+ return rc;
+}
+
+
diff --git a/src/tools/cephfs/shell/CMakeLists.txt b/src/tools/cephfs/shell/CMakeLists.txt
new file mode 100644
index 000000000..5a1f6ad80
--- /dev/null
+++ b/src/tools/cephfs/shell/CMakeLists.txt
@@ -0,0 +1,7 @@
+include(Distutils)
+distutils_install_module(cephfs-shell)
+
+if(WITH_TESTS)
+ include(AddCephTest)
+ add_tox_test(cephfs-shell)
+endif()
diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell
new file mode 100755
index 000000000..51bd569e0
--- /dev/null
+++ b/src/tools/cephfs/shell/cephfs-shell
@@ -0,0 +1,1684 @@
+#!/usr/bin/python3
+# coding = utf-8
+
+import argparse
+import os
+import os.path
+import sys
+import cephfs as libcephfs
+import shutil
+import traceback
+import colorama
+import fnmatch
+import math
+import re
+import shlex
+import stat
+import errno
+
+from cmd2 import Cmd
+from cmd2 import __version__ as cmd2_version
+from distutils.version import LooseVersion
+
+if sys.version_info.major < 3:
+ raise RuntimeError("cephfs-shell is only compatible with python3")
+
+try:
+ from cmd2 import with_argparser
+except ImportError:
+ def with_argparser(argparser):
+ import functools
+
+ def argparser_decorator(func):
+ @functools.wraps(func)
+ def wrapper(thiz, cmdline):
+ if isinstance(cmdline, list):
+ arglist = cmdline
+ else:
+ # do not split if it's already a list
+ arglist = shlex.split(cmdline, posix=False)
+ # in case user quotes the command args
+ arglist = [arg.strip('\'""') for arg in arglist]
+ try:
+ args = argparser.parse_args(arglist)
+ except SystemExit:
+ shell.exit_code = 1
+ # argparse exits at seeing bad arguments
+ return
+ else:
+ return func(thiz, args)
+ argparser.prog = func.__name__[3:]
+ if argparser.description is None and func.__doc__:
+ argparser.description = func.__doc__
+
+ return wrapper
+
+ return argparser_decorator
+
+
+cephfs = None # holds CephFS Python bindings
+shell = None # holds instance of class CephFSShell
+exit_codes = {'Misc': 1,
+ 'KeyboardInterrupt': 2,
+ errno.EPERM: 3,
+ errno.EACCES: 4,
+ errno.ENOENT: 5,
+ errno.EIO: 6,
+ errno.ENOSPC: 7,
+ errno.EEXIST: 8,
+ errno.ENODATA: 9,
+ errno.EINVAL: 10,
+ errno.EOPNOTSUPP: 11,
+ errno.ERANGE: 12,
+ errno.EWOULDBLOCK: 13,
+ errno.ENOTEMPTY: 14,
+ errno.ENOTDIR: 15,
+ errno.EDQUOT: 16,
+ errno.EPIPE: 17,
+ errno.ESHUTDOWN: 18,
+ errno.ECONNABORTED: 19,
+ errno.ECONNREFUSED: 20,
+ errno.ECONNRESET: 21,
+ errno.EINTR: 22}
+
+
+#########################################################################
+#
+# Following are methods are generically useful through class CephFSShell
+#
+#######################################################################
+
+
+def poutput(s, end='\n'):
+ shell.poutput(s, end=end)
+
+
+def perror(msg, **kwargs):
+ shell.perror(msg, **kwargs)
+
+
+def set_exit_code_msg(errcode='Misc', msg=''):
+ """
+ Set exit code and print error message
+ """
+ if isinstance(msg, libcephfs.Error):
+ shell.exit_code = exit_codes[msg.get_error_code()]
+ else:
+ shell.exit_code = exit_codes[errcode]
+ if msg:
+ perror(msg)
+
+
+def mode_notation(mode):
+ """
+ """
+ permission_bits = {'0': '---',
+ '1': '--x',
+ '2': '-w-',
+ '3': '-wx',
+ '4': 'r--',
+ '5': 'r-x',
+ '6': 'rw-',
+ '7': 'rwx'}
+ mode = str(oct(mode))
+ notation = '-'
+ if mode[2] == '4':
+ notation = 'd'
+ elif mode[2:4] == '12':
+ notation = 'l'
+ for i in mode[-3:]:
+ notation += permission_bits[i]
+ return notation
+
+
+def get_chunks(file_size):
+ chunk_start = 0
+ chunk_size = 0x20000 # 131072 bytes, default max ssl buffer size
+ while chunk_start + chunk_size < file_size:
+ yield chunk_start, chunk_size
+ chunk_start += chunk_size
+ final_chunk_size = file_size - chunk_start
+ yield chunk_start, final_chunk_size
+
+
+def to_bytes(param):
+ # don't convert as follows as it can lead unusable results like coverting
+ # [1, 2, 3, 4] to '[1, 2, 3, 4]' -
+ # str(param).encode('utf-8')
+ if isinstance(param, bytes):
+ return param
+ elif isinstance(param, str):
+ return bytes(param, encoding='utf-8')
+ elif isinstance(param, list):
+ return [i.encode('utf-8') if isinstance(i, str) else to_bytes(i) for
+ i in param]
+ elif isinstance(param, int) or isinstance(param, float):
+ return str(param).encode('utf-8')
+ elif param is None:
+ return None
+
+
+def ls(path, opts=''):
+ # opts tries to be like /bin/ls opts
+ almost_all = 'A' in opts
+ try:
+ with cephfs.opendir(path) as d:
+ while True:
+ dent = cephfs.readdir(d)
+ if dent is None:
+ return
+ elif almost_all and dent.d_name in (b'.', b'..'):
+ continue
+ yield dent
+ except libcephfs.ObjectNotFound as e:
+ set_exit_code_msg(msg=e)
+
+
+def glob(path, pattern):
+ paths = []
+ parent_dir = os.path.dirname(path)
+ if parent_dir == b'':
+ parent_dir = b'/'
+ if path == b'/' or is_dir_exists(os.path.basename(path), parent_dir):
+ for i in ls(path, opts='A'):
+ if fnmatch.fnmatch(i.d_name, pattern):
+ paths.append(os.path.join(path, i.d_name))
+ return paths
+
+
+def locate_file(name, case_sensitive=True):
+ dir_list = sorted(set(dirwalk(cephfs.getcwd())))
+ if not case_sensitive:
+ return [dname for dname in dir_list if name.lower() in dname.lower()]
+ else:
+ return [dname for dname in dir_list if name in dname]
+
+
+def get_all_possible_paths(pattern):
+ complete_pattern = pattern[:]
+ paths = []
+ is_rel_path = not os.path.isabs(pattern)
+ if is_rel_path:
+ dir_ = cephfs.getcwd()
+ else:
+ dir_ = b'/'
+ pattern = pattern[1:]
+ patterns = pattern.split(b'/')
+ paths.extend(glob(dir_, patterns[0]))
+ patterns.pop(0)
+ for pattern in patterns:
+ for path in paths:
+ paths.extend(glob(path, pattern))
+ if is_rel_path:
+ complete_pattern = os.path.join(cephfs.getcwd(), complete_pattern)
+ return [path for path in paths if fnmatch.fnmatch(path, complete_pattern)]
+
+
+suffixes = ['B', 'K', 'M', 'G', 'T', 'P']
+
+
+def humansize(nbytes):
+ i = 0
+ while nbytes >= 1024 and i < len(suffixes) - 1:
+ nbytes /= 1024.
+ i += 1
+ nbytes = math.ceil(nbytes)
+ f = ('%d' % nbytes).rstrip('.')
+ return '%s%s' % (f, suffixes[i])
+
+
+def style_listing(path, is_dir, is_symlink, ls_long=False):
+ if not (is_dir or is_symlink):
+ return path
+ pretty = colorama.Style.BRIGHT
+ if is_symlink:
+ pretty += colorama.Fore.CYAN + path
+ if ls_long:
+ # Add target path
+ pretty += ' -> ' + cephfs.readlink(path, size=255).decode('utf-8')
+ elif is_dir:
+ pretty += colorama.Fore.BLUE + path + '/'
+ pretty += colorama.Style.RESET_ALL
+ return pretty
+
+
+def print_long(path, is_dir, is_symlink, human_readable):
+ info = cephfs.stat(path, follow_symlink=(not is_symlink))
+ pretty = style_listing(os.path.basename(path.decode('utf-8')), is_dir, is_symlink, True)
+ if human_readable:
+ sizefmt = '\t {:10s}'.format(humansize(info.st_size))
+ else:
+ sizefmt = '{:12d}'.format(info.st_size)
+ poutput(f'{mode_notation(info.st_mode)} {sizefmt} {info.st_uid} {info.st_gid} {info.st_mtime}'
+ f' {pretty}')
+
+
+def word_len(word):
+ """
+ Returns the word length, minus any color codes.
+ """
+ if word[0] == '\x1b':
+ return len(word) - 9
+ return len(word)
+
+
+def is_dir_exists(path, dir_=b''):
+ path_to_stat = os.path.join(dir_, path)
+ try:
+ return ((cephfs.stat(path_to_stat).st_mode & 0o0040000) != 0)
+ except libcephfs.Error:
+ return False
+
+
+def is_file_exists(path, dir_=b''):
+ try:
+ # if its not a directory, then its a file
+ return ((cephfs.stat(os.path.join(dir_, path)).st_mode & 0o0040000) == 0)
+ except libcephfs.Error:
+ return False
+
+
+def print_list(words, termwidth=79):
+ if not words:
+ return
+ words = [word.decode('utf-8') if isinstance(word, bytes) else word for word in words]
+ width = max([word_len(word) for word in words]) + 2
+ nwords = len(words)
+ ncols = max(1, (termwidth + 1) // (width + 1))
+ nrows = (nwords + ncols - 1) // ncols
+ for row in range(nrows):
+ for i in range(row, nwords, nrows):
+ word = words[i]
+ print_width = width
+ if word[0] == '\x1b':
+ print_width = print_width + 10
+
+ poutput('%-*s' % (print_width, words[i]),
+ end='\n' if i + nrows >= nwords else '')
+
+
+def copy_from_local(local_path, remote_path):
+ stdin = -1
+ file_ = None
+ fd = None
+ convert_to_bytes = False
+ if local_path == b'-':
+ file_ = sys.stdin
+ convert_to_bytes = True
+ else:
+ try:
+ file_ = open(local_path, 'rb')
+ except PermissionError as e:
+ set_exit_code_msg(e.errno, 'error: no permission to read local file {}'.format(
+ local_path.decode('utf-8')))
+ return
+ stdin = 1
+ try:
+ fd = cephfs.open(remote_path, 'w', 0o666)
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+ return
+ progress = 0
+ while True:
+ data = file_.read(65536)
+ if not data or len(data) == 0:
+ break
+ if convert_to_bytes:
+ data = to_bytes(data)
+ wrote = cephfs.write(fd, data, progress)
+ if wrote < 0:
+ break
+ progress += wrote
+ cephfs.close(fd)
+ if stdin > 0:
+ file_.close()
+ poutput('')
+
+
+def copy_to_local(remote_path, local_path):
+ fd = None
+ if local_path != b'-':
+ local_dir = os.path.dirname(local_path)
+ dir_list = remote_path.rsplit(b'/', 1)
+ if not os.path.exists(local_dir):
+ os.makedirs(local_dir)
+ if len(dir_list) > 2 and dir_list[1] == b'':
+ return
+ fd = open(local_path, 'wb+')
+ file_ = cephfs.open(remote_path, 'r')
+ file_size = cephfs.stat(remote_path).st_size
+ if file_size <= 0:
+ return
+ progress = 0
+ for chunk_start, chunk_size in get_chunks(file_size):
+ file_chunk = cephfs.read(file_, chunk_start, chunk_size)
+ progress += len(file_chunk)
+ if fd:
+ fd.write(file_chunk)
+ else:
+ poutput(file_chunk.decode('utf-8'))
+ cephfs.close(file_)
+ if fd:
+ fd.close()
+
+
+def dirwalk(path):
+ """
+ walk a directory tree, using a generator
+ """
+ path = os.path.normpath(path)
+ for item in ls(path, opts='A'):
+ fullpath = os.path.join(path, item.d_name)
+ src_path = fullpath.rsplit(b'/', 1)[0]
+
+ yield os.path.normpath(fullpath)
+ if is_dir_exists(item.d_name, src_path):
+ for x in dirwalk(fullpath):
+ yield x
+
+
+##################################################################
+#
+# Following methods are implementation for CephFS Shell commands
+#
+#################################################################
+
+class CephFSShell(Cmd):
+
+ def __init__(self):
+ super().__init__(use_ipython=False)
+ self.working_dir = cephfs.getcwd().decode('utf-8')
+ self.set_prompt()
+ self.interactive = False
+ self.umask = '2'
+
+ def default(self, line):
+ perror('Unrecognized command')
+
+ def set_prompt(self):
+ self.prompt = ('\033[01;33mCephFS:~' + colorama.Fore.LIGHTCYAN_EX
+ + self.working_dir + colorama.Style.RESET_ALL
+ + '\033[01;33m>>>\033[00m ')
+
+ def create_argparser(self, command):
+ try:
+ argparse_args = getattr(self, 'argparse_' + command)
+ except AttributeError:
+ set_exit_code_msg()
+ return None
+ doc_lines = getattr(
+ self, 'do_' + command).__doc__.expandtabs().splitlines()
+ if '' in doc_lines:
+ blank_idx = doc_lines.index('')
+ usage = doc_lines[:blank_idx]
+ description = doc_lines[blank_idx + 1:]
+ else:
+ usage = doc_lines
+ description = []
+ parser = argparse.ArgumentParser(
+ prog=command,
+ usage='\n'.join(usage),
+ description='\n'.join(description),
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
+ for args, kwargs in argparse_args:
+ parser.add_argument(*args, **kwargs)
+ return parser
+
+ def complete_filenames(self, text, line, begidx, endidx):
+ if not text:
+ completions = [x.d_name.decode('utf-8') + '/' * int(x.is_dir())
+ for x in ls(b".", opts='A')]
+ else:
+ if text.count('/') > 0:
+ completions = [text.rsplit('/', 1)[0] + '/'
+ + x.d_name.decode('utf-8') + '/'
+ * int(x.is_dir()) for x in ls('/'
+ + text.rsplit('/', 1)[0], opts='A')
+ if x.d_name.decode('utf-8').startswith(
+ text.rsplit('/', 1)[1])]
+ else:
+ completions = [x.d_name.decode('utf-8') + '/'
+ * int(x.is_dir()) for x in ls(b".", opts='A')
+ if x.d_name.decode('utf-8').startswith(text)]
+ if len(completions) == 1 and completions[0][-1] == '/':
+ dir_, file_ = completions[0].rsplit('/', 1)
+ completions.extend([dir_ + '/' + x.d_name.decode('utf-8')
+ + '/' * int(x.is_dir()) for x in
+ ls('/' + dir_, opts='A')
+ if x.d_name.decode('utf-8').startswith(file_)])
+ return self.delimiter_complete(text, line, begidx, endidx, completions, '/')
+ return completions
+
+ def onecmd(self, line, **kwargs):
+ """
+ Global error catcher
+ """
+ try:
+ res = Cmd.onecmd(self, line, **kwargs)
+ if self.interactive:
+ self.set_prompt()
+ return res
+ except ConnectionError as e:
+ set_exit_code_msg(e.errno, f'***\n{e}')
+ except KeyboardInterrupt:
+ set_exit_code_msg('KeyboardInterrupt', 'Command aborted')
+ except (libcephfs.Error, Exception) as e:
+ if shell.debug:
+ traceback.print_exc(file=sys.stdout)
+ set_exit_code_msg(msg=e)
+
+ class path_to_bytes(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ values = to_bytes(values)
+ setattr(namespace, self.dest, values)
+
+ # TODO: move the necessary contents from here to `class path_to_bytes`.
+ class get_list_of_bytes_path(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ values = to_bytes(values)
+
+ if values == b'.':
+ values = cephfs.getcwd()
+ else:
+ for i in values:
+ if i == b'.':
+ values[values.index(i)] = cephfs.getcwd()
+
+ setattr(namespace, self.dest, values)
+
+ def complete_mkdir(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ class ModeAction(argparse.Action):
+ def __init__(self, option_strings, dest, nargs=None, **kwargs):
+ if nargs is not None and nargs != '?':
+ raise ValueError("more than one modes not allowed")
+ super().__init__(option_strings, dest, **kwargs)
+
+ def __call__(self, parser, namespace, values, option_string=None):
+ o_mode = 0
+ res = None
+ try:
+ o_mode = int(values, base=8)
+ except ValueError:
+ res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', values)
+ if res is None:
+ parser.error("invalid mode: %s\n"
+ "mode must be a numeric octal literal\n"
+ "or ((u?g?o?)|(a?))(=)(r?w?x?)" %
+ values)
+ else:
+ # we are supporting only assignment of mode and not + or -
+ # as is generally available with the chmod command
+ # eg.
+ # >>> res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', 'go=')
+ # >>> res.groups()
+ # ('go', 'go', None, '=', '')
+ val = res.groups()
+
+ if val[3] != '=':
+ parser.error("need assignment operator between user "
+ "and mode specifiers")
+ if val[4] == '':
+ parser.error("invalid mode: %s\n"
+ "mode must be combination of: r | w | x" %
+ values)
+ users = ''
+ if val[2] is None:
+ users = val[1]
+ else:
+ users = val[2]
+
+ t_mode = 0
+ if users == 'a':
+ users = 'ugo'
+
+ if 'r' in val[4]:
+ t_mode |= 4
+ if 'w' in val[4]:
+ t_mode |= 2
+ if 'x' in val[4]:
+ t_mode |= 1
+
+ if 'u' in users:
+ o_mode |= (t_mode << 6)
+ if 'g' in users:
+ o_mode |= (t_mode << 3)
+ if 'o' in users:
+ o_mode |= t_mode
+
+ if o_mode < 0:
+ parser.error("invalid mode: %s\n"
+ "mode cannot be negative" % values)
+ if o_mode > 0o777:
+ parser.error("invalid mode: %s\n"
+ "mode cannot be greater than octal 0777" % values)
+
+ setattr(namespace, self.dest, str(oct(o_mode)))
+
+ mkdir_parser = argparse.ArgumentParser(
+ description='Create the directory(ies), if they do not already exist.')
+ mkdir_parser.add_argument('dirs', type=str,
+ action=path_to_bytes,
+ metavar='DIR_NAME',
+ help='Name of new_directory.',
+ nargs='+')
+ mkdir_parser.add_argument('-m', '--mode', type=str,
+ action=ModeAction,
+ help='Sets the access mode for the new directory.')
+ mkdir_parser.add_argument('-p', '--parent', action='store_true',
+ help='Create parent directories as necessary. '
+ 'When this option is specified, no error is'
+ 'reported if a directory already exists.')
+
+ @with_argparser(mkdir_parser)
+ def do_mkdir(self, args):
+ """
+ Create directory.
+ """
+ for path in args.dirs:
+ if args.mode:
+ permission = int(args.mode, 8)
+ else:
+ permission = 0o777
+ if args.parent:
+ cephfs.mkdirs(path, permission)
+ else:
+ try:
+ cephfs.mkdir(path, permission)
+ except libcephfs.Error as e:
+ set_exit_code_msg(e)
+
+ def complete_put(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ index_dict = {1: self.path_complete}
+ return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+ put_parser = argparse.ArgumentParser(
+ description='Copy a file/directory to Ceph File System from Local File System.')
+ put_parser.add_argument('local_path', type=str, action=path_to_bytes,
+ help='Path of the file in the local system')
+ put_parser.add_argument('remote_path', type=str, action=path_to_bytes,
+ help='Path of the file in the remote system')
+ put_parser.add_argument('-f', '--force', action='store_true',
+ help='Overwrites the destination if it already exists.')
+
+ @with_argparser(put_parser)
+ def do_put(self, args):
+ """
+ Copy a local file/directory to CephFS.
+ """
+ if args.local_path != b'-' and not os.path.isfile(args.local_path) \
+ and not os.path.isdir(args.local_path):
+ set_exit_code_msg(errno.ENOENT,
+ msg=f"error: "
+ f"{args.local_path.decode('utf-8')}: "
+ f"No such file or directory")
+ return
+
+ if (is_file_exists(args.remote_path) or is_dir_exists(
+ args.remote_path)) and not args.force:
+ set_exit_code_msg(msg=f"error: file/directory "
+ f"{args.remote_path.decode('utf-8')} "
+ f"exists, use --force to overwrite")
+ return
+
+ root_src_dir = args.local_path
+ root_dst_dir = args.remote_path
+ if args.local_path == b'.' or args.local_path == b'./':
+ root_src_dir = os.getcwdb()
+ elif len(args.local_path.rsplit(b'/', 1)) < 2:
+ root_src_dir = os.path.join(os.getcwdb(), args.local_path)
+ else:
+ p = args.local_path.split(b'/')
+ if p[0] == b'.':
+ root_src_dir = os.getcwdb()
+ p.pop(0)
+ while len(p) > 0:
+ root_src_dir += b'/' + p.pop(0)
+
+ if root_dst_dir == b'.':
+ if args.local_path != b'-':
+ root_dst_dir = root_src_dir.rsplit(b'/', 1)[1]
+ if root_dst_dir == b'':
+ root_dst_dir = root_src_dir.rsplit(b'/', 1)[0]
+ a = root_dst_dir.rsplit(b'/', 1)
+ if len(a) > 1:
+ root_dst_dir = a[1]
+ else:
+ root_dst_dir = a[0]
+ else:
+ set_exit_code_msg(errno.EINVAL, 'error: no filename specified '
+ 'for destination')
+ return
+
+ if root_dst_dir[-1] != b'/':
+ root_dst_dir += b'/'
+
+ if args.local_path == b'-' or os.path.isfile(root_src_dir):
+ if args.local_path == b'-':
+ root_src_dir = b'-'
+ copy_from_local(root_src_dir, root_dst_dir)
+ else:
+ for src_dir, dirs, files in os.walk(root_src_dir):
+ if isinstance(src_dir, str):
+ src_dir = to_bytes(src_dir)
+ dst_dir = src_dir.replace(root_src_dir, root_dst_dir, 1)
+ dst_dir = re.sub(rb'\/+', b'/', cephfs.getcwd()
+ + dst_dir)
+ if args.force and dst_dir != b'/' and not is_dir_exists(
+ dst_dir[:-1]) and not locate_file(dst_dir):
+ try:
+ cephfs.mkdirs(dst_dir, 0o777)
+ except libcephfs.Error:
+ pass
+ if (not args.force) and dst_dir != b'/' and not is_dir_exists(
+ dst_dir) and not os.path.isfile(root_src_dir):
+ try:
+ cephfs.mkdirs(dst_dir, 0o777)
+ except libcephfs.Error:
+ # TODO: perhaps, set retval to 1?
+ pass
+
+ for dir_ in dirs:
+ dir_name = os.path.join(dst_dir, dir_)
+ if not is_dir_exists(dir_name):
+ try:
+ cephfs.mkdirs(dir_name, 0o777)
+ except libcephfs.Error:
+ # TODO: perhaps, set retval to 1?
+ pass
+
+ for file_ in files:
+ src_file = os.path.join(src_dir, file_)
+ dst_file = re.sub(rb'\/+', b'/', b'/' + dst_dir + b'/' + file_)
+ if (not args.force) and is_file_exists(dst_file):
+ return
+ copy_from_local(src_file, os.path.join(cephfs.getcwd(),
+ dst_file))
+
+ def complete_get(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ get_parser = argparse.ArgumentParser(
+ description='Copy a file from Ceph File System to Local Directory.')
+ get_parser.add_argument('remote_path', type=str, action=path_to_bytes,
+ help='Path of the file in the remote system')
+ get_parser.add_argument('local_path', type=str, action=path_to_bytes,
+ help='Path of the file in the local system')
+ get_parser.add_argument('-f', '--force', action='store_true',
+ help='Overwrites the destination if it already exists.')
+
+ @with_argparser(get_parser)
+ def do_get(self, args):
+ """
+ Copy a file/directory from CephFS to given path.
+ """
+ if not is_file_exists(args.remote_path) and not \
+ is_dir_exists(args.remote_path):
+ set_exit_code_msg(errno.ENOENT, "error: no file/directory"
+ " found at specified remote "
+ "path")
+ return
+ if (os.path.isfile(args.local_path) or os.path.isdir(
+ args.local_path)) and not args.force:
+ set_exit_code_msg(msg=f"error: file/directory "
+ f"{args.local_path.decode('utf-8')}"
+ f" already exists, use --force to "
+ f"overwrite")
+ return
+ root_src_dir = args.remote_path
+ root_dst_dir = args.local_path
+ fname = root_src_dir.rsplit(b'/', 1)
+ if args.local_path == b'.':
+ root_dst_dir = os.getcwdb()
+ if args.remote_path == b'.':
+ root_src_dir = cephfs.getcwd()
+ if args.local_path == b'-':
+ if args.remote_path == b'.' or args.remote_path == b'./':
+ set_exit_code_msg(errno.EINVAL, 'error: no remote file name specified')
+ return
+ copy_to_local(root_src_dir, b'-')
+ elif is_file_exists(args.remote_path):
+ copy_to_local(root_src_dir, root_dst_dir)
+ elif b'/' in root_src_dir and is_file_exists(fname[1], fname[0]):
+ copy_to_local(root_src_dir, root_dst_dir)
+ else:
+ files = list(reversed(sorted(dirwalk(root_src_dir))))
+ for file_ in files:
+ dst_dirpath, dst_file = file_.rsplit(b'/', 1)
+ if dst_dirpath in files:
+ files.remove(dst_dirpath)
+ dst_path = os.path.join(root_dst_dir, dst_dirpath, dst_file)
+ dst_path = os.path.normpath(dst_path)
+ if is_dir_exists(file_):
+ try:
+ os.makedirs(dst_path)
+ except OSError:
+ pass
+ else:
+ copy_to_local(file_, dst_path)
+
+ return 0
+
+ def complete_ls(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ ls_parser = argparse.ArgumentParser(
+ description='Copy a file from Ceph File System from Local Directory.')
+ ls_parser.add_argument('-l', '--long', action='store_true',
+ help='Detailed list of items in the directory.')
+ ls_parser.add_argument('-r', '--reverse', action='store_true',
+ help='Reverse order of listing items in the directory.')
+ ls_parser.add_argument('-H', action='store_true', help='Human Readable')
+ ls_parser.add_argument('-a', '--all', action='store_true',
+ help='Do not Ignore entries starting with .')
+ ls_parser.add_argument('-S', action='store_true', help='Sort by file_size')
+ ls_parser.add_argument('paths', help='Name of Directories',
+ action=path_to_bytes, nargs='*', default=['.'])
+
+ @with_argparser(ls_parser)
+ def do_ls(self, args):
+ """
+ List all the files and directories in the current working directory
+ """
+ paths = args.paths
+ for path in paths:
+ values = []
+ items = []
+ try:
+ if path.count(b'*') > 0:
+ all_items = get_all_possible_paths(path)
+ if len(all_items) == 0:
+ continue
+ path = all_items[0].rsplit(b'/', 1)[0]
+ if path == b'':
+ path = b'/'
+ dirs = []
+ for i in all_items:
+ for item in ls(path):
+ d_name = item.d_name
+ if os.path.basename(i) == d_name:
+ if item.is_dir():
+ dirs.append(os.path.join(path, d_name))
+ else:
+ items.append(item)
+ if dirs:
+ paths.extend(dirs)
+ else:
+ poutput(path.decode('utf-8'), end=':\n')
+ items = sorted(items, key=lambda item: item.d_name)
+ else:
+ if path != b'' and path != cephfs.getcwd() and len(paths) > 1:
+ poutput(path.decode('utf-8'), end=':\n')
+ items = sorted(ls(path), key=lambda item: item.d_name)
+ if not args.all:
+ items = [i for i in items if not i.d_name.startswith(b'.')]
+ if args.S:
+ items = sorted(items, key=lambda item: cephfs.stat(
+ path + b'/' + item.d_name, follow_symlink=(
+ not item.is_symbol_file())).st_size)
+ if args.reverse:
+ items = reversed(items)
+ for item in items:
+ filepath = item.d_name
+ is_dir = item.is_dir()
+ is_sym_lnk = item.is_symbol_file()
+ try:
+ if args.long and args.H:
+ print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir,
+ is_sym_lnk, True)
+ elif args.long:
+ print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir,
+ is_sym_lnk, False)
+ elif is_sym_lnk or is_dir:
+ values.append(style_listing(filepath.decode('utf-8'), is_dir,
+ is_sym_lnk))
+ else:
+ values.append(filepath)
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+ if not args.long:
+ print_list(values, shutil.get_terminal_size().columns)
+ if path != paths[-1]:
+ poutput('')
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ def complete_rmdir(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ rmdir_parser = argparse.ArgumentParser(description='Remove Directory.')
+ rmdir_parser.add_argument('paths', help='Directory Path.', nargs='+',
+ action=path_to_bytes)
+ rmdir_parser.add_argument('-p', '--parent', action='store_true',
+ help='Remove parent directories as necessary. '
+ 'When this option is specified, no error '
+ 'is reported if a directory has any '
+ 'sub-directories, files')
+
+ @with_argparser(rmdir_parser)
+ def do_rmdir(self, args):
+ self.do_rmdir_helper(args)
+
+ def do_rmdir_helper(self, args):
+ """
+ Remove a specific Directory
+ """
+ is_pattern = False
+ paths = args.paths
+ for path in paths:
+ if path.count(b'*') > 0:
+ is_pattern = True
+ all_items = get_all_possible_paths(path)
+ if len(all_items) > 0:
+ path = all_items[0].rsplit(b'/', 1)[0]
+ if path == b'':
+ path = b'/'
+ dirs = []
+ for i in all_items:
+ for item in ls(path):
+ d_name = item.d_name
+ if os.path.basename(i) == d_name:
+ if item.is_dir():
+ dirs.append(os.path.join(path, d_name))
+ paths.extend(dirs)
+ continue
+ else:
+ is_pattern = False
+
+ if args.parent:
+ path = os.path.join(cephfs.getcwd(), path.rsplit(b'/')[0])
+ files = list(sorted(set(dirwalk(path)), reverse=True))
+ if not files:
+ path = b'.'
+ for filepath in files:
+ try:
+ cephfs.rmdir(os.path.normpath(filepath))
+ except libcephfs.Error as e:
+ perror(e)
+ path = b'.'
+ break
+ else:
+ path = os.path.normpath(os.path.join(cephfs.getcwd(), path))
+ if not is_pattern and path != os.path.normpath(b''):
+ try:
+ cephfs.rmdir(path)
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ def complete_rm(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ rm_parser = argparse.ArgumentParser(description='Remove File.')
+ rm_parser.add_argument('paths', help='File Path.', nargs='+',
+ action=path_to_bytes)
+
+ @with_argparser(rm_parser)
+ def do_rm(self, args):
+ """
+ Remove a specific file
+ """
+ file_paths = args.paths
+ for path in file_paths:
+ if path.count(b'*') > 0:
+ file_paths.extend([i for i in get_all_possible_paths(
+ path) if is_file_exists(i)])
+ else:
+ try:
+ cephfs.unlink(path)
+ except libcephfs.Error as e:
+ # NOTE: perhaps we need a better msg here
+ set_exit_code_msg(msg=e)
+
+ def complete_mv(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ mv_parser = argparse.ArgumentParser(description='Move File.')
+ mv_parser.add_argument('src_path', type=str, action=path_to_bytes,
+ help='Source File Path.')
+ mv_parser.add_argument('dest_path', type=str, action=path_to_bytes,
+ help='Destination File Path.')
+
+ @with_argparser(mv_parser)
+ def do_mv(self, args):
+ """
+ Rename a file or Move a file from source path to the destination
+ """
+ cephfs.rename(args.src_path, args.dest_path)
+
+ def complete_cd(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ cd_parser = argparse.ArgumentParser(description='Change working directory')
+ cd_parser.add_argument('path', type=str, help='Name of the directory.',
+ action=path_to_bytes, nargs='?', default='/')
+
+ @with_argparser(cd_parser)
+ def do_cd(self, args):
+ """
+ Change working directory
+ """
+ cephfs.chdir(args.path)
+ self.working_dir = cephfs.getcwd().decode('utf-8')
+ self.set_prompt()
+
+ def do_cwd(self, arglist):
+ """
+ Get current working directory.
+ """
+ poutput(cephfs.getcwd().decode('utf-8'))
+
+ def complete_chmod(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ chmod_parser = argparse.ArgumentParser(description='Create Directory.')
+ chmod_parser.add_argument('mode', type=str, action=ModeAction, help='Mode')
+ chmod_parser.add_argument('paths', type=str, action=path_to_bytes,
+ help='Name of the file', nargs='+')
+
+ @with_argparser(chmod_parser)
+ def do_chmod(self, args):
+ """
+ Change permission of a file
+ """
+ for path in args.paths:
+ mode = int(args.mode, base=8)
+ try:
+ cephfs.chmod(path, mode)
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ def complete_cat(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ cat_parser = argparse.ArgumentParser(description='')
+ cat_parser.add_argument('paths', help='Name of Files', action=path_to_bytes,
+ nargs='+')
+
+ @with_argparser(cat_parser)
+ def do_cat(self, args):
+ """
+ Print contents of a file
+ """
+ for path in args.paths:
+ if is_file_exists(path):
+ copy_to_local(path, b'-')
+ else:
+ set_exit_code_msg(errno.ENOENT, '{}: no such file'.format(
+ path.decode('utf-8')))
+
+ umask_parser = argparse.ArgumentParser(description='Set umask value.')
+ umask_parser.add_argument('mode', help='Mode', type=str, action=ModeAction,
+ nargs='?', default='')
+
+ @with_argparser(umask_parser)
+ def do_umask(self, args):
+ """
+ Set Umask value.
+ """
+ if args.mode == '':
+ poutput(self.umask.zfill(4))
+ else:
+ mode = int(args.mode, 8)
+ self.umask = str(oct(cephfs.umask(mode))[2:])
+
+ def complete_write(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ write_parser = argparse.ArgumentParser(description='Writes data into a file')
+ write_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of File')
+
+ @with_argparser(write_parser)
+ def do_write(self, args):
+ """
+ Write data into a file.
+ """
+
+ copy_from_local(b'-', args.path)
+
+ def complete_lcd(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ index_dict = {1: self.path_complete}
+ return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+ lcd_parser = argparse.ArgumentParser(description='')
+ lcd_parser.add_argument('path', type=str, action=path_to_bytes, help='Path')
+
+ @with_argparser(lcd_parser)
+ def do_lcd(self, args):
+ """
+ Moves into the given local directory
+ """
+ try:
+ os.chdir(os.path.expanduser(args.path))
+ except OSError as e:
+ set_exit_code_msg(e.errno, "Cannot change to "
+ f"{e.filename.decode('utf-8')}: {e.strerror}")
+
+ def complete_lls(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ index_dict = {1: self.path_complete}
+ return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+ lls_parser = argparse.ArgumentParser(
+ description='List files in local system.')
+ lls_parser.add_argument('paths', help='Paths', action=path_to_bytes,
+ nargs='*')
+
+ @with_argparser(lls_parser)
+ def do_lls(self, args):
+ """
+ Lists all files and folders in the current local directory
+ """
+ if not args.paths:
+ print_list(os.listdir(os.getcwdb()))
+ else:
+ for path in args.paths:
+ try:
+ items = os.listdir(path)
+ poutput("{}:".format(path.decode('utf-8')))
+ print_list(items)
+ except OSError as e:
+ set_exit_code_msg(e.errno, f"{e.filename.decode('utf-8')}: "
+ f"{e.strerror}")
+ # Arguments to the with_argpaser decorator function are sticky.
+ # The items in args.path do not get overwritten in subsequent calls.
+ # The arguments remain in args.paths after the function exits and we
+ # neeed to clean it up to ensure the next call works as expected.
+ args.paths.clear()
+
+ def do_lpwd(self, arglist):
+ """
+ Prints the absolute path of the current local directory
+ """
+ poutput(os.getcwd())
+
+ def complete_df(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ df_parser = argparse.ArgumentParser(description='Show information about\
+ the amount of available disk space')
+ df_parser.add_argument('file', help='Name of the file', nargs='*',
+ default=['.'], action=path_to_bytes)
+
+ @with_argparser(df_parser)
+ def do_df(self, arglist):
+ """
+ Display the amount of available disk space for file systems
+ """
+ header = True # Set to true for printing header only once
+ if b'.' == arglist.file[0]:
+ arglist.file = ls(b'.')
+
+ for file in arglist.file:
+ if isinstance(file, libcephfs.DirEntry):
+ file = file.d_name
+ if file == b'.' or file == b'..':
+ continue
+ try:
+ statfs = cephfs.statfs(file)
+ stat = cephfs.stat(file)
+ block_size = (statfs['f_blocks'] * statfs['f_bsize']) // 1024
+ available = block_size - stat.st_size
+ use = 0
+
+ if block_size > 0:
+ use = (stat.st_size * 100) // block_size
+
+ if header:
+ header = False
+ poutput('{:25s}\t{:5s}\t{:15s}{:10s}{}'.format(
+ "1K-blocks", "Used", "Available", "Use%",
+ "Stored on"))
+
+ poutput('{:d}\t{:18d}\t{:8d}\t{:10s} {}'.format(block_size,
+ stat.st_size, available, str(int(use)) + '%',
+ file.decode('utf-8')))
+ except libcephfs.OSError as e:
+ set_exit_code_msg(e.get_error_code(), "could not statfs {}: {}".format(
+ file.decode('utf-8'), e.strerror))
+
+ locate_parser = argparse.ArgumentParser(
+ description='Find file within file system')
+ locate_parser.add_argument('name', help='name', type=str,
+ action=path_to_bytes)
+ locate_parser.add_argument('-c', '--count', action='store_true',
+ help='Count list of items located.')
+ locate_parser.add_argument(
+ '-i', '--ignorecase', action='store_true', help='Ignore case')
+
+ @with_argparser(locate_parser)
+ def do_locate(self, args):
+ """
+ Find a file within the File System
+ """
+ if args.name.count(b'*') == 1:
+ if args.name[0] == b'*':
+ args.name += b'/'
+ elif args.name[-1] == '*':
+ args.name = b'/' + args.name
+ args.name = args.name.replace(b'*', b'')
+ if args.ignorecase:
+ locations = locate_file(args.name, False)
+ else:
+ locations = locate_file(args.name)
+ if args.count:
+ poutput(len(locations))
+ else:
+ poutput((b'\n'.join(locations)).decode('utf-8'))
+
+ def complete_du(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ du_parser = argparse.ArgumentParser(
+ description='Disk Usage of a Directory')
+ du_parser.add_argument('paths', type=str, action=get_list_of_bytes_path,
+ help='Name of the directory.', nargs='*',
+ default=[b'.'])
+ du_parser.add_argument('-r', action='store_true',
+ help='Recursive Disk usage of all directories.')
+
+ @with_argparser(du_parser)
+ def do_du(self, args):
+ """
+ Print disk usage of a given path(s).
+ """
+ def print_disk_usage(files):
+ if isinstance(files, bytes):
+ files = (files, )
+
+ for f in files:
+ try:
+ st = cephfs.lstat(f)
+
+ if stat.S_ISDIR(st.st_mode):
+ dusage = int(cephfs.getxattr(f,
+ 'ceph.dir.rbytes').decode('utf-8'))
+ else:
+ dusage = st.st_size
+
+ # print path in local context
+ f = os.path.normpath(f)
+ if f[0] is ord('/'):
+ f = b'.' + f
+ poutput('{:10s} {}'.format(humansize(dusage),
+ f.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+ continue
+
+ for path in args.paths:
+ if args.r:
+ print_disk_usage(sorted(set(dirwalk(path)).union({path})))
+ else:
+ print_disk_usage(path)
+
+ quota_parser = argparse.ArgumentParser(
+ description='Quota management for a Directory')
+ quota_parser.add_argument('op', choices=['get', 'set'],
+ help='Quota operation type.')
+ quota_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of the directory.')
+ quota_parser.add_argument('--max_bytes', type=int, default=-1, nargs='?',
+ help='Max cumulative size of the data under '
+ 'this directory.')
+ quota_parser.add_argument('--max_files', type=int, default=-1, nargs='?',
+ help='Total number of files under this '
+ 'directory tree.')
+
+ @with_argparser(quota_parser)
+ def do_quota(self, args):
+ """
+ Quota management.
+ """
+ if not is_dir_exists(args.path):
+ set_exit_code_msg(errno.ENOENT, 'error: no such directory {}'.format(
+ args.path.decode('utf-8')))
+ return
+
+ if args.op == 'set':
+ if (args.max_bytes == -1) and (args.max_files == -1):
+ set_exit_code_msg(errno.EINVAL, 'please specify either '
+ '--max_bytes or --max_files or both')
+ return
+
+ if args.max_bytes >= 0:
+ max_bytes = to_bytes(str(args.max_bytes))
+ try:
+ cephfs.setxattr(args.path, 'ceph.quota.max_bytes',
+ max_bytes, os.XATTR_CREATE)
+ poutput('max_bytes set to %d' % args.max_bytes)
+ except libcephfs.Error as e:
+ cephfs.setxattr(args.path, 'ceph.quota.max_bytes',
+ max_bytes, os.XATTR_REPLACE)
+ set_exit_code_msg(e.get_error_code(), 'max_bytes reset to '
+ f'{args.max_bytes}')
+
+ if args.max_files >= 0:
+ max_files = to_bytes(str(args.max_files))
+ try:
+ cephfs.setxattr(args.path, 'ceph.quota.max_files',
+ max_files, os.XATTR_CREATE)
+ poutput('max_files set to %d' % args.max_files)
+ except libcephfs.Error as e:
+ cephfs.setxattr(args.path, 'ceph.quota.max_files',
+ max_files, os.XATTR_REPLACE)
+ set_exit_code_msg(e.get_error_code(), 'max_files reset to '
+ f'{args.max_files}')
+ elif args.op == 'get':
+ max_bytes = '0'
+ max_files = '0'
+ try:
+ max_bytes = cephfs.getxattr(args.path, 'ceph.quota.max_bytes')
+ poutput('max_bytes: {}'.format(max_bytes.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(e.get_error_code(), 'max_bytes is not set')
+
+ try:
+ max_files = cephfs.getxattr(args.path, 'ceph.quota.max_files')
+ poutput('max_files: {}'.format(max_files.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(e.get_error_code(), 'max_files is not set')
+
+ snap_parser = argparse.ArgumentParser(description='Snapshot Management')
+ snap_parser.add_argument('op', type=str,
+ help='Snapshot operation: create or delete')
+ snap_parser.add_argument('name', type=str, action=path_to_bytes,
+ help='Name of snapshot')
+ snap_parser.add_argument('dir', type=str, action=path_to_bytes,
+ help='Directory for which snapshot '
+ 'needs to be created or deleted')
+
+ @with_argparser(snap_parser)
+ def do_snap(self, args):
+ """
+ Snapshot management for the volume
+ """
+ # setting self.colors to None turns off colorizing and
+ # perror emits plain text
+ self.colors = None
+
+ snapdir = '.snap'
+ conf_snapdir = cephfs.conf_get('client_snapdir')
+ if conf_snapdir is not None:
+ snapdir = conf_snapdir
+ snapdir = to_bytes(snapdir)
+ if args.op == 'create':
+ try:
+ if is_dir_exists(args.dir):
+ cephfs.mkdir(os.path.join(args.dir, snapdir, args.name), 0o755)
+ else:
+ set_exit_code_msg(errno.ENOENT, "'{}': no such directory".format(
+ args.dir.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(e.get_error_code(),
+ "snapshot '{}' already exists".format(
+ args.name.decode('utf-8')))
+ elif args.op == 'delete':
+ snap_dir = os.path.join(args.dir, snapdir, args.name)
+ try:
+ if is_dir_exists(snap_dir):
+ newargs = argparse.Namespace(paths=[snap_dir], parent=False)
+ self.do_rmdir_helper(newargs)
+ else:
+ set_exit_code_msg(errno.ENOENT, "'{}': no such snapshot".format(
+ args.name.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(e.get_error_code(), "error while deleting "
+ "'{}'".format(snap_dir.decode('utf-8')))
+ else:
+ set_exit_code_msg(errno.EINVAL, "snapshot can only be created or "
+ "deleted; check - help snap")
+
+ def do_help(self, line):
+ """
+ Get details about a command.
+ Usage: help <cmd> - for a specific command
+ help all - for all the commands
+ """
+ if line == 'all':
+ for k in dir(self):
+ if k.startswith('do_'):
+ poutput('-' * 80)
+ super().do_help(k[3:])
+ return
+ parser = self.create_argparser(line)
+ if parser:
+ parser.print_help()
+ else:
+ super().do_help(line)
+
+ def complete_stat(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ stat_parser = argparse.ArgumentParser(
+ description='Display file or file system status')
+ stat_parser.add_argument('paths', type=str, help='file paths',
+ action=path_to_bytes, nargs='+')
+
+ @with_argparser(stat_parser)
+ def do_stat(self, args):
+ """
+ Display file or file system status
+ """
+ for path in args.paths:
+ try:
+ stat = cephfs.stat(path)
+ atime = stat.st_atime.isoformat(' ')
+ mtime = stat.st_mtime.isoformat(' ')
+ ctime = stat.st_mtime.isoformat(' ')
+
+ poutput("File: {}\nSize: {:d}\nBlocks: {:d}\nIO Block: {:d}\n"
+ "Device: {:d}\tInode: {:d}\tLinks: {:d}\nPermission: "
+ "{:o}/{}\tUid: {:d}\tGid: {:d}\nAccess: {}\nModify: "
+ "{}\nChange: {}".format(path.decode('utf-8'),
+ stat.st_size, stat.st_blocks,
+ stat.st_blksize, stat.st_dev,
+ stat.st_ino, stat.st_nlink,
+ stat.st_mode,
+ mode_notation(stat.st_mode),
+ stat.st_uid, stat.st_gid, atime,
+ mtime, ctime))
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ setxattr_parser = argparse.ArgumentParser(
+ description='Set extended attribute for a file')
+ setxattr_parser.add_argument('path', type=str, action=path_to_bytes, help='Name of the file')
+ setxattr_parser.add_argument('name', type=str, help='Extended attribute name')
+ setxattr_parser.add_argument('value', type=str, help='Extended attribute value')
+
+ @with_argparser(setxattr_parser)
+ def do_setxattr(self, args):
+ """
+ Set extended attribute for a file
+ """
+ val_bytes = to_bytes(args.value)
+ name_bytes = to_bytes(args.name)
+ try:
+ cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_CREATE)
+ poutput('{} is successfully set to {}'.format(args.name, args.value))
+ except libcephfs.ObjectExists:
+ cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_REPLACE)
+ poutput('{} is successfully reset to {}'.format(args.name, args.value))
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ getxattr_parser = argparse.ArgumentParser(
+ description='Get extended attribute set for a file')
+ getxattr_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of the file')
+ getxattr_parser.add_argument('name', type=str, help='Extended attribute name')
+
+ @with_argparser(getxattr_parser)
+ def do_getxattr(self, args):
+ """
+ Get extended attribute for a file
+ """
+ try:
+ poutput('{}'.format(cephfs.getxattr(args.path,
+ to_bytes(args.name)).decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ listxattr_parser = argparse.ArgumentParser(
+ description='List extended attributes set for a file')
+ listxattr_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of the file')
+
+ @with_argparser(listxattr_parser)
+ def do_listxattr(self, args):
+ """
+ List extended attributes for a file
+ """
+ try:
+ size, xattr_list = cephfs.listxattr(args.path)
+ if size > 0:
+ poutput('{}'.format(xattr_list.replace(b'\x00', b' ').decode('utf-8')))
+ else:
+ poutput('No extended attribute is set')
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+
+#######################################################
+#
+# Following are methods that get cephfs-shell started.
+#
+#####################################################
+
+def setup_cephfs():
+ """
+ Mounting a cephfs
+ """
+ global cephfs
+ try:
+ cephfs = libcephfs.LibCephFS(conffile='')
+ cephfs.mount()
+ except libcephfs.ObjectNotFound as e:
+ print('couldn\'t find ceph configuration not found')
+ sys.exit(e.get_error_code())
+ except libcephfs.Error as e:
+ print(e)
+ sys.exit(e.get_error_code())
+
+
+def str_to_bool(val):
+ """
+ Return corresponding bool values for strings like 'true' or 'false'.
+ """
+ if not isinstance(val, str):
+ return val
+
+ val = val.replace('\n', '')
+ if val.lower() in ['true', 'yes']:
+ return True
+ elif val.lower() in ['false', 'no']:
+ return False
+ else:
+ return val
+
+
+def read_shell_conf(shell, shell_conf_file):
+ import configparser
+
+ sec = 'cephfs-shell'
+ opts = []
+ if LooseVersion(cmd2_version) >= LooseVersion("0.10.0"):
+ for attr in shell.settables.keys():
+ opts.append(attr)
+ else:
+ if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"):
+ # hardcoding options for 0.7.9 because -
+ # 1. we use cmd2 v0.7.9 with teuthology and
+ # 2. there's no way distinguish between a shell setting and shell
+ # object attribute until v0.10.0
+ opts = ['abbrev', 'autorun_on_edit', 'colors',
+ 'continuation_prompt', 'debug', 'echo', 'editor',
+ 'feedback_to_output', 'locals_in_py', 'prompt', 'quiet',
+ 'timing']
+ elif LooseVersion(cmd2_version) >= LooseVersion("0.9.23"):
+ opts.append('allow_style')
+ # no equivalent option was defined by cmd2.
+ else:
+ pass
+
+ # default and only section in our conf file.
+ cp = configparser.ConfigParser(default_section=sec, strict=False)
+ cp.read(shell_conf_file)
+ for opt in opts:
+ if cp.has_option(sec, opt):
+ setattr(shell, opt, str_to_bool(cp.get(sec, opt)))
+
+
+def get_shell_conffile_path(arg_conf=''):
+ conf_filename = 'cephfs-shell.conf'
+ env_var = 'CEPHFS_SHELL_CONF'
+
+ arg_conf = '' if not arg_conf else arg_conf
+ home_dir_conf = os.path.expanduser('~/.' + conf_filename)
+ env_conf = os.environ[env_var] if env_var in os.environ else ''
+
+ # here's the priority by which conf gets read.
+ for path in (arg_conf, env_conf, home_dir_conf):
+ if os.path.isfile(path):
+ return path
+ else:
+ return ''
+
+
+def manage_args():
+ main_parser = argparse.ArgumentParser(description='')
+ main_parser.add_argument('-c', '--config', action='store',
+ help='Path to Ceph configuration file.',
+ type=str)
+ main_parser.add_argument('-b', '--batch', action='store',
+ help='Path to CephFS shell script/batch file'
+ 'containing CephFS shell commands',
+ type=str)
+ main_parser.add_argument('-t', '--test', action='store',
+ help='Test against transcript(s) in FILE',
+ nargs='+')
+ main_parser.add_argument('commands', nargs='*', help='Comma delimited '
+ 'commands. The shell executes the given command '
+ 'and quits immediately with the return value of '
+ 'command. In case no commands are provided, the '
+ 'shell is launched.', default=[])
+
+ args = main_parser.parse_args()
+ args.exe_and_quit = False # Execute and quit, don't launch the shell.
+
+ if args.batch:
+ if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"):
+ args.commands = ['load ' + args.batch, ',quit']
+ else:
+ args.commands = ['run_script ' + args.batch, ',quit']
+ if args.test:
+ args.commands.extend(['-t,'] + [arg + ',' for arg in args.test])
+ if not args.batch and len(args.commands) > 0:
+ args.exe_and_quit = True
+
+ manage_sys_argv(args)
+
+ return args
+
+
+def manage_sys_argv(args):
+ exe = sys.argv[0]
+ sys.argv.clear()
+ sys.argv.append(exe)
+ sys.argv.extend([i.strip() for i in ' '.join(args.commands).split(',')])
+
+ setup_cephfs()
+
+
+def execute_cmd_args(args):
+ """
+ Launch a shell session if no arguments were passed, else just execute
+ the given argument as a shell command and exit the shell session
+ immediately at (last) command's termination with the (last) command's
+ return value.
+ """
+ if not args.exe_and_quit:
+ return shell.cmdloop()
+ return execute_cmds_and_quit(args)
+
+
+def execute_cmds_and_quit(args):
+ """
+ Multiple commands might be passed separated by commas, feed onecmd()
+ one command at a time.
+ """
+ # do_* methods triggered by cephfs-shell commands return None when they
+ # complete running successfully. Until 0.9.6, shell.onecmd() returned this
+ # value to indicate whether the execution of the commands should stop, but
+ # since 0.9.7 it returns the return value of do_* methods only if it's
+ # not None. When it is None it returns False instead of None.
+ if LooseVersion(cmd2_version) <= LooseVersion("0.9.6"):
+ stop_exec_val = None
+ else:
+ stop_exec_val = False
+
+ args_to_onecmd = ''
+ if len(args.commands) <= 1:
+ args.commands = args.commands[0].split(' ')
+ for cmdarg in args.commands:
+ if ',' in cmdarg:
+ args_to_onecmd += ' ' + cmdarg[0:-1]
+ onecmd_retval = shell.onecmd(args_to_onecmd)
+ # if the curent command failed, let's abort the execution of
+ # series of commands passed.
+ if onecmd_retval is not stop_exec_val:
+ return onecmd_retval
+ if shell.exit_code != 0:
+ return shell.exit_code
+
+ args_to_onecmd = ''
+ continue
+
+ args_to_onecmd += ' ' + cmdarg
+ return shell.onecmd(args_to_onecmd)
+
+
+if __name__ == '__main__':
+ args = manage_args()
+
+ shell = CephFSShell()
+ # TODO: perhaps, we should add an option to pass ceph.conf?
+ read_shell_conf(shell, get_shell_conffile_path(args.config))
+ # XXX: setting shell.exit_code to zero so that in case there are no errors
+ # and exceptions, it is not set by any method or function of cephfs-shell
+ # and return values from shell.cmdloop() or shell.onecmd() is not an
+ # integer, we can treat it as the return value of cephfs-shell.
+ shell.exit_code = 0
+
+ retval = execute_cmd_args(args)
+ sys.exit(retval if retval else shell.exit_code)
diff --git a/src/tools/cephfs/shell/setup.py b/src/tools/cephfs/shell/setup.py
new file mode 100644
index 000000000..8cf7f28f7
--- /dev/null
+++ b/src/tools/cephfs/shell/setup.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+from setuptools import setup
+
+__version__ = '0.0.1'
+
+setup(
+ name='cephfs-shell',
+ version=__version__,
+ description='Interactive shell for Ceph file system',
+ keywords='cephfs, shell',
+ scripts=['cephfs-shell'],
+ install_requires=[
+ 'cephfs',
+ 'cmd2',
+ 'colorama',
+ ],
+ classifiers=[
+ 'Development Status :: 3 - Alpha',
+ 'Environment :: Console',
+ 'Intended Audience :: System Administrators',
+ 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)',
+ 'Operating System :: POSIX :: Linux',
+ 'Programming Language :: Python :: 3'
+ ],
+ license='LGPLv2+',
+)
diff --git a/src/tools/cephfs/shell/tox.ini b/src/tools/cephfs/shell/tox.ini
new file mode 100644
index 000000000..c1cbff051
--- /dev/null
+++ b/src/tools/cephfs/shell/tox.ini
@@ -0,0 +1,7 @@
+[tox]
+envlist = py3
+skipsdist = true
+
+[testenv:py3]
+deps = flake8
+commands = flake8 --ignore=W503 --max-line-length=100 cephfs-shell
diff --git a/src/tools/cephfs/top/CMakeLists.txt b/src/tools/cephfs/top/CMakeLists.txt
new file mode 100644
index 000000000..49750c850
--- /dev/null
+++ b/src/tools/cephfs/top/CMakeLists.txt
@@ -0,0 +1,7 @@
+include(Distutils)
+distutils_install_module(cephfs-top)
+
+if(WITH_TESTS)
+ include(AddCephTest)
+ add_tox_test(cephfs-top)
+endif()
diff --git a/src/tools/cephfs/top/cephfs-top b/src/tools/cephfs/top/cephfs-top
new file mode 100755
index 000000000..d57c3ab83
--- /dev/null
+++ b/src/tools/cephfs/top/cephfs-top
@@ -0,0 +1,888 @@
+#!/usr/bin/python3
+
+import argparse
+import sys
+import curses
+import errno
+import json
+import signal
+import time
+import math
+import threading
+
+from collections import OrderedDict
+from datetime import datetime
+from enum import Enum, unique
+
+import rados
+
+
+class FSTopException(Exception):
+ def __init__(self, msg=''):
+ self.error_msg = msg
+
+ def get_error_msg(self):
+ return self.error_msg
+
+
+@unique
+class MetricType(Enum):
+ METRIC_TYPE_NONE = 0
+ METRIC_TYPE_PERCENTAGE = 1
+ METRIC_TYPE_LATENCY = 2
+ METRIC_TYPE_SIZE = 3
+ METRIC_TYPE_STDEV = 4
+
+
+FS_TOP_PROG_STR = 'cephfs-top'
+FS_TOP_ALL_FS_APP = 'ALL_FS_APP'
+FS_TOP_FS_SELECTED_APP = 'SELECTED_FS_APP'
+
+# version match b/w fstop and stats emitted by mgr/stats
+FS_TOP_SUPPORTED_VER = 2
+
+ITEMS_PAD_LEN = 3
+ITEMS_PAD = " " * ITEMS_PAD_LEN
+DEFAULT_REFRESH_INTERVAL = 1
+# min refresh interval allowed
+MIN_REFRESH_INTERVAL = 0.5
+
+# metadata provided by mgr/stats
+FS_TOP_MAIN_WINDOW_COL_CLIENT_ID = "client_id"
+FS_TOP_MAIN_WINDOW_COL_MNT_ROOT = "mount_root"
+FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR = "mount_point@host/addr"
+
+MAIN_WINDOW_TOP_LINE_ITEMS_START = [ITEMS_PAD,
+ FS_TOP_MAIN_WINDOW_COL_CLIENT_ID,
+ FS_TOP_MAIN_WINDOW_COL_MNT_ROOT]
+MAIN_WINDOW_TOP_LINE_ITEMS_END = [FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR]
+
+MAIN_WINDOW_TOP_LINE_METRICS_LEGACY = ["READ_LATENCY",
+ "WRITE_LATENCY",
+ "METADATA_LATENCY"
+ ]
+
+# adjust this map according to stats version and maintain order
+# as emitted by mgr/stast
+MAIN_WINDOW_TOP_LINE_METRICS = OrderedDict([
+ ("CAP_HIT", MetricType.METRIC_TYPE_PERCENTAGE),
+ ("READ_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("DENTRY_LEASE", MetricType.METRIC_TYPE_PERCENTAGE),
+ ("OPENED_FILES", MetricType.METRIC_TYPE_NONE),
+ ("PINNED_ICAPS", MetricType.METRIC_TYPE_NONE),
+ ("OPENED_INODES", MetricType.METRIC_TYPE_NONE),
+ ("READ_IO_SIZES", MetricType.METRIC_TYPE_SIZE),
+ ("WRITE_IO_SIZES", MetricType.METRIC_TYPE_SIZE),
+ ("AVG_READ_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("STDEV_READ_LATENCY", MetricType.METRIC_TYPE_STDEV),
+ ("AVG_WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("STDEV_WRITE_LATENCY", MetricType.METRIC_TYPE_STDEV),
+ ("AVG_METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("STDEV_METADATA_LATENCY", MetricType.METRIC_TYPE_STDEV),
+])
+MGR_STATS_COUNTERS = list(MAIN_WINDOW_TOP_LINE_METRICS.keys())
+
+FS_TOP_VERSION_HEADER_FMT = '{prog_name} - {now}'
+FS_TOP_CLIENT_HEADER_FMT = 'Total Client(s): {num_clients} - '\
+ '{num_mounts} FUSE, {num_kclients} kclient, {num_libs} libcephfs'
+FS_TOP_NAME_TOPL_FMT = 'Filesystem: {fs_name} - {client_count} client(s)'
+
+CLIENT_METADATA_KEY = "client_metadata"
+CLIENT_METADATA_MOUNT_POINT_KEY = "mount_point"
+CLIENT_METADATA_MOUNT_ROOT_KEY = "root"
+CLIENT_METADATA_IP_KEY = "IP"
+CLIENT_METADATA_HOSTNAME_KEY = "hostname"
+CLIENT_METADATA_VALID_METRICS_KEY = "valid_metrics"
+
+GLOBAL_METRICS_KEY = "global_metrics"
+GLOBAL_COUNTERS_KEY = "global_counters"
+
+last_time = time.time()
+last_read_size = {}
+last_write_size = {}
+
+fs_list = []
+
+
+def calc_perc(c):
+ if c[0] == 0 and c[1] == 0:
+ return 0.0
+ return round((c[0] / (c[0] + c[1])) * 100, 2)
+
+
+def calc_lat(c):
+ return round(c[0] * 1000 + c[1] / 1000000, 2)
+
+
+def calc_stdev(c):
+ stdev = 0.0
+ if c[1] > 1:
+ stdev = math.sqrt(c[0] / (c[1] - 1)) / 1000000
+ return round(stdev, 2)
+
+
+# in MB
+def calc_size(c):
+ return round(c[1] / (1024 * 1024), 2)
+
+
+# in MB
+def calc_avg_size(c):
+ if c[0] == 0:
+ return 0.0
+ return round(c[1] / (c[0] * 1024 * 1024), 2)
+
+
+# in MB/s
+def calc_speed(size, duration):
+ if duration == 0:
+ return 0.0
+ return round(size / (duration * 1024 * 1024), 2)
+
+
+def wrap(s, sl):
+ """return a '+' suffixed wrapped string"""
+ if len(s) < sl:
+ return s
+ return f'{s[0:sl-1]}+'
+
+
+class FSTop(object):
+ def __init__(self, args):
+ self.rados = None
+ self.stdscr = None # curses instance
+ self.current_screen = ""
+ self.client_name = args.id
+ self.cluster_name = args.cluster
+ self.conffile = args.conffile
+ self.refresh_interval_secs = args.delay
+ self.PAD_HEIGHT = 10000 # height of the fstop_pad
+ self.PAD_WIDTH = 300 # width of the fstop_pad
+ self.exit_ev = threading.Event()
+
+ def handle_signal(self, signum, _):
+ self.exit_ev.set()
+
+ def init(self):
+ try:
+ if self.conffile:
+ r_rados = rados.Rados(rados_id=self.client_name,
+ clustername=self.cluster_name,
+ conffile=self.conffile)
+ else:
+ r_rados = rados.Rados(rados_id=self.client_name,
+ clustername=self.cluster_name)
+ r_rados.conf_read_file()
+ r_rados.connect()
+ self.rados = r_rados
+ except rados.Error as e:
+ if e.errno == errno.ENOENT:
+ raise FSTopException(f'cluster {self.cluster_name}'
+ ' does not exist')
+ else:
+ raise FSTopException(f'error connecting to cluster: {e}')
+ self.verify_perf_stats_support()
+ signal.signal(signal.SIGTERM, self.handle_signal)
+ signal.signal(signal.SIGINT, self.handle_signal)
+
+ def fini(self):
+ if self.rados:
+ self.rados.shutdown()
+ self.rados = None
+
+ def selftest(self):
+ stats_json = self.perf_stats_query()
+ if not stats_json['version'] == FS_TOP_SUPPORTED_VER:
+ raise FSTopException('perf stats version mismatch!')
+ missing = [m for m in stats_json["global_counters"]
+ if m.upper() not in MGR_STATS_COUNTERS]
+ if missing:
+ raise FSTopException('Cannot handle unknown metrics from'
+ f'\'ceph fs perf stats\': {missing}')
+
+ def get_fs_names(self):
+ mon_cmd = {'prefix': 'fs ls', 'format': 'json'}
+ try:
+ ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'')
+ except Exception as e:
+ raise FSTopException(f'Error in fs ls: {e}')
+ fs_map = json.loads(buf.decode('utf-8'))
+ global fs_list
+ fs_list.clear()
+ for filesystem in fs_map:
+ fs = filesystem['name']
+ fs_list.append(fs)
+ return fs_list
+
+ def setup_curses(self, win):
+ self.stdscr = win
+ self.stdscr.keypad(True)
+ curses.use_default_colors()
+ curses.start_color()
+ try:
+ curses.curs_set(0)
+ except curses.error:
+ # If the terminal do not support the visibility
+ # requested it will raise an exception
+ pass
+ self.fstop_pad = curses.newpad(self.PAD_HEIGHT, self.PAD_WIDTH)
+ self.run_all_display()
+
+ def display_fs_menu(self, stdscr, selected_row_idx):
+ stdscr.clear()
+ h, w = stdscr.getmaxyx()
+ global fs_list
+ if not fs_list:
+ title = ['No filesystem available',
+ 'Press "q" to go back to home (All Filesystem Info) screen']
+ pos_x1 = w // 2 - len(title[0]) // 2
+ pos_x2 = w // 2 - len(title[1]) // 2
+ stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD)
+ stdscr.addstr(3, pos_x2, title[1])
+ else:
+ title = ['Filesystems', 'Press "q" to go back to home (All Filesystem Info) screen']
+ pos_x1 = w // 2 - len(title[0]) // 2
+ pos_x2 = w // 2 - len(title[1]) // 2
+ stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD)
+ stdscr.addstr(3, pos_x2, title[1])
+ for index, name in enumerate(fs_list):
+ x = w // 2 - len(name) // 2
+ y = h // 2 - len(fs_list) // 2 + index
+ if index == selected_row_idx:
+ stdscr.attron(curses.color_pair(1))
+ stdscr.addstr(y, x, name)
+ stdscr.attroff(curses.color_pair(1))
+ else:
+ stdscr.addstr(y, x, name)
+ stdscr.refresh()
+
+ def set_key(self, stdscr):
+ curses.curs_set(0)
+ curses.init_pair(1, curses.COLOR_BLACK, curses.COLOR_WHITE)
+ curr_row = 0
+ key = 0
+ endmenu = False
+ while not endmenu:
+ global fs_list
+ fs_list = self.get_fs_names()
+
+ if key == curses.KEY_UP and curr_row > 0:
+ curr_row -= 1
+ elif key == curses.KEY_DOWN and curr_row < len(fs_list) - 1:
+ curr_row += 1
+ elif (key in [curses.KEY_ENTER, 10, 13]) and fs_list:
+ self.stdscr.erase()
+ self.run_display(fs_list[curr_row])
+ endmenu = True
+ elif key == ord('q'):
+ self.stdscr.erase()
+ self.run_all_display()
+ endmenu = True
+
+ try:
+ self.display_fs_menu(stdscr, curr_row)
+ except curses.error:
+ pass
+ curses.halfdelay(self.refresh_interval_secs)
+ key = stdscr.getch()
+
+ def set_option(self, opt):
+ if opt == ord('m'):
+ curses.wrapper(self.set_key)
+ elif opt == ord('q'):
+ if self.current_screen == FS_TOP_ALL_FS_APP:
+ quit()
+ else:
+ self.run_all_display()
+
+ def verify_perf_stats_support(self):
+ mon_cmd = {'prefix': 'mgr module ls', 'format': 'json'}
+ try:
+ ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'')
+ except Exception as e:
+ raise FSTopException(f'error checking \'stats\' module: {e}')
+ if ret != 0:
+ raise FSTopException(f'error checking \'stats\' module: {out}')
+ if 'stats' not in json.loads(buf.decode('utf-8'))['enabled_modules']:
+ raise FSTopException('\'stats\' module not enabled. Use'
+ '\'ceph mgr module enable stats\' to enable')
+
+ def perf_stats_query(self):
+ mgr_cmd = {'prefix': 'fs perf stats', 'format': 'json'}
+ try:
+ ret, buf, out = self.rados.mgr_command(json.dumps(mgr_cmd), b'')
+ except Exception as e:
+ raise FSTopException(f'error in \'perf stats\' query: {e}')
+ if ret != 0:
+ raise FSTopException(f'error in \'perf stats\' query: {out}')
+ return json.loads(buf.decode('utf-8'))
+
+ def items(self, item):
+ if item == "CAP_HIT":
+ return "chit"
+ if item == "READ_LATENCY":
+ return "rlat"
+ if item == "WRITE_LATENCY":
+ return "wlat"
+ if item == "METADATA_LATENCY":
+ return "mlat"
+ if item == "DENTRY_LEASE":
+ return "dlease"
+ if item == "OPENED_FILES":
+ return "ofiles"
+ if item == "PINNED_ICAPS":
+ return "oicaps"
+ if item == "OPENED_INODES":
+ return "oinodes"
+ if item == "READ_IO_SIZES":
+ return "rtio"
+ if item == "WRITE_IO_SIZES":
+ return "wtio"
+ if item == 'AVG_READ_LATENCY':
+ return 'rlatavg'
+ if item == 'STDEV_READ_LATENCY':
+ return 'rlatsd'
+ if item == 'AVG_WRITE_LATENCY':
+ return 'wlatavg'
+ if item == 'STDEV_WRITE_LATENCY':
+ return 'wlatsd'
+ if item == 'AVG_METADATA_LATENCY':
+ return 'mlatavg'
+ if item == 'STDEV_METADATA_LATENCY':
+ return 'mlatsd'
+ else:
+ # return empty string for none type
+ return ''
+
+ def mtype(self, typ):
+ if typ == MetricType.METRIC_TYPE_PERCENTAGE:
+ return "(%)"
+ elif typ == MetricType.METRIC_TYPE_LATENCY:
+ return "(ms)"
+ elif typ == MetricType.METRIC_TYPE_SIZE:
+ return "(MB)"
+ elif typ == MetricType.METRIC_TYPE_STDEV:
+ return "(ms)"
+ else:
+ # return empty string for none type
+ return ''
+
+ def avg_items(self, item):
+ if item == "READ_IO_SIZES":
+ return "raio"
+ if item == "WRITE_IO_SIZES":
+ return "waio"
+ else:
+ # return empty string for none type
+ return ''
+
+ def speed_items(self, item):
+ if item == "READ_IO_SIZES":
+ return "rsp"
+ if item == "WRITE_IO_SIZES":
+ return "wsp"
+ else:
+ # return empty string for none type
+ return ''
+
+ def speed_mtype(self, typ):
+ if typ == MetricType.METRIC_TYPE_SIZE:
+ return "(MB/s)"
+ else:
+ # return empty string for none type
+ return ''
+
+ @staticmethod
+ def has_metric(metadata, metrics_key):
+ return metrics_key in metadata
+
+ @staticmethod
+ def has_metrics(metadata, metrics_keys):
+ for key in metrics_keys:
+ if not FSTop.has_metric(metadata, key):
+ return False
+ return True
+
+ def create_top_line_and_build_coord(self):
+ xp = 0
+ x_coord_map = {}
+
+ heading = []
+ for item in MAIN_WINDOW_TOP_LINE_ITEMS_START:
+ heading.append(item)
+ nlen = len(item) + len(ITEMS_PAD)
+ x_coord_map[item] = (xp, nlen)
+ xp += nlen
+
+ for item, typ in MAIN_WINDOW_TOP_LINE_METRICS.items():
+ if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY:
+ continue
+ it = f'{self.items(item)}{self.mtype(typ)}'
+ heading.append(it)
+ nlen = len(it) + len(ITEMS_PAD)
+ x_coord_map[item] = (xp, nlen)
+ xp += nlen
+
+ if item == "READ_IO_SIZES" or item == "WRITE_IO_SIZES":
+ # average io sizes
+ it = f'{self.avg_items(item)}{self.mtype(typ)}'
+ heading.append(it)
+ nlen = len(it) + len(ITEMS_PAD)
+ if item == "READ_IO_SIZES":
+ x_coord_map["READ_IO_AVG"] = (xp, nlen)
+ if item == "WRITE_IO_SIZES":
+ x_coord_map["WRITE_IO_AVG"] = (xp, nlen)
+ xp += nlen
+
+ # io speeds
+ it = f'{self.speed_items(item)}{self.speed_mtype(typ)}'
+ heading.append(it)
+ nlen = len(it) + len(ITEMS_PAD)
+ if item == "READ_IO_SIZES":
+ x_coord_map["READ_IO_SPEED"] = (xp, nlen)
+ if item == "WRITE_IO_SIZES":
+ x_coord_map["WRITE_IO_SPEED"] = (xp, nlen)
+ xp += nlen
+
+ for item in MAIN_WINDOW_TOP_LINE_ITEMS_END:
+ heading.append(item)
+ nlen = len(item) + len(ITEMS_PAD)
+ x_coord_map[item] = (xp, nlen)
+ xp += nlen
+ title = ITEMS_PAD.join(heading)
+ self.fsstats.addstr(self.tablehead_y, 0, title, curses.A_STANDOUT | curses.A_BOLD)
+ return x_coord_map
+
+ def create_client(self, client_id, metrics, counters,
+ client_meta, x_coord_map, y_coord):
+ global last_time
+ size = 0
+ cur_time = time.time()
+ duration = cur_time - last_time
+ last_time = cur_time
+ for item in MAIN_WINDOW_TOP_LINE_ITEMS_START:
+ coord = x_coord_map[item]
+ hlen = coord[1] - 1
+ if item == FS_TOP_MAIN_WINDOW_COL_CLIENT_ID:
+ self.fsstats.addstr(y_coord, coord[0],
+ wrap(client_id.split('.')[1], hlen), curses.A_DIM)
+ elif item == FS_TOP_MAIN_WINDOW_COL_MNT_ROOT:
+ if FSTop.has_metric(client_meta,
+ CLIENT_METADATA_MOUNT_ROOT_KEY):
+ self.fsstats.addstr(
+ y_coord, coord[0],
+ wrap(client_meta[CLIENT_METADATA_MOUNT_ROOT_KEY], hlen), curses.A_DIM)
+ else:
+ self.fsstats.addstr(y_coord, coord[0], "N/A", curses.A_DIM)
+
+ cidx = 0
+ for item in counters:
+ if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY:
+ cidx += 1
+ continue
+ coord = x_coord_map[item]
+ m = metrics[cidx]
+ key = MGR_STATS_COUNTERS[cidx]
+ typ = MAIN_WINDOW_TOP_LINE_METRICS[key]
+ if item.lower() in client_meta.get(
+ CLIENT_METADATA_VALID_METRICS_KEY, []):
+ if typ == MetricType.METRIC_TYPE_PERCENTAGE:
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_perc(m)}', curses.A_DIM)
+ elif typ == MetricType.METRIC_TYPE_LATENCY:
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_lat(m)}', curses.A_DIM)
+ elif typ == MetricType.METRIC_TYPE_STDEV:
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_stdev(m)}', curses.A_DIM)
+ elif typ == MetricType.METRIC_TYPE_SIZE:
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_size(m)}', curses.A_DIM)
+
+ # average io sizes
+ if key == "READ_IO_SIZES":
+ coord = x_coord_map["READ_IO_AVG"]
+ elif key == "WRITE_IO_SIZES":
+ coord = x_coord_map["WRITE_IO_AVG"]
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_avg_size(m)}', curses.A_DIM)
+
+ # io speeds
+ if key == "READ_IO_SIZES":
+ coord = x_coord_map["READ_IO_SPEED"]
+ elif key == "WRITE_IO_SIZES":
+ coord = x_coord_map["WRITE_IO_SPEED"]
+ size = 0
+ if key == "READ_IO_SIZES":
+ if m[1] > 0:
+ global last_read_size
+ last_size = last_read_size.get(client_id, 0)
+ size = m[1] - last_size
+ last_read_size[client_id] = m[1]
+ if key == "WRITE_IO_SIZES":
+ if m[1] > 0:
+ global last_write_size
+ last_size = last_write_size.get(client_id, 0)
+ size = m[1] - last_size
+ last_write_size[client_id] = m[1]
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_speed(abs(size), duration)}', curses.A_DIM)
+ else:
+ # display 0th element from metric tuple
+ self.fsstats.addstr(y_coord, coord[0], f'{m[0]}', curses.A_DIM)
+ else:
+ self.fsstats.addstr(y_coord, coord[0], "N/A", curses.A_DIM)
+ cidx += 1
+
+ for item in MAIN_WINDOW_TOP_LINE_ITEMS_END:
+ coord = x_coord_map[item]
+ wrapLen = self.PAD_WIDTH - coord[0]
+ # always place the FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR in the
+ # last, it will be a very long string to display
+ if item == FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR:
+ if FSTop.has_metrics(client_meta,
+ [CLIENT_METADATA_MOUNT_POINT_KEY,
+ CLIENT_METADATA_HOSTNAME_KEY,
+ CLIENT_METADATA_IP_KEY]):
+ mount_point = f'{client_meta[CLIENT_METADATA_MOUNT_POINT_KEY]}@'\
+ f'{client_meta[CLIENT_METADATA_HOSTNAME_KEY]}/'\
+ f'{client_meta[CLIENT_METADATA_IP_KEY]}'
+ self.fsstats.addstr(
+ y_coord, coord[0],
+ wrap(mount_point, wrapLen), curses.A_DIM)
+ else:
+ self.fsstats.addstr(y_coord, coord[0], "N/A", curses.A_DIM)
+
+ def create_clients(self, x_coord_map, stats_json, fs_name):
+ counters = [m.upper() for m in stats_json[GLOBAL_COUNTERS_KEY]]
+ self.tablehead_y += 2
+ res = stats_json[CLIENT_METADATA_KEY].get(fs_name, {})
+ client_cnt = len(res)
+ self.fsstats.addstr(self.tablehead_y, 0, FS_TOP_NAME_TOPL_FMT.format(
+ fs_name=fs_name, client_count=client_cnt), curses.A_BOLD | curses.A_ITALIC)
+ self.tablehead_y += 2
+ if client_cnt:
+ for client_id, metrics in \
+ stats_json[GLOBAL_METRICS_KEY][fs_name].items():
+ self.create_client(
+ client_id, metrics, counters, res[client_id],
+ x_coord_map, self.tablehead_y)
+ self.tablehead_y += 1
+
+ def create_header(self, stats_json, help, screen_title="", color_id=0):
+ num_clients, num_mounts, num_kclients, num_libs = 0, 0, 0, 0
+ if not stats_json['version'] == FS_TOP_SUPPORTED_VER:
+ self.header.addstr(0, 0, 'perf stats version mismatch!', curses.A_BOLD)
+ return False
+ global fs_list
+ for fs_name in fs_list:
+ client_metadata = stats_json[CLIENT_METADATA_KEY].get(fs_name, {})
+ client_cnt = len(client_metadata)
+ if client_cnt:
+ num_clients = num_clients + client_cnt
+ num_mounts = num_mounts + len(
+ [client for client, metadata in client_metadata.items() if
+ CLIENT_METADATA_MOUNT_POINT_KEY in metadata
+ and metadata[CLIENT_METADATA_MOUNT_POINT_KEY] != 'N/A'])
+ num_kclients = num_kclients + len(
+ [client for client, metadata in client_metadata.items() if
+ "kernel_version" in metadata])
+ num_libs = num_clients - (num_mounts + num_kclients)
+ now = datetime.now().ctime()
+ self.header.addstr(0, 0, FS_TOP_VERSION_HEADER_FMT.format(prog_name=FS_TOP_PROG_STR,
+ now=now), curses.A_BOLD)
+ self.header.addstr(2, 0, screen_title, curses.color_pair(color_id) | curses.A_BOLD)
+ self.header.addstr(3, 0, FS_TOP_CLIENT_HEADER_FMT.format(num_clients=num_clients,
+ num_mounts=num_mounts,
+ num_kclients=num_kclients,
+ num_libs=num_libs), curses.A_DIM)
+ self.header.addstr(4, 0, help, curses.A_DIM)
+ return True
+
+ def run_display(self, fs):
+ # clear the pads to have a smooth refresh
+ self.header.erase()
+ self.fsstats.erase()
+
+ self.current_screen = FS_TOP_FS_SELECTED_APP
+ screen_title = "Selected Filesystem Info"
+ help_commands = "Press 'q' to go back to home (All Filesystem Info) screen"\
+ " | Press 'm' to select another filesystem"
+ curses.init_pair(3, curses.COLOR_MAGENTA, -1)
+
+ top, left = 0, 0 # where to place pad
+ vscrollOffset, hscrollOffset = 0, 0 # scroll offsets
+
+ # calculate the initial viewport height and width
+ windowsize = self.stdscr.getmaxyx()
+ self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+
+ # create header subpad
+ self.header_height = 7
+ self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0)
+
+ # create fsstats subpad
+ fsstats_begin_y = self.header_height
+ fsstats_height = self.PAD_HEIGHT - self.header_height
+ self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0)
+
+ curses.halfdelay(1)
+ cmd = self.stdscr.getch()
+ while not self.exit_ev.is_set():
+ if cmd in [ord('m'), ord('q')]:
+ self.set_option(cmd)
+ self.exit_ev.set()
+
+ # header display
+ global fs_list
+ fs_list = self.get_fs_names()
+ stats_json = self.perf_stats_query()
+ vscrollEnd = 0
+ if fs not in fs_list:
+ help = "Error: The selected filesystem is not available now. " + help_commands
+ self.header.erase() # erase previous text
+ self.create_header(stats_json, help, screen_title, 3)
+ else:
+ self.tablehead_y = 0
+ help = "HELP: " + help_commands
+ self.fsstats.erase() # erase previous text
+
+ vscrollEnd = len(stats_json[CLIENT_METADATA_KEY].get(fs, {}))
+ if self.create_header(stats_json, help, screen_title, 3):
+ x_coord_map = self.create_top_line_and_build_coord()
+ self.create_clients(x_coord_map, stats_json, fs)
+
+ # scroll and refresh
+ if cmd == curses.KEY_DOWN:
+ if (vscrollEnd - vscrollOffset) > 1:
+ vscrollOffset += 1
+ else:
+ vscrollOffset = vscrollEnd
+ elif cmd == curses.KEY_UP:
+ if vscrollOffset > 0:
+ vscrollOffset -= 1
+ elif cmd == curses.KEY_NPAGE:
+ if (vscrollEnd - vscrollOffset) / 20 > 1:
+ vscrollOffset += 20
+ else:
+ vscrollOffset = vscrollEnd
+ elif cmd == curses.KEY_PPAGE:
+ if vscrollOffset / 20 >= 1:
+ vscrollOffset -= 20
+ else:
+ vscrollOffset = 0
+ elif cmd == curses.KEY_RIGHT:
+ if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1:
+ hscrollOffset += 1
+ elif cmd == curses.KEY_LEFT:
+ if hscrollOffset > 0:
+ hscrollOffset -= 1
+ elif cmd == curses.KEY_HOME:
+ hscrollOffset = 0
+ elif cmd == curses.KEY_END:
+ hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1
+ elif cmd == curses.KEY_RESIZE:
+ # terminal resize event. Update the viewport dimensions
+ windowsize = self.stdscr.getmaxyx()
+ self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+
+ if cmd:
+ try:
+ # refresh the viewport for the header portion
+ if cmd not in [curses.KEY_DOWN,
+ curses.KEY_UP,
+ curses.KEY_NPAGE,
+ curses.KEY_PPAGE,
+ curses.KEY_RIGHT,
+ curses.KEY_LEFT]:
+ self.fstop_pad.refresh(0, 0,
+ top, left,
+ top + self.header_height, left + self.viewportWidth)
+ # refresh the viewport for the current table header portion in the fsstats pad
+ if cmd not in [curses.KEY_DOWN,
+ curses.KEY_UP,
+ curses.KEY_NPAGE,
+ curses.KEY_PPAGE]:
+ self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset,
+ top + fsstats_begin_y, left,
+ 7, left + self.viewportWidth)
+ # refresh the viewport for the current client records portion in the fsstats pad
+ self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset,
+ top + fsstats_begin_y + 2, left,
+ top + self.viewportHeight, left + self.viewportWidth)
+ except curses.error:
+ # This happens when the user switches to a terminal of different zoom size.
+ # just retry it.
+ pass
+ # End scroll and refresh
+
+ curses.halfdelay(self.refresh_interval_secs * 10)
+ cmd = self.stdscr.getch()
+
+ def run_all_display(self):
+ # clear text from the previous screen
+ if self.current_screen == FS_TOP_FS_SELECTED_APP:
+ self.header.erase()
+
+ self.current_screen = FS_TOP_ALL_FS_APP
+ screen_title = "All Filesystem Info"
+ curses.init_pair(2, curses.COLOR_CYAN, -1)
+
+ top, left = 0, 0 # where to place pad
+ vscrollOffset, hscrollOffset = 0, 0 # scroll offsets
+
+ # calculate the initial viewport height and width
+ windowsize = self.stdscr.getmaxyx()
+ self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+
+ # create header subpad
+ self.header_height = 7
+ self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0)
+
+ # create fsstats subpad
+ fsstats_begin_y = self.header_height
+ fsstats_height = self.PAD_HEIGHT - self.header_height
+ self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0)
+
+ curses.halfdelay(1)
+ cmd = self.stdscr.getch()
+ while not self.exit_ev.is_set():
+ if cmd in [ord('m'), ord('q')]:
+ self.set_option(cmd)
+ self.exit_ev.set()
+
+ # header display
+ global fs_list
+ fs_list = self.get_fs_names()
+ stats_json = self.perf_stats_query()
+ vscrollEnd = 0
+ if not fs_list:
+ help = "INFO: No filesystem is available [Press 'q' to quit]"
+ self.header.erase() # erase previous text
+ self.fsstats.erase()
+ self.create_header(stats_json, help, screen_title, 2)
+ else:
+ self.tablehead_y = 0
+ help = "HELP: Press 'm' to select a filesystem | Press 'q' to quit"
+ self.fsstats.erase() # erase previous text
+ for index, fs in enumerate(fs_list):
+ # Get the vscrollEnd in advance
+ vscrollEnd += len(stats_json[CLIENT_METADATA_KEY].get(fs, {}))
+ if self.create_header(stats_json, help, screen_title, 2):
+ if not index: # do it only for the first fs
+ x_coord_map = self.create_top_line_and_build_coord()
+ self.create_clients(x_coord_map, stats_json, fs)
+
+ # scroll and refresh
+ if cmd == curses.KEY_DOWN:
+ if (vscrollEnd - vscrollOffset) > 1:
+ vscrollOffset += 1
+ else:
+ vscrollOffset = vscrollEnd
+ elif cmd == curses.KEY_UP:
+ if vscrollOffset > 0:
+ vscrollOffset -= 1
+ elif cmd == curses.KEY_NPAGE:
+ if (vscrollEnd - vscrollOffset) / 20 > 1:
+ vscrollOffset += 20
+ else:
+ vscrollOffset = vscrollEnd
+ elif cmd == curses.KEY_PPAGE:
+ if vscrollOffset / 20 >= 1:
+ vscrollOffset -= 20
+ else:
+ vscrollOffset = 0
+ elif cmd == curses.KEY_RIGHT:
+ if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1:
+ hscrollOffset += 1
+ elif cmd == curses.KEY_LEFT:
+ if hscrollOffset > 0:
+ hscrollOffset -= 1
+ elif cmd == curses.KEY_HOME:
+ hscrollOffset = 0
+ elif cmd == curses.KEY_END:
+ hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1
+ elif cmd == curses.KEY_RESIZE:
+ # terminal resize event. Update the viewport dimensions
+ windowsize = self.stdscr.getmaxyx()
+ self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+ if cmd:
+ try:
+ # refresh the viewport for the header portion
+ if cmd not in [curses.KEY_DOWN,
+ curses.KEY_UP,
+ curses.KEY_NPAGE,
+ curses.KEY_PPAGE,
+ curses.KEY_RIGHT,
+ curses.KEY_LEFT]:
+ self.fstop_pad.refresh(0, 0,
+ top, left,
+ top + self.header_height, left + self.viewportWidth)
+ # refresh the viewport for the current table header portion in the fsstats pad
+ if cmd not in [curses.KEY_DOWN,
+ curses.KEY_UP,
+ curses.KEY_NPAGE,
+ curses.KEY_PPAGE]:
+ self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset,
+ top + fsstats_begin_y, left,
+ 7, left + self.viewportWidth)
+ # refresh the viewport for the current client records portion in the fsstats pad
+ self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset,
+ top + fsstats_begin_y + 2, left,
+ top + self.viewportHeight, left + self.viewportWidth)
+ except curses.error:
+ # This happens when the user switches to a terminal of different zoom size.
+ # just retry it.
+ pass
+ # End scroll and refresh
+
+ curses.halfdelay(self.refresh_interval_secs * 10)
+ cmd = self.stdscr.getch()
+# End class FSTop
+
+
+if __name__ == '__main__':
+ def float_greater_than(x):
+ value = float(x)
+ if value < MIN_REFRESH_INTERVAL:
+ raise argparse.ArgumentTypeError(
+ 'Refresh interval should be greater than or equal to'
+ f' {MIN_REFRESH_INTERVAL}')
+ return value
+
+ parser = argparse.ArgumentParser(description='Ceph Filesystem top utility')
+ parser.add_argument('--cluster', nargs='?', const='ceph', default='ceph',
+ help='Ceph cluster to connect (defualt: ceph)')
+ parser.add_argument('--id', nargs='?', const='fstop', default='fstop',
+ help='Ceph user to use to connection (default: fstop)')
+ parser.add_argument('--conffile', nargs='?', default=None,
+ help='Path to cluster configuration file')
+ parser.add_argument('--selftest', dest='selftest', action='store_true',
+ help='Run in selftest mode')
+ parser.add_argument('-d', '--delay', nargs='?',
+ default=DEFAULT_REFRESH_INTERVAL,
+ type=float_greater_than,
+ help='Refresh interval in seconds '
+ f'(default: {DEFAULT_REFRESH_INTERVAL})')
+
+ args = parser.parse_args()
+ err = False
+ ft = FSTop(args)
+ try:
+ ft.init()
+ if args.selftest:
+ ft.selftest()
+ sys.stdout.write("selftest ok\n")
+ else:
+ curses.wrapper(ft.setup_curses)
+ except FSTopException as fst:
+ err = True
+ sys.stderr.write(f'{fst.get_error_msg()}\n')
+ except Exception as e:
+ err = True
+ sys.stderr.write(f'exception: {e}\n')
+ finally:
+ ft.fini()
+ sys.exit(0 if not err else -1)
diff --git a/src/tools/cephfs/top/setup.py b/src/tools/cephfs/top/setup.py
new file mode 100644
index 000000000..92fbd964c
--- /dev/null
+++ b/src/tools/cephfs/top/setup.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+from setuptools import setup
+
+__version__ = '0.0.1'
+
+setup(
+ name='cephfs-top',
+ version=__version__,
+ description='top(1) like utility for Ceph Filesystem',
+ keywords='cephfs, top',
+ scripts=['cephfs-top'],
+ install_requires=[
+ 'rados',
+ ],
+ classifiers=[
+ 'Development Status :: 3 - Alpha',
+ 'Environment :: Console',
+ 'Intended Audience :: System Administrators',
+ 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)',
+ 'Operating System :: POSIX :: Linux',
+ 'Programming Language :: Python :: 3'
+ ],
+ license='LGPLv2+',
+)
diff --git a/src/tools/cephfs/top/tox.ini b/src/tools/cephfs/top/tox.ini
new file mode 100644
index 000000000..b125c0bc8
--- /dev/null
+++ b/src/tools/cephfs/top/tox.ini
@@ -0,0 +1,7 @@
+[tox]
+envlist = py3
+skipsdist = true
+
+[testenv:py3]
+deps = flake8
+commands = flake8 --ignore=W503 --max-line-length=100 cephfs-top
diff --git a/src/tools/cephfs/type_helper.hpp b/src/tools/cephfs/type_helper.hpp
new file mode 100644
index 000000000..06dfa0afe
--- /dev/null
+++ b/src/tools/cephfs/type_helper.hpp
@@ -0,0 +1,28 @@
+#ifndef TYPE_HELPER_HPP__
+#define TYPE_HELPER_HPP__
+
+template<typename T1, typename T2>
+T1 conv_t(T2 s){
+ T1 target;
+ std::stringstream conv;
+ conv << s;
+ conv >> target;
+ return target;
+}
+
+void string_split(std::string str, vector<string>& out, string split = ":"){
+ std::cout << str << std::endl;
+ auto pos = str.find(split);
+ while(pos != std::string::npos){
+ std::cout << str.substr(0, pos) << std::endl;
+ out.push_back(str.substr(0, pos));
+ if (str.size() > pos + split.size()){
+ str = str.substr(pos + split.size());
+ pos = str.find(split);
+ }else
+ return;
+ }
+ out.push_back(str.substr());
+ return;
+}
+#endif // TYPE_HELPER_HPP__
diff --git a/src/tools/cephfs_mirror/CMakeLists.txt b/src/tools/cephfs_mirror/CMakeLists.txt
new file mode 100644
index 000000000..4b6dea7a1
--- /dev/null
+++ b/src/tools/cephfs_mirror/CMakeLists.txt
@@ -0,0 +1,30 @@
+set(cephfs_mirror_internal
+ ClusterWatcher.cc
+ Mirror.cc
+ FSMirror.cc
+ InstanceWatcher.cc
+ MirrorWatcher.cc
+ PeerReplayer.cc
+ ServiceDaemon.cc
+ Types.cc
+ Utils.cc
+ Watcher.cc
+ watcher/RewatchRequest.cc)
+
+add_executable(cephfs-mirror
+ main.cc)
+
+add_library(cephfs_mirror_internal STATIC
+ ${cephfs_mirror_internal})
+
+target_link_libraries(cephfs-mirror
+ cephfs_mirror_internal
+ global
+ ceph-common
+ cls_cephfs_client
+ librados
+ mds
+ cephfs
+ ${ALLOC_LIBS})
+
+install(TARGETS cephfs-mirror DESTINATION bin)
diff --git a/src/tools/cephfs_mirror/ClusterWatcher.cc b/src/tools/cephfs_mirror/ClusterWatcher.cc
new file mode 100644
index 000000000..b5f6f81d7
--- /dev/null
+++ b/src/tools/cephfs_mirror/ClusterWatcher.cc
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <mutex>
+#include <vector>
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "mon/MonClient.h"
+
+#include "ClusterWatcher.h"
+#include "ServiceDaemon.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::ClusterWatcher " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+ClusterWatcher::ClusterWatcher(CephContext *cct, MonClient *monc, ServiceDaemon *service_daemon,
+ Listener &listener)
+ : Dispatcher(cct),
+ m_monc(monc),
+ m_service_daemon(service_daemon),
+ m_listener(listener) {
+}
+
+ClusterWatcher::~ClusterWatcher() {
+}
+
+bool ClusterWatcher::ms_can_fast_dispatch2(const cref_t<Message> &m) const {
+ return m->get_type() == CEPH_MSG_FS_MAP;
+}
+
+void ClusterWatcher::ms_fast_dispatch2(const ref_t<Message> &m) {
+ bool handled = ms_dispatch2(m);
+ ceph_assert(handled);
+}
+
+bool ClusterWatcher::ms_dispatch2(const ref_t<Message> &m) {
+ if (m->get_type() == CEPH_MSG_FS_MAP) {
+ if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ handle_fsmap(ref_cast<MFSMap>(m));
+ }
+ return true;
+ }
+
+ return false;
+}
+
+int ClusterWatcher::init() {
+ dout(20) << dendl;
+
+ bool sub = m_monc->sub_want("fsmap", 0, 0);
+ if (!sub) {
+ derr << ": failed subscribing to FSMap" << dendl;
+ return -1;
+ }
+
+ m_monc->renew_subs();
+ dout(10) << ": subscribed to FSMap" << dendl;
+ return 0;
+}
+
+void ClusterWatcher::shutdown() {
+ dout(20) << dendl;
+ std::scoped_lock locker(m_lock);
+ m_stopping = true;
+ m_monc->sub_unwant("fsmap");
+}
+
+void ClusterWatcher::handle_fsmap(const cref_t<MFSMap> &m) {
+ dout(20) << dendl;
+
+ auto fsmap = m->get_fsmap();
+ auto filesystems = fsmap.get_filesystems();
+
+ std::vector<Filesystem> mirroring_enabled;
+ std::vector<Filesystem> mirroring_disabled;
+ std::map<Filesystem, Peers> peers_added;
+ std::map<Filesystem, Peers> peers_removed;
+ std::map<Filesystem, uint64_t> fs_metadata_pools;
+ {
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ return;
+ }
+
+ // deleted filesystems are considered mirroring disabled
+ for (auto it = m_filesystem_peers.begin(); it != m_filesystem_peers.end();) {
+ if (!fsmap.filesystem_exists(it->first.fscid)) {
+ mirroring_disabled.emplace_back(it->first);
+ it = m_filesystem_peers.erase(it);
+ continue;
+ }
+ ++it;
+ }
+
+ for (auto &filesystem : filesystems) {
+ auto fs = Filesystem{filesystem->fscid,
+ std::string(filesystem->mds_map.get_fs_name())};
+ auto pool_id = filesystem->mds_map.get_metadata_pool();
+ auto &mirror_info = filesystem->mirror_info;
+
+ if (!mirror_info.is_mirrored()) {
+ auto it = m_filesystem_peers.find(fs);
+ if (it != m_filesystem_peers.end()) {
+ mirroring_disabled.emplace_back(fs);
+ m_filesystem_peers.erase(it);
+ }
+ } else {
+ auto [fspeersit, enabled] = m_filesystem_peers.emplace(fs, Peers{});
+ auto &peers = fspeersit->second;
+
+ if (enabled) {
+ mirroring_enabled.emplace_back(fs);
+ fs_metadata_pools.emplace(fs, pool_id);
+ }
+
+ // peers added
+ Peers added;
+ std::set_difference(mirror_info.peers.begin(), mirror_info.peers.end(),
+ peers.begin(), peers.end(), std::inserter(added, added.end()));
+
+ // peers removed
+ Peers removed;
+ std::set_difference(peers.begin(), peers.end(),
+ mirror_info.peers.begin(), mirror_info.peers.end(),
+ std::inserter(removed, removed.end()));
+
+ // update set
+ if (!added.empty()) {
+ peers_added.emplace(fs, added);
+ peers.insert(added.begin(), added.end());
+ }
+ if (!removed.empty()) {
+ peers_removed.emplace(fs, removed);
+ for (auto &p : removed) {
+ peers.erase(p);
+ }
+ }
+ }
+ }
+ }
+
+ dout(5) << ": mirroring enabled=" << mirroring_enabled << ", mirroring_disabled="
+ << mirroring_disabled << dendl;
+ for (auto &fs : mirroring_enabled) {
+ m_service_daemon->add_filesystem(fs.fscid, fs.fs_name);
+ m_listener.handle_mirroring_enabled(FilesystemSpec(fs, fs_metadata_pools.at(fs)));
+ }
+ for (auto &fs : mirroring_disabled) {
+ m_service_daemon->remove_filesystem(fs.fscid);
+ m_listener.handle_mirroring_disabled(fs);
+ }
+
+ dout(5) << ": peers added=" << peers_added << ", peers removed=" << peers_removed << dendl;
+
+ for (auto &[fs, peers] : peers_added) {
+ for (auto &peer : peers) {
+ m_service_daemon->add_peer(fs.fscid, peer);
+ m_listener.handle_peers_added(fs, peer);
+ }
+ }
+ for (auto &[fs, peers] : peers_removed) {
+ for (auto &peer : peers) {
+ m_service_daemon->remove_peer(fs.fscid, peer);
+ m_listener.handle_peers_removed(fs, peer);
+ }
+ }
+
+ std::scoped_lock locker(m_lock);
+ if (!m_stopping) {
+ m_monc->sub_got("fsmap", fsmap.get_epoch());
+ } // else we have already done a sub_unwant()
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/ClusterWatcher.h b/src/tools/cephfs_mirror/ClusterWatcher.h
new file mode 100644
index 000000000..a418898f5
--- /dev/null
+++ b/src/tools/cephfs_mirror/ClusterWatcher.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_CLUSTER_WATCHER_H
+#define CEPHFS_MIRROR_CLUSTER_WATCHER_H
+
+#include <map>
+
+#include "common/ceph_mutex.h"
+#include "common/async/context_pool.h"
+#include "messages/MFSMap.h"
+#include "msg/Dispatcher.h"
+#include "Types.h"
+
+class MonClient;
+
+namespace cephfs {
+namespace mirror {
+
+class ServiceDaemon;
+
+// watch peer changes for filesystems via FSMap updates
+
+class ClusterWatcher : public Dispatcher {
+public:
+ struct Listener {
+ virtual ~Listener() {
+ }
+
+ virtual void handle_mirroring_enabled(const FilesystemSpec &spec) = 0;
+ virtual void handle_mirroring_disabled(const Filesystem &filesystem) = 0;
+
+ virtual void handle_peers_added(const Filesystem &filesystem, const Peer &peer) = 0;
+ virtual void handle_peers_removed(const Filesystem &filesystem, const Peer &peer) = 0;
+ };
+
+ ClusterWatcher(CephContext *cct, MonClient *monc, ServiceDaemon *service_daemon,
+ Listener &listener);
+ ~ClusterWatcher();
+
+ bool ms_can_fast_dispatch_any() const override {
+ return true;
+ }
+ bool ms_can_fast_dispatch2(const cref_t<Message> &m) const override;
+ void ms_fast_dispatch2(const ref_t<Message> &m) override;
+ bool ms_dispatch2(const ref_t<Message> &m) override;
+
+ void ms_handle_connect(Connection *c) override {
+ }
+ bool ms_handle_reset(Connection *c) override {
+ return false;
+ }
+ void ms_handle_remote_reset(Connection *c) override {
+ }
+ bool ms_handle_refused(Connection *c) override {
+ return false;
+ }
+
+ int init();
+ void shutdown();
+
+private:
+ ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::cluster_watcher");
+ MonClient *m_monc;
+ ServiceDaemon *m_service_daemon;
+ Listener &m_listener;
+
+ bool m_stopping = false;
+ std::map<Filesystem, Peers> m_filesystem_peers;
+
+ void handle_fsmap(const cref_t<MFSMap> &m);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_CLUSTER_WATCHER_H
diff --git a/src/tools/cephfs_mirror/FSMirror.cc b/src/tools/cephfs_mirror/FSMirror.cc
new file mode 100644
index 000000000..76dcc11f6
--- /dev/null
+++ b/src/tools/cephfs_mirror/FSMirror.cc
@@ -0,0 +1,441 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/admin_socket.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/common_init.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "msg/Messenger.h"
+#include "FSMirror.h"
+#include "PeerReplayer.h"
+#include "aio_utils.h"
+#include "ServiceDaemon.h"
+#include "Utils.h"
+
+#include "common/Cond.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::FSMirror " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+const std::string SERVICE_DAEMON_DIR_COUNT_KEY("directory_count");
+const std::string SERVICE_DAEMON_PEER_INIT_FAILED_KEY("peer_init_failed");
+
+class MirrorAdminSocketCommand {
+public:
+ virtual ~MirrorAdminSocketCommand() {
+ }
+ virtual int call(Formatter *f) = 0;
+};
+
+class StatusCommand : public MirrorAdminSocketCommand {
+public:
+ explicit StatusCommand(FSMirror *fs_mirror)
+ : fs_mirror(fs_mirror) {
+ }
+
+ int call(Formatter *f) override {
+ fs_mirror->mirror_status(f);
+ return 0;
+ }
+
+private:
+ FSMirror *fs_mirror;
+};
+
+} // anonymous namespace
+
+class MirrorAdminSocketHook : public AdminSocketHook {
+public:
+ MirrorAdminSocketHook(CephContext *cct, const Filesystem &filesystem, FSMirror *fs_mirror)
+ : admin_socket(cct->get_admin_socket()) {
+ int r;
+ std::string cmd;
+
+ // mirror status format is name@fscid
+ cmd = "fs mirror status " + stringify(filesystem.fs_name) + "@" + stringify(filesystem.fscid);
+ r = admin_socket->register_command(
+ cmd, this, "get filesystem mirror status");
+ if (r == 0) {
+ commands[cmd] = new StatusCommand(fs_mirror);
+ }
+ }
+
+ ~MirrorAdminSocketHook() override {
+ admin_socket->unregister_commands(this);
+ for (auto &[command, cmdptr] : commands) {
+ delete cmdptr;
+ }
+ }
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f, std::ostream &errss, bufferlist &out) override {
+ auto p = commands.at(std::string(command));
+ return p->call(f);
+ }
+
+private:
+ typedef std::map<std::string, MirrorAdminSocketCommand*, std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+FSMirror::FSMirror(CephContext *cct, const Filesystem &filesystem, uint64_t pool_id,
+ ServiceDaemon *service_daemon, std::vector<const char*> args,
+ ContextWQ *work_queue)
+ : m_cct(cct),
+ m_filesystem(filesystem),
+ m_pool_id(pool_id),
+ m_service_daemon(service_daemon),
+ m_args(args),
+ m_work_queue(work_queue),
+ m_snap_listener(this),
+ m_asok_hook(new MirrorAdminSocketHook(cct, filesystem, this)) {
+ m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY,
+ (uint64_t)0);
+}
+
+FSMirror::~FSMirror() {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ delete m_instance_watcher;
+ delete m_mirror_watcher;
+ }
+ // outside the lock so that in-progress commands can acquire
+ // lock and finish executing.
+ delete m_asok_hook;
+}
+
+int FSMirror::init_replayer(PeerReplayer *peer_replayer) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return peer_replayer->init();
+}
+
+void FSMirror::shutdown_replayer(PeerReplayer *peer_replayer) {
+ peer_replayer->shutdown();
+}
+
+void FSMirror::cleanup() {
+ dout(20) << dendl;
+ ceph_unmount(m_mount);
+ ceph_release(m_mount);
+ m_ioctx.close();
+ m_cluster.reset();
+}
+
+void FSMirror::reopen_logs() {
+ std::scoped_lock locker(m_lock);
+
+ if (m_cluster) {
+ reinterpret_cast<CephContext *>(m_cluster->cct())->reopen_logs();
+ }
+ for (auto &[peer, replayer] : m_peer_replayers) {
+ replayer->reopen_logs();
+ }
+}
+
+void FSMirror::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ int r = connect(g_ceph_context->_conf->name.to_str(),
+ g_ceph_context->_conf->cluster, &m_cluster, "", "", m_args);
+ if (r < 0) {
+ m_init_failed = true;
+ on_finish->complete(r);
+ return;
+ }
+
+ r = m_cluster->ioctx_create2(m_pool_id, m_ioctx);
+ if (r < 0) {
+ m_init_failed = true;
+ m_cluster.reset();
+ derr << ": error accessing local pool (id=" << m_pool_id << "): "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ r = mount(m_cluster, m_filesystem, true, &m_mount);
+ if (r < 0) {
+ m_init_failed = true;
+ m_ioctx.close();
+ m_cluster.reset();
+ on_finish->complete(r);
+ return;
+ }
+
+ m_addrs = m_cluster->get_addrs();
+ dout(10) << ": rados addrs=" << m_addrs << dendl;
+
+ init_instance_watcher(on_finish);
+}
+
+void FSMirror::shutdown(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ m_stopping = true;
+ if (m_on_init_finish != nullptr) {
+ dout(10) << ": delaying shutdown -- init in progress" << dendl;
+ m_on_shutdown_finish = new LambdaContext([this, on_finish](int r) {
+ if (r < 0) {
+ on_finish->complete(0);
+ return;
+ }
+ m_on_shutdown_finish = on_finish;
+ shutdown_peer_replayers();
+ });
+ return;
+ }
+
+ m_on_shutdown_finish = on_finish;
+ }
+
+ shutdown_peer_replayers();
+}
+
+void FSMirror::shutdown_peer_replayers() {
+ dout(20) << dendl;
+
+ for (auto &[peer, peer_replayer] : m_peer_replayers) {
+ dout(5) << ": shutting down replayer for peer=" << peer << dendl;
+ shutdown_replayer(peer_replayer.get());
+ }
+ m_peer_replayers.clear();
+
+ shutdown_mirror_watcher();
+}
+
+void FSMirror::init_instance_watcher(Context *on_finish) {
+ dout(20) << dendl;
+
+ m_on_init_finish = new LambdaContext([this, on_finish](int r) {
+ {
+ std::scoped_lock locker(m_lock);
+ if (r < 0) {
+ m_init_failed = true;
+ }
+ }
+ on_finish->complete(r);
+ if (m_on_shutdown_finish != nullptr) {
+ m_on_shutdown_finish->complete(r);
+ }
+ });
+
+ Context *ctx = new C_CallbackAdapter<
+ FSMirror, &FSMirror::handle_init_instance_watcher>(this);
+ m_instance_watcher = InstanceWatcher::create(m_ioctx, m_snap_listener, m_work_queue);
+ m_instance_watcher->init(ctx);
+}
+
+void FSMirror::handle_init_instance_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r < 0) {
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ return;
+ }
+
+ init_mirror_watcher();
+}
+
+void FSMirror::init_mirror_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *ctx = new C_CallbackAdapter<
+ FSMirror, &FSMirror::handle_init_mirror_watcher>(this);
+ m_mirror_watcher = MirrorWatcher::create(m_ioctx, this, m_work_queue);
+ m_mirror_watcher->init(ctx);
+}
+
+void FSMirror::handle_init_mirror_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r == 0) {
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ return;
+ }
+
+ m_retval = r; // save errcode for init context callback
+ shutdown_instance_watcher();
+}
+
+void FSMirror::shutdown_mirror_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *ctx = new C_CallbackAdapter<
+ FSMirror, &FSMirror::handle_shutdown_mirror_watcher>(this);
+ m_mirror_watcher->shutdown(ctx);
+}
+
+void FSMirror::handle_shutdown_mirror_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ shutdown_instance_watcher();
+}
+
+void FSMirror::shutdown_instance_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *ctx = new C_CallbackAdapter<
+ FSMirror, &FSMirror::handle_shutdown_instance_watcher>(this);
+ m_instance_watcher->shutdown(new C_AsyncCallback<ContextWQ>(m_work_queue, ctx));
+}
+
+void FSMirror::handle_shutdown_instance_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ cleanup();
+
+ Context *on_init_finish = nullptr;
+ Context *on_shutdown_finish = nullptr;
+
+ {
+ std::scoped_lock locker(m_lock);
+ std::swap(on_init_finish, m_on_init_finish);
+ std::swap(on_shutdown_finish, m_on_shutdown_finish);
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(m_retval);
+ }
+ if (on_shutdown_finish != nullptr) {
+ on_shutdown_finish->complete(r);
+ }
+}
+
+void FSMirror::handle_acquire_directory(string_view dir_path) {
+ dout(5) << ": dir_path=" << dir_path << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ m_directories.emplace(dir_path);
+ m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY,
+ m_directories.size());
+
+ for (auto &[peer, peer_replayer] : m_peer_replayers) {
+ dout(10) << ": peer=" << peer << dendl;
+ peer_replayer->add_directory(dir_path);
+ }
+ }
+}
+
+void FSMirror::handle_release_directory(string_view dir_path) {
+ dout(5) << ": dir_path=" << dir_path << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto it = m_directories.find(dir_path);
+ if (it != m_directories.end()) {
+ m_directories.erase(it);
+ m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY,
+ m_directories.size());
+ for (auto &[peer, peer_replayer] : m_peer_replayers) {
+ dout(10) << ": peer=" << peer << dendl;
+ peer_replayer->remove_directory(dir_path);
+ }
+ }
+ }
+}
+
+void FSMirror::add_peer(const Peer &peer) {
+ dout(10) << ": peer=" << peer << dendl;
+
+ std::scoped_lock locker(m_lock);
+ m_all_peers.emplace(peer);
+ if (m_peer_replayers.find(peer) != m_peer_replayers.end()) {
+ return;
+ }
+
+ auto replayer = std::make_unique<PeerReplayer>(
+ m_cct, this, m_cluster, m_filesystem, peer, m_directories, m_mount, m_service_daemon);
+ int r = init_replayer(replayer.get());
+ if (r < 0) {
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, peer,
+ SERVICE_DAEMON_PEER_INIT_FAILED_KEY,
+ true);
+ return;
+ }
+ m_peer_replayers.emplace(peer, std::move(replayer));
+ ceph_assert(m_peer_replayers.size() == 1); // support only a single peer
+}
+
+void FSMirror::remove_peer(const Peer &peer) {
+ dout(10) << ": peer=" << peer << dendl;
+
+ std::unique_ptr<PeerReplayer> replayer;
+ {
+ std::scoped_lock locker(m_lock);
+ m_all_peers.erase(peer);
+ auto it = m_peer_replayers.find(peer);
+ if (it != m_peer_replayers.end()) {
+ replayer = std::move(it->second);
+ m_peer_replayers.erase(it);
+ }
+ }
+
+ if (replayer) {
+ dout(5) << ": shutting down replayers for peer=" << peer << dendl;
+ shutdown_replayer(replayer.get());
+ }
+}
+
+void FSMirror::mirror_status(Formatter *f) {
+ std::scoped_lock locker(m_lock);
+ f->open_object_section("status");
+ if (m_init_failed) {
+ f->dump_string("state", "failed");
+ } else if (is_blocklisted(locker)) {
+ f->dump_string("state", "blocklisted");
+ } else {
+ // dump rados addr for blocklist test
+ f->dump_string("rados_inst", m_addrs);
+ f->open_object_section("peers");
+ for ([[maybe_unused]] auto &[peer, peer_replayer] : m_peer_replayers) {
+ peer.dump(f);
+ }
+ f->close_section(); // peers
+ f->open_object_section("snap_dirs");
+ f->dump_int("dir_count", m_directories.size());
+ f->close_section(); // snap_dirs
+ }
+ f->close_section(); // status
+}
+
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/FSMirror.h b/src/tools/cephfs_mirror/FSMirror.h
new file mode 100644
index 000000000..bae5a38e1
--- /dev/null
+++ b/src/tools/cephfs_mirror/FSMirror.h
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_FS_MIRROR_H
+#define CEPHFS_MIRROR_FS_MIRROR_H
+
+#include "common/Formatter.h"
+#include "common/Thread.h"
+#include "mds/FSMap.h"
+#include "Types.h"
+#include "InstanceWatcher.h"
+#include "MirrorWatcher.h"
+
+class ContextWQ;
+
+namespace cephfs {
+namespace mirror {
+
+class MirrorAdminSocketHook;
+class PeerReplayer;
+class ServiceDaemon;
+
+// handle mirroring for a filesystem to a set of peers
+
+class FSMirror {
+public:
+ FSMirror(CephContext *cct, const Filesystem &filesystem, uint64_t pool_id,
+ ServiceDaemon *service_daemon, std::vector<const char*> args,
+ ContextWQ *work_queue);
+ ~FSMirror();
+
+ void init(Context *on_finish);
+ void shutdown(Context *on_finish);
+
+ void add_peer(const Peer &peer);
+ void remove_peer(const Peer &peer);
+
+ bool is_stopping() {
+ std::scoped_lock locker(m_lock);
+ return m_stopping;
+ }
+
+ bool is_init_failed() {
+ std::scoped_lock locker(m_lock);
+ return m_init_failed;
+ }
+
+ bool is_failed() {
+ std::scoped_lock locker(m_lock);
+ return m_init_failed ||
+ m_instance_watcher->is_failed() ||
+ m_mirror_watcher->is_failed();
+ }
+
+ bool is_blocklisted() {
+ std::scoped_lock locker(m_lock);
+ return is_blocklisted(locker);
+ }
+
+ Peers get_peers() {
+ std::scoped_lock locker(m_lock);
+ return m_all_peers;
+ }
+
+ std::string get_instance_addr() {
+ std::scoped_lock locker(m_lock);
+ return m_addrs;
+ }
+
+ // admin socket helpers
+ void mirror_status(Formatter *f);
+
+ void reopen_logs();
+
+private:
+ bool is_blocklisted(const std::scoped_lock<ceph::mutex> &locker) const {
+ bool blocklisted = false;
+ if (m_instance_watcher) {
+ blocklisted = m_instance_watcher->is_blocklisted();
+ }
+ if (m_mirror_watcher) {
+ blocklisted |= m_mirror_watcher->is_blocklisted();
+ }
+
+ return blocklisted;
+ }
+
+ struct SnapListener : public InstanceWatcher::Listener {
+ FSMirror *fs_mirror;
+
+ SnapListener(FSMirror *fs_mirror)
+ : fs_mirror(fs_mirror) {
+ }
+
+ void acquire_directory(string_view dir_path) override {
+ fs_mirror->handle_acquire_directory(dir_path);
+ }
+
+ void release_directory(string_view dir_path) override {
+ fs_mirror->handle_release_directory(dir_path);
+ }
+ };
+
+ CephContext *m_cct;
+ Filesystem m_filesystem;
+ uint64_t m_pool_id;
+ ServiceDaemon *m_service_daemon;
+ std::vector<const char *> m_args;
+ ContextWQ *m_work_queue;
+
+ ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::fs_mirror");
+ SnapListener m_snap_listener;
+ std::set<std::string, std::less<>> m_directories;
+ Peers m_all_peers;
+ std::map<Peer, std::unique_ptr<PeerReplayer>> m_peer_replayers;
+
+ RadosRef m_cluster;
+ std::string m_addrs;
+ librados::IoCtx m_ioctx;
+ InstanceWatcher *m_instance_watcher = nullptr;
+ MirrorWatcher *m_mirror_watcher = nullptr;
+
+ int m_retval = 0;
+ bool m_stopping = false;
+ bool m_init_failed = false;
+ Context *m_on_init_finish = nullptr;
+ Context *m_on_shutdown_finish = nullptr;
+
+ MirrorAdminSocketHook *m_asok_hook = nullptr;
+
+ MountRef m_mount;
+
+ int init_replayer(PeerReplayer *peer_replayer);
+ void shutdown_replayer(PeerReplayer *peer_replayer);
+ void cleanup();
+
+ void init_instance_watcher(Context *on_finish);
+ void handle_init_instance_watcher(int r);
+
+ void init_mirror_watcher();
+ void handle_init_mirror_watcher(int r);
+
+ void shutdown_peer_replayers();
+
+ void shutdown_mirror_watcher();
+ void handle_shutdown_mirror_watcher(int r);
+
+ void shutdown_instance_watcher();
+ void handle_shutdown_instance_watcher(int r);
+
+ void handle_acquire_directory(string_view dir_path);
+ void handle_release_directory(string_view dir_path);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_FS_MIRROR_H
diff --git a/src/tools/cephfs_mirror/InstanceWatcher.cc b/src/tools/cephfs_mirror/InstanceWatcher.cc
new file mode 100644
index 000000000..9c357da31
--- /dev/null
+++ b/src/tools/cephfs_mirror/InstanceWatcher.cc
@@ -0,0 +1,251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/cephfs/cls_cephfs_client.h"
+#include "include/stringify.h"
+#include "aio_utils.h"
+#include "InstanceWatcher.h"
+#include "Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::InstanceWatcher " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+std::string instance_oid(const std::string &instance_id) {
+ return CEPHFS_MIRROR_OBJECT + "." + instance_id;
+}
+
+} // anonymous namespace
+
+InstanceWatcher::InstanceWatcher(librados::IoCtx &ioctx,
+ Listener &listener, ContextWQ *work_queue)
+ : Watcher(ioctx, instance_oid(stringify(ioctx.get_instance_id())), work_queue),
+ m_ioctx(ioctx),
+ m_listener(listener),
+ m_work_queue(work_queue),
+ m_lock(ceph::make_mutex("cephfs::mirror::instance_watcher")) {
+}
+
+InstanceWatcher::~InstanceWatcher() {
+}
+
+void InstanceWatcher::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_on_init_finish == nullptr);
+ m_on_init_finish = new LambdaContext([this, on_finish](int r) {
+ on_finish->complete(r);
+ if (m_on_shutdown_finish != nullptr) {
+ m_on_shutdown_finish->complete(0);
+ }
+ });
+ }
+
+ create_instance();
+}
+
+void InstanceWatcher::shutdown(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_on_shutdown_finish == nullptr);
+ if (m_on_init_finish != nullptr) {
+ dout(10) << ": delaying shutdown -- init in progress" << dendl;
+ m_on_shutdown_finish = new LambdaContext([this, on_finish](int r) {
+ m_on_shutdown_finish = nullptr;
+ shutdown(on_finish);
+ });
+ return;
+ }
+
+ m_on_shutdown_finish = on_finish;
+ }
+
+ unregister_watcher();
+}
+
+void InstanceWatcher::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) {
+ dout(20) << dendl;
+
+ std::string dir_path;
+ std::string mode;
+ try {
+ JSONDecoder jd(bl);
+ JSONDecoder::decode_json("dir_path", dir_path, &jd.parser, true);
+ JSONDecoder::decode_json("mode", mode, &jd.parser, true);
+ } catch (const JSONDecoder::err &e) {
+ derr << ": failed to decode notify json: " << e.what() << dendl;
+ }
+
+ dout(20) << ": notifier_id=" << notifier_id << ", dir_path=" << dir_path
+ << ", mode=" << mode << dendl;
+
+ if (mode == "acquire") {
+ m_listener.acquire_directory(dir_path);
+ } else if (mode == "release") {
+ m_listener.release_directory(dir_path);
+ } else {
+ derr << ": unknown mode" << dendl;
+ }
+
+ bufferlist outbl;
+ acknowledge_notify(notify_id, handle, outbl);
+}
+
+void InstanceWatcher::handle_rewatch_complete(int r) {
+ dout(5) << ": r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ dout(0) << ": client blocklisted" <<dendl;
+ std::scoped_lock locker(m_lock);
+ m_blocklisted = true;
+ } else if (r == -ENOENT) {
+ derr << ": mirroring object deleted" << dendl;
+ m_failed = true;
+ } else if (r < 0) {
+ derr << ": rewatch error: " << cpp_strerror(r) << dendl;
+ m_failed = true;
+ }
+}
+
+void InstanceWatcher::create_instance() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ librados::ObjectWriteOperation op;
+ op.create(false);
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<InstanceWatcher, &InstanceWatcher::handle_create_instance>);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void InstanceWatcher::handle_create_instance(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r < 0) {
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ return;
+ }
+
+ register_watcher();
+}
+
+void InstanceWatcher::register_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *on_finish = new C_CallbackAdapter<
+ InstanceWatcher, &InstanceWatcher::handle_register_watcher>(this);
+ register_watch(on_finish);
+}
+
+void InstanceWatcher::handle_register_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r == 0) {
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ return;
+ }
+
+ remove_instance();
+}
+
+void InstanceWatcher::unregister_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *on_finish = new C_CallbackAdapter<
+ InstanceWatcher, &InstanceWatcher::handle_unregister_watcher>(this);
+ unregister_watch(new C_AsyncCallback<ContextWQ>(m_work_queue, on_finish));
+}
+
+void InstanceWatcher::handle_unregister_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_shutdown_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r < 0) {
+ std::swap(on_shutdown_finish, m_on_shutdown_finish);
+ }
+ }
+
+ if (on_shutdown_finish != nullptr) {
+ on_shutdown_finish->complete(r);
+ return;
+ }
+
+ remove_instance();
+}
+
+void InstanceWatcher::remove_instance() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ librados::ObjectWriteOperation op;
+ op.remove();
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<InstanceWatcher, &InstanceWatcher::handle_remove_instance>);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void InstanceWatcher::handle_remove_instance(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ Context *on_shutdown_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ std::swap(on_init_finish, m_on_init_finish);
+ std::swap(on_shutdown_finish, m_on_shutdown_finish);
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+ if (on_shutdown_finish != nullptr) {
+ on_shutdown_finish->complete(r);
+ }
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/InstanceWatcher.h b/src/tools/cephfs_mirror/InstanceWatcher.h
new file mode 100644
index 000000000..06edf5da9
--- /dev/null
+++ b/src/tools/cephfs_mirror/InstanceWatcher.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_INSTANCE_WATCHER_H
+#define CEPHFS_MIRROR_INSTANCE_WATCHER_H
+
+#include <string_view>
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "Watcher.h"
+
+class ContextWQ;
+
+namespace cephfs {
+namespace mirror {
+
+// watch directory update notifications via per daemon rados
+// object and invoke listener callback.
+
+class InstanceWatcher : public Watcher {
+public:
+ struct Listener {
+ virtual ~Listener() {
+ }
+
+ virtual void acquire_directory(string_view dir_path) = 0;
+ virtual void release_directory(string_view dir_path) = 0;
+ };
+
+ static InstanceWatcher *create(librados::IoCtx &ioctx,
+ Listener &listener, ContextWQ *work_queue) {
+ return new InstanceWatcher(ioctx, listener, work_queue);
+ }
+
+ InstanceWatcher(librados::IoCtx &ioctx, Listener &listener, ContextWQ *work_queue);
+ ~InstanceWatcher();
+
+ void init(Context *on_finish);
+ void shutdown(Context *on_finish);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) override;
+ void handle_rewatch_complete(int r) override;
+
+ bool is_blocklisted() {
+ std::scoped_lock locker(m_lock);
+ return m_blocklisted;
+ }
+
+ bool is_failed() {
+ std::scoped_lock locker(m_lock);
+ return m_failed;
+ }
+
+private:
+ librados::IoCtx &m_ioctx;
+ Listener &m_listener;
+ ContextWQ *m_work_queue;
+
+ ceph::mutex m_lock;
+ Context *m_on_init_finish = nullptr;
+ Context *m_on_shutdown_finish = nullptr;
+
+ bool m_blocklisted = false;
+ bool m_failed = false;
+
+ void create_instance();
+ void handle_create_instance(int r);
+
+ void register_watcher();
+ void handle_register_watcher(int r);
+
+ void remove_instance();
+ void handle_remove_instance(int r);
+
+ void unregister_watcher();
+ void handle_unregister_watcher(int r);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_INSTANCE_WATCHER_H
diff --git a/src/tools/cephfs_mirror/Mirror.cc b/src/tools/cephfs_mirror/Mirror.cc
new file mode 100644
index 000000000..890805764
--- /dev/null
+++ b/src/tools/cephfs_mirror/Mirror.cc
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/common_init.h"
+#include "common/Cond.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "include/types.h"
+#include "mon/MonClient.h"
+#include "msg/Messenger.h"
+#include "aio_utils.h"
+#include "Mirror.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::Mirror " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+const std::string SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY("mirroring_failed");
+
+class SafeTimerSingleton : public CommonSafeTimer<ceph::mutex> {
+public:
+ ceph::mutex timer_lock = ceph::make_mutex("cephfs::mirror::timer_lock");
+
+ explicit SafeTimerSingleton(CephContext *cct)
+ : SafeTimer(cct, timer_lock, true) {
+ init();
+ }
+};
+
+class ThreadPoolSingleton : public ThreadPool {
+public:
+ ContextWQ *work_queue = nullptr;
+
+ explicit ThreadPoolSingleton(CephContext *cct)
+ : ThreadPool(cct, "Mirror::thread_pool", "tp_mirror", 1) {
+ work_queue = new ContextWQ("Mirror::work_queue", ceph::make_timespan(60), this);
+
+ start();
+ }
+};
+
+} // anonymous namespace
+
+struct Mirror::C_EnableMirroring : Context {
+ Mirror *mirror;
+ Filesystem filesystem;
+ uint64_t pool_id;
+
+ C_EnableMirroring(Mirror *mirror, const Filesystem &filesystem, uint64_t pool_id)
+ : mirror(mirror),
+ filesystem(filesystem),
+ pool_id(pool_id) {
+ }
+
+ void finish(int r) override {
+ enable_mirroring();
+ }
+
+ void enable_mirroring() {
+ Context *ctx = new C_CallbackAdapter<C_EnableMirroring,
+ &C_EnableMirroring::handle_enable_mirroring>(this);
+ mirror->enable_mirroring(filesystem, pool_id, ctx);
+ }
+
+ void handle_enable_mirroring(int r) {
+ mirror->handle_enable_mirroring(filesystem, r);
+ delete this;
+ }
+
+ // context needs to live post completion
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+struct Mirror::C_DisableMirroring : Context {
+ Mirror *mirror;
+ Filesystem filesystem;
+
+ C_DisableMirroring(Mirror *mirror, const Filesystem &filesystem)
+ : mirror(mirror),
+ filesystem(filesystem) {
+ }
+
+ void finish(int r) override {
+ disable_mirroring();
+ }
+
+ void disable_mirroring() {
+ Context *ctx = new C_CallbackAdapter<C_DisableMirroring,
+ &C_DisableMirroring::handle_disable_mirroring>(this);
+ mirror->disable_mirroring(filesystem, ctx);
+ }
+
+ void handle_disable_mirroring(int r) {
+ mirror->handle_disable_mirroring(filesystem, r);
+ delete this;
+ }
+
+ // context needs to live post completion
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+struct Mirror::C_PeerUpdate : Context {
+ Mirror *mirror;
+ Filesystem filesystem;
+ Peer peer;
+ bool remove = false;
+
+ C_PeerUpdate(Mirror *mirror, const Filesystem &filesystem,
+ const Peer &peer)
+ : mirror(mirror),
+ filesystem(filesystem),
+ peer(peer) {
+ }
+ C_PeerUpdate(Mirror *mirror, const Filesystem &filesystem,
+ const Peer &peer, bool remove)
+ : mirror(mirror),
+ filesystem(filesystem),
+ peer(peer),
+ remove(remove) {
+ }
+
+ void finish(int r) override {
+ if (remove) {
+ mirror->remove_peer(filesystem, peer);
+ } else {
+ mirror->add_peer(filesystem, peer);
+ }
+ }
+};
+
+struct Mirror::C_RestartMirroring : Context {
+ Mirror *mirror;
+ Filesystem filesystem;
+ uint64_t pool_id;
+ Peers peers;
+
+ C_RestartMirroring(Mirror *mirror, const Filesystem &filesystem,
+ uint64_t pool_id, const Peers &peers)
+ : mirror(mirror),
+ filesystem(filesystem),
+ pool_id(pool_id),
+ peers(peers) {
+ }
+
+ void finish(int r) override {
+ disable_mirroring();
+ }
+
+ void disable_mirroring() {
+ Context *ctx = new C_CallbackAdapter<C_RestartMirroring,
+ &C_RestartMirroring::handle_disable_mirroring>(this);
+ mirror->disable_mirroring(filesystem, ctx);
+ }
+
+ void handle_disable_mirroring(int r) {
+ enable_mirroring();
+ }
+
+ void enable_mirroring() {
+ std::scoped_lock locker(mirror->m_lock);
+ Context *ctx = new C_CallbackAdapter<C_RestartMirroring,
+ &C_RestartMirroring::handle_enable_mirroring>(this);
+ mirror->enable_mirroring(filesystem, pool_id, ctx, true);
+ }
+
+ void handle_enable_mirroring(int r) {
+ mirror->handle_enable_mirroring(filesystem, peers, r);
+ delete this;
+ }
+
+ // context needs to live post completion
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+Mirror::Mirror(CephContext *cct, const std::vector<const char*> &args,
+ MonClient *monc, Messenger *msgr)
+ : m_cct(cct),
+ m_args(args),
+ m_monc(monc),
+ m_msgr(msgr),
+ m_listener(this),
+ m_last_blocklist_check(ceph_clock_now()),
+ m_last_failure_check(ceph_clock_now()),
+ m_local(new librados::Rados()) {
+ auto thread_pool = &(cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
+ "cephfs::mirror::thread_pool", false, cct));
+ auto safe_timer = &(cct->lookup_or_create_singleton_object<SafeTimerSingleton>(
+ "cephfs::mirror::safe_timer", false, cct));
+ m_thread_pool = thread_pool;
+ m_work_queue = thread_pool->work_queue;
+ m_timer = safe_timer;
+ m_timer_lock = &safe_timer->timer_lock;
+ std::scoped_lock timer_lock(*m_timer_lock);
+ schedule_mirror_update_task();
+}
+
+Mirror::~Mirror() {
+ dout(10) << dendl;
+ {
+ std::scoped_lock timer_lock(*m_timer_lock);
+ m_timer->shutdown();
+ }
+
+ m_work_queue->drain();
+ delete m_work_queue;
+ {
+ std::scoped_lock locker(m_lock);
+ m_thread_pool->stop();
+ }
+}
+
+int Mirror::init_mon_client() {
+ dout(20) << dendl;
+
+ m_monc->set_messenger(m_msgr);
+ m_msgr->add_dispatcher_head(m_monc);
+ m_monc->set_want_keys(CEPH_ENTITY_TYPE_MON);
+
+ int r = m_monc->init();
+ if (r < 0) {
+ derr << ": failed to init mon client: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = m_monc->authenticate(std::chrono::duration<double>(m_cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout")).count());
+ if (r < 0) {
+ derr << ": failed to authenticate to monitor: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ client_t me = m_monc->get_global_id();
+ m_msgr->set_myname(entity_name_t::CLIENT(me.v));
+ return 0;
+}
+
+int Mirror::init(std::string &reason) {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+
+ int r = m_local->init_with_context(m_cct);
+ if (r < 0) {
+ derr << ": could not initialize rados handler" << dendl;
+ return r;
+ }
+
+ r = m_local->connect();
+ if (r < 0) {
+ derr << ": error connecting to local cluster" << dendl;
+ return r;
+ }
+
+ m_service_daemon = std::make_unique<ServiceDaemon>(m_cct, m_local);
+ r = m_service_daemon->init();
+ if (r < 0) {
+ derr << ": error registering service daemon: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = init_mon_client();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void Mirror::shutdown() {
+ dout(20) << dendl;
+ m_stopping = true;
+ m_cluster_watcher->shutdown();
+ m_cond.notify_all();
+}
+
+void Mirror::reopen_logs() {
+ for (auto &[filesystem, mirror_action] : m_mirror_actions) {
+ mirror_action.fs_mirror->reopen_logs();
+ }
+ g_ceph_context->reopen_logs();
+}
+
+void Mirror::handle_signal(int signum) {
+ dout(10) << ": signal=" << signum << dendl;
+
+ std::scoped_lock locker(m_lock);
+ switch (signum) {
+ case SIGHUP:
+ reopen_logs();
+ break;
+ case SIGINT:
+ case SIGTERM:
+ shutdown();
+ break;
+ default:
+ ceph_abort_msgf("unexpected signal %d", signum);
+ }
+}
+
+void Mirror::handle_enable_mirroring(const Filesystem &filesystem,
+ const Peers &peers, int r) {
+ dout(20) << ": filesystem=" << filesystem << ", peers=" << peers
+ << ", r=" << r << dendl;
+
+ std::scoped_lock locker(m_lock);
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.action_in_progress);
+
+ mirror_action.action_in_progress = false;
+ m_cond.notify_all();
+ if (r < 0) {
+ derr << ": failed to initialize FSMirror for filesystem=" << filesystem
+ << ": " << cpp_strerror(r) << dendl;
+ m_service_daemon->add_or_update_fs_attribute(filesystem.fscid,
+ SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY,
+ true);
+ return;
+ }
+
+ for (auto &peer : peers) {
+ mirror_action.fs_mirror->add_peer(peer);
+ }
+
+ dout(10) << ": Initialized FSMirror for filesystem=" << filesystem << dendl;
+}
+
+void Mirror::handle_enable_mirroring(const Filesystem &filesystem, int r) {
+ dout(20) << ": filesystem=" << filesystem << ", r=" << r << dendl;
+
+ std::scoped_lock locker(m_lock);
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.action_in_progress);
+
+ mirror_action.action_in_progress = false;
+ m_cond.notify_all();
+ if (r < 0) {
+ derr << ": failed to initialize FSMirror for filesystem=" << filesystem
+ << ": " << cpp_strerror(r) << dendl;
+ m_service_daemon->add_or_update_fs_attribute(filesystem.fscid,
+ SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY,
+ true);
+ return;
+ }
+
+ dout(10) << ": Initialized FSMirror for filesystem=" << filesystem << dendl;
+}
+
+void Mirror::enable_mirroring(const Filesystem &filesystem, uint64_t local_pool_id,
+ Context *on_finish, bool is_restart) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ if (is_restart) {
+ mirror_action.fs_mirror.reset();
+ } else {
+ ceph_assert(!mirror_action.action_in_progress);
+ }
+
+ ceph_assert(!mirror_action.fs_mirror);
+
+ dout(10) << ": starting FSMirror: filesystem=" << filesystem << dendl;
+
+ mirror_action.action_in_progress = true;
+ mirror_action.fs_mirror = std::make_unique<FSMirror>(m_cct, filesystem, local_pool_id,
+ m_service_daemon.get(), m_args, m_work_queue);
+ mirror_action.fs_mirror->init(new C_AsyncCallback<ContextWQ>(m_work_queue, on_finish));
+}
+
+void Mirror::mirroring_enabled(const Filesystem &filesystem, uint64_t local_pool_id) {
+ dout(10) << ": filesystem=" << filesystem << ", pool_id=" << local_pool_id << dendl;
+
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ return;
+ }
+
+ auto p = m_mirror_actions.emplace(filesystem, MirrorAction(local_pool_id));
+ auto &mirror_action = p.first->second;
+ mirror_action.action_ctxs.push_back(new C_EnableMirroring(this, filesystem, local_pool_id));
+}
+
+void Mirror::handle_disable_mirroring(const Filesystem &filesystem, int r) {
+ dout(10) << ": filesystem=" << filesystem << ", r=" << r << dendl;
+
+ std::scoped_lock locker(m_lock);
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+
+ if (!mirror_action.fs_mirror->is_init_failed()) {
+ ceph_assert(mirror_action.action_in_progress);
+ mirror_action.action_in_progress = false;
+ m_cond.notify_all();
+ }
+
+ if (!m_stopping) {
+ mirror_action.fs_mirror.reset();
+ if (mirror_action.action_ctxs.empty()) {
+ dout(10) << ": no pending actions for filesystem=" << filesystem << dendl;
+ m_mirror_actions.erase(filesystem);
+ }
+ }
+}
+
+void Mirror::disable_mirroring(const Filesystem &filesystem, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.fs_mirror);
+ ceph_assert(!mirror_action.action_in_progress);
+
+ if (mirror_action.fs_mirror->is_init_failed()) {
+ dout(10) << ": init failed for filesystem=" << filesystem << dendl;
+ m_work_queue->queue(on_finish, -EINVAL);
+ return;
+ }
+
+ mirror_action.action_in_progress = true;
+ mirror_action.fs_mirror->shutdown(new C_AsyncCallback<ContextWQ>(m_work_queue, on_finish));
+}
+
+void Mirror::mirroring_disabled(const Filesystem &filesystem) {
+ dout(10) << ": filesystem=" << filesystem << dendl;
+
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ dout(5) << "shutting down" << dendl;
+ return;
+ }
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ mirror_action.action_ctxs.push_back(new C_DisableMirroring(this, filesystem));
+}
+
+void Mirror::add_peer(const Filesystem &filesystem, const Peer &peer) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.fs_mirror);
+ ceph_assert(!mirror_action.action_in_progress);
+
+ mirror_action.fs_mirror->add_peer(peer);
+}
+
+void Mirror::peer_added(const Filesystem &filesystem, const Peer &peer) {
+ dout(20) << ": filesystem=" << filesystem << ", peer=" << peer << dendl;
+
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ dout(5) << "shutting down" << dendl;
+ return;
+ }
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ mirror_action.action_ctxs.push_back(new C_PeerUpdate(this, filesystem, peer));
+}
+
+void Mirror::remove_peer(const Filesystem &filesystem, const Peer &peer) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.fs_mirror);
+ ceph_assert(!mirror_action.action_in_progress);
+
+ mirror_action.fs_mirror->remove_peer(peer);
+}
+
+void Mirror::peer_removed(const Filesystem &filesystem, const Peer &peer) {
+ dout(20) << ": filesystem=" << filesystem << ", peer=" << peer << dendl;
+
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ dout(5) << "shutting down" << dendl;
+ return;
+ }
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ mirror_action.action_ctxs.push_back(new C_PeerUpdate(this, filesystem, peer, true));
+}
+
+void Mirror::update_fs_mirrors() {
+ dout(20) << dendl;
+
+ auto now = ceph_clock_now();
+ double blocklist_interval = g_ceph_context->_conf.get_val<std::chrono::seconds>
+ ("cephfs_mirror_restart_mirror_on_blocklist_interval").count();
+ bool check_blocklist = blocklist_interval > 0 && ((now - m_last_blocklist_check) >= blocklist_interval);
+
+ double failed_interval = g_ceph_context->_conf.get_val<std::chrono::seconds>
+ ("cephfs_mirror_restart_mirror_on_failure_interval").count();
+ bool check_failure = failed_interval > 0 && ((now - m_last_failure_check) >= failed_interval);
+
+ {
+ std::scoped_lock locker(m_lock);
+ for (auto &[filesystem, mirror_action] : m_mirror_actions) {
+ auto failed = mirror_action.fs_mirror && mirror_action.fs_mirror->is_failed();
+ auto blocklisted = mirror_action.fs_mirror && mirror_action.fs_mirror->is_blocklisted();
+
+ if (check_failure && !mirror_action.action_in_progress && failed) {
+ // about to restart failed mirror instance -- nothing
+ // should interfere
+ dout(5) << ": filesystem=" << filesystem << " failed mirroring -- restarting" << dendl;
+ auto peers = mirror_action.fs_mirror->get_peers();
+ auto ctx = new C_RestartMirroring(this, filesystem, mirror_action.pool_id, peers);
+ ctx->complete(0);
+ } else if (check_blocklist && !mirror_action.action_in_progress && blocklisted) {
+ // about to restart blocklisted mirror instance -- nothing
+ // should interfere
+ dout(5) << ": filesystem=" << filesystem << " is blocklisted -- restarting" << dendl;
+ auto peers = mirror_action.fs_mirror->get_peers();
+ auto ctx = new C_RestartMirroring(this, filesystem, mirror_action.pool_id, peers);
+ ctx->complete(0);
+ }
+ if (!failed && !blocklisted && !mirror_action.action_ctxs.empty()
+ && !mirror_action.action_in_progress) {
+ auto ctx = std::move(mirror_action.action_ctxs.front());
+ mirror_action.action_ctxs.pop_front();
+ ctx->complete(0);
+ }
+ }
+
+ if (check_blocklist) {
+ m_last_blocklist_check = now;
+ }
+ if (check_failure) {
+ m_last_failure_check = now;
+ }
+ }
+
+ schedule_mirror_update_task();
+}
+
+void Mirror::schedule_mirror_update_task() {
+ ceph_assert(m_timer_task == nullptr);
+ ceph_assert(ceph_mutex_is_locked(*m_timer_lock));
+
+ m_timer_task = new LambdaContext([this](int _) {
+ m_timer_task = nullptr;
+ update_fs_mirrors();
+ });
+ double after = g_ceph_context->_conf.get_val<std::chrono::seconds>
+ ("cephfs_mirror_action_update_interval").count();
+ dout(20) << ": scheduling fs mirror update (" << m_timer_task << ") after "
+ << after << " seconds" << dendl;
+ m_timer->add_event_after(after, m_timer_task);
+}
+
+void Mirror::run() {
+ dout(20) << dendl;
+
+ std::unique_lock locker(m_lock);
+ m_cluster_watcher.reset(new ClusterWatcher(m_cct, m_monc, m_service_daemon.get(), m_listener));
+ m_msgr->add_dispatcher_tail(m_cluster_watcher.get());
+
+ m_cluster_watcher->init();
+ m_cond.wait(locker, [this]{return m_stopping;});
+
+ locker.unlock();
+ {
+ std::scoped_lock timer_lock(*m_timer_lock);
+ if (m_timer_task != nullptr) {
+ dout(10) << ": canceling timer task=" << m_timer_task << dendl;
+ m_timer->cancel_event(m_timer_task);
+ m_timer_task = nullptr;
+ }
+ }
+ locker.lock();
+
+ for (auto &[filesystem, mirror_action] : m_mirror_actions) {
+ dout(10) << ": trying to shutdown filesystem=" << filesystem << dendl;
+ // wait for in-progress action and shutdown
+ m_cond.wait(locker, [&mirror_action=mirror_action]
+ {return !mirror_action.action_in_progress;});
+ if (mirror_action.fs_mirror &&
+ !mirror_action.fs_mirror->is_stopping() &&
+ !mirror_action.fs_mirror->is_init_failed()) {
+ C_SaferCond cond;
+ mirror_action.fs_mirror->shutdown(new C_AsyncCallback<ContextWQ>(m_work_queue, &cond));
+ int r = cond.wait();
+ dout(10) << ": shutdown filesystem=" << filesystem << ", r=" << r << dendl;
+ }
+
+ mirror_action.fs_mirror.reset();
+ }
+}
+
+} // namespace mirror
+} // namespace cephfs
+
diff --git a/src/tools/cephfs_mirror/Mirror.h b/src/tools/cephfs_mirror/Mirror.h
new file mode 100644
index 000000000..f0ffdd516
--- /dev/null
+++ b/src/tools/cephfs_mirror/Mirror.h
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_H
+#define CEPHFS_MIRROR_H
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "common/ceph_mutex.h"
+#include "common/WorkQueue.h"
+#include "mds/FSMap.h"
+#include "ClusterWatcher.h"
+#include "FSMirror.h"
+#include "ServiceDaemon.h"
+#include "Types.h"
+
+class Messenger;
+class MonClient;
+class ContextWQ;
+
+namespace cephfs {
+namespace mirror {
+
+// this wraps up ClusterWatcher and FSMirrors to implement mirroring
+// for ceph filesystems.
+
+class Mirror {
+public:
+ Mirror(CephContext *cct, const std::vector<const char*> &args,
+ MonClient *monc, Messenger *msgr);
+ ~Mirror();
+
+ int init(std::string &reason);
+ void shutdown();
+ void run();
+
+ void handle_signal(int signum);
+
+private:
+ static constexpr std::string_view MIRRORING_MODULE = "mirroring";
+
+ struct C_EnableMirroring;
+ struct C_DisableMirroring;
+ struct C_PeerUpdate;
+ struct C_RestartMirroring;
+
+ struct ClusterListener : ClusterWatcher::Listener {
+ Mirror *mirror;
+
+ ClusterListener(Mirror *mirror)
+ : mirror(mirror) {
+ }
+
+ void handle_mirroring_enabled(const FilesystemSpec &spec) override {
+ mirror->mirroring_enabled(spec.filesystem, spec.pool_id);
+ }
+
+ void handle_mirroring_disabled(const Filesystem &filesystem) override {
+ mirror->mirroring_disabled(filesystem);
+ }
+
+ void handle_peers_added(const Filesystem &filesystem, const Peer &peer) override {
+ mirror->peer_added(filesystem, peer);
+ }
+
+ void handle_peers_removed(const Filesystem &filesystem, const Peer &peer) override {
+ mirror->peer_removed(filesystem, peer);
+ }
+ };
+
+ struct MirrorAction {
+ MirrorAction(uint64_t pool_id) :
+ pool_id(pool_id) {
+ }
+
+ uint64_t pool_id; // for restarting blocklisted mirror instance
+ bool action_in_progress = false;
+ std::list<Context *> action_ctxs;
+ std::unique_ptr<FSMirror> fs_mirror;
+ };
+
+ ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::Mirror");
+ ceph::condition_variable m_cond;
+
+ CephContext *m_cct;
+ std::vector<const char *> m_args;
+ MonClient *m_monc;
+ Messenger *m_msgr;
+ ClusterListener m_listener;
+
+ ThreadPool *m_thread_pool = nullptr;
+ ContextWQ *m_work_queue = nullptr;
+ SafeTimer *m_timer = nullptr;
+ ceph::mutex *m_timer_lock = nullptr;
+ Context *m_timer_task = nullptr;
+
+ bool m_stopping = false;
+ std::unique_ptr<ClusterWatcher> m_cluster_watcher;
+ std::map<Filesystem, MirrorAction> m_mirror_actions;
+
+ utime_t m_last_blocklist_check;
+ utime_t m_last_failure_check;
+
+ RadosRef m_local;
+ std::unique_ptr<ServiceDaemon> m_service_daemon;
+
+ int init_mon_client();
+
+ // called via listener
+ void mirroring_enabled(const Filesystem &filesystem, uint64_t local_pool_id);
+ void mirroring_disabled(const Filesystem &filesystem);
+ void peer_added(const Filesystem &filesystem, const Peer &peer);
+ void peer_removed(const Filesystem &filesystem, const Peer &peer);
+
+ // mirror enable callback
+ void enable_mirroring(const Filesystem &filesystem, uint64_t local_pool_id,
+ Context *on_finish, bool is_restart=false);
+ void handle_enable_mirroring(const Filesystem &filesystem, int r);
+ void handle_enable_mirroring(const Filesystem &filesystem, const Peers &peers, int r);
+
+ // mirror disable callback
+ void disable_mirroring(const Filesystem &filesystem, Context *on_finish);
+ void handle_disable_mirroring(const Filesystem &filesystem, int r);
+
+ // peer update callback
+ void add_peer(const Filesystem &filesystem, const Peer &peer);
+ void remove_peer(const Filesystem &filesystem, const Peer &peer);
+
+ void schedule_mirror_update_task();
+ void update_fs_mirrors();
+
+ void reopen_logs();
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_H
diff --git a/src/tools/cephfs_mirror/MirrorWatcher.cc b/src/tools/cephfs_mirror/MirrorWatcher.cc
new file mode 100644
index 000000000..26b88d077
--- /dev/null
+++ b/src/tools/cephfs_mirror/MirrorWatcher.cc
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "msg/Messenger.h"
+#include "aio_utils.h"
+#include "MirrorWatcher.h"
+#include "FSMirror.h"
+#include "Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::MirrorWatcher " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+MirrorWatcher::MirrorWatcher(librados::IoCtx &ioctx, FSMirror *fs_mirror,
+ ContextWQ *work_queue)
+ : Watcher(ioctx, CEPHFS_MIRROR_OBJECT, work_queue),
+ m_ioctx(ioctx),
+ m_fs_mirror(fs_mirror),
+ m_work_queue(work_queue),
+ m_lock(ceph::make_mutex("cephfs::mirror::mirror_watcher")),
+ m_instance_id(stringify(m_ioctx.get_instance_id())) {
+}
+
+MirrorWatcher::~MirrorWatcher() {
+}
+
+void MirrorWatcher::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_on_init_finish == nullptr);
+ m_on_init_finish = new LambdaContext([this, on_finish](int r) {
+ on_finish->complete(r);
+ if (m_on_shutdown_finish != nullptr) {
+ m_on_shutdown_finish->complete(0);
+ }
+ });
+ }
+
+ register_watcher();
+}
+
+void MirrorWatcher::shutdown(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_on_shutdown_finish == nullptr);
+ if (m_on_init_finish != nullptr) {
+ dout(10) << ": delaying shutdown -- init in progress" << dendl;
+ m_on_shutdown_finish = new LambdaContext([this, on_finish](int r) {
+ m_on_shutdown_finish = nullptr;
+ shutdown(on_finish);
+ });
+ return;
+ }
+
+ m_on_shutdown_finish = on_finish;
+ }
+
+ unregister_watcher();
+}
+
+void MirrorWatcher::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) {
+ dout(20) << dendl;
+
+ JSONFormatter f;
+ f.open_object_section("info");
+ encode_json("addr", m_fs_mirror->get_instance_addr(), &f);
+ f.close_section();
+
+ bufferlist outbl;
+ f.flush(outbl);
+ acknowledge_notify(notify_id, handle, outbl);
+}
+
+void MirrorWatcher::handle_rewatch_complete(int r) {
+ dout(5) << ": r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ dout(0) << ": client blocklisted" <<dendl;
+ std::scoped_lock locker(m_lock);
+ m_blocklisted = true;
+ } else if (r == -ENOENT) {
+ derr << ": mirroring object deleted" << dendl;
+ m_failed = true;
+ } else if (r < 0) {
+ derr << ": rewatch error: " << cpp_strerror(r) << dendl;
+ m_failed = true;
+ }
+}
+
+void MirrorWatcher::register_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *on_finish = new C_CallbackAdapter<
+ MirrorWatcher, &MirrorWatcher::handle_register_watcher>(this);
+ register_watch(on_finish);
+}
+
+void MirrorWatcher::handle_register_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+
+ on_init_finish->complete(r);
+}
+
+void MirrorWatcher::unregister_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *on_finish = new C_CallbackAdapter<
+ MirrorWatcher, &MirrorWatcher::handle_unregister_watcher>(this);
+ unregister_watch(new C_AsyncCallback<ContextWQ>(m_work_queue, on_finish));
+}
+
+void MirrorWatcher::handle_unregister_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_shutdown_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ std::swap(on_shutdown_finish, m_on_shutdown_finish);
+ }
+
+ on_shutdown_finish->complete(r);
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/MirrorWatcher.h b/src/tools/cephfs_mirror/MirrorWatcher.h
new file mode 100644
index 000000000..c4d4f4522
--- /dev/null
+++ b/src/tools/cephfs_mirror/MirrorWatcher.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_MIRROR_WATCHER_H
+#define CEPHFS_MIRROR_MIRROR_WATCHER_H
+
+#include <string_view>
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "Watcher.h"
+
+class ContextWQ;
+class Messenger;
+
+namespace cephfs {
+namespace mirror {
+
+class FSMirror;
+
+// watch for notifications via cephfs_mirror object (in metadata
+// pool). this is used sending keepalived with keepalive payload
+// being the rados instance address (used by the manager module
+// to blocklist when needed).
+
+class MirrorWatcher : public Watcher {
+public:
+ static MirrorWatcher *create(librados::IoCtx &ioctx, FSMirror *fs_mirror,
+ ContextWQ *work_queue) {
+ return new MirrorWatcher(ioctx, fs_mirror, work_queue);
+ }
+
+ MirrorWatcher(librados::IoCtx &ioctx, FSMirror *fs_mirror,
+ ContextWQ *work_queue);
+ ~MirrorWatcher();
+
+ void init(Context *on_finish);
+ void shutdown(Context *on_finish);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) override;
+ void handle_rewatch_complete(int r) override;
+
+ bool is_blocklisted() {
+ std::scoped_lock locker(m_lock);
+ return m_blocklisted;
+ }
+
+ bool is_failed() {
+ std::scoped_lock locker(m_lock);
+ return m_failed;
+ }
+
+private:
+ librados::IoCtx &m_ioctx;
+ FSMirror *m_fs_mirror;
+ ContextWQ *m_work_queue;
+
+ ceph::mutex m_lock;
+ std::string m_instance_id;
+
+ Context *m_on_init_finish = nullptr;
+ Context *m_on_shutdown_finish = nullptr;
+
+ bool m_blocklisted = false;
+ bool m_failed = false;
+
+ void register_watcher();
+ void handle_register_watcher(int r);
+
+ void unregister_watcher();
+ void handle_unregister_watcher(int r);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_MIRROR_WATCHER_H
diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc
new file mode 100644
index 000000000..aaf97b868
--- /dev/null
+++ b/src/tools/cephfs_mirror/PeerReplayer.cc
@@ -0,0 +1,1552 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <stack>
+#include <fcntl.h>
+#include <algorithm>
+#include <sys/time.h>
+#include <sys/file.h>
+#include <boost/scope_exit.hpp>
+
+#include "common/admin_socket.h"
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "FSMirror.h"
+#include "PeerReplayer.h"
+#include "Utils.h"
+
+#include "json_spirit/json_spirit.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::PeerReplayer(" \
+ << m_peer.uuid << ") " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+const std::string PEER_CONFIG_KEY_PREFIX = "cephfs/mirror/peer";
+
+std::string snapshot_dir_path(CephContext *cct, const std::string &path) {
+ return path + "/" + cct->_conf->client_snapdir;
+}
+
+std::string snapshot_path(const std::string &snap_dir, const std::string &snap_name) {
+ return snap_dir + "/" + snap_name;
+}
+
+std::string snapshot_path(CephContext *cct, const std::string &path, const std::string &snap_name) {
+ return path + "/" + cct->_conf->client_snapdir + "/" + snap_name;
+}
+
+std::string entry_path(const std::string &dir, const std::string &name) {
+ return dir + "/" + name;
+}
+
+std::map<std::string, std::string> decode_snap_metadata(snap_metadata *snap_metadata,
+ size_t nr_snap_metadata) {
+ std::map<std::string, std::string> metadata;
+ for (size_t i = 0; i < nr_snap_metadata; ++i) {
+ metadata.emplace(snap_metadata[i].key, snap_metadata[i].value);
+ }
+
+ return metadata;
+}
+
+std::string peer_config_key(const std::string &fs_name, const std::string &uuid) {
+ return PEER_CONFIG_KEY_PREFIX + "/" + fs_name + "/" + uuid;
+}
+
+class PeerAdminSocketCommand {
+public:
+ virtual ~PeerAdminSocketCommand() {
+ }
+ virtual int call(Formatter *f) = 0;
+};
+
+class StatusCommand : public PeerAdminSocketCommand {
+public:
+ explicit StatusCommand(PeerReplayer *peer_replayer)
+ : peer_replayer(peer_replayer) {
+ }
+
+ int call(Formatter *f) override {
+ peer_replayer->peer_status(f);
+ return 0;
+ }
+
+private:
+ PeerReplayer *peer_replayer;
+};
+
+// helper to open a directory relative to a file descriptor
+int opendirat(MountRef mnt, int dirfd, const std::string &relpath, int flags,
+ ceph_dir_result **dirp) {
+ int r = ceph_openat(mnt, dirfd, relpath.c_str(), flags, 0);
+ if (r < 0) {
+ return r;
+ }
+
+ int fd = r;
+ r = ceph_fdopendir(mnt, fd, dirp);
+ ceph_close(mnt, fd);
+ return r;
+}
+
+} // anonymous namespace
+
+class PeerReplayerAdminSocketHook : public AdminSocketHook {
+public:
+ PeerReplayerAdminSocketHook(CephContext *cct, const Filesystem &filesystem,
+ const Peer &peer, PeerReplayer *peer_replayer)
+ : admin_socket(cct->get_admin_socket()) {
+ int r;
+ std::string cmd;
+
+ // mirror peer status format is name@id uuid
+ cmd = "fs mirror peer status "
+ + stringify(filesystem.fs_name) + "@" + stringify(filesystem.fscid)
+ + " "
+ + stringify(peer.uuid);
+ r = admin_socket->register_command(
+ cmd, this, "get peer mirror status");
+ if (r == 0) {
+ commands[cmd] = new StatusCommand(peer_replayer);
+ }
+ }
+
+ ~PeerReplayerAdminSocketHook() override {
+ admin_socket->unregister_commands(this);
+ for (auto &[command, cmdptr] : commands) {
+ delete cmdptr;
+ }
+ }
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f, std::ostream &errss, bufferlist &out) override {
+ auto p = commands.at(std::string(command));
+ return p->call(f);
+ }
+
+private:
+ typedef std::map<std::string, PeerAdminSocketCommand*, std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+PeerReplayer::PeerReplayer(CephContext *cct, FSMirror *fs_mirror,
+ RadosRef local_cluster, const Filesystem &filesystem,
+ const Peer &peer, const std::set<std::string, std::less<>> &directories,
+ MountRef mount, ServiceDaemon *service_daemon)
+ : m_cct(cct),
+ m_fs_mirror(fs_mirror),
+ m_local_cluster(local_cluster),
+ m_filesystem(filesystem),
+ m_peer(peer),
+ m_directories(directories.begin(), directories.end()),
+ m_local_mount(mount),
+ m_service_daemon(service_daemon),
+ m_asok_hook(new PeerReplayerAdminSocketHook(cct, filesystem, peer, this)),
+ m_lock(ceph::make_mutex("cephfs::mirror::PeerReplayer::" + stringify(peer.uuid))) {
+ // reset sync stats sent via service daemon
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer,
+ SERVICE_DAEMON_FAILED_DIR_COUNT_KEY, (uint64_t)0);
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer,
+ SERVICE_DAEMON_RECOVERED_DIR_COUNT_KEY, (uint64_t)0);
+}
+
+PeerReplayer::~PeerReplayer() {
+ delete m_asok_hook;
+}
+
+int PeerReplayer::init() {
+ dout(20) << ": initial dir list=[" << m_directories << "]" << dendl;
+ for (auto &dir_root : m_directories) {
+ m_snap_sync_stats.emplace(dir_root, SnapSyncStat());
+ }
+
+ auto &remote_client = m_peer.remote.client_name;
+ auto &remote_cluster = m_peer.remote.cluster_name;
+ auto remote_filesystem = Filesystem{0, m_peer.remote.fs_name};
+
+ std::string key = peer_config_key(m_filesystem.fs_name, m_peer.uuid);
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config-key get\", "
+ "\"key\": \"" + key + "\""
+ "}";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ int r = m_local_cluster->mon_command(cmd, in_bl, &out_bl, nullptr);
+ dout(5) << ": mon command r=" << r << dendl;
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+
+ std::string mon_host;
+ std::string cephx_key;
+ if (!r) {
+ json_spirit::mValue root;
+ if (!json_spirit::read(out_bl.to_str(), root)) {
+ derr << ": invalid config-key JSON" << dendl;
+ return -EBADMSG;
+ }
+ try {
+ auto &root_obj = root.get_obj();
+ mon_host = root_obj.at("mon_host").get_str();
+ cephx_key = root_obj.at("key").get_str();
+ dout(0) << ": remote monitor host=" << mon_host << dendl;
+ } catch (std::runtime_error&) {
+ derr << ": unexpected JSON received" << dendl;
+ return -EBADMSG;
+ }
+ }
+
+ r = connect(remote_client, remote_cluster, &m_remote_cluster, mon_host, cephx_key);
+ if (r < 0) {
+ derr << ": error connecting to remote cluster: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = mount(m_remote_cluster, remote_filesystem, false, &m_remote_mount);
+ if (r < 0) {
+ m_remote_cluster.reset();
+ derr << ": error mounting remote filesystem=" << remote_filesystem << dendl;
+ return r;
+ }
+
+ std::scoped_lock locker(m_lock);
+ auto nr_replayers = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_max_concurrent_directory_syncs");
+ dout(20) << ": spawning " << nr_replayers << " snapshot replayer(s)" << dendl;
+
+ while (nr_replayers-- > 0) {
+ std::unique_ptr<SnapshotReplayerThread> replayer(
+ new SnapshotReplayerThread(this));
+ std::string name("replayer-" + stringify(nr_replayers));
+ replayer->create(name.c_str());
+ m_replayers.push_back(std::move(replayer));
+ }
+
+ return 0;
+}
+
+void PeerReplayer::shutdown() {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(!m_stopping);
+ m_stopping = true;
+ m_cond.notify_all();
+ }
+
+ for (auto &replayer : m_replayers) {
+ replayer->join();
+ }
+ m_replayers.clear();
+ ceph_unmount(m_remote_mount);
+ ceph_release(m_remote_mount);
+ m_remote_mount = nullptr;
+ m_remote_cluster.reset();
+}
+
+void PeerReplayer::add_directory(string_view dir_root) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ std::scoped_lock locker(m_lock);
+ m_directories.emplace_back(dir_root);
+ m_snap_sync_stats.emplace(dir_root, SnapSyncStat());
+ m_cond.notify_all();
+}
+
+void PeerReplayer::remove_directory(string_view dir_root) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+ auto _dir_root = std::string(dir_root);
+
+ std::scoped_lock locker(m_lock);
+ auto it = std::find(m_directories.begin(), m_directories.end(), _dir_root);
+ if (it != m_directories.end()) {
+ m_directories.erase(it);
+ }
+
+ auto it1 = m_registered.find(_dir_root);
+ if (it1 == m_registered.end()) {
+ m_snap_sync_stats.erase(_dir_root);
+ } else {
+ it1->second.canceled = true;
+ }
+ m_cond.notify_all();
+}
+
+boost::optional<std::string> PeerReplayer::pick_directory() {
+ dout(20) << dendl;
+
+ auto now = clock::now();
+ auto retry_timo = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_retry_failed_directories_interval");
+
+ boost::optional<std::string> candidate;
+ for (auto &dir_root : m_directories) {
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ if (sync_stat.failed) {
+ std::chrono::duration<double> d = now - *sync_stat.last_failed;
+ if (d.count() < retry_timo) {
+ continue;
+ }
+ }
+ if (!m_registered.count(dir_root)) {
+ candidate = dir_root;
+ break;
+ }
+ }
+
+ std::rotate(m_directories.begin(), m_directories.begin() + 1, m_directories.end());
+ return candidate;
+}
+
+int PeerReplayer::register_directory(const std::string &dir_root,
+ SnapshotReplayerThread *replayer) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+ ceph_assert(m_registered.find(dir_root) == m_registered.end());
+
+ DirRegistry registry;
+ int r = try_lock_directory(dir_root, replayer, &registry);
+ if (r < 0) {
+ return r;
+ }
+
+ dout(5) << ": dir_root=" << dir_root << " registered with replayer="
+ << replayer << dendl;
+ m_registered.emplace(dir_root, std::move(registry));
+ return 0;
+}
+
+void PeerReplayer::unregister_directory(const std::string &dir_root) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ auto it = m_registered.find(dir_root);
+ ceph_assert(it != m_registered.end());
+
+ unlock_directory(it->first, it->second);
+ m_registered.erase(it);
+ if (std::find(m_directories.begin(), m_directories.end(), dir_root) == m_directories.end()) {
+ m_snap_sync_stats.erase(dir_root);
+ }
+}
+
+int PeerReplayer::try_lock_directory(const std::string &dir_root,
+ SnapshotReplayerThread *replayer, DirRegistry *registry) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ int r = ceph_open(m_remote_mount, dir_root.c_str(), O_RDONLY | O_DIRECTORY, 0);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to open remote dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (r == -ENOENT) {
+ // we snap under dir_root, so mode does not matter much
+ r = ceph_mkdirs(m_remote_mount, dir_root.c_str(), 0755);
+ if (r < 0) {
+ derr << ": failed to create remote directory=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = ceph_open(m_remote_mount, dir_root.c_str(), O_RDONLY | O_DIRECTORY, 0);
+ if (r < 0) {
+ derr << ": failed to open remote dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ int fd = r;
+ r = ceph_flock(m_remote_mount, fd, LOCK_EX | LOCK_NB, (uint64_t)replayer->get_thread_id());
+ if (r != 0) {
+ if (r == -EWOULDBLOCK) {
+ dout(5) << ": dir_root=" << dir_root << " is locked by cephfs-mirror, "
+ << "will retry again" << dendl;
+ } else {
+ derr << ": failed to lock dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ }
+
+ if (ceph_close(m_remote_mount, fd) < 0) {
+ derr << ": failed to close (cleanup) remote dir_root=" << dir_root << ": "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+
+ dout(10) << ": dir_root=" << dir_root << " locked" << dendl;
+
+ registry->fd = fd;
+ registry->replayer = replayer;
+ return 0;
+}
+
+void PeerReplayer::unlock_directory(const std::string &dir_root, const DirRegistry &registry) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ int r = ceph_flock(m_remote_mount, registry.fd, LOCK_UN,
+ (uint64_t)registry.replayer->get_thread_id());
+ if (r < 0) {
+ derr << ": failed to unlock remote dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+
+ r = ceph_close(m_remote_mount, registry.fd);
+ if (r < 0) {
+ derr << ": failed to close remote dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ }
+
+ dout(10) << ": dir_root=" << dir_root << " unlocked" << dendl;
+}
+
+int PeerReplayer::build_snap_map(const std::string &dir_root,
+ std::map<uint64_t, std::string> *snap_map, bool is_remote) {
+ auto snap_dir = snapshot_dir_path(m_cct, dir_root);
+ dout(20) << ": dir_root=" << dir_root << ", snap_dir=" << snap_dir
+ << ", is_remote=" << is_remote << dendl;
+
+ auto lr_str = is_remote ? "remote" : "local";
+ auto mnt = is_remote ? m_remote_mount : m_local_mount;
+
+ ceph_dir_result *dirp = nullptr;
+ int r = ceph_opendir(mnt, snap_dir.c_str(), &dirp);
+ if (r < 0) {
+ if (is_remote && r == -ENOENT) {
+ return 0;
+ }
+ derr << ": failed to open " << lr_str << " snap directory=" << snap_dir
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::set<std::string> snaps;
+ auto entry = ceph_readdir(mnt, dirp);
+ while (entry != NULL) {
+ auto d_name = std::string(entry->d_name);
+ dout(20) << ": entry=" << d_name << dendl;
+ if (d_name != "." && d_name != ".." && d_name.rfind("_", 0) != 0) {
+ snaps.emplace(d_name);
+ }
+
+ entry = ceph_readdir(mnt, dirp);
+ }
+
+ int rv = 0;
+ for (auto &snap : snaps) {
+ snap_info info;
+ auto snap_path = snapshot_path(snap_dir, snap);
+ r = ceph_get_snap_info(mnt, snap_path.c_str(), &info);
+ if (r < 0) {
+ derr << ": failed to fetch " << lr_str << " snap info for snap_path=" << snap_path
+ << ": " << cpp_strerror(r) << dendl;
+ rv = r;
+ break;
+ }
+
+ uint64_t snap_id;
+ if (is_remote) {
+ if (!info.nr_snap_metadata) {
+ derr << ": snap_path=" << snap_path << " has invalid metadata in remote snapshot"
+ << dendl;
+ rv = -EINVAL;
+ } else {
+ auto metadata = decode_snap_metadata(info.snap_metadata, info.nr_snap_metadata);
+ dout(20) << ": snap_path=" << snap_path << ", metadata=" << metadata << dendl;
+ auto it = metadata.find(PRIMARY_SNAP_ID_KEY);
+ if (it == metadata.end()) {
+ derr << ": snap_path=" << snap_path << " has missing \"" << PRIMARY_SNAP_ID_KEY
+ << "\" in metadata" << dendl;
+ rv = -EINVAL;
+ } else {
+ snap_id = std::stoull(it->second);
+ }
+ ceph_free_snap_info_buffer(&info);
+ }
+ } else {
+ snap_id = info.id;
+ }
+
+ if (rv != 0) {
+ break;
+ }
+ snap_map->emplace(snap_id, snap);
+ }
+
+ r = ceph_closedir(mnt, dirp);
+ if (r < 0) {
+ derr << ": failed to close " << lr_str << " snap directory=" << snap_dir
+ << ": " << cpp_strerror(r) << dendl;
+ }
+
+ dout(10) << ": " << lr_str << " snap_map=" << *snap_map << dendl;
+ return rv;
+}
+
+int PeerReplayer::propagate_snap_deletes(const std::string &dir_root,
+ const std::set<std::string> &snaps) {
+ dout(5) << ": dir_root=" << dir_root << ", deleted snapshots=" << snaps << dendl;
+
+ for (auto &snap : snaps) {
+ dout(20) << ": deleting dir_root=" << dir_root << ", snapshot=" << snap
+ << dendl;
+ int r = ceph_rmsnap(m_remote_mount, dir_root.c_str(), snap.c_str());
+ if (r < 0) {
+ derr << ": failed to delete remote snap dir_root=" << dir_root
+ << ", snapshot=" << snaps << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ inc_deleted_snap(dir_root);
+ }
+
+ return 0;
+}
+
+int PeerReplayer::propagate_snap_renames(
+ const std::string &dir_root,
+ const std::set<std::pair<std::string,std::string>> &snaps) {
+ dout(10) << ": dir_root=" << dir_root << ", renamed snapshots=" << snaps << dendl;
+
+ for (auto &snapp : snaps) {
+ auto from = snapshot_path(m_cct, dir_root, snapp.first);
+ auto to = snapshot_path(m_cct, dir_root, snapp.second);
+ dout(20) << ": renaming dir_root=" << dir_root << ", snapshot from="
+ << from << ", to=" << to << dendl;
+ int r = ceph_rename(m_remote_mount, from.c_str(), to.c_str());
+ if (r < 0) {
+ derr << ": failed to rename remote snap dir_root=" << dir_root
+ << ", snapshot from =" << from << ", to=" << to << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ inc_renamed_snap(dir_root);
+ }
+
+ return 0;
+}
+
+int PeerReplayer::remote_mkdir(const std::string &epath, const struct ceph_statx &stx,
+ const FHandles &fh) {
+ dout(10) << ": remote epath=" << epath << dendl;
+
+ int r = ceph_mkdirat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_mode & ~S_IFDIR);
+ if (r < 0 && r != -EEXIST) {
+ derr << ": failed to create remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = ceph_chownat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_uid, stx.stx_gid,
+ AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to chown remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = ceph_chmodat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_mode & ~S_IFMT,
+ AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to chmod remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ struct timespec times[] = {{stx.stx_atime.tv_sec, stx.stx_atime.tv_nsec},
+ {stx.stx_mtime.tv_sec, stx.stx_mtime.tv_nsec}};
+ r = ceph_utimensat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), times, AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to change [am]time on remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+#define NR_IOVECS 8 // # iovecs
+#define IOVEC_SIZE (8 * 1024 * 1024) // buffer size for each iovec
+int PeerReplayer::copy_to_remote(const std::string &dir_root, const std::string &epath,
+ const struct ceph_statx &stx, const FHandles &fh) {
+ dout(10) << ": dir_root=" << dir_root << ", epath=" << epath << dendl;
+ int l_fd;
+ int r_fd;
+ void *ptr;
+ struct iovec iov[NR_IOVECS];
+
+ int r = ceph_openat(m_local_mount, fh.c_fd, epath.c_str(), O_RDONLY | O_NOFOLLOW, 0);
+ if (r < 0) {
+ derr << ": failed to open local file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ l_fd = r;
+ r = ceph_openat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(),
+ O_CREAT | O_TRUNC | O_WRONLY | O_NOFOLLOW, stx.stx_mode);
+ if (r < 0) {
+ derr << ": failed to create remote file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ goto close_local_fd;
+ }
+
+ r_fd = r;
+ ptr = malloc(NR_IOVECS * IOVEC_SIZE);
+ if (!ptr) {
+ r = -ENOMEM;
+ derr << ": failed to allocate memory" << dendl;
+ goto close_remote_fd;
+ }
+
+ while (true) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+
+ for (int i = 0; i < NR_IOVECS; ++i) {
+ iov[i].iov_base = (char*)ptr + IOVEC_SIZE*i;
+ iov[i].iov_len = IOVEC_SIZE;
+ }
+
+ r = ceph_preadv(m_local_mount, l_fd, iov, NR_IOVECS, -1);
+ if (r < 0) {
+ derr << ": failed to read local file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ if (r == 0) {
+ break;
+ }
+
+ int iovs = (int)(r / IOVEC_SIZE);
+ int t = r % IOVEC_SIZE;
+ if (t) {
+ iov[iovs].iov_len = t;
+ ++iovs;
+ }
+
+ r = ceph_pwritev(m_remote_mount, r_fd, iov, iovs, -1);
+ if (r < 0) {
+ derr << ": failed to write remote file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ }
+
+ if (r == 0) {
+ r = ceph_fsync(m_remote_mount, r_fd, 0);
+ if (r < 0) {
+ derr << ": failed to sync data for file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ }
+ }
+
+ free(ptr);
+
+close_remote_fd:
+ if (ceph_close(m_remote_mount, r_fd) < 0) {
+ derr << ": failed to close remote fd path=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return -EINVAL;
+ }
+
+close_local_fd:
+ if (ceph_close(m_local_mount, l_fd) < 0) {
+ derr << ": failed to close local fd path=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return -EINVAL;
+ }
+
+ return r == 0 ? 0 : r;
+}
+
+int PeerReplayer::remote_file_op(const std::string &dir_root, const std::string &epath,
+ const struct ceph_statx &stx, const FHandles &fh,
+ bool need_data_sync, bool need_attr_sync) {
+ dout(10) << ": dir_root=" << dir_root << ", epath=" << epath << ", need_data_sync=" << need_data_sync
+ << ", need_attr_sync=" << need_attr_sync << dendl;
+
+ int r;
+ if (need_data_sync) {
+ if (S_ISREG(stx.stx_mode)) {
+ r = copy_to_remote(dir_root, epath, stx, fh);
+ if (r < 0) {
+ derr << ": failed to copy path=" << epath << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else if (S_ISLNK(stx.stx_mode)) {
+ // free the remote link before relinking
+ r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), 0);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to remove remote symlink=" << epath << dendl;
+ return r;
+ }
+ char *target = (char *)alloca(stx.stx_size+1);
+ r = ceph_readlinkat(m_local_mount, fh.c_fd, epath.c_str(), target, stx.stx_size);
+ if (r < 0) {
+ derr << ": failed to readlink local path=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ target[stx.stx_size] = '\0';
+ r = ceph_symlinkat(m_remote_mount, target, fh.r_fd_dir_root, epath.c_str());
+ if (r < 0 && r != EEXIST) {
+ derr << ": failed to symlink remote path=" << epath << " to target=" << target
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else {
+ dout(5) << ": skipping entry=" << epath << ": unsupported mode=" << stx.stx_mode
+ << dendl;
+ return 0;
+ }
+ }
+
+ if (need_attr_sync) {
+ r = ceph_chownat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_uid, stx.stx_gid,
+ AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to chown remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = ceph_chmodat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_mode & ~S_IFMT,
+ AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to chmod remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ struct timespec times[] = {{stx.stx_atime.tv_sec, stx.stx_atime.tv_nsec},
+ {stx.stx_mtime.tv_sec, stx.stx_mtime.tv_nsec}};
+ r = ceph_utimensat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), times, AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to change [am]time on remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+int PeerReplayer::cleanup_remote_dir(const std::string &dir_root,
+ const std::string &epath, const FHandles &fh) {
+ dout(20) << ": dir_root=" << dir_root << ", epath=" << epath
+ << dendl;
+
+ struct ceph_statx tstx;
+ int r = ceph_statxat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), &tstx,
+ CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID |
+ CEPH_STATX_SIZE | CEPH_STATX_ATIME | CEPH_STATX_MTIME,
+ AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to stat remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ ceph_dir_result *tdirp;
+ r = opendirat(m_remote_mount, fh.r_fd_dir_root, epath, AT_SYMLINK_NOFOLLOW,
+ &tdirp);
+ if (r < 0) {
+ derr << ": failed to open remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::stack<SyncEntry> rm_stack;
+ rm_stack.emplace(SyncEntry(epath, tdirp, tstx));
+ while (!rm_stack.empty()) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+
+ dout(20) << ": " << rm_stack.size() << " entries in stack" << dendl;
+ std::string e_name;
+ auto &entry = rm_stack.top();
+ dout(20) << ": top of stack path=" << entry.epath << dendl;
+ if (entry.is_directory()) {
+ struct ceph_statx stx;
+ struct dirent de;
+ while (true) {
+ r = ceph_readdirplus_r(m_remote_mount, entry.dirp, &de, &stx,
+ CEPH_STATX_MODE, AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW, NULL);
+ if (r < 0) {
+ derr << ": failed to read remote directory=" << entry.epath << dendl;
+ break;
+ }
+ if (r == 0) {
+ break;
+ }
+
+ auto d_name = std::string(de.d_name);
+ if (d_name != "." && d_name != "..") {
+ e_name = d_name;
+ break;
+ }
+ }
+
+ if (r == 0) {
+ r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, entry.epath.c_str(), AT_REMOVEDIR);
+ if (r < 0) {
+ derr << ": failed to remove remote directory=" << entry.epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+
+ dout(10) << ": done for remote directory=" << entry.epath << dendl;
+ if (ceph_closedir(m_remote_mount, entry.dirp) < 0) {
+ derr << ": failed to close remote directory=" << entry.epath << dendl;
+ }
+ rm_stack.pop();
+ continue;
+ }
+ if (r < 0) {
+ break;
+ }
+
+ auto epath = entry_path(entry.epath, e_name);
+ if (S_ISDIR(stx.stx_mode)) {
+ ceph_dir_result *dirp;
+ r = opendirat(m_remote_mount, fh.r_fd_dir_root, epath, AT_SYMLINK_NOFOLLOW,
+ &dirp);
+ if (r < 0) {
+ derr << ": failed to open remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ rm_stack.emplace(SyncEntry(epath, dirp, stx));
+ } else {
+ rm_stack.emplace(SyncEntry(epath, stx));
+ }
+ } else {
+ r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, entry.epath.c_str(), 0);
+ if (r < 0) {
+ derr << ": failed to remove remote directory=" << entry.epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ dout(10) << ": done for remote file=" << entry.epath << dendl;
+ rm_stack.pop();
+ }
+ }
+
+ while (!rm_stack.empty()) {
+ auto &entry = rm_stack.top();
+ if (entry.is_directory()) {
+ dout(20) << ": closing remote directory=" << entry.epath << dendl;
+ if (ceph_closedir(m_remote_mount, entry.dirp) < 0) {
+ derr << ": failed to close remote directory=" << entry.epath << dendl;
+ }
+ }
+
+ rm_stack.pop();
+ }
+
+ return r;
+}
+
+int PeerReplayer::should_sync_entry(const std::string &epath, const struct ceph_statx &cstx,
+ const FHandles &fh, bool *need_data_sync, bool *need_attr_sync) {
+ dout(10) << ": epath=" << epath << dendl;
+
+ *need_data_sync = false;
+ *need_attr_sync = false;
+ struct ceph_statx pstx;
+ int r = ceph_statxat(fh.p_mnt, fh.p_fd, epath.c_str(), &pstx,
+ CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID |
+ CEPH_STATX_SIZE | CEPH_STATX_CTIME | CEPH_STATX_MTIME,
+ AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0 && r != -ENOENT && r != -ENOTDIR) {
+ derr << ": failed to stat prev entry= " << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (r < 0) {
+ // inode does not exist in prev snapshot or file type has changed
+ // (file was S_IFREG earlier, S_IFDIR now).
+ dout(5) << ": entry=" << epath << ", r=" << r << dendl;
+ *need_data_sync = true;
+ *need_attr_sync = true;
+ return 0;
+ }
+
+ dout(10) << ": local cur statx: mode=" << cstx.stx_mode << ", uid=" << cstx.stx_uid
+ << ", gid=" << cstx.stx_gid << ", size=" << cstx.stx_size << ", ctime="
+ << cstx.stx_ctime << ", mtime=" << cstx.stx_mtime << dendl;
+ dout(10) << ": local prev statx: mode=" << pstx.stx_mode << ", uid=" << pstx.stx_uid
+ << ", gid=" << pstx.stx_gid << ", size=" << pstx.stx_size << ", ctime="
+ << pstx.stx_ctime << ", mtime=" << pstx.stx_mtime << dendl;
+ if ((cstx.stx_mode & S_IFMT) != (pstx.stx_mode & S_IFMT)) {
+ dout(5) << ": entry=" << epath << " has mode mismatch" << dendl;
+ *need_data_sync = true;
+ *need_attr_sync = true;
+ } else {
+ *need_data_sync = (cstx.stx_size != pstx.stx_size) || (cstx.stx_mtime != pstx.stx_mtime);
+ *need_attr_sync = (cstx.stx_ctime != pstx.stx_ctime);
+ }
+
+ return 0;
+}
+
+int PeerReplayer::propagate_deleted_entries(const std::string &dir_root,
+ const std::string &epath, const FHandles &fh) {
+ dout(10) << ": dir_root=" << dir_root << ", epath=" << epath << dendl;
+
+ ceph_dir_result *dirp;
+ int r = opendirat(fh.p_mnt, fh.p_fd, epath, AT_SYMLINK_NOFOLLOW, &dirp);
+ if (r < 0) {
+ if (r == -ELOOP) {
+ dout(5) << ": epath=" << epath << " is a symbolic link -- mode sync"
+ << " done when traversing parent" << dendl;
+ return 0;
+ }
+ if (r == -ENOTDIR) {
+ dout(5) << ": epath=" << epath << " is not a directory -- mode sync"
+ << " done when traversing parent" << dendl;
+ return 0;
+ }
+ if (r == -ENOENT) {
+ dout(5) << ": epath=" << epath << " missing in previous-snap/remote dir-root"
+ << dendl;
+ }
+ return r;
+ }
+
+ struct dirent *dire = (struct dirent *)alloca(512 * sizeof(struct dirent));
+ while (true) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+
+ int len = ceph_getdents(fh.p_mnt, dirp, (char *)dire, 512);
+ if (len < 0) {
+ derr << ": failed to read directory entries: " << cpp_strerror(len) << dendl;
+ r = len;
+ // flip errno to signal that we got an err (possible the
+ // snapshot getting deleted in midst).
+ if (r == -ENOENT) {
+ r = -EINVAL;
+ }
+ break;
+ }
+ if (len == 0) {
+ dout(10) << ": reached EOD" << dendl;
+ break;
+ }
+ int nr = len / sizeof(struct dirent);
+ for (int i = 0; i < nr; ++i) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+ std::string d_name = std::string(dire[i].d_name);
+ if (d_name == "." || d_name == "..") {
+ continue;
+ }
+
+ struct ceph_statx pstx;
+ auto dpath = entry_path(epath, d_name);
+ r = ceph_statxat(fh.p_mnt, fh.p_fd, dpath.c_str(), &pstx,
+ CEPH_STATX_MODE, AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to stat (prev) directory=" << dpath << ": "
+ << cpp_strerror(r) << dendl;
+ // flip errno to signal that we got an err (possible the
+ // snapshot getting deleted in midst).
+ if (r == -ENOENT) {
+ r = -EINVAL;
+ }
+ return r;
+ }
+
+ struct ceph_statx cstx;
+ r = ceph_statxat(m_local_mount, fh.c_fd, dpath.c_str(), &cstx,
+ CEPH_STATX_MODE, AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to stat local (cur) directory=" << dpath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ bool purge_remote = true;
+ if (r == 0) {
+ // directory entry present in both snapshots -- check inode
+ // type
+ if ((pstx.stx_mode & S_IFMT) == (cstx.stx_mode & S_IFMT)) {
+ dout(5) << ": mode matches for entry=" << d_name << dendl;
+ purge_remote = false;
+ } else {
+ dout(5) << ": mode mismatch for entry=" << d_name << dendl;
+ }
+ } else {
+ dout(5) << ": entry=" << d_name << " missing in current snapshot" << dendl;
+ }
+
+ if (purge_remote) {
+ dout(5) << ": purging remote entry=" << dpath << dendl;
+ if (S_ISDIR(pstx.stx_mode)) {
+ r = cleanup_remote_dir(dir_root, dpath, fh);
+ } else {
+ r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, dpath.c_str(), 0);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to cleanup remote entry=" << d_name << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+ }
+
+ ceph_closedir(fh.p_mnt, dirp);
+ return r;
+}
+
+int PeerReplayer::open_dir(MountRef mnt, const std::string &dir_path,
+ boost::optional<uint64_t> snap_id) {
+ dout(20) << ": dir_path=" << dir_path << dendl;
+ if (snap_id) {
+ dout(20) << ": expected snapshot id=" << *snap_id << dendl;
+ }
+
+ int fd = ceph_open(mnt, dir_path.c_str(), O_DIRECTORY | O_RDONLY, 0);
+ if (fd < 0) {
+ derr << ": cannot open dir_path=" << dir_path << ": " << cpp_strerror(fd)
+ << dendl;
+ return fd;
+ }
+
+ if (!snap_id) {
+ return fd;
+ }
+
+ snap_info info;
+ int r = ceph_get_snap_info(mnt, dir_path.c_str(), &info);
+ if (r < 0) {
+ derr << ": failed to fetch snap_info for path=" << dir_path
+ << ": " << cpp_strerror(r) << dendl;
+ ceph_close(mnt, fd);
+ return r;
+ }
+
+ if (info.id != *snap_id) {
+ dout(5) << ": got mismatching snapshot id for path=" << dir_path << " (" << info.id
+ << " vs " << *snap_id << ") -- possible recreate" << dendl;
+ ceph_close(mnt, fd);
+ return -EINVAL;
+ }
+
+ return fd;
+}
+
+int PeerReplayer::pre_sync_check_and_open_handles(
+ const std::string &dir_root,
+ const Snapshot &current, boost::optional<Snapshot> prev,
+ FHandles *fh) {
+ dout(20) << ": dir_root=" << dir_root << ", current=" << current << dendl;
+ if (prev) {
+ dout(20) << ": prev=" << prev << dendl;
+ }
+
+ auto cur_snap_path = snapshot_path(m_cct, dir_root, current.first);
+ auto fd = open_dir(m_local_mount, cur_snap_path, current.second);
+ if (fd < 0) {
+ return fd;
+ }
+
+ // current snapshot file descriptor
+ fh->c_fd = fd;
+
+ MountRef mnt;
+ if (prev) {
+ mnt = m_local_mount;
+ auto prev_snap_path = snapshot_path(m_cct, dir_root, (*prev).first);
+ fd = open_dir(mnt, prev_snap_path, (*prev).second);
+ } else {
+ mnt = m_remote_mount;
+ fd = open_dir(mnt, dir_root, boost::none);
+ }
+
+ if (fd < 0) {
+ if (!prev || fd != -ENOENT) {
+ ceph_close(m_local_mount, fh->c_fd);
+ return fd;
+ }
+
+ // ENOENT of previous snap
+ dout(5) << ": previous snapshot=" << *prev << " missing" << dendl;
+ mnt = m_remote_mount;
+ fd = open_dir(mnt, dir_root, boost::none);
+ if (fd < 0) {
+ ceph_close(m_local_mount, fh->c_fd);
+ return fd;
+ }
+ }
+
+ // "previous" snapshot or dir_root file descriptor
+ fh->p_fd = fd;
+ fh->p_mnt = mnt;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto it = m_registered.find(dir_root);
+ ceph_assert(it != m_registered.end());
+ fh->r_fd_dir_root = it->second.fd;
+ }
+
+ dout(5) << ": using " << ((fh->p_mnt == m_local_mount) ? "local (previous) snapshot" : "remote dir_root")
+ << " for incremental transfer" << dendl;
+ return 0;
+}
+
+void PeerReplayer::post_sync_close_handles(const FHandles &fh) {
+ dout(20) << dendl;
+
+ // @FHandles.r_fd_dir_root is closed in @unregister_directory since
+ // its used to acquire an exclusive lock on remote dir_root.
+ ceph_close(m_local_mount, fh.c_fd);
+ ceph_close(fh.p_mnt, fh.p_fd);
+}
+
+int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev) {
+ dout(20) << ": dir_root=" << dir_root << ", current=" << current << dendl;
+ if (prev) {
+ dout(20) << ": incremental sync check from prev=" << prev << dendl;
+ }
+
+ FHandles fh;
+ int r = pre_sync_check_and_open_handles(dir_root, current, prev, &fh);
+ if (r < 0) {
+ dout(5) << ": cannot proceeed with sync: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ BOOST_SCOPE_EXIT_ALL( (this)(&fh) ) {
+ post_sync_close_handles(fh);
+ };
+
+ // record that we are going to "dirty" the data under this
+ // directory root
+ auto snap_id_str{stringify(current.second)};
+ r = ceph_fsetxattr(m_remote_mount, fh.r_fd_dir_root, "ceph.mirror.dirty_snap_id",
+ snap_id_str.c_str(), snap_id_str.size(), 0);
+ if (r < 0) {
+ derr << ": error setting \"ceph.mirror.dirty_snap_id\" on dir_root=" << dir_root
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ struct ceph_statx tstx;
+ r = ceph_fstatx(m_local_mount, fh.c_fd, &tstx,
+ CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID |
+ CEPH_STATX_SIZE | CEPH_STATX_ATIME | CEPH_STATX_MTIME,
+ AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to stat snap=" << current.first << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ceph_dir_result *tdirp;
+ r = ceph_fdopendir(m_local_mount, fh.c_fd, &tdirp);
+ if (r < 0) {
+ derr << ": failed to open local snap=" << current.first << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::stack<SyncEntry> sync_stack;
+ sync_stack.emplace(SyncEntry(".", tdirp, tstx));
+ while (!sync_stack.empty()) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+
+ dout(20) << ": " << sync_stack.size() << " entries in stack" << dendl;
+ std::string e_name;
+ auto &entry = sync_stack.top();
+ dout(20) << ": top of stack path=" << entry.epath << dendl;
+ if (entry.is_directory()) {
+ // entry is a directory -- propagate deletes for missing entries
+ // (and changed inode types) to the remote filesystem.
+ if (!entry.needs_remote_sync()) {
+ r = propagate_deleted_entries(dir_root, entry.epath, fh);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to propagate missing dirs: " << cpp_strerror(r) << dendl;
+ break;
+ }
+ entry.set_remote_synced();
+ }
+
+ struct ceph_statx stx;
+ struct dirent de;
+ while (true) {
+ r = ceph_readdirplus_r(m_local_mount, entry.dirp, &de, &stx,
+ CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID |
+ CEPH_STATX_SIZE | CEPH_STATX_ATIME | CEPH_STATX_MTIME,
+ AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW, NULL);
+ if (r < 0) {
+ derr << ": failed to local read directory=" << entry.epath << dendl;
+ break;
+ }
+ if (r == 0) {
+ break;
+ }
+
+ auto d_name = std::string(de.d_name);
+ if (d_name != "." && d_name != "..") {
+ e_name = d_name;
+ break;
+ }
+ }
+
+ if (r == 0) {
+ dout(10) << ": done for directory=" << entry.epath << dendl;
+ if (ceph_closedir(m_local_mount, entry.dirp) < 0) {
+ derr << ": failed to close local directory=" << entry.epath << dendl;
+ }
+ sync_stack.pop();
+ continue;
+ }
+ if (r < 0) {
+ break;
+ }
+
+ auto epath = entry_path(entry.epath, e_name);
+ if (S_ISDIR(stx.stx_mode)) {
+ r = remote_mkdir(epath, stx, fh);
+ if (r < 0) {
+ break;
+ }
+ ceph_dir_result *dirp;
+ r = opendirat(m_local_mount, fh.c_fd, epath, AT_SYMLINK_NOFOLLOW, &dirp);
+ if (r < 0) {
+ derr << ": failed to open local directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ sync_stack.emplace(SyncEntry(epath, dirp, stx));
+ } else {
+ sync_stack.emplace(SyncEntry(epath, stx));
+ }
+ } else {
+ bool need_data_sync = true;
+ bool need_attr_sync = true;
+ r = should_sync_entry(entry.epath, entry.stx, fh,
+ &need_data_sync, &need_attr_sync);
+ if (r < 0) {
+ break;
+ }
+
+ dout(5) << ": entry=" << entry.epath << ", data_sync=" << need_data_sync
+ << ", attr_sync=" << need_attr_sync << dendl;
+ if (need_data_sync || need_attr_sync) {
+ r = remote_file_op(dir_root, entry.epath, entry.stx, fh, need_data_sync,
+ need_attr_sync);
+ if (r < 0) {
+ break;
+ }
+ }
+ dout(10) << ": done for epath=" << entry.epath << dendl;
+ sync_stack.pop();
+ }
+ }
+
+ while (!sync_stack.empty()) {
+ auto &entry = sync_stack.top();
+ if (entry.is_directory()) {
+ dout(20) << ": closing local directory=" << entry.epath << dendl;
+ if (ceph_closedir(m_local_mount, entry.dirp) < 0) {
+ derr << ": failed to close local directory=" << entry.epath << dendl;
+ }
+ }
+
+ sync_stack.pop();
+ }
+
+ return r;
+}
+
+int PeerReplayer::synchronize(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev) {
+ dout(20) << ": dir_root=" << dir_root << ", current=" << current << dendl;
+ if (prev) {
+ dout(20) << ": prev=" << prev << dendl;
+ }
+
+ int r = ceph_getxattr(m_remote_mount, dir_root.c_str(), "ceph.mirror.dirty_snap_id", nullptr, 0);
+ if (r < 0 && r != -ENODATA) {
+ derr << ": failed to fetch primary_snap_id length from dir_root=" << dir_root
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // no xattr, can't determine which snap the data belongs to!
+ if (r < 0) {
+ dout(5) << ": missing \"ceph.mirror.dirty_snap_id\" xattr on remote -- using"
+ << " incremental sync with remote scan" << dendl;
+ r = do_synchronize(dir_root, current, boost::none);
+ } else {
+ size_t xlen = r;
+ char *val = (char *)alloca(xlen+1);
+ r = ceph_getxattr(m_remote_mount, dir_root.c_str(), "ceph.mirror.dirty_snap_id", (void*)val, xlen);
+ if (r < 0) {
+ derr << ": failed to fetch \"dirty_snap_id\" for dir_root: " << dir_root
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ val[xlen] = '\0';
+ uint64_t dirty_snap_id = atoll(val);
+
+ dout(20) << ": dirty_snap_id: " << dirty_snap_id << " vs (" << current.second
+ << "," << (prev ? stringify((*prev).second) : "~") << ")" << dendl;
+ if (prev && (dirty_snap_id == (*prev).second || dirty_snap_id == current.second)) {
+ dout(5) << ": match -- using incremental sync with local scan" << dendl;
+ r = do_synchronize(dir_root, current, prev);
+ } else {
+ dout(5) << ": mismatch -- using incremental sync with remote scan" << dendl;
+ r = do_synchronize(dir_root, current, boost::none);
+ }
+ }
+
+ // snap sync failed -- bail out!
+ if (r < 0) {
+ return r;
+ }
+
+ auto cur_snap_id_str{stringify(current.second)};
+ snap_metadata snap_meta[] = {{PRIMARY_SNAP_ID_KEY.c_str(), cur_snap_id_str.c_str()}};
+ r = ceph_mksnap(m_remote_mount, dir_root.c_str(), current.first.c_str(), 0755,
+ snap_meta, sizeof(snap_meta)/sizeof(snap_metadata));
+ if (r < 0) {
+ derr << ": failed to snap remote directory dir_root=" << dir_root
+ << ": " << cpp_strerror(r) << dendl;
+ }
+
+ return r;
+}
+
+int PeerReplayer::do_sync_snaps(const std::string &dir_root) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ std::map<uint64_t, std::string> local_snap_map;
+ std::map<uint64_t, std::string> remote_snap_map;
+
+ int r = build_snap_map(dir_root, &local_snap_map);
+ if (r < 0) {
+ derr << ": failed to build local snap map" << dendl;
+ return r;
+ }
+
+ r = build_snap_map(dir_root, &remote_snap_map, true);
+ if (r < 0) {
+ derr << ": failed to build remote snap map" << dendl;
+ return r;
+ }
+
+ // infer deleted and renamed snapshots from local and remote
+ // snap maps
+ std::set<std::string> snaps_deleted;
+ std::set<std::pair<std::string,std::string>> snaps_renamed;
+ for (auto &[primary_snap_id, snap_name] : remote_snap_map) {
+ auto it = local_snap_map.find(primary_snap_id);
+ if (it == local_snap_map.end()) {
+ snaps_deleted.emplace(snap_name);
+ } else if (it->second != snap_name) {
+ snaps_renamed.emplace(std::make_pair(snap_name, it->second));
+ }
+ }
+
+ r = propagate_snap_deletes(dir_root, snaps_deleted);
+ if (r < 0) {
+ derr << ": failed to propgate deleted snapshots" << dendl;
+ return r;
+ }
+
+ r = propagate_snap_renames(dir_root, snaps_renamed);
+ if (r < 0) {
+ derr << ": failed to propgate renamed snapshots" << dendl;
+ return r;
+ }
+
+ // start mirroring snapshots from the last snap-id synchronized
+ uint64_t last_snap_id = 0;
+ std::string last_snap_name;
+ if (!remote_snap_map.empty()) {
+ auto last = remote_snap_map.rbegin();
+ last_snap_id = last->first;
+ last_snap_name = last->second;
+ set_last_synced_snap(dir_root, last_snap_id, last_snap_name);
+ }
+
+ dout(5) << ": last snap-id transferred=" << last_snap_id << dendl;
+ auto it = local_snap_map.upper_bound(last_snap_id);
+ if (it == local_snap_map.end()) {
+ dout(20) << ": nothing to synchronize" << dendl;
+ return 0;
+ }
+
+ auto snaps_per_cycle = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_max_snapshot_sync_per_cycle");
+
+ dout(10) << ": synchronizing from snap-id=" << it->first << dendl;
+ for (; it != local_snap_map.end(); ++it) {
+ set_current_syncing_snap(dir_root, it->first, it->second);
+ auto start = clock::now();
+ boost::optional<Snapshot> prev = boost::none;
+ if (last_snap_id != 0) {
+ prev = std::make_pair(last_snap_name, last_snap_id);
+ }
+ r = synchronize(dir_root, std::make_pair(it->second, it->first), prev);
+ if (r < 0) {
+ derr << ": failed to synchronize dir_root=" << dir_root
+ << ", snapshot=" << it->second << dendl;
+ clear_current_syncing_snap(dir_root);
+ return r;
+ }
+ std::chrono::duration<double> duration = clock::now() - start;
+ set_last_synced_stat(dir_root, it->first, it->second, duration.count());
+ if (--snaps_per_cycle == 0) {
+ break;
+ }
+
+ last_snap_name = it->second;
+ last_snap_id = it->first;
+ }
+
+ return 0;
+}
+
+void PeerReplayer::sync_snaps(const std::string &dir_root,
+ std::unique_lock<ceph::mutex> &locker) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+ locker.unlock();
+ int r = do_sync_snaps(dir_root);
+ if (r < 0) {
+ derr << ": failed to sync snapshots for dir_root=" << dir_root << dendl;
+ }
+ locker.lock();
+ if (r < 0) {
+ _inc_failed_count(dir_root);
+ } else {
+ _reset_failed_count(dir_root);
+ }
+}
+
+void PeerReplayer::run(SnapshotReplayerThread *replayer) {
+ dout(10) << ": snapshot replayer=" << replayer << dendl;
+
+ time last_directory_scan = clock::zero();
+ auto scan_interval = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_directory_scan_interval");
+
+ std::unique_lock locker(m_lock);
+ while (true) {
+ // do not check if client is blocklisted under lock
+ m_cond.wait_for(locker, 1s, [this]{return is_stopping();});
+ if (is_stopping()) {
+ dout(5) << ": exiting" << dendl;
+ break;
+ }
+
+ locker.unlock();
+
+ if (m_fs_mirror->is_blocklisted()) {
+ dout(5) << ": exiting as client is blocklisted" << dendl;
+ break;
+ }
+
+ locker.lock();
+
+ auto now = clock::now();
+ std::chrono::duration<double> timo = now - last_directory_scan;
+ if (timo.count() >= scan_interval && m_directories.size()) {
+ dout(20) << ": trying to pick from " << m_directories.size() << " directories" << dendl;
+ auto dir_root = pick_directory();
+ if (dir_root) {
+ dout(5) << ": picked dir_root=" << *dir_root << dendl;
+ int r = register_directory(*dir_root, replayer);
+ if (r == 0) {
+ sync_snaps(*dir_root, locker);
+ unregister_directory(*dir_root);
+ }
+ }
+
+ last_directory_scan = now;
+ }
+ }
+}
+
+void PeerReplayer::peer_status(Formatter *f) {
+ std::scoped_lock locker(m_lock);
+ f->open_object_section("stats");
+ for (auto &[dir_root, sync_stat] : m_snap_sync_stats) {
+ f->open_object_section(dir_root);
+ if (sync_stat.failed) {
+ f->dump_string("state", "failed");
+ } else if (!sync_stat.current_syncing_snap) {
+ f->dump_string("state", "idle");
+ } else {
+ f->dump_string("state", "syncing");
+ f->open_object_section("current_sycning_snap");
+ f->dump_unsigned("id", (*sync_stat.current_syncing_snap).first);
+ f->dump_string("name", (*sync_stat.current_syncing_snap).second);
+ f->close_section();
+ }
+ if (sync_stat.last_synced_snap) {
+ f->open_object_section("last_synced_snap");
+ f->dump_unsigned("id", (*sync_stat.last_synced_snap).first);
+ f->dump_string("name", (*sync_stat.last_synced_snap).second);
+ if (sync_stat.last_sync_duration) {
+ f->dump_float("sync_duration", *sync_stat.last_sync_duration);
+ f->dump_stream("sync_time_stamp") << sync_stat.last_synced;
+ }
+ f->close_section();
+ }
+ f->dump_unsigned("snaps_synced", sync_stat.synced_snap_count);
+ f->dump_unsigned("snaps_deleted", sync_stat.deleted_snap_count);
+ f->dump_unsigned("snaps_renamed", sync_stat.renamed_snap_count);
+ f->close_section(); // dir_root
+ }
+ f->close_section(); // stats
+}
+
+void PeerReplayer::reopen_logs() {
+ std::scoped_lock locker(m_lock);
+
+ if (m_remote_cluster) {
+ reinterpret_cast<CephContext *>(m_remote_cluster->cct())->reopen_logs();
+ }
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/PeerReplayer.h b/src/tools/cephfs_mirror/PeerReplayer.h
new file mode 100644
index 000000000..886c95329
--- /dev/null
+++ b/src/tools/cephfs_mirror/PeerReplayer.h
@@ -0,0 +1,319 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_PEER_REPLAYER_H
+#define CEPHFS_MIRROR_PEER_REPLAYER_H
+
+#include "common/Formatter.h"
+#include "common/Thread.h"
+#include "mds/FSMap.h"
+#include "ServiceDaemon.h"
+#include "Types.h"
+
+namespace cephfs {
+namespace mirror {
+
+class FSMirror;
+class PeerReplayerAdminSocketHook;
+
+class PeerReplayer {
+public:
+ PeerReplayer(CephContext *cct, FSMirror *fs_mirror,
+ RadosRef local_cluster, const Filesystem &filesystem,
+ const Peer &peer, const std::set<std::string, std::less<>> &directories,
+ MountRef mount, ServiceDaemon *service_daemon);
+ ~PeerReplayer();
+
+ // initialize replayer for a peer
+ int init();
+
+ // shutdown replayer for a peer
+ void shutdown();
+
+ // add a directory to mirror queue
+ void add_directory(string_view dir_root);
+
+ // remove a directory from queue
+ void remove_directory(string_view dir_root);
+
+ // admin socket helpers
+ void peer_status(Formatter *f);
+
+ // reopen logs
+ void reopen_logs();
+
+private:
+ inline static const std::string PRIMARY_SNAP_ID_KEY = "primary_snap_id";
+
+ inline static const std::string SERVICE_DAEMON_FAILED_DIR_COUNT_KEY = "failure_count";
+ inline static const std::string SERVICE_DAEMON_RECOVERED_DIR_COUNT_KEY = "recovery_count";
+
+ using Snapshot = std::pair<std::string, uint64_t>;
+
+ // file descriptor "triplet" for synchronizing a snapshot
+ // w/ an added MountRef for accessing "previous" snapshot.
+ struct FHandles {
+ // open file descriptor on the snap directory for snapshot
+ // currently being synchronized. Always use this fd with
+ // @m_local_mount.
+ int c_fd;
+
+ // open file descriptor on the "previous" snapshot or on
+ // dir_root on remote filesystem (based on if the snapshot
+ // can be used for incremental transfer). Always use this
+ // fd with p_mnt which either points to @m_local_mount (
+ // for local incremental comparison) or @m_remote_mount (
+ // for remote incremental comparison).
+ int p_fd;
+ MountRef p_mnt;
+
+ // open file descriptor on dir_root on remote filesystem.
+ // Always use this fd with @m_remote_mount.
+ int r_fd_dir_root;
+ };
+
+ bool is_stopping() {
+ return m_stopping;
+ }
+
+ struct Replayer;
+ class SnapshotReplayerThread : public Thread {
+ public:
+ SnapshotReplayerThread(PeerReplayer *peer_replayer)
+ : m_peer_replayer(peer_replayer) {
+ }
+
+ void *entry() override {
+ m_peer_replayer->run(this);
+ return 0;
+ }
+
+ private:
+ PeerReplayer *m_peer_replayer;
+ };
+
+ struct DirRegistry {
+ int fd;
+ bool canceled = false;
+ SnapshotReplayerThread *replayer;
+ };
+
+ struct SyncEntry {
+ std::string epath;
+ ceph_dir_result *dirp; // valid for directories
+ struct ceph_statx stx;
+ // set by incremental sync _after_ ensuring missing entries
+ // in the currently synced snapshot have been propagated to
+ // the remote filesystem.
+ bool remote_synced = false;
+
+ SyncEntry(std::string_view path,
+ const struct ceph_statx &stx)
+ : epath(path),
+ stx(stx) {
+ }
+ SyncEntry(std::string_view path,
+ ceph_dir_result *dirp,
+ const struct ceph_statx &stx)
+ : epath(path),
+ dirp(dirp),
+ stx(stx) {
+ }
+
+ bool is_directory() const {
+ return S_ISDIR(stx.stx_mode);
+ }
+
+ bool needs_remote_sync() const {
+ return remote_synced;
+ }
+ void set_remote_synced() {
+ remote_synced = true;
+ }
+ };
+
+ using clock = ceph::coarse_mono_clock;
+ using time = ceph::coarse_mono_time;
+
+ // stats sent to service daemon
+ struct ServiceDaemonStats {
+ uint64_t failed_dir_count = 0;
+ uint64_t recovered_dir_count = 0;
+ };
+
+ struct SnapSyncStat {
+ uint64_t nr_failures = 0; // number of consecutive failures
+ boost::optional<time> last_failed; // lat failed timestamp
+ bool failed = false; // hit upper cap for consecutive failures
+ boost::optional<std::pair<uint64_t, std::string>> last_synced_snap;
+ boost::optional<std::pair<uint64_t, std::string>> current_syncing_snap;
+ uint64_t synced_snap_count = 0;
+ uint64_t deleted_snap_count = 0;
+ uint64_t renamed_snap_count = 0;
+ time last_synced = clock::zero();
+ boost::optional<double> last_sync_duration;
+ };
+
+ void _inc_failed_count(const std::string &dir_root) {
+ auto max_failures = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_max_consecutive_failures_per_directory");
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.last_failed = clock::now();
+ if (++sync_stat.nr_failures >= max_failures && !sync_stat.failed) {
+ sync_stat.failed = true;
+ ++m_service_daemon_stats.failed_dir_count;
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer,
+ SERVICE_DAEMON_FAILED_DIR_COUNT_KEY,
+ m_service_daemon_stats.failed_dir_count);
+ }
+ }
+ void _reset_failed_count(const std::string &dir_root) {
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ if (sync_stat.failed) {
+ ++m_service_daemon_stats.recovered_dir_count;
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer,
+ SERVICE_DAEMON_RECOVERED_DIR_COUNT_KEY,
+ m_service_daemon_stats.recovered_dir_count);
+ }
+ sync_stat.nr_failures = 0;
+ sync_stat.failed = false;
+ sync_stat.last_failed = boost::none;
+ }
+
+ void _set_last_synced_snap(const std::string &dir_root, uint64_t snap_id,
+ const std::string &snap_name) {
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.last_synced_snap = std::make_pair(snap_id, snap_name);
+ sync_stat.current_syncing_snap = boost::none;
+ }
+ void set_last_synced_snap(const std::string &dir_root, uint64_t snap_id,
+ const std::string &snap_name) {
+ std::scoped_lock locker(m_lock);
+ _set_last_synced_snap(dir_root, snap_id, snap_name);
+ }
+ void set_current_syncing_snap(const std::string &dir_root, uint64_t snap_id,
+ const std::string &snap_name) {
+ std::scoped_lock locker(m_lock);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.current_syncing_snap = std::make_pair(snap_id, snap_name);
+ }
+ void clear_current_syncing_snap(const std::string &dir_root) {
+ std::scoped_lock locker(m_lock);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.current_syncing_snap = boost::none;
+ }
+ void inc_deleted_snap(const std::string &dir_root) {
+ std::scoped_lock locker(m_lock);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ ++sync_stat.deleted_snap_count;
+ }
+ void inc_renamed_snap(const std::string &dir_root) {
+ std::scoped_lock locker(m_lock);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ ++sync_stat.renamed_snap_count;
+ }
+ void set_last_synced_stat(const std::string &dir_root, uint64_t snap_id,
+ const std::string &snap_name, double duration) {
+ std::scoped_lock locker(m_lock);
+ _set_last_synced_snap(dir_root, snap_id, snap_name);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.last_synced = clock::now();
+ sync_stat.last_sync_duration = duration;
+ ++sync_stat.synced_snap_count;
+ }
+
+ bool should_backoff(const std::string &dir_root, int *retval) {
+ if (m_fs_mirror->is_blocklisted()) {
+ *retval = -EBLOCKLISTED;
+ return true;
+ }
+
+ std::scoped_lock locker(m_lock);
+ if (is_stopping()) {
+ // ceph defines EBLOCKLISTED to ESHUTDOWN (108). so use
+ // EINPROGRESS to identify shutdown.
+ *retval = -EINPROGRESS;
+ return true;
+ }
+ auto &dr = m_registered.at(dir_root);
+ if (dr.canceled) {
+ *retval = -ECANCELED;
+ return true;
+ }
+
+ *retval = 0;
+ return false;
+ }
+
+ typedef std::vector<std::unique_ptr<SnapshotReplayerThread>> SnapshotReplayers;
+
+ CephContext *m_cct;
+ FSMirror *m_fs_mirror;
+ RadosRef m_local_cluster;
+ Filesystem m_filesystem;
+ Peer m_peer;
+ // probably need to be encapsulated when supporting cancelations
+ std::map<std::string, DirRegistry> m_registered;
+ std::vector<std::string> m_directories;
+ std::map<std::string, SnapSyncStat> m_snap_sync_stats;
+ MountRef m_local_mount;
+ ServiceDaemon *m_service_daemon;
+ PeerReplayerAdminSocketHook *m_asok_hook = nullptr;
+
+ ceph::mutex m_lock;
+ ceph::condition_variable m_cond;
+ RadosRef m_remote_cluster;
+ MountRef m_remote_mount;
+ bool m_stopping = false;
+ SnapshotReplayers m_replayers;
+
+ ServiceDaemonStats m_service_daemon_stats;
+
+ void run(SnapshotReplayerThread *replayer);
+
+ boost::optional<std::string> pick_directory();
+ int register_directory(const std::string &dir_root, SnapshotReplayerThread *replayer);
+ void unregister_directory(const std::string &dir_root);
+ int try_lock_directory(const std::string &dir_root, SnapshotReplayerThread *replayer,
+ DirRegistry *registry);
+ void unlock_directory(const std::string &dir_root, const DirRegistry &registry);
+ void sync_snaps(const std::string &dir_root, std::unique_lock<ceph::mutex> &locker);
+
+
+ int build_snap_map(const std::string &dir_root, std::map<uint64_t, std::string> *snap_map,
+ bool is_remote=false);
+
+ int propagate_snap_deletes(const std::string &dir_root, const std::set<std::string> &snaps);
+ int propagate_snap_renames(const std::string &dir_root,
+ const std::set<std::pair<std::string,std::string>> &snaps);
+ int propagate_deleted_entries(const std::string &dir_root, const std::string &epath,
+ const FHandles &fh);
+ int cleanup_remote_dir(const std::string &dir_root, const std::string &epath,
+ const FHandles &fh);
+
+ int should_sync_entry(const std::string &epath, const struct ceph_statx &cstx,
+ const FHandles &fh, bool *need_data_sync, bool *need_attr_sync);
+
+ int open_dir(MountRef mnt, const std::string &dir_path, boost::optional<uint64_t> snap_id);
+ int pre_sync_check_and_open_handles(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev, FHandles *fh);
+ void post_sync_close_handles(const FHandles &fh);
+
+ int do_synchronize(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev);
+
+ int synchronize(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev);
+ int do_sync_snaps(const std::string &dir_root);
+
+ int remote_mkdir(const std::string &epath, const struct ceph_statx &stx, const FHandles &fh);
+ int remote_file_op(const std::string &dir_root, const std::string &epath, const struct ceph_statx &stx,
+ const FHandles &fh, bool need_data_sync, bool need_attr_sync);
+ int copy_to_remote(const std::string &dir_root, const std::string &epath, const struct ceph_statx &stx,
+ const FHandles &fh);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_PEER_REPLAYER_H
diff --git a/src/tools/cephfs_mirror/ServiceDaemon.cc b/src/tools/cephfs_mirror/ServiceDaemon.cc
new file mode 100644
index 000000000..f66dd46bf
--- /dev/null
+++ b/src/tools/cephfs_mirror/ServiceDaemon.cc
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "include/stringify.h"
+#include "ServiceDaemon.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::ServiceDaemon: " << this << " " \
+ << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+struct AttributeDumpVisitor : public boost::static_visitor<void> {
+ ceph::Formatter *f;
+ std::string name;
+
+ AttributeDumpVisitor(ceph::Formatter *f, std::string_view name)
+ : f(f), name(name) {
+ }
+
+ void operator()(bool val) const {
+ f->dump_bool(name.c_str(), val);
+ }
+ void operator()(uint64_t val) const {
+ f->dump_unsigned(name.c_str(), val);
+ }
+ void operator()(const std::string &val) const {
+ f->dump_string(name.c_str(), val);
+ }
+};
+
+} // anonymous namespace
+
+ServiceDaemon::ServiceDaemon(CephContext *cct, RadosRef rados)
+ : m_cct(cct),
+ m_rados(rados),
+ m_timer(new SafeTimer(cct, m_timer_lock, true)) {
+ m_timer->init();
+}
+
+ServiceDaemon::~ServiceDaemon() {
+ dout(10) << dendl;
+ {
+ std::scoped_lock timer_lock(m_timer_lock);
+ if (m_timer_ctx != nullptr) {
+ dout(5) << ": canceling timer task=" << m_timer_ctx << dendl;
+ m_timer->cancel_event(m_timer_ctx);
+ }
+ m_timer->shutdown();
+ }
+
+ delete m_timer;
+}
+
+int ServiceDaemon::init() {
+ dout(20) << dendl;
+
+ std::string id = m_cct->_conf->name.get_id();
+ if (id.find(CEPHFS_MIRROR_AUTH_ID_PREFIX) == 0) {
+ id = id.substr(CEPHFS_MIRROR_AUTH_ID_PREFIX.size());
+ }
+ std::string instance_id = stringify(m_rados->get_instance_id());
+
+ std::map<std::string, std::string> service_metadata = {{"id", id},
+ {"instance_id", instance_id}};
+ int r = m_rados->service_daemon_register("cephfs-mirror", instance_id,
+ service_metadata);
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+void ServiceDaemon::add_filesystem(fs_cluster_id_t fscid, std::string_view fs_name) {
+ dout(10) << ": fscid=" << fscid << ", fs_name=" << fs_name << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ m_filesystems.emplace(fscid, Filesystem(fs_name));
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::remove_filesystem(fs_cluster_id_t fscid) {
+ dout(10) << ": fscid=" << fscid << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ m_filesystems.erase(fscid);
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::add_peer(fs_cluster_id_t fscid, const Peer &peer) {
+ dout(10) << ": peer=" << peer << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto fs_it = m_filesystems.find(fscid);
+ if (fs_it == m_filesystems.end()) {
+ return;
+ }
+ fs_it->second.peer_attributes.emplace(peer, Attributes{});
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::remove_peer(fs_cluster_id_t fscid, const Peer &peer) {
+ dout(10) << ": peer=" << peer << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto fs_it = m_filesystems.find(fscid);
+ if (fs_it == m_filesystems.end()) {
+ return;
+ }
+ fs_it->second.peer_attributes.erase(peer);
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::add_or_update_fs_attribute(fs_cluster_id_t fscid, std::string_view key,
+ AttributeValue value) {
+ dout(10) << ": fscid=" << fscid << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto fs_it = m_filesystems.find(fscid);
+ if (fs_it == m_filesystems.end()) {
+ return;
+ }
+
+ fs_it->second.fs_attributes[std::string(key)] = value;
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::add_or_update_peer_attribute(fs_cluster_id_t fscid, const Peer &peer,
+ std::string_view key, AttributeValue value) {
+ dout(10) << ": fscid=" << fscid << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto fs_it = m_filesystems.find(fscid);
+ if (fs_it == m_filesystems.end()) {
+ return;
+ }
+
+ auto peer_it = fs_it->second.peer_attributes.find(peer);
+ if (peer_it == fs_it->second.peer_attributes.end()) {
+ return;
+ }
+
+ peer_it->second[std::string(key)] = value;
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::schedule_update_status() {
+ dout(10) << dendl;
+
+ std::scoped_lock timer_lock(m_timer_lock);
+ if (m_timer_ctx != nullptr) {
+ return;
+ }
+
+ m_timer_ctx = new LambdaContext([this] {
+ m_timer_ctx = nullptr;
+ update_status();
+ });
+ m_timer->add_event_after(1, m_timer_ctx);
+}
+
+void ServiceDaemon::update_status() {
+ dout(20) << ": " << m_filesystems.size() << " filesystem(s)" << dendl;
+
+ ceph::JSONFormatter f;
+ {
+ std::scoped_lock locker(m_lock);
+ f.open_object_section("filesystems");
+ for (auto &[fscid, filesystem] : m_filesystems) {
+ f.open_object_section(stringify(fscid).c_str());
+ f.dump_string("name", filesystem.fs_name);
+ for (auto &[attr_name, attr_value] : filesystem.fs_attributes) {
+ AttributeDumpVisitor visitor(&f, attr_name);
+ boost::apply_visitor(visitor, attr_value);
+ }
+ f.open_object_section("peers");
+ for (auto &[peer, attributes] : filesystem.peer_attributes) {
+ f.open_object_section(peer.uuid);
+ f.dump_object("remote", peer.remote);
+ f.open_object_section("stats");
+ for (auto &[attr_name, attr_value] : attributes) {
+ AttributeDumpVisitor visitor(&f, attr_name);
+ boost::apply_visitor(visitor, attr_value);
+ }
+ f.close_section(); // stats
+ f.close_section(); // peer.uuid
+ }
+ f.close_section(); // peers
+ f.close_section(); // fscid
+ }
+ f.close_section(); // filesystems
+ }
+
+ std::stringstream ss;
+ f.flush(ss);
+
+ int r = m_rados->service_daemon_update_status({{"status_json", ss.str()}});
+ if (r < 0) {
+ derr << ": failed to update service daemon status: " << cpp_strerror(r)
+ << dendl;
+ }
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/ServiceDaemon.h b/src/tools/cephfs_mirror/ServiceDaemon.h
new file mode 100644
index 000000000..83eee286d
--- /dev/null
+++ b/src/tools/cephfs_mirror/ServiceDaemon.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_SERVICE_DAEMON_H
+#define CEPHFS_MIRROR_SERVICE_DAEMON_H
+
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include "mds/FSMap.h"
+#include "Types.h"
+
+namespace cephfs {
+namespace mirror {
+
+class ServiceDaemon {
+public:
+ ServiceDaemon(CephContext *cct, RadosRef rados);
+ ~ServiceDaemon();
+
+ int init();
+
+ void add_filesystem(fs_cluster_id_t fscid, std::string_view fs_name);
+ void remove_filesystem(fs_cluster_id_t fscid);
+
+ void add_peer(fs_cluster_id_t fscid, const Peer &peer);
+ void remove_peer(fs_cluster_id_t fscid, const Peer &peer);
+
+ void add_or_update_fs_attribute(fs_cluster_id_t fscid, std::string_view key,
+ AttributeValue value);
+ void add_or_update_peer_attribute(fs_cluster_id_t fscid, const Peer &peer,
+ std::string_view key, AttributeValue value);
+
+private:
+ struct Filesystem {
+ std::string fs_name;
+ Attributes fs_attributes;
+ std::map<Peer, Attributes> peer_attributes;
+
+ Filesystem(std::string_view fs_name)
+ : fs_name(fs_name) {
+ }
+ };
+
+ const std::string CEPHFS_MIRROR_AUTH_ID_PREFIX = "cephfs-mirror.";
+
+ CephContext *m_cct;
+ RadosRef m_rados;
+ SafeTimer *m_timer;
+ ceph::mutex m_timer_lock = ceph::make_mutex("cephfs::mirror::ServiceDaemon");
+
+ ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::service_daemon");
+ Context *m_timer_ctx = nullptr;
+ std::map<fs_cluster_id_t, Filesystem> m_filesystems;
+
+ void schedule_update_status();
+ void update_status();
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_SERVICE_DAEMON_H
diff --git a/src/tools/cephfs_mirror/Types.cc b/src/tools/cephfs_mirror/Types.cc
new file mode 100644
index 000000000..0049f9d79
--- /dev/null
+++ b/src/tools/cephfs_mirror/Types.cc
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Types.h"
+
+namespace cephfs {
+namespace mirror {
+
+std::ostream& operator<<(std::ostream& out, const Filesystem &filesystem) {
+ out << "{fscid=" << filesystem.fscid << ", fs_name=" << filesystem.fs_name << "}";
+ return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const FilesystemSpec &spec) {
+ out << "{filesystem=" << spec.filesystem << ", pool_id=" << spec.pool_id << "}";
+ return out;
+}
+
+} // namespace mirror
+} // namespace cephfs
+
diff --git a/src/tools/cephfs_mirror/Types.h b/src/tools/cephfs_mirror/Types.h
new file mode 100644
index 000000000..016a8dc86
--- /dev/null
+++ b/src/tools/cephfs_mirror/Types.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_TYPES_H
+#define CEPHFS_MIRROR_TYPES_H
+
+#include <set>
+#include <iostream>
+#include <string_view>
+
+#include "include/rados/librados.hpp"
+#include "include/cephfs/libcephfs.h"
+#include "mds/mdstypes.h"
+
+namespace cephfs {
+namespace mirror {
+
+static const std::string CEPHFS_MIRROR_OBJECT("cephfs_mirror");
+
+typedef boost::variant<bool, uint64_t, std::string> AttributeValue;
+typedef std::map<std::string, AttributeValue> Attributes;
+
+// distinct filesystem identifier
+struct Filesystem {
+ fs_cluster_id_t fscid;
+ std::string fs_name;
+
+ bool operator==(const Filesystem &rhs) const {
+ return (fscid == rhs.fscid &&
+ fs_name == rhs.fs_name);
+ }
+
+ bool operator!=(const Filesystem &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator<(const Filesystem &rhs) const {
+ if (fscid != rhs.fscid) {
+ return fscid < rhs.fscid;
+ }
+
+ return fs_name < rhs.fs_name;
+ }
+};
+
+// specification of a filesystem -- pool id the metadata pool id.
+struct FilesystemSpec {
+ FilesystemSpec() = default;
+ FilesystemSpec(const Filesystem &filesystem, uint64_t pool_id)
+ : filesystem(filesystem),
+ pool_id(pool_id) {
+ }
+ FilesystemSpec(fs_cluster_id_t fscid, std::string_view fs_name, uint64_t pool_id)
+ : filesystem(Filesystem{fscid, std::string(fs_name)}),
+ pool_id(pool_id) {
+ }
+
+ Filesystem filesystem;
+ uint64_t pool_id;
+
+ bool operator==(const FilesystemSpec &rhs) const {
+ return (filesystem == rhs.filesystem &&
+ pool_id == rhs.pool_id);
+ }
+
+ bool operator<(const FilesystemSpec &rhs) const {
+ if (filesystem != rhs.filesystem) {
+ return filesystem < rhs.filesystem;
+ }
+
+ return pool_id < rhs.pool_id;
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const Filesystem &filesystem);
+std::ostream& operator<<(std::ostream& out, const FilesystemSpec &spec);
+
+typedef std::shared_ptr<librados::Rados> RadosRef;
+typedef std::shared_ptr<librados::IoCtx> IoCtxRef;
+
+// not a shared_ptr since the type is incomplete
+typedef ceph_mount_info *MountRef;
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_TYPES_H
diff --git a/src/tools/cephfs_mirror/Utils.cc b/src/tools/cephfs_mirror/Utils.cc
new file mode 100644
index 000000000..1a8b8e0ac
--- /dev/null
+++ b/src/tools/cephfs_mirror/Utils.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/common_init.h"
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::Utils " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+int connect(std::string_view client_name, std::string_view cluster_name,
+ RadosRef *cluster, std::string_view mon_host, std::string_view cephx_key,
+ std::vector<const char *> args) {
+ dout(20) << ": connecting to cluster=" << cluster_name << ", client=" << client_name
+ << ", mon_host=" << mon_host << dendl;
+
+ CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
+ if (client_name.empty() || !iparams.name.from_str(client_name)) {
+ derr << ": error initializing cluster handle for " << cluster_name << dendl;
+ return -EINVAL;
+ }
+
+ CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+ if (mon_host.empty()) {
+ cct->_conf->cluster = cluster_name;
+ }
+
+ int r = cct->_conf.parse_config_files(nullptr, nullptr, 0);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": could not read ceph conf: " << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ cct->_conf.parse_env(cct->get_module_type());
+
+ if (!args.empty()) {
+ r = cct->_conf.parse_argv(args);
+ if (r < 0) {
+ derr << ": could not parse command line args: " << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+ cct->_conf.parse_env(cct->get_module_type());
+
+ if (!mon_host.empty()) {
+ r = cct->_conf.set_val("mon_host", std::string(mon_host));
+ if (r < 0) {
+ derr << "failed to set mon_host config: " << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+ if (!cephx_key.empty()) {
+ r = cct->_conf.set_val("key", std::string(cephx_key));
+ if (r < 0) {
+ derr << "failed to set key config: " << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+
+ dout(10) << ": using mon addr=" << cct->_conf.get_val<std::string>("mon_host") << dendl;
+
+ cluster->reset(new librados::Rados());
+
+ r = (*cluster)->init_with_context(cct);
+ ceph_assert(r == 0);
+ cct->put();
+
+ r = (*cluster)->connect();
+ if (r < 0) {
+ derr << ": error connecting to " << cluster_name << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ dout(10) << ": connected to cluster=" << cluster_name << " using client="
+ << client_name << dendl;
+
+ return 0;
+}
+
+int mount(RadosRef cluster, const Filesystem &filesystem, bool cross_check_fscid,
+ MountRef *mount) {
+ dout(20) << ": filesystem=" << filesystem << dendl;
+
+ ceph_mount_info *cmi;
+ int r = ceph_create_with_context(&cmi, reinterpret_cast<CephContext*>(cluster->cct()));
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_conf_set(cmi, "client_mount_uid", "0");
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_conf_set(cmi, "client_mount_gid", "0");
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // mount timeout applies for local and remote mounts.
+ auto mount_timeout = g_ceph_context->_conf.get_val<std::chrono::seconds>
+ ("cephfs_mirror_mount_timeout").count();
+ r = ceph_set_mount_timeout(cmi, mount_timeout);
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_init(cmi);
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_select_filesystem(cmi, filesystem.fs_name.c_str());
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_mount(cmi, NULL);
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto fs_id = ceph_get_fs_cid(cmi);
+ if (cross_check_fscid && fs_id != filesystem.fscid) {
+ // this can happen in the most remotest possibility when a
+ // filesystem is deleted and recreated with the same name.
+ // since all this is driven asynchronously, we were able to
+ // mount the recreated filesystem. so bubble up the error.
+ // cleanup will eventually happen since a mirror disable event
+ // would have been queued.
+ derr << ": filesystem-id mismatch " << fs_id << " vs " << filesystem.fscid
+ << dendl;
+ // ignore errors, we are shutting down anyway.
+ ceph_unmount(cmi);
+ return -EINVAL;
+ }
+
+ dout(10) << ": mounted filesystem=" << filesystem << dendl;
+
+ *mount = cmi;
+ return 0;
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/Utils.h b/src/tools/cephfs_mirror/Utils.h
new file mode 100644
index 000000000..76b0c0726
--- /dev/null
+++ b/src/tools/cephfs_mirror/Utils.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_UTILS_H
+#define CEPHFS_MIRROR_UTILS_H
+
+#include "Types.h"
+
+namespace cephfs {
+namespace mirror {
+
+int connect(std::string_view client_name, std::string_view cluster_name,
+ RadosRef *cluster, std::string_view mon_host={}, std::string_view cephx_key={},
+ std::vector<const char *> args={});
+
+int mount(RadosRef cluster, const Filesystem &filesystem, bool cross_check_fscid,
+ MountRef *mount);
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_UTILS_H
diff --git a/src/tools/cephfs_mirror/Watcher.cc b/src/tools/cephfs_mirror/Watcher.cc
new file mode 100644
index 000000000..1445fce5f
--- /dev/null
+++ b/src/tools/cephfs_mirror/Watcher.cc
@@ -0,0 +1,285 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "aio_utils.h"
+#include "watcher/RewatchRequest.h"
+#include "Watcher.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::Watcher " << __func__
+
+using cephfs::mirror::watcher::RewatchRequest;
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+struct C_UnwatchAndFlush : public Context {
+ librados::Rados rados;
+ Context *on_finish;
+ bool flushing = false;
+ int ret_val = 0;
+
+ C_UnwatchAndFlush(librados::IoCtx &ioctx, Context *on_finish)
+ : rados(ioctx), on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ if (ret_val == 0 && r < 0) {
+ ret_val = r;
+ }
+
+ if (!flushing) {
+ flushing = true;
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<Context, &Context::complete>);
+ r = rados.aio_watch_flush(aio_comp);
+
+ ceph_assert(r == 0);
+ aio_comp->release();
+ return;
+ }
+
+ // ensure our reference to the RadosClient is released prior
+ // to completing the callback to avoid racing an explicit
+ // librados shutdown
+ Context *ctx = on_finish;
+ r = ret_val;
+ delete this;
+
+ ctx->complete(r);
+ }
+
+ void finish(int r) override {
+ }
+};
+
+} // anonymous namespace
+
+Watcher::Watcher(librados::IoCtx &ioctx, std::string_view oid, ContextWQ *work_queue)
+ : m_oid(oid),
+ m_ioctx(ioctx),
+ m_work_queue(work_queue),
+ m_lock(ceph::make_shared_mutex("cephfs::mirror::snap_watcher")),
+ m_state(STATE_IDLE),
+ m_watch_ctx(*this) {
+}
+
+Watcher::~Watcher() {
+}
+
+void Watcher::register_watch(Context *on_finish) {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ m_state = STATE_REGISTERING;
+
+ on_finish = new C_RegisterWatch(this, on_finish);
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(on_finish, &rados_callback<Context, &Context::complete>);
+ int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_watch_handle, &m_watch_ctx);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void Watcher::handle_register_watch(int r, Context *on_finish) {
+ dout(20) << ": r=" << r << dendl;
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_state == STATE_REGISTERING);
+
+ m_state = STATE_IDLE;
+ if (r < 0) {
+ derr << ": failed to register watch: " << cpp_strerror(r) << dendl;
+ m_watch_handle = 0;
+ }
+
+ if (m_unregister_watch_ctx != nullptr) {
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == 0 && m_watch_error) {
+ derr << ": re-registering after watch error" << dendl;
+ m_state = STATE_REGISTERING;
+ watch_error = true;
+ } else {
+ m_watch_blocklisted = (r == -EBLOCKLISTED);
+ }
+ }
+
+ on_finish->complete(r);
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
+ }
+}
+
+void Watcher::unregister_watch(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ if (m_state != STATE_IDLE) {
+ dout(10) << ": delaying unregister -- watch register in progress" << dendl;
+ ceph_assert(m_unregister_watch_ctx == nullptr);
+ m_unregister_watch_ctx = new LambdaContext([this, on_finish](int r) {
+ unregister_watch(on_finish);
+ });
+ return;
+ } else if (is_registered()) {
+ // watch is registered -- unwatch
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(new C_UnwatchAndFlush(m_ioctx, on_finish),
+ &rados_callback<Context, &Context::complete>);
+ int r = m_ioctx.aio_unwatch(m_watch_handle, aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ m_watch_handle = 0;
+ m_watch_blocklisted = false;
+ return;
+ }
+ }
+
+ on_finish->complete(0);
+}
+
+void Watcher::handle_error(uint64_t handle, int err) {
+ derr << ": handle=" << handle << ": " << cpp_strerror(err) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ m_watch_error = true;
+
+ if (is_registered()) {
+ m_state = STATE_REWATCHING;
+ if (err == -EBLOCKLISTED) {
+ m_watch_blocklisted = true;
+ }
+ m_work_queue->queue(new LambdaContext([this] {
+ rewatch();
+ }), 0);
+ }
+}
+
+void Watcher::rewatch() {
+ dout(20) << dendl;
+
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::unique_lock locker(m_lock);
+ ceph_assert(m_state == STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ m_state = STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else {
+ m_watch_error = false;
+ Context *ctx = new C_CallbackAdapter<Watcher, &Watcher::handle_rewatch>(this);
+ auto req = RewatchRequest::create(m_ioctx, m_oid, m_lock,
+ &m_watch_ctx, &m_watch_handle, ctx);
+ req->send();
+ return;
+ }
+ }
+
+ unregister_watch_ctx->complete(0);
+}
+
+void Watcher::handle_rewatch(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_state == STATE_REWATCHING);
+
+ m_watch_blocklisted = false;
+ if (m_unregister_watch_ctx != nullptr) {
+ dout(10) << ": skipping rewatch -- unregistering" << dendl;
+ m_state = STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLOCKLISTED) {
+ m_watch_blocklisted = true;
+ derr << ": client blocklisted" << dendl;
+ } else if (r == -ENOENT) {
+ dout(5) << ": object " << m_oid << " does not exist" << dendl;
+ } else if (r < 0) {
+ derr << ": failed to rewatch: " << cpp_strerror(r) << dendl;
+ watch_error = true;
+ } else if (m_watch_error) {
+ derr << ": re-registering watch after error" << dendl;
+ watch_error = true;
+ }
+ }
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ return;
+ } else if (watch_error) {
+ rewatch();
+ return;
+ }
+
+ Context *ctx = new C_CallbackAdapter<Watcher, &Watcher::handle_rewatch_callback>(this);
+ m_work_queue->queue(ctx, r);
+}
+
+void Watcher::handle_rewatch_callback(int r) {
+ dout(10) << ": r=" << r << dendl;
+ handle_rewatch_complete(r);
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_state == STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ m_state = STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLOCKLISTED || r == -ENOENT) {
+ m_state = STATE_IDLE;
+ } else if (r < 0 || m_watch_error) {
+ watch_error = true;
+ } else {
+ m_state = STATE_IDLE;
+ }
+ }
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
+ }
+}
+
+void Watcher::acknowledge_notify(uint64_t notify_id, uint64_t handle, bufferlist &bl) {
+ m_ioctx.notify_ack(m_oid, notify_id, handle, bl);
+}
+
+void Watcher::WatchCtx::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) {
+ dout(20) << ": notify_id=" << notify_id << ", handle=" << handle
+ << ", notifier_id=" << notifier_id << dendl;
+ watcher.handle_notify(notify_id, handle, notifier_id, bl);
+}
+
+void Watcher::WatchCtx::handle_error(uint64_t handle, int err) {
+ dout(20) << dendl;
+ watcher.handle_error(handle, err);
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/Watcher.h b/src/tools/cephfs_mirror/Watcher.h
new file mode 100644
index 000000000..9e7c54eeb
--- /dev/null
+++ b/src/tools/cephfs_mirror/Watcher.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_WATCHER_H
+#define CEPHFS_MIRROR_WATCHER_H
+
+#include <string_view>
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+
+class ContextWQ;
+
+namespace cephfs {
+namespace mirror {
+
+// generic watcher class -- establish watch on a given rados object
+// and invoke handle_notify() when notified. On notify error, try
+// to re-establish the watch. Errors during rewatch are notified via
+// handle_rewatch_complete().
+
+class Watcher {
+public:
+ Watcher(librados::IoCtx &ioctx, std::string_view oid, ContextWQ *work_queue);
+ virtual ~Watcher();
+
+ void register_watch(Context *on_finish);
+ void unregister_watch(Context *on_finish);
+
+protected:
+ std::string m_oid;
+
+ void acknowledge_notify(uint64_t notify_if, uint64_t handle, bufferlist &bl);
+
+ bool is_registered() const {
+ return m_state == STATE_IDLE && m_watch_handle != 0;
+ }
+ bool is_unregistered() const {
+ return m_state == STATE_IDLE && m_watch_handle == 0;
+ }
+
+ virtual void handle_rewatch_complete(int r) { }
+
+private:
+ enum State {
+ STATE_IDLE,
+ STATE_REGISTERING,
+ STATE_REWATCHING
+ };
+
+ struct WatchCtx : public librados::WatchCtx2 {
+ Watcher &watcher;
+
+ WatchCtx(Watcher &parent) : watcher(parent) {}
+
+ void handle_notify(uint64_t notify_id,
+ uint64_t handle,
+ uint64_t notifier_id,
+ bufferlist& bl) override;
+ void handle_error(uint64_t handle, int err) override;
+ };
+
+ struct C_RegisterWatch : public Context {
+ Watcher *watcher;
+ Context *on_finish;
+
+ C_RegisterWatch(Watcher *watcher, Context *on_finish)
+ : watcher(watcher),
+ on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ watcher->handle_register_watch(r, on_finish);
+ }
+ };
+
+ librados::IoCtx &m_ioctx;
+ ContextWQ *m_work_queue;
+
+ mutable ceph::shared_mutex m_lock;
+ State m_state;
+ bool m_watch_error = false;
+ bool m_watch_blocklisted = false;
+ uint64_t m_watch_handle;
+ WatchCtx m_watch_ctx;
+ Context *m_unregister_watch_ctx = nullptr;
+
+ virtual void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) = 0;
+ void handle_error(uint64_t handle, int err);
+
+ void rewatch();
+ void handle_rewatch(int r);
+ void handle_rewatch_callback(int r);
+ void handle_register_watch(int r, Context *on_finish);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_WATCHER_H
diff --git a/src/tools/cephfs_mirror/aio_utils.h b/src/tools/cephfs_mirror/aio_utils.h
new file mode 100644
index 000000000..43f356381
--- /dev/null
+++ b/src/tools/cephfs_mirror/aio_utils.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_AIO_UTILS_H
+#define CEPHFS_MIRROR_AIO_UTILS_H
+
+#include "include/rados/librados.hpp"
+
+namespace cephfs {
+namespace mirror {
+
+template <typename T, void(T::*MF)(int)>
+void rados_callback(rados_completion_t c, void *arg) {
+ T *obj = reinterpret_cast<T*>(arg);
+ int r = rados_aio_get_return_value(c);
+ (obj->*MF)(r);
+}
+
+template <typename T, void (T::*MF)(int)>
+class C_CallbackAdapter : public Context {
+ T *obj;
+public:
+ C_CallbackAdapter(T *obj)
+ : obj(obj) {
+ }
+
+protected:
+ void finish(int r) override {
+ (obj->*MF)(r);
+ }
+};
+
+template <typename WQ>
+struct C_AsyncCallback : public Context {
+ WQ *op_work_queue;
+ Context *on_finish;
+
+ C_AsyncCallback(WQ *op_work_queue, Context *on_finish)
+ : op_work_queue(op_work_queue), on_finish(on_finish) {
+ }
+ ~C_AsyncCallback() override {
+ delete on_finish;
+ }
+ void finish(int r) override {
+ op_work_queue->queue(on_finish, r);
+ on_finish = nullptr;
+ }
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_AIO_UTILS_H
diff --git a/src/tools/cephfs_mirror/main.cc b/src/tools/cephfs_mirror/main.cc
new file mode 100644
index 000000000..efaa89c35
--- /dev/null
+++ b/src/tools/cephfs_mirror/main.cc
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/async/context_pool.h"
+#include "common/Preforker.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+#include "mon/MonClient.h"
+#include "msg/Messenger.h"
+#include "Mirror.h"
+
+#include <vector>
+
+void usage() {
+ std::cout << "usage: cephfs-mirror [options...]" << std::endl;
+ std::cout << "options:\n";
+ std::cout << " --mon-host monaddress[:port] connect to specified monitor\n";
+ std::cout << " --keyring=<path> path to keyring for local cluster\n";
+ std::cout << " --log-file=<logfile> file to log debug output\n";
+ std::cout << " --debug-cephfs-mirror=<log-level>/<memory-level> set cephfs-mirror debug level\n";
+ generic_server_usage();
+}
+
+cephfs::mirror::Mirror *mirror = nullptr;
+
+static void handle_signal(int signum) {
+ if (mirror) {
+ mirror->handle_signal(signum);
+ }
+}
+
+int main(int argc, const char **argv) {
+ std::vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ ::exit(1);
+ }
+
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ ::exit(0);
+ }
+
+ auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+
+ Preforker forker;
+ if (global_init_prefork(g_ceph_context) >= 0) {
+ std::string err;
+ int r = forker.prefork(err);
+ if (r < 0) {
+ cerr << err << std::endl;
+ return r;
+ }
+ if (forker.is_parent()) {
+ g_ceph_context->_log->start();
+ if (forker.parent_wait(err) != 0) {
+ return -ENXIO;
+ }
+ return 0;
+ }
+ global_init_postfork_start(g_ceph_context);
+ }
+
+ common_init_finish(g_ceph_context);
+
+ bool daemonize = g_conf().get_val<bool>("daemonize");
+ if (daemonize) {
+ global_init_postfork_finish(g_ceph_context);
+ forker.daemonize();
+ }
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, handle_signal);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ std::vector<const char*> cmd_args;
+ argv_to_vec(argc, argv, cmd_args);
+
+ Messenger *msgr = Messenger::create_client_messenger(g_ceph_context, "client");
+ msgr->set_default_policy(Messenger::Policy::lossy_client(0));
+
+ std::string reason;
+ ceph::async::io_context_pool ctxpool(1);
+ MonClient monc(MonClient(g_ceph_context, ctxpool));
+ int r = monc.build_initial_monmap();
+ if (r < 0) {
+ cerr << "failed to generate initial monmap" << std::endl;
+ goto cleanup_messenger;
+ }
+
+ msgr->start();
+
+ mirror = new cephfs::mirror::Mirror(g_ceph_context, cmd_args, &monc, msgr);
+ r = mirror->init(reason);
+ if (r < 0) {
+ std::cerr << "failed to initialize cephfs-mirror: " << reason << std::endl;
+ goto cleanup;
+ }
+
+ mirror->run();
+ delete mirror;
+
+cleanup:
+ monc.shutdown();
+cleanup_messenger:
+ msgr->shutdown();
+ msgr->wait();
+ delete msgr;
+
+ unregister_async_signal_handler(SIGHUP, handle_signal);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ return forker.signal_exit(r);
+}
diff --git a/src/tools/cephfs_mirror/watcher/RewatchRequest.cc b/src/tools/cephfs_mirror/watcher/RewatchRequest.cc
new file mode 100644
index 000000000..3070e6f8b
--- /dev/null
+++ b/src/tools/cephfs_mirror/watcher/RewatchRequest.cc
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_mutex.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/Context.h"
+#include "tools/cephfs_mirror/aio_utils.h"
+#include "RewatchRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::watcher:RewatchRequest " << __func__
+
+namespace cephfs {
+namespace mirror {
+namespace watcher {
+
+RewatchRequest::RewatchRequest(librados::IoCtx &ioctx, const std::string &oid,
+ ceph::shared_mutex &watch_lock,
+ librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish)
+ : m_ioctx(ioctx), m_oid(oid), m_lock(watch_lock),
+ m_watch_ctx(watch_ctx), m_watch_handle(watch_handle),
+ m_on_finish(on_finish) {
+}
+
+void RewatchRequest::send() {
+ unwatch();
+}
+
+void RewatchRequest::unwatch() {
+ ceph_assert(ceph_mutex_is_wlocked(m_lock));
+ if (*m_watch_handle == 0) {
+ rewatch();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ uint64_t watch_handle = 0;
+ std::swap(*m_watch_handle, watch_handle);
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<RewatchRequest, &RewatchRequest::handle_unwatch>);
+ int r = m_ioctx.aio_unwatch(watch_handle, aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void RewatchRequest::handle_unwatch(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ derr << ": client blocklisted" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << ": failed to unwatch: " << cpp_strerror(r) << dendl;
+ }
+
+ rewatch();
+}
+
+void RewatchRequest::rewatch() {
+ dout(20) << dendl;
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<RewatchRequest, &RewatchRequest::handle_rewatch>);
+ int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_rewatch_handle, m_watch_ctx);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void RewatchRequest::handle_rewatch(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to watch object: " << cpp_strerror(r) << dendl;
+ m_rewatch_handle = 0;
+ }
+
+ {
+ std::unique_lock locker(m_lock);
+ *m_watch_handle = m_rewatch_handle;
+ }
+
+ finish(r);
+}
+
+void RewatchRequest::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace watcher
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/watcher/RewatchRequest.h b/src/tools/cephfs_mirror/watcher/RewatchRequest.h
new file mode 100644
index 000000000..453fcb219
--- /dev/null
+++ b/src/tools/cephfs_mirror/watcher/RewatchRequest.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_WATCHER_REWATCH_REQUEST_H
+#define CEPHFS_MIRROR_WATCHER_REWATCH_REQUEST_H
+
+#include "common/ceph_mutex.h"
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+
+struct Context;
+
+namespace cephfs {
+namespace mirror {
+namespace watcher {
+
+// Rewatch an existing watch -- the watch can be in an operatioal
+// or error state.
+
+class RewatchRequest {
+public:
+
+ static RewatchRequest *create(librados::IoCtx &ioctx, const std::string &oid,
+ ceph::shared_mutex &watch_lock,
+ librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish) {
+ return new RewatchRequest(ioctx, oid, watch_lock, watch_ctx, watch_handle,
+ on_finish);
+ }
+
+ RewatchRequest(librados::IoCtx &ioctx, const std::string &oid,
+ ceph::shared_mutex &watch_lock, librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish);
+
+ void send();
+
+private:
+ librados::IoCtx& m_ioctx;
+ std::string m_oid;
+ ceph::shared_mutex &m_lock;
+ librados::WatchCtx2 *m_watch_ctx;
+ uint64_t *m_watch_handle;
+ Context *m_on_finish;
+
+ uint64_t m_rewatch_handle = 0;
+
+ void unwatch();
+ void handle_unwatch(int r);
+
+ void rewatch();
+ void handle_rewatch(int r);
+
+ void finish(int r);
+};
+
+} // namespace watcher
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_WATCHER_REWATCH_REQUEST_H
diff --git a/src/tools/crimson/CMakeLists.txt b/src/tools/crimson/CMakeLists.txt
new file mode 100644
index 000000000..19a2cfa91
--- /dev/null
+++ b/src/tools/crimson/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(perf-crimson-msgr perf_crimson_msgr.cc)
+target_link_libraries(perf-crimson-msgr crimson)
+
+add_executable(perf-async-msgr perf_async_msgr.cc)
+target_link_libraries(perf-async-msgr ceph-common global ${ALLOC_LIBS})
+
+add_executable(perf-staged-fltree perf_staged_fltree.cc)
+target_link_libraries(perf-staged-fltree crimson-seastore)
diff --git a/src/tools/crimson/perf_async_msgr.cc b/src/tools/crimson/perf_async_msgr.cc
new file mode 100644
index 000000000..25d1d410e
--- /dev/null
+++ b/src/tools/crimson/perf_async_msgr.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include "auth/Auth.h"
+#include "global/global_init.h"
+#include "msg/Dispatcher.h"
+#include "msg/Messenger.h"
+#include "messages/MOSDOp.h"
+
+#include "auth/DummyAuth.h"
+
+namespace {
+
+constexpr int CEPH_OSD_PROTOCOL = 10;
+
+struct Server {
+ Server(CephContext* cct, unsigned msg_len)
+ : dummy_auth(cct), dispatcher(cct, msg_len)
+ {
+ msgr.reset(Messenger::create(cct, "async", entity_name_t::OSD(0), "server", 0));
+ dummy_auth.auth_registry.refresh_config();
+ msgr->set_cluster_protocol(CEPH_OSD_PROTOCOL);
+ msgr->set_default_policy(Messenger::Policy::stateless_server(0));
+ msgr->set_auth_client(&dummy_auth);
+ msgr->set_auth_server(&dummy_auth);
+ msgr->set_require_authorizer(false);
+ }
+ DummyAuthClientServer dummy_auth;
+ unique_ptr<Messenger> msgr;
+ struct ServerDispatcher : Dispatcher {
+ unsigned msg_len = 0;
+ bufferlist msg_data;
+
+ ServerDispatcher(CephContext* cct, unsigned msg_len)
+ : Dispatcher(cct), msg_len(msg_len)
+ {
+ msg_data.append_zero(msg_len);
+ }
+ bool ms_can_fast_dispatch_any() const override {
+ return true;
+ }
+ bool ms_can_fast_dispatch(const Message* m) const override {
+ return m->get_type() == CEPH_MSG_OSD_OP;
+ }
+ void ms_fast_dispatch(Message* m) override {
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+ const static pg_t pgid;
+ const static object_locator_t oloc;
+ const static hobject_t hobj(object_t(), oloc.key, CEPH_NOSNAP, pgid.ps(),
+ pgid.pool(), oloc.nspace);
+ static spg_t spgid(pgid);
+ MOSDOp *rep = new MOSDOp(0, 0, hobj, spgid, 0, 0, 0);
+ bufferlist data(msg_data);
+ rep->write(0, msg_len, data);
+ rep->set_tid(m->get_tid());
+ m->get_connection()->send_message(rep);
+ m->put();
+ }
+ bool ms_dispatch(Message*) override {
+ ceph_abort();
+ }
+ bool ms_handle_reset(Connection*) override {
+ return true;
+ }
+ void ms_handle_remote_reset(Connection*) override {
+ }
+ bool ms_handle_refused(Connection*) override {
+ return true;
+ }
+ } dispatcher;
+};
+
+}
+
+static void run(CephContext* cct, entity_addr_t addr, unsigned bs)
+{
+ std::cout << "async server listening at " << addr << std::endl;
+ Server server{cct, bs};
+ server.msgr->bind(addr);
+ server.msgr->add_dispatcher_head(&server.dispatcher);
+ server.msgr->start();
+ server.msgr->wait();
+}
+
+int main(int argc, char** argv)
+{
+ namespace po = boost::program_options;
+ po::options_description desc{"Allowed options"};
+ desc.add_options()
+ ("help,h", "show help message")
+ ("addr", po::value<std::string>()->default_value("v1:127.0.0.1:9010"),
+ "server address")
+ ("bs", po::value<unsigned>()->default_value(0),
+ "server block size")
+ ("v1-crc-enabled", po::value<bool>()->default_value(false),
+ "enable v1 CRC checks");
+ po::variables_map vm;
+ std::vector<std::string> unrecognized_options;
+ try {
+ auto parsed = po::command_line_parser(argc, argv)
+ .options(desc)
+ .allow_unregistered()
+ .run();
+ po::store(parsed, vm);
+ if (vm.count("help")) {
+ std::cout << desc << std::endl;
+ return 0;
+ }
+ po::notify(vm);
+ unrecognized_options = po::collect_unrecognized(parsed.options, po::include_positional);
+ } catch(const po::error& e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ return 1;
+ }
+
+ auto addr = vm["addr"].as<std::string>();
+ entity_addr_t target_addr;
+ target_addr.parse(addr.c_str(), nullptr);
+ auto bs = vm["bs"].as<unsigned>();
+ auto v1_crc_enabled = vm["v1-crc-enabled"].as<bool>();
+
+ std::vector<const char*> args(argv, argv + argc);
+ auto cct = global_init(nullptr, args,
+ CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_MON_CONFIG);
+ common_init_finish(cct.get());
+
+ if (v1_crc_enabled) {
+ cct->_conf.set_val("ms_crc_header", "true");
+ cct->_conf.set_val("ms_crc_data", "true");
+ } else {
+ cct->_conf.set_val("ms_crc_header", "false");
+ cct->_conf.set_val("ms_crc_data", "false");
+ }
+
+ run(cct.get(), target_addr, bs);
+}
diff --git a/src/tools/crimson/perf_crimson_msgr.cc b/src/tools/crimson/perf_crimson_msgr.cc
new file mode 100644
index 000000000..e76f273a9
--- /dev/null
+++ b/src/tools/crimson/perf_crimson_msgr.cc
@@ -0,0 +1,746 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <map>
+#include <random>
+#include <boost/program_options.hpp>
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/do_with.hh>
+#include <seastar/core/future-util.hh>
+#include <seastar/core/reactor.hh>
+#include <seastar/core/sleep.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/smp.hh>
+
+#include "common/ceph_time.h"
+#include "messages/MOSDOp.h"
+
+#include "crimson/auth/DummyAuth.h"
+#include "crimson/common/log.h"
+#include "crimson/net/Connection.h"
+#include "crimson/net/Dispatcher.h"
+#include "crimson/net/Messenger.h"
+
+namespace bpo = boost::program_options;
+
+namespace {
+
+template<typename Message>
+using Ref = boost::intrusive_ptr<Message>;
+
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_ms);
+}
+
+template <typename T, typename... Args>
+seastar::future<T*> create_sharded(Args... args) {
+ // seems we should only construct/stop shards on #0
+ return seastar::smp::submit_to(0, [=] {
+ auto sharded_obj = seastar::make_lw_shared<seastar::sharded<T>>();
+ return sharded_obj->start(args...).then([sharded_obj]() {
+ seastar::engine().at_exit([sharded_obj]() {
+ return sharded_obj->stop().then([sharded_obj] {});
+ });
+ return sharded_obj.get();
+ });
+ }).then([] (seastar::sharded<T> *ptr_shard) {
+ // return the pointer valid for the caller CPU
+ return &ptr_shard->local();
+ });
+}
+
+enum class perf_mode_t {
+ both,
+ client,
+ server
+};
+
+struct client_config {
+ entity_addr_t server_addr;
+ unsigned block_size;
+ unsigned ramptime;
+ unsigned msgtime;
+ unsigned jobs;
+ unsigned depth;
+ bool v1_crc_enabled;
+
+ std::string str() const {
+ std::ostringstream out;
+ out << "client[>> " << server_addr
+ << "](bs=" << block_size
+ << ", ramptime=" << ramptime
+ << ", msgtime=" << msgtime
+ << ", jobs=" << jobs
+ << ", depth=" << depth
+ << ", v1-crc-enabled=" << v1_crc_enabled
+ << ")";
+ return out.str();
+ }
+
+ static client_config load(bpo::variables_map& options) {
+ client_config conf;
+ entity_addr_t addr;
+ ceph_assert(addr.parse(options["addr"].as<std::string>().c_str(), nullptr));
+
+ conf.server_addr = addr;
+ conf.block_size = options["cbs"].as<unsigned>();
+ conf.ramptime = options["ramptime"].as<unsigned>();
+ conf.msgtime = options["msgtime"].as<unsigned>();
+ conf.jobs = options["jobs"].as<unsigned>();
+ conf.depth = options["depth"].as<unsigned>();
+ ceph_assert(conf.depth % conf.jobs == 0);
+ conf.v1_crc_enabled = options["v1-crc-enabled"].as<bool>();
+ return conf;
+ }
+};
+
+struct server_config {
+ entity_addr_t addr;
+ unsigned block_size;
+ unsigned core;
+ bool v1_crc_enabled;
+
+ std::string str() const {
+ std::ostringstream out;
+ out << "server[" << addr
+ << "](bs=" << block_size
+ << ", core=" << core
+ << ", v1-crc-enabled=" << v1_crc_enabled
+ << ")";
+ return out.str();
+ }
+
+ static server_config load(bpo::variables_map& options) {
+ server_config conf;
+ entity_addr_t addr;
+ ceph_assert(addr.parse(options["addr"].as<std::string>().c_str(), nullptr));
+
+ conf.addr = addr;
+ conf.block_size = options["sbs"].as<unsigned>();
+ conf.core = options["core"].as<unsigned>();
+ conf.v1_crc_enabled = options["v1-crc-enabled"].as<bool>();
+ return conf;
+ }
+};
+
+const unsigned SAMPLE_RATE = 7;
+
+static seastar::future<> run(
+ perf_mode_t mode,
+ const client_config& client_conf,
+ const server_config& server_conf)
+{
+ struct test_state {
+ struct Server;
+ using ServerFRef = seastar::foreign_ptr<std::unique_ptr<Server>>;
+
+ struct Server final
+ : public crimson::net::Dispatcher {
+ crimson::net::MessengerRef msgr;
+ crimson::auth::DummyAuthClientServer dummy_auth;
+ const seastar::shard_id msgr_sid;
+ std::string lname;
+ unsigned msg_len;
+ bufferlist msg_data;
+
+ Server(unsigned msg_len)
+ : msgr_sid{seastar::this_shard_id()},
+ msg_len{msg_len} {
+ lname = "server#";
+ lname += std::to_string(msgr_sid);
+ msg_data.append_zero(msg_len);
+ }
+
+ std::optional<seastar::future<>> ms_dispatch(
+ crimson::net::ConnectionRef c, MessageRef m) override {
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+
+ // server replies with MOSDOp to generate server-side write workload
+ const static pg_t pgid;
+ const static object_locator_t oloc;
+ const static hobject_t hobj(object_t(), oloc.key, CEPH_NOSNAP, pgid.ps(),
+ pgid.pool(), oloc.nspace);
+ static spg_t spgid(pgid);
+ auto rep = make_message<MOSDOp>(0, 0, hobj, spgid, 0, 0, 0);
+ bufferlist data(msg_data);
+ rep->write(0, msg_len, data);
+ rep->set_tid(m->get_tid());
+ std::ignore = c->send(std::move(rep));
+ return {seastar::now()};
+ }
+
+ seastar::future<> init(bool v1_crc_enabled, const entity_addr_t& addr) {
+ return seastar::smp::submit_to(msgr_sid, [v1_crc_enabled, addr, this] {
+ // server msgr is always with nonce 0
+ msgr = crimson::net::Messenger::create(entity_name_t::OSD(msgr_sid), lname, 0);
+ msgr->set_default_policy(crimson::net::SocketPolicy::stateless_server(0));
+ msgr->set_auth_client(&dummy_auth);
+ msgr->set_auth_server(&dummy_auth);
+ if (v1_crc_enabled) {
+ msgr->set_crc_header();
+ msgr->set_crc_data();
+ }
+ return msgr->bind(entity_addrvec_t{addr}).safe_then([this] {
+ return msgr->start({this});
+ }, crimson::net::Messenger::bind_ertr::all_same_way(
+ [addr] (const std::error_code& e) {
+ logger().error("Server: "
+ "there is another instance running at {}", addr);
+ ceph_abort();
+ }));
+ });
+ }
+ seastar::future<> shutdown() {
+ logger().info("{} shutdown...", lname);
+ return seastar::smp::submit_to(msgr_sid, [this] {
+ ceph_assert(msgr);
+ msgr->stop();
+ return msgr->shutdown();
+ });
+ }
+ seastar::future<> wait() {
+ return seastar::smp::submit_to(msgr_sid, [this] {
+ ceph_assert(msgr);
+ return msgr->wait();
+ });
+ }
+
+ static seastar::future<ServerFRef> create(seastar::shard_id msgr_sid, unsigned msg_len) {
+ return seastar::smp::submit_to(msgr_sid, [msg_len] {
+ return seastar::make_foreign(std::make_unique<Server>(msg_len));
+ });
+ }
+ };
+
+ struct Client final
+ : public crimson::net::Dispatcher,
+ public seastar::peering_sharded_service<Client> {
+
+ struct ConnStats {
+ mono_time connecting_time = mono_clock::zero();
+ mono_time connected_time = mono_clock::zero();
+ unsigned received_count = 0u;
+
+ mono_time start_time = mono_clock::zero();
+ unsigned start_count = 0u;
+
+ unsigned sampled_count = 0u;
+ double total_lat_s = 0.0;
+
+ // for reporting only
+ mono_time finish_time = mono_clock::zero();
+
+ void start() {
+ start_time = mono_clock::now();
+ start_count = received_count;
+ sampled_count = 0u;
+ total_lat_s = 0.0;
+ finish_time = mono_clock::zero();
+ }
+ };
+ ConnStats conn_stats;
+
+ struct PeriodStats {
+ mono_time start_time = mono_clock::zero();
+ unsigned start_count = 0u;
+ unsigned sampled_count = 0u;
+ double total_lat_s = 0.0;
+
+ // for reporting only
+ mono_time finish_time = mono_clock::zero();
+ unsigned finish_count = 0u;
+ unsigned depth = 0u;
+
+ void reset(unsigned received_count, PeriodStats* snap = nullptr) {
+ if (snap) {
+ snap->start_time = start_time;
+ snap->start_count = start_count;
+ snap->sampled_count = sampled_count;
+ snap->total_lat_s = total_lat_s;
+ snap->finish_time = mono_clock::now();
+ snap->finish_count = received_count;
+ }
+ start_time = mono_clock::now();
+ start_count = received_count;
+ sampled_count = 0u;
+ total_lat_s = 0.0;
+ }
+ };
+ PeriodStats period_stats;
+
+ const seastar::shard_id sid;
+ std::string lname;
+
+ const unsigned jobs;
+ crimson::net::MessengerRef msgr;
+ const unsigned msg_len;
+ bufferlist msg_data;
+ const unsigned nr_depth;
+ seastar::semaphore depth;
+ std::vector<mono_time> time_msgs_sent;
+ crimson::auth::DummyAuthClientServer dummy_auth;
+
+ unsigned sent_count = 0u;
+ crimson::net::ConnectionRef active_conn = nullptr;
+
+ bool stop_send = false;
+ seastar::promise<> stopped_send_promise;
+
+ Client(unsigned jobs, unsigned msg_len, unsigned depth)
+ : sid{seastar::this_shard_id()},
+ jobs{jobs},
+ msg_len{msg_len},
+ nr_depth{depth/jobs},
+ depth{nr_depth},
+ time_msgs_sent{depth/jobs, mono_clock::zero()} {
+ lname = "client#";
+ lname += std::to_string(sid);
+ msg_data.append_zero(msg_len);
+ }
+
+ unsigned get_current_depth() const {
+ ceph_assert(depth.available_units() >= 0);
+ return nr_depth - depth.current();
+ }
+
+ void ms_handle_connect(crimson::net::ConnectionRef conn) override {
+ conn_stats.connected_time = mono_clock::now();
+ }
+ std::optional<seastar::future<>> ms_dispatch(
+ crimson::net::ConnectionRef, MessageRef m) override {
+ // server replies with MOSDOp to generate server-side write workload
+ ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+
+ auto msg_id = m->get_tid();
+ if (msg_id % SAMPLE_RATE == 0) {
+ auto index = msg_id % time_msgs_sent.size();
+ ceph_assert(time_msgs_sent[index] != mono_clock::zero());
+ std::chrono::duration<double> cur_latency = mono_clock::now() - time_msgs_sent[index];
+ conn_stats.total_lat_s += cur_latency.count();
+ ++(conn_stats.sampled_count);
+ period_stats.total_lat_s += cur_latency.count();
+ ++(period_stats.sampled_count);
+ time_msgs_sent[index] = mono_clock::zero();
+ }
+
+ ++(conn_stats.received_count);
+ depth.signal(1);
+
+ return {seastar::now()};
+ }
+
+ // should start messenger at this shard?
+ bool is_active() {
+ ceph_assert(seastar::this_shard_id() == sid);
+ return sid != 0 && sid <= jobs;
+ }
+
+ seastar::future<> init(bool v1_crc_enabled) {
+ return container().invoke_on_all([v1_crc_enabled] (auto& client) {
+ if (client.is_active()) {
+ client.msgr = crimson::net::Messenger::create(entity_name_t::OSD(client.sid), client.lname, client.sid);
+ client.msgr->set_default_policy(crimson::net::SocketPolicy::lossy_client(0));
+ client.msgr->set_require_authorizer(false);
+ client.msgr->set_auth_client(&client.dummy_auth);
+ client.msgr->set_auth_server(&client.dummy_auth);
+ if (v1_crc_enabled) {
+ client.msgr->set_crc_header();
+ client.msgr->set_crc_data();
+ }
+ return client.msgr->start({&client});
+ }
+ return seastar::now();
+ });
+ }
+
+ seastar::future<> shutdown() {
+ return container().invoke_on_all([] (auto& client) {
+ if (client.is_active()) {
+ logger().info("{} shutdown...", client.lname);
+ ceph_assert(client.msgr);
+ client.msgr->stop();
+ return client.msgr->shutdown().then([&client] {
+ return client.stop_dispatch_messages();
+ });
+ }
+ return seastar::now();
+ });
+ }
+
+ seastar::future<> connect_wait_verify(const entity_addr_t& peer_addr) {
+ return container().invoke_on_all([peer_addr] (auto& client) {
+ // start clients in active cores (#1 ~ #jobs)
+ if (client.is_active()) {
+ mono_time start_time = mono_clock::now();
+ client.active_conn = client.msgr->connect(peer_addr, entity_name_t::TYPE_OSD);
+ // make sure handshake won't hurt the performance
+ return seastar::sleep(1s).then([&client, start_time] {
+ if (client.conn_stats.connected_time == mono_clock::zero()) {
+ logger().error("\n{} not connected after 1s!\n", client.lname);
+ ceph_assert(false);
+ }
+ client.conn_stats.connecting_time = start_time;
+ });
+ }
+ return seastar::now();
+ });
+ }
+
+ private:
+ class TimerReport {
+ private:
+ const unsigned jobs;
+ const unsigned msgtime;
+ const unsigned bytes_of_block;
+
+ unsigned elapsed = 0u;
+ std::vector<mono_time> start_times;
+ std::vector<PeriodStats> snaps;
+ std::vector<ConnStats> summaries;
+
+ public:
+ TimerReport(unsigned jobs, unsigned msgtime, unsigned bs)
+ : jobs{jobs},
+ msgtime{msgtime},
+ bytes_of_block{bs},
+ start_times{jobs, mono_clock::zero()},
+ snaps{jobs},
+ summaries{jobs} {}
+
+ unsigned get_elapsed() const { return elapsed; }
+
+ PeriodStats& get_snap_by_job(seastar::shard_id sid) {
+ ceph_assert(sid >= 1 && sid <= jobs);
+ return snaps[sid - 1];
+ }
+
+ ConnStats& get_summary_by_job(seastar::shard_id sid) {
+ ceph_assert(sid >= 1 && sid <= jobs);
+ return summaries[sid - 1];
+ }
+
+ bool should_stop() const {
+ return elapsed >= msgtime;
+ }
+
+ seastar::future<> ticktock() {
+ return seastar::sleep(1s).then([this] {
+ ++elapsed;
+ });
+ }
+
+ void report_header() {
+ std::ostringstream sout;
+ sout << std::setfill(' ')
+ << std::setw(7) << "sec"
+ << std::setw(6) << "depth"
+ << std::setw(8) << "IOPS"
+ << std::setw(8) << "MB/s"
+ << std::setw(8) << "lat(ms)";
+ std::cout << sout.str() << std::endl;
+ }
+
+ void report_period() {
+ if (elapsed == 1) {
+ // init this->start_times at the first period
+ for (unsigned i=0; i<jobs; ++i) {
+ start_times[i] = snaps[i].start_time;
+ }
+ }
+ std::chrono::duration<double> elapsed_d = 0s;
+ unsigned depth = 0u;
+ unsigned ops = 0u;
+ unsigned sampled_count = 0u;
+ double total_lat_s = 0.0;
+ for (const auto& snap: snaps) {
+ elapsed_d += (snap.finish_time - snap.start_time);
+ depth += snap.depth;
+ ops += (snap.finish_count - snap.start_count);
+ sampled_count += snap.sampled_count;
+ total_lat_s += snap.total_lat_s;
+ }
+ double elapsed_s = elapsed_d.count() / jobs;
+ double iops = ops/elapsed_s;
+ std::ostringstream sout;
+ sout << setfill(' ')
+ << std::setw(7) << elapsed_s
+ << std::setw(6) << depth
+ << std::setw(8) << iops
+ << std::setw(8) << iops * bytes_of_block / 1048576
+ << std::setw(8) << (total_lat_s / sampled_count * 1000);
+ std::cout << sout.str() << std::endl;
+ }
+
+ void report_summary() const {
+ std::chrono::duration<double> elapsed_d = 0s;
+ unsigned ops = 0u;
+ unsigned sampled_count = 0u;
+ double total_lat_s = 0.0;
+ for (const auto& summary: summaries) {
+ elapsed_d += (summary.finish_time - summary.start_time);
+ ops += (summary.received_count - summary.start_count);
+ sampled_count += summary.sampled_count;
+ total_lat_s += summary.total_lat_s;
+ }
+ double elapsed_s = elapsed_d.count() / jobs;
+ double iops = ops / elapsed_s;
+ std::ostringstream sout;
+ sout << "--------------"
+ << " summary "
+ << "--------------\n"
+ << setfill(' ')
+ << std::setw(7) << elapsed_s
+ << std::setw(6) << "-"
+ << std::setw(8) << iops
+ << std::setw(8) << iops * bytes_of_block / 1048576
+ << std::setw(8) << (total_lat_s / sampled_count * 1000)
+ << "\n";
+ std::cout << sout.str() << std::endl;
+ }
+ };
+
+ seastar::future<> report_period(TimerReport& report) {
+ return container().invoke_on_all([&report] (auto& client) {
+ if (client.is_active()) {
+ PeriodStats& snap = report.get_snap_by_job(client.sid);
+ client.period_stats.reset(client.conn_stats.received_count,
+ &snap);
+ snap.depth = client.get_current_depth();
+ }
+ }).then([&report] {
+ report.report_period();
+ });
+ }
+
+ seastar::future<> report_summary(TimerReport& report) {
+ return container().invoke_on_all([&report] (auto& client) {
+ if (client.is_active()) {
+ ConnStats& summary = report.get_summary_by_job(client.sid);
+ summary = client.conn_stats;
+ summary.finish_time = mono_clock::now();
+ }
+ }).then([&report] {
+ report.report_summary();
+ });
+ }
+
+ public:
+ seastar::future<> dispatch_with_timer(unsigned ramptime, unsigned msgtime) {
+ logger().info("[all clients]: start sending MOSDOps from {} clients", jobs);
+ return container().invoke_on_all([] (auto& client) {
+ if (client.is_active()) {
+ client.do_dispatch_messages(client.active_conn.get());
+ }
+ }).then([this, ramptime] {
+ logger().info("[all clients]: ramping up {} seconds...", ramptime);
+ return seastar::sleep(std::chrono::seconds(ramptime));
+ }).then([this] {
+ return container().invoke_on_all([] (auto& client) {
+ if (client.is_active()) {
+ client.conn_stats.start();
+ client.period_stats.reset(client.conn_stats.received_count);
+ }
+ });
+ }).then([this, msgtime] {
+ logger().info("[all clients]: reporting {} seconds...\n", msgtime);
+ return seastar::do_with(
+ TimerReport(jobs, msgtime, msg_len), [this] (auto& report) {
+ report.report_header();
+ return seastar::do_until(
+ [&report] { return report.should_stop(); },
+ [&report, this] {
+ return report.ticktock().then([&report, this] {
+ // report period every 1s
+ return report_period(report);
+ }).then([&report, this] {
+ // report summary every 10s
+ if (report.get_elapsed() % 10 == 0) {
+ return report_summary(report);
+ } else {
+ return seastar::now();
+ }
+ });
+ }
+ ).then([&report, this] {
+ // report the final summary
+ if (report.get_elapsed() % 10 != 0) {
+ return report_summary(report);
+ } else {
+ return seastar::now();
+ }
+ });
+ });
+ });
+ }
+
+ private:
+ seastar::future<> send_msg(crimson::net::Connection* conn) {
+ ceph_assert(seastar::this_shard_id() == sid);
+ return depth.wait(1).then([this, conn] {
+ const static pg_t pgid;
+ const static object_locator_t oloc;
+ const static hobject_t hobj(object_t(), oloc.key, CEPH_NOSNAP, pgid.ps(),
+ pgid.pool(), oloc.nspace);
+ static spg_t spgid(pgid);
+ auto m = make_message<MOSDOp>(0, 0, hobj, spgid, 0, 0, 0);
+ bufferlist data(msg_data);
+ m->write(0, msg_len, data);
+ // use tid as the identity of each round
+ m->set_tid(sent_count);
+
+ // sample message latency
+ if (sent_count % SAMPLE_RATE == 0) {
+ auto index = sent_count % time_msgs_sent.size();
+ ceph_assert(time_msgs_sent[index] == mono_clock::zero());
+ time_msgs_sent[index] = mono_clock::now();
+ }
+
+ return conn->send(std::move(m));
+ });
+ }
+
+ class DepthBroken: public std::exception {};
+
+ seastar::future<> stop_dispatch_messages() {
+ stop_send = true;
+ depth.broken(DepthBroken());
+ return stopped_send_promise.get_future();
+ }
+
+ void do_dispatch_messages(crimson::net::Connection* conn) {
+ ceph_assert(seastar::this_shard_id() == sid);
+ ceph_assert(sent_count == 0);
+ conn_stats.start_time = mono_clock::now();
+ // forwarded to stopped_send_promise
+ (void) seastar::do_until(
+ [this] { return stop_send; },
+ [this, conn] {
+ sent_count += 1;
+ return send_msg(conn);
+ }
+ ).handle_exception_type([] (const DepthBroken& e) {
+ // ok, stopped by stop_dispatch_messages()
+ }).then([this, conn] {
+ std::chrono::duration<double> dur_conn = conn_stats.connected_time - conn_stats.connecting_time;
+ std::chrono::duration<double> dur_msg = mono_clock::now() - conn_stats.start_time;
+ unsigned ops = conn_stats.received_count - conn_stats.start_count;
+ logger().info("{}: stopped sending OSDOPs.\n"
+ "{}(depth={}):\n"
+ " connect time: {}s\n"
+ " messages received: {}\n"
+ " messaging time: {}s\n"
+ " latency: {}ms\n"
+ " IOPS: {}\n"
+ " throughput: {}MB/s\n",
+ *conn,
+ lname,
+ nr_depth,
+ dur_conn.count(),
+ ops,
+ dur_msg.count(),
+ conn_stats.total_lat_s / conn_stats.sampled_count * 1000,
+ ops / dur_msg.count(),
+ ops / dur_msg.count() * msg_len / 1048576);
+ stopped_send_promise.set_value();
+ });
+ }
+ };
+ };
+
+ return seastar::when_all(
+ test_state::Server::create(server_conf.core, server_conf.block_size),
+ create_sharded<test_state::Client>(client_conf.jobs, client_conf.block_size, client_conf.depth)
+ ).then([=](auto&& ret) {
+ auto fp_server = std::move(std::get<0>(ret).get0());
+ auto client = std::move(std::get<1>(ret).get0());
+ test_state::Server* server = fp_server.get();
+ if (mode == perf_mode_t::both) {
+ logger().info("\nperf settings:\n {}\n {}\n",
+ client_conf.str(), server_conf.str());
+ ceph_assert(seastar::smp::count >= 1+client_conf.jobs);
+ ceph_assert(client_conf.jobs > 0);
+ ceph_assert(seastar::smp::count >= 1+server_conf.core);
+ ceph_assert(server_conf.core == 0 || server_conf.core > client_conf.jobs);
+ return seastar::when_all_succeed(
+ server->init(server_conf.v1_crc_enabled, server_conf.addr),
+ client->init(client_conf.v1_crc_enabled)
+ ).then_unpack([client, addr = client_conf.server_addr] {
+ return client->connect_wait_verify(addr);
+ }).then([client, ramptime = client_conf.ramptime,
+ msgtime = client_conf.msgtime] {
+ return client->dispatch_with_timer(ramptime, msgtime);
+ }).then([client] {
+ return client->shutdown();
+ }).then([server, fp_server = std::move(fp_server)] () mutable {
+ return server->shutdown().then([cleanup = std::move(fp_server)] {});
+ });
+ } else if (mode == perf_mode_t::client) {
+ logger().info("\nperf settings:\n {}\n", client_conf.str());
+ ceph_assert(seastar::smp::count >= 1+client_conf.jobs);
+ ceph_assert(client_conf.jobs > 0);
+ return client->init(client_conf.v1_crc_enabled
+ ).then([client, addr = client_conf.server_addr] {
+ return client->connect_wait_verify(addr);
+ }).then([client, ramptime = client_conf.ramptime,
+ msgtime = client_conf.msgtime] {
+ return client->dispatch_with_timer(ramptime, msgtime);
+ }).then([client] {
+ return client->shutdown();
+ });
+ } else { // mode == perf_mode_t::server
+ ceph_assert(seastar::smp::count >= 1+server_conf.core);
+ logger().info("\nperf settings:\n {}\n", server_conf.str());
+ return server->init(server_conf.v1_crc_enabled, server_conf.addr
+ // dispatch ops
+ ).then([server] {
+ return server->wait();
+ // shutdown
+ }).then([server, fp_server = std::move(fp_server)] () mutable {
+ return server->shutdown().then([cleanup = std::move(fp_server)] {});
+ });
+ }
+ });
+}
+
+}
+
+int main(int argc, char** argv)
+{
+ seastar::app_template app;
+ app.add_options()
+ ("mode", bpo::value<unsigned>()->default_value(0),
+ "0: both, 1:client, 2:server")
+ ("addr", bpo::value<std::string>()->default_value("v1:127.0.0.1:9010"),
+ "server address")
+ ("ramptime", bpo::value<unsigned>()->default_value(5),
+ "seconds of client ramp-up time")
+ ("msgtime", bpo::value<unsigned>()->default_value(15),
+ "seconds of client messaging time")
+ ("jobs", bpo::value<unsigned>()->default_value(1),
+ "number of client jobs (messengers)")
+ ("cbs", bpo::value<unsigned>()->default_value(4096),
+ "client block size")
+ ("depth", bpo::value<unsigned>()->default_value(512),
+ "client io depth")
+ ("core", bpo::value<unsigned>()->default_value(0),
+ "server running core")
+ ("sbs", bpo::value<unsigned>()->default_value(0),
+ "server block size")
+ ("v1-crc-enabled", bpo::value<bool>()->default_value(false),
+ "enable v1 CRC checks");
+ return app.run(argc, argv, [&app] {
+ auto&& config = app.configuration();
+ auto mode = config["mode"].as<unsigned>();
+ ceph_assert(mode <= 2);
+ auto _mode = static_cast<perf_mode_t>(mode);
+ auto server_conf = server_config::load(config);
+ auto client_conf = client_config::load(config);
+ return run(_mode, client_conf, server_conf).then([] {
+ logger().info("\nsuccessful!\n");
+ }).handle_exception([] (auto eptr) {
+ logger().info("\nfailed!\n");
+ return seastar::make_exception_future<>(eptr);
+ });
+ });
+}
diff --git a/src/tools/crimson/perf_staged_fltree.cc b/src/tools/crimson/perf_staged_fltree.cc
new file mode 100644
index 000000000..14f863508
--- /dev/null
+++ b/src/tools/crimson/perf_staged_fltree.cc
@@ -0,0 +1,129 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/program_options.hpp>
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/thread.hh>
+
+#include "crimson/common/log.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h"
+#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h"
+#include "test/crimson/seastore/transaction_manager_test_state.h"
+
+using namespace crimson::os::seastore::onode;
+namespace bpo = boost::program_options;
+
+seastar::logger& logger() {
+ return crimson::get_logger(ceph_subsys_test);
+}
+
+template <bool TRACK>
+class PerfTree : public TMTestState {
+ public:
+ PerfTree(bool is_dummy) : is_dummy{is_dummy} {}
+
+ seastar::future<> run(KVPool& kvs) {
+ return tm_setup().then([this, &kvs] {
+ return seastar::async([this, &kvs] {
+ auto tree = std::make_unique<TreeBuilder<TRACK>>(kvs,
+ (is_dummy ? NodeExtentManager::create_dummy(true)
+ : NodeExtentManager::create_seastore(*tm)));
+ {
+ auto t = tm->create_transaction();
+ tree->bootstrap(*t).unsafe_get();
+ tm->submit_transaction(std::move(t)).unsafe_get();
+ }
+ {
+ auto t = tm->create_transaction();
+ tree->insert(*t).unsafe_get();
+ auto start_time = mono_clock::now();
+ tm->submit_transaction(std::move(t)).unsafe_get();
+ std::chrono::duration<double> duration = mono_clock::now() - start_time;
+ logger().warn("submit_transaction() done! {}s", duration.count());
+ }
+ {
+ auto t = tm->create_transaction();
+ tree->get_stats(*t).unsafe_get();
+ tm->submit_transaction(std::move(t)).unsafe_get();
+ }
+ {
+ // Note: tm->create_weak_transaction() can also work, but too slow.
+ auto t = tm->create_transaction();
+ tree->validate(*t).unsafe_get();
+ }
+ tree.reset();
+ });
+ }).then([this] {
+ return tm_teardown();
+ });
+ }
+
+ private:
+ bool is_dummy;
+};
+
+template <bool TRACK>
+seastar::future<> run(const bpo::variables_map& config) {
+ return seastar::async([&config] {
+ auto backend = config["backend"].as<std::string>();
+ bool is_dummy;
+ if (backend == "dummy") {
+ is_dummy = true;
+ } else if (backend == "seastore") {
+ is_dummy = false;
+ } else {
+ ceph_abort(false && "invalid backend");
+ }
+ auto str_sizes = config["str-sizes"].as<std::vector<size_t>>();
+ auto onode_sizes = config["onode-sizes"].as<std::vector<size_t>>();
+ auto range2 = config["range2"].as<std::vector<int>>();
+ ceph_assert(range2.size() == 2);
+ auto range1 = config["range1"].as<std::vector<unsigned>>();
+ ceph_assert(range1.size() == 2);
+ auto range0 = config["range0"].as<std::vector<unsigned>>();
+ ceph_assert(range0.size() == 2);
+
+ KVPool kvs{str_sizes, onode_sizes,
+ {range2[0], range2[1]},
+ {range1[0], range1[1]},
+ {range0[0], range0[1]}};
+ PerfTree<TRACK> perf{is_dummy};
+ perf.run(kvs).get0();
+ });
+}
+
+
+int main(int argc, char** argv)
+{
+ seastar::app_template app;
+ app.add_options()
+ ("backend", bpo::value<std::string>()->default_value("dummy"),
+ "tree backend: dummy, seastore")
+ ("tracked", bpo::value<bool>()->default_value(false),
+ "track inserted cursors")
+ ("str-sizes", bpo::value<std::vector<size_t>>()->default_value(
+ {8, 11, 64, 256, 301, 320}),
+ "sizes of ns/oid strings")
+ ("onode-sizes", bpo::value<std::vector<size_t>>()->default_value(
+ {8, 16, 128, 512, 576, 640}),
+ "sizes of onode")
+ ("range2", bpo::value<std::vector<int>>()->default_value(
+ {0, 128}),
+ "range of shard-pool-crush [a, b)")
+ ("range1", bpo::value<std::vector<unsigned>>()->default_value(
+ {0, 10}),
+ "range of ns-oid strings [a, b)")
+ ("range0", bpo::value<std::vector<unsigned>>()->default_value(
+ {0, 4}),
+ "range of snap-gen [a, b)");
+ return app.run(argc, argv, [&app] {
+ auto&& config = app.configuration();
+ auto tracked = config["tracked"].as<bool>();
+ if (tracked) {
+ return run<true>(config);
+ } else {
+ return run<false>(config);
+ }
+ });
+}
diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc
new file mode 100644
index 000000000..a4defb048
--- /dev/null
+++ b/src/tools/crushtool.cc
@@ -0,0 +1,1328 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <fstream>
+#include <type_traits>
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/Formatter.h"
+
+#include "common/ceph_argparse.h"
+#include "include/stringify.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "osd/OSDMap.h"
+#include "crush/CrushWrapper.h"
+#include "crush/CrushCompiler.h"
+#include "crush/CrushTester.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_crush
+
+
+const char *infn = "stdin";
+
+static int get_fd_data(int fd, bufferlist &bl)
+{
+
+ uint64_t total = 0;
+ do {
+ ssize_t bytes = bl.read_fd(fd, 1024*1024);
+ if (bytes < 0) {
+ cerr << "read_fd error " << cpp_strerror(-bytes) << "\n";
+ return -1;
+ }
+
+ if (bytes == 0)
+ break;
+
+ total += bytes;
+ } while(true);
+
+ ceph_assert(bl.length() == total);
+ return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+void data_analysis_usage()
+{
+cout << "data output from testing routine ...\n";
+cout << " absolute_weights\n";
+cout << " the decimal weight of each OSD\n";
+cout << " data layout: ROW MAJOR\n";
+cout << " OSD id (int), weight (int)\n";
+cout << " batch_device_expected_utilization_all\n";
+cout << " the expected number of objects each OSD should receive per placement batch\n";
+cout << " which may be a decimal value\n";
+cout << " data layout: COLUMN MAJOR\n";
+cout << " round (int), objects expected on OSD 0...OSD n (float)\n";
+cout << " batch_device_utilization_all\n";
+cout << " the number of objects stored on each OSD during each placement round\n";
+cout << " data layout: COLUMN MAJOR\n";
+cout << " round (int), objects stored on OSD 0...OSD n (int)\n";
+cout << " device_utilization_all\n";
+cout << " the number of objects stored on each OSD at the end of placements\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " OSD id (int), objects stored (int), objects expected (float)\n";
+cout << " device_utilization\n";
+cout << " the number of objects stored on each OSD marked 'up' at the end of placements\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " OSD id (int), objects stored (int), objects expected (float)\n";
+cout << " placement_information\n";
+cout << " the map of input -> OSD\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " input (int), OSD's mapped (int)\n";
+cout << " proportional_weights_all\n";
+cout << " the proportional weight of each OSD specified in the CRUSH map\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " OSD id (int), proportional weight (float)\n";
+cout << " proportional_weights\n";
+cout << " the proportional weight of each 'up' OSD specified in the CRUSH map\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " OSD id (int), proportional weight (float)\n";
+}
+
+void usage()
+{
+ cout << "usage: crushtool ...\n";
+ cout << "\n";
+ cout << "Display, modify and test a crush map\n";
+ cout << "\n";
+ cout << "There are five stages, running one after the other:\n";
+ cout << "\n";
+ cout << " - input/build\n";
+ cout << " - tunables adjustments\n";
+ cout << " - modifications\n";
+ cout << " - display/test\n";
+ cout << " - output\n";
+ cout << "\n";
+ cout << "Options that are not specific to a stage.\n";
+ cout << "\n";
+ cout << " [--infn|-i infile]\n";
+ cout << " read the crush map from infile\n";
+ cout << "\n";
+ cout << "Options for the input/build stage\n";
+ cout << "\n";
+ cout << " --decompile|-d map decompile a crush map to source\n";
+ cout << " [--outfn|-o outfile]\n";
+ cout << " specify output for for (de)compilation\n";
+ cout << " --compile|-c map.txt compile a map from source\n";
+ cout << " --enable-unsafe-tunables\n";
+ cout << " compile with unsafe tunables\n";
+ cout << " --build --num_osds N layer1 ...\n";
+ cout << " build a new map, where each 'layer' is\n";
+ cout << " 'name (uniform|straw2|straw|list|tree) size'\n";
+ cout << "\n";
+ cout << "Options for the tunables adjustments stage\n";
+ cout << "\n";
+ cout << " --set-choose-local-tries N\n";
+ cout << " set choose local retries before re-descent\n";
+ cout << " --set-choose-local-fallback-tries N\n";
+ cout << " set choose local retries using fallback\n";
+ cout << " permutation before re-descent\n";
+ cout << " --set-choose-total-tries N\n";
+ cout << " set choose total descent attempts\n";
+ cout << " --set-chooseleaf-descend-once <0|1>\n";
+ cout << " set chooseleaf to (not) retry the recursive descent\n";
+ cout << " --set-chooseleaf-vary-r <0|1>\n";
+ cout << " set chooseleaf to (not) vary r based on parent\n";
+ cout << " --set-chooseleaf-stable <0|1>\n";
+ cout << " set chooseleaf firstn to (not) return stable results\n";
+ cout << "\n";
+ cout << "Options for the modifications stage\n";
+ cout << "\n";
+ cout << " -i mapfn --add-item id weight name [--loc type name ...]\n";
+ cout << " insert an item into the hierarchy at the\n";
+ cout << " given location\n";
+ cout << " -i mapfn --update-item id weight name [--loc type name ...]\n";
+ cout << " insert or move an item into the hierarchy at the\n";
+ cout << " given location\n";
+ cout << " -i mapfn --remove-item name\n"
+ << " remove the given item\n";
+ cout << " -i mapfn --reweight-item name weight\n";
+ cout << " reweight a given item (and adjust ancestor\n"
+ << " weights as needed)\n";
+ cout << " -i mapfn --add-bucket name type [--loc type name ...]\n"
+ << " insert a bucket into the hierarchy at the given\n"
+ << " location\n";
+ cout << " -i mapfn --move name --loc type name ...\n"
+ << " move the given item to specified location\n";
+ cout << " -i mapfn --reweight recalculate all bucket weights\n";
+ cout << " -i mapfn --rebuild-class-roots\n";
+ cout << " rebuild the per-class shadow trees (normally a no-op)\n";
+ cout << " -i mapfn --create-simple-rule name root type mode\n"
+ << " create crush rule <name> to start from <root>,\n"
+ << " replicate across buckets of type <type>, using\n"
+ << " a choose mode of <firstn|indep>\n";
+ cout << " -i mapfn --create-replicated-rule name root type\n"
+ << " create crush rule <name> to start from <root>,\n"
+ << " replicate across buckets of type <type>\n";
+ cout << " --device-class <class>\n";
+ cout << " use device class <class> for new rule\n";
+ cout << " -i mapfn --remove-rule name\n"
+ << " remove the specified crush rule\n";
+ cout << "\n";
+ cout << "Options for the display/test stage\n";
+ cout << "\n";
+ cout << " -f --format the format of --dump, defaults to json-pretty\n";
+ cout << " can be one of json, json-pretty, xml, xml-pretty,\n";
+ cout << " table, table-kv, html, html-pretty\n";
+ cout << " --dump dump the crush map\n";
+ cout << " --tree print map summary as a tree\n";
+ cout << " --bucket-tree print bucket map summary as a tree\n";
+ cout << " --bucket-name specify bucket bucket name for bucket-tree\n";
+ cout << " --check [max_id] check if any item is referencing an unknown name/type\n";
+ cout << " -i mapfn --show-location id\n";
+ cout << " show location for given device id\n";
+ cout << " -i mapfn --test test a range of inputs on the map\n";
+ cout << " [--min-x x] [--max-x x] [--x x]\n";
+ cout << " [--min-rule r] [--max-rule r] [--rule r] [--ruleset rs]\n";
+ cout << " [--num-rep n]\n";
+ cout << " [--pool-id n] specifies pool id\n";
+ cout << " [--batches b] split the CRUSH mapping into b > 1 rounds\n";
+ cout << " [--weight|-w devno weight]\n";
+ cout << " where weight is 0 to 1.0\n";
+ cout << " [--simulate] simulate placements using a random\n";
+ cout << " number generator in place of the CRUSH\n";
+ cout << " algorithm\n";
+ cout << " --show-utilization show OSD usage\n";
+ cout << " --show-utilization-all\n";
+ cout << " include zero weight items\n";
+ cout << " --show-statistics show chi squared statistics\n";
+ cout << " --show-mappings show mappings\n";
+ cout << " --show-bad-mappings show bad mappings\n";
+ cout << " --show-choose-tries show choose tries histogram\n";
+ cout << " --output-name name\n";
+ cout << " prepend the data file(s) generated during the\n";
+ cout << " testing routine with name\n";
+ cout << " --output-csv\n";
+ cout << " export select data generated during testing routine\n";
+ cout << " to CSV files for off-line post-processing\n";
+ cout << " use --help-output for more information\n";
+ cout << " --reclassify transform legacy CRUSH map buckets and rules\n";
+ cout << " by adding classes\n";
+ cout << " --reclassify-bucket <bucket-match> <class> <default-parent>\n";
+ cout << " --reclassify-root <bucket-name> <class>\n";
+ cout << " --set-subtree-class <bucket-name> <class>\n";
+ cout << " set class for all items beneath bucket-name\n";
+ cout << " --compare <otherfile> compare two maps using --test parameters\n";
+ cout << "\n";
+ cout << "Options for the output stage\n";
+ cout << "\n";
+ cout << " [--outfn|-o outfile]\n";
+ cout << " specify output for modified crush map\n";
+ cout << "\n";
+}
+
+struct bucket_types_t {
+ const char *name;
+ int type;
+} bucket_types[] = {
+ { "uniform", CRUSH_BUCKET_UNIFORM },
+ { "list", CRUSH_BUCKET_LIST },
+ { "straw", CRUSH_BUCKET_STRAW },
+ { "straw2", CRUSH_BUCKET_STRAW2 },
+ { "tree", CRUSH_BUCKET_TREE },
+ { 0, 0 },
+};
+
+struct layer_t {
+ const char *name;
+ const char *buckettype;
+ int size;
+};
+
+template<typename... Args>
+bool argparse_withargs(std::vector<const char*> &args,
+ std::vector<const char*>::iterator& i,
+ std::ostream& oss,
+ const char* opt,
+ Args*... opts)
+{
+ if (!ceph_argparse_flag(args, i, opt, nullptr)) {
+ return false;
+ }
+ auto parse = [&](auto& opt) {
+ if (i == args.end()) {
+ oss << "expecting additional argument to " << opt;
+ return false;
+ }
+ using opt_t = std::remove_pointer_t<decay_t<decltype(opt)>>;
+ string err;
+ if constexpr (std::is_same_v<opt_t, string>) {
+ opt->assign(*i);
+ } else if constexpr (is_same_v<opt_t, int>) {
+ *opt = strict_strtol(*i, 10, &err);
+ } else if constexpr (is_same_v<opt_t, float>) {
+ *opt = strict_strtof(*i, &err);
+ }
+ i = args.erase(i);
+ if (err.empty())
+ return true;
+ else {
+ oss << err;
+ return false;
+ }
+ };
+ (... && parse(opts));
+ return true;
+}
+
+int do_add_bucket(CephContext* cct,
+ const char* me,
+ CrushWrapper& crush,
+ const string& add_name,
+ const string& add_type,
+ const map<string,string>& add_loc) {
+ int bucketno;
+ if (crush.name_exists(add_name)) {
+ cerr << me << " bucket '" << add_name << "' already exists" << std::endl;
+ return -EEXIST;
+ }
+ int type = crush.get_type_id(add_type);
+ if (type <= 0) {
+ cerr << me << " bad bucket type: " << add_type << std::endl;
+ return -EINVAL;
+ }
+ if (int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT, type, 0, nullptr, nullptr, &bucketno);
+ r < 0) {
+ cerr << me << " unable to add bucket: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (int r = crush.set_item_name(bucketno, add_name); r < 0) {
+ cerr << me << " bad bucket name: " << add_name << std::endl;
+ return r;
+ }
+ if (!add_loc.empty()) {
+ if (!crush.check_item_loc(cct, bucketno, add_loc, (int*)nullptr)) {
+ if (int r = crush.move_bucket(cct, bucketno, add_loc); r < 0) {
+ cerr << me << " error moving bucket '" << add_name << "' to " << add_loc << std::endl;
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+// return 1 for no change, 0 for successful change, negative on error
+int do_move_item(CephContext* cct,
+ const char *me,
+ CrushWrapper& crush,
+ const string& name,
+ const map<string,string>& loc)
+{
+ if (!crush.name_exists(name)) {
+ cerr << me << " item '" << name << "' does not exist" << std::endl;
+ return -ENOENT;
+ }
+ int id = crush.get_item_id(name);
+ if (loc.empty()) {
+ cerr << me << " expecting additional --loc argument to --move" << std::endl;
+ return -EINVAL;
+ }
+ if (crush.check_item_loc(cct, id, loc, (int*)nullptr)) {
+ // it's already there
+ cerr << me << " item '" << name << "' already at " << loc << std::endl;
+ return 1;
+ }
+ if (id >= 0) {
+ switch (int r = crush.create_or_move_item(cct, id, 0, name, loc)) {
+ case 0:
+ return 1;
+ case 1:
+ return 0;
+ default:
+ return r;
+ }
+ } else {
+ return crush.move_bucket(cct, id, loc);
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ const char *me = argv[0];
+
+ std::string infn, srcfn, outfn, add_name, add_type, remove_name,
+ reweight_name, bucket_name;
+ std::string move_name;
+ bool compile = false;
+ bool decompile = false;
+ bool check = false;
+ int max_id = -1;
+ bool test = false;
+ bool display = false;
+ bool tree = false;
+ bool bucket_tree = false;
+ string dump_format = "json-pretty";
+ bool dump = false;
+ int full_location = -1;
+ bool write_to_file = false;
+ int verbose = 0;
+ bool unsafe_tunables = false;
+
+ bool rebuild_class_roots = false;
+
+ bool reweight = false;
+ int add_item = -1;
+ bool add_bucket = false;
+ bool update_item = false;
+ bool move_item = false;
+ bool add_rule = false;
+ std::string rule_name, rule_root, rule_type, rule_mode, rule_device_class;
+ bool del_rule = false;
+ float add_weight = 0;
+ map<string,string> add_loc;
+ float reweight_weight = 0;
+
+ bool adjust = false;
+
+ int build = 0;
+ int num_osds =0;
+ vector<layer_t> layers;
+
+ int choose_local_tries = -1;
+ int choose_local_fallback_tries = -1;
+ int choose_total_tries = -1;
+ int chooseleaf_descend_once = -1;
+ int chooseleaf_vary_r = -1;
+ int chooseleaf_stable = -1;
+ int straw_calc_version = -1;
+ int allowed_bucket_algs = -1;
+
+ bool reclassify = false;
+ map<string,pair<string,string>> reclassify_bucket; // %suffix or prefix% -> class, default_root
+ map<string,string> reclassify_root; // bucket -> class
+ map<string,string> set_subtree_class; // bucket -> class
+
+ string compare;
+
+ CrushWrapper crush;
+
+ CrushTester tester(crush, cout);
+
+ // we use -c, don't confuse the generic arg parsing
+ // only parse arguments from CEPH_ARGS, if in the environment
+ vector<const char *> empty_args;
+ auto cct = global_init(NULL, empty_args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ // crushtool times out occasionally when quits. so do not
+ // release the g_ceph_context.
+ cct->get();
+ common_init_finish(g_ceph_context);
+
+ int x;
+ float y;
+ long long z;
+
+ std::string val;
+ std::ostringstream err;
+ int tmp;
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_witharg(args, i, &val, "-d", "--decompile", (char*)NULL)) {
+ infn = val;
+ decompile = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "-i", "--infn", (char*)NULL)) {
+ infn = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-o", "--outfn", (char*)NULL)) {
+ outfn = val;
+ } else if (ceph_argparse_flag(args, i, "-v", "--verbose", (char*)NULL)) {
+ verbose += 1;
+ } else if (ceph_argparse_witharg(args, i, &val, "--compare", (char*)NULL)) {
+ compare = val;
+ } else if (ceph_argparse_flag(args, i, "--reclassify", (char*)NULL)) {
+ reclassify = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-bucket",
+ (char*)NULL)) {
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ string c = *i;
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ reclassify_bucket[val] = make_pair(c, *i);
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-root",
+ (char*)NULL)) {
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ reclassify_root[val] = *i;
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &val, "--set-subtree-class",
+ (char*)NULL)) {
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ set_subtree_class[val] = *i;
+ i = args.erase(i);
+ } else if (ceph_argparse_flag(args, i, "--tree", (char*)NULL)) {
+ tree = true;
+ } else if (ceph_argparse_flag(args, i, "--bucket-tree", (char*)NULL)) {
+ bucket_tree = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "-b", "--bucket-name", (char*)NULL)) {
+ bucket_name = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-f", "--format", (char*)NULL)) {
+ dump_format = val;
+ } else if (ceph_argparse_flag(args, i, "--dump", (char*)NULL)) {
+ dump = true;
+ } else if (ceph_argparse_flag(args, i, "--show_utilization", (char*)NULL)) {
+ display = true;
+ tester.set_output_utilization(true);
+ } else if (ceph_argparse_flag(args, i, "--show_utilization_all", (char*)NULL)) {
+ display = true;
+ tester.set_output_utilization_all(true);
+ } else if (ceph_argparse_flag(args, i, "--show_statistics", (char*)NULL)) {
+ display = true;
+ tester.set_output_statistics(true);
+ } else if (ceph_argparse_flag(args, i, "--show_mappings", (char*)NULL)) {
+ display = true;
+ tester.set_output_mappings(true);
+ } else if (ceph_argparse_flag(args, i, "--show_bad_mappings", (char*)NULL)) {
+ display = true;
+ tester.set_output_bad_mappings(true);
+ } else if (ceph_argparse_flag(args, i, "--show_choose_tries", (char*)NULL)) {
+ display = true;
+ tester.set_output_choose_tries(true);
+ } else if (ceph_argparse_witharg(args, i, &val, "-c", "--compile", (char*)NULL)) {
+ srcfn = val;
+ compile = true;
+ } else if (ceph_argparse_witharg(args, i, &max_id, err, "--check", (char*)NULL)) {
+ check = true;
+ } else if (ceph_argparse_flag(args, i, "-t", "--test", (char*)NULL)) {
+ test = true;
+ } else if (ceph_argparse_witharg(args, i, &full_location, err, "--show-location", (char*)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "-s", "--simulate", (char*)NULL)) {
+ tester.set_random_placement();
+ } else if (ceph_argparse_flag(args, i, "--enable-unsafe-tunables", (char*)NULL)) {
+ unsafe_tunables = true;
+ } else if (ceph_argparse_witharg(args, i, &choose_local_tries, err,
+ "--set_choose_local_tries", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &choose_local_fallback_tries, err,
+ "--set_choose_local_fallback_tries", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &choose_total_tries, err,
+ "--set_choose_total_tries", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &chooseleaf_descend_once, err,
+ "--set_chooseleaf_descend_once", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &chooseleaf_vary_r, err,
+ "--set_chooseleaf_vary_r", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &chooseleaf_stable, err,
+ "--set_chooseleaf_stable", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &straw_calc_version, err,
+ "--set_straw_calc_version", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &allowed_bucket_algs, err,
+ "--set_allowed_bucket_algs", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) {
+ reweight = true;
+ } else if (ceph_argparse_flag(args, i, "--rebuild-class-roots", (char*)NULL)) {
+ rebuild_class_roots = true;
+ } else if (ceph_argparse_witharg(args, i, &add_item, err, "--add_item", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --add-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_weight = atof(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --add-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_name.assign(*i);
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &add_item, err, "--update_item", (char*)NULL)) {
+ update_item = true;
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --update-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_weight = atof(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --update-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_name.assign(*i);
+ i = args.erase(i);
+ } else if (argparse_withargs(args, i, err, "--add-bucket",
+ &add_name, &add_type)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_bucket = true;
+ } else if (argparse_withargs(args, i, err, "--move",
+ &move_name)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ move_item = true;
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--create-simple-rule", (char*)NULL)) {
+ rule_name.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-simple-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_root.assign(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-simple-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_type.assign(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-simple-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_mode.assign(*i);
+ i = args.erase(i);
+
+ cout << "--create-simple-rule:"
+ << " name=" << rule_name
+ << " root=" << rule_root
+ << " type=" << rule_type
+ << " mode=" << rule_mode
+ << std::endl;
+ add_rule = true;
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--create-replicated-rule", (char*)NULL)) {
+ rule_name.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-replicated-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_root.assign(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-replicated-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_type.assign(*i);
+ i = args.erase(i);
+ rule_mode = "firstn";
+
+ cout << "--create-replicated-rule:"
+ << " name=" << rule_name
+ << " root=" << rule_root
+ << " type=" << rule_type
+ << std::endl;
+ add_rule = true;
+
+ } else if (ceph_argparse_witharg(args, i, &val, "--device-class", (char*)NULL)) {
+ rule_device_class.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ } else if (ceph_argparse_witharg(args, i, &val, "--remove-rule", (char*)NULL)) {
+ rule_name.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ del_rule = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--loc", (char*)NULL)) {
+ std::string type(val);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --loc" << std::endl;
+ return EXIT_FAILURE;
+ }
+ std::string name(*i);
+ i = args.erase(i);
+ add_loc[type] = name;
+ } else if (ceph_argparse_flag(args, i, "--output-csv", (char*)NULL)) {
+ write_to_file = true;
+ tester.set_output_data_file(true);
+ tester.set_output_csv(true);
+ } else if (ceph_argparse_flag(args, i, "--help-output", (char*)NULL)) {
+ data_analysis_usage();
+ return EXIT_SUCCESS;
+ } else if (ceph_argparse_witharg(args, i, &val, "--output-name", (char*)NULL)) {
+ std::string name(val);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --output-name" << std::endl;
+ return EXIT_FAILURE;
+ }
+ else {
+ tester.set_output_data_file_name(name + "-");
+ }
+ } else if (ceph_argparse_witharg(args, i, &val, "--remove_item", (char*)NULL)) {
+ remove_name = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--reweight_item", (char*)NULL)) {
+ reweight_name = val;
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --reweight-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ reweight_weight = atof(*i);
+ i = args.erase(i);
+ } else if (ceph_argparse_flag(args, i, "--build", (char*)NULL)) {
+ build = true;
+ } else if (ceph_argparse_witharg(args, i, &num_osds, err, "--num_osds", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--num_rep", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_num_rep(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--max_x", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_max_x(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--min_x", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_min_x(x);
+ } else if (ceph_argparse_witharg(args, i, &z, err, "--pool_id", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_pool_id(z);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--x", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_x(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--max_rule", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_max_rule(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--min_rule", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_min_rule(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--rule", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_rule(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--ruleset", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_ruleset(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--batches", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_batches(x);
+ } else if (ceph_argparse_witharg(args, i, &y, err, "--mark-down-ratio", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_device_down_ratio(y);
+ } else if (ceph_argparse_witharg(args, i, &y, err, "--mark-down-bucket-ratio", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_bucket_down_ratio(y);
+ } else if (ceph_argparse_witharg(args, i, &tmp, err, "--weight", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ int dev = tmp;
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --weight" << std::endl;
+ return EXIT_FAILURE;
+ }
+ float f = atof(*i);
+ i = args.erase(i);
+ tester.set_device_weight(dev, f);
+ }
+ else {
+ ++i;
+ }
+ }
+
+ if (test && !check && !display && !write_to_file && compare.empty()) {
+ cerr << "WARNING: no output selected; use --output-csv or --show-X" << std::endl;
+ }
+
+ if (decompile + compile + build > 1) {
+ cerr << "cannot specify more than one of compile, decompile, and build" << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (!check && !compile && !decompile && !build && !test && !reweight && !adjust && !tree && !dump &&
+ add_item < 0 && !add_bucket && !move_item && !add_rule && !del_rule && full_location < 0 &&
+ !bucket_tree &&
+ !reclassify && !rebuild_class_roots &&
+ compare.empty() &&
+
+ remove_name.empty() && reweight_name.empty()) {
+ cerr << "no action specified; -h for help" << std::endl;
+ return EXIT_FAILURE;
+ }
+ if ((!build) && (!args.empty())) {
+ cerr << "unrecognized arguments: " << args << std::endl;
+ return EXIT_FAILURE;
+ }
+ else {
+ if ((args.size() % 3) != 0U) {
+ cerr << "remaining args: " << args << std::endl;
+ cerr << "layers must be specified with 3-tuples of (name, buckettype, size)"
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ for (size_t j = 0; j < args.size(); j += 3) {
+ layer_t l;
+ l.name = args[j];
+ l.buckettype = args[j+1];
+ l.size = atoi(args[j+2]);
+ layers.push_back(l);
+ }
+ }
+
+ /*
+ if (outfn) cout << "outfn " << outfn << std::endl;
+ if (cinfn) cout << "cinfn " << cinfn << std::endl;
+ if (dinfn) cout << "dinfn " << dinfn << std::endl;
+ */
+
+ bool modified = false;
+
+ // input ----
+
+ if (!infn.empty()) {
+ bufferlist bl;
+ std::string error;
+
+ int r = 0;
+ if (infn == "-") {
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin must not be from a tty" << std::endl;
+ return EXIT_FAILURE;
+ }
+ r = get_fd_data(STDIN_FILENO, bl);
+ if (r < 0) {
+ cerr << "error reading data from STDIN" << std::endl;
+ return EXIT_FAILURE;
+ }
+ } else {
+ r = bl.read_file(infn.c_str(), &error);
+ if (r < 0) {
+ cerr << me << ": error reading '" << infn << "': "
+ << error << std::endl;
+ return EXIT_FAILURE;
+ }
+ }
+ auto p = bl.cbegin();
+ try {
+ crush.decode(p);
+ } catch(...) {
+ cerr << me << ": unable to decode " << infn << std::endl;
+ return EXIT_FAILURE;
+ }
+ }
+
+ if (compile) {
+ crush.create();
+
+ // read the file
+ ifstream in(srcfn.c_str());
+ if (!in.is_open()) {
+ cerr << "input file " << srcfn << " not found" << std::endl;
+ return -ENOENT;
+ }
+
+ CrushCompiler cc(crush, cerr, verbose);
+ if (unsafe_tunables)
+ cc.enable_unsafe_tunables();
+ int r = cc.compile(in, srcfn.c_str());
+ if (r < 0)
+ return EXIT_FAILURE;
+
+ modified = true;
+ }
+
+ if (build) {
+ if (layers.empty()) {
+ cerr << me << ": must specify at least one layer" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ crush.create();
+
+ vector<int> lower_items;
+ vector<int> lower_weights;
+
+ crush.set_max_devices(num_osds);
+ for (int i=0; i<num_osds; i++) {
+ lower_items.push_back(i);
+ lower_weights.push_back(0x10000);
+ crush.set_item_name(i, "osd." + stringify(i));
+ }
+
+ crush.set_type_name(0, "osd");
+ int type = 1;
+ for (vector<layer_t>::iterator p = layers.begin(); p != layers.end(); ++p, type++) {
+ layer_t &l = *p;
+
+ dout(2) << "layer " << type
+ << " " << l.name
+ << " bucket type " << l.buckettype
+ << " " << l.size
+ << dendl;
+
+ crush.set_type_name(type, l.name);
+
+ int buckettype = -1;
+ for (int i = 0; bucket_types[i].name; i++)
+ if (l.buckettype && strcmp(l.buckettype, bucket_types[i].name) == 0) {
+ buckettype = bucket_types[i].type;
+ break;
+ }
+ if (buckettype < 0) {
+ cerr << "unknown bucket type '" << l.buckettype << "'" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ // build items
+ vector<int> cur_items;
+ vector<int> cur_weights;
+ unsigned lower_pos = 0; // lower pos
+
+ dout(2) << "lower_items " << lower_items << dendl;
+ dout(2) << "lower_weights " << lower_weights << dendl;
+
+ int i = 0;
+ while (1) {
+ if (lower_pos == lower_items.size())
+ break;
+
+ int items[num_osds];
+ int weights[num_osds];
+
+ int weight = 0;
+ int j;
+ for (j=0; j<l.size || l.size==0; j++) {
+ if (lower_pos == lower_items.size())
+ break;
+ items[j] = lower_items[lower_pos];
+ weights[j] = lower_weights[lower_pos];
+ weight += weights[j];
+ lower_pos++;
+ dout(2) << " item " << items[j] << " weight " << weights[j] << dendl;
+ }
+
+ int id;
+ int r = crush.add_bucket(0, buckettype, CRUSH_HASH_DEFAULT, type, j, items, weights, &id);
+ if (r < 0) {
+ cerr << " Couldn't add bucket: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ char format[20];
+ format[sizeof(format)-1] = '\0';
+ if (l.size)
+ snprintf(format, sizeof(format)-1, "%s%%d", l.name);
+ else
+ strncpy(format, l.name, sizeof(format)-1);
+ char name[20];
+ snprintf(name, sizeof(name), format, i);
+ crush.set_item_name(id, name);
+
+ dout(2) << " in bucket " << id << " '" << name << "' size " << j << " weight " << weight << dendl;
+
+ cur_items.push_back(id);
+ cur_weights.push_back(weight);
+ i++;
+ }
+
+ lower_items.swap(cur_items);
+ lower_weights.swap(cur_weights);
+ }
+
+ string root = layers.back().size == 0 ? layers.back().name :
+ string(layers.back().name) + "0";
+
+ {
+ set<int> roots;
+ crush.find_roots(&roots);
+ if (roots.size() > 1) {
+ cerr << "The crush rulesets will use the root " << root << "\n"
+ << "and ignore the others.\n"
+ << "There are " << roots.size() << " roots, they can be\n"
+ << "grouped into a single root by appending something like:\n"
+ << " root straw 0\n"
+ << std::endl;
+ }
+ }
+
+ if (OSDMap::build_simple_crush_rules(g_ceph_context, crush, root, &cerr))
+ return EXIT_FAILURE;
+
+ modified = true;
+ }
+
+ // mutate ----
+
+ if (choose_local_tries >= 0) {
+ crush.set_choose_local_tries(choose_local_tries);
+ modified = true;
+ }
+ if (choose_local_fallback_tries >= 0) {
+ crush.set_choose_local_fallback_tries(choose_local_fallback_tries);
+ modified = true;
+ }
+ if (choose_total_tries >= 0) {
+ crush.set_choose_total_tries(choose_total_tries);
+ modified = true;
+ }
+ if (chooseleaf_descend_once >= 0) {
+ crush.set_chooseleaf_descend_once(chooseleaf_descend_once);
+ modified = true;
+ }
+ if (chooseleaf_vary_r >= 0) {
+ crush.set_chooseleaf_vary_r(chooseleaf_vary_r);
+ modified = true;
+ }
+ if (chooseleaf_stable >= 0) {
+ crush.set_chooseleaf_stable(chooseleaf_stable);
+ modified = true;
+ }
+ if (straw_calc_version >= 0) {
+ crush.set_straw_calc_version(straw_calc_version);
+ modified = true;
+ }
+ if (allowed_bucket_algs >= 0) {
+ crush.set_allowed_bucket_algs(allowed_bucket_algs);
+ modified = true;
+ }
+
+ if (!reweight_name.empty()) {
+ cout << me << " reweighting item " << reweight_name << " to " << reweight_weight << std::endl;
+ int r;
+ if (!crush.name_exists(reweight_name)) {
+ cerr << " name " << reweight_name << " dne" << std::endl;
+ r = -ENOENT;
+ } else {
+ int item = crush.get_item_id(reweight_name);
+ r = crush.adjust_item_weightf(g_ceph_context, item, reweight_weight);
+ }
+ if (r >= 0)
+ modified = true;
+ else {
+ cerr << me << " " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ if (!remove_name.empty()) {
+ cout << me << " removing item " << remove_name << std::endl;
+ int r;
+ if (!crush.name_exists(remove_name)) {
+ cerr << " name " << remove_name << " dne" << std::endl;
+ r = -ENOENT;
+ } else {
+ int remove_item = crush.get_item_id(remove_name);
+ r = crush.remove_item(g_ceph_context, remove_item, false);
+ }
+ if (r == 0)
+ modified = true;
+ else {
+ cerr << me << " " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ if (add_item >= 0) {
+ int r;
+ if (update_item) {
+ r = crush.update_item(g_ceph_context, add_item, add_weight, add_name.c_str(), add_loc);
+ } else {
+ r = crush.insert_item(g_ceph_context, add_item, add_weight, add_name.c_str(), add_loc);
+ }
+ if (r >= 0) {
+ modified = true;
+ } else {
+ cerr << me << " " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ if (add_bucket) {
+ if (int r = do_add_bucket(cct.get(), me, crush, add_name, add_type, add_loc); !r) {
+ modified = true;
+ } else {
+ return r;
+ }
+ }
+
+ if (move_item) {
+ if (int r = do_move_item(cct.get(), me, crush, move_name, add_loc); !r) {
+ modified = true;
+ } else {
+ return r;
+ }
+ }
+ if (add_rule) {
+ if (crush.rule_exists(rule_name)) {
+ cerr << "rule " << rule_name << " already exists" << std::endl;
+ return EXIT_FAILURE;
+ }
+ int r = crush.add_simple_rule(rule_name, rule_root, rule_type,
+ rule_device_class,
+ rule_mode, pg_pool_t::TYPE_REPLICATED, &err);
+ if (r < 0) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
+
+ if (del_rule) {
+ if (!crush.rule_exists(rule_name)) {
+ cerr << "rule " << rule_name << " does not exist" << std::endl;
+ return 0;
+ }
+ int ruleno = crush.get_rule_id(rule_name);
+ ceph_assert(ruleno >= 0);
+ int r = crush.remove_rule(ruleno);
+ if (r < 0) {
+ cerr << "fail to remove rule " << rule_name << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
+
+ if (reweight) {
+ crush.reweight(g_ceph_context);
+ modified = true;
+ }
+ if (rebuild_class_roots) {
+ int r = crush.rebuild_roots_with_classes(g_ceph_context);
+ if (r < 0) {
+ cerr << "failed to rebuidl roots with classes" << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
+
+ for (auto& i : set_subtree_class) {
+ crush.set_subtree_class(i.first, i.second);
+ modified = true;
+ }
+ if (reclassify) {
+ int r = crush.reclassify(
+ g_ceph_context,
+ cout,
+ reclassify_root,
+ reclassify_bucket);
+ if (r < 0) {
+ cerr << "failed to reclassify map" << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
+
+ // display ---
+ if (full_location >= 0) {
+ map<string, string> loc = crush.get_full_location(full_location);
+ for (map<string,string>::iterator p = loc.begin();
+ p != loc.end();
+ ++p) {
+ cout << p->first << "\t" << p->second << std::endl;
+ }
+ }
+
+ if (tree) {
+ crush.dump_tree(&cout, NULL, {}, true);
+ }
+
+ if (bucket_tree) {
+ if (bucket_name.empty()) {
+ cerr << ": error bucket_name is empty" << std::endl;
+ }
+ else {
+ set<int> osd_ids;
+ crush.get_leaves(bucket_name.c_str(), &osd_ids);
+ for (auto &id : osd_ids) {
+ cout << "osd." << id << std::endl;
+ }
+ }
+ }
+
+ if (dump) {
+ boost::scoped_ptr<Formatter> f(Formatter::create(dump_format, "json-pretty", "json-pretty"));
+ f->open_object_section("crush_map");
+ crush.dump(f.get());
+ f->close_section();
+ f->flush(cout);
+ cout << "\n";
+ }
+
+ if (decompile) {
+ CrushCompiler cc(crush, cerr, verbose);
+ if (!outfn.empty()) {
+ ofstream o;
+ o.open(outfn.c_str(), ios::out | ios::binary | ios::trunc);
+ if (!o.is_open()) {
+ cerr << me << ": error writing '" << outfn << "'" << std::endl;
+ return EXIT_FAILURE;
+ }
+ cc.decompile(o);
+ o.close();
+ } else {
+ cc.decompile(cout);
+ }
+ }
+
+ if (check) {
+ tester.check_overlapped_rules();
+ if (max_id >= 0) {
+ if (!tester.check_name_maps(max_id)) {
+ return EXIT_FAILURE;
+ }
+ }
+ }
+
+ if (test) {
+ if (tester.get_output_utilization_all() ||
+ tester.get_output_utilization())
+ tester.set_output_statistics(true);
+
+ int r = tester.test();
+ if (r < 0)
+ return EXIT_FAILURE;
+ }
+
+ if (compare.size()) {
+ CrushWrapper crush2;
+ bufferlist in;
+ string error;
+ int r = in.read_file(compare.c_str(), &error);
+ if (r < 0) {
+ cerr << me << ": error reading '" << compare << "': "
+ << error << std::endl;
+ return EXIT_FAILURE;
+ }
+ auto p = in.cbegin();
+ try {
+ crush2.decode(p);
+ } catch(...) {
+ cerr << me << ": unable to decode " << compare << std::endl;
+ return EXIT_FAILURE;
+ }
+ r = tester.compare(crush2);
+ if (r < 0)
+ return EXIT_FAILURE;
+ }
+
+ // output ---
+ if (modified) {
+ crush.finalize();
+
+ if (outfn.empty()) {
+ cout << me << " successfully built or modified map. Use '-o <file>' to write it out." << std::endl;
+ } else {
+ bufferlist bl;
+ crush.encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ int r = bl.write_file(outfn.c_str());
+ if (r < 0) {
+ cerr << me << ": error writing '" << outfn << "': " << cpp_strerror(r) << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (verbose)
+ cout << "wrote crush map to " << outfn << std::endl;
+ }
+ }
+
+ return 0;
+}
+/*
+ * Local Variables:
+ * compile-command: "cd .. ; make crushtool && test/run-cli-tests"
+ * End:
+ */
diff --git a/src/tools/erasure-code/CMakeLists.txt b/src/tools/erasure-code/CMakeLists.txt
new file mode 100644
index 000000000..3583733f5
--- /dev/null
+++ b/src/tools/erasure-code/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_executable(ceph-erasure-code-tool
+ ${PROJECT_SOURCE_DIR}/src/osd/ECUtil.cc
+ ceph-erasure-code-tool.cc)
+target_link_libraries(ceph-erasure-code-tool global ceph-common)
+install(TARGETS ceph-erasure-code-tool DESTINATION bin)
diff --git a/src/tools/erasure-code/ceph-erasure-code-tool.cc b/src/tools/erasure-code/ceph-erasure-code-tool.cc
new file mode 100644
index 000000000..6c99abf46
--- /dev/null
+++ b/src/tools/erasure-code/ceph-erasure-code-tool.cc
@@ -0,0 +1,322 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/buffer.h"
+#include "include/stringify.h"
+#include "common/ceph_argparse.h"
+#include "common/config_proxy.h"
+#include "common/errno.h"
+#include "erasure-code/ErasureCode.h"
+#include "erasure-code/ErasureCodePlugin.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "osd/ECUtil.h"
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+
+std::vector<std::string> display_params = {
+ "chunk_count", "data_chunk_count", "coding_chunk_count"
+};
+
+void usage(const std::string message, ostream &out) {
+ if (!message.empty()) {
+ out << message << std::endl;
+ out << "" << std::endl;
+ }
+ out << "usage: ceph-erasure-code-tool test-plugin-exists <plugin>" << std::endl;
+ out << " ceph-erasure-code-tool validate-profile <profile> [<display-param> ...]" << std::endl;
+ out << " ceph-erasure-code-tool calc-chunk-size <profile> <object_size>" << std::endl;
+ out << " ceph-erasure-code-tool encode <profile> <stripe_unit> <want_to_encode> <fname>" << std::endl;
+ out << " ceph-erasure-code-tool decode <profile> <stripe_unit> <want_to_decode> <fname>" << std::endl;
+ out << "" << std::endl;
+ out << " plugin - plugin name" << std::endl;
+ out << " profile - comma separated list of erasure-code profile settings" << std::endl;
+ out << " example: plugin=jerasure,technique=reed_sol_van,k=3,m=2" << std::endl;
+ out << " display-param - parameter to display (display all if empty)" << std::endl;
+ out << " may be: " << display_params << std::endl;
+ out << " object_size - object size" << std::endl;
+ out << " stripe_unit - stripe unit" << std::endl;
+ out << " want_to_encode - comma separated list of shards to encode" << std::endl;
+ out << " want_to_decode - comma separated list of shards to decode" << std::endl;
+ out << " fname - name for input/output files" << std::endl;
+ out << " when encoding input is read form {fname} file," << std::endl;
+ out << " result is stored in {fname}.{shard} files" << std::endl;
+ out << " when decoding input is read form {fname}.{shard} files," << std::endl;
+ out << " result is stored in {fname} file" << std::endl;
+}
+
+int ec_init(const std::string &profile_str,
+ const std::string &stripe_unit_str,
+ ceph::ErasureCodeInterfaceRef *ec_impl,
+ std::unique_ptr<ECUtil::stripe_info_t> *sinfo) {
+ ceph::ErasureCodeProfile profile;
+ std::vector<std::string> opts;
+ boost::split(opts, profile_str, boost::is_any_of(", "));
+ for (auto &opt_str : opts) {
+ std::vector<std::string> opt;
+ boost::split(opt, opt_str, boost::is_any_of("="));
+ if (opt.size() <= 1) {
+ usage("invalid profile", std::cerr);
+ return 1;
+ }
+ profile[opt[0]] = opt[1];
+ }
+ auto plugin = profile.find("plugin");
+ if (plugin == profile.end()) {
+ usage("invalid profile: plugin not specified", std::cerr);
+ return 1;
+ }
+
+ stringstream ss;
+ ceph::ErasureCodePluginRegistry::instance().factory(
+ plugin->second, g_conf().get_val<std::string>("erasure_code_dir"),
+ profile, ec_impl, &ss);
+ if (!*ec_impl) {
+ usage("invalid profile: " + ss.str(), std::cerr);
+ return 1;
+ }
+
+ if (sinfo == nullptr) {
+ return 0;
+ }
+
+ uint64_t stripe_unit = atoi(stripe_unit_str.c_str());
+ if (stripe_unit <= 0) {
+ usage("invalid stripe unit", std::cerr);
+ return 1;
+ }
+
+ uint64_t stripe_size = atoi(profile["k"].c_str());
+ ceph_assert(stripe_size > 0);
+ uint64_t stripe_width = stripe_size * stripe_unit;
+ sinfo->reset(new ECUtil::stripe_info_t(stripe_size, stripe_width));
+
+ return 0;
+}
+
+int do_test_plugin_exists(const std::vector<const char*> &args) {
+ if (args.size() < 1) {
+ usage("not enought arguments", std::cerr);
+ return 1;
+ }
+
+ ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
+ ErasureCodePlugin *plugin;
+ stringstream ss;
+
+ std::lock_guard l{instance.lock};
+ int r = instance.load(
+ args[0], g_conf().get_val<std::string>("erasure_code_dir"), &plugin, &ss);
+ std::cerr << ss.str() << endl;
+ return r;
+}
+
+int do_validate_profile(const std::vector<const char*> &args) {
+ if (args.size() < 1) {
+ usage("not enought arguments", std::cerr);
+ return 1;
+ }
+
+ ceph::ErasureCodeInterfaceRef ec_impl;
+ int r = ec_init(args[0], {}, &ec_impl, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ if (args.size() > 1) {
+ std::set<std::string> valid_params(display_params.begin(),
+ display_params.end());
+ display_params.clear();
+ for (size_t i = 1; i < args.size(); i++) {
+ if (!valid_params.count(args[i])) {
+ usage("invalid display param: " + std::string(args[i]), std::cerr);
+ return 1;
+ }
+ display_params.push_back(args[i]);
+ }
+ }
+
+ for (auto &param : display_params) {
+ if (display_params.size() > 1) {
+ std::cout << param << ": ";
+ }
+ if (param == "chunk_count") {
+ std::cout << ec_impl->get_chunk_count() << std::endl;
+ } else if (param == "data_chunk_count") {
+ std::cout << ec_impl->get_data_chunk_count() << std::endl;
+ } else if (param == "coding_chunk_count") {
+ std::cout << ec_impl->get_coding_chunk_count() << std::endl;
+ } else {
+ ceph_abort_msgf("unknown display_param: %s", param.c_str());
+ }
+ }
+
+ return 0;
+}
+
+int do_calc_chunk_size(const std::vector<const char*> &args) {
+ if (args.size() < 2) {
+ usage("not enought arguments", std::cerr);
+ return 1;
+ }
+
+ ceph::ErasureCodeInterfaceRef ec_impl;
+ int r = ec_init(args[0], {}, &ec_impl, nullptr);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t object_size = atoi(args[1]);
+ if (object_size <= 0) {
+ usage("invalid object size", std::cerr);
+ return 1;
+ }
+
+ std::cout << ec_impl->get_chunk_size(object_size) << std::endl;
+ return 0;
+}
+
+int do_encode(const std::vector<const char*> &args) {
+ if (args.size() < 4) {
+ usage("not enought arguments", std::cerr);
+ return 1;
+ }
+
+ ceph::ErasureCodeInterfaceRef ec_impl;
+ std::unique_ptr<ECUtil::stripe_info_t> sinfo;
+ int r = ec_init(args[0], args[1], &ec_impl, &sinfo);
+ if (r < 0) {
+ return r;
+ }
+
+ std::set<int> want;
+ std::vector<std::string> shards;
+ boost::split(shards, args[2], boost::is_any_of(","));
+ for (auto &shard : shards) {
+ want.insert(atoi(shard.c_str()));
+ }
+ ceph::bufferlist decoded_data;
+ std::string fname = args[3];
+
+ std::string error;
+ r = decoded_data.read_file(fname.c_str(), &error);
+ if (r < 0) {
+ std::cerr << "failed to read " << fname << ": " << error << std::endl;
+ return 1;
+ }
+
+ uint64_t stripe_width = sinfo->get_stripe_width();
+ if (decoded_data.length() % stripe_width != 0) {
+ uint64_t pad = stripe_width - decoded_data.length() % stripe_width;
+ decoded_data.append_zero(pad);
+ }
+
+ std::map<int, ceph::bufferlist> encoded_data;
+ r = ECUtil::encode(*sinfo, ec_impl, decoded_data, want, &encoded_data);
+ if (r < 0) {
+ std::cerr << "failed to encode: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+
+ for (auto &[shard, bl] : encoded_data) {
+ std::string name = fname + "." + stringify(shard);
+ r = bl.write_file(name.c_str());
+ if (r < 0) {
+ std::cerr << "failed to write " << name << ": " << cpp_strerror(r)
+ << std::endl;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+int do_decode(const std::vector<const char*> &args) {
+ if (args.size() < 4) {
+ usage("not enought arguments", std::cerr);
+ return 1;
+ }
+
+ ceph::ErasureCodeInterfaceRef ec_impl;
+ std::unique_ptr<ECUtil::stripe_info_t> sinfo;
+ int r = ec_init(args[0], args[1], &ec_impl, &sinfo);
+ if (r < 0) {
+ return r;
+ }
+
+ std::map<int, ceph::bufferlist> encoded_data;
+ std::vector<std::string> shards;
+ boost::split(shards, args[2], boost::is_any_of(","));
+ for (auto &shard : shards) {
+ encoded_data[atoi(shard.c_str())] = {};
+ }
+ ceph::bufferlist decoded_data;
+ std::string fname = args[3];
+
+ for (auto &[shard, bl] : encoded_data) {
+ std::string name = fname + "." + stringify(shard);
+ std::string error;
+ r = bl.read_file(name.c_str(), &error);
+ if (r < 0) {
+ std::cerr << "failed to read " << name << ": " << error << std::endl;
+ return 1;
+ }
+ }
+
+ r = ECUtil::decode(*sinfo, ec_impl, encoded_data, &decoded_data);
+ if (r < 0) {
+ std::cerr << "failed to decode: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+
+ r = decoded_data.write_file(fname.c_str());
+ if (r < 0) {
+ std::cerr << "failed to write " << fname << ": " << cpp_strerror(r)
+ << std::endl;
+ return 1;
+ }
+
+ return 0;
+}
+
+int main(int argc, const char **argv) {
+ std::vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_MON_CONFIG);
+
+ if (args.empty() || args[0] == std::string("-h") ||
+ args[0] == std::string("--help")) {
+ usage("", std::cout);
+ return 0;
+ }
+
+ if (args.size() < 1) {
+ usage("not enought arguments", std::cerr);
+ return 1;
+ }
+
+ std::string cmd = args[0];
+ std::vector<const char*> cmd_args(args.begin() + 1, args.end());
+
+ if (cmd == "test-plugin-exists") {
+ return do_test_plugin_exists(cmd_args);
+ } else if (cmd == "validate-profile") {
+ return do_validate_profile(cmd_args);
+ } else if (cmd == "calc-chunk-size") {
+ return do_calc_chunk_size(cmd_args);
+ } else if (cmd == "encode") {
+ return do_encode(cmd_args);
+ } else if (cmd == "decode") {
+ return do_decode(cmd_args);
+ }
+
+ usage("invalid command: " + cmd, std::cerr);
+ return 1;
+}
diff --git a/src/tools/histogram_dump.py b/src/tools/histogram_dump.py
new file mode 100755
index 000000000..cc22fef5e
--- /dev/null
+++ b/src/tools/histogram_dump.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# coding: utf-8
+#
+# Ceph - scalable distributed file system
+#
+# Copyright (C) 2017 OVH
+# Copyright (C) 2020 Marc Schöchlin <ms-github@256bit.org>
+#
+# This is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public
+# License version 2, as published by the Free Software
+# Foundation. See file COPYING.
+#
+
+import json
+import subprocess
+import time
+import os
+import argparse
+import glob
+import sys
+import textwrap
+import datetime
+
+
+def shorten(val):
+ if isinstance(val, str):
+ return val
+ for u in ((3, ''), (6, 'k'), (9, 'M'), (12, 'G'), (15, 'T')):
+ if val < 10**u[0]:
+ return "{}{}".format(int(val / (10 ** (u[0]-3))), u[1])
+ return val
+
+
+def create_histogram(sockets, counter, last, seconds, batch):
+
+ current_datasets = {}
+ json_d = {}
+ for socket in sockets:
+ try:
+ out = subprocess.check_output(
+ "ceph --admin-daemon {} perf histogram dump".format(socket),
+ shell=True)
+ json_d = json.loads(out.decode('utf-8'))
+ except Exception as e:
+ return (last,
+ "Couldn't connect to admin socket, result: \n{}".format(e))
+ current_datasets[socket] = json_d['osd'][counter]['values']
+
+
+ axes = json_d['osd'][counter]['axes']
+
+ if batch:
+ content = "{} : Counter: {} for {}\n\n\n".format(
+ datetime.datetime.now().isoformat(), counter,", ".join(sockets))
+ else:
+ content = "Counter: {} for {}\n(create statistics every {} seconds)\n\n".format(
+ counter,", ".join(sockets),seconds)
+
+ content += "{}:\n".format(axes[1]['name'])
+ for r in axes[1]['ranges']:
+ content += "{0: >4} ".format(
+ shorten(r['min']) if 'min' in r else '')
+ content += "\n"
+ for r in axes[1]['ranges']:
+ content += "{0: >4} ".format(
+ shorten(r['max']) if 'max' in r else '')
+ content += "\n"
+
+ content += ("{0: >"+str(len(axes[1]['ranges'])*5+14)+"}:\n").format(
+ axes[0]['name'])
+
+ if batch:
+ COL = ''
+ ENDC = ''
+ else:
+ COL = '\033[91m'
+ ENDC = '\033[0m'
+
+ current = []
+
+ # initalize with zeros
+ for i in range(len(current_datasets[socket])):
+ current.append([])
+ for j in range(len(current_datasets[socket][i])):
+ current[i].append(0)
+
+ # combine data
+ for socket, data in current_datasets.items():
+ for i in range(len(data)):
+ for j in range(len(data[i])):
+ current[i][j] += data[i][j]
+
+ for i in range(len(current)):
+ for j in range(len(current[i])):
+ try:
+ diff = current[i][j] - last[i][j]
+ except IndexError:
+ diff = '-'
+
+ if diff != "-" and diff != 0:
+ content += "{0}{1: >4}{2} ".format(COL,shorten(diff),ENDC)
+ else:
+ content += "{0: >4} ".format(shorten(diff))
+
+ r = axes[0]['ranges'][i]
+ content += "{0: >6} : {1}\n".format(
+ shorten(r['min']) if 'min' in r else '',
+ shorten(r['max']) if 'max' in r else '')
+ return (current, content)
+
+
+def loop_print(sockets, counter, loop_seconds, batch):
+ last = []
+
+ try:
+ while True:
+ last, content = create_histogram(sockets, counter, last, loop_seconds, batch)
+ if not batch:
+ print(chr(27) + "[2J")
+ print(content)
+ time.sleep(loop_seconds)
+ except KeyboardInterrupt:
+ print("...interupted")
+ sys.exit(0)
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='Continuously display ceph performance histogram for selected osd operations')
+ parser.add_argument(
+ '--asok',
+ type=str,
+ default=['/var/run/ceph/*.asok'],
+ nargs='+',
+ help='Path to asok file, you can use wildcards')
+ parser.add_argument(
+ '--counter',
+ type=str,
+ help=textwrap.dedent('''\
+ Specify name of the counter to calculate statistics
+ see "ceph --admin-daemon /var/run/ceph/<osd>.asok perf histogram dump"
+ '''),
+ default='op_w_latency_in_bytes_histogram')
+ parser.add_argument(
+ '--batch',
+ help='Disable colors and add timestamps',
+ action='store_true',
+ )
+ parser.add_argument(
+ '--loop_seconds',
+ type=int,
+ help='Cycle time in seconds for statistics generation',
+ default=5)
+
+ args = parser.parse_args()
+
+ if not sys.stdout.isatty():
+ print("Not running with a tty, automatically switching to batch mode")
+ args.batch = True
+
+ sockets = []
+ for asok in args.asok:
+ sockets = glob.glob(asok) + sockets
+
+ if len(sockets) == 0:
+ print("no suitable socket at {}".format(args.asok))
+ sys.exit(1)
+
+ loop_print(sockets, args.counter, args.loop_seconds, args.batch)
+
+if __name__ == '__main__':
+ main()
diff --git a/src/tools/immutable_object_cache/CMakeLists.txt b/src/tools/immutable_object_cache/CMakeLists.txt
new file mode 100644
index 000000000..ed118ed6f
--- /dev/null
+++ b/src/tools/immutable_object_cache/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(ceph_immutable_object_cache_files
+ ObjectCacheStore.cc
+ CacheController.cc
+ CacheServer.cc
+ CacheClient.cc
+ CacheSession.cc
+ SimplePolicy.cc
+ Types.cc
+ )
+add_library(ceph_immutable_object_cache_lib STATIC ${ceph_immutable_object_cache_files})
+
+add_executable(ceph-immutable-object-cache
+ main.cc)
+target_link_libraries(ceph-immutable-object-cache
+ ceph_immutable_object_cache_lib
+ librados
+ StdFilesystem::filesystem
+ global)
+install(TARGETS ceph-immutable-object-cache DESTINATION bin)
diff --git a/src/tools/immutable_object_cache/CacheClient.cc b/src/tools/immutable_object_cache/CacheClient.cc
new file mode 100644
index 000000000..2b837be51
--- /dev/null
+++ b/src/tools/immutable_object_cache/CacheClient.cc
@@ -0,0 +1,435 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/bind/bind.hpp>
+#include "CacheClient.h"
+#include "common/Cond.h"
+#include "common/version.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_immutable_obj_cache
+#undef dout_prefix
+#define dout_prefix *_dout << "ceph::cache::CacheClient: " << this << " " \
+ << __func__ << ": "
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+ CacheClient::CacheClient(const std::string& file, CephContext* ceph_ctx)
+ : m_cct(ceph_ctx), m_io_service_work(m_io_service),
+ m_dm_socket(m_io_service), m_ep(stream_protocol::endpoint(file)),
+ m_io_thread(nullptr), m_session_work(false), m_writing(false),
+ m_reading(false), m_sequence_id(0) {
+ m_worker_thread_num =
+ m_cct->_conf.get_val<uint64_t>(
+ "immutable_object_cache_client_dedicated_thread_num");
+
+ if (m_worker_thread_num != 0) {
+ m_worker = new boost::asio::io_service();
+ m_worker_io_service_work = new boost::asio::io_service::work(*m_worker);
+ for (uint64_t i = 0; i < m_worker_thread_num; i++) {
+ std::thread* thd = new std::thread([this](){m_worker->run();});
+ m_worker_threads.push_back(thd);
+ }
+ }
+ m_bp_header = buffer::create(get_header_size());
+ }
+
+ CacheClient::~CacheClient() {
+ stop();
+ }
+
+ void CacheClient::run() {
+ m_io_thread.reset(new std::thread([this](){m_io_service.run(); }));
+ }
+
+ bool CacheClient::is_session_work() {
+ return m_session_work.load() == true;
+ }
+
+ int CacheClient::stop() {
+ m_session_work.store(false);
+ m_io_service.stop();
+
+ if (m_io_thread != nullptr) {
+ m_io_thread->join();
+ }
+ if (m_worker_thread_num != 0) {
+ m_worker->stop();
+ for (auto thd : m_worker_threads) {
+ thd->join();
+ delete thd;
+ }
+ delete m_worker_io_service_work;
+ delete m_worker;
+ }
+ return 0;
+ }
+
+ // close domain socket
+ void CacheClient::close() {
+ m_session_work.store(false);
+ boost::system::error_code close_ec;
+ m_dm_socket.close(close_ec);
+ if (close_ec) {
+ ldout(m_cct, 20) << "close: " << close_ec.message() << dendl;
+ }
+ }
+
+ // sync connect
+ int CacheClient::connect() {
+ int ret = -1;
+ C_SaferCond cond;
+ Context* on_finish = new LambdaContext([&cond, &ret](int err) {
+ ret = err;
+ cond.complete(err);
+ });
+
+ connect(on_finish);
+ cond.wait();
+
+ return ret;
+ }
+
+ // async connect
+ void CacheClient::connect(Context* on_finish) {
+ m_dm_socket.async_connect(m_ep,
+ boost::bind(&CacheClient::handle_connect, this,
+ on_finish, boost::asio::placeholders::error));
+ }
+
+ void CacheClient::handle_connect(Context* on_finish,
+ const boost::system::error_code& err) {
+ if (err) {
+ ldout(m_cct, 20) << "fails to connect to cache server. error : "
+ << err.message() << dendl;
+ fault(ASIO_ERROR_CONNECT, err);
+ on_finish->complete(-1);
+ return;
+ }
+
+ ldout(m_cct, 20) << "successfully connected to cache server." << dendl;
+ on_finish->complete(0);
+ }
+
+ void CacheClient::lookup_object(std::string pool_nspace, uint64_t pool_id,
+ uint64_t snap_id, uint64_t object_size,
+ std::string oid,
+ CacheGenContextURef&& on_finish) {
+ ldout(m_cct, 20) << dendl;
+ ObjectCacheRequest* req = new ObjectCacheReadData(RBDSC_READ,
+ ++m_sequence_id, 0, 0, pool_id,
+ snap_id, object_size, oid, pool_nspace);
+ req->process_msg = std::move(on_finish);
+ req->encode();
+
+ {
+ std::lock_guard locker{m_lock};
+ m_outcoming_bl.append(req->get_payload_bufferlist());
+ ceph_assert(m_seq_to_req.find(req->seq) == m_seq_to_req.end());
+ m_seq_to_req[req->seq] = req;
+ }
+
+ // try to send message to server.
+ try_send();
+
+ // try to receive ack from server.
+ try_receive();
+ }
+
+ void CacheClient::try_send() {
+ ldout(m_cct, 20) << dendl;
+ if (!m_writing.load()) {
+ m_writing.store(true);
+ send_message();
+ }
+ }
+
+ void CacheClient::send_message() {
+ ldout(m_cct, 20) << dendl;
+ bufferlist bl;
+ {
+ std::lock_guard locker{m_lock};
+ bl.swap(m_outcoming_bl);
+ ceph_assert(m_outcoming_bl.length() == 0);
+ }
+
+ // send bytes as many as possible.
+ boost::asio::async_write(m_dm_socket,
+ boost::asio::buffer(bl.c_str(), bl.length()),
+ boost::asio::transfer_exactly(bl.length()),
+ [this, bl](const boost::system::error_code& err, size_t cb) {
+ if (err || cb != bl.length()) {
+ fault(ASIO_ERROR_WRITE, err);
+ return;
+ }
+
+ ceph_assert(cb == bl.length());
+
+ {
+ std::lock_guard locker{m_lock};
+ if (m_outcoming_bl.length() == 0) {
+ m_writing.store(false);
+ return;
+ }
+ }
+
+ // still have left bytes, continue to send.
+ send_message();
+ });
+ try_receive();
+ }
+
+ void CacheClient::try_receive() {
+ ldout(m_cct, 20) << dendl;
+ if (!m_reading.load()) {
+ m_reading.store(true);
+ receive_message();
+ }
+ }
+
+ void CacheClient::receive_message() {
+ ldout(m_cct, 20) << dendl;
+ ceph_assert(m_reading.load());
+ read_reply_header();
+ }
+
+ void CacheClient::read_reply_header() {
+ ldout(m_cct, 20) << dendl;
+ /* create new head buffer for every reply */
+ bufferptr bp_head(buffer::create(get_header_size()));
+ auto raw_ptr = bp_head.c_str();
+
+ boost::asio::async_read(m_dm_socket,
+ boost::asio::buffer(raw_ptr, get_header_size()),
+ boost::asio::transfer_exactly(get_header_size()),
+ boost::bind(&CacheClient::handle_reply_header,
+ this, bp_head,
+ boost::asio::placeholders::error,
+ boost::asio::placeholders::bytes_transferred));
+ }
+
+ void CacheClient::handle_reply_header(bufferptr bp_head,
+ const boost::system::error_code& ec,
+ size_t bytes_transferred) {
+ ldout(m_cct, 20) << dendl;
+ if (ec || bytes_transferred != get_header_size()) {
+ fault(ASIO_ERROR_READ, ec);
+ return;
+ }
+
+ ceph_assert(bytes_transferred == bp_head.length());
+
+ uint32_t data_len = get_data_len(bp_head.c_str());
+
+ bufferptr bp_data(buffer::create(data_len));
+ read_reply_data(std::move(bp_head), std::move(bp_data), data_len);
+ }
+
+ void CacheClient::read_reply_data(bufferptr&& bp_head,
+ bufferptr&& bp_data,
+ const uint64_t data_len) {
+ ldout(m_cct, 20) << dendl;
+ auto raw_ptr = bp_data.c_str();
+ boost::asio::async_read(m_dm_socket, boost::asio::buffer(raw_ptr, data_len),
+ boost::asio::transfer_exactly(data_len),
+ boost::bind(&CacheClient::handle_reply_data,
+ this, std::move(bp_head), std::move(bp_data), data_len,
+ boost::asio::placeholders::error,
+ boost::asio::placeholders::bytes_transferred));
+ }
+
+ void CacheClient::handle_reply_data(bufferptr bp_head,
+ bufferptr bp_data,
+ const uint64_t data_len,
+ const boost::system::error_code& ec,
+ size_t bytes_transferred) {
+ ldout(m_cct, 20) << dendl;
+ if (ec || bytes_transferred != data_len) {
+ fault(ASIO_ERROR_WRITE, ec);
+ return;
+ }
+ ceph_assert(bp_data.length() == data_len);
+
+ bufferlist data_buffer;
+ data_buffer.append(std::move(bp_head));
+ data_buffer.append(std::move(bp_data));
+
+ ObjectCacheRequest* reply = decode_object_cache_request(data_buffer);
+ data_buffer.clear();
+ ceph_assert(data_buffer.length() == 0);
+
+ process(reply, reply->seq);
+
+ {
+ std::lock_guard locker{m_lock};
+ if (m_seq_to_req.size() == 0 && m_outcoming_bl.length()) {
+ m_reading.store(false);
+ return;
+ }
+ }
+ if (is_session_work()) {
+ receive_message();
+ }
+ }
+
+ void CacheClient::process(ObjectCacheRequest* reply, uint64_t seq_id) {
+ ldout(m_cct, 20) << dendl;
+ ObjectCacheRequest* current_request = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_seq_to_req.find(seq_id) != m_seq_to_req.end());
+ current_request = m_seq_to_req[seq_id];
+ m_seq_to_req.erase(seq_id);
+ }
+
+ ceph_assert(current_request != nullptr);
+ auto process_reply = new LambdaContext([current_request, reply]
+ (bool dedicated) {
+ if (dedicated) {
+ // dedicated thrad to execute this context.
+ }
+ current_request->process_msg.release()->complete(reply);
+ delete current_request;
+ delete reply;
+ });
+
+ if (m_worker_thread_num != 0) {
+ m_worker->post([process_reply]() {
+ process_reply->complete(true);
+ });
+ } else {
+ process_reply->complete(false);
+ }
+ }
+
+ // if there is one request fails, just execute fault, then shutdown RO.
+ void CacheClient::fault(const int err_type,
+ const boost::system::error_code& ec) {
+ ldout(m_cct, 20) << "fault." << ec.message() << dendl;
+
+ if (err_type == ASIO_ERROR_CONNECT) {
+ ceph_assert(!m_session_work.load());
+ if (ec == boost::asio::error::connection_refused) {
+ ldout(m_cct, 20) << "Connecting RO daenmon fails : "<< ec.message()
+ << ". Immutable-object-cache daemon is down ? "
+ << "Data will be read from ceph cluster " << dendl;
+ } else {
+ ldout(m_cct, 20) << "Connecting RO daemon fails : "
+ << ec.message() << dendl;
+ }
+
+ if (m_dm_socket.is_open()) {
+ // Set to indicate what error occurred, if any.
+ // Note that, even if the function indicates an error,
+ // the underlying descriptor is closed.
+ boost::system::error_code close_ec;
+ m_dm_socket.close(close_ec);
+ if (close_ec) {
+ ldout(m_cct, 20) << "close: " << close_ec.message() << dendl;
+ }
+ }
+ return;
+ }
+
+ if (!m_session_work.load()) {
+ return;
+ }
+
+ /* when current session don't work, ASIO will don't receive any new request from hook.
+ * On the other hand, for pending request of ASIO, cancle these request,
+ * then call their callback. these request which are cancled by this method,
+ * will be re-dispatched to RADOS layer.
+ * make sure just have one thread to modify execute below code. */
+ m_session_work.store(false);
+
+ if (err_type == ASIO_ERROR_MSG_INCOMPLETE) {
+ ldout(m_cct, 20) << "ASIO In-complete message." << ec.message() << dendl;
+ ceph_assert(0);
+ }
+
+ if (err_type == ASIO_ERROR_READ) {
+ ldout(m_cct, 20) << "ASIO async read fails : " << ec.message() << dendl;
+ }
+
+ if (err_type == ASIO_ERROR_WRITE) {
+ ldout(m_cct, 20) << "ASIO asyn write fails : " << ec.message() << dendl;
+ // CacheClient should not occur this error.
+ ceph_assert(0);
+ }
+
+ // currently, for any asio error, just shutdown RO.
+ close();
+
+ /* all pending request, which have entered into ASIO,
+ * will be re-dispatched to RADOS.*/
+ {
+ std::lock_guard locker{m_lock};
+ for (auto it : m_seq_to_req) {
+ it.second->type = RBDSC_READ_RADOS;
+ it.second->process_msg->complete(it.second);
+ }
+ m_seq_to_req.clear();
+ }
+
+ ldout(m_cct, 20) << "Because ASIO domain socket fails, just shutdown RO.\
+ Later all reading will be re-dispatched RADOS layer"
+ << ec.message() << dendl;
+ }
+
+ // TODO : re-implement this method
+ int CacheClient::register_client(Context* on_finish) {
+ ObjectCacheRequest* reg_req = new ObjectCacheRegData(RBDSC_REGISTER,
+ m_sequence_id++,
+ ceph_version_to_str());
+ reg_req->encode();
+
+ bufferlist bl;
+ bl.append(reg_req->get_payload_bufferlist());
+
+ uint64_t ret;
+ boost::system::error_code ec;
+
+ ret = boost::asio::write(m_dm_socket,
+ boost::asio::buffer(bl.c_str(), bl.length()), ec);
+
+ if (ec || ret != bl.length()) {
+ fault(ASIO_ERROR_WRITE, ec);
+ return -1;
+ }
+ delete reg_req;
+
+ ret = boost::asio::read(m_dm_socket,
+ boost::asio::buffer(m_bp_header.c_str(), get_header_size()), ec);
+ if (ec || ret != get_header_size()) {
+ fault(ASIO_ERROR_READ, ec);
+ return -1;
+ }
+
+ uint64_t data_len = get_data_len(m_bp_header.c_str());
+ bufferptr bp_data(buffer::create(data_len));
+
+ ret = boost::asio::read(m_dm_socket, boost::asio::buffer(bp_data.c_str(),
+ data_len), ec);
+ if (ec || ret != data_len) {
+ fault(ASIO_ERROR_READ, ec);
+ return -1;
+ }
+
+ bufferlist data_buffer;
+ data_buffer.append(m_bp_header);
+ data_buffer.append(std::move(bp_data));
+ ObjectCacheRequest* req = decode_object_cache_request(data_buffer);
+ if (req->type == RBDSC_REGISTER_REPLY) {
+ m_session_work.store(true);
+ on_finish->complete(0);
+ } else {
+ on_finish->complete(-1);
+ }
+
+ delete req;
+ return 0;
+ }
+
+} // namespace immutable_obj_cache
+} // namespace ceph
diff --git a/src/tools/immutable_object_cache/CacheClient.h b/src/tools/immutable_object_cache/CacheClient.h
new file mode 100644
index 000000000..b2f749631
--- /dev/null
+++ b/src/tools/immutable_object_cache/CacheClient.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_CACHE_CLIENT_H
+#define CEPH_CACHE_CACHE_CLIENT_H
+
+#include <atomic>
+#include <boost/asio.hpp>
+#include <boost/asio/error.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "include/ceph_assert.h"
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "Types.h"
+#include "SocketCommon.h"
+
+
+using boost::asio::local::stream_protocol;
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+class CacheClient {
+ public:
+ CacheClient(const std::string& file, CephContext* ceph_ctx);
+ ~CacheClient();
+ void run();
+ bool is_session_work();
+ void close();
+ int stop();
+ int connect();
+ void connect(Context* on_finish);
+ void lookup_object(std::string pool_nspace, uint64_t pool_id,
+ uint64_t snap_id, uint64_t object_size, std::string oid,
+ CacheGenContextURef&& on_finish);
+ int register_client(Context* on_finish);
+
+ private:
+ void send_message();
+ void try_send();
+ void fault(const int err_type, const boost::system::error_code& err);
+ void handle_connect(Context* on_finish, const boost::system::error_code& err);
+ void try_receive();
+ void receive_message();
+ void process(ObjectCacheRequest* reply, uint64_t seq_id);
+ void read_reply_header();
+ void handle_reply_header(bufferptr bp_head,
+ const boost::system::error_code& ec,
+ size_t bytes_transferred);
+ void read_reply_data(bufferptr&& bp_head, bufferptr&& bp_data,
+ const uint64_t data_len);
+ void handle_reply_data(bufferptr bp_head, bufferptr bp_data,
+ const uint64_t data_len,
+ const boost::system::error_code& ec,
+ size_t bytes_transferred);
+
+ private:
+ CephContext* m_cct;
+ boost::asio::io_service m_io_service;
+ boost::asio::io_service::work m_io_service_work;
+ stream_protocol::socket m_dm_socket;
+ stream_protocol::endpoint m_ep;
+ std::shared_ptr<std::thread> m_io_thread;
+ std::atomic<bool> m_session_work;
+
+ uint64_t m_worker_thread_num;
+ boost::asio::io_service* m_worker;
+ std::vector<std::thread*> m_worker_threads;
+ boost::asio::io_service::work* m_worker_io_service_work;
+
+ std::atomic<bool> m_writing;
+ std::atomic<bool> m_reading;
+ std::atomic<uint64_t> m_sequence_id;
+ ceph::mutex m_lock =
+ ceph::make_mutex("ceph::cache::cacheclient::m_lock");
+ std::map<uint64_t, ObjectCacheRequest*> m_seq_to_req;
+ bufferlist m_outcoming_bl;
+ bufferptr m_bp_header;
+};
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+#endif // CEPH_CACHE_CACHE_CLIENT_H
diff --git a/src/tools/immutable_object_cache/CacheController.cc b/src/tools/immutable_object_cache/CacheController.cc
new file mode 100644
index 000000000..ae1636839
--- /dev/null
+++ b/src/tools/immutable_object_cache/CacheController.cc
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "CacheController.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_immutable_obj_cache
+#undef dout_prefix
+#define dout_prefix *_dout << "ceph::cache::CacheController: " << this << " " \
+ << __func__ << ": "
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+CacheController::CacheController(CephContext *cct,
+ const std::vector<const char*> &args):
+ m_args(args), m_cct(cct) {
+ ldout(m_cct, 20) << dendl;
+}
+
+CacheController::~CacheController() {
+ delete m_cache_server;
+ delete m_object_cache_store;
+}
+
+int CacheController::init() {
+ ldout(m_cct, 20) << dendl;
+ m_object_cache_store = new ObjectCacheStore(m_cct);
+ // TODO(dehao): make this configurable
+ int r = m_object_cache_store->init(true);
+ if (r < 0) {
+ lderr(m_cct) << "init error\n" << dendl;
+ return r;
+ }
+
+ r = m_object_cache_store->init_cache();
+ if (r < 0) {
+ lderr(m_cct) << "init error\n" << dendl;
+ }
+
+ return r;
+}
+
+int CacheController::shutdown() {
+ ldout(m_cct, 20) << dendl;
+
+ int r;
+ if (m_cache_server != nullptr) {
+ r = m_cache_server->stop();
+ if (r < 0) {
+ lderr(m_cct) << "stop error\n" << dendl;
+ return r;
+ }
+ }
+
+ r = m_object_cache_store->shutdown();
+ if (r < 0) {
+ lderr(m_cct) << "stop error\n" << dendl;
+ return r;
+ }
+
+ return r;
+}
+
+void CacheController::handle_signal(int signum) {
+ shutdown();
+}
+
+int CacheController::run() {
+ try {
+ std::string controller_path =
+ m_cct->_conf.get_val<std::string>("immutable_object_cache_sock");
+ if (controller_path.empty()) {
+ lderr(m_cct) << "'immutable_object_cache_sock' path not set" << dendl;
+ return -EINVAL;
+ }
+
+ std::remove(controller_path.c_str());
+
+ m_cache_server = new CacheServer(m_cct, controller_path,
+ std::bind(&CacheController::handle_request, this,
+ std::placeholders::_1, std::placeholders::_2));
+
+ int ret = m_cache_server->run();
+ if (ret != 0) {
+ return ret;
+ }
+
+ return 0;
+ } catch (std::exception& e) {
+ lderr(m_cct) << "Exception: " << e.what() << dendl;
+ return -EFAULT;
+ }
+}
+
+void CacheController::handle_request(CacheSession* session,
+ ObjectCacheRequest* req) {
+ ldout(m_cct, 20) << dendl;
+
+ switch (req->get_request_type()) {
+ case RBDSC_REGISTER: {
+ // TODO(dehao): skip register and allow clients to lookup directly
+
+ auto req_reg_data = reinterpret_cast <ObjectCacheRegData*> (req);
+ session->set_client_version(req_reg_data->version);
+
+ ObjectCacheRequest* reply = new ObjectCacheRegReplyData(
+ RBDSC_REGISTER_REPLY, req->seq);
+ session->send(reply);
+ break;
+ }
+ case RBDSC_READ: {
+ // lookup object in local cache store
+ std::string cache_path;
+ ObjectCacheReadData* req_read_data =
+ reinterpret_cast <ObjectCacheReadData*> (req);
+ bool return_dne_path = session->client_version().empty();
+ int ret = m_object_cache_store->lookup_object(
+ req_read_data->pool_namespace, req_read_data->pool_id,
+ req_read_data->snap_id, req_read_data->object_size,
+ req_read_data->oid, return_dne_path, cache_path);
+ ObjectCacheRequest* reply = nullptr;
+ if (ret != OBJ_CACHE_PROMOTED && ret != OBJ_CACHE_DNE) {
+ reply = new ObjectCacheReadRadosData(RBDSC_READ_RADOS, req->seq);
+ } else {
+ reply = new ObjectCacheReadReplyData(RBDSC_READ_REPLY,
+ req->seq, cache_path);
+ }
+ session->send(reply);
+ break;
+ }
+ default:
+ ldout(m_cct, 5) << "can't recongize request" << dendl;
+ ceph_assert(0);
+ }
+}
+
+} // namespace immutable_obj_cache
+} // namespace ceph
diff --git a/src/tools/immutable_object_cache/CacheController.h b/src/tools/immutable_object_cache/CacheController.h
new file mode 100644
index 000000000..f70f6bb1c
--- /dev/null
+++ b/src/tools/immutable_object_cache/CacheController.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_CACHE_CONTROLLER_H
+#define CEPH_CACHE_CACHE_CONTROLLER_H
+
+#include "common/ceph_context.h"
+#include "common/WorkQueue.h"
+#include "CacheServer.h"
+#include "ObjectCacheStore.h"
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+class CacheController {
+ public:
+ CacheController(CephContext *cct, const std::vector<const char*> &args);
+ ~CacheController();
+
+ int init();
+
+ int shutdown();
+
+ void handle_signal(int sinnum);
+
+ int run();
+
+ void handle_request(CacheSession* session, ObjectCacheRequest* msg);
+
+ private:
+ CacheServer *m_cache_server = nullptr;
+ std::vector<const char*> m_args;
+ CephContext *m_cct;
+ ObjectCacheStore *m_object_cache_store = nullptr;
+};
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+
+#endif // CEPH_CACHE_CACHE_CONTROLLER_H
diff --git a/src/tools/immutable_object_cache/CacheServer.cc b/src/tools/immutable_object_cache/CacheServer.cc
new file mode 100644
index 000000000..e94a47c7a
--- /dev/null
+++ b/src/tools/immutable_object_cache/CacheServer.cc
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/bind/bind.hpp>
+#include "common/debug.h"
+#include "common/ceph_context.h"
+#include "CacheServer.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_immutable_obj_cache
+#undef dout_prefix
+#define dout_prefix *_dout << "ceph::cache::CacheServer: " << this << " " \
+ << __func__ << ": "
+
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+CacheServer::CacheServer(CephContext* cct, const std::string& file,
+ ProcessMsg processmsg)
+ : cct(cct), m_server_process_msg(processmsg),
+ m_local_path(file), m_acceptor(m_io_service) {}
+
+CacheServer::~CacheServer() {
+ stop();
+}
+
+int CacheServer::run() {
+ ldout(cct, 20) << dendl;
+
+ int ret = start_accept();
+ if (ret != 0) {
+ return ret;
+ }
+
+ boost::system::error_code ec;
+ ret = m_io_service.run(ec);
+ if (ec) {
+ ldout(cct, 1) << "m_io_service run fails: " << ec.message() << dendl;
+ return -1;
+ }
+ return 0;
+}
+
+int CacheServer::stop() {
+ m_io_service.stop();
+ return 0;
+}
+
+int CacheServer::start_accept() {
+ ldout(cct, 20) << dendl;
+
+ boost::system::error_code ec;
+ m_acceptor.open(m_local_path.protocol(), ec);
+ if (ec) {
+ lderr(cct) << "failed to open domain socket: " << ec.message() << dendl;
+ return -ec.value();
+ }
+
+ m_acceptor.bind(m_local_path, ec);
+ if (ec) {
+ lderr(cct) << "failed to bind to domain socket '"
+ << m_local_path << "': " << ec.message() << dendl;
+ return -ec.value();
+ }
+
+ m_acceptor.listen(boost::asio::socket_base::max_connections, ec);
+ if (ec) {
+ lderr(cct) << "failed to listen on domain socket: " << ec.message()
+ << dendl;
+ return -ec.value();
+ }
+
+ accept();
+ return 0;
+}
+
+void CacheServer::accept() {
+ CacheSessionPtr new_session = nullptr;
+
+ new_session.reset(new CacheSession(m_io_service,
+ m_server_process_msg, cct));
+
+ m_acceptor.async_accept(new_session->socket(),
+ boost::bind(&CacheServer::handle_accept, this, new_session,
+ boost::asio::placeholders::error));
+}
+
+void CacheServer::handle_accept(CacheSessionPtr new_session,
+ const boost::system::error_code& error) {
+ ldout(cct, 20) << dendl;
+ if (error) {
+ // operation_absort
+ lderr(cct) << "async accept fails : " << error.message() << dendl;
+ return;
+ }
+
+ // TODO(dehao) : session setting
+ new_session->start();
+
+ // lanuch next accept
+ accept();
+}
+
+} // namespace immutable_obj_cache
+} // namespace ceph
diff --git a/src/tools/immutable_object_cache/CacheServer.h b/src/tools/immutable_object_cache/CacheServer.h
new file mode 100644
index 000000000..31d859934
--- /dev/null
+++ b/src/tools/immutable_object_cache/CacheServer.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_CACHE_SERVER_H
+#define CEPH_CACHE_CACHE_SERVER_H
+
+#include <boost/asio.hpp>
+#include <boost/asio/error.hpp>
+
+#include "Types.h"
+#include "SocketCommon.h"
+#include "CacheSession.h"
+
+
+using boost::asio::local::stream_protocol;
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+class CacheServer {
+ public:
+ CacheServer(CephContext* cct, const std::string& file, ProcessMsg processmsg);
+ ~CacheServer();
+
+ int run();
+ int start_accept();
+ int stop();
+
+ private:
+ void accept();
+ void handle_accept(CacheSessionPtr new_session,
+ const boost::system::error_code& error);
+
+ private:
+ CephContext* cct;
+ boost::asio::io_service m_io_service;
+ ProcessMsg m_server_process_msg;
+ stream_protocol::endpoint m_local_path;
+ stream_protocol::acceptor m_acceptor;
+};
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+
+#endif
diff --git a/src/tools/immutable_object_cache/CacheSession.cc b/src/tools/immutable_object_cache/CacheSession.cc
new file mode 100644
index 000000000..38c38c97d
--- /dev/null
+++ b/src/tools/immutable_object_cache/CacheSession.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/bind/bind.hpp>
+#include "common/debug.h"
+#include "common/ceph_context.h"
+#include "CacheSession.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_immutable_obj_cache
+#undef dout_prefix
+#define dout_prefix *_dout << "ceph::cache::CacheSession: " << this << " " \
+ << __func__ << ": "
+
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+CacheSession::CacheSession(io_service& io_service,
+ ProcessMsg processmsg,
+ CephContext* cct)
+ : m_dm_socket(io_service),
+ m_server_process_msg(processmsg), m_cct(cct) {
+ m_bp_header = buffer::create(get_header_size());
+}
+
+CacheSession::~CacheSession() {
+ close();
+}
+
+stream_protocol::socket& CacheSession::socket() {
+ return m_dm_socket;
+}
+
+void CacheSession::set_client_version(const std::string &version) {
+ m_client_version = version;
+}
+
+const std::string &CacheSession::client_version() const {
+ return m_client_version;
+}
+
+void CacheSession::close() {
+ if (m_dm_socket.is_open()) {
+ boost::system::error_code close_ec;
+ m_dm_socket.close(close_ec);
+ if (close_ec) {
+ ldout(m_cct, 20) << "close: " << close_ec.message() << dendl;
+ }
+ }
+}
+
+void CacheSession::start() {
+ read_request_header();
+}
+
+void CacheSession::read_request_header() {
+ ldout(m_cct, 20) << dendl;
+ boost::asio::async_read(m_dm_socket,
+ boost::asio::buffer(m_bp_header.c_str(), get_header_size()),
+ boost::asio::transfer_exactly(get_header_size()),
+ boost::bind(&CacheSession::handle_request_header,
+ shared_from_this(), boost::asio::placeholders::error,
+ boost::asio::placeholders::bytes_transferred));
+}
+
+void CacheSession::handle_request_header(const boost::system::error_code& err,
+ size_t bytes_transferred) {
+ ldout(m_cct, 20) << dendl;
+ if (err || bytes_transferred != get_header_size()) {
+ fault(err);
+ return;
+ }
+
+ read_request_data(get_data_len(m_bp_header.c_str()));
+}
+
+void CacheSession::read_request_data(uint64_t data_len) {
+ ldout(m_cct, 20) << dendl;
+ bufferptr bp_data(buffer::create(data_len));
+ boost::asio::async_read(m_dm_socket,
+ boost::asio::buffer(bp_data.c_str(), bp_data.length()),
+ boost::asio::transfer_exactly(data_len),
+ boost::bind(&CacheSession::handle_request_data,
+ shared_from_this(), bp_data, data_len,
+ boost::asio::placeholders::error,
+ boost::asio::placeholders::bytes_transferred));
+}
+
+void CacheSession::handle_request_data(bufferptr bp, uint64_t data_len,
+ const boost::system::error_code& err,
+ size_t bytes_transferred) {
+ ldout(m_cct, 20) << dendl;
+ if (err || bytes_transferred != data_len) {
+ fault(err);
+ return;
+ }
+
+ bufferlist bl_data;
+
+ bl_data.append(m_bp_header);
+ bl_data.append(std::move(bp));
+
+ ObjectCacheRequest* req = decode_object_cache_request(bl_data);
+
+ process(req);
+ delete req;
+ read_request_header();
+}
+
+void CacheSession::process(ObjectCacheRequest* req) {
+ ldout(m_cct, 20) << dendl;
+ m_server_process_msg(this, req);
+}
+
+void CacheSession::send(ObjectCacheRequest* reply) {
+ ldout(m_cct, 20) << dendl;
+ bufferlist bl;
+ reply->encode();
+ bl.append(reply->get_payload_bufferlist());
+
+ boost::asio::async_write(m_dm_socket,
+ boost::asio::buffer(bl.c_str(), bl.length()),
+ boost::asio::transfer_exactly(bl.length()),
+ [this, bl, reply](const boost::system::error_code& err,
+ size_t bytes_transferred) {
+ delete reply;
+ if (err || bytes_transferred != bl.length()) {
+ fault(err);
+ return;
+ }
+ });
+}
+
+void CacheSession::fault(const boost::system::error_code& ec) {
+ ldout(m_cct, 20) << "session fault : " << ec.message() << dendl;
+}
+
+} // namespace immutable_obj_cache
+} // namespace ceph
diff --git a/src/tools/immutable_object_cache/CacheSession.h b/src/tools/immutable_object_cache/CacheSession.h
new file mode 100644
index 000000000..0826e8a2b
--- /dev/null
+++ b/src/tools/immutable_object_cache/CacheSession.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_SESSION_H
+#define CEPH_CACHE_SESSION_H
+
+#include <boost/asio.hpp>
+#include <boost/asio/error.hpp>
+
+#include "Types.h"
+#include "SocketCommon.h"
+
+using boost::asio::local::stream_protocol;
+using boost::asio::io_service;
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+class CacheSession : public std::enable_shared_from_this<CacheSession> {
+ public:
+ CacheSession(io_service& io_service, ProcessMsg process_msg,
+ CephContext* ctx);
+ ~CacheSession();
+ stream_protocol::socket& socket();
+ void close();
+ void start();
+ void read_request_header();
+ void handle_request_header(const boost::system::error_code& err,
+ size_t bytes_transferred);
+ void read_request_data(uint64_t data_len);
+ void handle_request_data(bufferptr bp, uint64_t data_len,
+ const boost::system::error_code& err,
+ size_t bytes_transferred);
+ void process(ObjectCacheRequest* req);
+ void fault(const boost::system::error_code& ec);
+ void send(ObjectCacheRequest* msg);
+
+ void set_client_version(const std::string &version);
+ const std::string &client_version() const;
+
+ private:
+ stream_protocol::socket m_dm_socket;
+ ProcessMsg m_server_process_msg;
+ CephContext* m_cct;
+
+ std::string m_client_version;
+
+ bufferptr m_bp_header;
+};
+
+typedef std::shared_ptr<CacheSession> CacheSessionPtr;
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+
+#endif // CEPH_CACHE_SESSION_H
diff --git a/src/tools/immutable_object_cache/ObjectCacheStore.cc b/src/tools/immutable_object_cache/ObjectCacheStore.cc
new file mode 100644
index 000000000..1b1eef1e3
--- /dev/null
+++ b/src/tools/immutable_object_cache/ObjectCacheStore.cc
@@ -0,0 +1,466 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ObjectCacheStore.h"
+#include "Utils.h"
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#else
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#endif
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_immutable_obj_cache
+#undef dout_prefix
+#define dout_prefix *_dout << "ceph::cache::ObjectCacheStore: " << this << " " \
+ << __func__ << ": "
+
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+namespace {
+
+class SafeTimerSingleton : public CommonSafeTimer<ceph::mutex> {
+public:
+ ceph::mutex lock = ceph::make_mutex
+ ("ceph::immutable_object_cache::SafeTimerSingleton::lock");
+
+ explicit SafeTimerSingleton(CephContext *cct)
+ : CommonSafeTimer(cct, lock, true) {
+ init();
+ }
+ ~SafeTimerSingleton() {
+ std::lock_guard locker{lock};
+ shutdown();
+ }
+};
+
+} // anonymous namespace
+
+enum ThrottleTargetCode {
+ ROC_QOS_IOPS_THROTTLE = 1,
+ ROC_QOS_BPS_THROTTLE = 2
+};
+
+ObjectCacheStore::ObjectCacheStore(CephContext *cct)
+ : m_cct(cct), m_rados(new librados::Rados()) {
+
+ m_cache_root_dir =
+ m_cct->_conf.get_val<std::string>("immutable_object_cache_path");
+
+ if (m_cache_root_dir.back() != '/') {
+ m_cache_root_dir += "/";
+ }
+
+ uint64_t cache_max_size =
+ m_cct->_conf.get_val<Option::size_t>("immutable_object_cache_max_size");
+
+ double cache_watermark =
+ m_cct->_conf.get_val<double>("immutable_object_cache_watermark");
+
+ uint64_t max_inflight_ops =
+ m_cct->_conf.get_val<uint64_t>("immutable_object_cache_max_inflight_ops");
+
+ uint64_t limit = 0;
+ if ((limit = m_cct->_conf.get_val<uint64_t>
+ ("immutable_object_cache_qos_iops_limit")) != 0) {
+ apply_qos_tick_and_limit(ROC_QOS_IOPS_THROTTLE,
+ m_cct->_conf.get_val<std::chrono::milliseconds>
+ ("immutable_object_cache_qos_schedule_tick_min"),
+ limit,
+ m_cct->_conf.get_val<uint64_t>
+ ("immutable_object_cache_qos_iops_burst"),
+ m_cct->_conf.get_val<std::chrono::seconds>
+ ("immutable_object_cache_qos_iops_burst_seconds"));
+ }
+ if ((limit = m_cct->_conf.get_val<uint64_t>
+ ("immutable_object_cache_qos_bps_limit")) != 0) {
+ apply_qos_tick_and_limit(ROC_QOS_BPS_THROTTLE,
+ m_cct->_conf.get_val<std::chrono::milliseconds>
+ ("immutable_object_cache_qos_schedule_tick_min"),
+ limit,
+ m_cct->_conf.get_val<uint64_t>
+ ("immutable_object_cache_qos_bps_burst"),
+ m_cct->_conf.get_val<std::chrono::seconds>
+ ("immutable_object_cache_qos_bps_burst_seconds"));
+ }
+
+ if ((cache_watermark <= 0) || (cache_watermark > 1)) {
+ lderr(m_cct) << "Invalid water mark provided, set it to default." << dendl;
+ cache_watermark = 0.9;
+ }
+ m_policy = new SimplePolicy(m_cct, cache_max_size, max_inflight_ops,
+ cache_watermark);
+}
+
+ObjectCacheStore::~ObjectCacheStore() {
+ delete m_policy;
+ if (m_qos_enabled_flag & ROC_QOS_IOPS_THROTTLE) {
+ ceph_assert(m_throttles[ROC_QOS_IOPS_THROTTLE] != nullptr);
+ delete m_throttles[ROC_QOS_IOPS_THROTTLE];
+ }
+ if (m_qos_enabled_flag & ROC_QOS_BPS_THROTTLE) {
+ ceph_assert(m_throttles[ROC_QOS_BPS_THROTTLE] != nullptr);
+ delete m_throttles[ROC_QOS_BPS_THROTTLE];
+ }
+}
+
+int ObjectCacheStore::init(bool reset) {
+ ldout(m_cct, 20) << dendl;
+
+ int ret = m_rados->init_with_context(m_cct);
+ if (ret < 0) {
+ lderr(m_cct) << "fail to init Ceph context" << dendl;
+ return ret;
+ }
+
+ ret = m_rados->connect();
+ if (ret < 0) {
+ lderr(m_cct) << "fail to connect to cluster" << dendl;
+ return ret;
+ }
+
+ // TODO(dehao): fsck and reuse existing cache objects
+ if (reset) {
+ try {
+ if (fs::exists(m_cache_root_dir)) {
+ // remove all sub folders
+ for (auto& p : fs::directory_iterator(m_cache_root_dir)) {
+ fs::remove_all(p.path());
+ }
+ } else {
+ fs::create_directories(m_cache_root_dir);
+ }
+ } catch (const fs::filesystem_error& e) {
+ lderr(m_cct) << "failed to initialize cache store directory: "
+ << e.what() << dendl;
+ return -e.code().value();
+ }
+ }
+ return 0;
+}
+
+int ObjectCacheStore::shutdown() {
+ ldout(m_cct, 20) << dendl;
+
+ m_rados->shutdown();
+ return 0;
+}
+
+int ObjectCacheStore::init_cache() {
+ ldout(m_cct, 20) << dendl;
+ std::string cache_dir = m_cache_root_dir;
+
+ return 0;
+}
+
+int ObjectCacheStore::do_promote(std::string pool_nspace, uint64_t pool_id,
+ uint64_t snap_id, std::string object_name) {
+ ldout(m_cct, 20) << "to promote object: " << object_name
+ << " from pool id: " << pool_id
+ << " namespace: " << pool_nspace
+ << " snapshot: " << snap_id << dendl;
+
+ int ret = 0;
+ std::string cache_file_name =
+ get_cache_file_name(pool_nspace, pool_id, snap_id, object_name);
+ librados::IoCtx ioctx;
+ {
+ std::lock_guard _locker{m_ioctx_map_lock};
+ if (m_ioctx_map.find(pool_id) == m_ioctx_map.end()) {
+ ret = m_rados->ioctx_create2(pool_id, ioctx);
+ if (ret < 0) {
+ lderr(m_cct) << "fail to create ioctx" << dendl;
+ return ret;
+ }
+ m_ioctx_map.emplace(pool_id, ioctx);
+ } else {
+ ioctx = m_ioctx_map[pool_id];
+ }
+ }
+
+ ioctx.set_namespace(pool_nspace);
+ ioctx.snap_set_read(snap_id);
+
+ librados::bufferlist* read_buf = new librados::bufferlist();
+
+ auto ctx = new LambdaContext([this, read_buf, cache_file_name](int ret) {
+ handle_promote_callback(ret, read_buf, cache_file_name);
+ });
+
+ return promote_object(&ioctx, object_name, read_buf, ctx);
+}
+
+int ObjectCacheStore::handle_promote_callback(int ret, bufferlist* read_buf,
+ std::string cache_file_name) {
+ ldout(m_cct, 20) << " cache_file_name: " << cache_file_name << dendl;
+
+ // rados read error
+ if (ret != -ENOENT && ret < 0) {
+ lderr(m_cct) << "fail to read from rados" << dendl;
+
+ m_policy->update_status(cache_file_name, OBJ_CACHE_NONE);
+ delete read_buf;
+ return ret;
+ }
+
+ auto state = OBJ_CACHE_PROMOTED;
+ if (ret == -ENOENT) {
+ // object is empty
+ state = OBJ_CACHE_DNE;
+ ret = 0;
+ } else {
+ std::string cache_file_path = get_cache_file_path(cache_file_name, true);
+ if (cache_file_path == "") {
+ lderr(m_cct) << "fail to write cache file" << dendl;
+ m_policy->update_status(cache_file_name, OBJ_CACHE_NONE);
+ delete read_buf;
+ return -ENOSPC;
+ }
+
+ ret = read_buf->write_file(cache_file_path.c_str());
+ if (ret < 0) {
+ lderr(m_cct) << "fail to write cache file" << dendl;
+
+ m_policy->update_status(cache_file_name, OBJ_CACHE_NONE);
+ delete read_buf;
+ return ret;
+ }
+ }
+
+ // update metadata
+ ceph_assert(OBJ_CACHE_SKIP == m_policy->get_status(cache_file_name));
+ m_policy->update_status(cache_file_name, state, read_buf->length());
+ ceph_assert(state == m_policy->get_status(cache_file_name));
+
+ delete read_buf;
+
+ evict_objects();
+
+ return ret;
+}
+
+int ObjectCacheStore::lookup_object(std::string pool_nspace, uint64_t pool_id,
+ uint64_t snap_id, uint64_t object_size,
+ std::string object_name,
+ bool return_dne_path,
+ std::string& target_cache_file_path) {
+ ldout(m_cct, 20) << "object name = " << object_name
+ << " in pool ID : " << pool_id << dendl;
+
+ int pret = -1;
+ std::string cache_file_name =
+ get_cache_file_name(pool_nspace, pool_id, snap_id, object_name);
+
+ cache_status_t ret = m_policy->lookup_object(cache_file_name);
+
+ switch (ret) {
+ case OBJ_CACHE_NONE: {
+ if (take_token_from_throttle(object_size, 1)) {
+ pret = do_promote(pool_nspace, pool_id, snap_id, object_name);
+ if (pret < 0) {
+ lderr(m_cct) << "fail to start promote" << dendl;
+ }
+ } else {
+ m_policy->update_status(cache_file_name, OBJ_CACHE_NONE);
+ }
+ return ret;
+ }
+ case OBJ_CACHE_PROMOTED:
+ target_cache_file_path = get_cache_file_path(cache_file_name);
+ return ret;
+ case OBJ_CACHE_DNE:
+ if (return_dne_path) {
+ target_cache_file_path = get_cache_file_path(cache_file_name);
+ }
+ return ret;
+ case OBJ_CACHE_SKIP:
+ return ret;
+ default:
+ lderr(m_cct) << "unrecognized object cache status" << dendl;
+ ceph_assert(0);
+ }
+}
+
+int ObjectCacheStore::promote_object(librados::IoCtx* ioctx,
+ std::string object_name,
+ librados::bufferlist* read_buf,
+ Context* on_finish) {
+ ldout(m_cct, 20) << "object name = " << object_name << dendl;
+
+ librados::AioCompletion* read_completion = create_rados_callback(on_finish);
+ // issue a zero-sized read req to get the entire obj
+ int ret = ioctx->aio_read(object_name, read_completion, read_buf, 0, 0);
+ if (ret < 0) {
+ lderr(m_cct) << "failed to read from rados" << dendl;
+ }
+ read_completion->release();
+
+ return ret;
+}
+
+int ObjectCacheStore::evict_objects() {
+ ldout(m_cct, 20) << dendl;
+
+ std::list<std::string> obj_list;
+ m_policy->get_evict_list(&obj_list);
+ for (auto& obj : obj_list) {
+ do_evict(obj);
+ }
+ return 0;
+}
+
+int ObjectCacheStore::do_evict(std::string cache_file) {
+ ldout(m_cct, 20) << "file = " << cache_file << dendl;
+
+ if (cache_file == "") {
+ return 0;
+ }
+
+ std::string cache_file_path = get_cache_file_path(cache_file);
+
+ ldout(m_cct, 20) << "evict cache: " << cache_file_path << dendl;
+
+ // TODO(dehao): possible race on read?
+ int ret = std::remove(cache_file_path.c_str());
+ // evict metadata
+ if (ret == 0) {
+ m_policy->update_status(cache_file, OBJ_CACHE_SKIP);
+ m_policy->evict_entry(cache_file);
+ }
+
+ return ret;
+}
+
+std::string ObjectCacheStore::get_cache_file_name(std::string pool_nspace,
+ uint64_t pool_id,
+ uint64_t snap_id,
+ std::string oid) {
+ return pool_nspace + ":" + std::to_string(pool_id) + ":" +
+ std::to_string(snap_id) + ":" + oid;
+}
+
+std::string ObjectCacheStore::get_cache_file_path(std::string cache_file_name,
+ bool mkdir) {
+ ldout(m_cct, 20) << cache_file_name <<dendl;
+
+ uint32_t crc = 0;
+ crc = ceph_crc32c(0, (unsigned char *)cache_file_name.c_str(),
+ cache_file_name.length());
+
+ std::string cache_file_dir = std::to_string(crc % 100) + "/";
+
+ if (mkdir) {
+ ldout(m_cct, 20) << "creating cache dir: " << cache_file_dir <<dendl;
+ std::error_code ec;
+ std::string new_dir = m_cache_root_dir + cache_file_dir;
+ if (fs::exists(new_dir, ec)) {
+ ldout(m_cct, 20) << "cache dir exists: " << cache_file_dir <<dendl;
+ return new_dir + cache_file_name;
+ }
+
+ if (!fs::create_directories(new_dir, ec)) {
+ ldout(m_cct, 5) << "fail to create cache dir: " << new_dir
+ << "error: " << ec.message() << dendl;
+ return "";
+ }
+ }
+
+ return m_cache_root_dir + cache_file_dir + cache_file_name;
+}
+
+void ObjectCacheStore::handle_throttle_ready(uint64_t tokens, uint64_t type) {
+ m_io_throttled = false;
+ std::lock_guard lock(m_throttle_lock);
+ if (type & ROC_QOS_IOPS_THROTTLE){
+ m_iops_tokens += tokens;
+ } else if (type & ROC_QOS_BPS_THROTTLE){
+ m_bps_tokens += tokens;
+ } else {
+ lderr(m_cct) << "unknow throttle type." << dendl;
+ }
+}
+
+bool ObjectCacheStore::take_token_from_throttle(uint64_t object_size,
+ uint64_t object_num) {
+ if (m_io_throttled == true) {
+ return false;
+ }
+
+ int flag = 0;
+ bool wait = false;
+ if (!wait && (m_qos_enabled_flag & ROC_QOS_IOPS_THROTTLE)) {
+ std::lock_guard lock(m_throttle_lock);
+ if (object_num > m_iops_tokens) {
+ wait = m_throttles[ROC_QOS_IOPS_THROTTLE]->get(object_num, this,
+ &ObjectCacheStore::handle_throttle_ready, object_num,
+ ROC_QOS_IOPS_THROTTLE);
+ } else {
+ m_iops_tokens -= object_num;
+ flag = 1;
+ }
+ }
+ if (!wait && (m_qos_enabled_flag & ROC_QOS_BPS_THROTTLE)) {
+ std::lock_guard lock(m_throttle_lock);
+ if (object_size > m_bps_tokens) {
+ wait = m_throttles[ROC_QOS_BPS_THROTTLE]->get(object_size, this,
+ &ObjectCacheStore::handle_throttle_ready, object_size,
+ ROC_QOS_BPS_THROTTLE);
+ } else {
+ m_bps_tokens -= object_size;
+ }
+ }
+
+ if (wait) {
+ m_io_throttled = true;
+ // when passing iops throttle, but limit in bps throttle, recovery
+ if (flag == 1) {
+ std::lock_guard lock(m_throttle_lock);
+ m_iops_tokens += object_num;
+ }
+ }
+
+ return !wait;
+}
+
+static const std::map<uint64_t, std::string> THROTTLE_FLAGS = {
+ { ROC_QOS_IOPS_THROTTLE, "roc_qos_iops_throttle" },
+ { ROC_QOS_BPS_THROTTLE, "roc_qos_bps_throttle" }
+};
+
+void ObjectCacheStore::apply_qos_tick_and_limit(
+ const uint64_t flag,
+ std::chrono::milliseconds min_tick,
+ uint64_t limit,
+ uint64_t burst,
+ std::chrono::seconds burst_seconds) {
+ SafeTimerSingleton* safe_timer_singleton = nullptr;
+ TokenBucketThrottle* throttle = nullptr;
+ safe_timer_singleton =
+ &m_cct->lookup_or_create_singleton_object<SafeTimerSingleton>(
+ "tools::immutable_object_cache", false, m_cct);
+ SafeTimer* timer = safe_timer_singleton;
+ ceph::mutex* timer_lock = &safe_timer_singleton->lock;
+ m_qos_enabled_flag |= flag;
+ auto throttle_flags_it = THROTTLE_FLAGS.find(flag);
+ ceph_assert(throttle_flags_it != THROTTLE_FLAGS.end());
+ throttle = new TokenBucketThrottle(m_cct, throttle_flags_it->second,
+ 0, 0, timer, timer_lock);
+ throttle->set_schedule_tick_min(min_tick.count());
+ int ret = throttle->set_limit(limit, burst, burst_seconds.count());
+ if (ret < 0) {
+ lderr(m_cct) << throttle->get_name() << ": invalid qos parameter: "
+ << "burst(" << burst << ") is less than "
+ << "limit(" << limit << ")" << dendl;
+ throttle->set_limit(limit, 0, 1);
+ }
+
+ ceph_assert(m_throttles.find(flag) == m_throttles.end());
+ m_throttles.insert({flag, throttle});
+}
+
+} // namespace immutable_obj_cache
+} // namespace ceph
diff --git a/src/tools/immutable_object_cache/ObjectCacheStore.h b/src/tools/immutable_object_cache/ObjectCacheStore.h
new file mode 100644
index 000000000..51e5a77b8
--- /dev/null
+++ b/src/tools/immutable_object_cache/ObjectCacheStore.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_OBJECT_CACHE_STORE_H
+#define CEPH_CACHE_OBJECT_CACHE_STORE_H
+
+#include "common/ceph_context.h"
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include "common/Throttle.h"
+#include "common/Cond.h"
+#include "include/rados/librados.hpp"
+
+#include "SimplePolicy.h"
+
+
+using librados::Rados;
+using librados::IoCtx;
+class Context;
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+typedef shared_ptr<librados::Rados> RadosRef;
+typedef shared_ptr<librados::IoCtx> IoCtxRef;
+
+class ObjectCacheStore {
+ public:
+ ObjectCacheStore(CephContext *cct);
+ ~ObjectCacheStore();
+ int init(bool reset);
+ int shutdown();
+ int init_cache();
+ int lookup_object(std::string pool_nspace,
+ uint64_t pool_id, uint64_t snap_id,
+ uint64_t object_size,
+ std::string object_name,
+ bool return_dne_path,
+ std::string& target_cache_file_path);
+ private:
+ enum ThrottleTypeCode {
+ THROTTLE_CODE_BYTE,
+ THROTTLE_CODE_OBJECT
+ };
+
+ std::string get_cache_file_name(std::string pool_nspace, uint64_t pool_id,
+ uint64_t snap_id, std::string oid);
+ std::string get_cache_file_path(std::string cache_file_name,
+ bool mkdir = false);
+ int evict_objects();
+ int do_promote(std::string pool_nspace, uint64_t pool_id,
+ uint64_t snap_id, std::string object_name);
+ int promote_object(librados::IoCtx*, std::string object_name,
+ librados::bufferlist* read_buf,
+ Context* on_finish);
+ int handle_promote_callback(int, bufferlist*, std::string);
+ int do_evict(std::string cache_file);
+
+ bool take_token_from_throttle(uint64_t object_size, uint64_t object_num);
+ void handle_throttle_ready(uint64_t tokens, uint64_t type);
+ void apply_qos_tick_and_limit(const uint64_t flag,
+ std::chrono::milliseconds min_tick,
+ uint64_t limit, uint64_t burst,
+ std::chrono::seconds burst_seconds);
+
+ CephContext *m_cct;
+ RadosRef m_rados;
+ std::map<uint64_t, librados::IoCtx> m_ioctx_map;
+ ceph::mutex m_ioctx_map_lock =
+ ceph::make_mutex("ceph::cache::ObjectCacheStore::m_ioctx_map_lock");
+ Policy* m_policy;
+ std::string m_cache_root_dir;
+ // throttle mechanism
+ uint64_t m_qos_enabled_flag{0};
+ std::map<uint64_t, TokenBucketThrottle*> m_throttles;
+ bool m_io_throttled{false};
+ ceph::mutex m_throttle_lock =
+ ceph::make_mutex("ceph::cache::ObjectCacheStore::m_throttle_lock");;
+ uint64_t m_iops_tokens{0};
+ uint64_t m_bps_tokens{0};
+};
+
+} // namespace immutable_obj_cache
+} // ceph
+#endif // CEPH_CACHE_OBJECT_CACHE_STORE_H
diff --git a/src/tools/immutable_object_cache/Policy.h b/src/tools/immutable_object_cache/Policy.h
new file mode 100644
index 000000000..7924a8919
--- /dev/null
+++ b/src/tools/immutable_object_cache/Policy.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_POLICY_H
+#define CEPH_CACHE_POLICY_H
+
+#include <list>
+#include <string>
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+typedef enum {
+ OBJ_CACHE_NONE = 0,
+ OBJ_CACHE_PROMOTED,
+ OBJ_CACHE_SKIP,
+ OBJ_CACHE_DNE,
+} cache_status_t;
+
+class Policy {
+ public:
+ Policy() {}
+ virtual ~Policy() {}
+ virtual cache_status_t lookup_object(std::string) = 0;
+ virtual int evict_entry(std::string) = 0;
+ virtual void update_status(std::string, cache_status_t,
+ uint64_t size = 0) = 0;
+ virtual cache_status_t get_status(std::string) = 0;
+ virtual void get_evict_list(std::list<std::string>* obj_list) = 0;
+};
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+#endif
diff --git a/src/tools/immutable_object_cache/SimplePolicy.cc b/src/tools/immutable_object_cache/SimplePolicy.cc
new file mode 100644
index 000000000..3a7375ba9
--- /dev/null
+++ b/src/tools/immutable_object_cache/SimplePolicy.cc
@@ -0,0 +1,216 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "SimplePolicy.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_immutable_obj_cache
+#undef dout_prefix
+#define dout_prefix *_dout << "ceph::cache::SimplePolicy: " << this << " " \
+ << __func__ << ": "
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+SimplePolicy::SimplePolicy(CephContext *cct, uint64_t cache_size,
+ uint64_t max_inflight, double watermark)
+ : cct(cct), m_watermark(watermark), m_max_inflight_ops(max_inflight),
+ m_max_cache_size(cache_size) {
+
+ ldout(cct, 20) << "max cache size= " << m_max_cache_size
+ << " ,watermark= " << m_watermark
+ << " ,max inflight ops= " << m_max_inflight_ops << dendl;
+
+ m_cache_size = 0;
+
+}
+
+SimplePolicy::~SimplePolicy() {
+ ldout(cct, 20) << dendl;
+
+ for (auto it : m_cache_map) {
+ Entry* entry = (it.second);
+ delete entry;
+ }
+}
+
+cache_status_t SimplePolicy::alloc_entry(std::string file_name) {
+ ldout(cct, 20) << "alloc entry for: " << file_name << dendl;
+
+ std::unique_lock wlocker{m_cache_map_lock};
+
+ // cache hit when promoting
+ if (m_cache_map.find(file_name) != m_cache_map.end()) {
+ ldout(cct, 20) << "object is under promoting: " << file_name << dendl;
+ return OBJ_CACHE_SKIP;
+ }
+
+ if ((m_cache_size < m_max_cache_size) &&
+ (inflight_ops < m_max_inflight_ops)) {
+ Entry* entry = new Entry();
+ ceph_assert(entry != nullptr);
+ m_cache_map[file_name] = entry;
+ wlocker.unlock();
+ update_status(file_name, OBJ_CACHE_SKIP);
+ return OBJ_CACHE_NONE; // start promotion request
+ }
+
+ // if there's no free entry, return skip to read from rados
+ return OBJ_CACHE_SKIP;
+}
+
+cache_status_t SimplePolicy::lookup_object(std::string file_name) {
+ ldout(cct, 20) << "lookup: " << file_name << dendl;
+
+ std::shared_lock rlocker{m_cache_map_lock};
+
+ auto entry_it = m_cache_map.find(file_name);
+ // simply promote on first lookup
+ if (entry_it == m_cache_map.end()) {
+ rlocker.unlock();
+ return alloc_entry(file_name);
+ }
+
+ Entry* entry = entry_it->second;
+
+ if (entry->status == OBJ_CACHE_PROMOTED || entry->status == OBJ_CACHE_DNE) {
+ // bump pos in lru on hit
+ m_promoted_lru.lru_touch(entry);
+ }
+
+ return entry->status;
+}
+
+void SimplePolicy::update_status(std::string file_name,
+ cache_status_t new_status, uint64_t size) {
+ ldout(cct, 20) << "update status for: " << file_name
+ << " new status = " << new_status << dendl;
+
+ std::unique_lock locker{m_cache_map_lock};
+
+ auto entry_it = m_cache_map.find(file_name);
+ if (entry_it == m_cache_map.end()) {
+ return;
+ }
+
+ ceph_assert(entry_it != m_cache_map.end());
+ Entry* entry = entry_it->second;
+
+ // to promote
+ if (entry->status == OBJ_CACHE_NONE && new_status== OBJ_CACHE_SKIP) {
+ entry->status = new_status;
+ entry->file_name = file_name;
+ inflight_ops++;
+ return;
+ }
+
+ // promoting done
+ if (entry->status == OBJ_CACHE_SKIP && (new_status== OBJ_CACHE_PROMOTED ||
+ new_status== OBJ_CACHE_DNE)) {
+ m_promoted_lru.lru_insert_top(entry);
+ entry->status = new_status;
+ entry->size = size;
+ m_cache_size += entry->size;
+ inflight_ops--;
+ return;
+ }
+
+ // promoting failed
+ if (entry->status == OBJ_CACHE_SKIP && new_status== OBJ_CACHE_NONE) {
+ // mark this entry as free
+ entry->file_name = "";
+ entry->status = new_status;
+
+ m_cache_map.erase(entry_it);
+ inflight_ops--;
+ delete entry;
+ return;
+ }
+
+ // to evict
+ if ((entry->status == OBJ_CACHE_PROMOTED || entry->status == OBJ_CACHE_DNE) &&
+ new_status== OBJ_CACHE_NONE) {
+ // mark this entry as free
+ uint64_t size = entry->size;
+ entry->file_name = "";
+ entry->size = 0;
+ entry->status = new_status;
+
+ m_promoted_lru.lru_remove(entry);
+ m_cache_map.erase(entry_it);
+ m_cache_size -= size;
+ delete entry;
+ return;
+ }
+}
+
+int SimplePolicy::evict_entry(std::string file_name) {
+ ldout(cct, 20) << "to evict: " << file_name << dendl;
+
+ update_status(file_name, OBJ_CACHE_NONE);
+
+ return 0;
+}
+
+cache_status_t SimplePolicy::get_status(std::string file_name) {
+ ldout(cct, 20) << file_name << dendl;
+
+ std::shared_lock locker{m_cache_map_lock};
+ auto entry_it = m_cache_map.find(file_name);
+ if (entry_it == m_cache_map.end()) {
+ return OBJ_CACHE_NONE;
+ }
+
+ return entry_it->second->status;
+}
+
+void SimplePolicy::get_evict_list(std::list<std::string>* obj_list) {
+ ldout(cct, 20) << dendl;
+
+ std::unique_lock locker{m_cache_map_lock};
+ // check free ratio, pop entries from LRU
+ if ((double)m_cache_size > m_max_cache_size * m_watermark) {
+ // TODO(dehao): make this configurable
+ int evict_num = m_cache_map.size() * 0.1;
+ for (int i = 0; i < evict_num; i++) {
+ Entry* entry = reinterpret_cast<Entry*>(m_promoted_lru.lru_expire());
+ if (entry == nullptr) {
+ continue;
+ }
+ std::string file_name = entry->file_name;
+ obj_list->push_back(file_name);
+ }
+ }
+}
+
+// for unit test
+uint64_t SimplePolicy::get_free_size() {
+ return m_max_cache_size - m_cache_size;
+}
+
+uint64_t SimplePolicy::get_promoting_entry_num() {
+ uint64_t index = 0;
+ std::shared_lock rlocker{m_cache_map_lock};
+ for (auto it : m_cache_map) {
+ if (it.second->status == OBJ_CACHE_SKIP) {
+ index++;
+ }
+ }
+ return index;
+}
+
+uint64_t SimplePolicy::get_promoted_entry_num() {
+ return m_promoted_lru.lru_get_size();
+}
+
+std::string SimplePolicy::get_evict_entry() {
+ Entry* entry = reinterpret_cast<Entry*>(m_promoted_lru.lru_get_next_expire());
+ if (entry == nullptr) {
+ return "";
+ }
+ return entry->file_name;
+}
+
+} // namespace immutable_obj_cache
+} // namespace ceph
diff --git a/src/tools/immutable_object_cache/SimplePolicy.h b/src/tools/immutable_object_cache/SimplePolicy.h
new file mode 100644
index 000000000..671cbd518
--- /dev/null
+++ b/src/tools/immutable_object_cache/SimplePolicy.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_SIMPLE_POLICY_H
+#define CEPH_CACHE_SIMPLE_POLICY_H
+
+#include "common/ceph_context.h"
+#include "common/ceph_mutex.h"
+#include "include/lru.h"
+#include "Policy.h"
+
+#include <unordered_map>
+#include <string>
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+class SimplePolicy : public Policy {
+ public:
+ SimplePolicy(CephContext *cct, uint64_t block_num, uint64_t max_inflight,
+ double watermark);
+ ~SimplePolicy();
+
+ cache_status_t lookup_object(std::string file_name);
+ cache_status_t get_status(std::string file_name);
+
+ void update_status(std::string file_name,
+ cache_status_t new_status,
+ uint64_t size = 0);
+
+ int evict_entry(std::string file_name);
+
+ void get_evict_list(std::list<std::string>* obj_list);
+
+ uint64_t get_free_size();
+ uint64_t get_promoting_entry_num();
+ uint64_t get_promoted_entry_num();
+ std::string get_evict_entry();
+
+ private:
+ cache_status_t alloc_entry(std::string file_name);
+
+ class Entry : public LRUObject {
+ public:
+ cache_status_t status;
+ Entry() : status(OBJ_CACHE_NONE) {}
+ std::string file_name;
+ uint64_t size;
+ };
+
+ CephContext* cct;
+ double m_watermark;
+ uint64_t m_max_inflight_ops;
+ uint64_t m_max_cache_size;
+ std::atomic<uint64_t> inflight_ops = 0;
+
+ std::unordered_map<std::string, Entry*> m_cache_map;
+ ceph::shared_mutex m_cache_map_lock =
+ ceph::make_shared_mutex("rbd::cache::SimplePolicy::m_cache_map_lock");
+
+ std::atomic<uint64_t> m_cache_size;
+
+ LRU m_promoted_lru;
+};
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+#endif // CEPH_CACHE_SIMPLE_POLICY_H
diff --git a/src/tools/immutable_object_cache/SocketCommon.h b/src/tools/immutable_object_cache/SocketCommon.h
new file mode 100644
index 000000000..99acf3609
--- /dev/null
+++ b/src/tools/immutable_object_cache/SocketCommon.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_SOCKET_COMMON_H
+#define CEPH_CACHE_SOCKET_COMMON_H
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+static const int RBDSC_REGISTER = 0X11;
+static const int RBDSC_READ = 0X12;
+static const int RBDSC_REGISTER_REPLY = 0X13;
+static const int RBDSC_READ_REPLY = 0X14;
+static const int RBDSC_READ_RADOS = 0X15;
+
+static const int ASIO_ERROR_READ = 0X01;
+static const int ASIO_ERROR_WRITE = 0X02;
+static const int ASIO_ERROR_CONNECT = 0X03;
+static const int ASIO_ERROR_ACCEPT = 0X04;
+static const int ASIO_ERROR_MSG_INCOMPLETE = 0X05;
+
+class ObjectCacheRequest;
+class CacheSession;
+
+typedef GenContextURef<ObjectCacheRequest*> CacheGenContextURef;
+
+typedef std::function<void(CacheSession*, ObjectCacheRequest*)> ProcessMsg;
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+#endif // CEPH_CACHE_SOCKET_COMMON_H
diff --git a/src/tools/immutable_object_cache/Types.cc b/src/tools/immutable_object_cache/Types.cc
new file mode 100644
index 000000000..860017d6a
--- /dev/null
+++ b/src/tools/immutable_object_cache/Types.cc
@@ -0,0 +1,184 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Types.h"
+#include "SocketCommon.h"
+
+#define dout_subsys ceph_subsys_immutable_obj_cache
+#undef dout_prefix
+#define dout_prefix *_dout << "ceph::cache::Types: " << __func__ << ": "
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+ObjectCacheRequest::ObjectCacheRequest() {}
+ObjectCacheRequest::ObjectCacheRequest(uint16_t t, uint64_t s)
+ : type(t), seq(s) {}
+ObjectCacheRequest::~ObjectCacheRequest() {}
+
+void ObjectCacheRequest::encode() {
+ ENCODE_START(2, 1, payload);
+ ceph::encode(type, payload);
+ ceph::encode(seq, payload);
+ if (!payload_empty()) {
+ encode_payload();
+ }
+ ENCODE_FINISH(payload);
+}
+
+void ObjectCacheRequest::decode(bufferlist& bl) {
+ auto i = bl.cbegin();
+ DECODE_START(2, i);
+ ceph::decode(type, i);
+ ceph::decode(seq, i);
+ if (!payload_empty()) {
+ decode_payload(i, struct_v);
+ }
+ DECODE_FINISH(i);
+}
+
+ObjectCacheRegData::ObjectCacheRegData() {}
+ObjectCacheRegData::ObjectCacheRegData(uint16_t t, uint64_t s)
+ : ObjectCacheRequest(t, s) {}
+ObjectCacheRegData::ObjectCacheRegData(uint16_t t, uint64_t s,
+ const std::string &version)
+ : ObjectCacheRequest(t, s),
+ version(version) {
+}
+
+ObjectCacheRegData::~ObjectCacheRegData() {}
+
+void ObjectCacheRegData::encode_payload() {
+ ceph::encode(version, payload);
+}
+
+void ObjectCacheRegData::decode_payload(bufferlist::const_iterator i,
+ __u8 encode_version) {
+ if (i.end()) {
+ return;
+ }
+ ceph::decode(version, i);
+}
+
+ObjectCacheRegReplyData::ObjectCacheRegReplyData() {}
+ObjectCacheRegReplyData::ObjectCacheRegReplyData(uint16_t t, uint64_t s)
+ : ObjectCacheRequest(t, s) {}
+
+ObjectCacheRegReplyData::~ObjectCacheRegReplyData() {}
+
+void ObjectCacheRegReplyData::encode_payload() {}
+
+void ObjectCacheRegReplyData::decode_payload(bufferlist::const_iterator bl,
+ __u8 encode_version) {}
+
+ObjectCacheReadData::ObjectCacheReadData(uint16_t t, uint64_t s,
+ uint64_t read_offset,
+ uint64_t read_len,
+ uint64_t pool_id, uint64_t snap_id,
+ uint64_t object_size,
+ std::string oid,
+ std::string pool_namespace)
+ : ObjectCacheRequest(t, s), read_offset(read_offset),
+ read_len(read_len), pool_id(pool_id), snap_id(snap_id),
+ object_size(object_size), oid(oid), pool_namespace(pool_namespace)
+{}
+
+ObjectCacheReadData::ObjectCacheReadData(uint16_t t, uint64_t s)
+ : ObjectCacheRequest(t, s) {}
+
+ObjectCacheReadData::~ObjectCacheReadData() {}
+
+void ObjectCacheReadData::encode_payload() {
+ ceph::encode(read_offset, payload);
+ ceph::encode(read_len, payload);
+ ceph::encode(pool_id, payload);
+ ceph::encode(snap_id, payload);
+ ceph::encode(oid, payload);
+ ceph::encode(pool_namespace, payload);
+ ceph::encode(object_size, payload);
+}
+
+void ObjectCacheReadData::decode_payload(bufferlist::const_iterator i,
+ __u8 encode_version) {
+ ceph::decode(read_offset, i);
+ ceph::decode(read_len, i);
+ ceph::decode(pool_id, i);
+ ceph::decode(snap_id, i);
+ ceph::decode(oid, i);
+ ceph::decode(pool_namespace, i);
+ if (encode_version >= 2) {
+ ceph::decode(object_size, i);
+ }
+}
+
+ObjectCacheReadReplyData::ObjectCacheReadReplyData(uint16_t t, uint64_t s,
+ string cache_path)
+ : ObjectCacheRequest(t, s), cache_path(cache_path) {}
+ObjectCacheReadReplyData::ObjectCacheReadReplyData(uint16_t t, uint64_t s)
+ : ObjectCacheRequest(t, s) {}
+
+ObjectCacheReadReplyData::~ObjectCacheReadReplyData() {}
+
+void ObjectCacheReadReplyData::encode_payload() {
+ ceph::encode(cache_path, payload);
+}
+
+void ObjectCacheReadReplyData::decode_payload(bufferlist::const_iterator i,
+ __u8 encode_version) {
+ ceph::decode(cache_path, i);
+}
+
+ObjectCacheReadRadosData::ObjectCacheReadRadosData() {}
+ObjectCacheReadRadosData::ObjectCacheReadRadosData(uint16_t t, uint64_t s)
+ : ObjectCacheRequest(t, s) {}
+
+ObjectCacheReadRadosData::~ObjectCacheReadRadosData() {}
+
+void ObjectCacheReadRadosData::encode_payload() {}
+
+void ObjectCacheReadRadosData::decode_payload(bufferlist::const_iterator i,
+ __u8 encode_version) {}
+
+ObjectCacheRequest* decode_object_cache_request(bufferlist payload_buffer) {
+ ObjectCacheRequest* req = nullptr;
+
+ uint16_t type;
+ uint64_t seq;
+ auto i = payload_buffer.cbegin();
+ DECODE_START(1, i);
+ ceph::decode(type, i);
+ ceph::decode(seq, i);
+ DECODE_FINISH(i);
+
+ switch (type) {
+ case RBDSC_REGISTER: {
+ req = new ObjectCacheRegData(type, seq);
+ break;
+ }
+ case RBDSC_READ: {
+ req = new ObjectCacheReadData(type, seq);
+ break;
+ }
+ case RBDSC_REGISTER_REPLY: {
+ req = new ObjectCacheRegReplyData(type, seq);
+ break;
+ }
+ case RBDSC_READ_REPLY: {
+ req = new ObjectCacheReadReplyData(type, seq);
+ break;
+ }
+ case RBDSC_READ_RADOS: {
+ req = new ObjectCacheReadRadosData(type, seq);
+ break;
+ }
+ default:
+ ceph_assert(0);
+ }
+
+ req->decode(payload_buffer);
+
+ return req;
+}
+
+} // namespace immutable_obj_cache
+} // namespace ceph
diff --git a/src/tools/immutable_object_cache/Types.h b/src/tools/immutable_object_cache/Types.h
new file mode 100644
index 000000000..05394d843
--- /dev/null
+++ b/src/tools/immutable_object_cache/Types.h
@@ -0,0 +1,136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_TYPES_H
+#define CEPH_CACHE_TYPES_H
+
+#include "include/encoding.h"
+#include "include/Context.h"
+#include "SocketCommon.h"
+
+namespace ceph {
+namespace immutable_obj_cache {
+
+namespace {
+struct HeaderHelper {
+ uint8_t v;
+ uint8_t c_v;
+ ceph_le32 len;
+}__attribute__((packed));
+
+inline uint8_t get_header_size() {
+ return sizeof(HeaderHelper);
+}
+
+inline uint32_t get_data_len(char* buf) {
+ HeaderHelper* header = reinterpret_cast<HeaderHelper*>(buf);
+ return header->len;
+}
+} // namespace
+
+class ObjectCacheRequest {
+ public:
+ uint16_t type;
+ uint64_t seq;
+
+ bufferlist payload;
+
+ CacheGenContextURef process_msg;
+
+ ObjectCacheRequest();
+ ObjectCacheRequest(uint16_t type, uint64_t seq);
+ virtual ~ObjectCacheRequest();
+
+ // encode consists of two steps
+ // step 1 : directly encode common bits using encode method of base classs.
+ // step 2 : according to payload_empty, determine whether addtional bits
+ // need to be encoded which be implements by child class.
+ void encode();
+ void decode(bufferlist& bl);
+ bufferlist get_payload_bufferlist() { return payload; }
+
+ virtual void encode_payload() = 0;
+ virtual void decode_payload(bufferlist::const_iterator bl_it,
+ __u8 encode_version) = 0;
+ virtual uint16_t get_request_type() = 0;
+ virtual bool payload_empty() = 0;
+};
+
+class ObjectCacheRegData : public ObjectCacheRequest {
+ public:
+ std::string version;
+ ObjectCacheRegData();
+ ObjectCacheRegData(uint16_t t, uint64_t s, const std::string &version);
+ ObjectCacheRegData(uint16_t t, uint64_t s);
+ ~ObjectCacheRegData() override;
+ void encode_payload() override;
+ void decode_payload(bufferlist::const_iterator bl,
+ __u8 encode_version) override;
+ uint16_t get_request_type() override { return RBDSC_REGISTER; }
+ bool payload_empty() override { return false; }
+};
+
+class ObjectCacheRegReplyData : public ObjectCacheRequest {
+ public:
+ ObjectCacheRegReplyData();
+ ObjectCacheRegReplyData(uint16_t t, uint64_t s);
+ ~ObjectCacheRegReplyData() override;
+ void encode_payload() override;
+ void decode_payload(bufferlist::const_iterator iter,
+ __u8 encode_version) override;
+ uint16_t get_request_type() override { return RBDSC_REGISTER_REPLY; }
+ bool payload_empty() override { return true; }
+};
+
+class ObjectCacheReadData : public ObjectCacheRequest {
+ public:
+ uint64_t read_offset;
+ uint64_t read_len;
+ uint64_t pool_id;
+ uint64_t snap_id;
+ uint64_t object_size = 0;
+ std::string oid;
+ std::string pool_namespace;
+ ObjectCacheReadData(uint16_t t, uint64_t s, uint64_t read_offset,
+ uint64_t read_len, uint64_t pool_id,
+ uint64_t snap_id, uint64_t object_size,
+ std::string oid, std::string pool_namespace);
+ ObjectCacheReadData(uint16_t t, uint64_t s);
+ ~ObjectCacheReadData() override;
+ void encode_payload() override;
+ void decode_payload(bufferlist::const_iterator bl,
+ __u8 encode_version) override;
+ uint16_t get_request_type() override { return RBDSC_READ; }
+ bool payload_empty() override { return false; }
+};
+
+class ObjectCacheReadReplyData : public ObjectCacheRequest {
+ public:
+ std::string cache_path;
+ ObjectCacheReadReplyData(uint16_t t, uint64_t s, std::string cache_path);
+ ObjectCacheReadReplyData(uint16_t t, uint64_t s);
+ ~ObjectCacheReadReplyData() override;
+ void encode_payload() override;
+ void decode_payload(bufferlist::const_iterator bl,
+ __u8 encode_version) override;
+ uint16_t get_request_type() override { return RBDSC_READ_REPLY; }
+ bool payload_empty() override { return false; }
+};
+
+class ObjectCacheReadRadosData : public ObjectCacheRequest {
+ public:
+ ObjectCacheReadRadosData();
+ ObjectCacheReadRadosData(uint16_t t, uint64_t s);
+ ~ObjectCacheReadRadosData() override;
+ void encode_payload() override;
+ void decode_payload(bufferlist::const_iterator bl,
+ __u8 encode_version) override;
+ uint16_t get_request_type() override { return RBDSC_READ_RADOS; }
+ bool payload_empty() override { return true; }
+};
+
+ObjectCacheRequest* decode_object_cache_request(bufferlist payload_buffer);
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+#endif // CEPH_CACHE_TYPES_H
diff --git a/src/tools/immutable_object_cache/Utils.h b/src/tools/immutable_object_cache/Utils.h
new file mode 100644
index 000000000..3c68cfa7b
--- /dev/null
+++ b/src/tools/immutable_object_cache/Utils.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CACHE_UTILS_H
+#define CEPH_CACHE_UTILS_H
+
+#include "include/rados/librados.hpp"
+#include "include/Context.h"
+
+namespace ceph {
+namespace immutable_obj_cache {
+namespace detail {
+
+template <typename T, void(T::*MF)(int)>
+void rados_callback(rados_completion_t c, void *arg) {
+ T *obj = reinterpret_cast<T*>(arg);
+ int r = rados_aio_get_return_value(c);
+ (obj->*MF)(r);
+}
+
+} // namespace detail
+
+template <typename T, void(T::*MF)(int)=&T::complete>
+librados::AioCompletion *create_rados_callback(T *obj) {
+ return librados::Rados::aio_create_completion(
+ obj, &detail::rados_callback<T, MF>);
+}
+
+} // namespace immutable_obj_cache
+} // namespace ceph
+#endif // CEPH_CACHE_UTILS_H
diff --git a/src/tools/immutable_object_cache/main.cc b/src/tools/immutable_object_cache/main.cc
new file mode 100644
index 000000000..55b0d087a
--- /dev/null
+++ b/src/tools/immutable_object_cache/main.cc
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+#include "CacheController.h"
+
+#include <vector>
+
+ceph::immutable_obj_cache::CacheController *cachectl = nullptr;
+
+void usage() {
+ std::cout << "usage: ceph-immutable-object-cache [options...]" << std::endl;
+ std::cout << "options:\n";
+ std::cout << " -m monaddress[:port] connect to specified monitor\n";
+ std::cout << " --keyring=<path> path to keyring for local "
+ << "cluster\n";
+ std::cout << " --log-file=<logfile> file to log debug output\n";
+ std::cout << " --debug-immutable-obj-cache=<log-level>/<memory-level> "
+ << "set debug level\n";
+ generic_server_usage();
+}
+
+static void handle_signal(int signum) {
+ if (cachectl)
+ cachectl->handle_signal(signum);
+}
+
+int main(int argc, const char **argv) {
+ std::vector<const char*> args;
+ env_to_vec(args);
+ argv_to_vec(argc, argv, args);
+
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+
+ if (g_conf()->daemonize) {
+ global_init_daemonize(g_ceph_context);
+ }
+
+ common_init_finish(g_ceph_context);
+ global_init_chdir(g_ceph_context);
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, sighup_handler);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ std::vector<const char*> cmd_args;
+ argv_to_vec(argc, argv, cmd_args);
+
+ cachectl = new ceph::immutable_obj_cache::CacheController(g_ceph_context,
+ cmd_args);
+ int r = cachectl->init();
+ if (r < 0) {
+ std::cerr << "failed to initialize: " << cpp_strerror(r) << std::endl;
+ goto cleanup;
+ }
+
+ r = cachectl->run();
+ if (r < 0) {
+ goto cleanup;
+ }
+
+ cleanup:
+ unregister_async_signal_handler(SIGHUP, sighup_handler);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ delete cachectl;
+
+ return r < 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/src/tools/kvstore_tool.cc b/src/tools/kvstore_tool.cc
new file mode 100644
index 000000000..d26a19588
--- /dev/null
+++ b/src/tools/kvstore_tool.cc
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "kvstore_tool.h"
+
+#include <iostream>
+
+#include "common/errno.h"
+#include "common/url_escape.h"
+#include "include/buffer.h"
+#include "kv/KeyValueDB.h"
+
+StoreTool::StoreTool(const string& type,
+ const string& path,
+ bool to_repair,
+ bool need_stats)
+ : store_path(path)
+{
+
+ if (need_stats) {
+ g_conf()->rocksdb_perf = true;
+ g_conf()->rocksdb_collect_compaction_stats = true;
+ }
+
+ if (type == "bluestore-kv") {
+#ifdef WITH_BLUESTORE
+ if (load_bluestore(path, to_repair) != 0)
+ exit(1);
+#else
+ cerr << "bluestore not compiled in" << std::endl;
+ exit(1);
+#endif
+ } else {
+ auto db_ptr = KeyValueDB::create(g_ceph_context, type, path);
+ if (!to_repair) {
+ if (int r = db_ptr->open(std::cerr); r < 0) {
+ cerr << "failed to open type " << type << " path " << path << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(1);
+ }
+ db.reset(db_ptr);
+ }
+ }
+}
+
+int StoreTool::load_bluestore(const string& path, bool to_repair)
+{
+ auto bluestore = new BlueStore(g_ceph_context, path);
+ KeyValueDB *db_ptr;
+ int r = bluestore->open_db_environment(&db_ptr, to_repair);
+ if (r < 0) {
+ return -EINVAL;
+ }
+ db = decltype(db){db_ptr, Deleter(bluestore)};
+ return 0;
+}
+
+uint32_t StoreTool::traverse(const string& prefix,
+ const bool do_crc,
+ const bool do_value_dump,
+ ostream *out)
+{
+ KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
+
+ if (prefix.empty())
+ iter->seek_to_first();
+ else
+ iter->seek_to_first(prefix);
+
+ uint32_t crc = -1;
+
+ while (iter->valid()) {
+ pair<string,string> rk = iter->raw_key();
+ if (!prefix.empty() && (rk.first != prefix))
+ break;
+
+ if (out)
+ *out << url_escape(rk.first) << "\t" << url_escape(rk.second);
+ if (do_crc) {
+ bufferlist bl;
+ bl.append(rk.first);
+ bl.append(rk.second);
+ bl.append(iter->value());
+
+ crc = bl.crc32c(crc);
+ if (out) {
+ *out << "\t" << bl.crc32c(0);
+ }
+ }
+ if (out)
+ *out << std::endl;
+ if (out && do_value_dump) {
+ bufferptr bp = iter->value_as_ptr();
+ bufferlist value;
+ value.append(bp);
+ ostringstream os;
+ value.hexdump(os);
+ std::cout << os.str() << std::endl;
+ }
+ iter->next();
+ }
+
+ return crc;
+}
+
+void StoreTool::list(const string& prefix, const bool do_crc,
+ const bool do_value_dump)
+{
+ traverse(prefix, do_crc, do_value_dump,& std::cout);
+}
+
+bool StoreTool::exists(const string& prefix)
+{
+ ceph_assert(!prefix.empty());
+ KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
+ iter->seek_to_first(prefix);
+ return (iter->valid() && (iter->raw_key().first == prefix));
+}
+
+bool StoreTool::exists(const string& prefix, const string& key)
+{
+ ceph_assert(!prefix.empty());
+
+ if (key.empty()) {
+ return exists(prefix);
+ }
+ bool exists = false;
+ get(prefix, key, exists);
+ return exists;
+}
+
+bufferlist StoreTool::get(const string& prefix,
+ const string& key,
+ bool& exists)
+{
+ ceph_assert(!prefix.empty() && !key.empty());
+
+ map<string,bufferlist> result;
+ std::set<std::string> keys;
+ keys.insert(key);
+ db->get(prefix, keys, &result);
+
+ if (result.count(key) > 0) {
+ exists = true;
+ return result[key];
+ } else {
+ exists = false;
+ return bufferlist();
+ }
+}
+
+uint64_t StoreTool::get_size()
+{
+ map<string,uint64_t> extras;
+ uint64_t s = db->get_estimated_size(extras);
+ for (auto& [name, size] : extras) {
+ std::cout << name << " - " << size << std::endl;
+ }
+ std::cout << "total: " << s << std::endl;
+ return s;
+}
+
+bool StoreTool::set(const string &prefix, const string &key, bufferlist &val)
+{
+ ceph_assert(!prefix.empty());
+ ceph_assert(!key.empty());
+ ceph_assert(val.length() > 0);
+
+ KeyValueDB::Transaction tx = db->get_transaction();
+ tx->set(prefix, key, val);
+ int ret = db->submit_transaction_sync(tx);
+
+ return (ret == 0);
+}
+
+bool StoreTool::rm(const string& prefix, const string& key)
+{
+ ceph_assert(!prefix.empty());
+ ceph_assert(!key.empty());
+
+ KeyValueDB::Transaction tx = db->get_transaction();
+ tx->rmkey(prefix, key);
+ int ret = db->submit_transaction_sync(tx);
+
+ return (ret == 0);
+}
+
+bool StoreTool::rm_prefix(const string& prefix)
+{
+ ceph_assert(!prefix.empty());
+
+ KeyValueDB::Transaction tx = db->get_transaction();
+ tx->rmkeys_by_prefix(prefix);
+ int ret = db->submit_transaction_sync(tx);
+
+ return (ret == 0);
+}
+
+void StoreTool::print_summary(const uint64_t total_keys, const uint64_t total_size,
+ const uint64_t total_txs, const string& store_path,
+ const string& other_path, const int duration) const
+{
+ std::cout << "summary:" << std::endl;
+ std::cout << " copied " << total_keys << " keys" << std::endl;
+ std::cout << " used " << total_txs << " transactions" << std::endl;
+ std::cout << " total size " << byte_u_t(total_size) << std::endl;
+ std::cout << " from '" << store_path << "' to '" << other_path << "'"
+ << std::endl;
+ std::cout << " duration " << duration << " seconds" << std::endl;
+}
+
+int StoreTool::print_stats() const
+{
+ ostringstream ostr;
+ Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty");
+ int ret = -1;
+ if (g_conf()->rocksdb_perf) {
+ db->get_statistics(f);
+ ostr << "db_statistics ";
+ f->flush(ostr);
+ ret = 0;
+ } else {
+ ostr << "db_statistics not enabled";
+ f->flush(ostr);
+ }
+ std::cout << ostr.str() << std::endl;
+ delete f;
+ return ret;
+}
+
+int StoreTool::copy_store_to(const string& type, const string& other_path,
+ const int num_keys_per_tx,
+ const string& other_type)
+{
+ if (num_keys_per_tx <= 0) {
+ std::cerr << "must specify a number of keys/tx > 0" << std::endl;
+ return -EINVAL;
+ }
+
+ // open or create a leveldb store at @p other_path
+ boost::scoped_ptr<KeyValueDB> other;
+ KeyValueDB *other_ptr = KeyValueDB::create(g_ceph_context,
+ other_type,
+ other_path);
+ if (int err = other_ptr->create_and_open(std::cerr); err < 0) {
+ return err;
+ }
+ other.reset(other_ptr);
+
+ KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator();
+ it->seek_to_first();
+ uint64_t total_keys = 0;
+ uint64_t total_size = 0;
+ uint64_t total_txs = 0;
+
+ auto duration = [start=coarse_mono_clock::now()] {
+ const auto now = coarse_mono_clock::now();
+ auto seconds = std::chrono::duration<double>(now - start);
+ return seconds.count();
+ };
+
+ do {
+ int num_keys = 0;
+
+ KeyValueDB::Transaction tx = other->get_transaction();
+
+ while (it->valid() && num_keys < num_keys_per_tx) {
+ auto [prefix, key] = it->raw_key();
+ bufferlist v = it->value();
+ tx->set(prefix, key, v);
+
+ num_keys++;
+ total_size += v.length();
+
+ it->next();
+ }
+
+ total_txs++;
+ total_keys += num_keys;
+
+ if (num_keys > 0)
+ other->submit_transaction_sync(tx);
+
+ std::cout << "ts = " << duration() << "s, copied " << total_keys
+ << " keys so far (" << byte_u_t(total_size) << ")"
+ << std::endl;
+
+ } while (it->valid());
+
+ print_summary(total_keys, total_size, total_txs, store_path, other_path,
+ duration());
+
+ return 0;
+}
+
+void StoreTool::compact()
+{
+ db->compact();
+}
+
+void StoreTool::compact_prefix(const string& prefix)
+{
+ db->compact_prefix(prefix);
+}
+
+void StoreTool::compact_range(const string& prefix,
+ const string& start,
+ const string& end)
+{
+ db->compact_range(prefix, start, end);
+}
+
+int StoreTool::destructive_repair()
+{
+ return db->repair(std::cout);
+}
diff --git a/src/tools/kvstore_tool.h b/src/tools/kvstore_tool.h
new file mode 100644
index 000000000..d8c896613
--- /dev/null
+++ b/src/tools/kvstore_tool.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include "acconfig.h"
+#include "include/buffer_fwd.h"
+#ifdef WITH_BLUESTORE
+#include "os/bluestore/BlueStore.h"
+#endif
+
+class KeyValueDB;
+
+class StoreTool
+{
+#ifdef WITH_BLUESTORE
+ struct Deleter {
+ BlueStore *bluestore;
+ Deleter()
+ : bluestore(nullptr) {}
+ Deleter(BlueStore *store)
+ : bluestore(store) {}
+ void operator()(KeyValueDB *db) {
+ if (bluestore) {
+ bluestore->umount();
+ delete bluestore;
+ } else {
+ delete db;
+ }
+ }
+ };
+ std::unique_ptr<KeyValueDB, Deleter> db;
+#else
+ std::unique_ptr<KeyValueDB> db;
+#endif
+
+ const std::string store_path;
+
+public:
+ StoreTool(const std::string& type,
+ const std::string& path,
+ bool need_open_db = true,
+ bool need_stats = false);
+ int load_bluestore(const std::string& path, bool need_open_db);
+ uint32_t traverse(const std::string& prefix,
+ const bool do_crc,
+ const bool do_value_dump,
+ ostream *out);
+ void list(const std::string& prefix,
+ const bool do_crc,
+ const bool do_value_dump);
+ bool exists(const std::string& prefix);
+ bool exists(const std::string& prefix, const std::string& key);
+ ceph::bufferlist get(const std::string& prefix,
+ const std::string& key,
+ bool& exists);
+ uint64_t get_size();
+ bool set(const std::string& prefix,
+ const std::string& key,
+ ceph::bufferlist& val);
+ bool rm(const std::string& prefix, const std::string& key);
+ bool rm_prefix(const std::string& prefix);
+ void print_summary(const uint64_t total_keys, const uint64_t total_size,
+ const uint64_t total_txs, const std::string& store_path,
+ const std::string& other_path, const int duration) const;
+ int copy_store_to(const std::string& type, const std::string& other_path,
+ const int num_keys_per_tx, const std::string& other_type);
+ void compact();
+ void compact_prefix(const std::string& prefix);
+ void compact_range(const std::string& prefix,
+ const std::string& start,
+ const std::string& end);
+ int destructive_repair();
+
+ int print_stats() const;
+};
diff --git a/src/tools/monmaptool.cc b/src/tools/monmaptool.cc
new file mode 100644
index 000000000..667e333bc
--- /dev/null
+++ b/src/tools/monmaptool.cc
@@ -0,0 +1,478 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include <string>
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+
+#include "global/global_init.h"
+#include "include/str_list.h"
+#include "mon/MonMap.h"
+
+
+void usage()
+{
+ cout << "usage: monmaptool [--print] [--create [--clobber] [--fsid uuid]]\n"
+ << " [--enable-all-features]\n"
+ << " [--generate] [--set-initial-members]\n"
+ << " [--add name 1.2.3.4:567] [--rm name]\n"
+ << " [--addv name [v2:1.2.4.5:567,v1:1.2.3.4:568]]\n"
+ << " [--feature-list [plain|parseable]]\n"
+ << " [--feature-set <value> [--optional|--persistent]]\n"
+ << " [--feature-unset <value> [--optional|--persistent]]\n"
+ << " [--set-min-mon-release <release-major-number>]\n"
+ << " <mapfilename>"
+ << std::endl;
+}
+
+void helpful_exit()
+{
+ cerr << "monmaptool -h for usage" << std::endl;
+ exit(1);
+}
+
+struct feature_op_t {
+ enum type_t {
+ PERSISTENT,
+ OPTIONAL,
+ PLAIN,
+ PARSEABLE,
+ NONE
+ };
+
+ enum op_t {
+ OP_SET,
+ OP_UNSET,
+ OP_LIST
+ };
+
+ op_t op;
+ type_t type;
+ mon_feature_t feature;
+
+ feature_op_t() : op(OP_LIST), type(NONE) { }
+ // default to 'persistent' feature if not specified
+ feature_op_t(op_t o) : op(o), type(PERSISTENT) { }
+ feature_op_t(op_t o, type_t t) : op(o), type(t) { }
+ feature_op_t(op_t o, type_t t, mon_feature_t &f) :
+ op(o), type(t), feature(t) { }
+
+ void set_optional() {
+ type = OPTIONAL;
+ }
+ void set_persistent() {
+ type = PERSISTENT;
+ }
+ bool parse_value(string &s, ostream *errout = NULL) {
+
+ feature = ceph::features::mon::get_feature_by_name(s);
+ if (feature != ceph::features::mon::FEATURE_NONE) {
+ return true;
+ }
+
+ // try parsing as numerical value
+ uint64_t feature_val;
+ string interr;
+ feature_val = strict_strtoll(s.c_str(), 10, &interr);
+ if (!interr.empty()) {
+ if (errout) {
+ *errout << "unknown features name '" << s
+ << "' or unable to parse value: " << interr << std::endl;
+ }
+ return false;
+ }
+ feature = mon_feature_t(feature_val);
+ return true;
+ }
+};
+
+void features_list(feature_op_t &f, MonMap &m)
+{
+ if (f.type == feature_op_t::type_t::PLAIN) {
+
+ cout << "MONMAP FEATURES:" << std::endl;
+ cout << " persistent: ";
+ m.persistent_features.print_with_value(cout);
+ cout << std::endl;
+ cout << " optional: ";
+ m.optional_features.print_with_value(cout);
+ cout << std::endl;
+ cout << " required: ";
+ m.get_required_features().print_with_value(cout);
+ cout << std::endl;
+
+ cout << std::endl;
+ cout << "AVAILABLE FEATURES:" << std::endl;
+ cout << " supported: ";
+ ceph::features::mon::get_supported().print_with_value(cout);
+ cout << std::endl;
+ cout << " persistent: ";
+ ceph::features::mon::get_persistent().print_with_value(cout);
+ cout << std::endl;
+ } else if (f.type == feature_op_t::type_t::PARSEABLE) {
+
+ cout << "monmap:persistent:";
+ m.persistent_features.print_with_value(cout);
+ cout << std::endl;
+ cout << "monmap:optional:";
+ m.optional_features.print_with_value(cout);
+ cout << std::endl;
+ cout << "monmap:required:";
+ m.get_required_features().print_with_value(cout);
+ cout << std::endl;
+ cout << "available:supported:";
+ ceph::features::mon::get_supported().print_with_value(cout);
+ cout << std::endl;
+ cout << "available:persistent:";
+ ceph::features::mon::get_persistent().print_with_value(cout);
+ cout << std::endl;
+ }
+}
+
+bool handle_features(list<feature_op_t>& lst, MonMap &m)
+{
+ if (lst.empty())
+ return false;
+
+ bool modified = false;
+
+ for (auto &f : lst) {
+ if (f.op == feature_op_t::op_t::OP_LIST) {
+ features_list(f, m);
+ } else if (f.op == feature_op_t::op_t::OP_SET ||
+ f.op == feature_op_t::op_t::OP_UNSET) {
+
+ modified = true;
+
+ mon_feature_t &target =
+ ( f.type == feature_op_t::type_t::OPTIONAL ?
+ m.optional_features : m.persistent_features );
+
+ if (f.op == feature_op_t::op_t::OP_SET) {
+ target.set_feature(f.feature);
+ } else {
+ target.unset_feature(f.feature);
+ }
+ } else {
+ cerr << "unknown feature operation type '" << f.op << "'" << std::endl;
+ }
+ }
+ return modified;
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ const char *me = argv[0];
+
+ std::string fn;
+ bool print = false;
+ bool create = false;
+ bool enable_all_features = false;
+ bool clobber = false;
+ bool modified = false;
+ bool show_features = false;
+ bool generate = false;
+ bool filter = false;
+ ceph_release_t min_mon_release{0};
+ map<string,entity_addr_t> add;
+ map<string,entity_addrvec_t> addv;
+ list<string> rm;
+ list<feature_op_t> features;
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+ std::string val;
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_flag(args, i, "-p", "--print", (char*)NULL)) {
+ print = true;
+ } else if (ceph_argparse_flag(args, i, "--create", (char*)NULL)) {
+ create = true;
+ } else if (ceph_argparse_flag(args, i, "--enable-all-features", (char*)NULL)) {
+ enable_all_features = true;
+ } else if (ceph_argparse_flag(args, i, "--clobber", (char*)NULL)) {
+ clobber = true;
+ } else if (ceph_argparse_flag(args, i, "--generate", (char*)NULL)) {
+ generate = true;
+ } else if (ceph_argparse_flag(args, i, "--set-initial-members", (char*)NULL)) {
+ filter = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--set-min-mon-release",
+ (char*)NULL)) {
+ min_mon_release = ceph_release_from_name(val);
+ } else if (ceph_argparse_flag(args, i, "--add", (char*)NULL)) {
+ string name = *i;
+ i = args.erase(i);
+ if (i == args.end())
+ helpful_exit();
+ entity_addr_t addr;
+ if (!addr.parse(string_view{*i})) {
+ // Either we couldn't parse the address or we didn't consume the entire token
+ cerr << me << ": invalid ip:port '" << *i << "'" << std::endl;
+ return -1;
+ }
+ add[name] = addr;
+ modified = true;
+ i = args.erase(i);
+ } else if (ceph_argparse_flag(args, i, "--addv", (char*)NULL)) {
+ string name = *i;
+ i = args.erase(i);
+ if (i == args.end())
+ helpful_exit();
+ entity_addrvec_t addrs;
+ if (!addrs.parse(*i)) {
+ cerr << me << ": invalid ip:port '" << *i << "'" << std::endl;
+ return -1;
+ }
+ addv[name] = addrs;
+ modified = true;
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &val, "--rm", (char*)NULL)) {
+ rm.push_back(val);
+ modified = true;
+ } else if (ceph_argparse_flag(args, i, "--feature-list", (char*)NULL)) {
+ string format = *i;
+ if (format == "plain" || format == "parseable") {
+ i = args.erase(i);
+ } else {
+ format = "plain";
+ }
+
+ feature_op_t f(feature_op_t::op_t::OP_LIST,
+ feature_op_t::type_t::PLAIN);
+
+ if (format == "parseable") {
+ f.type = feature_op_t::type_t::PARSEABLE;
+ } else if (format != "plain") {
+ cerr << "invalid format type for list: '" << val << "'" << std::endl;
+ helpful_exit();
+ }
+
+ features.push_back(f);
+ show_features = true;
+ } else if (ceph_argparse_witharg(args, i, &val,
+ "--feature-set", (char*)NULL)) {
+ // parse value
+ feature_op_t f(feature_op_t::op_t::OP_SET);
+ if (!f.parse_value(val, &cerr)) {
+ helpful_exit();
+ }
+ features.push_back(f);
+
+ } else if (ceph_argparse_witharg(args, i, &val,
+ "--feature-unset", (char*)NULL)) {
+ // parse value
+ feature_op_t f(feature_op_t::op_t::OP_UNSET);
+ if (!f.parse_value(val, &cerr)) {
+ helpful_exit();
+ }
+ features.push_back(f);
+ } else if (ceph_argparse_flag(args, i, "--optional", (char*)NULL)) {
+ if (features.empty()) {
+ helpful_exit();
+ }
+ features.back().set_optional();
+ } else if (ceph_argparse_flag(args, i, "--persistent", (char*)NULL)) {
+ if (features.empty()) {
+ helpful_exit();
+ }
+ features.back().set_persistent();
+ } else {
+ ++i;
+ }
+ }
+ if (args.empty()) {
+ cerr << me << ": must specify monmap filename" << std::endl;
+ helpful_exit();
+ }
+ else if (args.size() > 1) {
+ cerr << me << ": too many arguments" << std::endl;
+ helpful_exit();
+ }
+ fn = args[0];
+
+ MonMap monmap;
+
+ cout << me << ": monmap file " << fn << std::endl;
+
+ int r = 0;
+ if (!(create && clobber)) {
+ try {
+ r = monmap.read(fn.c_str());
+ } catch (...) {
+ cerr << me << ": unable to read monmap file" << std::endl;
+ return -1;
+ }
+ }
+
+ if (!create && r < 0) {
+ cerr << me << ": couldn't open " << fn << ": " << cpp_strerror(r) << std::endl;
+ return -1;
+ }
+ else if (create && !clobber && r == 0) {
+ cerr << me << ": " << fn << " exists, --clobber to overwrite" << std::endl;
+ return -1;
+ }
+
+ if (create) {
+ monmap.epoch = 0;
+ monmap.created = ceph_clock_now();
+ monmap.last_changed = monmap.created;
+ srand(getpid() + time(0));
+ if (g_conf().get_val<uuid_d>("fsid").is_zero()) {
+ monmap.generate_fsid();
+ cout << me << ": generated fsid " << monmap.fsid << std::endl;
+ }
+ monmap.strategy = static_cast<MonMap::election_strategy>(
+ g_conf().get_val<uint64_t>("mon_election_default_strategy"));
+ // TODO: why do we not use build_initial in our normal path here!?!?!
+ modified = true;
+ }
+ if (enable_all_features) {
+ // populate persistent features, too
+ monmap.persistent_features = ceph::features::mon::get_persistent();
+ modified = true;
+ }
+
+ if (generate) {
+ int r = monmap.build_initial(g_ceph_context, true, cerr);
+ if (r < 0)
+ return r;
+ }
+
+ if (min_mon_release != ceph_release_t::unknown) {
+ monmap.min_mon_release = min_mon_release;
+ cout << "setting min_mon_release = " << min_mon_release << std::endl;
+ modified = true;
+ }
+
+ if (filter) {
+ // apply initial members
+ list<string> initial_members;
+ get_str_list(g_conf()->mon_initial_members, initial_members);
+ if (!initial_members.empty()) {
+ cout << "initial_members " << initial_members << ", filtering seed monmap" << std::endl;
+ set<entity_addrvec_t> removed;
+ monmap.set_initial_members(g_ceph_context, initial_members,
+ string(), entity_addrvec_t(),
+ &removed);
+ cout << "removed " << removed << std::endl;
+ }
+ modified = true;
+ }
+
+ if (!g_conf().get_val<uuid_d>("fsid").is_zero()) {
+ monmap.fsid = g_conf().get_val<uuid_d>("fsid");
+ cout << me << ": set fsid to " << monmap.fsid << std::endl;
+ modified = true;
+ }
+
+ for (auto& p : add) {
+ entity_addr_t addr = p.second;
+ entity_addrvec_t addrs;
+ if (monmap.contains(p.first)) {
+ cerr << me << ": map already contains mon." << p.first << std::endl;
+ helpful_exit();
+ }
+ if (addr.get_port() == 0) {
+ if (monmap.persistent_features.contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ addr.set_port(CEPH_MON_PORT_IANA);
+ addrs.v.push_back(addr);
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ addrs.v.push_back(addr);
+ } else {
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ addrs.v.push_back(addr);
+ }
+ } else if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addrs.v.push_back(addr);
+ } else {
+ if (monmap.persistent_features.contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ }
+ addrs.v.push_back(addr);
+ }
+ if (monmap.contains(addrs)) {
+ cerr << me << ": map already contains " << addrs << std::endl;
+ helpful_exit();
+ }
+ monmap.add(p.first, addrs);
+ }
+ for (auto& p : addv) {
+ if (monmap.contains(p.first)) {
+ cerr << me << ": map already contains mon." << p.first << std::endl;
+ helpful_exit();
+ }
+ if (monmap.contains(p.second)) {
+ cerr << me << ": map already contains " << p.second << std::endl;
+ helpful_exit();
+ }
+ monmap.add(p.first, p.second);
+ }
+ for (auto& p : rm) {
+ cout << me << ": removing " << p << std::endl;
+ if (!monmap.contains(p)) {
+ cerr << me << ": map does not contain " << p << std::endl;
+ helpful_exit();
+ }
+ monmap.remove(p);
+ }
+
+ if (handle_features(features, monmap)) {
+ modified = true;
+ }
+
+ if (!print && !modified && !show_features) {
+ cerr << "no action specified" << std::endl;
+ helpful_exit();
+ }
+
+ if (print)
+ monmap.print(cout);
+
+ if (modified) {
+ // write it out
+ cout << me << ": writing epoch " << monmap.epoch
+ << " to " << fn
+ << " (" << monmap.size() << " monitors)"
+ << std::endl;
+ int r = monmap.write(fn.c_str());
+ if (r < 0) {
+ cerr << "monmaptool: error writing to '" << fn << "': " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ }
+
+
+ return 0;
+}
diff --git a/src/tools/neorados.cc b/src/tools/neorados.cc
new file mode 100644
index 000000000..516dfbce7
--- /dev/null
+++ b/src/tools/neorados.cc
@@ -0,0 +1,385 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#define BOOST_COROUTINES_NO_DEPRECATION_WARNING
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+#include <boost/asio.hpp>
+#include <boost/io/ios_state.hpp>
+#include <boost/program_options.hpp>
+#include <boost/system/system_error.hpp>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <spawn/spawn.hpp>
+
+#include "include/buffer.h" // :(
+
+#include "include/neorados/RADOS.hpp"
+
+using namespace std::literals;
+
+namespace ba = boost::asio;
+namespace bs = boost::system;
+namespace R = neorados;
+namespace s = spawn;
+
+std::string verstr(const std::tuple<uint32_t, uint32_t, uint32_t>& v)
+{
+ const auto [maj, min, p] = v;
+ return fmt::format("v{}.{}.{}", maj, min, p);
+}
+
+template<typename V>
+void printseq(const V& v, std::ostream& m)
+{
+ std::for_each(v.cbegin(), v.cend(),
+ [&m](const auto& e) {
+ fmt::print(m, "{}\n", e);
+ });
+}
+
+template<typename V, typename F>
+void printseq(const V& v, std::ostream& m, F&& f)
+{
+ std::for_each(v.cbegin(), v.cend(),
+ [&m, &f](const auto& e) {
+ fmt::print(m, "{}\n", f(e));
+ });
+}
+
+std::int64_t lookup_pool(R::RADOS& r, const std::string& pname,
+ s::yield_context y)
+{
+ bs::error_code ec;
+ auto p = r.lookup_pool(pname, y[ec]);
+ if (ec)
+ throw bs::system_error(
+ ec, fmt::format("when looking up '{}'", pname));
+ return p;
+}
+
+
+void lspools(R::RADOS& r, const std::vector<std::string>&,
+ s::yield_context y)
+{
+ const auto l = r.list_pools(y);
+ printseq(l, std::cout, [](const auto& p) -> const std::string& {
+ return p.second;
+ });
+}
+
+
+void ls(R::RADOS& r, const std::vector<std::string>& p, s::yield_context y)
+{
+ const auto& pname = p[0];
+ const auto pool = lookup_pool(r, pname, y);
+
+ std::vector<R::Entry> ls;
+ R::Cursor next = R::Cursor::begin();
+ bs::error_code ec;
+ do {
+ std::tie(ls, next) = r.enumerate_objects(pool, next, R::Cursor::end(),
+ 1000, {}, y[ec], R::all_nspaces);
+ if (ec)
+ throw bs::system_error(ec, fmt::format("when listing {}", pname));
+ printseq(ls, std::cout);
+ ls.clear();
+ } while (next != R::Cursor::end());
+}
+
+void mkpool(R::RADOS& r, const std::vector<std::string>& p,
+ s::yield_context y)
+{
+ const auto& pname = p[0];
+ bs::error_code ec;
+ r.create_pool(pname, std::nullopt, y[ec]);
+ if (ec)
+ throw bs::system_error(ec, fmt::format("when creating pool '{}'", pname));
+}
+
+void rmpool(R::RADOS& r, const std::vector<std::string>& p,
+ s::yield_context y)
+{
+ const auto& pname = p[0];
+ bs::error_code ec;
+ r.delete_pool(pname, y[ec]);
+ if (ec)
+ throw bs::system_error(ec, fmt::format("when removing pool '{}'", pname));
+}
+
+void create(R::RADOS& r, const std::vector<std::string>& p,
+ s::yield_context y)
+{
+ const auto& pname = p[0];
+ const R::Object obj = p[1];
+ const auto pool = lookup_pool(r, pname, y);
+
+ bs::error_code ec;
+ R::WriteOp op;
+ op.create(true);
+ r.execute(obj, pool, std::move(op), y[ec]);
+ if (ec)
+ throw bs::system_error(ec,
+ fmt::format(
+ "when creating object '{}' in pool '{}'",
+ obj, pname));
+}
+
+inline constexpr std::size_t io_size = 4 << 20;
+
+void write(R::RADOS& r, const std::vector<std::string>& p, s::yield_context y)
+{
+ const auto& pname = p[0];
+ const R::Object obj(p[1]);
+ const auto pool = lookup_pool(r, pname, y);
+
+ bs::error_code ec;
+ std::unique_ptr<char[]> buf = std::make_unique<char[]>(io_size);
+ std::size_t off = 0;
+ boost::io::ios_exception_saver ies(std::cin);
+
+ std::cin.exceptions(std::istream::badbit);
+ std::cin.clear();
+
+ while (!std::cin.eof()) {
+ auto curoff = off;
+ std::cin.read(buf.get(), io_size);
+ auto len = std::cin.gcount();
+ off += len;
+ if (len == 0)
+ break; // Nothin' to do.
+
+ ceph::buffer::list bl;
+ bl.append(buffer::create_static(len, buf.get()));
+ R::WriteOp op;
+ op.write(curoff, std::move(bl));
+ r.execute(obj, pool, std::move(op), y[ec]);
+
+ if (ec)
+ throw bs::system_error(ec, fmt::format(
+ "when writing object '{}' in pool '{}'",
+ obj, pname));
+ }
+}
+
+void read(R::RADOS& r, const std::vector<std::string>& p, s::yield_context y)
+{
+ const auto& pname = p[0];
+ const R::Object obj(p[1]);
+ const auto pool = lookup_pool(r, pname, y);
+
+ bs::error_code ec;
+ std::uint64_t len;
+ {
+ R::ReadOp op;
+ op.stat(&len, nullptr);
+ r.execute(obj, pool, std::move(op),
+ nullptr, y[ec]);
+ if (ec)
+ throw bs::system_error(
+ ec,
+ fmt::format("when getting length of object '{}' in pool '{}'",
+ obj, pname));
+ }
+
+ std::size_t off = 0;
+ ceph::buffer::list bl;
+ while (auto toread = std::max(len - off, io_size)) {
+ R::ReadOp op;
+ op.read(off, toread, &bl);
+ r.execute(obj, pool, std::move(op), nullptr, y[ec]);
+ if (ec)
+ throw bs::system_error(
+ ec,
+ fmt::format("when reading from object '{}' in pool '{}'",
+ obj, pool));
+
+ off += bl.length();
+ bl.write_stream(std::cout);
+ bl.clear();
+ }
+}
+
+void rm(R::RADOS& r, const std::vector<std::string>& p, s::yield_context y)
+{
+ const auto& pname = p[0];
+ const R::Object obj = p[1];
+ const auto pool = lookup_pool(r, pname, y);
+
+ bs::error_code ec;
+ R::WriteOp op;
+ op.remove();
+ r.execute(obj, pool, std::move(op), y[ec]);
+ if (ec)
+ throw bs::system_error(ec, fmt::format(
+ "when removing object '{}' in pool '{}'",
+ obj, pname));
+}
+
+static constexpr auto version = std::make_tuple(0ul, 0ul, 1ul);
+
+using cmdfunc = void (*)(R::RADOS& r, const std::vector<std::string>& p,
+ s::yield_context);
+
+struct cmdesc {
+ std::string_view name;
+ std::size_t arity;
+ cmdfunc f;
+ std::string_view usage;
+ std::string_view desc;
+};
+
+const std::array commands = {
+ // Pools operations ;)
+
+ cmdesc{ "lspools"sv,
+ 0, &lspools,
+ ""sv,
+ "List all pools"sv },
+
+ // Pool operations
+
+ cmdesc{ "ls"sv,
+ 1, &ls,
+ "POOL"sv,
+ "list all objects in POOL"sv },
+ cmdesc{ "mkpool"sv,
+ 1, &mkpool,
+ "POOL"sv,
+ "create POOL"sv },
+ cmdesc{ "rmpool"sv,
+ 1, &rmpool,
+ "POOL"sv,
+ "remove POOL"sv },
+
+ // Object operations
+
+ cmdesc{ "create"sv,
+ 2, &create,
+ "POOL OBJECT"sv,
+ "exclusively create OBJECT in POOL"sv },
+ cmdesc{ "write"sv,
+ 2, &write,
+ "POOL OBJECT"sv,
+ "write to OBJECT in POOL from standard input"sv },
+ cmdesc{ "read"sv,
+ 2, &read,
+ "POOL OBJECT"sv,
+ "read contents of OBJECT in POOL to standard out"sv },
+ cmdesc{ "rm"sv,
+ 2, &rm,
+ "POOL OBJECT"sv,
+ "remove OBJECT in POOL"sv }
+};
+
+int main(int argc, char* argv[])
+{
+ const std::string_view prog(argv[0]);
+ std::string command;
+ namespace po = boost::program_options;
+ try {
+ std::vector<std::string> parameters;
+
+ po::options_description desc(fmt::format("{} options", prog));
+ desc.add_options()
+ ("help", "show help")
+ ("version", "show version")
+ ("command", po::value<std::string>(&command), "the operation to perform")
+ ("parameters", po::value<std::vector<std::string>>(&parameters),
+ "parameters to the command");
+
+ po::positional_options_description p;
+ p.add("command", 1);
+ p.add("parameters", -1);
+
+ po::variables_map vm;
+
+ po::store(po::command_line_parser(argc, argv).
+ options(desc).positional(p).run(), vm);
+
+ po::notify(vm);
+
+ if (vm.count("help")) {
+ fmt::print("{}", desc);
+ fmt::print("Commands:\n");
+ for (const auto& cmd : commands) {
+ fmt::print(" {} {}{}{}\n",
+ cmd.name, cmd.usage,
+ cmd.name.length() + cmd.usage.length() < 13 ?
+ "\t\t"sv : "\t"sv,
+ cmd.desc);
+ }
+ return 0;
+ }
+
+ if (vm.count("version")) {
+ fmt::print(
+ "{}: RADOS command exerciser, {},\n"
+ "RADOS library version {}\n"
+ "Copyright (C) 2019 Red Hat <contact@redhat.com>\n"
+ "This is free software; you can redistribute it and/or\n"
+ "modify it under the terms of the GNU Lesser General Public\n"
+ "License version 2.1, as published by the Free Software\n"
+ "Foundation. See file COPYING.\n", prog,
+ verstr(version), verstr(R::RADOS::version()));
+ return 0;
+ }
+
+ if (vm.find("command") == vm.end()) {
+ fmt::print(std::cerr, "{}: a command is required\n", prog);
+ return 1;
+ }
+
+ ba::io_context c;
+
+ if (auto ci = std::find_if(commands.begin(), commands.end(),
+ [&command](const cmdesc& c) {
+ return c.name == command;
+ }); ci != commands.end()) {
+ if (parameters.size() < ci->arity) {
+ fmt::print(std::cerr, "{}: {}: too few arguments\n\t{} {}\n",
+ prog, command, ci->name, ci->usage);
+ return 1;
+ }
+ if (parameters.size() > ci->arity) {
+ fmt::print(std::cerr, "{}: {}: too many arguments\n\t{} {}\n",
+ prog, command, ci->name, ci->usage);
+ return 1;
+ }
+ s::spawn(c, [&](s::yield_context y) {
+ auto r = R::RADOS::Builder{}.build(c, y);
+ ci->f(r, parameters, y);
+ });
+ } else {
+ fmt::print(std::cerr, "{}: {}: unknown command\n", prog, command);
+ return 1;
+ }
+ c.run();
+ } catch (const std::exception& e) {
+ fmt::print(std::cerr, "{}: {}: {}\n", prog, command, e.what());
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc
new file mode 100644
index 000000000..9bbe40f4d
--- /dev/null
+++ b/src/tools/osdmaptool.cc
@@ -0,0 +1,846 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string>
+#include <sys/stat.h>
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "include/random.h"
+#include "mon/health_check.h"
+#include <time.h>
+#include <algorithm>
+
+#include "global/global_init.h"
+#include "osd/OSDMap.h"
+
+
+void usage()
+{
+ cout << " usage: [--print] <mapfilename>" << std::endl;
+ cout << " --create-from-conf creates an osd map with default configurations" << std::endl;
+ cout << " --createsimple <numosd> [--clobber] [--pg-bits <bitsperosd>] [--pgp-bits <bits>] creates a relatively generic OSD map with <numosd> devices" << std::endl;
+ cout << " --pgp-bits <bits> pgp_num map attribute will be shifted by <bits>" << std::endl;
+ cout << " --pg-bits <bits> pg_num map attribute will be shifted by <bits>" << std::endl;
+ cout << " --clobber allows osdmaptool to overwrite <mapfilename> if it already exists" << std::endl;
+ cout << " --export-crush <file> write osdmap's crush map to <file>" << std::endl;
+ cout << " --import-crush <file> replace osdmap's crush map with <file>" << std::endl;
+ cout << " --health dump health checks" << std::endl;
+ cout << " --test-map-pgs [--pool <poolid>] [--pg_num <pg_num>] [--range-first <first> --range-last <last>] map all pgs" << std::endl;
+ cout << " --test-map-pgs-dump [--pool <poolid>] [--range-first <first> --range-last <last>] map all pgs" << std::endl;
+ cout << " --test-map-pgs-dump-all [--pool <poolid>] [--range-first <first> --range-last <last>] map all pgs to osds" << std::endl;
+ cout << " --mark-up-in mark osds up and in (but do not persist)" << std::endl;
+ cout << " --mark-out <osdid> mark an osd as out (but do not persist)" << std::endl;
+ cout << " --mark-up <osdid> mark an osd as up (but do not persist)" << std::endl;
+ cout << " --mark-in <osdid> mark an osd as in (but do not persist)" << std::endl;
+ cout << " --with-default-pool include default pool when creating map" << std::endl;
+ cout << " --clear-temp clear pg_temp and primary_temp" << std::endl;
+ cout << " --clean-temps clean pg_temps" << std::endl;
+ cout << " --test-random do random placements" << std::endl;
+ cout << " --test-map-pg <pgid> map a pgid to osds" << std::endl;
+ cout << " --test-map-object <objectname> [--pool <poolid>] map an object to osds"
+ << std::endl;
+ cout << " --upmap-cleanup <file> clean up pg_upmap[_items] entries, writing" << std::endl;
+ cout << " commands to <file> [default: - for stdout]" << std::endl;
+ cout << " --upmap <file> calculate pg upmap entries to balance pg layout" << std::endl;
+ cout << " writing commands to <file> [default: - for stdout]" << std::endl;
+ cout << " --upmap-max <max-count> set max upmap entries to calculate [default: 10]" << std::endl;
+ cout << " --upmap-deviation <max-deviation>" << std::endl;
+ cout << " max deviation from target [default: 5]" << std::endl;
+ cout << " --upmap-pool <poolname> restrict upmap balancing to 1 or more pools" << std::endl;
+ cout << " --upmap-active Act like an active balancer, keep applying changes until balanced" << std::endl;
+ cout << " --dump <format> displays the map in plain text when <format> is 'plain', 'json' if specified format is not supported" << std::endl;
+ cout << " --tree displays a tree of the map" << std::endl;
+ cout << " --test-crush [--range-first <first> --range-last <last>] map pgs to acting osds" << std::endl;
+ cout << " --adjust-crush-weight <osdid:weight>[,<osdid:weight>,<...>] change <osdid> CRUSH <weight> (but do not persist)" << std::endl;
+ cout << " --save write modified osdmap with upmap or crush-adjust changes" << std::endl;
+ exit(1);
+}
+
+void print_inc_upmaps(const OSDMap::Incremental& pending_inc, int fd)
+{
+ ostringstream ss;
+ for (auto& i : pending_inc.old_pg_upmap) {
+ ss << "ceph osd rm-pg-upmap " << i << std::endl;
+ }
+ for (auto& i : pending_inc.new_pg_upmap) {
+ ss << "ceph osd pg-upmap " << i.first;
+ for (auto osd : i.second) {
+ ss << " " << osd;
+ }
+ ss << std::endl;
+ }
+ for (auto& i : pending_inc.old_pg_upmap_items) {
+ ss << "ceph osd rm-pg-upmap-items " << i << std::endl;
+ }
+ for (auto& i : pending_inc.new_pg_upmap_items) {
+ ss << "ceph osd pg-upmap-items " << i.first;
+ for (auto p : i.second) {
+ ss << " " << p.first << " " << p.second;
+ }
+ ss << std::endl;
+ }
+ string s = ss.str();
+ int r = safe_write(fd, s.c_str(), s.size());
+ if (r < 0) {
+ cerr << "error writing output: " << cpp_strerror(r) << std::endl;
+ exit(1);
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+
+ const char *me = argv[0];
+
+ std::string fn;
+ bool print = false;
+ boost::scoped_ptr<Formatter> print_formatter;
+ bool tree = false;
+ boost::scoped_ptr<Formatter> tree_formatter;
+ bool createsimple = false;
+ bool createpool = false;
+ bool create_from_conf = false;
+ int num_osd = 0;
+ int pg_bits = 6;
+ int pgp_bits = 6;
+ bool clobber = false;
+ bool modified = false;
+ std::string export_crush, import_crush, test_map_pg, test_map_object, adjust_crush_weight;
+ bool test_crush = false;
+ int range_first = -1;
+ int range_last = -1;
+ int pool = -1;
+ bool mark_up_in = false;
+ int marked_out = -1;
+ int marked_up = -1;
+ int marked_in = -1;
+ bool clear_temp = false;
+ bool clean_temps = false;
+ bool test_map_pgs = false;
+ bool test_map_pgs_dump = false;
+ bool test_random = false;
+ bool upmap_cleanup = false;
+ bool upmap = false;
+ bool health = false;
+ std::string upmap_file = "-";
+ int upmap_max = 10;
+ int upmap_deviation = 5;
+ bool upmap_active = false;
+ std::set<std::string> upmap_pools;
+ int64_t pg_num = -1;
+ bool test_map_pgs_dump_all = false;
+ bool save = false;
+
+ std::string val;
+ std::ostringstream err;
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_flag(args, i, "-p", "--print", (char*)NULL)) {
+ print = true;
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--dump", (char*)NULL)) {
+ print = true;
+ if (!val.empty() && val != "plain") {
+ print_formatter.reset(Formatter::create(val, "", "json"));
+ }
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--tree", (char*)NULL)) {
+ tree = true;
+ if (!val.empty() && val != "plain") {
+ tree_formatter.reset(Formatter::create(val, "", "json"));
+ }
+ } else if (ceph_argparse_witharg(args, i, &pg_bits, err, "--osd-pg-bits", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &pgp_bits, err, "--osd-pgp-bits", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &upmap_file, "--upmap-cleanup", (char*)NULL)) {
+ upmap_cleanup = true;
+ } else if (ceph_argparse_witharg(args, i, &upmap_file, "--upmap", (char*)NULL)) {
+ upmap_cleanup = true;
+ upmap = true;
+ } else if (ceph_argparse_witharg(args, i, &upmap_max, err, "--upmap-max", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &upmap_deviation, err, "--upmap-deviation", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &val, "--upmap-pool", (char*)NULL)) {
+ upmap_pools.insert(val);
+ } else if (ceph_argparse_witharg(args, i, &num_osd, err, "--createsimple", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ createsimple = true;
+ } else if (ceph_argparse_flag(args, i, "--upmap-active", (char*)NULL)) {
+ upmap_active = true;
+ } else if (ceph_argparse_flag(args, i, "--health", (char*)NULL)) {
+ health = true;
+ } else if (ceph_argparse_flag(args, i, "--with-default-pool", (char*)NULL)) {
+ createpool = true;
+ } else if (ceph_argparse_flag(args, i, "--create-from-conf", (char*)NULL)) {
+ create_from_conf = true;
+ } else if (ceph_argparse_flag(args, i, "--mark-up-in", (char*)NULL)) {
+ mark_up_in = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--mark-out", (char*)NULL)) {
+ marked_out = std::stoi(val);
+ } else if (ceph_argparse_witharg(args, i, &val, "--mark-up", (char*)NULL)) {
+ marked_up = std::stod(val);
+ } else if (ceph_argparse_witharg(args, i, &val, "--mark-in", (char*)NULL)) {
+ marked_in = std::stod(val);
+ } else if (ceph_argparse_flag(args, i, "--clear-temp", (char*)NULL)) {
+ clear_temp = true;
+ } else if (ceph_argparse_flag(args, i, "--clean-temps", (char*)NULL)) {
+ clean_temps = true;
+ } else if (ceph_argparse_flag(args, i, "--test-map-pgs", (char*)NULL)) {
+ test_map_pgs = true;
+ } else if (ceph_argparse_flag(args, i, "--test-map-pgs-dump", (char*)NULL)) {
+ test_map_pgs_dump = true;
+ } else if (ceph_argparse_flag(args, i, "--test-map-pgs-dump-all", (char*)NULL)) {
+ test_map_pgs_dump_all = true;
+ } else if (ceph_argparse_flag(args, i, "--test-random", (char*)NULL)) {
+ test_random = true;
+ } else if (ceph_argparse_flag(args, i, "--clobber", (char*)NULL)) {
+ clobber = true;
+ } else if (ceph_argparse_witharg(args, i, &pg_bits, err, "--pg_bits", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ } else if (ceph_argparse_witharg(args, i, &pgp_bits, err, "--pgp_bits", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ } else if (ceph_argparse_witharg(args, i, &val, "--export_crush", (char*)NULL)) {
+ export_crush = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--import_crush", (char*)NULL)) {
+ import_crush = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--test_map_pg", (char*)NULL)) {
+ test_map_pg = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--test_map_object", (char*)NULL)) {
+ test_map_object = val;
+ } else if (ceph_argparse_flag(args, i, "--test_crush", (char*)NULL)) {
+ test_crush = true;
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--pg_num", (char*)NULL)) {
+ string interr;
+ pg_num = strict_strtoll(val.c_str(), 10, &interr);
+ if (interr.length() > 0) {
+ cerr << "error parsing integer value " << interr << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ } else if (ceph_argparse_witharg(args, i, &range_first, err, "--range_first", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &range_last, err, "--range_last", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &pool, err, "--pool", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--adjust-crush-weight", (char*)NULL)) {
+ adjust_crush_weight = val;
+ } else if (ceph_argparse_flag(args, i, "--save", (char*)NULL)) {
+ save = true;
+ } else {
+ ++i;
+ }
+ }
+ if (args.empty()) {
+ cerr << me << ": must specify osdmap filename" << std::endl;
+ usage();
+ }
+ else if (args.size() > 1) {
+ cerr << me << ": too many arguments" << std::endl;
+ usage();
+ }
+ if (upmap_deviation < 1) {
+ cerr << me << ": upmap-deviation must be >= 1" << std::endl;
+ usage();
+ }
+ fn = args[0];
+
+ if (range_first >= 0 && range_last >= 0) {
+ set<OSDMap*> maps;
+ OSDMap *prev = NULL;
+ for (int i=range_first; i <= range_last; i++) {
+ ostringstream f;
+ f << fn << "/" << i;
+ bufferlist bl;
+ string error, s = f.str();
+ int r = bl.read_file(s.c_str(), &error);
+ if (r < 0) {
+ cerr << "unable to read " << s << ": " << cpp_strerror(r) << std::endl;
+ exit(1);
+ }
+ cout << s << " got " << bl.length() << " bytes" << std::endl;
+ OSDMap *o = new OSDMap;
+ o->decode(bl);
+ maps.insert(o);
+ if (prev)
+ OSDMap::dedup(prev, o);
+ prev = o;
+ }
+ exit(0);
+ }
+
+ OSDMap osdmap;
+ bufferlist bl;
+
+ cerr << me << ": osdmap file '" << fn << "'" << std::endl;
+
+ int r = 0;
+ struct stat st;
+ if (!createsimple && !create_from_conf && !clobber) {
+ std::string error;
+ r = bl.read_file(fn.c_str(), &error);
+ if (r == 0) {
+ try {
+ osdmap.decode(bl);
+ }
+ catch (const buffer::error &e) {
+ cerr << me << ": error decoding osdmap '" << fn << "'" << std::endl;
+ return -1;
+ }
+ }
+ else {
+ cerr << me << ": couldn't open " << fn << ": " << error << std::endl;
+ return -1;
+ }
+ }
+ else if ((createsimple || create_from_conf) && !clobber && ::stat(fn.c_str(), &st) == 0) {
+ cerr << me << ": " << fn << " exists, --clobber to overwrite" << std::endl;
+ return -1;
+ }
+
+ if (createsimple || create_from_conf) {
+ if (createsimple) {
+ if (num_osd < 1) {
+ cerr << me << ": osd count must be > 0" << std::endl;
+ exit(1);
+ }
+ } else {
+ num_osd = -1;
+ }
+ uuid_d fsid;
+ if (createpool) {
+ osdmap.build_simple_with_pool(
+ g_ceph_context, 0, fsid, num_osd, pg_bits, pgp_bits);
+ } else {
+ osdmap.build_simple(g_ceph_context, 0, fsid, num_osd);
+ }
+ modified = true;
+ }
+
+ if (mark_up_in) {
+ cout << "marking all OSDs up and in" << std::endl;
+ int n = osdmap.get_max_osd();
+ for (int i=0; i<n; i++) {
+ osdmap.set_state(i, osdmap.get_state(i) | CEPH_OSD_UP);
+ osdmap.set_weight(i, CEPH_OSD_IN);
+ if (osdmap.crush->get_item_weight(i) == 0 ) {
+ osdmap.crush->adjust_item_weightf(g_ceph_context, i, 1.0);
+ }
+ }
+ }
+
+ if (marked_out >=0 && marked_out < osdmap.get_max_osd()) {
+ cout << "marking OSD@" << marked_out << " as out" << std::endl;
+ int id = marked_out;
+ osdmap.set_state(id, osdmap.get_state(id) | CEPH_OSD_UP);
+ osdmap.set_weight(id, CEPH_OSD_OUT);
+ }
+
+ if (marked_up >=0 && marked_up < osdmap.get_max_osd()) {
+ cout << "marking OSD@" << marked_up << " as up" << std::endl;
+ int id = marked_up;
+ osdmap.set_state(id, osdmap.get_state(id) | CEPH_OSD_UP);
+ }
+
+ if (marked_in >=0 && marked_in < osdmap.get_max_osd()) {
+ cout << "marking OSD@" << marked_up << " as up" << std::endl;
+ int id = marked_up;
+ osdmap.set_weight(id, CEPH_OSD_IN);
+ }
+
+ for_each_substr(adjust_crush_weight, ",", [&](auto osd_to_adjust) {
+ std::string_view osd_to_weight_delimiter{":"};
+ size_t pos = osd_to_adjust.find(osd_to_weight_delimiter);
+ if (pos == osd_to_adjust.npos) {
+ cerr << me << ": use ':' as separator of osd id and its weight"
+ << std::endl;
+ usage();
+ }
+ int osd_id = std::stoi(string(osd_to_adjust.substr(0, pos)));
+ float new_weight = std::stof(string(osd_to_adjust.substr(pos + 1)));
+ osdmap.crush->adjust_item_weightf(g_ceph_context, osd_id, new_weight);
+ std::cout << "Adjusted osd." << osd_id << " CRUSH weight to " << new_weight
+ << std::endl;
+ if (save) {
+ OSDMap::Incremental inc;
+ inc.fsid = osdmap.get_fsid();
+ inc.epoch = osdmap.get_epoch() + 1;
+ osdmap.apply_incremental(inc);
+ modified = true;
+ }
+ });
+
+ if (clear_temp) {
+ cout << "clearing pg/primary temp" << std::endl;
+ osdmap.clear_temp();
+ }
+ if (clean_temps) {
+ cout << "cleaning pg temps" << std::endl;
+ OSDMap::Incremental pending_inc(osdmap.get_epoch()+1);
+ OSDMap tmpmap;
+ tmpmap.deepish_copy_from(osdmap);
+ tmpmap.apply_incremental(pending_inc);
+ OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
+ }
+ int upmap_fd = STDOUT_FILENO;
+ if (upmap || upmap_cleanup) {
+ if (upmap_file != "-") {
+ upmap_fd = ::open(upmap_file.c_str(), O_CREAT|O_WRONLY|O_TRUNC, 0644);
+ if (upmap_fd < 0) {
+ cerr << "error opening " << upmap_file << ": " << cpp_strerror(errno)
+ << std::endl;
+ exit(1);
+ }
+ cout << "writing upmap command output to: " << upmap_file << std::endl;
+ }
+ }
+ if (upmap_cleanup) {
+ cout << "checking for upmap cleanups" << std::endl;
+ OSDMap::Incremental pending_inc(osdmap.get_epoch()+1);
+ pending_inc.fsid = osdmap.get_fsid();
+ int r = osdmap.clean_pg_upmaps(g_ceph_context, &pending_inc);
+ if (r > 0) {
+ print_inc_upmaps(pending_inc, upmap_fd);
+ r = osdmap.apply_incremental(pending_inc);
+ ceph_assert(r == 0);
+ }
+ }
+ if (upmap) {
+ cout << "upmap, max-count " << upmap_max
+ << ", max deviation " << upmap_deviation
+ << std::endl;
+ vector<int64_t> pools;
+ set<int64_t> upmap_pool_nums;
+ for (auto& s : upmap_pools) {
+ int64_t p = osdmap.lookup_pg_pool_name(s);
+ if (p < 0) {
+ cerr << " pool " << s << " does not exist" << std::endl;
+ exit(1);
+ }
+ pools.push_back(p);
+ upmap_pool_nums.insert(p);
+ }
+ if (!pools.empty()) {
+ cout << " limiting to pools " << upmap_pools << " (" << pools << ")"
+ << std::endl;
+ } else {
+ mempool::osdmap::map<int64_t,pg_pool_t> opools = osdmap.get_pools();
+ for (auto& i : opools) {
+ pools.push_back(i.first);
+ }
+ }
+ if (pools.empty()) {
+ cout << "No pools available" << std::endl;
+ goto skip_upmap;
+ }
+ int rounds = 0;
+ struct timespec round_start;
+ int r = clock_gettime(CLOCK_MONOTONIC, &round_start);
+ assert(r == 0);
+ do {
+ random_device_t rd;
+ std::shuffle(pools.begin(), pools.end(), std::mt19937{rd()});
+ cout << "pools ";
+ for (auto& i: pools)
+ cout << osdmap.get_pool_name(i) << " ";
+ cout << std::endl;
+ OSDMap::Incremental pending_inc(osdmap.get_epoch()+1);
+ pending_inc.fsid = osdmap.get_fsid();
+ int total_did = 0;
+ int left = upmap_max;
+ struct timespec begin, end;
+ r = clock_gettime(CLOCK_MONOTONIC, &begin);
+ assert(r == 0);
+ for (auto& i: pools) {
+ set<int64_t> one_pool;
+ one_pool.insert(i);
+ int did = osdmap.calc_pg_upmaps(
+ g_ceph_context, upmap_deviation,
+ left, one_pool,
+ &pending_inc);
+ total_did += did;
+ left -= did;
+ if (left <= 0)
+ break;
+ }
+ r = clock_gettime(CLOCK_MONOTONIC, &end);
+ assert(r == 0);
+ cout << "prepared " << total_did << "/" << upmap_max << " changes" << std::endl;
+ float elapsed_time = (end.tv_sec - begin.tv_sec) + 1.0e-9*(end.tv_nsec - begin.tv_nsec);
+ if (upmap_active)
+ cout << "Time elapsed " << elapsed_time << " secs" << std::endl;
+ if (total_did > 0) {
+ print_inc_upmaps(pending_inc, upmap_fd);
+ if (save || upmap_active) {
+ int r = osdmap.apply_incremental(pending_inc);
+ ceph_assert(r == 0);
+ if (save)
+ modified = true;
+ }
+ } else {
+ cout << "Unable to find further optimization, "
+ << "or distribution is already perfect"
+ << std::endl;
+ if (upmap_active) {
+ map<int,set<pg_t>> pgs_by_osd;
+ for (auto& i : osdmap.get_pools()) {
+ if (!upmap_pool_nums.empty() && !upmap_pool_nums.count(i.first))
+ continue;
+ for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
+ pg_t pg(ps, i.first);
+ vector<int> up;
+ osdmap.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
+ //ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
+ for (auto osd : up) {
+ if (osd != CRUSH_ITEM_NONE)
+ pgs_by_osd[osd].insert(pg);
+ }
+ }
+ }
+ for (auto& i : pgs_by_osd)
+ cout << "osd." << i.first << " pgs " << i.second.size() << std::endl;
+ float elapsed_time = (end.tv_sec - round_start.tv_sec) + 1.0e-9*(end.tv_nsec - round_start.tv_nsec);
+ cout << "Total time elapsed " << elapsed_time << " secs, " << rounds << " rounds" << std::endl;
+ }
+ break;
+ }
+ ++rounds;
+ } while(upmap_active);
+ }
+skip_upmap:
+ if (upmap_file != "-") {
+ ::close(upmap_fd);
+ }
+
+ if (!import_crush.empty()) {
+ bufferlist cbl;
+ std::string error;
+ r = cbl.read_file(import_crush.c_str(), &error);
+ if (r) {
+ cerr << me << ": error reading crush map from " << import_crush
+ << ": " << error << std::endl;
+ exit(1);
+ }
+
+ // validate
+ CrushWrapper cw;
+ auto p = cbl.cbegin();
+ cw.decode(p);
+
+ if (cw.get_max_devices() > osdmap.get_max_osd()) {
+ cerr << me << ": crushmap max_devices " << cw.get_max_devices()
+ << " > osdmap max_osd " << osdmap.get_max_osd() << std::endl;
+ exit(1);
+ }
+
+ // apply
+ OSDMap::Incremental inc;
+ inc.fsid = osdmap.get_fsid();
+ inc.epoch = osdmap.get_epoch()+1;
+ inc.crush = cbl;
+ osdmap.apply_incremental(inc);
+ cout << me << ": imported " << cbl.length() << " byte crush map from " << import_crush << std::endl;
+ modified = true;
+ }
+
+ if (!export_crush.empty()) {
+ bufferlist cbl;
+ osdmap.crush->encode(cbl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ r = cbl.write_file(export_crush.c_str());
+ if (r < 0) {
+ cerr << me << ": error writing crush map to " << import_crush << std::endl;
+ exit(1);
+ }
+ cout << me << ": exported crush map to " << export_crush << std::endl;
+ }
+
+ if (!test_map_object.empty()) {
+ object_t oid(test_map_object);
+ if (pool == -1) {
+ cout << me << ": assuming pool 1 (use --pool to override)" << std::endl;
+ pool = 1;
+ }
+ if (!osdmap.have_pg_pool(pool)) {
+ cerr << "There is no pool " << pool << std::endl;
+ exit(1);
+ }
+ object_locator_t loc(pool);
+ pg_t raw_pgid = osdmap.object_locator_to_pg(oid, loc);
+ pg_t pgid = osdmap.raw_pg_to_pg(raw_pgid);
+
+ vector<int> acting;
+ osdmap.pg_to_acting_osds(pgid, acting);
+ cout << " object '" << oid
+ << "' -> " << pgid
+ << " -> " << acting
+ << std::endl;
+ }
+ if (!test_map_pg.empty()) {
+ pg_t pgid;
+ if (!pgid.parse(test_map_pg.c_str())) {
+ cerr << me << ": failed to parse pg '" << test_map_pg << std::endl;
+ usage();
+ }
+ cout << " parsed '" << test_map_pg << "' -> " << pgid << std::endl;
+
+ vector<int> raw, up, acting;
+ int raw_primary, up_primary, acting_primary;
+ osdmap.pg_to_raw_osds(pgid, &raw, &raw_primary);
+ osdmap.pg_to_up_acting_osds(pgid, &up, &up_primary,
+ &acting, &acting_primary);
+ cout << pgid << " raw (" << raw << ", p" << raw_primary
+ << ") up (" << up << ", p" << up_primary
+ << ") acting (" << acting << ", p" << acting_primary << ")"
+ << std::endl;
+ }
+ if (test_map_pgs || test_map_pgs_dump || test_map_pgs_dump_all) {
+ if (pool != -1 && !osdmap.have_pg_pool(pool)) {
+ cerr << "There is no pool " << pool << std::endl;
+ exit(1);
+ }
+ int n = osdmap.get_max_osd();
+ vector<int> count(n, 0);
+ vector<int> first_count(n, 0);
+ vector<int> primary_count(n, 0);
+ vector<int> size(30, 0);
+ int max_size = 0;
+ if (test_random)
+ srand(getpid());
+ auto& pools = osdmap.get_pools();
+ for (auto p = pools.begin(); p != pools.end(); ++p) {
+ if (pool != -1 && p->first != pool)
+ continue;
+ if (pg_num > 0)
+ p->second.set_pg_num(pg_num);
+
+ cout << "pool " << p->first
+ << " pg_num " << p->second.get_pg_num() << std::endl;
+ for (unsigned i = 0; i < p->second.get_pg_num(); ++i) {
+ pg_t pgid = pg_t(i, p->first);
+
+ vector<int> osds, raw, up, acting;
+ int primary, calced_primary, up_primary, acting_primary;
+ if (test_random) {
+ osds.resize(p->second.size);
+ for (unsigned i=0; i<osds.size(); ++i) {
+ osds[i] = rand() % osdmap.get_max_osd();
+ }
+ primary = osds[0];
+ } else if (test_map_pgs_dump_all) {
+ osdmap.pg_to_raw_osds(pgid, &raw, &calced_primary);
+ osdmap.pg_to_up_acting_osds(pgid, &up, &up_primary,
+ &acting, &acting_primary);
+ osds = acting;
+ primary = acting_primary;
+ } else {
+ osdmap.pg_to_acting_osds(pgid, &osds, &primary);
+ }
+ size[osds.size()]++;
+ if ((unsigned)max_size < osds.size())
+ max_size = osds.size();
+
+ if (test_map_pgs_dump) {
+ cout << pgid << "\t" << osds << "\t" << primary << std::endl;
+ } else if (test_map_pgs_dump_all) {
+ cout << pgid << " raw (" << raw << ", p" << calced_primary
+ << ") up (" << up << ", p" << up_primary
+ << ") acting (" << acting << ", p" << acting_primary << ")"
+ << std::endl;
+ }
+
+ for (unsigned i=0; i<osds.size(); i++) {
+ //cout << " rep " << i << " on " << osds[i] << std::endl;
+ count[osds[i]]++;
+ }
+ if (osds.size())
+ first_count[osds[0]]++;
+ if (primary >= 0)
+ primary_count[primary]++;
+ }
+ }
+
+ uint64_t total = 0;
+ int in = 0;
+ int min_osd = -1;
+ int max_osd = -1;
+ cout << "#osd\tcount\tfirst\tprimary\tc wt\twt\n";
+ for (int i=0; i<n; i++) {
+ if (!osdmap.is_in(i))
+ continue;
+ if (osdmap.crush->get_item_weight(i) <= 0)
+ continue;
+ in++;
+ cout << "osd." << i
+ << "\t" << count[i]
+ << "\t" << first_count[i]
+ << "\t" << primary_count[i]
+ << "\t" << osdmap.crush->get_item_weightf(i)
+ << "\t" << osdmap.get_weightf(i)
+ << std::endl;
+ total += count[i];
+ if (count[i] &&
+ (min_osd < 0 ||
+ count[i] < count[min_osd]))
+ min_osd = i;
+ if (count[i] &&
+ (max_osd < 0 ||
+ count[i] > count[max_osd]))
+ max_osd = i;
+
+ }
+ uint64_t avg = in ? (total / in) : 0;
+ double dev = 0;
+ for (int i=0; i<n; i++) {
+ if (!osdmap.is_in(i))
+ continue;
+ if (osdmap.crush->get_item_weight(i) <= 0)
+ continue;
+ dev += (avg - count[i]) * (avg - count[i]);
+ }
+ dev /= in;
+ dev = sqrt(dev);
+
+ //double edev = sqrt(pgavg) * (double)avg / pgavg;
+ double edev = sqrt((double)total / (double)in * (1.0 - (1.0 / (double)in)));
+ cout << " in " << in << std::endl;
+ cout << " avg " << avg
+ << " stddev " << dev
+ << " (" << (dev/avg) << "x)"
+ << " (expected " << edev << " " << (edev/avg) << "x))"
+ << std::endl;
+
+ if (min_osd >= 0)
+ cout << " min osd." << min_osd << " " << count[min_osd] << std::endl;
+ if (max_osd >= 0)
+ cout << " max osd." << max_osd << " " << count[max_osd] << std::endl;
+
+ for (int i=0; i<=max_size; i++) {
+ if (size[i])
+ cout << "size " << i << "\t" << size[i] << std::endl;
+ }
+ }
+ if (test_crush) {
+ int pass = 0;
+ while (1) {
+ cout << "pass " << ++pass << std::endl;
+
+ ceph::unordered_map<pg_t,vector<int> > m;
+ for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
+ p != osdmap.get_pools().end();
+ ++p) {
+ const pg_pool_t *pool = osdmap.get_pg_pool(p->first);
+ for (ps_t ps = 0; ps < pool->get_pg_num(); ps++) {
+ pg_t pgid(ps, p->first);
+ for (int i=0; i<100; i++) {
+ cout << pgid << " attempt " << i << std::endl;
+
+ vector<int> r;
+ osdmap.pg_to_acting_osds(pgid, r);
+ //cout << pgid << " " << r << std::endl;
+ if (m.count(pgid)) {
+ if (m[pgid] != r) {
+ cout << pgid << " had " << m[pgid] << " now " << r << std::endl;
+ ceph_abort();
+ }
+ } else
+ m[pgid] = r;
+ }
+ }
+ }
+ }
+ }
+
+ if (!print && !health && !tree && !modified &&
+ export_crush.empty() && import_crush.empty() &&
+ test_map_pg.empty() && test_map_object.empty() &&
+ !test_map_pgs && !test_map_pgs_dump && !test_map_pgs_dump_all &&
+ adjust_crush_weight.empty() && !upmap && !upmap_cleanup) {
+ cerr << me << ": no action specified?" << std::endl;
+ usage();
+ }
+
+ if (modified)
+ osdmap.inc_epoch();
+
+ if (health) {
+ health_check_map_t checks;
+ osdmap.check_health(cct.get(), &checks);
+ JSONFormatter jf(true);
+ jf.dump_object("checks", checks);
+ jf.flush(cout);
+ }
+ if (print) {
+ if (print_formatter) {
+ print_formatter->open_object_section("osdmap");
+ osdmap.dump(print_formatter.get());
+ print_formatter->close_section();
+ print_formatter->flush(cout);
+ } else {
+ osdmap.print(cout);
+ }
+ }
+
+ if (tree) {
+ if (tree_formatter) {
+ tree_formatter->open_object_section("tree");
+ osdmap.print_tree(tree_formatter.get(), NULL);
+ tree_formatter->close_section();
+ tree_formatter->flush(cout);
+ cout << std::endl;
+ } else {
+ osdmap.print_tree(NULL, &cout);
+ }
+ }
+ if (modified) {
+ bl.clear();
+ osdmap.encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT | CEPH_FEATURE_RESERVED);
+
+ // write it out
+ cout << me << ": writing epoch " << osdmap.get_epoch()
+ << " to " << fn
+ << std::endl;
+ int r = bl.write_file(fn.c_str());
+ if (r) {
+ cerr << "osdmaptool: error writing to '" << fn << "': "
+ << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ }
+
+
+ return 0;
+}
diff --git a/src/tools/psim.cc b/src/tools/psim.cc
new file mode 100644
index 000000000..90e6fb958
--- /dev/null
+++ b/src/tools/psim.cc
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd/OSDMap.h"
+#include "include/buffer.h"
+
+int main(int argc, char **argv)
+{
+ /*
+ * you need to create a suitable osdmap first. e.g., for 40 osds,
+ * $ ./osdmaptool --createsimple 40 --clobber .ceph_osdmap
+ */
+ bufferlist bl;
+ std::string error;
+ if (bl.read_file(".ceph_osdmap", &error)) {
+ cout << argv[0] << ": error reading .ceph_osdmap: " << error << std::endl;
+ return 1;
+ }
+ OSDMap osdmap;
+
+ try {
+ osdmap.decode(bl);
+ } catch (ceph::buffer::end_of_buffer &eob) {
+ cout << "Exception (end_of_buffer) in decode(), exit." << std::endl;
+ exit(1);
+ }
+
+ //osdmap.set_primary_affinity(0, 0x8000);
+ //osdmap.set_primary_affinity(3, 0);
+
+ int n = osdmap.get_max_osd();
+ int count[n];
+ int first_count[n];
+ int primary_count[n];
+ int size[4];
+
+ memset(count, 0, sizeof(count));
+ memset(first_count, 0, sizeof(first_count));
+ memset(primary_count, 0, sizeof(primary_count));
+ memset(size, 0, sizeof(size));
+
+ for (int i=0; i<n; i++) {
+ osdmap.set_state(i, osdmap.get_state(i) | CEPH_OSD_UP);
+ //if (i<12)
+ osdmap.set_weight(i, CEPH_OSD_IN);
+ }
+
+ //pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(0);
+ //p->type = pg_pool_t::TYPE_ERASURE;
+
+ for (int n = 0; n < 10; n++) { // namespaces
+ char nspace[20];
+ snprintf(nspace, sizeof(nspace), "n%d", n);
+ for (int f = 0; f < 5000; f++) { // files
+ for (int b = 0; b < 4; b++) { // blocks
+ char foo[20];
+ snprintf(foo, sizeof(foo), "%d.%d", f, b);
+ object_t oid(foo);
+ ceph_object_layout l = osdmap.make_object_layout(oid, 0, nspace);
+ vector<int> osds;
+ pg_t pgid = pg_t(l.ol_pgid);
+ //pgid.u.ps = f * 4 + b;
+ int primary;
+ osdmap.pg_to_acting_osds(pgid, &osds, &primary);
+ size[osds.size()]++;
+#if 0
+ if (0) {
+ hash<object_t> H;
+ int x = H(oid);
+ x = ceph_stable_mod(x, 1023, 1023);
+ int s = crush_hash32(x) % 15;
+ //cout << "ceph_psim: x = " << x << " s = " << s << std::endl;
+ //osds[0] = s;
+ }
+#endif
+ //osds[0] = crush_hash32(f) % n;
+ //cout << "oid " << oid << " pgid " << pgid << " on " << osds << std::endl;
+ for (unsigned i=0; i<osds.size(); i++) {
+ //cout << " rep " << i << " on " << osds[i] << std::endl;
+ count[osds[i]]++;
+ }
+ if (osds.size())
+ first_count[osds[0]]++;
+ if (primary >= 0)
+ primary_count[primary]++;
+ }
+ }
+ }
+
+ uint64_t avg = 0;
+ for (int i=0; i<n; i++) {
+ cout << "osd." << i << "\t" << count[i]
+ << "\t" << first_count[i]
+ << "\t" << primary_count[i]
+ << std::endl;
+ avg += count[i];
+ }
+ avg /= n;
+ double dev = 0;
+ for (int i=0; i<n; i++)
+ dev += (avg - count[i]) * (avg - count[i]);
+ dev /= n;
+ dev = sqrt(dev);
+
+ double pgavg = (double)osdmap.get_pg_pool(0)->get_pg_num() / (double)n;
+ double edev = sqrt(pgavg) * (double)avg / pgavg;
+ cout << " avg " << avg
+ << " stddev " << dev
+ << " (expected " << edev << ")"
+ << " (indep object placement would be " << sqrt(avg) << ")" << std::endl;
+
+ for (int i=0; i<4; i++) {
+ cout << "size" << i << "\t" << size[i] << std::endl;
+ }
+
+ return 0;
+}
diff --git a/src/tools/rados/PoolDump.cc b/src/tools/rados/PoolDump.cc
new file mode 100644
index 000000000..9bfafa107
--- /dev/null
+++ b/src/tools/rados/PoolDump.cc
@@ -0,0 +1,169 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+
+#include "PoolDump.h"
+
+using namespace librados;
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rados
+
+/**
+ * Export RADOS objects from a live cluster
+ * to a serialized format via a file descriptor.
+ *
+ * @returns 0 on success, else error code
+ */
+int PoolDump::dump(IoCtx *io_ctx)
+{
+ ceph_assert(io_ctx != NULL);
+
+ int r = 0;
+ write_super();
+
+ r = write_simple(TYPE_POOL_BEGIN, file_fd);
+ if (r != 0) {
+ return r;
+ }
+
+ io_ctx->set_namespace(all_nspaces);
+ librados::NObjectIterator i = io_ctx->nobjects_begin();
+
+ librados::NObjectIterator i_end = io_ctx->nobjects_end();
+ for (; i != i_end; ++i) {
+ const std::string oid = i->get_oid();
+ dout(10) << "OID '" << oid << "'" << dendl;
+
+ // Compose OBJECT_BEGIN
+ // ====================
+ object_begin obj_begin;
+ obj_begin.hoid.hobj.oid = i->get_oid();
+ obj_begin.hoid.hobj.nspace = i->get_nspace();
+ obj_begin.hoid.hobj.set_key(i->get_locator());
+
+ // Only output head, RadosImport only wants that
+ obj_begin.hoid.hobj.snap = CEPH_NOSNAP;
+
+ // Skip setting object_begin.oi, RadosImport doesn't care
+
+ r = write_section(TYPE_OBJECT_BEGIN, obj_begin, file_fd);
+ if (r != 0) {
+ return r;
+ }
+
+ // Compose TYPE_DATA chunks
+ // ========================
+ const uint32_t op_size = 4096 * 1024;
+ uint64_t offset = 0;
+ io_ctx->set_namespace(i->get_nspace());
+ io_ctx->locator_set_key(i->get_locator());
+ while (true) {
+ bufferlist outdata;
+ r = io_ctx->read(oid, outdata, op_size, offset);
+ if (r <= 0) {
+ // Error or no data
+ break;
+ }
+
+ r = write_section(TYPE_DATA,
+ data_section(offset, outdata.length(), outdata), file_fd);
+ if (r != 0) {
+ // Output stream error
+ return r;
+ }
+
+ if (outdata.length() < op_size) {
+ // No more data
+ break;
+ }
+ offset += outdata.length();
+ }
+
+ // Compose TYPE_ATTRS chunk
+ // ========================
+ std::map<std::string, bufferlist> raw_xattrs;
+ std::map<std::string, bufferlist> xattrs;
+ r = io_ctx->getxattrs(oid, raw_xattrs);
+ if (r < 0) {
+ cerr << "error getting xattr set " << oid << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ // Prepend "_" to mimic how user keys are represented in a pg export
+ for (std::map<std::string, bufferlist>::iterator i = raw_xattrs.begin();
+ i != raw_xattrs.end(); ++i) {
+ std::pair< std::string, bufferlist> item(std::string("_") + std::string(i->first.c_str()), i->second);
+ xattrs.insert(item);
+ }
+ r = write_section(TYPE_ATTRS, attr_section(xattrs), file_fd);
+ if (r != 0) {
+ return r;
+ }
+
+ // Compose TYPE_OMAP_HDR section
+ // =============================
+ bufferlist omap_header;
+ r = io_ctx->omap_get_header(oid, &omap_header);
+ if (r < 0) {
+ cerr << "error getting omap header " << oid
+ << ": " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ r = write_section(TYPE_OMAP_HDR, omap_hdr_section(omap_header), file_fd);
+ if (r != 0) {
+ return r;
+ }
+
+ // Compose TYPE_OMAP
+ int MAX_READ = 512;
+ string last_read = "";
+ do {
+ map<string, bufferlist> values;
+ r = io_ctx->omap_get_vals(oid, last_read, MAX_READ, &values);
+ if (r < 0) {
+ cerr << "error getting omap keys " << oid << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (values.size()) {
+ last_read = values.rbegin()->first;
+ } else {
+ break;
+ }
+
+ r = write_section(TYPE_OMAP, omap_section(values), file_fd);
+ if (r != 0) {
+ return r;
+ }
+ r = values.size();
+ } while (r == MAX_READ);
+
+ // Close object
+ // =============
+ r = write_simple(TYPE_OBJECT_END, file_fd);
+ if (r != 0) {
+ return r;
+ }
+ }
+
+ r = write_simple(TYPE_POOL_END, file_fd);
+#if defined(__linux__)
+ if (file_fd != STDOUT_FILENO)
+ posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+ return r;
+}
diff --git a/src/tools/rados/PoolDump.h b/src/tools/rados/PoolDump.h
new file mode 100644
index 000000000..33abd8868
--- /dev/null
+++ b/src/tools/rados/PoolDump.h
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef POOL_DUMP_H_
+#define POOL_DUMP_H_
+
+#include "include/rados/librados_fwd.hpp"
+#include "tools/RadosDump.h"
+
+class PoolDump : public RadosDump
+{
+ public:
+ explicit PoolDump(int file_fd_) : RadosDump(file_fd_, false) {}
+ int dump(librados::IoCtx *io_ctx);
+};
+
+#endif // POOL_DUMP_H_
diff --git a/src/tools/rados/RadosImport.cc b/src/tools/rados/RadosImport.cc
new file mode 100644
index 000000000..0a901b709
--- /dev/null
+++ b/src/tools/rados/RadosImport.cc
@@ -0,0 +1,399 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "common/errno.h"
+
+#include "osd/PGLog.h"
+#include "RadosImport.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rados
+
+int RadosImport::import(std::string pool, bool no_overwrite)
+{
+ librados::IoCtx ioctx;
+ librados::Rados cluster;
+
+ char *id = getenv("CEPH_CLIENT_ID");
+ if (id) cerr << "Client id is: " << id << std::endl;
+ int ret = cluster.init(id);
+ if (ret) {
+ cerr << "Error " << ret << " in cluster.init" << std::endl;
+ return ret;
+ }
+ ret = cluster.conf_read_file(NULL);
+ if (ret) {
+ cerr << "Error " << ret << " in cluster.conf_read_file" << std::endl;
+ return ret;
+ }
+ ret = cluster.conf_parse_env(NULL);
+ if (ret) {
+ cerr << "Error " << ret << " in cluster.conf_read_env" << std::endl;
+ return ret;
+ }
+ ret = cluster.connect();
+ if (ret) {
+ cerr << "Error " << ret << " in cluster.connect" << std::endl;
+ return ret;
+ }
+
+ ret = cluster.ioctx_create(pool.c_str(), ioctx);
+ if (ret < 0) {
+ cerr << "ioctx_create " << pool << " failed with " << ret << std::endl;
+ return ret;
+ }
+
+ return import(ioctx, no_overwrite);
+}
+
+int RadosImport::import(librados::IoCtx &io_ctx, bool no_overwrite)
+{
+ bufferlist ebl;
+ pg_info_t info;
+ PGLog::IndexedLog log;
+
+ int ret = read_super();
+ if (ret)
+ return ret;
+
+ if (sh.magic != super_header::super_magic) {
+ cerr << "Invalid magic number: 0x"
+ << std::hex << sh.magic << " vs. 0x" << super_header::super_magic
+ << std::dec << std::endl;
+ return -EFAULT;
+ }
+
+ if (sh.version > super_header::super_ver) {
+ cerr << "Can't handle export format version=" << sh.version << std::endl;
+ return -EINVAL;
+ }
+
+ //First section must be TYPE_PG_BEGIN
+ sectiontype_t type;
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ bool pool_mode = false;
+ if (type == TYPE_POOL_BEGIN) {
+ pool_mode = true;
+ cout << "Importing pool" << std::endl;
+ } else if (type == TYPE_PG_BEGIN) {
+ auto ebliter = ebl.cbegin();
+ pg_begin pgb;
+ pgb.decode(ebliter);
+ spg_t pgid = pgb.pgid;;
+ if (!pgid.is_no_shard()) {
+ cerr << "Importing Erasure Coded shard is not supported" << std::endl;
+ return -EOPNOTSUPP;
+ }
+ dout(10) << "Exported features: " << pgb.superblock.compat_features << dendl;
+ cout << "Importing from pgid " << pgid << std::endl;
+ } else {
+ cerr << "Invalid initial section code " << type << std::endl;
+ return -EFAULT;
+ }
+
+ // XXX: How to check export features?
+#if 0
+ if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
+ cerr << "Export has incompatible features set "
+ << pgb.superblock.compat_features << std::endl;
+ return -EINVAL;
+ }
+#endif
+
+#if defined(__linux__)
+ if (file_fd != STDIN_FILENO)
+ posix_fadvise(file_fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+
+ bool done = false;
+ bool found_metadata = false;
+ while(!done) {
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ //cout << "do_import: Section type " << hex << type << dec << std::endl;
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_OBJECT_BEGIN:
+ ret = get_object_rados(io_ctx, ebl, no_overwrite);
+ if (ret) {
+ cerr << "Error inserting object: " << ret << std::endl;
+ return ret;
+ }
+ break;
+ case TYPE_PG_METADATA:
+ dout(10) << "Don't care about the old metadata" << dendl;
+ found_metadata = true;
+ break;
+ case TYPE_PG_END:
+ done = true;
+ break;
+ case TYPE_POOL_END:
+ done = true;
+ break;
+ default:
+ return -EFAULT;
+ }
+ }
+
+ if (!(pool_mode || found_metadata)) {
+ cerr << "Missing metadata section!" << std::endl;
+ }
+
+#if defined(__linux__)
+ if (file_fd != STDIN_FILENO)
+ posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+ return 0;
+}
+
+int RadosImport::get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite)
+{
+ auto ebliter = bl.cbegin();
+ object_begin ob;
+ ob.decode(ebliter);
+ map<string,bufferlist>::iterator i;
+ bufferlist abl;
+ bool skipping;
+
+ data_section ds;
+ attr_section as;
+ omap_hdr_section oh;
+ omap_section os;
+
+ ceph_assert(g_ceph_context);
+ if (ob.hoid.hobj.nspace == g_ceph_context->_conf->osd_hit_set_namespace) {
+ cout << "Skipping internal object " << ob.hoid << std::endl;
+ skip_object(bl);
+ return 0;
+ }
+
+ if (!ob.hoid.hobj.is_head()) {
+ cout << "Skipping non-head for " << ob.hoid << std::endl;
+ skip_object(bl);
+ return 0;
+ }
+
+ ioctx.set_namespace(ob.hoid.hobj.get_namespace());
+ ioctx.locator_set_key(ob.hoid.hobj.get_key());
+
+ string msg("Write");
+ skipping = false;
+ if (dry_run) {
+ uint64_t psize;
+ time_t pmtime;
+ int ret = ioctx.stat(ob.hoid.hobj.oid.name, &psize, &pmtime);
+ if (ret == 0) {
+ if (no_overwrite)
+ // Could set skipping, but dry-run doesn't change anything either
+ msg = "Skipping existing";
+ else
+ msg = "***Overwrite***";
+ }
+ } else {
+ int ret = ioctx.create(ob.hoid.hobj.oid.name, true);
+ if (ret && ret != -EEXIST) {
+ cerr << "create failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ if (ret == -EEXIST) {
+ if (no_overwrite) {
+ msg = "Skipping existing";
+ skipping = true;
+ } else {
+ msg = "***Overwrite***";
+ ret = ioctx.remove(ob.hoid.hobj.oid.name);
+ if (ret < 0) {
+ cerr << "remove failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ ret = ioctx.create(ob.hoid.hobj.oid.name, true);
+ // If object re-appeared after removal, let's just skip it
+ if (ret == -EEXIST) {
+ skipping = true;
+ msg = "Skipping in-use object";
+ ret = 0;
+ }
+ if (ret < 0) {
+ cerr << "create failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ }
+ }
+ }
+
+ cout << msg << " " << ob.hoid << std::endl;
+
+ bool need_align = false;
+ uint64_t alignment = 0;
+ if (align) {
+ need_align = true;
+ alignment = align;
+ } else {
+ int ret = ioctx.pool_requires_alignment2(&need_align);
+ if (ret < 0) {
+ cerr << "pool_requires_alignment2 failed: " << cpp_strerror(ret)
+ << std::endl;
+ return ret;
+ }
+
+ if (need_align) {
+ ret = ioctx.pool_required_alignment2(&alignment);
+ if (ret < 0) {
+ cerr << "pool_required_alignment2 failed: " << cpp_strerror(ret)
+ << std::endl;
+ return ret;
+ }
+ ceph_assert(alignment != 0);
+ }
+ }
+
+ if (need_align) {
+ dout(10) << "alignment = " << alignment << dendl;
+ }
+
+ bufferlist ebl, databl;
+ uint64_t in_offset = 0, out_offset = 0;
+ bool done = false;
+ while(!done) {
+ sectiontype_t type;
+ int ret = read_section(&type, &ebl);
+ if (ret) {
+ cerr << "Error reading section: " << ret << std::endl;
+ return ret;
+ }
+
+ ebliter = ebl.cbegin();
+ //cout << "\tdo_object: Section type " << hex << type << dec << std::endl;
+ //cout << "\t\tsection size " << ebl.length() << std::endl;
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown object section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_DATA:
+ ds.decode(ebliter);
+ dout(10) << "\tdata: offset " << ds.offset << " len " << ds.len << dendl;
+ if (need_align) {
+ if (ds.offset != in_offset) {
+ cerr << "Discontiguous object data in export" << std::endl;
+ return -EFAULT;
+ }
+ ceph_assert(ds.databl.length() == ds.len);
+ databl.claim_append(ds.databl);
+ in_offset += ds.len;
+ if (databl.length() >= alignment) {
+ uint64_t rndlen = uint64_t(databl.length() / alignment) * alignment;
+ dout(10) << "write offset=" << out_offset << " len=" << rndlen << dendl;
+ if (!dry_run && !skipping) {
+ ret = ioctx.write(ob.hoid.hobj.oid.name, databl, rndlen, out_offset);
+ if (ret) {
+ cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ }
+ out_offset += rndlen;
+ bufferlist n;
+ if (databl.length() > rndlen) {
+ ceph_assert(databl.length() - rndlen < alignment);
+ n.substr_of(databl, rndlen, databl.length() - rndlen);
+ }
+ databl = n;
+ }
+ break;
+ }
+ if (!dry_run && !skipping) {
+ ret = ioctx.write(ob.hoid.hobj.oid.name, ds.databl, ds.len, ds.offset);
+ if (ret) {
+ cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ }
+ break;
+ case TYPE_ATTRS:
+ as.decode(ebliter);
+
+ dout(10) << "\tattrs: len " << as.data.size() << dendl;
+ if (dry_run || skipping)
+ break;
+ for (std::map<string,bufferlist>::iterator i = as.data.begin();
+ i != as.data.end(); ++i) {
+ // The user xattrs that we want all begin with "_" with length > 1.
+ // Drop key "_" and all attributes that do not start with '_'
+ if (i->first == "_" || i->first[0] != '_')
+ continue;
+ ret = ioctx.setxattr(ob.hoid.hobj.oid.name, i->first.substr(1).c_str(), i->second);
+ if (ret) {
+ cerr << "setxattr failed: " << cpp_strerror(ret) << std::endl;
+ if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+ }
+ break;
+ case TYPE_OMAP_HDR:
+ oh.decode(ebliter);
+
+ dout(10) << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length())
+ << dendl;
+ if (dry_run || skipping)
+ break;
+ ret = ioctx.omap_set_header(ob.hoid.hobj.oid.name, oh.hdr);
+ if (ret) {
+ cerr << "omap_set_header failed: " << cpp_strerror(ret) << std::endl;
+ if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+ break;
+ case TYPE_OMAP:
+ os.decode(ebliter);
+
+ dout(10) << "\tomap: size " << os.omap.size() << dendl;
+ if (dry_run || skipping)
+ break;
+ ret = ioctx.omap_set(ob.hoid.hobj.oid.name, os.omap);
+ if (ret) {
+ cerr << "omap_set failed: " << cpp_strerror(ret) << std::endl;
+ if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+ break;
+ case TYPE_OBJECT_END:
+ done = true;
+ if (need_align && databl.length() > 0) {
+ ceph_assert(databl.length() < alignment);
+ dout(10) << "END write offset=" << out_offset << " len=" << databl.length() << dendl;
+ if (dry_run || skipping)
+ break;
+ ret = ioctx.write(ob.hoid.hobj.oid.name, databl, databl.length(), out_offset);
+ if (ret) {
+ cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ }
+ break;
+ default:
+ cerr << "Unexpected section type " << type << std::endl;
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
diff --git a/src/tools/rados/RadosImport.h b/src/tools/rados/RadosImport.h
new file mode 100644
index 000000000..3a5166306
--- /dev/null
+++ b/src/tools/rados/RadosImport.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef RADOS_IMPORT_H_
+#define RADOS_IMPORT_H_
+
+#include <string>
+
+#include "include/rados/librados.hpp"
+#include "include/buffer_fwd.h"
+
+#include "tools/RadosDump.h"
+
+/**
+ * Specialization of RadosDump that adds
+ * methods for importing objects from a stream
+ * to a live cluster.
+ */
+class RadosImport : public RadosDump
+{
+ protected:
+ uint64_t align;
+ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite);
+
+ public:
+ RadosImport(int file_fd_, uint64_t align_, bool dry_run_)
+ : RadosDump(file_fd_, dry_run_), align(align_)
+ {}
+
+ int import(std::string pool, bool no_overwrite);
+ int import(librados::IoCtx &io_ctx, bool no_overwrite);
+};
+
+#endif // RADOS_IMPORT_H_
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
new file mode 100644
index 000000000..7564fc7f0
--- /dev/null
+++ b/src/tools/rados/rados.cc
@@ -0,0 +1,4209 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/types.h"
+
+#include "include/rados/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rados/rados_types.hpp"
+
+#include "acconfig.h"
+#ifdef WITH_LIBRADOSSTRIPER
+ #include "include/radosstriper/libradosstriper.hpp"
+ using namespace libradosstriper;
+#endif
+
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/Cond.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/obj_bencher.h"
+#include "common/TextTable.h"
+#include "include/stringify.h"
+#include "mds/inode_backtrace.h"
+#include "include/random.h"
+#include <iostream>
+#include <fstream>
+
+#include <stdlib.h>
+#include <time.h>
+#include <sstream>
+#include <errno.h>
+#include <dirent.h>
+#include <stdexcept>
+#include <climits>
+#include <locale>
+#include <memory>
+#include <optional>
+
+#include "cls/lock/cls_lock_client.h"
+#include "include/compat.h"
+#include "include/util.h"
+#include "common/hobject.h"
+
+#include "PoolDump.h"
+#include "RadosImport.h"
+
+#include "osd/ECUtil.h"
+
+using namespace librados;
+using ceph::util::generate_random_number;
+
+// two steps seem to be necessary to do this right
+#define STR(x) _STR(x)
+#define _STR(x) #x
+
+void usage(ostream& out)
+{
+ out << \
+"usage: rados [options] [commands]\n"
+"POOL COMMANDS\n"
+" lspools list pools\n"
+" cppool <pool-name> <dest-pool> copy content of a pool\n"
+" purge <pool-name> --yes-i-really-really-mean-it\n"
+" remove all objects from pool <pool-name> without removing it\n"
+" df show per-pool and total usage\n"
+" ls list objects in pool\n\n"
+"\n"
+"POOL SNAP COMMANDS\n"
+" lssnap list snaps\n"
+" mksnap <snap-name> create snap <snap-name>\n"
+" rmsnap <snap-name> remove snap <snap-name>\n"
+"\n"
+"OBJECT COMMANDS\n"
+" get <obj-name> <outfile> fetch object\n"
+" put <obj-name> <infile> [--offset offset]\n"
+" write object with start offset (default:0)\n"
+" append <obj-name> <infile> append object\n"
+" truncate <obj-name> length truncate object\n"
+" create <obj-name> create object\n"
+" rm <obj-name> ...[--force-full] [force no matter full or not]remove object(s)\n"
+" cp <obj-name> [target-obj] copy object\n"
+" listxattr <obj-name>\n"
+" getxattr <obj-name> attr\n"
+" setxattr <obj-name> attr val\n"
+" rmxattr <obj-name> attr\n"
+" stat <obj-name> stat the named object\n"
+" stat2 <obj-name> stat2 the named object (with high precision time)\n"
+" touch <obj-name> [timestamp] change the named object modification time\n"
+" mapext <obj-name>\n"
+" rollback <obj-name> <snap-name> roll back object to snap <snap-name>\n"
+"\n"
+" listsnaps <obj-name> list the snapshots of this object\n"
+" bench <seconds> write|seq|rand [-t concurrent_operations] [--no-cleanup] [--run-name run_name] [--no-hints] [--reuse-bench]\n"
+" default is 16 concurrent IOs and 4 MB ops\n"
+" default is to clean up after write benchmark\n"
+" default run-name is 'benchmark_last_metadata'\n"
+" cleanup [--run-name run_name] [--prefix prefix]\n"
+" clean up a previous benchmark operation\n"
+" default run-name is 'benchmark_last_metadata'\n"
+" load-gen [options] generate load on the cluster\n"
+" listomapkeys <obj-name> list the keys in the object map\n"
+" listomapvals <obj-name> list the keys and vals in the object map \n"
+" getomapval <obj-name> <key> [file] show the value for the specified key\n"
+" in the object's object map\n"
+" setomapval <obj-name> <key> <val | --input-file file>\n"
+" rmomapkey <obj-name> <key>\n"
+" clearomap <obj-name> [obj-name2 obj-name3...] clear all the omap keys for the specified objects\n"
+" getomapheader <obj-name> [file]\n"
+" setomapheader <obj-name> <val>\n"
+" watch <obj-name> add watcher on this object\n"
+" notify <obj-name> <message> notify watcher of this object with message\n"
+" listwatchers <obj-name> list the watchers of this object\n"
+" set-alloc-hint <obj-name> <expected-object-size> <expected-write-size>\n"
+" set allocation hint for an object\n"
+" set-redirect <object A> --target-pool <caspool> <target object A> [--with-reference]\n"
+" set redirect target\n"
+" set-chunk <object A> <offset> <length> --target-pool <caspool> <target object A> <taget-offset> [--with-reference]\n"
+" convert an object to chunked object\n"
+" tier-promote <obj-name> promote the object to the base tier\n"
+" unset-manifest <obj-name> unset redirect or chunked object\n"
+" tier-flush <obj-name> flush the chunked object\n"
+" tier-evict <obj-name> evict the chunked object\n"
+"\n"
+"IMPORT AND EXPORT\n"
+" export [filename]\n"
+" Serialize pool contents to a file or standard out.\n"
+" import [--dry-run] [--no-overwrite] < filename | - >\n"
+" Load pool contents from a file or standard in\n"
+"\n"
+"ADVISORY LOCKS\n"
+" lock list <obj-name>\n"
+" List all advisory locks on an object\n"
+" lock get <obj-name> <lock-name> [--lock-cookie locker-cookie] [--lock-tag locker-tag] [--lock-description locker-desc] [--lock-duration locker-dur] [--lock-type locker-type]\n"
+" Try to acquire a lock\n"
+" lock break <obj-name> <lock-name> <locker-name> [--lock-cookie locker-cookie]\n"
+" Try to break a lock acquired by another client\n"
+" lock info <obj-name> <lock-name>\n"
+" Show lock information\n"
+" options:\n"
+" --lock-tag Lock tag, all locks operation should use\n"
+" the same tag\n"
+" --lock-cookie Locker cookie\n"
+" --lock-description Description of lock\n"
+" --lock-duration Lock duration (in seconds)\n"
+" --lock-type Lock type (shared, exclusive)\n"
+"\n"
+"SCRUB AND REPAIR:\n"
+" list-inconsistent-pg <pool> list inconsistent PGs in given pool\n"
+" list-inconsistent-obj <pgid> list inconsistent objects in given PG\n"
+" list-inconsistent-snapset <pgid> list inconsistent snapsets in the given PG\n"
+"\n"
+"CACHE POOLS: (for testing/development only)\n"
+" cache-flush <obj-name> flush cache pool object (blocking)\n"
+" cache-try-flush <obj-name> flush cache pool object (non-blocking)\n"
+" cache-evict <obj-name> evict cache pool object\n"
+" cache-flush-evict-all flush+evict all objects\n"
+" cache-try-flush-evict-all try-flush+evict all objects\n"
+"\n"
+"GLOBAL OPTIONS:\n"
+" --object-locator object_locator\n"
+" set object_locator for operation\n"
+" -p pool\n"
+" --pool=pool\n"
+" select given pool by name\n"
+" --target-pool=pool\n"
+" select target pool by name\n"
+" --pgid PG id\n"
+" select given PG id\n"
+" -f [--format plain|json|json-pretty]\n"
+" --format=[--format plain|json|json-pretty]\n"
+" -b op_size\n"
+" set the block size for put/get ops and for write benchmarking\n"
+" -O object_size\n"
+" set the object size for put/get ops and for write benchmarking\n"
+" --max-objects\n"
+" set the max number of objects for write benchmarking\n"
+" --obj-name-file file\n"
+" use the content of the specified file in place of <obj-name>\n"
+" -s name\n"
+" --snap name\n"
+" select given snap name for (read) IO\n"
+" --input-file file\n"
+" use the content of the specified file in place of <val>\n"
+" --create\n"
+" create the pool or directory that was specified\n"
+" -N namespace\n"
+" --namespace=namespace\n"
+" specify the namespace to use for the object\n"
+" --all\n"
+" Use with ls to list objects in all namespaces\n"
+" Put in CEPH_ARGS environment variable to make this the default\n"
+" --default\n"
+" Use with ls to list objects in default namespace\n"
+" Takes precedence over --all in case --all is in environment\n"
+" --target-locator\n"
+" Use with cp to specify the locator of the new object\n"
+" --target-nspace\n"
+" Use with cp to specify the namespace of the new object\n"
+#ifdef WITH_LIBRADOSSTRIPER
+" --striper\n"
+" Use radostriper interface rather than pure rados\n"
+" Available for stat, get, put, truncate, rm, ls and \n"
+" all xattr related operations\n"
+#endif
+"\n"
+"BENCH OPTIONS:\n"
+" -t N\n"
+" --concurrent-ios=N\n"
+" Set number of concurrent I/O operations\n"
+" --show-time\n"
+" prefix output with date/time\n"
+" --no-verify\n"
+" do not verify contents of read objects\n"
+" --write-object\n"
+" write contents to the objects\n"
+" --write-omap\n"
+" write contents to the omap\n"
+" --write-xattr\n"
+" write contents to the extended attributes\n"
+"\n"
+"LOAD GEN OPTIONS:\n"
+" --num-objects total number of objects\n"
+" --min-object-size min object size\n"
+" --max-object-size max object size\n"
+" --min-op-len min io size of operations\n"
+" --max-op-len max io size of operations\n"
+" --max-ops max number of operations\n"
+" --max-backlog max backlog size\n"
+" --read-percent percent of operations that are read\n"
+" --target-throughput target throughput (in bytes)\n"
+" --run-length total time (in seconds)\n"
+" --offset-align at what boundary to align random op offsets\n"
+"\n"
+"CACHE POOLS OPTIONS:\n"
+" --with-clones include clones when doing flush or evict\n"
+"\n"
+"OMAP OPTIONS:\n"
+" --omap-key-file file read the omap key from a file\n"
+"\n"
+"GENERIC OPTIONS:\n";
+ generic_client_usage();
+}
+
+namespace detail {
+
+#ifdef WITH_LIBRADOSSTRIPER
+RadosStriper& striper()
+{
+ static RadosStriper s;
+ return s;
+}
+#endif
+
+int read([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, buffer::list& out_data, const unsigned op_size, const uint64_t offset, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().read(oid, &out_data, op_size, offset);
+#endif
+
+ return io_ctx.read(oid, out_data, op_size, offset);
+}
+
+int write([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, buffer::list& indata, const uint64_t count, const uint64_t offset, [[maybe_unused]] const bool use_striper)
+{
+ #ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().write(oid, indata, count, offset);
+#endif
+
+ return io_ctx.write(oid, indata, count, offset);
+}
+
+int write_full([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, bufferlist& indata, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().write_full(oid, indata);
+#endif
+
+ return io_ctx.write_full(oid, indata);
+}
+
+int trunc([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const uint64_t offset, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().trunc(oid, offset);
+#endif
+
+ return io_ctx.trunc(oid, offset);
+}
+
+int append([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, buffer::list& indata, const uint64_t count, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().append(oid, indata, count);
+#endif
+
+ return io_ctx.append(oid, indata, count);
+}
+
+int setxattr([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const std::string& attr_name, buffer::list& bl, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().setxattr(oid, attr_name.c_str(), bl);
+#endif
+
+ return io_ctx.setxattr(oid, attr_name.c_str(), bl);
+}
+
+int getxattr([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const std::string& attr_name, buffer::list& bl, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().getxattr(oid, attr_name.c_str(), bl);
+#endif
+
+ return io_ctx.getxattr(oid, attr_name.c_str(), bl);
+}
+
+int rmxattr([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const std::string& attr_name, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().rmxattr(oid, attr_name.c_str());
+#endif
+
+ return io_ctx.rmxattr(oid, attr_name.c_str());
+}
+
+int getxattrs([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, std::map<std::string, buffer::list>& attrset, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().getxattrs(oid, attrset);
+#endif
+
+ return io_ctx.getxattrs(oid, attrset);
+}
+
+int remove([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const int flags, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().remove(oid, flags);
+#endif
+
+ return io_ctx.remove(oid, flags);
+}
+
+int remove([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().remove(oid);
+#endif
+
+ return io_ctx.remove(oid);
+}
+
+std::string get_oid(librados::NObjectIterator& i, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return i->get_oid().substr(0, i->get_oid().length()-17);
+#endif
+
+ return i->get_oid();
+}
+
+int stat([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, uint64_t& size, time_t& mtime, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().stat(oid, &size, &mtime);
+#endif
+
+ return io_ctx.stat(oid, &size, &mtime);
+}
+
+int stat2([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, uint64_t& size, timespec& mtime, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().stat2(oid, &size, &mtime);
+#endif
+
+ return io_ctx.stat2(oid, &size, &mtime);
+}
+
+void dump_name(Formatter *formatter, const librados::NObjectIterator& i, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper) {
+ formatter->dump_string("name", i->get_oid().substr(0, i->get_oid().length()-17));
+ return;
+ }
+#endif
+
+ formatter->dump_string("name", i->get_oid());
+}
+
+} // namespace detail
+
+unsigned default_op_size = 1 << 22;
+static const unsigned MAX_OMAP_BYTES_PER_REQUEST = 1 << 10;
+
+[[noreturn]] static void usage_exit()
+{
+ usage(cerr);
+ exit(1);
+}
+
+
+template <typename I, typename T>
+static int rados_sistrtoll(I &i, T *val) {
+ std::string err;
+ *val = strict_iecstrtoll(i->second.c_str(), &err);
+ if (err != "") {
+ cerr << "Invalid value for " << i->first << ": " << err << std::endl;
+ return -EINVAL;
+ } else {
+ return 0;
+ }
+}
+
+
+static int dump_data(std::string const &filename, bufferlist const &data)
+{
+ int fd;
+ if (filename == "-") {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = TEMP_FAILURE_RETRY(::open(filename.c_str(), O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0644));
+ if (fd < 0) {
+ int err = errno;
+ cerr << "failed to open file: " << cpp_strerror(err) << std::endl;
+ return -err;
+ }
+ }
+
+ int r = data.write_fd(fd);
+
+ if (fd != 1) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+
+ return r;
+}
+
+
+static int do_get(IoCtx& io_ctx, const std::string& oid, const char *outfile, unsigned op_size, [[maybe_unused]] const bool use_striper)
+{
+ int fd;
+ if (strcmp(outfile, "-") == 0) {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = TEMP_FAILURE_RETRY(::open(outfile, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0644));
+ if (fd < 0) {
+ int err = errno;
+ cerr << "failed to open file: " << cpp_strerror(err) << std::endl;
+ return -err;
+ }
+ }
+
+ uint64_t offset = 0;
+ int ret;
+ while (true) {
+ bufferlist outdata;
+
+ ret = detail::read(io_ctx, oid, outdata, op_size, offset, use_striper);
+ if (ret <= 0) {
+ goto out;
+ }
+ ret = outdata.write_fd(fd);
+ if (ret < 0) {
+ cerr << "error writing to file: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ if (outdata.length() < op_size)
+ break;
+ offset += outdata.length();
+ }
+ ret = 0;
+
+ out:
+ if (fd != 1)
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+}
+
+static int do_copy(IoCtx& io_ctx, const char *objname,
+ IoCtx& target_ctx, const char *target_obj)
+{
+ uint32_t src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ uint32_t dest_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+ ObjectWriteOperation op;
+ op.copy_from(objname, io_ctx, 0, src_fadvise_flags);
+ op.set_op_flags2(dest_fadvise_flags);
+
+ return target_ctx.operate(target_obj, &op);
+}
+
+static int do_copy_pool(Rados& rados, const char *src_pool, const char *target_pool)
+{
+ IoCtx src_ctx, target_ctx;
+ int ret = rados.ioctx_create(src_pool, src_ctx);
+ if (ret < 0) {
+ cerr << "cannot open source pool: " << src_pool << std::endl;
+ return ret;
+ }
+ ret = rados.ioctx_create(target_pool, target_ctx);
+ if (ret < 0) {
+ cerr << "cannot open target pool: " << target_pool << std::endl;
+ return ret;
+ }
+ src_ctx.set_namespace(all_nspaces);
+ librados::NObjectIterator i = src_ctx.nobjects_begin();
+ librados::NObjectIterator i_end = src_ctx.nobjects_end();
+ for (; i != i_end; ++i) {
+ string nspace = i->get_nspace();
+ string oid = i->get_oid();
+ string locator = i->get_locator();
+
+ string target_name = (nspace.size() ? nspace + "/" : "") + oid;
+ string src_name = target_name;
+ if (locator.size())
+ src_name += "(@" + locator + ")";
+ cout << src_pool << ":" << src_name << " => "
+ << target_pool << ":" << target_name << std::endl;
+
+ src_ctx.locator_set_key(locator);
+ src_ctx.set_namespace(nspace);
+ target_ctx.set_namespace(nspace);
+ ret = do_copy(src_ctx, oid.c_str(), target_ctx, oid.c_str());
+ if (ret < 0) {
+ cerr << "error copying object: " << cpp_strerror(errno) << std::endl;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int do_put(IoCtx& io_ctx,
+ const std::string& oid, const char *infile, int op_size,
+ uint64_t obj_offset, bool create_object,
+ const bool use_striper)
+{
+ bool stdio = (strcmp(infile, "-") == 0);
+ int ret = 0;
+ int fd = STDIN_FILENO;
+ if (!stdio)
+ fd = open(infile, O_RDONLY|O_BINARY);
+ if (fd < 0) {
+ cerr << "error reading input file " << infile << ": " << cpp_strerror(errno) << std::endl;
+ return 1;
+ }
+ int count = op_size;
+ uint64_t offset = obj_offset;
+ while (count != 0) {
+ bufferlist indata;
+ count = indata.read_fd(fd, op_size);
+ if (count < 0) {
+ ret = -errno;
+ cerr << "error reading input file " << infile << ": " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ if (count == 0) {
+ if (offset == obj_offset) { // in case we have to create an empty object & if obj_offset > 0 do a hole
+ ret = detail::write_full(io_ctx, oid, indata, use_striper); // indata is empty
+
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (offset) {
+ ret = detail::trunc(io_ctx, oid, offset, use_striper); // before truncate, object must be existed.
+
+ if (ret < 0) {
+ goto out;
+ }
+ }
+ }
+ continue;
+ }
+
+ if (0 == offset && create_object)
+ ret = detail::write_full(io_ctx, oid, indata, use_striper);
+ else
+ ret = detail::write(io_ctx, oid, indata, count, offset, use_striper);
+
+ if (ret < 0) {
+ goto out;
+ }
+ offset += count;
+ }
+ ret = 0;
+ out:
+ if (fd != STDOUT_FILENO)
+ VOID_TEMP_FAILURE_RETRY(close(fd));
+ return ret;
+}
+
+static int do_append(IoCtx& io_ctx,
+ const std::string& oid, const char *infile, int op_size,
+ const bool use_striper)
+{
+ bool stdio = (strcmp(infile, "-") == 0);
+ int ret = 0;
+ int fd = STDIN_FILENO;
+ if (!stdio)
+ fd = open(infile, O_RDONLY|O_BINARY);
+ if (fd < 0) {
+ cerr << "error reading input file " << infile << ": " << cpp_strerror(errno) << std::endl;
+ return 1;
+ }
+ int count = op_size;
+ while (count != 0) {
+ bufferlist indata;
+ count = indata.read_fd(fd, op_size);
+ if (count < 0) {
+ ret = -errno;
+ cerr << "error reading input file " << infile << ": " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = detail::append(io_ctx, oid, indata, count, use_striper);
+
+ if (ret < 0) {
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ if (fd != STDOUT_FILENO)
+ VOID_TEMP_FAILURE_RETRY(close(fd));
+ return ret;
+}
+
+class RadosWatchCtx : public librados::WatchCtx2 {
+ IoCtx& ioctx;
+ string name;
+public:
+ RadosWatchCtx(IoCtx& io, const char *imgname) : ioctx(io), name(imgname) {}
+ ~RadosWatchCtx() override {}
+ void handle_notify(uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ bufferlist& bl) override {
+ cout << "NOTIFY"
+ << " cookie " << cookie
+ << " notify_id " << notify_id
+ << " from " << notifier_id
+ << std::endl;
+ bl.hexdump(cout);
+ ioctx.notify_ack(name, notify_id, cookie, bl);
+ }
+ void handle_error(uint64_t cookie, int err) override {
+ cout << "ERROR"
+ << " cookie " << cookie
+ << " err " << cpp_strerror(err)
+ << std::endl;
+ }
+};
+
+static const char alphanum_table[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
+
+void gen_rand_alphanumeric(char *dest, int size) /* size should be the required string size + 1 */
+{
+ const int max = sizeof(alphanum_table) - 2;
+
+ int i;
+ for (i=0; i<size - 1; i++) {
+ int pos = generate_random_number(0, max);
+ dest[i] = alphanum_table[pos];
+ }
+ dest[i] = '\0';
+}
+
+struct obj_info {
+ string name;
+ size_t len;
+};
+
+class LoadGen {
+ size_t total_sent;
+ size_t total_completed;
+
+ IoCtx io_ctx;
+ Rados *rados;
+
+ map<int, obj_info> objs;
+
+ utime_t start_time;
+
+ bool going_down;
+
+public:
+ int read_percent;
+ int num_objs;
+ size_t min_obj_len;
+ size_t max_obj_len;
+ size_t min_op_len;
+ size_t max_op_len;
+ size_t max_ops;
+ size_t max_backlog;
+ size_t target_throughput;
+ size_t offset_align = 0;
+ int run_length;
+
+ enum {
+ OP_READ,
+ OP_WRITE,
+ };
+
+ struct LoadGenOp {
+ int id;
+ int type;
+ string oid;
+ size_t off;
+ size_t len;
+ bufferlist bl;
+ LoadGen *lg;
+ librados::AioCompletion *completion;
+
+ LoadGenOp() : id(0), type(0), off(0), len(0), lg(NULL), completion(NULL) {}
+ explicit LoadGenOp(LoadGen *_lg) : id(0), type(0), off(0), len(0), lg(_lg), completion(NULL) {}
+ };
+
+ int max_op;
+
+ map<int, LoadGenOp *> pending_ops;
+
+ void gen_op(LoadGenOp *op);
+ uint64_t gen_next_op();
+ void run_op(LoadGenOp *op);
+
+ uint64_t cur_sent_rate() {
+ return total_sent / time_passed();
+ }
+
+ uint64_t cur_completed_rate() {
+ return total_completed / time_passed();
+ }
+
+ uint64_t total_expected() {
+ return target_throughput * time_passed();
+ }
+
+ float time_passed() {
+ utime_t now = ceph_clock_now();
+ now -= start_time;
+ uint64_t ns = now.nsec();
+ float total = (float) ns / 1000000000.0;
+ total += now.sec();
+ return total;
+ }
+
+ ceph::mutex lock = ceph::make_mutex("LoadGen");
+ ceph::condition_variable cond;
+
+ explicit LoadGen(Rados *_rados) : rados(_rados), going_down(false) {
+ read_percent = 80;
+ min_obj_len = 1024;
+ max_obj_len = 5ull * 1024ull * 1024ull * 1024ull;
+ min_op_len = 1024;
+ target_throughput = 5 * 1024 * 1024; // B/sec
+ max_op_len = 2 * 1024 * 1024;
+ max_ops = 16;
+ max_backlog = target_throughput * 2;
+ run_length = 60;
+
+ total_sent = 0;
+ total_completed = 0;
+ num_objs = 200;
+ max_op = 0;
+ }
+ int bootstrap(const char *pool);
+ int run();
+ void cleanup();
+
+ void io_cb(completion_t c, LoadGenOp *op) {
+ std::lock_guard l{lock};
+
+ total_completed += op->len;
+
+ double rate = (double)cur_completed_rate() / (1024 * 1024);
+ std::streamsize original_precision = cout.precision();
+ cout.precision(3);
+ cout << "op " << op->id << " completed, throughput=" << rate << "MB/sec" << std::endl;
+ cout.precision(original_precision);
+
+ map<int, LoadGenOp *>::iterator iter = pending_ops.find(op->id);
+ if (iter != pending_ops.end())
+ pending_ops.erase(iter);
+
+ if (!going_down)
+ op->completion->release();
+
+ delete op;
+
+ cond.notify_all();
+ }
+};
+
+static void _load_gen_cb(completion_t c, void *param)
+{
+ LoadGen::LoadGenOp *op = (LoadGen::LoadGenOp *)param;
+ op->lg->io_cb(c, op);
+}
+
+int LoadGen::bootstrap(const char *pool)
+{
+ char buf[128];
+ int i;
+
+ if (!pool) {
+ cerr << "ERROR: pool name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ int ret = rados->ioctx_create(pool, io_ctx);
+ if (ret < 0) {
+ cerr << "error opening pool " << pool << ": " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ int buf_len = 1;
+ bufferptr p = buffer::create(buf_len);
+ bufferlist bl;
+ memset(p.c_str(), 0, buf_len);
+ bl.push_back(p);
+
+ list<librados::AioCompletion *> completions;
+ for (i = 0; i < num_objs; i++) {
+ obj_info info;
+ gen_rand_alphanumeric(buf, 16);
+ info.name = "obj-";
+ info.name.append(buf);
+ info.len = generate_random_number(min_obj_len, max_obj_len);
+
+ // throttle...
+ while (completions.size() > max_ops) {
+ AioCompletion *c = completions.front();
+ c->wait_for_complete();
+ ret = c->get_return_value();
+ c->release();
+ completions.pop_front();
+ if (ret < 0) {
+ cerr << "aio_write failed" << std::endl;
+ return ret;
+ }
+ }
+
+ librados::AioCompletion *c = rados->aio_create_completion(nullptr, nullptr);
+ completions.push_back(c);
+ // generate object
+ ret = io_ctx.aio_write(info.name, c, bl, buf_len, info.len - buf_len);
+ if (ret < 0) {
+ cerr << "couldn't write obj: " << info.name << " ret=" << ret << std::endl;
+ return ret;
+ }
+ objs[i] = info;
+ }
+
+ list<librados::AioCompletion *>::iterator iter;
+ for (iter = completions.begin(); iter != completions.end(); ++iter) {
+ AioCompletion *c = *iter;
+ c->wait_for_complete();
+ ret = c->get_return_value();
+ c->release();
+ if (ret < 0) { // yes, we leak.
+ cerr << "aio_write failed" << std::endl;
+ return ret;
+ }
+ }
+ return 0;
+}
+
+void LoadGen::run_op(LoadGenOp *op)
+{
+ op->completion = rados->aio_create_completion(op, _load_gen_cb);
+
+ switch (op->type) {
+ case OP_READ:
+ io_ctx.aio_read(op->oid, op->completion, &op->bl, op->len, op->off);
+ break;
+ case OP_WRITE:
+ bufferptr p = buffer::create(op->len);
+ memset(p.c_str(), 0, op->len);
+ op->bl.push_back(p);
+
+ io_ctx.aio_write(op->oid, op->completion, op->bl, op->len, op->off);
+ break;
+ }
+
+ total_sent += op->len;
+}
+
+void LoadGen::gen_op(LoadGenOp *op)
+{
+ int i = generate_random_number<int>(0, objs.size() - 1);
+ obj_info& info = objs[i];
+ op->oid = info.name;
+
+ size_t len = generate_random_number(min_op_len, max_op_len);
+ if (len > info.len)
+ len = info.len;
+ size_t off = generate_random_number<size_t>(0, info.len);
+
+ if (off + len > info.len)
+ off = info.len - len;
+
+ if (offset_align)
+ off = p2align(off, offset_align);
+
+ op->off = off;
+ op->len = len;
+
+ i = generate_random_number(1, 100);
+ if (i > read_percent)
+ op->type = OP_WRITE;
+ else
+ op->type = OP_READ;
+
+ cout << (op->type == OP_READ ? "READ" : "WRITE") << " : oid=" << op->oid << " off=" << op->off << " len=" << op->len << std::endl;
+}
+
+uint64_t LoadGen::gen_next_op()
+{
+ lock.lock();
+
+ LoadGenOp *op = new LoadGenOp(this);
+ gen_op(op);
+ op->id = max_op++;
+ pending_ops[op->id] = op;
+
+ lock.unlock();
+
+ run_op(op);
+
+ return op->len;
+}
+
+int LoadGen::run()
+{
+ start_time = ceph_clock_now();
+ utime_t end_time = start_time;
+ end_time += run_length;
+ utime_t stamp_time = start_time;
+ uint32_t total_sec = 0;
+
+ while (1) {
+ {
+ std::unique_lock l{lock};
+ cond.wait_for(l, 1s);
+ }
+ utime_t now = ceph_clock_now();
+
+ if (now > end_time)
+ break;
+
+ uint64_t expected = total_expected();
+ lock.lock();
+ uint64_t sent = total_sent;
+ uint64_t completed = total_completed;
+ lock.unlock();
+
+ if (now - stamp_time >= utime_t(1, 0)) {
+ double rate = (double)cur_completed_rate() / (1024 * 1024);
+ ++total_sec;
+ std::streamsize original_precision = cout.precision();
+ cout.precision(3);
+ cout << setw(5) << total_sec << ": throughput=" << rate << "MB/sec" << " pending data=" << sent - completed << std::endl;
+ cout.precision(original_precision);
+ stamp_time = now;
+ }
+
+ while (sent < expected &&
+ sent - completed < max_backlog &&
+ pending_ops.size() < max_ops) {
+ sent += gen_next_op();
+ }
+ }
+
+ // get a reference to all pending requests
+ vector<librados::AioCompletion *> completions;
+ lock.lock();
+ going_down = true;
+ map<int, LoadGenOp *>::iterator iter;
+ for (iter = pending_ops.begin(); iter != pending_ops.end(); ++iter) {
+ LoadGenOp *op = iter->second;
+ completions.push_back(op->completion);
+ }
+ lock.unlock();
+
+ cout << "waiting for all operations to complete" << std::endl;
+
+ // now wait on all the pending requests
+ for (vector<librados::AioCompletion *>::iterator citer = completions.begin(); citer != completions.end(); ++citer) {
+ librados::AioCompletion *c = *citer;
+ c->wait_for_complete();
+ c->release();
+ }
+
+ return 0;
+}
+
+void LoadGen::cleanup()
+{
+ cout << "cleaning up objects" << std::endl;
+ map<int, obj_info>::iterator iter;
+ for (iter = objs.begin(); iter != objs.end(); ++iter) {
+ obj_info& info = iter->second;
+ int ret = io_ctx.remove(info.name);
+ if (ret < 0)
+ cerr << "couldn't remove obj: " << info.name << " ret=" << ret << std::endl;
+ }
+}
+
+enum OpWriteDest {
+ OP_WRITE_DEST_OBJ = 2 << 0,
+ OP_WRITE_DEST_OMAP = 2 << 1,
+ OP_WRITE_DEST_XATTR = 2 << 2,
+};
+
+class RadosBencher : public ObjBencher {
+ librados::AioCompletion **completions;
+ librados::Rados& rados;
+ librados::IoCtx& io_ctx;
+ librados::NObjectIterator oi;
+ bool iterator_valid;
+ OpWriteDest write_destination;
+
+protected:
+ int completions_init(int concurrentios) override {
+ completions = new librados::AioCompletion *[concurrentios];
+ return 0;
+ }
+ void completions_done() override {
+ delete[] completions;
+ completions = NULL;
+ }
+ int create_completion(int slot, void (*cb)(void *, void*), void *arg) override {
+ completions[slot] = rados.aio_create_completion((void *) arg, cb);
+
+ if (!completions[slot])
+ return -EINVAL;
+
+ return 0;
+ }
+ void release_completion(int slot) override {
+ completions[slot]->release();
+ completions[slot] = 0;
+ }
+
+ int aio_read(const std::string& oid, int slot, bufferlist *pbl, size_t len,
+ size_t offset) override {
+ return io_ctx.aio_read(oid, completions[slot], pbl, len, offset);
+ }
+
+ int aio_write(const std::string& oid, int slot, bufferlist& bl, size_t len,
+ size_t offset) override {
+ librados::ObjectWriteOperation op;
+
+ if (write_destination & OP_WRITE_DEST_OBJ) {
+ if (data.hints)
+ op.set_alloc_hint2(data.object_size, data.op_size,
+ ALLOC_HINT_FLAG_SEQUENTIAL_WRITE |
+ ALLOC_HINT_FLAG_SEQUENTIAL_READ |
+ ALLOC_HINT_FLAG_APPEND_ONLY |
+ ALLOC_HINT_FLAG_IMMUTABLE);
+ op.write(offset, bl);
+ }
+
+ if (write_destination & OP_WRITE_DEST_OMAP) {
+ std::map<std::string, librados::bufferlist> omap;
+ omap[string("bench-omap-key-") + stringify(offset)] = bl;
+ op.omap_set(omap);
+ }
+
+ if (write_destination & OP_WRITE_DEST_XATTR) {
+ char key[80];
+ snprintf(key, sizeof(key), "bench-xattr-key-%d", (int)offset);
+ op.setxattr(key, bl);
+ }
+
+ return io_ctx.aio_operate(oid, completions[slot], &op);
+ }
+
+ int aio_remove(const std::string& oid, int slot) override {
+ return io_ctx.aio_remove(oid, completions[slot]);
+ }
+
+ int sync_read(const std::string& oid, bufferlist& bl, size_t len) override {
+ return io_ctx.read(oid, bl, len, 0);
+ }
+ int sync_write(const std::string& oid, bufferlist& bl, size_t len) override {
+ return io_ctx.write_full(oid, bl);
+ }
+
+ int sync_remove(const std::string& oid) override {
+ return io_ctx.remove(oid);
+ }
+
+ bool completion_is_done(int slot) override {
+ return completions[slot] && completions[slot]->is_complete();
+ }
+
+ int completion_wait(int slot) override {
+ return completions[slot]->wait_for_complete_and_cb();
+ }
+ int completion_ret(int slot) override {
+ return completions[slot]->get_return_value();
+ }
+
+ bool get_objects(std::list<Object>* objects, int num) override {
+ int count = 0;
+
+ if (!iterator_valid) {
+ oi = io_ctx.nobjects_begin();
+ iterator_valid = true;
+ }
+
+ librados::NObjectIterator ei = io_ctx.nobjects_end();
+
+ if (oi == ei) {
+ iterator_valid = false;
+ return false;
+ }
+
+ objects->clear();
+ for ( ; oi != ei && count < num; ++oi) {
+ Object obj(oi->get_oid(), oi->get_nspace());
+ objects->push_back(obj);
+ ++count;
+ }
+
+ return true;
+ }
+
+ void set_namespace( const std::string& ns) override {
+ io_ctx.set_namespace(ns);
+ }
+
+public:
+ RadosBencher(CephContext *cct_, librados::Rados& _r, librados::IoCtx& _i)
+ : ObjBencher(cct_), completions(NULL), rados(_r), io_ctx(_i), iterator_valid(false), write_destination(OP_WRITE_DEST_OBJ) {}
+ ~RadosBencher() override { }
+
+ void set_write_destination(OpWriteDest dest) {
+ write_destination = dest;
+ }
+};
+
+static int do_lock_cmd(std::vector<const char*> &nargs,
+ const std::map < std::string, std::string > &opts,
+ IoCtx *ioctx,
+ Formatter *formatter)
+{
+ if (nargs.size() < 3)
+ usage_exit();
+
+ string cmd(nargs[1]);
+ string oid(nargs[2]);
+
+ string lock_tag;
+ string lock_cookie;
+ string lock_description;
+ int lock_duration = 0;
+ ClsLockType lock_type = ClsLockType::EXCLUSIVE;
+
+ map<string, string>::const_iterator i;
+ i = opts.find("lock-tag");
+ if (i != opts.end()) {
+ lock_tag = i->second;
+ }
+ i = opts.find("lock-cookie");
+ if (i != opts.end()) {
+ lock_cookie = i->second;
+ }
+ i = opts.find("lock-description");
+ if (i != opts.end()) {
+ lock_description = i->second;
+ }
+ i = opts.find("lock-duration");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &lock_duration)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("lock-type");
+ if (i != opts.end()) {
+ const string& type_str = i->second;
+ if (type_str.compare("exclusive") == 0) {
+ lock_type = ClsLockType::EXCLUSIVE;
+ } else if (type_str.compare("shared") == 0) {
+ lock_type = ClsLockType::SHARED;
+ } else {
+ cerr << "unknown lock type was specified, aborting" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (cmd.compare("list") == 0) {
+ list<string> locks;
+ int ret = rados::cls::lock::list_locks(ioctx, oid, &locks);
+ if (ret < 0) {
+ cerr << "ERROR: rados_list_locks(): " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ formatter->open_object_section("object");
+ formatter->dump_string("objname", oid);
+ formatter->open_array_section("locks");
+ list<string>::iterator iter;
+ for (iter = locks.begin(); iter != locks.end(); ++iter) {
+ formatter->open_object_section("lock");
+ formatter->dump_string("name", *iter);
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(cout);
+ return 0;
+ }
+
+ if (nargs.size() < 4)
+ usage_exit();
+
+ string lock_name(nargs[3]);
+
+ if (cmd.compare("info") == 0) {
+ map<rados::cls::lock::locker_id_t, rados::cls::lock::locker_info_t> lockers;
+ ClsLockType type = ClsLockType::NONE;
+ string tag;
+ int ret = rados::cls::lock::get_lock_info(ioctx, oid, lock_name, &lockers, &type, &tag);
+ if (ret < 0) {
+ cerr << "ERROR: rados_lock_get_lock_info(): " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ formatter->open_object_section("lock");
+ formatter->dump_string("name", lock_name);
+ formatter->dump_string("type", cls_lock_type_str(type));
+ formatter->dump_string("tag", tag);
+ formatter->open_array_section("lockers");
+ map<rados::cls::lock::locker_id_t, rados::cls::lock::locker_info_t>::iterator iter;
+ for (iter = lockers.begin(); iter != lockers.end(); ++iter) {
+ const rados::cls::lock::locker_id_t& id = iter->first;
+ const rados::cls::lock::locker_info_t& info = iter->second;
+ formatter->open_object_section("locker");
+ formatter->dump_stream("name") << id.locker;
+ formatter->dump_string("cookie", id.cookie);
+ formatter->dump_string("description", info.description);
+ formatter->dump_stream("expiration") << info.expiration;
+ formatter->dump_stream("addr") << info.addr.get_legacy_str();
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(cout);
+
+ return ret;
+ } else if (cmd.compare("get") == 0) {
+ rados::cls::lock::Lock l(lock_name);
+ l.set_cookie(lock_cookie);
+ l.set_tag(lock_tag);
+ l.set_duration(utime_t(lock_duration, 0));
+ l.set_description(lock_description);
+ int ret;
+ switch (lock_type) {
+ case ClsLockType::SHARED:
+ ret = l.lock_shared(ioctx, oid);
+ break;
+ default:
+ ret = l.lock_exclusive(ioctx, oid);
+ }
+ if (ret < 0) {
+ cerr << "ERROR: failed locking: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ return ret;
+ }
+
+ if (nargs.size() < 5)
+ usage_exit();
+
+ if (cmd.compare("break") == 0) {
+ string locker(nargs[4]);
+ rados::cls::lock::Lock l(lock_name);
+ l.set_cookie(lock_cookie);
+ l.set_tag(lock_tag);
+ entity_name_t name;
+ if (!name.parse(locker)) {
+ cerr << "ERROR: failed to parse locker name (" << locker << ")" << std::endl;
+ return -EINVAL;
+ }
+ int ret = l.break_lock(ioctx, oid, name);
+ if (ret < 0) {
+ cerr << "ERROR: failed breaking lock: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ } else {
+ usage_exit();
+ }
+
+ return 0;
+}
+
+static int do_cache_flush(IoCtx& io_ctx, string oid)
+{
+ ObjectReadOperation op;
+ op.cache_flush();
+ librados::AioCompletion *completion =
+ librados::Rados::aio_create_completion();
+ io_ctx.aio_operate(oid.c_str(), completion, &op,
+ librados::OPERATION_IGNORE_CACHE |
+ librados::OPERATION_IGNORE_OVERLAY,
+ NULL);
+ completion->wait_for_complete();
+ int r = completion->get_return_value();
+ completion->release();
+ return r;
+}
+
+static int do_cache_try_flush(IoCtx& io_ctx, string oid)
+{
+ ObjectReadOperation op;
+ op.cache_try_flush();
+ librados::AioCompletion *completion =
+ librados::Rados::aio_create_completion();
+ io_ctx.aio_operate(oid.c_str(), completion, &op,
+ librados::OPERATION_IGNORE_CACHE |
+ librados::OPERATION_IGNORE_OVERLAY |
+ librados::OPERATION_SKIPRWLOCKS,
+ NULL);
+ completion->wait_for_complete();
+ int r = completion->get_return_value();
+ completion->release();
+ return r;
+}
+
+static int do_cache_evict(IoCtx& io_ctx, string oid)
+{
+ ObjectReadOperation op;
+ op.cache_evict();
+ librados::AioCompletion *completion =
+ librados::Rados::aio_create_completion();
+ io_ctx.aio_operate(oid.c_str(), completion, &op,
+ librados::OPERATION_IGNORE_CACHE |
+ librados::OPERATION_IGNORE_OVERLAY |
+ librados::OPERATION_SKIPRWLOCKS,
+ NULL);
+ completion->wait_for_complete();
+ int r = completion->get_return_value();
+ completion->release();
+ return r;
+}
+
+static int do_cache_flush_evict_all(IoCtx& io_ctx, bool blocking)
+{
+ int errors = 0;
+ io_ctx.set_namespace(all_nspaces);
+ try {
+ librados::NObjectIterator i = io_ctx.nobjects_begin();
+ librados::NObjectIterator i_end = io_ctx.nobjects_end();
+ for (; i != i_end; ++i) {
+ int r;
+ cout << i->get_nspace() << "\t" << i->get_oid() << "\t" << i->get_locator() << std::endl;
+ if (i->get_locator().size()) {
+ io_ctx.locator_set_key(i->get_locator());
+ } else {
+ io_ctx.locator_set_key(string());
+ }
+ io_ctx.set_namespace(i->get_nspace());
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ r = io_ctx.list_snaps(i->get_oid(), &ls);
+ if (r < 0) {
+ cerr << "error listing snap shots " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ continue;
+ }
+ std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ // no snapshots
+ if (ci == ls.clones.end()) {
+ io_ctx.snap_set_read(CEPH_NOSNAP);
+ if (blocking)
+ r = do_cache_flush(io_ctx, i->get_oid());
+ else
+ r = do_cache_try_flush(io_ctx, i->get_oid());
+ if (r < 0) {
+ cerr << "failed to flush " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ continue;
+ }
+ r = do_cache_evict(io_ctx, i->get_oid());
+ if (r < 0) {
+ cerr << "failed to evict " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ continue;
+ }
+ } else {
+ // has snapshots
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+ io_ctx.snap_set_read(ci->cloneid);
+ if (blocking)
+ r = do_cache_flush(io_ctx, i->get_oid());
+ else
+ r = do_cache_try_flush(io_ctx, i->get_oid());
+ if (r < 0) {
+ cerr << "failed to flush " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ break;
+ }
+ r = do_cache_evict(io_ctx, i->get_oid());
+ if (r < 0) {
+ cerr << "failed to evict " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ break;
+ }
+ }
+ }
+ }
+ }
+ catch (const std::exception& e) {
+ cerr << e.what() << std::endl;
+ return -1;
+ }
+ return errors ? -1 : 0;
+}
+
+static int do_get_inconsistent_pg_cmd(const std::vector<const char*> &nargs,
+ Rados& rados,
+ Formatter& formatter)
+{
+ if (nargs.size() < 2) {
+ usage_exit();
+ }
+ int64_t pool_id = rados.pool_lookup(nargs[1]);
+ if (pool_id < 0) {
+ cerr << "pool \"" << nargs[1] << "\" not found" << std::endl;
+ return (int)pool_id;
+ }
+ std::vector<PlacementGroup> pgs;
+ int ret = rados.get_inconsistent_pgs(pool_id, &pgs);
+ if (ret) {
+ return ret;
+ }
+ formatter.open_array_section("pgs");
+ for (auto& pg : pgs) {
+ formatter.dump_stream("pg") << pg;
+ }
+ formatter.close_section();
+ formatter.flush(cout);
+ cout << std::endl;
+ return 0;
+}
+
+static void dump_errors(const err_t &err, Formatter &f, const char *name)
+{
+ f.open_array_section(name);
+ if (err.has_shard_missing())
+ f.dump_string("error", "missing");
+ if (err.has_stat_error())
+ f.dump_string("error", "stat_error");
+ if (err.has_read_error())
+ f.dump_string("error", "read_error");
+ if (err.has_data_digest_mismatch_info())
+ f.dump_string("error", "data_digest_mismatch_info");
+ if (err.has_omap_digest_mismatch_info())
+ f.dump_string("error", "omap_digest_mismatch_info");
+ if (err.has_size_mismatch_info())
+ f.dump_string("error", "size_mismatch_info");
+ if (err.has_ec_hash_error())
+ f.dump_string("error", "ec_hash_error");
+ if (err.has_ec_size_error())
+ f.dump_string("error", "ec_size_error");
+ if (err.has_info_missing())
+ f.dump_string("error", "info_missing");
+ if (err.has_info_corrupted())
+ f.dump_string("error", "info_corrupted");
+ if (err.has_obj_size_info_mismatch())
+ f.dump_string("error", "obj_size_info_mismatch");
+ if (err.has_snapset_missing())
+ f.dump_string("error", "snapset_missing");
+ if (err.has_snapset_corrupted())
+ f.dump_string("error", "snapset_corrupted");
+ if (err.has_hinfo_missing())
+ f.dump_string("error", "hinfo_missing");
+ if (err.has_hinfo_corrupted())
+ f.dump_string("error", "hinfo_corrupted");
+ f.close_section();
+}
+
+static void dump_shard(const shard_info_t& shard,
+ const inconsistent_obj_t& inc,
+ Formatter &f)
+{
+ dump_errors(shard, f, "errors");
+
+ if (shard.has_shard_missing())
+ return;
+
+ if (!shard.has_stat_error())
+ f.dump_unsigned("size", shard.size);
+ if (shard.omap_digest_present) {
+ f.dump_format("omap_digest", "0x%08x", shard.omap_digest);
+ }
+ if (shard.data_digest_present) {
+ f.dump_format("data_digest", "0x%08x", shard.data_digest);
+ }
+
+ if ((inc.union_shards.has_info_missing()
+ || inc.union_shards.has_info_corrupted()
+ || inc.has_object_info_inconsistency()
+ || shard.has_obj_size_info_mismatch()) &&
+ !shard.has_info_missing()) {
+ map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(OI_ATTR);
+ ceph_assert(k != shard.attrs.end()); // Can't be missing
+ if (!shard.has_info_corrupted()) {
+ object_info_t oi;
+ bufferlist bl;
+ auto bliter = k->second.cbegin();
+ decode(oi, bliter); // Can't be corrupted
+ f.open_object_section("object_info");
+ oi.dump(&f);
+ f.close_section();
+ } else {
+ bool b64;
+ f.dump_string("object_info", cleanbin(k->second, b64));
+ }
+ }
+ if ((inc.union_shards.has_snapset_missing()
+ || inc.union_shards.has_snapset_corrupted()
+ || inc.has_snapset_inconsistency()) &&
+ !shard.has_snapset_missing()) {
+ map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(SS_ATTR);
+ ceph_assert(k != shard.attrs.end()); // Can't be missing
+ if (!shard.has_snapset_corrupted()) {
+ SnapSet ss;
+ bufferlist bl;
+ auto bliter = k->second.cbegin();
+ decode(ss, bliter); // Can't be corrupted
+ f.open_object_section("snapset");
+ ss.dump(&f);
+ f.close_section();
+ } else {
+ bool b64;
+ f.dump_string("snapset", cleanbin(k->second, b64));
+ }
+ }
+ if ((inc.union_shards.has_hinfo_missing()
+ || inc.union_shards.has_hinfo_corrupted()
+ || inc.has_hinfo_inconsistency()) &&
+ !shard.has_hinfo_missing()) {
+ map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(ECUtil::get_hinfo_key());
+ ceph_assert(k != shard.attrs.end()); // Can't be missing
+ if (!shard.has_hinfo_corrupted()) {
+ ECUtil::HashInfo hi;
+ bufferlist bl;
+ auto bliter = k->second.cbegin();
+ decode(hi, bliter); // Can't be corrupted
+ f.open_object_section("hashinfo");
+ hi.dump(&f);
+ f.close_section();
+ } else {
+ bool b64;
+ f.dump_string("hashinfo", cleanbin(k->second, b64));
+ }
+ }
+ if (inc.has_attr_name_mismatch() || inc.has_attr_value_mismatch()) {
+ f.open_array_section("attrs");
+ for (auto kv : shard.attrs) {
+ // System attribute handled above
+ if (kv.first == OI_ATTR || kv.first[0] != '_')
+ continue;
+ f.open_object_section("attr");
+ // Skip leading underscore since only giving user attrs
+ f.dump_string("name", kv.first.substr(1));
+ bool b64;
+ f.dump_string("value", cleanbin(kv.second, b64));
+ f.dump_bool("Base64", b64);
+ f.close_section();
+ }
+ f.close_section();
+ }
+}
+
+static void dump_obj_errors(const obj_err_t &err, Formatter &f)
+{
+ f.open_array_section("errors");
+ if (err.has_object_info_inconsistency())
+ f.dump_string("error", "object_info_inconsistency");
+ if (err.has_data_digest_mismatch())
+ f.dump_string("error", "data_digest_mismatch");
+ if (err.has_omap_digest_mismatch())
+ f.dump_string("error", "omap_digest_mismatch");
+ if (err.has_size_mismatch())
+ f.dump_string("error", "size_mismatch");
+ if (err.has_attr_value_mismatch())
+ f.dump_string("error", "attr_value_mismatch");
+ if (err.has_attr_name_mismatch())
+ f.dump_string("error", "attr_name_mismatch");
+ if (err.has_snapset_inconsistency())
+ f.dump_string("error", "snapset_inconsistency");
+ if (err.has_hinfo_inconsistency())
+ f.dump_string("error", "hinfo_inconsistency");
+ if (err.has_size_too_large())
+ f.dump_string("error", "size_too_large");
+ f.close_section();
+}
+
+static void dump_object_id(const object_id_t& object,
+ Formatter &f)
+{
+ f.dump_string("name", object.name);
+ f.dump_string("nspace", object.nspace);
+ f.dump_string("locator", object.locator);
+ switch (object.snap) {
+ case CEPH_NOSNAP:
+ f.dump_string("snap", "head");
+ break;
+ case CEPH_SNAPDIR:
+ f.dump_string("snap", "snapdir");
+ break;
+ default:
+ f.dump_unsigned("snap", object.snap);
+ break;
+ }
+}
+
+static void dump_inconsistent(const inconsistent_obj_t& inc,
+ Formatter &f)
+{
+ f.open_object_section("object");
+ dump_object_id(inc.object, f);
+ f.dump_unsigned("version", inc.version);
+ f.close_section();
+
+ dump_obj_errors(inc, f);
+ dump_errors(inc.union_shards, f, "union_shard_errors");
+ for (const auto& shard_info : inc.shards) {
+ shard_info_t shard = const_cast<shard_info_t&>(shard_info.second);
+ if (shard.selected_oi) {
+ object_info_t oi;
+ bufferlist bl;
+ auto k = shard.attrs.find(OI_ATTR);
+ ceph_assert(k != shard.attrs.end()); // Can't be missing
+ auto bliter = k->second.cbegin();
+ decode(oi, bliter); // Can't be corrupted
+ f.open_object_section("selected_object_info");
+ oi.dump(&f);
+ f.close_section();
+ break;
+ }
+ }
+ f.open_array_section("shards");
+ for (const auto& shard_info : inc.shards) {
+ f.open_object_section("shard");
+ auto& osd_shard = shard_info.first;
+ f.dump_int("osd", osd_shard.osd);
+ f.dump_bool("primary", shard_info.second.primary);
+ auto shard = osd_shard.shard;
+ if (shard != shard_id_t::NO_SHARD)
+ f.dump_unsigned("shard", shard);
+ dump_shard(shard_info.second, inc, f);
+ f.close_section();
+ }
+ f.close_section();
+}
+
+static void dump_inconsistent(const inconsistent_snapset_t& inc,
+ Formatter &f)
+{
+ dump_object_id(inc.object, f);
+
+ if (inc.ss_bl.length()) {
+ SnapSet ss;
+ bufferlist bl = inc.ss_bl;
+ auto bliter = bl.cbegin();
+ decode(ss, bliter); // Can't be corrupted
+ f.open_object_section("snapset");
+ ss.dump(&f);
+ f.close_section();
+ }
+ f.open_array_section("errors");
+ if (inc.snapset_missing())
+ f.dump_string("error", "snapset_missing");
+ if (inc.snapset_corrupted())
+ f.dump_string("error", "snapset_corrupted");
+ if (inc.info_missing())
+ f.dump_string("error", "info_missing");
+ if (inc.info_corrupted())
+ f.dump_string("error", "info_corrupted");
+ if (inc.snapset_error())
+ f.dump_string("error", "snapset_error");
+ if (inc.headless())
+ f.dump_string("error", "headless");
+ if (inc.size_mismatch())
+ f.dump_string("error", "size_mismatch");
+ if (inc.extra_clones())
+ f.dump_string("error", "extra_clones");
+ if (inc.clone_missing())
+ f.dump_string("error", "clone_missing");
+ f.close_section();
+
+ if (inc.extra_clones()) {
+ f.open_array_section("extra clones");
+ for (auto snap : inc.clones) {
+ f.dump_unsigned("snap", snap);
+ }
+ f.close_section();
+ }
+
+ if (inc.clone_missing()) {
+ f.open_array_section("missing");
+ for (auto snap : inc.missing) {
+ f.dump_unsigned("snap", snap);
+ }
+ f.close_section();
+ }
+}
+
+// dispatch the call by type
+static int do_get_inconsistent(Rados& rados,
+ const PlacementGroup& pg,
+ const librados::object_id_t &start,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_obj_t>* objs,
+ uint32_t* interval)
+{
+ return rados.get_inconsistent_objects(pg, start, max_return, c,
+ objs, interval);
+}
+
+static int do_get_inconsistent(Rados& rados,
+ const PlacementGroup& pg,
+ const librados::object_id_t &start,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_snapset_t>* snapsets,
+ uint32_t* interval)
+{
+ return rados.get_inconsistent_snapsets(pg, start, max_return, c,
+ snapsets, interval);
+}
+
+template <typename T>
+static int do_get_inconsistent_cmd(const std::vector<const char*> &nargs,
+ Rados& rados,
+ Formatter& formatter)
+{
+ if (nargs.size() < 2) {
+ usage_exit();
+ }
+ PlacementGroup pg;
+ int ret = 0;
+ ret = pg.parse(nargs[1]);
+ if (!ret) {
+ cerr << "bad pg: " << nargs[1] << std::endl;
+ return ret;
+ }
+ uint32_t interval = 0, first_interval = 0;
+ const unsigned max_item_num = 32;
+ bool opened = false;
+ for (librados::object_id_t start;;) {
+ std::vector<T> items;
+ auto completion = librados::Rados::aio_create_completion();
+ ret = do_get_inconsistent(rados, pg, start, max_item_num, completion,
+ &items, &interval);
+ completion->wait_for_complete();
+ ret = completion->get_return_value();
+ completion->release();
+ if (ret < 0) {
+ if (ret == -EAGAIN)
+ cerr << "interval#" << interval << " expired." << std::endl;
+ else if (ret == -ENOENT)
+ cerr << "No scrub information available for pg " << pg << std::endl;
+ break;
+ }
+ // It must be the same interval every time. EAGAIN would
+ // occur if interval changes.
+ ceph_assert(start.name.empty() || first_interval == interval);
+ if (start.name.empty()) {
+ first_interval = interval;
+ formatter.open_object_section("info");
+ formatter.dump_int("epoch", interval);
+ formatter.open_array_section("inconsistents");
+ opened = true;
+ }
+ for (auto& inc : items) {
+ formatter.open_object_section("inconsistent");
+ dump_inconsistent(inc, formatter);
+ formatter.close_section();
+ }
+ if (items.size() < max_item_num) {
+ formatter.close_section();
+ break;
+ }
+ if (!items.empty()) {
+ start = items.back().object;
+ }
+ items.clear();
+ }
+ if (opened) {
+ formatter.close_section();
+ formatter.flush(cout);
+ }
+ return ret;
+}
+
+static std::string prettify(const std::string& s)
+{
+ if (std::find_if_not(s.begin(), s.end(),
+ (int (*)(int))isprint) != s.end()) {
+ return "(binary key)";
+ } else {
+ return s;
+ }
+}
+
+/**********************************************
+
+**********************************************/
+static int rados_tool_common(const std::map < std::string, std::string > &opts,
+ std::vector<const char*> &nargs)
+{
+ int ret;
+ bool create_pool = false;
+ const char *pool_name = NULL;
+ const char *target_pool_name = NULL;
+ string oloc, target_oloc, nspace, target_nspace;
+ int concurrent_ios = 16;
+ unsigned op_size = default_op_size;
+ unsigned object_size = 0;
+ unsigned max_objects = 0;
+ uint64_t obj_offset = 0;
+ bool obj_offset_specified = false;
+ bool block_size_specified = false;
+ int bench_write_dest = 0;
+ bool cleanup = true;
+ bool hints = true; // for rados bench
+ bool reuse_bench = false;
+ bool no_verify = false;
+ bool use_striper = false;
+ bool with_clones = false;
+ const char *snapname = NULL;
+ snap_t snapid = CEPH_NOSNAP;
+ std::map<std::string, std::string>::const_iterator i;
+
+ uint64_t offset_align = 0;
+ uint64_t min_obj_len = 0;
+ uint64_t max_obj_len = 0;
+ uint64_t min_op_len = 0;
+ uint64_t max_op_len = 0;
+ uint64_t max_ops = 0;
+ uint64_t max_backlog = 0;
+ uint64_t target_throughput = 0;
+ int64_t read_percent = -1;
+ uint64_t num_objs = 0;
+ int run_length = 0;
+
+ bool show_time = false;
+ bool wildcard = false;
+
+ std::string run_name;
+ std::string prefix;
+ bool forcefull = false;
+ unique_ptr<Formatter> formatter = nullptr;
+ bool pretty_format = false;
+ const char *output = NULL;
+ std::optional<std::string> omap_key;
+ std::optional<std::string> obj_name;
+ std::string input_file;
+ bool with_reference = false;
+
+ Rados rados;
+ IoCtx io_ctx;
+
+ i = opts.find("create");
+ if (i != opts.end()) {
+ create_pool = true;
+ }
+ i = opts.find("pool");
+ if (i != opts.end()) {
+ pool_name = i->second.c_str();
+ }
+ i = opts.find("target_pool");
+ if (i != opts.end()) {
+ target_pool_name = i->second.c_str();
+ }
+ i = opts.find("object_locator");
+ if (i != opts.end()) {
+ oloc = i->second;
+ }
+ i = opts.find("target_locator");
+ if (i != opts.end()) {
+ target_oloc = i->second;
+ }
+ i = opts.find("target_nspace");
+ if (i != opts.end()) {
+ target_nspace = i->second;
+ }
+ i = opts.find("concurrent-ios");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &concurrent_ios)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("run-name");
+ if (i != opts.end()) {
+ run_name = i->second;
+ }
+
+ i = opts.find("force-full");
+ if (i != opts.end()) {
+ forcefull = true;
+ }
+ i = opts.find("prefix");
+ if (i != opts.end()) {
+ prefix = i->second;
+ }
+ i = opts.find("block-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &op_size)) {
+ return -EINVAL;
+ }
+ block_size_specified = true;
+ }
+ i = opts.find("object-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &object_size)) {
+ return -EINVAL;
+ }
+ block_size_specified = true;
+ }
+ i = opts.find("max-objects");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_objects)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("offset");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &obj_offset)) {
+ return -EINVAL;
+ }
+ obj_offset_specified = true;
+ }
+ i = opts.find("snap");
+ if (i != opts.end()) {
+ snapname = i->second.c_str();
+ }
+ i = opts.find("snapid");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &snapid)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("min-object-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &min_obj_len)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-object-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_obj_len)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("min-op-len");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &min_op_len)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-op-len");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_op_len)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-ops");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_ops)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-backlog");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_backlog)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("target-throughput");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &target_throughput)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("read-percent");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &read_percent)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("num-objects");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &num_objs)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("run-length");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &run_length)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("show-time");
+ if (i != opts.end()) {
+ show_time = true;
+ }
+ i = opts.find("no-cleanup");
+ if (i != opts.end()) {
+ cleanup = false;
+ }
+ i = opts.find("no-hints");
+ if (i != opts.end()) {
+ hints = false;
+ }
+ i = opts.find("reuse-bench");
+ if (i != opts.end()) {
+ reuse_bench = true;
+ }
+ i = opts.find("pretty-format");
+ if (i != opts.end()) {
+ pretty_format = true;
+ }
+ i = opts.find("format");
+ if (i != opts.end()) {
+ const char *format = i->second.c_str();
+ formatter.reset(Formatter::create(format));
+ if (!formatter) {
+ cerr << "unrecognized format: " << format << std::endl;
+ return -EINVAL;
+ }
+ }
+ i = opts.find("namespace");
+ if (i != opts.end()) {
+ nspace = i->second;
+ }
+ i = opts.find("no-verify");
+ if (i != opts.end()) {
+ no_verify = true;
+ }
+ i = opts.find("output");
+ if (i != opts.end()) {
+ output = i->second.c_str();
+ }
+ i = opts.find("write-dest-obj");
+ if (i != opts.end()) {
+ bench_write_dest |= static_cast<int>(OP_WRITE_DEST_OBJ);
+ }
+ i = opts.find("write-dest-omap");
+ if (i != opts.end()) {
+ bench_write_dest |= static_cast<int>(OP_WRITE_DEST_OMAP);
+ }
+ i = opts.find("write-dest-xattr");
+ if (i != opts.end()) {
+ bench_write_dest |= static_cast<int>(OP_WRITE_DEST_XATTR);
+ }
+ i = opts.find("with-clones");
+ if (i != opts.end()) {
+ with_clones = true;
+ }
+ i = opts.find("omap-key-file");
+ if (i != opts.end()) {
+ string err;
+ bufferlist indata;
+ ret = indata.read_file(i->second.c_str(), &err);
+ if (ret < 0) {
+ cerr << err << std::endl;
+ return 1;
+ }
+ omap_key = std::string(indata.c_str(), indata.length());
+ }
+ i = opts.find("obj-name-file");
+ if (i != opts.end()) {
+ string err;
+ bufferlist indata;
+ ret = indata.read_file(i->second.c_str(), &err);
+ if (ret < 0) {
+ cerr << err << std::endl;
+ return 1;
+ }
+ obj_name = std::string(indata.c_str(), indata.length());
+ }
+ i = opts.find("offset_align");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &offset_align)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("with-reference");
+ if (i != opts.end()) {
+ with_reference = true;
+ }
+ i = opts.find("input_file");
+ if (i != opts.end()) {
+ input_file = i->second;
+ }
+
+ // open rados
+ ret = rados.init_with_context(g_ceph_context);
+ if (ret < 0) {
+ cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ ret = rados.connect();
+ if (ret) {
+ cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ if (create_pool && !pool_name) {
+ cerr << "--create-pool requested but pool_name was not specified!" << std::endl;
+ usage(cerr);
+ return 1;
+ }
+
+ if (create_pool) {
+ ret = rados.pool_create(pool_name);
+ if (ret < 0) {
+ cerr << "error creating pool " << pool_name << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+
+ i = opts.find("pgid");
+ boost::optional<pg_t> pgid(i != opts.end(), pg_t());
+ if (pgid && (!pgid->parse(i->second.c_str()) || (pool_name && rados.pool_lookup(pool_name) != pgid->pool()))) {
+ cerr << "invalid pgid" << std::endl;
+ return 1;
+ }
+
+ // open io context.
+ if (pool_name || pgid) {
+ ret = pool_name ? rados.ioctx_create(pool_name, io_ctx) : rados.ioctx_create2(pgid->pool(), io_ctx);
+ if (ret < 0) {
+ cerr << "error opening pool "
+ << (pool_name ? pool_name : std::string("with id ") + std::to_string(pgid->pool())) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ // align op_size
+ {
+ bool requires;
+ ret = io_ctx.pool_requires_alignment2(&requires);
+ if (ret < 0) {
+ cerr << "error checking pool alignment requirement"
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ if (requires) {
+ uint64_t align = 0;
+ ret = io_ctx.pool_required_alignment2(&align);
+ if (ret < 0) {
+ cerr << "error getting pool alignment"
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ const uint64_t prev_op_size = op_size;
+ op_size = uint64_t((op_size + align - 1) / align) * align;
+ // Warn: if user specified and it was rounded
+ if (prev_op_size != default_op_size && prev_op_size != op_size)
+ cerr << "INFO: op_size has been rounded to " << op_size << std::endl;
+ }
+ }
+
+#ifdef WITH_LIBRADOSSTRIPER
+ // create striper interface
+ if (opts.find("striper") != opts.end()) {
+ // Note that this call does a tricky thing by reaching into a "singleton". We count
+ // on this happening only once:
+ ret = RadosStriper::striper_create(io_ctx, &detail::striper());
+ if (0 != ret) {
+ cerr << "error opening pool " << pool_name << " with striper interface: "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ use_striper = true;
+ }
+#endif // USE_LIBRADOSSTRIPER
+ }
+
+ // snapname?
+ if (snapname) {
+ if (!pool_name) {
+ cerr << "pool name must be specified with --snap" << std::endl;
+ return 1;
+ }
+ ret = io_ctx.snap_lookup(snapname, &snapid);
+ if (ret < 0) {
+ cerr << "error looking up snap '" << snapname << "': " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ if (oloc.size()) {
+ if (!pool_name) {
+ cerr << "pool name must be specified with --object-locator" << std::endl;
+ return 1;
+ }
+ io_ctx.locator_set_key(oloc);
+ }
+ // Use namespace from command line if specified
+ if (opts.find("namespace") != opts.end()) {
+ if (!pool_name) {
+ cerr << "pool name must be specified with --namespace" << std::endl;
+ return 1;
+ }
+ io_ctx.set_namespace(nspace);
+ // Use wildcard if --all specified and --default NOT specified
+ } else if (opts.find("all") != opts.end() && opts.find("default") == opts.end()) {
+ // Only the ls should ever set namespace to special value
+ wildcard = true;
+ }
+ if (snapid != CEPH_NOSNAP) {
+ if (!pool_name) {
+ cerr << "pool name must be specified with --snapid" << std::endl;
+ return 1;
+ }
+ string name;
+ ret = io_ctx.snap_get_name(snapid, &name);
+ if (ret < 0) {
+ cerr << "snapid " << snapid << " doesn't exist in pool "
+ << io_ctx.get_pool_name() << std::endl;
+ return 1;
+ }
+ io_ctx.snap_set_read(snapid);
+ cout << "selected snap " << snapid << " '" << name << "'" << std::endl;
+ }
+
+ ceph_assert(!nargs.empty());
+
+ // list pools?
+ if (strcmp(nargs[0], "lspools") == 0) {
+ list<string> vec;
+ ret = rados.pool_list(vec);
+ if (ret < 0) {
+ cerr << "error listing pools: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ for (list<string>::iterator i = vec.begin(); i != vec.end(); ++i)
+ cout << *i << std::endl;
+ }
+ else if (strcmp(nargs[0], "df") == 0) {
+ // pools
+ list<string> vec;
+
+ if (!pool_name) {
+ ret = rados.pool_list(vec);
+ if (ret < 0) {
+ cerr << "error listing pools: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else {
+ vec.push_back(pool_name);
+ }
+
+ map<string,librados::pool_stat_t> stats;
+ ret = rados.get_pool_stats(vec, stats);
+ if (ret < 0) {
+ cerr << "error fetching pool stats: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ TextTable tab;
+
+ if (!formatter) {
+ tab.define_column("POOL_NAME", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("CLONES", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("COPIES", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("MISSING_ON_PRIMARY", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("UNFOUND", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("DEGRADED", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("RD_OPS", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("RD", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("WR_OPS", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("WR", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("USED COMPR", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("UNDER COMPR", TextTable::RIGHT, TextTable::RIGHT);
+ } else {
+ formatter->open_object_section("stats");
+ formatter->open_array_section("pools");
+ }
+ for (map<string,librados::pool_stat_t>::iterator i = stats.begin();
+ i != stats.end();
+ ++i) {
+ const char *pool_name = i->first.c_str();
+ librados::pool_stat_t& s = i->second;
+ if (!formatter) {
+ tab << pool_name
+ << byte_u_t(s.num_bytes)
+ << s.num_objects
+ << s.num_object_clones
+ << s.num_object_copies
+ << s.num_objects_missing_on_primary
+ << s.num_objects_unfound
+ << s.num_objects_degraded
+ << s.num_rd
+ << byte_u_t(s.num_rd_kb << 10)
+ << s.num_wr
+ << byte_u_t(s.num_wr_kb << 10)
+ << byte_u_t(s.compressed_bytes_alloc)
+ << byte_u_t(s.compressed_bytes_orig)
+ << TextTable::endrow;
+ } else {
+ formatter->open_object_section("pool");
+ int64_t pool_id = rados.pool_lookup(pool_name);
+ formatter->dump_string("name", pool_name);
+ if (pool_id >= 0)
+ formatter->dump_int("id", pool_id);
+ else
+ cerr << "ERROR: lookup_pg_pool_name for name=" << pool_name
+ << " returned " << pool_id << std::endl;
+ formatter->dump_int("size_bytes",s.num_bytes);
+ formatter->dump_int("size_kb", s.num_kb);
+ formatter->dump_int("num_objects", s.num_objects);
+ formatter->dump_int("num_object_clones", s.num_object_clones);
+ formatter->dump_int("num_object_copies", s.num_object_copies);
+ formatter->dump_int("num_objects_missing_on_primary", s.num_objects_missing_on_primary);
+ formatter->dump_int("num_objects_unfound", s.num_objects_unfound);
+ formatter->dump_int("num_objects_degraded", s.num_objects_degraded);
+ formatter->dump_int("read_ops", s.num_rd);
+ formatter->dump_int("read_bytes", s.num_rd_kb * 1024ull);
+ formatter->dump_int("write_ops", s.num_wr);
+ formatter->dump_int("write_bytes", s.num_wr_kb * 1024ull);
+ formatter->dump_int("compress_bytes_used", s.compressed_bytes_alloc);
+ formatter->dump_int("compress_under_bytes", s.compressed_bytes_orig);
+ formatter->close_section();
+ }
+ }
+
+ if (!formatter) {
+ cout << tab;
+ }
+
+ // total
+ cluster_stat_t tstats;
+ ret = rados.cluster_stat(tstats);
+ if (ret < 0) {
+ cerr << "error getting total cluster usage: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ if (!formatter) {
+ cout << std::endl;
+ cout << "total_objects " << tstats.num_objects
+ << std::endl;
+ cout << "total_used " << byte_u_t(tstats.kb_used << 10)
+ << std::endl;
+ cout << "total_avail " << byte_u_t(tstats.kb_avail << 10)
+ << std::endl;
+ cout << "total_space " << byte_u_t(tstats.kb << 10)
+ << std::endl;
+ } else {
+ formatter->close_section();
+ formatter->dump_int("total_objects", tstats.num_objects);
+ formatter->dump_int("total_used", tstats.kb_used);
+ formatter->dump_int("total_avail", tstats.kb_avail);
+ formatter->dump_int("total_space", tstats.kb);
+ formatter->close_section();
+ formatter->flush(cout);
+ }
+ }
+
+ else if (strcmp(nargs[0], "ls") == 0) {
+ if (!pool_name && !pgid) {
+ cerr << "either pool name or pg id needs to be specified" << std::endl;
+ return 1;
+ }
+
+ if (wildcard) {
+ io_ctx.set_namespace(all_nspaces);
+ }
+ bool use_stdout = (!output && (nargs.size() < 2 || (strcmp(nargs[1], "-") == 0)));
+ if (!use_stdout && !output) {
+ cerr << "Please use --output to specify the output file name" << std::endl;
+ return 1;
+ }
+
+ ostream *outstream;
+ if (use_stdout) {
+ outstream = &cout;
+ } else {
+ outstream = new ofstream(output);
+ }
+
+ {
+ if (formatter) {
+ formatter->open_array_section("objects");
+ }
+ try {
+ librados::NObjectIterator i = pgid ? io_ctx.nobjects_begin(pgid->ps()) : io_ctx.nobjects_begin();
+ const librados::NObjectIterator i_end = io_ctx.nobjects_end();
+ for (; i != i_end; ++i) {
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper) {
+ // in case of --striper option, we only list striped
+ // objects, so we only display the first object of
+ // each, without its suffix '.000...000'
+ size_t l = i->get_oid().length();
+ if (l <= 17 ||
+ (0 != i->get_oid().compare(l-17, 17,".0000000000000000"))) {
+ continue;
+ }
+ }
+#endif // WITH_LIBRADOSSTRIPER
+ if (pgid) {
+ uint32_t ps;
+ if (io_ctx.get_object_pg_hash_position2(i->get_oid(), &ps) || pgid->ps() != ps) {
+ break;
+ }
+ }
+ if (!formatter) {
+ // Only include namespace in output when wildcard specified
+ if (wildcard) {
+ *outstream << i->get_nspace() << "\t";
+ }
+ *outstream << detail::get_oid(i, use_striper);
+ if (i->get_locator().size()) {
+ *outstream << "\t" << i->get_locator();
+ }
+ *outstream << std::endl;
+ } else {
+ formatter->open_object_section("object");
+ formatter->dump_string("namespace", i->get_nspace());
+
+ detail::dump_name(formatter.get(), i, use_striper);
+
+ if (i->get_locator().size()) {
+ formatter->dump_string("locator", i->get_locator());
+ }
+ formatter->close_section(); //object
+
+ constexpr int TARGET_BYTES_PER_FLUSH = 4096;
+ if (formatter->get_len() >= TARGET_BYTES_PER_FLUSH) {
+ formatter->flush(*outstream);
+ }
+ }
+ }
+ }
+ catch (const std::exception& e) {
+ cerr << e.what() << std::endl;
+ return 1;
+ }
+ }
+ if (formatter) {
+ formatter->close_section(); //objects
+ formatter->flush(*outstream);
+ if (pretty_format) {
+ *outstream << std::endl;
+ }
+ formatter->flush(*outstream);
+ }
+ if (!stdout) {
+ delete outstream;
+ }
+ }
+ else if (strcmp(nargs[0], "mapext") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ std::map<uint64_t,uint64_t> m;
+ ret = io_ctx.mapext(*obj_name, 0, -1, m);
+ if (ret < 0) {
+ cerr << "mapext error on " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ std::map<uint64_t,uint64_t>::iterator iter;
+ for (iter = m.begin(); iter != m.end(); ++iter) {
+ cout << hex << iter->first << "\t" << iter->second << dec << std::endl;
+ }
+ }
+ else if (strcmp(nargs[0], "stat") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ uint64_t size;
+ time_t mtime;
+
+ ret = detail::stat(io_ctx, *obj_name, size, mtime, use_striper);
+
+ if (ret < 0) {
+ cerr << " error stat-ing " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ utime_t t(mtime, 0);
+ cout << pool_name << "/" << prettify(*obj_name)
+ << " mtime " << t << ", size " << size << std::endl;
+ }
+ }
+ else if (strcmp(nargs[0], "stat2") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ uint64_t size;
+ struct timespec mtime;
+
+ ret = detail::stat2(io_ctx, *obj_name, size, mtime, use_striper);
+
+ if (ret < 0) {
+ cerr << " error stat-ing " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ utime_t t(mtime);
+ cout << pool_name << "/" << prettify(*obj_name)
+ << " mtime " << t << ", size " << size << std::endl;
+ }
+ }
+ else if (strcmp(nargs[0], "touch") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ time_t timestamp = time(NULL);
+ if (nargs.size() > (obj_name ? 1 : 2)) {
+ char* endptr = NULL;
+ timestamp = static_cast<time_t>(strtoll(nargs[obj_name ? 1 : 2], &endptr, 10));
+ if (*endptr) {
+ cerr << "Invalid value for timestamp: '" << nargs[obj_name ? 1 : 2] << "'" << std::endl;
+ ret = -EINVAL;
+ return 1;
+ }
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ObjectWriteOperation op;
+ op.create(false);
+ op.mtime(&timestamp);
+ ret = io_ctx.operate(*obj_name, &op);
+ if (ret < 0) {
+ cerr << " error touch-ing " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "get") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+ const char* out_filename;
+ if (obj_name) {
+ out_filename = nargs[1];
+ } else {
+ obj_name = nargs[1];
+ out_filename = nargs[2];
+ }
+ ret = do_get(io_ctx, *obj_name, out_filename, op_size, use_striper);
+ if (ret < 0) {
+ cerr << "error getting " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "put") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+ const char* in_filename;
+ if (obj_name) {
+ in_filename = nargs[1];
+ } else {
+ obj_name = nargs[1];
+ in_filename = nargs[2];
+ }
+ bool create_object = !obj_offset_specified;
+ ret = do_put(io_ctx, *obj_name, in_filename, op_size, obj_offset, create_object, use_striper);
+ if (ret < 0) {
+ cerr << "error putting " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "append") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+ const char* in_filename;
+ if (obj_name) {
+ in_filename = nargs[1];
+ } else {
+ obj_name = nargs[1];
+ in_filename = nargs[2];
+ }
+ ret = do_append(io_ctx, *obj_name, in_filename, op_size, use_striper);
+ if (ret < 0) {
+ cerr << "error appending " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "truncate") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+
+ char* endptr = NULL;
+ long size;
+ if (!obj_name) {
+ obj_name = nargs[1];
+ size = strtoll(nargs[2], &endptr, 10);
+ } else {
+ size = strtoll(nargs[1], &endptr, 10);
+ }
+ if (*endptr) {
+ cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl;
+ ret = -EINVAL;
+ return 1;
+ }
+ if (size < 0) {
+ cerr << "error, cannot truncate to negative value" << std::endl;
+ usage(cerr);
+ return 1;
+ }
+
+ ret = detail::trunc(io_ctx, *obj_name, size, use_striper);
+
+ if (ret < 0) {
+ cerr << "error truncating oid "
+ << prettify(*obj_name) << " to " << size << ": "
+ << cpp_strerror(ret) << std::endl;
+ } else {
+ ret = 0;
+ }
+ }
+ else if (strcmp(nargs[0], "setxattr") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3) ||
+ nargs.size() > (obj_name ? 3 : 4)) {
+ usage(cerr);
+ return 1;
+ }
+ string attr_name(nargs[obj_name ? 1 : 2]);
+ bufferlist bl;
+ if (nargs.size() == (obj_name ? 3 : 4)) {
+ string attr_val(nargs[obj_name ? 2 : 3]);
+ bl.append(attr_val.c_str(), attr_val.length());
+ } else {
+ do {
+ ret = bl.read_fd(STDIN_FILENO, 1024); // from stdin
+ if (ret < 0)
+ return 1;
+ } while (ret > 0);
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+
+ ret = detail::setxattr(io_ctx, *obj_name, attr_name, bl, use_striper);
+
+ if (ret < 0) {
+ cerr << "error setting xattr " << pool_name << "/" << prettify(*obj_name) << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ else
+ ret = 0;
+ }
+ else if (strcmp(nargs[0], "getxattr") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+ string attr_name(nargs[obj_name ? 1 : 2]);
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ bufferlist bl;
+ ret = detail::getxattr(io_ctx, *obj_name, attr_name, bl, use_striper);
+
+ if (ret < 0) {
+ cerr << "error getting xattr " << pool_name << "/" << prettify(*obj_name) << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ else
+ ret = 0;
+ string s(bl.c_str(), bl.length());
+ cout << s;
+ } else if (strcmp(nargs[0], "rmxattr") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+
+ string attr_name(nargs[obj_name ? 1 : 2]);
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ret = detail::rmxattr(io_ctx, *obj_name, attr_name, use_striper);
+
+ if (ret < 0) {
+ cerr << "error removing xattr " << pool_name << "/" << prettify(*obj_name) << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "listxattr") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ bufferlist bl;
+ map<std::string, bufferlist> attrset;
+
+ ret = detail::getxattrs(io_ctx, *obj_name, attrset, use_striper);
+
+ if (ret < 0) {
+ cerr << "error getting xattr set " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ for (map<std::string, bufferlist>::iterator iter = attrset.begin();
+ iter != attrset.end(); ++iter) {
+ cout << iter->first << std::endl;
+ }
+ } else if (strcmp(nargs[0], "getomapheader") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ string outfile;
+ if (nargs.size() >= (obj_name ? 2 : 3)) {
+ outfile = nargs[obj_name ? 1 : 2];
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ bufferlist header;
+ ret = io_ctx.omap_get_header(*obj_name, &header);
+ if (ret < 0) {
+ cerr << "error getting omap header " << pool_name << "/" << prettify(*obj_name)
+ << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ if (!outfile.empty()) {
+ cerr << "Writing to " << outfile << std::endl;
+ dump_data(outfile, header);
+ } else {
+ cout << "header (" << header.length() << " bytes) :\n";
+ header.hexdump(cout);
+ cout << std::endl;
+ }
+ ret = 0;
+ }
+ } else if (strcmp(nargs[0], "setomapheader") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+
+ bufferlist bl;
+ if (!obj_name) {
+ obj_name = nargs[1];
+ bl.append(nargs[2]); // val
+ } else {
+ bl.append(nargs[1]); // val
+ }
+ ret = io_ctx.omap_set_header(*obj_name, bl);
+ if (ret < 0) {
+ cerr << "error setting omap value " << pool_name << "/" << prettify(*obj_name)
+ << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ ret = 0;
+ }
+ } else if (strcmp(nargs[0], "setomapval") == 0) {
+ uint32_t min_args = (omap_key ? 2 : 3);
+ if (!pool_name || nargs.size() < min_args || nargs.size() > min_args + 1) {
+ usage(cerr);
+ return 1;
+ }
+
+ string oid(nargs[1]);
+ if (!omap_key) {
+ omap_key = nargs[2];
+ }
+
+ bufferlist bl;
+ if (!input_file.empty()) {
+ string err;
+ ret = bl.read_file(input_file.c_str(), &err);
+ if (ret < 0) {
+ cerr << "error reading file " << input_file.c_str() << ": " << err << std::endl;
+ return 1;
+ }
+ } else if (nargs.size() > min_args) {
+ string val(nargs[min_args]);
+ bl.append(val);
+ } else {
+ do {
+ ret = bl.read_fd(STDIN_FILENO, 1024); // from stdin
+ if (ret < 0) {
+ return 1;
+ }
+ } while (ret > 0);
+ }
+
+ map<string, bufferlist> values;
+ values[*omap_key] = bl;
+
+ ret = io_ctx.omap_set(oid, values);
+ if (ret < 0) {
+ cerr << "error setting omap value " << pool_name << "/" << oid << "/"
+ << prettify(*omap_key) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ ret = 0;
+ }
+ } else if (strcmp(nargs[0], "getomapval") == 0) {
+ uint32_t min_args = (omap_key ? (obj_name ? 1 : 2)
+ : (obj_name ? 2 : 3));
+ if (!pool_name || nargs.size() < min_args || nargs.size() > min_args + 1) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (!omap_key) {
+ omap_key = nargs[obj_name ? 1 : 2];
+ }
+
+ set<string> keys;
+ keys.insert(*omap_key);
+
+ std::string outfile;
+ if (nargs.size() > min_args) {
+ outfile = nargs[min_args];
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+
+ map<string, bufferlist> values;
+ ret = io_ctx.omap_get_vals_by_keys(*obj_name, keys, &values);
+ if (ret < 0) {
+ cerr << "error getting omap value " << pool_name << "/" << prettify(*obj_name) << "/"
+ << prettify(*omap_key) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ ret = 0;
+ }
+
+ if (values.size() && values.begin()->first == *omap_key) {
+ if (!outfile.empty()) {
+ cerr << "Writing to " << outfile << std::endl;
+ dump_data(outfile, values.begin()->second);
+ } else {
+ cout << "value (" << values.begin()->second.length() << " bytes) :\n";
+ values.begin()->second.hexdump(cout);
+ cout << std::endl;
+ }
+ ret = 0;
+ } else {
+ cout << "No such key: " << pool_name << "/" << prettify(*obj_name) << "/"
+ << prettify(*omap_key) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "rmomapkey") == 0) {
+ uint32_t num_args = (omap_key ? (obj_name ? 1 : 2)
+ : (obj_name ? 2 : 3));
+ if (!pool_name || nargs.size() != num_args) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (!omap_key) {
+ omap_key = nargs[obj_name ? 1 : 2];
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ set<string> keys;
+ keys.insert(*omap_key);
+
+ ret = io_ctx.omap_rm_keys(*obj_name, keys);
+ if (ret < 0) {
+ cerr << "error removing omap key " << pool_name << "/" << prettify(*obj_name) << "/"
+ << prettify(*omap_key) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ ret = 0;
+ }
+ } else if (strcmp(nargs[0], "clearomap") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ // strip nargs[0] which is "clearomap"
+ std::vector<std::string> oids(std::next(std::begin(nargs)),
+ std::end(nargs));
+ if (obj_name) {
+ oids.push_back(*obj_name);
+ }
+
+ for (const auto& oid : oids) {
+ ret = io_ctx.omap_clear(oid);
+ if (ret < 0) {
+ cerr << "error clearing omap keys " << pool_name << "/" << prettify(*obj_name) << "/"
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ ret = 0;
+ } else if (strcmp(nargs[0], "listomapvals") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ string last_read = "";
+ do {
+ map<string, bufferlist> values;
+ ret = io_ctx.omap_get_vals(*obj_name, last_read, MAX_OMAP_BYTES_PER_REQUEST, &values);
+ if (ret < 0) {
+ cerr << "error getting omap keys " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ ret = values.size();
+ for (map<string, bufferlist>::const_iterator it = values.begin();
+ it != values.end(); ++it) {
+ last_read = it->first;
+ // dump key in hex if it contains nonprintable characters
+ if (std::count_if(it->first.begin(), it->first.end(),
+ (int (*)(int))isprint) < (int)it->first.length()) {
+ cout << "key (" << it->first.length() << " bytes):\n";
+ bufferlist keybl;
+ keybl.append(it->first);
+ keybl.hexdump(cout);
+ } else {
+ cout << it->first;
+ }
+ cout << std::endl;
+ cout << "value (" << it->second.length() << " bytes) :\n";
+ it->second.hexdump(cout);
+ cout << std::endl;
+ }
+ } while (ret == MAX_OMAP_BYTES_PER_REQUEST);
+ ret = 0;
+ }
+ else if (strcmp(nargs[0], "cp") == 0) {
+ // XXX: binary names aren't supported for this operation
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (nargs.size() < 2 || nargs.size() > 3) {
+ usage(cerr);
+ return 1;
+ }
+
+ const char *target = target_pool_name;
+ if (!target)
+ target = pool_name;
+
+ const char *target_obj;
+ if (nargs.size() < 3) {
+ if (strcmp(target, pool_name) == 0) {
+ cerr << "cannot copy object into itself" << std::endl;
+ return 1;
+ }
+ target_obj = nargs[1];
+ } else {
+ target_obj = nargs[2];
+ }
+
+ // open io context.
+ IoCtx target_ctx;
+ ret = rados.ioctx_create(target, target_ctx);
+ if (ret < 0) {
+ cerr << "error opening target pool " << target << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ if (target_oloc.size()) {
+ target_ctx.locator_set_key(target_oloc);
+ }
+ if (target_nspace.size()) {
+ target_ctx.set_namespace(target_nspace);
+ }
+
+ ret = do_copy(io_ctx, nargs[1], target_ctx, target_obj);
+ if (ret < 0) {
+ cerr << "error copying " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "rm") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ // strip nargs[0] which is "rm"
+ std::vector<std::string> oids(std::next(std::begin(nargs)),
+ std::end(nargs));
+ if (obj_name) {
+ oids.push_back(*obj_name);
+ }
+ for (const auto& oid : oids) {
+ if (forcefull) {
+ ret = detail::remove(io_ctx, oid, (CEPH_OSD_FLAG_FULL_FORCE |
+ CEPH_OSD_FLAG_FULL_TRY), use_striper);
+ } else {
+ ret = detail::remove(io_ctx, oid, use_striper);
+ }
+
+ if (ret < 0) {
+ string name = (nspace.size() ? nspace + "/" : "" ) + prettify(oid);
+ cerr << "error removing " << pool_name << ">" << name << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ }
+ else if (strcmp(nargs[0], "create") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ret = io_ctx.create(*obj_name, true);
+ if (ret < 0) {
+ cerr << "error creating " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "cppool") == 0) {
+ bool force = nargs.size() == 4 && !strcmp(nargs[3], "--yes-i-really-mean-it");
+ if (nargs.size() != 3 && !(nargs.size() == 4 && force)) {
+ usage(cerr);
+ return 1;
+ }
+ const char *src_pool = nargs[1];
+ const char *target_pool = nargs[2];
+
+ if (strcmp(src_pool, target_pool) == 0) {
+ cerr << "cannot copy pool into itself" << std::endl;
+ return 1;
+ }
+
+ cerr << "WARNING: pool copy does not preserve user_version, which some "
+ << " apps may rely on." << std::endl;
+
+ if (rados.get_pool_is_selfmanaged_snaps_mode(src_pool)) {
+ cerr << "WARNING: pool " << src_pool << " has selfmanaged snaps, which are not preserved\n"
+ << " by the cppool operation. This will break any snapshot user."
+ << std::endl;
+ if (!force) {
+ cerr << " If you insist on making a broken copy, you can pass\n"
+ << " --yes-i-really-mean-it to proceed anyway."
+ << std::endl;
+ exit(1);
+ }
+ }
+
+ ret = do_copy_pool(rados, src_pool, target_pool);
+ if (ret < 0) {
+ cerr << "error copying pool " << src_pool << " => " << target_pool << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ cout << "successfully copied pool " << nargs[1] << std::endl;
+ }
+ else if (strcmp(nargs[0], "purge") == 0) {
+ if (nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+ if (nargs.size() < 3 ||
+ strcmp(nargs[2], "--yes-i-really-really-mean-it") != 0) {
+ cerr << "WARNING:\n"
+ << " This will PERMANENTLY DESTROY all objects from a pool with no way back.\n"
+ << " To confirm, follow pool with --yes-i-really-really-mean-it" << std::endl;
+ return 1;
+ }
+ ret = rados.ioctx_create(nargs[1], io_ctx);
+ if (ret < 0) {
+ cerr << "error pool " << nargs[1] << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ io_ctx.set_namespace(all_nspaces);
+ io_ctx.set_pool_full_try();
+ RadosBencher bencher(g_ceph_context, rados, io_ctx);
+ ret = bencher.clean_up_slow("", concurrent_ios);
+ if (ret >= 0) {
+ cout << "successfully purged pool " << nargs[1] << std::endl;
+ } else { //error
+ cerr << "pool " << nargs[1] << " could not be purged" << std::endl;
+ cerr << "Check your monitor configuration - `mon allow pool delete` is set to false by default,"
+ << " change it to true to allow deletion of pools" << std::endl;
+ }
+ }
+ else if (strcmp(nargs[0], "lssnap") == 0) {
+ if (!pool_name || nargs.size() != 1) {
+ usage(cerr);
+ return 1;
+ }
+
+ vector<snap_t> snaps;
+ io_ctx.snap_list(&snaps);
+ for (vector<snap_t>::iterator i = snaps.begin();
+ i != snaps.end();
+ ++i) {
+ string s;
+ time_t t;
+ if (io_ctx.snap_get_name(*i, &s) < 0)
+ continue;
+ if (io_ctx.snap_get_stamp(*i, &t) < 0)
+ continue;
+ struct tm bdt;
+ localtime_r(&t, &bdt);
+ cout << *i << "\t" << s << "\t";
+
+ std::ios_base::fmtflags original_flags = cout.flags();
+ cout.setf(std::ios::right);
+ cout.fill('0');
+ cout << std::setw(4) << (bdt.tm_year+1900)
+ << '.' << std::setw(2) << (bdt.tm_mon+1)
+ << '.' << std::setw(2) << bdt.tm_mday
+ << ' '
+ << std::setw(2) << bdt.tm_hour
+ << ':' << std::setw(2) << bdt.tm_min
+ << ':' << std::setw(2) << bdt.tm_sec
+ << std::endl;
+ cout.flags(original_flags);
+ }
+ cout << snaps.size() << " snaps" << std::endl;
+ }
+
+ else if (strcmp(nargs[0], "mksnap") == 0) {
+ if (!pool_name || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (rados.get_pool_is_selfmanaged_snaps_mode(pool_name)) {
+ cerr << "can't create snapshot: pool " << pool_name
+ << " is in selfmanaged snaps mode" << std::endl;
+ return 1;
+ }
+
+ ret = io_ctx.snap_create(nargs[1]);
+ if (ret < 0) {
+ cerr << "error creating pool " << pool_name << " snapshot " << nargs[1]
+ << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ cout << "created pool " << pool_name << " snap " << nargs[1] << std::endl;
+ }
+
+ else if (strcmp(nargs[0], "rmsnap") == 0) {
+ if (!pool_name || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ ret = io_ctx.snap_remove(nargs[1]);
+ if (ret < 0) {
+ cerr << "error removing pool " << pool_name << " snapshot " << nargs[1]
+ << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ cout << "removed pool " << pool_name << " snap " << nargs[1] << std::endl;
+ }
+
+ else if (strcmp(nargs[0], "rollback") == 0) {
+ if (!pool_name || nargs.size() < 3) {
+ usage(cerr);
+ return 1;
+ }
+
+ ret = io_ctx.snap_rollback(nargs[1], nargs[2]);
+ if (ret < 0) {
+ cerr << "error rolling back pool " << pool_name << " to snapshot " << nargs[1]
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ cout << "rolled back pool " << pool_name
+ << " to snapshot " << nargs[2] << std::endl;
+ }
+ else if (strcmp(nargs[0], "bench") == 0) {
+ if (!pool_name || nargs.size() < 3) {
+ usage(cerr);
+ return 1;
+ }
+ char* endptr = NULL;
+ int seconds = strtol(nargs[1], &endptr, 10);
+ if (*endptr) {
+ cerr << "Invalid value for seconds: '" << nargs[1] << "'" << std::endl;
+ return 1;
+ }
+ int operation = 0;
+ if (strcmp(nargs[2], "write") == 0)
+ operation = OP_WRITE;
+ else if (strcmp(nargs[2], "seq") == 0)
+ operation = OP_SEQ_READ;
+ else if (strcmp(nargs[2], "rand") == 0)
+ operation = OP_RAND_READ;
+ else {
+ usage(cerr);
+ return 1;
+ }
+ if (operation != OP_WRITE) {
+ if (block_size_specified) {
+ cerr << "-b|--block_size option can be used only with 'write' bench test"
+ << std::endl;
+ return 1;
+ }
+ if (bench_write_dest != 0) {
+ cerr << "--write-object, --write-omap and --write-xattr options can "
+ "only be used with the 'write' bench test"
+ << std::endl;
+ return 1;
+ }
+ }
+ else if (bench_write_dest == 0) {
+ bench_write_dest = OP_WRITE_DEST_OBJ;
+ }
+
+ if (!formatter && output) {
+ cerr << "-o|--output option can only be used with '--format' option"
+ << std::endl;
+ return 1;
+ }
+ RadosBencher bencher(g_ceph_context, rados, io_ctx);
+ bencher.set_show_time(show_time);
+ bencher.set_write_destination(static_cast<OpWriteDest>(bench_write_dest));
+
+ ostream *outstream = NULL;
+ if (formatter) {
+ bencher.set_formatter(formatter.get());
+ if (output)
+ outstream = new ofstream(output);
+ else
+ outstream = &cout;
+ bencher.set_outstream(*outstream);
+ }
+ if (!object_size)
+ object_size = op_size;
+ else if (object_size < op_size)
+ op_size = object_size;
+ cout << "hints = " << (int)hints << std::endl;
+ ret = bencher.aio_bench(operation, seconds,
+ concurrent_ios, op_size, object_size,
+ max_objects, cleanup, hints, run_name, reuse_bench, no_verify);
+ if (ret != 0)
+ cerr << "error during benchmark: " << cpp_strerror(ret) << std::endl;
+ if (formatter && output)
+ delete outstream;
+ }
+ else if (strcmp(nargs[0], "cleanup") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+ if (wildcard)
+ io_ctx.set_namespace(all_nspaces);
+ RadosBencher bencher(g_ceph_context, rados, io_ctx);
+ ret = bencher.clean_up(prefix, concurrent_ios, run_name);
+ if (ret != 0)
+ cerr << "error during cleanup: " << cpp_strerror(ret) << std::endl;
+ }
+ else if (strcmp(nargs[0], "watch") == 0) {
+ if (!pool_name || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+ string oid(nargs[1]);
+ RadosWatchCtx ctx(io_ctx, oid.c_str());
+ uint64_t cookie;
+ ret = io_ctx.watch2(oid, &cookie, &ctx);
+ if (ret != 0)
+ cerr << "error calling watch: " << cpp_strerror(ret) << std::endl;
+ else {
+ cout << "press enter to exit..." << std::endl;
+ getchar();
+ io_ctx.unwatch2(cookie);
+ rados.watch_flush();
+ }
+ }
+ else if (strcmp(nargs[0], "notify") == 0) {
+ if (!pool_name || nargs.size() < 3) {
+ usage(cerr);
+ return 1;
+ }
+ string oid(nargs[1]);
+ string msg(nargs[2]);
+ bufferlist bl, replybl;
+ encode(msg, bl);
+ ret = io_ctx.notify2(oid, bl, 10000, &replybl);
+ if (ret != 0)
+ cerr << "error calling notify: " << cpp_strerror(ret) << std::endl;
+ if (replybl.length()) {
+ map<pair<uint64_t,uint64_t>,bufferlist> rm;
+ set<pair<uint64_t,uint64_t> > missed;
+ auto p = replybl.cbegin();
+ decode(rm, p);
+ decode(missed, p);
+ for (map<pair<uint64_t,uint64_t>,bufferlist>::iterator p = rm.begin();
+ p != rm.end();
+ ++p) {
+ cout << "reply client." << p->first.first
+ << " cookie " << p->first.second
+ << " : " << p->second.length() << " bytes" << std::endl;
+ if (p->second.length())
+ p->second.hexdump(cout);
+ }
+ for (multiset<pair<uint64_t,uint64_t> >::iterator p = missed.begin();
+ p != missed.end(); ++p) {
+ cout << "timeout client." << p->first
+ << " cookie " << p->second << std::endl;
+ }
+ }
+ } else if (strcmp(nargs[0], "set-alloc-hint") == 0) {
+ // cmd, [oid, ] obj_size, write_size
+ if (!pool_name || nargs.size() < (obj_name ? 3 : 4)) {
+ usage(cerr);
+ return 1;
+ }
+ string err;
+ uint64_t expected_object_size = strict_strtoll(nargs[obj_name ? 1 : 2], 10, &err);
+ if (!err.empty()) {
+ cerr << "couldn't parse expected_object_size: " << err << std::endl;
+ usage(cerr);
+ return 1;
+ }
+ uint64_t expected_write_size = strict_strtoll(nargs[obj_name ? 2 : 3], 10, &err);
+ if (!err.empty()) {
+ cerr << "couldn't parse expected_write_size: " << err << std::endl;
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ret = io_ctx.set_alloc_hint(*obj_name, expected_object_size, expected_write_size);
+ if (ret < 0) {
+ cerr << "error setting alloc-hint " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "load-gen") == 0) {
+ if (!pool_name) {
+ cerr << "error: must specify pool" << std::endl;
+ usage(cerr);
+ return 1;
+ }
+ LoadGen lg(&rados);
+ if (min_obj_len)
+ lg.min_obj_len = min_obj_len;
+ if (max_obj_len)
+ lg.max_obj_len = max_obj_len;
+ if (min_op_len)
+ lg.min_op_len = min_op_len;
+ if (max_op_len)
+ lg.max_op_len = max_op_len;
+ if (max_ops)
+ lg.max_ops = max_ops;
+ if (max_backlog)
+ lg.max_backlog = max_backlog;
+ if (target_throughput)
+ lg.target_throughput = target_throughput;
+ if (read_percent >= 0)
+ lg.read_percent = read_percent;
+ if (num_objs)
+ lg.num_objs = num_objs;
+ if (run_length)
+ lg.run_length = run_length;
+ if (offset_align)
+ lg.offset_align = offset_align;
+
+ cout << "run length " << run_length << " seconds" << std::endl;
+ cout << "preparing " << lg.num_objs << " objects" << std::endl;
+ ret = lg.bootstrap(pool_name);
+ if (ret < 0) {
+ cerr << "load-gen bootstrap failed" << std::endl;
+ return 1;
+ }
+ cout << "load-gen will run " << lg.run_length << " seconds" << std::endl;
+ lg.run();
+ lg.cleanup();
+ } else if (strcmp(nargs[0], "listomapkeys") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ string last_read;
+ bool more = true;
+ do {
+ set<string> out_keys;
+ ret = io_ctx.omap_get_keys2(*obj_name, last_read, MAX_OMAP_BYTES_PER_REQUEST, &out_keys, &more);
+ if (ret < 0) {
+ cerr << "error getting omap key set " << pool_name << "/"
+ << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ for (auto &key : out_keys) {
+ cout << key << std::endl;
+ last_read = std::move(key);
+ }
+ } while (more);
+ } else if (strcmp(nargs[0], "lock") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (!formatter) {
+ formatter = std::make_unique<JSONFormatter>(pretty_format);
+ }
+ ret = do_lock_cmd(nargs, opts, &io_ctx, formatter.get());
+ } else if (strcmp(nargs[0], "listwatchers") == 0) {
+ if (!pool_name || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ string oid(nargs[1]);
+ std::list<obj_watch_t> lw;
+
+ ret = io_ctx.list_watchers(oid, &lw);
+ if (ret < 0) {
+ cerr << "error listing watchers " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ else
+ ret = 0;
+
+ for (std::list<obj_watch_t>::iterator i = lw.begin(); i != lw.end(); ++i) {
+ cout << "watcher=" << i->addr << " client." << i->watcher_id << " cookie=" << i->cookie << std::endl;
+ }
+ } else if (strcmp(nargs[0], "listsnaps") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ ret = io_ctx.list_snaps(*obj_name, &ls);
+ if (ret < 0) {
+ cerr << "error listing snap shots " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ else
+ ret = 0;
+
+ map<snap_t,string> snamemap;
+ if (formatter || pretty_format) {
+ vector<snap_t> snaps;
+ io_ctx.snap_list(&snaps);
+ for (vector<snap_t>::iterator i = snaps.begin();
+ i != snaps.end(); ++i) {
+ string s;
+ if (io_ctx.snap_get_name(*i, &s) < 0)
+ continue;
+ snamemap.insert(pair<snap_t,string>(*i, s));
+ }
+ }
+
+ if (formatter) {
+ formatter->open_object_section("object");
+ formatter->dump_string("name", *obj_name);
+ formatter->open_array_section("clones");
+ } else {
+ cout << prettify(*obj_name) << ":" << std::endl;
+ cout << "cloneid snaps size overlap" << std::endl;
+ }
+
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+
+ if (formatter) formatter->open_object_section("clone");
+
+ if (ci->cloneid == librados::SNAP_HEAD) {
+ if (formatter)
+ formatter->dump_string("id", "head");
+ else
+ cout << "head";
+ } else {
+ if (formatter)
+ formatter->dump_unsigned("id", ci->cloneid);
+ else
+ cout << ci->cloneid;
+ }
+
+ if (formatter)
+ formatter->open_array_section("snapshots");
+ else
+ cout << "\t";
+
+ if (!formatter && ci->snaps.empty()) {
+ cout << "-";
+ }
+ for (std::vector<snap_t>::const_iterator snapindex = ci->snaps.begin();
+ snapindex != ci->snaps.end(); ++snapindex) {
+
+ map<snap_t,string>::iterator si;
+
+ if (formatter || pretty_format) si = snamemap.find(*snapindex);
+
+ if (formatter) {
+ formatter->open_object_section("snapshot");
+ formatter->dump_unsigned("id", *snapindex);
+ if (si != snamemap.end())
+ formatter->dump_string("name", si->second);
+ formatter->close_section(); //snapshot
+ } else {
+ if (snapindex != ci->snaps.begin()) cout << ",";
+ if (!pretty_format || (si == snamemap.end()))
+ cout << *snapindex;
+ else
+ cout << si->second << "(" << *snapindex << ")";
+ }
+ }
+
+ if (formatter) {
+ formatter->close_section(); //Snapshots
+ formatter->dump_unsigned("size", ci->size);
+ } else {
+ cout << "\t" << ci->size;
+ }
+
+ if (ci->cloneid != librados::SNAP_HEAD) {
+ if (formatter)
+ formatter->open_array_section("overlaps");
+ else
+ cout << "\t[";
+
+ for (std::vector< std::pair<uint64_t,uint64_t> >::iterator ovi = ci->overlap.begin();
+ ovi != ci->overlap.end(); ++ovi) {
+ if (formatter) {
+ formatter->open_object_section("section");
+ formatter->dump_unsigned("start", ovi->first);
+ formatter->dump_unsigned("length", ovi->second);
+ formatter->close_section(); //section
+ } else {
+ if (ovi != ci->overlap.begin()) cout << ",";
+ cout << ovi->first << "~" << ovi->second;
+ }
+ }
+ if (formatter)
+ formatter->close_section(); //overlaps
+ else
+ cout << "]" << std::endl;
+ }
+ if (formatter) formatter->close_section(); //clone
+ }
+ if (formatter) {
+ formatter->close_section(); //clones
+ formatter->close_section(); //object
+ formatter->flush(cout);
+ } else {
+ cout << std::endl;
+ }
+ } else if (strcmp(nargs[0], "list-inconsistent-pg") == 0) {
+ if (!formatter) {
+ formatter = std::make_unique<JSONFormatter>(pretty_format);
+ }
+ ret = do_get_inconsistent_pg_cmd(nargs, rados, *formatter);
+ } else if (strcmp(nargs[0], "list-inconsistent-obj") == 0) {
+ if (!formatter) {
+ formatter = std::make_unique<JSONFormatter>(pretty_format);
+ }
+ ret = do_get_inconsistent_cmd<inconsistent_obj_t>(nargs, rados, *formatter);
+ } else if (strcmp(nargs[0], "list-inconsistent-snapset") == 0) {
+ if (!formatter) {
+ formatter = std::make_unique<JSONFormatter>(pretty_format);
+ }
+ ret = do_get_inconsistent_cmd<inconsistent_snapset_t>(nargs, rados, *formatter);
+ } else if (strcmp(nargs[0], "cache-flush") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ if (with_clones) {
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ ret = io_ctx.list_snaps(*obj_name, &ls);
+ if (ret < 0) {
+ cerr << "error listing snapshots " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+ if (snapid != CEPH_NOSNAP && ci->cloneid > snapid)
+ break;
+ io_ctx.snap_set_read(ci->cloneid);
+ ret = do_cache_flush(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else {
+ ret = do_cache_flush(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else if (strcmp(nargs[0], "cache-try-flush") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ if (with_clones) {
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ ret = io_ctx.list_snaps(*obj_name, &ls);
+ if (ret < 0) {
+ cerr << "error listing snapshots " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+ if (snapid != CEPH_NOSNAP && ci->cloneid > snapid)
+ break;
+ io_ctx.snap_set_read(ci->cloneid);
+ ret = do_cache_try_flush(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else {
+ ret = do_cache_try_flush(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else if (strcmp(nargs[0], "cache-evict") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ if (with_clones) {
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ ret = io_ctx.list_snaps(*obj_name, &ls);
+ if (ret < 0) {
+ cerr << "error listing snapshots " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+ if (snapid != CEPH_NOSNAP && ci->cloneid > snapid)
+ break;
+ io_ctx.snap_set_read(ci->cloneid);
+ ret = do_cache_evict(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else {
+ ret = do_cache_evict(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else if (strcmp(nargs[0], "cache-flush-evict-all") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+ ret = do_cache_flush_evict_all(io_ctx, true);
+ if (ret < 0) {
+ cerr << "cache-flush-evict-all finished with errors" << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "cache-try-flush-evict-all") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+ ret = do_cache_flush_evict_all(io_ctx, false);
+ if (ret < 0) {
+ cerr << "cache-try-flush-evict-all finished with errors" << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "set-redirect") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+
+ const char *target = target_pool_name;
+ if (!target)
+ target = pool_name;
+
+ const char *target_obj;
+ if (nargs.size() < 3) {
+ if (strcmp(target, pool_name) == 0) {
+ cerr << "cannot copy object into itself" << std::endl;
+ return 1;
+ }
+ target_obj = nargs[1];
+ } else {
+ target_obj = nargs[2];
+ }
+
+ IoCtx target_ctx;
+ ret = rados.ioctx_create(target, target_ctx);
+ if (target_oloc.size()) {
+ target_ctx.locator_set_key(target_oloc);
+ }
+ if (target_nspace.size()) {
+ target_ctx.set_namespace(target_nspace);
+ }
+
+ ObjectWriteOperation op;
+ if (with_reference) {
+ op.set_redirect(target_obj, target_ctx, 0, CEPH_OSD_OP_FLAG_WITH_REFERENCE);
+ } else {
+ op.set_redirect(target_obj, target_ctx, 0);
+ }
+ ret = io_ctx.operate(nargs[1], &op);
+ if (ret < 0) {
+ cerr << "error set-redirect " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "set-chunk") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+
+ const char *target = target_pool_name;
+ if (!target)
+ target = pool_name;
+
+ uint64_t offset;
+ uint64_t length;
+ uint64_t tgt_offset;
+ string tgt_oid;
+ if (nargs.size() < 6) {
+ usage(cerr);
+ return 1;
+ } else {
+ char* endptr = NULL;
+ offset = strtoull(nargs[2], &endptr, 10);
+ if (*endptr) {
+ cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl;
+ return 1;
+ }
+ length = strtoull(nargs[3], &endptr, 10);
+ if (*endptr) {
+ cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl;
+ return 1;
+ }
+ tgt_oid = string(nargs[4]);
+ tgt_offset = strtoull(nargs[5], &endptr, 10);
+ if (*endptr) {
+ cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl;
+ return 1;
+ }
+ }
+
+ IoCtx target_ctx;
+ ret = rados.ioctx_create(target, target_ctx);
+ ObjectReadOperation op;
+ op.set_chunk(offset, length, target_ctx, tgt_oid, tgt_offset, CEPH_OSD_OP_FLAG_WITH_REFERENCE);
+ ret = io_ctx.operate(nargs[1], &op, NULL);
+ if (ret < 0) {
+ cerr << "error set-chunk " << pool_name << "/" << nargs[1] << " " << " offset " << offset
+ << " length " << length << " target_pool " << target
+ << "tgt_offset: " << tgt_offset << " : " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "tier-promote") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ObjectWriteOperation op;
+ op.tier_promote();
+ ret = io_ctx.operate(*obj_name, &op);
+ if (ret < 0) {
+ cerr << "error tier-promote " << pool_name << "/" << prettify(*obj_name) << " : "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "unset-manifest") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ObjectWriteOperation op;
+ op.unset_manifest();
+ ret = io_ctx.operate(*obj_name, &op);
+ if (ret < 0) {
+ cerr << "error unset-manifest " << pool_name << "/" << prettify(*obj_name) << " : "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "tier-flush") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ObjectReadOperation op;
+ op.tier_flush();
+ librados::AioCompletion *completion =
+ librados::Rados::aio_create_completion();
+ io_ctx.aio_operate(*obj_name, completion, &op,
+ librados::OPERATION_IGNORE_CACHE |
+ librados::OPERATION_IGNORE_OVERLAY,
+ NULL);
+ completion->wait_for_complete();
+ ret = completion->get_return_value();
+ completion->release();
+ if (ret < 0) {
+ cerr << "error tier-flush " << pool_name << "/" << prettify(*obj_name) << " : "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "tier-evict") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ObjectReadOperation op;
+ op.tier_evict();
+ librados::AioCompletion *completion =
+ librados::Rados::aio_create_completion();
+ io_ctx.aio_operate(*obj_name, completion, &op,
+ librados::OPERATION_IGNORE_CACHE |
+ librados::OPERATION_IGNORE_OVERLAY,
+ NULL);
+ completion->wait_for_complete();
+ ret = completion->get_return_value();
+ completion->release();
+ if (ret < 0) {
+ cerr << "error tier-evict " << pool_name << "/" << prettify(*obj_name) << " : "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "export") == 0) {
+ // export [filename]
+ if (!pool_name || nargs.size() > 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ int file_fd;
+ if (nargs.size() < 2 || std::string(nargs[1]) == "-") {
+ file_fd = STDOUT_FILENO;
+ } else {
+ file_fd = open(nargs[1], O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0666);
+ if (file_fd < 0) {
+ cerr << "Error opening '" << nargs[1] << "': "
+ << cpp_strerror(file_fd) << std::endl;
+ return 1;
+ }
+ }
+
+ ret = PoolDump(file_fd).dump(&io_ctx);
+
+ if (file_fd != STDIN_FILENO) {
+ VOID_TEMP_FAILURE_RETRY(::close(file_fd));
+ }
+
+ if (ret < 0) {
+ cerr << "error from export: "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "import") == 0) {
+ // import [--no-overwrite] [--dry-run] <filename | - >
+ if (!pool_name || nargs.size() > 4 || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ // Last arg is the filename
+ std::string const filename = nargs[nargs.size() - 1];
+
+ // All other args may be flags
+ bool dry_run = false;
+ bool no_overwrite = false;
+ for (unsigned i = 1; i < nargs.size() - 1; ++i) {
+ std::string arg(nargs[i]);
+
+ if (arg == std::string("--no-overwrite")) {
+ no_overwrite = true;
+ } else if (arg == std::string("--dry-run")) {
+ dry_run = true;
+ } else {
+ std::cerr << "Invalid argument '" << arg << "'" << std::endl;
+ return 1;
+ }
+ }
+
+ int file_fd;
+ if (filename == "-") {
+ file_fd = STDIN_FILENO;
+ } else {
+ file_fd = open(filename.c_str(), O_RDONLY|O_BINARY);
+ if (file_fd < 0) {
+ cerr << "Error opening '" << filename << "': "
+ << cpp_strerror(file_fd) << std::endl;
+ return 1;
+ }
+ }
+
+ ret = RadosImport(file_fd, 0, dry_run).import(io_ctx, no_overwrite);
+
+ if (file_fd != STDIN_FILENO) {
+ VOID_TEMP_FAILURE_RETRY(::close(file_fd));
+ }
+
+ if (ret < 0) {
+ cerr << "error from import: "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else {
+ cerr << "unrecognized command " << nargs[0] << "; -h or --help for usage" << std::endl;
+ ret = -EINVAL;
+ }
+
+ if (ret < 0)
+ cerr << "error " << (-ret) << ": " << cpp_strerror(ret) << std::endl;
+
+ return (ret < 0) ? 1 : 0;
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage(cout);
+ exit(0);
+ }
+
+ std::map < std::string, std::string > opts;
+ std::string val;
+
+ // Necessary to support usage of -f for formatting,
+ // since global_init will remove the -f using ceph
+ // argparse procedures.
+ for (auto j = args.begin(); j != args.end(); ++j) {
+ if (strcmp(*j, "--") == 0) {
+ break;
+ } else if ((j+1) == args.end()) {
+ // This can't be a formatting call (no format arg)
+ break;
+ } else if (strcmp(*j, "-f") == 0) {
+ val = *(j+1);
+ unique_ptr<Formatter> formatter(Formatter::create(val.c_str()));
+
+ if (formatter) {
+ j = args.erase(j);
+ opts["format"] = val;
+
+ j = args.erase(j);
+ break;
+ }
+ }
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ std::vector<const char*>::iterator i;
+ for (i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_flag(args, i, "--force-full", (char*)NULL)) {
+ opts["force-full"] = "true";
+ } else if (ceph_argparse_flag(args, i, "-d", "--delete-after", (char*)NULL)) {
+ opts["delete-after"] = "true";
+ } else if (ceph_argparse_flag(args, i, "-C", "--create", "--create-pool",
+ (char*)NULL)) {
+ opts["create"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--pretty-format", (char*)NULL)) {
+ opts["pretty-format"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--show-time", (char*)NULL)) {
+ opts["show-time"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--no-cleanup", (char*)NULL)) {
+ opts["no-cleanup"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--no-hints", (char*)NULL)) {
+ opts["no-hints"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--reuse-bench", (char*)NULL)) {
+ opts["reuse-bench"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--no-verify", (char*)NULL)) {
+ opts["no-verify"] = "true";
+ } else if (ceph_argparse_witharg(args, i, &val, "--run-name", (char*)NULL)) {
+ opts["run-name"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--prefix", (char*)NULL)) {
+ opts["prefix"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) {
+ opts["pool"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-pool", (char*)NULL)) {
+ opts["target_pool"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--object-locator" , (char *)NULL)) {
+ opts["object_locator"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-locator" , (char *)NULL)) {
+ opts["target_locator"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-nspace" , (char *)NULL)) {
+ opts["target_nspace"] = val;
+#ifdef WITH_LIBRADOSSTRIPER
+ } else if (ceph_argparse_flag(args, i, "--striper" , (char *)NULL)) {
+ opts["striper"] = "true";
+#endif
+ } else if (ceph_argparse_witharg(args, i, &val, "-t", "--concurrent-ios", (char*)NULL)) {
+ opts["concurrent-ios"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--block-size", (char*)NULL)) {
+ opts["block-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-b", (char*)NULL)) {
+ opts["block-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--object-size", (char*)NULL)) {
+ opts["object-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-objects", (char*)NULL)) {
+ opts["max-objects"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--offset", (char*)NULL)) {
+ opts["offset"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-O", (char*)NULL)) {
+ opts["object-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-s", "--snap", (char*)NULL)) {
+ opts["snap"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-S", "--snapid", (char*)NULL)) {
+ opts["snapid"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--min-object-size", (char*)NULL)) {
+ opts["min-object-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-object-size", (char*)NULL)) {
+ opts["max-object-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--min-op-len", (char*)NULL)) {
+ opts["min-op-len"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-op-len", (char*)NULL)) {
+ opts["max-op-len"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-ops", (char*)NULL)) {
+ opts["max-ops"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-backlog", (char*)NULL)) {
+ opts["max-backlog"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-throughput", (char*)NULL)) {
+ opts["target-throughput"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--offset-align", (char*)NULL)) {
+ opts["offset_align"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--read-percent", (char*)NULL)) {
+ opts["read-percent"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--num-objects", (char*)NULL)) {
+ opts["num-objects"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--run-length", (char*)NULL)) {
+ opts["run-length"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--workers", (char*)NULL)) {
+ opts["workers"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-f", "--format", (char*)NULL)) {
+ opts["format"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-tag", (char*)NULL)) {
+ opts["lock-tag"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-cookie", (char*)NULL)) {
+ opts["lock-cookie"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-description", (char*)NULL)) {
+ opts["lock-description"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-duration", (char*)NULL)) {
+ opts["lock-duration"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-type", (char*)NULL)) {
+ opts["lock-type"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-N", "--namespace", (char*)NULL)) {
+ opts["namespace"] = val;
+ } else if (ceph_argparse_flag(args, i, "--all", (char*)NULL)) {
+ opts["all"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--default", (char*)NULL)) {
+ opts["default"] = "true";
+ } else if (ceph_argparse_witharg(args, i, &val, "-o", "--output", (char*)NULL)) {
+ opts["output"] = val;
+ } else if (ceph_argparse_flag(args, i, "--write-omap", (char*)NULL)) {
+ opts["write-dest-omap"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--write-object", (char*)NULL)) {
+ opts["write-dest-obj"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--write-xattr", (char*)NULL)) {
+ opts["write-dest-xattr"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--with-clones", (char*)NULL)) {
+ opts["with-clones"] = "true";
+ } else if (ceph_argparse_witharg(args, i, &val, "--omap-key-file", (char*)NULL)) {
+ opts["omap-key-file"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--obj-name-file", (char*)NULL)) {
+ opts["obj-name-file"] = val;
+ } else if (ceph_argparse_flag(args, i, "--with-reference", (char*)NULL)) {
+ opts["with-reference"] = "true";
+ } else if (ceph_argparse_witharg(args, i, &val, "--pgid", (char*)NULL)) {
+ opts["pgid"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--input-file", (char*)NULL)) {
+ opts["input_file"] = val;
+ } else {
+ if (val[0] == '-')
+ usage_exit();
+ ++i;
+ }
+ }
+
+ if (args.empty()) {
+ cerr << "rados: you must give an action. Try --help" << std::endl;
+ return 1;
+ }
+
+ return rados_tool_common(opts, args);
+}
diff --git a/src/tools/radosacl.cc b/src/tools/radosacl.cc
new file mode 100644
index 000000000..3b071705b
--- /dev/null
+++ b/src/tools/radosacl.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <stdlib.h>
+#include <time.h>
+#include <errno.h>
+
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+
+using namespace librados;
+
+void buf_to_hex(const unsigned char *buf, int len, char *str)
+{
+ str[0] = '\0';
+ for (int i = 0; i < len; i++) {
+ sprintf(&str[i*2], "%02x", (int)buf[i]);
+ }
+}
+
+
+#define ID_SIZE 8
+
+#define ACL_RD 0x1
+#define ACL_WR 0x2
+
+struct ACLID {
+ char id[ID_SIZE + 1];
+
+ void encode(bufferlist& bl) const {
+ bl.append((const char *)id, ID_SIZE);
+ }
+ void decode(bufferlist::const_iterator& iter) {
+ iter.copy(ID_SIZE, (char *)id);
+ }
+};
+WRITE_CLASS_ENCODER(ACLID)
+
+typedef __u32 ACLFlags;
+
+
+inline bool operator<(const ACLID& l, const ACLID& r)
+{
+ return (memcmp(&l, &r, ID_SIZE) < 0);
+}
+
+struct ACLPair {
+ ACLID id;
+ ACLFlags flags;
+};
+
+class ObjectACLs {
+ map<ACLID, ACLFlags> acls_map;
+
+public:
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(acls_map, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(acls_map, bl);
+ }
+
+ int read_acl(ACLID& id, ACLFlags *flags);
+ void set_acl(ACLID& id, ACLFlags flags);
+};
+WRITE_CLASS_ENCODER(ObjectACLs)
+
+int ObjectACLs::read_acl(ACLID& id, ACLFlags *flags)
+{
+ if (!flags)
+ return -EINVAL;
+
+ map<ACLID, ACLFlags>::iterator iter = acls_map.find(id);
+
+ if (iter == acls_map.end())
+ return -ENOENT;
+
+ *flags = iter->second;
+
+ return 0;
+}
+
+void ObjectACLs::set_acl(ACLID& id, ACLFlags flags)
+{
+ acls_map[id] = flags;
+}
+
+
+
+class ACLEntity
+{
+ string name;
+ map<ACLID, ACLEntity> groups;
+};
+
+typedef map<ACLID, ACLEntity> tACLIDEntityMap;
+
+static map<ACLID, ACLEntity> users;
+static map<ACLID, ACLEntity> groups;
+
+void get_user(ACLID& aclid, ACLEntity *entity)
+{
+ //users.find(aclid);
+}
+
+
+
+
+
+int main(int argc, const char **argv)
+{
+ Rados rados;
+ if (rados.init(NULL) < 0) {
+ cerr << "couldn't initialize rados!" << std::endl;
+ exit(1);
+ }
+ if (rados.conf_read_file(NULL)) {
+ cerr << "couldn't read Ceph configuration file!" << std::endl;
+ exit(1);
+ }
+ if (rados.connect() < 0) {
+ cerr << "couldn't connect to cluster!" << std::endl;
+ exit(1);
+ }
+
+ time_t tm;
+ bufferlist bl, bl2;
+ char buf[128];
+
+ time(&tm);
+ snprintf(buf, 128, "%s", ctime(&tm));
+ bl.append(buf, strlen(buf));
+
+ const char *oid = "bar";
+
+ IoCtx io_ctx;
+ int r = rados.ioctx_create("data", io_ctx);
+ cout << "open io_ctx result = " << r << " pool = " << io_ctx.get_pool_name() << std::endl;
+
+ ACLID id;
+
+ snprintf(id.id, sizeof(id.id), "%.8x", 0x1234);
+ cout << "id=" << id.id << std::endl;
+
+ r = io_ctx.exec(oid, "acl", "get", bl, bl2);
+ cout << "exec(acl get) returned " << r
+ << " len=" << bl2.length() << std::endl;
+ ObjectACLs oa;
+ if (r >= 0) {
+ auto iter = bl2.cbegin();
+ oa.decode(iter);
+ }
+
+ oa.set_acl(id, ACL_RD);
+ bl.clear();
+ oa.encode(bl);
+ r = io_ctx.exec(oid, "acl", "set", bl, bl2);
+ cout << "exec(acl set) returned " << r
+ << " len=" << bl2.length() << std::endl;
+
+ const unsigned char *md5 = (const unsigned char *)bl2.c_str();
+ char md5_str[bl2.length()*2 + 1];
+ buf_to_hex(md5, bl2.length(), md5_str);
+ cout << "md5 result=" << md5_str << std::endl;
+
+ int size = io_ctx.read(oid, bl2, 128, 0);
+ cout << "read result=" << bl2.c_str() << std::endl;
+ cout << "size=" << size << std::endl;
+
+ return 0;
+}
+
diff --git a/src/tools/rbd/ArgumentTypes.cc b/src/tools/rbd/ArgumentTypes.cc
new file mode 100644
index 000000000..7b111b811
--- /dev/null
+++ b/src/tools/rbd/ArgumentTypes.cc
@@ -0,0 +1,548 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd/features.h"
+#include "common/config_proxy.h"
+#include "common/strtol.h"
+#include "common/Formatter.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/tokenizer.hpp>
+
+namespace rbd {
+namespace argument_types {
+
+namespace po = boost::program_options;
+
+const std::map<uint64_t, std::string> ImageFeatures::FEATURE_MAPPING = {
+ {RBD_FEATURE_LAYERING, RBD_FEATURE_NAME_LAYERING},
+ {RBD_FEATURE_STRIPINGV2, RBD_FEATURE_NAME_STRIPINGV2},
+ {RBD_FEATURE_EXCLUSIVE_LOCK, RBD_FEATURE_NAME_EXCLUSIVE_LOCK},
+ {RBD_FEATURE_OBJECT_MAP, RBD_FEATURE_NAME_OBJECT_MAP},
+ {RBD_FEATURE_FAST_DIFF, RBD_FEATURE_NAME_FAST_DIFF},
+ {RBD_FEATURE_DEEP_FLATTEN, RBD_FEATURE_NAME_DEEP_FLATTEN},
+ {RBD_FEATURE_JOURNALING, RBD_FEATURE_NAME_JOURNALING},
+ {RBD_FEATURE_DATA_POOL, RBD_FEATURE_NAME_DATA_POOL},
+ {RBD_FEATURE_OPERATIONS, RBD_FEATURE_NAME_OPERATIONS},
+ {RBD_FEATURE_MIGRATING, RBD_FEATURE_NAME_MIGRATING},
+ {RBD_FEATURE_NON_PRIMARY, RBD_FEATURE_NAME_NON_PRIMARY},
+};
+
+Format::Formatter Format::create_formatter(bool pretty) const {
+ if (value == "json") {
+ return Formatter(new JSONFormatter(pretty));
+ } else if (value == "xml") {
+ return Formatter(new XMLFormatter(pretty));
+ }
+ return Formatter();
+}
+
+std::string get_name_prefix(ArgumentModifier modifier) {
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_SOURCE:
+ return SOURCE_PREFIX;
+ case ARGUMENT_MODIFIER_DEST:
+ return DEST_PREFIX;
+ default:
+ return "";
+ }
+}
+
+std::string get_description_prefix(ArgumentModifier modifier) {
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_SOURCE:
+ return "source ";
+ case ARGUMENT_MODIFIER_DEST:
+ return "destination ";
+ default:
+ return "";
+ }
+}
+
+void add_pool_option(po::options_description *opt,
+ ArgumentModifier modifier,
+ const std::string &desc_suffix) {
+ std::string name = POOL_NAME + ",p";
+ std::string description = "pool name";
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_NONE:
+ break;
+ case ARGUMENT_MODIFIER_SOURCE:
+ description = "source " + description;
+ break;
+ case ARGUMENT_MODIFIER_DEST:
+ name = DEST_POOL_NAME;
+ description = "destination " + description;
+ break;
+ }
+ description += desc_suffix;
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_namespace_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier) {
+ std::string name = NAMESPACE_NAME;
+ std::string description = "namespace name";
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_NONE:
+ break;
+ case ARGUMENT_MODIFIER_SOURCE:
+ description = "source " + description;
+ break;
+ case ARGUMENT_MODIFIER_DEST:
+ name = DEST_NAMESPACE_NAME;
+ description = "destination " + description;
+ break;
+ }
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_image_option(po::options_description *opt,
+ ArgumentModifier modifier,
+ const std::string &desc_suffix) {
+ std::string name = IMAGE_NAME;
+ std::string description = "image name";
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_NONE:
+ break;
+ case ARGUMENT_MODIFIER_SOURCE:
+ description = "source " + description;
+ break;
+ case ARGUMENT_MODIFIER_DEST:
+ name = DEST_IMAGE_NAME;
+ description = "destination " + description;
+ break;
+ }
+ description += desc_suffix;
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_image_id_option(po::options_description *opt,
+ const std::string &desc_suffix) {
+ std::string name = IMAGE_ID;
+ std::string description = "image id";
+ description += desc_suffix;
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_snap_option(po::options_description *opt,
+ ArgumentModifier modifier) {
+
+ std::string name = SNAPSHOT_NAME;
+ std::string description = "snapshot name";
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_NONE:
+ break;
+ case ARGUMENT_MODIFIER_DEST:
+ name = DEST_SNAPSHOT_NAME;
+ description = "destination " + description;
+ break;
+ case ARGUMENT_MODIFIER_SOURCE:
+ description = "source " + description;
+ break;
+ }
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_snap_id_option(po::options_description *opt) {
+ opt->add_options()
+ (SNAPSHOT_ID.c_str(), po::value<uint64_t>(), "snapshot id");
+}
+
+void add_pool_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ bool namespaces_supported) {
+ opt->add_options()
+ ((POOL_NAME + ",p").c_str(), po::value<std::string>(), "pool name");
+ if (namespaces_supported) {
+ add_namespace_option(opt, ARGUMENT_MODIFIER_NONE);
+ pos->add_options()
+ ("pool-spec", "pool specification\n"
+ "(example: <pool-name>[/<namespace>]");
+ } else {
+ pos->add_options()
+ ("pool-name", "pool name");
+ }
+}
+
+void add_image_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ ArgumentModifier modifier) {
+ pos->add_options()
+ ((get_name_prefix(modifier) + IMAGE_SPEC).c_str(),
+ (get_description_prefix(modifier) + "image specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<image-name>)").c_str());
+ add_pool_option(opt, modifier);
+ add_namespace_option(opt, modifier);
+ add_image_option(opt, modifier);
+}
+
+void add_snap_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ ArgumentModifier modifier) {
+ pos->add_options()
+ ((get_name_prefix(modifier) + SNAPSHOT_SPEC).c_str(),
+ (get_description_prefix(modifier) + "snapshot specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<image-name>@<snap-name>)").c_str());
+ add_pool_option(opt, modifier);
+ add_namespace_option(opt, modifier);
+ add_image_option(opt, modifier);
+ add_snap_option(opt, modifier);
+}
+
+void add_image_or_snap_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ ArgumentModifier modifier) {
+ pos->add_options()
+ ((get_name_prefix(modifier) + IMAGE_OR_SNAPSHOT_SPEC).c_str(),
+ (get_description_prefix(modifier) + "image or snapshot specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<image-name>[@<snap-name>])").c_str());
+ add_pool_option(opt, modifier);
+ add_namespace_option(opt, modifier);
+ add_image_option(opt, modifier);
+ add_snap_option(opt, modifier);
+}
+
+void add_create_image_options(po::options_description *opt,
+ bool include_format) {
+ // TODO get default image format from conf
+ if (include_format) {
+ opt->add_options()
+ (IMAGE_FORMAT.c_str(), po::value<ImageFormat>(),
+ "image format [default: 2]")
+ (IMAGE_NEW_FORMAT.c_str(),
+ po::value<ImageNewFormat>()->zero_tokens(),
+ "deprecated[:image-format 2]");
+ }
+
+ opt->add_options()
+ (IMAGE_ORDER.c_str(), po::value<ImageOrder>(),
+ "deprecated[:object-size]")
+ (IMAGE_OBJECT_SIZE.c_str(), po::value<ImageObjectSize>(),
+ "object size in B/K/M [4K <= object size <= 32M]")
+ (IMAGE_FEATURES.c_str(), po::value<ImageFeatures>()->composing(),
+ ("image features\n" + get_short_features_help(true)).c_str())
+ (IMAGE_SHARED.c_str(), po::bool_switch(), "shared image")
+ (IMAGE_STRIPE_UNIT.c_str(), po::value<ImageObjectSize>(), "stripe unit in B/K/M")
+ (IMAGE_STRIPE_COUNT.c_str(), po::value<uint64_t>(), "stripe count")
+ (IMAGE_DATA_POOL.c_str(), po::value<std::string>(), "data pool")
+ (IMAGE_MIRROR_IMAGE_MODE.c_str(), po::value<MirrorImageMode>(),
+ "mirror image mode [journal or snapshot]");
+
+ add_create_journal_options(opt);
+}
+
+void add_create_journal_options(po::options_description *opt) {
+ opt->add_options()
+ (JOURNAL_SPLAY_WIDTH.c_str(), po::value<uint64_t>(),
+ "number of active journal objects")
+ (JOURNAL_OBJECT_SIZE.c_str(), po::value<JournalObjectSize>(),
+ "size of journal objects [4K <= size <= 64M]")
+ (JOURNAL_POOL.c_str(), po::value<std::string>(),
+ "pool for journal objects");
+}
+
+void add_size_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ ((IMAGE_SIZE + ",s").c_str(), po::value<ImageSize>()->required(),
+ "image size (in M/G/T) [default: M]");
+}
+
+void add_sparse_size_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (IMAGE_SPARSE_SIZE.c_str(), po::value<ImageObjectSize>(),
+ "sparse size in B/K/M [default: 4K]");
+}
+
+void add_path_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ const std::string &description) {
+ pos->add_options()
+ (PATH_NAME.c_str(), po::value<std::string>(), description.c_str());
+ opt->add_options()
+ (PATH.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_limit_option(po::options_description *opt) {
+ std::string description = "maximum allowed snapshot count";
+
+ opt->add_options()
+ (LIMIT.c_str(), po::value<uint64_t>(), description.c_str());
+}
+
+void add_no_progress_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (NO_PROGRESS.c_str(), po::bool_switch(), "disable progress output");
+}
+
+void add_format_options(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (FORMAT.c_str(), po::value<Format>(), "output format (plain, json, or xml) [default: plain]")
+ (PRETTY_FORMAT.c_str(), po::bool_switch(),
+ "pretty formatting (json and xml)");
+}
+
+void add_verbose_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (VERBOSE.c_str(), po::bool_switch(), "be verbose");
+}
+
+void add_no_error_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (NO_ERR.c_str(), po::bool_switch(), "continue after error");
+}
+
+void add_export_format_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ ("export-format", po::value<ExportFormat>(), "format of image file");
+}
+
+void add_flatten_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (IMAGE_FLATTEN.c_str(), po::bool_switch(),
+ "fill clone with parent data (make it independent)");
+}
+
+void add_snap_create_options(po::options_description *opt) {
+ opt->add_options()
+ (SKIP_QUIESCE.c_str(), po::bool_switch(), "do not run quiesce hooks")
+ (IGNORE_QUIESCE_ERROR.c_str(), po::bool_switch(),
+ "ignore quiesce hook error");
+}
+
+std::string get_short_features_help(bool append_suffix) {
+ std::ostringstream oss;
+ bool first_feature = true;
+ oss << "[";
+ for (auto &pair : ImageFeatures::FEATURE_MAPPING) {
+ if ((pair.first & RBD_FEATURES_IMPLICIT_ENABLE) != 0ULL) {
+ // hide implicitly enabled features from list
+ continue;
+ } else if (!append_suffix && (pair.first & RBD_FEATURES_MUTABLE) == 0ULL) {
+ // hide non-mutable features for the 'rbd feature XYZ' command
+ continue;
+ }
+
+ if (!first_feature) {
+ oss << ", ";
+ }
+ first_feature = false;
+
+ std::string suffix;
+ if (append_suffix) {
+ if ((pair.first & rbd::utils::get_rbd_default_features(g_ceph_context)) != 0) {
+ suffix += "+";
+ }
+ if ((pair.first & RBD_FEATURES_MUTABLE) != 0) {
+ suffix += "*";
+ } else if ((pair.first & RBD_FEATURES_DISABLE_ONLY) != 0) {
+ suffix += "-";
+ }
+ if (!suffix.empty()) {
+ suffix = "(" + suffix + ")";
+ }
+ }
+ oss << pair.second << suffix;
+ }
+ oss << "]";
+ return oss.str();
+}
+
+std::string get_long_features_help() {
+ std::ostringstream oss;
+ oss << "Image Features:" << std::endl
+ << " (*) supports enabling/disabling on existing images" << std::endl
+ << " (-) supports disabling-only on existing images" << std::endl
+ << " (+) enabled by default for new images if features not specified"
+ << std::endl;
+ return oss.str();
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageSize *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (!parse_error.empty()) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+
+ //NOTE: We can remove below given three lines of code once all applications,
+ //which use this CLI will adopt B/K/M/G/T/P/E with size value
+ if (isdigit(*s.rbegin())) {
+ size = size << 20; // Default MB to Bytes
+ }
+ v = boost::any(size);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageOrder *target_type, int dummy) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ try {
+ uint64_t order = boost::lexical_cast<uint64_t>(s);
+ if (order >= 12 && order <= 25) {
+ v = boost::any(order);
+ return;
+ }
+ } catch (const boost::bad_lexical_cast &) {
+ }
+ throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageObjectSize *target_type, int dummy) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t objectsize = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (!parse_error.empty()) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+ v = boost::any(objectsize);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageFormat *target_type, int dummy) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ try {
+ uint32_t format = boost::lexical_cast<uint32_t>(s);
+ if (format == 1 || format == 2) {
+ v = boost::any(format);
+ return;
+ }
+ } catch (const boost::bad_lexical_cast &) {
+ }
+ throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageNewFormat *target_type, int dummy) {
+ v = boost::any(true);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageFeatures *target_type, int) {
+ if (v.empty()) {
+ v = boost::any(static_cast<uint64_t>(0));
+ }
+
+ uint64_t &features = boost::any_cast<uint64_t &>(v);
+ for (auto &value : values) {
+ boost::char_separator<char> sep(",");
+ boost::tokenizer<boost::char_separator<char> > tok(value, sep);
+ for (auto &token : tok) {
+ bool matched = false;
+ for (auto &it : ImageFeatures::FEATURE_MAPPING) {
+ if (token == it.second) {
+ features |= it.first;
+ matched = true;
+ break;
+ }
+ }
+
+ if (!matched) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+ }
+ }
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ MirrorImageMode* mirror_image_mode, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ if (s == "journal") {
+ v = boost::any(RBD_MIRROR_IMAGE_MODE_JOURNAL);
+ } else if (s == "snapshot") {
+ v = boost::any(RBD_MIRROR_IMAGE_MODE_SNAPSHOT);
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Format *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ if (s == "plain" || s == "json" || s == "xml") {
+ v = boost::any(Format(s));
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ JournalObjectSize *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (parse_error.empty() && (size >= (1 << 12)) && (size <= (1 << 26))) {
+ v = boost::any(size);
+ return;
+ }
+ throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ EncryptionAlgorithm *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ if (s == "aes-128") {
+ v = boost::any(RBD_ENCRYPTION_ALGORITHM_AES128);
+ } else if (s == "aes-256") {
+ v = boost::any(RBD_ENCRYPTION_ALGORITHM_AES256);
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ExportFormat *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t format = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (!parse_error.empty() || (format != 1 && format != 2)) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+
+ v = boost::any(format);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Secret *target_type, int) {
+
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ g_conf().set_val_or_die("keyfile", s.c_str());
+ v = boost::any(s);
+}
+
+} // namespace argument_types
+} // namespace rbd
diff --git a/src/tools/rbd/ArgumentTypes.h b/src/tools/rbd/ArgumentTypes.h
new file mode 100644
index 000000000..39d374c64
--- /dev/null
+++ b/src/tools/rbd/ArgumentTypes.h
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_ARGUMENT_TYPES_H
+#define CEPH_RBD_ARGUMENT_TYPES_H
+
+#include "include/int_types.h"
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/any.hpp>
+#include <boost/program_options.hpp>
+#include <boost/shared_ptr.hpp>
+
+namespace ceph { class Formatter; }
+
+namespace rbd {
+namespace argument_types {
+
+enum ArgumentModifier {
+ ARGUMENT_MODIFIER_NONE,
+ ARGUMENT_MODIFIER_SOURCE,
+ ARGUMENT_MODIFIER_DEST
+};
+
+enum SpecFormat {
+ SPEC_FORMAT_IMAGE,
+ SPEC_FORMAT_SNAPSHOT,
+ SPEC_FORMAT_IMAGE_OR_SNAPSHOT
+};
+
+static const std::string SOURCE_PREFIX("source-");
+static const std::string DEST_PREFIX("dest-");
+
+// positional arguments
+static const std::string POSITIONAL_COMMAND_SPEC("positional-command-spec");
+static const std::string POSITIONAL_ARGUMENTS("positional-arguments");
+static const std::string IMAGE_SPEC("image-spec");
+static const std::string SNAPSHOT_SPEC("snap-spec");
+static const std::string IMAGE_OR_SNAPSHOT_SPEC("image-or-snap-spec");
+static const std::string PATH_NAME("path-name");
+static const std::string IMAGE_ID("image-id");
+
+// optional arguments
+static const std::string CONFIG_PATH("conf");
+static const std::string POOL_NAME("pool");
+static const std::string DEST_POOL_NAME("dest-pool");
+static const std::string NAMESPACE_NAME("namespace");
+static const std::string DEST_NAMESPACE_NAME("dest-namespace");
+static const std::string IMAGE_NAME("image");
+static const std::string DEST_IMAGE_NAME("dest");
+static const std::string SNAPSHOT_NAME("snap");
+static const std::string SNAPSHOT_ID("snap-id");
+static const std::string DEST_SNAPSHOT_NAME("dest-snap");
+static const std::string PATH("path");
+static const std::string FROM_SNAPSHOT_NAME("from-snap");
+static const std::string WHOLE_OBJECT("whole-object");
+
+static const std::string IMAGE_FORMAT("image-format");
+static const std::string IMAGE_NEW_FORMAT("new-format");
+static const std::string IMAGE_ORDER("order");
+static const std::string IMAGE_OBJECT_SIZE("object-size");
+static const std::string IMAGE_FEATURES("image-feature");
+static const std::string IMAGE_SHARED("image-shared");
+static const std::string IMAGE_SIZE("size");
+static const std::string IMAGE_STRIPE_UNIT("stripe-unit");
+static const std::string IMAGE_STRIPE_COUNT("stripe-count");
+static const std::string IMAGE_DATA_POOL("data-pool");
+static const std::string IMAGE_SPARSE_SIZE("sparse-size");
+static const std::string IMAGE_THICK_PROVISION("thick-provision");
+static const std::string IMAGE_FLATTEN("flatten");
+static const std::string IMAGE_MIRROR_IMAGE_MODE("mirror-image-mode");
+
+static const std::string JOURNAL_OBJECT_SIZE("journal-object-size");
+static const std::string JOURNAL_SPLAY_WIDTH("journal-splay-width");
+static const std::string JOURNAL_POOL("journal-pool");
+
+static const std::string NO_PROGRESS("no-progress");
+static const std::string FORMAT("format");
+static const std::string PRETTY_FORMAT("pretty-format");
+static const std::string VERBOSE("verbose");
+static const std::string NO_ERR("no-error");
+
+static const std::string LIMIT("limit");
+
+static const std::string SKIP_QUIESCE("skip-quiesce");
+static const std::string IGNORE_QUIESCE_ERROR("ignore-quiesce-error");
+
+static const std::set<std::string> SWITCH_ARGUMENTS = {
+ WHOLE_OBJECT, IMAGE_SHARED, IMAGE_THICK_PROVISION, IMAGE_FLATTEN,
+ NO_PROGRESS, PRETTY_FORMAT, VERBOSE, NO_ERR, SKIP_QUIESCE,
+ IGNORE_QUIESCE_ERROR
+};
+
+struct ImageSize {};
+struct ImageOrder {};
+struct ImageObjectSize {};
+struct ImageFormat {};
+struct ImageNewFormat {};
+
+struct ImageFeatures {
+ static const std::map<uint64_t, std::string> FEATURE_MAPPING;
+
+ uint64_t features;
+};
+
+struct MirrorImageMode {};
+
+template <typename T>
+struct TypedValue {
+ T value;
+ TypedValue(const T& t) : value(t) {}
+};
+
+struct Format : public TypedValue<std::string> {
+ typedef boost::shared_ptr<ceph::Formatter> Formatter;
+
+ Format(const std::string &format) : TypedValue<std::string>(format) {}
+
+ Formatter create_formatter(bool pretty) const;
+};
+
+struct JournalObjectSize {};
+
+struct ExportFormat {};
+
+struct Secret {};
+
+struct EncryptionAlgorithm {};
+
+void add_export_format_option(boost::program_options::options_description *opt);
+
+std::string get_name_prefix(ArgumentModifier modifier);
+std::string get_description_prefix(ArgumentModifier modifier);
+
+void add_all_option(boost::program_options::options_description *opt,
+ std::string description);
+
+void add_pool_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier,
+ const std::string &desc_suffix = "");
+void add_namespace_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+
+void add_image_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier,
+ const std::string &desc_suffix = "");
+
+void add_image_id_option(boost::program_options::options_description *opt,
+ const std::string &desc_suffix = "");
+
+void add_snap_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+void add_snap_id_option(boost::program_options::options_description *opt);
+
+void add_pool_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ bool namespaces_supported);
+
+void add_image_spec_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+
+void add_snap_spec_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+
+void add_image_or_snap_spec_options(
+ boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+
+void add_create_image_options(boost::program_options::options_description *opt,
+ bool include_format);
+
+void add_create_journal_options(
+ boost::program_options::options_description *opt);
+
+void add_size_option(boost::program_options::options_description *opt);
+
+void add_sparse_size_option(boost::program_options::options_description *opt);
+
+void add_path_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ const std::string &description);
+
+void add_limit_option(boost::program_options::options_description *opt);
+
+void add_no_progress_option(boost::program_options::options_description *opt);
+
+void add_format_options(boost::program_options::options_description *opt);
+
+void add_verbose_option(boost::program_options::options_description *opt);
+
+void add_no_error_option(boost::program_options::options_description *opt);
+
+void add_flatten_option(boost::program_options::options_description *opt);
+
+void add_snap_create_options(boost::program_options::options_description *opt);
+
+std::string get_short_features_help(bool append_suffix);
+std::string get_long_features_help();
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ExportFormat *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageSize *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageOrder *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageObjectSize *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageFormat *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageNewFormat *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageFeatures *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Format *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ JournalObjectSize *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ EncryptionAlgorithm *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Secret *target_type, int);
+
+
+std::ostream &operator<<(std::ostream &os, const ImageFeatures &features);
+
+} // namespace argument_types
+} // namespace rbd
+
+#endif // CEPH_RBD_ARGUMENT_TYPES_H
diff --git a/src/tools/rbd/CMakeLists.txt b/src/tools/rbd/CMakeLists.txt
new file mode 100644
index 000000000..5a895354d
--- /dev/null
+++ b/src/tools/rbd/CMakeLists.txt
@@ -0,0 +1,80 @@
+set(CURSES_NEED_NCURSES TRUE)
+# libcurses may not be available on some platforms (e.g. Windows).
+find_package(Curses)
+
+set(rbd_srcs
+ rbd.cc
+ ArgumentTypes.cc
+ IndentStream.cc
+ MirrorDaemonServiceInfo.cc
+ OptionPrinter.cc
+ Schedule.cc
+ Shell.cc
+ Utils.cc
+ action/Bench.cc
+ action/Children.cc
+ action/Clone.cc
+ action/Config.cc
+ action/Copy.cc
+ action/Create.cc
+ action/Device.cc
+ action/Diff.cc
+ action/DiskUsage.cc
+ action/Encryption.cc
+ action/Export.cc
+ action/Feature.cc
+ action/Flatten.cc
+ action/Ggate.cc
+ action/Group.cc
+ action/ImageMeta.cc
+ action/Import.cc
+ action/Info.cc
+ action/Journal.cc
+ action/Kernel.cc
+ action/List.cc
+ action/Lock.cc
+ action/MergeDiff.cc
+ action/Migration.cc
+ action/MirrorImage.cc
+ action/MirrorPool.cc
+ action/MirrorSnapshotSchedule.cc
+ action/Namespace.cc
+ action/Nbd.cc
+ action/ObjectMap.cc
+ action/Perf.cc
+ action/PersistentCache.cc
+ action/Pool.cc
+ action/Remove.cc
+ action/Rename.cc
+ action/Resize.cc
+ action/Snap.cc
+ action/Sparsify.cc
+ action/Status.cc
+ action/TrashPurgeSchedule.cc
+ action/Trash.cc
+ action/Watch.cc
+ action/Wnbd.cc)
+
+add_executable(rbd ${rbd_srcs}
+ $<TARGET_OBJECTS:common_texttable_obj>)
+set_target_properties(rbd PROPERTIES OUTPUT_NAME rbd)
+target_link_libraries(rbd
+ cls_journal_client
+ cls_rbd_client
+ rbd_types
+ librbd
+ journal
+ libneorados
+ librados
+ ceph-common global
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+if(CURSES_FOUND)
+ target_compile_definitions(rbd PRIVATE HAVE_CURSES)
+ target_link_libraries(rbd ${CURSES_LIBRARIES})
+endif()
+if(WITH_KRBD)
+ target_link_libraries(rbd
+ krbd)
+endif()
+
+install(TARGETS rbd DESTINATION bin)
diff --git a/src/tools/rbd/IndentStream.cc b/src/tools/rbd/IndentStream.cc
new file mode 100644
index 000000000..83591a8cb
--- /dev/null
+++ b/src/tools/rbd/IndentStream.cc
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/IndentStream.h"
+
+namespace rbd {
+
+int IndentBuffer::overflow (int c) {
+ if (traits_type::eq_int_type(traits_type::eof(), c)) {
+ return traits_type::not_eof(c);
+ }
+
+ int r;
+ switch (c) {
+ case '\n':
+ m_buffer += c;
+ flush_line();
+ r = m_streambuf->sputn(m_buffer.c_str(), m_buffer.size());
+ m_buffer.clear();
+ return r;
+ case '\t':
+ // convert tab to single space and fall-through
+ c = ' ';
+ default:
+ if (m_indent + m_buffer.size() >= m_line_length) {
+ size_t word_offset = m_buffer.find_last_of(m_delim);
+ bool space_delim = (m_delim == " ");
+ if (word_offset == std::string::npos && !space_delim) {
+ word_offset = m_buffer.find_last_of(" ");
+ }
+
+ if (word_offset != std::string::npos) {
+ flush_line();
+ m_streambuf->sputn(m_buffer.c_str(), word_offset);
+ m_buffer = std::string(m_buffer,
+ word_offset + (space_delim ? 1 : 0));
+ } else {
+ flush_line();
+ m_streambuf->sputn(m_buffer.c_str(), m_buffer.size());
+ m_buffer.clear();
+ }
+ m_streambuf->sputc('\n');
+ }
+ m_buffer += c;
+ return c;
+ }
+}
+
+void IndentBuffer::flush_line() {
+ if (m_initial_offset >= m_indent) {
+ m_initial_offset = 0;
+ m_streambuf->sputc('\n');
+ }
+
+ m_streambuf->sputn(m_indent_prefix.c_str(), m_indent - m_initial_offset);
+ m_initial_offset = 0;
+}
+
+} // namespace rbd
diff --git a/src/tools/rbd/IndentStream.h b/src/tools/rbd/IndentStream.h
new file mode 100644
index 000000000..85ccc85b3
--- /dev/null
+++ b/src/tools/rbd/IndentStream.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_INDENT_STREAM_H
+#define CEPH_RBD_INDENT_STREAM_H
+
+#include "include/int_types.h"
+#include <iostream>
+#include <streambuf>
+#include <iomanip>
+
+namespace rbd {
+
+class IndentBuffer : public std::streambuf {
+public:
+ IndentBuffer(size_t indent, size_t initial_offset, size_t line_length,
+ std::streambuf *streambuf)
+ : m_indent(indent), m_initial_offset(initial_offset),
+ m_line_length(line_length), m_streambuf(streambuf),
+ m_delim(" "), m_indent_prefix(m_indent, ' ') {
+ }
+
+ void set_delimiter(const std::string &delim) {
+ m_delim = delim;
+ }
+
+protected:
+ int overflow (int c) override;
+
+private:
+ size_t m_indent;
+ size_t m_initial_offset;
+ size_t m_line_length;
+ std::streambuf *m_streambuf;
+
+ std::string m_delim;
+ std::string m_indent_prefix;
+ std::string m_buffer;
+
+ void flush_line();
+};
+
+class IndentStream : public std::ostream {
+public:
+ IndentStream(size_t indent, size_t initial_offset, size_t line_length,
+ std::ostream &os)
+ : std::ostream(&m_indent_buffer),
+ m_indent_buffer(indent, initial_offset, line_length, os.rdbuf()) {
+ }
+
+ void set_delimiter(const std::string &delim) {
+ m_indent_buffer.set_delimiter(delim);
+ }
+private:
+ IndentBuffer m_indent_buffer;
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_INDENT_STREAM_ITERATOR_H
diff --git a/src/tools/rbd/MirrorDaemonServiceInfo.cc b/src/tools/rbd/MirrorDaemonServiceInfo.cc
new file mode 100644
index 000000000..e7422e66a
--- /dev/null
+++ b/src/tools/rbd/MirrorDaemonServiceInfo.cc
@@ -0,0 +1,307 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_json.h"
+#include "common/errno.h"
+#include "include/rados/librados.hpp"
+#include "include/stringify.h"
+#include "tools/rbd/MirrorDaemonServiceInfo.h"
+
+#include <boost/scope_exit.hpp>
+#include <iostream>
+
+#include "json_spirit/json_spirit.h"
+
+namespace rbd {
+
+std::ostream& operator<<(std::ostream& os, MirrorHealth mirror_health) {
+ switch (mirror_health) {
+ case MIRROR_HEALTH_OK:
+ os << "OK";
+ break;
+ case MIRROR_HEALTH_UNKNOWN:
+ os << "UNKNOWN";
+ break;
+ case MIRROR_HEALTH_WARNING:
+ os << "WARNING";
+ break;
+ case MIRROR_HEALTH_ERROR:
+ os << "ERROR";
+ break;
+ }
+ return os;
+}
+
+std::string MirrorService::get_image_description() const {
+ std::string description = (!client_id.empty() ? client_id :
+ stringify(service_id));
+ if (!hostname.empty()) {
+ description += " on " + hostname;
+ }
+ return description;
+}
+
+void MirrorService::dump_image(
+ argument_types::Format::Formatter formatter) const {
+ formatter->open_object_section("daemon_service");
+ formatter->dump_string("service_id", service_id);
+ formatter->dump_string("instance_id", instance_id);
+ formatter->dump_string("daemon_id", client_id);
+ formatter->dump_string("hostname", hostname);
+ formatter->close_section();
+}
+
+int MirrorDaemonServiceInfo::init() {
+ int r = get_mirror_service_dump();
+ if (r < 0) {
+ return r;
+ } else if (m_mirror_services.empty()) {
+ return 0;
+ }
+
+ r = get_mirror_service_status();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+const MirrorService* MirrorDaemonServiceInfo::get_by_service_id(
+ const std::string& service_id) const {
+ auto it = m_mirror_services.find(service_id);
+ if (it == m_mirror_services.end()) {
+ return nullptr;
+ }
+
+ return &it->second;
+}
+
+const MirrorService* MirrorDaemonServiceInfo::get_by_instance_id(
+ const std::string& instance_id) const {
+ auto it = m_instance_to_service_ids.find(instance_id);
+ if (it == m_instance_to_service_ids.end()) {
+ return nullptr;
+ }
+
+ return get_by_service_id(it->second);
+}
+
+MirrorServices MirrorDaemonServiceInfo::get_mirror_services() const {
+ MirrorServices mirror_services;
+ for (auto& it : m_mirror_services) {
+ mirror_services.push_back(it.second);
+ }
+ return mirror_services;
+}
+
+int MirrorDaemonServiceInfo::get_mirror_service_dump() {
+ librados::Rados rados(m_io_ctx);
+ std::string cmd = R"({"prefix": "service dump", "format": "json"})";
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r < 0) {
+ std::cerr << "rbd: failed to query services: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ json_spirit::mValue json_root;
+ if(!json_spirit::read(out_bl.to_str(), json_root)) {
+ std::cerr << "rbd: invalid service dump JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ try {
+ auto& services = json_root.get_obj()["services"];
+ if (services.is_null()) {
+ std::cerr << "rbd: missing services in service dump JSON" << std::endl;
+ return -EBADMSG;
+ }
+
+ auto& service = services.get_obj()["rbd-mirror"];
+ if (service.is_null()) {
+ // no rbd-mirror daemons running
+ return 0;
+ }
+
+ auto& daemons = service.get_obj()["daemons"];
+ if (daemons.is_null()) {
+ return 0;
+ }
+
+ for (auto& daemon_pair : daemons.get_obj()) {
+ // rbd-mirror instances will always be integers but other objects
+ // are included
+ auto& service_id = daemon_pair.first;
+ if (daemon_pair.second.type() != json_spirit::obj_type) {
+ continue;
+ }
+
+ auto& daemon = daemon_pair.second.get_obj();
+ auto& metadata_val = daemon["metadata"];
+ if (metadata_val.is_null()) {
+ continue;
+ }
+ auto& metadata = metadata_val.get_obj();
+
+ MirrorService mirror_service{service_id};
+
+ auto& client_id = metadata["id"];
+ if (!client_id.is_null()) {
+ mirror_service.client_id = client_id.get_str();
+ }
+
+ auto& ceph_version = metadata["ceph_version_short"];
+ if (!ceph_version.is_null()) {
+ mirror_service.ceph_version = ceph_version.get_str();
+ }
+
+ auto& hostname = metadata["hostname"];
+ if (!hostname.is_null()) {
+ mirror_service.hostname = hostname.get_str();
+ }
+
+ m_mirror_services[service_id] = mirror_service;
+ }
+
+ } catch (std::runtime_error&) {
+ std::cerr << "rbd: unexpected service dump JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+int MirrorDaemonServiceInfo::get_mirror_service_status() {
+ librados::Rados rados(m_io_ctx);
+ std::string cmd = R"({"prefix": "service status", "format": "json"})";
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ int r = rados.mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r < 0) {
+ std::cerr << "rbd: failed to query service status: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ json_spirit::mValue json_root;
+ if(!json_spirit::read(out_bl.to_str(), json_root)) {
+ std::cerr << "rbd: invalid service status JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ bool found_leader = false;
+ bool found_pool = false;
+
+ try {
+ auto& service = json_root.get_obj()["rbd-mirror"];
+ if (service.is_null()) {
+ return 0;
+ }
+
+ for (auto& daemon_pair : service.get_obj()) {
+ std::string service_id = daemon_pair.first;
+ auto it = m_mirror_services.find(service_id);
+ if (it == m_mirror_services.end()) {
+ continue;
+ }
+
+ auto& mirror_service = it->second;
+ auto& daemon = daemon_pair.second.get_obj();
+ auto& status = daemon["status"];
+ if (status.is_null()) {
+ mirror_service.callouts.push_back("not reporting status");
+ mirror_service.health = MIRROR_HEALTH_WARNING;
+ continue;
+ }
+
+ auto& json = status.get_obj()["json"];
+ if (json.is_null()) {
+ mirror_service.callouts.push_back("not reporting status");
+ mirror_service.health = MIRROR_HEALTH_WARNING;
+ continue;
+ }
+
+ json_spirit::mValue json_status;
+ if(!json_spirit::read(json.get_str(), json_status)) {
+ std::cerr << "rbd: invalid service status daemon status JSON received"
+ << std::endl;
+ return -EBADMSG;
+ }
+
+ auto& pool_val = json_status.get_obj()[stringify(m_io_ctx.get_id())];
+ if (pool_val.is_null()) {
+ mirror_service.callouts.push_back("not reporting status for pool");
+ mirror_service.health = MIRROR_HEALTH_WARNING;
+ continue;
+ }
+
+ auto& pool = pool_val.get_obj();
+ found_pool = true;
+
+ auto& instance_id = pool["instance_id"];
+ if (!instance_id.is_null()) {
+ mirror_service.instance_id = instance_id.get_str();
+ m_instance_to_service_ids[mirror_service.instance_id] = service_id;
+ }
+
+ auto& leader = pool["leader"];
+ if (!leader.is_null() && leader.get_bool()) {
+ mirror_service.leader = true;
+ found_leader = true;
+ }
+
+ MirrorHealth mirror_service_health = MIRROR_HEALTH_OK;
+ auto& callouts = pool["callouts"];
+ if (!callouts.is_null()) {
+ for (auto& callout_pair : callouts.get_obj()) {
+ auto& callout = callout_pair.second.get_obj();
+ auto& level = callout["level"];
+ if (level.is_null()) {
+ continue;
+ }
+
+ auto& level_str = level.get_str();
+ if (mirror_service_health < MIRROR_HEALTH_ERROR &&
+ level_str == "error") {
+ mirror_service_health = MIRROR_HEALTH_ERROR;
+ } else if (mirror_service_health < MIRROR_HEALTH_WARNING &&
+ level_str == "warning") {
+ mirror_service_health = MIRROR_HEALTH_WARNING;
+ }
+
+ auto& text = callout["text"];
+ if (!text.is_null()) {
+ mirror_service.callouts.push_back(text.get_str());
+ }
+ }
+ }
+ mirror_service.health = mirror_service_health;
+ }
+ } catch (std::runtime_error&) {
+ std::cerr << "rbd: unexpected service status JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ // compute overall daemon health
+ m_daemon_health = MIRROR_HEALTH_OK;
+ if (!found_pool) {
+ // no daemons are reporting status for this pool
+ m_daemon_health = MIRROR_HEALTH_ERROR;
+ } else if (!found_leader) {
+ // no daemons are reporting leader role for this pool
+ m_daemon_health = MIRROR_HEALTH_WARNING;
+ }
+
+ for (auto& pair : m_mirror_services) {
+ m_daemon_health = std::max(m_daemon_health, pair.second.health);
+ }
+
+ return 0;
+}
+
+} // namespace rbd
+
diff --git a/src/tools/rbd/MirrorDaemonServiceInfo.h b/src/tools/rbd/MirrorDaemonServiceInfo.h
new file mode 100644
index 000000000..d667332e5
--- /dev/null
+++ b/src/tools/rbd/MirrorDaemonServiceInfo.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_DAEMON_SERVICE_INFO_H
+#define CEPH_RBD_MIRROR_DAEMON_SERVICE_INFO_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "tools/rbd/ArgumentTypes.h"
+
+#include <iosfwd>
+#include <list>
+#include <map>
+#include <string>
+
+namespace rbd {
+
+enum MirrorHealth {
+ MIRROR_HEALTH_OK = 0,
+ MIRROR_HEALTH_UNKNOWN = 1,
+ MIRROR_HEALTH_WARNING = 2,
+ MIRROR_HEALTH_ERROR = 3
+};
+
+std::ostream& operator<<(std::ostream& os, MirrorHealth mirror_health);
+
+struct MirrorService {
+ MirrorService() {}
+ explicit MirrorService(const std::string& service_id)
+ : service_id(service_id) {
+ }
+
+ std::string service_id;
+ std::string instance_id;
+ bool leader = false;
+ std::string client_id;
+ std::string ceph_version;
+ std::string hostname;
+ std::list<std::string> callouts;
+
+ MirrorHealth health = MIRROR_HEALTH_UNKNOWN;
+
+ std::string get_image_description() const;
+ void dump_image(argument_types::Format::Formatter formatter) const;
+};
+
+typedef std::list<MirrorService> MirrorServices;
+
+class MirrorDaemonServiceInfo {
+public:
+ MirrorDaemonServiceInfo(librados::IoCtx &io_ctx) : m_io_ctx(io_ctx) {
+ }
+
+ int init();
+
+ const MirrorService* get_by_service_id(const std::string& service_id) const;
+ const MirrorService* get_by_instance_id(const std::string& instance_id) const;
+
+ MirrorServices get_mirror_services() const;
+ MirrorHealth get_daemon_health() const {
+ return m_daemon_health;
+ }
+
+private:
+ librados::IoCtx &m_io_ctx;
+
+ std::map<std::string, MirrorService> m_mirror_services;
+ std::map<std::string, std::string> m_instance_to_service_ids;
+
+ MirrorHealth m_daemon_health = MIRROR_HEALTH_UNKNOWN;
+
+ int get_mirror_service_dump();
+ int get_mirror_service_status();
+
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_DAEMON_SERVICE_INFO_H
diff --git a/src/tools/rbd/OptionPrinter.cc b/src/tools/rbd/OptionPrinter.cc
new file mode 100644
index 000000000..0fea6b691
--- /dev/null
+++ b/src/tools/rbd/OptionPrinter.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/OptionPrinter.h"
+#include "tools/rbd/IndentStream.h"
+#include "include/ceph_assert.h"
+
+namespace rbd {
+
+namespace po = boost::program_options;
+
+const std::string OptionPrinter::POSITIONAL_ARGUMENTS("Positional arguments");
+const std::string OptionPrinter::OPTIONAL_ARGUMENTS("Optional arguments");
+
+const size_t OptionPrinter::MAX_DESCRIPTION_OFFSET;
+
+OptionPrinter::OptionPrinter(const OptionsDescription &positional,
+ const OptionsDescription &optional)
+ : m_positional(positional), m_optional(optional) {
+}
+
+void OptionPrinter::print_short(std::ostream &os, size_t initial_offset) {
+ size_t max_option_width = 0;
+ std::vector<std::string> optionals;
+ for (size_t i = 0; i < m_optional.options().size(); ++i) {
+ std::stringstream option;
+
+ bool required = m_optional.options()[i]->semantic()->is_required();
+ if (!required) {
+ option << "[";
+ }
+ option << "--" << m_optional.options()[i]->long_name();
+ if (m_optional.options()[i]->semantic()->max_tokens() != 0) {
+ option << " <" << m_optional.options()[i]->long_name() << ">";
+ }
+ if (!required) {
+ option << "]";
+ }
+ max_option_width = std::max(max_option_width, option.str().size());
+ optionals.emplace_back(option.str());
+ }
+
+ std::vector<std::string> positionals;
+ for (size_t i = 0; i < m_positional.options().size(); ++i) {
+ std::stringstream option;
+
+ // we overload po::value<std::string>()->default_value("") to signify
+ // an optional positional argument (purely for help printing purposes)
+ boost::any v;
+ bool required = !m_positional.options()[i]->semantic()->apply_default(v);
+ if (!required) {
+ auto ptr = boost::any_cast<std::string>(&v);
+ ceph_assert(ptr && ptr->empty());
+ option << "[";
+ }
+ option << "<" << m_positional.options()[i]->long_name() << ">";
+ if (m_positional.options()[i]->semantic()->max_tokens() > 1) {
+ option << " [<" << m_positional.options()[i]->long_name() << "> ...]";
+ }
+ if (!required) {
+ option << "]";
+ }
+
+ max_option_width = std::max(max_option_width, option.str().size());
+ positionals.emplace_back(option.str());
+
+ if (m_positional.options()[i]->semantic()->max_tokens() > 1) {
+ break;
+ }
+ }
+
+ size_t indent = std::min(initial_offset, MAX_DESCRIPTION_OFFSET) + 1;
+ if (indent + max_option_width + 2 > LINE_WIDTH) {
+ // decrease the indent so that we don't wrap past the end of the line
+ indent = LINE_WIDTH - max_option_width - 2;
+ }
+
+ IndentStream indent_stream(indent, initial_offset, LINE_WIDTH, os);
+ indent_stream.set_delimiter("[");
+ for (auto& option : optionals) {
+ indent_stream << option << " ";
+ }
+
+ if (optionals.size() > 0 || positionals.size() == 0) {
+ indent_stream << std::endl;
+ }
+
+ if (positionals.size() > 0) {
+ indent_stream.set_delimiter(" ");
+ for (auto& option : positionals) {
+ indent_stream << option << " ";
+ }
+ indent_stream << std::endl;
+ }
+}
+
+void OptionPrinter::print_optional(const OptionsDescription &global_opts,
+ size_t &name_width, std::ostream &os) {
+ std::string indent2(2, ' ');
+
+ for (size_t i = 0; i < global_opts.options().size(); ++i) {
+ std::string description = global_opts.options()[i]->description();
+ auto result = boost::find_first(description, "deprecated");
+ if (!result.empty()) {
+ continue;
+ }
+ std::stringstream ss;
+ ss << indent2
+ << global_opts.options()[i]->format_name() << " "
+ << global_opts.options()[i]->format_parameter();
+
+ std::cout << ss.str();
+ IndentStream indent_stream(name_width, ss.str().size(), LINE_WIDTH, std::cout);
+ indent_stream << global_opts.options()[i]->description() << std::endl;
+ }
+
+}
+
+void OptionPrinter::print_detailed(std::ostream &os) {
+ std::string indent_prefix(2, ' ');
+ size_t name_width = compute_name_width(indent_prefix.size());
+
+ if (m_positional.options().size() > 0) {
+ std::cout << POSITIONAL_ARGUMENTS << std::endl;
+ for (size_t i = 0; i < m_positional.options().size(); ++i) {
+ std::stringstream ss;
+ ss << indent_prefix << "<" << m_positional.options()[i]->long_name()
+ << ">";
+
+ std::cout << ss.str();
+ IndentStream indent_stream(name_width, ss.str().size(), LINE_WIDTH, os);
+ indent_stream << m_positional.options()[i]->description() << std::endl;
+ }
+ std::cout << std::endl;
+ }
+
+ if (m_optional.options().size() > 0) {
+ std::cout << OPTIONAL_ARGUMENTS << std::endl;
+ print_optional(m_optional, name_width, os);
+ std::cout << std::endl;
+ }
+}
+
+size_t OptionPrinter::compute_name_width(size_t indent) {
+ size_t width = MIN_NAME_WIDTH;
+ std::vector<OptionsDescription> descs = {m_positional, m_optional};
+ for (size_t desc_idx = 0; desc_idx < descs.size(); ++desc_idx) {
+ const OptionsDescription &desc = descs[desc_idx];
+ for (size_t opt_idx = 0; opt_idx < desc.options().size(); ++opt_idx) {
+ size_t name_width = desc.options()[opt_idx]->format_name().size() +
+ desc.options()[opt_idx]->format_parameter().size()
+ + 1;
+ width = std::max(width, name_width);
+ }
+ }
+ width += indent;
+ width = std::min(width, MAX_DESCRIPTION_OFFSET) + 1;
+ return width;
+}
+
+} // namespace rbd
diff --git a/src/tools/rbd/OptionPrinter.h b/src/tools/rbd/OptionPrinter.h
new file mode 100644
index 000000000..06d3a3c99
--- /dev/null
+++ b/src/tools/rbd/OptionPrinter.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_OPTION_PRINTER_H
+#define CEPH_RBD_OPTION_PRINTER_H
+
+#include "include/int_types.h"
+#include <string>
+#include <vector>
+#include <boost/algorithm/string.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+
+class OptionPrinter {
+public:
+ typedef boost::program_options::options_description OptionsDescription;
+
+ static const std::string POSITIONAL_ARGUMENTS;
+ static const std::string OPTIONAL_ARGUMENTS;
+
+ static const size_t LINE_WIDTH = 80;
+ static const size_t MIN_NAME_WIDTH = 20;
+ static const size_t MAX_DESCRIPTION_OFFSET = 37;
+
+ OptionPrinter(const OptionsDescription &positional,
+ const OptionsDescription &optional);
+
+ void print_short(std::ostream &os, size_t initial_offset);
+ void print_detailed(std::ostream &os);
+ static void print_optional(const OptionsDescription &global_opts,
+ size_t &name_width, std::ostream &os);
+
+private:
+ const OptionsDescription &m_positional;
+ const OptionsDescription &m_optional;
+
+ size_t compute_name_width(size_t indent);
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_OPTION_PRINTER_H
diff --git a/src/tools/rbd/Schedule.cc b/src/tools/rbd/Schedule.cc
new file mode 100644
index 000000000..15dda3aee
--- /dev/null
+++ b/src/tools/rbd/Schedule.cc
@@ -0,0 +1,367 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/ceph_json.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Schedule.h"
+#include "tools/rbd/Utils.h"
+
+#include <iostream>
+#include <regex>
+
+namespace rbd {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+int parse_schedule_name(const std::string &name, bool allow_images,
+ std::string *pool_name, std::string *namespace_name,
+ std::string *image_name) {
+ // parse names like:
+ // '', 'rbd/', 'rbd/ns/', 'rbd/image', 'rbd/ns/image'
+ std::regex pattern("^(?:([^/]+)/(?:(?:([^/]+)/|)(?:([^/@]+))?)?)?$");
+ std::smatch match;
+ if (!std::regex_match(name, match, pattern)) {
+ return -EINVAL;
+ }
+
+ if (match[1].matched) {
+ *pool_name = match[1];
+ } else {
+ *pool_name = "-";
+ }
+
+ if (match[2].matched) {
+ *namespace_name = match[2];
+ } else if (match[3].matched) {
+ *namespace_name = "";
+ } else {
+ *namespace_name = "-";
+ }
+
+ if (match[3].matched) {
+ if (!allow_images) {
+ return -EINVAL;
+ }
+ *image_name = match[3];
+ } else {
+ *image_name = "-";
+ }
+
+ return 0;
+}
+
+} // anonymous namespace
+
+void add_level_spec_options(po::options_description *options,
+ bool allow_image) {
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_namespace_option(options, at::ARGUMENT_MODIFIER_NONE);
+ if (allow_image) {
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+ }
+}
+
+int get_level_spec_args(const po::variables_map &vm,
+ std::map<std::string, std::string> *args) {
+ if (vm.count(at::IMAGE_NAME)) {
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+
+ int r = utils::extract_spec(vm[at::IMAGE_NAME].as<std::string>(),
+ &pool_name, &namespace_name, &image_name,
+ nullptr, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!pool_name.empty()) {
+ if (vm.count(at::POOL_NAME)) {
+ std::cerr << "rbd: pool is specified both via pool and image options"
+ << std::endl;
+ return -EINVAL;
+ }
+ if (vm.count(at::NAMESPACE_NAME)) {
+ std::cerr << "rbd: namespace is specified both via namespace and image"
+ << " options" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (vm.count(at::POOL_NAME)) {
+ pool_name = vm[at::POOL_NAME].as<std::string>();
+ }
+
+ if (vm.count(at::NAMESPACE_NAME)) {
+ namespace_name = vm[at::NAMESPACE_NAME].as<std::string>();
+ }
+
+ if (namespace_name.empty()) {
+ (*args)["level_spec"] = pool_name + "/" + image_name;
+ } else {
+ (*args)["level_spec"] = pool_name + "/" + namespace_name + "/" +
+ image_name;
+ }
+ return 0;
+ }
+
+ if (vm.count(at::NAMESPACE_NAME)) {
+ std::string pool_name;
+ std::string namespace_name;
+
+ if (vm.count(at::POOL_NAME)) {
+ pool_name = vm[at::POOL_NAME].as<std::string>();
+ }
+
+ namespace_name = vm[at::NAMESPACE_NAME].as<std::string>();
+
+ (*args)["level_spec"] = pool_name + "/" + namespace_name + "/";
+
+ return 0;
+ }
+
+ if (vm.count(at::POOL_NAME)) {
+ std::string pool_name = vm[at::POOL_NAME].as<std::string>();
+
+ (*args)["level_spec"] = pool_name + "/";
+
+ return 0;
+ }
+
+ (*args)["level_spec"] = "";
+
+ return 0;
+}
+
+void normalize_level_spec_args(std::map<std::string, std::string> *args) {
+ std::map<std::string, std::string> raw_args;
+ std::swap(raw_args, *args);
+
+ auto default_pool_name = utils::get_default_pool_name();
+ for (auto [key, value] : raw_args) {
+ if (key == "level_spec" && !value.empty() && value[0] == '/') {
+ value = default_pool_name + value;
+ }
+
+ (*args)[key] = value;
+ }
+}
+
+void add_schedule_options(po::options_description *positional,
+ bool mandatory) {
+ if (mandatory) {
+ positional->add_options()
+ ("interval", "schedule interval");
+ } else {
+ positional->add_options()
+ ("interval", po::value<std::string>()->default_value(""),
+ "schedule interval");
+ }
+ positional->add_options()
+ ("start-time", po::value<std::string>()->default_value(""),
+ "schedule start time");
+}
+
+int get_schedule_args(const po::variables_map &vm, bool mandatory,
+ std::map<std::string, std::string> *args) {
+ size_t arg_index = 0;
+
+ std::string interval = utils::get_positional_argument(vm, arg_index++);
+ if (interval.empty()) {
+ if (mandatory) {
+ std::cerr << "rbd: missing 'interval' argument" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+ }
+ (*args)["interval"] = interval;
+
+ std::string start_time = utils::get_positional_argument(vm, arg_index++);
+ if (!start_time.empty()) {
+ (*args)["start_time"] = start_time;
+ }
+
+ return 0;
+}
+
+int Schedule::parse(json_spirit::mValue &schedule_val) {
+ if (schedule_val.type() != json_spirit::array_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "schedule is not array" << std::endl;
+ return -EBADMSG;
+ }
+
+ try {
+ for (auto &item_val : schedule_val.get_array()) {
+ if (item_val.type() != json_spirit::obj_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "schedule item is not object" << std::endl;
+ return -EBADMSG;
+ }
+
+ auto &item = item_val.get_obj();
+
+ if (item["interval"].type() != json_spirit::str_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "interval is not string" << std::endl;
+ return -EBADMSG;
+ }
+ auto interval = item["interval"].get_str();
+
+ std::string start_time;
+ if (item["start_time"].type() == json_spirit::str_type) {
+ start_time = item["start_time"].get_str();
+ }
+
+ items.push_back({interval, start_time});
+ }
+
+ } catch (std::runtime_error &) {
+ std::cerr << "rbd: invalid schedule JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+void Schedule::dump(ceph::Formatter *f) {
+ f->open_array_section("items");
+ for (auto &item : items) {
+ f->open_object_section("item");
+ f->dump_string("interval", item.first);
+ f->dump_string("start_time", item.second);
+ f->close_section(); // item
+ }
+ f->close_section(); // items
+}
+
+std::ostream& operator<<(std::ostream& os, Schedule &s) {
+ std::string delimiter;
+ for (auto &item : s.items) {
+ os << delimiter << "every " << item.first;
+ if (!item.second.empty()) {
+ os << " starting at " << item.second;
+ }
+ delimiter = ", ";
+ }
+ return os;
+}
+
+int ScheduleList::parse(const std::string &list) {
+ json_spirit::mValue json_root;
+ if (!json_spirit::read(list, json_root)) {
+ std::cerr << "rbd: invalid schedule list JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ try {
+ for (auto &[id, schedule_val] : json_root.get_obj()) {
+ if (schedule_val.type() != json_spirit::obj_type) {
+ std::cerr << "rbd: unexpected schedule list JSON received: "
+ << "schedule_val is not object" << std::endl;
+ return -EBADMSG;
+ }
+ auto &schedule = schedule_val.get_obj();
+ if (schedule["name"].type() != json_spirit::str_type) {
+ std::cerr << "rbd: unexpected schedule list JSON received: "
+ << "schedule name is not string" << std::endl;
+ return -EBADMSG;
+ }
+ auto name = schedule["name"].get_str();
+
+ if (schedule["schedule"].type() != json_spirit::array_type) {
+ std::cerr << "rbd: unexpected schedule list JSON received: "
+ << "schedule is not array" << std::endl;
+ return -EBADMSG;
+ }
+
+ Schedule s;
+ int r = s.parse(schedule["schedule"]);
+ if (r < 0) {
+ return r;
+ }
+ schedules[name] = s;
+ }
+ } catch (std::runtime_error &) {
+ std::cerr << "rbd: invalid schedule list JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+Schedule *ScheduleList::find(const std::string &name) {
+ auto it = schedules.find(name);
+ if (it == schedules.end()) {
+ return nullptr;
+ }
+
+ return &it->second;
+}
+
+void ScheduleList::dump(ceph::Formatter *f) {
+ f->open_array_section("schedules");
+ for (auto &[name, s] : schedules) {
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+
+ int r = parse_schedule_name(name, allow_images, &pool_name, &namespace_name,
+ &image_name);
+ if (r < 0) {
+ continue;
+ }
+
+ f->open_object_section("schedule");
+ f->dump_string("pool", pool_name);
+ f->dump_string("namespace", namespace_name);
+ if (allow_images) {
+ f->dump_string("image", image_name);
+ }
+ s.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+std::ostream& operator<<(std::ostream& os, ScheduleList &l) {
+ TextTable tbl;
+ tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("NAMESPACE", TextTable::LEFT, TextTable::LEFT);
+ if (l.allow_images) {
+ tbl.define_column("IMAGE", TextTable::LEFT, TextTable::LEFT);
+ }
+ tbl.define_column("SCHEDULE", TextTable::LEFT, TextTable::LEFT);
+
+ for (auto &[name, s] : l.schedules) {
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+
+ int r = parse_schedule_name(name, l.allow_images, &pool_name,
+ &namespace_name, &image_name);
+ if (r < 0) {
+ continue;
+ }
+
+ std::stringstream ss;
+ ss << s;
+
+ tbl << pool_name << namespace_name;
+ if (l.allow_images) {
+ tbl << image_name;
+ }
+ tbl << ss.str() << TextTable::endrow;
+ }
+
+ os << tbl;
+ return os;
+}
+
+} // namespace rbd
+
diff --git a/src/tools/rbd/Schedule.h b/src/tools/rbd/Schedule.h
new file mode 100644
index 000000000..bf0964bb1
--- /dev/null
+++ b/src/tools/rbd/Schedule.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_SCHEDULE_H
+#define CEPH_RBD_SCHEDULE_H
+
+#include "json_spirit/json_spirit.h"
+
+#include <iostream>
+#include <list>
+#include <map>
+#include <string>
+#include <boost/program_options.hpp>
+
+namespace ceph { class Formatter; }
+
+namespace rbd {
+
+void add_level_spec_options(
+ boost::program_options::options_description *options, bool allow_image=true);
+int get_level_spec_args(const boost::program_options::variables_map &vm,
+ std::map<std::string, std::string> *args);
+void normalize_level_spec_args(std::map<std::string, std::string> *args);
+
+void add_schedule_options(
+ boost::program_options::options_description *positional, bool mandatory);
+int get_schedule_args(const boost::program_options::variables_map &vm,
+ bool mandatory, std::map<std::string, std::string> *args);
+
+class Schedule {
+public:
+ Schedule() {
+ }
+
+ int parse(json_spirit::mValue &schedule_val);
+ void dump(ceph::Formatter *f);
+
+ friend std::ostream& operator<<(std::ostream& os, Schedule &s);
+
+private:
+ std::string name;
+ std::list<std::pair<std::string, std::string>> items;
+};
+
+std::ostream& operator<<(std::ostream& os, Schedule &s);
+
+class ScheduleList {
+public:
+ ScheduleList(bool allow_images=true) : allow_images(allow_images) {
+ }
+
+ int parse(const std::string &list);
+ Schedule *find(const std::string &name);
+ void dump(ceph::Formatter *f);
+
+ friend std::ostream& operator<<(std::ostream& os, ScheduleList &l);
+
+private:
+ bool allow_images;
+ std::map<std::string, Schedule> schedules;
+};
+
+std::ostream& operator<<(std::ostream& os, ScheduleList &l);
+
+} // namespace rbd
+
+#endif // CEPH_RBD_SCHEDULE_H
diff --git a/src/tools/rbd/Shell.cc b/src/tools/rbd/Shell.cc
new file mode 100644
index 000000000..05052ff98
--- /dev/null
+++ b/src/tools/rbd/Shell.cc
@@ -0,0 +1,488 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/IndentStream.h"
+#include "tools/rbd/OptionPrinter.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "include/stringify.h"
+#include <algorithm>
+#include <iostream>
+#include <set>
+
+namespace rbd {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+static const std::string APP_NAME("rbd");
+static const std::string HELP_SPEC("help");
+static const std::string BASH_COMPLETION_SPEC("bash-completion");
+
+boost::intrusive_ptr<CephContext> global_init(
+ int argc, const char **argv, std::vector<std::string> *command_args,
+ std::vector<std::string> *global_init_args) {
+ std::vector<const char*> cmd_args;
+ argv_to_vec(argc, argv, cmd_args);
+ std::vector<const char*> args(cmd_args);
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_MON_CONFIG);
+
+ *command_args = {args.begin(), args.end()};
+
+ // Scan command line arguments for ceph global init args (those are
+ // filtered out from args vector by global_init).
+
+ auto cursor = args.begin();
+ for (auto &arg : cmd_args) {
+ auto iter = cursor;
+ for (; iter != args.end(); iter++) {
+ if (*iter == arg) {
+ break;
+ }
+ }
+ if (iter == args.end()) {
+ // filtered out by global_init
+ global_init_args->push_back(arg);
+ } else {
+ cursor = ++iter;
+ }
+ }
+
+ return cct;
+}
+
+std::string format_command_spec(const Shell::CommandSpec &spec) {
+ return joinify<std::string>(spec.begin(), spec.end(), " ");
+}
+
+std::string format_alias_spec(const Shell::CommandSpec &spec,
+ const Shell::CommandSpec &alias_spec) {
+ auto spec_it = spec.begin();
+ auto alias_it = alias_spec.begin();
+ int level = 0;
+ while (spec_it != spec.end() && alias_it != alias_spec.end() &&
+ *spec_it == *alias_it) {
+ spec_it++;
+ alias_it++;
+ level++;
+ }
+ ceph_assert(spec_it != spec.end() && alias_it != alias_spec.end());
+
+ if (level < 2) {
+ return joinify<std::string>(alias_spec.begin(), alias_spec.end(), " ");
+ } else {
+ return "... " + joinify<std::string>(alias_it, alias_spec.end(), " ");
+ }
+}
+
+std::string format_command_name(const Shell::CommandSpec &spec,
+ const Shell::CommandSpec &alias_spec) {
+ std::string name = format_command_spec(spec);
+ if (!alias_spec.empty()) {
+ name += " (" + format_alias_spec(spec, alias_spec) + ")";
+ }
+ return name;
+}
+
+std::string format_option_suffix(
+ const boost::shared_ptr<po::option_description> &option) {
+ std::string suffix;
+ if (option->semantic()->max_tokens() != 0) {
+ if (option->description().find("path") != std::string::npos ||
+ option->description().find("file") != std::string::npos) {
+ suffix += " path";
+ } else if (option->description().find("host") != std::string::npos) {
+ suffix += " host";
+ } else {
+ suffix += " arg";
+ }
+ }
+ return suffix;
+}
+
+} // anonymous namespace
+
+std::vector<Shell::Action *>& Shell::get_actions() {
+ static std::vector<Action *> actions;
+
+ return actions;
+}
+
+std::set<std::string>& Shell::get_switch_arguments() {
+ static std::set<std::string> switch_arguments;
+
+ return switch_arguments;
+}
+
+void print_deprecated_warning(po::option_description option, std::string description) {
+ auto pos = description.find_first_of(":");
+ if (pos != std::string::npos) {
+ std::string param = description.substr(pos + 1, description.size() - pos - 2);
+ std::cerr << "rbd: " << option.format_name() << " is deprecated, use --"
+ << param << std::endl;
+ }
+}
+
+int Shell::execute(int argc, const char **argv) {
+ std::vector<std::string> arguments;
+ std::vector<std::string> ceph_global_init_args;
+ auto cct = global_init(argc, argv, &arguments, &ceph_global_init_args);
+
+ std::vector<std::string> command_spec;
+ get_command_spec(arguments, &command_spec);
+ bool is_alias = true;
+
+ if (command_spec.empty() || command_spec == CommandSpec({"help"})) {
+ // list all available actions
+ print_help();
+ return 0;
+ } else if (command_spec[0] == HELP_SPEC) {
+ // list help for specific action
+ command_spec.erase(command_spec.begin());
+ Action *action = find_action(command_spec, NULL, &is_alias);
+ if (action == NULL) {
+ print_unknown_action(command_spec);
+ return EXIT_FAILURE;
+ } else {
+ print_action_help(action, is_alias);
+ return 0;
+ }
+ } else if (command_spec[0] == BASH_COMPLETION_SPEC) {
+ command_spec.erase(command_spec.begin());
+ print_bash_completion(command_spec);
+ return 0;
+ }
+
+ CommandSpec *matching_spec;
+ Action *action = find_action(command_spec, &matching_spec, &is_alias);
+ if (action == NULL) {
+ print_unknown_action(command_spec);
+ return EXIT_FAILURE;
+ }
+
+ po::variables_map vm;
+ try {
+ po::options_description positional_opts;
+ po::options_description command_opts;
+ (*action->get_arguments)(&positional_opts, &command_opts);
+
+ // dynamically allocate options for our command (e.g. snap list) and
+ // its associated positional arguments
+ po::options_description argument_opts;
+ argument_opts.add_options()
+ (at::POSITIONAL_COMMAND_SPEC.c_str(),
+ po::value<std::vector<std::string> >()->required(), "")
+ (at::POSITIONAL_ARGUMENTS.c_str(),
+ po::value<std::vector<std::string> >(), "");
+
+ po::positional_options_description positional_options;
+ positional_options.add(at::POSITIONAL_COMMAND_SPEC.c_str(),
+ matching_spec->size());
+ if (!positional_opts.options().empty()) {
+ int max_count = positional_opts.options().size();
+ if (positional_opts.options().back()->semantic()->max_tokens() > 1)
+ max_count = -1;
+ positional_options.add(at::POSITIONAL_ARGUMENTS.c_str(), max_count);
+ }
+
+ po::options_description group_opts;
+ group_opts.add(command_opts)
+ .add(argument_opts);
+
+ po::store(po::command_line_parser(arguments)
+ .style(po::command_line_style::default_style &
+ ~po::command_line_style::allow_guessing)
+ .options(group_opts)
+ .positional(positional_options)
+ .run(), vm);
+
+ if (vm[at::POSITIONAL_COMMAND_SPEC].as<std::vector<std::string> >() !=
+ *matching_spec) {
+ std::cerr << "rbd: failed to parse command" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ int r = (*action->execute)(vm, ceph_global_init_args);
+
+ if (vm.size() > 0) {
+ for (auto opt : vm) {
+ try {
+ auto option = command_opts.find(opt.first, false);
+ auto description = option.description();
+ auto result = boost::find_first(description, "deprecated");
+ if (!result.empty()) {
+ print_deprecated_warning(option, description);
+ }
+ } catch (exception& e) {
+ continue;
+ }
+ }
+ }
+
+ po::options_description global_opts;
+ get_global_options(&global_opts);
+ auto it = ceph_global_init_args.begin();
+ for ( ; it != ceph_global_init_args.end(); ++it) {
+ auto pos = (*it).find_last_of("-");
+ auto prefix_style = po::command_line_style::allow_long;
+ if (pos == 0) {
+ prefix_style = po::command_line_style::allow_dash_for_short;
+ } else if (pos == std::string::npos) {
+ continue;
+ }
+
+ for (size_t i = 0; i < global_opts.options().size(); ++i) {
+ std::string param_name = global_opts.options()[i]->canonical_display_name(
+ prefix_style);
+ auto description = global_opts.options()[i]->description();
+ auto result = boost::find_first(description, "deprecated");
+ if (!result.empty() && *it == param_name) {
+ print_deprecated_warning(*global_opts.options()[i], description);
+ break;
+ }
+ }
+ }
+
+ if (r != 0) {
+ return std::abs(r);
+ }
+ } catch (po::required_option& e) {
+ std::cerr << "rbd: " << e.what() << std::endl;
+ return EXIT_FAILURE;
+ } catch (po::too_many_positional_options_error& e) {
+ std::cerr << "rbd: too many arguments" << std::endl;
+ return EXIT_FAILURE;
+ } catch (po::error& e) {
+ std::cerr << "rbd: " << e.what() << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ return 0;
+}
+
+void Shell::get_command_spec(const std::vector<std::string> &arguments,
+ std::vector<std::string> *command_spec) {
+ for (size_t i = 0; i < arguments.size(); ++i) {
+ std::string arg(arguments[i]);
+ if (arg == "-h" || arg == "--help") {
+ *command_spec = {HELP_SPEC};
+ return;
+ } else if (arg == "--") {
+ // all arguments after a double-dash are positional
+ if (i + 1 < arguments.size()) {
+ command_spec->insert(command_spec->end(),
+ arguments.data() + i + 1,
+ arguments.data() + arguments.size());
+ }
+ return;
+ } else if (arg[0] == '-') {
+ // if the option is not a switch, skip its value
+ if (arg.size() >= 2 &&
+ (arg[1] == '-' ||
+ get_switch_arguments().count(arg.substr(1, 1)) == 0) &&
+ (arg[1] != '-' ||
+ get_switch_arguments().count(arg.substr(2, std::string::npos)) == 0) &&
+ at::SWITCH_ARGUMENTS.count(arg.substr(2, std::string::npos)) == 0 &&
+ arg.find('=') == std::string::npos) {
+ ++i;
+ }
+ } else {
+ command_spec->push_back(arg);
+ }
+ }
+}
+
+Shell::Action *Shell::find_action(const CommandSpec &command_spec,
+ CommandSpec **matching_spec, bool *is_alias) {
+ // sort such that all "trash purge schedule ..." actions come before
+ // "trash purge"
+ std::vector<Action *> actions(get_actions());
+ std::sort(actions.begin(), actions.end(), [](auto lhs, auto rhs) {
+ return lhs->command_spec.size() > rhs->command_spec.size();
+ });
+
+ for (Action *action : actions) {
+ if (action->command_spec.size() <= command_spec.size()) {
+ if (std::equal(action->command_spec.begin(),
+ action->command_spec.end(),
+ command_spec.begin())) {
+ if (matching_spec != NULL) {
+ *matching_spec = &action->command_spec;
+ }
+ *is_alias = false;
+ return action;
+ }
+ }
+ if (!action->alias_command_spec.empty() &&
+ action->alias_command_spec.size() <= command_spec.size()) {
+ if (std::equal(action->alias_command_spec.begin(),
+ action->alias_command_spec.end(),
+ command_spec.begin())) {
+ if (matching_spec != NULL) {
+ *matching_spec = &action->alias_command_spec;
+ }
+ *is_alias = true;
+ return action;
+ }
+ }
+ }
+ return NULL;
+}
+
+void Shell::get_global_options(po::options_description *opts) {
+ opts->add_options()
+ ((at::CONFIG_PATH + ",c").c_str(), po::value<std::string>(), "path to cluster configuration")
+ ("cluster", po::value<std::string>(), "cluster name")
+ ("id", po::value<std::string>(), "client id (without 'client.' prefix)")
+ ("user", po::value<std::string>(), "deprecated[:id]")
+ ("name,n", po::value<std::string>(), "client name")
+ ("mon_host,m", po::value<std::string>(), "monitor host")
+ ("secret", po::value<at::Secret>(), "deprecated[:keyfile]")
+ ("keyfile,K", po::value<std::string>(), "path to secret key")
+ ("keyring,k", po::value<std::string>(), "path to keyring");
+}
+
+void Shell::print_help() {
+ std::cout << "usage: " << APP_NAME << " <command> ..."
+ << std::endl << std::endl
+ << "Command-line interface for managing Ceph RBD images."
+ << std::endl << std::endl;
+
+ std::vector<Action *> actions(get_actions());
+ std::sort(actions.begin(), actions.end(),
+ [](Action *lhs, Action *rhs) { return lhs->command_spec <
+ rhs->command_spec; });
+
+ std::cout << OptionPrinter::POSITIONAL_ARGUMENTS << ":" << std::endl
+ << " <command>" << std::endl;
+
+ // since the commands have spaces, we have to build our own formatter
+ std::string indent(4, ' ');
+ size_t name_width = OptionPrinter::MIN_NAME_WIDTH;
+ for (size_t i = 0; i < actions.size(); ++i) {
+ Action *action = actions[i];
+ std::string name = format_command_name(action->command_spec,
+ action->alias_command_spec);
+ name_width = std::max(name_width, name.size());
+ }
+ name_width += indent.size();
+ name_width = std::min(name_width, OptionPrinter::MAX_DESCRIPTION_OFFSET) + 1;
+
+ for (size_t i = 0; i < actions.size(); ++i) {
+ Action *action = actions[i];
+ if (!action->visible)
+ continue;
+ std::stringstream ss;
+ ss << indent
+ << format_command_name(action->command_spec, action->alias_command_spec);
+
+ std::cout << ss.str();
+ if (!action->description.empty()) {
+ IndentStream indent_stream(name_width, ss.str().size(),
+ OptionPrinter::LINE_WIDTH,
+ std::cout);
+ indent_stream << action->description << std::endl;
+ } else {
+ std::cout << std::endl;
+ }
+ }
+
+ po::options_description global_opts;
+ get_global_options(&global_opts);
+
+ std::cout << std::endl << OptionPrinter::OPTIONAL_ARGUMENTS << ":" << std::endl;
+ OptionPrinter::print_optional(global_opts, name_width, std::cout);
+
+ std::cout << std::endl
+ << "See '" << APP_NAME << " help <command>' for help on a specific "
+ << "command." << std::endl;
+ }
+
+void Shell::print_action_help(Action *action, bool is_alias) {
+ std::stringstream ss;
+ ss << "usage: " << APP_NAME << " "
+ << format_command_spec(is_alias ? action->alias_command_spec : action->command_spec);
+ std::cout << ss.str();
+
+ po::options_description positional;
+ po::options_description options;
+ (*action->get_arguments)(&positional, &options);
+
+ OptionPrinter option_printer(positional, options);
+ option_printer.print_short(std::cout, ss.str().size());
+
+ if (!action->description.empty()) {
+ std::cout << std::endl << action->description << std::endl;
+ }
+
+ std::cout << std::endl;
+ option_printer.print_detailed(std::cout);
+
+ if (!action->help.empty()) {
+ std::cout << action->help << std::endl;
+ }
+}
+
+void Shell::print_unknown_action(const std::vector<std::string> &command_spec) {
+ std::cerr << "error: unknown option '"
+ << joinify<std::string>(command_spec.begin(),
+ command_spec.end(), " ") << "'"
+ << std::endl << std::endl;
+ print_help();
+}
+
+void Shell::print_bash_completion(const CommandSpec &command_spec) {
+
+ bool is_alias = true;
+
+ Action *action = find_action(command_spec, NULL, &is_alias);
+ po::options_description global_opts;
+ get_global_options(&global_opts);
+ print_bash_completion_options(global_opts);
+
+ if (action != nullptr) {
+ po::options_description positional_opts;
+ po::options_description command_opts;
+ (*action->get_arguments)(&positional_opts, &command_opts);
+ print_bash_completion_options(command_opts);
+ } else {
+ std::cout << "|help";
+ for (size_t i = 0; i < get_actions().size(); ++i) {
+ Action *action = get_actions()[i];
+ std::cout << "|"
+ << joinify<std::string>(action->command_spec.begin(),
+ action->command_spec.end(), " ");
+ if (!action->alias_command_spec.empty()) {
+ std::cout << "|"
+ << joinify<std::string>(action->alias_command_spec.begin(),
+ action->alias_command_spec.end(),
+ " ");
+ }
+ }
+ }
+ std::cout << "|" << std::endl;
+}
+
+void Shell::print_bash_completion_options(const po::options_description &ops) {
+ for (size_t i = 0; i < ops.options().size(); ++i) {
+ auto option = ops.options()[i];
+ std::string long_name(option->canonical_display_name(0));
+ std::string short_name(option->canonical_display_name(
+ po::command_line_style::allow_dash_for_short));
+
+ std::cout << "|--" << long_name << format_option_suffix(option);
+ if (long_name != short_name) {
+ std::cout << "|" << short_name << format_option_suffix(option);
+ }
+ }
+}
+
+} // namespace rbd
diff --git a/src/tools/rbd/Shell.h b/src/tools/rbd/Shell.h
new file mode 100644
index 000000000..fe3dee46b
--- /dev/null
+++ b/src/tools/rbd/Shell.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_SHELL_H
+#define CEPH_RBD_SHELL_H
+
+#include "include/int_types.h"
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+
+class Shell {
+public:
+ typedef std::vector<std::string> CommandSpec;
+
+ struct Action {
+ typedef void (*GetArguments)(boost::program_options::options_description *,
+ boost::program_options::options_description *);
+ typedef int (*Execute)(const boost::program_options::variables_map &,
+ const std::vector<std::string> &);
+
+ CommandSpec command_spec;
+ CommandSpec alias_command_spec;
+ const std::string description;
+ const std::string help;
+ GetArguments get_arguments;
+ Execute execute;
+ bool visible;
+
+ template <typename Args, typename Execute>
+ Action(const std::initializer_list<std::string> &command_spec,
+ const std::initializer_list<std::string> &alias_command_spec,
+ const std::string &description, const std::string &help,
+ Args args, Execute execute, bool visible = true)
+ : command_spec(command_spec), alias_command_spec(alias_command_spec),
+ description(description), help(help), get_arguments(args),
+ execute(execute), visible(visible) {
+ Shell::get_actions().push_back(this);
+ }
+
+ };
+
+ struct SwitchArguments {
+ SwitchArguments(const std::initializer_list<std::string> &arguments) {
+ Shell::get_switch_arguments().insert(arguments.begin(), arguments.end());
+ }
+ };
+
+ int execute(int argc, const char **argv);
+
+private:
+ static std::vector<Action *>& get_actions();
+ static std::set<std::string>& get_switch_arguments();
+
+ void get_command_spec(const std::vector<std::string> &arguments,
+ std::vector<std::string> *command_spec);
+ Action *find_action(const CommandSpec &command_spec,
+ CommandSpec **matching_spec, bool *is_alias);
+
+ void get_global_options(boost::program_options::options_description *opts);
+
+ void print_help();
+ void print_action_help(Action *action, bool is_alias);
+ void print_unknown_action(const CommandSpec &command_spec);
+
+ void print_bash_completion(const CommandSpec &command_spec);
+ void print_bash_completion_options(
+ const boost::program_options::options_description &ops);
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_SHELL_H
diff --git a/src/tools/rbd/Utils.cc b/src/tools/rbd/Utils.cc
new file mode 100644
index 000000000..2d250fa6b
--- /dev/null
+++ b/src/tools/rbd/Utils.cc
@@ -0,0 +1,1136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/Utils.h"
+#include "include/ceph_assert.h"
+#include "include/Context.h"
+#include "include/encoding.h"
+#include "common/common_init.h"
+#include "include/stringify.h"
+#include "include/rbd/features.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/escape.h"
+#include "common/safe_io.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <regex>
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+
+namespace rbd {
+namespace utils {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+static std::string mgr_command_args_to_str(
+ const std::map<std::string, std::string> &args) {
+ std::string out = "";
+
+ std::string delimiter;
+ for (auto &it : args) {
+ out += delimiter + "\"" + it.first + "\": \"" +
+ stringify(json_stream_escaper(it.second)) + "\"";
+ delimiter = ",\n";
+ }
+
+ return out;
+}
+
+} // anonymous namespace
+
+int ProgressContext::update_progress(uint64_t offset, uint64_t total) {
+ if (progress) {
+ int pc = get_percentage(offset, total);
+ if (pc > last_pc) {
+ cerr << "\r" << operation << ": "
+ << pc << "% complete...";
+ cerr.flush();
+ last_pc = pc;
+ }
+ }
+ return 0;
+}
+
+void ProgressContext::finish() {
+ if (progress) {
+ cerr << "\r" << operation << ": 100% complete...done." << std::endl;
+ }
+}
+
+void ProgressContext::fail() {
+ if (progress) {
+ cerr << "\r" << operation << ": " << last_pc << "% complete...failed."
+ << std::endl;
+ }
+}
+
+int get_percentage(uint64_t part, uint64_t whole) {
+ return whole ? (100 * part / whole) : 0;
+}
+
+void aio_context_callback(librbd::completion_t completion, void *arg)
+{
+ librbd::RBD::AioCompletion *aio_completion =
+ reinterpret_cast<librbd::RBD::AioCompletion*>(completion);
+ Context *context = reinterpret_cast<Context *>(arg);
+ context->complete(aio_completion->get_return_value());
+ aio_completion->release();
+}
+
+int read_string(int fd, unsigned max, std::string *out) {
+ char buf[4];
+
+ int r = safe_read_exact(fd, buf, 4);
+ if (r < 0)
+ return r;
+
+ bufferlist bl;
+ bl.append(buf, 4);
+ auto p = bl.cbegin();
+ uint32_t len;
+ decode(len, p);
+ if (len > max)
+ return -EINVAL;
+
+ char sbuf[len];
+ r = safe_read_exact(fd, sbuf, len);
+ if (r < 0)
+ return r;
+ out->assign(sbuf, len);
+ return len;
+}
+
+int extract_spec(const std::string &spec, std::string *pool_name,
+ std::string *namespace_name, std::string *name,
+ std::string *snap_name, SpecValidation spec_validation) {
+ if (!g_ceph_context->_conf.get_val<bool>("rbd_validate_names")) {
+ spec_validation = SPEC_VALIDATION_NONE;
+ }
+
+ std::regex pattern;
+ switch (spec_validation) {
+ case SPEC_VALIDATION_FULL:
+ // disallow "/" and "@" in all names
+ pattern = "^(?:([^/@]+)/(?:([^/@]+)/)?)?([^/@]+)(?:@([^/@]+))?$";
+ break;
+ case SPEC_VALIDATION_SNAP:
+ // disallow "/" and "@" in snap name
+ pattern = "^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$";
+ break;
+ case SPEC_VALIDATION_NONE:
+ // relaxed pattern assumes pool is before first "/",
+ // namespace is before second "/", and snap name is after first "@"
+ pattern = "^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@(.+))?$";
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+
+ std::smatch match;
+ if (!std::regex_match(spec, match, pattern)) {
+ std::cerr << "rbd: invalid spec '" << spec << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (match[1].matched) {
+ if (pool_name != nullptr) {
+ *pool_name = match[1];
+ } else {
+ std::cerr << "rbd: pool name specified for a command that doesn't use it"
+ << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (match[2].matched) {
+ if (namespace_name != nullptr) {
+ *namespace_name = match[2];
+ } else {
+ std::cerr << "rbd: namespace name specified for a command that doesn't "
+ << "use it" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (name != nullptr) {
+ *name = match[3];
+ }
+
+ if (match[4].matched) {
+ if (snap_name != nullptr) {
+ *snap_name = match[4];
+ } else {
+ std::cerr << "rbd: snapshot name specified for a command that doesn't "
+ << "use it" << std::endl;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+std::string get_positional_argument(const po::variables_map &vm, size_t index) {
+ if (vm.count(at::POSITIONAL_ARGUMENTS) == 0) {
+ return "";
+ }
+
+ const std::vector<std::string> &args =
+ boost::any_cast<std::vector<std::string> >(
+ vm[at::POSITIONAL_ARGUMENTS].value());
+ if (index < args.size()) {
+ return args[index];
+ }
+ return "";
+}
+
+void normalize_pool_name(std::string* pool_name) {
+ if (pool_name->empty()) {
+ *pool_name = get_default_pool_name();
+ }
+}
+
+std::string get_default_pool_name() {
+ return g_ceph_context->_conf.get_val<std::string>("rbd_default_pool");
+}
+
+int get_pool_and_namespace_names(
+ const boost::program_options::variables_map &vm, bool validate_pool_name,
+ std::string* pool_name, std::string* namespace_name, size_t *arg_index) {
+ if (namespace_name != nullptr && vm.count(at::NAMESPACE_NAME)) {
+ *namespace_name = vm[at::NAMESPACE_NAME].as<std::string>();
+ }
+
+ if (vm.count(at::POOL_NAME)) {
+ *pool_name = vm[at::POOL_NAME].as<std::string>();
+ } else {
+ *pool_name = get_positional_argument(vm, *arg_index);
+ if (!pool_name->empty()) {
+ if (namespace_name != nullptr) {
+ auto slash_pos = pool_name->find_last_of('/');
+ if (slash_pos != std::string::npos) {
+ *namespace_name = pool_name->substr(slash_pos + 1);
+ }
+ *pool_name = pool_name->substr(0, slash_pos);
+ }
+ ++(*arg_index);
+ }
+ }
+
+ if (!g_ceph_context->_conf.get_val<bool>("rbd_validate_names")) {
+ validate_pool_name = false;
+ }
+
+ if (validate_pool_name &&
+ pool_name->find_first_of("/@") != std::string::npos) {
+ std::cerr << "rbd: invalid pool '" << *pool_name << "'" << std::endl;
+ return -EINVAL;
+ } else if (namespace_name != nullptr &&
+ namespace_name->find_first_of("/@") != std::string::npos) {
+ std::cerr << "rbd: invalid namespace '" << *namespace_name << "'"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_pool_image_id(const po::variables_map &vm,
+ size_t *spec_arg_index,
+ std::string *pool_name,
+ std::string *namespace_name,
+ std::string *image_id) {
+
+ if (vm.count(at::POOL_NAME) && pool_name != nullptr) {
+ *pool_name = vm[at::POOL_NAME].as<std::string>();
+ }
+ if (vm.count(at::NAMESPACE_NAME) && namespace_name != nullptr) {
+ *namespace_name = vm[at::NAMESPACE_NAME].as<std::string>();
+ }
+ if (vm.count(at::IMAGE_ID) && image_id != nullptr) {
+ *image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r;
+ if (image_id != nullptr && spec_arg_index != nullptr && image_id->empty()) {
+ std::string spec = get_positional_argument(vm, (*spec_arg_index)++);
+ if (!spec.empty()) {
+ r = extract_spec(spec, pool_name, namespace_name, image_id, nullptr,
+ SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ if (image_id != nullptr && image_id->empty()) {
+ std::cerr << "rbd: image id was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_image_or_snap_spec(const po::variables_map &vm, std::string *spec) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string nspace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name,
+ &image_name, &snap_name, true, SNAPSHOT_PRESENCE_PERMITTED,
+ SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (pool_name.empty()) {
+ // connect to the cluster to get the default pool
+ librados::Rados rados;
+ r = init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ normalize_pool_name(&pool_name);
+ }
+
+ spec->append(pool_name);
+ spec->append("/");
+ if (!nspace_name.empty()) {
+ spec->append(nspace_name);
+ spec->append("/");
+ }
+ spec->append(image_name);
+ if (!snap_name.empty()) {
+ spec->append("@");
+ spec->append(snap_name);
+ }
+
+ return 0;
+}
+
+void append_options_as_args(const std::vector<std::string> &options,
+ std::vector<std::string> *args) {
+ for (auto &opts : options) {
+ std::vector<std::string> args_;
+ boost::split(args_, opts, boost::is_any_of(","));
+ for (auto &o : args_) {
+ args->push_back("--" + o);
+ }
+ }
+}
+
+int get_pool_image_snapshot_names(const po::variables_map &vm,
+ at::ArgumentModifier mod,
+ size_t *spec_arg_index,
+ std::string *pool_name,
+ std::string *namespace_name,
+ std::string *image_name,
+ std::string *snap_name,
+ bool image_name_required,
+ SnapshotPresence snapshot_presence,
+ SpecValidation spec_validation) {
+ std::string pool_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_POOL_NAME : at::POOL_NAME);
+ std::string image_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_IMAGE_NAME : at::IMAGE_NAME);
+ return get_pool_generic_snapshot_names(vm, mod, spec_arg_index, pool_key,
+ pool_name, namespace_name, image_key,
+ "image", image_name, snap_name,
+ image_name_required, snapshot_presence,
+ spec_validation);
+}
+
+int get_pool_generic_snapshot_names(const po::variables_map &vm,
+ at::ArgumentModifier mod,
+ size_t *spec_arg_index,
+ const std::string& pool_key,
+ std::string *pool_name,
+ std::string *namespace_name,
+ const std::string& generic_key,
+ const std::string& generic_key_desc,
+ std::string *generic_name,
+ std::string *snap_name,
+ bool generic_name_required,
+ SnapshotPresence snapshot_presence,
+ SpecValidation spec_validation) {
+ std::string namespace_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_NAMESPACE_NAME : at::NAMESPACE_NAME);
+ std::string snap_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_SNAPSHOT_NAME : at::SNAPSHOT_NAME);
+
+ if (vm.count(pool_key) && pool_name != nullptr) {
+ *pool_name = vm[pool_key].as<std::string>();
+ }
+ if (vm.count(namespace_key) && namespace_name != nullptr) {
+ *namespace_name = vm[namespace_key].as<std::string>();
+ }
+ if (vm.count(generic_key) && generic_name != nullptr) {
+ *generic_name = vm[generic_key].as<std::string>();
+ }
+ if (vm.count(snap_key) && snap_name != nullptr) {
+ *snap_name = vm[snap_key].as<std::string>();
+ }
+
+ int r;
+ if ((generic_key == at::IMAGE_NAME || generic_key == at::DEST_IMAGE_NAME) &&
+ generic_name != nullptr && !generic_name->empty()) {
+ // despite the separate pool and snapshot name options,
+ // we can also specify them via the image option
+ std::string image_name_copy(*generic_name);
+ r = extract_spec(image_name_copy, pool_name, namespace_name, generic_name,
+ snap_name, spec_validation);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (generic_name != nullptr && spec_arg_index != nullptr &&
+ generic_name->empty()) {
+ std::string spec = get_positional_argument(vm, (*spec_arg_index)++);
+ if (!spec.empty()) {
+ r = extract_spec(spec, pool_name, namespace_name, generic_name, snap_name,
+ spec_validation);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ if (generic_name != nullptr && generic_name_required &&
+ generic_name->empty()) {
+ std::string prefix = at::get_description_prefix(mod);
+ std::cerr << "rbd: "
+ << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+ << generic_key_desc << " name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ std::regex pattern("^[^@/]*?$");
+ if (spec_validation == SPEC_VALIDATION_FULL) {
+ // validate pool name while creating/renaming/copying/cloning/importing/etc
+ if ((pool_name != nullptr) && !std::regex_match (*pool_name, pattern)) {
+ std::cerr << "rbd: invalid pool name '" << *pool_name << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (namespace_name != nullptr && !namespace_name->empty() &&
+ !std::regex_match (*namespace_name, pattern)) {
+ std::cerr << "rbd: invalid namespace name '" << *namespace_name << "'"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ if (snap_name != nullptr) {
+ r = validate_snapshot_name(mod, *snap_name, snapshot_presence,
+ spec_validation);
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+int validate_snapshot_name(at::ArgumentModifier mod,
+ const std::string &snap_name,
+ SnapshotPresence snapshot_presence,
+ SpecValidation spec_validation) {
+ std::string prefix = at::get_description_prefix(mod);
+ switch (snapshot_presence) {
+ case SNAPSHOT_PRESENCE_PERMITTED:
+ break;
+ case SNAPSHOT_PRESENCE_NONE:
+ if (!snap_name.empty()) {
+ std::cerr << "rbd: "
+ << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+ << "snapshot name specified for a command that doesn't use it"
+ << std::endl;
+ return -EINVAL;
+ }
+ break;
+ case SNAPSHOT_PRESENCE_REQUIRED:
+ if (snap_name.empty()) {
+ std::cerr << "rbd: "
+ << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+ << "snapshot name was not specified" << std::endl;
+ return -EINVAL;
+ }
+ break;
+ }
+
+ if (spec_validation == SPEC_VALIDATION_SNAP) {
+ // disallow "/" and "@" in snap name
+ std::regex pattern("^[^@/]*?$");
+ if (!std::regex_match (snap_name, pattern)) {
+ std::cerr << "rbd: invalid snap name '" << snap_name << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+int get_image_options(const boost::program_options::variables_map &vm,
+ bool get_format, librbd::ImageOptions *opts) {
+ uint64_t order = 0, stripe_unit = 0, stripe_count = 0, object_size = 0;
+ uint64_t features = 0, features_clear = 0;
+ std::string data_pool;
+ bool order_specified = true;
+ bool features_specified = false;
+ bool features_clear_specified = false;
+ bool stripe_specified = false;
+
+ if (vm.count(at::IMAGE_ORDER)) {
+ order = vm[at::IMAGE_ORDER].as<uint64_t>();
+ } else if (vm.count(at::IMAGE_OBJECT_SIZE)) {
+ object_size = vm[at::IMAGE_OBJECT_SIZE].as<uint64_t>();
+ order = std::round(std::log2(object_size));
+ } else {
+ order_specified = false;
+ }
+
+ if (vm.count(at::IMAGE_FEATURES)) {
+ features = vm[at::IMAGE_FEATURES].as<uint64_t>();
+ features_specified = true;
+ }
+
+ if (vm.count(at::IMAGE_STRIPE_UNIT)) {
+ stripe_unit = vm[at::IMAGE_STRIPE_UNIT].as<uint64_t>();
+ stripe_specified = true;
+ }
+
+ if (vm.count(at::IMAGE_STRIPE_COUNT)) {
+ stripe_count = vm[at::IMAGE_STRIPE_COUNT].as<uint64_t>();
+ stripe_specified = true;
+ }
+
+ if (vm.count(at::IMAGE_SHARED) && vm[at::IMAGE_SHARED].as<bool>()) {
+ if (features_specified) {
+ features &= ~RBD_FEATURES_SINGLE_CLIENT;
+ } else {
+ features_clear |= RBD_FEATURES_SINGLE_CLIENT;
+ features_clear_specified = true;
+ }
+ }
+
+ if (vm.count(at::IMAGE_DATA_POOL)) {
+ data_pool = vm[at::IMAGE_DATA_POOL].as<std::string>();
+ }
+
+ if (get_format) {
+ uint64_t format = 0;
+ bool format_specified = false;
+ if (vm.count(at::IMAGE_NEW_FORMAT)) {
+ format = 2;
+ format_specified = true;
+ } else if (vm.count(at::IMAGE_FORMAT)) {
+ format = vm[at::IMAGE_FORMAT].as<uint32_t>();
+ format_specified = true;
+ }
+ if (format == 1) {
+ std::cerr << "rbd: image format 1 is deprecated" << std::endl;
+ }
+
+ if (features_specified && features != 0) {
+ if (format_specified && format == 1) {
+ std::cerr << "rbd: features not allowed with format 1; "
+ << "use --image-format 2" << std::endl;
+ return -EINVAL;
+ } else {
+ format = 2;
+ format_specified = true;
+ }
+ }
+
+ if ((stripe_unit || stripe_count) &&
+ (stripe_unit != (1ull << order) && stripe_count != 1)) {
+ if (format_specified && format == 1) {
+ std::cerr << "rbd: non-default striping not allowed with format 1; "
+ << "use --image-format 2" << std::endl;
+ return -EINVAL;
+ } else {
+ format = 2;
+ format_specified = true;
+ }
+ }
+
+ if (!data_pool.empty()) {
+ if (format_specified && format == 1) {
+ std::cerr << "rbd: data pool not allowed with format 1; "
+ << "use --image-format 2" << std::endl;
+ return -EINVAL;
+ } else {
+ format = 2;
+ format_specified = true;
+ }
+ }
+
+ if (format_specified) {
+ int r = g_conf().set_val("rbd_default_format", stringify(format));
+ ceph_assert(r == 0);
+ opts->set(RBD_IMAGE_OPTION_FORMAT, format);
+ }
+ }
+
+ if (order_specified)
+ opts->set(RBD_IMAGE_OPTION_ORDER, order);
+ if (features_specified)
+ opts->set(RBD_IMAGE_OPTION_FEATURES, features);
+ if (features_clear_specified) {
+ opts->set(RBD_IMAGE_OPTION_FEATURES_CLEAR, features_clear);
+ }
+ if (stripe_specified) {
+ opts->set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ opts->set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ }
+ if (!data_pool.empty()) {
+ opts->set(RBD_IMAGE_OPTION_DATA_POOL, data_pool);
+ }
+ int r = get_journal_options(vm, opts);
+ if (r < 0) {
+ return r;
+ }
+
+ r = get_flatten_option(vm, opts);
+ if (r < 0) {
+ return r;
+ }
+
+ if (vm.count(at::IMAGE_MIRROR_IMAGE_MODE)) {
+ opts->set(RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE,
+ vm[at::IMAGE_MIRROR_IMAGE_MODE].as<librbd::mirror_image_mode_t>());
+ }
+
+ return 0;
+}
+
+int get_journal_options(const boost::program_options::variables_map &vm,
+ librbd::ImageOptions *opts) {
+
+ if (vm.count(at::JOURNAL_OBJECT_SIZE)) {
+ uint64_t size = vm[at::JOURNAL_OBJECT_SIZE].as<uint64_t>();
+ uint64_t order = 12;
+ while ((1ULL << order) < size) {
+ order++;
+ }
+ opts->set(RBD_IMAGE_OPTION_JOURNAL_ORDER, order);
+
+ int r = g_conf().set_val("rbd_journal_order", stringify(order));
+ ceph_assert(r == 0);
+ }
+ if (vm.count(at::JOURNAL_SPLAY_WIDTH)) {
+ opts->set(RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH,
+ vm[at::JOURNAL_SPLAY_WIDTH].as<uint64_t>());
+
+ int r = g_conf().set_val("rbd_journal_splay_width",
+ stringify(
+ vm[at::JOURNAL_SPLAY_WIDTH].as<uint64_t>()));
+ ceph_assert(r == 0);
+ }
+ if (vm.count(at::JOURNAL_POOL)) {
+ opts->set(RBD_IMAGE_OPTION_JOURNAL_POOL,
+ vm[at::JOURNAL_POOL].as<std::string>());
+
+ int r = g_conf().set_val("rbd_journal_pool",
+ vm[at::JOURNAL_POOL].as<std::string>());
+ ceph_assert(r == 0);
+ }
+
+ return 0;
+}
+
+int get_flatten_option(const boost::program_options::variables_map &vm,
+ librbd::ImageOptions *opts) {
+ if (vm.count(at::IMAGE_FLATTEN) && vm[at::IMAGE_FLATTEN].as<bool>()) {
+ uint64_t flatten = 1;
+ opts->set(RBD_IMAGE_OPTION_FLATTEN, flatten);
+ }
+ return 0;
+}
+
+int get_image_size(const boost::program_options::variables_map &vm,
+ uint64_t *size) {
+ if (vm.count(at::IMAGE_SIZE) == 0) {
+ std::cerr << "rbd: must specify --size <M/G/T>" << std::endl;
+ return -EINVAL;
+ }
+
+ *size = vm[at::IMAGE_SIZE].as<uint64_t>();
+ return 0;
+}
+
+int get_path(const boost::program_options::variables_map &vm,
+ size_t *arg_index, std::string *path) {
+ if (vm.count(at::PATH)) {
+ *path = vm[at::PATH].as<std::string>();
+ } else {
+ *path = get_positional_argument(vm, *arg_index);
+ if (!path->empty()) {
+ ++(*arg_index);
+ }
+ }
+
+ if (path->empty()) {
+ std::cerr << "rbd: path was not specified" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int get_formatter(const po::variables_map &vm,
+ at::Format::Formatter *formatter) {
+ if (vm.count(at::FORMAT)) {
+ bool pretty = vm[at::PRETTY_FORMAT].as<bool>();
+ *formatter = vm[at::FORMAT].as<at::Format>().create_formatter(pretty);
+ if (*formatter == nullptr && pretty) {
+ std::cerr << "rbd: --pretty-format only works when --format "
+ << "is json or xml" << std::endl;
+ return -EINVAL;
+ } else if (*formatter != nullptr && !pretty) {
+ formatter->get()->enable_line_break();
+ }
+ } else if (vm[at::PRETTY_FORMAT].as<bool>()) {
+ std::cerr << "rbd: --pretty-format only works when --format "
+ << "is json or xml" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int get_snap_create_flags(const po::variables_map &vm, uint32_t *flags) {
+ if (vm[at::SKIP_QUIESCE].as<bool>() &&
+ vm[at::IGNORE_QUIESCE_ERROR].as<bool>()) {
+ std::cerr << "rbd: " << at::IGNORE_QUIESCE_ERROR
+ << " cannot be used together with " << at::SKIP_QUIESCE
+ << std::endl;
+ return -EINVAL;
+ }
+
+ *flags = 0;
+ if (vm[at::SKIP_QUIESCE].as<bool>()) {
+ *flags |= RBD_SNAP_CREATE_SKIP_QUIESCE;
+ } else if (vm[at::IGNORE_QUIESCE_ERROR].as<bool>()) {
+ *flags |= RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR;
+ }
+ return 0;
+}
+
+void init_context() {
+ g_conf().set_val_or_die("rbd_cache_writethrough_until_flush", "false");
+ g_conf().apply_changes(nullptr);
+}
+
+int init_rados(librados::Rados *rados) {
+ init_context();
+
+ int r = rados->init_with_context(g_ceph_context);
+ if (r < 0) {
+ std::cerr << "rbd: couldn't initialize rados!" << std::endl;
+ return r;
+ }
+
+ r = rados->connect();
+ if (r < 0) {
+ std::cerr << "rbd: couldn't connect to the cluster!" << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int init(const std::string &pool_name, const std::string& namespace_name,
+ librados::Rados *rados, librados::IoCtx *io_ctx) {
+ init_context();
+
+ int r = init_rados(rados);
+ if (r < 0) {
+ return r;
+ }
+
+ r = init_io_ctx(*rados, pool_name, namespace_name, io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+int init_io_ctx(librados::Rados &rados, std::string pool_name,
+ const std::string& namespace_name, librados::IoCtx *io_ctx) {
+ normalize_pool_name(&pool_name);
+
+ int r = rados.ioctx_create(pool_name.c_str(), *io_ctx);
+ if (r < 0) {
+ if (r == -ENOENT && pool_name == get_default_pool_name()) {
+ std::cerr << "rbd: error opening default pool "
+ << "'" << pool_name << "'" << std::endl
+ << "Ensure that the default pool has been created or specify "
+ << "an alternate pool name." << std::endl;
+ } else {
+ std::cerr << "rbd: error opening pool '" << pool_name << "': "
+ << cpp_strerror(r) << std::endl;
+ }
+ return r;
+ }
+
+ return set_namespace(namespace_name, io_ctx);
+}
+
+int set_namespace(const std::string& namespace_name, librados::IoCtx *io_ctx) {
+ if (!namespace_name.empty()) {
+ librbd::RBD rbd;
+ bool exists = false;
+ int r = rbd.namespace_exists(*io_ctx, namespace_name.c_str(), &exists);
+ if (r < 0) {
+ std::cerr << "rbd: error asserting namespace: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (!exists) {
+ std::cerr << "rbd: namespace '" << namespace_name << "' does not exist."
+ << std::endl;
+ return -ENOENT;
+ }
+ }
+ io_ctx->set_namespace(namespace_name);
+ return 0;
+}
+
+void disable_cache() {
+ g_conf().set_val_or_die("rbd_cache", "false");
+}
+
+int open_image(librados::IoCtx &io_ctx, const std::string &image_name,
+ bool read_only, librbd::Image *image) {
+ int r;
+ librbd::RBD rbd;
+ if (read_only) {
+ r = rbd.open_read_only(io_ctx, *image, image_name.c_str(), NULL);
+ } else {
+ r = rbd.open(io_ctx, *image, image_name.c_str());
+ }
+
+ if (r < 0) {
+ std::cerr << "rbd: error opening image " << image_name << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int open_image_by_id(librados::IoCtx &io_ctx, const std::string &image_id,
+ bool read_only, librbd::Image *image) {
+ int r;
+ librbd::RBD rbd;
+ if (read_only) {
+ r = rbd.open_by_id_read_only(io_ctx, *image, image_id.c_str(), NULL);
+ } else {
+ r = rbd.open_by_id(io_ctx, *image, image_id.c_str());
+ }
+
+ if (r < 0) {
+ std::cerr << "rbd: error opening image with id " << image_id << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int init_and_open_image(const std::string &pool_name,
+ const std::string &namespace_name,
+ const std::string &image_name,
+ const std::string &image_id,
+ const std::string &snap_name, bool read_only,
+ librados::Rados *rados, librados::IoCtx *io_ctx,
+ librbd::Image *image) {
+ int r = init(pool_name, namespace_name, rados, io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ if (image_id.empty()) {
+ r = open_image(*io_ctx, image_name, read_only, image);
+ } else {
+ r = open_image_by_id(*io_ctx, image_id, read_only, image);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ if (!snap_name.empty()) {
+ r = snap_set(*image, snap_name);
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+int snap_set(librbd::Image &image, const std::string &snap_name) {
+ int r = image.snap_set(snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "error setting snapshot context: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void calc_sparse_extent(const bufferptr &bp,
+ size_t sparse_size,
+ size_t buffer_offset,
+ uint64_t buffer_length,
+ size_t *write_length,
+ bool *zeroed) {
+ if (sparse_size == 0) {
+ // sparse writes are disabled -- write the full extent
+ ceph_assert(buffer_offset == 0);
+ *write_length = buffer_length;
+ *zeroed = false;
+ return;
+ }
+
+ *write_length = 0;
+ size_t original_offset = buffer_offset;
+ while (buffer_offset < buffer_length) {
+ size_t extent_size = std::min<size_t>(
+ sparse_size, buffer_length - buffer_offset);
+
+ bufferptr extent(bp, buffer_offset, extent_size);
+
+ bool extent_is_zero = extent.is_zero();
+ if (original_offset == buffer_offset) {
+ *zeroed = extent_is_zero;
+ } else if (*zeroed != extent_is_zero) {
+ ceph_assert(*write_length > 0);
+ return;
+ }
+
+ buffer_offset += extent_size;
+ *write_length += extent_size;
+ }
+}
+
+std::string image_id(librbd::Image& image) {
+ std::string id;
+ int r = image.get_id(&id);
+ if (r < 0) {
+ return std::string();
+ }
+ return id;
+}
+
+std::string mirror_image_mode(librbd::mirror_image_mode_t mode) {
+ switch (mode) {
+ case RBD_MIRROR_IMAGE_MODE_JOURNAL:
+ return "journal";
+ case RBD_MIRROR_IMAGE_MODE_SNAPSHOT:
+ return "snapshot";
+ default:
+ return "unknown";
+ }
+}
+
+std::string mirror_image_state(librbd::mirror_image_state_t state) {
+ switch (state) {
+ case RBD_MIRROR_IMAGE_DISABLING:
+ return "disabling";
+ case RBD_MIRROR_IMAGE_ENABLED:
+ return "enabled";
+ case RBD_MIRROR_IMAGE_DISABLED:
+ return "disabled";
+ default:
+ return "unknown";
+ }
+}
+
+std::string mirror_image_status_state(
+ librbd::mirror_image_status_state_t state) {
+ switch (state) {
+ case MIRROR_IMAGE_STATUS_STATE_UNKNOWN:
+ return "unknown";
+ case MIRROR_IMAGE_STATUS_STATE_ERROR:
+ return "error";
+ case MIRROR_IMAGE_STATUS_STATE_SYNCING:
+ return "syncing";
+ case MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY:
+ return "starting_replay";
+ case MIRROR_IMAGE_STATUS_STATE_REPLAYING:
+ return "replaying";
+ case MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY:
+ return "stopping_replay";
+ case MIRROR_IMAGE_STATUS_STATE_STOPPED:
+ return "stopped";
+ default:
+ return "unknown (" + stringify(static_cast<uint32_t>(state)) + ")";
+ }
+}
+
+std::string mirror_image_site_status_state(
+ const librbd::mirror_image_site_status_t& status) {
+ return (status.up ? "up+" : "down+") +
+ mirror_image_status_state(status.state);
+}
+
+std::string mirror_image_global_status_state(
+ const librbd::mirror_image_global_status_t& status) {
+ librbd::mirror_image_site_status_t local_status;
+ int r = get_local_mirror_image_status(status, &local_status);
+ if (r < 0) {
+ return "down+unknown";
+ }
+
+ return mirror_image_site_status_state(local_status);
+}
+
+int get_local_mirror_image_status(
+ const librbd::mirror_image_global_status_t& status,
+ librbd::mirror_image_site_status_t* local_status) {
+ auto it = std::find_if(status.site_statuses.begin(),
+ status.site_statuses.end(),
+ [](auto& site_status) {
+ return (site_status.mirror_uuid ==
+ RBD_MIRROR_IMAGE_STATUS_LOCAL_MIRROR_UUID);
+ });
+ if (it == status.site_statuses.end()) {
+ return -ENOENT;
+ }
+
+ *local_status = *it;
+ return 0;
+}
+
+std::string timestr(time_t t) {
+ if (t == 0) {
+ return "";
+ }
+
+ struct tm tm;
+
+ localtime_r(&t, &tm);
+
+ char buf[32];
+ strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &tm);
+
+ return buf;
+}
+
+uint64_t get_rbd_default_features(CephContext* cct) {
+ auto features = cct->_conf.get_val<std::string>("rbd_default_features");
+ return boost::lexical_cast<uint64_t>(features);
+}
+
+bool is_not_user_snap_namespace(librbd::Image* image,
+ const librbd::snap_info_t &snap_info)
+{
+ librbd::snap_namespace_type_t namespace_type;
+ int r = image->snap_get_namespace_type(snap_info.id, &namespace_type);
+ if (r < 0) {
+ return false;
+ }
+ return namespace_type != RBD_SNAP_NAMESPACE_TYPE_USER;
+}
+
+void get_mirror_peer_sites(
+ librados::IoCtx& io_ctx,
+ std::vector<librbd::mirror_peer_site_t>* mirror_peers) {
+ librados::IoCtx default_io_ctx;
+ default_io_ctx.dup(io_ctx);
+ default_io_ctx.set_namespace("");
+
+ mirror_peers->clear();
+
+ librbd::RBD rbd;
+ int r = rbd.mirror_peer_site_list(default_io_ctx, mirror_peers);
+ if (r < 0 && r != -ENOENT) {
+ std::cerr << "rbd: failed to list mirror peers" << std::endl;
+ }
+}
+
+void get_mirror_peer_mirror_uuids_to_names(
+ const std::vector<librbd::mirror_peer_site_t>& mirror_peers,
+ std::map<std::string, std::string>* mirror_uuids_to_name) {
+ mirror_uuids_to_name->clear();
+ for (auto& peer : mirror_peers) {
+ if (!peer.mirror_uuid.empty() && !peer.site_name.empty()) {
+ (*mirror_uuids_to_name)[peer.mirror_uuid] = peer.site_name;
+ }
+ }
+}
+
+void populate_unknown_mirror_image_site_statuses(
+ const std::vector<librbd::mirror_peer_site_t>& mirror_peers,
+ librbd::mirror_image_global_status_t* global_status) {
+ std::set<std::string> missing_mirror_uuids;
+ librbd::mirror_peer_direction_t mirror_peer_direction =
+ RBD_MIRROR_PEER_DIRECTION_RX_TX;
+ for (auto& peer : mirror_peers) {
+ if (peer.uuid == mirror_peers.begin()->uuid) {
+ mirror_peer_direction = peer.direction;
+ } else if (mirror_peer_direction != RBD_MIRROR_PEER_DIRECTION_RX_TX &&
+ mirror_peer_direction != peer.direction) {
+ mirror_peer_direction = RBD_MIRROR_PEER_DIRECTION_RX_TX;
+ }
+
+ if (!peer.mirror_uuid.empty() &&
+ peer.direction != RBD_MIRROR_PEER_DIRECTION_TX) {
+ missing_mirror_uuids.insert(peer.mirror_uuid);
+ }
+ }
+
+ if (mirror_peer_direction != RBD_MIRROR_PEER_DIRECTION_TX) {
+ missing_mirror_uuids.insert(RBD_MIRROR_IMAGE_STATUS_LOCAL_MIRROR_UUID);
+ }
+
+ std::vector<librbd::mirror_image_site_status_t> site_statuses;
+ site_statuses.reserve(missing_mirror_uuids.size());
+
+ for (auto& site_status : global_status->site_statuses) {
+ if (missing_mirror_uuids.count(site_status.mirror_uuid) > 0) {
+ missing_mirror_uuids.erase(site_status.mirror_uuid);
+ site_statuses.push_back(site_status);
+ }
+ }
+
+ for (auto& mirror_uuid : missing_mirror_uuids) {
+ site_statuses.push_back({mirror_uuid, MIRROR_IMAGE_STATUS_STATE_UNKNOWN,
+ "status not found", 0, false});
+ }
+
+ std::swap(global_status->site_statuses, site_statuses);
+}
+
+int mgr_command(librados::Rados& rados, const std::string& cmd,
+ const std::map<std::string, std::string> &args,
+ std::ostream *out_os, std::ostream *err_os) {
+ std::string command = R"(
+ {
+ "prefix": ")" + cmd + R"(", )" + mgr_command_args_to_str(args) + R"(
+ })";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+ std::string outs;
+ int r = rados.mgr_command(command, in_bl, &out_bl, &outs);
+ if (r < 0) {
+ (*err_os) << "rbd: " << cmd << " failed: " << cpp_strerror(r);
+ if (!outs.empty()) {
+ (*err_os) << ": " << outs;
+ }
+ (*err_os) << std::endl;
+ return r;
+ }
+
+ if (out_bl.length() != 0) {
+ (*out_os) << out_bl.c_str();
+ }
+
+ return 0;
+}
+
+} // namespace utils
+} // namespace rbd
diff --git a/src/tools/rbd/Utils.h b/src/tools/rbd/Utils.h
new file mode 100644
index 000000000..bd00765f8
--- /dev/null
+++ b/src/tools/rbd/Utils.h
@@ -0,0 +1,242 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_UTILS_H
+#define CEPH_RBD_UTILS_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "tools/rbd/ArgumentTypes.h"
+#include <map>
+#include <string>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace utils {
+
+namespace detail {
+
+template <typename T, void(T::*MF)(int)>
+void aio_completion_callback(librbd::completion_t completion,
+ void *arg) {
+ librbd::RBD::AioCompletion *aio_completion =
+ reinterpret_cast<librbd::RBD::AioCompletion*>(completion);
+
+ // complete the AIO callback in separate thread context
+ T *t = reinterpret_cast<T *>(arg);
+ int r = aio_completion->get_return_value();
+ aio_completion->release();
+
+ (t->*MF)(r);
+}
+
+} // namespace detail
+
+static const std::string RBD_DIFF_BANNER ("rbd diff v1\n");
+static const size_t RBD_DEFAULT_SPARSE_SIZE = 4096;
+
+static const std::string RBD_IMAGE_BANNER_V2 ("rbd image v2\n");
+static const std::string RBD_IMAGE_DIFFS_BANNER_V2 ("rbd image diffs v2\n");
+static const std::string RBD_DIFF_BANNER_V2 ("rbd diff v2\n");
+
+#define RBD_DIFF_FROM_SNAP 'f'
+#define RBD_DIFF_TO_SNAP 't'
+#define RBD_DIFF_IMAGE_SIZE 's'
+#define RBD_DIFF_WRITE 'w'
+#define RBD_DIFF_ZERO 'z'
+#define RBD_DIFF_END 'e'
+
+#define RBD_SNAP_PROTECTION_STATUS 'p'
+
+#define RBD_EXPORT_IMAGE_ORDER 'O'
+#define RBD_EXPORT_IMAGE_FEATURES 'T'
+#define RBD_EXPORT_IMAGE_STRIPE_UNIT 'U'
+#define RBD_EXPORT_IMAGE_STRIPE_COUNT 'C'
+#define RBD_EXPORT_IMAGE_META 'M'
+#define RBD_EXPORT_IMAGE_END 'E'
+
+enum SnapshotPresence {
+ SNAPSHOT_PRESENCE_NONE,
+ SNAPSHOT_PRESENCE_PERMITTED,
+ SNAPSHOT_PRESENCE_REQUIRED
+};
+
+enum SpecValidation {
+ SPEC_VALIDATION_FULL,
+ SPEC_VALIDATION_SNAP,
+ SPEC_VALIDATION_NONE
+};
+
+struct ProgressContext : public librbd::ProgressContext {
+ const char *operation;
+ bool progress;
+ int last_pc;
+
+ ProgressContext(const char *o, bool no_progress)
+ : operation(o), progress(!no_progress), last_pc(0) {
+ }
+
+ int update_progress(uint64_t offset, uint64_t total) override;
+ void finish();
+ void fail();
+};
+
+int get_percentage(uint64_t part, uint64_t whole);
+
+template <typename T, void(T::*MF)(int)>
+librbd::RBD::AioCompletion *create_aio_completion(T *t) {
+ return new librbd::RBD::AioCompletion(
+ t, &detail::aio_completion_callback<T, MF>);
+}
+
+void aio_context_callback(librbd::completion_t completion, void *arg);
+
+int read_string(int fd, unsigned max, std::string *out);
+
+int extract_spec(const std::string &spec, std::string *pool_name,
+ std::string *namespace_name, std::string *name,
+ std::string *snap_name, SpecValidation spec_validation);
+
+std::string get_positional_argument(
+ const boost::program_options::variables_map &vm, size_t index);
+
+void normalize_pool_name(std::string* pool_name);
+std::string get_default_pool_name();
+
+int get_image_or_snap_spec(const boost::program_options::variables_map &vm,
+ std::string *spec);
+
+void append_options_as_args(const std::vector<std::string> &options,
+ std::vector<std::string> *args);
+
+int get_pool_and_namespace_names(
+ const boost::program_options::variables_map &vm, bool validate_pool_name,
+ std::string* pool_name, std::string* namespace_name, size_t *arg_index);
+
+int get_pool_image_snapshot_names(
+ const boost::program_options::variables_map &vm,
+ argument_types::ArgumentModifier mod, size_t *spec_arg_index,
+ std::string *pool_name, std::string *namespace_name,
+ std::string *image_name, std::string *snap_name, bool image_name_required,
+ SnapshotPresence snapshot_presence, SpecValidation spec_validation);
+
+int get_pool_generic_snapshot_names(
+ const boost::program_options::variables_map &vm,
+ argument_types::ArgumentModifier mod, size_t *spec_arg_index,
+ const std::string& pool_key, std::string *pool_name,
+ std::string *namespace_name, const std::string& generic_key,
+ const std::string& generic_key_desc, std::string *generic_name,
+ std::string *snap_name, bool generic_name_required,
+ SnapshotPresence snapshot_presence, SpecValidation spec_validation);
+
+int get_pool_image_id(const boost::program_options::variables_map &vm,
+ size_t *spec_arg_index,
+ std::string *pool_name,
+ std::string *namespace_name,
+ std::string *image_id);
+
+int validate_snapshot_name(argument_types::ArgumentModifier mod,
+ const std::string &snap_name,
+ SnapshotPresence snapshot_presence,
+ SpecValidation spec_validation);
+
+int get_image_options(const boost::program_options::variables_map &vm,
+ bool get_format, librbd::ImageOptions* opts);
+
+int get_journal_options(const boost::program_options::variables_map &vm,
+ librbd::ImageOptions *opts);
+
+int get_flatten_option(const boost::program_options::variables_map &vm,
+ librbd::ImageOptions *opts);
+
+int get_image_size(const boost::program_options::variables_map &vm,
+ uint64_t *size);
+
+int get_path(const boost::program_options::variables_map &vm,
+ size_t *arg_index, std::string *path);
+
+int get_formatter(const boost::program_options::variables_map &vm,
+ argument_types::Format::Formatter *formatter);
+
+int get_snap_create_flags(const boost::program_options::variables_map &vm,
+ uint32_t *flags);
+
+void init_context();
+
+int init_rados(librados::Rados *rados);
+
+int init(const std::string& pool_name, const std::string& namespace_name,
+ librados::Rados *rados, librados::IoCtx *io_ctx);
+int init_io_ctx(librados::Rados &rados, std::string pool_name,
+ const std::string& namespace_name, librados::IoCtx *io_ctx);
+int set_namespace(const std::string& namespace_name, librados::IoCtx *io_ctx);
+
+void disable_cache();
+
+int open_image(librados::IoCtx &io_ctx, const std::string &image_name,
+ bool read_only, librbd::Image *image);
+
+int open_image_by_id(librados::IoCtx &io_ctx, const std::string &image_id,
+ bool read_only, librbd::Image *image);
+
+int init_and_open_image(const std::string &pool_name,
+ const std::string &namespace_name,
+ const std::string &image_name,
+ const std::string &image_id,
+ const std::string &snap_name, bool read_only,
+ librados::Rados *rados, librados::IoCtx *io_ctx,
+ librbd::Image *image);
+
+int snap_set(librbd::Image &image, const std::string &snap_name);
+
+void calc_sparse_extent(const bufferptr &bp,
+ size_t sparse_size,
+ size_t buffer_offset,
+ uint64_t length,
+ size_t *write_length,
+ bool *zeroed);
+
+bool is_not_user_snap_namespace(librbd::Image* image,
+ const librbd::snap_info_t &snap_info);
+
+std::string image_id(librbd::Image& image);
+
+std::string mirror_image_mode(
+ librbd::mirror_image_mode_t mirror_image_mode);
+std::string mirror_image_state(
+ librbd::mirror_image_state_t mirror_image_state);
+std::string mirror_image_status_state(
+ librbd::mirror_image_status_state_t state);
+std::string mirror_image_site_status_state(
+ const librbd::mirror_image_site_status_t& status);
+std::string mirror_image_global_status_state(
+ const librbd::mirror_image_global_status_t& status);
+
+int get_local_mirror_image_status(
+ const librbd::mirror_image_global_status_t& status,
+ librbd::mirror_image_site_status_t* local_status);
+
+std::string timestr(time_t t);
+
+// duplicate here to not include librbd_internal lib
+uint64_t get_rbd_default_features(CephContext* cct);
+
+void get_mirror_peer_sites(
+ librados::IoCtx& io_ctx,
+ std::vector<librbd::mirror_peer_site_t>* mirror_peers);
+void get_mirror_peer_mirror_uuids_to_names(
+ const std::vector<librbd::mirror_peer_site_t>& mirror_peers,
+ std::map<std::string, std::string>* fsid_to_name);
+void populate_unknown_mirror_image_site_statuses(
+ const std::vector<librbd::mirror_peer_site_t>& mirror_peers,
+ librbd::mirror_image_global_status_t* global_status);
+
+int mgr_command(librados::Rados& rados, const std::string& cmd,
+ const std::map<std::string, std::string> &args,
+ std::ostream *out_os, std::ostream *err_os);
+
+} // namespace utils
+} // namespace rbd
+
+#endif // CEPH_RBD_UTILS_H
diff --git a/src/tools/rbd/action/Bench.cc b/src/tools/rbd/action/Bench.cc
new file mode 100644
index 000000000..304b5c229
--- /dev/null
+++ b/src/tools/rbd/action/Bench.cc
@@ -0,0 +1,588 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/ceph_mutex.h"
+#include "include/types.h"
+#include "global/signal_handler.h"
+#include <atomic>
+#include <chrono>
+#include <iostream>
+#include <boost/accumulators/accumulators.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+#include <boost/accumulators/statistics/rolling_sum.hpp>
+#include <boost/program_options.hpp>
+
+using namespace std::chrono;
+
+static std::atomic<bool> terminating;
+static void handle_signal(int signum)
+{
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ terminating = true;
+}
+
+namespace rbd {
+namespace action {
+namespace bench {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+enum io_type_t {
+ IO_TYPE_READ = 0,
+ IO_TYPE_WRITE,
+ IO_TYPE_RW,
+
+ IO_TYPE_NUM,
+};
+
+enum io_pattern_t {
+ IO_PATTERN_RAND,
+ IO_PATTERN_SEQ,
+ IO_PATTERN_FULL_SEQ
+};
+
+struct IOType {};
+struct Size {};
+struct IOPattern {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Size *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (!parse_error.empty()) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+ v = boost::any(size);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ IOPattern *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ if (s == "rand") {
+ v = IO_PATTERN_RAND;
+ } else if (s == "seq") {
+ v = IO_PATTERN_SEQ;
+ } else if (s == "full-seq") {
+ v = IO_PATTERN_FULL_SEQ;
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+io_type_t get_io_type(std::string io_type_string) {
+ if (io_type_string == "read")
+ return IO_TYPE_READ;
+ else if (io_type_string == "write")
+ return IO_TYPE_WRITE;
+ else if (io_type_string == "readwrite" || io_type_string == "rw")
+ return IO_TYPE_RW;
+ else
+ return IO_TYPE_NUM;
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ IOType *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ io_type_t io_type = get_io_type(s);
+ if (io_type >= IO_TYPE_NUM)
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ else
+ v = boost::any(io_type);
+}
+
+} // anonymous namespace
+
+static void rbd_bencher_completion(void *c, void *pc);
+struct rbd_bencher;
+
+struct bencher_completer {
+ rbd_bencher *bencher;
+ bufferlist *bl;
+
+public:
+ bencher_completer(rbd_bencher *bencher, bufferlist *bl)
+ : bencher(bencher), bl(bl)
+ { }
+
+ ~bencher_completer()
+ {
+ if (bl)
+ delete bl;
+ }
+};
+
+struct rbd_bencher {
+ librbd::Image *image;
+ ceph::mutex lock = ceph::make_mutex("rbd_bencher::lock");
+ ceph::condition_variable cond;
+ int in_flight;
+ io_type_t io_type;
+ uint64_t io_size;
+ bufferlist write_bl;
+
+ explicit rbd_bencher(librbd::Image *i, io_type_t io_type, uint64_t io_size)
+ : image(i),
+ in_flight(0),
+ io_type(io_type),
+ io_size(io_size)
+ {
+ if (io_type == IO_TYPE_WRITE || io_type == IO_TYPE_RW) {
+ bufferptr bp(io_size);
+ memset(bp.c_str(), rand() & 0xff, io_size);
+ write_bl.push_back(bp);
+ }
+ }
+
+ void start_io(int max, uint64_t off, uint64_t len, int op_flags, bool read_flag)
+ {
+ {
+ std::lock_guard l{lock};
+ in_flight++;
+ }
+
+ librbd::RBD::AioCompletion *c;
+ if (read_flag) {
+ bufferlist *read_bl = new bufferlist();
+ c = new librbd::RBD::AioCompletion((void *)(new bencher_completer(this, read_bl)),
+ rbd_bencher_completion);
+ image->aio_read2(off, len, *read_bl, c, op_flags);
+ } else {
+ c = new librbd::RBD::AioCompletion((void *)(new bencher_completer(this, NULL)),
+ rbd_bencher_completion);
+ image->aio_write2(off, len, write_bl, c, op_flags);
+ }
+ }
+
+ int wait_for(int max, bool interrupt_on_terminating) {
+ std::unique_lock l{lock};
+ while (in_flight > max && !(terminating && interrupt_on_terminating)) {
+ cond.wait_for(l, 200ms);
+ }
+
+ return terminating ? -EINTR : 0;
+ }
+
+};
+
+void rbd_bencher_completion(void *vc, void *pc)
+{
+ librbd::RBD::AioCompletion *c = (librbd::RBD::AioCompletion *)vc;
+ bencher_completer *bc = static_cast<bencher_completer *>(pc);
+ rbd_bencher *b = bc->bencher;
+ //cout << "complete " << c << std::endl;
+ int ret = c->get_return_value();
+ if (b->io_type == IO_TYPE_WRITE && ret != 0) {
+ std::cout << "write error: " << cpp_strerror(ret) << std::endl;
+ exit(ret < 0 ? -ret : ret);
+ } else if (b->io_type == IO_TYPE_READ && (unsigned int)ret != b->io_size) {
+ cout << "read error: " << cpp_strerror(ret) << std::endl;
+ exit(ret < 0 ? -ret : ret);
+ }
+ b->lock.lock();
+ b->in_flight--;
+ b->cond.notify_all();
+ b->lock.unlock();
+ c->release();
+ delete bc;
+}
+
+bool should_read(uint64_t read_proportion)
+{
+ uint64_t rand_num = rand() % 100;
+
+ if (rand_num < read_proportion)
+ return true;
+ else
+ return false;
+}
+
+int do_bench(librbd::Image& image, io_type_t io_type,
+ uint64_t io_size, uint64_t io_threads,
+ uint64_t io_bytes, io_pattern_t io_pattern,
+ uint64_t read_proportion)
+{
+ uint64_t size = 0;
+ image.size(&size);
+ if (io_size > size) {
+ std::cerr << "rbd: io-size " << byte_u_t(io_size) << " "
+ << "larger than image size " << byte_u_t(size) << std::endl;
+ return -EINVAL;
+ }
+
+ if (io_size > std::numeric_limits<uint32_t>::max()) {
+ std::cerr << "rbd: io-size should be less than 4G" << std::endl;
+ return -EINVAL;
+ }
+
+ int r = image.flush();
+ if (r < 0 && (r != -EROFS || io_type != IO_TYPE_READ)) {
+ std::cerr << "rbd: failed to flush: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ rbd_bencher b(&image, io_type, io_size);
+
+ std::cout << "bench "
+ << " type " << (io_type == IO_TYPE_READ ? "read" :
+ io_type == IO_TYPE_WRITE ? "write" : "readwrite")
+ << (io_type == IO_TYPE_RW ? " read:write=" +
+ to_string(read_proportion) + ":" + to_string(100 - read_proportion) : "")
+ << " io_size " << io_size
+ << " io_threads " << io_threads
+ << " bytes " << io_bytes
+ << " pattern ";
+ switch (io_pattern) {
+ case IO_PATTERN_RAND:
+ std::cout << "random";
+ break;
+ case IO_PATTERN_SEQ:
+ std::cout << "sequential";
+ break;
+ case IO_PATTERN_FULL_SEQ:
+ std::cout << "full sequential";
+ break;
+ default:
+ ceph_assert(false);
+ break;
+ }
+ std::cout << std::endl;
+
+ srand(time(NULL) % (unsigned long) -1);
+
+ coarse_mono_time start = coarse_mono_clock::now();
+ chrono::duration<double> last = chrono::duration<double>::zero();
+ unsigned ios = 0;
+
+ vector<uint64_t> thread_offset;
+ uint64_t i;
+ uint64_t seq_chunk_length = (size / io_size / io_threads) * io_size;;
+
+ // disturb all thread's offset
+ for (i = 0; i < io_threads; i++) {
+ uint64_t start_pos = 0;
+ switch (io_pattern) {
+ case IO_PATTERN_RAND:
+ start_pos = (rand() % (size / io_size)) * io_size;
+ break;
+ case IO_PATTERN_SEQ:
+ start_pos = seq_chunk_length * i;
+ break;
+ case IO_PATTERN_FULL_SEQ:
+ start_pos = i * io_size;
+ break;
+ default:
+ break;
+ }
+ thread_offset.push_back(start_pos);
+ }
+
+ const int WINDOW_SIZE = 5;
+ typedef boost::accumulators::accumulator_set<
+ double, boost::accumulators::stats<
+ boost::accumulators::tag::rolling_sum> > RollingSum;
+
+ RollingSum time_acc(
+ boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
+ RollingSum ios_acc(
+ boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
+ RollingSum off_acc(
+ boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
+ uint64_t cur_ios = 0;
+ uint64_t cur_off = 0;
+
+ int op_flags;
+ if (io_pattern == IO_PATTERN_RAND) {
+ op_flags = LIBRADOS_OP_FLAG_FADVISE_RANDOM;
+ } else {
+ op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
+ }
+
+ printf(" SEC OPS OPS/SEC BYTES/SEC\n");
+ uint64_t off;
+ int read_ops = 0;
+ int write_ops = 0;
+
+ for (off = 0; off < io_bytes; ) {
+ // Issue I/O
+ i = 0;
+ int r = 0;
+ while (i < io_threads && off < io_bytes) {
+ bool read_flag = should_read(read_proportion);
+
+ r = b.wait_for(io_threads - 1, true);
+ if (r < 0) {
+ break;
+ }
+ b.start_io(io_threads, thread_offset[i], io_size, op_flags, read_flag);
+
+ ++i;
+ ++ios;
+ off += io_size;
+
+ ++cur_ios;
+ cur_off += io_size;
+
+ if (read_flag)
+ read_ops++;
+ else
+ write_ops++;
+ }
+
+ if (r < 0) {
+ break;
+ }
+
+ // Set the thread_offsets of next I/O
+ for (i = 0; i < io_threads; ++i) {
+ switch (io_pattern) {
+ case IO_PATTERN_RAND:
+ thread_offset[i] = (rand() % (size / io_size)) * io_size;
+ continue;
+ case IO_PATTERN_SEQ:
+ if (off < (seq_chunk_length * io_threads)) {
+ thread_offset[i] += io_size;
+ } else {
+ // thread_offset is adjusted to the chunks unassigned to threads.
+ thread_offset[i] = off + (i * io_size);
+ }
+ if (thread_offset[i] + io_size > size) {
+ thread_offset[i] = seq_chunk_length * i;
+ }
+ break;
+ case IO_PATTERN_FULL_SEQ:
+ thread_offset[i] += (io_size * io_threads);
+ if (thread_offset[i] >= size) {
+ thread_offset[i] = i * io_size;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ coarse_mono_time now = coarse_mono_clock::now();
+ chrono::duration<double> elapsed = now - start;
+ if (last == chrono::duration<double>::zero()) {
+ last = elapsed;
+ } else if ((int)elapsed.count() != (int)last.count()) {
+ time_acc((elapsed - last).count());
+ ios_acc(static_cast<double>(cur_ios));
+ off_acc(static_cast<double>(cur_off));
+ cur_ios = 0;
+ cur_off = 0;
+
+ double time_sum = boost::accumulators::rolling_sum(time_acc);
+ std::cout.width(5);
+ std::cout << (int)elapsed.count();
+ std::cout.width(10);
+ std::cout << (int)(ios - io_threads);
+ std::cout.width(10);
+ std::cout << boost::accumulators::rolling_sum(ios_acc) / time_sum;
+ std::cout.width(10);
+ std::cout << byte_u_t(boost::accumulators::rolling_sum(off_acc) / time_sum) << "/s"
+ << std::endl;
+ last = elapsed;
+ }
+ }
+ b.wait_for(0, false);
+
+ if (io_type != IO_TYPE_READ) {
+ r = image.flush();
+ if (r < 0) {
+ std::cerr << "rbd: failed to flush at the end: " << cpp_strerror(r)
+ << std::endl;
+ }
+ }
+
+ coarse_mono_time now = coarse_mono_clock::now();
+ chrono::duration<double> elapsed = now - start;
+
+ std::cout << "elapsed: " << (int)elapsed.count() << " "
+ << "ops: " << ios << " "
+ << "ops/sec: " << (double)ios / elapsed.count() << " "
+ << "bytes/sec: " << byte_u_t((double)off / elapsed.count()) << "/s"
+ << std::endl;
+
+ if (io_type == IO_TYPE_RW) {
+ std::cout << "read_ops: " << read_ops << " "
+ << "read_ops/sec: " << (double)read_ops / elapsed.count() << " "
+ << "read_bytes/sec: " << byte_u_t((double)read_ops * io_size / elapsed.count()) << "/s"
+ << std::endl;
+
+ std::cout << "write_ops: " << write_ops << " "
+ << "write_ops/sec: " << (double)write_ops / elapsed.count() << " "
+ << "write_bytes/sec: " << byte_u_t((double)write_ops * io_size / elapsed.count()) << "/s"
+ << std::endl;
+
+ }
+
+ return 0;
+}
+
+void add_bench_common_options(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+
+ options->add_options()
+ ("io-size", po::value<Size>(), "IO size (in B/K/M/G/T) [default: 4K]")
+ ("io-threads", po::value<uint32_t>(), "ios in flight [default: 16]")
+ ("io-total", po::value<Size>(), "total size for IO (in B/K/M/G/T) [default: 1G]")
+ ("io-pattern", po::value<IOPattern>(), "IO pattern (rand, seq, or full-seq) [default: seq]")
+ ("rw-mix-read", po::value<uint64_t>(), "read proportion in readwrite (<= 100) [default: 50]");
+}
+
+void get_arguments_for_write(po::options_description *positional,
+ po::options_description *options) {
+ add_bench_common_options(positional, options);
+}
+
+void get_arguments_for_bench(po::options_description *positional,
+ po::options_description *options) {
+ add_bench_common_options(positional, options);
+
+ options->add_options()
+ ("io-type", po::value<IOType>()->required(), "IO type (read, write, or readwrite(rw))");
+}
+
+int bench_execute(const po::variables_map &vm, io_type_t bench_io_type) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ utils::SnapshotPresence snap_presence = utils::SNAPSHOT_PRESENCE_NONE;
+ if (bench_io_type == IO_TYPE_READ)
+ snap_presence = utils::SNAPSHOT_PRESENCE_PERMITTED;
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, snap_presence, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t bench_io_size;
+ if (vm.count("io-size")) {
+ bench_io_size = vm["io-size"].as<uint64_t>();
+ } else {
+ bench_io_size = 4096;
+ }
+ if (bench_io_size == 0) {
+ std::cerr << "rbd: --io-size should be greater than zero." << std::endl;
+ return -EINVAL;
+ }
+
+ uint32_t bench_io_threads;
+ if (vm.count("io-threads")) {
+ bench_io_threads = vm["io-threads"].as<uint32_t>();
+ } else {
+ bench_io_threads = 16;
+ }
+ if (bench_io_threads == 0) {
+ std::cerr << "rbd: --io-threads should be greater than zero." << std::endl;
+ return -EINVAL;
+ }
+
+ uint64_t bench_bytes;
+ if (vm.count("io-total")) {
+ bench_bytes = vm["io-total"].as<uint64_t>();
+ } else {
+ bench_bytes = 1 << 30;
+ }
+
+ io_pattern_t bench_pattern;
+ if (vm.count("io-pattern")) {
+ bench_pattern = vm["io-pattern"].as<io_pattern_t>();
+ } else {
+ bench_pattern = IO_PATTERN_SEQ;
+ }
+
+ uint64_t bench_read_proportion;
+ if (bench_io_type == IO_TYPE_READ) {
+ bench_read_proportion = 100;
+ } else if (bench_io_type == IO_TYPE_WRITE) {
+ bench_read_proportion = 0;
+ } else {
+ if (vm.count("rw-mix-read")) {
+ bench_read_proportion = vm["rw-mix-read"].as<uint64_t>();
+ } else {
+ bench_read_proportion = 50;
+ }
+
+ if (bench_read_proportion > 100) {
+ std::cerr << "rbd: --rw-mix-read should not be larger than 100." << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, sighup_handler);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ r = do_bench(image, bench_io_type, bench_io_size, bench_io_threads,
+ bench_bytes, bench_pattern, bench_read_proportion);
+
+ unregister_async_signal_handler(SIGHUP, sighup_handler);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ if (r < 0) {
+ std::cerr << "bench failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int execute_for_write(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::cerr << "rbd: bench-write is deprecated, use rbd bench --io-type write ..." << std::endl;
+ return bench_execute(vm, IO_TYPE_WRITE);
+}
+
+int execute_for_bench(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ io_type_t bench_io_type;
+ if (vm.count("io-type")) {
+ bench_io_type = vm["io-type"].as<io_type_t>();
+ } else {
+ std::cerr << "rbd: --io-type must be specified." << std::endl;
+ return -EINVAL;
+ }
+
+ return bench_execute(vm, bench_io_type);
+}
+
+Shell::Action action_write(
+ {"bench-write"}, {}, "Simple write benchmark. (Deprecated, please use `rbd bench --io-type write` instead.)",
+ "", &get_arguments_for_write, &execute_for_write, false);
+
+Shell::Action action_bench(
+ {"bench"}, {}, "Simple benchmark.", "", &get_arguments_for_bench, &execute_for_bench);
+
+} // namespace bench
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Children.cc b/src/tools/rbd/action/Children.cc
new file mode 100644
index 000000000..58e861b69
--- /dev/null
+++ b/src/tools/rbd/action/Children.cc
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace children {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_list_children(librados::IoCtx &io_ctx, librbd::Image &image,
+ bool all_flag, bool descendants_flag, Formatter *f)
+{
+ std::vector<librbd::linked_image_spec_t> children;
+ librbd::RBD rbd;
+ int r;
+ if (descendants_flag) {
+ r = image.list_descendants(&children);
+ } else {
+ r = image.list_children3(&children);
+ }
+ if (r < 0)
+ return r;
+
+ if (f)
+ f->open_array_section("children");
+
+ for (auto& child : children) {
+ bool trash = child.trash;
+ if (f) {
+ if (all_flag) {
+ f->open_object_section("child");
+ f->dump_string("pool", child.pool_name);
+ f->dump_string("pool_namespace", child.pool_namespace);
+ f->dump_string("image", child.image_name);
+ f->dump_string("id", child.image_id);
+ f->dump_bool("trash", child.trash);
+ f->close_section();
+ } else if (!trash) {
+ f->open_object_section("child");
+ f->dump_string("pool", child.pool_name);
+ f->dump_string("pool_namespace", child.pool_namespace);
+ f->dump_string("image", child.image_name);
+ f->close_section();
+ }
+ } else if (all_flag || !trash) {
+ if (child.pool_name.empty()) {
+ std::cout << "(child missing " << child.pool_id << "/";
+ } else {
+ std::cout << child.pool_name << "/";
+ }
+ if (!child.pool_namespace.empty()) {
+ std::cout << child.pool_namespace << "/";
+ }
+ if (child.image_name.empty()) {
+ std::cout << child.image_id << ")";
+ } else {
+ std::cout << child.image_name;
+ if (trash) {
+ std::cout << " (trash " << child.image_id << ")";
+ }
+ }
+ std::cout << std::endl;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_snap_id_option(options);
+ options->add_options()
+ ("all,a", po::bool_switch(), "list all children (include trash)");
+ options->add_options()
+ ("descendants", po::bool_switch(), "include all descendants");
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ uint64_t snap_id = LIBRADOS_SNAP_HEAD;
+ if (vm.count(at::SNAPSHOT_ID)) {
+ snap_id = vm[at::SNAPSHOT_ID].as<uint64_t>();
+ }
+
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (snap_id != LIBRADOS_SNAP_HEAD && !snap_name.empty()) {
+ std::cerr << "rbd: trying to access snapshot using both name and id."
+ << std::endl;
+ return -EINVAL;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!snap_name.empty()) {
+ r = image.snap_set(snap_name.c_str());
+ } else if (snap_id != LIBRADOS_SNAP_HEAD) {
+ r = image.snap_set_by_id(snap_id);
+ }
+ if (r == -ENOENT) {
+ std::cerr << "rbd: snapshot does not exist." << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: error setting snapshot: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ r = do_list_children(io_ctx, image, vm["all"].as<bool>(),
+ vm["descendants"].as<bool>(), formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: listing children failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"all", "a", "descendants"});
+Shell::Action action(
+ {"children"}, {}, "Display children of an image or its snapshot.", "",
+ &get_arguments, &execute);
+
+} // namespace children
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Clone.cc b/src/tools/rbd/action/Clone.cc
new file mode 100644
index 000000000..6406c957e
--- /dev/null
+++ b/src/tools/rbd/action/Clone.cc
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace clone {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_clone(librbd::RBD &rbd, librados::IoCtx &p_ioctx,
+ const char *p_name, const char *p_snapname,
+ librados::IoCtx &c_ioctx, const char *c_name,
+ librbd::ImageOptions& opts) {
+ return rbd.clone3(p_ioctx, p_name, p_snapname, c_ioctx, c_name, opts);
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, false);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dst_pool_name;
+ std::string dst_namespace_name;
+ std::string dst_image_name;
+ std::string dst_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name,
+ &dst_namespace_name, &dst_image_name, &dst_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, false, &opts);
+ if (r < 0) {
+ return r;
+ }
+ opts.set(RBD_IMAGE_OPTION_FORMAT, static_cast<uint64_t>(2));
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name, &dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = do_clone(rbd, io_ctx, image_name.c_str(), snap_name.c_str(), dst_io_ctx,
+ dst_image_name.c_str(), opts);
+ if (r == -EXDEV) {
+ std::cerr << "rbd: clone v2 required for cross-namespace clones."
+ << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: clone error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"clone"}, {}, "Clone a snapshot into a CoW child image.",
+ at::get_long_features_help(), &get_arguments, &execute);
+
+} // namespace clone
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Config.cc b/src/tools/rbd/action/Config.cc
new file mode 100644
index 000000000..b038485ce
--- /dev/null
+++ b/src/tools/rbd/action/Config.cc
@@ -0,0 +1,891 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/escape.h"
+#include "common/errno.h"
+#include "common/options.h"
+#include "global/global_context.h"
+#include "include/stringify.h"
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+
+#include <iostream>
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/program_options.hpp>
+
+#include "json_spirit/json_spirit.h"
+
+namespace rbd {
+namespace action {
+namespace config {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+const std::string METADATA_CONF_PREFIX = "conf_";
+
+void add_config_entity_option(
+ boost::program_options::options_description *positional) {
+ positional->add_options()
+ ("config-entity", "config entity (global, client, client.<id>)");
+}
+
+void add_pool_option(boost::program_options::options_description *positional) {
+ positional->add_options()
+ ("pool-name", "pool name");
+}
+
+void add_key_option(po::options_description *positional) {
+ positional->add_options()
+ ("key", "config key");
+}
+
+int get_config_entity(const po::variables_map &vm, std::string *config_entity) {
+ *config_entity = utils::get_positional_argument(vm, 0);
+
+ if (*config_entity != "global" && *config_entity != "client" &&
+ !boost::starts_with(*config_entity, ("client."))) {
+ std::cerr << "rbd: invalid config entity: " << *config_entity
+ << " (must be global, client or client.<id>)" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_pool(const po::variables_map &vm, std::string *pool_name) {
+ *pool_name = utils::get_positional_argument(vm, 0);
+ if (pool_name->empty()) {
+ std::cerr << "rbd: pool name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_key(const po::variables_map &vm, size_t *arg_index,
+ std::string *key) {
+ *key = utils::get_positional_argument(vm, *arg_index);
+ if (key->empty()) {
+ std::cerr << "rbd: config key was not specified" << std::endl;
+ return -EINVAL;
+ } else {
+ ++(*arg_index);
+ }
+
+ if (!boost::starts_with(*key, "rbd_")) {
+ std::cerr << "rbd: not rbd option: " << *key << std::endl;
+ return -EINVAL;
+ }
+
+ std::string value;
+ int r = g_ceph_context->_conf.get_val(key->c_str(), &value);
+ if (r < 0) {
+ std::cerr << "rbd: invalid config key: " << *key << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const librbd::config_source_t& source) {
+ switch (source) {
+ case RBD_CONFIG_SOURCE_CONFIG:
+ os << "config";
+ break;
+ case RBD_CONFIG_SOURCE_POOL:
+ os << "pool";
+ break;
+ case RBD_CONFIG_SOURCE_IMAGE:
+ os << "image";
+ break;
+ default:
+ os << "unknown (" << static_cast<uint32_t>(source) << ")";
+ break;
+ }
+ return os;
+}
+
+int config_global_list(
+ librados::Rados &rados, const std::string &config_entity,
+ std::map<std::string, std::pair<std::string, std::string>> *options) {
+ bool client_id_config_entity =
+ boost::starts_with(config_entity, ("client."));
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config dump\", "
+ "\"format\": \"json\" "
+ "}";
+ bufferlist in_bl;
+ bufferlist out_bl;
+ std::string ss;
+ int r = rados.mon_command(cmd, in_bl, &out_bl, &ss);
+ if (r < 0) {
+ std::cerr << "rbd: error reading config: " << ss << std::endl;
+ return r;
+ }
+
+ json_spirit::mValue json_root;
+ if (!json_spirit::read(out_bl.to_str(), json_root)) {
+ std::cerr << "rbd: error parsing config dump" << std::endl;
+ return -EINVAL;
+ }
+
+ try {
+ auto &json_array = json_root.get_array();
+ for (auto& e : json_array) {
+ auto &json_obj = e.get_obj();
+ std::string section;
+ std::string name;
+ std::string value;
+
+ for (auto &pairs : json_obj) {
+ if (pairs.first == "section") {
+ section = pairs.second.get_str();
+ } else if (pairs.first == "name") {
+ name = pairs.second.get_str();
+ } else if (pairs.first == "value") {
+ value = pairs.second.get_str();
+ }
+ }
+
+ if (!boost::starts_with(name, "rbd_")) {
+ continue;
+ }
+ if (section != "global" && section != "client" &&
+ (!client_id_config_entity || section != config_entity)) {
+ continue;
+ }
+ if (config_entity == "global" && section != "global") {
+ continue;
+ }
+ auto it = options->find(name);
+ if (it == options->end()) {
+ (*options)[name] = {value, section};
+ continue;
+ }
+ if (section == "client") {
+ if (it->second.second == "global") {
+ it->second = {value, section};
+ }
+ } else if (client_id_config_entity) {
+ it->second = {value, section};
+ }
+ }
+ } catch (std::runtime_error &e) {
+ std::cerr << "rbd: error parsing config dump: " << e.what() << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+} // anonymous namespace
+
+void get_global_get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_config_entity_option(positional);
+ add_key_option(positional);
+}
+
+int execute_global_get(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string config_entity;
+ int r = get_config_entity(vm, &config_entity);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ std::map<std::string, std::pair<std::string, std::string>> options;
+ r = config_global_list(rados, config_entity, &options);
+ if (r < 0) {
+ return r;
+ }
+
+ auto it = options.find(key);
+
+ if (it == options.end() || it->second.second != config_entity) {
+ std::cerr << "rbd: " << key << " is not set" << std::endl;
+ return -ENOENT;
+ }
+
+ std::cout << it->second.first << std::endl;
+ return 0;
+}
+
+void get_global_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_config_entity_option(positional);
+ add_key_option(positional);
+ positional->add_options()
+ ("value", "config value");
+}
+
+int execute_global_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string config_entity;
+ int r = get_config_entity(vm, &config_entity);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value = utils::get_positional_argument(vm, 2);
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config set\", "
+ "\"who\": \"" + stringify(json_stream_escaper(config_entity)) + "\", "
+ "\"name\": \"" + key + "\", "
+ "\"value\": \"" + stringify(json_stream_escaper(value)) + "\""
+ "}";
+ bufferlist in_bl;
+ std::string ss;
+ r = rados.mon_command(cmd, in_bl, nullptr, &ss);
+ if (r < 0) {
+ std::cerr << "rbd: error setting " << key << ": " << ss << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_global_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_config_entity_option(positional);
+ add_key_option(positional);
+}
+
+int execute_global_remove(
+ const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string config_entity;
+ int r = get_config_entity(vm, &config_entity);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config rm\", "
+ "\"who\": \"" + stringify(json_stream_escaper(config_entity)) + "\", "
+ "\"name\": \"" + key + "\""
+ "}";
+ bufferlist in_bl;
+ std::string ss;
+ r = rados.mon_command(cmd, in_bl, nullptr, &ss);
+ if (r < 0) {
+ std::cerr << "rbd: error removing " << key << ": " << ss << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_global_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_config_entity_option(positional);
+ at::add_format_options(options);
+}
+
+int execute_global_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string config_entity;
+ int r = get_config_entity(vm, &config_entity);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter f;
+ r = utils::get_formatter(vm, &f);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ std::map<std::string, std::pair<std::string, std::string>> options;
+ r = config_global_list(rados, config_entity, &options);
+ if (r < 0) {
+ return r;
+ }
+
+ if (options.empty() && !f) {
+ return 0;
+ }
+
+ TextTable tbl;
+
+ if (f) {
+ f->open_array_section("config");
+ } else {
+ tbl.define_column("Name", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Section", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (const auto &it : options) {
+ if (f) {
+ f->open_object_section("option");
+ f->dump_string("name", it.first);
+ f->dump_string("value", it.second.first);
+ f->dump_string("section", it.second.second);
+ f->close_section();
+ } else {
+ tbl << it.first << it.second.first << it.second.second
+ << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+void get_pool_get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_pool_option(positional);
+ add_key_option(positional);
+}
+
+int execute_pool_get(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ int r = get_pool(vm, &pool_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::string value;
+
+ r = rbd.pool_metadata_get(io_ctx, METADATA_CONF_PREFIX + key, &value);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ std::cerr << "rbd: " << key << " is not set" << std::endl;
+ } else {
+ std::cerr << "rbd: failed to get " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+
+ std::cout << value << std::endl;
+ return 0;
+}
+
+void get_pool_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_pool_option(positional);
+ add_key_option(positional);
+ positional->add_options()
+ ("value", "config value");
+}
+
+int execute_pool_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ int r = get_pool(vm, &pool_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value = utils::get_positional_argument(vm, 2);
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.pool_metadata_set(io_ctx, METADATA_CONF_PREFIX + key, value);
+ if (r < 0) {
+ std::cerr << "rbd: failed to set " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_pool_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_pool_option(positional);
+ add_key_option(positional);
+}
+
+int execute_pool_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ int r = get_pool(vm, &pool_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.pool_metadata_remove(io_ctx, METADATA_CONF_PREFIX + key);
+ if (r < 0) {
+ std::cerr << "rbd: failed to remove " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_pool_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_pool_option(positional);
+ at::add_format_options(options);
+}
+
+int execute_pool_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ int r = get_pool(vm, &pool_name);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter f;
+ r = utils::get_formatter(vm, &f);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ TextTable tbl;
+ librbd::RBD rbd;
+ std::vector<librbd::config_option_t> options;
+
+ r = rbd.config_list(io_ctx, &options);
+ if (r < 0) {
+ std::cerr << "rbd: failed to list config: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (f) {
+ f->open_array_section("config");
+ } else {
+ tbl.define_column("Name", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Source", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (auto &option : options) {
+ if (f) {
+ f->open_object_section("option");
+ f->dump_string("name", option.name);
+ f->dump_string("value", option.value);
+ f->dump_stream("source") << option.source;
+ f->close_section();
+ } else {
+ std::ostringstream source;
+ source << option.source;
+ tbl << option.name << option.value << source.str() << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+void get_image_get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+}
+
+int execute_image_get(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value;
+
+ r = image.metadata_get(METADATA_CONF_PREFIX + key, &value);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ std::cerr << "rbd: " << key << " is not set" << std::endl;
+ } else {
+ std::cerr << "rbd: failed to get " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+
+ std::cout << value << std::endl;
+ return 0;
+}
+
+void get_image_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+ positional->add_options()
+ ("value", "config value");
+}
+
+int execute_image_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value = utils::get_positional_argument(vm, arg_index);
+ if (value.empty()) {
+ std::cerr << "rbd: image config value was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.metadata_set(METADATA_CONF_PREFIX + key, value);
+ if (r < 0) {
+ std::cerr << "rbd: failed to set " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_image_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+}
+
+int execute_image_remove(
+ const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.metadata_remove(METADATA_CONF_PREFIX + key);
+ if (r < 0) {
+ std::cerr << "rbd: failed to remove " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_image_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_image_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter f;
+ r = utils::get_formatter(vm, &f);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ TextTable tbl;
+ std::vector<librbd::config_option_t> options;
+
+ r = image.config_list(&options);
+ if (r < 0) {
+ std::cerr << "rbd: failed to list config: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (options.empty()) {
+ if (f == nullptr) {
+ std::cout << "There are no values" << std::endl;
+ }
+ return 0;
+ }
+
+ if (f) {
+ f->open_array_section("config");
+ } else {
+ tbl.define_column("Name", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Source", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (auto &option : options) {
+ if (f) {
+ f->open_object_section("option");
+ f->dump_string("name", option.name);
+ f->dump_string("value", option.value);
+ f->dump_stream("source") << option.source;
+ f->close_section();
+ } else {
+ std::ostringstream source;
+ source << option.source;
+ tbl << option.name << option.value << source.str() << TextTable::endrow;
+ }
+ }
+
+ if (f == nullptr) {
+ bool single = (options.size() == 1);
+ std::cout << "There " << (single ? "is" : "are") << " " << options.size()
+ << " " << (single ? "value" : "values") << ":" << std::endl;
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+Shell::Action action_global_get(
+ {"config", "global", "get"}, {},
+ "Get a global-level configuration override.", "",
+ &get_global_get_arguments, &execute_global_get);
+Shell::Action action_global_set(
+ {"config", "global", "set"}, {},
+ "Set a global-level configuration override.", "",
+ &get_global_set_arguments, &execute_global_set);
+Shell::Action action_global_remove(
+ {"config", "global", "remove"}, {"config", "global", "rm"},
+ "Remove a global-level configuration override.", "",
+ &get_global_remove_arguments, &execute_global_remove);
+Shell::Action action_global_list(
+ {"config", "global", "list"}, {"config", "global", "ls"},
+ "List global-level configuration overrides.", "",
+ &get_global_list_arguments, &execute_global_list);
+
+Shell::Action action_pool_get(
+ {"config", "pool", "get"}, {}, "Get a pool-level configuration override.", "",
+ &get_pool_get_arguments, &execute_pool_get);
+Shell::Action action_pool_set(
+ {"config", "pool", "set"}, {}, "Set a pool-level configuration override.", "",
+ &get_pool_set_arguments, &execute_pool_set);
+Shell::Action action_pool_remove(
+ {"config", "pool", "remove"}, {"config", "pool", "rm"},
+ "Remove a pool-level configuration override.", "",
+ &get_pool_remove_arguments, &execute_pool_remove);
+Shell::Action action_pool_list(
+ {"config", "pool", "list"}, {"config", "pool", "ls"},
+ "List pool-level configuration overrides.", "",
+ &get_pool_list_arguments, &execute_pool_list);
+
+Shell::Action action_image_get(
+ {"config", "image", "get"}, {}, "Get an image-level configuration override.",
+ "", &get_image_get_arguments, &execute_image_get);
+Shell::Action action_image_set(
+ {"config", "image", "set"}, {}, "Set an image-level configuration override.",
+ "", &get_image_set_arguments, &execute_image_set);
+Shell::Action action_image_remove(
+ {"config", "image", "remove"}, {"config", "image", "rm"},
+ "Remove an image-level configuration override.", "",
+ &get_image_remove_arguments, &execute_image_remove);
+Shell::Action action_image_list(
+ {"config", "image", "list"}, {"config", "image", "ls"},
+ "List image-level configuration overrides.", "",
+ &get_image_list_arguments, &execute_image_list);
+
+} // namespace config
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Copy.cc b/src/tools/rbd/action/Copy.cc
new file mode 100644
index 000000000..9a2484371
--- /dev/null
+++ b/src/tools/rbd/action/Copy.cc
@@ -0,0 +1,195 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace copy {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_copy(librbd::Image &src, librados::IoCtx& dest_pp,
+ const char *destname, librbd::ImageOptions& opts,
+ bool no_progress,
+ size_t sparse_size)
+{
+ utils::ProgressContext pc("Image copy", no_progress);
+ int r = src.copy_with_progress4(dest_pp, destname, opts, pc, sparse_size);
+ if (r < 0){
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, false);
+ at::add_sparse_size_option(options);
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dst_pool_name;
+ std::string dst_namespace_name;
+ std::string dst_image_name;
+ std::string dst_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name,
+ &dst_namespace_name, &dst_image_name, &dst_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, false, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name, &dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE;
+ if (vm.count(at::IMAGE_SPARSE_SIZE)) {
+ sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>();
+ }
+ r = do_copy(image, dst_io_ctx, dst_image_name.c_str(), opts,
+ vm[at::NO_PROGRESS].as<bool>(), sparse_size);
+ if (r < 0) {
+ std::cerr << "rbd: copy failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"copy"}, {"cp"}, "Copy src image to dest.", at::get_long_features_help(),
+ &get_arguments, &execute);
+
+static int do_deep_copy(librbd::Image &src, librados::IoCtx& dest_pp,
+ const char *destname, librbd::ImageOptions& opts,
+ bool no_progress)
+{
+ utils::ProgressContext pc("Image deep copy", no_progress);
+ int r = src.deep_copy_with_progress(dest_pp, destname, opts, pc);
+ if (r < 0){
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments_deep(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, false);
+ at::add_flatten_option(options);
+ at::add_no_progress_option(options);
+}
+
+int execute_deep(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dst_pool_name;
+ std::string dst_namespace_name;
+ std::string dst_image_name;
+ std::string dst_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name,
+ &dst_namespace_name, &dst_image_name, &dst_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, false, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name, &dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_deep_copy(image, dst_io_ctx, dst_image_name.c_str(), opts,
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: deep copy failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_deep(
+ {"deep", "copy"}, {"deep", "cp"}, "Deep copy src image to dest.",
+ at::get_long_features_help(), &get_arguments_deep, &execute_deep);
+
+} // namespace copy
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Create.cc b/src/tools/rbd/action/Create.cc
new file mode 100644
index 000000000..2199e009c
--- /dev/null
+++ b/src/tools/rbd/action/Create.cc
@@ -0,0 +1,261 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/ceph_mutex.h"
+#include "common/config_proxy.h"
+#include "common/errno.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace create {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_create(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+ const char *imgname, uint64_t size,
+ librbd::ImageOptions& opts) {
+ return rbd.create4(io_ctx, imgname, size, opts);
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_create_image_options(options, true);
+ options->add_options()
+ (at::IMAGE_THICK_PROVISION.c_str(), po::bool_switch(), "fully allocate storage and zero image");
+ at::add_size_option(options);
+ at::add_no_progress_option(options);
+}
+
+void thick_provision_writer_completion(rbd_completion_t, void *);
+
+struct thick_provision_writer {
+ librbd::Image *image;
+ ceph::mutex lock = ceph::make_mutex("thick_provision_writer::lock");
+ ceph::condition_variable cond;
+ uint64_t chunk_size;
+ uint64_t concurr;
+ struct {
+ uint64_t in_flight;
+ int io_error;
+ } io_status;
+
+ // Constructor
+ explicit thick_provision_writer(librbd::Image *i, librbd::ImageOptions &o)
+ : image(i)
+ {
+ // If error cases occur, the code is aborted, because
+ // constructor cannot return error value.
+ ceph_assert(g_ceph_context != nullptr);
+
+ librbd::image_info_t info;
+ int r = image->stat(info, sizeof(info));
+ ceph_assert(r >= 0);
+
+ uint64_t order = info.order;
+ if (order == 0) {
+ order = g_conf().get_val<uint64_t>("rbd_default_order");
+ }
+
+ auto stripe_count = std::max<uint64_t>(1U, image->get_stripe_count());
+ chunk_size = (1ull << order) * stripe_count;
+
+ concurr = std::max<uint64_t>(
+ 1U, g_conf().get_val<uint64_t>("rbd_concurrent_management_ops") /
+ stripe_count);
+
+ io_status.in_flight = 0;
+ io_status.io_error = 0;
+ }
+
+ int start_io(uint64_t write_offset)
+ {
+ {
+ std::lock_guard l{lock};
+ io_status.in_flight++;
+ if (io_status.in_flight > concurr) {
+ io_status.in_flight--;
+ return -EINVAL;
+ }
+ }
+
+ librbd::RBD::AioCompletion *c;
+ c = new librbd::RBD::AioCompletion(this, thick_provision_writer_completion);
+ int r;
+ r = image->aio_write_zeroes(write_offset, chunk_size, c,
+ RBD_WRITE_ZEROES_FLAG_THICK_PROVISION,
+ LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL);
+ if (r < 0) {
+ std::lock_guard l{lock};
+ io_status.io_error = r;
+ }
+ return r;
+ }
+
+ int wait_for(uint64_t max) {
+ std::unique_lock l{lock};
+ int r = io_status.io_error;
+
+ while (io_status.in_flight > max) {
+ cond.wait_for(l, 200ms);
+ }
+ return r;
+ }
+};
+
+void thick_provision_writer_completion(rbd_completion_t rc, void *pc) {
+ librbd::RBD::AioCompletion *ac = (librbd::RBD::AioCompletion *)rc;
+ thick_provision_writer *tc = static_cast<thick_provision_writer *>(pc);
+
+ int r = ac->get_return_value();
+ tc->lock.lock();
+ if (r < 0 && tc->io_status.io_error >= 0) {
+ tc->io_status.io_error = r;
+ }
+ tc->io_status.in_flight--;
+ tc->cond.notify_all();
+ tc->lock.unlock();
+ ac->release();
+}
+
+int write_data(librbd::Image &image, librbd::ImageOptions &opts,
+ bool no_progress) {
+ uint64_t image_size;
+ int r = 0;
+ utils::ProgressContext pc("Thick provisioning", no_progress);
+
+ if (image.size(&image_size) != 0) {
+ return -EINVAL;
+ }
+
+ thick_provision_writer tpw(&image, opts);
+ uint64_t off;
+ uint64_t i;
+ for (off = 0; off < image_size;) {
+ i = 0;
+ while (i < tpw.concurr && off < image_size) {
+ tpw.wait_for(tpw.concurr - 1);
+ r = tpw.start_io(off);
+ if (r != 0) {
+ goto err_writesame;
+ }
+ ++i;
+ off += tpw.chunk_size;
+ if(off > image_size) {
+ off = image_size;
+ }
+ pc.update_progress(off, image_size);
+ }
+ }
+
+ tpw.wait_for(0);
+ r = image.flush();
+ if (r < 0) {
+ std::cerr << "rbd: failed to flush at the end: " << cpp_strerror(r)
+ << std::endl;
+ goto err_writesame;
+ }
+ pc.finish();
+
+ return r;
+
+err_writesame:
+ tpw.wait_for(0);
+ pc.fail();
+
+ return r;
+}
+
+int thick_write(const std::string &image_name,librados::IoCtx &io_ctx,
+ librbd::ImageOptions &opts, bool no_progress) {
+ int r = 0;
+ librbd::Image image;
+
+ // To prevent writesame from discarding data, thick_write sets
+ // the rbd_discard_on_zeroed_write_same option to false.
+ ceph_assert(g_ceph_context != nullptr);
+ r = g_conf().set_val("rbd_discard_on_zeroed_write_same", "false");
+ ceph_assert(r == 0);
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = write_data(image, opts, no_progress);
+
+ image.close();
+
+ return r;
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, true, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t size;
+ r = utils::get_image_size(vm, &size);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = do_create(rbd, io_ctx, image_name.c_str(), size, opts);
+ if (!namespace_name.empty() && r == -ENOENT) {
+ std::cerr << "rbd: namespace not found - it must be created with "
+ << "'rbd namespace create' before creating an image."
+ << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: create error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (vm.count(at::IMAGE_THICK_PROVISION) && vm[at::IMAGE_THICK_PROVISION].as<bool>()) {
+ r = thick_write(image_name, io_ctx, opts, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: image created but error encountered during thick provisioning: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"create"}, {}, "Create an empty image.", at::get_long_features_help(),
+ &get_arguments, &execute);
+
+} // namespace create
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Device.cc b/src/tools/rbd/action/Device.cc
new file mode 100644
index 000000000..078f944cc
--- /dev/null
+++ b/src/tools/rbd/action/Device.cc
@@ -0,0 +1,280 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+
+#include <boost/program_options.hpp>
+
+#include "include/ceph_assert.h"
+
+namespace rbd {
+namespace action {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+#define DECLARE_DEVICE_OPERATIONS(ns) \
+ namespace ns { \
+ int execute_list(const po::variables_map &vm, \
+ const std::vector<std::string> &ceph_global_args); \
+ int execute_map(const po::variables_map &vm, \
+ const std::vector<std::string> &ceph_global_args); \
+ int execute_unmap(const po::variables_map &vm, \
+ const std::vector<std::string> &ceph_global_args); \
+ int execute_attach(const po::variables_map &vm, \
+ const std::vector<std::string> &ceph_global_args); \
+ int execute_detach(const po::variables_map &vm, \
+ const std::vector<std::string> &ceph_global_args); \
+ }
+
+DECLARE_DEVICE_OPERATIONS(ggate);
+DECLARE_DEVICE_OPERATIONS(kernel);
+DECLARE_DEVICE_OPERATIONS(nbd);
+DECLARE_DEVICE_OPERATIONS(wnbd);
+
+namespace device {
+
+namespace {
+
+struct DeviceOperations {
+ int (*execute_list)(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args);
+ int (*execute_map)(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args);
+ int (*execute_unmap)(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args);
+ int (*execute_attach)(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args);
+ int (*execute_detach)(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args);
+};
+
+const DeviceOperations ggate_operations = {
+ ggate::execute_list,
+ ggate::execute_map,
+ ggate::execute_unmap,
+ ggate::execute_attach,
+ ggate::execute_detach,
+};
+
+const DeviceOperations krbd_operations = {
+ kernel::execute_list,
+ kernel::execute_map,
+ kernel::execute_unmap,
+ kernel::execute_attach,
+ kernel::execute_detach,
+};
+
+const DeviceOperations nbd_operations = {
+ nbd::execute_list,
+ nbd::execute_map,
+ nbd::execute_unmap,
+ nbd::execute_attach,
+ nbd::execute_detach,
+};
+
+const DeviceOperations wnbd_operations = {
+ wnbd::execute_list,
+ wnbd::execute_map,
+ wnbd::execute_unmap,
+ wnbd::execute_attach,
+ wnbd::execute_detach,
+};
+
+enum device_type_t {
+ DEVICE_TYPE_GGATE,
+ DEVICE_TYPE_KRBD,
+ DEVICE_TYPE_NBD,
+ DEVICE_TYPE_WNBD,
+};
+
+struct DeviceType {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ DeviceType *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ #ifdef _WIN32
+ if (s == "wnbd") {
+ v = boost::any(DEVICE_TYPE_WNBD);
+ #else
+ if (s == "nbd") {
+ v = boost::any(DEVICE_TYPE_NBD);
+ } else if (s == "ggate") {
+ v = boost::any(DEVICE_TYPE_GGATE);
+ } else if (s == "krbd") {
+ v = boost::any(DEVICE_TYPE_KRBD);
+ #endif /* _WIN32 */
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+void add_device_type_option(po::options_description *options) {
+ options->add_options()
+ ("device-type,t", po::value<DeviceType>(),
+#ifdef _WIN32
+ "device type [wnbd]");
+#else
+ "device type [ggate, krbd (default), nbd]");
+#endif
+}
+
+void add_device_specific_options(po::options_description *options) {
+ options->add_options()
+ ("options,o", po::value<std::vector<std::string>>(),
+ "device specific options");
+}
+
+device_type_t get_device_type(const po::variables_map &vm) {
+ if (vm.count("device-type")) {
+ return vm["device-type"].as<device_type_t>();
+ }
+ #ifndef _WIN32
+ return DEVICE_TYPE_KRBD;
+ #else
+ return DEVICE_TYPE_WNBD;
+ #endif
+}
+
+const DeviceOperations *get_device_operations(const po::variables_map &vm) {
+ switch (get_device_type(vm)) {
+ case DEVICE_TYPE_GGATE:
+ return &ggate_operations;
+ case DEVICE_TYPE_KRBD:
+ return &krbd_operations;
+ case DEVICE_TYPE_NBD:
+ return &nbd_operations;
+ case DEVICE_TYPE_WNBD:
+ return &wnbd_operations;
+ default:
+ ceph_abort();
+ return nullptr;
+ }
+}
+
+} // anonymous namespace
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_device_type_option(options);
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return (*get_device_operations(vm)->execute_list)(vm, ceph_global_init_args);
+}
+
+void get_map_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_device_type_option(options);
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ ("show-cookie", po::bool_switch(), "show device cookie")
+ ("cookie", po::value<std::string>(), "specify device cookie")
+ ("read-only", po::bool_switch(), "map read-only")
+ ("exclusive", po::bool_switch(), "disable automatic exclusive lock transitions")
+ ("quiesce", po::bool_switch(), "use quiesce hooks")
+ ("quiesce-hook", po::value<std::string>(), "quiesce hook path");
+ add_device_specific_options(options);
+}
+
+int execute_map(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return (*get_device_operations(vm)->execute_map)(vm, ceph_global_init_args);
+}
+
+void get_unmap_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_device_type_option(options);
+ positional->add_options()
+ ("image-or-snap-or-device-spec",
+ "image, snapshot, or device specification\n"
+ "[<pool-name>/[<namespace>/]]<image-name>[@<snap-name>] or <device-path>");
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_namespace_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_snap_option(options, at::ARGUMENT_MODIFIER_NONE);
+ add_device_specific_options(options);
+}
+
+int execute_unmap(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return (*get_device_operations(vm)->execute_unmap)(vm, ceph_global_init_args);
+}
+
+void get_attach_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_device_type_option(options);
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ ("device", po::value<std::string>()->required(), "specify device path")
+ ("show-cookie", po::bool_switch(), "show device cookie")
+ ("cookie", po::value<std::string>(), "specify device cookie")
+ ("read-only", po::bool_switch(), "attach read-only")
+ ("force", po::bool_switch(), "force attach")
+ ("exclusive", po::bool_switch(), "disable automatic exclusive lock transitions")
+ ("quiesce", po::bool_switch(), "use quiesce hooks")
+ ("quiesce-hook", po::value<std::string>(), "quiesce hook path");
+ add_device_specific_options(options);
+}
+
+int execute_attach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return (*get_device_operations(vm)->execute_attach)(vm, ceph_global_init_args);
+}
+
+void get_detach_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_device_type_option(options);
+ positional->add_options()
+ ("image-or-snap-or-device-spec",
+ "image, snapshot, or device specification\n"
+ "[<pool-name>/]<image-name>[@<snap-name>] or <device-path>");
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_snap_option(options, at::ARGUMENT_MODIFIER_NONE);
+ add_device_specific_options(options);
+}
+
+int execute_detach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return (*get_device_operations(vm)->execute_detach)(vm, ceph_global_init_args);
+}
+
+Shell::SwitchArguments switched_arguments({"exclusive", "force", "quiesce",
+ "read-only", "show-cookie"});
+
+Shell::Action action_list(
+ {"device", "list"}, {"showmapped"}, "List mapped rbd images.", "",
+ &get_list_arguments, &execute_list);
+// yet another alias for list command
+Shell::Action action_ls(
+ {"device", "ls"}, {}, "List mapped rbd images.", "",
+ &get_list_arguments, &execute_list, false);
+
+Shell::Action action_map(
+ {"device", "map"}, {"map"}, "Map an image to a block device.", "",
+ &get_map_arguments, &execute_map);
+
+Shell::Action action_unmap(
+ {"device", "unmap"}, {"unmap"}, "Unmap a rbd device.", "",
+ &get_unmap_arguments, &execute_unmap);
+
+Shell::Action action_attach(
+ {"device", "attach"}, {}, "Attach image to device.", "",
+ &get_attach_arguments, &execute_attach);
+
+Shell::Action action_detach(
+ {"device", "detach"}, {}, "Detach image from device.", "",
+ &get_detach_arguments, &execute_detach);
+
+} // namespace device
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Diff.cc b/src/tools/rbd/action/Diff.cc
new file mode 100644
index 000000000..838ef6cc5
--- /dev/null
+++ b/src/tools/rbd/action/Diff.cc
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace diff {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+struct output_method {
+ output_method() : f(NULL), t(NULL), empty(true) {}
+ Formatter *f;
+ TextTable *t;
+ bool empty;
+};
+
+static int diff_cb(uint64_t ofs, size_t len, int exists, void *arg)
+{
+ output_method *om = static_cast<output_method *>(arg);
+ om->empty = false;
+ if (om->f) {
+ om->f->open_object_section("extent");
+ om->f->dump_unsigned("offset", ofs);
+ om->f->dump_unsigned("length", len);
+ om->f->dump_string("exists", exists ? "true" : "false");
+ om->f->close_section();
+ } else {
+ ceph_assert(om->t);
+ *(om->t) << ofs << len << (exists ? "data" : "zero") << TextTable::endrow;
+ }
+ return 0;
+}
+
+static int do_diff(librbd::Image& image, const char *fromsnapname,
+ bool whole_object, Formatter *f)
+{
+ int r;
+ librbd::image_info_t info;
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ output_method om;
+ if (f) {
+ om.f = f;
+ f->open_array_section("extents");
+ } else {
+ om.t = new TextTable();
+ om.t->define_column("Offset", TextTable::LEFT, TextTable::LEFT);
+ om.t->define_column("Length", TextTable::LEFT, TextTable::LEFT);
+ om.t->define_column("Type", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
+ diff_cb, &om);
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ if (!om.empty)
+ std::cout << *om.t;
+ delete om.t;
+ }
+ return r;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(),
+ "snapshot starting point")
+ (at::WHOLE_OBJECT.c_str(), po::bool_switch(), "compare whole object");
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string from_snap_name;
+ if (vm.count(at::FROM_SNAPSHOT_NAME)) {
+ from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>();
+ }
+
+ bool diff_whole_object = vm[at::WHOLE_OBJECT].as<bool>();
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_diff(image, from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
+ diff_whole_object, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: diff error: " << cpp_strerror(r) << std::endl;
+ return -r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"diff"}, {},
+ "Print extents that differ since a previous snap, or image creation.", "",
+ &get_arguments, &execute);
+
+} // namespace diff
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/DiskUsage.cc b/src/tools/rbd/action/DiskUsage.cc
new file mode 100644
index 000000000..12fb8cfde
--- /dev/null
+++ b/src/tools/rbd/action/DiskUsage.cc
@@ -0,0 +1,377 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <algorithm>
+#include <iostream>
+#include <boost/bind/bind.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace disk_usage {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+using namespace boost::placeholders;
+
+static int disk_usage_callback(uint64_t offset, size_t len, int exists,
+ void *arg) {
+ uint64_t *used_size = reinterpret_cast<uint64_t *>(arg);
+ if (exists) {
+ (*used_size) += len;
+ }
+ return 0;
+}
+
+static int get_image_disk_usage(const std::string& name,
+ const std::string& snap_name,
+ const std::string& from_snap_name,
+ librbd::Image &image,
+ bool exact,
+ uint64_t size,
+ uint64_t *used_size){
+
+ const char* from = NULL;
+ if (!from_snap_name.empty()) {
+ from = from_snap_name.c_str();
+ }
+
+ uint64_t flags;
+ int r = image.get_flags(&flags);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve image flags: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) {
+ std::cerr << "warning: fast-diff map is invalid for " << name
+ << (snap_name.empty() ? "" : "@" + snap_name) << ". "
+ << "operation may be slow." << std::endl;
+ }
+
+ *used_size = 0;
+ r = image.diff_iterate2(from, 0, size, false, !exact,
+ &disk_usage_callback, used_size);
+ if (r < 0) {
+ std::cerr << "rbd: failed to iterate diffs: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void format_image_disk_usage(const std::string& name,
+ const std::string& id,
+ const std::string& snap_name,
+ uint64_t snap_id,
+ uint64_t size,
+ uint64_t used_size,
+ TextTable& tbl, Formatter *f) {
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("name", name);
+ f->dump_string("id", id);
+ if (!snap_name.empty()) {
+ f->dump_string("snapshot", snap_name);
+ f->dump_unsigned("snapshot_id", snap_id);
+ }
+ f->dump_unsigned("provisioned_size", size);
+ f->dump_unsigned("used_size" , used_size);
+ f->close_section();
+ } else {
+ std::string full_name = name;
+ if (!snap_name.empty()) {
+ full_name += "@" + snap_name;
+ }
+ tbl << full_name
+ << stringify(byte_u_t(size))
+ << stringify(byte_u_t(used_size))
+ << TextTable::endrow;
+ }
+}
+
+static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
+ const char *imgname, const char *snapname,
+ const char *from_snapname, bool exact, Formatter *f,
+ bool merge_snap) {
+ std::vector<librbd::image_spec_t> images;
+ int r = rbd.list2(io_ctx, &images);
+ if (r == -ENOENT) {
+ r = 0;
+ } else if (r < 0) {
+ return r;
+ }
+
+ TextTable tbl;
+ if (f) {
+ f->open_object_section("stats");
+ f->open_array_section("images");
+ } else {
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("PROVISIONED", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
+ }
+
+ uint32_t count = 0;
+ uint64_t used_size = 0;
+ uint64_t total_prov = 0;
+ uint64_t total_used = 0;
+ uint64_t snap_id = CEPH_NOSNAP;
+ uint64_t from_id = CEPH_NOSNAP;
+ bool found = false;
+ for (auto& image_spec : images) {
+ if (imgname != NULL && image_spec.name != imgname) {
+ continue;
+ }
+ found = true;
+
+ librbd::Image image;
+ r = rbd.open_read_only(io_ctx, image, image_spec.name.c_str(), NULL);
+ if (r < 0) {
+ if (r != -ENOENT) {
+ std::cerr << "rbd: error opening " << image_spec.name << ": "
+ << cpp_strerror(r) << std::endl;
+ }
+ continue;
+ }
+
+ uint64_t features;
+ r = image.features(&features);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve image features: " << cpp_strerror(r)
+ << std::endl;
+ goto out;
+ }
+ if ((features & RBD_FEATURE_FAST_DIFF) == 0) {
+ std::cerr << "warning: fast-diff map is not enabled for "
+ << image_spec.name << ". " << "operation may be slow."
+ << std::endl;
+ }
+
+ librbd::image_info_t info;
+ if (image.stat(info, sizeof(info)) < 0) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ std::vector<librbd::snap_info_t> snap_list;
+ r = image.snap_list(snap_list);
+ if (r < 0) {
+ std::cerr << "rbd: error opening " << image_spec.name << " snapshots: "
+ << cpp_strerror(r) << std::endl;
+ continue;
+ }
+
+ snap_list.erase(remove_if(snap_list.begin(),
+ snap_list.end(),
+ boost::bind(utils::is_not_user_snap_namespace, &image, _1)),
+ snap_list.end());
+
+ bool found_from_snap = (from_snapname == nullptr);
+ bool found_snap = (snapname == nullptr);
+ bool found_from = (from_snapname == nullptr);
+ std::string last_snap_name;
+ std::sort(snap_list.begin(), snap_list.end(),
+ boost::bind(&librbd::snap_info_t::id, _1) <
+ boost::bind(&librbd::snap_info_t::id, _2));
+ if (!found_snap || !found_from) {
+ for (auto &snap_info : snap_list) {
+ if (!found_snap && snap_info.name == snapname) {
+ snap_id = snap_info.id;
+ found_snap = true;
+ }
+ if (!found_from && snap_info.name == from_snapname) {
+ from_id = snap_info.id;
+ found_from = true;
+ }
+ if (found_snap && found_from) {
+ break;
+ }
+ }
+ }
+ if ((snapname != nullptr && snap_id == CEPH_NOSNAP) ||
+ (from_snapname != nullptr && from_id == CEPH_NOSNAP)) {
+ std::cerr << "specified snapshot is not found." << std::endl;
+ return -ENOENT;
+ }
+ if (snap_id != CEPH_NOSNAP && from_id != CEPH_NOSNAP) {
+ if (from_id == snap_id) {
+ // no diskusage.
+ return 0;
+ }
+ if (from_id >= snap_id) {
+ return -EINVAL;
+ }
+ }
+
+ uint64_t image_full_used_size = 0;
+
+ for (std::vector<librbd::snap_info_t>::const_iterator snap =
+ snap_list.begin(); snap != snap_list.end(); ++snap) {
+ librbd::Image snap_image;
+ r = rbd.open_read_only(io_ctx, snap_image, image_spec.name.c_str(),
+ snap->name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: error opening snapshot " << image_spec.name << "@"
+ << snap->name << ": " << cpp_strerror(r) << std::endl;
+ goto out;
+ }
+
+ if (imgname == nullptr || found_from_snap ||
+ (found_from_snap && snapname != nullptr && snap->name == snapname)) {
+
+ r = get_image_disk_usage(image_spec.name, snap->name, last_snap_name, snap_image, exact, snap->size, &used_size);
+ if (r < 0) {
+ goto out;
+ }
+ if (!merge_snap) {
+ format_image_disk_usage(image_spec.name, image_spec.id, snap->name,
+ snap->id, snap->size, used_size, tbl, f);
+ }
+
+ image_full_used_size += used_size;
+
+ if (snapname != NULL) {
+ total_prov += snap->size;
+ }
+ total_used += used_size;
+ ++count;
+ }
+
+ if (!found_from_snap && from_snapname != nullptr &&
+ snap->name == from_snapname) {
+ found_from_snap = true;
+ }
+ if (snapname != nullptr && snap->name == snapname) {
+ break;
+ }
+ last_snap_name = snap->name;
+ }
+
+ if (snapname == NULL) {
+ r = get_image_disk_usage(image_spec.name, "", last_snap_name, image, exact, info.size, &used_size);
+ if (r < 0) {
+ goto out;
+ }
+
+ image_full_used_size += used_size;
+
+ if (!merge_snap) {
+ format_image_disk_usage(image_spec.name, image_spec.id, "", CEPH_NOSNAP,
+ info.size, used_size, tbl, f);
+ } else {
+ format_image_disk_usage(image_spec.name, image_spec.id, "", CEPH_NOSNAP,
+ info.size, image_full_used_size, tbl, f);
+ }
+
+ total_prov += info.size;
+ total_used += used_size;
+ ++count;
+ }
+ }
+ if (imgname != nullptr && !found) {
+ std::cerr << "specified image " << imgname << " is not found." << std::endl;
+ return -ENOENT;
+ }
+
+out:
+ if (f) {
+ f->close_section();
+ if (imgname == NULL) {
+ f->dump_unsigned("total_provisioned_size", total_prov);
+ f->dump_unsigned("total_used_size", total_used);
+ }
+ f->close_section();
+ f->flush(std::cout);
+ } else if (!images.empty()) {
+ if (count > 1) {
+ tbl << "<TOTAL>"
+ << stringify(byte_u_t(total_prov))
+ << stringify(byte_u_t(total_used))
+ << TextTable::endrow;
+ }
+ std::cout << tbl;
+ }
+
+ return r < 0 ? r : 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+ options->add_options()
+ (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(),
+ "snapshot starting point")
+ ("exact", po::bool_switch(), "compute exact disk usage (slow)")
+ ("merge-snapshots", po::bool_switch(),
+ "merge snapshot sizes with its image");
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, vm.count(at::FROM_SNAPSHOT_NAME),
+ utils::SNAPSHOT_PRESENCE_PERMITTED, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string from_snap_name;
+ if (vm.count(at::FROM_SNAPSHOT_NAME)) {
+ from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>();
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ librbd::RBD rbd;
+ r = do_disk_usage(rbd, io_ctx,
+ image_name.empty() ? nullptr: image_name.c_str(),
+ snap_name.empty() ? nullptr : snap_name.c_str(),
+ from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
+ vm["exact"].as<bool>(), formatter.get(),
+ vm["merge-snapshots"].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: du failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"exact", "merge-snapshots"});
+Shell::Action action(
+ {"disk-usage"}, {"du"}, "Show disk usage stats for pool, image or snapshot.",
+ "", &get_arguments, &execute);
+
+} // namespace disk_usage
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Encryption.cc b/src/tools/rbd/action/Encryption.cc
new file mode 100644
index 000000000..a997fe701
--- /dev/null
+++ b/src/tools/rbd/action/Encryption.cc
@@ -0,0 +1,120 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "include/scope_guard.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <fstream>
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace encryption {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ positional->add_options()
+ ("format", "encryption format [possible values: luks1, luks2]")
+ ("passphrase-file",
+ "path of file containing passphrase for unlocking the image");
+ options->add_options()
+ ("cipher-alg", po::value<at::EncryptionAlgorithm>(),
+ "encryption algorithm [possible values: aes-128, aes-256 (default)]");
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string format_str = utils::get_positional_argument(vm, arg_index++);
+ if (format_str.empty()) {
+ std::cerr << "rbd: must specify format." << std::endl;
+ return -EINVAL;
+ }
+
+ std::string passphrase_file =
+ utils::get_positional_argument(vm, arg_index++);
+ if (passphrase_file.empty()) {
+ std::cerr << "rbd: must specify passphrase-file." << std::endl;
+ return -EINVAL;
+ }
+
+ std::ifstream file(passphrase_file.c_str());
+ if (file.fail()) {
+ std::cerr << "rbd: unable to open passphrase file " << passphrase_file
+ << ": " << cpp_strerror(errno) << std::endl;
+ return -errno;
+ }
+ std::string passphrase((std::istreambuf_iterator<char>(file)),
+ (std::istreambuf_iterator<char>()));
+ auto sg = make_scope_guard([&] {
+ ceph_memzero_s(&passphrase[0], passphrase.size(), passphrase.size()); });
+ file.close();
+ if (!passphrase.empty() && passphrase[passphrase.length() - 1] == '\n') {
+ passphrase.erase(passphrase.length() - 1);
+ }
+
+ auto alg = RBD_ENCRYPTION_ALGORITHM_AES256;
+ if (vm.count("cipher-alg")) {
+ alg = vm["cipher-alg"].as<librbd::encryption_algorithm_t>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ if (format_str == "luks1") {
+ librbd::encryption_luks1_format_options_t opts = {};
+ opts.alg = alg;
+ opts.passphrase = passphrase;
+ r = image.encryption_format(
+ RBD_ENCRYPTION_FORMAT_LUKS1, &opts, sizeof(opts));
+ } else if (format_str == "luks2") {
+ librbd::encryption_luks2_format_options_t opts = {};
+ opts.alg = alg;
+ opts.passphrase = passphrase;
+ r = image.encryption_format(
+ RBD_ENCRYPTION_FORMAT_LUKS2, &opts, sizeof(opts));
+ } else {
+ std::cerr << "rbd: unsupported encryption format" << std::endl;
+ return -ENOTSUP;
+ }
+
+ if (r < 0) {
+ std::cerr << "rbd: encryption format error: " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+}
+
+Shell::Action action(
+ {"encryption", "format"}, {}, "Format image to an encrypted format.", "",
+ &get_arguments, &execute);
+
+} // namespace encryption
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Export.cc b/src/tools/rbd/action/Export.cc
new file mode 100644
index 000000000..f40c587cc
--- /dev/null
+++ b/src/tools/rbd/action/Export.cc
@@ -0,0 +1,650 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/Context.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "include/encoding.h"
+#include <iostream>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <boost/program_options.hpp>
+#include <boost/scope_exit.hpp>
+
+namespace rbd {
+namespace action {
+namespace export_full {
+
+struct ExportDiffContext {
+ librbd::Image *image;
+ int fd;
+ int export_format;
+ uint64_t totalsize;
+ utils::ProgressContext pc;
+ OrderedThrottle throttle;
+
+ ExportDiffContext(librbd::Image *i, int f, uint64_t t, int max_ops,
+ bool no_progress, int eformat) :
+ image(i), fd(f), export_format(eformat), totalsize(t), pc("Exporting image", no_progress),
+ throttle(max_ops, true) {
+ }
+};
+
+class C_ExportDiff : public Context {
+public:
+ C_ExportDiff(ExportDiffContext *edc, uint64_t offset, uint64_t length,
+ bool exists, int export_format)
+ : m_export_diff_context(edc), m_offset(offset), m_length(length),
+ m_exists(exists), m_export_format(export_format) {
+ }
+
+ int send() {
+ if (m_export_diff_context->throttle.pending_error()) {
+ return m_export_diff_context->throttle.wait_for_ret();
+ }
+
+ C_OrderedThrottle *ctx = m_export_diff_context->throttle.start_op(this);
+ if (m_exists) {
+ librbd::RBD::AioCompletion *aio_completion =
+ new librbd::RBD::AioCompletion(ctx, &utils::aio_context_callback);
+
+ int op_flags = LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ int r = m_export_diff_context->image->aio_read2(
+ m_offset, m_length, m_read_data, aio_completion, op_flags);
+ if (r < 0) {
+ aio_completion->release();
+ ctx->complete(r);
+ }
+ } else {
+ ctx->complete(0);
+ }
+ return 0;
+ }
+
+ static int export_diff_cb(uint64_t offset, size_t length, int exists,
+ void *arg) {
+ ExportDiffContext *edc = reinterpret_cast<ExportDiffContext *>(arg);
+
+ C_ExportDiff *context = new C_ExportDiff(edc, offset, length, exists, edc->export_format);
+ return context->send();
+ }
+
+protected:
+ void finish(int r) override {
+ if (r >= 0) {
+ if (m_exists) {
+ m_exists = !m_read_data.is_zero();
+ }
+ r = write_extent(m_export_diff_context, m_offset, m_length, m_exists, m_export_format);
+ if (r == 0 && m_exists) {
+ r = m_read_data.write_fd(m_export_diff_context->fd);
+ }
+ }
+ m_export_diff_context->throttle.end_op(r);
+ }
+
+private:
+ ExportDiffContext *m_export_diff_context;
+ uint64_t m_offset;
+ uint64_t m_length;
+ bool m_exists;
+ int m_export_format;
+ bufferlist m_read_data;
+
+ static int write_extent(ExportDiffContext *edc, uint64_t offset,
+ uint64_t length, bool exists, int export_format) {
+ // extent
+ bufferlist bl;
+ __u8 tag = exists ? RBD_DIFF_WRITE : RBD_DIFF_ZERO;
+ uint64_t len = 0;
+ encode(tag, bl);
+ if (export_format == 2) {
+ if (tag == RBD_DIFF_WRITE)
+ len = 8 + 8 + length;
+ else
+ len = 8 + 8;
+ encode(len, bl);
+ }
+ encode(offset, bl);
+ encode(length, bl);
+ int r = bl.write_fd(edc->fd);
+
+ edc->pc.update_progress(offset, edc->totalsize);
+ return r;
+ }
+};
+
+
+int do_export_diff_fd(librbd::Image& image, const char *fromsnapname,
+ const char *endsnapname, bool whole_object,
+ int fd, bool no_progress, int export_format)
+{
+ int r;
+ librbd::image_info_t info;
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ {
+ // header
+ bufferlist bl;
+ if (export_format == 1)
+ bl.append(utils::RBD_DIFF_BANNER);
+ else
+ bl.append(utils::RBD_DIFF_BANNER_V2);
+
+ __u8 tag;
+ uint64_t len = 0;
+ if (fromsnapname) {
+ tag = RBD_DIFF_FROM_SNAP;
+ encode(tag, bl);
+ std::string from(fromsnapname);
+ if (export_format == 2) {
+ len = from.length() + 4;
+ encode(len, bl);
+ }
+ encode(from, bl);
+ }
+
+ if (endsnapname) {
+ tag = RBD_DIFF_TO_SNAP;
+ encode(tag, bl);
+ std::string to(endsnapname);
+ if (export_format == 2) {
+ len = to.length() + 4;
+ encode(len, bl);
+ }
+ encode(to, bl);
+ }
+
+ if (endsnapname && export_format == 2) {
+ tag = RBD_SNAP_PROTECTION_STATUS;
+ encode(tag, bl);
+ bool is_protected = false;
+ r = image.snap_is_protected(endsnapname, &is_protected);
+ if (r < 0) {
+ return r;
+ }
+ len = 8;
+ encode(len, bl);
+ encode(is_protected, bl);
+ }
+
+ tag = RBD_DIFF_IMAGE_SIZE;
+ encode(tag, bl);
+ uint64_t endsize = info.size;
+ if (export_format == 2) {
+ len = 8;
+ encode(len, bl);
+ }
+ encode(endsize, bl);
+
+ r = bl.write_fd(fd);
+ if (r < 0) {
+ return r;
+ }
+ }
+ ExportDiffContext edc(&image, fd, info.size,
+ g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"),
+ no_progress, export_format);
+ r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
+ &C_ExportDiff::export_diff_cb, (void *)&edc);
+ if (r < 0) {
+ goto out;
+ }
+
+ r = edc.throttle.wait_for_ret();
+ if (r < 0) {
+ goto out;
+ }
+
+ {
+ __u8 tag = RBD_DIFF_END;
+ bufferlist bl;
+ encode(tag, bl);
+ r = bl.write_fd(fd);
+ }
+
+out:
+ if (r < 0)
+ edc.pc.fail();
+ else
+ edc.pc.finish();
+
+ return r;
+}
+
+int do_export_diff(librbd::Image& image, const char *fromsnapname,
+ const char *endsnapname, bool whole_object,
+ const char *path, bool no_progress)
+{
+ int r;
+ int fd;
+
+ if (strcmp(path, "-") == 0)
+ fd = STDOUT_FILENO;
+ else
+ fd = open(path, O_WRONLY | O_CREAT | O_EXCL | O_BINARY, 0644);
+ if (fd < 0)
+ return -errno;
+
+ r = do_export_diff_fd(image, fromsnapname, endsnapname, whole_object, fd, no_progress, 1);
+
+ if (fd != 1)
+ close(fd);
+ if (r < 0 && fd != 1) {
+ remove(path);
+ }
+
+ return r;
+}
+
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments_diff(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_path_options(positional, options,
+ "export file (or '-' for stdout)");
+ options->add_options()
+ (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(),
+ "snapshot starting point")
+ (at::WHOLE_OBJECT.c_str(), po::bool_switch(), "compare whole object");
+ at::add_no_progress_option(options);
+}
+
+int execute_diff(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string path;
+ r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string from_snap_name;
+ if (vm.count(at::FROM_SNAPSHOT_NAME)) {
+ from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_export_diff(image,
+ from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
+ snap_name.empty() ? nullptr : snap_name.c_str(),
+ vm[at::WHOLE_OBJECT].as<bool>(), path.c_str(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: export-diff error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_diff(
+ {"export-diff"}, {}, "Export incremental diff to file.", "",
+ &get_arguments_diff, &execute_diff);
+
+class C_Export : public Context
+{
+public:
+ C_Export(OrderedThrottle &ordered_throttle, librbd::Image &image,
+ uint64_t fd_offset, uint64_t offset, uint64_t length, int fd)
+ : m_throttle(ordered_throttle), m_image(image), m_dest_offset(fd_offset),
+ m_offset(offset), m_length(length), m_fd(fd)
+ {
+ }
+
+ void send()
+ {
+ auto ctx = m_throttle.start_op(this);
+ auto aio_completion = new librbd::RBD::AioCompletion(
+ ctx, &utils::aio_context_callback);
+ int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ int r = m_image.aio_read2(m_offset, m_length, m_bufferlist,
+ aio_completion, op_flags);
+ if (r < 0) {
+ cerr << "rbd: error requesting read from source image" << std::endl;
+ aio_completion->release();
+ m_throttle.end_op(r);
+ }
+ }
+
+ void finish(int r) override
+ {
+ BOOST_SCOPE_EXIT((&m_throttle) (&r))
+ {
+ m_throttle.end_op(r);
+ } BOOST_SCOPE_EXIT_END
+
+ if (r < 0) {
+ cerr << "rbd: error reading from source image at offset "
+ << m_offset << ": " << cpp_strerror(r) << std::endl;
+ return;
+ }
+
+ ceph_assert(m_bufferlist.length() == static_cast<size_t>(r));
+ if (m_fd != STDOUT_FILENO) {
+ if (m_bufferlist.is_zero()) {
+ return;
+ }
+
+ uint64_t chkret = lseek64(m_fd, m_dest_offset, SEEK_SET);
+ if (chkret != m_dest_offset) {
+ cerr << "rbd: error seeking destination image to offset "
+ << m_dest_offset << std::endl;
+ r = -errno;
+ return;
+ }
+ }
+
+ r = m_bufferlist.write_fd(m_fd);
+ if (r < 0) {
+ cerr << "rbd: error writing to destination image at offset "
+ << m_dest_offset << std::endl;
+ }
+ }
+
+private:
+ OrderedThrottle &m_throttle;
+ librbd::Image &m_image;
+ bufferlist m_bufferlist;
+ uint64_t m_dest_offset;
+ uint64_t m_offset;
+ uint64_t m_length;
+ int m_fd;
+};
+
+const uint32_t MAX_KEYS = 64;
+
+static int do_export_v2(librbd::Image& image, librbd::image_info_t &info, int fd,
+ uint64_t period, int max_concurrent_ops, utils::ProgressContext &pc)
+{
+ int r = 0;
+ // header
+ bufferlist bl;
+ bl.append(utils::RBD_IMAGE_BANNER_V2);
+
+ __u8 tag;
+ uint64_t length;
+ // encode order
+ tag = RBD_EXPORT_IMAGE_ORDER;
+ length = 8;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(uint64_t(info.order), bl);
+
+ // encode features
+ tag = RBD_EXPORT_IMAGE_FEATURES;
+ uint64_t features;
+ image.features(&features);
+ length = 8;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(features, bl);
+
+ // encode stripe_unit and stripe_count
+ tag = RBD_EXPORT_IMAGE_STRIPE_UNIT;
+ uint64_t stripe_unit;
+ stripe_unit = image.get_stripe_unit();
+ length = 8;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(stripe_unit, bl);
+
+ tag = RBD_EXPORT_IMAGE_STRIPE_COUNT;
+ uint64_t stripe_count;
+ stripe_count = image.get_stripe_count();
+ length = 8;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(stripe_count, bl);
+
+ //retrieve metadata of image
+ std::map<std::string, string> imagemetas;
+ std::string last_key;
+ bool more_results = true;
+ while (more_results) {
+ std::map<std::string, bufferlist> pairs;
+ r = image.metadata_list(last_key, MAX_KEYS, &pairs);
+ if (r < 0) {
+ std::cerr << "failed to retrieve metadata of image : " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ if (!pairs.empty()) {
+ last_key = pairs.rbegin()->first;
+
+ for (auto kv : pairs) {
+ std::string key = kv.first;
+ std::string val(kv.second.c_str(), kv.second.length());
+ imagemetas[key] = val;
+ }
+ }
+ more_results = (pairs.size() == MAX_KEYS);
+ }
+
+ //encode imageMeta key and value
+ for (std::map<std::string, string>::iterator it = imagemetas.begin();
+ it != imagemetas.end(); ++it) {
+ string key = it->first;
+ string value = it->second;
+
+ tag = RBD_EXPORT_IMAGE_META;
+ length = key.length() + value.length() + 4 * 2;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(key, bl);
+ encode(value, bl);
+ }
+
+ // encode end tag
+ tag = RBD_EXPORT_IMAGE_END;
+ encode(tag, bl);
+
+ // write bl to fd.
+ r = bl.write_fd(fd);
+ if (r < 0) {
+ return r;
+ }
+
+ // header for snapshots
+ bl.clear();
+ bl.append(utils::RBD_IMAGE_DIFFS_BANNER_V2);
+
+ std::vector<librbd::snap_info_t> snaps;
+ r = image.snap_list(snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t diff_num = snaps.size() + 1;
+ encode(diff_num, bl);
+
+ r = bl.write_fd(fd);
+ if (r < 0) {
+ return r;
+ }
+
+ const char *last_snap = NULL;
+ for (size_t i = 0; i < snaps.size(); ++i) {
+ utils::snap_set(image, snaps[i].name.c_str());
+ r = do_export_diff_fd(image, last_snap, snaps[i].name.c_str(), false, fd, true, 2);
+ if (r < 0) {
+ return r;
+ }
+ pc.update_progress(i, snaps.size() + 1);
+ last_snap = snaps[i].name.c_str();
+ }
+ utils::snap_set(image, std::string(""));
+ r = do_export_diff_fd(image, last_snap, nullptr, false, fd, true, 2);
+ if (r < 0) {
+ return r;
+ }
+ pc.update_progress(snaps.size() + 1, snaps.size() + 1);
+ return r;
+}
+
+static int do_export_v1(librbd::Image& image, librbd::image_info_t &info,
+ int fd, uint64_t period, int max_concurrent_ops,
+ utils::ProgressContext &pc)
+{
+ int r = 0;
+ size_t file_size = 0;
+ OrderedThrottle throttle(max_concurrent_ops, false);
+ for (uint64_t offset = 0; offset < info.size; offset += period) {
+ if (throttle.pending_error()) {
+ break;
+ }
+
+ uint64_t length = min(period, info.size - offset);
+ C_Export *ctx = new C_Export(throttle, image, file_size + offset, offset,
+ length, fd);
+ ctx->send();
+
+ pc.update_progress(offset, info.size);
+ }
+
+ file_size += info.size;
+ r = throttle.wait_for_ret();
+ if (fd != 1) {
+ if (r >= 0) {
+ r = ftruncate(fd, file_size);
+ if (r < 0)
+ return r;
+
+ uint64_t chkret = lseek64(fd, file_size, SEEK_SET);
+ if (chkret != file_size)
+ r = errno;
+ }
+ }
+ return r;
+}
+
+static int do_export(librbd::Image& image, const char *path, bool no_progress,
+ int export_format)
+{
+ librbd::image_info_t info;
+ int64_t r = image.stat(info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ int fd;
+ int max_concurrent_ops = g_conf().get_val<uint64_t>("rbd_concurrent_management_ops");
+ bool to_stdout = (strcmp(path, "-") == 0);
+ if (to_stdout) {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = open(path, O_WRONLY | O_CREAT | O_EXCL | O_BINARY, 0644);
+ if (fd < 0) {
+ return -errno;
+ }
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+ }
+
+ utils::ProgressContext pc("Exporting image", no_progress);
+ uint64_t period = image.get_stripe_count() * (1ull << info.order);
+
+ if (export_format == 1)
+ r = do_export_v1(image, info, fd, period, max_concurrent_ops, pc);
+ else
+ r = do_export_v2(image, info, fd, period, max_concurrent_ops, pc);
+
+ if (r < 0)
+ pc.fail();
+ else
+ pc.finish();
+ if (!to_stdout)
+ close(fd);
+ return r;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_path_options(positional, options,
+ "export file (or '-' for stdout)");
+ at::add_no_progress_option(options);
+ at::add_export_format_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string path;
+ r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ int format = 1;
+ if (vm.count("export-format"))
+ format = vm["export-format"].as<uint64_t>();
+
+ r = do_export(image, path.c_str(), vm[at::NO_PROGRESS].as<bool>(), format);
+ if (r < 0) {
+ std::cerr << "rbd: export error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"export"}, {}, "Export image to file.", "", &get_arguments, &execute);
+
+} // namespace export_full
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Feature.cc b/src/tools/rbd/action/Feature.cc
new file mode 100644
index 000000000..13a7b6ea7
--- /dev/null
+++ b/src/tools/rbd/action/Feature.cc
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include <iostream>
+#include <map>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace feature {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options, bool enabled) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ positional->add_options()
+ ("features", po::value<at::ImageFeatures>()->multitoken(),
+ ("image features\n" + at::get_short_features_help(false)).c_str());
+ if (enabled) {
+ at::add_create_journal_options(options);
+ }
+}
+
+void get_arguments_disable(po::options_description *positional,
+ po::options_description *options) {
+ get_arguments(positional, options, false);
+}
+
+void get_arguments_enable(po::options_description *positional,
+ po::options_description *options) {
+ get_arguments(positional, options, true);
+}
+
+int execute(const po::variables_map &vm, bool enabled) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_journal_options(vm, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ std::vector<std::string> feature_names;
+ if (vm.count(at::POSITIONAL_ARGUMENTS)) {
+ const std::vector<std::string> &args =
+ vm[at::POSITIONAL_ARGUMENTS].as<std::vector<std::string> >();
+ feature_names.insert(feature_names.end(), args.begin() + arg_index,
+ args.end());
+ }
+
+ if (feature_names.empty()) {
+ std::cerr << "rbd: at least one feature name must be specified"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ boost::any features_any(static_cast<uint64_t>(0));
+ at::ImageFeatures image_features;
+ at::validate(features_any, feature_names, &image_features, 0);
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.update_features(boost::any_cast<uint64_t>(features_any), enabled);
+ if (r < 0) {
+ std::cerr << "rbd: failed to update image features: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int execute_disable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return execute(vm, false);
+}
+
+int execute_enable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return execute(vm, true);
+}
+
+Shell::Action action_disable(
+ {"feature", "disable"}, {}, "Disable the specified image feature.", "",
+ &get_arguments_disable, &execute_disable);
+Shell::Action action_enable(
+ {"feature", "enable"}, {}, "Enable the specified image feature.", "",
+ &get_arguments_enable, &execute_enable);
+
+} // namespace feature
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Flatten.cc b/src/tools/rbd/action/Flatten.cc
new file mode 100644
index 000000000..ec4e837a8
--- /dev/null
+++ b/src/tools/rbd/action/Flatten.cc
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace flatten {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_flatten(librbd::Image& image, bool no_progress)
+{
+ utils::ProgressContext pc("Image flatten", no_progress);
+ int r = image.flatten_with_progress(pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_flatten(image, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: flatten error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"flatten"}, {}, "Fill clone with parent data (make it independent).", "",
+ &get_arguments, &execute);
+
+} // namespace flatten
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Ggate.cc b/src/tools/rbd/action/Ggate.cc
new file mode 100644
index 000000000..11782d70a
--- /dev/null
+++ b/src/tools/rbd/action/Ggate.cc
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/param.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "include/stringify.h"
+#include "common/SubProcess.h"
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/program_options.hpp>
+
+#include <iostream>
+
+namespace rbd {
+namespace action {
+namespace ggate {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+#if defined(__FreeBSD__)
+static int call_ggate_cmd(const po::variables_map &vm,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &ceph_global_args) {
+ SubProcess process("rbd-ggate", SubProcess::KEEP, SubProcess::KEEP,
+ SubProcess::KEEP);
+
+ for (auto &arg : ceph_global_args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ for (auto &arg : args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ if (process.spawn()) {
+ std::cerr << "rbd: failed to run rbd-ggate: " << process.err() << std::endl;
+ return -EINVAL;
+ } else if (process.join()) {
+ std::cerr << "rbd: rbd-ggate failed with error: " << process.err()
+ << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+#endif
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(__FreeBSD__)
+ std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::vector<std::string> args;
+
+ args.push_back("list");
+
+ if (vm.count("format")) {
+ args.push_back("--format");
+ args.push_back(vm["format"].as<at::Format>().value);
+ }
+ if (vm["pretty-format"].as<bool>()) {
+ args.push_back("--pretty-format");
+ }
+
+ return call_ggate_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_map(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(__FreeBSD__)
+ std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::vector<std::string> args;
+
+ args.push_back("map");
+ std::string img;
+ int r = utils::get_image_or_snap_spec(vm, &img);
+ if (r < 0) {
+ return r;
+ }
+ args.push_back(img);
+
+ if (vm["quiesce"].as<bool>()) {
+ std::cerr << "rbd: warning: quiesce is not supported" << std::endl;
+ }
+
+ if (vm["read-only"].as<bool>()) {
+ args.push_back("--read-only");
+ }
+
+ if (vm["exclusive"].as<bool>()) {
+ args.push_back("--exclusive");
+ }
+
+ if (vm.count("quiesce-hook")) {
+ std::cerr << "rbd: warning: quiesce-hook is not supported" << std::endl;
+ }
+
+ if (vm.count("options")) {
+ utils::append_options_as_args(vm["options"].as<std::vector<std::string>>(),
+ &args);
+ }
+
+ return call_ggate_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_unmap(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(__FreeBSD__)
+ std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::string device_name = utils::get_positional_argument(vm, 0);
+ if (!boost::starts_with(device_name, "/dev/")) {
+ device_name.clear();
+ }
+
+ std::string image_name;
+ if (device_name.empty()) {
+ int r = utils::get_image_or_snap_spec(vm, &image_name);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (device_name.empty() && image_name.empty()) {
+ std::cerr << "rbd: unmap requires either image name or device path"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ std::vector<std::string> args;
+
+ args.push_back("unmap");
+ args.push_back(device_name.empty() ? image_name : device_name);
+
+ if (vm.count("options")) {
+ utils::append_options_as_args(vm["options"].as<std::vector<std::string>>(),
+ &args);
+ }
+
+ return call_ggate_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_attach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(__FreeBSD__)
+ std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl;
+#else
+ std::cerr << "rbd: ggate attach command not supported" << std::endl;
+#endif
+ return -EOPNOTSUPP;
+}
+
+int execute_detach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(__FreeBSD__)
+ std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl;
+#else
+ std::cerr << "rbd: ggate detach command not supported" << std::endl;
+#endif
+ return -EOPNOTSUPP;
+}
+
+} // namespace ggate
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Group.cc b/src/tools/rbd/action/Group.cc
new file mode 100644
index 000000000..5c2232a6f
--- /dev/null
+++ b/src/tools/rbd/action/Group.cc
@@ -0,0 +1,912 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+
+namespace rbd {
+namespace action {
+namespace group {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static const std::string GROUP_SPEC("group-spec");
+static const std::string GROUP_SNAP_SPEC("group-snap-spec");
+
+static const std::string GROUP_NAME("group");
+static const std::string DEST_GROUP_NAME("dest-group");
+
+static const std::string GROUP_POOL_NAME("group-" + at::POOL_NAME);
+static const std::string IMAGE_POOL_NAME("image-" + at::POOL_NAME);
+
+void add_group_option(po::options_description *opt,
+ at::ArgumentModifier modifier) {
+ std::string name = GROUP_NAME;
+ std::string description = at::get_description_prefix(modifier) + "group name";
+ switch (modifier) {
+ case at::ARGUMENT_MODIFIER_NONE:
+ case at::ARGUMENT_MODIFIER_SOURCE:
+ break;
+ case at::ARGUMENT_MODIFIER_DEST:
+ name = DEST_GROUP_NAME;
+ break;
+ }
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_prefixed_pool_option(po::options_description *opt,
+ const std::string &prefix) {
+ std::string name = prefix + "-" + at::POOL_NAME;
+ std::string description = prefix + " pool name";
+
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_prefixed_namespace_option(po::options_description *opt,
+ const std::string &prefix) {
+ std::string name = prefix + "-" + at::NAMESPACE_NAME;
+ std::string description = prefix + " namespace name";
+
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_group_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ at::ArgumentModifier modifier,
+ bool snap) {
+ at::add_pool_option(opt, modifier);
+ at::add_namespace_option(opt, modifier);
+ add_group_option(opt, modifier);
+ if (!snap) {
+ pos->add_options()
+ ((get_name_prefix(modifier) + GROUP_SPEC).c_str(),
+ (get_description_prefix(modifier) + "group specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<group-name>)").c_str());
+ } else {
+ add_snap_option(opt, modifier);
+ pos->add_options()
+ ((get_name_prefix(modifier) + GROUP_SNAP_SPEC).c_str(),
+ (get_description_prefix(modifier) + "group specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<group-name>@<snap-name>)").c_str());
+ }
+}
+
+int execute_create(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ librbd::RBD rbd;
+ r = rbd.group_create(io_ctx, group_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: create error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+ Formatter *f = formatter.get();
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::vector<std::string> names;
+ r = rbd.group_list(io_ctx, &names);
+ if (r < 0)
+ return r;
+
+ if (f)
+ f->open_array_section("groups");
+ for (auto i : names) {
+ if (f)
+ f->dump_string("name", i);
+ else
+ std::cout << i << std::endl;
+ }
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ librbd::RBD rbd;
+
+ r = rbd.group_remove(io_ctx, group_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: remove error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_rename(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dest_pool_name;
+ std::string dest_namespace_name;
+ std::string dest_group_name;
+
+ r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, at::DEST_POOL_NAME,
+ &dest_pool_name, &dest_namespace_name, DEST_GROUP_NAME, "group",
+ &dest_group_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (pool_name != dest_pool_name) {
+ std::cerr << "rbd: group rename across pools not supported" << std::endl
+ << "source pool: " << pool_name << ", dest pool: "
+ << dest_pool_name << std::endl;
+ return -EINVAL;
+ } else if (namespace_name != dest_namespace_name) {
+ std::cerr << "rbd: group rename across namespaces not supported"
+ << std::endl
+ << "source namespace: " << namespace_name << ", dest namespace: "
+ << dest_namespace_name << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_rename(io_ctx, group_name.c_str(),
+ dest_group_name.c_str());
+
+ if (r < 0) {
+ std::cerr << "rbd: failed to rename group: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_add(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ // Parse group data.
+ std::string group_pool_name;
+ std::string group_namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, GROUP_POOL_NAME,
+ &group_pool_name, &group_namespace_name, GROUP_NAME, "group", &group_name,
+ nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string image_pool_name;
+ std::string image_namespace_name;
+ std::string image_name;
+
+ r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, IMAGE_POOL_NAME,
+ &image_pool_name, &image_namespace_name, at::IMAGE_NAME, "image",
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (group_namespace_name != image_namespace_name) {
+ std::cerr << "rbd: group and image namespace must match." << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx cg_io_ctx;
+ r = utils::init(group_pool_name, group_namespace_name, &rados, &cg_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx image_io_ctx;
+ r = utils::init(image_pool_name, group_namespace_name, &rados, &image_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_image_add(cg_io_ctx, group_name.c_str(),
+ image_io_ctx, image_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: add image error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_remove_image(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+
+ std::string group_pool_name;
+ std::string group_namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, GROUP_POOL_NAME,
+ &group_pool_name, &group_namespace_name, GROUP_NAME, "group", &group_name,
+ nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string image_pool_name;
+ std::string image_namespace_name;
+ std::string image_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, IMAGE_POOL_NAME,
+ &image_pool_name, &image_namespace_name, at::IMAGE_NAME, "image",
+ &image_name, nullptr, image_id.empty(), utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (group_namespace_name != image_namespace_name) {
+ std::cerr << "rbd: group and image namespace must match." << std::endl;
+ return -EINVAL;
+ } else if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx cg_io_ctx;
+ r = utils::init(group_pool_name, group_namespace_name, &rados, &cg_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx image_io_ctx;
+ r = utils::init(image_pool_name, group_namespace_name, &rados, &image_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ if (image_id.empty()) {
+ r = rbd.group_image_remove(cg_io_ctx, group_name.c_str(),
+ image_io_ctx, image_name.c_str());
+ } else {
+ r = rbd.group_image_remove_by_id(cg_io_ctx, group_name.c_str(),
+ image_io_ctx, image_id.c_str());
+ }
+ if (r < 0) {
+ std::cerr << "rbd: remove image error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_list_images(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+ Formatter *f = formatter.get();
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::vector<librbd::group_image_info_t> images;
+
+ r = rbd.group_image_list(io_ctx, group_name.c_str(), &images,
+ sizeof(librbd::group_image_info_t));
+
+ if (r == -ENOENT)
+ r = 0;
+
+ if (r < 0)
+ return r;
+
+ std::sort(images.begin(), images.end(),
+ [](const librbd::group_image_info_t &lhs,
+ const librbd::group_image_info_t &rhs) {
+ if (lhs.pool != rhs.pool) {
+ return lhs.pool < rhs.pool;
+ }
+ return lhs.name < rhs.name;
+ }
+ );
+
+ if (f)
+ f->open_array_section("images");
+
+ for (auto image : images) {
+ std::string image_name = image.name;
+ int state = image.state;
+ std::string state_string;
+ if (RBD_GROUP_IMAGE_STATE_INCOMPLETE == state) {
+ state_string = "incomplete";
+ }
+
+ std::string pool_name = "";
+
+ librados::Rados rados(io_ctx);
+ librados::IoCtx pool_io_ctx;
+ r = rados.ioctx_create2(image.pool, pool_io_ctx);
+ if (r < 0) {
+ pool_name = "<missing image pool " + stringify(image.pool) + ">";
+ } else {
+ pool_name = pool_io_ctx.get_pool_name();
+ }
+
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("image", image_name);
+ f->dump_string("pool", pool_name);
+ f->dump_string("namespace", io_ctx.get_namespace());
+ f->dump_int("state", state);
+ f->close_section();
+ } else {
+ std::cout << pool_name << "/";
+ if (!io_ctx.get_namespace().empty()) {
+ std::cout << io_ctx.get_namespace() << "/";
+ }
+ std::cout << image_name << " " << state_string << std::endl;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+int execute_group_snap_create(const po::variables_map &vm,
+ const std::vector<std::string> &global_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ uint32_t flags;
+ r = utils::get_snap_create_flags(vm, &flags);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx io_ctx;
+ librados::Rados rados;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_snap_create2(io_ctx, group_name.c_str(), snap_name.c_str(),
+ flags);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_group_snap_remove(const po::variables_map &vm,
+ const std::vector<std::string> &global_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx io_ctx;
+ librados::Rados rados;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_snap_remove(io_ctx, group_name.c_str(), snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: failed to remove group snapshot: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_group_snap_rename(const po::variables_map &vm,
+ const std::vector<std::string> &global_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+ std::string source_snap_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, &source_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dest_snap_name;
+ if (vm.count(at::DEST_SNAPSHOT_NAME)) {
+ dest_snap_name = vm[at::DEST_SNAPSHOT_NAME].as<std::string>();
+ }
+
+ if (dest_snap_name.empty()) {
+ dest_snap_name = utils::get_positional_argument(vm, arg_index++);
+ }
+
+ if (dest_snap_name.empty()) {
+ std::cerr << "rbd: destination snapshot name was not specified"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ r = utils::validate_snapshot_name(at::ARGUMENT_MODIFIER_DEST, dest_snap_name,
+ utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_SNAP);
+ if (r < 0) {
+ return r;
+ }
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_snap_rename(io_ctx, group_name.c_str(),
+ source_snap_name.c_str(), dest_snap_name.c_str());
+
+ if (r < 0) {
+ std::cerr << "rbd: failed to rename group snapshot: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_group_snap_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+ Formatter *f = formatter.get();
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::vector<librbd::group_snap_info_t> snaps;
+
+ r = rbd.group_snap_list(io_ctx, group_name.c_str(), &snaps,
+ sizeof(librbd::group_snap_info_t));
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ TextTable t;
+ if (f) {
+ f->open_array_section("group_snaps");
+ } else {
+ t.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ t.define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
+ }
+
+ for (auto i : snaps) {
+ std::string snap_name = i.name;
+ int state = i.state;
+ std::string state_string;
+ if (RBD_GROUP_SNAP_STATE_INCOMPLETE == state) {
+ state_string = "incomplete";
+ } else {
+ state_string = "ok";
+ }
+ if (r < 0) {
+ return r;
+ }
+ if (f) {
+ f->open_object_section("group_snap");
+ f->dump_string("snapshot", snap_name);
+ f->dump_string("state", state_string);
+ f->close_section();
+ } else {
+ t << snap_name << state_string << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else if (snaps.size()) {
+ std::cout << t;
+ }
+ return 0;
+}
+
+int execute_group_snap_rollback(const po::variables_map &vm,
+ const std::vector<std::string> &global_args) {
+ size_t arg_index = 0;
+
+ std::string group_name;
+ std::string namespace_name;
+ std::string pool_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx io_ctx;
+ librados::Rados rados;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ utils::ProgressContext pc("Rolling back to group snapshot",
+ vm[at::NO_PROGRESS].as<bool>());
+ r = rbd.group_snap_rollback_with_progress(io_ctx, group_name.c_str(),
+ snap_name.c_str(), pc);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: rollback group to snapshot failed: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ pc.finish();
+ return 0;
+}
+
+void get_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ false);
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ false);
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ at::add_format_options(options);
+}
+
+void get_rename_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE,
+ false);
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST,
+ false);
+}
+
+void get_add_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ (GROUP_SPEC.c_str(),
+ "group specification\n"
+ "(example: [<pool-name>/[<namespace>/]]<group-name>)");
+
+ add_prefixed_pool_option(options, "group");
+ add_prefixed_namespace_option(options, "group");
+ add_group_option(options, at::ARGUMENT_MODIFIER_NONE);
+
+ positional->add_options()
+ (at::IMAGE_SPEC.c_str(),
+ "image specification\n"
+ "(example: [<pool-name>/[<namespace>/]]<image-name>)");
+
+ add_prefixed_pool_option(options, "image");
+ add_prefixed_namespace_option(options, "image");
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE,
+ " unless overridden");
+}
+
+void get_remove_image_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ (GROUP_SPEC.c_str(),
+ "group specification\n"
+ "(example: [<pool-name>/[<namespace>/]]<group-name>)");
+
+ add_prefixed_pool_option(options, "group");
+ add_prefixed_namespace_option(options, "group");
+ add_group_option(options, at::ARGUMENT_MODIFIER_NONE);
+
+ positional->add_options()
+ (at::IMAGE_SPEC.c_str(),
+ "image specification\n"
+ "(example: [<pool-name>/[<namespace>/]]<image-name>)");
+
+ add_prefixed_pool_option(options, "image");
+ add_prefixed_namespace_option(options, "image");
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE,
+ " unless overridden");
+ at::add_image_id_option(options);
+}
+
+void get_list_images_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_format_options(options);
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ false);
+}
+
+void get_group_snap_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ true);
+ at::add_snap_create_options(options);
+}
+
+void get_group_snap_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ true);
+}
+
+void get_group_snap_rename_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ true);
+
+ positional->add_options()
+ (at::DEST_SNAPSHOT_NAME.c_str(),
+ "destination snapshot name\n(example: <snap-name>)");
+ at::add_snap_option(options, at::ARGUMENT_MODIFIER_DEST);
+}
+
+void get_group_snap_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_format_options(options);
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ false);
+}
+
+void get_group_snap_rollback_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_no_progress_option(options);
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ true);
+}
+
+Shell::Action action_create(
+ {"group", "create"}, {}, "Create a group.",
+ "", &get_create_arguments, &execute_create);
+Shell::Action action_remove(
+ {"group", "remove"}, {"group", "rm"}, "Delete a group.",
+ "", &get_remove_arguments, &execute_remove);
+Shell::Action action_list(
+ {"group", "list"}, {"group", "ls"}, "List rbd groups.",
+ "", &get_list_arguments, &execute_list);
+Shell::Action action_rename(
+ {"group", "rename"}, {}, "Rename a group within pool.",
+ "", &get_rename_arguments, &execute_rename);
+Shell::Action action_add(
+ {"group", "image", "add"}, {}, "Add an image to a group.",
+ "", &get_add_arguments, &execute_add);
+Shell::Action action_remove_image(
+ {"group", "image", "remove"}, {"group", "image", "rm"},
+ "Remove an image from a group.", "",
+ &get_remove_image_arguments, &execute_remove_image);
+Shell::Action action_list_images(
+ {"group", "image", "list"}, {"group", "image", "ls"},
+ "List images in a group.", "",
+ &get_list_images_arguments, &execute_list_images);
+Shell::Action action_group_snap_create(
+ {"group", "snap", "create"}, {}, "Make a snapshot of a group.",
+ "", &get_group_snap_create_arguments, &execute_group_snap_create);
+Shell::Action action_group_snap_remove(
+ {"group", "snap", "remove"}, {"group", "snap", "rm"},
+ "Remove a snapshot from a group.",
+ "", &get_group_snap_remove_arguments, &execute_group_snap_remove);
+Shell::Action action_group_snap_rename(
+ {"group", "snap", "rename"}, {}, "Rename group's snapshot.",
+ "", &get_group_snap_rename_arguments, &execute_group_snap_rename);
+Shell::Action action_group_snap_list(
+ {"group", "snap", "list"}, {"group", "snap", "ls"},
+ "List snapshots of a group.",
+ "", &get_group_snap_list_arguments, &execute_group_snap_list);
+Shell::Action action_group_snap_rollback(
+ {"group", "snap", "rollback"}, {},
+ "Rollback group to snapshot.",
+ "", &get_group_snap_rollback_arguments, &execute_group_snap_rollback);
+
+} // namespace group
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/ImageMeta.cc b/src/tools/rbd/action/ImageMeta.cc
new file mode 100644
index 000000000..20c4555da
--- /dev/null
+++ b/src/tools/rbd/action/ImageMeta.cc
@@ -0,0 +1,345 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace image_meta {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+void add_key_option(po::options_description *positional) {
+ positional->add_options()
+ ("key", "image meta key");
+}
+
+int get_key(const po::variables_map &vm, size_t *arg_index,
+ std::string *key) {
+ *key = utils::get_positional_argument(vm, *arg_index);
+ if (key->empty()) {
+ std::cerr << "rbd: metadata key was not specified" << std::endl;
+ return -EINVAL;
+ } else {
+ ++(*arg_index);
+ }
+ return 0;
+}
+
+const uint32_t MAX_KEYS = 64;
+
+} // anonymous namespace
+
+static int do_metadata_list(librbd::Image& image, Formatter *f)
+{
+ int r;
+ TextTable tbl;
+
+ size_t count = 0;
+ std::string last_key;
+ bool more_results = true;
+ while (more_results) {
+ std::map<std::string, bufferlist> pairs;
+ r = image.metadata_list(last_key, MAX_KEYS, &pairs);
+ if (r < 0) {
+ std::cerr << "failed to list metadata of image : " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ more_results = (pairs.size() == MAX_KEYS);
+ if (!pairs.empty()) {
+ if (count == 0) {
+ if (f) {
+ f->open_object_section("metadatas");
+ } else {
+ tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+ }
+ }
+
+ last_key = pairs.rbegin()->first;
+ count += pairs.size();
+
+ for (auto kv : pairs) {
+ std::string val(kv.second.c_str(), kv.second.length());
+ if (f) {
+ f->dump_string(kv.first.c_str(), val.c_str());
+ } else {
+ tbl << kv.first << val << TextTable::endrow;
+ }
+ }
+ }
+ }
+
+ if (f == nullptr) {
+ bool single = (count == 1);
+ std::cout << "There " << (single ? "is" : "are") << " " << count << " "
+ << (single ? "metadatum" : "metadata") << " on this image"
+ << (count == 0 ? "." : ":") << std::endl;
+ }
+
+ if (count > 0) {
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << std::endl << tbl;
+ }
+ }
+ return 0;
+}
+
+static int do_metadata_set(librbd::Image& image, std::string &key,
+ std::string &value)
+{
+ int r = image.metadata_set(key, value);
+ if (r < 0) {
+ std::cerr << "failed to set metadata " << key << " of image : "
+ << cpp_strerror(r) << std::endl;
+ }
+ return r;
+}
+
+static int do_metadata_remove(librbd::Image& image, std::string &key)
+{
+ int r = image.metadata_remove(key);
+ if (r == -ENOENT) {
+ std::cerr << "rbd: no existing metadata key " << key << " of image : "
+ << cpp_strerror(r) << std::endl;
+ } else if(r < 0) {
+ std::cerr << "failed to remove metadata " << key << " of image : "
+ << cpp_strerror(r) << std::endl;
+ }
+ return r;
+}
+
+static int do_metadata_get(librbd::Image& image, std::string &key)
+{
+ std::string s;
+ int r = image.metadata_get(key, &s);
+ if (r < 0) {
+ std::cerr << "failed to get metadata " << key << " of image : "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ std::cout << s << std::endl;
+ return r;
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_metadata_list(image, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: listing metadata failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+}
+
+int execute_get(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_metadata_get(image, key);
+ if (r < 0) {
+ std::cerr << "rbd: getting metadata failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+ positional->add_options()
+ ("value", "image meta value");
+}
+
+int execute_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value = utils::get_positional_argument(vm, arg_index);
+ if (value.empty()) {
+ std::cerr << "rbd: metadata value was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_metadata_set(image, key, value);
+ if (r < 0) {
+ std::cerr << "rbd: setting metadata failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_metadata_remove(image, key);
+ if (r < 0) {
+ std::cerr << "rbd: removing metadata failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_list(
+ {"image-meta", "list"}, {"image-meta", "ls"}, "Image metadata list keys with values.", "",
+ &get_list_arguments, &execute_list);
+Shell::Action action_get(
+ {"image-meta", "get"}, {},
+ "Image metadata get the value associated with the key.", "",
+ &get_get_arguments, &execute_get);
+Shell::Action action_set(
+ {"image-meta", "set"}, {}, "Image metadata set key with value.", "",
+ &get_set_arguments, &execute_set);
+Shell::Action action_remove(
+ {"image-meta", "remove"}, {"image-meta", "rm"},
+ "Image metadata remove the key and value associated.", "",
+ &get_remove_arguments, &execute_remove);
+
+} // namespace image_meta
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Import.cc b/src/tools/rbd/action/Import.cc
new file mode 100644
index 000000000..0cdf3b713
--- /dev/null
+++ b/src/tools/rbd/action/Import.cc
@@ -0,0 +1,1033 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/Context.h"
+#include "common/blkdev.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "include/compat.h"
+#include "include/encoding.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/scoped_ptr.hpp>
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+
+namespace rbd {
+namespace action {
+namespace import {
+
+struct ImportDiffContext {
+ librbd::Image *image;
+ int fd;
+ size_t size;
+ utils::ProgressContext pc;
+ OrderedThrottle throttle;
+ uint64_t last_offset;
+
+ ImportDiffContext(librbd::Image *image, int fd, size_t size, bool no_progress)
+ : image(image), fd(fd), size(size), pc("Importing image diff", no_progress),
+ throttle((fd == STDIN_FILENO) ? 1 :
+ g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"),
+ false),
+ last_offset(0) {
+ }
+
+ void update_size(size_t new_size)
+ {
+ if (fd == STDIN_FILENO) {
+ size = new_size;
+ }
+ }
+
+ void update_progress(uint64_t off)
+ {
+ if (size) {
+ pc.update_progress(off, size);
+ last_offset = off;
+ }
+ }
+
+ void update_progress()
+ {
+ uint64_t off = last_offset;
+ if (fd != STDIN_FILENO) {
+ off = lseek(fd, 0, SEEK_CUR);
+ }
+
+ update_progress(off);
+ }
+
+ void finish(int r)
+ {
+ if (r < 0) {
+ pc.fail();
+ } else {
+ pc.finish();
+ }
+ }
+};
+
+class C_ImportDiff : public Context {
+public:
+ C_ImportDiff(ImportDiffContext *idiffctx, bufferlist data, uint64_t offset,
+ uint64_t length, bool write_zeroes)
+ : m_idiffctx(idiffctx), m_data(data), m_offset(offset), m_length(length),
+ m_write_zeroes(write_zeroes) {
+ // use block offset (stdin) or import file position to report
+ // progress.
+ if (m_idiffctx->fd == STDIN_FILENO) {
+ m_prog_offset = offset;
+ } else {
+ m_prog_offset = lseek(m_idiffctx->fd, 0, SEEK_CUR);
+ }
+ }
+
+ int send()
+ {
+ if (m_idiffctx->throttle.pending_error()) {
+ return m_idiffctx->throttle.wait_for_ret();
+ }
+
+ C_OrderedThrottle *ctx = m_idiffctx->throttle.start_op(this);
+ librbd::RBD::AioCompletion *aio_completion =
+ new librbd::RBD::AioCompletion(ctx, &utils::aio_context_callback);
+
+ int r;
+ if (m_write_zeroes) {
+ r = m_idiffctx->image->aio_write_zeroes(m_offset, m_length,
+ aio_completion, 0U,
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+ } else {
+ r = m_idiffctx->image->aio_write2(m_offset, m_length, m_data,
+ aio_completion,
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+ }
+
+ if (r < 0) {
+ aio_completion->release();
+ ctx->complete(r);
+ }
+
+ return r;
+ }
+
+ void finish(int r) override
+ {
+ m_idiffctx->update_progress(m_prog_offset);
+ m_idiffctx->throttle.end_op(r);
+ }
+
+private:
+ ImportDiffContext *m_idiffctx;
+ bufferlist m_data;
+ uint64_t m_offset;
+ uint64_t m_length;
+ bool m_write_zeroes;
+ uint64_t m_prog_offset;
+};
+
+static int do_image_snap_from(ImportDiffContext *idiffctx)
+{
+ int r;
+ string from;
+ r = utils::read_string(idiffctx->fd, 4096, &from); // 4k limit to make sure we don't get a garbage string
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode start snap name" << std::endl;
+ return r;
+ }
+
+ bool exists;
+ r = idiffctx->image->snap_exists2(from.c_str(), &exists);
+ if (r < 0) {
+ std::cerr << "rbd: failed to query start snap state" << std::endl;
+ return r;
+ }
+
+ if (!exists) {
+ std::cerr << "start snapshot '" << from
+ << "' does not exist in the image, aborting" << std::endl;
+ return -EINVAL;
+ }
+
+ idiffctx->update_progress();
+ return 0;
+}
+
+static int do_image_snap_to(ImportDiffContext *idiffctx, std::string *tosnap)
+{
+ int r;
+ string to;
+ r = utils::read_string(idiffctx->fd, 4096, &to); // 4k limit to make sure we don't get a garbage string
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode end snap name" << std::endl;
+ return r;
+ }
+
+ bool exists;
+ r = idiffctx->image->snap_exists2(to.c_str(), &exists);
+ if (r < 0) {
+ std::cerr << "rbd: failed to query end snap state" << std::endl;
+ return r;
+ }
+
+ if (exists) {
+ std::cerr << "end snapshot '" << to << "' already exists, aborting"
+ << std::endl;
+ return -EEXIST;
+ }
+
+ *tosnap = to;
+ idiffctx->update_progress();
+
+ return 0;
+}
+
+static int get_snap_protection_status(ImportDiffContext *idiffctx,
+ bool *is_protected)
+{
+ int r;
+ char buf[sizeof(__u8)];
+ r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode snap protection status" << std::endl;
+ return r;
+ }
+
+ *is_protected = (buf[0] != 0);
+ idiffctx->update_progress();
+
+ return 0;
+}
+
+static int do_image_resize(ImportDiffContext *idiffctx)
+{
+ int r;
+ char buf[sizeof(uint64_t)];
+ uint64_t end_size;
+ r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode image size" << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto p = bl.cbegin();
+ decode(end_size, p);
+
+ uint64_t cur_size;
+ idiffctx->image->size(&cur_size);
+ if (cur_size != end_size) {
+ idiffctx->image->resize(end_size);
+ }
+
+ idiffctx->update_size(end_size);
+ idiffctx->update_progress();
+ return 0;
+}
+
+static int do_image_io(ImportDiffContext *idiffctx, bool write_zeroes,
+ size_t sparse_size)
+{
+ int r;
+ char buf[16];
+ r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode IO length" << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto p = bl.cbegin();
+
+ uint64_t image_offset, buffer_length;
+ decode(image_offset, p);
+ decode(buffer_length, p);
+
+ if (!write_zeroes) {
+ bufferptr bp = buffer::create(buffer_length);
+ r = safe_read_exact(idiffctx->fd, bp.c_str(), buffer_length);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode write data" << std::endl;
+ return r;
+ }
+
+ size_t buffer_offset = 0;
+ while (buffer_offset < buffer_length) {
+ size_t write_length = 0;
+ bool zeroed = false;
+ utils::calc_sparse_extent(bp, sparse_size, buffer_offset, buffer_length,
+ &write_length, &zeroed);
+ ceph_assert(write_length > 0);
+
+ bufferlist write_bl;
+ if (!zeroed) {
+ bufferptr write_ptr(bp, buffer_offset, write_length);
+ write_bl.push_back(write_ptr);
+ ceph_assert(write_bl.length() == write_length);
+ }
+
+ C_ImportDiff *ctx = new C_ImportDiff(idiffctx, write_bl,
+ image_offset + buffer_offset,
+ write_length, zeroed);
+ r = ctx->send();
+ if (r < 0) {
+ return r;
+ }
+
+ buffer_offset += write_length;
+ }
+ } else {
+ bufferlist data;
+ C_ImportDiff *ctx = new C_ImportDiff(idiffctx, data, image_offset,
+ buffer_length, true);
+ return ctx->send();
+ }
+ return r;
+}
+
+static int validate_banner(int fd, std::string banner)
+{
+ int r;
+ char buf[banner.size() + 1];
+ memset(buf, 0, sizeof(buf));
+ r = safe_read_exact(fd, buf, banner.size());
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode diff banner" << std::endl;
+ return r;
+ }
+
+ buf[banner.size()] = '\0';
+ if (strcmp(buf, banner.c_str())) {
+ std::cerr << "rbd: invalid or unexpected diff banner" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int skip_tag(int fd, uint64_t length)
+{
+ int r;
+
+ if (fd == STDIN_FILENO) {
+ // read the appending data out to skip this tag.
+ char buf[4096];
+ uint64_t len = min<uint64_t>(length, sizeof(buf));
+ while (len > 0) {
+ r = safe_read_exact(fd, buf, len);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode skipped tag data" << std::endl;
+ return r;
+ }
+ length -= len;
+ len = min<uint64_t>(length, sizeof(buf));
+ }
+ } else {
+ // lseek to skip this tag
+ off64_t offs = lseek64(fd, length, SEEK_CUR);
+ if (offs < 0) {
+ return -errno;
+ }
+ }
+
+ return 0;
+}
+
+static int read_tag(int fd, __u8 end_tag, int format, __u8 *tag, uint64_t *readlen)
+{
+ int r;
+ __u8 read_tag;
+
+ r = safe_read_exact(fd, &read_tag, sizeof(read_tag));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode tag" << std::endl;
+ return r;
+ }
+
+ *tag = read_tag;
+ if (read_tag != end_tag && format == 2) {
+ char buf[sizeof(uint64_t)];
+ r = safe_read_exact(fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode tag length" << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto p = bl.cbegin();
+ decode(*readlen, p);
+ }
+
+ return 0;
+}
+
+int do_import_diff_fd(librados::Rados &rados, librbd::Image &image, int fd,
+ bool no_progress, int format, size_t sparse_size)
+{
+ int r;
+
+ uint64_t size = 0;
+ bool from_stdin = (fd == STDIN_FILENO);
+ if (!from_stdin) {
+ struct stat stat_buf;
+ r = ::fstat(fd, &stat_buf);
+ if (r < 0) {
+ std::cerr << "rbd: failed to stat specified diff file" << std::endl;
+ return r;
+ }
+ size = (uint64_t)stat_buf.st_size;
+ }
+
+ r = validate_banner(fd, (format == 1 ? utils::RBD_DIFF_BANNER :
+ utils::RBD_DIFF_BANNER_V2));
+ if (r < 0) {
+ return r;
+ }
+
+ // begin image import
+ std::string tosnap;
+ bool is_protected = false;
+ ImportDiffContext idiffctx(&image, fd, size, no_progress);
+ while (r == 0) {
+ __u8 tag;
+ uint64_t length = 0;
+
+ r = read_tag(fd, RBD_DIFF_END, format, &tag, &length);
+ if (r < 0 || tag == RBD_DIFF_END) {
+ break;
+ }
+
+ if (tag == RBD_DIFF_FROM_SNAP) {
+ r = do_image_snap_from(&idiffctx);
+ } else if (tag == RBD_DIFF_TO_SNAP) {
+ r = do_image_snap_to(&idiffctx, &tosnap);
+ } else if (tag == RBD_SNAP_PROTECTION_STATUS) {
+ r = get_snap_protection_status(&idiffctx, &is_protected);
+ } else if (tag == RBD_DIFF_IMAGE_SIZE) {
+ r = do_image_resize(&idiffctx);
+ } else if (tag == RBD_DIFF_WRITE || tag == RBD_DIFF_ZERO) {
+ r = do_image_io(&idiffctx, (tag == RBD_DIFF_ZERO), sparse_size);
+ } else {
+ std::cerr << "unrecognized tag byte " << (int)tag << " in stream; skipping"
+ << std::endl;
+ r = skip_tag(fd, length);
+ }
+ }
+
+ int temp_r = idiffctx.throttle.wait_for_ret();
+ r = (r < 0) ? r : temp_r; // preserve original error
+ if (r == 0 && tosnap.length()) {
+ r = idiffctx.image->snap_create(tosnap.c_str());
+ if (r == 0 && is_protected) {
+ r = idiffctx.image->snap_protect(tosnap.c_str());
+ }
+ }
+
+ idiffctx.finish(r);
+ return r;
+}
+
+int do_import_diff(librados::Rados &rados, librbd::Image &image,
+ const char *path, bool no_progress, size_t sparse_size)
+{
+ int r;
+ int fd;
+
+ if (strcmp(path, "-") == 0) {
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(path, O_RDONLY|O_BINARY);
+ if (fd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << path << std::endl;
+ return r;
+ }
+ }
+ r = do_import_diff_fd(rados, image, fd, no_progress, 1, sparse_size);
+
+ if (fd != 0)
+ close(fd);
+ return r;
+}
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments_diff(po::options_description *positional,
+ po::options_description *options) {
+ at::add_path_options(positional, options,
+ "import file (or '-' for stdin)");
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_sparse_size_option(options);
+ at::add_no_progress_option(options);
+}
+
+int execute_diff(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string path;
+ size_t arg_index = 0;
+ int r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE;
+ if (vm.count(at::IMAGE_SPARSE_SIZE)) {
+ sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_import_diff(rados, image, path.c_str(),
+ vm[at::NO_PROGRESS].as<bool>(), sparse_size);
+ if (r == -EDOM) {
+ r = -EBADMSG;
+ }
+ if (r < 0) {
+ cerr << "rbd: import-diff failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_diff(
+ {"import-diff"}, {}, "Import an incremental diff.", "", &get_arguments_diff,
+ &execute_diff);
+
+class C_Import : public Context {
+public:
+ C_Import(SimpleThrottle &simple_throttle, librbd::Image &image,
+ bufferlist &bl, uint64_t offset)
+ : m_throttle(simple_throttle), m_image(image),
+ m_aio_completion(
+ new librbd::RBD::AioCompletion(this, &utils::aio_context_callback)),
+ m_bufferlist(bl), m_offset(offset)
+ {
+ }
+
+ void send()
+ {
+ m_throttle.start_op();
+
+ int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ int r = m_image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist,
+ m_aio_completion, op_flags);
+ if (r < 0) {
+ std::cerr << "rbd: error requesting write to destination image"
+ << std::endl;
+ m_aio_completion->release();
+ m_throttle.end_op(r);
+ }
+ }
+
+ void finish(int r) override
+ {
+ if (r < 0) {
+ std::cerr << "rbd: error writing to destination image at offset "
+ << m_offset << ": " << cpp_strerror(r) << std::endl;
+ }
+ m_throttle.end_op(r);
+ }
+
+private:
+ SimpleThrottle &m_throttle;
+ librbd::Image &m_image;
+ librbd::RBD::AioCompletion *m_aio_completion;
+ bufferlist m_bufferlist;
+ uint64_t m_offset;
+};
+
+static int decode_and_set_image_option(int fd, uint64_t imageopt, librbd::ImageOptions& opts)
+{
+ int r;
+ char buf[sizeof(uint64_t)];
+
+ r = safe_read_exact(fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode image option" << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto it = bl.cbegin();
+
+ uint64_t val;
+ decode(val, it);
+
+ if (opts.get(imageopt, &val) != 0) {
+ opts.set(imageopt, val);
+ }
+
+ return 0;
+}
+
+static int do_import_metadata(int import_format, librbd::Image& image,
+ const std::map<std::string, std::string> &imagemetas)
+{
+ int r = 0;
+
+ //v1 format
+ if (import_format == 1) {
+ return 0;
+ }
+
+ for (std::map<std::string, std::string>::const_iterator it = imagemetas.begin();
+ it != imagemetas.end(); ++it) {
+ r = image.metadata_set(it->first, it->second);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int decode_imagemeta(int fd, uint64_t length, std::map<std::string, std::string>* imagemetas)
+{
+ int r;
+ string key;
+ string value;
+
+ r = utils::read_string(fd, length, &key);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode metadata key" << std::endl;
+ return r;
+ }
+
+ r = utils::read_string(fd, length, &value);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode metadata value" << std::endl;
+ return r;
+ }
+
+ (*imagemetas)[key] = value;
+ return 0;
+}
+
+static int do_import_header(int fd, int import_format, librbd::ImageOptions& opts,
+ std::map<std::string, std::string>* imagemetas)
+{
+ // There is no header in v1 image.
+ if (import_format == 1) {
+ return 0;
+ }
+
+ int r;
+ r = validate_banner(fd, utils::RBD_IMAGE_BANNER_V2);
+ if (r < 0) {
+ return r;
+ }
+
+ // As V1 format for image is already deprecated, import image in V2 by default.
+ uint64_t image_format = 2;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &image_format) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FORMAT, image_format);
+ }
+
+ while (r == 0) {
+ __u8 tag;
+ uint64_t length = 0;
+ r = read_tag(fd, RBD_EXPORT_IMAGE_END, image_format, &tag, &length);
+ if (r < 0 || tag == RBD_EXPORT_IMAGE_END) {
+ break;
+ }
+
+ if (tag == RBD_EXPORT_IMAGE_ORDER) {
+ r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_ORDER, opts);
+ } else if (tag == RBD_EXPORT_IMAGE_FEATURES) {
+ r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_FEATURES, opts);
+ } else if (tag == RBD_EXPORT_IMAGE_STRIPE_UNIT) {
+ r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_UNIT, opts);
+ } else if (tag == RBD_EXPORT_IMAGE_STRIPE_COUNT) {
+ r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_COUNT, opts);
+ } else if (tag == RBD_EXPORT_IMAGE_META) {
+ r = decode_imagemeta(fd, length, imagemetas);
+ } else {
+ std::cerr << "rbd: invalid tag in image properties zone: " << tag << "Skip it."
+ << std::endl;
+ r = skip_tag(fd, length);
+ }
+ }
+
+ return r;
+}
+
+static int do_import_v2(librados::Rados &rados, int fd, librbd::Image &image,
+ uint64_t size, size_t imgblklen,
+ utils::ProgressContext &pc, size_t sparse_size)
+{
+ int r = 0;
+ r = validate_banner(fd, utils::RBD_IMAGE_DIFFS_BANNER_V2);
+ if (r < 0) {
+ return r;
+ }
+
+ char buf[sizeof(uint64_t)];
+ r = safe_read_exact(fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode diff count" << std::endl;
+ return r;
+ }
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto p = bl.cbegin();
+ uint64_t diff_num;
+ decode(diff_num, p);
+ for (size_t i = 0; i < diff_num; i++) {
+ r = do_import_diff_fd(rados, image, fd, true, 2, sparse_size);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: import-diff failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ pc.update_progress(i + 1, diff_num);
+ }
+
+ return r;
+}
+
+static int do_import_v1(int fd, librbd::Image &image, uint64_t size,
+ size_t imgblklen, utils::ProgressContext &pc,
+ size_t sparse_size)
+{
+ int r = 0;
+ size_t reqlen = imgblklen; // amount requested from read
+ ssize_t readlen; // amount received from one read
+ size_t blklen = 0; // amount accumulated from reads to fill blk
+ char *p = new char[imgblklen];
+ uint64_t image_pos = 0;
+ bool from_stdin = (fd == STDIN_FILENO);
+ boost::scoped_ptr<SimpleThrottle> throttle;
+
+ if (from_stdin) {
+ throttle.reset(new SimpleThrottle(1, false));
+ } else {
+ throttle.reset(new SimpleThrottle(
+ g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"), false));
+ }
+
+ reqlen = min<uint64_t>(reqlen, size);
+ // loop body handles 0 return, as we may have a block to flush
+ while ((readlen = ::read(fd, p + blklen, reqlen)) >= 0) {
+ if (throttle->pending_error()) {
+ break;
+ }
+
+ blklen += readlen;
+ // if read was short, try again to fill the block before writing
+ if (readlen && ((size_t)readlen < reqlen)) {
+ reqlen -= readlen;
+ continue;
+ }
+ if (!from_stdin)
+ pc.update_progress(image_pos, size);
+
+ bufferptr blkptr(p, blklen);
+ // resize output image by binary expansion as we go for stdin
+ if (from_stdin && (image_pos + (size_t)blklen) > size) {
+ size *= 2;
+ r = image.resize(size);
+ if (r < 0) {
+ std::cerr << "rbd: can't resize image during import" << std::endl;
+ goto out;
+ }
+ }
+
+ // write as much as we got; perhaps less than imgblklen
+ // but skip writing zeros to create sparse images
+ size_t buffer_offset = 0;
+ while (buffer_offset < blklen) {
+ size_t write_length = 0;
+ bool zeroed = false;
+ utils::calc_sparse_extent(blkptr, sparse_size, buffer_offset, blklen,
+ &write_length, &zeroed);
+
+ if (!zeroed) {
+ bufferlist write_bl;
+ bufferptr write_ptr(blkptr, buffer_offset, write_length);
+ write_bl.push_back(write_ptr);
+ ceph_assert(write_bl.length() == write_length);
+
+ C_Import *ctx = new C_Import(*throttle, image, write_bl,
+ image_pos + buffer_offset);
+ ctx->send();
+ }
+
+ buffer_offset += write_length;
+ }
+
+ // done with whole block, whether written or not
+ image_pos += blklen;
+ if (!from_stdin && image_pos >= size)
+ break;
+ // if read had returned 0, we're at EOF and should quit
+ if (readlen == 0)
+ break;
+ blklen = 0;
+ reqlen = imgblklen;
+ }
+ r = throttle->wait_for_ret();
+ if (r < 0) {
+ goto out;
+ }
+
+ if (fd == STDIN_FILENO) {
+ r = image.resize(image_pos);
+ if (r < 0) {
+ std::cerr << "rbd: final image resize failed" << std::endl;
+ goto out;
+ }
+ }
+out:
+ delete[] p;
+ return r;
+}
+
+static int do_import(librados::Rados &rados, librbd::RBD &rbd,
+ librados::IoCtx& io_ctx, const char *imgname,
+ const char *path, librbd::ImageOptions& opts,
+ bool no_progress, int import_format, size_t sparse_size)
+{
+ int fd, r;
+ struct stat stat_buf;
+ utils::ProgressContext pc("Importing image", no_progress);
+ std::map<std::string, std::string> imagemetas;
+
+ ceph_assert(imgname);
+
+ uint64_t order;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ order = g_conf().get_val<uint64_t>("rbd_default_order");
+ }
+
+ // try to fill whole imgblklen blocks for sparsification
+ size_t imgblklen = 1 << order;
+ librbd::Image image;
+ uint64_t size = 0;
+
+ bool from_stdin = !strcmp(path, "-");
+ if (from_stdin) {
+ fd = STDIN_FILENO;
+ size = 1ULL << order;
+ } else {
+ if ((fd = open(path, O_RDONLY|O_BINARY)) < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << path << std::endl;
+ goto done2;
+ }
+
+ if ((fstat(fd, &stat_buf)) < 0) {
+ r = -errno;
+ std::cerr << "rbd: stat error " << path << std::endl;
+ goto done;
+ }
+ if (S_ISDIR(stat_buf.st_mode)) {
+ r = -EISDIR;
+ std::cerr << "rbd: cannot import a directory" << std::endl;
+ goto done;
+ }
+ if (stat_buf.st_size)
+ size = (uint64_t)stat_buf.st_size;
+
+ if (!size) {
+ int64_t bdev_size = 0;
+ BlkDev blkdev(fd);
+ r = blkdev.get_size(&bdev_size);
+ if (r < 0) {
+ std::cerr << "rbd: unable to get size of file/block device"
+ << std::endl;
+ goto done;
+ }
+ ceph_assert(bdev_size >= 0);
+ size = (uint64_t) bdev_size;
+ }
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+ }
+
+ r = do_import_header(fd, import_format, opts, &imagemetas);
+ if (r < 0) {
+ std::cerr << "rbd: import header failed." << std::endl;
+ goto done;
+ }
+
+ r = rbd.create4(io_ctx, imgname, size, opts);
+ if (r < 0) {
+ std::cerr << "rbd: image creation failed" << std::endl;
+ goto done;
+ }
+
+ r = rbd.open(io_ctx, image, imgname);
+ if (r < 0) {
+ std::cerr << "rbd: failed to open image" << std::endl;
+ goto err;
+ }
+
+ r = do_import_metadata(import_format, image, imagemetas);
+ if (r < 0) {
+ std::cerr << "rbd: failed to import image-meta" << std::endl;
+ goto err;
+ }
+
+ if (import_format == 1) {
+ r = do_import_v1(fd, image, size, imgblklen, pc, sparse_size);
+ } else {
+ r = do_import_v2(rados, fd, image, size, imgblklen, pc, sparse_size);
+ }
+ if (r < 0) {
+ std::cerr << "rbd: failed to import image" << std::endl;
+ image.close();
+ goto err;
+ }
+
+ r = image.close();
+err:
+ if (r < 0)
+ rbd.remove(io_ctx, imgname);
+done:
+ if (r < 0)
+ pc.fail();
+ else
+ pc.finish();
+ if (!from_stdin)
+ close(fd);
+done2:
+ return r;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_path_options(positional, options,
+ "import file (or '-' for stdin)");
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, true);
+ at::add_sparse_size_option(options);
+ at::add_no_progress_option(options);
+ at::add_export_format_option(options);
+
+ // TODO legacy rbd allowed import to accept both 'image'/'dest' and
+ // 'pool'/'dest-pool'
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE, " deprecated[:dest-pool]");
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE, " deprecated[:dest]");
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string path;
+ size_t arg_index = 0;
+ int r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ // odd check to support legacy / deprecated behavior of import
+ std::string deprecated_pool_name;
+ if (vm.count(at::POOL_NAME)) {
+ deprecated_pool_name = vm[at::POOL_NAME].as<std::string>();
+ }
+
+ std::string deprecated_image_name;
+ if (vm.count(at::IMAGE_NAME)) {
+ deprecated_image_name = vm[at::IMAGE_NAME].as<std::string>();
+ } else {
+ deprecated_image_name = path.substr(path.find_last_of("/\\") + 1);
+ }
+
+ std::string deprecated_snap_name;
+ r = utils::extract_spec(deprecated_image_name, &deprecated_pool_name,
+ nullptr, &deprecated_image_name,
+ &deprecated_snap_name, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE;
+ if (vm.count(at::IMAGE_SPARSE_SIZE)) {
+ sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>();
+ }
+
+ std::string pool_name = deprecated_pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name = deprecated_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, false, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (image_name.empty()) {
+ image_name = deprecated_image_name;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, true, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ int format = 1;
+ if (vm.count("export-format"))
+ format = vm["export-format"].as<uint64_t>();
+
+ librbd::RBD rbd;
+ r = do_import(rados, rbd, io_ctx, image_name.c_str(), path.c_str(),
+ opts, vm[at::NO_PROGRESS].as<bool>(), format, sparse_size);
+ if (r < 0) {
+ std::cerr << "rbd: import failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+Shell::Action action(
+ {"import"}, {}, "Import image from file.", at::get_long_features_help(),
+ &get_arguments, &execute);
+
+} // namespace import
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Info.cc b/src/tools/rbd/action/Info.cc
new file mode 100644
index 000000000..f8d053cd7
--- /dev/null
+++ b/src/tools/rbd/action/Info.cc
@@ -0,0 +1,471 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+#include "common/Clock.h"
+
+namespace rbd {
+namespace action {
+namespace info {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static void format_bitmask(Formatter *f, const std::string &name,
+ const std::map<uint64_t, std::string>& mapping,
+ uint64_t bitmask)
+{
+ int count = 0;
+ std::string group_name(name + "s");
+ if (f == NULL) {
+ std::cout << "\t" << group_name << ": ";
+ } else {
+ f->open_array_section(group_name.c_str());
+ }
+ for (std::map<uint64_t, std::string>::const_iterator it = mapping.begin();
+ it != mapping.end(); ++it) {
+ if ((it->first & bitmask) == 0) {
+ continue;
+ }
+
+ if (f == NULL) {
+ if (count++ > 0) {
+ std::cout << ", ";
+ }
+ std::cout << it->second;
+ } else {
+ f->dump_string(name.c_str(), it->second);
+ }
+ }
+ if (f == NULL) {
+ std::cout << std::endl;
+ } else {
+ f->close_section();
+ }
+}
+
+static void format_features(Formatter *f, uint64_t features)
+{
+ format_bitmask(f, "feature", at::ImageFeatures::FEATURE_MAPPING, features);
+}
+
+static void format_op_features(Formatter *f, uint64_t op_features)
+{
+ static std::map<uint64_t, std::string> mapping = {
+ {RBD_OPERATION_FEATURE_CLONE_PARENT, RBD_OPERATION_FEATURE_NAME_CLONE_PARENT},
+ {RBD_OPERATION_FEATURE_CLONE_CHILD, RBD_OPERATION_FEATURE_NAME_CLONE_CHILD},
+ {RBD_OPERATION_FEATURE_GROUP, RBD_OPERATION_FEATURE_NAME_GROUP},
+ {RBD_OPERATION_FEATURE_SNAP_TRASH, RBD_OPERATION_FEATURE_NAME_SNAP_TRASH}};
+ format_bitmask(f, "op_feature", mapping, op_features);
+}
+
+static void format_flags(Formatter *f, uint64_t flags)
+{
+ std::map<uint64_t, std::string> mapping = {
+ {RBD_FLAG_OBJECT_MAP_INVALID, "object map invalid"},
+ {RBD_FLAG_FAST_DIFF_INVALID, "fast diff invalid"}};
+ format_bitmask(f, "flag", mapping, flags);
+}
+
+void format_timestamp(struct timespec timestamp, std::string &timestamp_str) {
+ if(timestamp.tv_sec != 0) {
+ time_t ts = timestamp.tv_sec;
+ timestamp_str = ctime(&ts);
+ timestamp_str = timestamp_str.substr(0, timestamp_str.length() - 1);
+ }
+}
+
+static int do_show_info(librados::IoCtx &io_ctx, librbd::Image& image,
+ const std::string &snapname, Formatter *f)
+{
+ librbd::image_info_t info;
+ uint8_t old_format;
+ uint64_t overlap, features, flags, snap_limit;
+ bool snap_protected = false;
+ librbd::mirror_image_info_t mirror_image;
+ librbd::mirror_image_mode_t mirror_mode = RBD_MIRROR_IMAGE_MODE_JOURNAL;
+ std::vector<librbd::snap_info_t> snaps;
+ int r;
+
+ std::string imgname;
+ r = image.get_name(&imgname);
+ if (r < 0)
+ return r;
+
+ r = image.snap_list(snaps);
+ if (r < 0)
+ return r;
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ r = image.old_format(&old_format);
+ if (r < 0)
+ return r;
+
+ std::string imgid;
+ if (!old_format) {
+ r = image.get_id(&imgid);
+ if (r < 0)
+ return r;
+ }
+
+ std::string data_pool;
+ if (!old_format) {
+ int64_t data_pool_id = image.get_data_pool_id();
+ if (data_pool_id != io_ctx.get_id()) {
+ librados::Rados rados(io_ctx);
+ librados::IoCtx data_io_ctx;
+ r = rados.ioctx_create2(data_pool_id, data_io_ctx);
+ if (r < 0) {
+ data_pool = "<missing data pool " + stringify(data_pool_id) + ">";
+ } else {
+ data_pool = data_io_ctx.get_pool_name();
+ }
+ }
+ }
+
+ r = image.overlap(&overlap);
+ if (r < 0)
+ return r;
+
+ r = image.features(&features);
+ if (r < 0)
+ return r;
+
+ uint64_t op_features;
+ r = image.get_op_features(&op_features);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.get_flags(&flags);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!snapname.empty()) {
+ r = image.snap_is_protected(snapname.c_str(), &snap_protected);
+ if (r < 0)
+ return r;
+ }
+
+ mirror_image.state = RBD_MIRROR_IMAGE_DISABLED;
+ r = image.mirror_image_get_info(&mirror_image, sizeof(mirror_image));
+ if (r < 0) {
+ return r;
+ }
+
+ if (mirror_image.state != RBD_MIRROR_IMAGE_DISABLED) {
+ r = image.mirror_image_get_mode(&mirror_mode);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ r = image.snap_get_limit(&snap_limit);
+ if (r < 0)
+ return r;
+
+ std::string prefix = image.get_block_name_prefix();
+
+ librbd::group_info_t group_info;
+ r = image.get_group(&group_info, sizeof(group_info));
+ if (r < 0) {
+ return r;
+ }
+
+ std::string group_string = "";
+ if (RBD_GROUP_INVALID_POOL != group_info.pool) {
+ std::string group_pool;
+ librados::Rados rados(io_ctx);
+ librados::IoCtx group_io_ctx;
+ r = rados.ioctx_create2(group_info.pool, group_io_ctx);
+ if (r < 0) {
+ group_pool = "<missing group pool " + stringify(group_info.pool) + ">";
+ } else {
+ group_pool = group_io_ctx.get_pool_name();
+ }
+
+ group_string = group_pool + "/";
+ if (!io_ctx.get_namespace().empty()) {
+ group_string += io_ctx.get_namespace() + "/";
+ }
+ group_string += group_info.name;
+ }
+
+ struct timespec create_timestamp;
+ image.get_create_timestamp(&create_timestamp);
+
+ std::string create_timestamp_str = "";
+ format_timestamp(create_timestamp, create_timestamp_str);
+
+ struct timespec access_timestamp;
+ image.get_access_timestamp(&access_timestamp);
+
+ std::string access_timestamp_str = "";
+ format_timestamp(access_timestamp, access_timestamp_str);
+
+ struct timespec modify_timestamp;
+ image.get_modify_timestamp(&modify_timestamp);
+
+ std::string modify_timestamp_str = "";
+ format_timestamp(modify_timestamp, modify_timestamp_str);
+
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("name", imgname);
+ f->dump_string("id", imgid);
+ f->dump_unsigned("size", info.size);
+ f->dump_unsigned("objects", info.num_objs);
+ f->dump_int("order", info.order);
+ f->dump_unsigned("object_size", info.obj_size);
+ f->dump_int("snapshot_count", snaps.size());
+ if (!data_pool.empty()) {
+ f->dump_string("data_pool", data_pool);
+ }
+ f->dump_string("block_name_prefix", prefix);
+ f->dump_int("format", (old_format ? 1 : 2));
+ } else {
+ std::cout << "rbd image '" << imgname << "':\n"
+ << "\tsize " << byte_u_t(info.size) << " in "
+ << info.num_objs << " objects"
+ << std::endl
+ << "\torder " << info.order
+ << " (" << byte_u_t(info.obj_size) << " objects)"
+ << std::endl
+ << "\tsnapshot_count: " << snaps.size()
+ << std::endl;
+ if (!imgid.empty()) {
+ std::cout << "\tid: " << imgid << std::endl;
+ }
+ if (!data_pool.empty()) {
+ std::cout << "\tdata_pool: " << data_pool << std::endl;
+ }
+ std::cout << "\tblock_name_prefix: " << prefix
+ << std::endl
+ << "\tformat: " << (old_format ? "1" : "2")
+ << std::endl;
+ }
+
+ if (!old_format) {
+ format_features(f, features);
+ format_op_features(f, op_features);
+ format_flags(f, flags);
+ }
+
+ if (!group_string.empty()) {
+ if (f) {
+ f->dump_string("group", group_string);
+ } else {
+ std::cout << "\tgroup: " << group_string
+ << std::endl;
+ }
+ }
+
+ if (!create_timestamp_str.empty()) {
+ if (f) {
+ f->dump_string("create_timestamp", create_timestamp_str);
+ } else {
+ std::cout << "\tcreate_timestamp: " << create_timestamp_str
+ << std::endl;
+ }
+ }
+
+ if (!access_timestamp_str.empty()) {
+ if (f) {
+ f->dump_string("access_timestamp", access_timestamp_str);
+ } else {
+ std::cout << "\taccess_timestamp: " << access_timestamp_str
+ << std::endl;
+ }
+ }
+
+ if (!modify_timestamp_str.empty()) {
+ if (f) {
+ f->dump_string("modify_timestamp", modify_timestamp_str);
+ } else {
+ std::cout << "\tmodify_timestamp: " << modify_timestamp_str
+ << std::endl;
+ }
+ }
+
+ // snapshot info, if present
+ if (!snapname.empty()) {
+ if (f) {
+ f->dump_string("protected", snap_protected ? "true" : "false");
+ } else {
+ std::cout << "\tprotected: " << (snap_protected ? "True" : "False")
+ << std::endl;
+ }
+ }
+
+ if (snap_limit < UINT64_MAX) {
+ if (f) {
+ f->dump_unsigned("snapshot_limit", snap_limit);
+ } else {
+ std::cout << "\tsnapshot_limit: " << snap_limit << std::endl;
+ }
+ }
+
+ // parent info, if present
+ librbd::linked_image_spec_t parent_image_spec;
+ librbd::snap_spec_t parent_snap_spec;
+ if ((image.get_parent(&parent_image_spec, &parent_snap_spec) == 0) &&
+ (parent_image_spec.image_name.length() > 0)) {
+ if (f) {
+ f->open_object_section("parent");
+ f->dump_string("pool", parent_image_spec.pool_name);
+ f->dump_string("pool_namespace", parent_image_spec.pool_namespace);
+ f->dump_string("image", parent_image_spec.image_name);
+ f->dump_string("id", parent_image_spec.image_id);
+ f->dump_string("snapshot", parent_snap_spec.name);
+ f->dump_bool("trash", parent_image_spec.trash);
+ f->dump_unsigned("overlap", overlap);
+ f->close_section();
+ } else {
+ std::cout << "\tparent: " << parent_image_spec.pool_name << "/";
+ if (!parent_image_spec.pool_namespace.empty()) {
+ std::cout << parent_image_spec.pool_namespace << "/";
+ }
+ std::cout << parent_image_spec.image_name << "@"
+ << parent_snap_spec.name;
+ if (parent_image_spec.trash) {
+ std::cout << " (trash " << parent_image_spec.image_id << ")";
+ }
+ std::cout << std::endl;
+ std::cout << "\toverlap: " << byte_u_t(overlap) << std::endl;
+ }
+ }
+
+ // striping info, if feature is set
+ if (features & RBD_FEATURE_STRIPINGV2) {
+ if (f) {
+ f->dump_unsigned("stripe_unit", image.get_stripe_unit());
+ f->dump_unsigned("stripe_count", image.get_stripe_count());
+ } else {
+ std::cout << "\tstripe unit: " << byte_u_t(image.get_stripe_unit())
+ << std::endl
+ << "\tstripe count: " << image.get_stripe_count() << std::endl;
+ }
+ }
+
+ if (features & RBD_FEATURE_JOURNALING) {
+ if (f) {
+ f->dump_string("journal", utils::image_id(image));
+ } else {
+ std::cout << "\tjournal: " << utils::image_id(image) << std::endl;
+ }
+ }
+
+ if (features & RBD_FEATURE_JOURNALING ||
+ mirror_image.state != RBD_MIRROR_IMAGE_DISABLED) {
+ if (f) {
+ f->open_object_section("mirroring");
+ f->dump_string("mode",
+ utils::mirror_image_mode(mirror_mode));
+ f->dump_string("state",
+ utils::mirror_image_state(mirror_image.state));
+ if (mirror_image.state != RBD_MIRROR_IMAGE_DISABLED) {
+ f->dump_string("global_id", mirror_image.global_id);
+ f->dump_bool("primary", mirror_image.primary);
+ }
+ f->close_section();
+ } else {
+ std::cout << "\tmirroring state: "
+ << utils::mirror_image_state(mirror_image.state) << std::endl;
+ if (mirror_image.state != RBD_MIRROR_IMAGE_DISABLED) {
+ std::cout << "\tmirroring mode: "
+ << utils::mirror_image_mode(mirror_mode) << std::endl
+ << "\tmirroring global id: " << mirror_image.global_id
+ << std::endl
+ << "\tmirroring primary: "
+ << (mirror_image.primary ? "true" : "false") <<std::endl;
+ }
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ utils::SNAPSHOT_PRESENCE_PERMITTED, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name,
+ image_id, snap_name, true, &rados, &io_ctx,
+ &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_show_info(io_ctx, image, snap_name, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: info: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"info"}, {}, "Show information about image size, striping, etc.", "",
+ &get_arguments, &execute);
+
+} // namespace info
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Journal.cc b/src/tools/rbd/action/Journal.cc
new file mode 100644
index 000000000..08606fcc3
--- /dev/null
+++ b/src/tools/rbd/action/Journal.cc
@@ -0,0 +1,1251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/Cond.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "include/stringify.h"
+#include <fstream>
+#include <sstream>
+#include <boost/program_options.hpp>
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/journal/cls_journal_types.h"
+#include "cls/journal/cls_journal_client.h"
+
+#include "journal/Journaler.h"
+#include "journal/ReplayEntry.h"
+#include "journal/ReplayHandler.h"
+#include "journal/Settings.h"
+#include "librbd/journal/Types.h"
+
+namespace rbd {
+namespace action {
+namespace journal {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static const std::string JOURNAL_SPEC("journal-spec");
+static const std::string JOURNAL_NAME("journal");
+static const std::string DEST_JOURNAL_NAME("dest-journal");
+
+void add_journal_option(po::options_description *opt,
+ at::ArgumentModifier modifier) {
+ std::string name = JOURNAL_NAME;
+ std::string description = at::get_description_prefix(modifier) +
+ "journal name";
+ switch (modifier) {
+ case at::ARGUMENT_MODIFIER_NONE:
+ case at::ARGUMENT_MODIFIER_SOURCE:
+ break;
+ case at::ARGUMENT_MODIFIER_DEST:
+ name = DEST_JOURNAL_NAME;
+ break;
+ }
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_journal_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ at::ArgumentModifier modifier) {
+
+ pos->add_options()
+ ((get_name_prefix(modifier) + JOURNAL_SPEC).c_str(),
+ (get_description_prefix(modifier) + "journal specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<journal-name>)").c_str());
+ add_pool_option(opt, modifier);
+ add_namespace_option(opt, modifier);
+ add_image_option(opt, modifier);
+ add_journal_option(opt, modifier);
+}
+
+int get_pool_journal_names(const po::variables_map &vm,
+ at::ArgumentModifier mod,
+ size_t *spec_arg_index,
+ std::string *pool_name,
+ std::string *namespace_name,
+ std::string *journal_name) {
+ std::string pool_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_POOL_NAME : at::POOL_NAME);
+ std::string namespace_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_NAMESPACE_NAME : at::NAMESPACE_NAME);
+ std::string image_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_IMAGE_NAME : at::IMAGE_NAME);
+ std::string journal_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ DEST_JOURNAL_NAME : JOURNAL_NAME);
+
+ if (vm.count(pool_key) && pool_name != nullptr) {
+ *pool_name = vm[pool_key].as<std::string>();
+ }
+ if (vm.count(namespace_key) && namespace_name != nullptr) {
+ *namespace_name = vm[namespace_key].as<std::string>();
+ }
+ if (vm.count(journal_key) && journal_name != nullptr) {
+ *journal_name = vm[journal_key].as<std::string>();
+ }
+
+ std::string image_name;
+ if (vm.count(image_key)) {
+ image_name = vm[image_key].as<std::string>();
+ }
+
+ int r;
+ if (journal_name != nullptr && !journal_name->empty()) {
+ // despite the separate pool option,
+ // we can also specify them via the journal option
+ std::string journal_name_copy(*journal_name);
+ r = extract_spec(journal_name_copy, pool_name, namespace_name, journal_name,
+ nullptr, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (!image_name.empty()) {
+ // despite the separate pool option,
+ // we can also specify them via the image option
+ std::string image_name_copy(image_name);
+ r = extract_spec(image_name_copy, pool_name, namespace_name, &image_name,
+ nullptr, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (journal_name != nullptr && spec_arg_index != nullptr &&
+ journal_name->empty()) {
+ std::string spec = utils::get_positional_argument(vm, (*spec_arg_index)++);
+ if (!spec.empty()) {
+ r = extract_spec(spec, pool_name, namespace_name, journal_name, nullptr,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ if (pool_name != nullptr && pool_name->empty()) {
+ *pool_name = utils::get_default_pool_name();
+ }
+
+ if (pool_name != nullptr && namespace_name != nullptr &&
+ journal_name != nullptr && journal_name->empty() && !image_name.empty()) {
+ // Try to get journal name from image info.
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ int r = utils::init_and_open_image(*pool_name, *namespace_name, image_name,
+ "", "", true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ std::cerr << "rbd: failed to open image " << image_name
+ << " to get journal name: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ uint64_t features;
+ r = image.features(&features);
+ if (r < 0) {
+ return r;
+ }
+ if ((features & RBD_FEATURE_JOURNALING) == 0) {
+ std::cerr << "rbd: journaling is not enabled for image " << image_name
+ << std::endl;
+ return -EINVAL;
+ }
+ *journal_name = utils::image_id(image);
+ }
+
+ if (journal_name != nullptr && journal_name->empty()) {
+ std::string prefix = at::get_description_prefix(mod);
+ std::cerr << "rbd: "
+ << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+ << "journal was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int do_show_journal_info(librados::Rados& rados, librados::IoCtx& io_ctx,
+ const std::string& journal_id, Formatter *f)
+{
+ int r;
+ C_SaferCond cond;
+
+ std::string header_oid = ::journal::Journaler::header_oid(journal_id);
+ std::string object_oid_prefix = ::journal::Journaler::object_oid_prefix(
+ io_ctx.get_id(), journal_id);
+ uint8_t order;
+ uint8_t splay_width;
+ int64_t pool_id;
+
+ cls::journal::client::get_immutable_metadata(io_ctx, header_oid, &order,
+ &splay_width, &pool_id, &cond);
+ r = cond.wait();
+ if (r < 0) {
+ std::cerr << "failed to get journal metadata: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ std::string object_pool_name;
+ if (pool_id >= 0) {
+ r = rados.pool_reverse_lookup(pool_id, &object_pool_name);
+ if (r < 0) {
+ std::cerr << "error looking up pool name for pool_id=" << pool_id << ": "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+
+ if (f) {
+ f->open_object_section("journal");
+ f->dump_string("journal_id", journal_id);
+ f->dump_string("header_oid", header_oid);
+ f->dump_string("object_oid_prefix", object_oid_prefix);
+ f->dump_int("order", order);
+ f->dump_int("splay_width", splay_width);
+ if (!object_pool_name.empty()) {
+ f->dump_string("object_pool", object_pool_name);
+ }
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << "rbd journal '" << journal_id << "':" << std::endl;
+ std::cout << "\theader_oid: " << header_oid << std::endl;
+ std::cout << "\tobject_oid_prefix: " << object_oid_prefix << std::endl;
+ std::cout << "\torder: " << static_cast<int>(order) << " ("
+ << byte_u_t(1ull << order) << " objects)"<< std::endl;
+ std::cout << "\tsplay_width: " << static_cast<int>(splay_width) << std::endl;
+ if (!object_pool_name.empty()) {
+ std::cout << "\tobject_pool: " << object_pool_name << std::endl;
+ }
+ }
+ return 0;
+}
+
+static int do_show_journal_status(librados::IoCtx& io_ctx,
+ const std::string& journal_id, Formatter *f)
+{
+ int r;
+
+ C_SaferCond cond;
+ uint64_t minimum_set;
+ uint64_t active_set;
+ std::set<cls::journal::Client> registered_clients;
+ std::string oid = ::journal::Journaler::header_oid(journal_id);
+
+ cls::journal::client::get_mutable_metadata(io_ctx, oid, &minimum_set,
+ &active_set, &registered_clients,
+ &cond);
+ r = cond.wait();
+ if (r < 0) {
+ std::cerr << "warning: failed to get journal metadata" << std::endl;
+ return r;
+ }
+
+ if (f) {
+ f->open_object_section("status");
+ f->dump_unsigned("minimum_set", minimum_set);
+ f->dump_unsigned("active_set", active_set);
+ f->open_array_section("registered_clients");
+ for (std::set<cls::journal::Client>::iterator c =
+ registered_clients.begin(); c != registered_clients.end(); ++c) {
+ f->open_object_section("client");
+ c->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << "minimum_set: " << minimum_set << std::endl;
+ std::cout << "active_set: " << active_set << std::endl;
+ std::cout << "registered clients: " << std::endl;
+ for (std::set<cls::journal::Client>::iterator c =
+ registered_clients.begin(); c != registered_clients.end(); ++c) {
+ std::cout << "\t" << *c << std::endl;
+ }
+ }
+ return 0;
+}
+
+static int do_reset_journal(librados::IoCtx& io_ctx,
+ const std::string& journal_id)
+{
+ // disable/re-enable journaling to delete/re-create the journal
+ // to properly handle mirroring constraints
+ std::string image_name;
+ int r = librbd::cls_client::dir_get_name(&io_ctx, RBD_DIRECTORY, journal_id,
+ &image_name);
+ if (r < 0) {
+ std::cerr << "failed to locate journal's image: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ librbd::Image image;
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ if (r < 0) {
+ std::cerr << "failed to open image: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ r = image.update_features(RBD_FEATURE_JOURNALING, false);
+ if (r < 0) {
+ std::cerr << "failed to disable image journaling: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ r = image.update_features(RBD_FEATURE_JOURNALING, true);
+ if (r < 0) {
+ std::cerr << "failed to re-enable image journaling: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+static int do_disconnect_journal_client(librados::IoCtx& io_ctx,
+ const std::string& journal_id,
+ const std::string& client_id)
+{
+ int r;
+
+ C_SaferCond cond;
+ uint64_t minimum_set;
+ uint64_t active_set;
+ std::set<cls::journal::Client> registered_clients;
+ std::string oid = ::journal::Journaler::header_oid(journal_id);
+
+ cls::journal::client::get_mutable_metadata(io_ctx, oid, &minimum_set,
+ &active_set, &registered_clients,
+ &cond);
+ r = cond.wait();
+ if (r < 0) {
+ std::cerr << "warning: failed to get journal metadata" << std::endl;
+ return r;
+ }
+
+ static const std::string IMAGE_CLIENT_ID("");
+
+ bool found = false;
+ for (auto &c : registered_clients) {
+ if (c.id == IMAGE_CLIENT_ID || (!client_id.empty() && client_id != c.id)) {
+ continue;
+ }
+ r = cls::journal::client::client_update_state(io_ctx, oid, c.id,
+ cls::journal::CLIENT_STATE_DISCONNECTED);
+ if (r < 0) {
+ std::cerr << "warning: failed to disconnect client " << c.id << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ std::cout << "client " << c.id << " disconnected" << std::endl;
+ found = true;
+ }
+
+ if (!found) {
+ if (!client_id.empty()) {
+ std::cerr << "warning: client " << client_id << " is not registered"
+ << std::endl;
+ } else {
+ std::cerr << "no registered clients to disconnect" << std::endl;
+ }
+ return -ENOENT;
+ }
+
+ bufferlist bl;
+ r = io_ctx.notify2(oid, bl, 5000, NULL);
+ if (r < 0) {
+ std::cerr << "warning: failed to notify state change:" << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+class Journaler : public ::journal::Journaler {
+public:
+ Journaler(librados::IoCtx& io_ctx, const std::string& journal_id,
+ const std::string &client_id) :
+ ::journal::Journaler(io_ctx, journal_id, client_id, {}, nullptr) {
+ }
+
+ int init() {
+ int r;
+
+ // TODO register with librbd payload
+ r = register_client(bufferlist());
+ if (r < 0) {
+ std::cerr << "failed to register client: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ C_SaferCond cond;
+
+ ::journal::Journaler::init(&cond);
+ r = cond.wait();
+ if (r < 0) {
+ std::cerr << "failed to initialize journal: " << cpp_strerror(r)
+ << std::endl;
+ (void) unregister_client();
+ return r;
+ }
+
+ return 0;
+ }
+
+ int shut_down() {
+ int r = unregister_client();
+ if (r < 0) {
+ std::cerr << "rbd: failed to unregister journal client: "
+ << cpp_strerror(r) << std::endl;
+ }
+ ::journal::Journaler::shut_down();
+
+ return r;
+ }
+};
+
+class JournalPlayer {
+public:
+ JournalPlayer(librados::IoCtx& io_ctx, const std::string& journal_id,
+ const std::string &client_id) :
+ m_journaler(io_ctx, journal_id, client_id),
+ m_cond(),
+ m_r(0) {
+ }
+
+ virtual ~JournalPlayer() {}
+
+ virtual int exec() {
+ int r;
+
+ r = m_journaler.init();
+ if (r < 0) {
+ return r;
+ }
+
+ ReplayHandler replay_handler(this);
+
+ m_journaler.start_replay(&replay_handler);
+
+ r = m_cond.wait();
+ if (r < 0) {
+ std::cerr << "rbd: failed to process journal: " << cpp_strerror(r)
+ << std::endl;
+ if (m_r == 0) {
+ m_r = r;
+ }
+ }
+ return m_r;
+ }
+
+ int shut_down() {
+ return m_journaler.shut_down();
+ }
+
+protected:
+ struct ReplayHandler : public ::journal::ReplayHandler {
+ JournalPlayer *journal;
+ explicit ReplayHandler(JournalPlayer *_journal) : journal(_journal) {}
+
+ void handle_entries_available() override {
+ journal->handle_replay_ready();
+ }
+ void handle_complete(int r) override {
+ journal->handle_replay_complete(r);
+ }
+ };
+
+ void handle_replay_ready() {
+ int r = 0;
+ while (true) {
+ ::journal::ReplayEntry replay_entry;
+ uint64_t tag_id;
+ if (!m_journaler.try_pop_front(&replay_entry, &tag_id)) {
+ break;
+ }
+
+ r = process_entry(replay_entry, tag_id);
+ if (r < 0) {
+ break;
+ }
+ }
+ }
+
+ virtual int process_entry(::journal::ReplayEntry replay_entry,
+ uint64_t tag_id) = 0;
+
+ void handle_replay_complete(int r) {
+ if (m_r == 0 && r < 0) {
+ m_r = r;
+ }
+ m_journaler.stop_replay(&m_cond);
+ }
+
+ Journaler m_journaler;
+ C_SaferCond m_cond;
+ int m_r;
+};
+
+static int inspect_entry(bufferlist& data,
+ librbd::journal::EventEntry& event_entry,
+ bool verbose) {
+ try {
+ auto it = data.cbegin();
+ decode(event_entry, it);
+ } catch (const buffer::error &err) {
+ std::cerr << "failed to decode event entry: " << err.what() << std::endl;
+ return -EINVAL;
+ }
+ if (verbose) {
+ JSONFormatter f(true);
+ f.open_object_section("event_entry");
+ event_entry.dump(&f);
+ f.close_section();
+ f.flush(std::cout);
+ }
+ return 0;
+}
+
+class JournalInspector : public JournalPlayer {
+public:
+ JournalInspector(librados::IoCtx& io_ctx, const std::string& journal_id,
+ bool verbose) :
+ JournalPlayer(io_ctx, journal_id, "INSPECT"),
+ m_verbose(verbose),
+ m_s() {
+ }
+
+ int exec() override {
+ int r = JournalPlayer::exec();
+ m_s.print();
+ return r;
+ }
+
+private:
+ struct Stats {
+ Stats() : total(0), error(0) {}
+
+ void print() {
+ std::cout << "Summary:" << std::endl
+ << " " << total << " entries inspected, " << error << " errors"
+ << std::endl;
+ }
+
+ int total;
+ int error;
+ };
+
+ int process_entry(::journal::ReplayEntry replay_entry,
+ uint64_t tag_id) override {
+ m_s.total++;
+ if (m_verbose) {
+ std::cout << "Entry: tag_id=" << tag_id << ", commit_tid="
+ << replay_entry.get_commit_tid() << std::endl;
+ }
+ bufferlist data = replay_entry.get_data();
+ librbd::journal::EventEntry event_entry;
+ int r = inspect_entry(data, event_entry, m_verbose);
+ if (r < 0) {
+ m_r = r;
+ m_s.error++;
+ }
+ return 0;
+ }
+
+ bool m_verbose;
+ Stats m_s;
+};
+
+static int do_inspect_journal(librados::IoCtx& io_ctx,
+ const std::string& journal_id,
+ bool verbose) {
+ JournalInspector inspector(io_ctx, journal_id, verbose);
+ int r = inspector.exec();
+ if (r < 0) {
+ inspector.shut_down();
+ return r;
+ }
+
+ r = inspector.shut_down();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+struct ExportEntry {
+ uint64_t tag_id;
+ uint64_t commit_tid;
+ int type;
+ bufferlist entry;
+
+ ExportEntry() : tag_id(0), commit_tid(0), type(0), entry() {}
+
+ ExportEntry(uint64_t tag_id, uint64_t commit_tid, int type,
+ const bufferlist& entry)
+ : tag_id(tag_id), commit_tid(commit_tid), type(type), entry(entry) {
+ }
+
+ void dump(Formatter *f) const {
+ ::encode_json("tag_id", tag_id, f);
+ ::encode_json("commit_tid", commit_tid, f);
+ ::encode_json("type", type, f);
+ ::encode_json("entry", entry, f);
+ }
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("tag_id", tag_id, obj);
+ JSONDecoder::decode_json("commit_tid", commit_tid, obj);
+ JSONDecoder::decode_json("type", type, obj);
+ JSONDecoder::decode_json("entry", entry, obj);
+ }
+};
+
+class JournalExporter : public JournalPlayer {
+public:
+ JournalExporter(librados::IoCtx& io_ctx, const std::string& journal_id,
+ int fd, bool no_error, bool verbose) :
+ JournalPlayer(io_ctx, journal_id, "EXPORT"),
+ m_journal_id(journal_id),
+ m_fd(fd),
+ m_no_error(no_error),
+ m_verbose(verbose),
+ m_s() {
+ }
+
+ int exec() override {
+ std::string header("# journal_id: " + m_journal_id + "\n");
+ int r;
+ r = safe_write(m_fd, header.c_str(), header.size());
+ if (r < 0) {
+ std::cerr << "rbd: failed to write to export file: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ r = JournalPlayer::exec();
+ m_s.print();
+ return r;
+ }
+
+private:
+ struct Stats {
+ Stats() : total(0), error(0) {}
+
+ void print() {
+ std::cout << total << " entries processed, " << error << " errors"
+ << std::endl;
+ }
+
+ int total;
+ int error;
+ };
+
+ int process_entry(::journal::ReplayEntry replay_entry,
+ uint64_t tag_id) override {
+ m_s.total++;
+ int type = -1;
+ bufferlist entry = replay_entry.get_data();
+ librbd::journal::EventEntry event_entry;
+ int r = inspect_entry(entry, event_entry, m_verbose);
+ if (r < 0) {
+ m_s.error++;
+ m_r = r;
+ return m_no_error ? 0 : r;
+ } else {
+ type = event_entry.get_event_type();
+ }
+ ExportEntry export_entry(tag_id, replay_entry.get_commit_tid(), type,
+ entry);
+ JSONFormatter f;
+ ::encode_json("event_entry", export_entry, &f);
+ std::ostringstream oss;
+ f.flush(oss);
+ std::string objstr = oss.str();
+ std::string header = stringify(objstr.size()) + " ";
+ r = safe_write(m_fd, header.c_str(), header.size());
+ if (r == 0) {
+ r = safe_write(m_fd, objstr.c_str(), objstr.size());
+ }
+ if (r == 0) {
+ r = safe_write(m_fd, "\n", 1);
+ }
+ if (r < 0) {
+ std::cerr << "rbd: failed to write to export file: " << cpp_strerror(r)
+ << std::endl;
+ m_s.error++;
+ return r;
+ }
+ return 0;
+ }
+
+ std::string m_journal_id;
+ int m_fd;
+ bool m_no_error;
+ bool m_verbose;
+ Stats m_s;
+};
+
+static int do_export_journal(librados::IoCtx& io_ctx,
+ const std::string& journal_id,
+ const std::string& path,
+ bool no_error, bool verbose) {
+ int r;
+ int fd;
+ bool to_stdout = path == "-";
+ if (to_stdout) {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = open(path.c_str(), O_WRONLY | O_CREAT | O_EXCL | O_BINARY, 0644);
+ if (fd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error creating " << path << std::endl;
+ return r;
+ }
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+ }
+
+ JournalExporter exporter(io_ctx, journal_id, fd, no_error, verbose);
+ r = exporter.exec();
+
+ if (!to_stdout) {
+ close(fd);
+ }
+
+ int shut_down_r = exporter.shut_down();
+ if (r == 0 && shut_down_r < 0) {
+ r = shut_down_r;
+ }
+
+ return r;
+}
+
+class JournalImporter {
+public:
+ JournalImporter(librados::IoCtx& io_ctx, const std::string& journal_id,
+ int fd, bool no_error, bool verbose) :
+ m_journaler(io_ctx, journal_id, "IMPORT"),
+ m_fd(fd),
+ m_no_error(no_error),
+ m_verbose(verbose) {
+ }
+
+ bool read_entry(bufferlist& bl, int& r) {
+ // Entries are stored in the file using the following format:
+ //
+ // # Optional comments
+ // NNN {json encoded entry}
+ // ...
+ //
+ // Where NNN is the encoded entry size.
+ bl.clear();
+ char buf[80];
+ // Skip line feed and comments (lines started with #).
+ while ((r = safe_read_exact(m_fd, buf, 1)) == 0) {
+ if (buf[0] == '\n') {
+ continue;
+ } else if (buf[0] == '#') {
+ while ((r = safe_read_exact(m_fd, buf, 1)) == 0) {
+ if (buf[0] == '\n') {
+ break;
+ }
+ }
+ } else {
+ break;
+ }
+ }
+ if (r < 0) {
+ if (r == -EDOM) {
+ r = 0;
+ }
+ return false;
+ }
+ // Read entry size to buf.
+ if (!isdigit(buf[0])) {
+ r = -EINVAL;
+ std::cerr << "rbd: import data invalid format (digit expected)"
+ << std::endl;
+ return false;
+ }
+ for (size_t i = 1; i < sizeof(buf); i++) {
+ r = safe_read_exact(m_fd, buf + i, 1);
+ if (r < 0) {
+ std::cerr << "rbd: error reading import data" << std::endl;
+ return false;
+ }
+ if (!isdigit(buf[i])) {
+ if (buf[i] != ' ') {
+ r = -EINVAL;
+ std::cerr << "rbd: import data invalid format (space expected)"
+ << std::endl;
+ return false;
+ }
+ buf[i] = '\0';
+ break;
+ }
+ }
+ int entry_size = atoi(buf);
+ if (entry_size == 0) {
+ r = -EINVAL;
+ std::cerr << "rbd: import data invalid format (zero entry size)"
+ << std::endl;
+ return false;
+ }
+ ceph_assert(entry_size > 0);
+ // Read entry.
+ r = bl.read_fd(m_fd, entry_size);
+ if (r < 0) {
+ std::cerr << "rbd: error reading from stdin: " << cpp_strerror(r)
+ << std::endl;
+ return false;
+ }
+ if (r != entry_size) {
+ std::cerr << "rbd: error reading from stdin: truncated"
+ << std::endl;
+ r = -EINVAL;
+ return false;
+ }
+ r = 0;
+ return true;
+ }
+
+ int exec() {
+ int r = m_journaler.init();
+ if (r < 0) {
+ return r;
+ }
+ m_journaler.start_append(0);
+
+ int r1 = 0;
+ bufferlist bl;
+ int n = 0;
+ int error_count = 0;
+ while (read_entry(bl, r)) {
+ n++;
+ error_count++;
+ JSONParser p;
+ if (!p.parse(bl.c_str(), bl.length())) {
+ std::cerr << "rbd: error parsing input (entry " << n << ")"
+ << std::endl;
+ r = -EINVAL;
+ if (m_no_error) {
+ r1 = r;
+ continue;
+ } else {
+ break;
+ }
+ }
+ ExportEntry e;
+ try {
+ decode_json_obj(e, &p);
+ } catch (const JSONDecoder::err& err) {
+ std::cerr << "rbd: error json decoding import data (entry " << n << "):"
+ << err.what() << std::endl;
+ r = -EINVAL;
+ if (m_no_error) {
+ r1 = r;
+ continue;
+ } else {
+ break;
+ }
+ }
+ librbd::journal::EventEntry event_entry;
+ r = inspect_entry(e.entry, event_entry, m_verbose);
+ if (r < 0) {
+ std::cerr << "rbd: corrupted entry " << n << ": tag_tid=" << e.tag_id
+ << ", commit_tid=" << e.commit_tid << std::endl;
+ if (m_no_error) {
+ r1 = r;
+ continue;
+ } else {
+ break;
+ }
+ }
+ m_journaler.append(e.tag_id, e.entry);
+ error_count--;
+ }
+
+ std::cout << n << " entries processed, " << error_count << " errors" << std::endl;
+
+ std::cout << "Waiting for journal append to complete..." << std::endl;
+
+ C_SaferCond cond;
+ m_journaler.stop_append(&cond);
+ r = cond.wait();
+
+ if (r < 0) {
+ std::cerr << "failed to append journal: " << cpp_strerror(r) << std::endl;
+ }
+
+ if (r1 < 0 && r == 0) {
+ r = r1;
+ }
+ return r;
+ }
+
+ int shut_down() {
+ return m_journaler.shut_down();
+ }
+
+private:
+ Journaler m_journaler;
+ int m_fd;
+ bool m_no_error;
+ bool m_verbose;
+};
+
+static int do_import_journal(librados::IoCtx& io_ctx,
+ const std::string& journal_id,
+ const std::string& path,
+ bool no_error, bool verbose) {
+ int r;
+
+ int fd;
+ bool from_stdin = path == "-";
+ if (from_stdin) {
+ fd = STDIN_FILENO;
+ } else {
+ if ((fd = open(path.c_str(), O_RDONLY|O_BINARY)) < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << path << std::endl;
+ return r;
+ }
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+ }
+
+ JournalImporter importer(io_ctx, journal_id, fd, no_error, verbose);
+ r = importer.exec();
+
+ if (!from_stdin) {
+ close(fd);
+ }
+
+ int shut_down_r = importer.shut_down();
+ if (r == 0 && shut_down_r < 0) {
+ r = shut_down_r;
+ }
+
+ return r;
+}
+
+void get_info_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_info(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_show_journal_info(rados, io_ctx, journal_name, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: journal info: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+
+}
+
+void get_status_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_status(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_show_journal_status(io_ctx, journal_name, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: journal status: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_reset_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_reset(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_reset_journal(io_ctx, journal_name);
+ if (r < 0) {
+ std::cerr << "rbd: journal reset: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_client_disconnect_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ ("client-id", po::value<std::string>(),
+ "client ID (or leave unspecified to disconnect all)");
+}
+
+int execute_client_disconnect(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string client_id;
+ if (vm.count("client-id")) {
+ client_id = vm["client-id"].as<std::string>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_disconnect_journal_client(io_ctx, journal_name, client_id);
+ if (r < 0) {
+ std::cerr << "rbd: journal client disconnect: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_inspect_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_verbose_option(options);
+}
+
+int execute_inspect(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_inspect_journal(io_ctx, journal_name, vm[at::VERBOSE].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: journal inspect: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_export_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_path_options(positional, options,
+ "export file (or '-' for stdout)");
+ at::add_verbose_option(options);
+ at::add_no_error_option(options);
+}
+
+int execute_export(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string path;
+ r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_export_journal(io_ctx, journal_name, path, vm[at::NO_ERR].as<bool>(),
+ vm[at::VERBOSE].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: journal export: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_import_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_path_options(positional, options,
+ "import file (or '-' for stdin)");
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_verbose_option(options);
+ at::add_no_error_option(options);
+}
+
+int execute_import(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string path;
+ size_t arg_index = 0;
+ int r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_DEST, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_import_journal(io_ctx, journal_name, path, vm[at::NO_ERR].as<bool>(),
+ vm[at::VERBOSE].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: journal import: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_info(
+ {"journal", "info"}, {}, "Show information about image journal.", "",
+ &get_info_arguments, &execute_info);
+
+Shell::Action action_status(
+ {"journal", "status"}, {}, "Show status of image journal.", "",
+ &get_status_arguments, &execute_status);
+
+Shell::Action action_reset(
+ {"journal", "reset"}, {}, "Reset image journal.", "",
+ &get_reset_arguments, &execute_reset);
+
+Shell::Action action_inspect(
+ {"journal", "inspect"}, {}, "Inspect image journal for structural errors.", "",
+ &get_inspect_arguments, &execute_inspect);
+
+Shell::Action action_export(
+ {"journal", "export"}, {}, "Export image journal.", "",
+ &get_export_arguments, &execute_export);
+
+Shell::Action action_import(
+ {"journal", "import"}, {}, "Import image journal.", "",
+ &get_import_arguments, &execute_import);
+
+Shell::Action action_disconnect(
+ {"journal", "client", "disconnect"}, {},
+ "Flag image journal client as disconnected.", "",
+ &get_client_disconnect_arguments, &execute_client_disconnect);
+
+} // namespace journal
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Kernel.cc b/src/tools/rbd/action/Kernel.cc
new file mode 100644
index 000000000..02ceb7723
--- /dev/null
+++ b/src/tools/rbd/action/Kernel.cc
@@ -0,0 +1,681 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/krbd.h"
+#include "include/stringify.h"
+#include "include/uuid.h"
+#include "common/config_proxy.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/strtol.h"
+#include "common/Formatter.h"
+#include "msg/msg_types.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/scope_exit.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace kernel {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+typedef std::map<std::string, std::string> MapOptions;
+
+static std::string map_option_uuid_cb(const char *value_char)
+{
+ uuid_d u;
+ if (!u.parse(value_char))
+ return "";
+
+ return stringify(u);
+}
+
+static std::string map_option_ip_cb(const char *value_char)
+{
+ entity_addr_t a;
+ const char *endptr;
+ if (!a.parse(value_char, &endptr) ||
+ endptr != value_char + strlen(value_char)) {
+ return "";
+ }
+
+ return stringify(a.get_sockaddr());
+}
+
+static std::string map_option_int_cb(const char *value_char)
+{
+ std::string err;
+ int d = strict_strtol(value_char, 10, &err);
+ if (!err.empty() || d < 0)
+ return "";
+
+ return stringify(d);
+}
+
+static std::string map_option_string_cb(const char *value_char)
+{
+ return value_char;
+}
+
+static std::string map_option_read_from_replica_cb(const char *value_char)
+{
+ if (!strcmp(value_char, "no") || !strcmp(value_char, "balance") ||
+ !strcmp(value_char, "localize")) {
+ return value_char;
+ }
+ return "";
+}
+
+static std::string map_option_compression_hint_cb(const char *value_char)
+{
+ if (!strcmp(value_char, "none") || !strcmp(value_char, "compressible") ||
+ !strcmp(value_char, "incompressible")) {
+ return value_char;
+ }
+ return "";
+}
+
+static std::string map_option_ms_mode_cb(const char *value_char)
+{
+ if (!strcmp(value_char, "legacy") || !strcmp(value_char, "crc") ||
+ !strcmp(value_char, "secure") || !strcmp(value_char, "prefer-crc") ||
+ !strcmp(value_char, "prefer-secure")) {
+ return value_char;
+ }
+ return "";
+}
+
+static void put_map_option(const std::string &key, const std::string &val,
+ MapOptions* map_options)
+{
+ (*map_options)[key] = val;
+}
+
+static int put_map_option_value(const std::string &opt, const char *value_char,
+ std::string (*parse_cb)(const char *),
+ MapOptions* map_options)
+{
+ if (!value_char || *value_char == '\0') {
+ std::cerr << "rbd: " << opt << " option requires a value" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string value = parse_cb(value_char);
+ if (value.empty()) {
+ std::cerr << "rbd: invalid " << opt << " value '" << value_char << "'"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ put_map_option(opt, opt + "=" + value, map_options);
+ return 0;
+}
+
+static int parse_map_options(const std::string &options_string,
+ MapOptions* map_options)
+{
+ char *options = strdup(options_string.c_str());
+ BOOST_SCOPE_EXIT(options) {
+ free(options);
+ } BOOST_SCOPE_EXIT_END;
+
+ for (char *this_char = strtok(options, ", ");
+ this_char != NULL;
+ this_char = strtok(NULL, ",")) {
+ char *value_char;
+
+ if ((value_char = strchr(this_char, '=')) != NULL)
+ *value_char++ = '\0';
+
+ if (!strcmp(this_char, "fsid")) {
+ if (put_map_option_value("fsid", value_char, map_option_uuid_cb,
+ map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "ip")) {
+ if (put_map_option_value("ip", value_char, map_option_ip_cb,
+ map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "share") || !strcmp(this_char, "noshare")) {
+ put_map_option("share", this_char, map_options);
+ } else if (!strcmp(this_char, "crc") || !strcmp(this_char, "nocrc")) {
+ put_map_option("crc", this_char, map_options);
+ } else if (!strcmp(this_char, "cephx_require_signatures") ||
+ !strcmp(this_char, "nocephx_require_signatures")) {
+ put_map_option("cephx_require_signatures", this_char, map_options);
+ } else if (!strcmp(this_char, "tcp_nodelay") ||
+ !strcmp(this_char, "notcp_nodelay")) {
+ put_map_option("tcp_nodelay", this_char, map_options);
+ } else if (!strcmp(this_char, "cephx_sign_messages") ||
+ !strcmp(this_char, "nocephx_sign_messages")) {
+ put_map_option("cephx_sign_messages", this_char, map_options);
+ } else if (!strcmp(this_char, "mount_timeout")) {
+ if (put_map_option_value("mount_timeout", value_char, map_option_int_cb,
+ map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "osd_request_timeout")) {
+ if (put_map_option_value("osd_request_timeout", value_char,
+ map_option_int_cb, map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "lock_timeout")) {
+ if (put_map_option_value("lock_timeout", value_char, map_option_int_cb,
+ map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "osdkeepalive")) {
+ if (put_map_option_value("osdkeepalive", value_char, map_option_int_cb,
+ map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "osd_idle_ttl")) {
+ if (put_map_option_value("osd_idle_ttl", value_char, map_option_int_cb,
+ map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "rw") || !strcmp(this_char, "ro")) {
+ put_map_option("rw", this_char, map_options);
+ } else if (!strcmp(this_char, "queue_depth")) {
+ if (put_map_option_value("queue_depth", value_char, map_option_int_cb,
+ map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "lock_on_read")) {
+ put_map_option("lock_on_read", this_char, map_options);
+ } else if (!strcmp(this_char, "exclusive")) {
+ put_map_option("exclusive", this_char, map_options);
+ } else if (!strcmp(this_char, "notrim")) {
+ put_map_option("notrim", this_char, map_options);
+ } else if (!strcmp(this_char, "abort_on_full")) {
+ put_map_option("abort_on_full", this_char, map_options);
+ } else if (!strcmp(this_char, "alloc_size")) {
+ if (put_map_option_value("alloc_size", value_char, map_option_int_cb,
+ map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "crush_location")) {
+ if (put_map_option_value("crush_location", value_char,
+ map_option_string_cb, map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "read_from_replica")) {
+ if (put_map_option_value("read_from_replica", value_char,
+ map_option_read_from_replica_cb, map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "compression_hint")) {
+ if (put_map_option_value("compression_hint", value_char,
+ map_option_compression_hint_cb, map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "ms_mode")) {
+ if (put_map_option_value("ms_mode", value_char, map_option_ms_mode_cb,
+ map_options))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "rxbounce")) {
+ put_map_option("rxbounce", this_char, map_options);
+ } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) {
+ put_map_option("udev", this_char, map_options);
+ } else {
+ std::cerr << "rbd: unknown map option '" << this_char << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_unmap_options(const std::string &options_string,
+ MapOptions* unmap_options)
+{
+ char *options = strdup(options_string.c_str());
+ BOOST_SCOPE_EXIT(options) {
+ free(options);
+ } BOOST_SCOPE_EXIT_END;
+
+ for (char *this_char = strtok(options, ", ");
+ this_char != NULL;
+ this_char = strtok(NULL, ",")) {
+ char *value_char;
+
+ if ((value_char = strchr(this_char, '=')) != NULL)
+ *value_char++ = '\0';
+
+ if (!strcmp(this_char, "force")) {
+ put_map_option("force", this_char, unmap_options);
+ } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) {
+ put_map_option("udev", this_char, unmap_options);
+ } else {
+ std::cerr << "rbd: unknown unmap option '" << this_char << "'"
+ << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int do_kernel_list(Formatter *f) {
+#if defined(WITH_KRBD)
+ struct krbd_ctx *krbd;
+ int r;
+
+ r = krbd_create_from_context(g_ceph_context, 0, &krbd);
+ if (r < 0)
+ return r;
+
+ r = krbd_showmapped(krbd, f);
+
+ krbd_destroy(krbd);
+ return r;
+#else
+ std::cerr << "rbd: kernel device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int get_unsupported_features(librbd::Image &image,
+ uint64_t *unsupported_features)
+{
+ char buf[20];
+ uint64_t features, supported_features;
+ int r;
+
+ r = safe_read_file("/sys/bus/rbd/", "supported_features", buf,
+ sizeof(buf) - 1);
+ if (r < 0)
+ return r;
+
+ buf[r] = '\0';
+ try {
+ supported_features = std::stoull(buf, nullptr, 16);
+ } catch (...) {
+ return -EINVAL;
+ }
+
+ r = image.features(&features);
+ if (r < 0)
+ return r;
+
+ *unsupported_features = features & ~supported_features;
+ return 0;
+}
+
+/*
+ * hint user to check syslog for krbd related messages and provide suggestions
+ * based on errno return by krbd_map(). also note that even if some librbd calls
+ * fail, we at least dump the "try dmesg..." message to aid debugging.
+ */
+static void print_error_description(const char *poolname,
+ const char *nspace_name,
+ const char *imgname,
+ const char *snapname,
+ int maperrno)
+{
+ int r;
+ uint8_t oldformat;
+ librados::Rados rados;
+ librados::IoCtx ioctx;
+ librbd::Image image;
+
+ if (maperrno == -ENOENT)
+ goto done;
+
+ r = utils::init_and_open_image(poolname, nspace_name, imgname, "", snapname,
+ true, &rados, &ioctx, &image);
+ if (r < 0)
+ goto done;
+
+ r = image.old_format(&oldformat);
+ if (r < 0)
+ goto done;
+
+ /*
+ * kernel returns -ENXIO when mapping a V2 image due to unsupported feature
+ * set - so, hint about that too...
+ */
+ if (!oldformat && (maperrno == -ENXIO)) {
+ uint64_t unsupported_features;
+ bool need_terminate = true;
+
+ std::cout << "RBD image feature set mismatch. ";
+ r = get_unsupported_features(image, &unsupported_features);
+ if (r == 0 && (unsupported_features & ~RBD_FEATURES_ALL) == 0) {
+ uint64_t immutable = RBD_FEATURES_ALL & ~(RBD_FEATURES_MUTABLE |
+ RBD_FEATURES_DISABLE_ONLY);
+ if (unsupported_features & immutable) {
+ std::cout << "This image cannot be mapped because the following "
+ << "immutable features are unsupported by the kernel:";
+ unsupported_features &= immutable;
+ need_terminate = false;
+ } else {
+ std::cout << "You can disable features unsupported by the kernel "
+ << "with \"rbd feature disable ";
+ if (poolname != utils::get_default_pool_name() || *nspace_name) {
+ std::cout << poolname << "/";
+ }
+ if (*nspace_name) {
+ std::cout << nspace_name << "/";
+ }
+ std::cout << imgname;
+ }
+ } else {
+ std::cout << "Try disabling features unsupported by the kernel "
+ << "with \"rbd feature disable";
+ unsupported_features = 0;
+ }
+ for (auto it : at::ImageFeatures::FEATURE_MAPPING) {
+ if (it.first & unsupported_features) {
+ std::cout << " " << it.second;
+ }
+ }
+ if (need_terminate)
+ std::cout << "\"";
+ std::cout << "." << std::endl;
+ }
+
+ done:
+ std::cout << "In some cases useful info is found in syslog - try \"dmesg | tail\"." << std::endl;
+}
+
+static int do_kernel_map(const char *poolname, const char *nspace_name,
+ const char *imgname, const char *snapname,
+ MapOptions&& map_options)
+{
+#if defined(WITH_KRBD)
+ struct krbd_ctx *krbd;
+ std::ostringstream oss;
+ uint32_t flags = 0;
+ char *devnode;
+ int r;
+
+ for (auto it = map_options.begin(); it != map_options.end(); ) {
+ // for compatibility with < 3.7 kernels, assume that rw is on by
+ // default and omit it even if it was specified by the user
+ // (see ceph.git commit fb0f1986449b)
+ if (it->first == "rw" && it->second == "rw") {
+ it = map_options.erase(it);
+ } else if (it->first == "udev") {
+ if (it->second == "noudev") {
+ flags |= KRBD_CTX_F_NOUDEV;
+ }
+ it = map_options.erase(it);
+ } else {
+ if (it != map_options.begin())
+ oss << ",";
+ oss << it->second;
+ ++it;
+ }
+ }
+
+ r = krbd_create_from_context(g_ceph_context, flags, &krbd);
+ if (r < 0)
+ return r;
+
+ r = krbd_is_mapped(krbd, poolname, nspace_name, imgname, snapname, &devnode);
+ if (r < 0) {
+ std::cerr << "rbd: warning: can't get image map information: "
+ << cpp_strerror(r) << std::endl;
+ } else if (r > 0) {
+ std::cerr << "rbd: warning: image already mapped as " << devnode
+ << std::endl;
+ free(devnode);
+ }
+
+ r = krbd_map(krbd, poolname, nspace_name, imgname, snapname,
+ oss.str().c_str(), &devnode);
+ if (r < 0) {
+ print_error_description(poolname, nspace_name, imgname, snapname, r);
+ goto out;
+ }
+
+ std::cout << devnode << std::endl;
+
+ free(devnode);
+out:
+ krbd_destroy(krbd);
+ return r;
+#else
+ std::cerr << "rbd: kernel device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int do_kernel_unmap(const char *dev, const char *poolname,
+ const char *nspace_name, const char *imgname,
+ const char *snapname, MapOptions&& unmap_options)
+{
+#if defined(WITH_KRBD)
+ struct krbd_ctx *krbd;
+ std::ostringstream oss;
+ uint32_t flags = 0;
+ int r;
+
+ for (auto it = unmap_options.begin(); it != unmap_options.end(); ) {
+ if (it->first == "udev") {
+ if (it->second == "noudev") {
+ flags |= KRBD_CTX_F_NOUDEV;
+ }
+ it = unmap_options.erase(it);
+ } else {
+ if (it != unmap_options.begin())
+ oss << ",";
+ oss << it->second;
+ ++it;
+ }
+ }
+
+ r = krbd_create_from_context(g_ceph_context, flags, &krbd);
+ if (r < 0)
+ return r;
+
+ if (dev)
+ r = krbd_unmap(krbd, dev, oss.str().c_str());
+ else
+ r = krbd_unmap_by_spec(krbd, poolname, nspace_name, imgname, snapname,
+ oss.str().c_str());
+
+ krbd_destroy(krbd);
+ return r;
+#else
+ std::cerr << "rbd: kernel device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ at::Format::Formatter formatter;
+ int r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::init_context();
+
+ r = do_kernel_list(formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: device list failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int execute_map(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string nspace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ MapOptions map_options;
+ if (vm.count("options")) {
+ for (auto &options : vm["options"].as<std::vector<std::string>>()) {
+ r = parse_map_options(options, &map_options);
+ if (r < 0) {
+ std::cerr << "rbd: couldn't parse map options" << std::endl;
+ return r;
+ }
+ }
+ }
+
+ // parse options common to all device types after parsing krbd-specific
+ // options so that common options win (in particular "-o rw --read-only"
+ // should result in read-only mapping)
+ if (vm["read-only"].as<bool>()) {
+ put_map_option("rw", "ro", &map_options);
+ }
+ if (vm["exclusive"].as<bool>()) {
+ put_map_option("exclusive", "exclusive", &map_options);
+ }
+ if (vm["quiesce"].as<bool>()) {
+ std::cerr << "rbd: warning: quiesce is not supported" << std::endl;
+ }
+ if (vm.count("quiesce-hook")) {
+ std::cerr << "rbd: warning: quiesce-hook is not supported" << std::endl;
+ }
+
+ // connect to the cluster to get the default pool and the default map
+ // options
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::normalize_pool_name(&pool_name);
+
+ librados::IoCtx ioctx;
+ librbd::Image image;
+ r = utils::init_io_ctx(rados, pool_name, nspace_name, &ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = utils::open_image(ioctx, image_name, true, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ MapOptions default_map_options;
+ std::vector<librbd::config_option_t> options;
+ image.config_list(&options);
+ for (const auto &option : options) {
+ if (option.name == "rbd_default_map_options") {
+ r = parse_map_options(option.value, &default_map_options);
+ if (r < 0) {
+ std::cerr << "rbd: couldn't parse default map options" << std::endl;
+ return r;
+ }
+
+ break;
+ }
+ }
+
+ for (auto& [key, value] : default_map_options) {
+ if (map_options.count(key) == 0) {
+ map_options[key] = value;
+ }
+ }
+
+ r = do_kernel_map(pool_name.c_str(), nspace_name.c_str(), image_name.c_str(),
+ snap_name.c_str(), std::move(map_options));
+ if (r < 0) {
+ std::cerr << "rbd: map failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_unmap(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string device_name = utils::get_positional_argument(vm, 0);
+ if (!boost::starts_with(device_name, "/dev/")) {
+ device_name.clear();
+ }
+
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string nspace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r;
+ if (device_name.empty()) {
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name,
+ &image_name, &snap_name, false, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (device_name.empty() && image_name.empty()) {
+ std::cerr << "rbd: unmap requires either image name or device path"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ MapOptions unmap_options;
+ if (vm.count("options")) {
+ for (auto &options : vm["options"].as<std::vector<std::string>>()) {
+ r = parse_unmap_options(options, &unmap_options);
+ if (r < 0) {
+ std::cerr << "rbd: couldn't parse unmap options" << std::endl;
+ return r;
+ }
+ }
+ }
+
+ if (device_name.empty() && pool_name.empty()) {
+ // connect to the cluster to get the default pool
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::normalize_pool_name(&pool_name);
+ }
+
+ r = do_kernel_unmap(device_name.empty() ? nullptr : device_name.c_str(),
+ pool_name.c_str(), nspace_name.c_str(),
+ image_name.c_str(), snap_name.c_str(),
+ std::move(unmap_options));
+ if (r < 0) {
+ std::cerr << "rbd: unmap failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int execute_attach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(WITH_KRBD)
+ std::cerr << "rbd: krbd does not support attach" << std::endl;
+#else
+ std::cerr << "rbd: kernel device is not supported" << std::endl;
+#endif
+ return -EOPNOTSUPP;
+}
+
+int execute_detach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(WITH_KRBD)
+ std::cerr << "rbd: krbd does not support detach" << std::endl;
+#else
+ std::cerr << "rbd: kernel device is not supported" << std::endl;
+#endif
+ return -EOPNOTSUPP;
+}
+
+} // namespace kernel
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/List.cc b/src/tools/rbd/action/List.cc
new file mode 100644
index 000000000..e70835102
--- /dev/null
+++ b/src/tools/rbd/action/List.cc
@@ -0,0 +1,346 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+#include "include/types.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/bind/bind.hpp>
+#include <boost/program_options.hpp>
+#include "global/global_context.h"
+
+namespace rbd {
+
+namespace action {
+namespace list {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+using namespace boost::placeholders;
+
+enum WorkerState {
+ STATE_IDLE = 0,
+ STATE_OPENED,
+ STATE_DONE
+} ;
+
+struct WorkerEntry {
+ librbd::Image img;
+ librbd::RBD::AioCompletion* completion;
+ WorkerState state;
+ string name;
+ string id;
+
+ WorkerEntry() {
+ state = STATE_IDLE;
+ completion = nullptr;
+ }
+};
+
+
+int list_process_image(librados::Rados* rados, WorkerEntry* w, bool lflag, Formatter *f, TextTable &tbl)
+{
+ int r = 0;
+ librbd::image_info_t info;
+ std::string parent;
+
+ // handle second-nth trips through loop
+ librbd::linked_image_spec_t parent_image_spec;
+ librbd::snap_spec_t parent_snap_spec;
+ r = w->img.get_parent(&parent_image_spec, &parent_snap_spec);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+
+ bool has_parent = false;
+ if (r != -ENOENT) {
+ parent = parent_image_spec.pool_name + "/";
+ if (!parent_image_spec.pool_namespace.empty()) {
+ parent += parent_image_spec.pool_namespace + "/";
+ }
+ parent += parent_image_spec.image_name + "@" + parent_snap_spec.name;
+ has_parent = true;
+ }
+
+ if (w->img.stat(info, sizeof(info)) < 0) {
+ return -EINVAL;
+ }
+
+ uint8_t old_format;
+ w->img.old_format(&old_format);
+
+ std::list<librbd::locker_t> lockers;
+ bool exclusive;
+ r = w->img.list_lockers(&lockers, &exclusive, NULL);
+ if (r < 0)
+ return r;
+ std::string lockstr;
+ if (!lockers.empty()) {
+ lockstr = (exclusive) ? "excl" : "shr";
+ }
+
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("image", w->name);
+ f->dump_string("id", w->id);
+ f->dump_unsigned("size", info.size);
+ if (has_parent) {
+ f->open_object_section("parent");
+ f->dump_string("pool", parent_image_spec.pool_name);
+ f->dump_string("pool_namespace", parent_image_spec.pool_namespace);
+ f->dump_string("image", parent_image_spec.image_name);
+ f->dump_string("snapshot", parent_snap_spec.name);
+ f->close_section();
+ }
+ f->dump_int("format", old_format ? 1 : 2);
+ if (!lockers.empty())
+ f->dump_string("lock_type", exclusive ? "exclusive" : "shared");
+ f->close_section();
+ } else {
+ tbl << w->name
+ << stringify(byte_u_t(info.size))
+ << parent
+ << ((old_format) ? '1' : '2')
+ << "" // protect doesn't apply to images
+ << lockstr
+ << TextTable::endrow;
+ }
+
+ std::vector<librbd::snap_info_t> snaplist;
+ if (w->img.snap_list(snaplist) >= 0 && !snaplist.empty()) {
+ snaplist.erase(remove_if(snaplist.begin(),
+ snaplist.end(),
+ boost::bind(utils::is_not_user_snap_namespace, &w->img, _1)),
+ snaplist.end());
+ for (std::vector<librbd::snap_info_t>::iterator s = snaplist.begin();
+ s != snaplist.end(); ++s) {
+ bool is_protected;
+ bool has_parent = false;
+ parent.clear();
+ w->img.snap_set(s->name.c_str());
+ r = w->img.snap_is_protected(s->name.c_str(), &is_protected);
+ if (r < 0)
+ return r;
+ if (w->img.get_parent(&parent_image_spec, &parent_snap_spec) >= 0) {
+ parent = parent_image_spec.pool_name + "/";
+ if (!parent_image_spec.pool_namespace.empty()) {
+ parent += parent_image_spec.pool_namespace + "/";
+ }
+ parent += parent_image_spec.image_name + "@" + parent_snap_spec.name;
+ has_parent = true;
+ }
+ if (f) {
+ f->open_object_section("snapshot");
+ f->dump_string("image", w->name);
+ f->dump_string("id", w->id);
+ f->dump_string("snapshot", s->name);
+ f->dump_unsigned("snapshot_id", s->id);
+ f->dump_unsigned("size", s->size);
+ if (has_parent) {
+ f->open_object_section("parent");
+ f->dump_string("pool", parent_image_spec.pool_name);
+ f->dump_string("pool_namespace", parent_image_spec.pool_namespace);
+ f->dump_string("image", parent_image_spec.image_name);
+ f->dump_string("snapshot", parent_snap_spec.name);
+ f->close_section();
+ }
+ f->dump_int("format", old_format ? 1 : 2);
+ f->dump_string("protected", is_protected ? "true" : "false");
+ f->close_section();
+ } else {
+ tbl << w->name + "@" + s->name
+ << stringify(byte_u_t(s->size))
+ << parent
+ << ((old_format) ? '1' : '2')
+ << (is_protected ? "yes" : "")
+ << "" // locks don't apply to snaps
+ << TextTable::endrow;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int do_list(const std::string &pool_name, const std::string& namespace_name,
+ bool lflag, Formatter *f) {
+ std::vector<WorkerEntry*> workers;
+ std::vector<librbd::image_spec_t> images;
+ librados::Rados rados;
+ librbd::RBD rbd;
+ librados::IoCtx ioctx;
+
+ int r = utils::init(pool_name, namespace_name, &rados, &ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ int threads = g_conf().get_val<uint64_t>("rbd_concurrent_management_ops");
+ if (threads < 1) {
+ threads = 1;
+ }
+ if (threads > 32) {
+ threads = 32;
+ }
+
+ utils::disable_cache();
+
+ r = rbd.list2(ioctx, &images);
+ if (r < 0)
+ return r;
+
+ if (!lflag) {
+ if (f)
+ f->open_array_section("images");
+ for (auto& image : images) {
+ if (f)
+ f->dump_string("name", image.name);
+ else
+ std::cout << image.name << std::endl;
+ }
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+ return 0;
+ }
+
+ TextTable tbl;
+
+ if (f) {
+ f->open_array_section("images");
+ } else {
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("PARENT", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("FMT", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("PROT", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("LOCK", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (size_t left = 0; left < std::min<size_t>(threads, images.size());
+ left++) {
+ workers.push_back(new WorkerEntry());
+ }
+
+ auto i = images.begin();
+ while (true) {
+ size_t workers_idle = 0;
+ for (auto comp : workers) {
+ switch (comp->state) {
+ case STATE_DONE:
+ comp->completion->wait_for_complete();
+ comp->state = STATE_IDLE;
+ comp->completion->release();
+ comp->completion = nullptr;
+ // we want it to fall through in this case
+ case STATE_IDLE:
+ if (i == images.end()) {
+ workers_idle++;
+ continue;
+ }
+ comp->name = i->name;
+ comp->id = i->id;
+ comp->completion = new librbd::RBD::AioCompletion(nullptr, nullptr);
+ r = rbd.aio_open_read_only(ioctx, comp->img, i->name.c_str(), nullptr,
+ comp->completion);
+ i++;
+ comp->state = STATE_OPENED;
+ break;
+ case STATE_OPENED:
+ comp->completion->wait_for_complete();
+ // image might disappear between rbd.list() and rbd.open(); ignore
+ // that, warn about other possible errors (EPERM, say, for opening
+ // an old-format image, because you need execute permission for the
+ // class method)
+ r = comp->completion->get_return_value();
+ comp->completion->release();
+ if (r < 0) {
+ std::cerr << "rbd: error opening " << comp->name << ": "
+ << cpp_strerror(r) << std::endl;
+
+ // in any event, continue to next image
+ comp->state = STATE_IDLE;
+ continue;
+ }
+ r = list_process_image(&rados, comp, lflag, f, tbl);
+ if (r < 0) {
+ std::cerr << "rbd: error processing image " << comp->name << ": "
+ << cpp_strerror(r) << std::endl;
+ }
+ comp->completion = new librbd::RBD::AioCompletion(nullptr, nullptr);
+ r = comp->img.aio_close(comp->completion);
+ comp->state = STATE_DONE;
+ break;
+ }
+ }
+ if (workers_idle == workers.size()) {
+ break;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else if (!images.empty()) {
+ std::cout << tbl;
+ }
+
+ rados.shutdown();
+
+ for (auto comp : workers) {
+ delete comp;
+ }
+
+ return r < 0 ? r : 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ options->add_options()
+ ("long,l", po::bool_switch(), "long listing format");
+ at::add_pool_options(positional, options, true);
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_list(pool_name, namespace_name, vm["long"].as<bool>(),
+ formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: listing images failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"long", "l"});
+Shell::Action action(
+ {"list"}, {"ls"}, "List rbd images.", "", &get_arguments, &execute);
+
+} // namespace list
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Lock.cc b/src/tools/rbd/action/Lock.cc
new file mode 100644
index 000000000..754cb384c
--- /dev/null
+++ b/src/tools/rbd/action/Lock.cc
@@ -0,0 +1,279 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace lock {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+void add_id_option(po::options_description *positional) {
+ positional->add_options()
+ ("lock-id", "unique lock id");
+}
+
+int get_id(const po::variables_map &vm, size_t *arg_index,
+ std::string *id) {
+ *id = utils::get_positional_argument(vm, *arg_index);
+ if (id->empty()) {
+ std::cerr << "rbd: lock id was not specified" << std::endl;
+ return -EINVAL;
+ } else {
+ ++(*arg_index);
+ }
+ return 0;
+}
+
+} // anonymous namespace
+
+static int do_lock_list(librbd::Image& image, Formatter *f)
+{
+ std::list<librbd::locker_t> lockers;
+ bool exclusive;
+ std::string tag;
+ TextTable tbl;
+ int r;
+
+ r = image.list_lockers(&lockers, &exclusive, &tag);
+ if (r < 0)
+ return r;
+
+ if (f) {
+ f->open_array_section("locks");
+ } else {
+ tbl.define_column("Locker", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Address", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ if (lockers.size()) {
+ bool one = (lockers.size() == 1);
+
+ if (!f) {
+ std::cout << "There " << (one ? "is " : "are ") << lockers.size()
+ << (exclusive ? " exclusive" : " shared")
+ << " lock" << (one ? "" : "s") << " on this image.\n";
+ if (!exclusive)
+ std::cout << "Lock tag: " << tag << "\n";
+ }
+
+ for (std::list<librbd::locker_t>::const_iterator it = lockers.begin();
+ it != lockers.end(); ++it) {
+ if (f) {
+ f->open_object_section("lock");
+ f->dump_string("id", it->cookie);
+ f->dump_string("locker", it->client);
+ f->dump_string("address", it->address);
+ f->close_section();
+ } else {
+ tbl << it->client << it->cookie << it->address << TextTable::endrow;
+ }
+ }
+ if (!f)
+ std::cout << tbl;
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+ return 0;
+}
+
+static int do_lock_add(librbd::Image& image, const char *cookie,
+ const char *tag)
+{
+ if (tag)
+ return image.lock_shared(cookie, tag);
+ else
+ return image.lock_exclusive(cookie);
+}
+
+static int do_lock_remove(librbd::Image& image, const char *client,
+ const char *cookie)
+{
+ return image.break_lock(client, cookie);
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_lock_list(image, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: listing locks failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_add_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_id_option(positional);
+ options->add_options()
+ ("shared", po::value<std::string>(), "shared lock tag");
+}
+
+int execute_add(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string lock_cookie;
+ r = get_id(vm, &arg_index, &lock_cookie);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string lock_tag;
+ if (vm.count("shared")) {
+ lock_tag = vm["shared"].as<std::string>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_lock_add(image, lock_cookie.c_str(),
+ lock_tag.empty() ? nullptr : lock_tag.c_str());
+ if (r < 0) {
+ if (r == -EBUSY || r == -EEXIST) {
+ if (!lock_tag.empty()) {
+ std::cerr << "rbd: lock is already held by someone else"
+ << " with a different tag" << std::endl;
+ } else {
+ std::cerr << "rbd: lock is already held by someone else" << std::endl;
+ }
+ } else {
+ std::cerr << "rbd: taking lock failed: " << cpp_strerror(r) << std::endl;
+ }
+ return r;
+ }
+ return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_id_option(positional);
+ positional->add_options()
+ ("locker", "locker client");
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string lock_cookie;
+ r = get_id(vm, &arg_index, &lock_cookie);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string lock_client = utils::get_positional_argument(vm, arg_index);
+ if (lock_client.empty()) {
+ std::cerr << "rbd: locker was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_lock_remove(image, lock_client.c_str(), lock_cookie.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: releasing lock failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_list(
+ {"lock", "list"}, {"lock", "ls"}, "Show locks held on an image.", "",
+ &get_list_arguments, &execute_list);
+Shell::Action action_add(
+ {"lock", "add"}, {}, "Take a lock on an image.", "",
+ &get_add_arguments, &execute_add);
+Shell::Action action_remove(
+ {"lock", "remove"}, {"lock", "rm"}, "Release a lock on an image.", "",
+ &get_remove_arguments, &execute_remove);
+
+} // namespace lock
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/MergeDiff.cc b/src/tools/rbd/action/MergeDiff.cc
new file mode 100644
index 000000000..d33d1c11a
--- /dev/null
+++ b/src/tools/rbd/action/MergeDiff.cc
@@ -0,0 +1,454 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#define _LARGEFILE64_SOURCE
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "include/compat.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/safe_io.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+
+namespace rbd {
+namespace action {
+namespace merge_diff {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int parse_diff_header(int fd, __u8 *tag, string *from, string *to, uint64_t *size)
+{
+ int r;
+
+ {//header
+ char buf[utils::RBD_DIFF_BANNER.size() + 1];
+ r = safe_read_exact(fd, buf, utils::RBD_DIFF_BANNER.size());
+ if (r < 0)
+ return r;
+
+ buf[utils::RBD_DIFF_BANNER.size()] = '\0';
+ if (strcmp(buf, utils::RBD_DIFF_BANNER.c_str())) {
+ std::cerr << "invalid banner '" << buf << "', expected '"
+ << utils::RBD_DIFF_BANNER << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ while (true) {
+ r = safe_read_exact(fd, tag, 1);
+ if (r < 0)
+ return r;
+
+ if (*tag == RBD_DIFF_FROM_SNAP) {
+ r = utils::read_string(fd, 4096, from); // 4k limit to make sure we don't get a garbage string
+ if (r < 0)
+ return r;
+ dout(2) << " from snap " << *from << dendl;
+ } else if (*tag == RBD_DIFF_TO_SNAP) {
+ r = utils::read_string(fd, 4096, to); // 4k limit to make sure we don't get a garbage string
+ if (r < 0)
+ return r;
+ dout(2) << " to snap " << *to << dendl;
+ } else if (*tag == RBD_DIFF_IMAGE_SIZE) {
+ char buf[8];
+ r = safe_read_exact(fd, buf, 8);
+ if (r < 0)
+ return r;
+
+ bufferlist bl;
+ bl.append(buf, 8);
+ auto p = bl.cbegin();
+ decode(*size, p);
+ } else {
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_diff_body(int fd, __u8 *tag, uint64_t *offset, uint64_t *length)
+{
+ int r;
+
+ if (!(*tag)) {
+ r = safe_read_exact(fd, tag, 1);
+ if (r < 0)
+ return r;
+ }
+
+ if (*tag == RBD_DIFF_END) {
+ offset = 0;
+ length = 0;
+ return 0;
+ }
+
+ if (*tag != RBD_DIFF_WRITE && *tag != RBD_DIFF_ZERO)
+ return -ENOTSUP;
+
+ char buf[16];
+ r = safe_read_exact(fd, buf, 16);
+ if (r < 0)
+ return r;
+
+ bufferlist bl;
+ bl.append(buf, 16);
+ auto p = bl.cbegin();
+ decode(*offset, p);
+ decode(*length, p);
+
+ if (!(*length))
+ return -ENOTSUP;
+
+ return 0;
+}
+
+/*
+ * fd: the diff file to read from
+ * pd: the diff file to be written into
+ */
+static int accept_diff_body(int fd, int pd, __u8 tag, uint64_t offset, uint64_t length)
+{
+ if (tag == RBD_DIFF_END)
+ return 0;
+
+ bufferlist bl;
+ encode(tag, bl);
+ encode(offset, bl);
+ encode(length, bl);
+ int r;
+ r = bl.write_fd(pd);
+ if (r < 0)
+ return r;
+
+ if (tag == RBD_DIFF_WRITE) {
+ bufferptr bp = buffer::create(length);
+ r = safe_read_exact(fd, bp.c_str(), length);
+ if (r < 0)
+ return r;
+ bufferlist data;
+ data.append(bp);
+ r = data.write_fd(pd);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+/*
+ * Merge two diff files into one single file
+ * Note: It does not do the merging work if
+ * either of the source diff files is stripped,
+ * since which complicates the process and is
+ * rarely used
+ */
+static int do_merge_diff(const char *first, const char *second,
+ const char *path, bool no_progress)
+{
+ utils::ProgressContext pc("Merging image diff", no_progress);
+ int fd = -1, sd = -1, pd = -1, r;
+
+ string f_from, f_to;
+ string s_from, s_to;
+ uint64_t f_size = 0;
+ uint64_t s_size = 0;
+ uint64_t pc_size;
+
+ __u8 f_tag = 0, s_tag = 0;
+ uint64_t f_off = 0, f_len = 0;
+ uint64_t s_off = 0, s_len = 0;
+ bool f_end = false, s_end = false;
+
+ bool first_stdin = !strcmp(first, "-");
+ if (first_stdin) {
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(first, O_RDONLY|O_BINARY);
+ if (fd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << first << std::endl;
+ goto done;
+ }
+ }
+
+ sd = open(second, O_RDONLY|O_BINARY);
+ if (sd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << second << std::endl;
+ goto done;
+ }
+
+ if (strcmp(path, "-") == 0) {
+ pd = 1;
+ } else {
+ pd = open(path, O_WRONLY | O_CREAT | O_EXCL | O_BINARY, 0644);
+ if (pd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error create " << path << std::endl;
+ goto done;
+ }
+ }
+
+ //We just handle the case like 'banner, [ftag], [ttag], stag, [wztag]*,etag',
+ // and the (offset,length) in wztag must be ascending order.
+ r = parse_diff_header(fd, &f_tag, &f_from, &f_to, &f_size);
+ if (r < 0) {
+ std::cerr << "rbd: failed to parse first diff header" << std::endl;
+ goto done;
+ }
+
+ r = parse_diff_header(sd, &s_tag, &s_from, &s_to, &s_size);
+ if (r < 0) {
+ std::cerr << "rbd: failed to parse second diff header" << std::endl;
+ goto done;
+ }
+
+ if (f_to != s_from) {
+ r = -EINVAL;
+ std::cerr << "The first TO snapshot must be equal with the second FROM "
+ << "snapshot, aborting" << std::endl;
+ goto done;
+ }
+
+ {
+ // header
+ bufferlist bl;
+ bl.append(utils::RBD_DIFF_BANNER);
+
+ __u8 tag;
+ if (f_from.size()) {
+ tag = RBD_DIFF_FROM_SNAP;
+ encode(tag, bl);
+ encode(f_from, bl);
+ }
+
+ if (s_to.size()) {
+ tag = RBD_DIFF_TO_SNAP;
+ encode(tag, bl);
+ encode(s_to, bl);
+ }
+
+ tag = RBD_DIFF_IMAGE_SIZE;
+ encode(tag, bl);
+ encode(s_size, bl);
+
+ r = bl.write_fd(pd);
+ if (r < 0) {
+ std::cerr << "rbd: failed to write merged diff header" << std::endl;
+ goto done;
+ }
+ }
+ if (f_size > s_size)
+ pc_size = f_size << 1;
+ else
+ pc_size = s_size << 1;
+
+ //data block
+ while (!f_end || !s_end) {
+ // progress through input
+ pc.update_progress(f_off + s_off, pc_size);
+
+ if (!f_end && !f_len) {
+ uint64_t last_off = f_off;
+
+ r = parse_diff_body(fd, &f_tag, &f_off, &f_len);
+ dout(2) << "first diff data chunk: tag=" << f_tag << ", "
+ << "off=" << f_off << ", "
+ << "len=" << f_len << dendl;
+ if (r < 0) {
+ std::cerr << "rbd: failed to read first diff data chunk header"
+ << std::endl;
+ goto done;
+ }
+
+ if (f_tag == RBD_DIFF_END) {
+ f_end = true;
+ f_tag = RBD_DIFF_ZERO;
+ f_off = f_size;
+ if (f_size < s_size)
+ f_len = s_size - f_size;
+ else
+ f_len = 0;
+ }
+
+ if (last_off > f_off) {
+ r = -ENOTSUP;
+ std::cerr << "rbd: out-of-order offset from first diff ("
+ << last_off << " > " << f_off << ")" << std::endl;
+ goto done;
+ }
+ }
+
+ if (!s_end && !s_len) {
+ uint64_t last_off = s_off;
+
+ r = parse_diff_body(sd, &s_tag, &s_off, &s_len);
+ dout(2) << "second diff data chunk: tag=" << s_tag << ", "
+ << "off=" << s_off << ", "
+ << "len=" << s_len << dendl;
+ if (r < 0) {
+ std::cerr << "rbd: failed to read second diff data chunk header"
+ << std::endl;
+ goto done;
+ }
+
+ if (s_tag == RBD_DIFF_END) {
+ s_end = true;
+ s_off = s_size;
+ if (s_size < f_size)
+ s_len = f_size - s_size;
+ else
+ s_len = 0;
+ }
+
+ if (last_off > s_off) {
+ r = -ENOTSUP;
+ std::cerr << "rbd: out-of-order offset from second diff ("
+ << last_off << " > " << s_off << ")" << std::endl;
+ goto done;
+ }
+ }
+
+ if (f_off < s_off && f_len) {
+ uint64_t delta = s_off - f_off;
+ if (delta > f_len)
+ delta = f_len;
+ r = accept_diff_body(fd, pd, f_tag, f_off, delta);
+ if (r < 0) {
+ std::cerr << "rbd: failed to merge diff chunk" << std::endl;
+ goto done;
+ }
+ f_off += delta;
+ f_len -= delta;
+
+ if (!f_len) {
+ f_tag = 0;
+ continue;
+ }
+ }
+ ceph_assert(f_off >= s_off);
+
+ if (f_off < s_off + s_len && f_len) {
+ uint64_t delta = s_off + s_len - f_off;
+ if (delta > f_len)
+ delta = f_len;
+ if (f_tag == RBD_DIFF_WRITE) {
+ if (first_stdin) {
+ bufferptr bp = buffer::create(delta);
+ r = safe_read_exact(fd, bp.c_str(), delta);
+ } else {
+ off64_t l = lseek64(fd, delta, SEEK_CUR);
+ r = l < 0 ? -errno : 0;
+ }
+ if (r < 0) {
+ std::cerr << "rbd: failed to skip first diff data" << std::endl;
+ goto done;
+ }
+ }
+ f_off += delta;
+ f_len -= delta;
+
+ if (!f_len) {
+ f_tag = 0;
+ continue;
+ }
+ }
+ ceph_assert(f_off >= s_off + s_len);
+ if (s_len) {
+ r = accept_diff_body(sd, pd, s_tag, s_off, s_len);
+ if (r < 0) {
+ std::cerr << "rbd: failed to merge diff chunk" << std::endl;
+ goto done;
+ }
+ s_off += s_len;
+ s_len = 0;
+ s_tag = 0;
+ } else {
+ ceph_assert(f_end && s_end);
+ }
+ continue;
+ }
+
+ {//tail
+ __u8 tag = RBD_DIFF_END;
+ bufferlist bl;
+ encode(tag, bl);
+ r = bl.write_fd(pd);
+ }
+
+done:
+ if (pd > 2)
+ close(pd);
+ if (sd > 2)
+ close(sd);
+ if (fd > 2)
+ close(fd);
+
+ if(r < 0) {
+ pc.fail();
+ if (pd > 2)
+ unlink(path);
+ } else
+ pc.finish();
+
+ return r;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ ("diff1-path", "path to first diff (or '-' for stdin)")
+ ("diff2-path", "path to second diff");
+ at::add_path_options(positional, options,
+ "path to merged diff (or '-' for stdout)");
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string first_diff = utils::get_positional_argument(vm, 0);
+ if (first_diff.empty()) {
+ std::cerr << "rbd: first diff was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string second_diff = utils::get_positional_argument(vm, 1);
+ if (second_diff.empty()) {
+ std::cerr << "rbd: second diff was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string path;
+ size_t arg_index = 2;
+ int r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_merge_diff(first_diff.c_str(), second_diff.c_str(), path.c_str(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ cerr << "rbd: merge-diff error" << std::endl;
+ return -r;
+ }
+
+ return 0;
+}
+
+Shell::Action action(
+ {"merge-diff"}, {}, "Merge two diff exports together.", "",
+ &get_arguments, &execute);
+
+} // namespace merge_diff
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Migration.cc b/src/tools/rbd/action/Migration.cc
new file mode 100644
index 000000000..1ce6201d9
--- /dev/null
+++ b/src/tools/rbd/action/Migration.cc
@@ -0,0 +1,429 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace migration {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_execute(librados::IoCtx& io_ctx, const std::string &image_name,
+ bool no_progress) {
+ utils::ProgressContext pc("Image migration", no_progress);
+ int r = librbd::RBD().migration_execute_with_progress(io_ctx,
+ image_name.c_str(), pc);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: migration failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+static int do_abort(librados::IoCtx& io_ctx, const std::string &image_name,
+ bool no_progress) {
+ utils::ProgressContext pc("Abort image migration", no_progress);
+ int r = librbd::RBD().migration_abort_with_progress(io_ctx,
+ image_name.c_str(), pc);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: aborting migration failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+static int do_commit(librados::IoCtx& io_ctx, const std::string &image_name,
+ bool force, bool no_progress) {
+ librbd::image_migration_status_t migration_status;
+ int r = librbd::RBD().migration_status(io_ctx, image_name.c_str(),
+ &migration_status,
+ sizeof(migration_status));
+ if (r < 0) {
+ std::cerr << "rbd: getting migration status failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = librados::Rados(io_ctx).ioctx_create2(migration_status.dest_pool_id, dst_io_ctx);
+ if (r < 0) {
+ std::cerr << "rbd: accessing source pool id="
+ << migration_status.dest_pool_id << " failed: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ r = utils::set_namespace(migration_status.dest_pool_namespace, &dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::Image image;
+ r = utils::open_image_by_id(dst_io_ctx, migration_status.dest_image_id,
+ true, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ std::vector<librbd::linked_image_spec_t> children;
+ r = image.list_descendants(&children);
+ if (r < 0) {
+ std::cerr << "rbd: listing descendants failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ if (children.size() > 0) {
+ std::cerr << "rbd: the image has "
+ << (children.size() == 1 ? "a descendant" : "descendants") << ": "
+ << std::endl;
+ for (auto& child : children) {
+ std::cerr << " " << child.pool_name << "/";
+ if (!child.pool_namespace.empty()) {
+ std::cerr << child.pool_namespace << "/";
+ }
+ std::cerr << child.image_name;
+ if (child.trash) {
+ std::cerr << " (trash " << child.image_id << ")";
+ }
+ std::cerr << std::endl;
+ }
+ std::cerr << "Warning: in-use, read-only descendant images"
+ << " will not detect the parent update." << std::endl;
+ if (force) {
+ std::cerr << "Proceeding anyway due to force flag set." << std::endl;
+ } else {
+ std::cerr << "Ensure no descendant images are opened read-only"
+ << " and run again with force flag." << std::endl;
+ return -EBUSY;
+ }
+ }
+
+ utils::ProgressContext pc("Commit image migration", no_progress);
+ r = librbd::RBD().migration_commit_with_progress(io_ctx, image_name.c_str(),
+ pc);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: committing migration failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_prepare_arguments(po::options_description *positional,
+ po::options_description *options) {
+ options->add_options()
+ ("import-only", po::bool_switch(), "only import data from source")
+ ("source-spec-path", po::value<std::string>(),
+ "source-spec file (or '-' for stdin)")
+ ("source-spec", po::value<std::string>(),
+ "source-spec");
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, true);
+ at::add_flatten_option(options);
+}
+
+int execute_prepare(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ bool import_only = vm["import-only"].as<bool>();
+
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, import_only ? &snap_name : nullptr, true,
+ import_only ? utils::SNAPSHOT_PRESENCE_PERMITTED :
+ utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dst_pool_name;
+ std::string dst_namespace_name;
+ std::string dst_image_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name,
+ &dst_namespace_name, &dst_image_name, nullptr, false,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string source_spec;
+ if (vm.count("source-spec") && vm.count("source-spec-path")) {
+ std::cerr << "rbd: cannot specify both source-spec and source-spec-path"
+ << std::endl;
+ return -EINVAL;
+ } else if (vm.count("source-spec-path")) {
+ std::string source_spec_path = vm["source-spec-path"].as<std::string>();
+
+ int fd = STDIN_FILENO;
+ if (source_spec_path != "-") {
+ fd = open(source_spec_path.c_str(), O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << source_spec_path << std::endl;
+ return r;
+ }
+ }
+
+ source_spec.resize(4096);
+ r = safe_read(fd, source_spec.data(), source_spec.size() - 1);
+ if (fd != STDIN_FILENO) {
+ VOID_TEMP_FAILURE_RETRY(close(fd));
+ }
+
+ if (r >= 0) {
+ source_spec.resize(r);
+ } else {
+ std::cerr << "rbd: error reading source-spec file: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ } else if (vm.count("source-spec")) {
+ source_spec = vm["source-spec"].as<std::string>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx dst_io_ctx;
+ if (source_spec.empty()) {
+ utils::normalize_pool_name(&dst_pool_name);
+ r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name,
+ &dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (import_only && source_spec.empty()) {
+ if (snap_name.empty()) {
+ std::cerr << "rbd: snapshot name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ std::stringstream ss;
+ ss << R"({)"
+ << R"("type":"native",)"
+ << R"("pool_id":)" << io_ctx.get_id() << R"(,)"
+ << R"("pool_namespace":")" << io_ctx.get_namespace() << R"(",)"
+ << R"("image_name":")" << image_name << R"(",)"
+ << R"("snap_name":")" << snap_name << R"(")"
+ << R"(})";
+ source_spec = ss.str();
+
+ if (dst_image_name.empty()) {
+ std::cerr << "rbd: destination image name must be provided" << std::endl;
+ return -EINVAL;
+ }
+ io_ctx = dst_io_ctx;
+ image_name = dst_image_name;
+ snap_name = "";
+ } else if (!import_only && !source_spec.empty()) {
+ std::cerr << "rbd: --import-only must be used in combination with "
+ << "source-spec/source-spec-path" << std::endl;
+ return -EINVAL;
+ }
+
+ if (!snap_name.empty()) {
+ std::cerr << "rbd: snapshot name specified for a command that doesn't "
+ << "use it" << std::endl;
+ return -EINVAL;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, true, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ if (source_spec.empty()) {
+ if (dst_image_name.empty()) {
+ dst_image_name = image_name;
+ }
+
+ int r = librbd::RBD().migration_prepare(io_ctx, image_name.c_str(),
+ dst_io_ctx, dst_image_name.c_str(),
+ opts);
+ if (r < 0) {
+ std::cerr << "rbd: preparing migration failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ } else {
+ ceph_assert(import_only);
+ r = librbd::RBD().migration_prepare_import(source_spec.c_str(), io_ctx,
+ image_name.c_str(), opts);
+ if (r < 0) {
+ std::cerr << "rbd: preparing import migration failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+void get_execute_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ io_ctx.set_pool_full_try();
+
+ r = do_execute(io_ctx, image_name, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void get_abort_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_abort(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ io_ctx.set_pool_full_try();
+
+ r = do_abort(io_ctx, image_name, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void get_commit_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+ options->add_options()
+ ("force", po::bool_switch(), "proceed even if the image has children");
+}
+
+int execute_commit(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ io_ctx.set_pool_full_try();
+
+ r = do_commit(io_ctx, image_name, vm["force"].as<bool>(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"import-only"});
+
+Shell::Action action_prepare(
+ {"migration", "prepare"}, {}, "Prepare image migration.",
+ at::get_long_features_help(), &get_prepare_arguments, &execute_prepare);
+
+Shell::Action action_execute(
+ {"migration", "execute"}, {}, "Execute image migration.", "",
+ &get_execute_arguments, &execute_execute);
+
+Shell::Action action_abort(
+ {"migration", "abort"}, {}, "Cancel interrupted image migration.", "",
+ &get_abort_arguments, &execute_abort);
+
+Shell::Action action_commit(
+ {"migration", "commit"}, {}, "Commit image migration.", "",
+ &get_commit_arguments, &execute_commit);
+
+} // namespace migration
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/MirrorImage.cc b/src/tools/rbd/action/MirrorImage.cc
new file mode 100644
index 000000000..505d377f4
--- /dev/null
+++ b/src/tools/rbd/action/MirrorImage.cc
@@ -0,0 +1,605 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/MirrorDaemonServiceInfo.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace mirror_image {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+int validate_mirroring_enabled(librbd::Image &image, bool snapshot = false) {
+ librbd::mirror_image_info_t mirror_image;
+ int r = image.mirror_image_get_info(&mirror_image, sizeof(mirror_image));
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror info: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (mirror_image.state != RBD_MIRROR_IMAGE_ENABLED) {
+ std::cerr << "rbd: mirroring not enabled on the image" << std::endl;
+ return -EINVAL;
+ }
+
+ if (snapshot) {
+ librbd::mirror_image_mode_t mode;
+ r = image.mirror_image_get_mode(&mode);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (mode != RBD_MIRROR_IMAGE_MODE_SNAPSHOT) {
+ std::cerr << "rbd: snapshot based mirroring not enabled on the image"
+ << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+} // anonymous namespace
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+void get_arguments_enable(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ positional->add_options()
+ ("mode", po::value<std::string>()->default_value(""),
+ "mirror image mode (journal or snapshot) [default: journal]");
+}
+
+void get_arguments_disable(po::options_description *positional,
+ po::options_description *options) {
+ options->add_options()
+ ("force", po::bool_switch(), "disable even if not primary");
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_enable_disable(const po::variables_map &vm, bool enable,
+ bool force) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ if (enable) {
+ librbd::mirror_image_mode_t mode = RBD_MIRROR_IMAGE_MODE_JOURNAL;
+ std::string mode_arg = utils::get_positional_argument(vm, arg_index++);
+ if (mode_arg == "journal") {
+ mode = RBD_MIRROR_IMAGE_MODE_JOURNAL;
+ } else if (mode_arg == "snapshot") {
+ mode = RBD_MIRROR_IMAGE_MODE_SNAPSHOT;
+ } else if (!mode_arg.empty()) {
+ std::cerr << "rbd: invalid mode name: " << mode_arg << std::endl;
+ return -EINVAL;
+ }
+ r = image.mirror_image_enable2(mode);
+ } else {
+ r = image.mirror_image_disable(force);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ std::cout << (enable ? "Mirroring enabled" : "Mirroring disabled")
+ << std::endl;
+
+ return 0;
+}
+
+int execute_disable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return execute_enable_disable(vm, false, vm["force"].as<bool>());
+}
+
+int execute_enable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return execute_enable_disable(vm, true, false);
+}
+
+void get_arguments_promote(po::options_description *positional,
+ po::options_description *options) {
+ options->add_options()
+ ("force", po::bool_switch(), "promote even if not cleanly demoted by remote cluster");
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_promote(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ bool force = vm["force"].as<bool>();
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.mirror_image_promote(force);
+ if (r < 0) {
+ std::cerr << "rbd: error promoting image to primary" << std::endl;
+ return r;
+ }
+
+ std::cout << "Image promoted to primary" << std::endl;
+ return 0;
+}
+
+int execute_demote(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.mirror_image_demote();
+ if (r < 0) {
+ std::cerr << "rbd: error demoting image to non-primary" << std::endl;
+ return r;
+ }
+
+ std::cout << "Image demoted to non-primary" << std::endl;
+ return 0;
+}
+
+int execute_resync(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.mirror_image_resync();
+ if (r < 0) {
+ std::cerr << "rbd: error flagging image resync" << std::endl;
+ return r;
+ }
+
+ std::cout << "Flagged image for resync from primary" << std::endl;
+ return 0;
+}
+
+void get_status_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_status(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ at::Format::Formatter formatter;
+ int r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx default_ns_io_ctx;
+ default_ns_io_ctx.dup(io_ctx);
+ default_ns_io_ctx.set_namespace("");
+
+ std::vector<librbd::mirror_peer_site_t> mirror_peers;
+ utils::get_mirror_peer_sites(default_ns_io_ctx, &mirror_peers);
+
+ std::map<std::string, std::string> peer_mirror_uuids_to_name;
+ utils::get_mirror_peer_mirror_uuids_to_names(mirror_peers,
+ &peer_mirror_uuids_to_name);
+
+ librbd::mirror_image_global_status_t status;
+ r = image.mirror_image_get_global_status(&status, sizeof(status));
+ if (r < 0) {
+ std::cerr << "rbd: failed to get status for image " << image_name << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ utils::populate_unknown_mirror_image_site_statuses(mirror_peers, &status);
+
+ std::string instance_id;
+ MirrorDaemonServiceInfo daemon_service_info(io_ctx);
+
+ librbd::mirror_image_site_status_t local_status;
+ int local_site_r = utils::get_local_mirror_image_status(
+ status, &local_status);
+ status.site_statuses.erase(
+ std::remove_if(status.site_statuses.begin(),
+ status.site_statuses.end(),
+ [](auto& status) {
+ return (status.mirror_uuid ==
+ RBD_MIRROR_IMAGE_STATUS_LOCAL_MIRROR_UUID);
+ }),
+ status.site_statuses.end());
+
+ if (local_site_r >= 0 && local_status.up) {
+ r = image.mirror_image_get_instance_id(&instance_id);
+ if (r == -EOPNOTSUPP) {
+ std::cerr << "rbd: newer release of Ceph OSDs required to map image "
+ << "to rbd-mirror daemon instance" << std::endl;
+ // not fatal
+ } else if (r < 0 && r != -ENOENT) {
+ std::cerr << "rbd: failed to get service id for image "
+ << image_name << ": " << cpp_strerror(r) << std::endl;
+ // not fatal
+ } else if (!instance_id.empty()) {
+ daemon_service_info.init();
+ }
+ }
+
+ std::vector<librbd::snap_info_t> snaps;
+ if (status.info.primary && status.info.state == RBD_MIRROR_IMAGE_ENABLED) {
+ librbd::mirror_image_mode_t mode = RBD_MIRROR_IMAGE_MODE_JOURNAL;
+ r = image.mirror_image_get_mode(&mode);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ // not fatal
+ }
+
+ if (mode == RBD_MIRROR_IMAGE_MODE_SNAPSHOT) {
+ image.snap_list(snaps);
+ snaps.erase(
+ remove_if(snaps.begin(),
+ snaps.end(),
+ [&image](const librbd::snap_info_t &snap) {
+ librbd::snap_namespace_type_t type;
+ int r = image.snap_get_namespace_type(snap.id, &type);
+ if (r < 0) {
+ return false;
+ }
+ return type != RBD_SNAP_NAMESPACE_TYPE_MIRROR;
+ }),
+ snaps.end());
+ }
+ }
+
+ auto mirror_service = daemon_service_info.get_by_instance_id(instance_id);
+
+ if (formatter != nullptr) {
+ formatter->open_object_section("image");
+ formatter->dump_string("name", image_name);
+ formatter->dump_string("global_id", status.info.global_id);
+ if (local_site_r >= 0) {
+ formatter->dump_string("state", utils::mirror_image_site_status_state(
+ local_status));
+ formatter->dump_string("description", local_status.description);
+ if (mirror_service != nullptr) {
+ mirror_service->dump_image(formatter);
+ }
+ formatter->dump_string("last_update", utils::timestr(
+ local_status.last_update));
+ }
+ if (!status.site_statuses.empty()) {
+ formatter->open_array_section("peer_sites");
+ for (auto& status : status.site_statuses) {
+ formatter->open_object_section("peer_site");
+
+ auto name_it = peer_mirror_uuids_to_name.find(status.mirror_uuid);
+ formatter->dump_string("site_name",
+ (name_it != peer_mirror_uuids_to_name.end() ? name_it->second : ""));
+ formatter->dump_string("mirror_uuids", status.mirror_uuid);
+
+ formatter->dump_string("state", utils::mirror_image_site_status_state(
+ status));
+ formatter->dump_string("description", status.description);
+ formatter->dump_string("last_update", utils::timestr(
+ status.last_update));
+ formatter->close_section(); // peer_site
+ }
+ formatter->close_section(); // peer_sites
+ }
+ if (!snaps.empty()) {
+ formatter->open_array_section("snapshots");
+ for (auto &snap : snaps) {
+ librbd::snap_mirror_namespace_t info;
+ r = image.snap_get_mirror_namespace(snap.id, &info, sizeof(info));
+ if (r < 0 ||
+ (info.state != RBD_SNAP_MIRROR_STATE_PRIMARY &&
+ info.state != RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED)) {
+ continue;
+ }
+ formatter->open_object_section("snapshot");
+ formatter->dump_unsigned("id", snap.id);
+ formatter->dump_string("name", snap.name);
+ formatter->dump_bool("demoted",
+ info.state == RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED);
+ formatter->open_array_section("mirror_peer_uuids");
+ for (auto &peer : info.mirror_peer_uuids) {
+ formatter->dump_string("peer_uuid", peer);
+ }
+ formatter->close_section(); // mirror_peer_uuids
+ formatter->close_section(); // snapshot
+ }
+ formatter->close_section(); // snapshots
+ }
+ formatter->close_section(); // image
+ formatter->flush(std::cout);
+ } else {
+ std::cout << image_name << ":\n"
+ << " global_id: " << status.info.global_id << "\n";
+ if (local_site_r >= 0) {
+ std::cout << " state: " << utils::mirror_image_site_status_state(
+ local_status) << "\n"
+ << " description: " << local_status.description << "\n";
+ if (mirror_service != nullptr) {
+ std::cout << " service: " <<
+ mirror_service->get_image_description() << "\n";
+ }
+ std::cout << " last_update: " << utils::timestr(
+ local_status.last_update) << std::endl;
+ }
+ if (!status.site_statuses.empty()) {
+ std::cout << " peer_sites:" << std::endl;
+
+ bool first_site = true;
+ for (auto& site : status.site_statuses) {
+ if (!first_site) {
+ std::cout << std::endl;
+ }
+ first_site = false;
+
+ auto name_it = peer_mirror_uuids_to_name.find(site.mirror_uuid);
+ std::cout << " name: "
+ << (name_it != peer_mirror_uuids_to_name.end() ?
+ name_it->second : site.mirror_uuid)
+ << std::endl
+ << " state: " << utils::mirror_image_site_status_state(
+ site) << std::endl
+ << " description: " << site.description << std::endl
+ << " last_update: " << utils::timestr(
+ site.last_update) << std::endl;
+ }
+ }
+ if (!snaps.empty()) {
+ std::cout << " snapshots:" << std::endl;
+
+ bool first_site = true;
+ for (auto &snap : snaps) {
+ librbd::snap_mirror_namespace_t info;
+ r = image.snap_get_mirror_namespace(snap.id, &info, sizeof(info));
+ if (r < 0 ||
+ (info.state != RBD_SNAP_MIRROR_STATE_PRIMARY &&
+ info.state != RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED)) {
+ continue;
+ }
+
+ if (!first_site) {
+ std::cout << std::endl;
+ }
+
+ first_site = false;
+ std::cout << " " << snap.id << " " << snap.name << " ("
+ << (info.state == RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED ?
+ "demoted " : "")
+ << "peer_uuids:[" << info.mirror_peer_uuids << "])";
+ }
+ std::cout << std::endl;
+ }
+ }
+
+ return 0;
+}
+
+void get_snapshot_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_snap_create_options(options);
+}
+
+int execute_snapshot(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ uint32_t flags;
+ r = utils::get_snap_create_flags(vm, &flags);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(image, true);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t snap_id;
+ r = image.mirror_image_create_snapshot2(flags, &snap_id);
+ if (r < 0) {
+ std::cerr << "rbd: error creating snapshot: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ std::cout << "Snapshot ID: " << snap_id << std::endl;
+ return 0;
+}
+
+Shell::Action action_enable(
+ {"mirror", "image", "enable"}, {},
+ "Enable RBD mirroring for an image.", "",
+ &get_arguments_enable, &execute_enable);
+Shell::Action action_disable(
+ {"mirror", "image", "disable"}, {},
+ "Disable RBD mirroring for an image.", "",
+ &get_arguments_disable, &execute_disable);
+Shell::Action action_promote(
+ {"mirror", "image", "promote"}, {},
+ "Promote an image to primary for RBD mirroring.", "",
+ &get_arguments_promote, &execute_promote);
+Shell::Action action_demote(
+ {"mirror", "image", "demote"}, {},
+ "Demote an image to non-primary for RBD mirroring.", "",
+ &get_arguments, &execute_demote);
+Shell::Action action_resync(
+ {"mirror", "image", "resync"}, {},
+ "Force resync to primary image for RBD mirroring.", "",
+ &get_arguments, &execute_resync);
+Shell::Action action_status(
+ {"mirror", "image", "status"}, {},
+ "Show RBD mirroring status for an image.", "",
+ &get_status_arguments, &execute_status);
+Shell::Action action_snapshot(
+ {"mirror", "image", "snapshot"}, {},
+ "Create RBD mirroring image snapshot.", "",
+ &get_snapshot_arguments, &execute_snapshot);
+
+} // namespace mirror_image
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/MirrorPool.cc b/src/tools/rbd/action/MirrorPool.cc
new file mode 100644
index 000000000..b714c3bab
--- /dev/null
+++ b/src/tools/rbd/action/MirrorPool.cc
@@ -0,0 +1,1772 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/MirrorDaemonServiceInfo.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+#include "include/rbd/librbd.hpp"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/Throttle.h"
+#include "global/global_context.h"
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <regex>
+#include <set>
+#include <boost/program_options.hpp>
+#include "include/ceph_assert.h"
+
+#include <atomic>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::action::MirrorPool: "
+
+namespace rbd {
+namespace action {
+namespace mirror_pool {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static const std::string ALL_NAME("all");
+static const std::string SITE_NAME("site-name");
+
+namespace {
+
+void add_site_name_optional(po::options_description *options) {
+ options->add_options()
+ (SITE_NAME.c_str(), po::value<std::string>(), "local site name");
+}
+
+int set_site_name(librados::Rados& rados, const std::string& site_name) {
+ librbd::RBD rbd;
+ int r = rbd.mirror_site_name_set(rados, site_name);
+ if (r == -EOPNOTSUPP) {
+ std::cerr << "rbd: cluster does not support site names" << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to set site name" << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+struct MirrorPeerDirection {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ MirrorPeerDirection *target_type, int permit_tx) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ if (s == "rx-only") {
+ v = boost::any(RBD_MIRROR_PEER_DIRECTION_RX);
+ } else if (s == "rx-tx") {
+ v = boost::any(RBD_MIRROR_PEER_DIRECTION_RX_TX);
+ } else if (permit_tx != 0 && s == "tx-only") {
+ v = boost::any(RBD_MIRROR_PEER_DIRECTION_TX);
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+void add_direction_optional(po::options_description *options) {
+ options->add_options()
+ ("direction", po::value<MirrorPeerDirection>(),
+ "mirroring direction (rx-only, rx-tx)\n"
+ "[default: rx-tx]");
+}
+
+int validate_mirroring_enabled(librados::IoCtx& io_ctx) {
+ librbd::RBD rbd;
+ rbd_mirror_mode_t mirror_mode;
+ int r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (mirror_mode == RBD_MIRROR_MODE_DISABLED) {
+ std::cerr << "rbd: mirroring not enabled on the pool" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int validate_uuid(const std::string &uuid) {
+ std::regex pattern("^[A-F0-9]{8}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{12}$",
+ std::regex::icase);
+ std::smatch match;
+ if (!std::regex_match(uuid, match, pattern)) {
+ std::cerr << "rbd: invalid uuid '" << uuid << "'" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int read_key_file(std::string path, std::string* key) {
+ std::ifstream key_file;
+ key_file.open(path);
+ if (key_file.fail()) {
+ std::cerr << "rbd: failed to open " << path << std::endl;
+ return -EINVAL;
+ }
+
+ std::getline(key_file, *key);
+ if (key_file.bad()) {
+ std::cerr << "rbd: failed to read key from " << path << std::endl;
+ return -EINVAL;
+ }
+
+ key_file.close();
+ return 0;
+}
+
+void add_uuid_option(po::options_description *positional) {
+ positional->add_options()
+ ("uuid", po::value<std::string>(), "peer uuid");
+}
+
+int get_uuid(const po::variables_map &vm, size_t arg_index,
+ std::string *uuid) {
+ *uuid = utils::get_positional_argument(vm, arg_index);
+ if (uuid->empty()) {
+ std::cerr << "rbd: must specify peer uuid" << std::endl;
+ return -EINVAL;
+ }
+ return validate_uuid(*uuid);
+}
+
+int get_remote_cluster_spec(const po::variables_map &vm,
+ const std::string &spec,
+ std::string *remote_client_name,
+ std::string *remote_cluster,
+ std::map<std::string, std::string>* attributes) {
+ if (vm.count("remote-client-name")) {
+ *remote_client_name = vm["remote-client-name"].as<std::string>();
+ }
+ if (vm.count("remote-cluster")) {
+ *remote_cluster = vm["remote-cluster"].as<std::string>();
+ }
+ if (vm.count("remote-mon-host")) {
+ (*attributes)["mon_host"] = vm["remote-mon-host"].as<std::string>();
+ }
+ if (vm.count("remote-key-file")) {
+ std::string key;
+ int r = read_key_file(vm["remote-key-file"].as<std::string>(), &key);
+ if (r < 0) {
+ return r;
+ }
+ (*attributes)["key"] = key;
+ }
+
+ if (!spec.empty()) {
+ std::regex pattern("^(?:(client\\.[^@]+)@)?([^/@]+)$");
+ std::smatch match;
+ if (!std::regex_match(spec, match, pattern)) {
+ std::cerr << "rbd: invalid spec '" << spec << "'" << std::endl;
+ return -EINVAL;
+ }
+ if (match[1].matched) {
+ *remote_client_name = match[1];
+ }
+ *remote_cluster = match[2];
+ }
+
+ if (remote_cluster->empty()) {
+ std::cerr << "rbd: remote cluster was not specified" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int set_peer_config_key(librados::IoCtx& io_ctx, const std::string& peer_uuid,
+ std::map<std::string, std::string>&& attributes) {
+ librbd::RBD rbd;
+ int r = rbd.mirror_peer_site_set_attributes(io_ctx, peer_uuid, attributes);
+ if (r == -EPERM) {
+ std::cerr << "rbd: permission denied attempting to set peer "
+ << "config-key secrets in the monitor" << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to update mirroring peer config: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int get_peer_config_key(librados::IoCtx& io_ctx, const std::string& peer_uuid,
+ std::map<std::string, std::string>* attributes) {
+ librbd::RBD rbd;
+ int r = rbd.mirror_peer_site_get_attributes(io_ctx, peer_uuid, attributes);
+ if (r == -ENOENT) {
+ return r;
+ } else if (r == -EPERM) {
+ std::cerr << "rbd: permission denied attempting to access peer "
+ << "config-key secrets from the monitor" << std::endl;
+ return r;
+ } else if (r == -EINVAL) {
+ std::cerr << "rbd: corrupt mirroring peer config" << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: error reading mirroring peer config: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int update_peer_config_key(librados::IoCtx& io_ctx,
+ const std::string& peer_uuid,
+ const std::string& key,
+ const std::string& value) {
+ std::map<std::string, std::string> attributes;
+ int r = get_peer_config_key(io_ctx, peer_uuid, &attributes);
+ if (r == -ENOENT) {
+ return set_peer_config_key(io_ctx, peer_uuid, {{key, value}});
+ } else if (r < 0) {
+ return r;
+ }
+
+ if (value.empty()) {
+ attributes.erase(key);
+ } else {
+ attributes[key] = value;
+ }
+ return set_peer_config_key(io_ctx, peer_uuid, std::move(attributes));
+}
+
+int format_mirror_peers(librados::IoCtx& io_ctx,
+ at::Format::Formatter formatter,
+ const std::vector<librbd::mirror_peer_site_t> &peers,
+ bool config_key) {
+ if (formatter != nullptr) {
+ formatter->open_array_section("peers");
+ } else {
+ std::cout << "Peer Sites: ";
+ if (peers.empty()) {
+ std::cout << "none";
+ }
+ std::cout << std::endl;
+ }
+
+ for (auto &peer : peers) {
+ std::map<std::string, std::string> attributes;
+ if (config_key) {
+ int r = get_peer_config_key(io_ctx, peer.uuid, &attributes);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+ }
+
+ std::string direction;
+ switch (peer.direction) {
+ case RBD_MIRROR_PEER_DIRECTION_RX:
+ direction = "rx-only";
+ break;
+ case RBD_MIRROR_PEER_DIRECTION_TX:
+ direction = "tx-only";
+ break;
+ case RBD_MIRROR_PEER_DIRECTION_RX_TX:
+ direction = "rx-tx";
+ break;
+ default:
+ direction = "unknown";
+ break;
+ }
+
+ if (formatter != nullptr) {
+ formatter->open_object_section("peer");
+ formatter->dump_string("uuid", peer.uuid);
+ formatter->dump_string("direction", direction);
+ formatter->dump_string("site_name", peer.site_name);
+ formatter->dump_string("mirror_uuid", peer.mirror_uuid);
+ formatter->dump_string("client_name", peer.client_name);
+ for (auto& pair : attributes) {
+ formatter->dump_string(pair.first.c_str(), pair.second);
+ }
+ formatter->close_section();
+ } else {
+ std::cout << std::endl
+ << "UUID: " << peer.uuid << std::endl
+ << "Name: " << peer.site_name << std::endl;
+ if (peer.direction != RBD_MIRROR_PEER_DIRECTION_RX ||
+ !peer.mirror_uuid.empty()) {
+ std::cout << "Mirror UUID: " << peer.mirror_uuid << std::endl;
+ }
+ std::cout << "Direction: " << direction << std::endl;
+ if (peer.direction != RBD_MIRROR_PEER_DIRECTION_TX ||
+ !peer.client_name.empty()) {
+ std::cout << "Client: " << peer.client_name << std::endl;
+ }
+ if (config_key) {
+ std::cout << "Mon Host: " << attributes["mon_host"] << std::endl
+ << "Key: " << attributes["key"] << std::endl;
+ }
+ if (peer.site_name != peers.rbegin()->site_name) {
+ std::cout << std::endl;
+ }
+ }
+ }
+
+ if (formatter != nullptr) {
+ formatter->close_section();
+ }
+ return 0;
+}
+
+class ImageRequestBase {
+public:
+ void send() {
+ dout(20) << this << " " << __func__ << ": image_name=" << m_image_name
+ << dendl;
+
+ auto ctx = new LambdaContext([this](int r) {
+ handle_finalize(r);
+ });
+
+ // will pause here until slots are available
+ m_finalize_ctx = m_throttle.start_op(ctx);
+
+ open_image();
+ }
+
+protected:
+ ImageRequestBase(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name)
+ : m_io_ctx(io_ctx), m_throttle(throttle), m_image_name(image_name) {
+ }
+ virtual ~ImageRequestBase() {
+ }
+
+ virtual bool skip_get_info() const {
+ return false;
+ }
+ virtual void get_info(librbd::Image &image, librbd::mirror_image_info_t *info,
+ librbd::RBD::AioCompletion *aio_comp) {
+ image.aio_mirror_image_get_info(info, sizeof(librbd::mirror_image_info_t),
+ aio_comp);
+ }
+
+ virtual bool skip_action(const librbd::mirror_image_info_t &info) const {
+ return false;
+ }
+ virtual void execute_action(librbd::Image &image,
+ librbd::RBD::AioCompletion *aio_comp) = 0;
+ virtual void handle_execute_action(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ std::cerr << "rbd: failed to " << get_action_type() << " image "
+ << m_image_name << ": " << cpp_strerror(r) << std::endl;
+ m_ret_val = r;
+ }
+
+ close_image();
+ }
+
+ virtual void finalize_action() {
+ }
+ virtual std::string get_action_type() const = 0;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_IMAGE
+ * |
+ * v
+ * GET_INFO
+ * |
+ * v
+ * EXECUTE_ACTION
+ * |
+ * v
+ * CLOSE_IMAGE
+ * |
+ * v
+ * FINALIZE_ACTION
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ OrderedThrottle &m_throttle;
+ const std::string m_image_name;
+
+ librbd::Image m_image;
+ Context *m_finalize_ctx = nullptr;
+
+ librbd::mirror_image_info_t m_mirror_image_info;
+
+ int m_ret_val = 0;
+
+ void open_image() {
+ dout(20) << this << " " << __func__ << dendl;
+
+ librbd::RBD rbd;
+ auto aio_completion = utils::create_aio_completion<
+ ImageRequestBase, &ImageRequestBase::handle_open_image>(this);
+ rbd.aio_open(m_io_ctx, m_image, m_image_name.c_str(), nullptr,
+ aio_completion);
+ }
+
+ void handle_open_image(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ std::cerr << "rbd: failed to open image "
+ << m_image_name << ": " << cpp_strerror(r) << std::endl;
+ m_finalize_ctx->complete(r);
+ return;
+ }
+
+ get_info();
+ }
+
+ void get_info() {
+ if (skip_get_info()) {
+ execute_action();
+ return;
+ }
+ dout(20) << this << " " << __func__ << dendl;
+
+ auto aio_completion = utils::create_aio_completion<
+ ImageRequestBase, &ImageRequestBase::handle_get_info>(this);
+ get_info(m_image, &m_mirror_image_info, aio_completion);
+ }
+
+ void handle_get_info(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ close_image();
+ return;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror image info for "
+ << m_image_name << ": " << cpp_strerror(r) << std::endl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ execute_action();
+ }
+
+ void execute_action() {
+ if (skip_action(m_mirror_image_info)) {
+ close_image();
+ return;
+ }
+ dout(20) << this << " " << __func__ << dendl;
+
+ auto aio_completion = utils::create_aio_completion<
+ ImageRequestBase, &ImageRequestBase::handle_execute_action>(this);
+ execute_action(m_image, aio_completion);
+ }
+
+ void close_image() {
+ dout(20) << this << " " << __func__ << dendl;
+
+ auto aio_completion = utils::create_aio_completion<
+ ImageRequestBase, &ImageRequestBase::handle_close_image>(this);
+ m_image.aio_close(aio_completion);
+ }
+
+ void handle_close_image(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ std::cerr << "rbd: failed to close image "
+ << m_image_name << ": " << cpp_strerror(r) << std::endl;
+ }
+
+ m_finalize_ctx->complete(r);
+ }
+
+ void handle_finalize(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r == 0 && m_ret_val < 0) {
+ r = m_ret_val;
+ }
+ if (r >= 0) {
+ finalize_action();
+ }
+ m_throttle.end_op(r);
+ delete this;
+ }
+
+};
+
+class PromoteImageRequest : public ImageRequestBase {
+public:
+ PromoteImageRequest(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name, std::atomic<unsigned> *counter,
+ bool force)
+ : ImageRequestBase(io_ctx, throttle, image_name), m_counter(counter),
+ m_force(force) {
+ }
+
+protected:
+ bool skip_action(const librbd::mirror_image_info_t &info) const override {
+ return (info.state != RBD_MIRROR_IMAGE_ENABLED || info.primary);
+ }
+
+ void execute_action(librbd::Image &image,
+ librbd::RBD::AioCompletion *aio_comp) override {
+ image.aio_mirror_image_promote(m_force, aio_comp);
+ }
+
+ void handle_execute_action(int r) override {
+ if (r >= 0) {
+ (*m_counter)++;
+ }
+ ImageRequestBase::handle_execute_action(r);
+ }
+
+ std::string get_action_type() const override {
+ return "promote";
+ }
+
+private:
+ std::atomic<unsigned> *m_counter = nullptr;
+ bool m_force;
+};
+
+class DemoteImageRequest : public ImageRequestBase {
+public:
+ DemoteImageRequest(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name, std::atomic<unsigned> *counter)
+ : ImageRequestBase(io_ctx, throttle, image_name), m_counter(counter) {
+ }
+
+protected:
+ bool skip_action(const librbd::mirror_image_info_t &info) const override {
+ return (info.state != RBD_MIRROR_IMAGE_ENABLED || !info.primary);
+ }
+
+ void execute_action(librbd::Image &image,
+ librbd::RBD::AioCompletion *aio_comp) override {
+ image.aio_mirror_image_demote(aio_comp);
+ }
+ void handle_execute_action(int r) override {
+ if (r >= 0) {
+ (*m_counter)++;
+ }
+ ImageRequestBase::handle_execute_action(r);
+ }
+
+ std::string get_action_type() const override {
+ return "demote";
+ }
+
+private:
+ std::atomic<unsigned> *m_counter = nullptr;
+};
+
+class StatusImageRequest : public ImageRequestBase {
+public:
+ StatusImageRequest(
+ librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name,
+ const std::map<std::string, std::string> &instance_ids,
+ const std::vector<librbd::mirror_peer_site_t>& mirror_peers,
+ const std::map<std::string, std::string> &peer_mirror_uuids_to_name,
+ const MirrorDaemonServiceInfo &daemon_service_info,
+ at::Format::Formatter formatter)
+ : ImageRequestBase(io_ctx, throttle, image_name),
+ m_instance_ids(instance_ids), m_mirror_peers(mirror_peers),
+ m_peer_mirror_uuids_to_name(peer_mirror_uuids_to_name),
+ m_daemon_service_info(daemon_service_info), m_formatter(formatter) {
+ }
+
+protected:
+ bool skip_get_info() const override {
+ return true;
+ }
+
+ void execute_action(librbd::Image &image,
+ librbd::RBD::AioCompletion *aio_comp) override {
+ image.get_id(&m_image_id);
+ image.aio_mirror_image_get_global_status(
+ &m_mirror_image_global_status, sizeof(m_mirror_image_global_status),
+ aio_comp);
+ }
+
+ void finalize_action() override {
+ if (m_mirror_image_global_status.info.global_id.empty()) {
+ return;
+ }
+
+ utils::populate_unknown_mirror_image_site_statuses(
+ m_mirror_peers, &m_mirror_image_global_status);
+
+ librbd::mirror_image_site_status_t local_status;
+ int local_site_r = utils::get_local_mirror_image_status(
+ m_mirror_image_global_status, &local_status);
+ m_mirror_image_global_status.site_statuses.erase(
+ std::remove_if(m_mirror_image_global_status.site_statuses.begin(),
+ m_mirror_image_global_status.site_statuses.end(),
+ [](auto& status) {
+ return (status.mirror_uuid ==
+ RBD_MIRROR_IMAGE_STATUS_LOCAL_MIRROR_UUID);
+ }),
+ m_mirror_image_global_status.site_statuses.end());
+
+ std::string instance_id = (local_site_r >= 0 && local_status.up &&
+ m_instance_ids.count(m_image_id)) ?
+ m_instance_ids.find(m_image_id)->second : "";
+
+ auto mirror_service = m_daemon_service_info.get_by_instance_id(instance_id);
+ if (m_formatter != nullptr) {
+ m_formatter->open_object_section("image");
+ m_formatter->dump_string("name", m_mirror_image_global_status.name);
+ m_formatter->dump_string(
+ "global_id", m_mirror_image_global_status.info.global_id);
+ if (local_site_r >= 0) {
+ m_formatter->dump_string("state", utils::mirror_image_site_status_state(
+ local_status));
+ m_formatter->dump_string("description", local_status.description);
+ if (mirror_service != nullptr) {
+ mirror_service->dump_image(m_formatter);
+ }
+ m_formatter->dump_string("last_update", utils::timestr(
+ local_status.last_update));
+ }
+ if (!m_mirror_image_global_status.site_statuses.empty()) {
+ m_formatter->open_array_section("peer_sites");
+ for (auto& status : m_mirror_image_global_status.site_statuses) {
+ m_formatter->open_object_section("peer_site");
+
+ auto name_it = m_peer_mirror_uuids_to_name.find(status.mirror_uuid);
+ m_formatter->dump_string("site_name",
+ (name_it != m_peer_mirror_uuids_to_name.end() ?
+ name_it->second : ""));
+ m_formatter->dump_string("mirror_uuids", status.mirror_uuid);
+
+ m_formatter->dump_string(
+ "state", utils::mirror_image_site_status_state(status));
+ m_formatter->dump_string("description", status.description);
+ m_formatter->dump_string("last_update", utils::timestr(
+ status.last_update));
+ m_formatter->close_section(); // peer_site
+ }
+ m_formatter->close_section(); // peer_sites
+ }
+ m_formatter->close_section(); // image
+ } else {
+ std::cout << std::endl
+ << m_mirror_image_global_status.name << ":" << std::endl
+ << " global_id: "
+ << m_mirror_image_global_status.info.global_id << std::endl;
+ if (local_site_r >= 0) {
+ std::cout << " state: " << utils::mirror_image_site_status_state(
+ local_status) << std::endl
+ << " description: " << local_status.description << std::endl;
+ if (mirror_service != nullptr) {
+ std::cout << " service: " <<
+ mirror_service->get_image_description() << std::endl;
+ }
+ std::cout << " last_update: " << utils::timestr(
+ local_status.last_update) << std::endl;
+ }
+ if (!m_mirror_image_global_status.site_statuses.empty()) {
+ std::cout << " peer_sites:" << std::endl;
+ bool first_site = true;
+ for (auto& site : m_mirror_image_global_status.site_statuses) {
+ if (!first_site) {
+ std::cout << std::endl;
+ }
+ first_site = false;
+
+ auto name_it = m_peer_mirror_uuids_to_name.find(site.mirror_uuid);
+ std::cout << " name: "
+ << (name_it != m_peer_mirror_uuids_to_name.end() ?
+ name_it->second : site.mirror_uuid)
+ << std::endl
+ << " state: " << utils::mirror_image_site_status_state(
+ site) << std::endl
+ << " description: " << site.description << std::endl
+ << " last_update: " << utils::timestr(
+ site.last_update) << std::endl;
+ }
+ }
+ }
+ }
+
+ std::string get_action_type() const override {
+ return "status";
+ }
+
+private:
+ const std::map<std::string, std::string> &m_instance_ids;
+ const std::vector<librbd::mirror_peer_site_t> &m_mirror_peers;
+ const std::map<std::string, std::string> &m_peer_mirror_uuids_to_name;
+ const MirrorDaemonServiceInfo &m_daemon_service_info;
+ at::Format::Formatter m_formatter;
+ std::string m_image_id;
+ librbd::mirror_image_global_status_t m_mirror_image_global_status;
+};
+
+template <typename RequestT>
+class ImageRequestAllocator {
+public:
+ template <class... Args>
+ RequestT *operator()(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name, Args&&... args) {
+ return new RequestT(io_ctx, throttle, image_name,
+ std::forward<Args>(args)...);
+ }
+};
+
+template <typename RequestT>
+class ImageRequestGenerator {
+public:
+ template <class... Args>
+ ImageRequestGenerator(librados::IoCtx &io_ctx, Args&&... args)
+ : m_io_ctx(io_ctx),
+ m_factory(std::bind(ImageRequestAllocator<RequestT>(),
+ std::ref(m_io_ctx), std::ref(m_throttle),
+ std::placeholders::_1, std::forward<Args>(args)...)),
+ m_throttle(g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"),
+ true) {
+ }
+
+ int execute() {
+ // use the alphabetical list of image names for pool-level
+ // mirror image operations
+ librbd::RBD rbd;
+ int r = rbd.list2(m_io_ctx, &m_images);
+ if (r < 0 && r != -ENOENT) {
+ std::cerr << "rbd: failed to list images within pool" << std::endl;
+ return r;
+ }
+
+ for (auto &image : m_images) {
+ auto request = m_factory(image.name);
+ request->send();
+ }
+
+ return m_throttle.wait_for_ret();
+ }
+private:
+ typedef std::function<RequestT*(const std::string&)> Factory;
+
+ librados::IoCtx &m_io_ctx;
+ Factory m_factory;
+
+ OrderedThrottle m_throttle;
+
+ std::vector<librbd::image_spec_t> m_images;
+
+};
+
+int get_mirror_image_status(
+ librados::IoCtx& io_ctx, uint32_t* total_images,
+ std::map<librbd::mirror_image_status_state_t, int>* mirror_image_states,
+ MirrorHealth* mirror_image_health) {
+ librbd::RBD rbd;
+ int r = rbd.mirror_image_status_summary(io_ctx, mirror_image_states);
+ if (r < 0) {
+ std::cerr << "rbd: failed to get status summary for mirrored images: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ *mirror_image_health = MIRROR_HEALTH_OK;
+ for (auto &it : *mirror_image_states) {
+ auto &state = it.first;
+ if (*mirror_image_health < MIRROR_HEALTH_WARNING &&
+ (state != MIRROR_IMAGE_STATUS_STATE_REPLAYING &&
+ state != MIRROR_IMAGE_STATUS_STATE_STOPPED)) {
+ *mirror_image_health = MIRROR_HEALTH_WARNING;
+ }
+ if (*mirror_image_health < MIRROR_HEALTH_ERROR &&
+ state == MIRROR_IMAGE_STATUS_STATE_ERROR) {
+ *mirror_image_health = MIRROR_HEALTH_ERROR;
+ }
+ *total_images += it.second;
+ }
+
+ return 0;
+}
+
+} // anonymous namespace
+
+void get_peer_bootstrap_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ add_site_name_optional(options);
+}
+
+int execute_peer_bootstrap_create(
+ const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ if (vm.count(SITE_NAME)) {
+ r = set_site_name(rados, vm[SITE_NAME].as<std::string>());
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ librbd::RBD rbd;
+ std::string token;
+ r = rbd.mirror_peer_bootstrap_create(io_ctx, &token);
+ if (r == -EEXIST) {
+ std::cerr << "rbd: mismatch with pre-existing RBD mirroring peer user caps"
+ << std::endl;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to create mirroring bootstrap token: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ std::cout << token << std::endl;
+ return 0;
+}
+
+void get_peer_bootstrap_import_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ add_site_name_optional(options);
+ positional->add_options()
+ ("token-path", po::value<std::string>(),
+ "bootstrap token file (or '-' for stdin)");
+ options->add_options()
+ ("token-path", po::value<std::string>(),
+ "bootstrap token file (or '-' for stdin)");
+ add_direction_optional(options);
+}
+
+int execute_peer_bootstrap_import(
+ const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string token_path;
+ if (vm.count("token-path")) {
+ token_path = vm["token-path"].as<std::string>();
+ } else {
+ token_path = utils::get_positional_argument(vm, arg_index++);
+ }
+
+ if (token_path.empty()) {
+ std::cerr << "rbd: token path was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ rbd_mirror_peer_direction_t mirror_peer_direction =
+ RBD_MIRROR_PEER_DIRECTION_RX_TX;
+ if (vm.count("direction")) {
+ mirror_peer_direction = vm["direction"].as<rbd_mirror_peer_direction_t>();
+ }
+
+ int fd = STDIN_FILENO;
+ if (token_path != "-") {
+ fd = open(token_path.c_str(), O_RDONLY|O_BINARY);
+ if (fd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << token_path << std::endl;
+ return r;
+ }
+ }
+
+ char token[1024];
+ memset(token, 0, sizeof(token));
+ r = safe_read(fd, token, sizeof(token) - 1);
+ if (fd != STDIN_FILENO) {
+ VOID_TEMP_FAILURE_RETRY(close(fd));
+ }
+
+ if (r < 0) {
+ std::cerr << "rbd: error reading token file: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ if (vm.count(SITE_NAME)) {
+ r = set_site_name(rados, vm[SITE_NAME].as<std::string>());
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ librbd::RBD rbd;
+ r = rbd.mirror_peer_bootstrap_import(io_ctx, mirror_peer_direction, token);
+ if (r == -ENOSYS) {
+ std::cerr << "rbd: mirroring is not enabled on remote peer" << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to import peer bootstrap token" << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_peer_add_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ positional->add_options()
+ ("remote-cluster-spec", "remote cluster spec\n"
+ "(example: [<client name>@]<cluster name>)");
+ options->add_options()
+ ("remote-client-name", po::value<std::string>(), "remote client name")
+ ("remote-cluster", po::value<std::string>(), "remote cluster name")
+ ("remote-mon-host", po::value<std::string>(), "remote mon host(s)")
+ ("remote-key-file", po::value<std::string>(),
+ "path to file containing remote key");
+ add_direction_optional(options);
+}
+
+int execute_peer_add(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string remote_client_name = g_ceph_context->_conf->name.to_str();
+ std::string remote_cluster;
+ std::map<std::string, std::string> attributes;
+ r = get_remote_cluster_spec(
+ vm, utils::get_positional_argument(vm, arg_index),
+ &remote_client_name, &remote_cluster, &attributes);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO: temporary restriction to prevent adding multiple peers
+ // until rbd-mirror daemon can properly handle the scenario
+ librbd::RBD rbd;
+ std::vector<librbd::mirror_peer_site_t> mirror_peers;
+ r = rbd.mirror_peer_site_list(io_ctx, &mirror_peers);
+ if (r < 0) {
+ std::cerr << "rbd: failed to list mirror peers" << std::endl;
+ return r;
+ }
+
+ // ignore tx-only peers since the restriction is for rx
+ mirror_peers.erase(
+ std::remove_if(
+ mirror_peers.begin(), mirror_peers.end(),
+ [](const librbd::mirror_peer_site_t& peer) {
+ return (peer.direction == RBD_MIRROR_PEER_DIRECTION_TX);
+ }),
+ mirror_peers.end());
+
+ if (!mirror_peers.empty()) {
+ std::cerr << "rbd: multiple RX peers are not currently supported"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ rbd_mirror_peer_direction_t mirror_peer_direction =
+ RBD_MIRROR_PEER_DIRECTION_RX_TX;
+ if (vm.count("direction")) {
+ mirror_peer_direction = vm["direction"].as<rbd_mirror_peer_direction_t>();
+ }
+
+ std::string uuid;
+ r = rbd.mirror_peer_site_add(
+ io_ctx, &uuid, mirror_peer_direction, remote_cluster, remote_client_name);
+ if (r == -EEXIST) {
+ std::cerr << "rbd: mirror peer already exists" << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: error adding mirror peer" << std::endl;
+ return r;
+ }
+
+ if (!attributes.empty()) {
+ r = set_peer_config_key(io_ctx, uuid, std::move(attributes));
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ std::cout << uuid << std::endl;
+ return 0;
+}
+
+void get_peer_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ add_uuid_option(positional);
+}
+
+int execute_peer_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string uuid;
+ r = get_uuid(vm, arg_index, &uuid);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.mirror_peer_site_remove(io_ctx, uuid);
+ if (r < 0) {
+ std::cerr << "rbd: error removing mirror peer" << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_peer_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ add_uuid_option(positional);
+ positional->add_options()
+ ("key", "peer parameter\n"
+ "(direction, site-name, client, mon-host, key-file)")
+ ("value", "new value for specified key\n"
+ "(rx-only, tx-only, or rx-tx for direction)");
+}
+
+int execute_peer_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string uuid;
+ r = get_uuid(vm, arg_index++, &uuid);
+ if (r < 0) {
+ return r;
+ }
+
+ std::set<std::string> valid_keys{{"direction", "site-name", "cluster",
+ "client", "mon-host", "key-file"}};
+ std::string key = utils::get_positional_argument(vm, arg_index++);
+ if (valid_keys.find(key) == valid_keys.end()) {
+ std::cerr << "rbd: must specify ";
+ for (auto& valid_key : valid_keys) {
+ std::cerr << "'" << valid_key << "'";
+ if (&valid_key != &(*valid_keys.rbegin())) {
+ std::cerr << ", ";
+ }
+ }
+ std::cerr << " key." << std::endl;
+ return -EINVAL;
+ }
+
+ std::string value = utils::get_positional_argument(vm, arg_index++);
+ if (value.empty() && (key == "client" || key == "cluster")) {
+ std::cerr << "rbd: must specify new " << key << " value." << std::endl;
+ } else if (key == "key-file") {
+ key = "key";
+ r = read_key_file(value, &value);
+ if (r < 0) {
+ return r;
+ }
+ } else if (key == "mon-host") {
+ key = "mon_host";
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ if (key == "client") {
+ r = rbd.mirror_peer_site_set_client_name(io_ctx, uuid.c_str(),
+ value.c_str());
+ } else if (key == "site-name" || key == "cluster") {
+ r = rbd.mirror_peer_site_set_name(io_ctx, uuid.c_str(), value.c_str());
+ } else if (key == "direction") {
+ MirrorPeerDirection tag;
+ boost::any direction;
+ try {
+ validate(direction, {value}, &tag, 1);
+ } catch (...) {
+ std::cerr << "rbd: invalid direction" << std::endl;
+ return -EINVAL;
+ }
+
+ auto peer_direction = boost::any_cast<rbd_mirror_peer_direction_t>(
+ direction);
+ if (peer_direction != RBD_MIRROR_PEER_DIRECTION_TX) {
+ // TODO: temporary restriction to prevent adding multiple peers
+ // until rbd-mirror daemon can properly handle the scenario
+ std::vector<librbd::mirror_peer_site_t> mirror_peers;
+ r = rbd.mirror_peer_site_list(io_ctx, &mirror_peers);
+ if (r < 0) {
+ std::cerr << "rbd: failed to list mirror peers" << std::endl;
+ return r;
+ }
+
+ // ignore peer to be updated and tx-only peers since the restriction is
+ // for rx
+ mirror_peers.erase(
+ std::remove_if(
+ mirror_peers.begin(), mirror_peers.end(),
+ [uuid](const librbd::mirror_peer_site_t& peer) {
+ return (peer.uuid == uuid ||
+ peer.direction == RBD_MIRROR_PEER_DIRECTION_TX);
+ }),
+ mirror_peers.end());
+
+ if (!mirror_peers.empty()) {
+ std::cerr << "rbd: multiple RX peers are not currently supported"
+ << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ r = rbd.mirror_peer_site_set_direction(io_ctx, uuid, peer_direction);
+ } else {
+ r = update_peer_config_key(io_ctx, uuid, key, value);
+ }
+
+ if (r == -ENOENT) {
+ std::cerr << "rbd: mirror peer " << uuid << " does not exist"
+ << std::endl;
+ }
+
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+void get_disable_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+}
+
+void get_enable_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ positional->add_options()
+ ("mode", "mirror mode [image or pool]");
+ add_site_name_optional(options);
+}
+
+int execute_enable_disable(librados::IoCtx& io_ctx,
+ rbd_mirror_mode_t next_mirror_mode,
+ const std::string &mode, bool ignore_no_update) {
+ librbd::RBD rbd;
+ rbd_mirror_mode_t current_mirror_mode;
+ int r = rbd.mirror_mode_get(io_ctx, &current_mirror_mode);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (current_mirror_mode == next_mirror_mode) {
+ if (!ignore_no_update) {
+ if (mode == "disabled") {
+ std::cout << "rbd: mirroring is already " << mode << std::endl;
+ } else {
+ std::cout << "rbd: mirroring is already configured for "
+ << mode << " mode" << std::endl;
+ }
+ }
+ return 0;
+ } else if (next_mirror_mode == RBD_MIRROR_MODE_IMAGE &&
+ current_mirror_mode == RBD_MIRROR_MODE_POOL) {
+ std::cout << "note: changing mirroring mode from pool to image"
+ << std::endl;
+ } else if (next_mirror_mode == RBD_MIRROR_MODE_POOL &&
+ current_mirror_mode == RBD_MIRROR_MODE_IMAGE) {
+ std::cout << "note: changing mirroring mode from image to pool"
+ << std::endl;
+ }
+
+ r = rbd.mirror_mode_set(io_ctx, next_mirror_mode);
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+int execute_disable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ return execute_enable_disable(io_ctx, RBD_MIRROR_MODE_DISABLED, "disabled",
+ false);
+}
+
+int execute_enable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ rbd_mirror_mode_t mirror_mode;
+ std::string mode = utils::get_positional_argument(vm, arg_index++);
+ if (mode == "image") {
+ mirror_mode = RBD_MIRROR_MODE_IMAGE;
+ } else if (mode == "pool") {
+ mirror_mode = RBD_MIRROR_MODE_POOL;
+ } else {
+ std::cerr << "rbd: must specify 'image' or 'pool' mode." << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ bool updated = false;
+ if (vm.count(SITE_NAME)) {
+ librbd::RBD rbd;
+
+ auto site_name = vm[SITE_NAME].as<std::string>();
+ std::string original_site_name;
+ r = rbd.mirror_site_name_get(rados, &original_site_name);
+ updated = (r >= 0 && site_name != original_site_name);
+
+ r = set_site_name(rados, site_name);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return execute_enable_disable(io_ctx, mirror_mode, mode, updated);
+}
+
+void get_info_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ at::add_format_options(options);
+ options->add_options()
+ (ALL_NAME.c_str(), po::bool_switch(), "list all attributes");
+}
+
+int execute_info(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ rbd_mirror_mode_t mirror_mode;
+ r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string site_name;
+ r = rbd.mirror_site_name_get(rados, &site_name);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ return r;
+ }
+
+ std::vector<librbd::mirror_peer_site_t> mirror_peers;
+ if (namespace_name.empty()) {
+ r = rbd.mirror_peer_site_list(io_ctx, &mirror_peers);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ std::string mirror_mode_desc;
+ switch (mirror_mode) {
+ case RBD_MIRROR_MODE_DISABLED:
+ mirror_mode_desc = "disabled";
+ break;
+ case RBD_MIRROR_MODE_IMAGE:
+ mirror_mode_desc = "image";
+ break;
+ case RBD_MIRROR_MODE_POOL:
+ mirror_mode_desc = "pool";
+ break;
+ default:
+ mirror_mode_desc = "unknown";
+ break;
+ }
+
+ if (formatter != nullptr) {
+ formatter->open_object_section("mirror");
+ formatter->dump_string("mode", mirror_mode_desc);
+ } else {
+ std::cout << "Mode: " << mirror_mode_desc << std::endl;
+ }
+
+ if (mirror_mode != RBD_MIRROR_MODE_DISABLED && namespace_name.empty()) {
+ if (formatter != nullptr) {
+ formatter->dump_string("site_name", site_name);
+ } else {
+ std::cout << "Site Name: " << site_name << std::endl
+ << std::endl;
+ }
+
+ r = format_mirror_peers(io_ctx, formatter, mirror_peers,
+ vm[ALL_NAME].as<bool>());
+ if (r < 0) {
+ return r;
+ }
+ }
+ if (formatter != nullptr) {
+ formatter->close_section();
+ formatter->flush(std::cout);
+ }
+ return 0;
+}
+
+void get_status_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ at::add_format_options(options);
+ at::add_verbose_option(options);
+}
+
+int execute_status(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ bool verbose = vm[at::VERBOSE].as<bool>();
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+
+ uint32_t total_images = 0;
+ std::map<librbd::mirror_image_status_state_t, int> mirror_image_states;
+ MirrorHealth mirror_image_health = MIRROR_HEALTH_UNKNOWN;
+ r = get_mirror_image_status(io_ctx, &total_images, &mirror_image_states,
+ &mirror_image_health);
+ if (r < 0) {
+ return r;
+ }
+
+ MirrorDaemonServiceInfo daemon_service_info(io_ctx);
+ daemon_service_info.init();
+
+ MirrorHealth mirror_daemon_health = daemon_service_info.get_daemon_health();
+ auto mirror_services = daemon_service_info.get_mirror_services();
+
+ auto mirror_health = std::max(mirror_image_health, mirror_daemon_health);
+
+ if (formatter != nullptr) {
+ formatter->open_object_section("status");
+ formatter->open_object_section("summary");
+ formatter->dump_stream("health") << mirror_health;
+ formatter->dump_stream("daemon_health") << mirror_daemon_health;
+ formatter->dump_stream("image_health") << mirror_image_health;
+ formatter->open_object_section("states");
+ for (auto &it : mirror_image_states) {
+ std::string state_name = utils::mirror_image_status_state(it.first);
+ formatter->dump_int(state_name.c_str(), it.second);
+ }
+ formatter->close_section(); // states
+ formatter->close_section(); // summary
+ } else {
+ std::cout << "health: " << mirror_health << std::endl;
+ std::cout << "daemon health: " << mirror_daemon_health << std::endl;
+ std::cout << "image health: " << mirror_image_health << std::endl;
+ std::cout << "images: " << total_images << " total" << std::endl;
+ for (auto &it : mirror_image_states) {
+ std::cout << " " << it.second << " "
+ << utils::mirror_image_status_state(it.first) << std::endl;
+ }
+ }
+
+ int ret = 0;
+
+ if (verbose) {
+ // dump per-daemon status
+ if (formatter != nullptr) {
+ formatter->open_array_section("daemons");
+ for (auto& mirror_service : mirror_services) {
+ formatter->open_object_section("daemon");
+ formatter->dump_string("service_id", mirror_service.service_id);
+ formatter->dump_string("instance_id", mirror_service.instance_id);
+ formatter->dump_string("client_id", mirror_service.client_id);
+ formatter->dump_string("hostname", mirror_service.hostname);
+ formatter->dump_string("ceph_version", mirror_service.ceph_version);
+ formatter->dump_bool("leader", mirror_service.leader);
+ formatter->dump_stream("health") << mirror_service.health;
+ if (!mirror_service.callouts.empty()) {
+ formatter->open_array_section("callouts");
+ for (auto& callout : mirror_service.callouts) {
+ formatter->dump_string("callout", callout);
+ }
+ formatter->close_section(); // callouts
+ }
+ formatter->close_section(); // daemon
+ }
+ formatter->close_section(); // daemons
+ } else {
+ std::cout << std::endl << "DAEMONS" << std::endl;
+ if (mirror_services.empty()) {
+ std::cout << " none" << std::endl;
+ }
+ for (auto& mirror_service : mirror_services) {
+ std::cout << "service " << mirror_service.service_id << ":"
+ << std::endl
+ << " instance_id: " << mirror_service.instance_id
+ << std::endl
+ << " client_id: " << mirror_service.client_id << std::endl
+ << " hostname: " << mirror_service.hostname << std::endl
+ << " version: " << mirror_service.ceph_version << std::endl
+ << " leader: " << (mirror_service.leader ? "true" : "false")
+ << std::endl
+ << " health: " << mirror_service.health << std::endl;
+ if (!mirror_service.callouts.empty()) {
+ std::cout << " callouts: " << mirror_service.callouts << std::endl;
+ }
+ std::cout << std::endl;
+ }
+ std::cout << std::endl;
+ }
+
+ // dump per-image status
+ librados::IoCtx default_ns_io_ctx;
+ default_ns_io_ctx.dup(io_ctx);
+ default_ns_io_ctx.set_namespace("");
+ std::vector<librbd::mirror_peer_site_t> mirror_peers;
+ utils::get_mirror_peer_sites(default_ns_io_ctx, &mirror_peers);
+
+ std::map<std::string, std::string> peer_mirror_uuids_to_name;
+ utils::get_mirror_peer_mirror_uuids_to_names(mirror_peers,
+ &peer_mirror_uuids_to_name);
+
+ if (formatter != nullptr) {
+ formatter->open_array_section("images");
+ } else {
+ std::cout << "IMAGES";
+ }
+
+ std::map<std::string, std::string> instance_ids;
+
+ std::string start_image_id;
+ while (true) {
+ std::map<std::string, std::string> ids;
+ r = rbd.mirror_image_instance_id_list(io_ctx, start_image_id, 1024, &ids);
+ if (r < 0) {
+ if (r == -EOPNOTSUPP) {
+ std::cerr << "rbd: newer release of Ceph OSDs required to map image "
+ << "to rbd-mirror daemon instance" << std::endl;
+ } else {
+ std::cerr << "rbd: failed to get instance id list: "
+ << cpp_strerror(r) << std::endl;
+ }
+ // not fatal
+ break;
+ }
+ if (ids.empty()) {
+ break;
+ }
+ instance_ids.insert(ids.begin(), ids.end());
+ start_image_id = ids.rbegin()->first;
+ }
+
+ ImageRequestGenerator<StatusImageRequest> generator(
+ io_ctx, instance_ids, mirror_peers, peer_mirror_uuids_to_name,
+ daemon_service_info, formatter);
+ ret = generator.execute();
+
+ if (formatter != nullptr) {
+ formatter->close_section(); // images
+ }
+ }
+
+ if (formatter != nullptr) {
+ formatter->close_section(); // status
+ formatter->flush(std::cout);
+ }
+
+ return ret;
+}
+
+void get_promote_arguments(po::options_description *positional,
+ po::options_description *options) {
+ options->add_options()
+ ("force", po::bool_switch(),
+ "promote even if not cleanly demoted by remote cluster");
+ at::add_pool_options(positional, options, true);
+}
+
+int execute_promote(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ std::atomic<unsigned> counter = { 0 };
+ ImageRequestGenerator<PromoteImageRequest> generator(io_ctx, &counter,
+ vm["force"].as<bool>());
+ r = generator.execute();
+
+ std::cout << "Promoted " << counter.load() << " mirrored images" << std::endl;
+ return r;
+}
+
+void get_demote_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+}
+
+int execute_demote(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ std::atomic<unsigned> counter { 0 };
+ ImageRequestGenerator<DemoteImageRequest> generator(io_ctx, &counter);
+ r = generator.execute();
+
+ std::cout << "Demoted " << counter.load() << " mirrored images" << std::endl;
+ return r;
+}
+
+Shell::Action action_bootstrap_create(
+ {"mirror", "pool", "peer", "bootstrap", "create"}, {},
+ "Create a peer bootstrap token to import in a remote cluster", "",
+ &get_peer_bootstrap_create_arguments, &execute_peer_bootstrap_create);
+Shell::Action action_bootstreap_import(
+ {"mirror", "pool", "peer", "bootstrap", "import"}, {},
+ "Import a peer bootstrap token created from a remote cluster", "",
+ &get_peer_bootstrap_import_arguments, &execute_peer_bootstrap_import);
+
+Shell::Action action_add(
+ {"mirror", "pool", "peer", "add"}, {},
+ "Add a mirroring peer to a pool.", "",
+ &get_peer_add_arguments, &execute_peer_add);
+Shell::Action action_remove(
+ {"mirror", "pool", "peer", "remove"}, {},
+ "Remove a mirroring peer from a pool.", "",
+ &get_peer_remove_arguments, &execute_peer_remove);
+Shell::Action action_set(
+ {"mirror", "pool", "peer", "set"}, {},
+ "Update mirroring peer settings.", "",
+ &get_peer_set_arguments, &execute_peer_set);
+
+Shell::Action action_disable(
+ {"mirror", "pool", "disable"}, {},
+ "Disable RBD mirroring by default within a pool.", "",
+ &get_disable_arguments, &execute_disable);
+Shell::Action action_enable(
+ {"mirror", "pool", "enable"}, {},
+ "Enable RBD mirroring by default within a pool.", "",
+ &get_enable_arguments, &execute_enable);
+Shell::Action action_info(
+ {"mirror", "pool", "info"}, {},
+ "Show information about the pool mirroring configuration.", {},
+ &get_info_arguments, &execute_info);
+Shell::Action action_status(
+ {"mirror", "pool", "status"}, {},
+ "Show status for all mirrored images in the pool.", {},
+ &get_status_arguments, &execute_status);
+Shell::Action action_promote(
+ {"mirror", "pool", "promote"}, {},
+ "Promote all non-primary images in the pool.", {},
+ &get_promote_arguments, &execute_promote);
+Shell::Action action_demote(
+ {"mirror", "pool", "demote"}, {},
+ "Demote all primary images in the pool.", {},
+ &get_demote_arguments, &execute_demote);
+
+} // namespace mirror_pool
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/MirrorSnapshotSchedule.cc b/src/tools/rbd/action/MirrorSnapshotSchedule.cc
new file mode 100644
index 000000000..3f269c2ad
--- /dev/null
+++ b/src/tools/rbd/action/MirrorSnapshotSchedule.cc
@@ -0,0 +1,322 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Schedule.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "global/global_context.h"
+#include "include/stringify.h"
+
+#include <iostream>
+#include <list>
+#include <map>
+#include <string>
+#include <boost/program_options.hpp>
+
+#include "json_spirit/json_spirit.h"
+
+namespace rbd {
+namespace action {
+namespace mirror_snapshot_schedule {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+class ScheduleStatus {
+public:
+ ScheduleStatus() {
+ }
+
+ int parse(const std::string &status) {
+ json_spirit::mValue json_root;
+ if(!json_spirit::read(status, json_root)) {
+ std::cerr << "rbd: invalid schedule status JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ try {
+ auto &s = json_root.get_obj();
+
+ if (s["scheduled_images"].type() != json_spirit::array_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "scheduled_images is not array" << std::endl;
+ return -EBADMSG;
+ }
+
+ for (auto &item_val : s["scheduled_images"].get_array()) {
+ if (item_val.type() != json_spirit::obj_type) {
+ std::cerr << "rbd: unexpected schedule status JSON received: "
+ << "schedule item is not object" << std::endl;
+ return -EBADMSG;
+ }
+
+ auto &item = item_val.get_obj();
+
+ if (item["schedule_time"].type() != json_spirit::str_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "schedule_time is not string" << std::endl;
+ return -EBADMSG;
+ }
+ auto schedule_time = item["schedule_time"].get_str();
+
+ if (item["image"].type() != json_spirit::str_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "image is not string" << std::endl;
+ return -EBADMSG;
+ }
+ auto image = item["image"].get_str();
+
+ scheduled_images.push_back({schedule_time, image});
+ }
+
+ } catch (std::runtime_error &) {
+ std::cerr << "rbd: invalid schedule JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ return 0;
+ }
+
+ void dump(Formatter *f) {
+ f->open_array_section("scheduled_images");
+ for (auto &image : scheduled_images) {
+ f->open_object_section("image");
+ f->dump_string("schedule_time", image.first);
+ f->dump_string("image", image.second);
+ f->close_section(); // image
+ }
+ f->close_section(); // scheduled_images
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, ScheduleStatus &d);
+
+private:
+
+ std::list<std::pair<std::string, std::string>> scheduled_images;
+};
+
+std::ostream& operator<<(std::ostream& os, ScheduleStatus &s) {
+ TextTable tbl;
+ tbl.define_column("SCHEDULE TIME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("IMAGE", TextTable::LEFT, TextTable::LEFT);
+
+ for (auto &[schedule_time, image] : s.scheduled_images) {
+ tbl << schedule_time << image << TextTable::endrow;
+ }
+
+ os << tbl;
+ return os;
+}
+
+} // anonymous namespace
+
+void get_arguments_add(po::options_description *positional,
+ po::options_description *options) {
+ add_level_spec_options(options);
+ add_schedule_options(positional, true);
+}
+
+int execute_add(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::map<std::string, std::string> args;
+
+ int r = get_level_spec_args(vm, &args);
+ if (r < 0) {
+ return r;
+ }
+ r = get_schedule_args(vm, true, &args);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ normalize_level_spec_args(&args);
+ r = utils::mgr_command(rados, "rbd mirror snapshot schedule add", args,
+ &std::cout, &std::cerr);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void get_arguments_remove(po::options_description *positional,
+ po::options_description *options) {
+ add_level_spec_options(options);
+ add_schedule_options(positional, false);
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::map<std::string, std::string> args;
+
+ int r = get_level_spec_args(vm, &args);
+ if (r < 0) {
+ return r;
+ }
+ r = get_schedule_args(vm, false, &args);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ normalize_level_spec_args(&args);
+ r = utils::mgr_command(rados, "rbd mirror snapshot schedule remove", args,
+ &std::cout, &std::cerr);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void get_arguments_list(po::options_description *positional,
+ po::options_description *options) {
+ add_level_spec_options(options);
+ options->add_options()
+ ("recursive,R", po::bool_switch(), "list all schedules");
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::map<std::string, std::string> args;
+
+ int r = get_level_spec_args(vm, &args);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ normalize_level_spec_args(&args);
+ std::stringstream out;
+ r = utils::mgr_command(rados, "rbd mirror snapshot schedule list", args, &out,
+ &std::cerr);
+ if (r < 0) {
+ return r;
+ }
+
+ ScheduleList schedule_list;
+ r = schedule_list.parse(out.str());
+ if (r < 0) {
+ return r;
+ }
+
+ if (vm["recursive"].as<bool>()) {
+ if (formatter.get()) {
+ schedule_list.dump(formatter.get());
+ formatter->flush(std::cout);
+ } else {
+ std::cout << schedule_list;
+ }
+ } else {
+ auto schedule = schedule_list.find(args["level_spec"]);
+ if (schedule == nullptr) {
+ return -ENOENT;
+ }
+
+ if (formatter.get()) {
+ schedule->dump(formatter.get());
+ formatter->flush(std::cout);
+ } else {
+ std::cout << *schedule << std::endl;
+ }
+ }
+
+ return 0;
+}
+
+void get_arguments_status(po::options_description *positional,
+ po::options_description *options) {
+ add_level_spec_options(options);
+ at::add_format_options(options);
+}
+
+int execute_status(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::map<std::string, std::string> args;
+
+ int r = get_level_spec_args(vm, &args);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ normalize_level_spec_args(&args);
+ std::stringstream out;
+ r = utils::mgr_command(rados, "rbd mirror snapshot schedule status", args,
+ &out, &std::cerr);
+ ScheduleStatus schedule_status;
+ r = schedule_status.parse(out.str());
+ if (r < 0) {
+ return r;
+ }
+
+ if (formatter.get()) {
+ schedule_status.dump(formatter.get());
+ formatter->flush(std::cout);
+ } else {
+ std::cout << schedule_status;
+ }
+
+ return 0;
+}
+
+Shell::Action add_action(
+ {"mirror", "snapshot", "schedule", "add"}, {},
+ "Add mirror snapshot schedule.", "", &get_arguments_add, &execute_add);
+Shell::Action remove_action(
+ {"mirror", "snapshot", "schedule", "remove"},
+ {"mirror", "snapshot", "schedule", "rm"}, "Remove mirror snapshot schedule.",
+ "", &get_arguments_remove, &execute_remove);
+Shell::Action list_action(
+ {"mirror", "snapshot", "schedule", "list"},
+ {"mirror", "snapshot", "schedule", "ls"}, "List mirror snapshot schedule.",
+ "", &get_arguments_list, &execute_list);
+Shell::Action status_action(
+ {"mirror", "snapshot", "schedule", "status"}, {},
+ "Show mirror snapshot schedule status.", "", &get_arguments_status, &execute_status);
+
+} // namespace mirror_snapshot_schedule
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Namespace.cc b/src/tools/rbd/action/Namespace.cc
new file mode 100644
index 000000000..12d92bff8
--- /dev/null
+++ b/src/tools/rbd/action/Namespace.cc
@@ -0,0 +1,191 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <algorithm>
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace ns {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+}
+
+int execute_create(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ if (namespace_name.empty()) {
+ std::cerr << "rbd: namespace name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.namespace_create(io_ctx, namespace_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: failed to created namespace: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ if (namespace_name.empty()) {
+ std::cerr << "rbd: namespace name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.namespace_remove(io_ctx, namespace_name.c_str());
+ if (r == -EBUSY) {
+ std::cerr << "rbd: namespace contains images which must be deleted first."
+ << std::endl;
+ return r;
+ } else if (r == -ENOENT) {
+ std::cerr << "rbd: namespace does not exist." << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to remove namespace: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::vector<std::string> names;
+ r = rbd.namespace_list(io_ctx, &names);
+ if (r < 0 && r != -ENOENT) {
+ std::cerr << "rbd: failed to list namespaces: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ std::sort(names.begin(), names.end());
+
+ TextTable tbl;
+ if (formatter) {
+ formatter->open_array_section("namespaces");
+ } else {
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (auto& name : names) {
+ if (formatter) {
+ formatter->open_object_section("namespace");
+ formatter->dump_string("name", name);
+ formatter->close_section();
+ } else {
+ tbl << name << TextTable::endrow;
+ }
+ }
+
+ if (formatter) {
+ formatter->close_section();
+ formatter->flush(std::cout);
+ } else if (!names.empty()) {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+Shell::Action action_create(
+ {"namespace", "create"}, {},
+ "Create an RBD image namespace.", "",
+ &get_create_arguments, &execute_create);
+
+Shell::Action action_remove(
+ {"namespace", "remove"}, {"namespace", "rm"},
+ "Remove an RBD image namespace.", "",
+ &get_remove_arguments, &execute_remove);
+
+Shell::Action action_list(
+ {"namespace", "list"}, {"namespace", "ls"}, "List RBD image namespaces.", "",
+ &get_list_arguments, &execute_list);
+
+} // namespace ns
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Nbd.cc b/src/tools/rbd/action/Nbd.cc
new file mode 100644
index 000000000..061fdfa30
--- /dev/null
+++ b/src/tools/rbd/action/Nbd.cc
@@ -0,0 +1,369 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/SubProcess.h"
+#include <iostream>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace nbd {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int call_nbd_cmd(const po::variables_map &vm,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &ceph_global_init_args) {
+ #if defined(__FreeBSD__) || defined(_WIN32)
+ std::cerr << "rbd: nbd device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+ #else
+ char exe_path[PATH_MAX];
+ ssize_t exe_path_bytes = readlink("/proc/self/exe", exe_path,
+ sizeof(exe_path) - 1);
+ if (exe_path_bytes < 0) {
+ strcpy(exe_path, "rbd-nbd");
+ } else {
+ if (snprintf(exe_path + exe_path_bytes,
+ sizeof(exe_path) - exe_path_bytes,
+ "-nbd") < 0) {
+ return -EOVERFLOW;
+ }
+ }
+
+ SubProcess process(exe_path, SubProcess::KEEP, SubProcess::KEEP, SubProcess::KEEP);
+
+ for (auto &arg : ceph_global_init_args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ for (auto &arg : args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ if (process.spawn()) {
+ std::cerr << "rbd: failed to run rbd-nbd: " << process.err() << std::endl;
+ return -EINVAL;
+ } else if (process.join()) {
+ std::cerr << "rbd: rbd-nbd failed with error: " << process.err() << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+ #endif
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(__FreeBSD__) || defined(_WIN32)
+ std::cerr << "rbd: nbd device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::vector<std::string> args;
+
+ args.push_back("list-mapped");
+
+ if (vm.count("format")) {
+ args.push_back("--format");
+ args.push_back(vm["format"].as<at::Format>().value);
+ }
+ if (vm["pretty-format"].as<bool>()) {
+ args.push_back("--pretty-format");
+ }
+
+ return call_nbd_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_attach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(__FreeBSD__) || defined(_WIN32)
+ std::cerr << "rbd: nbd device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::vector<std::string> args;
+ std::string device_path;
+
+ args.push_back("attach");
+ std::string img;
+ int r = utils::get_image_or_snap_spec(vm, &img);
+ if (r < 0) {
+ return r;
+ }
+ args.push_back(img);
+
+ if (vm.count("device")) {
+ device_path = vm["device"].as<std::string>();
+ args.push_back("--device");
+ args.push_back(device_path);
+ } else {
+ std::cerr << "rbd: device was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ if (vm["show-cookie"].as<bool>()) {
+ args.push_back("--show-cookie");
+ }
+
+ if (vm.count("cookie")) {
+ args.push_back("--cookie");
+ args.push_back(vm["cookie"].as<std::string>());
+ } else if (!vm["force"].as<bool>()) {
+ std::cerr << "rbd: could not validate attach request\n";
+ std::cerr << "rbd: mismatching the image and the device may lead to data corruption\n";
+ std::cerr << "rbd: must specify --cookie <arg> or --force to proceed" << std::endl;
+ return -EINVAL;
+ }
+
+ if (vm["quiesce"].as<bool>()) {
+ args.push_back("--quiesce");
+ }
+
+ if (vm["read-only"].as<bool>()) {
+ args.push_back("--read-only");
+ }
+
+ if (vm["exclusive"].as<bool>()) {
+ args.push_back("--exclusive");
+ }
+
+ if (vm.count("quiesce-hook")) {
+ args.push_back("--quiesce-hook");
+ args.push_back(vm["quiesce-hook"].as<std::string>());
+ }
+
+ if (vm.count("options")) {
+ utils::append_options_as_args(vm["options"].as<std::vector<std::string>>(),
+ &args);
+ }
+
+ return call_nbd_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_detach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(__FreeBSD__) || defined(_WIN32)
+ std::cerr << "rbd: nbd device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::string device_name = utils::get_positional_argument(vm, 0);
+ if (!boost::starts_with(device_name, "/dev/")) {
+ device_name.clear();
+ }
+
+ std::string image_name;
+ if (device_name.empty()) {
+ int r = utils::get_image_or_snap_spec(vm, &image_name);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (device_name.empty() && image_name.empty()) {
+ std::cerr << "rbd: detach requires either image name or device path"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ std::vector<std::string> args;
+
+ args.push_back("detach");
+ args.push_back(device_name.empty() ? image_name : device_name);
+
+ if (vm.count("options")) {
+ utils::append_options_as_args(vm["options"].as<std::vector<std::string>>(),
+ &args);
+ }
+
+ return call_nbd_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_map(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(__FreeBSD__) || defined(_WIN32)
+ std::cerr << "rbd: nbd device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::vector<std::string> args;
+
+ args.push_back("map");
+ std::string img;
+ int r = utils::get_image_or_snap_spec(vm, &img);
+ if (r < 0) {
+ return r;
+ }
+ args.push_back(img);
+
+ if (vm["quiesce"].as<bool>()) {
+ args.push_back("--quiesce");
+ }
+
+ if (vm["show-cookie"].as<bool>()) {
+ args.push_back("--show-cookie");
+ }
+
+ if (vm.count("cookie")) {
+ args.push_back("--cookie");
+ args.push_back(vm["cookie"].as<std::string>());
+ }
+
+ if (vm["read-only"].as<bool>()) {
+ args.push_back("--read-only");
+ }
+
+ if (vm["exclusive"].as<bool>()) {
+ args.push_back("--exclusive");
+ }
+
+ if (vm.count("quiesce-hook")) {
+ args.push_back("--quiesce-hook");
+ args.push_back(vm["quiesce-hook"].as<std::string>());
+ }
+
+ if (vm.count("options")) {
+ utils::append_options_as_args(vm["options"].as<std::vector<std::string>>(),
+ &args);
+ }
+
+ return call_nbd_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_unmap(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(__FreeBSD__) || defined(_WIN32)
+ std::cerr << "rbd: nbd device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::string device_name = utils::get_positional_argument(vm, 0);
+ if (!boost::starts_with(device_name, "/dev/")) {
+ device_name.clear();
+ }
+
+ std::string image_name;
+ if (device_name.empty()) {
+ int r = utils::get_image_or_snap_spec(vm, &image_name);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (device_name.empty() && image_name.empty()) {
+ std::cerr << "rbd: unmap requires either image name or device path"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ std::vector<std::string> args;
+
+ args.push_back("unmap");
+ args.push_back(device_name.empty() ? image_name : device_name);
+
+ if (vm.count("options")) {
+ utils::append_options_as_args(vm["options"].as<std::vector<std::string>>(),
+ &args);
+ }
+
+ return call_nbd_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+void get_list_arguments_deprecated(po::options_description *positional,
+ po::options_description *options) {
+ at::add_format_options(options);
+}
+
+int execute_list_deprecated(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args) {
+ std::cerr << "rbd: 'nbd list' command is deprecated, "
+ << "use 'device list -t nbd' instead" << std::endl;
+ return execute_list(vm, ceph_global_args);
+}
+
+void get_map_arguments_deprecated(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ ("read-only", po::bool_switch(), "map read-only")
+ ("exclusive", po::bool_switch(), "forbid writes by other clients")
+ ("device", po::value<std::string>(), "specify nbd device")
+ ("nbds_max", po::value<std::string>(), "override module param nbds_max")
+ ("max_part", po::value<std::string>(), "override module param max_part")
+ ("timeout", po::value<std::string>(), "set nbd request timeout (seconds)");
+}
+
+int execute_map_deprecated(const po::variables_map &vm_deprecated,
+ const std::vector<std::string> &ceph_global_args) {
+ std::cerr << "rbd: 'nbd map' command is deprecated, "
+ << "use 'device map -t nbd' instead" << std::endl;
+
+ po::options_description options;
+ options.add_options()
+ ("options,o", po::value<std::vector<std::string>>()
+ ->default_value(std::vector<std::string>(), ""), "");
+
+ po::variables_map vm = vm_deprecated;
+ po::store(po::command_line_parser({}).options(options).run(), vm);
+
+ std::vector<std::string> opts;
+ if (vm_deprecated.count("device")) {
+ opts.push_back("device=" + vm_deprecated["device"].as<std::string>());
+ }
+ if (vm_deprecated.count("nbds_max")) {
+ opts.push_back("nbds_max=" + vm_deprecated["nbds_max"].as<std::string>());
+ }
+ if (vm_deprecated.count("max_part")) {
+ opts.push_back("max_part=" + vm_deprecated["max_part"].as<std::string>());
+ }
+ if (vm_deprecated.count("timeout")) {
+ opts.push_back("timeout=" + vm_deprecated["timeout"].as<std::string>());
+ }
+
+ vm.at("options").value() = boost::any(opts);
+
+ return execute_map(vm, ceph_global_args);
+}
+
+void get_unmap_arguments_deprecated(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ ("image-or-snap-or-device-spec",
+ "image, snapshot, or device specification\n"
+ "[<pool-name>/]<image-name>[@<snap-name>] or <device-path>");
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_snap_option(options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_unmap_deprecated(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args) {
+ std::cerr << "rbd: 'nbd unmap' command is deprecated, "
+ << "use 'device unmap -t nbd' instead" << std::endl;
+ return execute_unmap(vm, ceph_global_args);
+}
+
+Shell::Action action_show_deprecated(
+ {"nbd", "list"}, {"nbd", "ls"}, "List the nbd devices already used.", "",
+ &get_list_arguments_deprecated, &execute_list_deprecated, false);
+
+Shell::Action action_map_deprecated(
+ {"nbd", "map"}, {}, "Map image to a nbd device.", "",
+ &get_map_arguments_deprecated, &execute_map_deprecated, false);
+
+Shell::Action action_unmap_deprecated(
+ {"nbd", "unmap"}, {}, "Unmap a nbd device.", "",
+ &get_unmap_arguments_deprecated, &execute_unmap_deprecated, false);
+
+} // namespace nbd
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/ObjectMap.cc b/src/tools/rbd/action/ObjectMap.cc
new file mode 100644
index 000000000..40ee2d472
--- /dev/null
+++ b/src/tools/rbd/action/ObjectMap.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace object_map {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_object_map_rebuild(librbd::Image &image, bool no_progress)
+{
+ utils::ProgressContext pc("Object Map Rebuild", no_progress);
+ int r = image.rebuild_object_map(pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_rebuild_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_rebuild(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_object_map_rebuild(image, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: rebuilding object map failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+static int do_object_map_check(librbd::Image &image, bool no_progress)
+{
+ utils::ProgressContext pc("Object Map Check", no_progress);
+ int r = image.check_object_map(pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_check_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_check(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_object_map_check(image, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: checking object map failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_rebuild(
+ {"object-map", "rebuild"}, {}, "Rebuild an invalid object map.", "",
+ &get_rebuild_arguments, &execute_rebuild);
+Shell::Action action_check(
+ {"object-map", "check"}, {}, "Verify the object map is correct.", "",
+ &get_check_arguments, &execute_check);
+
+} // namespace object_map
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Perf.cc b/src/tools/rbd/action/Perf.cc
new file mode 100644
index 000000000..b39beac91
--- /dev/null
+++ b/src/tools/rbd/action/Perf.cc
@@ -0,0 +1,717 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "global/global_context.h"
+#ifdef HAVE_CURSES
+#include <ncurses.h>
+#endif
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <iostream>
+#include <vector>
+#include <boost/algorithm/string.hpp>
+#include <boost/assign.hpp>
+#include <boost/bimap.hpp>
+#include <boost/program_options.hpp>
+#include "json_spirit/json_spirit.h"
+
+namespace rbd {
+namespace action {
+namespace perf {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+enum class StatDescriptor {
+ WRITE_OPS = 0,
+ READ_OPS,
+ WRITE_BYTES,
+ READ_BYTES,
+ WRITE_LATENCY,
+ READ_LATENCY
+};
+
+typedef boost::bimap<StatDescriptor, std::string> StatDescriptors;
+
+static const StatDescriptors STAT_DESCRIPTORS =
+ boost::assign::list_of<StatDescriptors::relation>
+ (StatDescriptor::WRITE_OPS, "write_ops")
+ (StatDescriptor::READ_OPS, "read_ops")
+ (StatDescriptor::WRITE_BYTES, "write_bytes")
+ (StatDescriptor::READ_BYTES, "read_bytes")
+ (StatDescriptor::WRITE_LATENCY, "write_latency")
+ (StatDescriptor::READ_LATENCY, "read_latency");
+
+std::ostream& operator<<(std::ostream& os, const StatDescriptor& val) {
+ auto it = STAT_DESCRIPTORS.left.find(val);
+ if (it == STAT_DESCRIPTORS.left.end()) {
+ os << "unknown (" << static_cast<int>(val) << ")";
+ } else {
+ os << it->second;
+ }
+ return os;
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ StatDescriptor *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ std::string s = po::validators::get_single_string(values);
+ boost::replace_all(s, "_", " ");
+ boost::replace_all(s, "-", "_");
+
+ auto it = STAT_DESCRIPTORS.right.find(s);
+ if (it == STAT_DESCRIPTORS.right.end()) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+ v = boost::any(it->second);
+}
+
+struct ImageStat {
+ ImageStat(const std::string& pool_name, const std::string& pool_namespace,
+ const std::string& image_name)
+ : pool_name(pool_name), pool_namespace(pool_namespace),
+ image_name(image_name) {
+ stats.resize(STAT_DESCRIPTORS.size());
+ }
+
+ std::string pool_name;
+ std::string pool_namespace;
+ std::string image_name;
+ std::vector<double> stats;
+};
+
+typedef std::vector<ImageStat> ImageStats;
+
+typedef std::pair<std::string, std::string> SpecPair;
+
+std::string format_pool_spec(const std::string& pool,
+ const std::string& pool_namespace) {
+ std::string pool_spec{pool};
+ if (!pool_namespace.empty()) {
+ pool_spec += "/" + pool_namespace;
+ }
+ return pool_spec;
+}
+
+int query_iostats(librados::Rados& rados, const std::string& pool_spec,
+ StatDescriptor sort_by, ImageStats* image_stats,
+ std::ostream& err_os) {
+ auto sort_by_str = STAT_DESCRIPTORS.left.find(sort_by)->second;
+
+ std::string cmd = R"(
+ {
+ "prefix": "rbd perf image stats",
+ "pool_spec": ")" + pool_spec + R"(",
+ "sort_by": ")" + sort_by_str + R"(",
+ "format": "json"
+ }")";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+ std::string outs;
+ int r = rados.mgr_command(cmd, in_bl, &out_bl, &outs);
+ if (r == -EOPNOTSUPP) {
+ err_os << "rbd: 'rbd_support' mgr module is not enabled."
+ << std::endl << std::endl
+ << "Use 'ceph mgr module enable rbd_support' to enable."
+ << std::endl;
+ return r;
+ } else if (r < 0) {
+ err_os << "rbd: mgr command failed: " << cpp_strerror(r);
+ if (!outs.empty()) {
+ err_os << ": " << outs;
+ }
+ err_os << std::endl;
+ return r;
+ }
+
+ json_spirit::mValue json_root;
+ if (!json_spirit::read(out_bl.to_str(), json_root)) {
+ err_os << "rbd: error parsing perf stats" << std::endl;
+ return -EINVAL;
+ }
+
+ image_stats->clear();
+ try {
+ auto& root = json_root.get_obj();
+
+ // map JSON stat descriptor order to our internal order
+ std::map<uint32_t, uint32_t> json_to_internal_stats;
+ auto& json_stat_descriptors = root["stat_descriptors"].get_array();
+ for (size_t idx = 0; idx < json_stat_descriptors.size(); ++idx) {
+ auto it = STAT_DESCRIPTORS.right.find(
+ json_stat_descriptors[idx].get_str());
+ if (it == STAT_DESCRIPTORS.right.end()) {
+ continue;
+ }
+ json_to_internal_stats[idx] = static_cast<uint32_t>(it->second);
+ }
+
+ // cache a mapping from pool descriptors back to pool-specs
+ std::map<std::string, SpecPair> json_to_internal_pools;
+ auto& pool_descriptors = root["pool_descriptors"].get_obj();
+ for (auto& pool : pool_descriptors) {
+ auto& pool_spec = pool.second.get_str();
+ auto pos = pool_spec.rfind("/");
+
+ SpecPair pair{pool_spec.substr(0, pos), ""};
+ if (pos != std::string::npos) {
+ pair.second = pool_spec.substr(pos + 1);
+ }
+
+ json_to_internal_pools[pool.first] = pair;
+ }
+
+ auto& stats = root["stats"].get_array();
+ for (auto& stat : stats) {
+ auto& stat_obj = stat.get_obj();
+ if (!stat_obj.empty()) {
+ auto& image_spec = stat_obj.begin()->first;
+
+ auto pos = image_spec.find("/");
+ SpecPair pair{image_spec.substr(0, pos), ""};
+ if (pos != std::string::npos) {
+ pair.second = image_spec.substr(pos + 1);
+ }
+
+ const auto pool_it = json_to_internal_pools.find(pair.first);
+ if (pool_it == json_to_internal_pools.end()) {
+ continue;
+ }
+
+ image_stats->emplace_back(
+ pool_it->second.first, pool_it->second.second, pair.second);
+
+ auto& image_stat = image_stats->back();
+ auto& data = stat_obj.begin()->second.get_array();
+ for (auto& indexes : json_to_internal_stats) {
+ image_stat.stats[indexes.second] = data[indexes.first].get_real();
+ }
+ }
+ }
+ } catch (std::runtime_error &e) {
+ err_os << "rbd: error parsing perf stats: " << e.what() << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void format_stat(StatDescriptor stat_descriptor, double stat,
+ std::ostream& os) {
+ switch (stat_descriptor) {
+ case StatDescriptor::WRITE_OPS:
+ case StatDescriptor::READ_OPS:
+ os << si_u_t(stat) << "/s";
+ break;
+ case StatDescriptor::WRITE_BYTES:
+ case StatDescriptor::READ_BYTES:
+ os << byte_u_t(stat) << "/s";
+ break;
+ case StatDescriptor::WRITE_LATENCY:
+ case StatDescriptor::READ_LATENCY:
+ os << std::fixed << std::setprecision(2);
+ if (stat >= 1000000000) {
+ os << (stat / 1000000000) << " s";
+ } else if (stat >= 1000000) {
+ os << (stat / 1000000) << " ms";
+ } else if (stat >= 1000) {
+ os << (stat / 1000) << " us";
+ } else {
+ os << stat << " ns";
+ }
+ break;
+ default:
+ ceph_assert(false);
+ break;
+ }
+}
+
+} // anonymous namespace
+
+namespace iostat {
+
+struct Iterations {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Iterations *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ auto& s = po::validators::get_single_string(values);
+
+ try {
+ auto iterations = boost::lexical_cast<uint32_t>(s);
+ if (iterations > 0) {
+ v = boost::any(iterations);
+ return;
+ }
+ } catch (const boost::bad_lexical_cast &) {
+ }
+ throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void format(const ImageStats& image_stats, Formatter* f, bool global_search) {
+ TextTable tbl;
+ if (f) {
+ f->open_array_section("images");
+ } else {
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ for (auto& stat : STAT_DESCRIPTORS.left) {
+ std::string title;
+ switch (stat.first) {
+ case StatDescriptor::WRITE_OPS:
+ title = "WR ";
+ break;
+ case StatDescriptor::READ_OPS:
+ title = "RD ";
+ break;
+ case StatDescriptor::WRITE_BYTES:
+ title = "WR_BYTES ";
+ break;
+ case StatDescriptor::READ_BYTES:
+ title = "RD_BYTES ";
+ break;
+ case StatDescriptor::WRITE_LATENCY:
+ title = "WR_LAT ";
+ break;
+ case StatDescriptor::READ_LATENCY:
+ title = "RD_LAT ";
+ break;
+ default:
+ ceph_assert(false);
+ break;
+ }
+ tbl.define_column(title, TextTable::RIGHT, TextTable::RIGHT);
+ }
+ }
+
+ for (auto& image_stat : image_stats) {
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("pool", image_stat.pool_name);
+ f->dump_string("pool_namespace", image_stat.pool_namespace);
+ f->dump_string("image", image_stat.image_name);
+ for (auto& pair : STAT_DESCRIPTORS.left) {
+ f->dump_float(pair.second.c_str(),
+ image_stat.stats[static_cast<size_t>(pair.first)]);
+ }
+ f->close_section();
+ } else {
+ std::string name;
+ if (global_search) {
+ name += image_stat.pool_name + "/";
+ if (!image_stat.pool_namespace.empty()) {
+ name += image_stat.pool_namespace + "/";
+ }
+ }
+ name += image_stat.image_name;
+
+ tbl << name;
+ for (auto& pair : STAT_DESCRIPTORS.left) {
+ std::stringstream str;
+ format_stat(pair.first,
+ image_stat.stats[static_cast<size_t>(pair.first)], str);
+ str << ' ';
+ tbl << str.str();
+ }
+ tbl << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << tbl << std::endl;
+ }
+}
+
+} // namespace iostat
+
+#ifdef HAVE_CURSES
+namespace iotop {
+
+class MainWindow {
+public:
+ MainWindow(librados::Rados& rados, const std::string& pool_spec)
+ : m_rados(rados), m_pool_spec(pool_spec) {
+ initscr();
+ curs_set(0);
+ cbreak();
+ noecho();
+ keypad(stdscr, TRUE);
+ nodelay(stdscr, TRUE);
+
+ init_columns();
+ }
+
+ int run() {
+ redraw();
+
+ int r = 0;
+ std::stringstream err_str;
+ while (true) {
+ r = query_iostats(m_rados, m_pool_spec, m_sort_by, &m_image_stats,
+ err_str);
+ if (r < 0) {
+ break;
+ return r;
+ }
+
+ redraw();
+ wait_for_key_or_delay();
+
+ int ch = getch();
+ if (ch == 'q' || ch == 'Q') {
+ break;
+ } else if (ch == '<' || ch == KEY_LEFT) {
+ auto it = STAT_DESCRIPTORS.left.find(m_sort_by);
+ if (it != STAT_DESCRIPTORS.left.begin()) {
+ m_sort_by = (--it)->first;
+ }
+ } else if (ch == '>' || ch == KEY_RIGHT) {
+ auto it = STAT_DESCRIPTORS.left.find(m_sort_by);
+ if (it != STAT_DESCRIPTORS.left.end() &&
+ ++it != STAT_DESCRIPTORS.left.end()) {
+ m_sort_by = it->first;
+ }
+ }
+ }
+
+ endwin();
+
+ if (r < 0) {
+ std::cerr << err_str.str() << std::endl;
+ }
+ return r;
+ }
+
+private:
+ static const size_t STAT_COLUMN_WIDTH = 12;
+
+ librados::Rados& m_rados;
+ std::string m_pool_spec;
+
+ ImageStats m_image_stats;
+ StatDescriptor m_sort_by = StatDescriptor::WRITE_OPS;
+
+ bool m_pending_win_opened = false;
+ WINDOW* m_pending_win = nullptr;
+
+ int m_height = 1;
+ int m_width = 1;
+
+ std::map<StatDescriptor, std::string> m_columns;
+
+ void init_columns() {
+ m_columns.clear();
+ for (auto& pair : STAT_DESCRIPTORS.left) {
+ std::string title;
+ switch (pair.first) {
+ case StatDescriptor::WRITE_OPS:
+ title = "WRITES OPS";
+ break;
+ case StatDescriptor::READ_OPS:
+ title = "READS OPS";
+ break;
+ case StatDescriptor::WRITE_BYTES:
+ title = "WRITE BYTES";
+ break;
+ case StatDescriptor::READ_BYTES:
+ title = "READ BYTES";
+ break;
+ case StatDescriptor::WRITE_LATENCY:
+ title = "WRITE LAT";
+ break;
+ case StatDescriptor::READ_LATENCY:
+ title = "READ LAT";
+ break;
+ default:
+ ceph_assert(false);
+ break;
+ }
+ m_columns[pair.first] = (title);
+ }
+ }
+
+ void redraw() {
+ getmaxyx(stdscr, m_height, m_width);
+
+ redraw_main_window();
+ redraw_pending_window();
+
+ doupdate();
+ }
+
+ void redraw_main_window() {
+ werase(stdscr);
+ mvhline(0, 0, ' ' | A_REVERSE, m_width);
+
+ // print header for all metrics
+ int remaining_cols = m_width;
+ std::stringstream str;
+ for (auto& pair : m_columns) {
+ int attr = A_REVERSE;
+ std::string title;
+ if (pair.first == m_sort_by) {
+ title += '>';
+ attr |= A_BOLD;
+ } else {
+ title += ' ';
+ }
+ title += pair.second;
+
+ str.str("");
+ str << std::right << std::setfill(' ')
+ << std::setw(STAT_COLUMN_WIDTH)
+ << title << ' ';
+
+ attrset(attr);
+ addstr(str.str().c_str());
+ remaining_cols -= title.size();
+ }
+
+ attrset(A_REVERSE);
+ addstr("IMAGE");
+ attrset(A_NORMAL);
+
+ // print each image (one per line)
+ int row = 1;
+ int remaining_lines = m_height - 1;
+ for (auto& image_stat : m_image_stats) {
+ if (remaining_lines <= 0) {
+ break;
+ }
+ --remaining_lines;
+
+ move(row++, 0);
+ for (auto& pair : m_columns) {
+ str.str("");
+ format_stat(pair.first,
+ image_stat.stats[static_cast<size_t>(pair.first)], str);
+ auto value = str.str().substr(0, STAT_COLUMN_WIDTH);
+
+ str.str("");
+ str << std::right << std::setfill(' ')
+ << std::setw(STAT_COLUMN_WIDTH)
+ << value << ' ';
+ addstr(str.str().c_str());
+ }
+
+ std::string image;
+ if (m_pool_spec.empty()) {
+ image = format_pool_spec(image_stat.pool_name,
+ image_stat.pool_namespace) + "/";
+ }
+ image += image_stat.image_name;
+ addstr(image.substr(0, remaining_cols).c_str());
+ }
+
+ wnoutrefresh(stdscr);
+ }
+
+ void redraw_pending_window() {
+ // draw a "please by patient" window while waiting
+ const char* msg = "Waiting for initial stats";
+ int height = 5;
+ int width = strlen(msg) + 4;;
+ int starty = (m_height - height) / 2;
+ int startx = (m_width - width) / 2;
+
+ if (m_image_stats.empty() && !m_pending_win_opened) {
+ m_pending_win_opened = true;
+ m_pending_win = newwin(height, width, starty, startx);
+ }
+
+ if (m_pending_win != nullptr) {
+ if (m_image_stats.empty()) {
+ box(m_pending_win, 0 , 0);
+ mvwaddstr(m_pending_win, 2, 2, msg);
+ wnoutrefresh(m_pending_win);
+ } else {
+ delwin(m_pending_win);
+ m_pending_win = nullptr;
+ }
+ }
+ }
+
+ void wait_for_key_or_delay() {
+ fd_set fds;
+ FD_ZERO(&fds);
+ FD_SET(STDIN_FILENO, &fds);
+
+ // no point to refreshing faster than the stats period
+ struct timeval tval;
+ tval.tv_sec = std::min<uint32_t>(
+ 10, g_conf().get_val<int64_t>("mgr_stats_period"));
+ tval.tv_usec = 0;
+
+ select(STDIN_FILENO + 1, &fds, NULL, NULL, &tval);
+ }
+};
+
+} // namespace iotop
+#endif // HAVE_CURSES
+
+
+void get_arguments_iostat(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ options->add_options()
+ ("iterations", po::value<iostat::Iterations>(),
+ "iterations of metric collection [> 0]")
+ ("sort-by", po::value<StatDescriptor>()->default_value(StatDescriptor::WRITE_OPS),
+ "sort-by IO metric "
+ "(write-ops, read-ops, write-bytes, read-bytes, write-latency, read-latency) "
+ "[default: write-ops]");
+ at::add_format_options(options);
+}
+
+int execute_iostat(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool;
+ std::string pool_namespace;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool,
+ &pool_namespace, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ uint32_t iterations = 0;
+ if (vm.count("iterations")) {
+ iterations = vm["iterations"].as<uint32_t>();
+ }
+ auto sort_by = vm["sort-by"].as<StatDescriptor>();
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ auto f = formatter.get();
+ if (iterations > 1 && f != nullptr) {
+ std::cerr << "rbd: specifing iterations is not valid with formatted output"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ r = rados.wait_for_latest_osdmap();
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve OSD map" << std::endl;
+ return r;
+ }
+
+ if (!pool_namespace.empty()) {
+ // default empty pool name only if namespace is specified to allow
+ // for an empty pool_spec (-> GLOBAL_POOL_KEY)
+ utils::normalize_pool_name(&pool);
+ }
+ std::string pool_spec = format_pool_spec(pool, pool_namespace);
+
+ // no point to refreshing faster than the stats period
+ auto delay = std::min<uint32_t>(10, g_conf().get_val<int64_t>("mgr_stats_period"));
+
+ ImageStats image_stats;
+ uint32_t count = 0;
+ bool printed_notice = false;
+ while (count++ < iterations || iterations == 0) {
+ r = query_iostats(rados, pool_spec, sort_by, &image_stats, std::cerr);
+ if (r < 0) {
+ return r;
+ }
+
+ if (count == 1 && image_stats.empty()) {
+ count = 0;
+ if (!printed_notice) {
+ std::cerr << "rbd: waiting for initial image stats"
+ << std::endl << std::endl;;
+ printed_notice = true;
+ }
+ } else {
+ iostat::format(image_stats, f, pool_spec.empty());
+ if (f != nullptr) {
+ break;
+ }
+ }
+
+ sleep(delay);
+ }
+
+ return 0;
+}
+
+#ifdef HAVE_CURSES
+void get_arguments_iotop(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+}
+
+int execute_iotop(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool;
+ std::string pool_namespace;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool,
+ &pool_namespace, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ r = rados.wait_for_latest_osdmap();
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve OSD map" << std::endl;
+ return r;
+ }
+
+ if (!pool_namespace.empty()) {
+ // default empty pool name only if namespace is specified to allow
+ // for an empty pool_spec (-> GLOBAL_POOL_KEY)
+ utils::normalize_pool_name(&pool);
+ }
+ iotop::MainWindow mainWindow(rados, format_pool_spec(pool, pool_namespace));
+ r = mainWindow.run();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+Shell::Action top_action(
+ {"perf", "image", "iotop"}, {}, "Display a top-like IO monitor.", "",
+ &get_arguments_iotop, &execute_iotop);
+
+#endif // HAVE_CURSES
+
+Shell::Action stat_action(
+ {"perf", "image", "iostat"}, {}, "Display image IO statistics.", "",
+ &get_arguments_iostat, &execute_iostat);
+} // namespace perf
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/PersistentCache.cc b/src/tools/rbd/action/PersistentCache.cc
new file mode 100644
index 000000000..949006b82
--- /dev/null
+++ b/src/tools/rbd/action/PersistentCache.cc
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "include/rbd_types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace persistent_cache {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments_invalidate(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+}
+
+int execute_invalidate(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.invalidate_cache();
+ if (r < 0) {
+ std::cerr << "rbd: invalidating persistent cache failed: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_arguments_flush(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+}
+
+int execute_flush(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t features;
+ r = image.features(&features);
+ if (r < 0) {
+ return r;
+ }
+
+ if (features & RBD_FEATURE_DIRTY_CACHE) {
+ r = image.flush();
+ if (r < 0) {
+ std::cerr << "rbd: flushing persistent cache failed: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ } else {
+ std::cout << "rbd: persistent cache is clean or disabled" << std::endl;
+ }
+
+ return 0;
+}
+
+Shell::Action action_invalidate(
+ {"persistent-cache", "invalidate"}, {},
+ "Invalidate (discard) existing / dirty persistent cache.", "",
+ &get_arguments_invalidate, &execute_invalidate);
+Shell::Action action_flush(
+ {"persistent-cache", "flush"}, {}, "Flush persistent cache.", "",
+ &get_arguments_flush, &execute_flush);
+
+} // namespace persistent_cache
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Pool.cc b/src/tools/rbd/action/Pool.cc
new file mode 100644
index 000000000..2ad8e17ff
--- /dev/null
+++ b/src/tools/rbd/action/Pool.cc
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace pool {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments_init(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ options->add_options()
+ ("force", po::bool_switch(),
+ "force initialize pool for RBD use if registered by another application");
+}
+
+int execute_init(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.pool_init(io_ctx, vm["force"].as<bool>());
+ if (r == -EOPNOTSUPP) {
+ std::cerr << "rbd: luminous or later release required." << std::endl;
+ } else if (r == -EPERM) {
+ std::cerr << "rbd: pool already registered to a different application."
+ << std::endl;
+ } else if (r < 0) {
+ std::cerr << "rbd: error registered application: " << cpp_strerror(r)
+ << std::endl;
+ }
+
+ return 0;
+}
+
+void get_arguments_stats(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ at::add_format_options(options);
+}
+
+int execute_stats(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ uint64_t image_count;
+ uint64_t provisioned_bytes;
+ uint64_t snap_count;
+ uint64_t trash_count;
+ uint64_t trash_provisioned_bytes;
+ uint64_t trash_snap_count;
+
+ librbd::PoolStats pool_stats;
+ pool_stats.add(RBD_POOL_STAT_OPTION_IMAGES, &image_count);
+ pool_stats.add(RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES,
+ &provisioned_bytes);
+ pool_stats.add(RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, &snap_count);
+ pool_stats.add(RBD_POOL_STAT_OPTION_TRASH_IMAGES, &trash_count);
+ pool_stats.add(RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES,
+ &trash_provisioned_bytes);
+ pool_stats.add(RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS, &trash_snap_count);
+
+ r = rbd.pool_stats_get(io_ctx, &pool_stats);
+ if (r < 0) {
+ std::cerr << "rbd: failed to query pool stats: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ if (formatter) {
+ formatter->open_object_section("stats");
+ formatter->open_object_section("images");
+ formatter->dump_unsigned("count", image_count);
+ formatter->dump_unsigned("provisioned_bytes", provisioned_bytes);
+ formatter->dump_unsigned("snap_count", snap_count);
+ formatter->close_section();
+ formatter->open_object_section("trash");
+ formatter->dump_unsigned("count", trash_count);
+ formatter->dump_unsigned("provisioned_bytes", trash_provisioned_bytes);
+ formatter->dump_unsigned("snap_count", trash_snap_count);
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(std::cout);
+ } else {
+ std::cout << "Total Images: " << image_count;
+ if (trash_count > 0) {
+ std::cout << " (" << trash_count << " in trash)";
+ }
+ std::cout << std::endl;
+
+ std::cout << "Total Snapshots: " << snap_count;
+ if (trash_count > 0) {
+ std::cout << " (" << trash_snap_count << " in trash)";
+ }
+ std::cout << std::endl;
+
+ std::cout << "Provisioned Size: " << byte_u_t(provisioned_bytes);
+ if (trash_count > 0) {
+ std::cout << " (" << byte_u_t(trash_provisioned_bytes) << " in trash)";
+ }
+ std::cout << std::endl;
+ }
+
+ return 0;
+}
+
+Shell::Action init_action(
+ {"pool", "init"}, {}, "Initialize pool for use by RBD.", "",
+ &get_arguments_init, &execute_init);
+Shell::Action stat_action(
+ {"pool", "stats"}, {}, "Display pool statistics.",
+ "Note: legacy v1 images are not included in stats",
+ &get_arguments_stats, &execute_stats);
+
+} // namespace pool
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Remove.cc b/src/tools/rbd/action/Remove.cc
new file mode 100644
index 000000000..c5dcf2323
--- /dev/null
+++ b/src/tools/rbd/action/Remove.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace remove {
+
+namespace {
+
+bool is_auto_delete_snapshot(librbd::Image* image,
+ const librbd::snap_info_t &snap_info) {
+ librbd::snap_namespace_type_t namespace_type;
+ int r = image->snap_get_namespace_type(snap_info.id, &namespace_type);
+ if (r < 0) {
+ return false;
+ }
+
+ switch (namespace_type) {
+ case RBD_SNAP_NAMESPACE_TYPE_TRASH:
+ return true;
+ default:
+ return false;
+ }
+}
+
+} // anonymous namespace
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_delete(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+ const char *imgname, bool no_progress)
+{
+ utils::ProgressContext pc("Removing image", no_progress);
+ int r = rbd.remove_with_progress(io_ctx, imgname, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_pool_full_try();
+
+ librbd::RBD rbd;
+ r = do_delete(rbd, io_ctx, image_name.c_str(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ if (r == -ENOTEMPTY) {
+ librbd::Image image;
+ std::vector<librbd::snap_info_t> snaps;
+ int image_r = utils::open_image(io_ctx, image_name, true, &image);
+ if (image_r >= 0) {
+ image_r = image.snap_list(snaps);
+ }
+ if (image_r >= 0) {
+ snaps.erase(std::remove_if(snaps.begin(), snaps.end(),
+ [&image](const librbd::snap_info_t& snap) {
+ return is_auto_delete_snapshot(&image,
+ snap);
+ }),
+ snaps.end());
+ }
+
+ if (!snaps.empty()) {
+ std::cerr << "rbd: image has snapshots - these must be deleted"
+ << " with 'rbd snap purge' before the image can be removed."
+ << std::endl;
+ } else {
+ std::cerr << "rbd: image has snapshots with linked clones - these must "
+ << "be deleted or flattened before the image can be removed."
+ << std::endl;
+ }
+ } else if (r == -EBUSY) {
+ std::cerr << "rbd: error: image still has watchers"
+ << std::endl
+ << "This means the image is still open or the client using "
+ << "it crashed. Try again after closing/unmapping it or "
+ << "waiting 30s for the crashed client to timeout."
+ << std::endl;
+ } else if (r == -EMLINK) {
+ librbd::Image image;
+ int image_r = utils::open_image(io_ctx, image_name, true, &image);
+ librbd::group_info_t group_info;
+ if (image_r == 0) {
+ image_r = image.get_group(&group_info, sizeof(group_info));
+ }
+ if (image_r == 0) {
+ std::string pool_name = "";
+ librados::Rados rados(io_ctx);
+ librados::IoCtx pool_io_ctx;
+ image_r = rados.ioctx_create2(group_info.pool, pool_io_ctx);
+ if (image_r < 0) {
+ pool_name = "<missing group pool " + stringify(group_info.pool) + ">";
+ } else {
+ pool_name = pool_io_ctx.get_pool_name();
+ }
+ std::cerr << "rbd: error: image belongs to a group "
+ << pool_name << "/";
+ if (!io_ctx.get_namespace().empty()) {
+ std::cerr << io_ctx.get_namespace() << "/";
+ }
+ std::cerr << group_info.name;
+ } else
+ std::cerr << "rbd: error: image belongs to a group";
+
+ std::cerr << std::endl
+ << "Remove the image from the group and try again."
+ << std::endl;
+ image.close();
+ } else {
+ std::cerr << "rbd: delete error: " << cpp_strerror(r) << std::endl;
+ }
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"remove"}, {"rm"}, "Delete an image.", "", &get_arguments, &execute);
+
+} // namespace remove
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Rename.cc b/src/tools/rbd/action/Rename.cc
new file mode 100644
index 000000000..b4954bcbb
--- /dev/null
+++ b/src/tools/rbd/action/Rename.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace rename {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_rename(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+ const char *imgname, const char *destname)
+{
+ int r = rbd.rename(io_ctx, imgname, destname);
+ if (r < 0)
+ return r;
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dst_image_name;
+ std::string dst_snap_name;
+ std::string dst_pool_name = pool_name;
+ std::string dst_namespace_name = namespace_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name,
+ &dst_namespace_name, &dst_image_name, &dst_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (pool_name != dst_pool_name) {
+ std::cerr << "rbd: mv/rename across pools not supported" << std::endl
+ << "source pool: " << pool_name << " dest pool: " << dst_pool_name
+ << std::endl;
+ return -EINVAL;
+ } else if (namespace_name != dst_namespace_name) {
+ std::cerr << "rbd: mv/rename across namespaces not supported" << std::endl
+ << "source namespace: " << namespace_name << " dest namespace: "
+ << dst_namespace_name << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = do_rename(rbd, io_ctx, image_name.c_str(), dst_image_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: rename error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"rename"}, {"mv"}, "Rename image within pool.", "", &get_arguments,
+ &execute);
+
+} // namespace rename
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Resize.cc b/src/tools/rbd/action/Resize.cc
new file mode 100644
index 000000000..60c16429b
--- /dev/null
+++ b/src/tools/rbd/action/Resize.cc
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace resize {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_resize(librbd::Image& image, uint64_t size, bool allow_shrink, bool no_progress)
+{
+ utils::ProgressContext pc("Resizing image", no_progress);
+ int r = image.resize2(size, allow_shrink, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_size_option(options);
+ options->add_options()
+ ("allow-shrink", po::bool_switch(), "permit shrinking");
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t size;
+ r = utils::get_image_size(vm, &size);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::image_info_t info;
+ r = image.stat(info, sizeof(info));
+ if (r < 0) {
+ std::cerr << "rbd: resize error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (info.size == size) {
+ std::cerr << "rbd: new size is equal to original size " << std::endl;
+ return -EINVAL;
+ }
+
+ if (info.size > size && !vm["allow-shrink"].as<bool>()) {
+ r = -EINVAL;
+ } else {
+ r = do_resize(image, size, vm["allow-shrink"].as<bool>(), vm[at::NO_PROGRESS].as<bool>());
+ }
+
+ if (r < 0) {
+ if (r == -EINVAL && !vm["allow-shrink"].as<bool>()) {
+ std::cerr << "rbd: shrinking an image is only allowed with the "
+ << "--allow-shrink flag" << std::endl;
+ return r;
+ }
+ std::cerr << "rbd: resize error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"allow-shrink"});
+Shell::Action action(
+ {"resize"}, {}, "Resize (expand or shrink) image.", "", &get_arguments,
+ &execute);
+
+} // namespace resize
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Snap.cc b/src/tools/rbd/action/Snap.cc
new file mode 100644
index 000000000..e8a9cb1b8
--- /dev/null
+++ b/src/tools/rbd/action/Snap.cc
@@ -0,0 +1,972 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/bind/bind.hpp>
+
+namespace rbd {
+namespace action {
+namespace snap {
+
+using namespace boost::placeholders;
+
+static const std::string ALL_NAME("all");
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_list_snaps(librbd::Image& image, Formatter *f, bool all_snaps, librados::Rados& rados)
+{
+ std::vector<librbd::snap_info_t> snaps;
+ TextTable t;
+ int r;
+
+ r = image.snap_list(snaps);
+ if (r < 0) {
+ std::cerr << "rbd: unable to list snapshots" << std::endl;
+ return r;
+ }
+
+ librbd::image_info_t info;
+ if (!all_snaps) {
+ snaps.erase(remove_if(snaps.begin(),
+ snaps.end(),
+ boost::bind(utils::is_not_user_snap_namespace, &image, _1)),
+ snaps.end());
+ } else if (!f) {
+ r = image.stat(info, sizeof(info));
+ if (r < 0) {
+ std::cerr << "rbd: unable to get image info" << std::endl;
+ return r;
+ }
+ }
+
+ if (f) {
+ f->open_array_section("snapshots");
+ } else {
+ t.define_column("SNAPID", TextTable::LEFT, TextTable::RIGHT);
+ t.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ t.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
+ t.define_column("PROTECTED", TextTable::LEFT, TextTable::LEFT);
+ t.define_column("TIMESTAMP", TextTable::LEFT, TextTable::RIGHT);
+ if (all_snaps) {
+ t.define_column("NAMESPACE", TextTable::LEFT, TextTable::LEFT);
+ }
+ }
+
+ std::list<std::pair<int64_t, std::string>> pool_list;
+ rados.pool_list2(pool_list);
+ std::map<int64_t, std::string> pool_map(pool_list.begin(), pool_list.end());
+
+ for (std::vector<librbd::snap_info_t>::iterator s = snaps.begin();
+ s != snaps.end(); ++s) {
+ struct timespec timestamp;
+ bool snap_protected = false;
+ image.snap_get_timestamp(s->id, &timestamp);
+ string tt_str = "";
+ if(timestamp.tv_sec != 0) {
+ time_t tt = timestamp.tv_sec;
+ tt_str = ctime(&tt);
+ tt_str = tt_str.substr(0, tt_str.length() - 1);
+ }
+
+ librbd::snap_namespace_type_t snap_namespace;
+ r = image.snap_get_namespace_type(s->id, &snap_namespace);
+ if (r < 0) {
+ std::cerr << "rbd: unable to retrieve snap namespace" << std::endl;
+ return r;
+ }
+
+ std::string snap_namespace_name = "Unknown";
+ switch (snap_namespace) {
+ case RBD_SNAP_NAMESPACE_TYPE_USER:
+ snap_namespace_name = "user";
+ break;
+ case RBD_SNAP_NAMESPACE_TYPE_GROUP:
+ snap_namespace_name = "group";
+ break;
+ case RBD_SNAP_NAMESPACE_TYPE_TRASH:
+ snap_namespace_name = "trash";
+ break;
+ case RBD_SNAP_NAMESPACE_TYPE_MIRROR:
+ snap_namespace_name = "mirror";
+ break;
+ }
+
+ int get_trash_res = -ENOENT;
+ std::string trash_original_name;
+ int get_group_res = -ENOENT;
+ librbd::snap_group_namespace_t group_snap;
+ int get_mirror_res = -ENOENT;
+ librbd::snap_mirror_namespace_t mirror_snap;
+ std::string mirror_snap_state = "unknown";
+ if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_GROUP) {
+ get_group_res = image.snap_get_group_namespace(s->id, &group_snap,
+ sizeof(group_snap));
+ } else if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_TRASH) {
+ get_trash_res = image.snap_get_trash_namespace(
+ s->id, &trash_original_name);
+ } else if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_MIRROR) {
+ get_mirror_res = image.snap_get_mirror_namespace(
+ s->id, &mirror_snap, sizeof(mirror_snap));
+
+ switch (mirror_snap.state) {
+ case RBD_SNAP_MIRROR_STATE_PRIMARY:
+ mirror_snap_state = "primary";
+ break;
+ case RBD_SNAP_MIRROR_STATE_NON_PRIMARY:
+ mirror_snap_state = "non-primary";
+ break;
+ case RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED:
+ case RBD_SNAP_MIRROR_STATE_NON_PRIMARY_DEMOTED:
+ mirror_snap_state = "demoted";
+ break;
+ }
+ }
+
+ std::string protected_str = "";
+ if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_USER) {
+ r = image.snap_is_protected(s->name.c_str(), &snap_protected);
+ if (r < 0) {
+ std::cerr << "rbd: unable to retrieve snap protection" << std::endl;
+ return r;
+ }
+ }
+
+ if (f) {
+ protected_str = snap_protected ? "true" : "false";
+ f->open_object_section("snapshot");
+ f->dump_unsigned("id", s->id);
+ f->dump_string("name", s->name);
+ f->dump_unsigned("size", s->size);
+ f->dump_string("protected", protected_str);
+ f->dump_string("timestamp", tt_str);
+ if (all_snaps) {
+ f->open_object_section("namespace");
+ f->dump_string("type", snap_namespace_name);
+ if (get_group_res == 0) {
+ std::string pool_name = pool_map[group_snap.group_pool];
+ f->dump_string("pool", pool_name);
+ f->dump_string("group", group_snap.group_name);
+ f->dump_string("group snap", group_snap.group_snap_name);
+ } else if (get_trash_res == 0) {
+ f->dump_string("original_name", trash_original_name);
+ } else if (get_mirror_res == 0) {
+ f->dump_string("state", mirror_snap_state);
+ f->open_array_section("mirror_peer_uuids");
+ for (auto &uuid : mirror_snap.mirror_peer_uuids) {
+ f->dump_string("peer_uuid", uuid);
+ }
+ f->close_section();
+ f->dump_bool("complete", mirror_snap.complete);
+ if (mirror_snap.state == RBD_SNAP_MIRROR_STATE_NON_PRIMARY ||
+ mirror_snap.state == RBD_SNAP_MIRROR_STATE_NON_PRIMARY_DEMOTED) {
+ f->dump_string("primary_mirror_uuid",
+ mirror_snap.primary_mirror_uuid);
+ f->dump_unsigned("primary_snap_id",
+ mirror_snap.primary_snap_id);
+ f->dump_unsigned("last_copied_object_number",
+ mirror_snap.last_copied_object_number);
+ }
+ }
+ f->close_section();
+ }
+ f->close_section();
+ } else {
+ protected_str = snap_protected ? "yes" : "";
+ t << s->id << s->name << stringify(byte_u_t(s->size)) << protected_str << tt_str;
+
+ if (all_snaps) {
+ ostringstream oss;
+ oss << snap_namespace_name;
+
+ if (get_group_res == 0) {
+ std::string pool_name = pool_map[group_snap.group_pool];
+ oss << " (" << pool_name << "/"
+ << group_snap.group_name << "@"
+ << group_snap.group_snap_name << ")";
+ } else if (get_trash_res == 0) {
+ oss << " (" << trash_original_name << ")";
+ } else if (get_mirror_res == 0) {
+ oss << " (" << mirror_snap_state << " "
+ << "peer_uuids:[" << mirror_snap.mirror_peer_uuids << "]";
+ if (mirror_snap.state == RBD_SNAP_MIRROR_STATE_NON_PRIMARY ||
+ mirror_snap.state == RBD_SNAP_MIRROR_STATE_NON_PRIMARY_DEMOTED) {
+ oss << " " << mirror_snap.primary_mirror_uuid << ":"
+ << mirror_snap.primary_snap_id << " ";
+ if (!mirror_snap.complete) {
+ if (info.num_objs > 0) {
+ auto progress = std::min<uint64_t>(
+ 100, 100 * mirror_snap.last_copied_object_number /
+ info.num_objs);
+ oss << progress << "% ";
+ } else {
+ oss << "not ";
+ }
+ }
+ oss << "copied";
+ }
+ oss << ")";
+ }
+
+ t << oss.str();
+ }
+ t << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else if (snaps.size()) {
+ std::cout << t;
+ }
+
+ return 0;
+}
+
+int do_add_snap(librbd::Image& image, const char *snapname,
+ uint32_t flags, bool no_progress)
+{
+ utils::ProgressContext pc("Creating snap", no_progress);
+
+ int r = image.snap_create2(snapname, flags, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+
+ pc.finish();
+ return 0;
+}
+
+int do_remove_snap(librbd::Image& image, const char *snapname, bool force,
+ bool no_progress)
+{
+ uint32_t flags = force? RBD_SNAP_REMOVE_FORCE : 0;
+ int r = 0;
+ utils::ProgressContext pc("Removing snap", no_progress);
+
+ r = image.snap_remove2(snapname, flags, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+
+ pc.finish();
+ return 0;
+}
+
+int do_rollback_snap(librbd::Image& image, const char *snapname,
+ bool no_progress)
+{
+ utils::ProgressContext pc("Rolling back to snapshot", no_progress);
+ int r = image.snap_rollback_with_progress(snapname, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+int do_purge_snaps(librbd::Image& image, bool no_progress)
+{
+ utils::ProgressContext pc("Removing all snapshots", no_progress);
+ std::vector<librbd::snap_info_t> snaps;
+ bool is_protected = false;
+ int r = image.snap_list(snaps);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ } else if (0 == snaps.size()) {
+ return 0;
+ } else {
+ list<std::string> protect;
+ snaps.erase(remove_if(snaps.begin(),
+ snaps.end(),
+ boost::bind(utils::is_not_user_snap_namespace, &image, _1)),
+ snaps.end());
+ for (auto it = snaps.begin(); it != snaps.end();) {
+ r = image.snap_is_protected(it->name.c_str(), &is_protected);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ } else if (is_protected == true) {
+ protect.push_back(it->name.c_str());
+ snaps.erase(it);
+ } else {
+ ++it;
+ }
+ }
+
+ if (!protect.empty()) {
+ std::cout << "rbd: error removing snapshot(s) '" << protect << "', which "
+ << (1 == protect.size() ? "is" : "are")
+ << " protected - these must be unprotected with "
+ << "`rbd snap unprotect`."
+ << std::endl;
+ }
+ for (size_t i = 0; i < snaps.size(); ++i) {
+ r = image.snap_remove(snaps[i].name.c_str());
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.update_progress(i + 1, snaps.size() + protect.size());
+ }
+
+ if (!protect.empty()) {
+ pc.fail();
+ } else if (snaps.size() > 0) {
+ pc.finish();
+ }
+
+ return 0;
+ }
+}
+
+int do_protect_snap(librbd::Image& image, const char *snapname)
+{
+ int r = image.snap_protect(snapname);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int do_unprotect_snap(librbd::Image& image, const char *snapname)
+{
+ int r = image.snap_unprotect(snapname);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int do_set_limit(librbd::Image& image, uint64_t limit)
+{
+ return image.snap_set_limit(limit);
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_format_options(options);
+
+ std::string name = ALL_NAME + ",a";
+
+ options->add_options()
+ (name.c_str(), po::bool_switch(), "list snapshots from all namespaces");
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name,
+ image_id, "", true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ bool all_snaps = vm[ALL_NAME].as<bool>();
+ r = do_list_snaps(image, formatter.get(), all_snaps, rados);
+ if (r < 0) {
+ cerr << "rbd: failed to list snapshots: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_snap_create_options(options);
+ at::add_no_progress_option(options);
+}
+
+int execute_create(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_SNAP);
+ if (r < 0) {
+ return r;
+ }
+
+ uint32_t flags;
+ r = utils::get_snap_create_flags(vm, &flags);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_add_snap(image, snap_name.c_str(), flags,
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ cerr << "rbd: failed to create snapshot: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_snap_id_option(options);
+ at::add_no_progress_option(options);
+
+ options->add_options()
+ ("force", po::bool_switch(), "flatten children and unprotect snapshot if needed.");
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+ uint64_t snap_id = CEPH_NOSNAP;
+ bool force = vm["force"].as<bool>();
+ bool no_progress = vm[at::NO_PROGRESS].as<bool>();
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+ if (vm.count(at::SNAPSHOT_ID)) {
+ snap_id = vm[at::SNAPSHOT_ID].as<uint64_t>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ (snap_id == CEPH_NOSNAP ? utils::SNAPSHOT_PRESENCE_REQUIRED :
+ utils::SNAPSHOT_PRESENCE_PERMITTED),
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id."
+ << std::endl;
+ return -EINVAL;
+ } else if (!snap_name.empty() && snap_id != CEPH_NOSNAP) {
+ std::cerr << "rbd: trying to access snapshot using both name and id."
+ << std::endl;
+ return -EINVAL;
+ } else if ((force || no_progress) && snap_id != CEPH_NOSNAP) {
+ std::cerr << "rbd: force and no-progress options not permitted when "
+ << "removing by id." << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_pool_full_try();
+ if (image_id.empty()) {
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ } else {
+ r = utils::open_image_by_id(io_ctx, image_id, false, &image);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ if (!snap_name.empty()) {
+ r = do_remove_snap(image, snap_name.c_str(), force, no_progress);
+ } else {
+ r = image.snap_remove_by_id(snap_id);
+ }
+
+ if (r < 0) {
+ if (r == -EBUSY) {
+ std::cerr << "rbd: snapshot "
+ << (snap_name.empty() ? std::string("id ") + stringify(snap_id) :
+ std::string("'") + snap_name + "'")
+ << " is protected from removal." << std::endl;
+ } else {
+ std::cerr << "rbd: failed to remove snapshot: " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ return 0;
+}
+
+void get_purge_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_no_progress_option(options);
+}
+
+int execute_purge(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_pool_full_try();
+ if (image_id.empty()) {
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ } else {
+ r = utils::open_image_by_id(io_ctx, image_id, false, &image);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_purge_snaps(image, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ if (r != -EBUSY) {
+ std::cerr << "rbd: removing snaps failed: " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ return 0;
+}
+
+void get_rollback_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_rollback(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_rollback_snap(image, snap_name.c_str(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: rollback failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_protect_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_protect(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ bool is_protected = false;
+ r = image.snap_is_protected(snap_name.c_str(), &is_protected);
+ if (r < 0) {
+ std::cerr << "rbd: protecting snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ } else if (is_protected) {
+ std::cerr << "rbd: snap is already protected" << std::endl;
+ return -EBUSY;
+ }
+
+ r = do_protect_snap(image, snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: protecting snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_unprotect_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+}
+
+int execute_unprotect(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_pool_full_try();
+ if (image_id.empty()) {
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ } else {
+ r = utils::open_image_by_id(io_ctx, image_id, false, &image);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ bool is_protected = false;
+ r = image.snap_is_protected(snap_name.c_str(), &is_protected);
+ if (r < 0) {
+ std::cerr << "rbd: unprotecting snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ } else if (!is_protected) {
+ std::cerr << "rbd: snap is already unprotected" << std::endl;
+ return -EINVAL;
+ }
+
+ r = do_unprotect_snap(image, snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: unprotecting snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_set_limit_arguments(po::options_description *pos,
+ po::options_description *opt) {
+ at::add_image_spec_options(pos, opt, at::ARGUMENT_MODIFIER_NONE);
+ at::add_limit_option(opt);
+}
+
+int execute_set_limit(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ uint64_t limit;
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (vm.count(at::LIMIT)) {
+ limit = vm[at::LIMIT].as<uint64_t>();
+ } else {
+ std::cerr << "rbd: must specify --limit <num>" << std::endl;
+ return -ERANGE;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_set_limit(image, limit);
+ if (r < 0) {
+ std::cerr << "rbd: setting snapshot limit failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_clear_limit_arguments(po::options_description *pos,
+ po::options_description *opt) {
+ at::add_image_spec_options(pos, opt, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_clear_limit(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_set_limit(image, UINT64_MAX);
+ if (r < 0) {
+ std::cerr << "rbd: clearing snapshot limit failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_rename_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+}
+
+int execute_rename(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string src_snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &src_snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return -r;
+ }
+
+ std::string dest_pool_name(pool_name);
+ std::string dest_namespace_name(namespace_name);
+ std::string dest_image_name;
+ std::string dest_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dest_pool_name,
+ &dest_namespace_name, &dest_image_name, &dest_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_SNAP);
+ if (r < 0) {
+ return -r;
+ }
+
+ if (pool_name != dest_pool_name) {
+ std::cerr << "rbd: source and destination pool must be the same"
+ << std::endl;
+ return -EINVAL;
+ } else if (namespace_name != dest_namespace_name) {
+ std::cerr << "rbd: source and destination namespace must be the same"
+ << std::endl;
+ return -EINVAL;
+ } else if (image_name != dest_image_name) {
+ std::cerr << "rbd: source and destination image name must be the same"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.snap_rename(src_snap_name.c_str(), dest_snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: renaming snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_list(
+ {"snap", "list"}, {"snap", "ls"}, "Dump list of image snapshots.", "",
+ &get_list_arguments, &execute_list);
+Shell::Action action_create(
+ {"snap", "create"}, {"snap", "add"}, "Create a snapshot.", "",
+ &get_create_arguments, &execute_create);
+Shell::Action action_remove(
+ {"snap", "remove"}, {"snap", "rm"}, "Delete a snapshot.", "",
+ &get_remove_arguments, &execute_remove);
+Shell::Action action_purge(
+ {"snap", "purge"}, {}, "Delete all unprotected snapshots.", "",
+ &get_purge_arguments, &execute_purge);
+Shell::Action action_rollback(
+ {"snap", "rollback"}, {"snap", "revert"}, "Rollback image to snapshot.", "",
+ &get_rollback_arguments, &execute_rollback);
+Shell::Action action_protect(
+ {"snap", "protect"}, {}, "Prevent a snapshot from being deleted.", "",
+ &get_protect_arguments, &execute_protect);
+Shell::Action action_unprotect(
+ {"snap", "unprotect"}, {}, "Allow a snapshot to be deleted.", "",
+ &get_unprotect_arguments, &execute_unprotect);
+Shell::Action action_set_limit(
+ {"snap", "limit", "set"}, {}, "Limit the number of snapshots.", "",
+ &get_set_limit_arguments, &execute_set_limit);
+Shell::Action action_clear_limit(
+ {"snap", "limit", "clear"}, {}, "Remove snapshot limit.", "",
+ &get_clear_limit_arguments, &execute_clear_limit);
+Shell::Action action_rename(
+ {"snap", "rename"}, {}, "Rename a snapshot.", "",
+ &get_rename_arguments, &execute_rename);
+
+} // namespace snap
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Sparsify.cc b/src/tools/rbd/action/Sparsify.cc
new file mode 100644
index 000000000..a345f920b
--- /dev/null
+++ b/src/tools/rbd/action/Sparsify.cc
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace sparsify {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_sparsify(librbd::Image& image, size_t sparse_size,
+ bool no_progress)
+{
+ utils::ProgressContext pc("Image sparsify", no_progress);
+ int r = image.sparsify_with_progress(sparse_size, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+ at::add_sparse_size_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE;
+ if (vm.count(at::IMAGE_SPARSE_SIZE)) {
+ sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>();
+ }
+
+ r = do_sparsify(image, sparse_size, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: sparsify error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"sparsify"}, {},
+ "Reclaim space for zeroed image extents.", "",
+ &get_arguments, &execute);
+
+} // namespace sparsify
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Status.cc b/src/tools/rbd/action/Status.cc
new file mode 100644
index 000000000..958a686c4
--- /dev/null
+++ b/src/tools/rbd/action/Status.cc
@@ -0,0 +1,365 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "json_spirit/json_spirit.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd_types.h"
+#include "include/stringify.h"
+#include "librbd/cache/Types.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace status {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_show_status(librados::IoCtx& io_ctx, const std::string &image_name,
+ librbd::Image &image, Formatter *f)
+{
+ int r;
+ std::list<librbd::image_watcher_t> watchers;
+
+ r = image.list_watchers(watchers);
+ if (r < 0)
+ return r;
+
+ uint64_t features;
+ r = image.features(&features);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::image_migration_status_t migration_status;
+ std::string source_spec;
+ std::string source_pool_name;
+ std::string dest_pool_name;
+ std::string migration_state;
+ if ((features & RBD_FEATURE_MIGRATING) != 0) {
+ r = librbd::RBD().migration_status(io_ctx, image_name.c_str(),
+ &migration_status,
+ sizeof(migration_status));
+ if (r < 0) {
+ std::cerr << "rbd: getting migration status failed: " << cpp_strerror(r)
+ << std::endl;
+ // not fatal
+ } else {
+ if (migration_status.source_pool_id >= 0) {
+ librados::IoCtx src_io_ctx;
+ r = librados::Rados(io_ctx).ioctx_create2(migration_status.source_pool_id, src_io_ctx);
+ if (r < 0) {
+ source_pool_name = stringify(migration_status.source_pool_id);
+ } else {
+ source_pool_name = src_io_ctx.get_pool_name();
+ }
+ } else {
+ r = image.get_migration_source_spec(&source_spec);
+ if (r < 0) {
+ std::cerr << "rbd: getting migration source spec failed: "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = librados::Rados(io_ctx).ioctx_create2(migration_status.dest_pool_id, dst_io_ctx);
+ if (r < 0) {
+ dest_pool_name = stringify(migration_status.dest_pool_id);
+ } else {
+ dest_pool_name = dst_io_ctx.get_pool_name();
+ }
+
+ switch (migration_status.state) {
+ case RBD_IMAGE_MIGRATION_STATE_ERROR:
+ migration_state = "error";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_PREPARING:
+ migration_state = "preparing";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_PREPARED:
+ migration_state = "prepared";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_EXECUTING:
+ migration_state = "executing";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_EXECUTED:
+ migration_state = "executed";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_ABORTING:
+ migration_state = "aborting";
+ break;
+ default:
+ migration_state = "unknown";
+ }
+ }
+ }
+
+ struct {
+ // decoded
+ std::string host;
+ std::string path;
+ uint64_t size;
+ std::string mode;
+ std::string stats_timestamp;
+ bool present;
+ bool empty;
+ bool clean;
+ uint64_t allocated_bytes;
+ uint64_t cached_bytes;
+ uint64_t dirty_bytes;
+ uint64_t free_bytes;
+ uint64_t hits_full;
+ uint64_t hits_partial;
+ uint64_t misses;
+ uint64_t hit_bytes;
+ uint64_t miss_bytes;
+
+ // calculated
+ uint64_t total_read_ops;
+ uint64_t total_read_bytes;
+ int hits_full_percent;
+ int hits_partial_percent;
+ int hit_bytes_percent;
+ } cache_state;
+ std::string cache_str;
+ if (features & RBD_FEATURE_DIRTY_CACHE) {
+ r = image.metadata_get(librbd::cache::PERSISTENT_CACHE_STATE, &cache_str);
+ if (r < 0) {
+ std::cerr << "rbd: getting persistent cache state failed: " << cpp_strerror(r)
+ << std::endl;
+ // not fatal
+ }
+ json_spirit::mValue json_root;
+ if (!json_spirit::read(cache_str.c_str(), json_root)) {
+ std::cerr << "rbd: parsing persistent cache state failed" << std::endl;
+ cache_str.clear();
+ } else {
+ try {
+ auto& o = json_root.get_obj();
+ cache_state.host = o["host"].get_str();
+ cache_state.path = o["path"].get_str();
+ cache_state.size = o["size"].get_uint64();
+ cache_state.mode = o["mode"].get_str();
+ time_t stats_timestamp_sec = o["stats_timestamp"].get_uint64();
+ cache_state.stats_timestamp = ctime(&stats_timestamp_sec);
+ cache_state.stats_timestamp.pop_back();
+ cache_state.present = o["present"].get_bool();
+ cache_state.empty = o["empty"].get_bool();
+ cache_state.clean = o["clean"].get_bool();
+ cache_state.allocated_bytes = o["allocated_bytes"].get_uint64();
+ cache_state.cached_bytes = o["cached_bytes"].get_uint64();
+ cache_state.dirty_bytes = o["dirty_bytes"].get_uint64();
+ cache_state.free_bytes = o["free_bytes"].get_uint64();
+ cache_state.hits_full = o["hits_full"].get_uint64();
+ cache_state.hits_partial = o["hits_partial"].get_uint64();
+ cache_state.misses = o["misses"].get_uint64();
+ cache_state.hit_bytes = o["hit_bytes"].get_uint64();
+ cache_state.miss_bytes = o["miss_bytes"].get_uint64();
+ } catch (std::runtime_error &e) {
+ std::cerr << "rbd: parsing persistent cache state failed: " << e.what()
+ << std::endl;
+ cache_str.clear();
+ }
+ cache_state.total_read_ops = cache_state.hits_full +
+ cache_state.hits_partial + cache_state.misses;
+ cache_state.total_read_bytes = cache_state.hit_bytes +
+ cache_state.miss_bytes;
+ cache_state.hits_full_percent = utils::get_percentage(
+ cache_state.hits_full, cache_state.total_read_ops);
+ cache_state.hits_partial_percent = utils::get_percentage(
+ cache_state.hits_partial, cache_state.total_read_ops);
+ cache_state.hit_bytes_percent = utils::get_percentage(
+ cache_state.hit_bytes, cache_state.total_read_bytes);
+ }
+ }
+
+ if (f)
+ f->open_object_section("status");
+
+ if (f) {
+ f->open_array_section("watchers");
+ for (auto &watcher : watchers) {
+ f->open_object_section("watcher");
+ f->dump_string("address", watcher.addr);
+ f->dump_unsigned("client", watcher.id);
+ f->dump_unsigned("cookie", watcher.cookie);
+ f->close_section();
+ }
+ f->close_section(); // watchers
+ if (!migration_state.empty()) {
+ f->open_object_section("migration");
+ if (!source_spec.empty()) {
+ f->dump_string("source_spec", source_spec);
+ } else {
+ f->dump_string("source_pool_name", source_pool_name);
+ f->dump_string("source_pool_namespace",
+ migration_status.source_pool_namespace);
+ f->dump_string("source_image_name", migration_status.source_image_name);
+ f->dump_string("source_image_id", migration_status.source_image_id);
+ }
+ f->dump_string("dest_pool_name", dest_pool_name);
+ f->dump_string("dest_pool_namespace",
+ migration_status.dest_pool_namespace);
+ f->dump_string("dest_image_name", migration_status.dest_image_name);
+ f->dump_string("dest_image_id", migration_status.dest_image_id);
+ f->dump_string("state", migration_state);
+ f->dump_string("state_description", migration_status.state_description);
+ f->close_section(); // migration
+ }
+ if (!cache_str.empty()) {
+ f->open_object_section("persistent_cache");
+ f->dump_string("host", cache_state.host);
+ f->dump_string("path", cache_state.path);
+ f->dump_unsigned("size", cache_state.size);
+ f->dump_string("mode", cache_state.mode);
+ f->dump_string("stats_timestamp", cache_state.stats_timestamp);
+ f->dump_bool("present", cache_state.present);
+ f->dump_bool("empty", cache_state.empty);
+ f->dump_bool("clean", cache_state.clean);
+ f->dump_unsigned("allocated_bytes", cache_state.allocated_bytes);
+ f->dump_unsigned("cached_bytes", cache_state.cached_bytes);
+ f->dump_unsigned("dirty_bytes", cache_state.dirty_bytes);
+ f->dump_unsigned("free_bytes", cache_state.free_bytes);
+ f->dump_unsigned("hits_full", cache_state.hits_full);
+ f->dump_int("hits_full_percent", cache_state.hits_full_percent);
+ f->dump_unsigned("hits_partial", cache_state.hits_partial);
+ f->dump_int("hits_partial_percent", cache_state.hits_partial_percent);
+ f->dump_unsigned("misses", cache_state.misses);
+ f->dump_unsigned("hit_bytes", cache_state.hit_bytes);
+ f->dump_int("hit_bytes_percent", cache_state.hit_bytes_percent);
+ f->dump_unsigned("miss_bytes", cache_state.miss_bytes);
+ f->close_section(); // persistent_cache
+ }
+ } else {
+ if (watchers.size()) {
+ std::cout << "Watchers:" << std::endl;
+ for (auto &watcher : watchers) {
+ std::cout << "\twatcher=" << watcher.addr << " client." << watcher.id
+ << " cookie=" << watcher.cookie << std::endl;
+ }
+ } else {
+ std::cout << "Watchers: none" << std::endl;
+ }
+ if (!migration_state.empty()) {
+ if (!migration_status.source_pool_namespace.empty()) {
+ source_pool_name += ("/" + migration_status.source_pool_namespace);
+ }
+ if (!migration_status.dest_pool_namespace.empty()) {
+ dest_pool_name += ("/" + migration_status.dest_pool_namespace);
+ }
+
+ std::cout << "Migration:" << std::endl;
+ std::cout << "\tsource: ";
+ if (!source_spec.empty()) {
+ std::cout << source_spec;
+ } else {
+ std::cout << source_pool_name << "/"
+ << migration_status.source_image_name;
+ if (!migration_status.source_image_id.empty()) {
+ std::cout << " (" << migration_status.source_image_id << ")";
+ }
+ }
+ std::cout << std::endl;
+ std::cout << "\tdestination: " << dest_pool_name << "/"
+ << migration_status.dest_image_name << " ("
+ << migration_status.dest_image_id << ")" << std::endl;
+ std::cout << "\tstate: " << migration_state;
+ if (!migration_status.state_description.empty()) {
+ std::cout << " (" << migration_status.state_description << ")";
+ }
+ std::cout << std::endl;
+ }
+ if (!cache_str.empty()) {
+ std::cout << "Persistent cache state:" << std::endl;
+ std::cout << "\thost: " << cache_state.host << std::endl;
+ std::cout << "\tpath: " << cache_state.path << std::endl;
+ std::cout << "\tsize: " << byte_u_t(cache_state.size) << std::endl;
+ std::cout << "\tmode: " << cache_state.mode << std::endl;
+ std::cout << "\tstats_timestamp: " << cache_state.stats_timestamp
+ << std::endl;
+ std::cout << "\tpresent: " << (cache_state.present ? "true" : "false")
+ << "\tempty: " << (cache_state.empty ? "true" : "false")
+ << "\tclean: " << (cache_state.clean ? "true" : "false")
+ << std::endl;
+ std::cout << "\tallocated: " << byte_u_t(cache_state.allocated_bytes)
+ << std::endl;
+ std::cout << "\tcached: " << byte_u_t(cache_state.cached_bytes)
+ << std::endl;
+ std::cout << "\tdirty: " << byte_u_t(cache_state.dirty_bytes) << std::endl;
+ std::cout << "\tfree: " << byte_u_t(cache_state.free_bytes) << std::endl;
+ std::cout << "\thits_full: " << cache_state.hits_full << " / "
+ << cache_state.hits_full_percent << "%" << std::endl;
+ std::cout << "\thits_partial: " << cache_state.hits_partial << " / "
+ << cache_state.hits_partial_percent << "%" << std::endl;
+ std::cout << "\tmisses: " << cache_state.misses << std::endl;
+ std::cout << "\thit_bytes: " << byte_u_t(cache_state.hit_bytes) << " / "
+ << cache_state.hit_bytes_percent << "%" << std::endl;
+ std::cout << "\tmiss_bytes: " << byte_u_t(cache_state.miss_bytes)
+ << std::endl;
+ }
+ }
+
+ if (f) {
+ f->close_section(); // status
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_show_status(io_ctx, image_name, image, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: show status failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"status"}, {}, "Show the status of this image.", "", &get_arguments,
+ &execute);
+
+} // namespace status
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Trash.cc b/src/tools/rbd/action/Trash.cc
new file mode 100644
index 000000000..6765fbb3e
--- /dev/null
+++ b/src/tools/rbd/action/Trash.cc
@@ -0,0 +1,540 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/Clock.h"
+#include <iostream>
+#include <sstream>
+#include <boost/program_options.hpp>
+#include <boost/bind/bind.hpp>
+
+namespace rbd {
+namespace action {
+namespace trash {
+using namespace boost::placeholders;
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+//Optional arguments used only by this set of commands (rbd trash *)
+static const std::string EXPIRES_AT("expires-at");
+static const std::string EXPIRED_BEFORE("expired-before");
+static const std::string THRESHOLD("threshold");
+
+static bool is_not_trash_user(const librbd::trash_image_info_t &trash_info) {
+ return trash_info.source != RBD_TRASH_IMAGE_SOURCE_USER &&
+ trash_info.source != RBD_TRASH_IMAGE_SOURCE_USER_PARENT;
+}
+
+void get_move_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ (EXPIRES_AT.c_str(), po::value<std::string>()->default_value("now"),
+ "set the expiration time of an image so it can be purged when it is stale");
+}
+
+int execute_move(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utime_t now = ceph_clock_now();
+ utime_t exp_time = now;
+ std::string expires_at;
+ if (vm.find(EXPIRES_AT) != vm.end()) {
+ expires_at = vm[EXPIRES_AT].as<std::string>();
+ r = utime_t::invoke_date(expires_at, &exp_time);
+ if (r < 0) {
+ std::cerr << "rbd: error calling /bin/date: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ }
+
+ time_t dt = (exp_time - now).sec();
+ if(dt < 0) {
+ std::cerr << "rbd: cannot use a date in the past as an expiration date"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.trash_move(io_ctx, image_name.c_str(), dt);
+ if (r < 0) {
+ std::cerr << "rbd: deferred delete error: " << cpp_strerror(r)
+ << std::endl;
+ }
+
+ return r;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ (at::IMAGE_ID.c_str(), "image id\n(example: [<pool-name>/[<namespace>/]]<image-id>)");
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_namespace_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+
+ at::add_no_progress_option(options);
+ options->add_options()
+ ("force", po::bool_switch(), "force remove of non-expired delayed images");
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_id;
+ int r = utils::get_pool_image_id(vm, &arg_index, &pool_name, &namespace_name,
+ &image_id);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_pool_full_try();
+ librbd::RBD rbd;
+
+ utils::ProgressContext pc("Removing image", vm[at::NO_PROGRESS].as<bool>());
+ r = rbd.trash_remove_with_progress(io_ctx, image_id.c_str(),
+ vm["force"].as<bool>(), pc);
+ if (r < 0) {
+ if (r == -ENOTEMPTY) {
+ std::cerr << "rbd: image has snapshots - these must be deleted"
+ << " with 'rbd snap purge' before the image can be removed."
+ << std::endl;
+ } else if (r == -EUCLEAN) {
+ std::cerr << "rbd: error: image not fully moved to trash."
+ << std::endl;
+ } else if (r == -EBUSY) {
+ std::cerr << "rbd: error: image still has watchers"
+ << std::endl
+ << "This means the image is still open or the client using "
+ << "it crashed. Try again after closing/unmapping it or "
+ << "waiting 30s for the crashed client to timeout."
+ << std::endl;
+ } else if (r == -EMLINK) {
+ std::cerr << std::endl
+ << "Remove the image from the group and try again."
+ << std::endl;
+ } else if (r == -EPERM) {
+ std::cerr << std::endl
+ << "Deferment time has not expired, please use --force if you "
+ << "really want to remove the image"
+ << std::endl;
+ } else {
+ std::cerr << "rbd: remove error: " << cpp_strerror(r) << std::endl;
+ }
+ pc.fail();
+ return r;
+ }
+
+ pc.finish();
+
+ return r;
+}
+
+std::string delete_status(time_t deferment_end_time) {
+ time_t now = time(nullptr);
+
+ std::string time_str = ctime(&deferment_end_time);
+ time_str = time_str.substr(0, time_str.length() - 1);
+
+ std::stringstream ss;
+ if (now < deferment_end_time) {
+ ss << "protected until " << time_str;
+ } else {
+ ss << "expired at " << time_str;
+ }
+
+ return ss.str();
+}
+
+int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool long_flag,
+ bool all_flag, Formatter *f) {
+ std::vector<librbd::trash_image_info_t> trash_entries;
+ int r = rbd.trash_list(io_ctx, trash_entries);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!all_flag) {
+ trash_entries.erase(remove_if(trash_entries.begin(),
+ trash_entries.end(),
+ boost::bind(is_not_trash_user, _1)),
+ trash_entries.end());
+ }
+
+ if (!long_flag) {
+ if (f) {
+ f->open_array_section("trash");
+ }
+ for (const auto& entry : trash_entries) {
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("id", entry.id);
+ f->dump_string("name", entry.name);
+ f->close_section();
+ } else {
+ std::cout << entry.id << " " << entry.name << std::endl;
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+ return 0;
+ }
+
+ TextTable tbl;
+
+ if (f) {
+ f->open_array_section("trash");
+ } else {
+ tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("SOURCE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("DELETED_AT", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("STATUS", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("PARENT", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (const auto& entry : trash_entries) {
+ librbd::Image im;
+
+ r = rbd.open_by_id_read_only(io_ctx, im, entry.id.c_str(), NULL);
+ // image might disappear between rbd.list() and rbd.open(); ignore
+ // that, warn about other possible errors (EPERM, say, for opening
+ // an old-format image, because you need execute permission for the
+ // class method)
+ if (r < 0) {
+ if (r != -ENOENT) {
+ std::cerr << "rbd: error opening " << entry.id << ": "
+ << cpp_strerror(r) << std::endl;
+ }
+ // in any event, continue to next image
+ continue;
+ }
+
+ std::string del_source;
+ switch (entry.source) {
+ case RBD_TRASH_IMAGE_SOURCE_USER:
+ del_source = "USER";
+ break;
+ case RBD_TRASH_IMAGE_SOURCE_MIRRORING:
+ del_source = "MIRRORING";
+ break;
+ case RBD_TRASH_IMAGE_SOURCE_MIGRATION:
+ del_source = "MIGRATION";
+ break;
+ case RBD_TRASH_IMAGE_SOURCE_REMOVING:
+ del_source = "REMOVING";
+ break;
+ case RBD_TRASH_IMAGE_SOURCE_USER_PARENT:
+ del_source = "USER_PARENT";
+ break;
+ }
+
+ std::string time_str = ctime(&entry.deletion_time);
+ time_str = time_str.substr(0, time_str.length() - 1);
+
+ bool has_parent = false;
+ std::string parent;
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ r = im.get_parent(&parent_image, &parent_snap);
+ if (r == -ENOENT) {
+ r = 0;
+ } else if (r < 0) {
+ return r;
+ } else {
+ parent = parent_image.pool_name + "/";
+ if (!parent_image.pool_namespace.empty()) {
+ parent += parent_image.pool_namespace + "/";
+ }
+ parent += parent_image.image_name + "@" + parent_snap.name;
+ has_parent = true;
+ }
+
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("id", entry.id);
+ f->dump_string("name", entry.name);
+ f->dump_string("source", del_source);
+ f->dump_string("deleted_at", time_str);
+ f->dump_string("status",
+ delete_status(entry.deferment_end_time));
+ if (has_parent) {
+ f->open_object_section("parent");
+ f->dump_string("pool", parent_image.pool_name);
+ f->dump_string("pool_namespace", parent_image.pool_namespace);
+ f->dump_string("image", parent_image.image_name);
+ f->dump_string("snapshot", parent_snap.name);
+ f->close_section();
+ }
+ f->close_section();
+ } else {
+ tbl << entry.id
+ << entry.name
+ << del_source
+ << time_str
+ << delete_status(entry.deferment_end_time);
+ if (has_parent)
+ tbl << parent;
+ tbl << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else if (!trash_entries.empty()) {
+ std::cout << tbl;
+ }
+
+ return r < 0 ? r : 0;
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ options->add_options()
+ ("all,a", po::bool_switch(), "list images from all sources");
+ options->add_options()
+ ("long,l", po::bool_switch(), "long listing format");
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ librbd::RBD rbd;
+ r = do_list(rbd, io_ctx, vm["long"].as<bool>(), vm["all"].as<bool>(),
+ formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: trash list: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_purge_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ at::add_no_progress_option(options);
+
+ options->add_options()
+ (EXPIRED_BEFORE.c_str(), po::value<std::string>()->value_name("date"),
+ "purges images that expired before the given date");
+ options->add_options()
+ (THRESHOLD.c_str(), po::value<float>(),
+ "purges images until the current pool data usage is reduced to X%, "
+ "value range: 0.0-1.0");
+}
+
+int execute_purge(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ librbd::RBD rbd;
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_pool_full_try();
+
+ float threshold = -1;
+ time_t expire_ts = 0;
+
+ if (vm.find(THRESHOLD) != vm.end()) {
+ threshold = vm[THRESHOLD].as<float>();
+ } else {
+ if (vm.find(EXPIRED_BEFORE) != vm.end()) {
+ utime_t new_time;
+ r = utime_t::invoke_date(vm[EXPIRED_BEFORE].as<std::string>(), &new_time);
+ if (r < 0) {
+ std::cerr << "rbd: error calling /bin/date: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ expire_ts = new_time.sec();
+ }
+ }
+
+ utils::ProgressContext pc("Removing images", vm[at::NO_PROGRESS].as<bool>());
+ r = rbd.trash_purge_with_progress(io_ctx, expire_ts, threshold, pc);
+ if (r < 0) {
+ pc.fail();
+ if (r == -ENOTEMPTY || r == -EBUSY || r == -EMLINK || r == -EUCLEAN) {
+ std::cerr << "rbd: some expired images could not be removed"
+ << std::endl
+ << "Ensure that they are closed/unmapped, do not have "
+ << "snapshots (including trashed snapshots with linked "
+ << "clones), are not in a group and were moved to the "
+ << "trash successfully."
+ << std::endl;
+ }
+ return r;
+ }
+
+ pc.finish();
+ return 0;
+}
+
+void get_restore_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ (at::IMAGE_ID.c_str(), "image id\n(example: [<pool-name>/]<image-id>)");
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_namespace_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE, "");
+}
+
+int execute_restore(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_id;
+ int r = utils::get_pool_image_id(vm, &arg_index, &pool_name, &namespace_name,
+ &image_id);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string name;
+ if (vm.find(at::IMAGE_NAME) != vm.end()) {
+ name = vm[at::IMAGE_NAME].as<std::string>();
+ }
+
+ librbd::RBD rbd;
+ r = rbd.trash_restore(io_ctx, image_id.c_str(), name.c_str());
+ if (r < 0) {
+ if (r == -ENOENT) {
+ std::cerr << "rbd: error: image does not exist in trash"
+ << std::endl;
+ } else if (r == -EEXIST) {
+ std::cerr << "rbd: error: an image with the same name already exists, "
+ << "try again with a different name"
+ << std::endl;
+ } else {
+ std::cerr << "rbd: restore error: " << cpp_strerror(r) << std::endl;
+ }
+ return r;
+ }
+
+ return r;
+}
+
+Shell::Action action_move(
+ {"trash", "move"}, {"trash", "mv"}, "Move an image to the trash.", "",
+ &get_move_arguments, &execute_move);
+
+Shell::Action action_remove(
+ {"trash", "remove"}, {"trash", "rm"}, "Remove an image from trash.", "",
+ &get_remove_arguments, &execute_remove);
+
+Shell::Action action_purge(
+ {"trash", "purge"}, {}, "Remove all expired images from trash.", "",
+ &get_purge_arguments, &execute_purge);
+
+Shell::Action action_list(
+ {"trash", "list"}, {"trash", "ls"}, "List trash images.", "",
+ &get_list_arguments, &execute_list);
+
+Shell::Action action_restore(
+ {"trash", "restore"}, {}, "Restore an image from trash.", "",
+ &get_restore_arguments, &execute_restore);
+
+} // namespace trash
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/TrashPurgeSchedule.cc b/src/tools/rbd/action/TrashPurgeSchedule.cc
new file mode 100644
index 000000000..5c133c295
--- /dev/null
+++ b/src/tools/rbd/action/TrashPurgeSchedule.cc
@@ -0,0 +1,355 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Schedule.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "global/global_context.h"
+#include "include/stringify.h"
+
+#include <iostream>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <boost/program_options.hpp>
+
+#include "json_spirit/json_spirit.h"
+
+namespace rbd {
+namespace action {
+namespace trash_purge_schedule {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+class ScheduleStatus {
+public:
+ ScheduleStatus() {
+ }
+
+ int parse(const std::string &status) {
+ json_spirit::mValue json_root;
+ if(!json_spirit::read(status, json_root)) {
+ std::cerr << "rbd: invalid schedule status JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ try {
+ auto &s = json_root.get_obj();
+
+ if (s["scheduled"].type() != json_spirit::array_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "scheduled is not array" << std::endl;
+ return -EBADMSG;
+ }
+
+ for (auto &item_val : s["scheduled"].get_array()) {
+ if (item_val.type() != json_spirit::obj_type) {
+ std::cerr << "rbd: unexpected schedule status JSON received: "
+ << "schedule item is not object" << std::endl;
+ return -EBADMSG;
+ }
+
+ auto &item = item_val.get_obj();
+
+ if (item["pool_name"].type() != json_spirit::str_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "pool_name is not string" << std::endl;
+ return -EBADMSG;
+ }
+ auto pool_name = item["pool_name"].get_str();
+
+ if (item["namespace"].type() != json_spirit::str_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "namespace is not string" << std::endl;
+ return -EBADMSG;
+ }
+ auto namespace_name = item["namespace"].get_str();
+
+ if (item["schedule_time"].type() != json_spirit::str_type) {
+ std::cerr << "rbd: unexpected schedule JSON received: "
+ << "schedule_time is not string" << std::endl;
+ return -EBADMSG;
+ }
+ auto schedule_time = item["schedule_time"].get_str();
+
+ scheduled.insert({pool_name, namespace_name, schedule_time});
+ }
+
+ } catch (std::runtime_error &) {
+ std::cerr << "rbd: invalid schedule JSON received" << std::endl;
+ return -EBADMSG;
+ }
+
+ return 0;
+ }
+
+ void dump(Formatter *f) {
+ f->open_array_section("scheduled");
+ for (auto &item : scheduled) {
+ f->open_object_section("item");
+ f->dump_string("pool", item.pool_name);
+ f->dump_string("namespace", item.namespace_name);
+ f->dump_string("schedule_time", item.schedule_time);
+ f->close_section(); // item
+ }
+ f->close_section(); // scheduled
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, ScheduleStatus &d);
+
+private:
+
+ struct Item {
+ std::string pool_name;
+ std::string namespace_name;
+ std::string schedule_time;
+
+ Item(const std::string &pool_name, const std::string &namespace_name,
+ const std::string &schedule_time)
+ : pool_name(pool_name), namespace_name(namespace_name),
+ schedule_time(schedule_time) {
+ }
+
+ bool operator<(const Item &rhs) const {
+ if (pool_name != rhs.pool_name) {
+ return pool_name < rhs.pool_name;
+ }
+ return namespace_name < rhs.namespace_name;
+ }
+ };
+
+ std::set<Item> scheduled;
+};
+
+std::ostream& operator<<(std::ostream& os, ScheduleStatus &s) {
+ TextTable tbl;
+ tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("NAMESPACE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("SCHEDULE TIME", TextTable::LEFT, TextTable::LEFT);
+
+ for (auto &item : s.scheduled) {
+ tbl << item.pool_name << item.namespace_name << item.schedule_time
+ << TextTable::endrow;
+ }
+
+ os << tbl;
+ return os;
+}
+
+} // anonymous namespace
+
+void get_arguments_add(po::options_description *positional,
+ po::options_description *options) {
+ add_level_spec_options(options, false);
+ add_schedule_options(positional, true);
+}
+
+int execute_add(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::map<std::string, std::string> args;
+
+ int r = get_level_spec_args(vm, &args);
+ if (r < 0) {
+ return r;
+ }
+ r = get_schedule_args(vm, true, &args);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ normalize_level_spec_args(&args);
+ r = utils::mgr_command(rados, "rbd trash purge schedule add", args,
+ &std::cout, &std::cerr);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void get_arguments_remove(po::options_description *positional,
+ po::options_description *options) {
+ add_level_spec_options(options, false);
+ add_schedule_options(positional, false);
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::map<std::string, std::string> args;
+
+ int r = get_level_spec_args(vm, &args);
+ if (r < 0) {
+ return r;
+ }
+ r = get_schedule_args(vm, false, &args);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ normalize_level_spec_args(&args);
+ r = utils::mgr_command(rados, "rbd trash purge schedule remove", args,
+ &std::cout, &std::cerr);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void get_arguments_list(po::options_description *positional,
+ po::options_description *options) {
+ add_level_spec_options(options, false);
+ options->add_options()
+ ("recursive,R", po::bool_switch(), "list all schedules");
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::map<std::string, std::string> args;
+
+ int r = get_level_spec_args(vm, &args);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ normalize_level_spec_args(&args);
+ std::stringstream out;
+ r = utils::mgr_command(rados, "rbd trash purge schedule list", args, &out,
+ &std::cerr);
+ if (r < 0) {
+ return r;
+ }
+
+ ScheduleList schedule_list(false);
+ r = schedule_list.parse(out.str());
+ if (r < 0) {
+ return r;
+ }
+
+ if (vm["recursive"].as<bool>()) {
+ if (formatter.get()) {
+ schedule_list.dump(formatter.get());
+ formatter->flush(std::cout);
+ } else {
+ std::cout << schedule_list;
+ }
+ } else {
+ auto schedule = schedule_list.find(args["level_spec"]);
+ if (schedule == nullptr) {
+ return -ENOENT;
+ }
+
+ if (formatter.get()) {
+ schedule->dump(formatter.get());
+ formatter->flush(std::cout);
+ } else {
+ std::cout << *schedule << std::endl;
+ }
+ }
+
+ return 0;
+}
+
+void get_arguments_status(po::options_description *positional,
+ po::options_description *options) {
+ add_level_spec_options(options, false);
+ at::add_format_options(options);
+}
+
+int execute_status(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::map<std::string, std::string> args;
+
+ int r = get_level_spec_args(vm, &args);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ normalize_level_spec_args(&args);
+ std::stringstream out;
+ r = utils::mgr_command(rados, "rbd trash purge schedule status", args, &out,
+ &std::cerr);
+ ScheduleStatus schedule_status;
+ r = schedule_status.parse(out.str());
+ if (r < 0) {
+ return r;
+ }
+
+ if (formatter.get()) {
+ schedule_status.dump(formatter.get());
+ formatter->flush(std::cout);
+ } else {
+ std::cout << schedule_status;
+ }
+
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"recursive", "R"});
+
+Shell::Action add_action(
+ {"trash", "purge", "schedule", "add"}, {}, "Add trash purge schedule.", "",
+ &get_arguments_add, &execute_add);
+Shell::Action remove_action(
+ {"trash", "purge", "schedule", "remove"},
+ {"trash", "purge", "schedule", "rm"}, "Remove trash purge schedule.",
+ "", &get_arguments_remove, &execute_remove);
+Shell::Action list_action(
+ {"trash", "purge", "schedule", "list"},
+ {"trash", "purge", "schedule", "ls"}, "List trash purge schedule.",
+ "", &get_arguments_list, &execute_list);
+Shell::Action status_action(
+ {"trash", "purge", "schedule", "status"}, {},
+ "Show trash purge schedule status.", "", &get_arguments_status,
+ &execute_status);
+
+} // namespace trash_purge_schedule
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Watch.cc b/src/tools/rbd/action/Watch.cc
new file mode 100644
index 000000000..98697bc28
--- /dev/null
+++ b/src/tools/rbd/action/Watch.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd_types.h"
+#include "librbd/WatchNotifyTypes.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace watch {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+class RbdWatchCtx : public librados::WatchCtx2 {
+public:
+ RbdWatchCtx(librados::IoCtx& io_ctx, const char *image_name,
+ const std::string &header_oid)
+ : m_io_ctx(io_ctx), m_image_name(image_name), m_header_oid(header_oid)
+ {
+ }
+
+ ~RbdWatchCtx() override {}
+
+ void handle_notify(uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ bufferlist& bl) override {
+ using namespace librbd::watch_notify;
+ NotifyMessage notify_message;
+ if (bl.length() == 0) {
+ notify_message = NotifyMessage(new HeaderUpdatePayload());
+ } else {
+ try {
+ auto iter = bl.cbegin();
+ notify_message.decode(iter);
+ } catch (const buffer::error &err) {
+ std::cerr << "rbd: failed to decode image notification" << std::endl;
+ }
+ }
+
+ std::cout << m_image_name << " received notification: notify_id="
+ << notify_id << ", cookie=" << cookie << ", notifier_id="
+ << notifier_id << ", bl.length=" << bl.length() << ", notify_op="
+ << notify_message.get_notify_op() << std::endl;
+ bufferlist reply;
+ m_io_ctx.notify_ack(m_header_oid, notify_id, cookie, reply);
+ }
+
+ void handle_error(uint64_t cookie, int err) override {
+ std::cerr << m_image_name << " received error: cookie=" << cookie << ", "
+ << "err=" << cpp_strerror(err) << std::endl;
+ }
+private:
+ librados::IoCtx m_io_ctx;
+ const char *m_image_name;
+ std::string m_header_oid;
+};
+
+static int do_watch(librados::IoCtx& pp, librbd::Image &image,
+ const char *imgname)
+{
+ uint8_t old_format;
+ int r = image.old_format(&old_format);
+ if (r < 0) {
+ std::cerr << "failed to query format" << std::endl;
+ return r;
+ }
+
+ std::string header_oid;
+ if (old_format != 0) {
+ header_oid = std::string(imgname) + RBD_SUFFIX;
+ } else {
+ std::string id;
+ r = image.get_id(&id);
+ if (r < 0) {
+ return r;
+ }
+
+ header_oid = RBD_HEADER_PREFIX + id;
+ }
+
+ uint64_t cookie;
+ RbdWatchCtx ctx(pp, imgname, header_oid);
+ r = pp.watch2(header_oid, &cookie, &ctx);
+ if (r < 0) {
+ std::cerr << "rbd: watch failed" << std::endl;
+ return r;
+ }
+
+ std::cout << "press enter to exit..." << std::endl;
+ getchar();
+
+ r = pp.unwatch2(cookie);
+ if (r < 0) {
+ std::cerr << "rbd: unwatch failed" << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_watch(io_ctx, image, image_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: watch failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"watch"}, {}, "Watch events on image.", "", &get_arguments, &execute);
+
+} // namespace watch
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Wnbd.cc b/src/tools/rbd/action/Wnbd.cc
new file mode 100644
index 000000000..85d2c7057
--- /dev/null
+++ b/src/tools/rbd/action/Wnbd.cc
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/SubProcess.h"
+#include <iostream>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace wnbd {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+#if defined(_WIN32)
+static int call_wnbd_cmd(const po::variables_map &vm,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &ceph_global_init_args) {
+ char exe_path[PATH_MAX];
+ ssize_t exe_path_bytes = get_self_exe_path(exe_path, PATH_MAX);
+
+ if (exe_path_bytes > 4) {
+ // Drop .exe suffix as we're going to add the "-wnbd" suffix.
+ exe_path[strlen(exe_path) - 4] = '\0';
+ exe_path_bytes -= 4;
+ }
+
+ if (exe_path_bytes < 0) {
+ strcpy(exe_path, "rbd-wnbd");
+ } else {
+ if (snprintf(exe_path + exe_path_bytes,
+ sizeof(exe_path) - exe_path_bytes,
+ "-wnbd") < 0) {
+ return -EOVERFLOW;
+ }
+ }
+
+ SubProcess process(exe_path, SubProcess::KEEP, SubProcess::KEEP, SubProcess::KEEP);
+
+ for (auto &arg : ceph_global_init_args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ for (auto &arg : args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ if (process.spawn()) {
+ std::cerr << "rbd: failed to run rbd-wnbd: " << process.err() << std::endl;
+ return -EINVAL;
+ }
+ int exit_code = process.join();
+ if (exit_code) {
+ std::cerr << "rbd: rbd-wnbd failed with error: " << process.err() << std::endl;
+ return exit_code;
+ }
+
+ return 0;
+}
+#endif
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(_WIN32)
+ std::cerr << "rbd: wnbd is only supported on Windows" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::vector<std::string> args;
+
+ args.push_back("list");
+
+ if (vm.count("format")) {
+ args.push_back("--format");
+ args.push_back(vm["format"].as<at::Format>().value);
+ }
+ if (vm["pretty-format"].as<bool>()) {
+ args.push_back("--pretty-format");
+ }
+
+ return call_wnbd_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_map(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(_WIN32)
+ std::cerr << "rbd: wnbd is only supported on Windows" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::vector<std::string> args;
+
+ args.push_back("map");
+ std::string img;
+ int r = utils::get_image_or_snap_spec(vm, &img);
+ if (r < 0) {
+ return r;
+ }
+ args.push_back(img);
+
+ if (vm["read-only"].as<bool>()) {
+ args.push_back("--read-only");
+ }
+
+ if (vm["exclusive"].as<bool>()) {
+ args.push_back("--exclusive");
+ }
+
+ if (vm.count("options")) {
+ utils::append_options_as_args(vm["options"].as<std::vector<std::string>>(),
+ &args);
+ }
+
+ return call_wnbd_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_unmap(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(_WIN32)
+ std::cerr << "rbd: wnbd is only supported on Windows" << std::endl;
+ return -EOPNOTSUPP;
+#else
+ std::string image_name;
+
+ int r = utils::get_image_or_snap_spec(vm, &image_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::vector<std::string> args;
+
+ args.push_back("unmap");
+ args.push_back(image_name);
+
+ if (vm.count("options")) {
+ utils::append_options_as_args(vm["options"].as<std::vector<std::string>>(),
+ &args);
+ }
+
+ return call_wnbd_cmd(vm, args, ceph_global_init_args);
+#endif
+}
+
+int execute_attach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(_WIN32)
+ std::cerr << "rbd: wnbd is only supported on Windows" << std::endl;
+#else
+ std::cerr << "rbd: wnbd attach command not supported" << std::endl;
+#endif
+ return -EOPNOTSUPP;
+}
+
+int execute_detach(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(_WIN32)
+ std::cerr << "rbd: wnbd is only supported on Windows" << std::endl;
+#else
+ std::cerr << "rbd: wnbd detach command not supported" << std::endl;
+#endif
+ return -EOPNOTSUPP;
+}
+
+} // namespace wnbd
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/rbd.cc b/src/tools/rbd/rbd.cc
new file mode 100644
index 000000000..a8c59d575
--- /dev/null
+++ b/src/tools/rbd/rbd.cc
@@ -0,0 +1,10 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/Shell.h"
+
+int main(int argc, const char **argv)
+{
+ rbd::Shell shell;
+ return shell.execute(argc, argv);
+}
diff --git a/src/tools/rbd_ggate/CMakeLists.txt b/src/tools/rbd_ggate/CMakeLists.txt
new file mode 100644
index 000000000..5c5572c48
--- /dev/null
+++ b/src/tools/rbd_ggate/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(rbd-ggate
+ Driver.cc
+ Server.cc
+ Watcher.cc
+ debug.cc
+ ggate_drv.c
+ main.cc)
+target_link_libraries(rbd-ggate geom librbd librados global)
+install(TARGETS rbd-ggate DESTINATION bin)
diff --git a/src/tools/rbd_ggate/Driver.cc b/src/tools/rbd_ggate/Driver.cc
new file mode 100644
index 000000000..80acfe00c
--- /dev/null
+++ b/src/tools/rbd_ggate/Driver.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <stdlib.h>
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "Driver.h"
+#include "Request.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::ggate::Driver: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace ggate {
+
+int Driver::load() {
+
+ return ggate_drv_load();
+}
+
+int Driver::kill(const std::string &devname) {
+
+ int r = ggate_drv_kill(devname.c_str());
+
+ return r;
+}
+
+int Driver::list(std::map<std::string, DevInfo> *devices) {
+ size_t size = 1024;
+ ggate_drv_info *devs = nullptr;
+ int r;
+
+ while (size <= 1024 * 1024) {
+ devs = static_cast<ggate_drv_info *>(
+ realloc(static_cast<void *>(devs), size * sizeof(*devs)));
+ r = ggate_drv_list(devs, &size);
+ if (r != -ERANGE) {
+ break;
+ }
+ }
+ if (r < 0) {
+ goto free;
+ }
+
+ devices->clear();
+ for (size_t i = 0; i < size; i++) {
+ auto &dev = devs[i];
+ (*devices)[dev.id] = {dev.name, dev.info};
+ }
+
+free:
+ free(devs);
+
+ return r;
+}
+
+Driver::Driver(const std::string &devname, size_t sectorsize, size_t mediasize,
+ bool readonly, const std::string &info)
+ : m_devname(devname), m_sectorsize(sectorsize), m_mediasize(mediasize),
+ m_readonly(readonly), m_info(info) {
+}
+
+int Driver::init() {
+ dout(20) << dendl;
+
+ char name[PATH_MAX];
+ size_t namelen;
+
+ if (m_devname.empty()) {
+ name[0] = '\0';
+ namelen = PATH_MAX;
+ } else {
+ namelen = m_devname.size();
+ if (namelen >= PATH_MAX) {
+ return -ENAMETOOLONG;
+ }
+ strncpy(name, m_devname.c_str(), namelen + 1);
+ }
+
+ int r = ggate_drv_create(name, namelen, m_sectorsize, m_mediasize, m_readonly,
+ m_info.c_str(), &m_drv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_devname.empty()) {
+ m_devname = name;
+ }
+
+ return 0;
+}
+
+std::string Driver::get_devname() const {
+ dout(30) << m_devname << dendl;
+
+ return m_devname;
+}
+
+void Driver::shut_down() {
+ dout(20) << dendl;
+
+ ggate_drv_destroy(m_drv);
+}
+
+int Driver::resize(size_t newsize) {
+ dout(20) << "newsize=" << newsize << dendl;
+
+ int r = ggate_drv_resize(m_drv, newsize);
+ if (r < 0) {
+ return r;
+ }
+
+ m_mediasize = newsize;
+ return 0;
+}
+
+int Driver::recv(Request **req) {
+ dout(20) << dendl;
+
+ ggate_drv_req_t req_;
+
+ int r = ggate_drv_recv(m_drv, &req_);
+ if (r < 0) {
+ return r;
+ }
+
+ *req = new Request(req_);
+
+ dout(20) << "req=" << *req << dendl;
+
+ if (ggate_drv_req_cmd(req_) == GGATE_DRV_CMD_WRITE) {
+ bufferptr ptr(buffer::claim_malloc(
+ ggate_drv_req_length(req_),
+ static_cast<char *>(ggate_drv_req_release_buf(req_))));
+ (*req)->bl.push_back(ptr);
+ }
+
+ return 0;
+}
+
+int Driver::send(Request *req) {
+ dout(20) << "req=" << req << dendl;
+
+ if (ggate_drv_req_cmd(req->req) == GGATE_DRV_CMD_READ &&
+ ggate_drv_req_error(req->req) == 0) {
+ ceph_assert(req->bl.length() == ggate_drv_req_length(req->req));
+ // TODO: avoid copying?
+ req->bl.begin().copy(ggate_drv_req_length(req->req),
+ static_cast<char *>(ggate_drv_req_buf(req->req)));
+ dout(20) << "copied resulting " << req->bl.length() << " bytes to "
+ << ggate_drv_req_buf(req->req) << dendl;
+ }
+
+ int r = ggate_drv_send(m_drv, req->req);
+
+ delete req;
+ return r;
+}
+
+} // namespace ggate
+} // namespace rbd
diff --git a/src/tools/rbd_ggate/Driver.h b/src/tools/rbd_ggate/Driver.h
new file mode 100644
index 000000000..50be72b9c
--- /dev/null
+++ b/src/tools/rbd_ggate/Driver.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_DRIVER_H
+#define CEPH_RBD_GGATE_DRIVER_H
+
+#include <map>
+#include <string>
+
+#include "ggate_drv.h"
+
+namespace rbd {
+namespace ggate {
+
+struct Request;
+
+class Driver {
+public:
+ typedef std::pair<std::string, std::string> DevInfo;
+ static int load();
+ static int kill(const std::string &devname);
+ static int list(std::map<std::string, DevInfo> *devices);
+
+ Driver(const std::string &devname, size_t sectorsize, size_t mediasize,
+ bool readonly, const std::string &info);
+
+ int init();
+ void shut_down();
+
+ std::string get_devname() const;
+
+ int recv(Request **req);
+ int send(Request *req);
+
+ int resize(size_t newsize);
+
+private:
+ std::string m_devname;
+ size_t m_sectorsize;
+ size_t m_mediasize;
+ bool m_readonly;
+ std::string m_info;
+ ggate_drv_t m_drv = 0;
+};
+
+} // namespace ggate
+} // namespace rbd
+
+#endif // CEPH_RBD_GGATE_DRIVER_H
+
diff --git a/src/tools/rbd_ggate/Request.h b/src/tools/rbd_ggate/Request.h
new file mode 100644
index 000000000..66f219858
--- /dev/null
+++ b/src/tools/rbd_ggate/Request.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_REQUEST_H
+#define CEPH_RBD_GGATE_REQUEST_H
+
+#include "ggate_drv.h"
+
+namespace rbd {
+namespace ggate {
+
+struct Request {
+ enum Command {
+ Unknown = 0,
+ Write = 1,
+ Read = 2,
+ Flush = 3,
+ Discard = 4,
+ };
+
+ ggate_drv_req_t req;
+ bufferlist bl;
+
+ Request(ggate_drv_req_t req) : req(req) {
+ }
+
+ uint64_t get_id() {
+ return ggate_drv_req_id(req);
+ }
+
+ Command get_cmd() {
+ return static_cast<Command>(ggate_drv_req_cmd(req));
+ }
+
+ size_t get_length() {
+ return ggate_drv_req_length(req);
+ }
+
+ uint64_t get_offset() {
+ return ggate_drv_req_offset(req);
+ }
+
+ uint64_t get_error() {
+ return ggate_drv_req_error(req);
+ }
+
+ void set_error(int error) {
+ ggate_drv_req_set_error(req, error);
+ }
+};
+
+} // namespace ggate
+} // namespace rbd
+
+#endif // CEPH_RBD_GGATE_REQUEST_H
diff --git a/src/tools/rbd_ggate/Server.cc b/src/tools/rbd_ggate/Server.cc
new file mode 100644
index 000000000..2565ba10f
--- /dev/null
+++ b/src/tools/rbd_ggate/Server.cc
@@ -0,0 +1,262 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "Driver.h"
+#include "Server.h"
+#include "Request.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::ggate::Server: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace ggate {
+
+Server::Server(Driver *drv, librbd::Image& image)
+ : m_drv(drv), m_image(image),
+ m_reader_thread(this, &Server::reader_entry),
+ m_writer_thread(this, &Server::writer_entry) {
+}
+
+void Server::run() {
+ dout(10) << dendl;
+
+ int r = start();
+ ceph_assert(r == 0);
+
+ dout(20) << "entering run loop" << dendl;
+
+ {
+ std::unique_lock locker{m_lock};
+ m_cond.wait(locker, [this] { return m_stopping;});
+ }
+
+ dout(20) << "exiting run loop" << dendl;
+
+ stop();
+}
+
+int Server::start() {
+ dout(10) << dendl;
+
+ m_reader_thread.create("rbd_reader");
+ m_writer_thread.create("rbd_writer");
+ return 0;
+}
+
+void Server::stop() {
+ dout(10) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_stopping);
+ }
+
+ m_reader_thread.join();
+ m_writer_thread.join();
+
+ wait_clean();
+}
+
+void Server::io_start(IOContext *ctx) {
+ dout(20) << ctx << dendl;
+
+ std::lock_guard locker{m_lock};
+ m_io_pending.push_back(&ctx->item);
+}
+
+void Server::io_finish(IOContext *ctx) {
+ dout(20) << ctx << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(ctx->item.is_on_list());
+
+ ctx->item.remove_myself();
+ m_io_finished.push_back(&ctx->item);
+ m_cond.notify_all();
+}
+
+Server::IOContext *Server::wait_io_finish() {
+ dout(20) << dendl;
+
+ std::unique_lock locker{m_lock};
+ m_cond.wait(locker, [this] { return !m_io_finished.empty() || m_stopping;});
+
+ if (m_io_finished.empty()) {
+ return nullptr;
+ }
+
+ IOContext *ret = m_io_finished.front();
+ m_io_finished.pop_front();
+
+ return ret;
+}
+
+void Server::wait_clean() {
+ dout(20) << dendl;
+
+ ceph_assert(!m_reader_thread.is_started());
+
+ std::unique_lock locker{m_lock};
+ m_cond.wait(locker, [this] { return m_io_pending.empty();});
+
+ while (!m_io_finished.empty()) {
+ std::unique_ptr<IOContext> free_ctx(m_io_finished.front());
+ m_io_finished.pop_front();
+ }
+}
+
+void Server::aio_callback(librbd::completion_t cb, void *arg) {
+ librbd::RBD::AioCompletion *aio_completion =
+ reinterpret_cast<librbd::RBD::AioCompletion*>(cb);
+
+ IOContext *ctx = reinterpret_cast<IOContext *>(arg);
+ int r = aio_completion->get_return_value();
+
+ ctx->server->handle_aio(ctx, r);
+ aio_completion->release();
+}
+
+void Server::handle_aio(IOContext *ctx, int r) {
+ dout(20) << ctx << ": r=" << r << dendl;
+
+ if (r == -EINVAL) {
+ // if shrinking an image, a pagecache writeback might reference
+ // extents outside of the range of the new image extents
+ dout(5) << "masking IO out-of-bounds error" << dendl;
+ ctx->req->bl.clear();
+ r = 0;
+ }
+
+ if (r < 0) {
+ ctx->req->set_error(-r);
+ } else if ((ctx->req->get_cmd() == Request::Read) &&
+ r != static_cast<int>(ctx->req->get_length())) {
+ int pad_byte_count = static_cast<int> (ctx->req->get_length()) - r;
+ ctx->req->bl.append_zero(pad_byte_count);
+ dout(20) << ctx << ": pad byte count: " << pad_byte_count << dendl;
+ ctx->req->set_error(0);
+ } else {
+ ctx->req->set_error(0);
+ }
+ io_finish(ctx);
+}
+
+void Server::reader_entry() {
+ dout(20) << dendl;
+
+ while (!m_stopping) {
+ std::unique_ptr<IOContext> ctx(new IOContext(this));
+
+ dout(20) << "waiting for ggate request" << dendl;
+
+ int r = m_drv->recv(&ctx->req);
+ if (r < 0) {
+ if (r != -ECANCELED) {
+ derr << "recv: " << cpp_strerror(r) << dendl;
+ }
+ std::lock_guard locker{m_lock};
+ m_stopping = true;
+ m_cond.notify_all();
+ return;
+ }
+
+ IOContext *pctx = ctx.release();
+
+ dout(20) << pctx << ": start: " << *pctx << dendl;
+
+ io_start(pctx);
+ librbd::RBD::AioCompletion *c =
+ new librbd::RBD::AioCompletion(pctx, aio_callback);
+ switch (pctx->req->get_cmd())
+ {
+ case rbd::ggate::Request::Write:
+ m_image.aio_write(pctx->req->get_offset(), pctx->req->get_length(),
+ pctx->req->bl, c);
+ break;
+ case rbd::ggate::Request::Read:
+ m_image.aio_read(pctx->req->get_offset(), pctx->req->get_length(),
+ pctx->req->bl, c);
+ break;
+ case rbd::ggate::Request::Flush:
+ m_image.aio_flush(c);
+ break;
+ case rbd::ggate::Request::Discard:
+ m_image.aio_discard(pctx->req->get_offset(), pctx->req->get_length(), c);
+ break;
+ default:
+ derr << pctx << ": invalid request command: " << pctx->req->get_cmd()
+ << dendl;
+ c->release();
+ std::lock_guard locker{m_lock};
+ m_stopping = true;
+ m_cond.notify_all();
+ return;
+ }
+ }
+ dout(20) << "terminated" << dendl;
+}
+
+void Server::writer_entry() {
+ dout(20) << dendl;
+
+ while (!m_stopping) {
+ dout(20) << "waiting for io request" << dendl;
+
+ std::unique_ptr<IOContext> ctx(wait_io_finish());
+ if (!ctx) {
+ dout(20) << "no io requests, terminating" << dendl;
+ return;
+ }
+
+ dout(20) << ctx.get() << ": got: " << *ctx << dendl;
+
+ int r = m_drv->send(ctx->req);
+ if (r < 0) {
+ derr << ctx.get() << ": send: " << cpp_strerror(r) << dendl;
+ std::lock_guard locker{m_lock};
+ m_stopping = true;
+ m_cond.notify_all();
+ return;
+ }
+ dout(20) << ctx.get() << " finish" << dendl;
+ }
+ dout(20) << "terminated" << dendl;
+}
+
+std::ostream &operator<<(std::ostream &os, const Server::IOContext &ctx) {
+
+ os << "[" << ctx.req->get_id();
+
+ switch (ctx.req->get_cmd())
+ {
+ case rbd::ggate::Request::Write:
+ os << " Write ";
+ break;
+ case rbd::ggate::Request::Read:
+ os << " Read ";
+ break;
+ case rbd::ggate::Request::Flush:
+ os << " Flush ";
+ break;
+ case rbd::ggate::Request::Discard:
+ os << " Discard ";
+ break;
+ default:
+ os << " Unknow(" << ctx.req->get_cmd() << ") ";
+ break;
+ }
+
+ os << ctx.req->get_offset() << "~" << ctx.req->get_length() << " "
+ << ctx.req->get_error() << "]";
+
+ return os;
+}
+
+} // namespace ggate
+} // namespace rbd
+
diff --git a/src/tools/rbd_ggate/Server.h b/src/tools/rbd_ggate/Server.h
new file mode 100644
index 000000000..bb31b89f7
--- /dev/null
+++ b/src/tools/rbd_ggate/Server.h
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_SERVER_H
+#define CEPH_RBD_GGATE_SERVER_H
+
+#include "include/rbd/librbd.hpp"
+#include "include/xlist.h"
+#include "common/ceph_mutex.h"
+#include "common/Thread.h"
+
+namespace rbd {
+namespace ggate {
+
+class Driver;
+struct Request;
+
+class Server {
+public:
+ Server(Driver *drv, librbd::Image& image);
+
+ void run();
+
+private:
+ struct IOContext {
+ xlist<IOContext*>::item item;
+ Server *server;
+ Request *req = nullptr;
+
+ IOContext(Server *server) : item(this), server(server) {
+ }
+ };
+
+ class ThreadHelper : public Thread {
+ public:
+ typedef void (Server::*entry_func)();
+
+ ThreadHelper(Server *server, entry_func func)
+ : server(server), func(func) {
+ }
+
+ protected:
+ virtual void* entry() {
+ (server->*func)();
+ return nullptr;
+ }
+
+ private:
+ Server *server;
+ entry_func func;
+ };
+
+ friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx);
+
+ Driver *m_drv;
+ librbd::Image &m_image;
+
+ mutable ceph::mutex m_lock =
+ ceph::make_mutex("rbd::ggate::Server::m_lock");
+ ceph::condition_variable m_cond;
+ bool m_stopping = false;
+ ThreadHelper m_reader_thread, m_writer_thread;
+ xlist<IOContext*> m_io_pending;
+ xlist<IOContext*> m_io_finished;
+
+ static void aio_callback(librbd::completion_t cb, void *arg);
+
+ int start();
+ void stop();
+
+ void reader_entry();
+ void writer_entry();
+
+ void io_start(IOContext *ctx);
+ void io_finish(IOContext *ctx);
+
+ IOContext *wait_io_finish();
+ void wait_clean();
+
+ void handle_aio(IOContext *ctx, int r);
+};
+
+std::ostream &operator<<(std::ostream &os, const Server::IOContext &ctx);
+
+} // namespace ggate
+} // namespace rbd
+
+#endif // CEPH_RBD_GGATE_SERVER_H
diff --git a/src/tools/rbd_ggate/Watcher.cc b/src/tools/rbd_ggate/Watcher.cc
new file mode 100644
index 000000000..57b3f960e
--- /dev/null
+++ b/src/tools/rbd_ggate/Watcher.cc
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "Driver.h"
+#include "Watcher.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::ggate::Watcher: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace ggate {
+
+Watcher::Watcher(Driver *drv, librados::IoCtx &ioctx, librbd::Image &image,
+ size_t size)
+ : m_drv(drv), m_ioctx(ioctx), m_image(image), m_size(size) {
+}
+
+void Watcher::handle_notify() {
+ dout(20) << dendl;
+
+ librbd::image_info_t info;
+
+ if (m_image.stat(info, sizeof(info)) == 0) {
+ size_t new_size = info.size;
+
+ if (new_size != m_size) {
+ int r = m_drv->resize(new_size);
+ if (r < 0) {
+ derr << "resize failed: " << cpp_strerror(r) << dendl;
+ m_drv->shut_down();
+ }
+ r = m_image.invalidate_cache();
+ if (r < 0) {
+ derr << "invalidate rbd cache failed: " << cpp_strerror(r) << dendl;
+ m_drv->shut_down();
+ }
+ m_size = new_size;
+ }
+ }
+}
+
+} // namespace ggate
+} // namespace rbd
diff --git a/src/tools/rbd_ggate/Watcher.h b/src/tools/rbd_ggate/Watcher.h
new file mode 100644
index 000000000..8f524b43f
--- /dev/null
+++ b/src/tools/rbd_ggate/Watcher.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_WATCHER_H
+#define CEPH_RBD_GGATE_WATCHER_H
+
+#include "include/rbd/librbd.hpp"
+
+namespace rbd {
+namespace ggate {
+
+class Driver;
+
+class Watcher : public librbd::UpdateWatchCtx
+{
+public:
+ Watcher(Driver *m_drv, librados::IoCtx &ioctx, librbd::Image &image,
+ size_t size);
+
+ void handle_notify() override;
+
+private:
+ Driver *m_drv;
+ librados::IoCtx &m_ioctx;
+ librbd::Image &m_image;
+ size_t m_size;
+};
+
+
+} // namespace ggate
+} // namespace rbd
+
+#endif // CEPH_RBD_GGATE_WATCHER_H
+
diff --git a/src/tools/rbd_ggate/debug.cc b/src/tools/rbd_ggate/debug.cc
new file mode 100644
index 000000000..b675ba5b3
--- /dev/null
+++ b/src/tools/rbd_ggate/debug.cc
@@ -0,0 +1,55 @@
+#include "common/debug.h"
+#include "common/errno.h"
+#include "debug.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::ggate: "
+
+extern "C" void debugv(int level, const char *fmt, va_list ap) {
+ char *msg;
+ int saved_errno = errno;
+
+ if (g_ceph_context == nullptr) {
+ return;
+ }
+
+ vasprintf(&msg, fmt, ap);
+
+ dout(ceph::dout::need_dynamic(level)) << msg << dendl;
+
+ free(msg);
+ errno = saved_errno;
+}
+
+extern "C" void debug(int level, const char *fmt, ...) {
+ va_list ap;
+
+ va_start(ap, fmt);
+ debugv(level, fmt, ap);
+ va_end(ap);
+}
+
+extern "C" void errx(const char *fmt, ...) {
+ va_list ap;
+
+ va_start(ap, fmt);
+ debugv(-1, fmt, ap);
+ va_end(ap);
+}
+
+extern "C" void err(const char *fmt, ...) {
+ va_list ap;
+ char *msg;
+ int saved_errno = errno;
+
+ va_start(ap, fmt);
+ vasprintf(&msg, fmt, ap);
+ va_end(ap);
+ errno = saved_errno;
+
+ errx("%s: %s", msg, cpp_strerror(errno).c_str());
+
+ free(msg);
+}
diff --git a/src/tools/rbd_ggate/debug.h b/src/tools/rbd_ggate/debug.h
new file mode 100644
index 000000000..da9b46a38
--- /dev/null
+++ b/src/tools/rbd_ggate/debug.h
@@ -0,0 +1,17 @@
+#ifndef CEPH_RBD_GGATE_DEBUG_H
+#define CEPH_RBD_GGATE_DEBUG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void debug(int level, const char *fmt, ...) __printflike(2, 3);
+void debugv(int level, const char *fmt, va_list ap) __printflike(2, 0);
+void err(const char *fmt, ...) __printflike(1, 2);
+void errx(const char *fmt, ...) __printflike(1, 2);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CEPH_RBD_GGATE_DEBUG_H
diff --git a/src/tools/rbd_ggate/ggate_drv.c b/src/tools/rbd_ggate/ggate_drv.c
new file mode 100644
index 000000000..b1faccd25
--- /dev/null
+++ b/src/tools/rbd_ggate/ggate_drv.c
@@ -0,0 +1,379 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/linker.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+
+#include <geom/gate/g_gate.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <libgeom.h>
+
+#include "debug.h"
+#include "ggate_drv.h"
+
+uint64_t ggate_drv_req_id(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_seq;
+}
+
+int ggate_drv_req_cmd(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ switch (ggio->gctl_cmd) {
+ case BIO_WRITE:
+ return GGATE_DRV_CMD_WRITE;
+ case BIO_READ:
+ return GGATE_DRV_CMD_READ;
+ case BIO_FLUSH:
+ return GGATE_DRV_CMD_FLUSH;
+ case BIO_DELETE:
+ return GGATE_DRV_CMD_DISCARD;
+ default:
+ return GGATE_DRV_CMD_UNKNOWN;
+ }
+}
+
+uint64_t ggate_drv_req_offset(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_offset;
+}
+
+size_t ggate_drv_req_length(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_length;
+}
+
+void *ggate_drv_req_buf(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_data;
+}
+
+int ggate_drv_req_error(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_error;
+}
+
+void ggate_drv_req_set_error(ggate_drv_req_t req, int error) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ ggio->gctl_error = error;
+}
+
+void *ggate_drv_req_release_buf(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ void *data = ggio->gctl_data;
+ ggio->gctl_data = NULL;
+
+ return data;
+}
+
+struct ggate_drv {
+ int fd;
+ int unit;
+};
+
+int ggate_drv_load() {
+ if (modfind("g_gate") != -1) {
+ /* Present in kernel. */
+ return 0;
+ }
+
+ if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) {
+ if (errno != EEXIST) {
+ err("failed to load geom_gate module");
+ return -errno;
+ }
+ }
+ return 0;
+}
+
+int ggate_drv_create(char *name, size_t namelen, size_t sectorsize,
+ size_t mediasize, bool readonly, const char *info, ggate_drv_t *drv_) {
+ struct ggate_drv *drv;
+ struct g_gate_ctl_create ggiocreate;
+
+ debug(20, "%s: name=%s, sectorsize=%zd, mediasize=%zd, readonly=%d, info=%s",
+ __func__, name, sectorsize, mediasize, (int)readonly, info);
+
+ if (*name != '\0') {
+ if (namelen > sizeof(ggiocreate.gctl_name) - 1) {
+ return -ENAMETOOLONG;
+ }
+ }
+
+ /*
+ * We communicate with ggate via /dev/ggctl. Open it.
+ */
+ int fd = open("/dev/" G_GATE_CTL_NAME, O_RDWR);
+ if (fd == -1) {
+ err("failed to open /dev/" G_GATE_CTL_NAME);
+ return -errno;
+ }
+
+ drv = calloc(1, sizeof(*drv));
+ if (drv == NULL) {
+ errno = -ENOMEM;
+ goto fail_close;
+ }
+
+ /*
+ * Create provider.
+ */
+ memset(&ggiocreate, 0, sizeof(ggiocreate));
+ ggiocreate.gctl_version = G_GATE_VERSION;
+ ggiocreate.gctl_mediasize = mediasize;
+ ggiocreate.gctl_sectorsize = sectorsize;
+ ggiocreate.gctl_flags = readonly ? G_GATE_FLAG_READONLY : 0;
+ ggiocreate.gctl_maxcount = 0;
+ ggiocreate.gctl_timeout = 0;
+ if (*name != '\0') {
+ ggiocreate.gctl_unit = G_GATE_NAME_GIVEN;
+ strlcpy(ggiocreate.gctl_name, name, sizeof(ggiocreate.gctl_name));
+ } else {
+ ggiocreate.gctl_unit = G_GATE_UNIT_AUTO;
+ }
+ strlcpy(ggiocreate.gctl_info, info, sizeof(ggiocreate.gctl_info));
+ if (ioctl(fd, G_GATE_CMD_CREATE, &ggiocreate) == -1) {
+ err("failed to create " G_GATE_PROVIDER_NAME " device");
+ goto fail;
+ }
+
+ debug(20, "%s: created, unit: %d, name: %s", __func__, ggiocreate.gctl_unit,
+ ggiocreate.gctl_name);
+
+ drv->fd = fd;
+ drv->unit = ggiocreate.gctl_unit;
+ *drv_ = drv;
+
+ if (*name == '\0') {
+ snprintf(name, namelen, "%s%d", G_GATE_PROVIDER_NAME, drv->unit);
+ }
+
+ return 0;
+
+fail:
+ free(drv);
+fail_close:
+ close(fd);
+ return -errno;
+}
+
+void ggate_drv_destroy(ggate_drv_t drv_) {
+ struct ggate_drv *drv = (struct ggate_drv *)drv_;
+ struct g_gate_ctl_destroy ggiodestroy;
+
+ debug(20, "%s %p", __func__, drv);
+
+ memset(&ggiodestroy, 0, sizeof(ggiodestroy));
+ ggiodestroy.gctl_version = G_GATE_VERSION;
+ ggiodestroy.gctl_unit = drv->unit;
+ ggiodestroy.gctl_force = 1;
+
+ // Remember errno.
+ int rerrno = errno;
+
+ int r = ioctl(drv->fd, G_GATE_CMD_DESTROY, &ggiodestroy);
+ if (r == -1) {
+ err("failed to destroy /dev/%s%d device", G_GATE_PROVIDER_NAME,
+ drv->unit);
+ }
+ // Restore errno.
+ errno = rerrno;
+
+ free(drv);
+}
+
+int ggate_drv_resize(ggate_drv_t drv_, size_t newsize) {
+ struct ggate_drv *drv = (struct ggate_drv *)drv_;
+
+ debug(20, "%s %p: newsize=%zd", __func__, drv, newsize);
+
+ struct g_gate_ctl_modify ggiomodify;
+
+ memset(&ggiomodify, 0, sizeof(ggiomodify));
+ ggiomodify.gctl_version = G_GATE_VERSION;
+ ggiomodify.gctl_unit = drv->unit;
+ ggiomodify.gctl_modify = GG_MODIFY_MEDIASIZE;
+ ggiomodify.gctl_mediasize = newsize;
+
+ int r = ioctl(drv->fd, G_GATE_CMD_MODIFY, &ggiomodify);
+ if (r == -1) {
+ r = -errno;
+ err("failed to resize /dev/%s%d device", G_GATE_PROVIDER_NAME, drv->unit);
+ }
+ return r;
+}
+
+int ggate_drv_kill(const char *devname) {
+ debug(20, "%s %s", __func__, devname);
+
+ int fd = open("/dev/" G_GATE_CTL_NAME, O_RDWR);
+ if (fd == -1) {
+ err("failed to open /dev/" G_GATE_CTL_NAME);
+ return -errno;
+ }
+
+ struct g_gate_ctl_destroy ggiodestroy;
+ memset(&ggiodestroy, 0, sizeof(ggiodestroy));
+ ggiodestroy.gctl_version = G_GATE_VERSION;
+ ggiodestroy.gctl_unit = G_GATE_NAME_GIVEN;
+ ggiodestroy.gctl_force = 1;
+
+ strlcpy(ggiodestroy.gctl_name, devname, sizeof(ggiodestroy.gctl_name));
+
+ int r = ioctl(fd, G_GATE_CMD_DESTROY, &ggiodestroy);
+ if (r == -1) {
+ r = -errno;
+ err("failed to destroy %s device", devname);
+ }
+
+ close(fd);
+ return r;
+}
+
+int ggate_drv_recv(ggate_drv_t drv_, ggate_drv_req_t *req) {
+ struct ggate_drv *drv = (struct ggate_drv *)drv_;
+ struct g_gate_ctl_io *ggio;
+ int error, r;
+
+ debug(20, "%s", __func__);
+
+ ggio = calloc(1, sizeof(*ggio));
+ if (ggio == NULL) {
+ return -ENOMEM;
+ }
+
+ ggio->gctl_version = G_GATE_VERSION;
+ ggio->gctl_unit = drv->unit;
+ ggio->gctl_data = malloc(MAXPHYS);
+ ggio->gctl_length = MAXPHYS;
+
+ debug(20, "%s: waiting for request from kernel", __func__);
+ if (ioctl(drv->fd, G_GATE_CMD_START, ggio) == -1) {
+ err("%s: G_GATE_CMD_START failed", __func__);
+ return -errno;
+ }
+
+ debug(20, "%s: got request from kernel: "
+ "unit=%d, seq=%ju, cmd=%u, offset=%ju, length=%ju, error=%d, data=%p",
+ __func__, ggio->gctl_unit, (uintmax_t)ggio->gctl_seq, ggio->gctl_cmd,
+ (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length,
+ ggio->gctl_error, ggio->gctl_data);
+
+ error = ggio->gctl_error;
+ switch (error) {
+ case 0:
+ break;
+ case ECANCELED:
+ debug(10, "%s: canceled: exit gracefully", __func__);
+ r = -error;
+ goto fail;
+ case ENOMEM:
+ /*
+ * Buffer too small? Impossible, we allocate MAXPHYS
+ * bytes - request can't be bigger than that.
+ */
+ /* FALLTHROUGH */
+ case ENXIO:
+ default:
+ errno = error;
+ err("%s: G_GATE_CMD_START failed", __func__);
+ r = -error;
+ goto fail;
+ }
+
+ *req = ggio;
+ return 0;
+
+fail:
+ free(ggio->gctl_data);
+ free(ggio);
+ return r;
+}
+
+int ggate_drv_send(ggate_drv_t drv_, ggate_drv_req_t req) {
+ struct ggate_drv *drv = (struct ggate_drv *)drv_;
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+ int r = 0;
+
+ debug(20, "%s: send request to kernel: "
+ "unit=%d, seq=%ju, cmd=%u, offset=%ju, length=%ju, error=%d, data=%p",
+ __func__, ggio->gctl_unit, (uintmax_t)ggio->gctl_seq, ggio->gctl_cmd,
+ (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length,
+ ggio->gctl_error, ggio->gctl_data);
+
+ if (ioctl(drv->fd, G_GATE_CMD_DONE, ggio) == -1) {
+ err("%s: G_GATE_CMD_DONE failed", __func__);
+ r = -errno;
+ }
+
+ free(ggio->gctl_data);
+ free(ggio);
+ return r;
+}
+
+static const char * get_conf(struct ggeom *gp, const char *name) {
+ struct gconfig *conf;
+
+ LIST_FOREACH(conf, &gp->lg_config, lg_config) {
+ if (strcmp(conf->lg_name, name) == 0)
+ return (conf->lg_val);
+ }
+ return "";
+}
+
+int ggate_drv_list(struct ggate_drv_info *info, size_t *size) {
+ struct gmesh mesh;
+ struct gclass *class;
+ struct ggeom *gp;
+ int r;
+ size_t max_size;
+
+ r = geom_gettree(&mesh);
+ if (r != 0) {
+ return -errno;
+ }
+
+ max_size = *size;
+ *size = 0;
+
+ LIST_FOREACH(class, &mesh.lg_class, lg_class) {
+ if (strcmp(class->lg_name, G_GATE_CLASS_NAME) == 0) {
+ LIST_FOREACH(gp, &class->lg_geom, lg_geom) {
+ (*size)++;
+ }
+ if (*size > max_size) {
+ r = -ERANGE;
+ goto done;
+ }
+ LIST_FOREACH(gp, &class->lg_geom, lg_geom) {
+ strlcpy(info->id, get_conf(gp, "unit"), sizeof(info->id));
+ strlcpy(info->name, gp->lg_name, sizeof(info->name));
+ strlcpy(info->info, get_conf(gp, "info"), sizeof(info->info));
+ info++;
+ }
+ }
+ }
+
+done:
+ geom_deletetree(&mesh);
+ return r;
+}
diff --git a/src/tools/rbd_ggate/ggate_drv.h b/src/tools/rbd_ggate/ggate_drv.h
new file mode 100644
index 000000000..a32f51138
--- /dev/null
+++ b/src/tools/rbd_ggate/ggate_drv.h
@@ -0,0 +1,64 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_GGATE_DRV_H
+#define CEPH_RBD_GGATE_GGATE_DRV_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+
+#include <stdbool.h>
+#include <stdint.h>
+
+typedef void *ggate_drv_t;
+typedef void *ggate_drv_req_t;
+
+/*
+ * GGATE driver commands. They are mapped to GgateReq::Command.
+ */
+enum {
+ GGATE_DRV_CMD_UNKNOWN = 0,
+ GGATE_DRV_CMD_WRITE = 1,
+ GGATE_DRV_CMD_READ = 2,
+ GGATE_DRV_CMD_FLUSH = 3,
+ GGATE_DRV_CMD_DISCARD = 4,
+};
+
+struct ggate_drv_info {
+ char id[16];
+ char name[NAME_MAX];
+ char info[2048]; /* G_GATE_INFOSIZE */
+};
+
+uint64_t ggate_drv_req_id(ggate_drv_req_t req);
+int ggate_drv_req_cmd(ggate_drv_req_t req);
+void *ggate_drv_req_buf(ggate_drv_req_t req);
+size_t ggate_drv_req_length(ggate_drv_req_t req);
+uint64_t ggate_drv_req_offset(ggate_drv_req_t req);
+int ggate_drv_req_error(ggate_drv_req_t req);
+
+void ggate_drv_req_set_error(ggate_drv_req_t req, int error);
+void *ggate_drv_req_release_buf(ggate_drv_req_t req);
+
+int ggate_drv_load();
+
+int ggate_drv_create(char *name, size_t namelen, size_t sectorsize,
+ size_t mediasize, bool readonly, const char *info, ggate_drv_t *drv);
+void ggate_drv_destroy(ggate_drv_t drv);
+
+int ggate_drv_recv(ggate_drv_t drv, ggate_drv_req_t *req);
+int ggate_drv_send(ggate_drv_t drv, ggate_drv_req_t req);
+
+int ggate_drv_resize(ggate_drv_t drv, size_t newsize);
+
+int ggate_drv_kill(const char *devname);
+int ggate_drv_list(struct ggate_drv_info *info, size_t *size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CEPH_RBD_GGATE_GGATE_DRV_H
diff --git a/src/tools/rbd_ggate/main.cc b/src/tools/rbd_ggate/main.cc
new file mode 100644
index 000000000..5ed582fbf
--- /dev/null
+++ b/src/tools/rbd_ggate/main.cc
@@ -0,0 +1,521 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/int_types.h"
+
+#include <sys/types.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+
+#include <iostream>
+#include <memory>
+#include <boost/algorithm/string/predicate.hpp>
+#include <regex>
+
+#include "common/Formatter.h"
+#include "common/Preforker.h"
+#include "common/TextTable.h"
+#include "common/ceph_argparse.h"
+#include "common/config_proxy.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "include/stringify.h"
+
+#include "Driver.h"
+#include "Server.h"
+#include "Watcher.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd-ggate: " << __func__ << ": "
+
+static void usage() {
+ std::cout << "Usage: rbd-ggate [options] map <image-or-snap-spec> Map an image to ggate device\n"
+ << " unmap <device path> Unmap ggate device\n"
+ << " list List mapped ggate devices\n"
+ << "\n"
+ << "Map options:\n"
+ << " --device <device path> Specify ggate device path\n"
+ << " --read-only Map readonly\n"
+ << " --exclusive Forbid writes by other clients\n"
+ << "\n"
+ << "List options:\n"
+ << " --format plain|json|xml Output format (default: plain)\n"
+ << " --pretty-format Pretty formatting (json and xml)\n"
+ << std::endl;
+ generic_server_usage();
+}
+
+static std::string devpath, poolname, nsname, imgname, snapname;
+static bool readonly = false;
+static bool exclusive = false;
+
+static std::unique_ptr<rbd::ggate::Driver> drv;
+
+static void handle_signal(int signum)
+{
+ derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
+
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ ceph_assert(drv);
+
+ drv->shut_down();
+}
+
+static int do_map(int argc, const char *argv[])
+{
+ int r;
+
+ librados::Rados rados;
+ librbd::RBD rbd;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+
+ librbd::image_info_t info;
+ std::string desc;
+
+ Preforker forker;
+
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+ g_ceph_context->_conf.set_val_or_die("pid_file", "");
+
+ if (global_init_prefork(g_ceph_context) >= 0) {
+ std::string err;
+ r = forker.prefork(err);
+ if (r < 0) {
+ std::cerr << err << std::endl;
+ return r;
+ }
+ if (forker.is_parent()) {
+ if (forker.parent_wait(err) != 0) {
+ return -ENXIO;
+ }
+ return 0;
+ }
+ global_init_postfork_start(g_ceph_context);
+ }
+
+ common_init_finish(g_ceph_context);
+ global_init_chdir(g_ceph_context);
+
+ if (poolname.empty()) {
+ poolname = g_ceph_context->_conf.get_val<std::string>("rbd_default_pool");
+ }
+
+ std::string devname = boost::starts_with(devpath, "/dev/") ?
+ devpath.substr(5) : devpath;
+ std::unique_ptr<rbd::ggate::Watcher> watcher;
+ uint64_t handle;
+
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ goto done;
+ }
+
+ r = rados.connect();
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to connect to cluster: " << cpp_strerror(r)
+ << std::endl;
+ goto done;
+ }
+
+ r = rados.ioctx_create(poolname.c_str(), io_ctx);
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to acces pool " << poolname << ": "
+ << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+
+ io_ctx.set_namespace(nsname);
+
+ r = rbd.open(io_ctx, image, imgname.c_str());
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to open image " << imgname << ": "
+ << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+
+ if (exclusive) {
+ r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE);
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to acquire exclusive lock: "
+ << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+ }
+
+ desc = "RBD " + poolname + "/" + (nsname.empty() ? "" : nsname + "/") +
+ imgname;
+
+ if (!snapname.empty()) {
+ r = image.snap_set(snapname.c_str());
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to set snapshot " << snapname << ": "
+ << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+ readonly = true;
+ desc += "@" + snapname;
+ }
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0) {
+ std::cerr << "rbd-ggate: image stat failed: " << cpp_strerror(r)
+ << std::endl;
+ goto done;
+ }
+
+ rbd::ggate::Driver::load();
+ drv.reset(new rbd::ggate::Driver(devname, 512, info.size, readonly, desc));
+ r = drv->init();
+ if (r < 0) {
+ r = -errno;
+ std::cerr << "rbd-ggate: failed to create ggate device: " << cpp_strerror(r)
+ << std::endl;
+ goto done;
+ }
+
+ watcher.reset(new rbd::ggate::Watcher(drv.get(), io_ctx, image, info.size));
+ r = image.update_watch(watcher.get(), &handle);
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to set watcher: " << cpp_strerror(r)
+ << std::endl;
+ drv->shut_down();
+ goto done;
+ }
+
+ std::cout << "/dev/" << drv->get_devname() << std::endl;
+
+ if (g_conf()->daemonize) {
+ global_init_postfork_finish(g_ceph_context);
+ forker.daemonize();
+ }
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, sighup_handler);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ rbd::ggate::Server(drv.get(), image).run();
+
+ unregister_async_signal_handler(SIGHUP, sighup_handler);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ r = image.update_unwatch(handle);
+ ceph_assert(r == 0);
+
+done:
+ image.close();
+ io_ctx.close();
+ rados.shutdown();
+
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to map: " << cpp_strerror(r) << std::endl;
+ }
+
+ forker.exit(r < 0 ? EXIT_FAILURE : 0);
+ // Unreachable;
+ return r;
+}
+
+static int do_unmap()
+{
+ std::string devname = boost::starts_with(devpath, "/dev/") ?
+ devpath.substr(5) : devpath;
+
+ int r = rbd::ggate::Driver::kill(devname);
+ if (r < 0) {
+ cerr << "rbd-ggate: failed to destroy " << devname << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+static int parse_imgpath(const std::string &imgpath, std::string *poolname,
+ std::string *nsname, std::string *imgname,
+ std::string *snapname) {
+ std::regex pattern("^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$");
+ std::smatch match;
+ if (!std::regex_match(imgpath, match, pattern)) {
+ std::cerr << "rbd-ggate: invalid spec '" << imgpath << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (match[1].matched) {
+ *poolname = match[1];
+ }
+
+ if (match[2].matched) {
+ *nsname = match[2];
+ }
+
+ *imgname = match[3];
+
+ if (match[4].matched) {
+ *snapname = match[4];
+ }
+
+ return 0;
+}
+
+static bool find_mapped_dev_by_spec(const std::string &spec,
+ std::string *devname) {
+ std::string poolname, nsname, imgname, snapname;
+ int r = parse_imgpath(spec, &poolname, &nsname, &imgname, &snapname);
+ if (r < 0) {
+ return false;
+ }
+ if (poolname.empty()) {
+ // We could use rbd_default_pool config to set pool name but then
+ // we would need to initialize the global context. So right now it
+ // is mandatory for the user to specify a pool. Fortunately the
+ // preferred way for users to call rbd-ggate is via rbd, which
+ // cares to set the pool name.
+ return false;
+ }
+
+ std::map<std::string, rbd::ggate::Driver::DevInfo> devs;
+ r = rbd::ggate::Driver::list(&devs);
+ if (r < 0) {
+ return false;
+ }
+
+ for (auto &it : devs) {
+ auto &name = it.second.first;
+ auto &info = it.second.second;
+ if (!boost::starts_with(info, "RBD ")) {
+ continue;
+ }
+
+ std::string p, n, i, s;
+ parse_imgpath(info.substr(4), &p, &n, &i, &s);
+ if (p == poolname && n == nsname && i == imgname && s == snapname) {
+ *devname = name;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static int do_list(const std::string &format, bool pretty_format)
+{
+ rbd::ggate::Driver::load();
+
+ std::map<std::string, rbd::ggate::Driver::DevInfo> devs;
+ int r = rbd::ggate::Driver::list(&devs);
+ if (r < 0) {
+ return -r;
+ }
+
+ std::unique_ptr<ceph::Formatter> f;
+ TextTable tbl;
+
+ if (format == "json") {
+ f.reset(new JSONFormatter(pretty_format));
+ } else if (format == "xml") {
+ f.reset(new XMLFormatter(pretty_format));
+ } else if (!format.empty() && format != "plain") {
+ std::cerr << "rbd-ggate: invalid output format: " << format << std::endl;
+ return -EINVAL;
+ }
+
+ if (f) {
+ f->open_array_section("devices");
+ } else {
+ tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ int count = 0;
+
+ for (auto &it : devs) {
+ auto &id = it.first;
+ auto &name = it.second.first;
+ auto &info = it.second.second;
+ if (!boost::starts_with(info, "RBD ")) {
+ continue;
+ }
+
+ std::string poolname;
+ std::string nsname;
+ std::string imgname;
+ std::string snapname(f ? "" : "-");
+ parse_imgpath(info.substr(4), &poolname, &nsname, &imgname, &snapname);
+
+ if (f) {
+ f->open_object_section("device");
+ f->dump_string("id", id);
+ f->dump_string("pool", poolname);
+ f->dump_string("namespace", nsname);
+ f->dump_string("image", imgname);
+ f->dump_string("snap", snapname);
+ f->dump_string("device", "/dev/" + name);
+ f->close_section();
+ } else {
+ tbl << id << poolname << nsname << imgname << snapname << "/dev/" + name
+ << TextTable::endrow;
+ }
+ count++;
+ }
+
+ if (f) {
+ f->close_section(); // devices
+ f->flush(std::cout);
+ } else if (count > 0) {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+int main(int argc, const char *argv[]) {
+ int r;
+ enum {
+ None,
+ Connect,
+ Disconnect,
+ List
+ } cmd = None;
+
+ vector<const char*> args;
+
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+ // filter out ceph config options
+ ConfigProxy{false}.parse_argv(args);
+
+ std::string format;
+ bool pretty_format = false;
+ std::vector<const char*>::iterator i;
+
+ for (i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+ usage();
+ return 0;
+ } else if (ceph_argparse_witharg(args, i, &devpath, "--device",
+ (char *)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
+ readonly = true;
+ } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) {
+ exclusive = true;
+ } else if (ceph_argparse_witharg(args, i, &format, "--format",
+ (char *)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) {
+ pretty_format = true;
+ } else {
+ ++i;
+ }
+ }
+
+ if (args.begin() != args.end()) {
+ if (strcmp(*args.begin(), "map") == 0) {
+ cmd = Connect;
+ } else if (strcmp(*args.begin(), "unmap") == 0) {
+ cmd = Disconnect;
+ } else if (strcmp(*args.begin(), "list") == 0) {
+ cmd = List;
+ } else {
+ cerr << "rbd-ggate: unknown command: " << *args.begin() << std::endl;
+ return EXIT_FAILURE;
+ }
+ args.erase(args.begin());
+ }
+
+ if (cmd == None) {
+ cerr << "rbd-ggate: must specify command" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ switch (cmd) {
+ case Connect:
+ if (args.begin() == args.end()) {
+ cerr << "rbd-ggate: must specify image-or-snap-spec" << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (parse_imgpath(*args.begin(), &poolname, &nsname, &imgname,
+ &snapname) < 0) {
+ return EXIT_FAILURE;
+ }
+ args.erase(args.begin());
+ break;
+ case Disconnect:
+ if (args.begin() == args.end()) {
+ std::cerr << "rbd-ggate: must specify ggate device or image-or-snap-spec"
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (boost::starts_with(*args.begin(), "/dev/") ||
+ !find_mapped_dev_by_spec(*args.begin(), &devpath)) {
+ devpath = *args.begin();
+ }
+ args.erase(args.begin());
+ break;
+ default:
+ break;
+ }
+
+ if (args.begin() != args.end()) {
+ cerr << "rbd-ggate: unknown args: " << *args.begin() << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ switch (cmd) {
+ case Connect:
+ if (imgname.empty()) {
+ cerr << "rbd-ggate: image name was not specified" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ r = do_map(argc, argv);
+ if (r < 0)
+ return EXIT_FAILURE;
+ break;
+ case Disconnect:
+ r = do_unmap();
+ if (r < 0)
+ return EXIT_FAILURE;
+ break;
+ case List:
+ r = do_list(format, pretty_format);
+ if (r < 0)
+ return EXIT_FAILURE;
+ break;
+ default:
+ usage();
+ return EXIT_FAILURE;
+ }
+
+ return 0;
+}
diff --git a/src/tools/rbd_mirror/BaseRequest.h b/src/tools/rbd_mirror/BaseRequest.h
new file mode 100644
index 000000000..0da98651d
--- /dev/null
+++ b/src/tools/rbd_mirror/BaseRequest.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_BASE_REQUEST_H
+#define CEPH_RBD_MIRROR_BASE_REQUEST_H
+
+#include "include/Context.h"
+
+namespace rbd {
+namespace mirror {
+
+class BaseRequest {
+public:
+ BaseRequest(Context *on_finish) : m_on_finish(on_finish) {
+ }
+ virtual ~BaseRequest() {}
+
+ virtual void send() = 0;
+
+protected:
+ virtual void finish(int r) {
+ m_on_finish->complete(r);
+ delete this;
+ }
+
+private:
+ Context *m_on_finish;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_BASE_REQUEST_H
diff --git a/src/tools/rbd_mirror/CMakeLists.txt b/src/tools/rbd_mirror/CMakeLists.txt
new file mode 100644
index 000000000..f260d9786
--- /dev/null
+++ b/src/tools/rbd_mirror/CMakeLists.txt
@@ -0,0 +1,90 @@
+add_library(rbd_mirror_types STATIC
+ image_map/Types.cc
+ instance_watcher/Types.cc
+ leader_watcher/Types.cc)
+
+set(rbd_mirror_internal
+ ClusterWatcher.cc
+ ImageDeleter.cc
+ ImageMap.cc
+ ImageReplayer.cc
+ ImageSync.cc
+ InstanceReplayer.cc
+ InstanceWatcher.cc
+ Instances.cc
+ LeaderWatcher.cc
+ Mirror.cc
+ MirrorStatusUpdater.cc
+ MirrorStatusWatcher.cc
+ NamespaceReplayer.cc
+ PoolMetaCache.cc
+ PoolReplayer.cc
+ PoolWatcher.cc
+ RemotePoolPoller.cc
+ ServiceDaemon.cc
+ Threads.cc
+ Throttler.cc
+ Types.cc
+ image_deleter/SnapshotPurgeRequest.cc
+ image_deleter/TrashMoveRequest.cc
+ image_deleter/TrashRemoveRequest.cc
+ image_deleter/TrashWatcher.cc
+ image_map/LoadRequest.cc
+ image_map/Policy.cc
+ image_map/SimplePolicy.cc
+ image_map/StateTransition.cc
+ image_map/UpdateRequest.cc
+ image_replayer/BootstrapRequest.cc
+ image_replayer/CloseImageRequest.cc
+ image_replayer/CreateImageRequest.cc
+ image_replayer/GetMirrorImageIdRequest.cc
+ image_replayer/OpenImageRequest.cc
+ image_replayer/OpenLocalImageRequest.cc
+ image_replayer/PrepareLocalImageRequest.cc
+ image_replayer/PrepareRemoteImageRequest.cc
+ image_replayer/StateBuilder.cc
+ image_replayer/TimeRollingMean.cc
+ image_replayer/Utils.cc
+ image_replayer/journal/CreateLocalImageRequest.cc
+ image_replayer/journal/EventPreprocessor.cc
+ image_replayer/journal/PrepareReplayRequest.cc
+ image_replayer/journal/Replayer.cc
+ image_replayer/journal/ReplayStatusFormatter.cc
+ image_replayer/journal/StateBuilder.cc
+ image_replayer/journal/SyncPointHandler.cc
+ image_replayer/snapshot/ApplyImageStateRequest.cc
+ image_replayer/snapshot/CreateLocalImageRequest.cc
+ image_replayer/snapshot/PrepareReplayRequest.cc
+ image_replayer/snapshot/Replayer.cc
+ image_replayer/snapshot/StateBuilder.cc
+ image_replayer/snapshot/Utils.cc
+ image_sync/SyncPointCreateRequest.cc
+ image_sync/SyncPointPruneRequest.cc
+ image_sync/Utils.cc
+ pool_watcher/RefreshImagesRequest.cc
+ service_daemon/Types.cc)
+
+add_library(rbd_mirror_internal STATIC
+ ${rbd_mirror_internal}
+ $<TARGET_OBJECTS:common_prioritycache_obj>)
+
+add_executable(rbd-mirror
+ main.cc)
+target_link_libraries(rbd-mirror
+ rbd_mirror_internal
+ rbd_mirror_types
+ rbd_api
+ rbd_internal
+ rbd_types
+ journal
+ libneorados
+ librados
+ osdc
+ cls_rbd_client
+ cls_lock_client
+ cls_journal_client
+ global
+ heap_profiler
+ ${ALLOC_LIBS}
+ OpenSSL::SSL)
+install(TARGETS rbd-mirror DESTINATION bin)
diff --git a/src/tools/rbd_mirror/CancelableRequest.h b/src/tools/rbd_mirror/CancelableRequest.h
new file mode 100644
index 000000000..26e8dcb5b
--- /dev/null
+++ b/src/tools/rbd_mirror/CancelableRequest.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_CANCELABLE_REQUEST_H
+#define CEPH_RBD_MIRROR_CANCELABLE_REQUEST_H
+
+#include "common/RefCountedObj.h"
+#include "include/Context.h"
+
+namespace rbd {
+namespace mirror {
+
+class CancelableRequest : public RefCountedObject {
+public:
+ CancelableRequest(const std::string& name, CephContext *cct,
+ Context *on_finish)
+ : RefCountedObject(cct), m_name(name), m_cct(cct),
+ m_on_finish(on_finish) {
+ }
+
+ virtual void send() = 0;
+ virtual void cancel() {}
+
+protected:
+ virtual void finish(int r) {
+ if (m_cct) {
+ lsubdout(m_cct, rbd_mirror, 20) << m_name << "::finish: r=" << r << dendl;
+ }
+ if (m_on_finish) {
+ m_on_finish->complete(r);
+ }
+ put();
+ }
+
+private:
+ const std::string m_name;
+ CephContext *m_cct;
+ Context *m_on_finish;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_CANCELABLE_REQUEST_H
diff --git a/src/tools/rbd_mirror/ClusterWatcher.cc b/src/tools/rbd_mirror/ClusterWatcher.cc
new file mode 100644
index 000000000..2ae1306be
--- /dev/null
+++ b/src/tools/rbd_mirror/ClusterWatcher.cc
@@ -0,0 +1,251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ClusterWatcher.h"
+#include "include/stringify.h"
+#include "common/ceph_json.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/internal.h"
+#include "librbd/api/Mirror.h"
+#include "tools/rbd_mirror/ServiceDaemon.h"
+#include "json_spirit/json_spirit.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ClusterWatcher:" << this << " " \
+ << __func__ << ": "
+
+using std::list;
+using std::map;
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using librados::Rados;
+using librados::IoCtx;
+
+namespace rbd {
+namespace mirror {
+
+ClusterWatcher::ClusterWatcher(RadosRef cluster, ceph::mutex &lock,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon)
+ : m_cluster(cluster), m_lock(lock), m_service_daemon(service_daemon)
+{
+}
+
+const ClusterWatcher::PoolPeers& ClusterWatcher::get_pool_peers() const
+{
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return m_pool_peers;
+}
+
+std::string ClusterWatcher::get_site_name() const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return m_site_name;
+}
+
+void ClusterWatcher::refresh_pools()
+{
+ dout(20) << "enter" << dendl;
+
+ PoolPeers pool_peers;
+ read_pool_peers(&pool_peers);
+
+ std::string site_name;
+ int r = read_site_name(&site_name);
+
+ std::lock_guard l{m_lock};
+ m_pool_peers = pool_peers;
+
+ if (r >= 0) {
+ m_site_name = site_name;
+ }
+
+ // TODO: perhaps use a workqueue instead, once we get notifications
+ // about config changes for existing pools
+}
+
+void ClusterWatcher::read_pool_peers(PoolPeers *pool_peers)
+{
+ int r = m_cluster->wait_for_latest_osdmap();
+ if (r < 0) {
+ derr << "error waiting for OSD map: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ list<pair<int64_t, string> > pools;
+ r = m_cluster->pool_list2(pools);
+ if (r < 0) {
+ derr << "error listing pools: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ std::set<int64_t> service_pool_ids;
+ for (auto& kv : pools) {
+ int64_t pool_id = kv.first;
+ auto& pool_name = kv.second;
+ int64_t base_tier;
+ r = m_cluster->pool_get_base_tier(pool_id, &base_tier);
+ if (r == -ENOENT) {
+ dout(10) << "pool " << pool_name << " no longer exists" << dendl;
+ continue;
+ } else if (r < 0) {
+ derr << "Error retrieving base tier for pool " << pool_name << dendl;
+ continue;
+ }
+ if (pool_id != base_tier) {
+ // pool is a cache; skip it
+ continue;
+ }
+
+ IoCtx ioctx;
+ r = m_cluster->ioctx_create2(pool_id, ioctx);
+ if (r == -ENOENT) {
+ dout(10) << "pool " << pool_id << " no longer exists" << dendl;
+ continue;
+ } else if (r < 0) {
+ derr << "Error accessing pool " << pool_name << cpp_strerror(r) << dendl;
+ continue;
+ }
+
+ cls::rbd::MirrorMode mirror_mode_internal;
+ r = librbd::cls_client::mirror_mode_get(&ioctx, &mirror_mode_internal);
+ if (r == 0 && mirror_mode_internal == cls::rbd::MIRROR_MODE_DISABLED) {
+ dout(10) << "mirroring is disabled for pool " << pool_name << dendl;
+ continue;
+ }
+
+ service_pool_ids.insert(pool_id);
+ if (m_service_pools.find(pool_id) == m_service_pools.end()) {
+ m_service_pools[pool_id] = {};
+ m_service_daemon->add_pool(pool_id, pool_name);
+ }
+
+ if (r == -EPERM) {
+ dout(10) << "access denied querying pool " << pool_name << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING, "access denied");
+ continue;
+ } else if (r < 0) {
+ derr << "could not tell whether mirroring was enabled for " << pool_name
+ << " : " << cpp_strerror(r) << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING, "mirroring mode query failed");
+ continue;
+ }
+
+ vector<librbd::mirror_peer_site_t> configs;
+ r = librbd::api::Mirror<>::peer_site_list(ioctx, &configs);
+ if (r < 0) {
+ derr << "error reading mirroring config for pool " << pool_name
+ << cpp_strerror(r) << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_ERROR, "mirroring peer list failed");
+ continue;
+ }
+
+ std::vector<PeerSpec> peers;
+ peers.reserve(configs.size());
+ for (auto& peer : configs) {
+ if (peer.direction != RBD_MIRROR_PEER_DIRECTION_TX) {
+ peers.push_back(peer);
+ }
+ }
+
+ for (auto& peer : peers) {
+ r = resolve_peer_site_config_keys(pool_id, pool_name, &peer);
+ if (r < 0) {
+ break;
+ }
+ }
+
+ if (m_service_pools[pool_id] != service_daemon::CALLOUT_ID_NONE) {
+ m_service_daemon->remove_callout(pool_id, m_service_pools[pool_id]);
+ m_service_pools[pool_id] = service_daemon::CALLOUT_ID_NONE;
+ }
+
+ pool_peers->emplace(pool_id, Peers{peers.begin(), peers.end()});
+ }
+
+ for (auto it = m_service_pools.begin(); it != m_service_pools.end(); ) {
+ auto current_it(it++);
+ if (service_pool_ids.find(current_it->first) == service_pool_ids.end()) {
+ m_service_daemon->remove_pool(current_it->first);
+ m_service_pools.erase(current_it->first);
+ }
+ }
+}
+
+int ClusterWatcher::read_site_name(std::string* site_name) {
+ dout(10) << dendl;
+
+ librbd::RBD rbd;
+ return rbd.mirror_site_name_get(*m_cluster, site_name);
+}
+
+int ClusterWatcher::resolve_peer_site_config_keys(int64_t pool_id,
+ const std::string& pool_name,
+ PeerSpec* peer) {
+ dout(10) << "retrieving config-key: pool_id=" << pool_id << ", "
+ << "pool_name=" << pool_name << ", "
+ << "peer_uuid=" << peer->uuid << dendl;
+
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config-key get\", "
+ "\"key\": \"" RBD_MIRROR_PEER_CONFIG_KEY_PREFIX + stringify(pool_id) +
+ "/" + peer->uuid + "\""
+ "}";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+ int r = m_cluster->mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r == -ENOENT || out_bl.length() == 0) {
+ return 0;
+ } else if (r < 0) {
+ derr << "error reading mirroring peer config for pool " << pool_name << ": "
+ << cpp_strerror(r) << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING,
+ "mirroring peer config-key query failed");
+ return r;
+ }
+
+ bool json_valid = false;
+ json_spirit::mValue json_root;
+ if(json_spirit::read(out_bl.to_str(), json_root)) {
+ try {
+ auto& json_obj = json_root.get_obj();
+ if (json_obj.count("mon_host")) {
+ peer->mon_host = json_obj["mon_host"].get_str();
+ }
+ if (json_obj.count("key")) {
+ peer->key = json_obj["key"].get_str();
+ }
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ derr << "error parsing mirroring peer config for pool " << pool_name << ", "
+ << "peer " << peer->uuid << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING,
+ "mirroring peer config-key decode failed");
+ }
+
+ return 0;
+}
+
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/ClusterWatcher.h b/src/tools/rbd_mirror/ClusterWatcher.h
new file mode 100644
index 000000000..93356fec6
--- /dev/null
+++ b/src/tools/rbd_mirror/ClusterWatcher.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_CLUSTER_WATCHER_H
+#define CEPH_RBD_MIRROR_CLUSTER_WATCHER_H
+
+#include <map>
+#include <memory>
+#include <set>
+
+#include "common/ceph_context.h"
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include "include/rados/librados.hpp"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/service_daemon/Types.h"
+#include <unordered_map>
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class ServiceDaemon;
+
+/**
+ * Tracks mirroring configuration for pools in a single
+ * cluster.
+ */
+class ClusterWatcher {
+public:
+ struct PeerSpecCompare {
+ bool operator()(const PeerSpec& lhs, const PeerSpec& rhs) const {
+ return (lhs.uuid < rhs.uuid);
+ }
+ };
+ typedef std::set<PeerSpec, PeerSpecCompare> Peers;
+ typedef std::map<int64_t, Peers> PoolPeers;
+
+ ClusterWatcher(RadosRef cluster, ceph::mutex &lock,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon);
+ ~ClusterWatcher() = default;
+ ClusterWatcher(const ClusterWatcher&) = delete;
+ ClusterWatcher& operator=(const ClusterWatcher&) = delete;
+
+ // Caller controls frequency of calls
+ void refresh_pools();
+ const PoolPeers& get_pool_peers() const;
+ std::string get_site_name() const;
+
+private:
+ typedef std::unordered_map<int64_t, service_daemon::CalloutId> ServicePools;
+
+ RadosRef m_cluster;
+ ceph::mutex &m_lock;
+ ServiceDaemon<librbd::ImageCtx>* m_service_daemon;
+
+ ServicePools m_service_pools;
+ PoolPeers m_pool_peers;
+ std::string m_site_name;
+
+ void read_pool_peers(PoolPeers *pool_peers);
+
+ int read_site_name(std::string* site_name);
+
+ int resolve_peer_site_config_keys(
+ int64_t pool_id, const std::string& pool_name, PeerSpec* peer);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_CLUSTER_WATCHER_H
diff --git a/src/tools/rbd_mirror/ImageDeleter.cc b/src/tools/rbd_mirror/ImageDeleter.cc
new file mode 100644
index 000000000..fcdd1baad
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageDeleter.cc
@@ -0,0 +1,548 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "common/Formatter.h"
+#include "common/admin_socket.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "global/global_context.h"
+#include "librbd/internal.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/asio/ContextWQ.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Utils.h"
+#include "ImageDeleter.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/Throttler.h"
+#include "tools/rbd_mirror/image_deleter/TrashMoveRequest.h"
+#include "tools/rbd_mirror/image_deleter/TrashRemoveRequest.h"
+#include "tools/rbd_mirror/image_deleter/TrashWatcher.h"
+#include <map>
+#include <sstream>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+
+using std::string;
+using std::stringstream;
+using std::vector;
+using std::pair;
+using std::make_pair;
+
+using librados::IoCtx;
+using namespace librbd;
+
+namespace rbd {
+namespace mirror {
+
+using librbd::util::create_async_context_callback;
+
+namespace {
+
+class ImageDeleterAdminSocketCommand {
+public:
+ virtual ~ImageDeleterAdminSocketCommand() {}
+ virtual int call(Formatter *f) = 0;
+};
+
+template <typename I>
+class StatusCommand : public ImageDeleterAdminSocketCommand {
+public:
+ explicit StatusCommand(ImageDeleter<I> *image_del) : image_del(image_del) {}
+
+ int call(Formatter *f) override {
+ image_del->print_status(f);
+ return 0;
+ }
+
+private:
+ ImageDeleter<I> *image_del;
+};
+
+} // anonymous namespace
+
+template <typename I>
+class ImageDeleterAdminSocketHook : public AdminSocketHook {
+public:
+ ImageDeleterAdminSocketHook(CephContext *cct, const std::string& pool_name,
+ ImageDeleter<I> *image_del) :
+ admin_socket(cct->get_admin_socket()) {
+
+ std::string command;
+ int r;
+
+ command = "rbd mirror deletion status " + pool_name;
+ r = admin_socket->register_command(command, this,
+ "get status for image deleter");
+ if (r == 0) {
+ commands[command] = new StatusCommand<I>(image_del);
+ }
+
+ }
+
+ ~ImageDeleterAdminSocketHook() override {
+ (void)admin_socket->unregister_commands(this);
+ for (Commands::const_iterator i = commands.begin(); i != commands.end();
+ ++i) {
+ delete i->second;
+ }
+ }
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out) override {
+ Commands::const_iterator i = commands.find(command);
+ ceph_assert(i != commands.end());
+ return i->second->call(f);
+ }
+
+private:
+ typedef std::map<std::string, ImageDeleterAdminSocketCommand*,
+ std::less<>> Commands;
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+template <typename I>
+ImageDeleter<I>::ImageDeleter(
+ librados::IoCtx& local_io_ctx, Threads<librbd::ImageCtx>* threads,
+ Throttler<librbd::ImageCtx>* image_deletion_throttler,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon)
+ : m_local_io_ctx(local_io_ctx), m_threads(threads),
+ m_image_deletion_throttler(image_deletion_throttler),
+ m_service_daemon(service_daemon), m_trash_listener(this),
+ m_lock(ceph::make_mutex(
+ librbd::util::unique_lock_name("rbd::mirror::ImageDeleter::m_lock",
+ this))) {
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ImageDeleter: " << " " \
+ << __func__ << ": "
+
+template <typename I>
+void ImageDeleter<I>::trash_move(librados::IoCtx& local_io_ctx,
+ const std::string& global_image_id,
+ bool resync,
+ librbd::asio::ContextWQ* work_queue,
+ Context* on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << ", "
+ << "resync=" << resync << dendl;
+
+ auto req = rbd::mirror::image_deleter::TrashMoveRequest<>::create(
+ local_io_ctx, global_image_id, resync, work_queue, on_finish);
+ req->send();
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ImageDeleter: " << this << " " \
+ << __func__ << ": "
+
+template <typename I>
+void ImageDeleter<I>::init(Context* on_finish) {
+ dout(10) << dendl;
+
+ m_asok_hook = new ImageDeleterAdminSocketHook<I>(
+ g_ceph_context, m_local_io_ctx.get_pool_name(), this);
+
+ m_trash_watcher = image_deleter::TrashWatcher<I>::create(m_local_io_ctx,
+ m_threads,
+ m_trash_listener);
+ m_trash_watcher->init(on_finish);
+}
+
+template <typename I>
+void ImageDeleter<I>::shut_down(Context* on_finish) {
+ dout(10) << dendl;
+
+ delete m_asok_hook;
+ m_asok_hook = nullptr;
+
+ m_image_deletion_throttler->drain(m_local_io_ctx.get_namespace(),
+ -ESTALE);
+
+ shut_down_trash_watcher(on_finish);
+}
+
+template <typename I>
+void ImageDeleter<I>::shut_down_trash_watcher(Context* on_finish) {
+ dout(10) << dendl;
+ ceph_assert(m_trash_watcher);
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ delete m_trash_watcher;
+ m_trash_watcher = nullptr;
+
+ wait_for_ops(on_finish);
+ });
+ m_trash_watcher->shut_down(ctx);
+}
+
+template <typename I>
+void ImageDeleter<I>::wait_for_ops(Context* on_finish) {
+ {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ m_running = false;
+ cancel_retry_timer();
+ }
+
+ auto ctx = new LambdaContext([this, on_finish](int) {
+ cancel_all_deletions(on_finish);
+ });
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void ImageDeleter<I>::cancel_all_deletions(Context* on_finish) {
+ m_image_deletion_throttler->drain(m_local_io_ctx.get_namespace(),
+ -ECANCELED);
+ {
+ std::lock_guard locker{m_lock};
+ // wake up any external state machines waiting on deletions
+ ceph_assert(m_in_flight_delete_queue.empty());
+ for (auto& queue : {&m_delete_queue, &m_retry_delete_queue}) {
+ for (auto& info : *queue) {
+ notify_on_delete(info->image_id, -ECANCELED);
+ }
+ queue->clear();
+ }
+ }
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ImageDeleter<I>::wait_for_deletion(const std::string& image_id,
+ bool scheduled_only,
+ Context* on_finish) {
+ dout(5) << "image_id=" << image_id << dendl;
+
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ m_threads->work_queue->queue(on_finish, r);
+ });
+
+ std::lock_guard locker{m_lock};
+ auto del_info = find_delete_info(image_id);
+ if (!del_info && scheduled_only) {
+ // image not scheduled for deletion
+ on_finish->complete(0);
+ return;
+ }
+
+ notify_on_delete(image_id, -ESTALE);
+ m_on_delete_contexts[image_id] = on_finish;
+}
+
+template <typename I>
+void ImageDeleter<I>::complete_active_delete(DeleteInfoRef* delete_info,
+ int r) {
+ dout(20) << "info=" << *delete_info << ", r=" << r << dendl;
+ std::lock_guard locker{m_lock};
+ notify_on_delete((*delete_info)->image_id, r);
+ delete_info->reset();
+}
+
+template <typename I>
+void ImageDeleter<I>::enqueue_failed_delete(DeleteInfoRef* delete_info,
+ int error_code,
+ double retry_delay) {
+ dout(20) << "info=" << *delete_info << ", r=" << error_code << dendl;
+ if (error_code == -EBLOCKLISTED) {
+ std::lock_guard locker{m_lock};
+ derr << "blocklisted while deleting local image" << dendl;
+ complete_active_delete(delete_info, error_code);
+ return;
+ }
+
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ auto& delete_info_ref = *delete_info;
+ notify_on_delete(delete_info_ref->image_id, error_code);
+ delete_info_ref->error_code = error_code;
+ ++delete_info_ref->retries;
+ delete_info_ref->retry_time = (clock_t::now() +
+ ceph::make_timespan(retry_delay));
+ m_retry_delete_queue.push_back(delete_info_ref);
+
+ schedule_retry_timer();
+}
+
+template <typename I>
+typename ImageDeleter<I>::DeleteInfoRef
+ImageDeleter<I>::find_delete_info(const std::string &image_id) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ DeleteQueue delete_queues[] = {m_in_flight_delete_queue,
+ m_retry_delete_queue,
+ m_delete_queue};
+
+ DeleteInfo delete_info{image_id};
+ for (auto& queue : delete_queues) {
+ auto it = std::find_if(queue.begin(), queue.end(),
+ [&delete_info](const DeleteInfoRef& ref) {
+ return delete_info == *ref;
+ });
+ if (it != queue.end()) {
+ return *it;
+ }
+ }
+ return {};
+}
+
+template <typename I>
+void ImageDeleter<I>::print_status(Formatter *f) {
+ dout(20) << dendl;
+
+ f->open_object_section("image_deleter_status");
+ f->open_array_section("delete_images_queue");
+
+ std::lock_guard l{m_lock};
+ for (const auto& image : m_delete_queue) {
+ image->print_status(f);
+ }
+
+ f->close_section();
+ f->open_array_section("failed_deletes_queue");
+ for (const auto& image : m_retry_delete_queue) {
+ image->print_status(f, true);
+ }
+
+ f->close_section();
+ f->close_section();
+}
+
+template <typename I>
+vector<string> ImageDeleter<I>::get_delete_queue_items() {
+ vector<string> items;
+
+ std::lock_guard l{m_lock};
+ for (const auto& del_info : m_delete_queue) {
+ items.push_back(del_info->image_id);
+ }
+
+ return items;
+}
+
+template <typename I>
+vector<pair<string, int> > ImageDeleter<I>::get_failed_queue_items() {
+ vector<pair<string, int> > items;
+
+ std::lock_guard l{m_lock};
+ for (const auto& del_info : m_retry_delete_queue) {
+ items.push_back(make_pair(del_info->image_id,
+ del_info->error_code));
+ }
+
+ return items;
+}
+
+template <typename I>
+void ImageDeleter<I>::remove_images() {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ while (m_running && !m_delete_queue.empty()) {
+
+ DeleteInfoRef delete_info = m_delete_queue.front();
+ m_delete_queue.pop_front();
+
+ ceph_assert(delete_info);
+
+ auto on_start = create_async_context_callback(
+ m_threads->work_queue, new LambdaContext(
+ [this, delete_info](int r) {
+ if (r < 0) {
+ notify_on_delete(delete_info->image_id, r);
+ return;
+ }
+ remove_image(delete_info);
+ }));
+
+ m_image_deletion_throttler->start_op(m_local_io_ctx.get_namespace(),
+ delete_info->image_id, on_start);
+ }
+}
+
+template <typename I>
+void ImageDeleter<I>::remove_image(DeleteInfoRef delete_info) {
+ dout(10) << "info=" << *delete_info << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_in_flight_delete_queue.push_back(delete_info);
+ m_async_op_tracker.start_op();
+
+ auto ctx = new LambdaContext([this, delete_info](int r) {
+ handle_remove_image(delete_info, r);
+ m_async_op_tracker.finish_op();
+ });
+
+ auto req = image_deleter::TrashRemoveRequest<I>::create(
+ m_local_io_ctx, delete_info->image_id, &delete_info->error_result,
+ m_threads->work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void ImageDeleter<I>::handle_remove_image(DeleteInfoRef delete_info,
+ int r) {
+ dout(10) << "info=" << *delete_info << ", r=" << r << dendl;
+
+ m_image_deletion_throttler->finish_op(m_local_io_ctx.get_namespace(),
+ delete_info->image_id);
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto it = std::find(m_in_flight_delete_queue.begin(),
+ m_in_flight_delete_queue.end(), delete_info);
+ ceph_assert(it != m_in_flight_delete_queue.end());
+ m_in_flight_delete_queue.erase(it);
+ }
+
+ if (r < 0) {
+ if (delete_info->error_result == image_deleter::ERROR_RESULT_COMPLETE) {
+ complete_active_delete(&delete_info, r);
+ } else if (delete_info->error_result ==
+ image_deleter::ERROR_RESULT_RETRY_IMMEDIATELY) {
+ enqueue_failed_delete(&delete_info, r, m_busy_interval);
+ } else {
+ auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
+ double failed_interval = cct->_conf.get_val<double>(
+ "rbd_mirror_delete_retry_interval");
+ enqueue_failed_delete(&delete_info, r, failed_interval);
+ }
+ } else {
+ complete_active_delete(&delete_info, 0);
+ }
+
+ // process the next queued image to delete
+ remove_images();
+}
+
+template <typename I>
+void ImageDeleter<I>::schedule_retry_timer() {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ if (!m_running || m_timer_ctx != nullptr || m_retry_delete_queue.empty()) {
+ return;
+ }
+
+ dout(10) << dendl;
+ auto &delete_info = m_retry_delete_queue.front();
+ m_timer_ctx = new LambdaContext([this](int r) {
+ handle_retry_timer();
+ });
+ m_threads->timer->add_event_at(delete_info->retry_time, m_timer_ctx);
+}
+
+template <typename I>
+void ImageDeleter<I>::cancel_retry_timer() {
+ dout(10) << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ if (m_timer_ctx != nullptr) {
+ bool canceled = m_threads->timer->cancel_event(m_timer_ctx);
+ m_timer_ctx = nullptr;
+ ceph_assert(canceled);
+ }
+}
+
+template <typename I>
+void ImageDeleter<I>::handle_retry_timer() {
+ dout(10) << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_timer_ctx != nullptr);
+ m_timer_ctx = nullptr;
+
+ ceph_assert(m_running);
+ ceph_assert(!m_retry_delete_queue.empty());
+
+ // move all ready-to-ready items back to main queue
+ auto now = clock_t::now();
+ while (!m_retry_delete_queue.empty()) {
+ auto &delete_info = m_retry_delete_queue.front();
+ if (delete_info->retry_time > now) {
+ break;
+ }
+
+ m_delete_queue.push_back(delete_info);
+ m_retry_delete_queue.pop_front();
+ }
+
+ // schedule wake up for any future retries
+ schedule_retry_timer();
+
+ // start (concurrent) removal of images
+ m_async_op_tracker.start_op();
+ auto ctx = new LambdaContext([this](int r) {
+ remove_images();
+ m_async_op_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void ImageDeleter<I>::handle_trash_image(const std::string& image_id,
+ const ImageDeleter<I>::clock_t::time_point& deferment_end_time) {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+
+ auto del_info = find_delete_info(image_id);
+ if (del_info != nullptr) {
+ dout(20) << "image " << image_id << " "
+ << "was already scheduled for deletion" << dendl;
+ return;
+ }
+
+ dout(10) << "image_id=" << image_id << ", "
+ << "deferment_end_time=" << utime_t{deferment_end_time} << dendl;
+
+ del_info.reset(new DeleteInfo(image_id));
+ del_info->retry_time = deferment_end_time;
+ m_retry_delete_queue.push_back(del_info);
+
+ schedule_retry_timer();
+}
+
+template <typename I>
+void ImageDeleter<I>::notify_on_delete(const std::string& image_id,
+ int r) {
+ dout(10) << "image_id=" << image_id << ", r=" << r << dendl;
+ auto it = m_on_delete_contexts.find(image_id);
+ if (it == m_on_delete_contexts.end()) {
+ return;
+ }
+
+ it->second->complete(r);
+ m_on_delete_contexts.erase(it);
+}
+
+template <typename I>
+void ImageDeleter<I>::DeleteInfo::print_status(Formatter *f,
+ bool print_failure_info) {
+ f->open_object_section("delete_info");
+ f->dump_string("image_id", image_id);
+ if (print_failure_info) {
+ f->dump_string("error_code", cpp_strerror(error_code));
+ f->dump_int("retries", retries);
+ }
+ f->close_section();
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ImageDeleter<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageDeleter.h b/src/tools/rbd_mirror/ImageDeleter.h
new file mode 100644
index 000000000..5fe79496b
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageDeleter.h
@@ -0,0 +1,189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETER_H
+
+#include "include/utime.h"
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+#include <atomic>
+#include <deque>
+#include <iosfwd>
+#include <map>
+#include <memory>
+#include <vector>
+
+class AdminSocketHook;
+class Context;
+namespace librbd {
+struct ImageCtx;
+namespace asio { struct ContextWQ; }
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class ServiceDaemon;
+template <typename> class Threads;
+template <typename> class Throttler;
+
+namespace image_deleter { template <typename> struct TrashWatcher; }
+
+/**
+ * Manage deletion of non-primary images.
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageDeleter {
+public:
+ static ImageDeleter* create(
+ librados::IoCtx& local_io_ctx, Threads<librbd::ImageCtx>* threads,
+ Throttler<librbd::ImageCtx>* image_deletion_throttler,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon) {
+ return new ImageDeleter(local_io_ctx, threads, image_deletion_throttler,
+ service_daemon);
+ }
+
+ ImageDeleter(librados::IoCtx& local_io_ctx,
+ Threads<librbd::ImageCtx>* threads,
+ Throttler<librbd::ImageCtx>* image_deletion_throttler,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon);
+
+ ImageDeleter(const ImageDeleter&) = delete;
+ ImageDeleter& operator=(const ImageDeleter&) = delete;
+
+ static void trash_move(librados::IoCtx& local_io_ctx,
+ const std::string& global_image_id, bool resync,
+ librbd::asio::ContextWQ* work_queue,
+ Context* on_finish);
+
+ void init(Context* on_finish);
+ void shut_down(Context* on_finish);
+
+ void print_status(Formatter *f);
+
+ // for testing purposes
+ void wait_for_deletion(const std::string &image_id,
+ bool scheduled_only, Context* on_finish);
+
+ std::vector<std::string> get_delete_queue_items();
+ std::vector<std::pair<std::string, int> > get_failed_queue_items();
+
+ inline void set_busy_timer_interval(double interval) {
+ m_busy_interval = interval;
+ }
+
+private:
+ using clock_t = ceph::real_clock;
+ struct TrashListener : public image_deleter::TrashListener {
+ ImageDeleter *image_deleter;
+
+ TrashListener(ImageDeleter *image_deleter) : image_deleter(image_deleter) {
+ }
+
+ void handle_trash_image(const std::string& image_id,
+ const ceph::real_clock::time_point& deferment_end_time) override {
+ image_deleter->handle_trash_image(image_id, deferment_end_time);
+ }
+ };
+
+ struct DeleteInfo {
+ std::string image_id;
+
+ image_deleter::ErrorResult error_result = {};
+ int error_code = 0;
+ clock_t::time_point retry_time;
+ int retries = 0;
+
+ DeleteInfo(const std::string& image_id)
+ : image_id(image_id) {
+ }
+
+ inline bool operator==(const DeleteInfo& delete_info) const {
+ return (image_id == delete_info.image_id);
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, DeleteInfo& delete_info) {
+ os << "[image_id=" << delete_info.image_id << "]";
+ return os;
+ }
+
+ void print_status(Formatter *f,
+ bool print_failure_info=false);
+ };
+ typedef std::shared_ptr<DeleteInfo> DeleteInfoRef;
+ typedef std::deque<DeleteInfoRef> DeleteQueue;
+ typedef std::map<std::string, Context*> OnDeleteContexts;
+
+ librados::IoCtx& m_local_io_ctx;
+ Threads<librbd::ImageCtx>* m_threads;
+ Throttler<librbd::ImageCtx>* m_image_deletion_throttler;
+ ServiceDaemon<librbd::ImageCtx>* m_service_daemon;
+
+ image_deleter::TrashWatcher<ImageCtxT>* m_trash_watcher = nullptr;
+ TrashListener m_trash_listener;
+
+ std::atomic<unsigned> m_running { 1 };
+
+ double m_busy_interval = 1;
+
+ AsyncOpTracker m_async_op_tracker;
+
+ ceph::mutex m_lock;
+ DeleteQueue m_delete_queue;
+ DeleteQueue m_retry_delete_queue;
+ DeleteQueue m_in_flight_delete_queue;
+
+ OnDeleteContexts m_on_delete_contexts;
+
+ AdminSocketHook *m_asok_hook = nullptr;
+
+ Context *m_timer_ctx = nullptr;
+
+ bool process_image_delete();
+
+ void complete_active_delete(DeleteInfoRef* delete_info, int r);
+ void enqueue_failed_delete(DeleteInfoRef* delete_info, int error_code,
+ double retry_delay);
+
+ DeleteInfoRef find_delete_info(const std::string &image_id);
+
+ void remove_images();
+ void remove_image(DeleteInfoRef delete_info);
+ void handle_remove_image(DeleteInfoRef delete_info, int r);
+
+ void schedule_retry_timer();
+ void cancel_retry_timer();
+ void handle_retry_timer();
+
+ void handle_trash_image(const std::string& image_id,
+ const clock_t::time_point& deferment_end_time);
+
+ void shut_down_trash_watcher(Context* on_finish);
+ void wait_for_ops(Context* on_finish);
+ void cancel_all_deletions(Context* on_finish);
+
+ void notify_on_delete(const std::string& image_id, int r);
+
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ImageDeleter<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_H
diff --git a/src/tools/rbd_mirror/ImageMap.cc b/src/tools/rbd_mirror/ImageMap.cc
new file mode 100644
index 000000000..d352fcb2c
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageMap.cc
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "tools/rbd_mirror/Threads.h"
+
+#include "ImageMap.h"
+#include "image_map/LoadRequest.h"
+#include "image_map/SimplePolicy.h"
+#include "image_map/UpdateRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ImageMap: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+using ::operator<<;
+using image_map::Policy;
+
+using librbd::util::unique_lock_name;
+using librbd::util::create_async_context_callback;
+
+template <typename I>
+struct ImageMap<I>::C_NotifyInstance : public Context {
+ ImageMap* image_map;
+ std::string global_image_id;
+ bool acquire_release;
+
+ C_NotifyInstance(ImageMap* image_map, const std::string& global_image_id,
+ bool acquire_release)
+ : image_map(image_map), global_image_id(global_image_id),
+ acquire_release(acquire_release) {
+ image_map->start_async_op();
+ }
+
+ void finish(int r) override {
+ if (acquire_release) {
+ image_map->handle_peer_ack(global_image_id, r);
+ } else {
+ image_map->handle_peer_ack_remove(global_image_id, r);
+ }
+ image_map->finish_async_op();
+ }
+};
+
+template <typename I>
+ImageMap<I>::ImageMap(librados::IoCtx &ioctx, Threads<I> *threads,
+ const std::string& instance_id,
+ image_map::Listener &listener)
+ : m_ioctx(ioctx), m_threads(threads), m_instance_id(instance_id),
+ m_listener(listener),
+ m_lock(ceph::make_mutex(
+ unique_lock_name("rbd::mirror::ImageMap::m_lock", this))) {
+}
+
+template <typename I>
+ImageMap<I>::~ImageMap() {
+ ceph_assert(m_async_op_tracker.empty());
+ ceph_assert(m_timer_task == nullptr);
+ ceph_assert(m_rebalance_task == nullptr);
+}
+
+template <typename I>
+void ImageMap<I>::continue_action(const std::set<std::string> &global_image_ids,
+ int r) {
+ dout(20) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ if (m_shutting_down) {
+ return;
+ }
+
+ for (auto const &global_image_id : global_image_ids) {
+ bool schedule = m_policy->finish_action(global_image_id, r);
+ if (schedule) {
+ schedule_action(global_image_id);
+ }
+ }
+ }
+
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::handle_update_request(
+ const Updates &updates,
+ const std::set<std::string> &remove_global_image_ids, int r) {
+ dout(20) << "r=" << r << dendl;
+
+ std::set<std::string> global_image_ids;
+
+ global_image_ids.insert(remove_global_image_ids.begin(),
+ remove_global_image_ids.end());
+ for (auto const &update : updates) {
+ global_image_ids.insert(update.global_image_id);
+ }
+
+ continue_action(global_image_ids, r);
+}
+
+template <typename I>
+void ImageMap<I>::update_image_mapping(Updates&& map_updates,
+ std::set<std::string>&& map_removals) {
+ if (map_updates.empty() && map_removals.empty()) {
+ return;
+ }
+
+ dout(5) << "updates=[" << map_updates << "], "
+ << "removes=[" << map_removals << "]" << dendl;
+
+ Context *on_finish = new LambdaContext(
+ [this, map_updates, map_removals](int r) {
+ handle_update_request(map_updates, map_removals, r);
+ finish_async_op();
+ });
+ on_finish = create_async_context_callback(m_threads->work_queue, on_finish);
+
+ // empty meta policy for now..
+ image_map::PolicyMetaNone policy_meta;
+
+ bufferlist bl;
+ encode(image_map::PolicyData(policy_meta), bl);
+
+ // prepare update map
+ std::map<std::string, cls::rbd::MirrorImageMap> update_mapping;
+ for (auto const &update : map_updates) {
+ update_mapping.emplace(
+ update.global_image_id, cls::rbd::MirrorImageMap(update.instance_id,
+ update.mapped_time, bl));
+ }
+
+ start_async_op();
+ image_map::UpdateRequest<I> *req = image_map::UpdateRequest<I>::create(
+ m_ioctx, std::move(update_mapping), std::move(map_removals), on_finish);
+ req->send();
+}
+
+template <typename I>
+void ImageMap<I>::process_updates() {
+ dout(20) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(m_timer_task == nullptr);
+
+ Updates map_updates;
+ std::set<std::string> map_removals;
+ Updates acquire_updates;
+ Updates release_updates;
+
+ // gather updates by advancing the state machine
+ m_lock.lock();
+ for (auto const &global_image_id : m_global_image_ids) {
+ image_map::ActionType action_type =
+ m_policy->start_action(global_image_id);
+ image_map::LookupInfo info = m_policy->lookup(global_image_id);
+
+ dout(15) << "global_image_id=" << global_image_id << ", "
+ << "action=" << action_type << ", "
+ << "instance=" << info.instance_id << dendl;
+ switch (action_type) {
+ case image_map::ACTION_TYPE_NONE:
+ continue;
+ case image_map::ACTION_TYPE_MAP_UPDATE:
+ ceph_assert(info.instance_id != image_map::UNMAPPED_INSTANCE_ID);
+ map_updates.emplace_back(global_image_id, info.instance_id,
+ info.mapped_time);
+ break;
+ case image_map::ACTION_TYPE_MAP_REMOVE:
+ map_removals.emplace(global_image_id);
+ break;
+ case image_map::ACTION_TYPE_ACQUIRE:
+ ceph_assert(info.instance_id != image_map::UNMAPPED_INSTANCE_ID);
+ acquire_updates.emplace_back(global_image_id, info.instance_id);
+ break;
+ case image_map::ACTION_TYPE_RELEASE:
+ ceph_assert(info.instance_id != image_map::UNMAPPED_INSTANCE_ID);
+ release_updates.emplace_back(global_image_id, info.instance_id);
+ break;
+ }
+ }
+ m_global_image_ids.clear();
+ m_lock.unlock();
+
+ // notify listener (acquire, release) and update on-disk map. note
+ // that its safe to process this outside m_lock as we still hold
+ // timer lock.
+ notify_listener_acquire_release_images(acquire_updates, release_updates);
+ update_image_mapping(std::move(map_updates), std::move(map_removals));
+}
+
+template <typename I>
+void ImageMap<I>::schedule_update_task() {
+ std::lock_guard timer_lock{m_threads->timer_lock};
+ schedule_update_task(m_threads->timer_lock);
+}
+
+template <typename I>
+void ImageMap<I>::schedule_update_task(const ceph::mutex &timer_lock) {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+
+ schedule_rebalance_task();
+
+ if (m_timer_task != nullptr) {
+ return;
+ }
+
+ {
+ std::lock_guard locker{m_lock};
+ if (m_global_image_ids.empty()) {
+ return;
+ }
+ }
+
+ m_timer_task = new LambdaContext([this](int r) {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ m_timer_task = nullptr;
+
+ process_updates();
+ });
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ double after = cct->_conf.get_val<double>("rbd_mirror_image_policy_update_throttle_interval");
+
+ dout(20) << "scheduling image check update (" << m_timer_task << ")"
+ << " after " << after << " second(s)" << dendl;
+ m_threads->timer->add_event_after(after, m_timer_task);
+}
+
+template <typename I>
+void ImageMap<I>::rebalance() {
+ ceph_assert(m_rebalance_task == nullptr);
+
+ {
+ std::lock_guard locker{m_lock};
+ if (m_async_op_tracker.empty() && m_global_image_ids.empty()){
+ dout(20) << "starting rebalance" << dendl;
+
+ std::set<std::string> remap_global_image_ids;
+ m_policy->add_instances({}, &remap_global_image_ids);
+
+ for (auto const &global_image_id : remap_global_image_ids) {
+ schedule_action(global_image_id);
+ }
+ }
+ }
+
+ schedule_update_task(m_threads->timer_lock);
+}
+
+template <typename I>
+void ImageMap<I>::schedule_rebalance_task() {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+
+ // fetch the updated value of idle timeout for (re)scheduling
+ double resched_after = cct->_conf.get_val<double>(
+ "rbd_mirror_image_policy_rebalance_timeout");
+ if (!resched_after) {
+ return;
+ }
+
+ // cancel existing rebalance task if any before scheduling
+ if (m_rebalance_task != nullptr) {
+ m_threads->timer->cancel_event(m_rebalance_task);
+ }
+
+ m_rebalance_task = new LambdaContext([this](int _) {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ m_rebalance_task = nullptr;
+
+ rebalance();
+ });
+
+ dout(20) << "scheduling rebalance (" << m_rebalance_task << ")"
+ << " after " << resched_after << " second(s)" << dendl;
+ m_threads->timer->add_event_after(resched_after, m_rebalance_task);
+}
+
+template <typename I>
+void ImageMap<I>::schedule_action(const std::string &global_image_id) {
+ dout(20) << "global_image_id=" << global_image_id << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ m_global_image_ids.emplace(global_image_id);
+}
+
+template <typename I>
+void ImageMap<I>::notify_listener_acquire_release_images(
+ const Updates &acquire, const Updates &release) {
+ if (acquire.empty() && release.empty()) {
+ return;
+ }
+
+ dout(5) << "acquire=[" << acquire << "], "
+ << "release=[" << release << "]" << dendl;
+
+ for (auto const &update : acquire) {
+ m_listener.acquire_image(
+ update.global_image_id, update.instance_id,
+ create_async_context_callback(
+ m_threads->work_queue,
+ new C_NotifyInstance(this, update.global_image_id, true)));
+ }
+
+ for (auto const &update : release) {
+ m_listener.release_image(
+ update.global_image_id, update.instance_id,
+ create_async_context_callback(
+ m_threads->work_queue,
+ new C_NotifyInstance(this, update.global_image_id, true)));
+ }
+}
+
+template <typename I>
+void ImageMap<I>::notify_listener_remove_images(const std::string &peer_uuid,
+ const Updates &remove) {
+ dout(5) << "peer_uuid=" << peer_uuid << ", "
+ << "remove=[" << remove << "]" << dendl;
+
+ for (auto const &update : remove) {
+ m_listener.remove_image(
+ peer_uuid, update.global_image_id, update.instance_id,
+ create_async_context_callback(
+ m_threads->work_queue,
+ new C_NotifyInstance(this, update.global_image_id, false)));
+ }
+}
+
+template <typename I>
+void ImageMap<I>::handle_load(const std::map<std::string,
+ cls::rbd::MirrorImageMap> &image_mapping) {
+ dout(20) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ m_policy->init(image_mapping);
+
+ for (auto& pair : image_mapping) {
+ schedule_action(pair.first);
+ }
+ }
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::handle_peer_ack_remove(const std::string &global_image_id,
+ int r) {
+ std::lock_guard locker{m_lock};
+ dout(5) << "global_image_id=" << global_image_id << dendl;
+
+ if (r < 0) {
+ derr << "failed to remove global_image_id=" << global_image_id << dendl;
+ }
+
+ auto peer_it = m_peer_map.find(global_image_id);
+ if (peer_it == m_peer_map.end()) {
+ return;
+ }
+
+ m_peer_map.erase(peer_it);
+}
+
+template <typename I>
+void ImageMap<I>::update_images_added(
+ const std::string &peer_uuid,
+ const std::set<std::string> &global_image_ids) {
+ dout(5) << "peer_uuid=" << peer_uuid << ", "
+ << "global_image_ids=[" << global_image_ids << "]" << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ for (auto const &global_image_id : global_image_ids) {
+ auto result = m_peer_map[global_image_id].insert(peer_uuid);
+ if (result.second && m_peer_map[global_image_id].size() == 1) {
+ if (m_policy->add_image(global_image_id)) {
+ schedule_action(global_image_id);
+ }
+ }
+ }
+}
+
+template <typename I>
+void ImageMap<I>::update_images_removed(
+ const std::string &peer_uuid,
+ const std::set<std::string> &global_image_ids) {
+ dout(5) << "peer_uuid=" << peer_uuid << ", "
+ << "global_image_ids=[" << global_image_ids << "]" << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Updates to_remove;
+ for (auto const &global_image_id : global_image_ids) {
+ image_map::LookupInfo info = m_policy->lookup(global_image_id);
+ bool image_mapped = (info.instance_id != image_map::UNMAPPED_INSTANCE_ID);
+
+ bool image_removed = image_mapped;
+ bool peer_removed = false;
+ auto peer_it = m_peer_map.find(global_image_id);
+ if (peer_it != m_peer_map.end()) {
+ auto& peer_set = peer_it->second;
+ peer_removed = peer_set.erase(peer_uuid);
+ image_removed = peer_removed && peer_set.empty();
+ }
+
+ if (image_mapped && peer_removed && !peer_uuid.empty()) {
+ // peer image has been deleted
+ to_remove.emplace_back(global_image_id, info.instance_id);
+ }
+
+ if (image_removed) {
+ // local and peer images have been deleted
+ if (m_policy->remove_image(global_image_id)) {
+ schedule_action(global_image_id);
+ }
+ }
+ }
+
+ if (!to_remove.empty()) {
+ // removal notification will be notified instantly. this is safe
+ // even after scheduling action for images as we still hold m_lock
+ notify_listener_remove_images(peer_uuid, to_remove);
+ }
+}
+
+template <typename I>
+void ImageMap<I>::update_instances_added(
+ const std::vector<std::string> &instance_ids) {
+ {
+ std::lock_guard locker{m_lock};
+ if (m_shutting_down) {
+ return;
+ }
+
+ std::vector<std::string> filtered_instance_ids;
+ filter_instance_ids(instance_ids, &filtered_instance_ids, false);
+ if (filtered_instance_ids.empty()) {
+ return;
+ }
+
+ dout(20) << "instance_ids=" << filtered_instance_ids << dendl;
+
+ std::set<std::string> remap_global_image_ids;
+ m_policy->add_instances(filtered_instance_ids, &remap_global_image_ids);
+
+ for (auto const &global_image_id : remap_global_image_ids) {
+ schedule_action(global_image_id);
+ }
+ }
+
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::update_instances_removed(
+ const std::vector<std::string> &instance_ids) {
+ {
+ std::lock_guard locker{m_lock};
+ if (m_shutting_down) {
+ return;
+ }
+
+ std::vector<std::string> filtered_instance_ids;
+ filter_instance_ids(instance_ids, &filtered_instance_ids, true);
+ if (filtered_instance_ids.empty()) {
+ return;
+ }
+
+ dout(20) << "instance_ids=" << filtered_instance_ids << dendl;
+
+ std::set<std::string> remap_global_image_ids;
+ m_policy->remove_instances(filtered_instance_ids, &remap_global_image_ids);
+
+ for (auto const &global_image_id : remap_global_image_ids) {
+ schedule_action(global_image_id);
+ }
+ }
+
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::update_images(const std::string &peer_uuid,
+ std::set<std::string> &&added_global_image_ids,
+ std::set<std::string> &&removed_global_image_ids) {
+ dout(5) << "peer_uuid=" << peer_uuid << ", " << "added_count="
+ << added_global_image_ids.size() << ", " << "removed_count="
+ << removed_global_image_ids.size() << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ if (m_shutting_down) {
+ return;
+ }
+
+ if (!removed_global_image_ids.empty()) {
+ update_images_removed(peer_uuid, removed_global_image_ids);
+ }
+ if (!added_global_image_ids.empty()) {
+ update_images_added(peer_uuid, added_global_image_ids);
+ }
+ }
+
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::handle_peer_ack(const std::string &global_image_id, int r) {
+ dout (20) << "global_image_id=" << global_image_id << ", r=" << r
+ << dendl;
+
+ continue_action({global_image_id}, r);
+}
+
+template <typename I>
+void ImageMap<I>::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ std::string policy_type = cct->_conf.get_val<string>("rbd_mirror_image_policy_type");
+
+ if (policy_type == "none" || policy_type == "simple") {
+ m_policy.reset(image_map::SimplePolicy::create(m_ioctx));
+ } else {
+ ceph_abort(); // not really needed as such, but catch it.
+ }
+
+ dout(20) << "mapping policy=" << policy_type << dendl;
+
+ start_async_op();
+ C_LoadMap *ctx = new C_LoadMap(this, on_finish);
+ image_map::LoadRequest<I> *req = image_map::LoadRequest<I>::create(
+ m_ioctx, &ctx->image_mapping, ctx);
+ req->send();
+}
+
+template <typename I>
+void ImageMap<I>::shut_down(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::lock_guard timer_lock{m_threads->timer_lock};
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(!m_shutting_down);
+
+ m_shutting_down = true;
+ m_policy.reset();
+ }
+
+ if (m_timer_task != nullptr) {
+ m_threads->timer->cancel_event(m_timer_task);
+ m_timer_task = nullptr;
+ }
+ if (m_rebalance_task != nullptr) {
+ m_threads->timer->cancel_event(m_rebalance_task);
+ m_rebalance_task = nullptr;
+ }
+ }
+
+ wait_for_async_ops(on_finish);
+}
+
+template <typename I>
+void ImageMap<I>::filter_instance_ids(
+ const std::vector<std::string> &instance_ids,
+ std::vector<std::string> *filtered_instance_ids, bool removal) const {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ std::string policy_type = cct->_conf.get_val<string>("rbd_mirror_image_policy_type");
+
+ if (policy_type != "none") {
+ *filtered_instance_ids = instance_ids;
+ return;
+ }
+
+ if (removal) {
+ // propagate removals for external instances
+ for (auto& instance_id : instance_ids) {
+ if (instance_id != m_instance_id) {
+ filtered_instance_ids->push_back(instance_id);
+ }
+ }
+ } else if (std::find(instance_ids.begin(), instance_ids.end(),
+ m_instance_id) != instance_ids.end()) {
+ // propagate addition only for local instance
+ filtered_instance_ids->push_back(m_instance_id);
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ImageMap<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageMap.h b/src/tools/rbd_mirror/ImageMap.h
new file mode 100644
index 000000000..9dd61ee0d
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageMap.h
@@ -0,0 +1,175 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_H
+
+#include <vector>
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "common/AsyncOpTracker.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/rados/librados.hpp"
+
+#include "image_map/Policy.h"
+#include "image_map/Types.h"
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageMap {
+public:
+ static ImageMap *create(librados::IoCtx &ioctx, Threads<ImageCtxT> *threads,
+ const std::string& instance_id,
+ image_map::Listener &listener) {
+ return new ImageMap(ioctx, threads, instance_id, listener);
+ }
+
+ ~ImageMap();
+
+ // init (load) the instance map from disk
+ void init(Context *on_finish);
+
+ // shut down map operations
+ void shut_down(Context *on_finish);
+
+ // update (add/remove) images
+ void update_images(const std::string &peer_uuid,
+ std::set<std::string> &&added_global_image_ids,
+ std::set<std::string> &&removed_global_image_ids);
+
+ // add/remove instances
+ void update_instances_added(const std::vector<std::string> &instances);
+ void update_instances_removed(const std::vector<std::string> &instances);
+
+private:
+ struct C_NotifyInstance;
+
+ ImageMap(librados::IoCtx &ioctx, Threads<ImageCtxT> *threads,
+ const std::string& instance_id, image_map::Listener &listener);
+
+ struct Update {
+ std::string global_image_id;
+ std::string instance_id;
+ utime_t mapped_time;
+
+ Update(const std::string &global_image_id, const std::string &instance_id,
+ utime_t mapped_time)
+ : global_image_id(global_image_id),
+ instance_id(instance_id),
+ mapped_time(mapped_time) {
+ }
+ Update(const std::string &global_image_id, const std::string &instance_id)
+ : Update(global_image_id, instance_id, ceph_clock_now()) {
+ }
+
+ friend std::ostream& operator<<(std::ostream& os,
+ const Update& update) {
+ os << "{global_image_id=" << update.global_image_id << ", "
+ << "instance_id=" << update.instance_id << "}";
+ return os;
+ }
+
+ };
+ typedef std::list<Update> Updates;
+
+ // Lock ordering: m_threads->timer_lock, m_lock
+
+ librados::IoCtx &m_ioctx;
+ Threads<ImageCtxT> *m_threads;
+ std::string m_instance_id;
+ image_map::Listener &m_listener;
+
+ std::unique_ptr<image_map::Policy> m_policy; // our mapping policy
+
+ Context *m_timer_task = nullptr;
+ ceph::mutex m_lock;
+ bool m_shutting_down = false;
+ AsyncOpTracker m_async_op_tracker;
+
+ // global_image_id -> registered peers ("" == local, remote otherwise)
+ std::map<std::string, std::set<std::string> > m_peer_map;
+
+ std::set<std::string> m_global_image_ids;
+
+ Context *m_rebalance_task = nullptr;
+
+ struct C_LoadMap : Context {
+ ImageMap *image_map;
+ Context *on_finish;
+
+ std::map<std::string, cls::rbd::MirrorImageMap> image_mapping;
+
+ C_LoadMap(ImageMap *image_map, Context *on_finish)
+ : image_map(image_map),
+ on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ if (r == 0) {
+ image_map->handle_load(image_mapping);
+ }
+
+ image_map->finish_async_op();
+ on_finish->complete(r);
+ }
+ };
+
+ // async op-tracker helper routines
+ void start_async_op() {
+ m_async_op_tracker.start_op();
+ }
+ void finish_async_op() {
+ m_async_op_tracker.finish_op();
+ }
+ void wait_for_async_ops(Context *on_finish) {
+ m_async_op_tracker.wait_for_ops(on_finish);
+ }
+
+ void handle_peer_ack(const std::string &global_image_id, int r);
+ void handle_peer_ack_remove(const std::string &global_image_id, int r);
+
+ void handle_load(const std::map<std::string, cls::rbd::MirrorImageMap> &image_mapping);
+ void handle_update_request(const Updates &updates,
+ const std::set<std::string> &remove_global_image_ids, int r);
+
+ // continue (retry or resume depending on state machine) processing
+ // current action.
+ void continue_action(const std::set<std::string> &global_image_ids, int r);
+
+ // schedule an image for update
+ void schedule_action(const std::string &global_image_id);
+
+ void schedule_update_task();
+ void schedule_update_task(const ceph::mutex &timer_lock);
+ void process_updates();
+ void update_image_mapping(Updates&& map_updates,
+ std::set<std::string>&& map_removals);
+
+ void rebalance();
+ void schedule_rebalance_task();
+
+ void notify_listener_acquire_release_images(const Updates &acquire, const Updates &release);
+ void notify_listener_remove_images(const std::string &peer_uuid, const Updates &remove);
+
+ void update_images_added(const std::string &peer_uuid,
+ const std::set<std::string> &global_image_ids);
+ void update_images_removed(const std::string &peer_uuid,
+ const std::set<std::string> &global_image_ids);
+
+ void filter_instance_ids(const std::vector<std::string> &instance_ids,
+ std::vector<std::string> *filtered_instance_ids,
+ bool removal) const;
+
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_H
diff --git a/src/tools/rbd_mirror/ImageReplayer.cc b/src/tools/rbd_mirror/ImageReplayer.cc
new file mode 100644
index 000000000..ee22b8d34
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageReplayer.cc
@@ -0,0 +1,1190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "common/Formatter.h"
+#include "common/admin_socket.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/Timer.h"
+#include "global/global_context.h"
+#include "journal/Journaler.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "ImageDeleter.h"
+#include "ImageReplayer.h"
+#include "MirrorStatusUpdater.h"
+#include "Threads.h"
+#include "tools/rbd_mirror/image_replayer/BootstrapRequest.h"
+#include "tools/rbd_mirror/image_replayer/ReplayerListener.h"
+#include "tools/rbd_mirror/image_replayer/StateBuilder.h"
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+#include "tools/rbd_mirror/image_replayer/journal/Replayer.h"
+#include "tools/rbd_mirror/image_replayer/journal/StateBuilder.h"
+#include <map>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::" << *this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+std::ostream &operator<<(std::ostream &os,
+ const typename ImageReplayer<I>::State &state);
+
+namespace {
+
+template <typename I>
+class ImageReplayerAdminSocketCommand {
+public:
+ ImageReplayerAdminSocketCommand(const std::string &desc,
+ ImageReplayer<I> *replayer)
+ : desc(desc), replayer(replayer) {
+ }
+ virtual ~ImageReplayerAdminSocketCommand() {}
+ virtual int call(Formatter *f) = 0;
+
+ std::string desc;
+ ImageReplayer<I> *replayer;
+ bool registered = false;
+};
+
+template <typename I>
+class StatusCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit StatusCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->replayer->print_status(f);
+ return 0;
+ }
+};
+
+template <typename I>
+class StartCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit StartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->replayer->start(nullptr, true);
+ return 0;
+ }
+};
+
+template <typename I>
+class StopCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit StopCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->replayer->stop(nullptr, true);
+ return 0;
+ }
+};
+
+template <typename I>
+class RestartCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit RestartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->replayer->restart();
+ return 0;
+ }
+};
+
+template <typename I>
+class FlushCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit FlushCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->replayer->flush();
+ return 0;
+ }
+};
+
+template <typename I>
+class ImageReplayerAdminSocketHook : public AdminSocketHook {
+public:
+ ImageReplayerAdminSocketHook(CephContext *cct, const std::string &name,
+ ImageReplayer<I> *replayer)
+ : admin_socket(cct->get_admin_socket()),
+ commands{{"rbd mirror flush " + name,
+ new FlushCommand<I>("flush rbd mirror " + name, replayer)},
+ {"rbd mirror restart " + name,
+ new RestartCommand<I>("restart rbd mirror " + name, replayer)},
+ {"rbd mirror start " + name,
+ new StartCommand<I>("start rbd mirror " + name, replayer)},
+ {"rbd mirror status " + name,
+ new StatusCommand<I>("get status for rbd mirror " + name, replayer)},
+ {"rbd mirror stop " + name,
+ new StopCommand<I>("stop rbd mirror " + name, replayer)}} {
+ }
+
+ int register_commands() {
+ for (auto &it : commands) {
+ int r = admin_socket->register_command(it.first, this,
+ it.second->desc);
+ if (r < 0) {
+ return r;
+ }
+ it.second->registered = true;
+ }
+ return 0;
+ }
+
+ ~ImageReplayerAdminSocketHook() override {
+ admin_socket->unregister_commands(this);
+ for (auto &it : commands) {
+ delete it.second;
+ }
+ commands.clear();
+ }
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out) override {
+ auto i = commands.find(command);
+ ceph_assert(i != commands.end());
+ return i->second->call(f);
+ }
+
+private:
+ typedef std::map<std::string, ImageReplayerAdminSocketCommand<I>*,
+ std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+} // anonymous namespace
+
+template <typename I>
+void ImageReplayer<I>::BootstrapProgressContext::update_progress(
+ const std::string &description, bool flush)
+{
+ const std::string desc = "bootstrapping, " + description;
+ replayer->set_state_description(0, desc);
+ if (flush) {
+ replayer->update_mirror_image_status(false, boost::none);
+ }
+}
+
+template <typename I>
+struct ImageReplayer<I>::ReplayerListener
+ : public image_replayer::ReplayerListener {
+ ImageReplayer<I>* image_replayer;
+
+ ReplayerListener(ImageReplayer<I>* image_replayer)
+ : image_replayer(image_replayer) {
+ }
+
+ void handle_notification() override {
+ image_replayer->handle_replayer_notification();
+ }
+};
+
+template <typename I>
+ImageReplayer<I>::ImageReplayer(
+ librados::IoCtx &local_io_ctx, const std::string &local_mirror_uuid,
+ const std::string &global_image_id, Threads<I> *threads,
+ InstanceWatcher<I> *instance_watcher,
+ MirrorStatusUpdater<I>* local_status_updater,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache) :
+ m_local_io_ctx(local_io_ctx), m_local_mirror_uuid(local_mirror_uuid),
+ m_global_image_id(global_image_id), m_threads(threads),
+ m_instance_watcher(instance_watcher),
+ m_local_status_updater(local_status_updater),
+ m_cache_manager_handler(cache_manager_handler),
+ m_pool_meta_cache(pool_meta_cache),
+ m_local_image_name(global_image_id),
+ m_lock(ceph::make_mutex("rbd::mirror::ImageReplayer " +
+ stringify(local_io_ctx.get_id()) + " " + global_image_id)),
+ m_progress_cxt(this),
+ m_replayer_listener(new ReplayerListener(this))
+{
+ // Register asok commands using a temporary "remote_pool_name/global_image_id"
+ // name. When the image name becomes known on start the asok commands will be
+ // re-registered using "remote_pool_name/remote_image_name" name.
+
+ m_image_spec = image_replayer::util::compute_image_spec(
+ local_io_ctx, global_image_id);
+ register_admin_socket_hook();
+}
+
+template <typename I>
+ImageReplayer<I>::~ImageReplayer()
+{
+ unregister_admin_socket_hook();
+ ceph_assert(m_state_builder == nullptr);
+ ceph_assert(m_on_start_finish == nullptr);
+ ceph_assert(m_on_stop_contexts.empty());
+ ceph_assert(m_bootstrap_request == nullptr);
+ ceph_assert(m_update_status_task == nullptr);
+ delete m_replayer_listener;
+}
+
+template <typename I>
+image_replayer::HealthState ImageReplayer<I>::get_health_state() const {
+ std::lock_guard locker{m_lock};
+
+ if (!m_mirror_image_status_state) {
+ return image_replayer::HEALTH_STATE_OK;
+ } else if (*m_mirror_image_status_state ==
+ cls::rbd::MIRROR_IMAGE_STATUS_STATE_SYNCING ||
+ *m_mirror_image_status_state ==
+ cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN) {
+ return image_replayer::HEALTH_STATE_WARNING;
+ }
+ return image_replayer::HEALTH_STATE_ERROR;
+}
+
+template <typename I>
+void ImageReplayer<I>::add_peer(const Peer<I>& peer) {
+ dout(10) << "peer=" << peer << dendl;
+
+ std::lock_guard locker{m_lock};
+ auto it = m_peers.find(peer);
+ if (it == m_peers.end()) {
+ m_peers.insert(peer);
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::set_state_description(int r, const std::string &desc) {
+ dout(10) << "r=" << r << ", desc=" << desc << dendl;
+
+ std::lock_guard l{m_lock};
+ m_last_r = r;
+ m_state_desc = desc;
+}
+
+template <typename I>
+void ImageReplayer<I>::start(Context *on_finish, bool manual, bool restart)
+{
+ dout(10) << "on_finish=" << on_finish << dendl;
+
+ int r = 0;
+ {
+ std::lock_guard locker{m_lock};
+ if (!is_stopped_()) {
+ derr << "already running" << dendl;
+ r = -EINVAL;
+ } else if (m_manual_stop && !manual) {
+ dout(5) << "stopped manually, ignoring start without manual flag"
+ << dendl;
+ r = -EPERM;
+ } else if (restart && !m_restart_requested) {
+ dout(10) << "canceled restart" << dendl;
+ r = -ECANCELED;
+ } else {
+ m_state = STATE_STARTING;
+ m_last_r = 0;
+ m_state_desc.clear();
+ m_manual_stop = false;
+ m_delete_requested = false;
+ m_restart_requested = false;
+ m_status_removed = false;
+
+ if (on_finish != nullptr) {
+ ceph_assert(m_on_start_finish == nullptr);
+ m_on_start_finish = on_finish;
+ }
+ ceph_assert(m_on_stop_contexts.empty());
+ }
+ }
+
+ if (r < 0) {
+ if (on_finish) {
+ on_finish->complete(r);
+ }
+ return;
+ }
+
+ bootstrap();
+}
+
+template <typename I>
+void ImageReplayer<I>::bootstrap() {
+ dout(10) << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (m_peers.empty()) {
+ locker.unlock();
+
+ dout(5) << "no peer clusters" << dendl;
+ on_start_fail(-ENOENT, "no peer clusters");
+ return;
+ }
+
+ // TODO need to support multiple remote images
+ ceph_assert(!m_peers.empty());
+ m_remote_image_peer = *m_peers.begin();
+
+ if (on_start_interrupted(m_lock)) {
+ return;
+ }
+
+ ceph_assert(m_state_builder == nullptr);
+ auto ctx = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_bootstrap>(this);
+ auto request = image_replayer::BootstrapRequest<I>::create(
+ m_threads, m_local_io_ctx, m_remote_image_peer.io_ctx, m_instance_watcher,
+ m_global_image_id, m_local_mirror_uuid,
+ m_remote_image_peer.remote_pool_meta, m_cache_manager_handler,
+ m_pool_meta_cache, &m_progress_cxt, &m_state_builder, &m_resync_requested,
+ ctx);
+
+ request->get();
+ m_bootstrap_request = request;
+ locker.unlock();
+
+ update_mirror_image_status(false, boost::none);
+ request->send();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_bootstrap(int r) {
+ dout(10) << "r=" << r << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ m_bootstrap_request->put();
+ m_bootstrap_request = nullptr;
+ }
+
+ if (on_start_interrupted()) {
+ return;
+ } else if (r == -ENOMSG) {
+ dout(5) << "local image is primary" << dendl;
+ on_start_fail(0, "local image is primary");
+ return;
+ } else if (r == -EREMOTEIO) {
+ dout(5) << "remote image is not primary" << dendl;
+ on_start_fail(-EREMOTEIO, "remote image is not primary");
+ return;
+ } else if (r == -EEXIST) {
+ on_start_fail(r, "split-brain detected");
+ return;
+ } else if (r == -ENOLINK) {
+ m_delete_requested = true;
+ on_start_fail(0, "remote image no longer exists");
+ return;
+ } else if (r == -ERESTART) {
+ on_start_fail(r, "image in transient state, try again");
+ return;
+ } else if (r < 0) {
+ on_start_fail(r, "error bootstrapping replay");
+ return;
+ } else if (m_resync_requested) {
+ on_start_fail(0, "resync requested");
+ return;
+ }
+
+ start_replay();
+}
+
+template <typename I>
+void ImageReplayer<I>::start_replay() {
+ dout(10) << dendl;
+
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_replayer == nullptr);
+ m_replayer = m_state_builder->create_replayer(m_threads, m_instance_watcher,
+ m_local_mirror_uuid,
+ m_pool_meta_cache,
+ m_replayer_listener);
+
+ auto ctx = create_context_callback<
+ ImageReplayer<I>, &ImageReplayer<I>::handle_start_replay>(this);
+ m_replayer->init(ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_start_replay(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (on_start_interrupted()) {
+ return;
+ } else if (r < 0) {
+ std::string error_description = m_replayer->get_error_description();
+ if (r == -ENOTCONN && m_replayer->is_resync_requested()) {
+ std::unique_lock locker{m_lock};
+ m_resync_requested = true;
+ }
+
+ // shut down not required if init failed
+ m_replayer->destroy();
+ m_replayer = nullptr;
+
+ derr << "error starting replay: " << cpp_strerror(r) << dendl;
+ on_start_fail(r, error_description);
+ return;
+ }
+
+ Context *on_finish = nullptr;
+ {
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_state == STATE_STARTING);
+ m_state = STATE_REPLAYING;
+ std::swap(m_on_start_finish, on_finish);
+
+ std::unique_lock timer_locker{m_threads->timer_lock};
+ schedule_update_mirror_image_replay_status();
+ }
+
+ update_mirror_image_status(true, boost::none);
+ if (on_replay_interrupted()) {
+ if (on_finish != nullptr) {
+ on_finish->complete(r);
+ }
+ return;
+ }
+
+ dout(10) << "start succeeded" << dendl;
+ if (on_finish != nullptr) {
+ dout(10) << "on finish complete, r=" << r << dendl;
+ on_finish->complete(r);
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::on_start_fail(int r, const std::string &desc)
+{
+ dout(10) << "r=" << r << ", desc=" << desc << dendl;
+ Context *ctx = new LambdaContext([this, r, desc](int _r) {
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_STARTING);
+ m_state = STATE_STOPPING;
+ if (r < 0 && r != -ECANCELED && r != -EREMOTEIO && r != -ENOENT) {
+ derr << "start failed: " << cpp_strerror(r) << dendl;
+ } else {
+ dout(10) << "start canceled" << dendl;
+ }
+ }
+
+ set_state_description(r, desc);
+ update_mirror_image_status(false, boost::none);
+ shut_down(r);
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+bool ImageReplayer<I>::on_start_interrupted() {
+ std::lock_guard locker{m_lock};
+ return on_start_interrupted(m_lock);
+}
+
+template <typename I>
+bool ImageReplayer<I>::on_start_interrupted(ceph::mutex& lock) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_state == STATE_STARTING);
+ if (!m_stop_requested) {
+ return false;
+ }
+
+ on_start_fail(-ECANCELED, "");
+ return true;
+}
+
+template <typename I>
+void ImageReplayer<I>::stop(Context *on_finish, bool manual, bool restart)
+{
+ dout(10) << "on_finish=" << on_finish << ", manual=" << manual
+ << ", restart=" << restart << dendl;
+
+ image_replayer::BootstrapRequest<I> *bootstrap_request = nullptr;
+ bool shut_down_replay = false;
+ bool is_stopped = false;
+ {
+ std::lock_guard locker{m_lock};
+
+ if (!is_running_()) {
+ if (manual && !m_manual_stop) {
+ dout(10) << "marking manual" << dendl;
+ m_manual_stop = true;
+ }
+ if (!restart && m_restart_requested) {
+ dout(10) << "canceling restart" << dendl;
+ m_restart_requested = false;
+ }
+ if (is_stopped_()) {
+ dout(10) << "already stopped" << dendl;
+ is_stopped = true;
+ } else {
+ dout(10) << "joining in-flight stop" << dendl;
+ if (on_finish != nullptr) {
+ m_on_stop_contexts.push_back(on_finish);
+ }
+ }
+ } else {
+ if (m_state == STATE_STARTING) {
+ dout(10) << "canceling start" << dendl;
+ if (m_bootstrap_request != nullptr) {
+ bootstrap_request = m_bootstrap_request;
+ bootstrap_request->get();
+ }
+ } else {
+ dout(10) << "interrupting replay" << dendl;
+ shut_down_replay = true;
+ }
+
+ ceph_assert(m_on_stop_contexts.empty());
+ if (on_finish != nullptr) {
+ m_on_stop_contexts.push_back(on_finish);
+ }
+ m_stop_requested = true;
+ m_manual_stop = manual;
+ }
+ }
+
+ if (is_stopped) {
+ if (on_finish) {
+ on_finish->complete(-EINVAL);
+ }
+ return;
+ }
+
+ // avoid holding lock since bootstrap request will update status
+ if (bootstrap_request != nullptr) {
+ dout(10) << "canceling bootstrap" << dendl;
+ bootstrap_request->cancel();
+ bootstrap_request->put();
+ }
+
+ if (shut_down_replay) {
+ on_stop_journal_replay();
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::on_stop_journal_replay(int r, const std::string &desc)
+{
+ dout(10) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ if (m_state != STATE_REPLAYING) {
+ // might be invoked multiple times while stopping
+ return;
+ }
+
+ m_stop_requested = true;
+ m_state = STATE_STOPPING;
+ }
+
+ cancel_update_mirror_image_replay_status();
+ set_state_description(r, desc);
+ update_mirror_image_status(true, boost::none);
+ shut_down(0);
+}
+
+template <typename I>
+void ImageReplayer<I>::restart(Context *on_finish)
+{
+ {
+ std::lock_guard locker{m_lock};
+ m_restart_requested = true;
+ }
+
+ auto ctx = new LambdaContext(
+ [this, on_finish](int r) {
+ if (r < 0) {
+ // Try start anyway.
+ }
+ start(on_finish, true, true);
+ });
+ stop(ctx, false, true);
+}
+
+template <typename I>
+void ImageReplayer<I>::flush()
+{
+ C_SaferCond ctx;
+
+ {
+ std::unique_lock locker{m_lock};
+ if (m_state != STATE_REPLAYING) {
+ return;
+ }
+
+ dout(10) << dendl;
+ ceph_assert(m_replayer != nullptr);
+ m_replayer->flush(&ctx);
+ }
+
+ int r = ctx.wait();
+ if (r >= 0) {
+ update_mirror_image_status(false, boost::none);
+ }
+}
+
+template <typename I>
+bool ImageReplayer<I>::on_replay_interrupted()
+{
+ bool shut_down;
+ {
+ std::lock_guard locker{m_lock};
+ shut_down = m_stop_requested;
+ }
+
+ if (shut_down) {
+ on_stop_journal_replay();
+ }
+ return shut_down;
+}
+
+template <typename I>
+void ImageReplayer<I>::print_status(Formatter *f)
+{
+ dout(10) << dendl;
+
+ std::lock_guard l{m_lock};
+
+ f->open_object_section("image_replayer");
+ f->dump_string("name", m_image_spec);
+ f->dump_string("state", to_string(m_state));
+ f->close_section();
+}
+
+template <typename I>
+void ImageReplayer<I>::schedule_update_mirror_image_replay_status() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(ceph_mutex_is_locked_by_me(m_threads->timer_lock));
+ if (m_state != STATE_REPLAYING) {
+ return;
+ }
+
+ dout(10) << dendl;
+
+ // periodically update the replaying status even if nothing changes
+ // so that we can adjust our performance stats
+ ceph_assert(m_update_status_task == nullptr);
+ m_update_status_task = create_context_callback<
+ ImageReplayer<I>,
+ &ImageReplayer<I>::handle_update_mirror_image_replay_status>(this);
+ m_threads->timer->add_event_after(10, m_update_status_task);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_update_mirror_image_replay_status(int r) {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_threads->timer_lock));
+
+ ceph_assert(m_update_status_task != nullptr);
+ m_update_status_task = nullptr;
+
+ auto ctx = new LambdaContext([this](int) {
+ update_mirror_image_status(false, boost::none);
+
+ std::unique_lock locker{m_lock};
+ std::unique_lock timer_locker{m_threads->timer_lock};
+
+ schedule_update_mirror_image_replay_status();
+ m_in_flight_op_tracker.finish_op();
+ });
+
+ m_in_flight_op_tracker.start_op();
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void ImageReplayer<I>::cancel_update_mirror_image_replay_status() {
+ std::unique_lock timer_locker{m_threads->timer_lock};
+ if (m_update_status_task != nullptr) {
+ dout(10) << dendl;
+
+ if (m_threads->timer->cancel_event(m_update_status_task)) {
+ m_update_status_task = nullptr;
+ }
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::update_mirror_image_status(
+ bool force, const OptionalState &opt_state) {
+ dout(15) << "force=" << force << ", "
+ << "state=" << opt_state << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ if (!force && !is_stopped_() && !is_running_()) {
+ dout(15) << "shut down in-progress: ignoring update" << dendl;
+ return;
+ }
+ }
+
+ m_in_flight_op_tracker.start_op();
+ auto ctx = new LambdaContext(
+ [this, force, opt_state](int r) {
+ set_mirror_image_status_update(force, opt_state);
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void ImageReplayer<I>::set_mirror_image_status_update(
+ bool force, const OptionalState &opt_state) {
+ dout(15) << "force=" << force << ", "
+ << "state=" << opt_state << dendl;
+
+ reregister_admin_socket_hook();
+
+ State state;
+ std::string state_desc;
+ int last_r;
+ bool stopping_replay;
+
+ auto mirror_image_status_state = boost::make_optional(
+ false, cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN);
+ image_replayer::BootstrapRequest<I>* bootstrap_request = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ state = m_state;
+ state_desc = m_state_desc;
+ mirror_image_status_state = m_mirror_image_status_state;
+ last_r = m_last_r;
+ stopping_replay = (m_replayer != nullptr);
+
+ if (m_bootstrap_request != nullptr) {
+ bootstrap_request = m_bootstrap_request;
+ bootstrap_request->get();
+ }
+ }
+
+ bool syncing = false;
+ if (bootstrap_request != nullptr) {
+ syncing = bootstrap_request->is_syncing();
+ bootstrap_request->put();
+ bootstrap_request = nullptr;
+ }
+
+ if (opt_state) {
+ state = *opt_state;
+ }
+
+ cls::rbd::MirrorImageSiteStatus status;
+ status.up = true;
+ switch (state) {
+ case STATE_STARTING:
+ if (syncing) {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_SYNCING;
+ status.description = state_desc.empty() ? "syncing" : state_desc;
+ mirror_image_status_state = status.state;
+ } else {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY;
+ status.description = "starting replay";
+ }
+ break;
+ case STATE_REPLAYING:
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_REPLAYING;
+ {
+ std::string desc;
+ auto on_req_finish = new LambdaContext(
+ [this, force](int r) {
+ dout(15) << "replay status ready: r=" << r << dendl;
+ if (r >= 0) {
+ set_mirror_image_status_update(force, boost::none);
+ } else if (r == -EAGAIN) {
+ m_in_flight_op_tracker.finish_op();
+ }
+ });
+
+ ceph_assert(m_replayer != nullptr);
+ if (!m_replayer->get_replay_status(&desc, on_req_finish)) {
+ dout(15) << "waiting for replay status" << dendl;
+ return;
+ }
+
+ status.description = "replaying, " + desc;
+ mirror_image_status_state = boost::make_optional(
+ false, cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN);
+ }
+ break;
+ case STATE_STOPPING:
+ if (stopping_replay) {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY;
+ status.description = state_desc.empty() ? "stopping replay" : state_desc;
+ break;
+ }
+ // FALLTHROUGH
+ case STATE_STOPPED:
+ if (last_r == -EREMOTEIO) {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN;
+ status.description = state_desc;
+ mirror_image_status_state = status.state;
+ } else if (last_r < 0 && last_r != -ECANCELED) {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_ERROR;
+ status.description = state_desc;
+ mirror_image_status_state = status.state;
+ } else {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STOPPED;
+ status.description = state_desc.empty() ? "stopped" : state_desc;
+ mirror_image_status_state = boost::none;
+ }
+ break;
+ default:
+ ceph_assert(!"invalid state");
+ }
+
+ {
+ std::lock_guard locker{m_lock};
+ m_mirror_image_status_state = mirror_image_status_state;
+ }
+
+ // prevent the status from ping-ponging when failed replays are restarted
+ if (mirror_image_status_state &&
+ *mirror_image_status_state == cls::rbd::MIRROR_IMAGE_STATUS_STATE_ERROR) {
+ status.state = *mirror_image_status_state;
+ }
+
+ dout(15) << "status=" << status << dendl;
+ m_local_status_updater->set_mirror_image_status(m_global_image_id, status,
+ force);
+ if (m_remote_image_peer.mirror_status_updater != nullptr) {
+ m_remote_image_peer.mirror_status_updater->set_mirror_image_status(
+ m_global_image_id, status, force);
+ }
+
+ m_in_flight_op_tracker.finish_op();
+}
+
+template <typename I>
+void ImageReplayer<I>::shut_down(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_state == STATE_STOPPING);
+ }
+
+ if (!m_in_flight_op_tracker.empty()) {
+ dout(15) << "waiting for in-flight operations to complete" << dendl;
+ m_in_flight_op_tracker.wait_for_ops(new LambdaContext([this, r](int) {
+ shut_down(r);
+ }));
+ return;
+ }
+
+ // chain the shut down sequence (reverse order)
+ Context *ctx = new LambdaContext(
+ [this, r](int _r) {
+ update_mirror_image_status(true, STATE_STOPPED);
+ handle_shut_down(r);
+ });
+
+ // destruct the state builder
+ if (m_state_builder != nullptr) {
+ ctx = new LambdaContext([this, ctx](int r) {
+ m_state_builder->close(ctx);
+ });
+ }
+
+ // close the replayer
+ if (m_replayer != nullptr) {
+ ctx = new LambdaContext([this, ctx](int r) {
+ m_replayer->destroy();
+ m_replayer = nullptr;
+ ctx->complete(0);
+ });
+ ctx = new LambdaContext([this, ctx](int r) {
+ m_replayer->shut_down(ctx);
+ });
+ }
+
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_shut_down(int r) {
+ bool resync_requested = false;
+ bool delete_requested = false;
+ bool unregister_asok_hook = false;
+ {
+ std::lock_guard locker{m_lock};
+
+ if (m_delete_requested && m_state_builder != nullptr &&
+ !m_state_builder->local_image_id.empty()) {
+ ceph_assert(m_state_builder->remote_image_id.empty());
+ dout(0) << "remote image no longer exists: scheduling deletion" << dendl;
+ unregister_asok_hook = true;
+ std::swap(delete_requested, m_delete_requested);
+ m_delete_in_progress = true;
+ }
+
+ std::swap(resync_requested, m_resync_requested);
+ if (!delete_requested && !resync_requested && m_last_r == -ENOENT &&
+ ((m_state_builder == nullptr) ||
+ (m_state_builder->local_image_id.empty() &&
+ m_state_builder->remote_image_id.empty()))) {
+ dout(0) << "mirror image no longer exists" << dendl;
+ unregister_asok_hook = true;
+ m_finished = true;
+ }
+ }
+
+ if (unregister_asok_hook) {
+ unregister_admin_socket_hook();
+ }
+
+ if (delete_requested || resync_requested) {
+ dout(5) << "moving image to trash" << dendl;
+ auto ctx = new LambdaContext([this, r](int) {
+ handle_shut_down(r);
+ });
+ ImageDeleter<I>::trash_move(m_local_io_ctx, m_global_image_id,
+ resync_requested, m_threads->work_queue, ctx);
+ return;
+ }
+
+ if (!m_in_flight_op_tracker.empty()) {
+ dout(15) << "waiting for in-flight operations to complete" << dendl;
+ m_in_flight_op_tracker.wait_for_ops(new LambdaContext([this, r](int) {
+ handle_shut_down(r);
+ }));
+ return;
+ }
+
+ if (!m_status_removed) {
+ auto ctx = new LambdaContext([this, r](int) {
+ m_status_removed = true;
+ handle_shut_down(r);
+ });
+ remove_image_status(m_delete_in_progress, ctx);
+ return;
+ }
+
+ if (m_state_builder != nullptr) {
+ m_state_builder->destroy();
+ m_state_builder = nullptr;
+ }
+
+ dout(10) << "stop complete" << dendl;
+ Context *on_start = nullptr;
+ Contexts on_stop_contexts;
+ {
+ std::lock_guard locker{m_lock};
+ std::swap(on_start, m_on_start_finish);
+ on_stop_contexts = std::move(m_on_stop_contexts);
+ m_stop_requested = false;
+ ceph_assert(m_state == STATE_STOPPING);
+ m_state = STATE_STOPPED;
+ }
+
+ if (on_start != nullptr) {
+ dout(10) << "on start finish complete, r=" << r << dendl;
+ on_start->complete(r);
+ r = 0;
+ }
+ for (auto ctx : on_stop_contexts) {
+ dout(10) << "on stop finish " << ctx << " complete, r=" << r << dendl;
+ ctx->complete(r);
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_replayer_notification() {
+ dout(10) << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (m_state != STATE_REPLAYING) {
+ // might be attempting to shut down
+ return;
+ }
+
+ {
+ // detect a rename of the local image
+ ceph_assert(m_state_builder != nullptr &&
+ m_state_builder->local_image_ctx != nullptr);
+ std::shared_lock image_locker{m_state_builder->local_image_ctx->image_lock};
+ if (m_local_image_name != m_state_builder->local_image_ctx->name) {
+ // will re-register with new name after next status update
+ dout(10) << "image renamed" << dendl;
+ m_local_image_name = m_state_builder->local_image_ctx->name;
+ }
+ }
+
+ // replayer cannot be shut down while notification is in-flight
+ ceph_assert(m_replayer != nullptr);
+ locker.unlock();
+
+ if (m_replayer->is_resync_requested()) {
+ dout(10) << "resync requested" << dendl;
+ m_resync_requested = true;
+ on_stop_journal_replay(0, "resync requested");
+ return;
+ }
+
+ if (!m_replayer->is_replaying()) {
+ auto error_code = m_replayer->get_error_code();
+ auto error_description = m_replayer->get_error_description();
+ dout(10) << "replay interrupted: "
+ << "r=" << error_code << ", "
+ << "error=" << error_description << dendl;
+ on_stop_journal_replay(error_code, error_description);
+ return;
+ }
+
+ update_mirror_image_status(false, {});
+}
+
+template <typename I>
+std::string ImageReplayer<I>::to_string(const State state) {
+ switch (state) {
+ case ImageReplayer<I>::STATE_STARTING:
+ return "Starting";
+ case ImageReplayer<I>::STATE_REPLAYING:
+ return "Replaying";
+ case ImageReplayer<I>::STATE_STOPPING:
+ return "Stopping";
+ case ImageReplayer<I>::STATE_STOPPED:
+ return "Stopped";
+ default:
+ break;
+ }
+ return "Unknown(" + stringify(state) + ")";
+}
+
+template <typename I>
+void ImageReplayer<I>::register_admin_socket_hook() {
+ ImageReplayerAdminSocketHook<I> *asok_hook;
+ {
+ std::lock_guard locker{m_lock};
+ if (m_asok_hook != nullptr) {
+ return;
+ }
+
+ dout(15) << "registered asok hook: " << m_image_spec << dendl;
+ asok_hook = new ImageReplayerAdminSocketHook<I>(
+ g_ceph_context, m_image_spec, this);
+ int r = asok_hook->register_commands();
+ if (r == 0) {
+ m_asok_hook = asok_hook;
+ return;
+ }
+ derr << "error registering admin socket commands" << dendl;
+ }
+ delete asok_hook;
+}
+
+template <typename I>
+void ImageReplayer<I>::unregister_admin_socket_hook() {
+ dout(15) << dendl;
+
+ AdminSocketHook *asok_hook = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ std::swap(asok_hook, m_asok_hook);
+ }
+ delete asok_hook;
+}
+
+template <typename I>
+void ImageReplayer<I>::reregister_admin_socket_hook() {
+ std::unique_lock locker{m_lock};
+ if (m_state == STATE_STARTING && m_bootstrap_request != nullptr) {
+ m_local_image_name = m_bootstrap_request->get_local_image_name();
+ }
+
+ auto image_spec = image_replayer::util::compute_image_spec(
+ m_local_io_ctx, m_local_image_name);
+ if (m_asok_hook != nullptr && m_image_spec == image_spec) {
+ return;
+ }
+
+ dout(15) << "old_image_spec=" << m_image_spec << ", "
+ << "new_image_spec=" << image_spec << dendl;
+ m_image_spec = image_spec;
+
+ if (m_state == STATE_STOPPING || m_state == STATE_STOPPED) {
+ // no need to re-register if stopping
+ return;
+ }
+ locker.unlock();
+
+ unregister_admin_socket_hook();
+ register_admin_socket_hook();
+}
+
+template <typename I>
+void ImageReplayer<I>::remove_image_status(bool force, Context *on_finish)
+{
+ auto ctx = new LambdaContext([this, force, on_finish](int) {
+ remove_image_status_remote(force, on_finish);
+ });
+
+ if (m_local_status_updater->exists(m_global_image_id)) {
+ dout(15) << "removing local mirror image status" << dendl;
+ if (force) {
+ m_local_status_updater->remove_mirror_image_status(
+ m_global_image_id, true, ctx);
+ } else {
+ m_local_status_updater->remove_refresh_mirror_image_status(
+ m_global_image_id, ctx);
+ }
+ return;
+ }
+
+ ctx->complete(0);
+}
+
+template <typename I>
+void ImageReplayer<I>::remove_image_status_remote(bool force, Context *on_finish)
+{
+ if (m_remote_image_peer.mirror_status_updater != nullptr &&
+ m_remote_image_peer.mirror_status_updater->exists(m_global_image_id)) {
+ dout(15) << "removing remote mirror image status" << dendl;
+ if (force) {
+ m_remote_image_peer.mirror_status_updater->remove_mirror_image_status(
+ m_global_image_id, true, on_finish);
+ } else {
+ m_remote_image_peer.mirror_status_updater->remove_refresh_mirror_image_status(
+ m_global_image_id, on_finish);
+ }
+ return;
+ }
+ if (on_finish) {
+ on_finish->complete(0);
+ }
+}
+
+template <typename I>
+std::ostream &operator<<(std::ostream &os, const ImageReplayer<I> &replayer)
+{
+ os << "ImageReplayer: " << &replayer << " [" << replayer.get_local_pool_id()
+ << "/" << replayer.get_global_image_id() << "]";
+ return os;
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ImageReplayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageReplayer.h b/src/tools/rbd_mirror/ImageReplayer.h
new file mode 100644
index 000000000..432fdf225
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageReplayer.h
@@ -0,0 +1,273 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_H
+#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_H
+
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_mutex.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "ProgressContext.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/image_replayer/Types.h"
+#include <boost/optional.hpp>
+#include <string>
+
+class AdminSocketHook;
+
+namespace journal { struct CacheManagerHandler; }
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct InstanceWatcher;
+template <typename> struct MirrorStatusUpdater;
+struct PoolMetaCache;
+template <typename> struct Threads;
+
+namespace image_replayer {
+
+class Replayer;
+template <typename> class BootstrapRequest;
+template <typename> class StateBuilder;
+
+} // namespace image_replayer
+
+/**
+ * Replays changes from a remote cluster for a single image.
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageReplayer {
+public:
+ static ImageReplayer *create(
+ librados::IoCtx &local_io_ctx, const std::string &local_mirror_uuid,
+ const std::string &global_image_id, Threads<ImageCtxT> *threads,
+ InstanceWatcher<ImageCtxT> *instance_watcher,
+ MirrorStatusUpdater<ImageCtxT>* local_status_updater,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache) {
+ return new ImageReplayer(local_io_ctx, local_mirror_uuid, global_image_id,
+ threads, instance_watcher, local_status_updater,
+ cache_manager_handler, pool_meta_cache);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ ImageReplayer(librados::IoCtx &local_io_ctx,
+ const std::string &local_mirror_uuid,
+ const std::string &global_image_id,
+ Threads<ImageCtxT> *threads,
+ InstanceWatcher<ImageCtxT> *instance_watcher,
+ MirrorStatusUpdater<ImageCtxT>* local_status_updater,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache);
+ virtual ~ImageReplayer();
+ ImageReplayer(const ImageReplayer&) = delete;
+ ImageReplayer& operator=(const ImageReplayer&) = delete;
+
+ bool is_stopped() { std::lock_guard l{m_lock}; return is_stopped_(); }
+ bool is_running() { std::lock_guard l{m_lock}; return is_running_(); }
+ bool is_replaying() { std::lock_guard l{m_lock}; return is_replaying_(); }
+
+ std::string get_name() { std::lock_guard l{m_lock}; return m_image_spec; };
+ void set_state_description(int r, const std::string &desc);
+
+ // TODO temporary until policy handles release of image replayers
+ inline bool is_finished() const {
+ std::lock_guard locker{m_lock};
+ return m_finished;
+ }
+ inline void set_finished(bool finished) {
+ std::lock_guard locker{m_lock};
+ m_finished = finished;
+ }
+
+ inline bool is_blocklisted() const {
+ std::lock_guard locker{m_lock};
+ return (m_last_r == -EBLOCKLISTED);
+ }
+
+ image_replayer::HealthState get_health_state() const;
+
+ void add_peer(const Peer<ImageCtxT>& peer);
+
+ inline int64_t get_local_pool_id() const {
+ return m_local_io_ctx.get_id();
+ }
+ inline const std::string& get_global_image_id() const {
+ return m_global_image_id;
+ }
+
+ void start(Context *on_finish, bool manual = false, bool restart = false);
+ void stop(Context *on_finish, bool manual = false, bool restart = false);
+ void restart(Context *on_finish = nullptr);
+ void flush();
+
+ void print_status(Formatter *f);
+
+protected:
+ /**
+ * @verbatim
+ * (error)
+ * <uninitialized> <------------------------------------ FAIL
+ * | ^
+ * v *
+ * <starting> *
+ * | *
+ * v (error) *
+ * BOOTSTRAP_IMAGE * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * v (error) *
+ * START_REPLAY * * * * * * * * * * * * * * * * * * * * * *
+ * |
+ * v
+ * REPLAYING
+ * |
+ * v
+ * JOURNAL_REPLAY_SHUT_DOWN
+ * |
+ * v
+ * LOCAL_IMAGE_CLOSE
+ * |
+ * v
+ * <stopped>
+ *
+ * @endverbatim
+ */
+
+ void on_start_fail(int r, const std::string &desc);
+ bool on_start_interrupted();
+ bool on_start_interrupted(ceph::mutex& lock);
+
+ void on_stop_journal_replay(int r = 0, const std::string &desc = "");
+
+ bool on_replay_interrupted();
+
+private:
+ typedef std::set<Peer<ImageCtxT>> Peers;
+ typedef std::list<Context *> Contexts;
+
+ enum State {
+ STATE_UNKNOWN,
+ STATE_STARTING,
+ STATE_REPLAYING,
+ STATE_STOPPING,
+ STATE_STOPPED,
+ };
+
+ struct ReplayerListener;
+
+ typedef boost::optional<State> OptionalState;
+ typedef boost::optional<cls::rbd::MirrorImageStatusState>
+ OptionalMirrorImageStatusState;
+
+ class BootstrapProgressContext : public ProgressContext {
+ public:
+ BootstrapProgressContext(ImageReplayer<ImageCtxT> *replayer) :
+ replayer(replayer) {
+ }
+
+ void update_progress(const std::string &description,
+ bool flush = true) override;
+
+ private:
+ ImageReplayer<ImageCtxT> *replayer;
+ };
+
+ librados::IoCtx &m_local_io_ctx;
+ std::string m_local_mirror_uuid;
+ std::string m_global_image_id;
+ Threads<ImageCtxT> *m_threads;
+ InstanceWatcher<ImageCtxT> *m_instance_watcher;
+ MirrorStatusUpdater<ImageCtxT>* m_local_status_updater;
+ journal::CacheManagerHandler *m_cache_manager_handler;
+ PoolMetaCache* m_pool_meta_cache;
+
+ Peers m_peers;
+ Peer<ImageCtxT> m_remote_image_peer;
+
+ std::string m_local_image_name;
+ std::string m_image_spec;
+
+ mutable ceph::mutex m_lock;
+ State m_state = STATE_STOPPED;
+ std::string m_state_desc;
+
+ OptionalMirrorImageStatusState m_mirror_image_status_state =
+ boost::make_optional(false, cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN);
+ int m_last_r = 0;
+
+ BootstrapProgressContext m_progress_cxt;
+
+ bool m_finished = false;
+ bool m_delete_in_progress = false;
+ bool m_delete_requested = false;
+ bool m_resync_requested = false;
+ bool m_restart_requested = false;
+
+ bool m_status_removed = false;
+
+ image_replayer::StateBuilder<ImageCtxT>* m_state_builder = nullptr;
+ image_replayer::Replayer* m_replayer = nullptr;
+ ReplayerListener* m_replayer_listener = nullptr;
+
+ Context *m_on_start_finish = nullptr;
+ Contexts m_on_stop_contexts;
+ bool m_stop_requested = false;
+ bool m_manual_stop = false;
+
+ AdminSocketHook *m_asok_hook = nullptr;
+
+ image_replayer::BootstrapRequest<ImageCtxT> *m_bootstrap_request = nullptr;
+
+ AsyncOpTracker m_in_flight_op_tracker;
+
+ Context* m_update_status_task = nullptr;
+
+ static std::string to_string(const State state);
+
+ bool is_stopped_() const {
+ return m_state == STATE_STOPPED;
+ }
+ bool is_running_() const {
+ return !is_stopped_() && m_state != STATE_STOPPING && !m_stop_requested;
+ }
+ bool is_replaying_() const {
+ return (m_state == STATE_REPLAYING);
+ }
+
+ void schedule_update_mirror_image_replay_status();
+ void handle_update_mirror_image_replay_status(int r);
+ void cancel_update_mirror_image_replay_status();
+
+ void update_mirror_image_status(bool force, const OptionalState &state);
+ void set_mirror_image_status_update(bool force, const OptionalState &state);
+
+ void shut_down(int r);
+ void handle_shut_down(int r);
+
+ void bootstrap();
+ void handle_bootstrap(int r);
+
+ void start_replay();
+ void handle_start_replay(int r);
+
+ void handle_replayer_notification();
+
+ void register_admin_socket_hook();
+ void unregister_admin_socket_hook();
+ void reregister_admin_socket_hook();
+ void remove_image_status(bool force, Context *on_finish);
+ void remove_image_status_remote(bool force, Context *on_finish);
+
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ImageReplayer<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_H
diff --git a/src/tools/rbd_mirror/ImageSync.cc b/src/tools/rbd_mirror/ImageSync.cc
new file mode 100644
index 000000000..43d0c6663
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageSync.cc
@@ -0,0 +1,469 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ImageSync.h"
+#include "InstanceWatcher.h"
+#include "ProgressContext.h"
+#include "common/debug.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+#include "librbd/DeepCopyRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/internal.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/deep_copy/Handler.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_sync/SyncPointCreateRequest.h"
+#include "tools/rbd_mirror/image_sync/SyncPointPruneRequest.h"
+#include "tools/rbd_mirror/image_sync/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ImageSync: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+
+using namespace image_sync;
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+class ImageSync<I>::ImageCopyProgressHandler
+ : public librbd::deep_copy::NoOpHandler {
+public:
+ ImageCopyProgressHandler(ImageSync *image_sync) : image_sync(image_sync) {
+ }
+
+ int update_progress(uint64_t object_no, uint64_t object_count) override {
+ image_sync->handle_copy_image_update_progress(object_no, object_count);
+ return 0;
+ }
+
+ ImageSync *image_sync;
+};
+
+template <typename I>
+ImageSync<I>::ImageSync(
+ Threads<I>* threads,
+ I *local_image_ctx,
+ I *remote_image_ctx,
+ const std::string &local_mirror_uuid,
+ image_sync::SyncPointHandler* sync_point_handler,
+ InstanceWatcher<I> *instance_watcher,
+ ProgressContext *progress_ctx,
+ Context *on_finish)
+ : CancelableRequest("rbd::mirror::ImageSync", local_image_ctx->cct,
+ on_finish),
+ m_threads(threads),
+ m_local_image_ctx(local_image_ctx),
+ m_remote_image_ctx(remote_image_ctx),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_sync_point_handler(sync_point_handler),
+ m_instance_watcher(instance_watcher),
+ m_progress_ctx(progress_ctx),
+ m_lock(ceph::make_mutex(unique_lock_name("ImageSync::m_lock", this))),
+ m_update_sync_point_interval(
+ m_local_image_ctx->cct->_conf.template get_val<double>(
+ "rbd_mirror_sync_point_update_age")) {
+}
+
+template <typename I>
+ImageSync<I>::~ImageSync() {
+ ceph_assert(m_image_copy_request == nullptr);
+ ceph_assert(m_image_copy_prog_handler == nullptr);
+ ceph_assert(m_update_sync_ctx == nullptr);
+}
+
+template <typename I>
+void ImageSync<I>::send() {
+ send_notify_sync_request();
+}
+
+template <typename I>
+void ImageSync<I>::cancel() {
+ std::lock_guard locker{m_lock};
+
+ dout(10) << dendl;
+
+ m_canceled = true;
+
+ if (m_instance_watcher->cancel_sync_request(m_local_image_ctx->id)) {
+ return;
+ }
+
+ if (m_image_copy_request != nullptr) {
+ m_image_copy_request->cancel();
+ }
+}
+
+template <typename I>
+void ImageSync<I>::send_notify_sync_request() {
+ update_progress("NOTIFY_SYNC_REQUEST");
+
+ dout(10) << dendl;
+
+ m_lock.lock();
+ if (m_canceled) {
+ m_lock.unlock();
+ CancelableRequest::finish(-ECANCELED);
+ return;
+ }
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_notify_sync_request>(this));
+ m_instance_watcher->notify_sync_request(m_local_image_ctx->id, ctx);
+ m_lock.unlock();
+}
+
+template <typename I>
+void ImageSync<I>::handle_notify_sync_request(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ m_lock.lock();
+ if (r == 0 && m_canceled) {
+ r = -ECANCELED;
+ }
+ m_lock.unlock();
+
+ if (r < 0) {
+ CancelableRequest::finish(r);
+ return;
+ }
+
+ send_prune_catch_up_sync_point();
+}
+
+template <typename I>
+void ImageSync<I>::send_prune_catch_up_sync_point() {
+ update_progress("PRUNE_CATCH_UP_SYNC_POINT");
+
+ if (m_sync_point_handler->get_sync_points().empty()) {
+ send_create_sync_point();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ // prune will remove sync points with missing snapshots and
+ // ensure we have a maximum of one sync point (in case we
+ // restarted)
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_prune_catch_up_sync_point>(this);
+ SyncPointPruneRequest<I> *request = SyncPointPruneRequest<I>::create(
+ m_remote_image_ctx, false, m_sync_point_handler, ctx);
+ request->send();
+}
+
+template <typename I>
+void ImageSync<I>::handle_prune_catch_up_sync_point(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to prune catch-up sync point: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_create_sync_point();
+}
+
+template <typename I>
+void ImageSync<I>::send_create_sync_point() {
+ update_progress("CREATE_SYNC_POINT");
+
+ // TODO: when support for disconnecting laggy clients is added,
+ // re-connect and create catch-up sync point
+ if (!m_sync_point_handler->get_sync_points().empty()) {
+ send_copy_image();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_create_sync_point>(this);
+ SyncPointCreateRequest<I> *request = SyncPointCreateRequest<I>::create(
+ m_remote_image_ctx, m_local_mirror_uuid, m_sync_point_handler, ctx);
+ request->send();
+}
+
+template <typename I>
+void ImageSync<I>::handle_create_sync_point(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to create sync point: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ send_copy_image();
+}
+
+template <typename I>
+void ImageSync<I>::send_copy_image() {
+ librados::snap_t snap_id_start = 0;
+ librados::snap_t snap_id_end;
+ librbd::deep_copy::ObjectNumber object_number;
+ int r = 0;
+
+ m_snap_seqs_copy = m_sync_point_handler->get_snap_seqs();
+ m_sync_points_copy = m_sync_point_handler->get_sync_points();
+ ceph_assert(!m_sync_points_copy.empty());
+ auto &sync_point = m_sync_points_copy.front();
+
+ {
+ std::shared_lock image_locker{m_remote_image_ctx->image_lock};
+ snap_id_end = m_remote_image_ctx->get_snap_id(
+ cls::rbd::UserSnapshotNamespace(), sync_point.snap_name);
+ if (snap_id_end == CEPH_NOSNAP) {
+ derr << ": failed to locate snapshot: " << sync_point.snap_name << dendl;
+ r = -ENOENT;
+ } else if (!sync_point.from_snap_name.empty()) {
+ snap_id_start = m_remote_image_ctx->get_snap_id(
+ cls::rbd::UserSnapshotNamespace(), sync_point.from_snap_name);
+ if (snap_id_start == CEPH_NOSNAP) {
+ derr << ": failed to locate from snapshot: "
+ << sync_point.from_snap_name << dendl;
+ r = -ENOENT;
+ }
+ }
+ object_number = sync_point.object_number;
+ }
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ m_lock.lock();
+ if (m_canceled) {
+ m_lock.unlock();
+ finish(-ECANCELED);
+ return;
+ }
+
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_copy_image>(this);
+ m_image_copy_prog_handler = new ImageCopyProgressHandler(this);
+ m_image_copy_request = librbd::DeepCopyRequest<I>::create(
+ m_remote_image_ctx, m_local_image_ctx, snap_id_start, snap_id_end,
+ 0, false, object_number, m_threads->work_queue, &m_snap_seqs_copy,
+ m_image_copy_prog_handler, ctx);
+ m_image_copy_request->get();
+ m_lock.unlock();
+
+ update_progress("COPY_IMAGE");
+
+ m_image_copy_request->send();
+}
+
+template <typename I>
+void ImageSync<I>::handle_copy_image(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ m_image_copy_request->put();
+ m_image_copy_request = nullptr;
+ delete m_image_copy_prog_handler;
+ m_image_copy_prog_handler = nullptr;
+ if (r == 0 && m_canceled) {
+ r = -ECANCELED;
+ }
+
+ if (m_update_sync_ctx != nullptr) {
+ m_threads->timer->cancel_event(m_update_sync_ctx);
+ m_update_sync_ctx = nullptr;
+ }
+
+ if (m_updating_sync_point) {
+ m_ret_val = r;
+ return;
+ }
+ }
+
+ if (r == -ECANCELED) {
+ dout(10) << ": image copy canceled" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << ": failed to copy image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_flush_sync_point();
+}
+
+template <typename I>
+void ImageSync<I>::handle_copy_image_update_progress(uint64_t object_no,
+ uint64_t object_count) {
+ int percent = 100 * object_no / object_count;
+ update_progress("COPY_IMAGE " + stringify(percent) + "%");
+
+ std::lock_guard locker{m_lock};
+ m_image_copy_object_no = object_no;
+ m_image_copy_object_count = object_count;
+
+ if (m_update_sync_ctx == nullptr && !m_updating_sync_point) {
+ send_update_sync_point();
+ }
+}
+
+template <typename I>
+void ImageSync<I>::send_update_sync_point() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ m_update_sync_ctx = nullptr;
+
+ if (m_canceled) {
+ return;
+ }
+
+ ceph_assert(!m_sync_points_copy.empty());
+ auto sync_point = &m_sync_points_copy.front();
+
+ if (sync_point->object_number &&
+ (m_image_copy_object_no - 1) == sync_point->object_number.get()) {
+ // update sync point did not progress since last sync
+ return;
+ }
+
+ m_updating_sync_point = true;
+
+ if (m_image_copy_object_no > 0) {
+ sync_point->object_number = m_image_copy_object_no - 1;
+ }
+
+ auto ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_update_sync_point>(this);
+ m_sync_point_handler->update_sync_points(m_snap_seqs_copy,
+ m_sync_points_copy, false, ctx);
+}
+
+template <typename I>
+void ImageSync<I>::handle_update_sync_point(int r) {
+ CephContext *cct = m_local_image_ctx->cct;
+ ldout(cct, 20) << ": r=" << r << dendl;
+
+ {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ m_updating_sync_point = false;
+
+ if (m_image_copy_request != nullptr) {
+ m_update_sync_ctx = new LambdaContext(
+ [this](int r) {
+ std::lock_guard locker{m_lock};
+ this->send_update_sync_point();
+ });
+ m_threads->timer->add_event_after(
+ m_update_sync_point_interval, m_update_sync_ctx);
+ return;
+ }
+ }
+
+ send_flush_sync_point();
+}
+
+template <typename I>
+void ImageSync<I>::send_flush_sync_point() {
+ if (m_ret_val < 0) {
+ finish(m_ret_val);
+ return;
+ }
+
+ update_progress("FLUSH_SYNC_POINT");
+
+ ceph_assert(!m_sync_points_copy.empty());
+ auto sync_point = &m_sync_points_copy.front();
+
+ if (m_image_copy_object_no > 0) {
+ sync_point->object_number = m_image_copy_object_no - 1;
+ } else {
+ sync_point->object_number = boost::none;
+ }
+
+ auto ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_flush_sync_point>(this);
+ m_sync_point_handler->update_sync_points(m_snap_seqs_copy,
+ m_sync_points_copy, false, ctx);
+}
+
+template <typename I>
+void ImageSync<I>::handle_flush_sync_point(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to update client data: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ send_prune_sync_points();
+}
+
+template <typename I>
+void ImageSync<I>::send_prune_sync_points() {
+ dout(10) << dendl;
+
+ update_progress("PRUNE_SYNC_POINTS");
+
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_prune_sync_points>(this);
+ SyncPointPruneRequest<I> *request = SyncPointPruneRequest<I>::create(
+ m_remote_image_ctx, true, m_sync_point_handler, ctx);
+ request->send();
+}
+
+template <typename I>
+void ImageSync<I>::handle_prune_sync_points(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to prune sync point: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_sync_point_handler->get_sync_points().empty()) {
+ send_copy_image();
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void ImageSync<I>::update_progress(const std::string &description) {
+ dout(20) << ": " << description << dendl;
+
+ if (m_progress_ctx) {
+ m_progress_ctx->update_progress("IMAGE_SYNC/" + description);
+ }
+}
+
+template <typename I>
+void ImageSync<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_instance_watcher->notify_sync_complete(m_local_image_ctx->id);
+ CancelableRequest::finish(r);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ImageSync<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageSync.h b/src/tools/rbd_mirror/ImageSync.h
new file mode 100644
index 000000000..b3389ce18
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageSync.h
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_SYNC_H
+#define RBD_MIRROR_IMAGE_SYNC_H
+
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Types.h"
+#include "common/ceph_mutex.h"
+#include "tools/rbd_mirror/CancelableRequest.h"
+#include "tools/rbd_mirror/image_sync/Types.h"
+
+class Context;
+namespace journal { class Journaler; }
+namespace librbd { template <typename> class DeepCopyRequest; }
+
+namespace rbd {
+namespace mirror {
+
+class ProgressContext;
+template <typename> class InstanceWatcher;
+template <typename> class Threads;
+
+namespace image_sync { struct SyncPointHandler; }
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageSync : public CancelableRequest {
+public:
+ static ImageSync* create(
+ Threads<ImageCtxT>* threads,
+ ImageCtxT *local_image_ctx,
+ ImageCtxT *remote_image_ctx,
+ const std::string &local_mirror_uuid,
+ image_sync::SyncPointHandler* sync_point_handler,
+ InstanceWatcher<ImageCtxT> *instance_watcher,
+ ProgressContext *progress_ctx,
+ Context *on_finish) {
+ return new ImageSync(threads, local_image_ctx, remote_image_ctx,
+ local_mirror_uuid, sync_point_handler,
+ instance_watcher, progress_ctx, on_finish);
+ }
+
+ ImageSync(
+ Threads<ImageCtxT>* threads,
+ ImageCtxT *local_image_ctx,
+ ImageCtxT *remote_image_ctx,
+ const std::string &local_mirror_uuid,
+ image_sync::SyncPointHandler* sync_point_handler,
+ InstanceWatcher<ImageCtxT> *instance_watcher,
+ ProgressContext *progress_ctx,
+ Context *on_finish);
+ ~ImageSync() override;
+
+ void send() override;
+ void cancel() override;
+
+protected:
+ void finish(int r) override;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * NOTIFY_SYNC_REQUEST
+ * |
+ * v
+ * PRUNE_CATCH_UP_SYNC_POINT
+ * |
+ * v
+ * CREATE_SYNC_POINT (skip if already exists and
+ * | not disconnected)
+ * v
+ * COPY_IMAGE . . . . . . . . . . . . . .
+ * | .
+ * v .
+ * FLUSH_SYNC_POINT .
+ * | . (image sync canceled)
+ * v .
+ * PRUNE_SYNC_POINTS .
+ * | .
+ * v .
+ * <finish> < . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ class ImageCopyProgressHandler;
+
+ Threads<ImageCtxT>* m_threads;
+ ImageCtxT *m_local_image_ctx;
+ ImageCtxT *m_remote_image_ctx;
+ std::string m_local_mirror_uuid;
+ image_sync::SyncPointHandler* m_sync_point_handler;
+ InstanceWatcher<ImageCtxT> *m_instance_watcher;
+ ProgressContext *m_progress_ctx;
+
+ ceph::mutex m_lock;
+ bool m_canceled = false;
+
+ librbd::DeepCopyRequest<ImageCtxT> *m_image_copy_request = nullptr;
+ ImageCopyProgressHandler *m_image_copy_prog_handler = nullptr;
+
+ bool m_updating_sync_point = false;
+ Context *m_update_sync_ctx = nullptr;
+ double m_update_sync_point_interval;
+ uint64_t m_image_copy_object_no = 0;
+ uint64_t m_image_copy_object_count = 0;
+
+ librbd::SnapSeqs m_snap_seqs_copy;
+ image_sync::SyncPoints m_sync_points_copy;
+
+ int m_ret_val = 0;
+
+ void send_notify_sync_request();
+ void handle_notify_sync_request(int r);
+
+ void send_prune_catch_up_sync_point();
+ void handle_prune_catch_up_sync_point(int r);
+
+ void send_create_sync_point();
+ void handle_create_sync_point(int r);
+
+ void send_update_max_object_count();
+ void handle_update_max_object_count(int r);
+
+ void send_copy_image();
+ void handle_copy_image(int r);
+ void handle_copy_image_update_progress(uint64_t object_no,
+ uint64_t object_count);
+ void send_update_sync_point();
+ void handle_update_sync_point(int r);
+
+ void send_flush_sync_point();
+ void handle_flush_sync_point(int r);
+
+ void send_prune_sync_points();
+ void handle_prune_sync_points(int r);
+
+ void update_progress(const std::string &description);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ImageSync<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_SYNC_H
diff --git a/src/tools/rbd_mirror/InstanceReplayer.cc b/src/tools/rbd_mirror/InstanceReplayer.cc
new file mode 100644
index 000000000..e625bf365
--- /dev/null
+++ b/src/tools/rbd_mirror/InstanceReplayer.cc
@@ -0,0 +1,543 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/stringify.h"
+#include "common/Cond.h"
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "ImageReplayer.h"
+#include "InstanceReplayer.h"
+#include "ServiceDaemon.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::InstanceReplayer: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+namespace {
+
+const std::string SERVICE_DAEMON_ASSIGNED_COUNT_KEY("image_assigned_count");
+const std::string SERVICE_DAEMON_WARNING_COUNT_KEY("image_warning_count");
+const std::string SERVICE_DAEMON_ERROR_COUNT_KEY("image_error_count");
+
+} // anonymous namespace
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+InstanceReplayer<I>::InstanceReplayer(
+ librados::IoCtx &local_io_ctx, const std::string &local_mirror_uuid,
+ Threads<I> *threads, ServiceDaemon<I>* service_daemon,
+ MirrorStatusUpdater<I>* local_status_updater,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache)
+ : m_local_io_ctx(local_io_ctx), m_local_mirror_uuid(local_mirror_uuid),
+ m_threads(threads), m_service_daemon(service_daemon),
+ m_local_status_updater(local_status_updater),
+ m_cache_manager_handler(cache_manager_handler),
+ m_pool_meta_cache(pool_meta_cache),
+ m_lock(ceph::make_mutex("rbd::mirror::InstanceReplayer " +
+ stringify(local_io_ctx.get_id()))) {
+}
+
+template <typename I>
+InstanceReplayer<I>::~InstanceReplayer() {
+ ceph_assert(m_image_state_check_task == nullptr);
+ ceph_assert(m_async_op_tracker.empty());
+ ceph_assert(m_image_replayers.empty());
+}
+
+template <typename I>
+bool InstanceReplayer<I>::is_blocklisted() const {
+ std::lock_guard locker{m_lock};
+ return m_blocklisted;
+}
+
+template <typename I>
+int InstanceReplayer<I>::init() {
+ C_SaferCond init_ctx;
+ init(&init_ctx);
+ return init_ctx.wait();
+}
+
+template <typename I>
+void InstanceReplayer<I>::init(Context *on_finish) {
+ dout(10) << dendl;
+
+ Context *ctx = new LambdaContext(
+ [this, on_finish] (int r) {
+ {
+ std::lock_guard timer_locker{m_threads->timer_lock};
+ schedule_image_state_check_task();
+ }
+ on_finish->complete(0);
+ });
+
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::shut_down() {
+ C_SaferCond shut_down_ctx;
+ shut_down(&shut_down_ctx);
+ int r = shut_down_ctx.wait();
+ ceph_assert(r == 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::shut_down(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_shut_down == nullptr);
+ m_on_shut_down = on_finish;
+
+ Context *ctx = new LambdaContext(
+ [this] (int r) {
+ cancel_image_state_check_task();
+ wait_for_ops();
+ });
+
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::add_peer(const Peer<I>& peer) {
+ dout(10) << "peer=" << peer << dendl;
+
+ std::lock_guard locker{m_lock};
+ auto result = m_peers.insert(peer).second;
+ ceph_assert(result);
+}
+
+template <typename I>
+void InstanceReplayer<I>::release_all(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ C_Gather *gather_ctx = new C_Gather(g_ceph_context, on_finish);
+ for (auto it = m_image_replayers.begin(); it != m_image_replayers.end();
+ it = m_image_replayers.erase(it)) {
+ auto image_replayer = it->second;
+ auto ctx = gather_ctx->new_sub();
+ ctx = new LambdaContext(
+ [image_replayer, ctx] (int r) {
+ image_replayer->destroy();
+ ctx->complete(0);
+ });
+ stop_image_replayer(image_replayer, ctx);
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+void InstanceReplayer<I>::acquire_image(InstanceWatcher<I> *instance_watcher,
+ const std::string &global_image_id,
+ Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_shut_down == nullptr);
+
+ auto it = m_image_replayers.find(global_image_id);
+ if (it == m_image_replayers.end()) {
+ auto image_replayer = ImageReplayer<I>::create(
+ m_local_io_ctx, m_local_mirror_uuid, global_image_id,
+ m_threads, instance_watcher, m_local_status_updater,
+ m_cache_manager_handler, m_pool_meta_cache);
+
+ dout(10) << global_image_id << ": creating replayer " << image_replayer
+ << dendl;
+
+ it = m_image_replayers.insert(std::make_pair(global_image_id,
+ image_replayer)).first;
+
+ // TODO only a single peer is currently supported
+ ceph_assert(m_peers.size() == 1);
+ auto peer = *m_peers.begin();
+ image_replayer->add_peer(peer);
+ start_image_replayer(image_replayer);
+ } else {
+ // A duplicate acquire notification implies (1) connection hiccup or
+ // (2) new leader election. For the second case, restart the replayer to
+ // detect if the image has been deleted while the leader was offline
+ auto& image_replayer = it->second;
+ image_replayer->set_finished(false);
+ image_replayer->restart(new C_TrackedOp(m_async_op_tracker, nullptr));
+ }
+
+ m_threads->work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::release_image(const std::string &global_image_id,
+ Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_shut_down == nullptr);
+
+ auto it = m_image_replayers.find(global_image_id);
+ if (it == m_image_replayers.end()) {
+ dout(5) << global_image_id << ": not found" << dendl;
+ m_threads->work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ auto image_replayer = it->second;
+ m_image_replayers.erase(it);
+
+ on_finish = new LambdaContext(
+ [image_replayer, on_finish] (int r) {
+ image_replayer->destroy();
+ on_finish->complete(0);
+ });
+ stop_image_replayer(image_replayer, on_finish);
+}
+
+template <typename I>
+void InstanceReplayer<I>::remove_peer_image(const std::string &global_image_id,
+ const std::string &peer_mirror_uuid,
+ Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << ", "
+ << "peer_mirror_uuid=" << peer_mirror_uuid << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_shut_down == nullptr);
+
+ auto it = m_image_replayers.find(global_image_id);
+ if (it != m_image_replayers.end()) {
+ // TODO only a single peer is currently supported, therefore
+ // we can just interrupt the current image replayer and
+ // it will eventually detect that the peer image is missing and
+ // determine if a delete propagation is required.
+ auto image_replayer = it->second;
+ image_replayer->restart(new C_TrackedOp(m_async_op_tracker, nullptr));
+ }
+ m_threads->work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::print_status(Formatter *f) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ f->open_array_section("image_replayers");
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->print_status(f);
+ }
+ f->close_section();
+}
+
+template <typename I>
+void InstanceReplayer<I>::start()
+{
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_manual_stop = false;
+
+ auto cct = static_cast<CephContext *>(m_local_io_ctx.cct());
+ auto gather_ctx = new C_Gather(
+ cct, new C_TrackedOp(m_async_op_tracker, nullptr));
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->start(gather_ctx->new_sub(), true);
+ }
+
+ gather_ctx->activate();
+}
+
+template <typename I>
+void InstanceReplayer<I>::stop()
+{
+ stop(nullptr);
+}
+
+template <typename I>
+void InstanceReplayer<I>::stop(Context *on_finish)
+{
+ dout(10) << dendl;
+
+ if (on_finish == nullptr) {
+ on_finish = new C_TrackedOp(m_async_op_tracker, on_finish);
+ } else {
+ on_finish = new LambdaContext(
+ [this, on_finish] (int r) {
+ m_async_op_tracker.wait_for_ops(on_finish);
+ });
+ }
+
+ auto cct = static_cast<CephContext *>(m_local_io_ctx.cct());
+ auto gather_ctx = new C_Gather(cct, on_finish);
+ {
+ std::lock_guard locker{m_lock};
+
+ m_manual_stop = true;
+
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->stop(gather_ctx->new_sub(), true);
+ }
+ }
+
+ gather_ctx->activate();
+}
+
+template <typename I>
+void InstanceReplayer<I>::restart()
+{
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_manual_stop = false;
+
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->restart(new C_TrackedOp(m_async_op_tracker, nullptr));
+ }
+}
+
+template <typename I>
+void InstanceReplayer<I>::flush()
+{
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->flush();
+ }
+}
+
+template <typename I>
+void InstanceReplayer<I>::start_image_replayer(
+ ImageReplayer<I> *image_replayer) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ std::string global_image_id = image_replayer->get_global_image_id();
+ if (!image_replayer->is_stopped()) {
+ return;
+ } else if (image_replayer->is_blocklisted()) {
+ derr << "global_image_id=" << global_image_id << ": blocklisted detected "
+ << "during image replay" << dendl;
+ m_blocklisted = true;
+ return;
+ } else if (image_replayer->is_finished()) {
+ // TODO temporary until policy integrated
+ dout(5) << "removing image replayer for global_image_id="
+ << global_image_id << dendl;
+ m_image_replayers.erase(image_replayer->get_global_image_id());
+ image_replayer->destroy();
+ return;
+ } else if (m_manual_stop) {
+ return;
+ }
+
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+ image_replayer->start(new C_TrackedOp(m_async_op_tracker, nullptr), false);
+}
+
+template <typename I>
+void InstanceReplayer<I>::queue_start_image_replayers() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ InstanceReplayer, &InstanceReplayer<I>::start_image_replayers>(this);
+ m_async_op_tracker.start_op();
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::start_image_replayers(int r) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ if (m_on_shut_down != nullptr) {
+ m_async_op_tracker.finish_op();
+ return;
+ }
+
+ uint64_t image_count = 0;
+ uint64_t warning_count = 0;
+ uint64_t error_count = 0;
+ for (auto it = m_image_replayers.begin();
+ it != m_image_replayers.end();) {
+ auto current_it(it);
+ ++it;
+
+ ++image_count;
+ auto health_state = current_it->second->get_health_state();
+ if (health_state == image_replayer::HEALTH_STATE_WARNING) {
+ ++warning_count;
+ } else if (health_state == image_replayer::HEALTH_STATE_ERROR) {
+ ++error_count;
+ }
+
+ start_image_replayer(current_it->second);
+ }
+
+ m_service_daemon->add_or_update_namespace_attribute(
+ m_local_io_ctx.get_id(), m_local_io_ctx.get_namespace(),
+ SERVICE_DAEMON_ASSIGNED_COUNT_KEY, image_count);
+ m_service_daemon->add_or_update_namespace_attribute(
+ m_local_io_ctx.get_id(), m_local_io_ctx.get_namespace(),
+ SERVICE_DAEMON_WARNING_COUNT_KEY, warning_count);
+ m_service_daemon->add_or_update_namespace_attribute(
+ m_local_io_ctx.get_id(), m_local_io_ctx.get_namespace(),
+ SERVICE_DAEMON_ERROR_COUNT_KEY, error_count);
+
+ m_async_op_tracker.finish_op();
+}
+
+template <typename I>
+void InstanceReplayer<I>::stop_image_replayer(ImageReplayer<I> *image_replayer,
+ Context *on_finish) {
+ dout(10) << image_replayer << " global_image_id="
+ << image_replayer->get_global_image_id() << ", on_finish="
+ << on_finish << dendl;
+
+ if (image_replayer->is_stopped()) {
+ m_threads->work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ m_async_op_tracker.start_op();
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, new LambdaContext(
+ [this, image_replayer, on_finish] (int r) {
+ stop_image_replayer(image_replayer, on_finish);
+ m_async_op_tracker.finish_op();
+ }));
+
+ if (image_replayer->is_running()) {
+ image_replayer->stop(ctx, false);
+ } else {
+ int after = 1;
+ dout(10) << "scheduling image replayer " << image_replayer << " stop after "
+ << after << " sec (task " << ctx << ")" << dendl;
+ ctx = new LambdaContext(
+ [this, after, ctx] (int r) {
+ std::lock_guard timer_locker{m_threads->timer_lock};
+ m_threads->timer->add_event_after(after, ctx);
+ });
+ m_threads->work_queue->queue(ctx, 0);
+ }
+}
+
+template <typename I>
+void InstanceReplayer<I>::wait_for_ops() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ InstanceReplayer, &InstanceReplayer<I>::handle_wait_for_ops>(this);
+
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void InstanceReplayer<I>::handle_wait_for_ops(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ std::lock_guard locker{m_lock};
+ stop_image_replayers();
+}
+
+template <typename I>
+void InstanceReplayer<I>::stop_image_replayers() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<InstanceReplayer<I>,
+ &InstanceReplayer<I>::handle_stop_image_replayers>(this));
+
+ C_Gather *gather_ctx = new C_Gather(g_ceph_context, ctx);
+ for (auto &it : m_image_replayers) {
+ stop_image_replayer(it.second, gather_ctx->new_sub());
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+void InstanceReplayer<I>::handle_stop_image_replayers(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+
+ for (auto &it : m_image_replayers) {
+ ceph_assert(it.second->is_stopped());
+ it.second->destroy();
+ }
+ m_image_replayers.clear();
+
+ ceph_assert(m_on_shut_down != nullptr);
+ std::swap(on_finish, m_on_shut_down);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void InstanceReplayer<I>::cancel_image_state_check_task() {
+ std::lock_guard timer_locker{m_threads->timer_lock};
+
+ if (m_image_state_check_task == nullptr) {
+ return;
+ }
+
+ dout(10) << m_image_state_check_task << dendl;
+ bool canceled = m_threads->timer->cancel_event(m_image_state_check_task);
+ ceph_assert(canceled);
+ m_image_state_check_task = nullptr;
+}
+
+template <typename I>
+void InstanceReplayer<I>::schedule_image_state_check_task() {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(m_image_state_check_task == nullptr);
+
+ m_image_state_check_task = new LambdaContext(
+ [this](int r) {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ m_image_state_check_task = nullptr;
+ schedule_image_state_check_task();
+ queue_start_image_replayers();
+ });
+
+ auto cct = static_cast<CephContext *>(m_local_io_ctx.cct());
+ int after = cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_image_state_check_interval");
+
+ dout(10) << "scheduling image state check after " << after << " sec (task "
+ << m_image_state_check_task << ")" << dendl;
+ m_threads->timer->add_event_after(after, m_image_state_check_task);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::InstanceReplayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/InstanceReplayer.h b/src/tools/rbd_mirror/InstanceReplayer.h
new file mode 100644
index 000000000..7a5c79723
--- /dev/null
+++ b/src/tools/rbd_mirror/InstanceReplayer.h
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_INSTANCE_REPLAYER_H
+#define RBD_MIRROR_INSTANCE_REPLAYER_H
+
+#include <map>
+#include <sstream>
+
+#include "common/AsyncOpTracker.h"
+#include "common/Formatter.h"
+#include "common/ceph_mutex.h"
+#include "tools/rbd_mirror/Types.h"
+
+namespace journal { struct CacheManagerHandler; }
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class ImageReplayer;
+template <typename> class InstanceWatcher;
+template <typename> class MirrorStatusUpdater;
+struct PoolMetaCache;
+template <typename> class ServiceDaemon;
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class InstanceReplayer {
+public:
+ static InstanceReplayer* create(
+ librados::IoCtx &local_io_ctx, const std::string &local_mirror_uuid,
+ Threads<ImageCtxT> *threads, ServiceDaemon<ImageCtxT> *service_daemon,
+ MirrorStatusUpdater<ImageCtxT>* local_status_updater,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache) {
+ return new InstanceReplayer(local_io_ctx, local_mirror_uuid, threads,
+ service_daemon, local_status_updater,
+ cache_manager_handler, pool_meta_cache);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ InstanceReplayer(librados::IoCtx &local_io_ctx,
+ const std::string &local_mirror_uuid,
+ Threads<ImageCtxT> *threads,
+ ServiceDaemon<ImageCtxT> *service_daemon,
+ MirrorStatusUpdater<ImageCtxT>* local_status_updater,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache);
+ ~InstanceReplayer();
+
+ bool is_blocklisted() const;
+
+ int init();
+ void shut_down();
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+ void add_peer(const Peer<ImageCtxT>& peer);
+
+ void acquire_image(InstanceWatcher<ImageCtxT> *instance_watcher,
+ const std::string &global_image_id, Context *on_finish);
+ void release_image(const std::string &global_image_id, Context *on_finish);
+ void remove_peer_image(const std::string &global_image_id,
+ const std::string &peer_mirror_uuid,
+ Context *on_finish);
+
+ void release_all(Context *on_finish);
+
+ void print_status(Formatter *f);
+ void start();
+ void stop();
+ void restart();
+ void flush();
+
+ void stop(Context *on_finish);
+
+private:
+ /**
+ * @verbatim
+ *
+ * <uninitialized> <-------------------\
+ * | (init) | (repeat for each
+ * v STOP_IMAGE_REPLAYER ---\ image replayer)
+ * SCHEDULE_IMAGE_STATE_CHECK_TASK ^ ^ |
+ * | | | |
+ * v (shut_down) | \---------/
+ * <initialized> -----------------> WAIT_FOR_OPS
+ *
+ * @endverbatim
+ */
+
+ typedef std::set<Peer<ImageCtxT>> Peers;
+
+ librados::IoCtx &m_local_io_ctx;
+ std::string m_local_mirror_uuid;
+ Threads<ImageCtxT> *m_threads;
+ ServiceDaemon<ImageCtxT> *m_service_daemon;
+ MirrorStatusUpdater<ImageCtxT>* m_local_status_updater;
+ journal::CacheManagerHandler *m_cache_manager_handler;
+ PoolMetaCache* m_pool_meta_cache;
+
+ mutable ceph::mutex m_lock;
+ AsyncOpTracker m_async_op_tracker;
+ std::map<std::string, ImageReplayer<ImageCtxT> *> m_image_replayers;
+ Peers m_peers;
+ Context *m_image_state_check_task = nullptr;
+ Context *m_on_shut_down = nullptr;
+ bool m_manual_stop = false;
+ bool m_blocklisted = false;
+
+ void wait_for_ops();
+ void handle_wait_for_ops(int r);
+
+ void start_image_replayer(ImageReplayer<ImageCtxT> *image_replayer);
+ void queue_start_image_replayers();
+ void start_image_replayers(int r);
+
+ void stop_image_replayer(ImageReplayer<ImageCtxT> *image_replayer,
+ Context *on_finish);
+
+ void stop_image_replayers();
+ void handle_stop_image_replayers(int r);
+
+ void schedule_image_state_check_task();
+ void cancel_image_state_check_task();
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::InstanceReplayer<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_INSTANCE_REPLAYER_H
diff --git a/src/tools/rbd_mirror/InstanceWatcher.cc b/src/tools/rbd_mirror/InstanceWatcher.cc
new file mode 100644
index 000000000..7b531064d
--- /dev/null
+++ b/src/tools/rbd_mirror/InstanceWatcher.cc
@@ -0,0 +1,1290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "InstanceWatcher.h"
+#include "include/stringify.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ManagedLock.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "InstanceReplayer.h"
+#include "Throttler.h"
+#include "common/Cond.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::InstanceWatcher: "
+
+namespace rbd {
+namespace mirror {
+
+using namespace instance_watcher;
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+using librbd::util::unique_lock_name;
+
+namespace {
+
+struct C_GetInstances : public Context {
+ std::vector<std::string> *instance_ids;
+ Context *on_finish;
+ bufferlist out_bl;
+
+ C_GetInstances(std::vector<std::string> *instance_ids, Context *on_finish)
+ : instance_ids(instance_ids), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ dout(10) << "C_GetInstances: " << this << " " << __func__ << ": r=" << r
+ << dendl;
+
+ if (r == 0) {
+ auto it = out_bl.cbegin();
+ r = librbd::cls_client::mirror_instances_list_finish(&it, instance_ids);
+ } else if (r == -ENOENT) {
+ r = 0;
+ }
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+struct C_RemoveInstanceRequest : public Context {
+ InstanceWatcher<I> instance_watcher;
+ Context *on_finish;
+
+ C_RemoveInstanceRequest(librados::IoCtx &io_ctx,
+ librbd::AsioEngine& asio_engine,
+ const std::string &instance_id, Context *on_finish)
+ : instance_watcher(io_ctx, asio_engine, nullptr, nullptr, instance_id),
+ on_finish(on_finish) {
+ }
+
+ void send() {
+ dout(10) << "C_RemoveInstanceRequest: " << this << " " << __func__ << dendl;
+
+ instance_watcher.remove(this);
+ }
+
+ void finish(int r) override {
+ dout(10) << "C_RemoveInstanceRequest: " << this << " " << __func__ << ": r="
+ << r << dendl;
+ ceph_assert(r == 0);
+
+ on_finish->complete(r);
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+struct InstanceWatcher<I>::C_NotifyInstanceRequest : public Context {
+ InstanceWatcher<I> *instance_watcher;
+ std::string instance_id;
+ uint64_t request_id;
+ bufferlist bl;
+ Context *on_finish;
+ bool send_to_leader;
+ std::unique_ptr<librbd::watcher::Notifier> notifier;
+ librbd::watcher::NotifyResponse response;
+ bool canceling = false;
+
+ C_NotifyInstanceRequest(InstanceWatcher<I> *instance_watcher,
+ const std::string &instance_id, uint64_t request_id,
+ bufferlist &&bl, Context *on_finish)
+ : instance_watcher(instance_watcher), instance_id(instance_id),
+ request_id(request_id), bl(bl), on_finish(on_finish),
+ send_to_leader(instance_id.empty()) {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": instance_watcher=" << instance_watcher << ", instance_id="
+ << instance_id << ", request_id=" << request_id << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(instance_watcher->m_lock));
+
+ if (!send_to_leader) {
+ ceph_assert((!instance_id.empty()));
+ notifier.reset(new librbd::watcher::Notifier(
+ instance_watcher->m_work_queue,
+ instance_watcher->m_ioctx,
+ RBD_MIRROR_INSTANCE_PREFIX + instance_id));
+ }
+
+ instance_watcher->m_notify_op_tracker.start_op();
+ auto result = instance_watcher->m_notify_ops.insert(
+ std::make_pair(instance_id, this)).second;
+ ceph_assert(result);
+ }
+
+ void send() {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(instance_watcher->m_lock));
+
+ if (canceling) {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": canceling" << dendl;
+ instance_watcher->m_work_queue->queue(this, -ECANCELED);
+ return;
+ }
+
+ if (send_to_leader) {
+ if (instance_watcher->m_leader_instance_id.empty()) {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": suspending" << dendl;
+ instance_watcher->suspend_notify_request(this);
+ return;
+ }
+
+ if (instance_watcher->m_leader_instance_id != instance_id) {
+ auto count = instance_watcher->m_notify_ops.erase(
+ std::make_pair(instance_id, this));
+ ceph_assert(count > 0);
+
+ instance_id = instance_watcher->m_leader_instance_id;
+
+ auto result = instance_watcher->m_notify_ops.insert(
+ std::make_pair(instance_id, this)).second;
+ ceph_assert(result);
+
+ notifier.reset(new librbd::watcher::Notifier(
+ instance_watcher->m_work_queue,
+ instance_watcher->m_ioctx,
+ RBD_MIRROR_INSTANCE_PREFIX + instance_id));
+ }
+ }
+
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": sending to " << instance_id << dendl;
+ notifier->notify(bl, &response, this);
+ }
+
+ void cancel() {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(instance_watcher->m_lock));
+
+ canceling = true;
+ instance_watcher->unsuspend_notify_request(this);
+ }
+
+ void finish(int r) override {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ << ": r="
+ << r << dendl;
+
+ if (r == 0 || r == -ETIMEDOUT) {
+ bool found = false;
+ for (auto &it : response.acks) {
+ auto &bl = it.second;
+ if (it.second.length() == 0) {
+ dout(5) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": no payload in ack, ignoring" << dendl;
+ continue;
+ }
+ try {
+ auto iter = bl.cbegin();
+ NotifyAckPayload ack;
+ decode(ack, iter);
+ if (ack.instance_id != instance_watcher->get_instance_id()) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": ack instance_id (" << ack.instance_id << ") "
+ << "does not match, ignoring" << dendl;
+ continue;
+ }
+ if (ack.request_id != request_id) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": ack request_id (" << ack.request_id << ") "
+ << "does not match, ignoring" << dendl;
+ continue;
+ }
+ r = ack.ret_val;
+ found = true;
+ break;
+ } catch (const buffer::error &err) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": failed to decode ack: " << err.what() << dendl;
+ continue;
+ }
+ }
+
+ if (!found) {
+ if (r == -ETIMEDOUT) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": resending after timeout" << dendl;
+ std::lock_guard locker{instance_watcher->m_lock};
+ send();
+ return;
+ } else {
+ r = -EINVAL;
+ }
+ } else {
+ if (r == -ESTALE && send_to_leader) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": resending due to leader change" << dendl;
+ std::lock_guard locker{instance_watcher->m_lock};
+ send();
+ return;
+ }
+ }
+ }
+
+ on_finish->complete(r);
+
+ {
+ std::lock_guard locker{instance_watcher->m_lock};
+ auto result = instance_watcher->m_notify_ops.erase(
+ std::make_pair(instance_id, this));
+ ceph_assert(result > 0);
+ instance_watcher->m_notify_op_tracker.finish_op();
+ }
+
+ delete this;
+ }
+
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+template <typename I>
+struct InstanceWatcher<I>::C_SyncRequest : public Context {
+ InstanceWatcher<I> *instance_watcher;
+ std::string sync_id;
+ Context *on_start;
+ Context *on_complete = nullptr;
+ C_NotifyInstanceRequest *req = nullptr;
+
+ C_SyncRequest(InstanceWatcher<I> *instance_watcher,
+ const std::string &sync_id, Context *on_start)
+ : instance_watcher(instance_watcher), sync_id(sync_id),
+ on_start(on_start) {
+ dout(10) << "C_SyncRequest: " << this << " " << __func__ << ": sync_id="
+ << sync_id << dendl;
+ }
+
+ void finish(int r) override {
+ dout(10) << "C_SyncRequest: " << this << " " << __func__ << ": r="
+ << r << dendl;
+
+ if (on_start != nullptr) {
+ instance_watcher->handle_notify_sync_request(this, r);
+ } else {
+ instance_watcher->handle_notify_sync_complete(this, r);
+ delete this;
+ }
+ }
+
+ // called twice
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::InstanceWatcher: " \
+ << this << " " << __func__ << ": "
+template <typename I>
+void InstanceWatcher<I>::get_instances(librados::IoCtx &io_ctx,
+ std::vector<std::string> *instance_ids,
+ Context *on_finish) {
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_instances_list_start(&op);
+ C_GetInstances *ctx = new C_GetInstances(instance_ids, on_finish);
+ librados::AioCompletion *aio_comp = create_rados_callback(ctx);
+
+ int r = io_ctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op, &ctx->out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::remove_instance(librados::IoCtx &io_ctx,
+ librbd::AsioEngine& asio_engine,
+ const std::string &instance_id,
+ Context *on_finish) {
+ auto req = new C_RemoveInstanceRequest<I>(io_ctx, asio_engine, instance_id,
+ on_finish);
+ req->send();
+}
+
+template <typename I>
+InstanceWatcher<I> *InstanceWatcher<I>::create(
+ librados::IoCtx &io_ctx, librbd::AsioEngine& asio_engine,
+ InstanceReplayer<I> *instance_replayer,
+ Throttler<I> *image_sync_throttler) {
+ return new InstanceWatcher<I>(io_ctx, asio_engine, instance_replayer,
+ image_sync_throttler,
+ stringify(io_ctx.get_instance_id()));
+}
+
+template <typename I>
+InstanceWatcher<I>::InstanceWatcher(librados::IoCtx &io_ctx,
+ librbd::AsioEngine& asio_engine,
+ InstanceReplayer<I> *instance_replayer,
+ Throttler<I> *image_sync_throttler,
+ const std::string &instance_id)
+ : Watcher(io_ctx, asio_engine.get_work_queue(),
+ RBD_MIRROR_INSTANCE_PREFIX + instance_id),
+ m_instance_replayer(instance_replayer),
+ m_image_sync_throttler(image_sync_throttler), m_instance_id(instance_id),
+ m_lock(ceph::make_mutex(
+ unique_lock_name("rbd::mirror::InstanceWatcher::m_lock", this))),
+ m_instance_lock(librbd::ManagedLock<I>::create(
+ m_ioctx, asio_engine, m_oid, this, librbd::managed_lock::EXCLUSIVE, true,
+ m_cct->_conf.get_val<uint64_t>("rbd_blocklist_expire_seconds"))) {
+}
+
+template <typename I>
+InstanceWatcher<I>::~InstanceWatcher() {
+ ceph_assert(m_requests.empty());
+ ceph_assert(m_notify_ops.empty());
+ ceph_assert(m_notify_op_tracker.empty());
+ ceph_assert(m_suspended_ops.empty());
+ ceph_assert(m_inflight_sync_reqs.empty());
+ m_instance_lock->destroy();
+}
+
+template <typename I>
+int InstanceWatcher<I>::init() {
+ C_SaferCond init_ctx;
+ init(&init_ctx);
+ return init_ctx.wait();
+}
+
+template <typename I>
+void InstanceWatcher<I>::init(Context *on_finish) {
+ dout(10) << "instance_id=" << m_instance_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ register_instance();
+}
+
+template <typename I>
+void InstanceWatcher<I>::shut_down() {
+ C_SaferCond shut_down_ctx;
+ shut_down(&shut_down_ctx);
+ int r = shut_down_ctx.wait();
+ ceph_assert(r == 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::shut_down(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ release_lock();
+}
+
+template <typename I>
+void InstanceWatcher<I>::remove(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ get_instance_locker();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_image_acquire(
+ const std::string &instance_id, const std::string &global_image_id,
+ Context *on_notify_ack) {
+ dout(10) << "instance_id=" << instance_id << ", global_image_id="
+ << global_image_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_finish == nullptr);
+
+ uint64_t request_id = ++m_request_seq;
+ bufferlist bl;
+ encode(NotifyMessage{ImageAcquirePayload{request_id, global_image_id}}, bl);
+ auto req = new C_NotifyInstanceRequest(this, instance_id, request_id,
+ std::move(bl), on_notify_ack);
+ req->send();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_image_release(
+ const std::string &instance_id, const std::string &global_image_id,
+ Context *on_notify_ack) {
+ dout(10) << "instance_id=" << instance_id << ", global_image_id="
+ << global_image_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_finish == nullptr);
+
+ uint64_t request_id = ++m_request_seq;
+ bufferlist bl;
+ encode(NotifyMessage{ImageReleasePayload{request_id, global_image_id}}, bl);
+ auto req = new C_NotifyInstanceRequest(this, instance_id, request_id,
+ std::move(bl), on_notify_ack);
+ req->send();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_peer_image_removed(
+ const std::string &instance_id, const std::string &global_image_id,
+ const std::string &peer_mirror_uuid, Context *on_notify_ack) {
+ dout(10) << "instance_id=" << instance_id << ", "
+ << "global_image_id=" << global_image_id << ", "
+ << "peer_mirror_uuid=" << peer_mirror_uuid << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_finish == nullptr);
+
+ uint64_t request_id = ++m_request_seq;
+ bufferlist bl;
+ encode(NotifyMessage{PeerImageRemovedPayload{request_id, global_image_id,
+ peer_mirror_uuid}}, bl);
+ auto req = new C_NotifyInstanceRequest(this, instance_id, request_id,
+ std::move(bl), on_notify_ack);
+ req->send();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_request(const std::string &sync_id,
+ Context *on_sync_start) {
+ dout(10) << "sync_id=" << sync_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_inflight_sync_reqs.count(sync_id) == 0);
+
+ uint64_t request_id = ++m_request_seq;
+
+ bufferlist bl;
+ encode(NotifyMessage{SyncRequestPayload{request_id, sync_id}}, bl);
+
+ auto sync_ctx = new C_SyncRequest(this, sync_id, on_sync_start);
+ sync_ctx->req = new C_NotifyInstanceRequest(this, "", request_id,
+ std::move(bl), sync_ctx);
+
+ m_inflight_sync_reqs[sync_id] = sync_ctx;
+ sync_ctx->req->send();
+}
+
+template <typename I>
+bool InstanceWatcher<I>::cancel_sync_request(const std::string &sync_id) {
+ dout(10) << "sync_id=" << sync_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ auto it = m_inflight_sync_reqs.find(sync_id);
+ if (it == m_inflight_sync_reqs.end()) {
+ return false;
+ }
+
+ auto sync_ctx = it->second;
+
+ if (sync_ctx->on_start == nullptr) {
+ return false;
+ }
+
+ ceph_assert(sync_ctx->req != nullptr);
+ sync_ctx->req->cancel();
+ return true;
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_start(const std::string &instance_id,
+ const std::string &sync_id) {
+ dout(10) << "sync_id=" << sync_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ uint64_t request_id = ++m_request_seq;
+
+ bufferlist bl;
+ encode(NotifyMessage{SyncStartPayload{request_id, sync_id}}, bl);
+
+ auto ctx = new LambdaContext(
+ [this, sync_id] (int r) {
+ dout(10) << "finish: sync_id=" << sync_id << ", r=" << r << dendl;
+ std::lock_guard locker{m_lock};
+ if (r != -ESTALE && is_leader()) {
+ m_image_sync_throttler->finish_op(m_ioctx.get_namespace(), sync_id);
+ }
+ });
+ auto req = new C_NotifyInstanceRequest(this, instance_id, request_id,
+ std::move(bl), ctx);
+ req->send();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_complete(const std::string &sync_id) {
+ std::lock_guard locker{m_lock};
+ notify_sync_complete(m_lock, sync_id);
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_complete(const ceph::mutex&,
+ const std::string &sync_id) {
+ dout(10) << "sync_id=" << sync_id << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto it = m_inflight_sync_reqs.find(sync_id);
+ ceph_assert(it != m_inflight_sync_reqs.end());
+
+ auto sync_ctx = it->second;
+ ceph_assert(sync_ctx->req == nullptr);
+
+ m_inflight_sync_reqs.erase(it);
+ m_work_queue->queue(sync_ctx, 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_notify_sync_request(C_SyncRequest *sync_ctx,
+ int r) {
+ dout(10) << "sync_id=" << sync_ctx->sync_id << ", r=" << r << dendl;
+
+ Context *on_start = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(sync_ctx->req != nullptr);
+ ceph_assert(sync_ctx->on_start != nullptr);
+
+ if (sync_ctx->req->canceling) {
+ r = -ECANCELED;
+ }
+
+ std::swap(sync_ctx->on_start, on_start);
+ sync_ctx->req = nullptr;
+
+ if (r == -ECANCELED) {
+ notify_sync_complete(m_lock, sync_ctx->sync_id);
+ }
+ }
+
+ on_start->complete(r == -ECANCELED ? r : 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_notify_sync_complete(C_SyncRequest *sync_ctx,
+ int r) {
+ dout(10) << "sync_id=" << sync_ctx->sync_id << ", r=" << r << dendl;
+
+ if (sync_ctx->on_complete != nullptr) {
+ sync_ctx->on_complete->complete(r);
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_acquire_leader() {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_leader_instance_id = m_instance_id;
+ unsuspend_notify_requests();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_release_leader() {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_leader_instance_id.clear();
+
+ m_image_sync_throttler->drain(m_ioctx.get_namespace(), -ESTALE);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_update_leader(
+ const std::string &leader_instance_id) {
+ dout(10) << "leader_instance_id=" << leader_instance_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_leader_instance_id = leader_instance_id;
+
+ if (!m_leader_instance_id.empty()) {
+ unsuspend_notify_requests();
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::cancel_notify_requests(
+ const std::string &instance_id) {
+ dout(10) << "instance_id=" << instance_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ for (auto op : m_notify_ops) {
+ if (op.first == instance_id && !op.second->send_to_leader) {
+ op.second->cancel();
+ }
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::register_instance() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ dout(10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_instances_add(&op, m_instance_id);
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_register_instance>(this);
+
+ int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_register_instance(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+
+ if (r == 0) {
+ create_instance_object();
+ return;
+ }
+
+ derr << "error registering instance: " << cpp_strerror(r) << dendl;
+
+ std::swap(on_finish, m_on_finish);
+ }
+ on_finish->complete(r);
+}
+
+
+template <typename I>
+void InstanceWatcher<I>::create_instance_object() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ InstanceWatcher<I>,
+ &InstanceWatcher<I>::handle_create_instance_object>(this);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_create_instance_object(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error creating " << m_oid << " object: " << cpp_strerror(r)
+ << dendl;
+
+ m_ret_val = r;
+ unregister_instance();
+ return;
+ }
+
+ register_watch();
+}
+
+template <typename I>
+void InstanceWatcher<I>::register_watch() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_register_watch>(this));
+
+ librbd::Watcher::register_watch(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_register_watch(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error registering instance watcher for " << m_oid << " object: "
+ << cpp_strerror(r) << dendl;
+
+ m_ret_val = r;
+ remove_instance_object();
+ return;
+ }
+
+ acquire_lock();
+}
+
+template <typename I>
+void InstanceWatcher<I>::acquire_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_acquire_lock>(this));
+
+ m_instance_lock->acquire_lock(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_acquire_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+
+ derr << "error acquiring instance lock: " << cpp_strerror(r) << dendl;
+
+ m_ret_val = r;
+ unregister_watch();
+ return;
+ }
+
+ std::swap(on_finish, m_on_finish);
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void InstanceWatcher<I>::release_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_release_lock>(this));
+
+ m_instance_lock->shut_down(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_release_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error releasing instance lock: " << cpp_strerror(r) << dendl;
+ }
+
+ unregister_watch();
+}
+
+template <typename I>
+void InstanceWatcher<I>::unregister_watch() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_unregister_watch>(this));
+
+ librbd::Watcher::unregister_watch(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_unregister_watch(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error unregistering instance watcher for " << m_oid << " object: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+ remove_instance_object();
+}
+
+template <typename I>
+void InstanceWatcher<I>::remove_instance_object() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ dout(10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ op.remove();
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ InstanceWatcher<I>,
+ &InstanceWatcher<I>::handle_remove_instance_object>(this);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_remove_instance_object(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+
+ if (r < 0) {
+ derr << "error removing " << m_oid << " object: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+ unregister_instance();
+}
+
+template <typename I>
+void InstanceWatcher<I>::unregister_instance() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_instances_remove(&op, m_instance_id);
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_unregister_instance>(this);
+
+ int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_unregister_instance(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error unregistering instance: " << cpp_strerror(r) << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+ wait_for_notify_ops();
+}
+
+template <typename I>
+void InstanceWatcher<I>::wait_for_notify_ops() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ for (auto op : m_notify_ops) {
+ op.second->cancel();
+ }
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_wait_for_notify_ops>(this));
+
+ m_notify_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_wait_for_notify_ops(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_notify_ops.empty());
+
+ std::swap(on_finish, m_on_finish);
+ r = m_ret_val;
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void InstanceWatcher<I>::get_instance_locker() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_get_instance_locker>(this));
+
+ m_instance_lock->get_locker(&m_instance_locker, ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_get_instance_locker(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ if (r != -ENOENT) {
+ derr << "error retrieving instance locker: " << cpp_strerror(r) << dendl;
+ }
+ remove_instance_object();
+ return;
+ }
+
+ break_instance_lock();
+}
+
+template <typename I>
+void InstanceWatcher<I>::break_instance_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_break_instance_lock>(this));
+
+ m_instance_lock->break_lock(m_instance_locker, true, ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_break_instance_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ if (r != -ENOENT) {
+ derr << "error breaking instance lock: " << cpp_strerror(r) << dendl;
+ }
+ remove_instance_object();
+ return;
+ }
+
+ remove_instance_object();
+}
+
+template <typename I>
+void InstanceWatcher<I>::suspend_notify_request(C_NotifyInstanceRequest *req) {
+ dout(10) << req << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto result = m_suspended_ops.insert(req).second;
+ ceph_assert(result);
+}
+
+template <typename I>
+bool InstanceWatcher<I>::unsuspend_notify_request(
+ C_NotifyInstanceRequest *req) {
+ dout(10) << req << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto result = m_suspended_ops.erase(req);
+ if (result == 0) {
+ return false;
+ }
+
+ req->send();
+ return true;
+}
+
+template <typename I>
+void InstanceWatcher<I>::unsuspend_notify_requests() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ std::set<C_NotifyInstanceRequest *> suspended_ops;
+ std::swap(m_suspended_ops, suspended_ops);
+
+ for (auto op : suspended_ops) {
+ op->send();
+ }
+}
+
+template <typename I>
+Context *InstanceWatcher<I>::prepare_request(const std::string &instance_id,
+ uint64_t request_id,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "instance_id=" << instance_id << ", request_id=" << request_id
+ << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ Context *ctx = nullptr;
+ Request request(instance_id, request_id);
+ auto it = m_requests.find(request);
+
+ if (it != m_requests.end()) {
+ dout(10) << "duplicate for in-progress request" << dendl;
+ delete it->on_notify_ack;
+ m_requests.erase(it);
+ } else {
+ ctx = create_async_context_callback(
+ m_work_queue, new LambdaContext(
+ [this, instance_id, request_id] (int r) {
+ complete_request(instance_id, request_id, r);
+ }));
+ }
+
+ request.on_notify_ack = on_notify_ack;
+ m_requests.insert(request);
+ return ctx;
+}
+
+template <typename I>
+void InstanceWatcher<I>::complete_request(const std::string &instance_id,
+ uint64_t request_id, int r) {
+ dout(10) << "instance_id=" << instance_id << ", request_id=" << request_id
+ << dendl;
+
+ C_NotifyAck *on_notify_ack;
+ {
+ std::lock_guard locker{m_lock};
+ Request request(instance_id, request_id);
+ auto it = m_requests.find(request);
+ ceph_assert(it != m_requests.end());
+ on_notify_ack = it->on_notify_ack;
+ m_requests.erase(it);
+ }
+
+ encode(NotifyAckPayload(instance_id, request_id, r), on_notify_ack->out);
+ on_notify_ack->complete(0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ dout(10) << "notify_id=" << notify_id << ", handle=" << handle << ", "
+ << "notifier_id=" << notifier_id << dendl;
+
+ auto ctx = new C_NotifyAck(this, notify_id, handle);
+
+ NotifyMessage notify_message;
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ derr << "error decoding image notification: " << err.what() << dendl;
+ ctx->complete(0);
+ return;
+ }
+
+ apply_visitor(HandlePayloadVisitor(this, stringify(notifier_id), ctx),
+ notify_message.payload);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_image_acquire(
+ const std::string &global_image_id, Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+
+ auto ctx = new LambdaContext(
+ [this, global_image_id, on_finish] (int r) {
+ m_instance_replayer->acquire_image(this, global_image_id, on_finish);
+ m_notify_op_tracker.finish_op();
+ });
+
+ m_notify_op_tracker.start_op();
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_image_release(
+ const std::string &global_image_id, Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+
+ auto ctx = new LambdaContext(
+ [this, global_image_id, on_finish] (int r) {
+ m_instance_replayer->release_image(global_image_id, on_finish);
+ m_notify_op_tracker.finish_op();
+ });
+
+ m_notify_op_tracker.start_op();
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_peer_image_removed(
+ const std::string &global_image_id, const std::string &peer_mirror_uuid,
+ Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << ", "
+ << "peer_mirror_uuid=" << peer_mirror_uuid << dendl;
+
+ auto ctx = new LambdaContext(
+ [this, peer_mirror_uuid, global_image_id, on_finish] (int r) {
+ m_instance_replayer->remove_peer_image(global_image_id,
+ peer_mirror_uuid, on_finish);
+ m_notify_op_tracker.finish_op();
+ });
+
+ m_notify_op_tracker.start_op();
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_sync_request(const std::string &instance_id,
+ const std::string &sync_id,
+ Context *on_finish) {
+ dout(10) << "instance_id=" << instance_id << ", sync_id=" << sync_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (!is_leader()) {
+ dout(10) << "sync request for non-leader" << dendl;
+ m_work_queue->queue(on_finish, -ESTALE);
+ return;
+ }
+
+ Context *on_start = create_async_context_callback(
+ m_work_queue, new LambdaContext(
+ [this, instance_id, sync_id, on_finish] (int r) {
+ dout(10) << "handle_sync_request: finish: instance_id=" << instance_id
+ << ", sync_id=" << sync_id << ", r=" << r << dendl;
+ if (r == 0) {
+ notify_sync_start(instance_id, sync_id);
+ }
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ on_finish->complete(r);
+ }));
+ m_image_sync_throttler->start_op(m_ioctx.get_namespace(), sync_id, on_start);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_sync_start(const std::string &instance_id,
+ const std::string &sync_id,
+ Context *on_finish) {
+ dout(10) << "instance_id=" << instance_id << ", sync_id=" << sync_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ auto it = m_inflight_sync_reqs.find(sync_id);
+ if (it == m_inflight_sync_reqs.end()) {
+ dout(5) << "not found" << dendl;
+ m_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ auto sync_ctx = it->second;
+
+ if (sync_ctx->on_complete != nullptr) {
+ dout(5) << "duplicate request" << dendl;
+ m_work_queue->queue(sync_ctx->on_complete, -ESTALE);
+ }
+
+ sync_ctx->on_complete = on_finish;
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const ImageAcquirePayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "image_acquire: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish != nullptr) {
+ handle_image_acquire(payload.global_image_id, on_finish);
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const ImageReleasePayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "image_release: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish != nullptr) {
+ handle_image_release(payload.global_image_id, on_finish);
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const PeerImageRemovedPayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "remove_peer_image: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish != nullptr) {
+ handle_peer_image_removed(payload.global_image_id, payload.peer_mirror_uuid,
+ on_finish);
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const SyncRequestPayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "sync_request: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish == nullptr) {
+ return;
+ }
+
+ handle_sync_request(instance_id, payload.sync_id, on_finish);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const SyncStartPayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "sync_start: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish == nullptr) {
+ return;
+ }
+
+ handle_sync_start(instance_id, payload.sync_id, on_finish);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const UnknownPayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(5) << "unknown: instance_id=" << instance_id << dendl;
+
+ on_notify_ack->complete(0);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::InstanceWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/InstanceWatcher.h b/src/tools/rbd_mirror/InstanceWatcher.h
new file mode 100644
index 000000000..08e40b40b
--- /dev/null
+++ b/src/tools/rbd_mirror/InstanceWatcher.h
@@ -0,0 +1,269 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_INSTANCE_WATCHER_H
+#define CEPH_RBD_MIRROR_INSTANCE_WATCHER_H
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "common/AsyncOpTracker.h"
+#include "librbd/Watcher.h"
+#include "librbd/managed_lock/Types.h"
+#include "tools/rbd_mirror/instance_watcher/Types.h"
+
+namespace librbd {
+
+class AsioEngine;
+class ImageCtx;
+template <typename> class ManagedLock;
+
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class InstanceReplayer;
+template <typename> class Throttler;
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class InstanceWatcher : protected librbd::Watcher {
+ using librbd::Watcher::unregister_watch; // Silence overloaded virtual warning
+public:
+ static void get_instances(librados::IoCtx &io_ctx,
+ std::vector<std::string> *instance_ids,
+ Context *on_finish);
+ static void remove_instance(librados::IoCtx &io_ctx,
+ librbd::AsioEngine& asio_engine,
+ const std::string &instance_id,
+ Context *on_finish);
+
+ static InstanceWatcher *create(
+ librados::IoCtx &io_ctx, librbd::AsioEngine& asio_engine,
+ InstanceReplayer<ImageCtxT> *instance_replayer,
+ Throttler<ImageCtxT> *image_sync_throttler);
+ void destroy() {
+ delete this;
+ }
+
+ InstanceWatcher(librados::IoCtx &io_ctx, librbd::AsioEngine& asio_engine,
+ InstanceReplayer<ImageCtxT> *instance_replayer,
+ Throttler<ImageCtxT> *image_sync_throttler,
+ const std::string &instance_id);
+ ~InstanceWatcher() override;
+
+ inline std::string &get_instance_id() {
+ return m_instance_id;
+ }
+
+ int init();
+ void shut_down();
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+ void remove(Context *on_finish);
+
+ void notify_image_acquire(const std::string &instance_id,
+ const std::string &global_image_id,
+ Context *on_notify_ack);
+ void notify_image_release(const std::string &instance_id,
+ const std::string &global_image_id,
+ Context *on_notify_ack);
+ void notify_peer_image_removed(const std::string &instance_id,
+ const std::string &global_image_id,
+ const std::string &peer_mirror_uuid,
+ Context *on_notify_ack);
+
+ void notify_sync_request(const std::string &sync_id, Context *on_sync_start);
+ bool cancel_sync_request(const std::string &sync_id);
+ void notify_sync_complete(const std::string &sync_id);
+
+ void cancel_notify_requests(const std::string &instance_id);
+
+ void handle_acquire_leader();
+ void handle_release_leader();
+ void handle_update_leader(const std::string &leader_instance_id);
+
+private:
+ /**
+ * @verbatim
+ *
+ * BREAK_INSTANCE_LOCK -------\
+ * ^ |
+ * | (error) |
+ * GET_INSTANCE_LOCKER * * *>|
+ * ^ (remove) |
+ * | |
+ * <uninitialized> <----------------+---- WAIT_FOR_NOTIFY_OPS
+ * | (init) ^ | ^
+ * v (error) * | |
+ * REGISTER_INSTANCE * * * * * *|* *> UNREGISTER_INSTANCE
+ * | * | ^
+ * v (error) * v |
+ * CREATE_INSTANCE_OBJECT * * * * * *> REMOVE_INSTANCE_OBJECT
+ * | * ^
+ * v (error) * |
+ * REGISTER_WATCH * * * * * * * * * *> UNREGISTER_WATCH
+ * | * ^
+ * v (error) * |
+ * ACQUIRE_LOCK * * * * * * * * * * * RELEASE_LOCK
+ * | ^
+ * v (shut_down) |
+ * <watching> -------------------------------/
+ *
+ * @endverbatim
+ */
+
+ struct C_NotifyInstanceRequest;
+ struct C_SyncRequest;
+
+ typedef std::pair<std::string, std::string> Id;
+
+ struct HandlePayloadVisitor : public boost::static_visitor<void> {
+ InstanceWatcher *instance_watcher;
+ std::string instance_id;
+ C_NotifyAck *on_notify_ack;
+
+ HandlePayloadVisitor(InstanceWatcher *instance_watcher,
+ const std::string &instance_id,
+ C_NotifyAck *on_notify_ack)
+ : instance_watcher(instance_watcher), instance_id(instance_id),
+ on_notify_ack(on_notify_ack) {
+ }
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ instance_watcher->handle_payload(instance_id, payload, on_notify_ack);
+ }
+ };
+
+ struct Request {
+ std::string instance_id;
+ uint64_t request_id;
+ C_NotifyAck *on_notify_ack = nullptr;
+
+ Request(const std::string &instance_id, uint64_t request_id)
+ : instance_id(instance_id), request_id(request_id) {
+ }
+
+ inline bool operator<(const Request &rhs) const {
+ return instance_id < rhs.instance_id ||
+ (instance_id == rhs.instance_id && request_id < rhs.request_id);
+ }
+ };
+
+ Threads<ImageCtxT> *m_threads;
+ InstanceReplayer<ImageCtxT> *m_instance_replayer;
+ Throttler<ImageCtxT> *m_image_sync_throttler;
+ std::string m_instance_id;
+
+ mutable ceph::mutex m_lock;
+ librbd::ManagedLock<ImageCtxT> *m_instance_lock;
+ Context *m_on_finish = nullptr;
+ int m_ret_val = 0;
+ std::string m_leader_instance_id;
+ librbd::managed_lock::Locker m_instance_locker;
+ std::set<std::pair<std::string, C_NotifyInstanceRequest *>> m_notify_ops;
+ AsyncOpTracker m_notify_op_tracker;
+ uint64_t m_request_seq = 0;
+ std::set<Request> m_requests;
+ std::set<C_NotifyInstanceRequest *> m_suspended_ops;
+ std::map<std::string, C_SyncRequest *> m_inflight_sync_reqs;
+
+ inline bool is_leader() const {
+ return m_leader_instance_id == m_instance_id;
+ }
+
+ void register_instance();
+ void handle_register_instance(int r);
+
+ void create_instance_object();
+ void handle_create_instance_object(int r);
+
+ void register_watch();
+ void handle_register_watch(int r);
+
+ void acquire_lock();
+ void handle_acquire_lock(int r);
+
+ void release_lock();
+ void handle_release_lock(int r);
+
+ void unregister_watch();
+ void handle_unregister_watch(int r);
+
+ void remove_instance_object();
+ void handle_remove_instance_object(int r);
+
+ void unregister_instance();
+ void handle_unregister_instance(int r);
+
+ void wait_for_notify_ops();
+ void handle_wait_for_notify_ops(int r);
+
+ void get_instance_locker();
+ void handle_get_instance_locker(int r);
+
+ void break_instance_lock();
+ void handle_break_instance_lock(int r);
+
+ void suspend_notify_request(C_NotifyInstanceRequest *req);
+ bool unsuspend_notify_request(C_NotifyInstanceRequest *req);
+ void unsuspend_notify_requests();
+
+ void notify_sync_complete(const ceph::mutex& lock, const std::string &sync_id);
+ void handle_notify_sync_request(C_SyncRequest *sync_ctx, int r);
+ void handle_notify_sync_complete(C_SyncRequest *sync_ctx, int r);
+
+ void notify_sync_start(const std::string &instance_id,
+ const std::string &sync_id);
+
+ Context *prepare_request(const std::string &instance_id, uint64_t request_id,
+ C_NotifyAck *on_notify_ack);
+ void complete_request(const std::string &instance_id, uint64_t request_id,
+ int r);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+
+ void handle_image_acquire(const std::string &global_image_id,
+ Context *on_finish);
+ void handle_image_release(const std::string &global_image_id,
+ Context *on_finish);
+ void handle_peer_image_removed(const std::string &global_image_id,
+ const std::string &peer_mirror_uuid,
+ Context *on_finish);
+
+ void handle_sync_request(const std::string &instance_id,
+ const std::string &sync_id, Context *on_finish);
+ void handle_sync_start(const std::string &instance_id,
+ const std::string &sync_id, Context *on_finish);
+
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::ImageAcquirePayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::ImageReleasePayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::PeerImageRemovedPayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::SyncRequestPayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::SyncStartPayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::UnknownPayload &payload,
+ C_NotifyAck *on_notify_ack);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_INSTANCE_WATCHER_H
diff --git a/src/tools/rbd_mirror/Instances.cc b/src/tools/rbd_mirror/Instances.cc
new file mode 100644
index 000000000..ca291bb5f
--- /dev/null
+++ b/src/tools/rbd_mirror/Instances.cc
@@ -0,0 +1,356 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/stringify.h"
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "InstanceWatcher.h"
+#include "Instances.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::Instances: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+Instances<I>::Instances(Threads<I> *threads, librados::IoCtx &ioctx,
+ const std::string& instance_id,
+ instances::Listener& listener) :
+ m_threads(threads), m_ioctx(ioctx), m_instance_id(instance_id),
+ m_listener(listener), m_cct(reinterpret_cast<CephContext *>(ioctx.cct())),
+ m_lock(ceph::make_mutex("rbd::mirror::Instances " + ioctx.get_pool_name())) {
+}
+
+template <typename I>
+Instances<I>::~Instances() {
+}
+
+template <typename I>
+void Instances<I>::init(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ get_instances();
+}
+
+template <typename I>
+void Instances<I>::shut_down(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+
+ Context *ctx = new LambdaContext(
+ [this](int r) {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ cancel_remove_task();
+ wait_for_ops();
+ });
+
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void Instances<I>::unblock_listener() {
+ dout(5) << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_listener_blocked);
+ m_listener_blocked = false;
+
+ InstanceIds added_instance_ids;
+ for (auto& pair : m_instances) {
+ if (pair.second.state == INSTANCE_STATE_ADDING) {
+ added_instance_ids.push_back(pair.first);
+ }
+ }
+
+ if (!added_instance_ids.empty()) {
+ m_threads->work_queue->queue(
+ new C_NotifyInstancesAdded(this, added_instance_ids), 0);
+ }
+}
+
+template <typename I>
+void Instances<I>::acked(const InstanceIds& instance_ids) {
+ dout(10) << "instance_ids=" << instance_ids << dendl;
+
+ std::lock_guard locker{m_lock};
+ if (m_on_finish != nullptr) {
+ dout(5) << "received on shut down, ignoring" << dendl;
+ return;
+ }
+
+ Context *ctx = new C_HandleAcked(this, instance_ids);
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void Instances<I>::handle_acked(const InstanceIds& instance_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ if (m_on_finish != nullptr) {
+ dout(5) << "handled on shut down, ignoring" << dendl;
+ return;
+ }
+
+ InstanceIds added_instance_ids;
+ auto time = clock_t::now();
+ for (auto& instance_id : instance_ids) {
+ auto &instance = m_instances.insert(
+ std::make_pair(instance_id, Instance{})).first->second;
+ instance.acked_time = time;
+ if (instance.state == INSTANCE_STATE_ADDING) {
+ added_instance_ids.push_back(instance_id);
+ }
+ }
+
+ schedule_remove_task(time);
+ if (!m_listener_blocked && !added_instance_ids.empty()) {
+ m_threads->work_queue->queue(
+ new C_NotifyInstancesAdded(this, added_instance_ids), 0);
+ }
+}
+
+template <typename I>
+void Instances<I>::notify_instances_added(const InstanceIds& instance_ids) {
+ std::unique_lock locker{m_lock};
+ InstanceIds added_instance_ids;
+ for (auto& instance_id : instance_ids) {
+ auto it = m_instances.find(instance_id);
+ if (it != m_instances.end() && it->second.state == INSTANCE_STATE_ADDING) {
+ added_instance_ids.push_back(instance_id);
+ }
+ }
+
+ if (added_instance_ids.empty()) {
+ return;
+ }
+
+ dout(5) << "instance_ids=" << added_instance_ids << dendl;
+ locker.unlock();
+ m_listener.handle_added(added_instance_ids);
+ locker.lock();
+
+ for (auto& instance_id : added_instance_ids) {
+ auto it = m_instances.find(instance_id);
+ if (it != m_instances.end() && it->second.state == INSTANCE_STATE_ADDING) {
+ it->second.state = INSTANCE_STATE_IDLE;
+ }
+ }
+}
+
+template <typename I>
+void Instances<I>::notify_instances_removed(const InstanceIds& instance_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+ m_listener.handle_removed(instance_ids);
+
+ std::lock_guard locker{m_lock};
+ for (auto& instance_id : instance_ids) {
+ m_instances.erase(instance_id);
+ }
+}
+
+template <typename I>
+void Instances<I>::list(std::vector<std::string> *instance_ids) {
+ dout(20) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ for (auto it : m_instances) {
+ instance_ids->push_back(it.first);
+ }
+}
+
+
+template <typename I>
+void Instances<I>::get_instances() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_context_callback<
+ Instances, &Instances<I>::handle_get_instances>(this);
+
+ InstanceWatcher<I>::get_instances(m_ioctx, &m_instance_ids, ctx);
+}
+
+template <typename I>
+void Instances<I>::handle_get_instances(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ std::swap(on_finish, m_on_finish);
+ }
+
+ if (r < 0) {
+ derr << "error retrieving instances: " << cpp_strerror(r) << dendl;
+ } else {
+ handle_acked(m_instance_ids);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void Instances<I>::wait_for_ops() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ Instances, &Instances<I>::handle_wait_for_ops>(this));
+
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void Instances<I>::handle_wait_for_ops(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ std::swap(on_finish, m_on_finish);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void Instances<I>::remove_instances(const Instances<I>::clock_t::time_point& time) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ InstanceIds instance_ids;
+ for (auto& instance_pair : m_instances) {
+ if (instance_pair.first == m_instance_id) {
+ continue;
+ }
+ auto& instance = instance_pair.second;
+ if (instance.state != INSTANCE_STATE_REMOVING &&
+ instance.acked_time <= time) {
+ instance.state = INSTANCE_STATE_REMOVING;
+ instance_ids.push_back(instance_pair.first);
+ }
+ }
+ ceph_assert(!instance_ids.empty());
+
+ dout(10) << "instance_ids=" << instance_ids << dendl;
+ Context* ctx = new LambdaContext([this, instance_ids](int r) {
+ handle_remove_instances(r, instance_ids);
+ });
+ ctx = create_async_context_callback(m_threads->work_queue, ctx);
+
+ auto gather_ctx = new C_Gather(m_cct, ctx);
+ for (auto& instance_id : instance_ids) {
+ InstanceWatcher<I>::remove_instance(m_ioctx, *m_threads->asio_engine,
+ instance_id, gather_ctx->new_sub());
+ }
+
+ m_async_op_tracker.start_op();
+ gather_ctx->activate();
+}
+
+template <typename I>
+void Instances<I>::handle_remove_instances(
+ int r, const InstanceIds& instance_ids) {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+
+ dout(10) << "r=" << r << ", instance_ids=" << instance_ids << dendl;
+ ceph_assert(r == 0);
+
+ // fire removed notification now that instances have been blocklisted
+ m_threads->work_queue->queue(
+ new C_NotifyInstancesRemoved(this, instance_ids), 0);
+
+ // reschedule the timer for the next batch
+ schedule_remove_task(clock_t::now());
+ m_async_op_tracker.finish_op();
+}
+
+template <typename I>
+void Instances<I>::cancel_remove_task() {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ if (m_timer_task == nullptr) {
+ return;
+ }
+
+ dout(10) << dendl;
+
+ bool canceled = m_threads->timer->cancel_event(m_timer_task);
+ ceph_assert(canceled);
+ m_timer_task = nullptr;
+}
+
+template <typename I>
+void Instances<I>::schedule_remove_task(const Instances<I>::clock_t::time_point& time) {
+ cancel_remove_task();
+ if (m_on_finish != nullptr) {
+ dout(10) << "received on shut down, ignoring" << dendl;
+ return;
+ }
+
+ int after = m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_heartbeat_interval") *
+ (1 + m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_max_missed_heartbeats") +
+ m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_max_acquire_attempts_before_break"));
+
+ bool schedule = false;
+ auto oldest_time = time;
+ for (auto& instance : m_instances) {
+ if (instance.first == m_instance_id) {
+ continue;
+ }
+ if (instance.second.state == INSTANCE_STATE_REMOVING) {
+ // removal is already in-flight
+ continue;
+ }
+
+ oldest_time = std::min(oldest_time, instance.second.acked_time);
+ schedule = true;
+ }
+
+ if (!schedule) {
+ return;
+ }
+
+ dout(10) << dendl;
+
+ // schedule a time to fire when the oldest instance should be removed
+ m_timer_task = new LambdaContext(
+ [this, oldest_time](int r) {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ std::lock_guard locker{m_lock};
+ m_timer_task = nullptr;
+
+ remove_instances(oldest_time);
+ });
+
+ oldest_time += ceph::make_timespan(after);
+ m_threads->timer->add_event_at(oldest_time, m_timer_task);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::Instances<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/Instances.h b/src/tools/rbd_mirror/Instances.h
new file mode 100644
index 000000000..e6e104b73
--- /dev/null
+++ b/src/tools/rbd_mirror/Instances.h
@@ -0,0 +1,168 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_INSTANCES_H
+#define CEPH_RBD_MIRROR_INSTANCES_H
+
+#include <map>
+#include <vector>
+
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_mutex.h"
+#include "librbd/Watcher.h"
+#include "tools/rbd_mirror/instances/Types.h"
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Instances {
+public:
+ typedef std::vector<std::string> InstanceIds;
+
+ static Instances *create(Threads<ImageCtxT> *threads,
+ librados::IoCtx &ioctx,
+ const std::string& instance_id,
+ instances::Listener& listener) {
+ return new Instances(threads, ioctx, instance_id, listener);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ Instances(Threads<ImageCtxT> *threads, librados::IoCtx &ioctx,
+ const std::string& instance_id, instances::Listener& listener);
+ virtual ~Instances();
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+ void unblock_listener();
+
+ void acked(const InstanceIds& instance_ids);
+
+ void list(std::vector<std::string> *instance_ids);
+
+private:
+ /**
+ * @verbatim
+ *
+ * <uninitialized> <---------------------\
+ * | (init) ^ |
+ * v (error) * |
+ * GET_INSTANCES * * * * * WAIT_FOR_OPS
+ * | ^
+ * v (shut_down) |
+ * <initialized> ------------------------/
+ * .
+ * . (remove_instance)
+ * v
+ * REMOVE_INSTANCE
+ *
+ * @endverbatim
+ */
+
+ enum InstanceState {
+ INSTANCE_STATE_ADDING,
+ INSTANCE_STATE_IDLE,
+ INSTANCE_STATE_REMOVING
+ };
+
+ using clock_t = ceph::real_clock;
+ struct Instance {
+ clock_t::time_point acked_time{};
+ InstanceState state = INSTANCE_STATE_ADDING;
+ };
+
+ struct C_NotifyBase : public Context {
+ Instances *instances;
+ InstanceIds instance_ids;
+
+ C_NotifyBase(Instances *instances, const InstanceIds& instance_ids)
+ : instances(instances), instance_ids(instance_ids) {
+ instances->m_async_op_tracker.start_op();
+ }
+
+ void finish(int r) override {
+ execute();
+ instances->m_async_op_tracker.finish_op();
+ }
+
+ virtual void execute() = 0;
+ };
+
+ struct C_HandleAcked : public C_NotifyBase {
+ C_HandleAcked(Instances *instances, const InstanceIds& instance_ids)
+ : C_NotifyBase(instances, instance_ids) {
+ }
+
+ void execute() override {
+ this->instances->handle_acked(this->instance_ids);
+ }
+ };
+
+ struct C_NotifyInstancesAdded : public C_NotifyBase {
+ C_NotifyInstancesAdded(Instances *instances,
+ const InstanceIds& instance_ids)
+ : C_NotifyBase(instances, instance_ids) {
+ }
+
+ void execute() override {
+ this->instances->notify_instances_added(this->instance_ids);
+ }
+ };
+
+ struct C_NotifyInstancesRemoved : public C_NotifyBase {
+ C_NotifyInstancesRemoved(Instances *instances,
+ const InstanceIds& instance_ids)
+ : C_NotifyBase(instances, instance_ids) {
+ }
+
+ void execute() override {
+ this->instances->notify_instances_removed(this->instance_ids);
+ }
+ };
+
+ Threads<ImageCtxT> *m_threads;
+ librados::IoCtx &m_ioctx;
+ std::string m_instance_id;
+ instances::Listener& m_listener;
+ CephContext *m_cct;
+
+ ceph::mutex m_lock;
+ InstanceIds m_instance_ids;
+ std::map<std::string, Instance> m_instances;
+ Context *m_on_finish = nullptr;
+ AsyncOpTracker m_async_op_tracker;
+
+ Context *m_timer_task = nullptr;
+
+ bool m_listener_blocked = true;
+
+ void handle_acked(const InstanceIds& instance_ids);
+ void notify_instances_added(const InstanceIds& instance_ids);
+ void notify_instances_removed(const InstanceIds& instance_ids);
+
+ void get_instances();
+ void handle_get_instances(int r);
+
+ void wait_for_ops();
+ void handle_wait_for_ops(int r);
+
+ void remove_instances(const clock_t::time_point& time);
+ void handle_remove_instances(int r, const InstanceIds& instance_ids);
+
+ void cancel_remove_task();
+ void schedule_remove_task(const clock_t::time_point& time);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_INSTANCES_H
diff --git a/src/tools/rbd_mirror/LeaderWatcher.cc b/src/tools/rbd_mirror/LeaderWatcher.cc
new file mode 100644
index 000000000..8f12af14c
--- /dev/null
+++ b/src/tools/rbd_mirror/LeaderWatcher.cc
@@ -0,0 +1,1069 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LeaderWatcher.h"
+#include "common/Cond.h"
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "include/stringify.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/watcher/Types.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::LeaderWatcher: " \
+ << this << " " << __func__ << ": "
+namespace rbd {
+namespace mirror {
+
+using namespace leader_watcher;
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+LeaderWatcher<I>::LeaderWatcher(Threads<I> *threads, librados::IoCtx &io_ctx,
+ leader_watcher::Listener *listener)
+ : Watcher(io_ctx, threads->work_queue, RBD_MIRROR_LEADER),
+ m_threads(threads), m_listener(listener), m_instances_listener(this),
+ m_lock(ceph::make_mutex("rbd::mirror::LeaderWatcher " +
+ io_ctx.get_pool_name())),
+ m_notifier_id(librados::Rados(io_ctx).get_instance_id()),
+ m_instance_id(stringify(m_notifier_id)),
+ m_leader_lock(new LeaderLock(m_ioctx, *m_threads->asio_engine, m_oid, this,
+ true, m_cct->_conf.get_val<uint64_t>(
+ "rbd_blocklist_expire_seconds"))) {
+}
+
+template <typename I>
+LeaderWatcher<I>::~LeaderWatcher() {
+ ceph_assert(m_instances == nullptr);
+ ceph_assert(m_timer_task == nullptr);
+
+ delete m_leader_lock;
+}
+
+template <typename I>
+std::string LeaderWatcher<I>::get_instance_id() {
+ return m_instance_id;
+}
+
+template <typename I>
+int LeaderWatcher<I>::init() {
+ C_SaferCond init_ctx;
+ init(&init_ctx);
+ return init_ctx.wait();
+}
+
+template <typename I>
+void LeaderWatcher<I>::init(Context *on_finish) {
+ dout(10) << "notifier_id=" << m_notifier_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+
+ create_leader_object();
+}
+
+template <typename I>
+void LeaderWatcher<I>::create_leader_object() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ librados::ObjectWriteOperation op;
+ op.create(false);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_create_leader_object>(this);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_create_leader_object(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+
+ if (r == 0) {
+ register_watch();
+ return;
+ }
+
+ derr << "error creating " << m_oid << " object: " << cpp_strerror(r)
+ << dendl;
+
+ std::swap(on_finish, m_on_finish);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::register_watch() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_register_watch>(this));
+
+ librbd::Watcher::register_watch(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_register_watch(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard timer_locker(m_threads->timer_lock);
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error registering leader watcher for " << m_oid << " object: "
+ << cpp_strerror(r) << dendl;
+ } else {
+ schedule_acquire_leader_lock(0);
+ }
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(on_finish, m_on_finish);
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::shut_down() {
+ C_SaferCond shut_down_ctx;
+ shut_down(&shut_down_ctx);
+ int r = shut_down_ctx.wait();
+ ceph_assert(r == 0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::shut_down(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+
+ ceph_assert(m_on_shut_down_finish == nullptr);
+ m_on_shut_down_finish = on_finish;
+ cancel_timer_task();
+ shut_down_leader_lock();
+}
+
+template <typename I>
+void LeaderWatcher<I>::shut_down_leader_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_shut_down_leader_lock>(this));
+
+ m_leader_lock->shut_down(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_shut_down_leader_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error shutting down leader lock: " << cpp_strerror(r) << dendl;
+ }
+
+ unregister_watch();
+}
+
+template <typename I>
+void LeaderWatcher<I>::unregister_watch() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_unregister_watch>(this));
+
+ librbd::Watcher::unregister_watch(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_unregister_watch(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error unregistering leader watcher for " << m_oid << " object: "
+ << cpp_strerror(r) << dendl;
+ }
+ wait_for_tasks();
+}
+
+template <typename I>
+void LeaderWatcher<I>::wait_for_tasks() {
+ dout(10) << dendl;
+
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ schedule_timer_task("wait for tasks", 0, false,
+ &LeaderWatcher<I>::handle_wait_for_tasks, true);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_wait_for_tasks() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_on_shut_down_finish != nullptr);
+
+ ceph_assert(!m_timer_op_tracker.empty());
+ m_timer_op_tracker.finish_op();
+
+ auto ctx = new LambdaContext([this](int r) {
+ Context *on_finish;
+ {
+ // ensure lock isn't held when completing shut down
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_shut_down_finish != nullptr);
+ on_finish = m_on_shut_down_finish;
+ }
+ on_finish->complete(0);
+ });
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_blocklisted() const {
+ std::lock_guard locker{m_lock};
+ return m_blocklisted;
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_leader() const {
+ std::lock_guard locker{m_lock};
+ return is_leader(m_lock);
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_leader(ceph::mutex &lock) const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ bool leader = m_leader_lock->is_leader();
+ dout(10) << leader << dendl;
+ return leader;
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_releasing_leader() const {
+ std::lock_guard locker{m_lock};
+ return is_releasing_leader(m_lock);
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_releasing_leader(ceph::mutex &lock) const {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ bool releasing = m_leader_lock->is_releasing_leader();
+ dout(10) << releasing << dendl;
+ return releasing;
+}
+
+template <typename I>
+bool LeaderWatcher<I>::get_leader_instance_id(std::string *instance_id) const {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (is_leader(m_lock) || is_releasing_leader(m_lock)) {
+ *instance_id = m_instance_id;
+ return true;
+ }
+
+ if (!m_locker.cookie.empty()) {
+ *instance_id = stringify(m_locker.entity.num());
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+void LeaderWatcher<I>::release_leader() {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ if (!is_leader(m_lock)) {
+ return;
+ }
+
+ release_leader_lock();
+}
+
+template <typename I>
+void LeaderWatcher<I>::list_instances(std::vector<std::string> *instance_ids) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ instance_ids->clear();
+ if (m_instances != nullptr) {
+ m_instances->list(instance_ids);
+ }
+}
+
+template <typename I>
+void LeaderWatcher<I>::cancel_timer_task() {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ if (m_timer_task == nullptr) {
+ return;
+ }
+
+ dout(10) << m_timer_task << dendl;
+ bool canceled = m_threads->timer->cancel_event(m_timer_task);
+ ceph_assert(canceled);
+ m_timer_task = nullptr;
+}
+
+template <typename I>
+void LeaderWatcher<I>::schedule_timer_task(const std::string &name,
+ int delay_factor, bool leader,
+ TimerCallback timer_callback,
+ bool shutting_down) {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ if (!shutting_down && m_on_shut_down_finish != nullptr) {
+ return;
+ }
+
+ cancel_timer_task();
+
+ m_timer_task = new LambdaContext(
+ [this, leader, timer_callback](int r) {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ m_timer_task = nullptr;
+
+ if (m_timer_op_tracker.empty()) {
+ std::lock_guard locker{m_lock};
+ execute_timer_task(leader, timer_callback);
+ return;
+ }
+
+ // old timer task is still running -- do not start next
+ // task until the previous task completes
+ if (m_timer_gate == nullptr) {
+ m_timer_gate = new C_TimerGate(this);
+ m_timer_op_tracker.wait_for_ops(m_timer_gate);
+ }
+ m_timer_gate->leader = leader;
+ m_timer_gate->timer_callback = timer_callback;
+ });
+
+ int after = delay_factor * m_cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_leader_heartbeat_interval");
+
+ dout(10) << "scheduling " << name << " after " << after << " sec (task "
+ << m_timer_task << ")" << dendl;
+ m_threads->timer->add_event_after(after, m_timer_task);
+}
+
+template <typename I>
+void LeaderWatcher<I>::execute_timer_task(bool leader,
+ TimerCallback timer_callback) {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_timer_op_tracker.empty());
+
+ if (is_leader(m_lock) != leader) {
+ return;
+ }
+
+ m_timer_op_tracker.start_op();
+ (this->*timer_callback)();
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_post_acquire_leader_lock(int r,
+ Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -EAGAIN) {
+ dout(10) << "already locked" << dendl;
+ } else {
+ derr << "error acquiring leader lock: " << cpp_strerror(r) << dendl;
+ }
+ on_finish->complete(r);
+ return;
+ }
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ init_instances();
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_pre_release_leader_lock(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ notify_listener();
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_post_release_leader_lock(int r,
+ Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+
+ notify_lock_released();
+}
+
+template <typename I>
+void LeaderWatcher<I>::break_leader_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (m_locker.cookie.empty()) {
+ get_locker();
+ return;
+ }
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_break_leader_lock>(this));
+
+ m_leader_lock->break_lock(m_locker, true, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_break_leader_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (m_leader_lock->is_shutdown()) {
+ dout(10) << "canceling due to shutdown" << dendl;
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "error breaking leader lock: " << cpp_strerror(r) << dendl;
+ schedule_acquire_leader_lock(1);
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ m_locker = {};
+ m_acquire_attempts = 0;
+ acquire_leader_lock();
+}
+
+template <typename I>
+void LeaderWatcher<I>::schedule_get_locker(bool reset_leader,
+ uint32_t delay_factor) {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ if (reset_leader) {
+ m_locker = {};
+ m_acquire_attempts = 0;
+ }
+
+ schedule_timer_task("get locker", delay_factor, false,
+ &LeaderWatcher<I>::get_locker, false);
+}
+
+template <typename I>
+void LeaderWatcher<I>::get_locker() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ C_GetLocker *get_locker_ctx = new C_GetLocker(this);
+ Context *ctx = create_async_context_callback(m_work_queue, get_locker_ctx);
+
+ m_leader_lock->get_locker(&get_locker_ctx->locker, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_get_locker(int r,
+ librbd::managed_lock::Locker& locker) {
+ dout(10) << "r=" << r << dendl;
+
+ std::scoped_lock l{m_threads->timer_lock, m_lock};
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (m_leader_lock->is_shutdown()) {
+ dout(10) << "canceling due to shutdown" << dendl;
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ if (is_leader(m_lock)) {
+ m_locker = {};
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ if (r == -ENOENT) {
+ m_locker = {};
+ m_acquire_attempts = 0;
+ acquire_leader_lock();
+ return;
+ } else if (r < 0) {
+ derr << "error retrieving leader locker: " << cpp_strerror(r) << dendl;
+ schedule_get_locker(true, 1);
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ bool notify_listener = false;
+ if (m_locker != locker) {
+ m_locker = locker;
+ notify_listener = true;
+ if (m_acquire_attempts > 1) {
+ dout(10) << "new lock owner detected -- resetting heartbeat counter"
+ << dendl;
+ m_acquire_attempts = 0;
+ }
+ }
+
+ if (m_acquire_attempts >= m_cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_leader_max_acquire_attempts_before_break")) {
+ dout(0) << "breaking leader lock after " << m_acquire_attempts << " "
+ << "failed attempts to acquire" << dendl;
+ break_leader_lock();
+ return;
+ }
+
+ schedule_acquire_leader_lock(1);
+
+ if (!notify_listener) {
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ auto ctx = new LambdaContext(
+ [this](int r) {
+ std::string instance_id;
+ if (get_leader_instance_id(&instance_id)) {
+ m_listener->update_leader_handler(instance_id);
+ }
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ m_timer_op_tracker.finish_op();
+ });
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::schedule_acquire_leader_lock(uint32_t delay_factor) {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ schedule_timer_task("acquire leader lock",
+ delay_factor *
+ m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_max_missed_heartbeats"),
+ false, &LeaderWatcher<I>::acquire_leader_lock, false);
+}
+
+template <typename I>
+void LeaderWatcher<I>::acquire_leader_lock() {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ ++m_acquire_attempts;
+ dout(10) << "acquire_attempts=" << m_acquire_attempts << dendl;
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_acquire_leader_lock>(this));
+ m_leader_lock->try_acquire_lock(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_acquire_leader_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (m_leader_lock->is_shutdown()) {
+ dout(10) << "canceling due to shutdown" << dendl;
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ if (r < 0) {
+ if (r == -EAGAIN) {
+ dout(10) << "already locked" << dendl;
+ } else {
+ derr << "error acquiring lock: " << cpp_strerror(r) << dendl;
+ }
+
+ get_locker();
+ return;
+ }
+
+ m_locker = {};
+ m_acquire_attempts = 0;
+
+ if (m_ret_val) {
+ dout(5) << "releasing due to error on notify" << dendl;
+ release_leader_lock();
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ notify_heartbeat();
+}
+
+template <typename I>
+void LeaderWatcher<I>::release_leader_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_release_leader_lock>(this));
+
+ m_leader_lock->release_lock(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_release_leader_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+
+ if (r < 0) {
+ derr << "error releasing lock: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ schedule_acquire_leader_lock(1);
+}
+
+template <typename I>
+void LeaderWatcher<I>::init_instances() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_instances == nullptr);
+
+ m_instances = Instances<I>::create(m_threads, m_ioctx, m_instance_id,
+ m_instances_listener);
+
+ Context *ctx = create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_init_instances>(this);
+
+ m_instances->init(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_init_instances(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ if (r < 0) {
+ std::lock_guard locker{m_lock};
+ derr << "error initializing instances: " << cpp_strerror(r) << dendl;
+ m_instances->destroy();
+ m_instances = nullptr;
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(m_on_finish, on_finish);
+ } else {
+ std::lock_guard locker{m_lock};
+ notify_listener();
+ return;
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::shut_down_instances() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_instances != nullptr);
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<LeaderWatcher<I>,
+ &LeaderWatcher<I>::handle_shut_down_instances>(this));
+
+ m_instances->shut_down(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_shut_down_instances(int r) {
+ dout(10) << "r=" << r << dendl;
+ ceph_assert(r == 0);
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+
+ m_instances->destroy();
+ m_instances = nullptr;
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(m_on_finish, on_finish);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::notify_listener() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_listener>(this));
+
+ if (is_leader(m_lock)) {
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ m_listener->post_acquire_handler(ctx);
+ });
+ } else {
+ ctx = new LambdaContext(
+ [this, ctx](int r) {
+ m_listener->pre_release_handler(ctx);
+ });
+ }
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify_listener(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error notifying listener: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+
+ if (is_leader(m_lock)) {
+ notify_lock_acquired();
+ } else {
+ shut_down_instances();
+ }
+}
+
+template <typename I>
+void LeaderWatcher<I>::notify_lock_acquired() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_lock_acquired>(this);
+
+ bufferlist bl;
+ encode(NotifyMessage{LockAcquiredPayload{}}, bl);
+
+ send_notify(bl, nullptr, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify_lock_acquired(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ if (r < 0 && r != -ETIMEDOUT) {
+ derr << "error notifying leader lock acquired: " << cpp_strerror(r)
+ << dendl;
+ m_ret_val = r;
+ }
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(m_on_finish, on_finish);
+
+ if (m_ret_val == 0) {
+ // listener should be ready for instance add/remove events now
+ m_instances->unblock_listener();
+ }
+ }
+ on_finish->complete(0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::notify_lock_released() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_lock_released>(this);
+
+ bufferlist bl;
+ encode(NotifyMessage{LockReleasedPayload{}}, bl);
+
+ send_notify(bl, nullptr, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify_lock_released(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ if (r < 0 && r != -ETIMEDOUT) {
+ derr << "error notifying leader lock released: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(m_on_finish, on_finish);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::notify_heartbeat() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (!is_leader(m_lock)) {
+ dout(5) << "not leader, canceling" << dendl;
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ Context *ctx = create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_heartbeat>(this);
+
+ bufferlist bl;
+ encode(NotifyMessage{HeartbeatPayload{}}, bl);
+
+ m_heartbeat_response.acks.clear();
+ send_notify(bl, &m_heartbeat_response, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify_heartbeat(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ m_timer_op_tracker.finish_op();
+ if (m_leader_lock->is_shutdown()) {
+ dout(10) << "canceling due to shutdown" << dendl;
+ return;
+ } else if (!is_leader(m_lock)) {
+ return;
+ }
+
+ if (r < 0 && r != -ETIMEDOUT) {
+ derr << "error notifying heartbeat: " << cpp_strerror(r)
+ << ", releasing leader" << dendl;
+ release_leader_lock();
+ return;
+ }
+
+ dout(10) << m_heartbeat_response.acks.size() << " acks received, "
+ << m_heartbeat_response.timeouts.size() << " timed out" << dendl;
+
+ std::vector<std::string> instance_ids;
+ for (auto &it: m_heartbeat_response.acks) {
+ uint64_t notifier_id = it.first.gid;
+ instance_ids.push_back(stringify(notifier_id));
+ }
+ if (!instance_ids.empty()) {
+ m_instances->acked(instance_ids);
+ }
+
+ schedule_timer_task("heartbeat", 1, true,
+ &LeaderWatcher<I>::notify_heartbeat, false);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_heartbeat(Context *on_notify_ack) {
+ dout(10) << dendl;
+
+ {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ if (is_leader(m_lock)) {
+ dout(5) << "got another leader heartbeat, ignoring" << dendl;
+ } else if (!m_locker.cookie.empty()) {
+ cancel_timer_task();
+ m_acquire_attempts = 0;
+ schedule_acquire_leader_lock(1);
+ }
+ }
+
+ on_notify_ack->complete(0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_lock_acquired(Context *on_notify_ack) {
+ dout(10) << dendl;
+
+ {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ if (is_leader(m_lock)) {
+ dout(5) << "got another leader lock_acquired, ignoring" << dendl;
+ } else {
+ cancel_timer_task();
+ schedule_get_locker(true, 0);
+ }
+ }
+
+ on_notify_ack->complete(0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_lock_released(Context *on_notify_ack) {
+ dout(10) << dendl;
+
+ {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ if (is_leader(m_lock)) {
+ dout(5) << "got another leader lock_released, ignoring" << dendl;
+ } else {
+ cancel_timer_task();
+ schedule_get_locker(true, 0);
+ }
+ }
+
+ on_notify_ack->complete(0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ dout(10) << "notify_id=" << notify_id << ", handle=" << handle << ", "
+ << "notifier_id=" << notifier_id << dendl;
+
+ Context *ctx = new C_NotifyAck(this, notify_id, handle);
+
+ if (notifier_id == m_notifier_id) {
+ dout(10) << "our own notification, ignoring" << dendl;
+ ctx->complete(0);
+ return;
+ }
+
+ NotifyMessage notify_message;
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ derr << "error decoding image notification: " << err.what() << dendl;
+ ctx->complete(0);
+ return;
+ }
+
+ apply_visitor(HandlePayloadVisitor(this, ctx), notify_message.payload);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_rewatch_complete(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ dout(1) << "blocklisted detected" << dendl;
+ m_blocklisted = true;
+ return;
+ }
+
+ m_leader_lock->reacquire_lock(nullptr);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_payload(const HeartbeatPayload &payload,
+ Context *on_notify_ack) {
+ dout(10) << "heartbeat" << dendl;
+
+ handle_heartbeat(on_notify_ack);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_payload(const LockAcquiredPayload &payload,
+ Context *on_notify_ack) {
+ dout(10) << "lock_acquired" << dendl;
+
+ handle_lock_acquired(on_notify_ack);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_payload(const LockReleasedPayload &payload,
+ Context *on_notify_ack) {
+ dout(10) << "lock_released" << dendl;
+
+ handle_lock_released(on_notify_ack);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_payload(const UnknownPayload &payload,
+ Context *on_notify_ack) {
+ dout(10) << "unknown" << dendl;
+
+ on_notify_ack->complete(0);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::LeaderWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/LeaderWatcher.h b/src/tools/rbd_mirror/LeaderWatcher.h
new file mode 100644
index 000000000..58f23148f
--- /dev/null
+++ b/src/tools/rbd_mirror/LeaderWatcher.h
@@ -0,0 +1,313 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_LEADER_WATCHER_H
+#define CEPH_RBD_MIRROR_LEADER_WATCHER_H
+
+#include <list>
+#include <memory>
+#include <string>
+
+#include "common/AsyncOpTracker.h"
+#include "librbd/ManagedLock.h"
+#include "librbd/Watcher.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/watcher/Types.h"
+#include "Instances.h"
+#include "tools/rbd_mirror/instances/Types.h"
+#include "tools/rbd_mirror/leader_watcher/Types.h"
+
+namespace librbd {
+class ImageCtx;
+namespace asio { struct ContextWQ; }
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class LeaderWatcher : protected librbd::Watcher {
+ using librbd::Watcher::unregister_watch; // Silence overloaded virtual warning
+public:
+ static LeaderWatcher* create(Threads<ImageCtxT> *threads,
+ librados::IoCtx &io_ctx,
+ leader_watcher::Listener *listener) {
+ return new LeaderWatcher(threads, io_ctx, listener);
+ }
+
+ LeaderWatcher(Threads<ImageCtxT> *threads, librados::IoCtx &io_ctx,
+ leader_watcher::Listener *listener);
+ ~LeaderWatcher() override;
+
+ int init();
+ void shut_down();
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+ bool is_blocklisted() const;
+ bool is_leader() const;
+ bool is_releasing_leader() const;
+ bool get_leader_instance_id(std::string *instance_id) const;
+ void release_leader();
+ void list_instances(std::vector<std::string> *instance_ids);
+
+ std::string get_instance_id();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <uninitialized> <------------------------------ WAIT_FOR_TASKS
+ * | (init) ^ ^
+ * v * |
+ * CREATE_OBJECT * * * * * (error) UNREGISTER_WATCH
+ * | * ^
+ * v * |
+ * REGISTER_WATCH * * * * * SHUT_DOWN_LEADER_LOCK
+ * | ^
+ * | (no leader heartbeat and acquire failed) |
+ * | BREAK_LOCK <-------------------------------------\ |
+ * | | (no leader heartbeat) | | (shut down)
+ * | | /----------------------------------------\ | |
+ * | | | (lock_released received) | |
+ * | | | /-------------------------------------\ | |
+ * | | | | (lock_acquired or | | |
+ * | | | | heartbeat received) | | |
+ * | | | | (ENOENT) /-----------\ | | |
+ * | | | | * * * * * * * * * * | | | | |
+ * v v v v v (error) * v | | | |
+ * ACQUIRE_LEADER_LOCK * * * * *> GET_LOCKER ---> <secondary>
+ * | * ^
+ * ....|...................*.................... .....|.....................
+ * . v * . . | post_release .
+ * .INIT_INSTANCES * * * * * . .NOTIFY_LOCK_RELEASED .
+ * . | . .....^.....................
+ * . v . |
+ * .NOTIFY_LISTENER . RELEASE_LEADER_LOCK
+ * . | . ^
+ * . v . .....|.....................
+ * .NOTIFY_LOCK_ACQUIRED . . | .
+ * . | post_acquire . .SHUT_DOWN_INSTANCES .
+ * ....|........................................ . ^ .
+ * v . | .
+ * <leader> -----------------------------------> .NOTIFY_LISTENER .
+ * (shut_down, release_leader, . pre_release .
+ * notify error) ...........................
+ * @endverbatim
+ */
+
+ struct InstancesListener : public instances::Listener {
+ LeaderWatcher* leader_watcher;
+
+ InstancesListener(LeaderWatcher* leader_watcher)
+ : leader_watcher(leader_watcher) {
+ }
+
+ void handle_added(const InstanceIds& instance_ids) override {
+ leader_watcher->m_listener->handle_instances_added(instance_ids);
+ }
+
+ void handle_removed(const InstanceIds& instance_ids) override {
+ leader_watcher->m_listener->handle_instances_removed(instance_ids);
+ }
+ };
+
+ class LeaderLock : public librbd::ManagedLock<ImageCtxT> {
+ public:
+ typedef librbd::ManagedLock<ImageCtxT> Parent;
+
+ LeaderLock(librados::IoCtx& ioctx, librbd::AsioEngine& asio_engine,
+ const std::string& oid, LeaderWatcher *watcher,
+ bool blocklist_on_break_lock,
+ uint32_t blocklist_expire_seconds)
+ : Parent(ioctx, asio_engine, oid, watcher,
+ librbd::managed_lock::EXCLUSIVE, blocklist_on_break_lock,
+ blocklist_expire_seconds),
+ watcher(watcher) {
+ }
+
+ bool is_leader() const {
+ std::lock_guard locker{Parent::m_lock};
+ return Parent::is_state_post_acquiring() || Parent::is_state_locked();
+ }
+
+ bool is_releasing_leader() const {
+ std::lock_guard locker{Parent::m_lock};
+ return Parent::is_state_pre_releasing();
+ }
+
+ protected:
+ void post_acquire_lock_handler(int r, Context *on_finish) {
+ if (r == 0) {
+ // lock is owned at this point
+ std::lock_guard locker{Parent::m_lock};
+ Parent::set_state_post_acquiring();
+ }
+ watcher->handle_post_acquire_leader_lock(r, on_finish);
+ }
+ void pre_release_lock_handler(bool shutting_down,
+ Context *on_finish) {
+ watcher->handle_pre_release_leader_lock(on_finish);
+ }
+ void post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish) {
+ watcher->handle_post_release_leader_lock(r, on_finish);
+ }
+ private:
+ LeaderWatcher *watcher;
+ };
+
+ struct HandlePayloadVisitor : public boost::static_visitor<void> {
+ LeaderWatcher *leader_watcher;
+ Context *on_notify_ack;
+
+ HandlePayloadVisitor(LeaderWatcher *leader_watcher, Context *on_notify_ack)
+ : leader_watcher(leader_watcher), on_notify_ack(on_notify_ack) {
+ }
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ leader_watcher->handle_payload(payload, on_notify_ack);
+ }
+ };
+
+ struct C_GetLocker : public Context {
+ LeaderWatcher *leader_watcher;
+ librbd::managed_lock::Locker locker;
+
+ C_GetLocker(LeaderWatcher *leader_watcher)
+ : leader_watcher(leader_watcher) {
+ }
+
+ void finish(int r) override {
+ leader_watcher->handle_get_locker(r, locker);
+ }
+ };
+
+ typedef void (LeaderWatcher<ImageCtxT>::*TimerCallback)();
+
+ struct C_TimerGate : public Context {
+ LeaderWatcher *leader_watcher;
+
+ bool leader = false;
+ TimerCallback timer_callback = nullptr;
+
+ C_TimerGate(LeaderWatcher *leader_watcher)
+ : leader_watcher(leader_watcher) {
+ }
+
+ void finish(int r) override {
+ leader_watcher->m_timer_gate = nullptr;
+ leader_watcher->execute_timer_task(leader, timer_callback);
+ }
+ };
+
+ Threads<ImageCtxT> *m_threads;
+ leader_watcher::Listener *m_listener;
+
+ InstancesListener m_instances_listener;
+ mutable ceph::mutex m_lock;
+ uint64_t m_notifier_id;
+ std::string m_instance_id;
+ LeaderLock *m_leader_lock;
+ Context *m_on_finish = nullptr;
+ Context *m_on_shut_down_finish = nullptr;
+ uint64_t m_acquire_attempts = 0;
+ int m_ret_val = 0;
+ Instances<ImageCtxT> *m_instances = nullptr;
+ librbd::managed_lock::Locker m_locker;
+
+ bool m_blocklisted = false;
+
+ AsyncOpTracker m_timer_op_tracker;
+ Context *m_timer_task = nullptr;
+ C_TimerGate *m_timer_gate = nullptr;
+
+ librbd::watcher::NotifyResponse m_heartbeat_response;
+
+ bool is_leader(ceph::mutex &m_lock) const;
+ bool is_releasing_leader(ceph::mutex &m_lock) const;
+
+ void cancel_timer_task();
+ void schedule_timer_task(const std::string &name,
+ int delay_factor, bool leader,
+ TimerCallback callback, bool shutting_down);
+ void execute_timer_task(bool leader, TimerCallback timer_callback);
+
+ void create_leader_object();
+ void handle_create_leader_object(int r);
+
+ void register_watch();
+ void handle_register_watch(int r);
+
+ void shut_down_leader_lock();
+ void handle_shut_down_leader_lock(int r);
+
+ void unregister_watch();
+ void handle_unregister_watch(int r);
+
+ void wait_for_tasks();
+ void handle_wait_for_tasks();
+
+ void break_leader_lock();
+ void handle_break_leader_lock(int r);
+
+ void schedule_get_locker(bool reset_leader, uint32_t delay_factor);
+ void get_locker();
+ void handle_get_locker(int r, librbd::managed_lock::Locker& locker);
+
+ void schedule_acquire_leader_lock(uint32_t delay_factor);
+ void acquire_leader_lock();
+ void handle_acquire_leader_lock(int r);
+
+ void release_leader_lock();
+ void handle_release_leader_lock(int r);
+
+ void init_instances();
+ void handle_init_instances(int r);
+
+ void shut_down_instances();
+ void handle_shut_down_instances(int r);
+
+ void notify_listener();
+ void handle_notify_listener(int r);
+
+ void notify_lock_acquired();
+ void handle_notify_lock_acquired(int r);
+
+ void notify_lock_released();
+ void handle_notify_lock_released(int r);
+
+ void notify_heartbeat();
+ void handle_notify_heartbeat(int r);
+
+ void handle_post_acquire_leader_lock(int r, Context *on_finish);
+ void handle_pre_release_leader_lock(Context *on_finish);
+ void handle_post_release_leader_lock(int r, Context *on_finish);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+
+ void handle_rewatch_complete(int r) override;
+
+ void handle_heartbeat(Context *on_ack);
+ void handle_lock_acquired(Context *on_ack);
+ void handle_lock_released(Context *on_ack);
+
+ void handle_payload(const leader_watcher::HeartbeatPayload &payload,
+ Context *on_notify_ack);
+ void handle_payload(const leader_watcher::LockAcquiredPayload &payload,
+ Context *on_notify_ack);
+ void handle_payload(const leader_watcher::LockReleasedPayload &payload,
+ Context *on_notify_ack);
+ void handle_payload(const leader_watcher::UnknownPayload &payload,
+ Context *on_notify_ack);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_LEADER_WATCHER_H
diff --git a/src/tools/rbd_mirror/Mirror.cc b/src/tools/rbd_mirror/Mirror.cc
new file mode 100644
index 000000000..f02cfe65d
--- /dev/null
+++ b/src/tools/rbd_mirror/Mirror.cc
@@ -0,0 +1,748 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <signal.h>
+
+#include <boost/range/adaptor/map.hpp>
+
+#include "common/Formatter.h"
+#include "common/PriorityCache.h"
+#include "common/admin_socket.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "journal/Types.h"
+#include "librbd/ImageCtx.h"
+#include "perfglue/heap_profiler.h"
+#include "Mirror.h"
+#include "PoolMetaCache.h"
+#include "ServiceDaemon.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+
+using std::list;
+using std::map;
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using librados::Rados;
+using librados::IoCtx;
+using librbd::mirror_peer_t;
+
+namespace rbd {
+namespace mirror {
+
+namespace {
+
+class MirrorAdminSocketCommand {
+public:
+ virtual ~MirrorAdminSocketCommand() {}
+ virtual int call(Formatter *f) = 0;
+};
+
+class StatusCommand : public MirrorAdminSocketCommand {
+public:
+ explicit StatusCommand(Mirror *mirror) : mirror(mirror) {}
+
+ int call(Formatter *f) override {
+ mirror->print_status(f);
+ return 0;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class StartCommand : public MirrorAdminSocketCommand {
+public:
+ explicit StartCommand(Mirror *mirror) : mirror(mirror) {}
+
+ int call(Formatter *f) override {
+ mirror->start();
+ return 0;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class StopCommand : public MirrorAdminSocketCommand {
+public:
+ explicit StopCommand(Mirror *mirror) : mirror(mirror) {}
+
+ int call(Formatter *f) override {
+ mirror->stop();
+ return 0;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class RestartCommand : public MirrorAdminSocketCommand {
+public:
+ explicit RestartCommand(Mirror *mirror) : mirror(mirror) {}
+
+ int call(Formatter *f) override {
+ mirror->restart();
+ return 0;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class FlushCommand : public MirrorAdminSocketCommand {
+public:
+ explicit FlushCommand(Mirror *mirror) : mirror(mirror) {}
+
+ int call(Formatter *f) override {
+ mirror->flush();
+ return 0;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class LeaderReleaseCommand : public MirrorAdminSocketCommand {
+public:
+ explicit LeaderReleaseCommand(Mirror *mirror) : mirror(mirror) {}
+
+ int call(Formatter *f) override {
+ mirror->release_leader();
+ return 0;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::PriCache: " << this << " " \
+ << m_name << " " << __func__ << ": "
+
+struct PriCache : public PriorityCache::PriCache {
+ std::string m_name;
+ int64_t m_base_cache_max_size;
+ int64_t m_extra_cache_max_size;
+
+ PriorityCache::Priority m_base_cache_pri = PriorityCache::Priority::PRI10;
+ PriorityCache::Priority m_extra_cache_pri = PriorityCache::Priority::PRI10;
+ int64_t m_base_cache_bytes = 0;
+ int64_t m_extra_cache_bytes = 0;
+ int64_t m_committed_bytes = 0;
+ double m_cache_ratio = 0;
+
+ PriCache(const std::string &name, uint64_t min_size, uint64_t max_size)
+ : m_name(name), m_base_cache_max_size(min_size),
+ m_extra_cache_max_size(max_size - min_size) {
+ ceph_assert(max_size >= min_size);
+ }
+
+ void prioritize() {
+ if (m_base_cache_pri == PriorityCache::Priority::PRI0) {
+ return;
+ }
+ auto pri = static_cast<uint8_t>(m_base_cache_pri);
+ m_base_cache_pri = static_cast<PriorityCache::Priority>(--pri);
+
+ dout(30) << m_base_cache_pri << dendl;
+ }
+
+ int64_t request_cache_bytes(PriorityCache::Priority pri,
+ uint64_t total_cache) const override {
+ int64_t cache_bytes = 0;
+
+ if (pri == m_base_cache_pri) {
+ cache_bytes += m_base_cache_max_size;
+ }
+ if (pri == m_extra_cache_pri) {
+ cache_bytes += m_extra_cache_max_size;
+ }
+
+ dout(30) << cache_bytes << dendl;
+
+ return cache_bytes;
+ }
+
+ int64_t get_cache_bytes(PriorityCache::Priority pri) const override {
+ int64_t cache_bytes = 0;
+
+ if (pri == m_base_cache_pri) {
+ cache_bytes += m_base_cache_bytes;
+ }
+ if (pri == m_extra_cache_pri) {
+ cache_bytes += m_extra_cache_bytes;
+ }
+
+ dout(30) << "pri=" << pri << " " << cache_bytes << dendl;
+
+ return cache_bytes;
+ }
+
+ int64_t get_cache_bytes() const override {
+ auto cache_bytes = m_base_cache_bytes + m_extra_cache_bytes;
+
+ dout(30) << m_base_cache_bytes << "+" << m_extra_cache_bytes << "="
+ << cache_bytes << dendl;
+
+ return cache_bytes;
+ }
+
+ void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) override {
+ ceph_assert(bytes >= 0);
+ ceph_assert(pri == m_base_cache_pri || pri == m_extra_cache_pri ||
+ bytes == 0);
+
+ dout(30) << "pri=" << pri << " " << bytes << dendl;
+
+ if (pri == m_base_cache_pri) {
+ m_base_cache_bytes = std::min(m_base_cache_max_size, bytes);
+ bytes -= std::min(m_base_cache_bytes, bytes);
+ }
+
+ if (pri == m_extra_cache_pri) {
+ m_extra_cache_bytes = bytes;
+ }
+ }
+
+ void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) override {
+ ceph_assert(bytes >= 0);
+ ceph_assert(pri == m_base_cache_pri || pri == m_extra_cache_pri);
+
+ dout(30) << "pri=" << pri << " " << bytes << dendl;
+
+ if (pri == m_base_cache_pri) {
+ ceph_assert(m_base_cache_bytes <= m_base_cache_max_size);
+
+ auto chunk = std::min(m_base_cache_max_size - m_base_cache_bytes, bytes);
+ m_base_cache_bytes += chunk;
+ bytes -= chunk;
+ }
+
+ if (pri == m_extra_cache_pri) {
+ m_extra_cache_bytes += bytes;
+ }
+ }
+
+ int64_t commit_cache_size(uint64_t total_cache) override {
+ m_committed_bytes = p2roundup<int64_t>(get_cache_bytes(), 4096);
+
+ dout(30) << m_committed_bytes << dendl;
+
+ return m_committed_bytes;
+ }
+
+ int64_t get_committed_size() const override {
+ dout(30) << m_committed_bytes << dendl;
+
+ return m_committed_bytes;
+ }
+
+ double get_cache_ratio() const override {
+ dout(30) << m_cache_ratio << dendl;
+
+ return m_cache_ratio;
+ }
+
+ void set_cache_ratio(double ratio) override {
+ dout(30) << m_cache_ratio << dendl;
+
+ m_cache_ratio = ratio;
+ }
+
+ std::string get_cache_name() const override {
+ return m_name;
+ }
+};
+
+} // anonymous namespace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::Mirror: " << this << " " \
+ << __func__ << ": "
+
+class MirrorAdminSocketHook : public AdminSocketHook {
+public:
+ MirrorAdminSocketHook(CephContext *cct, Mirror *mirror) :
+ admin_socket(cct->get_admin_socket()) {
+ std::string command;
+ int r;
+
+ command = "rbd mirror status";
+ r = admin_socket->register_command(command, this,
+ "get status for rbd mirror");
+ if (r == 0) {
+ commands[command] = new StatusCommand(mirror);
+ }
+
+ command = "rbd mirror start";
+ r = admin_socket->register_command(command, this,
+ "start rbd mirror");
+ if (r == 0) {
+ commands[command] = new StartCommand(mirror);
+ }
+
+ command = "rbd mirror stop";
+ r = admin_socket->register_command(command, this,
+ "stop rbd mirror");
+ if (r == 0) {
+ commands[command] = new StopCommand(mirror);
+ }
+
+ command = "rbd mirror restart";
+ r = admin_socket->register_command(command, this,
+ "restart rbd mirror");
+ if (r == 0) {
+ commands[command] = new RestartCommand(mirror);
+ }
+
+ command = "rbd mirror flush";
+ r = admin_socket->register_command(command, this,
+ "flush rbd mirror");
+ if (r == 0) {
+ commands[command] = new FlushCommand(mirror);
+ }
+
+ command = "rbd mirror leader release";
+ r = admin_socket->register_command(command, this,
+ "release rbd mirror leader");
+ if (r == 0) {
+ commands[command] = new LeaderReleaseCommand(mirror);
+ }
+ }
+
+ ~MirrorAdminSocketHook() override {
+ (void)admin_socket->unregister_commands(this);
+ for (Commands::const_iterator i = commands.begin(); i != commands.end();
+ ++i) {
+ delete i->second;
+ }
+ }
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out) override {
+ Commands::const_iterator i = commands.find(command);
+ ceph_assert(i != commands.end());
+ return i->second->call(f);
+ }
+
+private:
+ typedef std::map<std::string, MirrorAdminSocketCommand*, std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+class CacheManagerHandler : public journal::CacheManagerHandler {
+public:
+ CacheManagerHandler(CephContext *cct)
+ : m_cct(cct) {
+
+ if (!m_cct->_conf.get_val<bool>("rbd_mirror_memory_autotune")) {
+ return;
+ }
+
+ uint64_t base = m_cct->_conf.get_val<Option::size_t>(
+ "rbd_mirror_memory_base");
+ double fragmentation = m_cct->_conf.get_val<double>(
+ "rbd_mirror_memory_expected_fragmentation");
+ uint64_t target = m_cct->_conf.get_val<Option::size_t>(
+ "rbd_mirror_memory_target");
+ uint64_t min = m_cct->_conf.get_val<Option::size_t>(
+ "rbd_mirror_memory_cache_min");
+ uint64_t max = min;
+
+ // When setting the maximum amount of memory to use for cache, first
+ // assume some base amount of memory for the daemon and then fudge in
+ // some overhead for fragmentation that scales with cache usage.
+ uint64_t ltarget = (1.0 - fragmentation) * target;
+ if (ltarget > base + min) {
+ max = ltarget - base;
+ }
+
+ m_next_balance = ceph_clock_now();
+ m_next_resize = ceph_clock_now();
+
+ m_cache_manager = std::make_unique<PriorityCache::Manager>(
+ m_cct, min, max, target, false);
+ }
+
+ ~CacheManagerHandler() {
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_caches.empty());
+ }
+
+ void register_cache(const std::string &cache_name,
+ uint64_t min_size, uint64_t max_size,
+ journal::CacheRebalanceHandler* handler) override {
+ if (!m_cache_manager) {
+ handler->handle_cache_rebalanced(max_size);
+ return;
+ }
+
+ dout(20) << cache_name << " min_size=" << min_size << " max_size="
+ << max_size << " handler=" << handler << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ auto p = m_caches.insert(
+ {cache_name, {cache_name, min_size, max_size, handler}});
+ ceph_assert(p.second == true);
+
+ m_cache_manager->insert(cache_name, p.first->second.pri_cache, false);
+ m_next_balance = ceph_clock_now();
+ }
+
+ void unregister_cache(const std::string &cache_name) override {
+ if (!m_cache_manager) {
+ return;
+ }
+
+ dout(20) << cache_name << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ auto it = m_caches.find(cache_name);
+ ceph_assert(it != m_caches.end());
+
+ m_cache_manager->erase(cache_name);
+ m_caches.erase(it);
+ m_next_balance = ceph_clock_now();
+ }
+
+ void run_cache_manager() {
+ if (!m_cache_manager) {
+ return;
+ }
+
+ std::lock_guard locker{m_lock};
+
+ // Before we trim, check and see if it's time to rebalance/resize.
+ auto autotune_interval = m_cct->_conf.get_val<double>(
+ "rbd_mirror_memory_cache_autotune_interval");
+ auto resize_interval = m_cct->_conf.get_val<double>(
+ "rbd_mirror_memory_cache_resize_interval");
+
+ utime_t now = ceph_clock_now();
+
+ if (autotune_interval > 0 && m_next_balance <= now) {
+ dout(20) << "balance" << dendl;
+ m_cache_manager->balance();
+
+ for (auto &it : m_caches) {
+ auto pri_cache = static_cast<PriCache *>(it.second.pri_cache.get());
+ auto new_cache_bytes = pri_cache->get_cache_bytes();
+ it.second.handler->handle_cache_rebalanced(new_cache_bytes);
+ pri_cache->prioritize();
+ }
+
+ m_next_balance = ceph_clock_now();
+ m_next_balance += autotune_interval;
+ }
+
+ if (resize_interval > 0 && m_next_resize < now) {
+ if (ceph_using_tcmalloc()) {
+ dout(20) << "tune memory" << dendl;
+ m_cache_manager->tune_memory();
+ }
+
+ m_next_resize = ceph_clock_now();
+ m_next_resize += resize_interval;
+ }
+ }
+
+private:
+ struct Cache {
+ std::shared_ptr<PriorityCache::PriCache> pri_cache;
+ journal::CacheRebalanceHandler *handler;
+
+ Cache(const std::string name, uint64_t min_size, uint64_t max_size,
+ journal::CacheRebalanceHandler *handler)
+ : pri_cache(new PriCache(name, min_size, max_size)), handler(handler) {
+ }
+ };
+
+ CephContext *m_cct;
+
+ mutable ceph::mutex m_lock =
+ ceph::make_mutex("rbd::mirror::CacheManagerHandler");
+ std::unique_ptr<PriorityCache::Manager> m_cache_manager;
+ std::map<std::string, Cache> m_caches;
+
+ utime_t m_next_balance;
+ utime_t m_next_resize;
+};
+
+Mirror::Mirror(CephContext *cct, const std::vector<const char*> &args) :
+ m_cct(cct),
+ m_args(args),
+ m_local(new librados::Rados()),
+ m_cache_manager_handler(new CacheManagerHandler(cct)),
+ m_pool_meta_cache(new PoolMetaCache(cct)),
+ m_asok_hook(new MirrorAdminSocketHook(cct, this)) {
+}
+
+Mirror::~Mirror()
+{
+ delete m_asok_hook;
+}
+
+void Mirror::handle_signal(int signum)
+{
+ dout(20) << signum << dendl;
+
+ std::lock_guard l{m_lock};
+
+ switch (signum) {
+ case SIGHUP:
+ for (auto &it : m_pool_replayers) {
+ it.second->reopen_logs();
+ }
+ g_ceph_context->reopen_logs();
+ break;
+
+ case SIGINT:
+ case SIGTERM:
+ m_stopping = true;
+ m_cond.notify_all();
+ break;
+
+ default:
+ ceph_abort_msgf("unexpected signal %d", signum);
+ }
+}
+
+int Mirror::init()
+{
+ int r = m_local->init_with_context(m_cct);
+ if (r < 0) {
+ derr << "could not initialize rados handle" << dendl;
+ return r;
+ }
+
+ r = m_local->connect();
+ if (r < 0) {
+ derr << "error connecting to local cluster" << dendl;
+ return r;
+ }
+
+ m_threads = &(m_cct->lookup_or_create_singleton_object<
+ Threads<librbd::ImageCtx>>("rbd_mirror::threads", false, m_local));
+ m_service_daemon.reset(new ServiceDaemon<>(m_cct, m_local, m_threads));
+
+ r = m_service_daemon->init();
+ if (r < 0) {
+ derr << "error registering service daemon: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ m_local_cluster_watcher.reset(new ClusterWatcher(m_local, m_lock,
+ m_service_daemon.get()));
+ return r;
+}
+
+void Mirror::run()
+{
+ dout(20) << "enter" << dendl;
+
+ utime_t next_refresh_pools = ceph_clock_now();
+
+ while (!m_stopping) {
+ utime_t now = ceph_clock_now();
+ bool refresh_pools = next_refresh_pools <= now;
+ if (refresh_pools) {
+ m_local_cluster_watcher->refresh_pools();
+ next_refresh_pools = ceph_clock_now();
+ next_refresh_pools += m_cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_pool_replayers_refresh_interval");
+ }
+ std::unique_lock l{m_lock};
+ if (!m_manual_stop) {
+ if (refresh_pools) {
+ update_pool_replayers(m_local_cluster_watcher->get_pool_peers(),
+ m_local_cluster_watcher->get_site_name());
+ }
+ m_cache_manager_handler->run_cache_manager();
+ }
+ m_cond.wait_for(l, 1s);
+ }
+
+ // stop all pool replayers in parallel
+ std::lock_guard locker{m_lock};
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->stop(false);
+ }
+ dout(20) << "return" << dendl;
+}
+
+void Mirror::print_status(Formatter *f)
+{
+ dout(20) << "enter" << dendl;
+
+ std::lock_guard l{m_lock};
+
+ if (m_stopping) {
+ return;
+ }
+
+ f->open_object_section("mirror_status");
+ f->open_array_section("pool_replayers");
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->print_status(f);
+ }
+ f->close_section();
+ f->close_section();
+}
+
+void Mirror::start()
+{
+ dout(20) << "enter" << dendl;
+ std::lock_guard l{m_lock};
+
+ if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = false;
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->start();
+ }
+}
+
+void Mirror::stop()
+{
+ dout(20) << "enter" << dendl;
+ std::lock_guard l{m_lock};
+
+ if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = true;
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->stop(true);
+ }
+}
+
+void Mirror::restart()
+{
+ dout(20) << "enter" << dendl;
+ std::lock_guard l{m_lock};
+
+ if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = false;
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->restart();
+ }
+}
+
+void Mirror::flush()
+{
+ dout(20) << "enter" << dendl;
+ std::lock_guard l{m_lock};
+
+ if (m_stopping || m_manual_stop) {
+ return;
+ }
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->flush();
+ }
+}
+
+void Mirror::release_leader()
+{
+ dout(20) << "enter" << dendl;
+ std::lock_guard l{m_lock};
+
+ if (m_stopping) {
+ return;
+ }
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->release_leader();
+ }
+}
+
+void Mirror::update_pool_replayers(const PoolPeers &pool_peers,
+ const std::string& site_name)
+{
+ dout(20) << "enter" << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ // remove stale pool replayers before creating new pool replayers
+ for (auto it = m_pool_replayers.begin(); it != m_pool_replayers.end();) {
+ auto &peer = it->first.second;
+ auto pool_peer_it = pool_peers.find(it->first.first);
+ if (pool_peer_it == pool_peers.end() ||
+ pool_peer_it->second.find(peer) == pool_peer_it->second.end()) {
+ dout(20) << "removing pool replayer for " << peer << dendl;
+ // TODO: make async
+ it->second->shut_down();
+ it = m_pool_replayers.erase(it);
+ } else {
+ ++it;
+ }
+ }
+
+ for (auto &kv : pool_peers) {
+ for (auto &peer : kv.second) {
+ PoolPeer pool_peer(kv.first, peer);
+
+ auto pool_replayers_it = m_pool_replayers.find(pool_peer);
+ if (pool_replayers_it != m_pool_replayers.end()) {
+ auto& pool_replayer = pool_replayers_it->second;
+ if (!m_site_name.empty() && !site_name.empty() &&
+ m_site_name != site_name) {
+ dout(0) << "restarting pool replayer for " << peer << " due to "
+ << "updated site name" << dendl;
+ // TODO: make async
+ pool_replayer->shut_down();
+ pool_replayer->init(site_name);
+ } else if (pool_replayer->is_blocklisted()) {
+ derr << "restarting blocklisted pool replayer for " << peer << dendl;
+ // TODO: make async
+ pool_replayer->shut_down();
+ pool_replayer->init(site_name);
+ } else if (!pool_replayer->is_running()) {
+ derr << "restarting failed pool replayer for " << peer << dendl;
+ // TODO: make async
+ pool_replayer->shut_down();
+ pool_replayer->init(site_name);
+ }
+ } else {
+ dout(20) << "starting pool replayer for " << peer << dendl;
+ unique_ptr<PoolReplayer<>> pool_replayer(
+ new PoolReplayer<>(m_threads, m_service_daemon.get(),
+ m_cache_manager_handler.get(),
+ m_pool_meta_cache.get(), kv.first, peer,
+ m_args));
+
+ // TODO: make async
+ pool_replayer->init(site_name);
+ m_pool_replayers.emplace(pool_peer, std::move(pool_replayer));
+ }
+ }
+
+ // TODO currently only support a single peer
+ }
+
+ m_site_name = site_name;
+}
+
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/Mirror.h b/src/tools/rbd_mirror/Mirror.h
new file mode 100644
index 000000000..f92a63b68
--- /dev/null
+++ b/src/tools/rbd_mirror/Mirror.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_H
+#define CEPH_RBD_MIRROR_H
+
+#include "common/ceph_context.h"
+#include "common/ceph_mutex.h"
+#include "include/rados/librados.hpp"
+#include "include/utime.h"
+#include "ClusterWatcher.h"
+#include "PoolReplayer.h"
+#include "tools/rbd_mirror/Types.h"
+
+#include <set>
+#include <map>
+#include <memory>
+#include <atomic>
+
+namespace journal { class CacheManagerHandler; }
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct ServiceDaemon;
+template <typename> struct Threads;
+class CacheManagerHandler;
+class MirrorAdminSocketHook;
+class PoolMetaCache;
+
+/**
+ * Contains the main loop and overall state for rbd-mirror.
+ *
+ * Sets up mirroring, and coordinates between noticing config
+ * changes and applying them.
+ */
+class Mirror {
+public:
+ Mirror(CephContext *cct, const std::vector<const char*> &args);
+ Mirror(const Mirror&) = delete;
+ Mirror& operator=(const Mirror&) = delete;
+ ~Mirror();
+
+ int init();
+ void run();
+ void handle_signal(int signum);
+
+ void print_status(Formatter *f);
+ void start();
+ void stop();
+ void restart();
+ void flush();
+ void release_leader();
+
+private:
+ typedef ClusterWatcher::PoolPeers PoolPeers;
+ typedef std::pair<int64_t, PeerSpec> PoolPeer;
+
+ void update_pool_replayers(const PoolPeers &pool_peers,
+ const std::string& site_name);
+
+ void create_cache_manager();
+ void run_cache_manager(utime_t *next_run_interval);
+
+ CephContext *m_cct;
+ std::vector<const char*> m_args;
+ Threads<librbd::ImageCtx> *m_threads = nullptr;
+ ceph::mutex m_lock = ceph::make_mutex("rbd::mirror::Mirror");
+ ceph::condition_variable m_cond;
+ RadosRef m_local;
+ std::unique_ptr<ServiceDaemon<librbd::ImageCtx>> m_service_daemon;
+
+ // monitor local cluster for config changes in peers
+ std::unique_ptr<ClusterWatcher> m_local_cluster_watcher;
+ std::unique_ptr<CacheManagerHandler> m_cache_manager_handler;
+ std::unique_ptr<PoolMetaCache> m_pool_meta_cache;
+ std::map<PoolPeer, std::unique_ptr<PoolReplayer<>>> m_pool_replayers;
+ std::atomic<bool> m_stopping = { false };
+ bool m_manual_stop = false;
+ MirrorAdminSocketHook *m_asok_hook;
+ std::string m_site_name;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_H
diff --git a/src/tools/rbd_mirror/MirrorStatusUpdater.cc b/src/tools/rbd_mirror/MirrorStatusUpdater.cc
new file mode 100644
index 000000000..257cb1df2
--- /dev/null
+++ b/src/tools/rbd_mirror/MirrorStatusUpdater.cc
@@ -0,0 +1,397 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/MirrorStatusUpdater.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "tools/rbd_mirror/MirrorStatusWatcher.h"
+#include "tools/rbd_mirror/Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::MirrorStatusUpdater " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+static const double UPDATE_INTERVAL_SECONDS = 30;
+static const uint32_t MAX_UPDATES_PER_OP = 100;
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+MirrorStatusUpdater<I>::MirrorStatusUpdater(
+ librados::IoCtx& io_ctx, Threads<I> *threads,
+ const std::string& local_mirror_uuid)
+ : m_io_ctx(io_ctx), m_threads(threads),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_lock(ceph::make_mutex("rbd::mirror::MirrorStatusUpdater " +
+ stringify(m_io_ctx.get_id()))) {
+ dout(10) << "local_mirror_uuid=" << local_mirror_uuid << ", "
+ << "pool_id=" << m_io_ctx.get_id() << dendl;
+}
+
+template <typename I>
+MirrorStatusUpdater<I>::~MirrorStatusUpdater() {
+ ceph_assert(!m_initialized);
+ delete m_mirror_status_watcher;
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::init(Context* on_finish) {
+ dout(10) << dendl;
+
+ ceph_assert(!m_initialized);
+ m_initialized = true;
+
+ {
+ std::lock_guard timer_locker{m_threads->timer_lock};
+ schedule_timer_task();
+ }
+
+ init_mirror_status_watcher(on_finish);
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::init_mirror_status_watcher(Context* on_finish) {
+ dout(10) << dendl;
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_init_mirror_status_watcher(r, on_finish);
+ });
+ m_mirror_status_watcher = MirrorStatusWatcher<I>::create(
+ m_io_ctx, m_threads->work_queue);
+ m_mirror_status_watcher->init(ctx);
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::handle_init_mirror_status_watcher(
+ int r, Context* on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to init mirror status watcher: " << cpp_strerror(r)
+ << dendl;
+
+ delete m_mirror_status_watcher;
+ m_mirror_status_watcher = nullptr;
+
+ on_finish = new LambdaContext([r, on_finish](int) {
+ on_finish->complete(r);
+ });
+ shut_down(on_finish);
+ return;
+ }
+
+ m_threads->work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::shut_down(Context* on_finish) {
+ dout(10) << dendl;
+
+ {
+ std::lock_guard timer_locker{m_threads->timer_lock};
+ ceph_assert(m_timer_task != nullptr);
+ m_threads->timer->cancel_event(m_timer_task);
+ }
+
+ {
+ std::unique_lock locker(m_lock);
+ ceph_assert(m_initialized);
+ m_initialized = false;
+ }
+
+ shut_down_mirror_status_watcher(on_finish);
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::shut_down_mirror_status_watcher(
+ Context* on_finish) {
+ if (m_mirror_status_watcher == nullptr) {
+ finalize_shutdown(0, on_finish);
+ return;
+ }
+
+ dout(10) << dendl;
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_shut_down_mirror_status_watcher(r, on_finish);
+ });
+ m_mirror_status_watcher->shut_down(ctx);
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::handle_shut_down_mirror_status_watcher(
+ int r, Context* on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to shut down mirror status watcher: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finalize_shutdown(r, on_finish);
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::finalize_shutdown(int r, Context* on_finish) {
+ dout(10) << dendl;
+
+ {
+ std::unique_lock locker(m_lock);
+ if (m_update_in_progress) {
+ if (r < 0) {
+ on_finish = new LambdaContext([r, on_finish](int) {
+ on_finish->complete(r);
+ });
+ }
+
+ m_update_on_finish_ctxs.push_back(on_finish);
+ return;
+ }
+ }
+
+ m_threads->work_queue->queue(on_finish, r);
+}
+
+template <typename I>
+bool MirrorStatusUpdater<I>::exists(const std::string& global_image_id) {
+ dout(15) << "global_image_id=" << global_image_id << dendl;
+
+ std::unique_lock locker(m_lock);
+ return (m_global_image_status.count(global_image_id) > 0);
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::set_mirror_image_status(
+ const std::string& global_image_id,
+ const cls::rbd::MirrorImageSiteStatus& mirror_image_site_status,
+ bool immediate_update) {
+ dout(15) << "global_image_id=" << global_image_id << ", "
+ << "mirror_image_site_status=" << mirror_image_site_status << dendl;
+
+ std::unique_lock locker(m_lock);
+
+ m_global_image_status[global_image_id] = mirror_image_site_status;
+ if (immediate_update) {
+ m_update_global_image_ids.insert(global_image_id);
+ queue_update_task(std::move(locker));
+ }
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::remove_refresh_mirror_image_status(
+ const std::string& global_image_id,
+ Context* on_finish) {
+ if (try_remove_mirror_image_status(global_image_id, false, false,
+ on_finish)) {
+ m_threads->work_queue->queue(on_finish, 0);
+ }
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::remove_mirror_image_status(
+ const std::string& global_image_id, bool immediate_update,
+ Context* on_finish) {
+ if (try_remove_mirror_image_status(global_image_id, true, immediate_update,
+ on_finish)) {
+ m_threads->work_queue->queue(on_finish, 0);
+ }
+}
+
+template <typename I>
+bool MirrorStatusUpdater<I>::try_remove_mirror_image_status(
+ const std::string& global_image_id, bool queue_update,
+ bool immediate_update, Context* on_finish) {
+ dout(15) << "global_image_id=" << global_image_id << ", "
+ << "queue_update=" << queue_update << ", "
+ << "immediate_update=" << immediate_update << dendl;
+
+ std::unique_lock locker(m_lock);
+ if ((m_update_in_flight &&
+ m_updating_global_image_ids.count(global_image_id) > 0) ||
+ ((m_update_in_progress || m_update_requested) &&
+ m_update_global_image_ids.count(global_image_id) > 0)) {
+ // if update is scheduled/in-progress, wait for it to complete
+ on_finish = new LambdaContext(
+ [this, global_image_id, queue_update, immediate_update,
+ on_finish](int r) {
+ if (try_remove_mirror_image_status(global_image_id, queue_update,
+ immediate_update, on_finish)) {
+ on_finish->complete(0);
+ }
+ });
+ m_update_on_finish_ctxs.push_back(on_finish);
+ return false;
+ }
+
+ m_global_image_status.erase(global_image_id);
+ if (queue_update) {
+ m_update_global_image_ids.insert(global_image_id);
+ if (immediate_update) {
+ queue_update_task(std::move(locker));
+ }
+ }
+
+ return true;
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::schedule_timer_task() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(m_timer_task == nullptr);
+ m_timer_task = create_context_callback<
+ MirrorStatusUpdater<I>,
+ &MirrorStatusUpdater<I>::handle_timer_task>(this);
+ m_threads->timer->add_event_after(UPDATE_INTERVAL_SECONDS, m_timer_task);
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::handle_timer_task(int r) {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(m_timer_task != nullptr);
+ m_timer_task = nullptr;
+ schedule_timer_task();
+
+ std::unique_lock locker(m_lock);
+ for (auto& pair : m_global_image_status) {
+ m_update_global_image_ids.insert(pair.first);
+ }
+
+ queue_update_task(std::move(locker));
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::queue_update_task(
+ std::unique_lock<ceph::mutex>&& locker) {
+ if (!m_initialized) {
+ return;
+ }
+
+ if (m_update_in_progress) {
+ if (m_update_in_flight) {
+ dout(10) << "deferring update due to in-flight ops" << dendl;
+ m_update_requested = true;
+ }
+ return;
+ }
+
+ m_update_in_progress = true;
+ ceph_assert(!m_update_in_flight);
+ ceph_assert(!m_update_requested);
+ locker.unlock();
+
+ dout(10) << dendl;
+ auto ctx = create_context_callback<
+ MirrorStatusUpdater<I>,
+ &MirrorStatusUpdater<I>::update_task>(this);
+ m_threads->work_queue->queue(ctx);
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::update_task(int r) {
+ dout(10) << dendl;
+
+ std::unique_lock locker(m_lock);
+ ceph_assert(m_update_in_progress);
+ ceph_assert(!m_update_in_flight);
+ m_update_in_flight = true;
+
+ std::swap(m_updating_global_image_ids, m_update_global_image_ids);
+ auto updating_global_image_ids = m_updating_global_image_ids;
+ auto global_image_status = m_global_image_status;
+ locker.unlock();
+
+ Context* ctx = create_context_callback<
+ MirrorStatusUpdater<I>,
+ &MirrorStatusUpdater<I>::handle_update_task>(this);
+ if (updating_global_image_ids.empty()) {
+ ctx->complete(0);
+ return;
+ }
+
+ auto gather = new C_Gather(g_ceph_context, ctx);
+
+ auto it = updating_global_image_ids.begin();
+ while (it != updating_global_image_ids.end()) {
+ librados::ObjectWriteOperation op;
+ uint32_t op_count = 0;
+
+ while (it != updating_global_image_ids.end() &&
+ op_count < MAX_UPDATES_PER_OP) {
+ auto& global_image_id = *it;
+ ++it;
+
+ auto status_it = global_image_status.find(global_image_id);
+ if (status_it == global_image_status.end()) {
+ librbd::cls_client::mirror_image_status_remove(&op, global_image_id);
+ ++op_count;
+ continue;
+ }
+
+ status_it->second.mirror_uuid = m_local_mirror_uuid;
+ librbd::cls_client::mirror_image_status_set(&op, global_image_id,
+ status_it->second);
+ ++op_count;
+ }
+
+ auto aio_comp = create_rados_callback(gather->new_sub());
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ }
+
+ gather->activate();
+}
+
+template <typename I>
+void MirrorStatusUpdater<I>::handle_update_task(int r) {
+ dout(10) << dendl;
+ if (r < 0) {
+ derr << "failed to update mirror image statuses: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ std::unique_lock locker(m_lock);
+
+ Contexts on_finish_ctxs;
+ std::swap(on_finish_ctxs, m_update_on_finish_ctxs);
+
+ ceph_assert(m_update_in_progress);
+ m_update_in_progress = false;
+
+ ceph_assert(m_update_in_flight);
+ m_update_in_flight = false;
+
+ m_updating_global_image_ids.clear();
+
+ if (m_update_requested) {
+ m_update_requested = false;
+ queue_update_task(std::move(locker));
+ } else {
+ locker.unlock();
+ }
+
+ for (auto on_finish : on_finish_ctxs) {
+ on_finish->complete(0);
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::MirrorStatusUpdater<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/MirrorStatusUpdater.h b/src/tools/rbd_mirror/MirrorStatusUpdater.h
new file mode 100644
index 000000000..783b818fc
--- /dev/null
+++ b/src/tools/rbd_mirror/MirrorStatusUpdater.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_MIRROR_STATUS_UPDATER_H
+#define CEPH_RBD_MIRROR_MIRROR_STATUS_UPDATER_H
+
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+
+struct Context;
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct MirrorStatusWatcher;
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MirrorStatusUpdater {
+public:
+
+ static MirrorStatusUpdater* create(librados::IoCtx& io_ctx,
+ Threads<ImageCtxT> *threads,
+ const std::string& local_mirror_uuid) {
+ return new MirrorStatusUpdater(io_ctx, threads, local_mirror_uuid);
+ }
+
+ MirrorStatusUpdater(librados::IoCtx& io_ctx, Threads<ImageCtxT> *threads,
+ const std::string& local_mirror_uuid);
+ ~MirrorStatusUpdater();
+
+ void init(Context* on_finish);
+ void shut_down(Context* on_finish);
+
+ bool exists(const std::string& global_image_id);
+ void set_mirror_image_status(
+ const std::string& global_image_id,
+ const cls::rbd::MirrorImageSiteStatus& mirror_image_site_status,
+ bool immediate_update);
+ void remove_mirror_image_status(const std::string& global_image_id,
+ bool immediate_update, Context* on_finish);
+ void remove_refresh_mirror_image_status(const std::string& global_image_id,
+ Context* on_finish);
+
+private:
+ /**
+ * @verbatim
+ *
+ * <uninitialized> <----------------------\
+ * | (init) ^ (error) |
+ * v * |
+ * INIT_STATUS_WATCHER * * * * * |
+ * | |
+ * | SHUT_DOWN_STATUS_WATCHER
+ * | ^
+ * | |
+ * | (shutdown) |
+ * <initialized> -------------------------/
+ *
+ * @endverbatim
+ */
+ typedef std::list<Context*> Contexts;
+ typedef std::set<std::string> GlobalImageIds;
+ typedef std::map<std::string, cls::rbd::MirrorImageSiteStatus>
+ GlobalImageStatus;
+
+ librados::IoCtx m_io_ctx;
+ Threads<ImageCtxT>* m_threads;
+ std::string m_local_mirror_uuid;
+
+ Context* m_timer_task = nullptr;
+
+ ceph::mutex m_lock;
+
+ bool m_initialized = false;
+
+ MirrorStatusWatcher<ImageCtxT>* m_mirror_status_watcher = nullptr;
+
+ GlobalImageIds m_update_global_image_ids;
+ GlobalImageStatus m_global_image_status;
+
+ bool m_update_in_progress = false;
+ bool m_update_in_flight = false;
+ bool m_update_requested = false;
+ Contexts m_update_on_finish_ctxs;
+ GlobalImageIds m_updating_global_image_ids;
+
+ bool try_remove_mirror_image_status(const std::string& global_image_id,
+ bool queue_update, bool immediate_update,
+ Context* on_finish);
+
+ void init_mirror_status_watcher(Context* on_finish);
+ void handle_init_mirror_status_watcher(int r, Context* on_finish);
+
+ void shut_down_mirror_status_watcher(Context* on_finish);
+ void handle_shut_down_mirror_status_watcher(int r, Context* on_finish);
+ void finalize_shutdown(int r, Context* on_finish);
+
+ void schedule_timer_task();
+ void handle_timer_task(int r);
+
+ void queue_update_task(std::unique_lock<ceph::mutex>&& locker);
+ void update_task(int r);
+ void handle_update_task(int r);
+
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::MirrorStatusUpdater<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_MIRROR_STATUS_UPDATER_H
diff --git a/src/tools/rbd_mirror/MirrorStatusWatcher.cc b/src/tools/rbd_mirror/MirrorStatusWatcher.cc
new file mode 100644
index 000000000..3e1564c5b
--- /dev/null
+++ b/src/tools/rbd_mirror/MirrorStatusWatcher.cc
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MirrorStatusWatcher.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::MirrorStatusWatcher: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+MirrorStatusWatcher<I>::MirrorStatusWatcher(librados::IoCtx &io_ctx,
+ librbd::asio::ContextWQ *work_queue)
+ : Watcher(io_ctx, work_queue, RBD_MIRRORING) {
+}
+
+template <typename I>
+MirrorStatusWatcher<I>::~MirrorStatusWatcher() {
+}
+
+template <typename I>
+void MirrorStatusWatcher<I>::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ on_finish = new LambdaContext(
+ [this, on_finish] (int r) {
+ if (r < 0) {
+ derr << "error removing down statuses: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+ register_watch(on_finish);
+ });
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_status_remove_down(&op);
+ librados::AioCompletion *aio_comp = create_rados_callback(on_finish);
+
+ int r = m_ioctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void MirrorStatusWatcher<I>::shut_down(Context *on_finish) {
+ dout(20) << dendl;
+
+ unregister_watch(on_finish);
+}
+
+template <typename I>
+void MirrorStatusWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id,
+ bufferlist &bl) {
+ dout(20) << dendl;
+
+ bufferlist out;
+ acknowledge_notify(notify_id, handle, out);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::MirrorStatusWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/MirrorStatusWatcher.h b/src/tools/rbd_mirror/MirrorStatusWatcher.h
new file mode 100644
index 000000000..3335e9e63
--- /dev/null
+++ b/src/tools/rbd_mirror/MirrorStatusWatcher.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_MIRROR_STATUS_WATCHER_H
+#define CEPH_RBD_MIRROR_MIRROR_STATUS_WATCHER_H
+
+#include "librbd/Watcher.h"
+
+namespace librbd {
+class ImageCtx;
+namespace asio { struct ContextWQ; }
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MirrorStatusWatcher : protected librbd::Watcher {
+public:
+ static MirrorStatusWatcher *create(librados::IoCtx &io_ctx,
+ librbd::asio::ContextWQ *work_queue) {
+ return new MirrorStatusWatcher(io_ctx, work_queue);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ MirrorStatusWatcher(librados::IoCtx &io_ctx,
+ librbd::asio::ContextWQ *work_queue);
+ ~MirrorStatusWatcher() override;
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+protected:
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_MIRROR_STATUS_WATCHER_H
diff --git a/src/tools/rbd_mirror/NamespaceReplayer.cc b/src/tools/rbd_mirror/NamespaceReplayer.cc
new file mode 100644
index 000000000..d305d8472
--- /dev/null
+++ b/src/tools/rbd_mirror/NamespaceReplayer.cc
@@ -0,0 +1,862 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "NamespaceReplayer.h"
+#include "common/Formatter.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Mirror.h"
+#include "librbd/asio/ContextWQ.h"
+#include "ServiceDaemon.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::NamespaceReplayer: " \
+ << this << " " << __func__ << ": "
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+namespace rbd {
+namespace mirror {
+
+using ::operator<<;
+
+namespace {
+
+const std::string SERVICE_DAEMON_LOCAL_COUNT_KEY("image_local_count");
+const std::string SERVICE_DAEMON_REMOTE_COUNT_KEY("image_remote_count");
+
+} // anonymous namespace
+
+template <typename I>
+NamespaceReplayer<I>::NamespaceReplayer(
+ const std::string &name,
+ librados::IoCtx &local_io_ctx, librados::IoCtx &remote_io_ctx,
+ const std::string &local_mirror_uuid,
+ const std::string& local_mirror_peer_uuid,
+ const RemotePoolMeta& remote_pool_meta,
+ Threads<I> *threads,
+ Throttler<I> *image_sync_throttler,
+ Throttler<I> *image_deletion_throttler,
+ ServiceDaemon<I> *service_daemon,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache) :
+ m_namespace_name(name),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_local_mirror_peer_uuid(local_mirror_peer_uuid),
+ m_remote_pool_meta(remote_pool_meta),
+ m_threads(threads), m_image_sync_throttler(image_sync_throttler),
+ m_image_deletion_throttler(image_deletion_throttler),
+ m_service_daemon(service_daemon),
+ m_cache_manager_handler(cache_manager_handler),
+ m_pool_meta_cache(pool_meta_cache),
+ m_lock(ceph::make_mutex(librbd::util::unique_lock_name(
+ "rbd::mirror::NamespaceReplayer " + name, this))),
+ m_local_pool_watcher_listener(this, true),
+ m_remote_pool_watcher_listener(this, false),
+ m_image_map_listener(this) {
+ dout(10) << name << dendl;
+
+ m_local_io_ctx.dup(local_io_ctx);
+ m_local_io_ctx.set_namespace(name);
+ m_remote_io_ctx.dup(remote_io_ctx);
+ m_remote_io_ctx.set_namespace(name);
+}
+
+template <typename I>
+bool NamespaceReplayer<I>::is_blocklisted() const {
+ std::lock_guard locker{m_lock};
+ return m_instance_replayer->is_blocklisted() ||
+ (m_local_pool_watcher &&
+ m_local_pool_watcher->is_blocklisted()) ||
+ (m_remote_pool_watcher &&
+ m_remote_pool_watcher->is_blocklisted());
+}
+
+template <typename I>
+void NamespaceReplayer<I>::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+
+ init_local_status_updater();
+}
+
+
+template <typename I>
+void NamespaceReplayer<I>::shut_down(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+
+ if (!m_image_map) {
+ stop_instance_replayer();
+ return;
+ }
+ }
+
+ auto ctx = new LambdaContext(
+ [this] (int r) {
+ std::lock_guard locker{m_lock};
+ stop_instance_replayer();
+ });
+ handle_release_leader(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::print_status(Formatter *f)
+{
+ dout(20) << dendl;
+
+ ceph_assert(f);
+
+ std::lock_guard locker{m_lock};
+
+ m_instance_replayer->print_status(f);
+
+ if (m_image_deleter) {
+ f->open_object_section("image_deleter");
+ m_image_deleter->print_status(f);
+ f->close_section();
+ }
+}
+
+template <typename I>
+void NamespaceReplayer<I>::start()
+{
+ dout(20) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_instance_replayer->start();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::stop()
+{
+ dout(20) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_instance_replayer->stop();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::restart()
+{
+ dout(20) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_instance_replayer->restart();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::flush()
+{
+ dout(20) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_instance_replayer->flush();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_update(const std::string &mirror_uuid,
+ ImageIds &&added_image_ids,
+ ImageIds &&removed_image_ids) {
+ std::lock_guard locker{m_lock};
+
+ if (!m_image_map) {
+ dout(20) << "not leader" << dendl;
+ return;
+ }
+
+ dout(10) << "mirror_uuid=" << mirror_uuid << ", "
+ << "added_count=" << added_image_ids.size() << ", "
+ << "removed_count=" << removed_image_ids.size() << dendl;
+
+ m_service_daemon->add_or_update_namespace_attribute(
+ m_local_io_ctx.get_id(), m_local_io_ctx.get_namespace(),
+ SERVICE_DAEMON_LOCAL_COUNT_KEY, m_local_pool_watcher->get_image_count());
+ if (m_remote_pool_watcher) {
+ m_service_daemon->add_or_update_namespace_attribute(
+ m_local_io_ctx.get_id(), m_local_io_ctx.get_namespace(),
+ SERVICE_DAEMON_REMOTE_COUNT_KEY,
+ m_remote_pool_watcher->get_image_count());
+ }
+
+ std::set<std::string> added_global_image_ids;
+ for (auto& image_id : added_image_ids) {
+ added_global_image_ids.insert(image_id.global_id);
+ }
+
+ std::set<std::string> removed_global_image_ids;
+ for (auto& image_id : removed_image_ids) {
+ removed_global_image_ids.insert(image_id.global_id);
+ }
+
+ m_image_map->update_images(mirror_uuid,
+ std::move(added_global_image_ids),
+ std::move(removed_global_image_ids));
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_acquire_leader(Context *on_finish) {
+ dout(10) << dendl;
+
+ m_instance_watcher->handle_acquire_leader();
+
+ init_image_map(on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_release_leader(Context *on_finish) {
+ dout(10) << dendl;
+
+ m_instance_watcher->handle_release_leader();
+ shut_down_image_deleter(on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_update_leader(
+ const std::string &leader_instance_id) {
+ dout(10) << "leader_instance_id=" << leader_instance_id << dendl;
+
+ m_instance_watcher->handle_update_leader(leader_instance_id);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_instances_added(
+ const std::vector<std::string> &instance_ids) {
+ dout(10) << "instance_ids=" << instance_ids << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (!m_image_map) {
+ return;
+ }
+
+ m_image_map->update_instances_added(instance_ids);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_instances_removed(
+ const std::vector<std::string> &instance_ids) {
+ dout(10) << "instance_ids=" << instance_ids << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (!m_image_map) {
+ return;
+ }
+
+ m_image_map->update_instances_removed(instance_ids);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::init_local_status_updater() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_local_status_updater);
+
+ m_local_status_updater.reset(MirrorStatusUpdater<I>::create(
+ m_local_io_ctx, m_threads, ""));
+ auto ctx = create_context_callback<
+ NamespaceReplayer<I>,
+ &NamespaceReplayer<I>::handle_init_local_status_updater>(this);
+
+ m_local_status_updater->init(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_init_local_status_updater(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error initializing local mirror status updater: "
+ << cpp_strerror(r) << dendl;
+
+ m_local_status_updater.reset();
+ ceph_assert(m_on_finish != nullptr);
+ m_threads->work_queue->queue(m_on_finish, r);
+ m_on_finish = nullptr;
+ return;
+ }
+
+ init_remote_status_updater();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::init_remote_status_updater() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_remote_status_updater);
+
+ m_remote_status_updater.reset(MirrorStatusUpdater<I>::create(
+ m_remote_io_ctx, m_threads, m_local_mirror_uuid));
+ auto ctx = create_context_callback<
+ NamespaceReplayer<I>,
+ &NamespaceReplayer<I>::handle_init_remote_status_updater>(this);
+ m_remote_status_updater->init(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_init_remote_status_updater(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error initializing remote mirror status updater: "
+ << cpp_strerror(r) << dendl;
+
+ m_remote_status_updater.reset();
+ m_ret_val = r;
+ shut_down_local_status_updater();
+ return;
+ }
+
+ init_instance_replayer();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::init_instance_replayer() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_instance_replayer);
+
+ m_instance_replayer.reset(InstanceReplayer<I>::create(
+ m_local_io_ctx, m_local_mirror_uuid, m_threads, m_service_daemon,
+ m_local_status_updater.get(), m_cache_manager_handler,
+ m_pool_meta_cache));
+ auto ctx = create_context_callback<NamespaceReplayer<I>,
+ &NamespaceReplayer<I>::handle_init_instance_replayer>(this);
+
+ m_instance_replayer->init(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_init_instance_replayer(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error initializing instance replayer: " << cpp_strerror(r)
+ << dendl;
+
+ m_instance_replayer.reset();
+ m_ret_val = r;
+ shut_down_remote_status_updater();
+ return;
+ }
+
+ m_instance_replayer->add_peer({m_local_mirror_peer_uuid, m_remote_io_ctx,
+ m_remote_pool_meta,
+ m_remote_status_updater.get()});
+
+ init_instance_watcher();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::init_instance_watcher() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(!m_instance_watcher);
+
+ m_instance_watcher.reset(InstanceWatcher<I>::create(
+ m_local_io_ctx, *m_threads->asio_engine, m_instance_replayer.get(),
+ m_image_sync_throttler));
+ auto ctx = create_context_callback<NamespaceReplayer<I>,
+ &NamespaceReplayer<I>::handle_init_instance_watcher>(this);
+
+ m_instance_watcher->init(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_init_instance_watcher(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ if (r < 0) {
+ derr << "error initializing instance watcher: " << cpp_strerror(r)
+ << dendl;
+
+ m_instance_watcher.reset();
+ m_ret_val = r;
+ shut_down_instance_replayer();
+ return;
+ }
+
+ ceph_assert(m_on_finish != nullptr);
+ m_threads->work_queue->queue(m_on_finish);
+ m_on_finish = nullptr;
+}
+
+template <typename I>
+void NamespaceReplayer<I>::stop_instance_replayer() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<NamespaceReplayer<I>,
+ &NamespaceReplayer<I>::handle_stop_instance_replayer>(this));
+
+ m_instance_replayer->stop(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_stop_instance_replayer(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error stopping instance replayer: " << cpp_strerror(r) << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+
+ shut_down_instance_watcher();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::shut_down_instance_watcher() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_instance_watcher);
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<NamespaceReplayer<I>,
+ &NamespaceReplayer<I>::handle_shut_down_instance_watcher>(this));
+
+ m_instance_watcher->shut_down(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_shut_down_instance_watcher(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error shutting instance watcher down: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+
+ m_instance_watcher.reset();
+
+ shut_down_instance_replayer();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::shut_down_instance_replayer() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_instance_replayer);
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<NamespaceReplayer<I>,
+ &NamespaceReplayer<I>::handle_shut_down_instance_replayer>(this));
+
+ m_instance_replayer->shut_down(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_shut_down_instance_replayer(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error shutting instance replayer down: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+
+ m_instance_replayer.reset();
+
+ shut_down_remote_status_updater();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::shut_down_remote_status_updater() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_remote_status_updater);
+
+ auto ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ NamespaceReplayer<I>,
+ &NamespaceReplayer<I>::handle_shut_down_remote_status_updater>(this));
+ m_remote_status_updater->shut_down(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_shut_down_remote_status_updater(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error shutting remote mirror status updater down: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+ m_remote_status_updater.reset();
+
+ shut_down_local_status_updater();
+}
+
+template <typename I>
+void NamespaceReplayer<I>::shut_down_local_status_updater() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ ceph_assert(m_local_status_updater);
+
+ auto ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ NamespaceReplayer<I>,
+ &NamespaceReplayer<I>::handle_shut_down_local_status_updater>(this));
+
+ m_local_status_updater->shut_down(ctx);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_shut_down_local_status_updater(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error shutting local mirror status updater down: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+
+ m_local_status_updater.reset();
+
+ ceph_assert(!m_image_map);
+ ceph_assert(!m_image_deleter);
+ ceph_assert(!m_local_pool_watcher);
+ ceph_assert(!m_remote_pool_watcher);
+ ceph_assert(!m_instance_watcher);
+ ceph_assert(!m_instance_replayer);
+
+ ceph_assert(m_on_finish != nullptr);
+ m_threads->work_queue->queue(m_on_finish, m_ret_val);
+ m_on_finish = nullptr;
+ m_ret_val = 0;
+}
+
+template <typename I>
+void NamespaceReplayer<I>::init_image_map(Context *on_finish) {
+ dout(10) << dendl;
+
+ auto image_map = ImageMap<I>::create(m_local_io_ctx, m_threads,
+ m_instance_watcher->get_instance_id(),
+ m_image_map_listener);
+
+ auto ctx = new LambdaContext(
+ [this, image_map, on_finish](int r) {
+ handle_init_image_map(r, image_map, on_finish);
+ });
+ image_map->init(create_async_context_callback(
+ m_threads->work_queue, ctx));
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_init_image_map(int r, ImageMap<I> *image_map,
+ Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to init image map: " << cpp_strerror(r) << dendl;
+ on_finish = new LambdaContext([image_map, on_finish, r](int) {
+ delete image_map;
+ on_finish->complete(r);
+ });
+ image_map->shut_down(on_finish);
+ return;
+ }
+
+ ceph_assert(!m_image_map);
+ m_image_map.reset(image_map);
+
+ init_local_pool_watcher(on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::init_local_pool_watcher(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(!m_local_pool_watcher);
+ m_local_pool_watcher.reset(PoolWatcher<I>::create(
+ m_threads, m_local_io_ctx, m_local_mirror_uuid,
+ m_local_pool_watcher_listener));
+
+ // ensure the initial set of local images is up-to-date
+ // after acquiring the leader role
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_init_local_pool_watcher(r, on_finish);
+ });
+ m_local_pool_watcher->init(create_async_context_callback(
+ m_threads->work_queue, ctx));
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_init_local_pool_watcher(
+ int r, Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to retrieve local images: " << cpp_strerror(r) << dendl;
+ on_finish = new LambdaContext([on_finish, r](int) {
+ on_finish->complete(r);
+ });
+ shut_down_pool_watchers(on_finish);
+ return;
+ }
+
+ init_remote_pool_watcher(on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::init_remote_pool_watcher(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(!m_remote_pool_watcher);
+ m_remote_pool_watcher.reset(PoolWatcher<I>::create(
+ m_threads, m_remote_io_ctx, m_remote_pool_meta.mirror_uuid,
+ m_remote_pool_watcher_listener));
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_init_remote_pool_watcher(r, on_finish);
+ });
+ m_remote_pool_watcher->init(create_async_context_callback(
+ m_threads->work_queue, ctx));
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_init_remote_pool_watcher(
+ int r, Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+ if (r == -ENOENT) {
+ // Technically nothing to do since the other side doesn't
+ // have mirroring enabled. Eventually the remote pool watcher will
+ // detect images (if mirroring is enabled), so no point propagating
+ // an error which would just busy-spin the state machines.
+ dout(0) << "remote peer does not have mirroring configured" << dendl;
+ } else if (r < 0) {
+ derr << "failed to retrieve remote images: " << cpp_strerror(r) << dendl;
+ on_finish = new LambdaContext([on_finish, r](int) {
+ on_finish->complete(r);
+ });
+ shut_down_pool_watchers(on_finish);
+ return;
+ }
+
+ init_image_deleter(on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::init_image_deleter(Context *on_finish) {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(!m_image_deleter);
+
+ on_finish = new LambdaContext([this, on_finish](int r) {
+ handle_init_image_deleter(r, on_finish);
+ });
+ m_image_deleter.reset(ImageDeleter<I>::create(m_local_io_ctx, m_threads,
+ m_image_deletion_throttler,
+ m_service_daemon));
+ m_image_deleter->init(create_async_context_callback(
+ m_threads->work_queue, on_finish));
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_init_image_deleter(
+ int r, Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to init image deleter: " << cpp_strerror(r) << dendl;
+ on_finish = new LambdaContext([on_finish, r](int) {
+ on_finish->complete(r);
+ });
+ shut_down_image_deleter(on_finish);
+ return;
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::shut_down_image_deleter(Context* on_finish) {
+ dout(10) << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ if (m_image_deleter) {
+ Context *ctx = new LambdaContext([this, on_finish](int r) {
+ handle_shut_down_image_deleter(r, on_finish);
+ });
+ ctx = create_async_context_callback(m_threads->work_queue, ctx);
+
+ m_image_deleter->shut_down(ctx);
+ return;
+ }
+ }
+ shut_down_pool_watchers(on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_shut_down_image_deleter(
+ int r, Context* on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_image_deleter);
+ m_image_deleter.reset();
+ }
+
+ shut_down_pool_watchers(on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::shut_down_pool_watchers(Context *on_finish) {
+ dout(10) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ if (m_local_pool_watcher) {
+ Context *ctx = new LambdaContext([this, on_finish](int r) {
+ handle_shut_down_pool_watchers(r, on_finish);
+ });
+ ctx = create_async_context_callback(m_threads->work_queue, ctx);
+
+ auto gather_ctx = new C_Gather(g_ceph_context, ctx);
+ m_local_pool_watcher->shut_down(gather_ctx->new_sub());
+ if (m_remote_pool_watcher) {
+ m_remote_pool_watcher->shut_down(gather_ctx->new_sub());
+ }
+ gather_ctx->activate();
+ return;
+ }
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_shut_down_pool_watchers(
+ int r, Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_local_pool_watcher);
+ m_local_pool_watcher.reset();
+
+ if (m_remote_pool_watcher) {
+ m_remote_pool_watcher.reset();
+ }
+ }
+ shut_down_image_map(on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::shut_down_image_map(Context *on_finish) {
+ dout(5) << dendl;
+
+ std::lock_guard locker{m_lock};
+ if (m_image_map) {
+ on_finish = new LambdaContext(
+ [this, on_finish](int r) {
+ handle_shut_down_image_map(r, on_finish);
+ });
+ m_image_map->shut_down(create_async_context_callback(
+ m_threads->work_queue, on_finish));
+ return;
+ }
+
+ m_threads->work_queue->queue(on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_shut_down_image_map(int r, Context *on_finish) {
+ dout(5) << "r=" << r << dendl;
+ if (r < 0 && r != -EBLOCKLISTED) {
+ derr << "failed to shut down image map: " << cpp_strerror(r) << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_image_map);
+ m_image_map.reset();
+
+ m_instance_replayer->release_all(create_async_context_callback(
+ m_threads->work_queue, on_finish));
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_acquire_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) {
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ m_instance_watcher->notify_image_acquire(instance_id, global_image_id,
+ on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_release_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) {
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ m_instance_watcher->notify_image_release(instance_id, global_image_id,
+ on_finish);
+}
+
+template <typename I>
+void NamespaceReplayer<I>::handle_remove_image(const std::string &mirror_uuid,
+ const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) {
+ ceph_assert(!mirror_uuid.empty());
+ dout(5) << "mirror_uuid=" << mirror_uuid << ", "
+ << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ m_instance_watcher->notify_peer_image_removed(instance_id, global_image_id,
+ mirror_uuid, on_finish);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::NamespaceReplayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/NamespaceReplayer.h b/src/tools/rbd_mirror/NamespaceReplayer.h
new file mode 100644
index 000000000..e304b8253
--- /dev/null
+++ b/src/tools/rbd_mirror/NamespaceReplayer.h
@@ -0,0 +1,308 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_NAMESPACE_REPLAYER_H
+#define CEPH_RBD_MIRROR_NAMESPACE_REPLAYER_H
+
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_mutex.h"
+#include "include/rados/librados.hpp"
+
+#include "tools/rbd_mirror/ImageDeleter.h"
+#include "tools/rbd_mirror/ImageMap.h"
+#include "tools/rbd_mirror/InstanceReplayer.h"
+#include "tools/rbd_mirror/InstanceWatcher.h"
+#include "tools/rbd_mirror/MirrorStatusUpdater.h"
+#include "tools/rbd_mirror/PoolWatcher.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/image_map/Types.h"
+#include "tools/rbd_mirror/pool_watcher/Types.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+class AdminSocketHook;
+
+namespace journal { struct CacheManagerHandler; }
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+struct PoolMetaCache;
+template <typename> class ServiceDaemon;
+template <typename> class Throttler;
+template <typename> struct Threads;
+
+/**
+ * Controls mirroring for a single remote cluster.
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class NamespaceReplayer {
+public:
+ static NamespaceReplayer *create(
+ const std::string &name,
+ librados::IoCtx &local_ioctx,
+ librados::IoCtx &remote_ioctx,
+ const std::string &local_mirror_uuid,
+ const std::string &local_mirror_peer_uuid,
+ const RemotePoolMeta& remote_pool_meta,
+ Threads<ImageCtxT> *threads,
+ Throttler<ImageCtxT> *image_sync_throttler,
+ Throttler<ImageCtxT> *image_deletion_throttler,
+ ServiceDaemon<ImageCtxT> *service_daemon,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache) {
+ return new NamespaceReplayer(name, local_ioctx, remote_ioctx,
+ local_mirror_uuid, local_mirror_peer_uuid,
+ remote_pool_meta, threads,
+ image_sync_throttler, image_deletion_throttler,
+ service_daemon, cache_manager_handler,
+ pool_meta_cache);
+ }
+
+ NamespaceReplayer(const std::string &name,
+ librados::IoCtx &local_ioctx,
+ librados::IoCtx &remote_ioctx,
+ const std::string &local_mirror_uuid,
+ const std::string& local_mirror_peer_uuid,
+ const RemotePoolMeta& remote_pool_meta,
+ Threads<ImageCtxT> *threads,
+ Throttler<ImageCtxT> *image_sync_throttler,
+ Throttler<ImageCtxT> *image_deletion_throttler,
+ ServiceDaemon<ImageCtxT> *service_daemon,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache);
+ NamespaceReplayer(const NamespaceReplayer&) = delete;
+ NamespaceReplayer& operator=(const NamespaceReplayer&) = delete;
+
+ bool is_blocklisted() const;
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+ void handle_acquire_leader(Context *on_finish);
+ void handle_release_leader(Context *on_finish);
+ void handle_update_leader(const std::string &leader_instance_id);
+ void handle_instances_added(const std::vector<std::string> &instance_ids);
+ void handle_instances_removed(const std::vector<std::string> &instance_ids);
+
+ void print_status(Formatter *f);
+ void start();
+ void stop();
+ void restart();
+ void flush();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <uninitialized> <------------------------------------\
+ * | (init) ^ (error) |
+ * v * |
+ * INIT_LOCAL_STATUS_UPDATER * * * * * * * * > SHUT_DOWN_LOCAL_STATUS_UPDATER
+ * | * (error) ^
+ * v * |
+ * INIT_REMOTE_STATUS_UPDATER * * * * * * * > SHUT_DOWN_REMOTE_STATUS_UPDATER
+ * | * (error) ^
+ * v * |
+ * INIT_INSTANCE_REPLAYER * * * * * * * * * > SHUT_DOWN_INSTANCE_REPLAYER
+ * | * ^
+ * v * |
+ * INIT_INSTANCE_WATCHER * * * * * * * * * * SHUT_DOWN_INSTANCE_WATCHER
+ * | (error) ^
+ * | |
+ * v STOP_INSTANCE_REPLAYER
+ * | ^
+ * | (shut down) |
+ * | /----------------------------------------------/
+ * v |
+ * <follower> <---------------------------\
+ * . |
+ * . |
+ * v (leader acquired) |
+ * INIT_IMAGE_MAP |
+ * | |
+ * v |
+ * INIT_LOCAL_POOL_WATCHER SHUT_DOWN_IMAGE_MAP
+ * | ^
+ * v |
+ * INIT_REMOTE_POOL_WATCHER SHUT_DOWN_POOL_WATCHERS
+ * | ^
+ * v |
+ * INIT_IMAGE_DELETER SHUT_DOWN_IMAGE_DELETER
+ * | ^
+ * v .
+ * <leader> <-----------\ .
+ * . | .
+ * . (image update) | .
+ * . . > NOTIFY_INSTANCE_WATCHER .
+ * . .
+ * . (leader lost / shut down) .
+ * . . . . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ struct PoolWatcherListener : public pool_watcher::Listener {
+ NamespaceReplayer *namespace_replayer;
+ bool local;
+
+ PoolWatcherListener(NamespaceReplayer *namespace_replayer, bool local)
+ : namespace_replayer(namespace_replayer), local(local) {
+ }
+
+ void handle_update(const std::string &mirror_uuid,
+ ImageIds &&added_image_ids,
+ ImageIds &&removed_image_ids) override {
+ namespace_replayer->handle_update((local ? "" : mirror_uuid),
+ std::move(added_image_ids),
+ std::move(removed_image_ids));
+ }
+ };
+
+ struct ImageMapListener : public image_map::Listener {
+ NamespaceReplayer *namespace_replayer;
+
+ ImageMapListener(NamespaceReplayer *namespace_replayer)
+ : namespace_replayer(namespace_replayer) {
+ }
+
+ void acquire_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) override {
+ namespace_replayer->handle_acquire_image(global_image_id, instance_id,
+ on_finish);
+ }
+
+ void release_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) override {
+ namespace_replayer->handle_release_image(global_image_id, instance_id,
+ on_finish);
+ }
+
+ void remove_image(const std::string &mirror_uuid,
+ const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) override {
+ namespace_replayer->handle_remove_image(mirror_uuid, global_image_id,
+ instance_id, on_finish);
+ }
+ };
+
+ void handle_update(const std::string &mirror_uuid,
+ ImageIds &&added_image_ids,
+ ImageIds &&removed_image_ids);
+
+ int init_rados(const std::string &cluster_name,
+ const std::string &client_name,
+ const std::string &mon_host,
+ const std::string &key,
+ const std::string &description, RadosRef *rados_ref,
+ bool strip_cluster_overrides);
+
+ void init_local_status_updater();
+ void handle_init_local_status_updater(int r);
+
+ void init_remote_status_updater();
+ void handle_init_remote_status_updater(int r);
+
+ void init_instance_replayer();
+ void handle_init_instance_replayer(int r);
+
+ void init_instance_watcher();
+ void handle_init_instance_watcher(int r);
+
+ void stop_instance_replayer();
+ void handle_stop_instance_replayer(int r);
+
+ void shut_down_instance_watcher();
+ void handle_shut_down_instance_watcher(int r);
+
+ void shut_down_instance_replayer();
+ void handle_shut_down_instance_replayer(int r);
+
+ void shut_down_remote_status_updater();
+ void handle_shut_down_remote_status_updater(int r);
+
+ void shut_down_local_status_updater();
+ void handle_shut_down_local_status_updater(int r);
+
+ void init_image_map(Context *on_finish);
+ void handle_init_image_map(int r, ImageMap<ImageCtxT> *image_map,
+ Context *on_finish);
+
+ void init_local_pool_watcher(Context *on_finish);
+ void handle_init_local_pool_watcher(int r, Context *on_finish);
+
+ void init_remote_pool_watcher(Context *on_finish);
+ void handle_init_remote_pool_watcher(int r, Context *on_finish);
+
+ void init_image_deleter(Context* on_finish);
+ void handle_init_image_deleter(int r, Context* on_finish);
+
+ void shut_down_image_deleter(Context* on_finish);
+ void handle_shut_down_image_deleter(int r, Context* on_finish);
+
+ void shut_down_pool_watchers(Context *on_finish);
+ void handle_shut_down_pool_watchers(int r, Context *on_finish);
+
+ void shut_down_image_map(Context *on_finish);
+ void handle_shut_down_image_map(int r, Context *on_finish);
+
+ void handle_acquire_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish);
+ void handle_release_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish);
+ void handle_remove_image(const std::string &mirror_uuid,
+ const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish);
+
+ std::string m_namespace_name;
+ librados::IoCtx m_local_io_ctx;
+ librados::IoCtx m_remote_io_ctx;
+ std::string m_local_mirror_uuid;
+ std::string m_local_mirror_peer_uuid;
+ RemotePoolMeta m_remote_pool_meta;
+ Threads<ImageCtxT> *m_threads;
+ Throttler<ImageCtxT> *m_image_sync_throttler;
+ Throttler<ImageCtxT> *m_image_deletion_throttler;
+ ServiceDaemon<ImageCtxT> *m_service_daemon;
+ journal::CacheManagerHandler *m_cache_manager_handler;
+ PoolMetaCache* m_pool_meta_cache;
+
+ mutable ceph::mutex m_lock;
+
+ int m_ret_val = 0;
+ Context *m_on_finish = nullptr;
+
+ std::unique_ptr<MirrorStatusUpdater<ImageCtxT>> m_local_status_updater;
+ std::unique_ptr<MirrorStatusUpdater<ImageCtxT>> m_remote_status_updater;
+
+ PoolWatcherListener m_local_pool_watcher_listener;
+ std::unique_ptr<PoolWatcher<ImageCtxT>> m_local_pool_watcher;
+
+ PoolWatcherListener m_remote_pool_watcher_listener;
+ std::unique_ptr<PoolWatcher<ImageCtxT>> m_remote_pool_watcher;
+
+ std::unique_ptr<InstanceReplayer<ImageCtxT>> m_instance_replayer;
+ std::unique_ptr<ImageDeleter<ImageCtxT>> m_image_deleter;
+
+ ImageMapListener m_image_map_listener;
+ std::unique_ptr<ImageMap<ImageCtxT>> m_image_map;
+
+ std::unique_ptr<InstanceWatcher<ImageCtxT>> m_instance_watcher;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::NamespaceReplayer<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_NAMESPACE_REPLAYER_H
diff --git a/src/tools/rbd_mirror/PoolMetaCache.cc b/src/tools/rbd_mirror/PoolMetaCache.cc
new file mode 100644
index 000000000..261802a55
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolMetaCache.cc
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/dout.h"
+#include "tools/rbd_mirror/PoolMetaCache.h"
+#include <shared_mutex>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::PoolMetaCache: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+int PoolMetaCache::get_local_pool_meta(
+ int64_t pool_id,
+ LocalPoolMeta* local_pool_meta) const {
+ dout(15) << "pool_id=" << pool_id << dendl;
+
+ std::shared_lock locker{m_lock};
+ auto it = m_local_pool_metas.find(pool_id);
+ if (it == m_local_pool_metas.end()) {
+ return -ENOENT;
+ }
+
+ *local_pool_meta = it->second;
+ return 0;
+}
+
+void PoolMetaCache::set_local_pool_meta(
+ int64_t pool_id,
+ const LocalPoolMeta& local_pool_meta) {
+ dout(15) << "pool_id=" << pool_id << ", "
+ << "local_pool_meta=" << local_pool_meta << dendl;
+
+ std::unique_lock locker(m_lock);
+ m_local_pool_metas[pool_id] = local_pool_meta;
+}
+
+void PoolMetaCache::remove_local_pool_meta(int64_t pool_id) {
+ dout(15) << "pool_id=" << pool_id << dendl;
+
+ std::unique_lock locker(m_lock);
+ m_local_pool_metas.erase(pool_id);
+}
+
+int PoolMetaCache::get_remote_pool_meta(
+ int64_t pool_id,
+ RemotePoolMeta* remote_pool_meta) const {
+ dout(15) << "pool_id=" << pool_id << dendl;
+
+ std::shared_lock locker{m_lock};
+ auto it = m_remote_pool_metas.find(pool_id);
+ if (it == m_remote_pool_metas.end()) {
+ return -ENOENT;
+ }
+
+ *remote_pool_meta = it->second;
+ return 0;
+}
+
+void PoolMetaCache::set_remote_pool_meta(
+ int64_t pool_id,
+ const RemotePoolMeta& remote_pool_meta) {
+ dout(15) << "pool_id=" << pool_id << ", "
+ << "remote_pool_meta=" << remote_pool_meta << dendl;
+
+ std::unique_lock locker(m_lock);
+ m_remote_pool_metas[pool_id] = remote_pool_meta;
+}
+
+void PoolMetaCache::remove_remote_pool_meta(int64_t pool_id) {
+ dout(15) << "pool_id=" << pool_id << dendl;
+
+ std::unique_lock locker(m_lock);
+ m_remote_pool_metas.erase(pool_id);
+}
+
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/PoolMetaCache.h b/src/tools/rbd_mirror/PoolMetaCache.h
new file mode 100644
index 000000000..f0440120f
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolMetaCache.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_POOL_META_CACHE_H
+#define CEPH_RBD_MIRROR_POOL_META_CACHE_H
+
+#include "include/int_types.h"
+#include "common/ceph_mutex.h"
+#include "tools/rbd_mirror/Types.h"
+#include <map>
+
+namespace rbd {
+namespace mirror {
+
+class PoolMetaCache {
+public:
+ PoolMetaCache(CephContext* cct)
+ : m_cct(cct) {
+ }
+ PoolMetaCache(const PoolMetaCache&) = delete;
+ PoolMetaCache& operator=(const PoolMetaCache&) = delete;
+
+ int get_local_pool_meta(int64_t pool_id,
+ LocalPoolMeta* local_pool_meta) const;
+ void set_local_pool_meta(int64_t pool_id,
+ const LocalPoolMeta& local_pool_meta);
+ void remove_local_pool_meta(int64_t pool_id);
+
+ int get_remote_pool_meta(int64_t pool_id,
+ RemotePoolMeta* remote_pool_meta) const;
+ void set_remote_pool_meta(int64_t pool_id,
+ const RemotePoolMeta& remote_pool_meta);
+ void remove_remote_pool_meta(int64_t pool_id);
+
+private:
+ CephContext* m_cct;
+
+ mutable ceph::shared_mutex m_lock =
+ ceph::make_shared_mutex("rbd::mirror::PoolMetaCache::m_lock");
+ std::map<int64_t, LocalPoolMeta> m_local_pool_metas;
+ std::map<int64_t, RemotePoolMeta> m_remote_pool_metas;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_POOL_META_CACHE_H
diff --git a/src/tools/rbd_mirror/PoolReplayer.cc b/src/tools/rbd_mirror/PoolReplayer.cc
new file mode 100644
index 000000000..de0d60241
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolReplayer.cc
@@ -0,0 +1,1109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PoolReplayer.h"
+#include "common/Cond.h"
+#include "common/Formatter.h"
+#include "common/admin_socket.h"
+#include "common/ceph_argparse.h"
+#include "common/code_environment.h"
+#include "common/common_init.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "global/global_context.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Namespace.h"
+#include "PoolMetaCache.h"
+#include "RemotePoolPoller.h"
+#include "ServiceDaemon.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::PoolReplayer: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+using ::operator<<;
+
+namespace {
+
+const std::string SERVICE_DAEMON_INSTANCE_ID_KEY("instance_id");
+const std::string SERVICE_DAEMON_LEADER_KEY("leader");
+
+const std::vector<std::string> UNIQUE_PEER_CONFIG_KEYS {
+ {"monmap", "mon_host", "mon_dns_srv_name", "key", "keyfile", "keyring"}};
+
+template <typename I>
+class PoolReplayerAdminSocketCommand {
+public:
+ PoolReplayerAdminSocketCommand(PoolReplayer<I> *pool_replayer)
+ : pool_replayer(pool_replayer) {
+ }
+ virtual ~PoolReplayerAdminSocketCommand() {}
+ virtual int call(Formatter *f) = 0;
+protected:
+ PoolReplayer<I> *pool_replayer;
+};
+
+template <typename I>
+class StatusCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit StatusCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->pool_replayer->print_status(f);
+ return 0;
+ }
+};
+
+template <typename I>
+class StartCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit StartCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->pool_replayer->start();
+ return 0;
+ }
+};
+
+template <typename I>
+class StopCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit StopCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->pool_replayer->stop(true);
+ return 0;
+ }
+};
+
+template <typename I>
+class RestartCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit RestartCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->pool_replayer->restart();
+ return 0;
+ }
+};
+
+template <typename I>
+class FlushCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit FlushCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->pool_replayer->flush();
+ return 0;
+ }
+};
+
+template <typename I>
+class LeaderReleaseCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit LeaderReleaseCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ int call(Formatter *f) override {
+ this->pool_replayer->release_leader();
+ return 0;
+ }
+};
+
+template <typename I>
+class PoolReplayerAdminSocketHook : public AdminSocketHook {
+public:
+ PoolReplayerAdminSocketHook(CephContext *cct, const std::string &name,
+ PoolReplayer<I> *pool_replayer)
+ : admin_socket(cct->get_admin_socket()) {
+ std::string command;
+ int r;
+
+ command = "rbd mirror status " + name;
+ r = admin_socket->register_command(command, this,
+ "get status for rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new StatusCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror start " + name;
+ r = admin_socket->register_command(command, this,
+ "start rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new StartCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror stop " + name;
+ r = admin_socket->register_command(command, this,
+ "stop rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new StopCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror restart " + name;
+ r = admin_socket->register_command(command, this,
+ "restart rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new RestartCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror flush " + name;
+ r = admin_socket->register_command(command, this,
+ "flush rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new FlushCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror leader release " + name;
+ r = admin_socket->register_command(command, this,
+ "release rbd mirror leader " + name);
+ if (r == 0) {
+ commands[command] = new LeaderReleaseCommand<I>(pool_replayer);
+ }
+ }
+
+ ~PoolReplayerAdminSocketHook() override {
+ (void)admin_socket->unregister_commands(this);
+ for (auto i = commands.begin(); i != commands.end(); ++i) {
+ delete i->second;
+ }
+ }
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& ss,
+ bufferlist& out) override {
+ auto i = commands.find(command);
+ ceph_assert(i != commands.end());
+ return i->second->call(f);
+ }
+
+private:
+ typedef std::map<std::string, PoolReplayerAdminSocketCommand<I>*,
+ std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+} // anonymous namespace
+
+template <typename I>
+struct PoolReplayer<I>::RemotePoolPollerListener
+ : public remote_pool_poller::Listener {
+
+ PoolReplayer<I>* m_pool_replayer;
+
+ RemotePoolPollerListener(PoolReplayer<I>* pool_replayer)
+ : m_pool_replayer(pool_replayer) {
+ }
+
+ void handle_updated(const RemotePoolMeta& remote_pool_meta) override {
+ m_pool_replayer->handle_remote_pool_meta_updated(remote_pool_meta);
+ }
+};
+
+template <typename I>
+PoolReplayer<I>::PoolReplayer(
+ Threads<I> *threads, ServiceDaemon<I> *service_daemon,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache, int64_t local_pool_id,
+ const PeerSpec &peer, const std::vector<const char*> &args) :
+ m_threads(threads),
+ m_service_daemon(service_daemon),
+ m_cache_manager_handler(cache_manager_handler),
+ m_pool_meta_cache(pool_meta_cache),
+ m_local_pool_id(local_pool_id),
+ m_peer(peer),
+ m_args(args),
+ m_lock(ceph::make_mutex("rbd::mirror::PoolReplayer " + stringify(peer))),
+ m_pool_replayer_thread(this),
+ m_leader_listener(this) {
+}
+
+template <typename I>
+PoolReplayer<I>::~PoolReplayer()
+{
+ shut_down();
+
+ ceph_assert(m_asok_hook == nullptr);
+}
+
+template <typename I>
+bool PoolReplayer<I>::is_blocklisted() const {
+ std::lock_guard locker{m_lock};
+ return m_blocklisted;
+}
+
+template <typename I>
+bool PoolReplayer<I>::is_leader() const {
+ std::lock_guard locker{m_lock};
+ return m_leader_watcher && m_leader_watcher->is_leader();
+}
+
+template <typename I>
+bool PoolReplayer<I>::is_running() const {
+ return m_pool_replayer_thread.is_started() && !m_stopping;
+}
+
+template <typename I>
+void PoolReplayer<I>::init(const std::string& site_name) {
+ std::lock_guard locker{m_lock};
+
+ ceph_assert(!m_pool_replayer_thread.is_started());
+
+ // reset state
+ m_stopping = false;
+ m_blocklisted = false;
+ m_site_name = site_name;
+
+ dout(10) << "replaying for " << m_peer << dendl;
+ int r = init_rados(g_ceph_context->_conf->cluster,
+ g_ceph_context->_conf->name.to_str(),
+ "", "", "local cluster", &m_local_rados, false);
+ if (r < 0) {
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to connect to local cluster");
+ return;
+ }
+
+ r = init_rados(m_peer.cluster_name, m_peer.client_name,
+ m_peer.mon_host, m_peer.key,
+ std::string("remote peer ") + stringify(m_peer),
+ &m_remote_rados, true);
+ if (r < 0) {
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to connect to remote cluster");
+ return;
+ }
+
+ r = m_local_rados->ioctx_create2(m_local_pool_id, m_local_io_ctx);
+ if (r < 0) {
+ derr << "error accessing local pool " << m_local_pool_id << ": "
+ << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
+ librbd::api::Config<I>::apply_pool_overrides(m_local_io_ctx, &cct->_conf);
+
+ r = librbd::cls_client::mirror_uuid_get(&m_local_io_ctx,
+ &m_local_mirror_uuid);
+ if (r < 0) {
+ derr << "failed to retrieve local mirror uuid from pool "
+ << m_local_io_ctx.get_pool_name() << ": " << cpp_strerror(r) << dendl;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to query local mirror uuid");
+ return;
+ }
+
+ r = m_remote_rados->ioctx_create(m_local_io_ctx.get_pool_name().c_str(),
+ m_remote_io_ctx);
+ if (r < 0) {
+ derr << "error accessing remote pool " << m_local_io_ctx.get_pool_name()
+ << ": " << cpp_strerror(r) << dendl;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_WARNING,
+ "unable to access remote pool");
+ return;
+ }
+
+ dout(10) << "connected to " << m_peer << dendl;
+
+ m_image_sync_throttler.reset(
+ Throttler<I>::create(cct, "rbd_mirror_concurrent_image_syncs"));
+
+ m_image_deletion_throttler.reset(
+ Throttler<I>::create(cct, "rbd_mirror_concurrent_image_deletions"));
+
+ m_remote_pool_poller_listener.reset(new RemotePoolPollerListener(this));
+ m_remote_pool_poller.reset(RemotePoolPoller<I>::create(
+ m_threads, m_remote_io_ctx, m_site_name, m_local_mirror_uuid,
+ *m_remote_pool_poller_listener));
+
+ C_SaferCond on_pool_poller_init;
+ m_remote_pool_poller->init(&on_pool_poller_init);
+ r = on_pool_poller_init.wait();
+ if (r < 0) {
+ derr << "failed to initialize remote pool poller: " << cpp_strerror(r)
+ << dendl;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to initialize remote pool poller");
+ m_remote_pool_poller.reset();
+ return;
+ }
+ ceph_assert(!m_remote_pool_meta.mirror_uuid.empty());
+ m_pool_meta_cache->set_remote_pool_meta(
+ m_remote_io_ctx.get_id(), m_remote_pool_meta);
+ m_pool_meta_cache->set_local_pool_meta(
+ m_local_io_ctx.get_id(), {m_local_mirror_uuid});
+
+ m_default_namespace_replayer.reset(NamespaceReplayer<I>::create(
+ "", m_local_io_ctx, m_remote_io_ctx, m_local_mirror_uuid, m_peer.uuid,
+ m_remote_pool_meta, m_threads, m_image_sync_throttler.get(),
+ m_image_deletion_throttler.get(), m_service_daemon,
+ m_cache_manager_handler, m_pool_meta_cache));
+
+ C_SaferCond on_init;
+ m_default_namespace_replayer->init(&on_init);
+ r = on_init.wait();
+ if (r < 0) {
+ derr << "error initializing default namespace replayer: " << cpp_strerror(r)
+ << dendl;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to initialize default namespace replayer");
+ m_default_namespace_replayer.reset();
+ return;
+ }
+
+ m_leader_watcher.reset(LeaderWatcher<I>::create(m_threads, m_local_io_ctx,
+ &m_leader_listener));
+ r = m_leader_watcher->init();
+ if (r < 0) {
+ derr << "error initializing leader watcher: " << cpp_strerror(r) << dendl;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to initialize leader messenger object");
+ m_leader_watcher.reset();
+ return;
+ }
+
+ if (m_callout_id != service_daemon::CALLOUT_ID_NONE) {
+ m_service_daemon->remove_callout(m_local_pool_id, m_callout_id);
+ m_callout_id = service_daemon::CALLOUT_ID_NONE;
+ }
+
+ m_service_daemon->add_or_update_attribute(
+ m_local_io_ctx.get_id(), SERVICE_DAEMON_INSTANCE_ID_KEY,
+ stringify(m_local_io_ctx.get_instance_id()));
+
+ m_pool_replayer_thread.create("pool replayer");
+}
+
+template <typename I>
+void PoolReplayer<I>::shut_down() {
+ {
+ std::lock_guard l{m_lock};
+ m_stopping = true;
+ m_cond.notify_all();
+ }
+ if (m_pool_replayer_thread.is_started()) {
+ m_pool_replayer_thread.join();
+ }
+
+ if (m_leader_watcher) {
+ m_leader_watcher->shut_down();
+ }
+ m_leader_watcher.reset();
+
+ if (m_default_namespace_replayer) {
+ C_SaferCond on_shut_down;
+ m_default_namespace_replayer->shut_down(&on_shut_down);
+ on_shut_down.wait();
+ }
+ m_default_namespace_replayer.reset();
+
+ if (m_remote_pool_poller) {
+ C_SaferCond ctx;
+ m_remote_pool_poller->shut_down(&ctx);
+ ctx.wait();
+
+ m_pool_meta_cache->remove_remote_pool_meta(m_remote_io_ctx.get_id());
+ m_pool_meta_cache->remove_local_pool_meta(m_local_io_ctx.get_id());
+ }
+ m_remote_pool_poller.reset();
+ m_remote_pool_poller_listener.reset();
+
+ m_image_sync_throttler.reset();
+ m_image_deletion_throttler.reset();
+
+ m_local_rados.reset();
+ m_remote_rados.reset();
+}
+
+template <typename I>
+int PoolReplayer<I>::init_rados(const std::string &cluster_name,
+ const std::string &client_name,
+ const std::string &mon_host,
+ const std::string &key,
+ const std::string &description,
+ RadosRef *rados_ref,
+ bool strip_cluster_overrides) {
+ // NOTE: manually bootstrap a CephContext here instead of via
+ // the librados API to avoid mixing global singletons between
+ // the librados shared library and the daemon
+ // TODO: eliminate intermingling of global singletons within Ceph APIs
+ CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
+ if (client_name.empty() || !iparams.name.from_str(client_name)) {
+ derr << "error initializing cluster handle for " << description << dendl;
+ return -EINVAL;
+ }
+
+ CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+ cct->_conf->cluster = cluster_name;
+
+ // librados::Rados::conf_read_file
+ int r = cct->_conf.parse_config_files(nullptr, nullptr, 0);
+ if (r < 0 && r != -ENOENT) {
+ // do not treat this as fatal, it might still be able to connect
+ derr << "could not read ceph conf for " << description << ": "
+ << cpp_strerror(r) << dendl;
+ }
+
+ // preserve cluster-specific config settings before applying environment/cli
+ // overrides
+ std::map<std::string, std::string> config_values;
+ if (strip_cluster_overrides) {
+ // remote peer connections shouldn't apply cluster-specific
+ // configuration settings
+ for (auto& key : UNIQUE_PEER_CONFIG_KEYS) {
+ config_values[key] = cct->_conf.get_val<std::string>(key);
+ }
+ }
+
+ cct->_conf.parse_env(cct->get_module_type());
+
+ // librados::Rados::conf_parse_env
+ std::vector<const char*> args;
+ r = cct->_conf.parse_argv(args);
+ if (r < 0) {
+ derr << "could not parse environment for " << description << ":"
+ << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ cct->_conf.parse_env(cct->get_module_type());
+
+ if (!m_args.empty()) {
+ // librados::Rados::conf_parse_argv
+ args = m_args;
+ r = cct->_conf.parse_argv(args);
+ if (r < 0) {
+ derr << "could not parse command line args for " << description << ": "
+ << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+
+ if (strip_cluster_overrides) {
+ // remote peer connections shouldn't apply cluster-specific
+ // configuration settings
+ for (auto& pair : config_values) {
+ auto value = cct->_conf.get_val<std::string>(pair.first);
+ if (pair.second != value) {
+ dout(0) << "reverting global config option override: "
+ << pair.first << ": " << value << " -> " << pair.second
+ << dendl;
+ cct->_conf.set_val_or_die(pair.first, pair.second);
+ }
+ }
+ }
+
+ if (!g_ceph_context->_conf->admin_socket.empty()) {
+ cct->_conf.set_val_or_die("admin_socket",
+ "$run_dir/$name.$pid.$cluster.$cctid.asok");
+ }
+
+ if (!mon_host.empty()) {
+ r = cct->_conf.set_val("mon_host", mon_host);
+ if (r < 0) {
+ derr << "failed to set mon_host config for " << description << ": "
+ << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+
+ if (!key.empty()) {
+ r = cct->_conf.set_val("key", key);
+ if (r < 0) {
+ derr << "failed to set key config for " << description << ": "
+ << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+
+ // disable unnecessary librbd cache
+ cct->_conf.set_val_or_die("rbd_cache", "false");
+ cct->_conf.apply_changes(nullptr);
+ cct->_conf.complain_about_parse_error(cct);
+
+ rados_ref->reset(new librados::Rados());
+
+ r = (*rados_ref)->init_with_context(cct);
+ ceph_assert(r == 0);
+ cct->put();
+
+ r = (*rados_ref)->connect();
+ if (r < 0) {
+ derr << "error connecting to " << description << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void PoolReplayer<I>::run() {
+ dout(20) << dendl;
+
+ while (true) {
+ std::string asok_hook_name = m_local_io_ctx.get_pool_name() + " " +
+ m_peer.cluster_name;
+ if (m_asok_hook_name != asok_hook_name || m_asok_hook == nullptr) {
+ m_asok_hook_name = asok_hook_name;
+ delete m_asok_hook;
+
+ m_asok_hook = new PoolReplayerAdminSocketHook<I>(g_ceph_context,
+ m_asok_hook_name, this);
+ }
+
+ with_namespace_replayers([this]() { update_namespace_replayers(); });
+
+ std::unique_lock locker{m_lock};
+
+ if (m_leader_watcher->is_blocklisted() ||
+ m_default_namespace_replayer->is_blocklisted()) {
+ m_blocklisted = true;
+ m_stopping = true;
+ }
+
+ for (auto &it : m_namespace_replayers) {
+ if (it.second->is_blocklisted()) {
+ m_blocklisted = true;
+ m_stopping = true;
+ break;
+ }
+ }
+
+ if (m_stopping) {
+ break;
+ }
+
+ auto seconds = g_ceph_context->_conf.get_val<uint64_t>(
+ "rbd_mirror_pool_replayers_refresh_interval");
+ m_cond.wait_for(locker, ceph::make_timespan(seconds));
+ }
+
+ // shut down namespace replayers
+ with_namespace_replayers([this]() { update_namespace_replayers(); });
+
+ delete m_asok_hook;
+ m_asok_hook = nullptr;
+}
+
+template <typename I>
+void PoolReplayer<I>::update_namespace_replayers() {
+ dout(20) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ std::set<std::string> mirroring_namespaces;
+ if (!m_stopping) {
+ int r = list_mirroring_namespaces(&mirroring_namespaces);
+ if (r < 0) {
+ return;
+ }
+ }
+
+ auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
+ C_SaferCond cond;
+ auto gather_ctx = new C_Gather(cct, &cond);
+ for (auto it = m_namespace_replayers.begin();
+ it != m_namespace_replayers.end(); ) {
+ auto iter = mirroring_namespaces.find(it->first);
+ if (iter == mirroring_namespaces.end()) {
+ auto namespace_replayer = it->second;
+ auto on_shut_down = new LambdaContext(
+ [namespace_replayer, ctx=gather_ctx->new_sub()](int r) {
+ delete namespace_replayer;
+ ctx->complete(r);
+ });
+ m_service_daemon->remove_namespace(m_local_pool_id, it->first);
+ namespace_replayer->shut_down(on_shut_down);
+ it = m_namespace_replayers.erase(it);
+ } else {
+ mirroring_namespaces.erase(iter);
+ it++;
+ }
+ }
+
+ for (auto &name : mirroring_namespaces) {
+ auto namespace_replayer = NamespaceReplayer<I>::create(
+ name, m_local_io_ctx, m_remote_io_ctx, m_local_mirror_uuid, m_peer.uuid,
+ m_remote_pool_meta, m_threads, m_image_sync_throttler.get(),
+ m_image_deletion_throttler.get(), m_service_daemon,
+ m_cache_manager_handler, m_pool_meta_cache);
+ auto on_init = new LambdaContext(
+ [this, namespace_replayer, name, &mirroring_namespaces,
+ ctx=gather_ctx->new_sub()](int r) {
+ std::lock_guard locker{m_lock};
+ if (r < 0) {
+ derr << "failed to initialize namespace replayer for namespace "
+ << name << ": " << cpp_strerror(r) << dendl;
+ delete namespace_replayer;
+ mirroring_namespaces.erase(name);
+ } else {
+ m_namespace_replayers[name] = namespace_replayer;
+ m_service_daemon->add_namespace(m_local_pool_id, name);
+ }
+ ctx->complete(r);
+ });
+ namespace_replayer->init(on_init);
+ }
+
+ gather_ctx->activate();
+
+ m_lock.unlock();
+ cond.wait();
+ m_lock.lock();
+
+ if (m_leader) {
+ C_SaferCond acquire_cond;
+ auto acquire_gather_ctx = new C_Gather(cct, &acquire_cond);
+
+ for (auto &name : mirroring_namespaces) {
+ namespace_replayer_acquire_leader(name, acquire_gather_ctx->new_sub());
+ }
+ acquire_gather_ctx->activate();
+
+ m_lock.unlock();
+ acquire_cond.wait();
+ m_lock.lock();
+
+ std::vector<std::string> instance_ids;
+ m_leader_watcher->list_instances(&instance_ids);
+
+ for (auto &name : mirroring_namespaces) {
+ auto it = m_namespace_replayers.find(name);
+ if (it == m_namespace_replayers.end()) {
+ // acuire leader for this namespace replayer failed
+ continue;
+ }
+ it->second->handle_instances_added(instance_ids);
+ }
+ } else {
+ std::string leader_instance_id;
+ if (m_leader_watcher->get_leader_instance_id(&leader_instance_id)) {
+ for (auto &name : mirroring_namespaces) {
+ m_namespace_replayers[name]->handle_update_leader(leader_instance_id);
+ }
+ }
+ }
+}
+
+template <typename I>
+int PoolReplayer<I>::list_mirroring_namespaces(
+ std::set<std::string> *namespaces) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ std::vector<std::string> names;
+
+ int r = librbd::api::Namespace<I>::list(m_local_io_ctx, &names);
+ if (r < 0) {
+ derr << "failed to list namespaces: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto &name : names) {
+ cls::rbd::MirrorMode mirror_mode = cls::rbd::MIRROR_MODE_DISABLED;
+ int r = librbd::cls_client::mirror_mode_get(&m_local_io_ctx, &mirror_mode);
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to get namespace mirror mode: " << cpp_strerror(r)
+ << dendl;
+ if (m_namespace_replayers.count(name) == 0) {
+ continue;
+ }
+ } else if (mirror_mode == cls::rbd::MIRROR_MODE_DISABLED) {
+ dout(10) << "mirroring is disabled for namespace " << name << dendl;
+ continue;
+ }
+
+ namespaces->insert(name);
+ }
+
+ return 0;
+}
+
+template <typename I>
+void PoolReplayer<I>::reopen_logs()
+{
+ std::lock_guard locker{m_lock};
+
+ if (m_local_rados) {
+ reinterpret_cast<CephContext *>(m_local_rados->cct())->reopen_logs();
+ }
+ if (m_remote_rados) {
+ reinterpret_cast<CephContext *>(m_remote_rados->cct())->reopen_logs();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::namespace_replayer_acquire_leader(const std::string &name,
+ Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto it = m_namespace_replayers.find(name);
+ ceph_assert(it != m_namespace_replayers.end());
+
+ on_finish = new LambdaContext(
+ [this, name, on_finish](int r) {
+ if (r < 0) {
+ derr << "failed to handle acquire leader for namespace: "
+ << name << ": " << cpp_strerror(r) << dendl;
+
+ // remove the namespace replayer -- update_namespace_replayers will
+ // retry to create it and acquire leader.
+
+ std::lock_guard locker{m_lock};
+
+ auto namespace_replayer = m_namespace_replayers[name];
+ m_namespace_replayers.erase(name);
+ auto on_shut_down = new LambdaContext(
+ [namespace_replayer, on_finish](int r) {
+ delete namespace_replayer;
+ on_finish->complete(r);
+ });
+ m_service_daemon->remove_namespace(m_local_pool_id, name);
+ namespace_replayer->shut_down(on_shut_down);
+ return;
+ }
+ on_finish->complete(0);
+ });
+
+ it->second->handle_acquire_leader(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::print_status(Formatter *f) {
+ dout(20) << dendl;
+
+ assert(f);
+
+ std::lock_guard l{m_lock};
+
+ f->open_object_section("pool_replayer_status");
+ f->dump_stream("peer") << m_peer;
+ if (m_local_io_ctx.is_valid()) {
+ f->dump_string("pool", m_local_io_ctx.get_pool_name());
+ f->dump_stream("instance_id") << m_local_io_ctx.get_instance_id();
+ }
+
+ std::string state("running");
+ if (m_manual_stop) {
+ state = "stopped (manual)";
+ } else if (m_stopping) {
+ state = "stopped";
+ } else if (!is_running()) {
+ state = "error";
+ }
+ f->dump_string("state", state);
+
+ if (m_leader_watcher) {
+ std::string leader_instance_id;
+ m_leader_watcher->get_leader_instance_id(&leader_instance_id);
+ f->dump_string("leader_instance_id", leader_instance_id);
+
+ bool leader = m_leader_watcher->is_leader();
+ f->dump_bool("leader", leader);
+ if (leader) {
+ std::vector<std::string> instance_ids;
+ m_leader_watcher->list_instances(&instance_ids);
+ f->open_array_section("instances");
+ for (auto instance_id : instance_ids) {
+ f->dump_string("instance_id", instance_id);
+ }
+ f->close_section(); // instances
+ }
+ }
+
+ if (m_local_rados) {
+ auto cct = reinterpret_cast<CephContext *>(m_local_rados->cct());
+ f->dump_string("local_cluster_admin_socket",
+ cct->_conf.get_val<std::string>("admin_socket"));
+ }
+ if (m_remote_rados) {
+ auto cct = reinterpret_cast<CephContext *>(m_remote_rados->cct());
+ f->dump_string("remote_cluster_admin_socket",
+ cct->_conf.get_val<std::string>("admin_socket"));
+ }
+
+ if (m_image_sync_throttler) {
+ f->open_object_section("sync_throttler");
+ m_image_sync_throttler->print_status(f);
+ f->close_section(); // sync_throttler
+ }
+
+ if (m_image_deletion_throttler) {
+ f->open_object_section("deletion_throttler");
+ m_image_deletion_throttler->print_status(f);
+ f->close_section(); // deletion_throttler
+ }
+
+ if (m_default_namespace_replayer) {
+ m_default_namespace_replayer->print_status(f);
+ }
+
+ f->open_array_section("namespaces");
+ for (auto &it : m_namespace_replayers) {
+ f->open_object_section("namespace");
+ f->dump_string("name", it.first);
+ it.second->print_status(f);
+ f->close_section(); // namespace
+ }
+ f->close_section(); // namespaces
+
+ f->close_section(); // pool_replayer_status
+}
+
+template <typename I>
+void PoolReplayer<I>::start() {
+ dout(20) << dendl;
+
+ std::lock_guard l{m_lock};
+
+ if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = false;
+
+ if (m_default_namespace_replayer) {
+ m_default_namespace_replayer->start();
+ }
+ for (auto &it : m_namespace_replayers) {
+ it.second->start();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::stop(bool manual) {
+ dout(20) << "enter: manual=" << manual << dendl;
+
+ std::lock_guard l{m_lock};
+ if (!manual) {
+ m_stopping = true;
+ m_cond.notify_all();
+ return;
+ } else if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = true;
+
+ if (m_default_namespace_replayer) {
+ m_default_namespace_replayer->stop();
+ }
+ for (auto &it : m_namespace_replayers) {
+ it.second->stop();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::restart() {
+ dout(20) << dendl;
+
+ std::lock_guard l{m_lock};
+
+ if (m_stopping) {
+ return;
+ }
+
+ if (m_default_namespace_replayer) {
+ m_default_namespace_replayer->restart();
+ }
+ for (auto &it : m_namespace_replayers) {
+ it.second->restart();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::flush() {
+ dout(20) << dendl;
+
+ std::lock_guard l{m_lock};
+
+ if (m_stopping || m_manual_stop) {
+ return;
+ }
+
+ if (m_default_namespace_replayer) {
+ m_default_namespace_replayer->flush();
+ }
+ for (auto &it : m_namespace_replayers) {
+ it.second->flush();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::release_leader() {
+ dout(20) << dendl;
+
+ std::lock_guard l{m_lock};
+
+ if (m_stopping || !m_leader_watcher) {
+ return;
+ }
+
+ m_leader_watcher->release_leader();
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_post_acquire_leader(Context *on_finish) {
+ dout(20) << dendl;
+
+ with_namespace_replayers(
+ [this](Context *on_finish) {
+ dout(10) << "handle_post_acquire_leader" << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ m_service_daemon->add_or_update_attribute(m_local_pool_id,
+ SERVICE_DAEMON_LEADER_KEY,
+ true);
+ auto ctx = new LambdaContext(
+ [this, on_finish](int r) {
+ if (r == 0) {
+ std::lock_guard locker{m_lock};
+ m_leader = true;
+ }
+ on_finish->complete(r);
+ });
+
+ auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
+ auto gather_ctx = new C_Gather(cct, ctx);
+
+ m_default_namespace_replayer->handle_acquire_leader(
+ gather_ctx->new_sub());
+
+ for (auto &it : m_namespace_replayers) {
+ namespace_replayer_acquire_leader(it.first, gather_ctx->new_sub());
+ }
+
+ gather_ctx->activate();
+ }, on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_pre_release_leader(Context *on_finish) {
+ dout(20) << dendl;
+
+ with_namespace_replayers(
+ [this](Context *on_finish) {
+ dout(10) << "handle_pre_release_leader" << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ m_leader = false;
+ m_service_daemon->remove_attribute(m_local_pool_id,
+ SERVICE_DAEMON_LEADER_KEY);
+
+ auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
+ auto gather_ctx = new C_Gather(cct, on_finish);
+
+ m_default_namespace_replayer->handle_release_leader(
+ gather_ctx->new_sub());
+
+ for (auto &it : m_namespace_replayers) {
+ it.second->handle_release_leader(gather_ctx->new_sub());
+ }
+
+ gather_ctx->activate();
+ }, on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_update_leader(
+ const std::string &leader_instance_id) {
+ dout(10) << "leader_instance_id=" << leader_instance_id << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ m_default_namespace_replayer->handle_update_leader(leader_instance_id);
+
+ for (auto &it : m_namespace_replayers) {
+ it.second->handle_update_leader(leader_instance_id);
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_instances_added(
+ const std::vector<std::string> &instance_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+
+ std::lock_guard locker{m_lock};
+ if (!m_leader_watcher->is_leader()) {
+ return;
+ }
+
+ m_default_namespace_replayer->handle_instances_added(instance_ids);
+
+ for (auto &it : m_namespace_replayers) {
+ it.second->handle_instances_added(instance_ids);
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_instances_removed(
+ const std::vector<std::string> &instance_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+
+ std::lock_guard locker{m_lock};
+ if (!m_leader_watcher->is_leader()) {
+ return;
+ }
+
+ m_default_namespace_replayer->handle_instances_removed(instance_ids);
+
+ for (auto &it : m_namespace_replayers) {
+ it.second->handle_instances_removed(instance_ids);
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_remote_pool_meta_updated(
+ const RemotePoolMeta& remote_pool_meta) {
+ dout(5) << "remote_pool_meta=" << remote_pool_meta << dendl;
+
+ if (!m_default_namespace_replayer) {
+ m_remote_pool_meta = remote_pool_meta;
+ return;
+ }
+
+ derr << "remote pool metadata updated unexpectedly" << dendl;
+ std::unique_lock locker{m_lock};
+ m_stopping = true;
+ m_cond.notify_all();
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::PoolReplayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/PoolReplayer.h b/src/tools/rbd_mirror/PoolReplayer.h
new file mode 100644
index 000000000..e0fd75377
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolReplayer.h
@@ -0,0 +1,288 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_POOL_REPLAYER_H
+#define CEPH_RBD_MIRROR_POOL_REPLAYER_H
+
+#include "common/Cond.h"
+#include "common/ceph_mutex.h"
+#include "include/rados/librados.hpp"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+
+#include "tools/rbd_mirror/LeaderWatcher.h"
+#include "tools/rbd_mirror/NamespaceReplayer.h"
+#include "tools/rbd_mirror/Throttler.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/leader_watcher/Types.h"
+#include "tools/rbd_mirror/service_daemon/Types.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+class AdminSocketHook;
+
+namespace journal { struct CacheManagerHandler; }
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class RemotePoolPoller;
+namespace remote_pool_poller { struct Listener; }
+
+struct PoolMetaCache;
+template <typename> class ServiceDaemon;
+template <typename> struct Threads;
+
+
+/**
+ * Controls mirroring for a single remote cluster.
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class PoolReplayer {
+public:
+ PoolReplayer(Threads<ImageCtxT> *threads,
+ ServiceDaemon<ImageCtxT> *service_daemon,
+ journal::CacheManagerHandler *cache_manager_handler,
+ PoolMetaCache* pool_meta_cache,
+ int64_t local_pool_id, const PeerSpec &peer,
+ const std::vector<const char*> &args);
+ ~PoolReplayer();
+ PoolReplayer(const PoolReplayer&) = delete;
+ PoolReplayer& operator=(const PoolReplayer&) = delete;
+
+ bool is_blocklisted() const;
+ bool is_leader() const;
+ bool is_running() const;
+
+ void init(const std::string& site_name);
+ void shut_down();
+
+ void run();
+
+ void print_status(Formatter *f);
+ void start();
+ void stop(bool manual);
+ void restart();
+ void flush();
+ void release_leader();
+ void reopen_logs();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT
+ * |
+ * v
+ * <follower> <---------------------\
+ * . |
+ * . (leader acquired) |
+ * v |
+ * NOTIFY_NAMESPACE_WATCHERS NOTIFY_NAMESPACE_WATCHERS
+ * | ^
+ * v .
+ * <leader> .
+ * . .
+ * . (leader lost / shut down) .
+ * . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ struct RemotePoolPollerListener;
+
+ int init_rados(const std::string &cluster_name,
+ const std::string &client_name,
+ const std::string &mon_host,
+ const std::string &key,
+ const std::string &description, RadosRef *rados_ref,
+ bool strip_cluster_overrides);
+
+ void update_namespace_replayers();
+ int list_mirroring_namespaces(std::set<std::string> *namespaces);
+
+ void namespace_replayer_acquire_leader(const std::string &name,
+ Context *on_finish);
+
+ void handle_post_acquire_leader(Context *on_finish);
+ void handle_pre_release_leader(Context *on_finish);
+
+ void handle_update_leader(const std::string &leader_instance_id);
+
+ void handle_instances_added(const std::vector<std::string> &instance_ids);
+ void handle_instances_removed(const std::vector<std::string> &instance_ids);
+
+ // sync version, executed in the caller thread
+ template <typename L>
+ void with_namespace_replayers(L &&callback) {
+ std::lock_guard locker{m_lock};
+
+ if (m_namespace_replayers_locked) {
+ ceph_assert(m_on_namespace_replayers_unlocked == nullptr);
+ C_SaferCond cond;
+ m_on_namespace_replayers_unlocked = &cond;
+ m_lock.unlock();
+ cond.wait();
+ m_lock.lock();
+ } else {
+ m_namespace_replayers_locked = true;
+ }
+
+ ceph_assert(m_namespace_replayers_locked);
+ callback(); // may temporary release the lock
+ ceph_assert(m_namespace_replayers_locked);
+
+ if (m_on_namespace_replayers_unlocked == nullptr) {
+ m_namespace_replayers_locked = false;
+ return;
+ }
+
+ m_threads->work_queue->queue(m_on_namespace_replayers_unlocked);
+ m_on_namespace_replayers_unlocked = nullptr;
+ }
+
+ // async version
+ template <typename L>
+ void with_namespace_replayers(L &&callback, Context *on_finish) {
+ std::lock_guard locker{m_lock};
+
+ on_finish = librbd::util::create_async_context_callback(
+ m_threads->work_queue, new LambdaContext(
+ [this, on_finish](int r) {
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_namespace_replayers_locked);
+
+ m_namespace_replayers_locked = false;
+
+ if (m_on_namespace_replayers_unlocked != nullptr) {
+ m_namespace_replayers_locked = true;
+ m_threads->work_queue->queue(m_on_namespace_replayers_unlocked);
+ m_on_namespace_replayers_unlocked = nullptr;
+ }
+ }
+ on_finish->complete(r);
+ }));
+
+ auto on_lock = new LambdaContext(
+ [this, callback, on_finish](int) {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_namespace_replayers_locked);
+
+ callback(on_finish);
+ });
+
+ if (m_namespace_replayers_locked) {
+ ceph_assert(m_on_namespace_replayers_unlocked == nullptr);
+ m_on_namespace_replayers_unlocked = on_lock;
+ return;
+ }
+
+ m_namespace_replayers_locked = true;
+ m_threads->work_queue->queue(on_lock);
+ }
+
+ void handle_remote_pool_meta_updated(const RemotePoolMeta& remote_pool_meta);
+
+ Threads<ImageCtxT> *m_threads;
+ ServiceDaemon<ImageCtxT> *m_service_daemon;
+ journal::CacheManagerHandler *m_cache_manager_handler;
+ PoolMetaCache* m_pool_meta_cache;
+ int64_t m_local_pool_id = -1;
+ PeerSpec m_peer;
+ std::vector<const char*> m_args;
+
+ mutable ceph::mutex m_lock;
+ ceph::condition_variable m_cond;
+ std::string m_site_name;
+ bool m_stopping = false;
+ bool m_manual_stop = false;
+ bool m_blocklisted = false;
+
+ RadosRef m_local_rados;
+ RadosRef m_remote_rados;
+
+ librados::IoCtx m_local_io_ctx;
+ librados::IoCtx m_remote_io_ctx;
+
+ std::string m_local_mirror_uuid;
+
+ RemotePoolMeta m_remote_pool_meta;
+ std::unique_ptr<remote_pool_poller::Listener> m_remote_pool_poller_listener;
+ std::unique_ptr<RemotePoolPoller<ImageCtxT>> m_remote_pool_poller;
+
+ std::unique_ptr<NamespaceReplayer<ImageCtxT>> m_default_namespace_replayer;
+ std::map<std::string, NamespaceReplayer<ImageCtxT> *> m_namespace_replayers;
+
+ std::string m_asok_hook_name;
+ AdminSocketHook *m_asok_hook = nullptr;
+
+ service_daemon::CalloutId m_callout_id = service_daemon::CALLOUT_ID_NONE;
+
+ bool m_leader = false;
+ bool m_namespace_replayers_locked = false;
+ Context *m_on_namespace_replayers_unlocked = nullptr;
+
+ class PoolReplayerThread : public Thread {
+ PoolReplayer *m_pool_replayer;
+ public:
+ PoolReplayerThread(PoolReplayer *pool_replayer)
+ : m_pool_replayer(pool_replayer) {
+ }
+ void *entry() override {
+ m_pool_replayer->run();
+ return 0;
+ }
+ } m_pool_replayer_thread;
+
+ class LeaderListener : public leader_watcher::Listener {
+ public:
+ LeaderListener(PoolReplayer *pool_replayer)
+ : m_pool_replayer(pool_replayer) {
+ }
+
+ protected:
+ void post_acquire_handler(Context *on_finish) override {
+ m_pool_replayer->handle_post_acquire_leader(on_finish);
+ }
+
+ void pre_release_handler(Context *on_finish) override {
+ m_pool_replayer->handle_pre_release_leader(on_finish);
+ }
+
+ void update_leader_handler(
+ const std::string &leader_instance_id) override {
+ m_pool_replayer->handle_update_leader(leader_instance_id);
+ }
+
+ void handle_instances_added(const InstanceIds& instance_ids) override {
+ m_pool_replayer->handle_instances_added(instance_ids);
+ }
+
+ void handle_instances_removed(const InstanceIds& instance_ids) override {
+ m_pool_replayer->handle_instances_removed(instance_ids);
+ }
+
+ private:
+ PoolReplayer *m_pool_replayer;
+ } m_leader_listener;
+
+ std::unique_ptr<LeaderWatcher<ImageCtxT>> m_leader_watcher;
+ std::unique_ptr<Throttler<ImageCtxT>> m_image_sync_throttler;
+ std::unique_ptr<Throttler<ImageCtxT>> m_image_deletion_throttler;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::PoolReplayer<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_POOL_REPLAYER_H
diff --git a/src/tools/rbd_mirror/PoolWatcher.cc b/src/tools/rbd_mirror/PoolWatcher.cc
new file mode 100644
index 000000000..bec931cf3
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolWatcher.cc
@@ -0,0 +1,473 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/PoolWatcher.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/MirroringWatcher.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Mirror.h"
+#include "librbd/asio/ContextWQ.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::PoolWatcher: " << this << " " \
+ << __func__ << ": "
+
+using std::list;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+namespace rbd {
+namespace mirror {
+
+template <typename I>
+class PoolWatcher<I>::MirroringWatcher : public librbd::MirroringWatcher<I> {
+public:
+ using ContextWQ = typename std::decay<
+ typename std::remove_pointer<
+ decltype(Threads<I>::work_queue)>::type>::type;
+
+ MirroringWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue,
+ PoolWatcher *pool_watcher)
+ : librbd::MirroringWatcher<I>(io_ctx, work_queue),
+ m_pool_watcher(pool_watcher) {
+ }
+
+ void handle_rewatch_complete(int r) override {
+ m_pool_watcher->handle_rewatch_complete(r);
+ }
+
+ void handle_mode_updated(cls::rbd::MirrorMode mirror_mode) override {
+ // invalidate all image state and refresh the pool contents
+ m_pool_watcher->schedule_refresh_images(5);
+ }
+
+ void handle_image_updated(cls::rbd::MirrorImageState state,
+ const std::string &image_id,
+ const std::string &global_image_id) override {
+ bool enabled = (state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED);
+ m_pool_watcher->handle_image_updated(image_id, global_image_id,
+ enabled);
+ }
+
+private:
+ PoolWatcher *m_pool_watcher;
+};
+
+template <typename I>
+PoolWatcher<I>::PoolWatcher(Threads<I> *threads,
+ librados::IoCtx &io_ctx,
+ const std::string& mirror_uuid,
+ pool_watcher::Listener &listener)
+ : m_threads(threads),
+ m_io_ctx(io_ctx),
+ m_mirror_uuid(mirror_uuid),
+ m_listener(listener),
+ m_lock(ceph::make_mutex(librbd::util::unique_lock_name(
+ "rbd::mirror::PoolWatcher", this))) {
+ m_mirroring_watcher = new MirroringWatcher(m_io_ctx,
+ m_threads->work_queue, this);
+}
+
+template <typename I>
+PoolWatcher<I>::~PoolWatcher() {
+ delete m_mirroring_watcher;
+}
+
+template <typename I>
+bool PoolWatcher<I>::is_blocklisted() const {
+ std::lock_guard locker{m_lock};
+ return m_blocklisted;
+}
+
+template <typename I>
+void PoolWatcher<I>::init(Context *on_finish) {
+ dout(5) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ m_on_init_finish = on_finish;
+
+ ceph_assert(!m_refresh_in_progress);
+ m_refresh_in_progress = true;
+ }
+
+ // start async updates for mirror image directory
+ register_watcher();
+}
+
+template <typename I>
+void PoolWatcher<I>::shut_down(Context *on_finish) {
+ dout(5) << dendl;
+
+ {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+
+ ceph_assert(!m_shutting_down);
+ m_shutting_down = true;
+ if (m_timer_ctx != nullptr) {
+ m_threads->timer->cancel_event(m_timer_ctx);
+ m_timer_ctx = nullptr;
+ }
+ }
+
+ // in-progress unregister tracked as async op
+ unregister_watcher();
+
+ m_async_op_tracker.wait_for_ops(on_finish);
+}
+
+template <typename I>
+void PoolWatcher<I>::register_watcher() {
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_image_ids_invalid);
+ ceph_assert(m_refresh_in_progress);
+ }
+
+ // if the watch registration is in-flight, let the watcher
+ // handle the transition -- only (re-)register if it's not registered
+ if (!m_mirroring_watcher->is_unregistered()) {
+ refresh_images();
+ return;
+ }
+
+ // first time registering or the watch failed
+ dout(5) << dendl;
+ m_async_op_tracker.start_op();
+
+ Context *ctx = create_context_callback<
+ PoolWatcher, &PoolWatcher<I>::handle_register_watcher>(this);
+ m_mirroring_watcher->register_watch(ctx);
+}
+
+template <typename I>
+void PoolWatcher<I>::handle_register_watcher(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_image_ids_invalid);
+ ceph_assert(m_refresh_in_progress);
+ if (r < 0) {
+ m_refresh_in_progress = false;
+ }
+ }
+
+ Context *on_init_finish = nullptr;
+ if (r >= 0) {
+ refresh_images();
+ } else if (r == -EBLOCKLISTED) {
+ dout(0) << "detected client is blocklisted" << dendl;
+
+ std::lock_guard locker{m_lock};
+ m_blocklisted = true;
+ std::swap(on_init_finish, m_on_init_finish);
+ } else if (r == -ENOENT) {
+ dout(5) << "mirroring directory does not exist" << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+
+ schedule_refresh_images(30);
+ } else {
+ derr << "unexpected error registering mirroring directory watch: "
+ << cpp_strerror(r) << dendl;
+ schedule_refresh_images(10);
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void PoolWatcher<I>::unregister_watcher() {
+ dout(5) << dendl;
+
+ m_async_op_tracker.start_op();
+ Context *ctx = new LambdaContext([this](int r) {
+ dout(5) << "unregister_watcher: r=" << r << dendl;
+ if (r < 0) {
+ derr << "error unregistering watcher for "
+ << m_mirroring_watcher->get_oid() << " object: " << cpp_strerror(r)
+ << dendl;
+ }
+ m_async_op_tracker.finish_op();
+ });
+
+ m_mirroring_watcher->unregister_watch(ctx);
+}
+
+template <typename I>
+void PoolWatcher<I>::refresh_images() {
+ dout(5) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_image_ids_invalid);
+ ceph_assert(m_refresh_in_progress);
+
+ // clear all pending notification events since we need to perform
+ // a full image list refresh
+ m_pending_added_image_ids.clear();
+ m_pending_removed_image_ids.clear();
+ }
+
+ m_async_op_tracker.start_op();
+ m_refresh_image_ids.clear();
+ Context *ctx = create_context_callback<
+ PoolWatcher, &PoolWatcher<I>::handle_refresh_images>(this);
+ auto req = pool_watcher::RefreshImagesRequest<I>::create(m_io_ctx,
+ &m_refresh_image_ids,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void PoolWatcher<I>::handle_refresh_images(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ bool deferred_refresh = false;
+ bool retry_refresh = false;
+ Context *on_init_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_image_ids_invalid);
+ ceph_assert(m_refresh_in_progress);
+ m_refresh_in_progress = false;
+
+ if (r == -ENOENT) {
+ dout(5) << "mirroring directory not found" << dendl;
+ r = 0;
+ m_refresh_image_ids.clear();
+ }
+
+ if (m_deferred_refresh) {
+ // need to refresh -- skip the notification
+ deferred_refresh = true;
+ } else if (r >= 0) {
+ m_pending_image_ids = std::move(m_refresh_image_ids);
+ m_image_ids_invalid = false;
+ std::swap(on_init_finish, m_on_init_finish);
+
+ schedule_listener();
+ } else if (r == -EBLOCKLISTED) {
+ dout(0) << "detected client is blocklisted during image refresh" << dendl;
+
+ m_blocklisted = true;
+ std::swap(on_init_finish, m_on_init_finish);
+ } else {
+ retry_refresh = true;
+ }
+ }
+
+ if (deferred_refresh) {
+ dout(5) << "scheduling deferred refresh" << dendl;
+ schedule_refresh_images(0);
+ } else if (retry_refresh) {
+ derr << "failed to retrieve mirroring directory: " << cpp_strerror(r)
+ << dendl;
+ schedule_refresh_images(10);
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void PoolWatcher<I>::schedule_refresh_images(double interval) {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ if (m_shutting_down || m_refresh_in_progress || m_timer_ctx != nullptr) {
+ if (m_refresh_in_progress && !m_deferred_refresh) {
+ dout(5) << "deferring refresh until in-flight refresh completes" << dendl;
+ m_deferred_refresh = true;
+ }
+ return;
+ }
+
+ m_image_ids_invalid = true;
+ m_timer_ctx = m_threads->timer->add_event_after(
+ interval,
+ new LambdaContext([this](int r) {
+ process_refresh_images();
+ }));
+}
+
+template <typename I>
+void PoolWatcher<I>::handle_rewatch_complete(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ dout(0) << "detected client is blocklisted" << dendl;
+
+ std::lock_guard locker{m_lock};
+ m_blocklisted = true;
+ return;
+ } else if (r == -ENOENT) {
+ dout(5) << "mirroring directory deleted" << dendl;
+ } else if (r < 0) {
+ derr << "unexpected error re-registering mirroring directory watch: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ schedule_refresh_images(5);
+}
+
+template <typename I>
+void PoolWatcher<I>::handle_image_updated(const std::string &id,
+ const std::string &global_image_id,
+ bool enabled) {
+ dout(10) << "image_id=" << id << ", "
+ << "global_image_id=" << global_image_id << ", "
+ << "enabled=" << enabled << dendl;
+
+ std::lock_guard locker{m_lock};
+ ImageId image_id(global_image_id, id);
+ m_pending_added_image_ids.erase(image_id);
+ m_pending_removed_image_ids.erase(image_id);
+
+ if (enabled) {
+ m_pending_added_image_ids.insert(image_id);
+ schedule_listener();
+ } else {
+ m_pending_removed_image_ids.insert(image_id);
+ schedule_listener();
+ }
+}
+
+template <typename I>
+void PoolWatcher<I>::process_refresh_images() {
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(m_timer_ctx != nullptr);
+ m_timer_ctx = nullptr;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(!m_refresh_in_progress);
+ m_refresh_in_progress = true;
+ m_deferred_refresh = false;
+ }
+
+ // execute outside of the timer's lock
+ m_async_op_tracker.start_op();
+ Context *ctx = new LambdaContext([this](int r) {
+ register_watcher();
+ m_async_op_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void PoolWatcher<I>::schedule_listener() {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ m_pending_updates = true;
+ if (m_shutting_down || m_image_ids_invalid || m_notify_listener_in_progress) {
+ return;
+ }
+
+ dout(20) << dendl;
+
+ m_async_op_tracker.start_op();
+ Context *ctx = new LambdaContext([this](int r) {
+ notify_listener();
+ m_async_op_tracker.finish_op();
+ });
+
+ m_notify_listener_in_progress = true;
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void PoolWatcher<I>::notify_listener() {
+ dout(10) << dendl;
+
+ std::string mirror_uuid;
+ ImageIds added_image_ids;
+ ImageIds removed_image_ids;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_notify_listener_in_progress);
+ }
+
+ if (!removed_image_ids.empty()) {
+ m_listener.handle_update(mirror_uuid, {}, std::move(removed_image_ids));
+ removed_image_ids.clear();
+ }
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_notify_listener_in_progress);
+
+ // if the watch failed while we didn't own the lock, we are going
+ // to need to perform a full refresh
+ if (m_image_ids_invalid) {
+ m_notify_listener_in_progress = false;
+ return;
+ }
+
+ // merge add/remove notifications into pending set (a given image
+ // can only be in one set or another)
+ for (auto &image_id : m_pending_removed_image_ids) {
+ dout(20) << "image_id=" << image_id << dendl;
+ m_pending_image_ids.erase(image_id);
+ }
+
+ for (auto &image_id : m_pending_added_image_ids) {
+ dout(20) << "image_id=" << image_id << dendl;
+ m_pending_image_ids.erase(image_id);
+ m_pending_image_ids.insert(image_id);
+ }
+ m_pending_added_image_ids.clear();
+
+ // compute added/removed images
+ for (auto &image_id : m_image_ids) {
+ auto it = m_pending_image_ids.find(image_id);
+ if (it == m_pending_image_ids.end() || it->id != image_id.id) {
+ removed_image_ids.insert(image_id);
+ }
+ }
+ for (auto &image_id : m_pending_image_ids) {
+ auto it = m_image_ids.find(image_id);
+ if (it == m_image_ids.end() || it->id != image_id.id) {
+ added_image_ids.insert(image_id);
+ }
+ }
+
+ m_pending_updates = false;
+ m_image_ids = m_pending_image_ids;
+ }
+
+ m_listener.handle_update(m_mirror_uuid, std::move(added_image_ids),
+ std::move(removed_image_ids));
+
+ {
+ std::lock_guard locker{m_lock};
+ m_notify_listener_in_progress = false;
+ if (m_pending_updates) {
+ schedule_listener();
+ }
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::PoolWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/PoolWatcher.h b/src/tools/rbd_mirror/PoolWatcher.h
new file mode 100644
index 000000000..2905de15f
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolWatcher.h
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_POOL_WATCHER_H
+#define CEPH_RBD_MIRROR_POOL_WATCHER_H
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_context.h"
+#include "common/ceph_mutex.h"
+#include "include/rados/librados.hpp"
+#include "tools/rbd_mirror/Types.h"
+#include <boost/functional/hash.hpp>
+#include <boost/optional.hpp>
+#include "include/ceph_assert.h"
+#include "tools/rbd_mirror/pool_watcher/Types.h"
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+/**
+ * Keeps track of images that have mirroring enabled within all
+ * pools.
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class PoolWatcher {
+public:
+ static PoolWatcher* create(Threads<ImageCtxT> *threads,
+ librados::IoCtx &io_ctx,
+ const std::string& mirror_uuid,
+ pool_watcher::Listener &listener) {
+ return new PoolWatcher(threads, io_ctx, mirror_uuid, listener);
+ }
+
+ PoolWatcher(Threads<ImageCtxT> *threads,
+ librados::IoCtx &io_ctx,
+ const std::string& mirror_uuid,
+ pool_watcher::Listener &listener);
+ ~PoolWatcher();
+ PoolWatcher(const PoolWatcher&) = delete;
+ PoolWatcher& operator=(const PoolWatcher&) = delete;
+
+ bool is_blocklisted() const;
+
+ void init(Context *on_finish = nullptr);
+ void shut_down(Context *on_finish);
+
+ inline uint64_t get_image_count() const {
+ std::lock_guard locker{m_lock};
+ return m_image_ids.size();
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT
+ * |
+ * v
+ * REGISTER_WATCHER
+ * |
+ * |/--------------------------------\
+ * | |
+ * v |
+ * REFRESH_IMAGES |
+ * | |
+ * |/----------------------------\ |
+ * | | |
+ * v | |
+ * NOTIFY_LISTENER | |
+ * | | |
+ * v | |
+ * IDLE ---\ | |
+ * | | | |
+ * | |\---> IMAGE_UPDATED | |
+ * | | | | |
+ * | | v | |
+ * | | GET_IMAGE_NAME --/ |
+ * | | |
+ * | \----> WATCH_ERROR ---------/
+ * v
+ * SHUT_DOWN
+ * |
+ * v
+ * UNREGISTER_WATCHER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ class MirroringWatcher;
+
+ Threads<ImageCtxT> *m_threads;
+ librados::IoCtx m_io_ctx;
+ std::string m_mirror_uuid;
+ pool_watcher::Listener &m_listener;
+
+ ImageIds m_refresh_image_ids;
+ bufferlist m_out_bl;
+
+ mutable ceph::mutex m_lock;
+
+ Context *m_on_init_finish = nullptr;
+
+ ImageIds m_image_ids;
+
+ bool m_pending_updates = false;
+ bool m_notify_listener_in_progress = false;
+ ImageIds m_pending_image_ids;
+ ImageIds m_pending_added_image_ids;
+ ImageIds m_pending_removed_image_ids;
+
+ MirroringWatcher *m_mirroring_watcher;
+
+ Context *m_timer_ctx = nullptr;
+
+ AsyncOpTracker m_async_op_tracker;
+ bool m_blocklisted = false;
+ bool m_shutting_down = false;
+ bool m_image_ids_invalid = true;
+ bool m_refresh_in_progress = false;
+ bool m_deferred_refresh = false;
+
+ void register_watcher();
+ void handle_register_watcher(int r);
+ void unregister_watcher();
+
+ void refresh_images();
+ void handle_refresh_images(int r);
+
+ void schedule_refresh_images(double interval);
+ void process_refresh_images();
+
+ void handle_rewatch_complete(int r);
+ void handle_image_updated(const std::string &image_id,
+ const std::string &global_image_id,
+ bool enabled);
+
+ void schedule_listener();
+ void notify_listener();
+
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::PoolWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_POOL_WATCHER_H
diff --git a/src/tools/rbd_mirror/ProgressContext.h b/src/tools/rbd_mirror/ProgressContext.h
new file mode 100644
index 000000000..e4430ee6a
--- /dev/null
+++ b/src/tools/rbd_mirror/ProgressContext.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_PROGRESS_CONTEXT_H
+#define RBD_MIRROR_PROGRESS_CONTEXT_H
+
+namespace rbd {
+namespace mirror {
+
+class ProgressContext
+{
+public:
+ virtual ~ProgressContext() {}
+ virtual void update_progress(const std::string &description,
+ bool flush = true) = 0;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_PROGRESS_CONTEXT_H
diff --git a/src/tools/rbd_mirror/RemotePoolPoller.cc b/src/tools/rbd_mirror/RemotePoolPoller.cc
new file mode 100644
index 000000000..8bfb35d4a
--- /dev/null
+++ b/src/tools/rbd_mirror/RemotePoolPoller.cc
@@ -0,0 +1,267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "RemotePoolPoller.h"
+#include "include/ceph_assert.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::RemotePoolPoller: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+static const double POLL_INTERVAL_SECONDS = 30;
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+RemotePoolPoller<I>::~RemotePoolPoller() {
+ ceph_assert(m_timer_task == nullptr);
+}
+
+template <typename I>
+void RemotePoolPoller<I>::init(Context* on_finish) {
+ dout(10) << dendl;
+
+ ceph_assert(m_state == STATE_INITIALIZING);
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+
+ get_mirror_uuid();
+}
+
+template <typename I>
+void RemotePoolPoller<I>::shut_down(Context* on_finish) {
+ dout(10) << dendl;
+
+ std::unique_lock locker(m_threads->timer_lock);
+ ceph_assert(m_state == STATE_POLLING);
+ m_state = STATE_SHUTTING_DOWN;
+
+ if (m_timer_task == nullptr) {
+ // currently executing a poll
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ return;
+ }
+
+ m_threads->timer->cancel_event(m_timer_task);
+ m_timer_task = nullptr;
+ m_threads->work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void RemotePoolPoller<I>::get_mirror_uuid() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_uuid_get_start(&op);
+
+ auto aio_comp = create_rados_callback<
+ RemotePoolPoller<I>, &RemotePoolPoller<I>::handle_get_mirror_uuid>(this);
+ m_out_bl.clear();
+ int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void RemotePoolPoller<I>::handle_get_mirror_uuid(int r) {
+ dout(10) << "r=" << r << dendl;
+ std::string remote_mirror_uuid;
+ if (r >= 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_uuid_get_finish(&it, &remote_mirror_uuid);
+ if (r >= 0 && remote_mirror_uuid.empty()) {
+ r = -ENOENT;
+ }
+ }
+
+ if (r < 0) {
+ if (r == -ENOENT) {
+ dout(5) << "remote mirror uuid missing" << dendl;
+ } else {
+ derr << "failed to retrieve remote mirror uuid: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ m_remote_pool_meta.mirror_uuid = "";
+ }
+
+ // if we have the mirror uuid, we will poll until shut down
+ if (m_state == STATE_INITIALIZING) {
+ if (r < 0) {
+ schedule_task(r);
+ return;
+ }
+
+ m_state = STATE_POLLING;
+ }
+
+ dout(10) << "remote_mirror_uuid=" << remote_mirror_uuid << dendl;
+ if (m_remote_pool_meta.mirror_uuid != remote_mirror_uuid) {
+ m_remote_pool_meta.mirror_uuid = remote_mirror_uuid;
+ m_updated = true;
+ }
+
+ mirror_peer_ping();
+}
+
+template <typename I>
+void RemotePoolPoller<I>::mirror_peer_ping() {
+ dout(10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_peer_ping(&op, m_site_name, m_local_mirror_uuid);
+
+ auto aio_comp = create_rados_callback<
+ RemotePoolPoller<I>, &RemotePoolPoller<I>::handle_mirror_peer_ping>(this);
+ int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void RemotePoolPoller<I>::handle_mirror_peer_ping(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -EOPNOTSUPP) {
+ // older OSD that doesn't support snaphot-based mirroring, so no need
+ // to query remote peers
+ dout(10) << "remote peer does not support snapshot-based mirroring"
+ << dendl;
+ notify_listener();
+ return;
+ } else if (r < 0) {
+ // we can still see if we can perform a peer list and find outselves
+ derr << "failed to ping remote mirror peer: " << cpp_strerror(r) << dendl;
+ }
+
+ mirror_peer_list();
+}
+
+template <typename I>
+void RemotePoolPoller<I>::mirror_peer_list() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_peer_list_start(&op);
+
+ auto aio_comp = create_rados_callback<
+ RemotePoolPoller<I>, &RemotePoolPoller<I>::handle_mirror_peer_list>(this);
+ m_out_bl.clear();
+ int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void RemotePoolPoller<I>::handle_mirror_peer_list(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::vector<cls::rbd::MirrorPeer> peers;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_peer_list_finish(&iter, &peers);
+ }
+
+ if (r < 0) {
+ derr << "failed to retrieve mirror peers: " << cpp_strerror(r) << dendl;
+ }
+
+ cls::rbd::MirrorPeer* matched_peer = nullptr;
+ for (auto& peer : peers) {
+ if (peer.mirror_peer_direction == cls::rbd::MIRROR_PEER_DIRECTION_RX) {
+ continue;
+ }
+
+ if (peer.mirror_uuid == m_local_mirror_uuid) {
+ matched_peer = &peer;
+ break;
+ } else if (peer.site_name == m_site_name) {
+ // keep searching in case we hit an exact match by fsid
+ matched_peer = &peer;
+ }
+ }
+
+ // older OSDs don't support peer ping so we might fail to find a match,
+ // which will prevent snapshot mirroring from functioning
+ std::string remote_mirror_peer_uuid;
+ if (matched_peer != nullptr) {
+ remote_mirror_peer_uuid = matched_peer->uuid;
+ }
+
+ dout(10) << "remote_mirror_peer_uuid=" << remote_mirror_peer_uuid << dendl;
+ if (m_remote_pool_meta.mirror_peer_uuid != remote_mirror_peer_uuid) {
+ m_remote_pool_meta.mirror_peer_uuid = remote_mirror_peer_uuid;
+ m_updated = true;
+ }
+
+ notify_listener();
+}
+
+template <typename I>
+void RemotePoolPoller<I>::notify_listener() {
+ bool updated = false;
+ std::swap(updated, m_updated);
+ if (updated) {
+ dout(10) << dendl;
+ m_listener.handle_updated(m_remote_pool_meta);
+ }
+
+ schedule_task(0);
+}
+
+template <typename I>
+void RemotePoolPoller<I>::schedule_task(int r) {
+ std::unique_lock locker{m_threads->timer_lock};
+
+ if (m_state == STATE_POLLING) {
+ dout(10) << dendl;
+
+ ceph_assert(m_timer_task == nullptr);
+ m_timer_task = new LambdaContext([this](int) {
+ handle_task();
+ });
+
+ m_threads->timer->add_event_after(POLL_INTERVAL_SECONDS, m_timer_task);
+ }
+
+ // finish init or shut down callback
+ if (m_on_finish != nullptr) {
+ locker.unlock();
+ Context* on_finish = nullptr;
+ std::swap(on_finish, m_on_finish);
+ on_finish->complete(m_state == STATE_SHUTTING_DOWN ? 0 : r);
+ }
+}
+
+template <typename I>
+void RemotePoolPoller<I>::handle_task() {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_threads->timer_lock));
+ m_timer_task = nullptr;
+
+ auto ctx = new LambdaContext([this](int) {
+ get_mirror_uuid();
+ });
+ m_threads->work_queue->queue(ctx);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::RemotePoolPoller<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/RemotePoolPoller.h b/src/tools/rbd_mirror/RemotePoolPoller.h
new file mode 100644
index 000000000..19d803ca1
--- /dev/null
+++ b/src/tools/rbd_mirror/RemotePoolPoller.h
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_REMOTE_POOL_POLLER_H
+#define CEPH_RBD_MIRROR_REMOTE_POOL_POLLER_H
+
+#include "include/rados/librados.hpp"
+#include "tools/rbd_mirror/Types.h"
+#include <string>
+
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+namespace remote_pool_poller {
+
+struct Listener {
+ virtual ~Listener() {}
+
+ virtual void handle_updated(const RemotePoolMeta& remote_pool_meta) = 0;
+};
+
+}; // namespace remote_pool_poller
+
+template <typename ImageCtxT>
+class RemotePoolPoller {
+public:
+ static RemotePoolPoller* create(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& remote_io_ctx,
+ const std::string& site_name,
+ const std::string& local_mirror_uuid,
+ remote_pool_poller::Listener& listener) {
+ return new RemotePoolPoller(threads, remote_io_ctx, site_name,
+ local_mirror_uuid, listener);
+ }
+
+ RemotePoolPoller(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& remote_io_ctx,
+ const std::string& site_name,
+ const std::string& local_mirror_uuid,
+ remote_pool_poller::Listener& listener)
+ : m_threads(threads),
+ m_remote_io_ctx(remote_io_ctx),
+ m_site_name(site_name),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_listener(listener) {
+ }
+ ~RemotePoolPoller();
+
+ void init(Context* on_finish);
+ void shut_down(Context* on_finish);
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * |/----------------------------\
+ * | |
+ * v |
+ * MIRROR_UUID_GET |
+ * | |
+ * v |
+ * MIRROR_PEER_PING |
+ * | |
+ * v |
+ * MIRROR_PEER_LIST |
+ * | |
+ * v |
+ * MIRROR_UUID_GET |
+ * | |
+ * v (skip if no changes) |
+ * NOTIFY_LISTENER |
+ * | |
+ * | (repeat periodically) |
+ * |\----------------------------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ enum State {
+ STATE_INITIALIZING,
+ STATE_POLLING,
+ STATE_SHUTTING_DOWN
+ };
+
+ Threads<ImageCtxT>* m_threads;
+ librados::IoCtx& m_remote_io_ctx;
+ std::string m_site_name;
+ std::string m_local_mirror_uuid;
+ remote_pool_poller::Listener& m_listener;
+
+ bufferlist m_out_bl;
+
+ RemotePoolMeta m_remote_pool_meta;
+ bool m_updated = false;
+
+ State m_state = STATE_INITIALIZING;
+ Context* m_timer_task = nullptr;
+ Context* m_on_finish = nullptr;
+
+ void get_mirror_uuid();
+ void handle_get_mirror_uuid(int r);
+
+ void mirror_peer_ping();
+ void handle_mirror_peer_ping(int r);
+
+ void mirror_peer_list();
+ void handle_mirror_peer_list(int r);
+
+ void notify_listener();
+
+ void schedule_task(int r);
+ void handle_task();
+
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::RemotePoolPoller<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_REMOTE_POOL_POLLER_H
diff --git a/src/tools/rbd_mirror/ServiceDaemon.cc b/src/tools/rbd_mirror/ServiceDaemon.cc
new file mode 100644
index 000000000..f3cabcc87
--- /dev/null
+++ b/src/tools/rbd_mirror/ServiceDaemon.cc
@@ -0,0 +1,327 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/ServiceDaemon.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/Timer.h"
+#include "tools/rbd_mirror/Threads.h"
+#include <sstream>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ServiceDaemon: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+namespace {
+
+const std::string RBD_MIRROR_AUTH_ID_PREFIX("rbd-mirror.");
+
+struct AttributeDumpVisitor : public boost::static_visitor<void> {
+ ceph::Formatter *f;
+ const std::string& name;
+
+ AttributeDumpVisitor(ceph::Formatter *f, const std::string& name)
+ : f(f), name(name) {
+ }
+
+ void operator()(bool val) const {
+ f->dump_bool(name.c_str(), val);
+ }
+ void operator()(uint64_t val) const {
+ f->dump_unsigned(name.c_str(), val);
+ }
+ void operator()(const std::string& val) const {
+ f->dump_string(name.c_str(), val);
+ }
+};
+
+} // anonymous namespace
+
+using namespace service_daemon;
+
+template <typename I>
+ServiceDaemon<I>::ServiceDaemon(CephContext *cct, RadosRef rados,
+ Threads<I>* threads)
+ : m_cct(cct), m_rados(rados), m_threads(threads) {
+ dout(20) << dendl;
+}
+
+template <typename I>
+ServiceDaemon<I>::~ServiceDaemon() {
+ dout(20) << dendl;
+ std::lock_guard timer_locker{m_threads->timer_lock};
+ if (m_timer_ctx != nullptr) {
+ m_threads->timer->cancel_event(m_timer_ctx);
+ update_status();
+ }
+}
+
+template <typename I>
+int ServiceDaemon<I>::init() {
+ dout(20) << dendl;
+
+ std::string id = m_cct->_conf->name.get_id();
+ if (id.find(RBD_MIRROR_AUTH_ID_PREFIX) == 0) {
+ id = id.substr(RBD_MIRROR_AUTH_ID_PREFIX.size());
+ }
+
+ std::string instance_id = stringify(m_rados->get_instance_id());
+ std::map<std::string, std::string> service_metadata = {
+ {"id", id}, {"instance_id", instance_id}};
+ int r = m_rados->service_daemon_register("rbd-mirror", instance_id,
+ service_metadata);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void ServiceDaemon<I>::add_pool(int64_t pool_id, const std::string& pool_name) {
+ dout(20) << "pool_id=" << pool_id << ", pool_name=" << pool_name << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ m_pools.insert({pool_id, {pool_name}});
+ }
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_pool(int64_t pool_id) {
+ dout(20) << "pool_id=" << pool_id << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ m_pools.erase(pool_id);
+ }
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::add_namespace(int64_t pool_id,
+ const std::string& namespace_name) {
+ dout(20) << "pool_id=" << pool_id << ", namespace=" << namespace_name
+ << dendl;
+
+ std::lock_guard locker{m_lock};
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.ns_attributes[namespace_name];
+
+ // don't schedule update status as the namespace attributes are empty yet
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_namespace(int64_t pool_id,
+ const std::string& namespace_name) {
+ dout(20) << "pool_id=" << pool_id << ", namespace=" << namespace_name
+ << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.ns_attributes.erase(namespace_name);
+ }
+ schedule_update_status();
+}
+
+template <typename I>
+uint64_t ServiceDaemon<I>::add_or_update_callout(int64_t pool_id,
+ uint64_t callout_id,
+ CalloutLevel callout_level,
+ const std::string& text) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "callout_id=" << callout_id << ", "
+ << "callout_level=" << callout_level << ", "
+ << "text=" << text << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return CALLOUT_ID_NONE;
+ }
+
+ if (callout_id == CALLOUT_ID_NONE) {
+ callout_id = ++m_callout_id;
+ }
+ pool_it->second.callouts[callout_id] = {callout_level, text};
+ }
+
+ schedule_update_status();
+ return callout_id;
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_callout(int64_t pool_id, uint64_t callout_id) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "callout_id=" << callout_id << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.callouts.erase(callout_id);
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::add_or_update_attribute(int64_t pool_id,
+ const std::string& key,
+ const AttributeValue& value) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "key=" << key << ", "
+ << "value=" << value << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.attributes[key] = value;
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::add_or_update_namespace_attribute(
+ int64_t pool_id, const std::string& namespace_name, const std::string& key,
+ const AttributeValue& value) {
+ if (namespace_name.empty()) {
+ add_or_update_attribute(pool_id, key, value);
+ return;
+ }
+
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "namespace=" << namespace_name << ", "
+ << "key=" << key << ", "
+ << "value=" << value << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+
+ auto ns_it = pool_it->second.ns_attributes.find(namespace_name);
+ if (ns_it == pool_it->second.ns_attributes.end()) {
+ return;
+ }
+
+ ns_it->second[key] = value;
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_attribute(int64_t pool_id,
+ const std::string& key) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "key=" << key << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.attributes.erase(key);
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::schedule_update_status() {
+ std::lock_guard timer_locker{m_threads->timer_lock};
+ if (m_timer_ctx != nullptr) {
+ return;
+ }
+
+ m_timer_ctx = new LambdaContext([this](int) {
+ m_timer_ctx = nullptr;
+ update_status();
+ });
+ m_threads->timer->add_event_after(1, m_timer_ctx);
+}
+
+template <typename I>
+void ServiceDaemon<I>::update_status() {
+ dout(20) << dendl;
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+
+ ceph::JSONFormatter f;
+ {
+ std::lock_guard locker{m_lock};
+ f.open_object_section("pools");
+ for (auto& pool_pair : m_pools) {
+ f.open_object_section(stringify(pool_pair.first).c_str());
+ f.dump_string("name", pool_pair.second.name);
+ f.open_object_section("callouts");
+ for (auto& callout : pool_pair.second.callouts) {
+ f.open_object_section(stringify(callout.first).c_str());
+ f.dump_string("level", stringify(callout.second.level).c_str());
+ f.dump_string("text", callout.second.text.c_str());
+ f.close_section();
+ }
+ f.close_section(); // callouts
+
+ for (auto& attribute : pool_pair.second.attributes) {
+ AttributeDumpVisitor attribute_dump_visitor(&f, attribute.first);
+ boost::apply_visitor(attribute_dump_visitor, attribute.second);
+ }
+
+ if (!pool_pair.second.ns_attributes.empty()) {
+ f.open_object_section("namespaces");
+ for (auto& [ns, attributes] : pool_pair.second.ns_attributes) {
+ f.open_object_section(ns.c_str());
+ for (auto& [key, value] : attributes) {
+ AttributeDumpVisitor attribute_dump_visitor(&f, key);
+ boost::apply_visitor(attribute_dump_visitor, value);
+ }
+ f.close_section(); // namespace
+ }
+ f.close_section(); // namespaces
+ }
+ f.close_section(); // pool
+ }
+ f.close_section(); // pools
+ }
+
+ std::stringstream ss;
+ f.flush(ss);
+
+ int r = m_rados->service_daemon_update_status({{"json", ss.str()}});
+ if (r < 0) {
+ derr << "failed to update service daemon status: " << cpp_strerror(r)
+ << dendl;
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ServiceDaemon<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ServiceDaemon.h b/src/tools/rbd_mirror/ServiceDaemon.h
new file mode 100644
index 000000000..8b1e0f584
--- /dev/null
+++ b/src/tools/rbd_mirror/ServiceDaemon.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_SERVICE_DAEMON_H
+#define CEPH_RBD_MIRROR_SERVICE_DAEMON_H
+
+#include "common/ceph_mutex.h"
+#include "include/common_fwd.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/service_daemon/Types.h"
+#include <map>
+#include <string>
+
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ServiceDaemon {
+public:
+ ServiceDaemon(CephContext *cct, RadosRef rados, Threads<ImageCtxT>* threads);
+ ~ServiceDaemon();
+
+ int init();
+
+ void add_pool(int64_t pool_id, const std::string& pool_name);
+ void remove_pool(int64_t pool_id);
+
+ void add_namespace(int64_t pool_id, const std::string& namespace_name);
+ void remove_namespace(int64_t pool_id, const std::string& namespace_name);
+
+ uint64_t add_or_update_callout(int64_t pool_id, uint64_t callout_id,
+ service_daemon::CalloutLevel callout_level,
+ const std::string& text);
+ void remove_callout(int64_t pool_id, uint64_t callout_id);
+
+ void add_or_update_attribute(int64_t pool_id, const std::string& key,
+ const service_daemon::AttributeValue& value);
+ void add_or_update_namespace_attribute(
+ int64_t pool_id, const std::string& namespace_name,
+ const std::string& key, const service_daemon::AttributeValue& value);
+ void remove_attribute(int64_t pool_id, const std::string& key);
+
+private:
+ struct Callout {
+ service_daemon::CalloutLevel level;
+ std::string text;
+
+ Callout() : level(service_daemon::CALLOUT_LEVEL_INFO) {
+ }
+ Callout(service_daemon::CalloutLevel level, const std::string& text)
+ : level(level), text(text) {
+ }
+ };
+ typedef std::map<uint64_t, Callout> Callouts;
+ typedef std::map<std::string, service_daemon::AttributeValue> Attributes;
+ typedef std::map<std::string, Attributes> NamespaceAttributes;
+
+ struct Pool {
+ std::string name;
+ Callouts callouts;
+ Attributes attributes;
+ NamespaceAttributes ns_attributes;
+
+ Pool(const std::string& name) : name(name) {
+ }
+ };
+
+ typedef std::map<int64_t, Pool> Pools;
+
+ CephContext *m_cct;
+ RadosRef m_rados;
+ Threads<ImageCtxT>* m_threads;
+
+ ceph::mutex m_lock = ceph::make_mutex("rbd::mirror::ServiceDaemon");
+ Pools m_pools;
+ uint64_t m_callout_id = service_daemon::CALLOUT_ID_NONE;
+
+ Context* m_timer_ctx = nullptr;
+
+ void schedule_update_status();
+ void update_status();
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ServiceDaemon<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_SERVICE_DAEMON_H
diff --git a/src/tools/rbd_mirror/Threads.cc b/src/tools/rbd_mirror/Threads.cc
new file mode 100644
index 000000000..b0c762641
--- /dev/null
+++ b/src/tools/rbd_mirror/Threads.cc
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/Threads.h"
+#include "common/Timer.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/asio/ContextWQ.h"
+
+namespace rbd {
+namespace mirror {
+
+template <typename I>
+Threads<I>::Threads(std::shared_ptr<librados::Rados>& rados) {
+ auto cct = static_cast<CephContext*>(rados->cct());
+ asio_engine = new librbd::AsioEngine(rados);
+ work_queue = asio_engine->get_work_queue();
+
+ timer = new SafeTimer(cct, timer_lock, true);
+ timer->init();
+}
+
+template <typename I>
+Threads<I>::~Threads() {
+ {
+ std::lock_guard timer_locker{timer_lock};
+ timer->shutdown();
+ }
+ delete timer;
+
+ work_queue->drain();
+ delete asio_engine;
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::Threads<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/Threads.h b/src/tools/rbd_mirror/Threads.h
new file mode 100644
index 000000000..35c0b0f1c
--- /dev/null
+++ b/src/tools/rbd_mirror/Threads.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_THREADS_H
+#define CEPH_RBD_MIRROR_THREADS_H
+
+#include "include/common_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include <memory>
+
+class ThreadPool;
+
+namespace librbd {
+struct AsioEngine;
+struct ImageCtx;
+namespace asio { struct ContextWQ; }
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Threads {
+public:
+ librbd::AsioEngine* asio_engine = nullptr;
+ librbd::asio::ContextWQ* work_queue = nullptr;
+
+ SafeTimer *timer = nullptr;
+ ceph::mutex timer_lock = ceph::make_mutex("Threads::timer_lock");
+
+ explicit Threads(std::shared_ptr<librados::Rados>& rados);
+ Threads(const Threads&) = delete;
+ Threads& operator=(const Threads&) = delete;
+
+ ~Threads();
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::Threads<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_THREADS_H
diff --git a/src/tools/rbd_mirror/Throttler.cc b/src/tools/rbd_mirror/Throttler.cc
new file mode 100644
index 000000000..b20298963
--- /dev/null
+++ b/src/tools/rbd_mirror/Throttler.cc
@@ -0,0 +1,240 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "Throttler.h"
+#include "common/Formatter.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::Throttler:: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+template <typename I>
+Throttler<I>::Throttler(CephContext *cct, const std::string &config_key)
+ : m_cct(cct), m_config_key(config_key),
+ m_config_keys{m_config_key.c_str(), nullptr},
+ m_lock(ceph::make_mutex(
+ librbd::util::unique_lock_name("rbd::mirror::Throttler", this))),
+ m_max_concurrent_ops(cct->_conf.get_val<uint64_t>(m_config_key)) {
+ dout(20) << m_config_key << "=" << m_max_concurrent_ops << dendl;
+ m_cct->_conf.add_observer(this);
+}
+
+template <typename I>
+Throttler<I>::~Throttler() {
+ m_cct->_conf.remove_observer(this);
+
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_inflight_ops.empty());
+ ceph_assert(m_queue.empty());
+}
+
+template <typename I>
+void Throttler<I>::start_op(const std::string &ns,
+ const std::string &id_,
+ Context *on_start) {
+ Id id{ns, id_};
+
+ dout(20) << "id=" << id << dendl;
+
+ int r = 0;
+ {
+ std::lock_guard locker{m_lock};
+
+ if (m_inflight_ops.count(id) > 0) {
+ dout(20) << "duplicate for already started op " << id << dendl;
+ } else if (m_queued_ops.count(id) > 0) {
+ dout(20) << "duplicate for already queued op " << id << dendl;
+ std::swap(m_queued_ops[id], on_start);
+ r = -ENOENT;
+ } else if (m_max_concurrent_ops == 0 ||
+ m_inflight_ops.size() < m_max_concurrent_ops) {
+ ceph_assert(m_queue.empty());
+ m_inflight_ops.insert(id);
+ dout(20) << "ready to start op for " << id << " ["
+ << m_inflight_ops.size() << "/" << m_max_concurrent_ops << "]"
+ << dendl;
+ } else {
+ m_queue.push_back(id);
+ std::swap(m_queued_ops[id], on_start);
+ dout(20) << "op for " << id << " has been queued" << dendl;
+ }
+ }
+
+ if (on_start != nullptr) {
+ on_start->complete(r);
+ }
+}
+
+template <typename I>
+bool Throttler<I>::cancel_op(const std::string &ns,
+ const std::string &id_) {
+ Id id{ns, id_};
+
+ dout(20) << "id=" << id << dendl;
+
+ Context *on_start = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ auto it = m_queued_ops.find(id);
+ if (it != m_queued_ops.end()) {
+ dout(20) << "canceled queued op for " << id << dendl;
+ m_queue.remove(id);
+ on_start = it->second;
+ m_queued_ops.erase(it);
+ }
+ }
+
+ if (on_start == nullptr) {
+ return false;
+ }
+
+ on_start->complete(-ECANCELED);
+ return true;
+}
+
+template <typename I>
+void Throttler<I>::finish_op(const std::string &ns,
+ const std::string &id_) {
+ Id id{ns, id_};
+
+ dout(20) << "id=" << id << dendl;
+
+ if (cancel_op(ns, id_)) {
+ return;
+ }
+
+ Context *on_start = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+
+ m_inflight_ops.erase(id);
+
+ if (m_inflight_ops.size() < m_max_concurrent_ops && !m_queue.empty()) {
+ auto id = m_queue.front();
+ auto it = m_queued_ops.find(id);
+ ceph_assert(it != m_queued_ops.end());
+ m_inflight_ops.insert(id);
+ dout(20) << "ready to start op for " << id << " ["
+ << m_inflight_ops.size() << "/" << m_max_concurrent_ops << "]"
+ << dendl;
+ on_start = it->second;
+ m_queued_ops.erase(it);
+ m_queue.pop_front();
+ }
+ }
+
+ if (on_start != nullptr) {
+ on_start->complete(0);
+ }
+}
+
+template <typename I>
+void Throttler<I>::drain(const std::string &ns, int r) {
+ dout(20) << "ns=" << ns << dendl;
+
+ std::map<Id, Context *> queued_ops;
+ {
+ std::lock_guard locker{m_lock};
+ for (auto it = m_queued_ops.begin(); it != m_queued_ops.end(); ) {
+ if (it->first.first == ns) {
+ queued_ops[it->first] = it->second;
+ m_queue.remove(it->first);
+ it = m_queued_ops.erase(it);
+ } else {
+ it++;
+ }
+ }
+ for (auto it = m_inflight_ops.begin(); it != m_inflight_ops.end(); ) {
+ if (it->first == ns) {
+ dout(20) << "inflight_op " << *it << dendl;
+ it = m_inflight_ops.erase(it);
+ } else {
+ it++;
+ }
+ }
+ }
+
+ for (auto &it : queued_ops) {
+ dout(20) << "queued_op " << it.first << dendl;
+ it.second->complete(r);
+ }
+}
+
+template <typename I>
+void Throttler<I>::set_max_concurrent_ops(uint32_t max) {
+ dout(20) << "max=" << max << dendl;
+
+ std::list<Context *> ops;
+ {
+ std::lock_guard locker{m_lock};
+ m_max_concurrent_ops = max;
+
+ // Start waiting ops in the case of available free slots
+ while ((m_max_concurrent_ops == 0 ||
+ m_inflight_ops.size() < m_max_concurrent_ops) &&
+ !m_queue.empty()) {
+ auto id = m_queue.front();
+ m_inflight_ops.insert(id);
+ dout(20) << "ready to start op for " << id << " ["
+ << m_inflight_ops.size() << "/" << m_max_concurrent_ops << "]"
+ << dendl;
+ auto it = m_queued_ops.find(id);
+ ceph_assert(it != m_queued_ops.end());
+ ops.push_back(it->second);
+ m_queued_ops.erase(it);
+ m_queue.pop_front();
+ }
+ }
+
+ for (const auto& ctx : ops) {
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+void Throttler<I>::print_status(ceph::Formatter *f) {
+ dout(20) << dendl;
+
+ std::lock_guard locker{m_lock};
+
+ f->dump_int("max_parallel_requests", m_max_concurrent_ops);
+ f->dump_int("running_requests", m_inflight_ops.size());
+ f->dump_int("waiting_requests", m_queue.size());
+}
+
+template <typename I>
+const char** Throttler<I>::get_tracked_conf_keys() const {
+ return m_config_keys;
+}
+
+template <typename I>
+void Throttler<I>::handle_conf_change(const ConfigProxy& conf,
+ const set<string> &changed) {
+ if (changed.count(m_config_key)) {
+ set_max_concurrent_ops(conf.get_val<uint64_t>(m_config_key));
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::Throttler<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/Throttler.h b/src/tools/rbd_mirror/Throttler.h
new file mode 100644
index 000000000..32080238a
--- /dev/null
+++ b/src/tools/rbd_mirror/Throttler.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_THROTTLER_H
+#define RBD_MIRROR_THROTTLER_H
+
+#include <list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "common/ceph_mutex.h"
+#include "common/config_obs.h"
+#include "include/common_fwd.h"
+
+class Context;
+
+namespace ceph { class Formatter; }
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Throttler : public md_config_obs_t {
+public:
+ static Throttler *create(
+ CephContext *cct,
+ const std::string &config_key) {
+ return new Throttler(cct, config_key);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ Throttler(CephContext *cct,
+ const std::string &config_key);
+ ~Throttler() override;
+
+ void set_max_concurrent_ops(uint32_t max);
+ void start_op(const std::string &ns, const std::string &id,
+ Context *on_start);
+ bool cancel_op(const std::string &ns, const std::string &id);
+ void finish_op(const std::string &ns, const std::string &id);
+ void drain(const std::string &ns, int r);
+
+ void print_status(ceph::Formatter *f);
+
+private:
+ typedef std::pair<std::string, std::string> Id;
+
+ CephContext *m_cct;
+ const std::string m_config_key;
+ mutable const char* m_config_keys[2];
+
+ ceph::mutex m_lock;
+ uint32_t m_max_concurrent_ops;
+ std::list<Id> m_queue;
+ std::map<Id, Context *> m_queued_ops;
+ std::set<Id> m_inflight_ops;
+
+ const char **get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::Throttler<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_THROTTLER_H
diff --git a/src/tools/rbd_mirror/Types.cc b/src/tools/rbd_mirror/Types.cc
new file mode 100644
index 000000000..cd71c73b1
--- /dev/null
+++ b/src/tools/rbd_mirror/Types.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/Types.h"
+
+namespace rbd {
+namespace mirror {
+
+std::ostream &operator<<(std::ostream &os, const ImageId &image_id) {
+ return os << "global id=" << image_id.global_id << ", "
+ << "id=" << image_id.id;
+}
+
+std::ostream& operator<<(std::ostream& lhs,
+ const LocalPoolMeta& rhs) {
+ return lhs << "mirror_uuid=" << rhs.mirror_uuid;
+}
+
+std::ostream& operator<<(std::ostream& lhs,
+ const RemotePoolMeta& rhs) {
+ return lhs << "mirror_uuid=" << rhs.mirror_uuid << ", "
+ "mirror_peer_uuid=" << rhs.mirror_peer_uuid;
+}
+
+std::ostream& operator<<(std::ostream& lhs, const PeerSpec &peer) {
+ return lhs << "uuid: " << peer.uuid
+ << " cluster: " << peer.cluster_name
+ << " client: " << peer.client_name;
+}
+
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/Types.h b/src/tools/rbd_mirror/Types.h
new file mode 100644
index 000000000..7b2a3b5ce
--- /dev/null
+++ b/src/tools/rbd_mirror/Types.h
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_TYPES_H
+#define CEPH_RBD_MIRROR_TYPES_H
+
+#include <iostream>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct MirrorStatusUpdater;
+
+// Performance counters
+enum {
+ l_rbd_mirror_journal_first = 27000,
+ l_rbd_mirror_replay,
+ l_rbd_mirror_replay_bytes,
+ l_rbd_mirror_replay_latency,
+ l_rbd_mirror_journal_last,
+ l_rbd_mirror_snapshot_first,
+ l_rbd_mirror_snapshot_replay_snapshots,
+ l_rbd_mirror_snapshot_replay_snapshots_time,
+ l_rbd_mirror_snapshot_replay_bytes,
+ l_rbd_mirror_snapshot_last,
+};
+
+typedef std::shared_ptr<librados::Rados> RadosRef;
+typedef std::shared_ptr<librados::IoCtx> IoCtxRef;
+typedef std::shared_ptr<librbd::Image> ImageRef;
+
+struct ImageId {
+ std::string global_id;
+ std::string id;
+
+ explicit ImageId(const std::string &global_id) : global_id(global_id) {
+ }
+ ImageId(const std::string &global_id, const std::string &id)
+ : global_id(global_id), id(id) {
+ }
+
+ inline bool operator==(const ImageId &rhs) const {
+ return (global_id == rhs.global_id && id == rhs.id);
+ }
+ inline bool operator<(const ImageId &rhs) const {
+ return global_id < rhs.global_id;
+ }
+};
+
+std::ostream &operator<<(std::ostream &, const ImageId &image_id);
+
+typedef std::set<ImageId> ImageIds;
+
+struct LocalPoolMeta {
+ LocalPoolMeta() {}
+ LocalPoolMeta(const std::string& mirror_uuid)
+ : mirror_uuid(mirror_uuid) {
+ }
+
+ std::string mirror_uuid;
+};
+
+std::ostream& operator<<(std::ostream& lhs,
+ const LocalPoolMeta& local_pool_meta);
+
+struct RemotePoolMeta {
+ RemotePoolMeta() {}
+ RemotePoolMeta(const std::string& mirror_uuid,
+ const std::string& mirror_peer_uuid)
+ : mirror_uuid(mirror_uuid),
+ mirror_peer_uuid(mirror_peer_uuid) {
+ }
+
+ std::string mirror_uuid;
+ std::string mirror_peer_uuid;
+};
+
+std::ostream& operator<<(std::ostream& lhs,
+ const RemotePoolMeta& remote_pool_meta);
+
+template <typename I>
+struct Peer {
+ std::string uuid;
+ mutable librados::IoCtx io_ctx;
+ RemotePoolMeta remote_pool_meta;
+ MirrorStatusUpdater<I>* mirror_status_updater = nullptr;
+
+ Peer() {
+ }
+ Peer(const std::string& uuid,
+ librados::IoCtx& io_ctx,
+ const RemotePoolMeta& remote_pool_meta,
+ MirrorStatusUpdater<I>* mirror_status_updater)
+ : io_ctx(io_ctx),
+ remote_pool_meta(remote_pool_meta),
+ mirror_status_updater(mirror_status_updater) {
+ }
+
+ inline bool operator<(const Peer &rhs) const {
+ return uuid < rhs.uuid;
+ }
+};
+
+template <typename I>
+std::ostream& operator<<(std::ostream& lhs, const Peer<I>& peer) {
+ return lhs << peer.remote_pool_meta;
+}
+
+struct PeerSpec {
+ PeerSpec() = default;
+ PeerSpec(const std::string &uuid, const std::string &cluster_name,
+ const std::string &client_name)
+ : uuid(uuid), cluster_name(cluster_name), client_name(client_name)
+ {
+ }
+ PeerSpec(const librbd::mirror_peer_site_t &peer) :
+ uuid(peer.uuid),
+ cluster_name(peer.site_name),
+ client_name(peer.client_name)
+ {
+ }
+
+ std::string uuid;
+ std::string cluster_name;
+ std::string client_name;
+
+ /// optional config properties
+ std::string mon_host;
+ std::string key;
+
+ bool operator==(const PeerSpec& rhs) const {
+ return (uuid == rhs.uuid &&
+ cluster_name == rhs.cluster_name &&
+ client_name == rhs.client_name &&
+ mon_host == rhs.mon_host &&
+ key == rhs.key);
+ }
+ bool operator<(const PeerSpec& rhs) const {
+ if (uuid != rhs.uuid) {
+ return uuid < rhs.uuid;
+ } else if (cluster_name != rhs.cluster_name) {
+ return cluster_name < rhs.cluster_name;
+ } else if (client_name != rhs.client_name) {
+ return client_name < rhs.client_name;
+ } else if (mon_host < rhs.mon_host) {
+ return mon_host < rhs.mon_host;
+ } else {
+ return key < rhs.key;
+ }
+ }
+};
+
+std::ostream& operator<<(std::ostream& lhs, const PeerSpec &peer);
+
+} // namespace mirror
+} // namespace rbd
+
+
+#endif // CEPH_RBD_MIRROR_TYPES_H
diff --git a/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc
new file mode 100644
index 000000000..19a98804c
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc
@@ -0,0 +1,299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/Policy.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_deleter::SnapshotPurgeRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+void SnapshotPurgeRequest<I>::send() {
+ open_image();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::open_image() {
+ dout(10) << dendl;
+ m_image_ctx = I::create("", m_image_id, nullptr, m_io_ctx, false);
+
+ // ensure non-primary images can be modified
+ m_image_ctx->read_only_mask &= ~librbd::IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+
+ {
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ m_image_ctx->set_journal_policy(new JournalPolicy());
+ }
+
+ Context *ctx = create_context_callback<
+ SnapshotPurgeRequest<I>, &SnapshotPurgeRequest<I>::handle_open_image>(
+ this);
+ m_image_ctx->state->open(librbd::OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_open_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to open image '" << m_image_id << "': " << cpp_strerror(r)
+ << dendl;
+ m_image_ctx = nullptr;
+
+ finish(r);
+ return;
+ }
+
+ acquire_lock();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::acquire_lock() {
+ dout(10) << dendl;
+
+ m_image_ctx->owner_lock.lock_shared();
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ m_image_ctx->owner_lock.unlock_shared();
+
+ start_snap_unprotect();
+ return;
+ }
+
+ m_image_ctx->exclusive_lock->acquire_lock(create_context_callback<
+ SnapshotPurgeRequest<I>, &SnapshotPurgeRequest<I>::handle_acquire_lock>(
+ this));
+ m_image_ctx->owner_lock.unlock_shared();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_acquire_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to acquire exclusive lock: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ start_snap_unprotect();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::start_snap_unprotect() {
+ dout(10) << dendl;
+
+ {
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ m_snaps = m_image_ctx->snaps;
+ }
+ snap_unprotect();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::snap_unprotect() {
+ if (m_snaps.empty()) {
+ close_image();
+ return;
+ }
+
+ librados::snap_t snap_id = m_snaps.back();
+ m_image_ctx->image_lock.lock_shared();
+ int r = m_image_ctx->get_snap_namespace(snap_id, &m_snap_namespace);
+ if (r < 0) {
+ m_image_ctx->image_lock.unlock_shared();
+
+ derr << "failed to get snap namespace: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ r = m_image_ctx->get_snap_name(snap_id, &m_snap_name);
+ if (r < 0) {
+ m_image_ctx->image_lock.unlock_shared();
+
+ derr << "failed to get snap name: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ bool is_protected;
+ r = m_image_ctx->is_snap_protected(snap_id, &is_protected);
+ if (r < 0) {
+ m_image_ctx->image_lock.unlock_shared();
+
+ derr << "failed to get snap protection status: " << cpp_strerror(r)
+ << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+ m_image_ctx->image_lock.unlock_shared();
+
+ if (!is_protected) {
+ snap_remove();
+ return;
+ }
+
+ dout(10) << "snap_id=" << snap_id << ", "
+ << "snap_namespace=" << m_snap_namespace << ", "
+ << "snap_name=" << m_snap_name << dendl;
+
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ derr << "lost exclusive lock" << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_snap_unprotect(r);
+ finish_op_ctx->complete(0);
+ });
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ m_image_ctx->operations->execute_snap_unprotect(
+ m_snap_namespace, m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_snap_unprotect(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -EBUSY) {
+ dout(10) << "snapshot in-use" << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ } else if (r < 0) {
+ derr << "failed to unprotect snapshot: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ {
+ // avoid the need to refresh to delete the newly unprotected snapshot
+ std::shared_lock image_locker{m_image_ctx->image_lock};
+ librados::snap_t snap_id = m_snaps.back();
+ auto snap_info_it = m_image_ctx->snap_info.find(snap_id);
+ if (snap_info_it != m_image_ctx->snap_info.end()) {
+ snap_info_it->second.protection_status =
+ RBD_PROTECTION_STATUS_UNPROTECTED;
+ }
+ }
+
+ snap_remove();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::snap_remove() {
+ librados::snap_t snap_id = m_snaps.back();
+ dout(10) << "snap_id=" << snap_id << ", "
+ << "snap_namespace=" << m_snap_namespace << ", "
+ << "snap_name=" << m_snap_name << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ derr << "lost exclusive lock" << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ auto ctx = new LambdaContext([this, finish_op_ctx](int r) {
+ handle_snap_remove(r);
+ finish_op_ctx->complete(0);
+ });
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ m_image_ctx->operations->execute_snap_remove(
+ m_snap_namespace, m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_snap_remove(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -EBUSY) {
+ dout(10) << "snapshot in-use" << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ } else if (r < 0) {
+ derr << "failed to remove snapshot: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ m_snaps.pop_back();
+ snap_unprotect();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::close_image() {
+ dout(10) << dendl;
+
+ m_image_ctx->state->close(create_context_callback<
+ SnapshotPurgeRequest<I>,
+ &SnapshotPurgeRequest<I>::handle_close_image>(this));
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_close_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_image_ctx = nullptr;
+
+ if (r < 0) {
+ derr << "failed to close: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ finish(0);
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+template <typename I>
+Context *SnapshotPurgeRequest<I>::start_lock_op(int* r) {
+ std::shared_lock owner_locker{m_image_ctx->owner_lock};
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ return new LambdaContext([](int r) {});
+ }
+ return m_image_ctx->exclusive_lock->start_op(r);
+}
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_deleter::SnapshotPurgeRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h
new file mode 100644
index 000000000..70cae8518
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_SNAPSHOT_PURGE_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETER_SNAPSHOT_PURGE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <string>
+#include <vector>
+
+class Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SnapshotPurgeRequest {
+public:
+ static SnapshotPurgeRequest* create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ Context *on_finish) {
+ return new SnapshotPurgeRequest(io_ctx, image_id, on_finish);
+ }
+
+ SnapshotPurgeRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_IMAGE
+ * |
+ * v
+ * ACQUIRE_LOCK
+ * |
+ * | (repeat for each snapshot)
+ * |/------------------------\
+ * | |
+ * v (skip if not needed) |
+ * SNAP_UNPROTECT |
+ * | |
+ * v (skip if not needed) |
+ * SNAP_REMOVE -----------------/
+ * |
+ * v
+ * CLOSE_IMAGE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ Context *m_on_finish;
+
+ ImageCtxT *m_image_ctx = nullptr;
+ int m_ret_val = 0;
+
+ std::vector<librados::snap_t> m_snaps;
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+
+ void open_image();
+ void handle_open_image(int r);
+
+ void acquire_lock();
+ void handle_acquire_lock(int r);
+
+ void start_snap_unprotect();
+ void snap_unprotect();
+ void handle_snap_unprotect(int r);
+
+ void snap_remove();
+ void handle_snap_remove(int r);
+
+ void close_image();
+ void handle_close_image(int r);
+
+ void finish(int r);
+
+ Context *start_lock_op(int* r);
+
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_deleter::SnapshotPurgeRequest<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_SNAPSHOT_PURGE_REQUEST_H
+
diff --git a/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc
new file mode 100644
index 000000000..e53923ef3
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc
@@ -0,0 +1,419 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_deleter/TrashMoveRequest.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/TrashWatcher.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/ResetRequest.h"
+#include "librbd/mirror/ImageRemoveRequest.h"
+#include "librbd/mirror/GetInfoRequest.h"
+#include "librbd/trash/MoveRequest.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_deleter::TrashMoveRequest: " \
+ << this << " " << __func__ << ": "
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void TrashMoveRequest<I>::send() {
+ get_mirror_image_id();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::get_mirror_image_id() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_image_id_start(&op, m_global_image_id);
+
+ auto aio_comp = create_rados_callback<
+ TrashMoveRequest<I>,
+ &TrashMoveRequest<I>::handle_get_mirror_image_id>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_get_mirror_image_id(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto bl_it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_image_id_finish(&bl_it,
+ &m_image_id);
+ }
+ if (r == -ENOENT) {
+ dout(10) << "image " << m_global_image_id << " is not mirrored" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "error retrieving local id for image " << m_global_image_id << ": "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ get_mirror_info();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::get_mirror_info() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_get_mirror_info>(this);
+ auto req = librbd::mirror::GetInfoRequest<I>::create(
+ m_io_ctx, m_op_work_queue, m_image_id, &m_mirror_image, &m_promotion_state,
+ &m_primary_mirror_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_get_mirror_info(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(5) << "image " << m_global_image_id << " is not mirrored" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "error retrieving image primary info for image "
+ << m_global_image_id << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_promotion_state == librbd::mirror::PROMOTION_STATE_PRIMARY) {
+ dout(10) << "image " << m_global_image_id << " is local primary" << dendl;
+ finish(-EPERM);
+ return;
+ } else if (m_promotion_state == librbd::mirror::PROMOTION_STATE_ORPHAN &&
+ !m_resync) {
+ dout(10) << "image " << m_global_image_id << " is orphaned" << dendl;
+ finish(-EPERM);
+ return;
+ }
+
+ disable_mirror_image();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::disable_mirror_image() {
+ dout(10) << dendl;
+
+ m_mirror_image.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_set(&op, m_image_id, m_mirror_image);
+
+ auto aio_comp = create_rados_callback<
+ TrashMoveRequest<I>,
+ &TrashMoveRequest<I>::handle_disable_mirror_image>(this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_disable_mirror_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(10) << "local image is not mirrored, aborting deletion." << dendl;
+ finish(r);
+ return;
+ } else if (r == -EEXIST || r == -EINVAL) {
+ derr << "cannot disable mirroring for image " << m_global_image_id
+ << ": global_image_id has changed/reused: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "cannot disable mirroring for image " << m_global_image_id
+ << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ open_image();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::open_image() {
+ dout(10) << dendl;
+
+ m_image_ctx = I::create("", m_image_id, nullptr, m_io_ctx, false);
+
+ // ensure non-primary images can be modified
+ m_image_ctx->read_only_mask &= ~librbd::IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+
+ {
+ // don't attempt to open the journal
+ std::unique_lock image_locker{m_image_ctx->image_lock};
+ m_image_ctx->set_journal_policy(new JournalPolicy());
+ }
+
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_open_image>(this);
+ m_image_ctx->state->open(librbd::OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_open_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(5) << "mirror image does not exist, removing orphaned metadata" << dendl;
+ m_image_ctx = nullptr;
+ remove_mirror_image();
+ return;
+ }
+
+ if (r < 0) {
+ derr << "failed to open image: " << cpp_strerror(r) << dendl;
+ m_image_ctx = nullptr;
+ finish(r);
+ return;
+ }
+
+ if (m_image_ctx->old_format) {
+ derr << "cannot move v1 image to trash" << dendl;
+ m_ret_val = -EINVAL;
+ close_image();
+ return;
+ }
+
+ reset_journal();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::reset_journal() {
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ // snapshot-based mirroring doesn't require journal feature
+ acquire_lock();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ // TODO use Journal thread pool for journal ops until converted to ASIO
+ ContextWQ* context_wq;
+ librbd::Journal<>::get_work_queue(
+ reinterpret_cast<CephContext*>(m_io_ctx.cct()), &context_wq);
+
+ // ensure that if the image is recovered any peers will split-brain
+ auto ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_reset_journal>(this);
+ auto req = librbd::journal::ResetRequest<I>::create(
+ m_io_ctx, m_image_id, librbd::Journal<>::IMAGE_CLIENT_ID,
+ librbd::Journal<>::LOCAL_MIRROR_UUID, context_wq, ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_reset_journal(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to reset journal: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ acquire_lock();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::acquire_lock() {
+ m_image_ctx->owner_lock.lock_shared();
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ m_image_ctx->owner_lock.unlock_shared();
+
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ // snapshot-based mirroring doesn't require exclusive-lock
+ trash_move();
+ } else {
+ derr << "exclusive lock feature not enabled" << dendl;
+ m_ret_val = -EINVAL;
+ close_image();
+ }
+ return;
+ }
+
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_acquire_lock>(this);
+ m_image_ctx->exclusive_lock->block_requests(0);
+ m_image_ctx->exclusive_lock->acquire_lock(ctx);
+ m_image_ctx->owner_lock.unlock_shared();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_acquire_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to acquire exclusive lock: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ trash_move();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::trash_move() {
+ dout(10) << dendl;
+
+ utime_t delete_time{ceph_clock_now()};
+ utime_t deferment_end_time{delete_time};
+ deferment_end_time +=
+ m_image_ctx->config.template get_val<uint64_t>("rbd_mirroring_delete_delay");
+
+ m_trash_image_spec = {
+ cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING, m_image_ctx->name, delete_time,
+ deferment_end_time};
+
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_trash_move>(this);
+ auto req = librbd::trash::MoveRequest<I>::create(
+ m_io_ctx, m_image_id, m_trash_image_spec, ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_trash_move(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to move image to trash: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ m_moved_to_trash = true;
+ remove_mirror_image();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::remove_mirror_image() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ TrashMoveRequest<I>,
+ &TrashMoveRequest<I>::handle_remove_mirror_image>(this);
+ auto req = librbd::mirror::ImageRemoveRequest<I>::create(
+ m_io_ctx, m_global_image_id, m_image_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_remove_mirror_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(10) << "local image is not mirrored" << dendl;
+ } else if (r < 0) {
+ derr << "failed to remove mirror image state for " << m_global_image_id
+ << ": " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+
+ close_image();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::close_image() {
+ dout(10) << dendl;
+
+ if (m_image_ctx == nullptr) {
+ handle_close_image(0);
+ return;
+ }
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_close_image>(this);
+ m_image_ctx->state->close(ctx);
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_close_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_image_ctx = nullptr;
+
+ if (r < 0) {
+ derr << "failed to close image: " << cpp_strerror(r) << dendl;
+ }
+
+ // don't send notification if we failed
+ if (!m_moved_to_trash) {
+ finish(0);
+ return;
+ }
+
+ notify_trash_add();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::notify_trash_add() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_notify_trash_add>(this);
+ librbd::TrashWatcher<I>::notify_image_added(m_io_ctx, m_image_id,
+ m_trash_image_spec, ctx);
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_notify_trash_add(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to notify trash watchers: " << cpp_strerror(r) << dendl;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void TrashMoveRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ dout(10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_deleter::TrashMoveRequest<librbd::ImageCtx>;
+
diff --git a/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h
new file mode 100644
index 000000000..5b3f02519
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_MOVE_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_MOVE_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+#include <string>
+
+struct Context;
+namespace librbd {
+struct ImageCtx;
+namespace asio { struct ContextWQ; }
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class TrashMoveRequest {
+public:
+ static TrashMoveRequest* create(librados::IoCtx& io_ctx,
+ const std::string& global_image_id,
+ bool resync,
+ librbd::asio::ContextWQ* op_work_queue,
+ Context* on_finish) {
+ return new TrashMoveRequest(io_ctx, global_image_id, resync, op_work_queue,
+ on_finish);
+ }
+
+ TrashMoveRequest(librados::IoCtx& io_ctx, const std::string& global_image_id,
+ bool resync, librbd::asio::ContextWQ* op_work_queue,
+ Context* on_finish)
+ : m_io_ctx(io_ctx), m_global_image_id(global_image_id), m_resync(resync),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_IMAGE_ID
+ * |
+ * v
+ * GET_MIRROR_INFO
+ * |
+ * v
+ * DISABLE_MIRROR_IMAGE
+ * |
+ * v
+ * OPEN_IMAGE
+ * |
+ * v (skip if not needed)
+ * RESET_JOURNAL
+ * |
+ * v (skip if not needed)
+ * ACQUIRE_LOCK
+ * |
+ * v
+ * TRASH_MOVE
+ * |
+ * v
+ * REMOVE_MIRROR_IMAGE
+ * |
+ * v
+ * CLOSE_IMAGE
+ * |
+ * v
+ * NOTIFY_TRASH_ADD
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_global_image_id;
+ bool m_resync;
+ librbd::asio::ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ ceph::bufferlist m_out_bl;
+ std::string m_image_id;
+ cls::rbd::MirrorImage m_mirror_image;
+ librbd::mirror::PromotionState m_promotion_state;
+ std::string m_primary_mirror_uuid;
+ cls::rbd::TrashImageSpec m_trash_image_spec;
+ ImageCtxT *m_image_ctx = nullptr;;
+ int m_ret_val = 0;
+ bool m_moved_to_trash = false;
+
+ void get_mirror_image_id();
+ void handle_get_mirror_image_id(int r);
+
+ void get_mirror_info();
+ void handle_get_mirror_info(int r);
+
+ void disable_mirror_image();
+ void handle_disable_mirror_image(int r);
+
+ void open_image();
+ void handle_open_image(int r);
+
+ void reset_journal();
+ void handle_reset_journal(int r);
+
+ void acquire_lock();
+ void handle_acquire_lock(int r);
+
+ void trash_move();
+ void handle_trash_move(int r);
+
+ void remove_mirror_image();
+ void handle_remove_mirror_image(int r);
+
+ void close_image();
+ void handle_close_image(int r);
+
+ void notify_trash_add();
+ void handle_notify_trash_add(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_deleter::TrashMoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H
diff --git a/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc
new file mode 100644
index 000000000..4d7c1c9df
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_deleter/TrashRemoveRequest.h"
+#include "include/ceph_assert.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/TrashWatcher.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/trash/RemoveRequest.h"
+#include "tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_deleter::TrashRemoveRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void TrashRemoveRequest<I>::send() {
+ *m_error_result = ERROR_RESULT_RETRY;
+
+ get_trash_image_spec();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::get_trash_image_spec() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::trash_get_start(&op, m_image_id);
+
+ auto aio_comp = create_rados_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_get_trash_image_spec>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_get_trash_image_spec(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto bl_it = m_out_bl.cbegin();
+ r = librbd::cls_client::trash_get_finish(&bl_it, &m_trash_image_spec);
+ }
+
+ if (r == -ENOENT || (r >= 0 && m_trash_image_spec.source !=
+ cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING)) {
+ dout(10) << "image id " << m_image_id << " not in mirroring trash" << dendl;
+ finish(0);
+ return;
+ } else if (r < 0) {
+ derr << "error getting image id " << m_image_id << " info from trash: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_trash_image_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL &&
+ m_trash_image_spec.state != cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ dout(10) << "image " << m_image_id << " is not in an expected trash state: "
+ << m_trash_image_spec.state << dendl;
+ *m_error_result = ERROR_RESULT_RETRY_IMMEDIATELY;
+ finish(-EBUSY);
+ return;
+ }
+
+ set_trash_state();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::set_trash_state() {
+ if (m_trash_image_spec.state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ get_snap_context();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::trash_state_set(&op, m_image_id,
+ cls::rbd::TRASH_IMAGE_STATE_REMOVING,
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL);
+
+ auto aio_comp = create_rados_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_set_trash_state>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_set_trash_state(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(10) << "image id " << m_image_id << " not in mirroring trash" << dendl;
+ finish(0);
+ return;
+ } else if (r < 0 && r != -EOPNOTSUPP) {
+ derr << "error setting trash image state for image id " << m_image_id
+ << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ get_snap_context();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::get_snap_context() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::get_snapcontext_start(&op);
+
+ std::string header_oid = librbd::util::header_name(m_image_id);
+
+ auto aio_comp = create_rados_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_get_snap_context>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(header_oid, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_get_snap_context(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ::SnapContext snapc;
+ if (r == 0) {
+ auto bl_it = m_out_bl.cbegin();
+ r = librbd::cls_client::get_snapcontext_finish(&bl_it, &snapc);
+ }
+ if (r < 0 && r != -ENOENT) {
+ derr << "error retrieving snapshot context for image "
+ << m_image_id << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_has_snapshots = (!snapc.empty());
+ purge_snapshots();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::purge_snapshots() {
+ if (!m_has_snapshots) {
+ remove_image();
+ return;
+ }
+
+ dout(10) << dendl;
+ auto ctx = create_context_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_purge_snapshots>(this);
+ auto req = SnapshotPurgeRequest<I>::create(m_io_ctx, m_image_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_purge_snapshots(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -EBUSY) {
+ dout(10) << "snapshots still in-use" << dendl;
+ *m_error_result = ERROR_RESULT_RETRY_IMMEDIATELY;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to purge image snapshots: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ remove_image();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::remove_image() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_remove_image>(this);
+ auto req = librbd::trash::RemoveRequest<I>::create(
+ m_io_ctx, m_image_id, m_op_work_queue, true, m_progress_ctx,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_remove_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r == -ENOTEMPTY) {
+ // image must have clone v2 snapshot still associated to child
+ dout(10) << "snapshots still in-use" << dendl;
+ *m_error_result = ERROR_RESULT_RETRY_IMMEDIATELY;
+ finish(-EBUSY);
+ return;
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "error removing image " << m_image_id << " "
+ << "(" << m_image_id << ") from local pool: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ notify_trash_removed();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::notify_trash_removed() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_notify_trash_removed>(this);
+ librbd::TrashWatcher<I>::notify_image_removed(m_io_ctx, m_image_id, ctx);
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_notify_trash_removed(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to notify trash watchers: " << cpp_strerror(r) << dendl;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::finish(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_deleter::TrashRemoveRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h
new file mode 100644
index 000000000..b99736b33
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_TRASH_REMOVE_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETER_TRASH_REMOVE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/internal.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+#include <string>
+#include <vector>
+
+class Context;
+class ContextWQ;
+namespace librbd {
+struct ImageCtx;
+namespace asio { struct ContextWQ; }
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class TrashRemoveRequest {
+public:
+ static TrashRemoveRequest* create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ ErrorResult *error_result,
+ librbd::asio::ContextWQ *op_work_queue,
+ Context *on_finish) {
+ return new TrashRemoveRequest(io_ctx, image_id, error_result, op_work_queue,
+ on_finish);
+ }
+
+ TrashRemoveRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ ErrorResult *error_result,
+ librbd::asio::ContextWQ *op_work_queue,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id), m_error_result(error_result),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_TRASH_IMAGE_SPEC
+ * |
+ * v
+ * SET_TRASH_STATE
+ * |
+ * v
+ * GET_SNAP_CONTEXT
+ * |
+ * v
+ * PURGE_SNAPSHOTS
+ * |
+ * v
+ * TRASH_REMOVE
+ * |
+ * v
+ * NOTIFY_TRASH_REMOVE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ ErrorResult *m_error_result;
+ librbd::asio::ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ ceph::bufferlist m_out_bl;
+ cls::rbd::TrashImageSpec m_trash_image_spec;
+ bool m_has_snapshots = false;
+ librbd::NoOpProgressContext m_progress_ctx;
+
+ void get_trash_image_spec();
+ void handle_get_trash_image_spec(int r);
+
+ void set_trash_state();
+ void handle_set_trash_state(int r);
+
+ void get_snap_context();
+ void handle_get_snap_context(int r);
+
+ void purge_snapshots();
+ void handle_purge_snapshots(int r);
+
+ void remove_image();
+ void handle_remove_image(int r);
+
+ void notify_trash_removed();
+ void handle_notify_trash_removed(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_deleter::TrashRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_TRASH_REMOVE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_deleter/TrashWatcher.cc b/src/tools/rbd_mirror/image_deleter/TrashWatcher.cc
new file mode 100644
index 000000000..552d77e0e
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashWatcher.cc
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_deleter/TrashWatcher.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_deleter::TrashWatcher: " \
+ << this << " " << __func__ << ": "
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+namespace {
+
+const size_t MAX_RETURN = 1024;
+
+} // anonymous namespace
+
+template <typename I>
+TrashWatcher<I>::TrashWatcher(librados::IoCtx &io_ctx, Threads<I> *threads,
+ TrashListener& trash_listener)
+ : librbd::TrashWatcher<I>(io_ctx, threads->work_queue),
+ m_io_ctx(io_ctx), m_threads(threads), m_trash_listener(trash_listener),
+ m_lock(ceph::make_mutex(librbd::util::unique_lock_name(
+ "rbd::mirror::image_deleter::TrashWatcher", this))) {
+}
+
+template <typename I>
+void TrashWatcher<I>::init(Context *on_finish) {
+ dout(5) << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ m_on_init_finish = on_finish;
+
+ ceph_assert(!m_trash_list_in_progress);
+ m_trash_list_in_progress = true;
+ }
+
+ create_trash();
+}
+
+template <typename I>
+void TrashWatcher<I>::shut_down(Context *on_finish) {
+ dout(5) << dendl;
+
+ {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+
+ ceph_assert(!m_shutting_down);
+ m_shutting_down = true;
+ if (m_timer_ctx != nullptr) {
+ m_threads->timer->cancel_event(m_timer_ctx);
+ m_timer_ctx = nullptr;
+ }
+ }
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ unregister_watcher(on_finish);
+ });
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_image_added(const std::string &image_id,
+ const cls::rbd::TrashImageSpec& spec) {
+ dout(10) << "image_id=" << image_id << dendl;
+
+ std::lock_guard locker{m_lock};
+ add_image(image_id, spec);
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_image_removed(const std::string &image_id) {
+ // ignore removals -- the image deleter will ignore -ENOENTs
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_rewatch_complete(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ dout(0) << "detected client is blocklisted" << dendl;
+ return;
+ } else if (r == -ENOENT) {
+ dout(5) << "trash directory deleted" << dendl;
+ } else if (r < 0) {
+ derr << "unexpected error re-registering trash directory watch: "
+ << cpp_strerror(r) << dendl;
+ }
+ schedule_trash_list(30);
+}
+
+template <typename I>
+void TrashWatcher<I>::create_trash() {
+ dout(20) << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_trash_list_in_progress);
+ }
+
+ librados::ObjectWriteOperation op;
+ op.create(false);
+
+ m_async_op_tracker.start_op();
+ auto aio_comp = create_rados_callback<
+ TrashWatcher<I>, &TrashWatcher<I>::handle_create_trash>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_create_trash(int r) {
+ dout(20) << "r=" << r << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_trash_list_in_progress);
+ }
+
+ Context* on_init_finish = nullptr;
+ if (r == -EBLOCKLISTED || r == -ENOENT) {
+ if (r == -EBLOCKLISTED) {
+ dout(0) << "detected client is blocklisted" << dendl;
+ } else {
+ dout(0) << "detected pool no longer exists" << dendl;
+ }
+
+ std::lock_guard locker{m_lock};
+ std::swap(on_init_finish, m_on_init_finish);
+ m_trash_list_in_progress = false;
+ } else if (r < 0 && r != -EEXIST) {
+ derr << "failed to create trash object: " << cpp_strerror(r) << dendl;
+ {
+ std::lock_guard locker{m_lock};
+ m_trash_list_in_progress = false;
+ }
+
+ schedule_trash_list(30);
+ } else {
+ register_watcher();
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void TrashWatcher<I>::register_watcher() {
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_trash_list_in_progress);
+ }
+
+ // if the watch registration is in-flight, let the watcher
+ // handle the transition -- only (re-)register if it's not registered
+ if (!this->is_unregistered()) {
+ trash_list(true);
+ return;
+ }
+
+ // first time registering or the watch failed
+ dout(5) << dendl;
+ m_async_op_tracker.start_op();
+
+ Context *ctx = create_context_callback<
+ TrashWatcher, &TrashWatcher<I>::handle_register_watcher>(this);
+ this->register_watch(ctx);
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_register_watcher(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_trash_list_in_progress);
+ if (r < 0) {
+ m_trash_list_in_progress = false;
+ }
+ }
+
+ Context *on_init_finish = nullptr;
+ if (r >= 0) {
+ trash_list(true);
+ } else if (r == -EBLOCKLISTED) {
+ dout(0) << "detected client is blocklisted" << dendl;
+
+ std::lock_guard locker{m_lock};
+ std::swap(on_init_finish, m_on_init_finish);
+ } else {
+ derr << "unexpected error registering trash directory watch: "
+ << cpp_strerror(r) << dendl;
+ schedule_trash_list(10);
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void TrashWatcher<I>::unregister_watcher(Context* on_finish) {
+ dout(5) << dendl;
+
+ m_async_op_tracker.start_op();
+ Context *ctx = new LambdaContext([this, on_finish](int r) {
+ handle_unregister_watcher(r, on_finish);
+ });
+ this->unregister_watch(ctx);
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_unregister_watcher(int r, Context* on_finish) {
+ dout(5) << "unregister_watcher: r=" << r << dendl;
+ if (r < 0) {
+ derr << "error unregistering watcher for trash directory: "
+ << cpp_strerror(r) << dendl;
+ }
+ m_async_op_tracker.finish_op();
+ on_finish->complete(0);
+}
+
+template <typename I>
+void TrashWatcher<I>::trash_list(bool initial_request) {
+ if (initial_request) {
+ m_async_op_tracker.start_op();
+ m_last_image_id = "";
+ }
+
+ dout(5) << "last_image_id=" << m_last_image_id << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_trash_list_in_progress);
+ }
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::trash_list_start(&op, m_last_image_id, MAX_RETURN);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ TrashWatcher<I>, &TrashWatcher<I>::handle_trash_list>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_trash_list(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ std::map<std::string, cls::rbd::TrashImageSpec> images;
+ if (r >= 0) {
+ auto bl_it = m_out_bl.cbegin();
+ r = librbd::cls_client::trash_list_finish(&bl_it, &images);
+ }
+
+ Context *on_init_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_trash_list_in_progress);
+ if (r >= 0) {
+ for (auto& image : images) {
+ add_image(image.first, image.second);
+ }
+ } else if (r == -ENOENT) {
+ r = 0;
+ }
+
+ if (r == -EBLOCKLISTED) {
+ dout(0) << "detected client is blocklisted during trash refresh" << dendl;
+ m_trash_list_in_progress = false;
+ std::swap(on_init_finish, m_on_init_finish);
+ } else if (r >= 0 && images.size() < MAX_RETURN) {
+ m_trash_list_in_progress = false;
+ std::swap(on_init_finish, m_on_init_finish);
+ } else if (r < 0) {
+ m_trash_list_in_progress = false;
+ }
+ }
+
+ if (r >= 0 && images.size() == MAX_RETURN) {
+ m_last_image_id = images.rbegin()->first;
+ trash_list(false);
+ return;
+ } else if (r < 0 && r != -EBLOCKLISTED) {
+ derr << "failed to retrieve trash directory: " << cpp_strerror(r) << dendl;
+ schedule_trash_list(10);
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void TrashWatcher<I>::schedule_trash_list(double interval) {
+ std::scoped_lock locker{m_threads->timer_lock, m_lock};
+ if (m_shutting_down || m_trash_list_in_progress || m_timer_ctx != nullptr) {
+ if (m_trash_list_in_progress && !m_deferred_trash_list) {
+ dout(5) << "deferring refresh until in-flight refresh completes" << dendl;
+ m_deferred_trash_list = true;
+ }
+ return;
+ }
+
+ dout(5) << dendl;
+ m_timer_ctx = m_threads->timer->add_event_after(
+ interval,
+ new LambdaContext([this](int r) {
+ process_trash_list();
+ }));
+}
+
+template <typename I>
+void TrashWatcher<I>::process_trash_list() {
+ dout(5) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked(m_threads->timer_lock));
+ ceph_assert(m_timer_ctx != nullptr);
+ m_timer_ctx = nullptr;
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(!m_trash_list_in_progress);
+ m_trash_list_in_progress = true;
+ }
+
+ // execute outside of the timer's lock
+ m_async_op_tracker.start_op();
+ Context *ctx = new LambdaContext([this](int r) {
+ create_trash();
+ m_async_op_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void TrashWatcher<I>::add_image(const std::string& image_id,
+ const cls::rbd::TrashImageSpec& spec) {
+ if (spec.source != cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING) {
+ return;
+ }
+
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ auto& deferment_end_time = spec.deferment_end_time;
+ dout(10) << "image_id=" << image_id << ", "
+ << "deferment_end_time=" << deferment_end_time << dendl;
+
+ m_async_op_tracker.start_op();
+ auto ctx = new LambdaContext([this, image_id, deferment_end_time](int r) {
+ m_trash_listener.handle_trash_image(image_id,
+ deferment_end_time.to_real_time());
+ m_async_op_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+} // namespace image_deleter;
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_deleter::TrashWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_deleter/TrashWatcher.h b/src/tools/rbd_mirror/image_deleter/TrashWatcher.h
new file mode 100644
index 000000000..e818a102c
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashWatcher.h
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H
+
+#include "include/rados/librados.hpp"
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_mutex.h"
+#include "librbd/TrashWatcher.h"
+#include <set>
+#include <string>
+
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+namespace image_deleter {
+
+struct TrashListener;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class TrashWatcher : public librbd::TrashWatcher<ImageCtxT> {
+public:
+ static TrashWatcher* create(librados::IoCtx &io_ctx,
+ Threads<ImageCtxT> *threads,
+ TrashListener& trash_listener) {
+ return new TrashWatcher(io_ctx, threads, trash_listener);
+ }
+
+ TrashWatcher(librados::IoCtx &io_ctx, Threads<ImageCtxT> *threads,
+ TrashListener& trash_listener);
+ TrashWatcher(const TrashWatcher&) = delete;
+ TrashWatcher& operator=(const TrashWatcher&) = delete;
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+protected:
+ void handle_image_added(const std::string &image_id,
+ const cls::rbd::TrashImageSpec& spec) override;
+
+ void handle_image_removed(const std::string &image_id) override;
+
+ void handle_rewatch_complete(int r) override;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT
+ * |
+ * v
+ * CREATE_TRASH
+ * |
+ * v
+ * REGISTER_WATCHER
+ * |
+ * |/--------------------------------\
+ * | |
+ * |/---------\ |
+ * | | |
+ * v | (more images) |
+ * TRASH_LIST ---/ |
+ * | |
+ * |/----------------------------\ |
+ * | | |
+ * v | |
+ * <idle> --\ | |
+ * | | | |
+ * | |\---> IMAGE_ADDED -----/ |
+ * | | |
+ * | \----> WATCH_ERROR ---------/
+ * v
+ * SHUT_DOWN
+ * |
+ * v
+ * UNREGISTER_WATCHER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx m_io_ctx;
+ Threads<ImageCtxT> *m_threads;
+ TrashListener& m_trash_listener;
+
+ std::string m_last_image_id;
+ bufferlist m_out_bl;
+
+ mutable ceph::mutex m_lock;
+
+ Context *m_on_init_finish = nullptr;
+ Context *m_timer_ctx = nullptr;
+
+ AsyncOpTracker m_async_op_tracker;
+ bool m_trash_list_in_progress = false;
+ bool m_deferred_trash_list = false;
+ bool m_shutting_down = false;
+
+ void register_watcher();
+ void handle_register_watcher(int r);
+
+ void create_trash();
+ void handle_create_trash(int r);
+
+ void unregister_watcher(Context* on_finish);
+ void handle_unregister_watcher(int r, Context* on_finish);
+
+ void trash_list(bool initial_request);
+ void handle_trash_list(int r);
+
+ void schedule_trash_list(double interval);
+ void process_trash_list();
+
+ void get_mirror_uuid();
+ void handle_get_mirror_uuid(int r);
+
+ void add_image(const std::string& image_id,
+ const cls::rbd::TrashImageSpec& spec);
+
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_deleter::TrashWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H
diff --git a/src/tools/rbd_mirror/image_deleter/Types.h b/src/tools/rbd_mirror/image_deleter/Types.h
new file mode 100644
index 000000000..1c70b7e14
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/Types.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_TYPES_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETER_TYPES_H
+
+#include "include/Context.h"
+#include "librbd/journal/Policy.h"
+#include <string>
+
+struct utime_t;
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+enum ErrorResult {
+ ERROR_RESULT_COMPLETE,
+ ERROR_RESULT_RETRY,
+ ERROR_RESULT_RETRY_IMMEDIATELY
+};
+
+struct TrashListener {
+ TrashListener() {
+ }
+ TrashListener(const TrashListener&) = delete;
+ TrashListener& operator=(const TrashListener&) = delete;
+
+ virtual ~TrashListener() {
+ }
+
+ virtual void handle_trash_image(const std::string& image_id,
+ const ceph::real_clock::time_point& deferment_end_time) = 0;
+
+};
+
+struct JournalPolicy : public librbd::journal::Policy {
+ bool append_disabled() const override {
+ return true;
+ }
+ bool journal_disabled() const override {
+ return true;
+ }
+
+ void allocate_tag_on_lock(Context *on_finish) override {
+ on_finish->complete(0);
+ }
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_TYPES_H
diff --git a/src/tools/rbd_mirror/image_map/LoadRequest.cc b/src/tools/rbd_mirror/image_map/LoadRequest.cc
new file mode 100644
index 000000000..46564a160
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/LoadRequest.cc
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "librbd/Utils.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+
+#include "UpdateRequest.h"
+#include "LoadRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_map::LoadRequest: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+static const uint32_t MAX_RETURN = 1024;
+
+using librbd::util::create_rados_callback;
+using librbd::util::create_context_callback;
+
+template<typename I>
+LoadRequest<I>::LoadRequest(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> *image_mapping,
+ Context *on_finish)
+ : m_ioctx(ioctx),
+ m_image_mapping(image_mapping),
+ m_on_finish(on_finish) {
+}
+
+template<typename I>
+void LoadRequest<I>::send() {
+ dout(20) << dendl;
+
+ image_map_list();
+}
+
+template<typename I>
+void LoadRequest<I>::image_map_list() {
+ dout(20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_map_list_start(&op, m_start_after, MAX_RETURN);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ LoadRequest, &LoadRequest::handle_image_map_list>(this);
+
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template<typename I>
+void LoadRequest<I>::handle_image_map_list(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ std::map<std::string, cls::rbd::MirrorImageMap> image_mapping;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_map_list_finish(&it, &image_mapping);
+ }
+
+ if (r < 0) {
+ derr << ": failed to get image map: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_image_mapping->insert(image_mapping.begin(), image_mapping.end());
+
+ if (image_mapping.size() == MAX_RETURN) {
+ m_start_after = image_mapping.rbegin()->first;
+ image_map_list();
+ return;
+ }
+
+ mirror_image_list();
+}
+
+template<typename I>
+void LoadRequest<I>::mirror_image_list() {
+ dout(20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_list_start(&op, m_start_after, MAX_RETURN);
+
+ m_out_bl.clear();
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ LoadRequest<I>,
+ &LoadRequest<I>::handle_mirror_image_list>(this);
+ int r = m_ioctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template<typename I>
+void LoadRequest<I>::handle_mirror_image_list(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ std::map<std::string, std::string> ids;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_list_finish(&it, &ids);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to list mirrored images: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ for (auto &id : ids) {
+ m_global_image_ids.emplace(id.second);
+ }
+
+ if (ids.size() == MAX_RETURN) {
+ m_start_after = ids.rbegin()->first;
+ mirror_image_list();
+ return;
+ }
+
+ cleanup_image_map();
+}
+
+template<typename I>
+void LoadRequest<I>::cleanup_image_map() {
+ dout(20) << dendl;
+
+ std::set<std::string> map_removals;
+
+ auto it = m_image_mapping->begin();
+ while (it != m_image_mapping->end()) {
+ if (m_global_image_ids.count(it->first) > 0) {
+ ++it;
+ continue;
+ }
+ map_removals.emplace(it->first);
+ it = m_image_mapping->erase(it);
+ }
+
+ if (map_removals.size() == 0) {
+ finish(0);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ LoadRequest<I>,
+ &LoadRequest<I>::finish>(this);
+ image_map::UpdateRequest<I> *req = image_map::UpdateRequest<I>::create(
+ m_ioctx, {}, std::move(map_removals), ctx);
+ req->send();
+}
+
+template<typename I>
+void LoadRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_map::LoadRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_map/LoadRequest.h b/src/tools/rbd_mirror/image_map/LoadRequest.h
new file mode 100644
index 000000000..9b1be9685
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/LoadRequest.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_LOAD_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_LOAD_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/rados/librados.hpp"
+
+class Context;
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+template<typename ImageCtxT = librbd::ImageCtx>
+class LoadRequest {
+public:
+ static LoadRequest *create(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> *image_mapping,
+ Context *on_finish) {
+ return new LoadRequest(ioctx, image_mapping, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . . . . . . .
+ * v v . MAX_RETURN
+ * IMAGE_MAP_LIST. . . . . . .
+ * |
+ * v
+ * MIRROR_IMAGE_LIST
+ * |
+ * v
+ * CLEANUP_IMAGE_MAP
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ LoadRequest(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> *image_mapping,
+ Context *on_finish);
+
+ librados::IoCtx &m_ioctx;
+ std::map<std::string, cls::rbd::MirrorImageMap> *m_image_mapping;
+ Context *m_on_finish;
+
+ std::set<std::string> m_global_image_ids;
+
+ bufferlist m_out_bl;
+ std::string m_start_after;
+
+ void image_map_list();
+ void handle_image_map_list(int r);
+
+ void mirror_image_list();
+ void handle_mirror_image_list(int r);
+
+ void cleanup_image_map();
+
+ void finish(int r);
+};
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_LOAD_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_map/Policy.cc b/src/tools/rbd_mirror/image_map/Policy.cc
new file mode 100644
index 000000000..62fbd12dc
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/Policy.cc
@@ -0,0 +1,407 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "librbd/Utils.h"
+#include "Policy.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_map::Policy: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+namespace {
+
+bool is_instance_action(ActionType action_type) {
+ switch (action_type) {
+ case ACTION_TYPE_ACQUIRE:
+ case ACTION_TYPE_RELEASE:
+ return true;
+ case ACTION_TYPE_NONE:
+ case ACTION_TYPE_MAP_UPDATE:
+ case ACTION_TYPE_MAP_REMOVE:
+ break;
+ }
+ return false;
+}
+
+} // anonymous namespace
+
+using ::operator<<;
+using librbd::util::unique_lock_name;
+
+Policy::Policy(librados::IoCtx &ioctx)
+ : m_ioctx(ioctx),
+ m_map_lock(ceph::make_shared_mutex(
+ unique_lock_name("rbd::mirror::image_map::Policy::m_map_lock", this))) {
+
+ // map should at least have once instance
+ std::string instance_id = stringify(ioctx.get_instance_id());
+ m_map.emplace(instance_id, std::set<std::string>{});
+}
+
+void Policy::init(
+ const std::map<std::string, cls::rbd::MirrorImageMap> &image_mapping) {
+ dout(20) << dendl;
+
+ std::unique_lock map_lock{m_map_lock};
+ for (auto& it : image_mapping) {
+ ceph_assert(!it.second.instance_id.empty());
+ auto map_result = m_map[it.second.instance_id].emplace(it.first);
+ ceph_assert(map_result.second);
+
+ auto image_state_result = m_image_states.emplace(
+ it.first, ImageState{it.second.instance_id, it.second.mapped_time});
+ ceph_assert(image_state_result.second);
+
+ // ensure we (re)send image acquire actions to the instance
+ auto& image_state = image_state_result.first->second;
+ auto start_action = set_state(&image_state,
+ StateTransition::STATE_INITIALIZING, false);
+ ceph_assert(start_action);
+ }
+}
+
+LookupInfo Policy::lookup(const std::string &global_image_id) {
+ dout(20) << "global_image_id=" << global_image_id << dendl;
+
+ std::shared_lock map_lock{m_map_lock};
+ LookupInfo info;
+
+ auto it = m_image_states.find(global_image_id);
+ if (it != m_image_states.end()) {
+ info.instance_id = it->second.instance_id;
+ info.mapped_time = it->second.mapped_time;
+ }
+ return info;
+}
+
+bool Policy::add_image(const std::string &global_image_id) {
+ dout(5) << "global_image_id=" << global_image_id << dendl;
+
+ std::unique_lock map_lock{m_map_lock};
+ auto image_state_result = m_image_states.emplace(global_image_id,
+ ImageState{});
+ auto& image_state = image_state_result.first->second;
+ if (image_state.state == StateTransition::STATE_INITIALIZING) {
+ // avoid duplicate acquire notifications upon leader startup
+ return false;
+ }
+
+ return set_state(&image_state, StateTransition::STATE_ASSOCIATING, false);
+}
+
+bool Policy::remove_image(const std::string &global_image_id) {
+ dout(5) << "global_image_id=" << global_image_id << dendl;
+
+ std::unique_lock map_lock{m_map_lock};
+ auto it = m_image_states.find(global_image_id);
+ if (it == m_image_states.end()) {
+ return false;
+ }
+
+ auto& image_state = it->second;
+ return set_state(&image_state, StateTransition::STATE_DISSOCIATING, false);
+}
+
+void Policy::add_instances(const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+
+ std::unique_lock map_lock{m_map_lock};
+ for (auto& instance : instance_ids) {
+ ceph_assert(!instance.empty());
+ m_map.emplace(instance, std::set<std::string>{});
+ }
+
+ // post-failover, remove any dead instances and re-shuffle their images
+ if (m_initial_update) {
+ dout(5) << "initial instance update" << dendl;
+ m_initial_update = false;
+
+ std::set<std::string> alive_instances(instance_ids.begin(),
+ instance_ids.end());
+ InstanceIds dead_instances;
+ for (auto& map_pair : m_map) {
+ if (alive_instances.find(map_pair.first) == alive_instances.end()) {
+ dead_instances.push_back(map_pair.first);
+ }
+ }
+
+ if (!dead_instances.empty()) {
+ remove_instances(m_map_lock, dead_instances, global_image_ids);
+ }
+ }
+
+ GlobalImageIds shuffle_global_image_ids;
+ do_shuffle_add_instances(m_map, m_image_states.size(), &shuffle_global_image_ids);
+ dout(5) << "shuffling global_image_ids=[" << shuffle_global_image_ids
+ << "]" << dendl;
+ for (auto& global_image_id : shuffle_global_image_ids) {
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+
+ auto& image_state = it->second;
+ if (set_state(&image_state, StateTransition::STATE_SHUFFLING, false)) {
+ global_image_ids->emplace(global_image_id);
+ }
+ }
+}
+
+void Policy::remove_instances(const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids) {
+ std::unique_lock map_lock{m_map_lock};
+ remove_instances(m_map_lock, instance_ids, global_image_ids);
+}
+
+void Policy::remove_instances(const ceph::shared_mutex& lock,
+ const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids) {
+ ceph_assert(ceph_mutex_is_wlocked(m_map_lock));
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+
+ for (auto& instance_id : instance_ids) {
+ auto map_it = m_map.find(instance_id);
+ if (map_it == m_map.end()) {
+ continue;
+ }
+
+ auto& instance_global_image_ids = map_it->second;
+ if (instance_global_image_ids.empty()) {
+ m_map.erase(map_it);
+ continue;
+ }
+
+ m_dead_instances.insert(instance_id);
+ dout(5) << "force shuffling: instance_id=" << instance_id << ", "
+ << "global_image_ids=[" << instance_global_image_ids << "]"<< dendl;
+ for (auto& global_image_id : instance_global_image_ids) {
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+
+ auto& image_state = it->second;
+ if (is_state_scheduled(image_state,
+ StateTransition::STATE_DISSOCIATING)) {
+ // don't shuffle images that no longer exist
+ continue;
+ }
+
+ if (set_state(&image_state, StateTransition::STATE_SHUFFLING, true)) {
+ global_image_ids->emplace(global_image_id);
+ }
+ }
+ }
+}
+
+ActionType Policy::start_action(const std::string &global_image_id) {
+ std::unique_lock map_lock{m_map_lock};
+
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+
+ auto& image_state = it->second;
+ auto& transition = image_state.transition;
+ ceph_assert(transition.action_type != ACTION_TYPE_NONE);
+
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "state=" << image_state.state << ", "
+ << "action_type=" << transition.action_type << dendl;
+ if (transition.start_policy_action) {
+ execute_policy_action(global_image_id, &image_state,
+ *transition.start_policy_action);
+ transition.start_policy_action = boost::none;
+ }
+ return transition.action_type;
+}
+
+bool Policy::finish_action(const std::string &global_image_id, int r) {
+ std::unique_lock map_lock{m_map_lock};
+
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+
+ auto& image_state = it->second;
+ auto& transition = image_state.transition;
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "state=" << image_state.state << ", "
+ << "action_type=" << transition.action_type << ", "
+ << "r=" << r << dendl;
+
+ // retry on failure unless it's an RPC message to an instance that is dead
+ if (r < 0 &&
+ (!is_instance_action(image_state.transition.action_type) ||
+ image_state.instance_id == UNMAPPED_INSTANCE_ID ||
+ m_dead_instances.find(image_state.instance_id) ==
+ m_dead_instances.end())) {
+ return true;
+ }
+
+ auto finish_policy_action = transition.finish_policy_action;
+ StateTransition::transit(image_state.state, &image_state.transition);
+ if (transition.finish_state) {
+ // in-progress state machine complete
+ ceph_assert(StateTransition::is_idle(*transition.finish_state));
+ image_state.state = *transition.finish_state;
+ image_state.transition = {};
+ }
+
+ if (StateTransition::is_idle(image_state.state) && image_state.next_state) {
+ // advance to pending state machine
+ bool start_action = set_state(&image_state, *image_state.next_state, false);
+ ceph_assert(start_action);
+ }
+
+ // image state may get purged in execute_policy_action()
+ bool pending_action = image_state.transition.action_type != ACTION_TYPE_NONE;
+ if (finish_policy_action) {
+ execute_policy_action(global_image_id, &image_state, *finish_policy_action);
+ }
+
+ return pending_action;
+}
+
+void Policy::execute_policy_action(
+ const std::string& global_image_id, ImageState* image_state,
+ StateTransition::PolicyAction policy_action) {
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "policy_action=" << policy_action << dendl;
+
+ switch (policy_action) {
+ case StateTransition::POLICY_ACTION_MAP:
+ map(global_image_id, image_state);
+ break;
+ case StateTransition::POLICY_ACTION_UNMAP:
+ unmap(global_image_id, image_state);
+ break;
+ case StateTransition::POLICY_ACTION_REMOVE:
+ if (image_state->state == StateTransition::STATE_UNASSOCIATED) {
+ ceph_assert(image_state->instance_id == UNMAPPED_INSTANCE_ID);
+ ceph_assert(!image_state->next_state);
+ m_image_states.erase(global_image_id);
+ }
+ break;
+ }
+}
+
+void Policy::map(const std::string& global_image_id, ImageState* image_state) {
+ ceph_assert(ceph_mutex_is_wlocked(m_map_lock));
+
+ std::string instance_id = image_state->instance_id;
+ if (instance_id != UNMAPPED_INSTANCE_ID && !is_dead_instance(instance_id)) {
+ return;
+ }
+ if (is_dead_instance(instance_id)) {
+ unmap(global_image_id, image_state);
+ }
+
+ instance_id = do_map(m_map, global_image_id);
+ ceph_assert(!instance_id.empty());
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ image_state->instance_id = instance_id;
+ image_state->mapped_time = ceph_clock_now();
+
+ auto ins = m_map[instance_id].emplace(global_image_id);
+ ceph_assert(ins.second);
+}
+
+void Policy::unmap(const std::string &global_image_id,
+ ImageState* image_state) {
+ ceph_assert(ceph_mutex_is_wlocked(m_map_lock));
+
+ std::string instance_id = image_state->instance_id;
+ if (instance_id == UNMAPPED_INSTANCE_ID) {
+ return;
+ }
+
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ ceph_assert(!instance_id.empty());
+ m_map[instance_id].erase(global_image_id);
+ image_state->instance_id = UNMAPPED_INSTANCE_ID;
+ image_state->mapped_time = {};
+
+ if (is_dead_instance(instance_id) && m_map[instance_id].empty()) {
+ dout(5) << "removing dead instance_id=" << instance_id << dendl;
+ m_map.erase(instance_id);
+ m_dead_instances.erase(instance_id);
+ }
+}
+
+bool Policy::is_image_shuffling(const std::string &global_image_id) {
+ ceph_assert(ceph_mutex_is_locked(m_map_lock));
+
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+ auto& image_state = it->second;
+
+ // avoid attempting to re-shuffle a pending shuffle
+ auto result = is_state_scheduled(image_state,
+ StateTransition::STATE_SHUFFLING);
+ dout(20) << "global_image_id=" << global_image_id << ", "
+ << "result=" << result << dendl;
+ return result;
+}
+
+bool Policy::can_shuffle_image(const std::string &global_image_id) {
+ ceph_assert(ceph_mutex_is_locked(m_map_lock));
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ int migration_throttle = cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_image_policy_migration_throttle");
+
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+ auto& image_state = it->second;
+
+ utime_t last_shuffled_time = image_state.mapped_time;
+
+ // idle images that haven't been recently remapped can shuffle
+ utime_t now = ceph_clock_now();
+ auto result = (StateTransition::is_idle(image_state.state) &&
+ ((migration_throttle <= 0) ||
+ (now - last_shuffled_time >= migration_throttle)));
+ dout(10) << "global_image_id=" << global_image_id << ", "
+ << "migration_throttle=" << migration_throttle << ", "
+ << "last_shuffled_time=" << last_shuffled_time << ", "
+ << "result=" << result << dendl;
+ return result;
+}
+
+bool Policy::set_state(ImageState* image_state, StateTransition::State state,
+ bool ignore_current_state) {
+ if (!ignore_current_state && image_state->state == state) {
+ image_state->next_state = boost::none;
+ return false;
+ } else if (StateTransition::is_idle(image_state->state)) {
+ image_state->state = state;
+ image_state->next_state = boost::none;
+
+ StateTransition::transit(image_state->state, &image_state->transition);
+ ceph_assert(image_state->transition.action_type != ACTION_TYPE_NONE);
+ ceph_assert(!image_state->transition.finish_state);
+ return true;
+ }
+
+ image_state->next_state = state;
+ return false;
+}
+
+bool Policy::is_state_scheduled(const ImageState& image_state,
+ StateTransition::State state) const {
+ return (image_state.state == state ||
+ (image_state.next_state && *image_state.next_state == state));
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_map/Policy.h b/src/tools/rbd_mirror/image_map/Policy.h
new file mode 100644
index 000000000..0617bb9ee
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/Policy.h
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_POLICY_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_POLICY_H
+
+#include <map>
+#include <tuple>
+#include <boost/optional.hpp>
+
+#include "common/RWLock.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/rados/librados.hpp"
+#include "tools/rbd_mirror/image_map/StateTransition.h"
+#include "tools/rbd_mirror/image_map/Types.h"
+
+class Context;
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+class Policy {
+public:
+ Policy(librados::IoCtx &ioctx);
+
+ virtual ~Policy() {
+ }
+
+ // init -- called during initialization
+ void init(
+ const std::map<std::string, cls::rbd::MirrorImageMap> &image_mapping);
+
+ // lookup an image from the map
+ LookupInfo lookup(const std::string &global_image_id);
+
+ // add, remove
+ bool add_image(const std::string &global_image_id);
+ bool remove_image(const std::string &global_image_id);
+
+ // shuffle images when instances are added/removed
+ void add_instances(const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids);
+ void remove_instances(const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids);
+
+ ActionType start_action(const std::string &global_image_id);
+ bool finish_action(const std::string &global_image_id, int r);
+
+protected:
+ typedef std::map<std::string, std::set<std::string> > InstanceToImageMap;
+
+ bool is_dead_instance(const std::string instance_id) {
+ ceph_assert(ceph_mutex_is_locked(m_map_lock));
+ return m_dead_instances.find(instance_id) != m_dead_instances.end();
+ }
+
+ bool is_image_shuffling(const std::string &global_image_id);
+ bool can_shuffle_image(const std::string &global_image_id);
+
+ // map an image (global image id) to an instance
+ virtual std::string do_map(const InstanceToImageMap& map,
+ const std::string &global_image_id) = 0;
+
+ // shuffle images when instances are added/removed
+ virtual void do_shuffle_add_instances(
+ const InstanceToImageMap& map, size_t image_count,
+ std::set<std::string> *remap_global_image_ids) = 0;
+
+private:
+ struct ImageState {
+ std::string instance_id = UNMAPPED_INSTANCE_ID;
+ utime_t mapped_time;
+
+ ImageState() {}
+ ImageState(const std::string& instance_id, const utime_t& mapped_time)
+ : instance_id(instance_id), mapped_time(mapped_time) {
+ }
+
+ // active state and action
+ StateTransition::State state = StateTransition::STATE_UNASSOCIATED;
+ StateTransition::Transition transition;
+
+ // next scheduled state
+ boost::optional<StateTransition::State> next_state = boost::none;
+ };
+
+ typedef std::map<std::string, ImageState> ImageStates;
+
+ librados::IoCtx &m_ioctx;
+
+ ceph::shared_mutex m_map_lock; // protects m_map
+ InstanceToImageMap m_map; // instance_id -> global_id map
+
+ ImageStates m_image_states;
+ std::set<std::string> m_dead_instances;
+
+ bool m_initial_update = true;
+
+ void remove_instances(const ceph::shared_mutex& lock,
+ const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids);
+
+ bool set_state(ImageState* image_state, StateTransition::State state,
+ bool ignore_current_state);
+
+ void execute_policy_action(const std::string& global_image_id,
+ ImageState* image_state,
+ StateTransition::PolicyAction policy_action);
+
+ void map(const std::string& global_image_id, ImageState* image_state);
+ void unmap(const std::string &global_image_id, ImageState* image_state);
+
+ bool is_state_scheduled(const ImageState& image_state,
+ StateTransition::State state) const;
+
+};
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_POLICY_H
diff --git a/src/tools/rbd_mirror/image_map/SimplePolicy.cc b/src/tools/rbd_mirror/image_map/SimplePolicy.cc
new file mode 100644
index 000000000..f26805819
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/SimplePolicy.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "SimplePolicy.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_map::SimplePolicy: " << this \
+ << " " << __func__ << ": "
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+SimplePolicy::SimplePolicy(librados::IoCtx &ioctx)
+ : Policy(ioctx) {
+}
+
+size_t SimplePolicy::calc_images_per_instance(const InstanceToImageMap& map,
+ size_t image_count) {
+ size_t nr_instances = 0;
+ for (auto const &it : map) {
+ if (!Policy::is_dead_instance(it.first)) {
+ ++nr_instances;
+ }
+ }
+ ceph_assert(nr_instances > 0);
+
+ size_t images_per_instance = image_count / nr_instances;
+ if (images_per_instance == 0) {
+ ++images_per_instance;
+ }
+
+ return images_per_instance;
+}
+
+void SimplePolicy::do_shuffle_add_instances(
+ const InstanceToImageMap& map, size_t image_count,
+ std::set<std::string> *remap_global_image_ids) {
+ uint64_t images_per_instance = calc_images_per_instance(map, image_count);
+ dout(5) << "images per instance=" << images_per_instance << dendl;
+
+ for (auto const &instance : map) {
+ if (instance.second.size() <= images_per_instance) {
+ continue;
+ }
+
+ auto it = instance.second.begin();
+ uint64_t cut_off = instance.second.size() - images_per_instance;
+
+ while (it != instance.second.end() && cut_off > 0) {
+ if (Policy::is_image_shuffling(*it)) {
+ --cut_off;
+ } else if (Policy::can_shuffle_image(*it)) {
+ --cut_off;
+ remap_global_image_ids->emplace(*it);
+ }
+
+ ++it;
+ }
+ }
+}
+
+std::string SimplePolicy::do_map(const InstanceToImageMap& map,
+ const std::string &global_image_id) {
+ auto min_it = map.end();
+ for (auto it = map.begin(); it != map.end(); ++it) {
+ ceph_assert(it->second.find(global_image_id) == it->second.end());
+ if (Policy::is_dead_instance(it->first)) {
+ continue;
+ } else if (min_it == map.end()) {
+ min_it = it;
+ } else if (it->second.size() < min_it->second.size()) {
+ min_it = it;
+ }
+ }
+
+ ceph_assert(min_it != map.end());
+ dout(20) << "global_image_id=" << global_image_id << " maps to instance_id="
+ << min_it->first << dendl;
+ return min_it->first;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_map/SimplePolicy.h b/src/tools/rbd_mirror/image_map/SimplePolicy.h
new file mode 100644
index 000000000..ad2071b2c
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/SimplePolicy.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_SIMPLE_POLICY_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_SIMPLE_POLICY_H
+
+#include "Policy.h"
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+class SimplePolicy : public Policy {
+public:
+ static SimplePolicy *create(librados::IoCtx &ioctx) {
+ return new SimplePolicy(ioctx);
+ }
+
+protected:
+ SimplePolicy(librados::IoCtx &ioctx);
+
+ std::string do_map(const InstanceToImageMap& map,
+ const std::string &global_image_id) override;
+
+ void do_shuffle_add_instances(
+ const InstanceToImageMap& map, size_t image_count,
+ std::set<std::string> *remap_global_image_ids) override;
+
+private:
+ size_t calc_images_per_instance(const InstanceToImageMap& map,
+ size_t image_count);
+
+};
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_SIMPLE_POLICY_H
diff --git a/src/tools/rbd_mirror/image_map/StateTransition.cc b/src/tools/rbd_mirror/image_map/StateTransition.cc
new file mode 100644
index 000000000..ec5f07ff9
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/StateTransition.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <ostream>
+#include "include/ceph_assert.h"
+#include "StateTransition.h"
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+std::ostream &operator<<(std::ostream &os,
+ const StateTransition::State &state) {
+ switch(state) {
+ case StateTransition::STATE_INITIALIZING:
+ os << "INITIALIZING";
+ break;
+ case StateTransition::STATE_ASSOCIATING:
+ os << "ASSOCIATING";
+ break;
+ case StateTransition::STATE_ASSOCIATED:
+ os << "ASSOCIATED";
+ break;
+ case StateTransition::STATE_SHUFFLING:
+ os << "SHUFFLING";
+ break;
+ case StateTransition::STATE_DISSOCIATING:
+ os << "DISSOCIATING";
+ break;
+ case StateTransition::STATE_UNASSOCIATED:
+ os << "UNASSOCIATED";
+ break;
+ }
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const StateTransition::PolicyAction &policy_action) {
+ switch(policy_action) {
+ case StateTransition::POLICY_ACTION_MAP:
+ os << "MAP";
+ break;
+ case StateTransition::POLICY_ACTION_UNMAP:
+ os << "UNMAP";
+ break;
+ case StateTransition::POLICY_ACTION_REMOVE:
+ os << "REMOVE";
+ break;
+ }
+ return os;
+}
+
+const StateTransition::TransitionTable StateTransition::s_transition_table {
+ // state current_action Transition
+ // ---------------------------------------------------------------------------
+ {{STATE_INITIALIZING, ACTION_TYPE_NONE}, {ACTION_TYPE_ACQUIRE, {}, {},
+ {}}},
+ {{STATE_INITIALIZING, ACTION_TYPE_ACQUIRE}, {ACTION_TYPE_NONE, {}, {},
+ {STATE_ASSOCIATED}}},
+
+ {{STATE_ASSOCIATING, ACTION_TYPE_NONE}, {ACTION_TYPE_MAP_UPDATE,
+ {POLICY_ACTION_MAP}, {}, {}}},
+ {{STATE_ASSOCIATING, ACTION_TYPE_MAP_UPDATE}, {ACTION_TYPE_ACQUIRE, {}, {},
+ {}}},
+ {{STATE_ASSOCIATING, ACTION_TYPE_ACQUIRE}, {ACTION_TYPE_NONE, {}, {},
+ {STATE_ASSOCIATED}}},
+
+ {{STATE_DISSOCIATING, ACTION_TYPE_NONE}, {ACTION_TYPE_RELEASE, {},
+ {POLICY_ACTION_UNMAP}, {}}},
+ {{STATE_DISSOCIATING, ACTION_TYPE_RELEASE}, {ACTION_TYPE_MAP_REMOVE, {},
+ {POLICY_ACTION_REMOVE}, {}}},
+ {{STATE_DISSOCIATING, ACTION_TYPE_MAP_REMOVE}, {ACTION_TYPE_NONE, {},
+ {}, {STATE_UNASSOCIATED}}},
+
+ {{STATE_SHUFFLING, ACTION_TYPE_NONE}, {ACTION_TYPE_RELEASE, {},
+ {POLICY_ACTION_UNMAP}, {}}},
+ {{STATE_SHUFFLING, ACTION_TYPE_RELEASE}, {ACTION_TYPE_MAP_UPDATE,
+ {POLICY_ACTION_MAP}, {}, {}}},
+ {{STATE_SHUFFLING, ACTION_TYPE_MAP_UPDATE}, {ACTION_TYPE_ACQUIRE, {}, {},
+ {}}},
+ {{STATE_SHUFFLING, ACTION_TYPE_ACQUIRE}, {ACTION_TYPE_NONE, {}, {},
+ {STATE_ASSOCIATED}}}
+};
+
+void StateTransition::transit(State state, Transition* transition) {
+ auto it = s_transition_table.find({state, transition->action_type});
+ ceph_assert(it != s_transition_table.end());
+
+ *transition = it->second;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_map/StateTransition.h b/src/tools/rbd_mirror/image_map/StateTransition.h
new file mode 100644
index 000000000..02a5ce4e9
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/StateTransition.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_STATE_TRANSITION_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_STATE_TRANSITION_H
+
+#include "tools/rbd_mirror/image_map/Types.h"
+#include <boost/optional.hpp>
+#include <map>
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+class StateTransition {
+public:
+ enum State {
+ STATE_UNASSOCIATED,
+ STATE_INITIALIZING,
+ STATE_ASSOCIATING,
+ STATE_ASSOCIATED,
+ STATE_SHUFFLING,
+ STATE_DISSOCIATING
+ };
+
+ enum PolicyAction {
+ POLICY_ACTION_MAP,
+ POLICY_ACTION_UNMAP,
+ POLICY_ACTION_REMOVE
+ };
+
+ struct Transition {
+ // image map action
+ ActionType action_type = ACTION_TYPE_NONE;
+
+ // policy internal action
+ boost::optional<PolicyAction> start_policy_action;
+ boost::optional<PolicyAction> finish_policy_action;
+
+ // state machine complete
+ boost::optional<State> finish_state;
+
+ Transition() {
+ }
+ Transition(ActionType action_type,
+ const boost::optional<PolicyAction>& start_policy_action,
+ const boost::optional<PolicyAction>& finish_policy_action,
+ const boost::optional<State>& finish_state)
+ : action_type(action_type), start_policy_action(start_policy_action),
+ finish_policy_action(finish_policy_action), finish_state(finish_state) {
+ }
+ };
+
+ static bool is_idle(State state) {
+ return (state == STATE_UNASSOCIATED || state == STATE_ASSOCIATED);
+ }
+
+ static void transit(State state, Transition* transition);
+
+private:
+ typedef std::pair<State, ActionType> TransitionKey;
+ typedef std::map<TransitionKey, Transition> TransitionTable;
+
+ // image transition table
+ static const TransitionTable s_transition_table;
+};
+
+std::ostream &operator<<(std::ostream &os, const StateTransition::State &state);
+std::ostream &operator<<(std::ostream &os,
+ const StateTransition::PolicyAction &policy_action);
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_STATE_TRANSITION_H
diff --git a/src/tools/rbd_mirror/image_map/Types.cc b/src/tools/rbd_mirror/image_map/Types.cc
new file mode 100644
index 000000000..47de9c3cf
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/Types.cc
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Types.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+#include <iostream>
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+const std::string UNMAPPED_INSTANCE_ID("");
+
+namespace {
+
+template <typename E>
+class GetTypeVisitor : public boost::static_visitor<E> {
+public:
+ template <typename T>
+ inline E operator()(const T&) const {
+ return T::TYPE;
+ }
+};
+
+class EncodeVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodeVisitor(bufferlist &bl) : m_bl(bl) {
+ }
+
+ template <typename T>
+ inline void operator()(const T& t) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(T::TYPE), m_bl);
+ t.encode(m_bl);
+ }
+private:
+ bufferlist &m_bl;
+};
+
+class DecodeVisitor : public boost::static_visitor<void> {
+public:
+ DecodeVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {
+ }
+
+ template <typename T>
+ inline void operator()(T& t) const {
+ t.decode(m_version, m_iter);
+ }
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+class DumpVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpVisitor(Formatter *formatter, const std::string &key)
+ : m_formatter(formatter), m_key(key) {}
+
+ template <typename T>
+ inline void operator()(const T& t) const {
+ auto type = T::TYPE;
+ m_formatter->dump_string(m_key.c_str(), stringify(type));
+ t.dump(m_formatter);
+ }
+private:
+ ceph::Formatter *m_formatter;
+ std::string m_key;
+};
+
+} // anonymous namespace
+
+PolicyMetaType PolicyData::get_policy_meta_type() const {
+ return boost::apply_visitor(GetTypeVisitor<PolicyMetaType>(), policy_meta);
+}
+
+void PolicyData::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ boost::apply_visitor(EncodeVisitor(bl), policy_meta);
+ ENCODE_FINISH(bl);
+}
+
+void PolicyData::decode(bufferlist::const_iterator& it) {
+ DECODE_START(1, it);
+
+ uint32_t policy_meta_type;
+ decode(policy_meta_type, it);
+
+ switch (policy_meta_type) {
+ case POLICY_META_TYPE_NONE:
+ policy_meta = PolicyMetaNone();
+ break;
+ default:
+ policy_meta = PolicyMetaUnknown();
+ break;
+ }
+
+ boost::apply_visitor(DecodeVisitor(struct_v, it), policy_meta);
+ DECODE_FINISH(it);
+}
+
+void PolicyData::dump(Formatter *f) const {
+ boost::apply_visitor(DumpVisitor(f, "policy_meta_type"), policy_meta);
+}
+
+void PolicyData::generate_test_instances(std::list<PolicyData *> &o) {
+ o.push_back(new PolicyData(PolicyMetaNone()));
+}
+
+std::ostream &operator<<(std::ostream &os, const ActionType& action_type) {
+ switch (action_type) {
+ case ACTION_TYPE_NONE:
+ os << "NONE";
+ break;
+ case ACTION_TYPE_MAP_UPDATE:
+ os << "MAP_UPDATE";
+ break;
+ case ACTION_TYPE_MAP_REMOVE:
+ os << "MAP_REMOVE";
+ break;
+ case ACTION_TYPE_ACQUIRE:
+ os << "ACQUIRE";
+ break;
+ case ACTION_TYPE_RELEASE:
+ os << "RELEASE";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(action_type) << ")";
+ break;
+ }
+ return os;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_map/Types.h b/src/tools/rbd_mirror/image_map/Types.h
new file mode 100644
index 000000000..5a97430f3
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/Types.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_TYPES_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_TYPES_H
+
+#include <iosfwd>
+#include <map>
+#include <set>
+#include <string>
+#include <boost/variant.hpp>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/utime.h"
+#include "tools/rbd_mirror/Types.h"
+
+struct Context;
+
+namespace ceph {
+class Formatter;
+}
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+extern const std::string UNMAPPED_INSTANCE_ID;
+
+struct Listener {
+ virtual ~Listener() {
+ }
+
+ virtual void acquire_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) = 0;
+ virtual void release_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) = 0;
+ virtual void remove_image(const std::string &mirror_uuid,
+ const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) = 0;
+};
+
+struct LookupInfo {
+ std::string instance_id = UNMAPPED_INSTANCE_ID;
+ utime_t mapped_time;
+};
+
+enum ActionType {
+ ACTION_TYPE_NONE,
+ ACTION_TYPE_MAP_UPDATE,
+ ACTION_TYPE_MAP_REMOVE,
+ ACTION_TYPE_ACQUIRE,
+ ACTION_TYPE_RELEASE
+};
+
+typedef std::vector<std::string> InstanceIds;
+typedef std::set<std::string> GlobalImageIds;
+typedef std::map<std::string, ActionType> ImageActionTypes;
+
+enum PolicyMetaType {
+ POLICY_META_TYPE_NONE = 0,
+};
+
+struct PolicyMetaNone {
+ static const PolicyMetaType TYPE = POLICY_META_TYPE_NONE;
+
+ PolicyMetaNone() {
+ }
+
+ void encode(bufferlist& bl) const {
+ }
+
+ void decode(__u8 version, bufferlist::const_iterator& it) {
+ }
+
+ void dump(Formatter *f) const {
+ }
+};
+
+struct PolicyMetaUnknown {
+ static const PolicyMetaType TYPE = static_cast<PolicyMetaType>(-1);
+
+ PolicyMetaUnknown() {
+ }
+
+ void encode(bufferlist& bl) const {
+ ceph_abort();
+ }
+
+ void decode(__u8 version, bufferlist::const_iterator& it) {
+ }
+
+ void dump(Formatter *f) const {
+ }
+};
+
+typedef boost::variant<PolicyMetaNone,
+ PolicyMetaUnknown> PolicyMeta;
+
+struct PolicyData {
+ PolicyData()
+ : policy_meta(PolicyMetaUnknown()) {
+ }
+ PolicyData(const PolicyMeta &policy_meta)
+ : policy_meta(policy_meta) {
+ }
+
+ PolicyMeta policy_meta;
+
+ PolicyMetaType get_policy_meta_type() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<PolicyData *> &o);
+};
+
+WRITE_CLASS_ENCODER(PolicyData);
+
+std::ostream &operator<<(std::ostream &os, const ActionType &action_type);
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_TYPES_H
diff --git a/src/tools/rbd_mirror/image_map/UpdateRequest.cc b/src/tools/rbd_mirror/image_map/UpdateRequest.cc
new file mode 100644
index 000000000..799c5670f
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/UpdateRequest.cc
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "librbd/Utils.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+
+#include "UpdateRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_map::UpdateRequest: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+using librbd::util::create_rados_callback;
+
+static const uint32_t MAX_UPDATE = 256;
+
+template <typename I>
+UpdateRequest<I>::UpdateRequest(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> &&update_mapping,
+ std::set<std::string> &&remove_global_image_ids, Context *on_finish)
+ : m_ioctx(ioctx),
+ m_update_mapping(update_mapping),
+ m_remove_global_image_ids(remove_global_image_ids),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void UpdateRequest<I>::send() {
+ dout(20) << dendl;
+
+ update_image_map();
+}
+
+template <typename I>
+void UpdateRequest<I>::update_image_map() {
+ dout(20) << dendl;
+
+ if (m_update_mapping.empty() && m_remove_global_image_ids.empty()) {
+ finish(0);
+ return;
+ }
+
+ uint32_t nr_updates = 0;
+ librados::ObjectWriteOperation op;
+
+ auto it1 = m_update_mapping.begin();
+ while (it1 != m_update_mapping.end() && nr_updates++ < MAX_UPDATE) {
+ librbd::cls_client::mirror_image_map_update(&op, it1->first, it1->second);
+ it1 = m_update_mapping.erase(it1);
+ }
+
+ auto it2 = m_remove_global_image_ids.begin();
+ while (it2 != m_remove_global_image_ids.end() && nr_updates++ < MAX_UPDATE) {
+ librbd::cls_client::mirror_image_map_remove(&op, *it2);
+ it2 = m_remove_global_image_ids.erase(it2);
+ }
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ UpdateRequest, &UpdateRequest::handle_update_image_map>(this);
+ int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void UpdateRequest<I>::handle_update_image_map(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to update image map: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ update_image_map();
+}
+
+template <typename I>
+void UpdateRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_map::UpdateRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_map/UpdateRequest.h b/src/tools/rbd_mirror/image_map/UpdateRequest.h
new file mode 100644
index 000000000..841cc6f9b
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/UpdateRequest.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_UPDATE_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_UPDATE_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/rados/librados.hpp"
+
+class Context;
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+template<typename ImageCtxT = librbd::ImageCtx>
+class UpdateRequest {
+public:
+ // accepts an image map for updation and a collection of
+ // global image ids to purge.
+ static UpdateRequest *create(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> &&update_mapping,
+ std::set<std::string> &&remove_global_image_ids, Context *on_finish) {
+ return new UpdateRequest(ioctx, std::move(update_mapping), std::move(remove_global_image_ids),
+ on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . . . . . . .
+ * v v . MAX_UPDATE
+ * UPDATE_IMAGE_MAP. . . . . . .
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ UpdateRequest(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> &&update_mapping,
+ std::set<std::string> &&remove_global_image_ids, Context *on_finish);
+
+ librados::IoCtx &m_ioctx;
+ std::map<std::string, cls::rbd::MirrorImageMap> m_update_mapping;
+ std::set<std::string> m_remove_global_image_ids;
+ Context *m_on_finish;
+
+ void update_image_map();
+ void handle_update_image_map(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_UPDATE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
new file mode 100644
index 000000000..bda5b5f9b
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
@@ -0,0 +1,485 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "BootstrapRequest.h"
+#include "CreateImageRequest.h"
+#include "OpenImageRequest.h"
+#include "OpenLocalImageRequest.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "journal/Journaler.h"
+#include "journal/Settings.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/Types.h"
+#include "tools/rbd_mirror/BaseRequest.h"
+#include "tools/rbd_mirror/ImageSync.h"
+#include "tools/rbd_mirror/ProgressContext.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/journal/StateBuilder.h"
+#include "tools/rbd_mirror/image_replayer/journal/SyncPointHandler.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \
+ << "BootstrapRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+BootstrapRequest<I>::BootstrapRequest(
+ Threads<I>* threads,
+ librados::IoCtx& local_io_ctx,
+ librados::IoCtx& remote_io_ctx,
+ InstanceWatcher<I>* instance_watcher,
+ const std::string& global_image_id,
+ const std::string& local_mirror_uuid,
+ const RemotePoolMeta& remote_pool_meta,
+ ::journal::CacheManagerHandler* cache_manager_handler,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ StateBuilder<I>** state_builder,
+ bool* do_resync,
+ Context* on_finish)
+ : CancelableRequest("rbd::mirror::image_replayer::BootstrapRequest",
+ reinterpret_cast<CephContext*>(local_io_ctx.cct()),
+ on_finish),
+ m_threads(threads),
+ m_local_io_ctx(local_io_ctx),
+ m_remote_io_ctx(remote_io_ctx),
+ m_instance_watcher(instance_watcher),
+ m_global_image_id(global_image_id),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_remote_pool_meta(remote_pool_meta),
+ m_cache_manager_handler(cache_manager_handler),
+ m_pool_meta_cache(pool_meta_cache),
+ m_progress_ctx(progress_ctx),
+ m_state_builder(state_builder),
+ m_do_resync(do_resync),
+ m_lock(ceph::make_mutex(unique_lock_name("BootstrapRequest::m_lock",
+ this))) {
+ dout(10) << dendl;
+}
+
+template <typename I>
+bool BootstrapRequest<I>::is_syncing() const {
+ std::lock_guard locker{m_lock};
+ return (m_image_sync != nullptr);
+}
+
+template <typename I>
+void BootstrapRequest<I>::send() {
+ *m_do_resync = false;
+
+ prepare_local_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::cancel() {
+ dout(10) << dendl;
+
+ std::lock_guard locker{m_lock};
+ m_canceled = true;
+
+ if (m_image_sync != nullptr) {
+ m_image_sync->cancel();
+ }
+}
+
+template <typename I>
+std::string BootstrapRequest<I>::get_local_image_name() const {
+ std::unique_lock locker{m_lock};
+ return m_local_image_name;
+}
+
+template <typename I>
+void BootstrapRequest<I>::prepare_local_image() {
+ dout(10) << dendl;
+ update_progress("PREPARE_LOCAL_IMAGE");
+
+ {
+ std::unique_lock locker{m_lock};
+ m_local_image_name = m_global_image_id;
+ }
+
+ ceph_assert(*m_state_builder == nullptr);
+ auto ctx = create_context_callback<
+ BootstrapRequest, &BootstrapRequest<I>::handle_prepare_local_image>(this);
+ auto req = image_replayer::PrepareLocalImageRequest<I>::create(
+ m_local_io_ctx, m_global_image_id, &m_prepare_local_image_name,
+ m_state_builder, m_threads->work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_prepare_local_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r < 0 || *m_state_builder != nullptr);
+ if (r == -ENOENT) {
+ dout(10) << "local image does not exist" << dendl;
+ } else if (r < 0) {
+ derr << "error preparing local image for replay: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ // image replayer will detect the name change (if any) at next
+ // status update
+ if (r >= 0 && !m_prepare_local_image_name.empty()) {
+ std::unique_lock locker{m_lock};
+ m_local_image_name = m_prepare_local_image_name;
+ }
+
+ prepare_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::prepare_remote_image() {
+ dout(10) << dendl;
+ update_progress("PREPARE_REMOTE_IMAGE");
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest, &BootstrapRequest<I>::handle_prepare_remote_image>(this);
+ auto req = image_replayer::PrepareRemoteImageRequest<I>::create(
+ m_threads, m_local_io_ctx, m_remote_io_ctx, m_global_image_id,
+ m_local_mirror_uuid, m_remote_pool_meta, m_cache_manager_handler,
+ m_state_builder, ctx);
+ req->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_prepare_remote_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ auto state_builder = *m_state_builder;
+ ceph_assert(state_builder == nullptr ||
+ !state_builder->remote_mirror_uuid.empty());
+
+ if (state_builder != nullptr && state_builder->is_local_primary()) {
+ dout(5) << "local image is primary" << dendl;
+ finish(-ENOMSG);
+ return;
+ } else if (r == -ENOENT || state_builder == nullptr) {
+ dout(10) << "remote image does not exist";
+ if (state_builder != nullptr) {
+ *_dout << ": "
+ << "local_image_id=" << state_builder->local_image_id << ", "
+ << "remote_image_id=" << state_builder->remote_image_id << ", "
+ << "is_linked=" << state_builder->is_linked();
+ }
+ *_dout << dendl;
+
+ // TODO need to support multiple remote images
+ if (state_builder != nullptr &&
+ state_builder->remote_image_id.empty() &&
+ (state_builder->local_image_id.empty() ||
+ state_builder->is_linked())) {
+ // both images doesn't exist or local image exists and is non-primary
+ // and linked to the missing remote image
+ finish(-ENOLINK);
+ } else {
+ finish(-ENOENT);
+ }
+ return;
+ } else if (r < 0) {
+ derr << "error preparing remote image for replay: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!state_builder->is_remote_primary()) {
+ ceph_assert(!state_builder->remote_image_id.empty());
+ if (state_builder->local_image_id.empty()) {
+ dout(10) << "local image does not exist and remote image is not primary"
+ << dendl;
+ finish(-EREMOTEIO);
+ return;
+ } else if (!state_builder->is_linked()) {
+ dout(10) << "local image is unlinked and remote image is not primary"
+ << dendl;
+ finish(-EREMOTEIO);
+ return;
+ }
+ // if the local image is linked to the remote image, we ignore that
+ // the remote image is not primary so that we can replay demotion
+ }
+
+ open_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::open_remote_image() {
+ ceph_assert(*m_state_builder != nullptr);
+ auto remote_image_id = (*m_state_builder)->remote_image_id;
+ dout(15) << "remote_image_id=" << remote_image_id << dendl;
+
+ update_progress("OPEN_REMOTE_IMAGE");
+
+ auto ctx = create_context_callback<
+ BootstrapRequest<I>,
+ &BootstrapRequest<I>::handle_open_remote_image>(this);
+ ceph_assert(*m_state_builder != nullptr);
+ OpenImageRequest<I> *request = OpenImageRequest<I>::create(
+ m_remote_io_ctx, &(*m_state_builder)->remote_image_ctx, remote_image_id,
+ false, ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_open_remote_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ ceph_assert(*m_state_builder != nullptr);
+ if (r < 0) {
+ derr << "failed to open remote image: " << cpp_strerror(r) << dendl;
+ ceph_assert((*m_state_builder)->remote_image_ctx == nullptr);
+ finish(r);
+ return;
+ }
+
+ if ((*m_state_builder)->local_image_id.empty()) {
+ create_local_image();
+ return;
+ }
+
+ open_local_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::open_local_image() {
+ ceph_assert(*m_state_builder != nullptr);
+ auto local_image_id = (*m_state_builder)->local_image_id;
+
+ dout(15) << "local_image_id=" << local_image_id << dendl;
+
+ update_progress("OPEN_LOCAL_IMAGE");
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_open_local_image>(
+ this);
+ OpenLocalImageRequest<I> *request = OpenLocalImageRequest<I>::create(
+ m_local_io_ctx, &(*m_state_builder)->local_image_ctx, local_image_id,
+ m_threads->work_queue, ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_open_local_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ ceph_assert(*m_state_builder != nullptr);
+ auto local_image_ctx = (*m_state_builder)->local_image_ctx;
+ ceph_assert((r >= 0 && local_image_ctx != nullptr) ||
+ (r < 0 && local_image_ctx == nullptr));
+
+ if (r == -ENOENT) {
+ dout(10) << "local image missing" << dendl;
+ create_local_image();
+ return;
+ } else if (r == -EREMOTEIO) {
+ dout(10) << "local image is primary -- skipping image replay" << dendl;
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ } else if (r < 0) {
+ derr << "failed to open local image: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ }
+
+ prepare_replay();
+}
+
+template <typename I>
+void BootstrapRequest<I>::prepare_replay() {
+ dout(10) << dendl;
+ update_progress("PREPARE_REPLAY");
+
+ ceph_assert(*m_state_builder != nullptr);
+ auto ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_prepare_replay>(this);
+ auto request = (*m_state_builder)->create_prepare_replay_request(
+ m_local_mirror_uuid, m_progress_ctx, m_do_resync, &m_syncing, ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_prepare_replay(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to prepare local replay: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ } else if (*m_do_resync) {
+ dout(10) << "local image resync requested" << dendl;
+ close_remote_image();
+ return;
+ } else if ((*m_state_builder)->is_disconnected()) {
+ dout(10) << "client flagged disconnected -- skipping bootstrap" << dendl;
+ // The caller is expected to detect disconnect initializing remote journal.
+ m_ret_val = 0;
+ close_remote_image();
+ return;
+ } else if (m_syncing) {
+ dout(10) << "local image still syncing to remote image" << dendl;
+ image_sync();
+ return;
+ }
+
+ close_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::create_local_image() {
+ dout(10) << dendl;
+ update_progress("CREATE_LOCAL_IMAGE");
+
+ ceph_assert(*m_state_builder != nullptr);
+ auto ctx = create_context_callback<
+ BootstrapRequest<I>,
+ &BootstrapRequest<I>::handle_create_local_image>(this);
+ auto request = (*m_state_builder)->create_local_image_request(
+ m_threads, m_local_io_ctx, m_global_image_id, m_pool_meta_cache,
+ m_progress_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_create_local_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -ENOENT) {
+ dout(10) << "parent image does not exist" << dendl;
+ } else {
+ derr << "failed to create local image: " << cpp_strerror(r) << dendl;
+ }
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ }
+
+ open_local_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::image_sync() {
+ std::unique_lock locker{m_lock};
+ if (m_canceled) {
+ locker.unlock();
+
+ m_ret_val = -ECANCELED;
+ dout(10) << "request canceled" << dendl;
+ close_remote_image();
+ return;
+ }
+
+ dout(15) << dendl;
+ ceph_assert(m_image_sync == nullptr);
+
+ auto state_builder = *m_state_builder;
+ auto sync_point_handler = state_builder->create_sync_point_handler();
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_image_sync>(this);
+ m_image_sync = ImageSync<I>::create(
+ m_threads, state_builder->local_image_ctx, state_builder->remote_image_ctx,
+ m_local_mirror_uuid, sync_point_handler, m_instance_watcher,
+ m_progress_ctx, ctx);
+ m_image_sync->get();
+ locker.unlock();
+
+ update_progress("IMAGE_SYNC");
+ m_image_sync->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_image_sync(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ {
+ std::lock_guard locker{m_lock};
+ m_image_sync->put();
+ m_image_sync = nullptr;
+
+ (*m_state_builder)->destroy_sync_point_handler();
+ }
+
+ if (r < 0) {
+ if (r == -ECANCELED) {
+ dout(10) << "request canceled" << dendl;
+ } else {
+ derr << "failed to sync remote image: " << cpp_strerror(r) << dendl;
+ }
+ m_ret_val = r;
+ }
+
+ close_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::close_remote_image() {
+ if ((*m_state_builder)->replay_requires_remote_image()) {
+ finish(m_ret_val);
+ return;
+ }
+
+ dout(15) << dendl;
+
+ update_progress("CLOSE_REMOTE_IMAGE");
+
+ auto ctx = create_context_callback<
+ BootstrapRequest<I>,
+ &BootstrapRequest<I>::handle_close_remote_image>(this);
+ ceph_assert(*m_state_builder != nullptr);
+ (*m_state_builder)->close_remote_image(ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_close_remote_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error encountered closing remote image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(m_ret_val);
+}
+
+template <typename I>
+void BootstrapRequest<I>::update_progress(const std::string &description) {
+ dout(15) << description << dendl;
+
+ if (m_progress_ctx) {
+ m_progress_ctx->update_progress(description);
+ }
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::BootstrapRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
new file mode 100644
index 000000000..f5bb8dd8a
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_BOOTSTRAP_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_BOOTSTRAP_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+#include "tools/rbd_mirror/CancelableRequest.h"
+#include "tools/rbd_mirror/Types.h"
+#include <string>
+
+class Context;
+
+namespace journal { class CacheManagerHandler; }
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+class ProgressContext;
+
+template <typename> class ImageSync;
+template <typename> class InstanceWatcher;
+struct PoolMetaCache;
+template <typename> struct Threads;
+
+namespace image_replayer {
+
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class BootstrapRequest : public CancelableRequest {
+public:
+ typedef rbd::mirror::ProgressContext ProgressContext;
+
+ static BootstrapRequest* create(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& local_io_ctx,
+ librados::IoCtx& remote_io_ctx,
+ InstanceWatcher<ImageCtxT>* instance_watcher,
+ const std::string& global_image_id,
+ const std::string& local_mirror_uuid,
+ const RemotePoolMeta& remote_pool_meta,
+ ::journal::CacheManagerHandler* cache_manager_handler,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>** state_builder,
+ bool* do_resync,
+ Context* on_finish) {
+ return new BootstrapRequest(
+ threads, local_io_ctx, remote_io_ctx, instance_watcher, global_image_id,
+ local_mirror_uuid, remote_pool_meta, cache_manager_handler,
+ pool_meta_cache, progress_ctx, state_builder, do_resync, on_finish);
+ }
+
+ BootstrapRequest(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& local_io_ctx,
+ librados::IoCtx& remote_io_ctx,
+ InstanceWatcher<ImageCtxT>* instance_watcher,
+ const std::string& global_image_id,
+ const std::string& local_mirror_uuid,
+ const RemotePoolMeta& remote_pool_meta,
+ ::journal::CacheManagerHandler* cache_manager_handler,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>** state_builder,
+ bool* do_resync,
+ Context* on_finish);
+
+ bool is_syncing() const;
+
+ void send() override;
+ void cancel() override;
+
+ std::string get_local_image_name() const;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (error)
+ * PREPARE_LOCAL_IMAGE * * * * * * * * * * * * * * * * * *
+ * | *
+ * v (error) *
+ * PREPARE_REMOTE_IMAGE * * * * * * * * * * * * * * * * * *
+ * | *
+ * v (error) *
+ * OPEN_REMOTE_IMAGE * * * * * * * * * * * * * * * * * * *
+ * | *
+ * | *
+ * \----> CREATE_LOCAL_IMAGE * * * * * * * * * * * * *
+ * | | ^ * *
+ * | | . * *
+ * | v . (image DNE) * *
+ * \----> OPEN_LOCAL_IMAGE * * * * * * * * * * * * * *
+ * | * *
+ * | * *
+ * v * *
+ * PREPARE_REPLAY * * * * * * * * * * * * * * *
+ * | * *
+ * | * *
+ * v (skip if not needed) * *
+ * IMAGE_SYNC * * * * * * * * * * * * * * * * *
+ * | * *
+ * | * *
+ * /---------/ * *
+ * | * *
+ * v * *
+ * CLOSE_REMOTE_IMAGE < * * * * * * * * * * * * * * * * *
+ * | *
+ * v *
+ * <finish> < * * * * * * * * * * * * * * * * * * * * * * *
+ *
+ * @endverbatim
+ */
+ Threads<ImageCtxT>* m_threads;
+ librados::IoCtx &m_local_io_ctx;
+ librados::IoCtx &m_remote_io_ctx;
+ InstanceWatcher<ImageCtxT> *m_instance_watcher;
+ std::string m_global_image_id;
+ std::string m_local_mirror_uuid;
+ RemotePoolMeta m_remote_pool_meta;
+ ::journal::CacheManagerHandler *m_cache_manager_handler;
+ PoolMetaCache* m_pool_meta_cache;
+ ProgressContext *m_progress_ctx;
+ StateBuilder<ImageCtxT>** m_state_builder;
+ bool *m_do_resync;
+
+ mutable ceph::mutex m_lock;
+ bool m_canceled = false;
+
+ int m_ret_val = 0;
+
+ std::string m_local_image_name;
+ std::string m_prepare_local_image_name;
+
+ bool m_syncing = false;
+ ImageSync<ImageCtxT> *m_image_sync = nullptr;
+
+ void prepare_local_image();
+ void handle_prepare_local_image(int r);
+
+ void prepare_remote_image();
+ void handle_prepare_remote_image(int r);
+
+ void open_remote_image();
+ void handle_open_remote_image(int r);
+
+ void open_local_image();
+ void handle_open_local_image(int r);
+
+ void create_local_image();
+ void handle_create_local_image(int r);
+
+ void prepare_replay();
+ void handle_prepare_replay(int r);
+
+ void image_sync();
+ void handle_image_sync(int r);
+
+ void close_remote_image();
+ void handle_close_remote_image(int r);
+
+ void update_progress(const std::string &description);
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::BootstrapRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_BOOTSTRAP_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc
new file mode 100644
index 000000000..872c8baa9
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "CloseImageRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::CloseImageRequest: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+CloseImageRequest<I>::CloseImageRequest(I **image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void CloseImageRequest<I>::send() {
+ close_image();
+}
+
+template <typename I>
+void CloseImageRequest<I>::close_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ CloseImageRequest<I>, &CloseImageRequest<I>::handle_close_image>(this);
+ (*m_image_ctx)->state->close(ctx);
+}
+
+template <typename I>
+void CloseImageRequest<I>::handle_close_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": error encountered while closing image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ *m_image_ctx = nullptr;
+
+ m_on_finish->complete(0);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::CloseImageRequest<librbd::ImageCtx>;
+
diff --git a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h
new file mode 100644
index 000000000..02481369d
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_CLOSE_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_CLOSE_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include <string>
+
+class Context;
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CloseImageRequest {
+public:
+ static CloseImageRequest* create(ImageCtxT **image_ctx, Context *on_finish) {
+ return new CloseImageRequest(image_ctx, on_finish);
+ }
+
+ CloseImageRequest(ImageCtxT **image_ctx, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * CLOSE_IMAGE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ ImageCtxT **m_image_ctx;
+ Context *m_on_finish;
+
+ void close_image();
+ void handle_close_image(int r);
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::CloseImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_CLOSE_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc
new file mode 100644
index 000000000..641bb03e8
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc
@@ -0,0 +1,451 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "CreateImageRequest.h"
+#include "CloseImageRequest.h"
+#include "OpenImageRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/image/CreateRequest.h"
+#include "librbd/image/CloneRequest.h"
+#include "tools/rbd_mirror/PoolMetaCache.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+#include "tools/rbd_mirror/image_sync/Utils.h"
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::CreateImageRequest: " \
+ << this << " " << __func__ << ": "
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename I>
+CreateImageRequest<I>::CreateImageRequest(
+ Threads<I>* threads,
+ librados::IoCtx &local_io_ctx,
+ const std::string &global_image_id,
+ const std::string &remote_mirror_uuid,
+ const std::string &local_image_name,
+ const std::string &local_image_id,
+ I *remote_image_ctx,
+ PoolMetaCache* pool_meta_cache,
+ cls::rbd::MirrorImageMode mirror_image_mode,
+ Context *on_finish)
+ : m_threads(threads), m_local_io_ctx(local_io_ctx),
+ m_global_image_id(global_image_id),
+ m_remote_mirror_uuid(remote_mirror_uuid),
+ m_local_image_name(local_image_name), m_local_image_id(local_image_id),
+ m_remote_image_ctx(remote_image_ctx),
+ m_pool_meta_cache(pool_meta_cache),
+ m_mirror_image_mode(mirror_image_mode), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void CreateImageRequest<I>::send() {
+ int r = validate_parent();
+ if (r < 0) {
+ error(r);
+ return;
+ }
+
+ if (m_remote_parent_spec.pool_id == -1) {
+ create_image();
+ } else {
+ get_parent_global_image_id();
+ }
+}
+
+template <typename I>
+void CreateImageRequest<I>::create_image() {
+ dout(10) << dendl;
+
+ using klass = CreateImageRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_create_image>(this);
+
+ std::shared_lock image_locker{m_remote_image_ctx->image_lock};
+
+ auto& config{
+ reinterpret_cast<CephContext*>(m_local_io_ctx.cct())->_conf};
+
+ librbd::ImageOptions image_options;
+ populate_image_options(&image_options);
+
+ auto req = librbd::image::CreateRequest<I>::create(
+ config, m_local_io_ctx, m_local_image_name, m_local_image_id,
+ m_remote_image_ctx->size, image_options, 0U, m_mirror_image_mode,
+ m_global_image_id, m_remote_mirror_uuid, m_remote_image_ctx->op_work_queue,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_create_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r == -EBADF) {
+ dout(5) << "image id " << m_local_image_id << " already in-use" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to create local image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void CreateImageRequest<I>::get_parent_global_image_id() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_start(&op,
+ m_remote_parent_spec.image_id);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_get_parent_global_image_id>(this);
+ m_out_bl.clear();
+ int r = m_remote_parent_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_get_parent_global_image_id(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r == 0) {
+ cls::rbd::MirrorImage mirror_image;
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_finish(&iter, &mirror_image);
+ if (r == 0) {
+ m_parent_global_image_id = mirror_image.global_image_id;
+ dout(15) << "parent_global_image_id=" << m_parent_global_image_id
+ << dendl;
+ }
+ }
+
+ if (r == -ENOENT) {
+ dout(10) << "parent image " << m_remote_parent_spec.image_id
+ << " not mirrored" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to retrieve global image id for parent image "
+ << m_remote_parent_spec.image_id << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ get_local_parent_image_id();
+}
+
+template <typename I>
+void CreateImageRequest<I>::get_local_parent_image_id() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_image_id_start(
+ &op, m_parent_global_image_id);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_get_local_parent_image_id>(this);
+ m_out_bl.clear();
+ int r = m_local_parent_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_get_local_parent_image_id(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_image_id_finish(
+ &iter, &m_local_parent_spec.image_id);
+ }
+
+ if (r == -ENOENT) {
+ dout(10) << "parent image " << m_parent_global_image_id << " not "
+ << "registered locally" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to retrieve local image id for parent image "
+ << m_parent_global_image_id << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ open_remote_parent_image();
+}
+
+template <typename I>
+void CreateImageRequest<I>::open_remote_parent_image() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_open_remote_parent_image>(this);
+ OpenImageRequest<I> *request = OpenImageRequest<I>::create(
+ m_remote_parent_io_ctx, &m_remote_parent_image_ctx,
+ m_remote_parent_spec.image_id, true, ctx);
+ request->send();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_open_remote_parent_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to open remote parent image " << m_parent_pool_name << "/"
+ << m_remote_parent_spec.image_id << dendl;
+ finish(r);
+ return;
+ }
+
+ clone_image();
+}
+
+template <typename I>
+void CreateImageRequest<I>::clone_image() {
+ dout(10) << dendl;
+
+ LocalPoolMeta local_parent_pool_meta;
+ int r = m_pool_meta_cache->get_local_pool_meta(
+ m_local_parent_io_ctx.get_id(), &local_parent_pool_meta);
+ if (r < 0) {
+ derr << "failed to retrieve local parent mirror uuid for pool "
+ << m_local_parent_io_ctx.get_id() << dendl;
+ m_ret_val = r;
+ close_remote_parent_image();
+ return;
+ }
+
+ // ensure no image sync snapshots for the local cluster exist in the
+ // remote image
+ bool found_parent_snap = false;
+ bool found_image_sync_snap = false;
+ std::string snap_name;
+ cls::rbd::SnapshotNamespace snap_namespace;
+ {
+ auto snap_prefix = image_sync::util::get_snapshot_name_prefix(
+ local_parent_pool_meta.mirror_uuid);
+
+ std::shared_lock remote_image_locker(m_remote_parent_image_ctx->image_lock);
+ for (auto snap_info : m_remote_parent_image_ctx->snap_info) {
+ if (snap_info.first == m_remote_parent_spec.snap_id) {
+ found_parent_snap = true;
+ snap_name = snap_info.second.name;
+ snap_namespace = snap_info.second.snap_namespace;
+ } else if (boost::starts_with(snap_info.second.name, snap_prefix)) {
+ found_image_sync_snap = true;
+ }
+ }
+ }
+
+ if (!found_parent_snap) {
+ dout(15) << "remote parent image snapshot not found" << dendl;
+ m_ret_val = -ENOENT;
+ close_remote_parent_image();
+ return;
+ } else if (found_image_sync_snap) {
+ dout(15) << "parent image not synced to local cluster" << dendl;
+ m_ret_val = -ENOENT;
+ close_remote_parent_image();
+ return;
+ }
+
+ librbd::ImageOptions opts;
+ populate_image_options(&opts);
+
+ auto& config{
+ reinterpret_cast<CephContext*>(m_local_io_ctx.cct())->_conf};
+
+ using klass = CreateImageRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_clone_image>(this);
+
+ librbd::image::CloneRequest<I> *req = librbd::image::CloneRequest<I>::create(
+ config, m_local_parent_io_ctx, m_local_parent_spec.image_id, snap_name,
+ snap_namespace, CEPH_NOSNAP, m_local_io_ctx, m_local_image_name,
+ m_local_image_id, opts, m_mirror_image_mode, m_global_image_id,
+ m_remote_mirror_uuid, m_remote_image_ctx->op_work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_clone_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r == -EBADF) {
+ dout(5) << "image id " << m_local_image_id << " already in-use" << dendl;
+ m_ret_val = r;
+ } else if (r < 0) {
+ derr << "failed to clone image " << m_parent_pool_name << "/"
+ << m_remote_parent_spec.image_id << " to "
+ << m_local_image_name << dendl;
+ m_ret_val = r;
+ }
+
+ close_remote_parent_image();
+}
+
+template <typename I>
+void CreateImageRequest<I>::close_remote_parent_image() {
+ dout(10) << dendl;
+ Context *ctx = create_context_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_close_remote_parent_image>(this);
+ CloseImageRequest<I> *request = CloseImageRequest<I>::create(
+ &m_remote_parent_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_close_remote_parent_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "error encountered closing remote parent image: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ finish(m_ret_val);
+}
+
+template <typename I>
+void CreateImageRequest<I>::error(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_threads->work_queue->queue(create_context_callback<
+ CreateImageRequest<I>, &CreateImageRequest<I>::finish>(this), r);
+}
+
+template <typename I>
+void CreateImageRequest<I>::finish(int r) {
+ dout(10) << "r=" << r << dendl;
+ m_on_finish->complete(r);
+ delete this;
+}
+
+template <typename I>
+int CreateImageRequest<I>::validate_parent() {
+ std::shared_lock owner_locker{m_remote_image_ctx->owner_lock};
+ std::shared_lock image_locker{m_remote_image_ctx->image_lock};
+
+ m_remote_parent_spec = m_remote_image_ctx->parent_md.spec;
+
+ // scan all remote snapshots for a linked parent
+ for (auto &snap_info_pair : m_remote_image_ctx->snap_info) {
+ auto &parent_spec = snap_info_pair.second.parent.spec;
+ if (parent_spec.pool_id == -1) {
+ continue;
+ } else if (m_remote_parent_spec.pool_id == -1) {
+ m_remote_parent_spec = parent_spec;
+ continue;
+ }
+
+ if (m_remote_parent_spec != parent_spec) {
+ derr << "remote image parent spec mismatch" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ if (m_remote_parent_spec.pool_id == -1) {
+ return 0;
+ }
+
+ // map remote parent pool to local parent pool
+ int r = librbd::util::create_ioctx(
+ m_remote_image_ctx->md_ctx, "remote parent pool",
+ m_remote_parent_spec.pool_id, m_remote_parent_spec.pool_namespace,
+ &m_remote_parent_io_ctx);
+ if (r < 0) {
+ derr << "failed to open remote parent pool " << m_remote_parent_spec.pool_id
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ m_parent_pool_name = m_remote_parent_io_ctx.get_pool_name();
+
+ librados::Rados local_rados(m_local_io_ctx);
+ r = local_rados.ioctx_create(m_parent_pool_name.c_str(),
+ m_local_parent_io_ctx);
+ if (r < 0) {
+ derr << "failed to open local parent pool " << m_parent_pool_name << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ m_local_parent_io_ctx.set_namespace(m_remote_parent_io_ctx.get_namespace());
+
+ return 0;
+}
+
+template <typename I>
+void CreateImageRequest<I>::populate_image_options(
+ librbd::ImageOptions* image_options) {
+ image_options->set(RBD_IMAGE_OPTION_FEATURES,
+ m_remote_image_ctx->features);
+ image_options->set(RBD_IMAGE_OPTION_ORDER, m_remote_image_ctx->order);
+ image_options->set(RBD_IMAGE_OPTION_STRIPE_UNIT,
+ m_remote_image_ctx->stripe_unit);
+ image_options->set(RBD_IMAGE_OPTION_STRIPE_COUNT,
+ m_remote_image_ctx->stripe_count);
+
+ // Determine the data pool for the local image as follows:
+ // 1. If the local pool has a default data pool, use it.
+ // 2. If the remote image has a data pool different from its metadata pool and
+ // a pool with the same name exists locally, use it.
+ // 3. Don't set the data pool explicitly.
+ std::string data_pool;
+ librados::Rados local_rados(m_local_io_ctx);
+ auto default_data_pool = g_ceph_context->_conf.get_val<std::string>("rbd_default_data_pool");
+ auto remote_md_pool = m_remote_image_ctx->md_ctx.get_pool_name();
+ auto remote_data_pool = m_remote_image_ctx->data_ctx.get_pool_name();
+
+ if (default_data_pool != "") {
+ data_pool = default_data_pool;
+ } else if (remote_data_pool != remote_md_pool) {
+ if (local_rados.pool_lookup(remote_data_pool.c_str()) >= 0) {
+ data_pool = remote_data_pool;
+ }
+ }
+
+ if (data_pool != "") {
+ image_options->set(RBD_IMAGE_OPTION_DATA_POOL, data_pool);
+ }
+
+ if (m_remote_parent_spec.pool_id != -1) {
+ uint64_t clone_format = 1;
+ if (m_remote_image_ctx->test_op_features(
+ RBD_OPERATION_FEATURE_CLONE_CHILD)) {
+ clone_format = 2;
+ }
+ image_options->set(RBD_IMAGE_OPTION_CLONE_FORMAT, clone_format);
+ }
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::CreateImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/CreateImageRequest.h b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.h
new file mode 100644
index 000000000..2ff7794e8
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.h
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_CREATE_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_CREATE_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Types.h"
+#include <string>
+
+class Context;
+namespace librbd { class ImageCtx; }
+namespace librbd { class ImageOptions; }
+
+namespace rbd {
+namespace mirror {
+
+class PoolMetaCache;
+template <typename> struct Threads;
+
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CreateImageRequest {
+public:
+ static CreateImageRequest *create(
+ Threads<ImageCtxT> *threads,
+ librados::IoCtx &local_io_ctx,
+ const std::string &global_image_id,
+ const std::string &remote_mirror_uuid,
+ const std::string &local_image_name,
+ const std::string &local_image_id,
+ ImageCtxT *remote_image_ctx,
+ PoolMetaCache* pool_meta_cache,
+ cls::rbd::MirrorImageMode mirror_image_mode,
+ Context *on_finish) {
+ return new CreateImageRequest(threads, local_io_ctx, global_image_id,
+ remote_mirror_uuid, local_image_name,
+ local_image_id, remote_image_ctx,
+ pool_meta_cache, mirror_image_mode,
+ on_finish);
+ }
+
+ CreateImageRequest(
+ Threads<ImageCtxT> *threads, librados::IoCtx &local_io_ctx,
+ const std::string &global_image_id,
+ const std::string &remote_mirror_uuid,
+ const std::string &local_image_name,
+ const std::string &local_image_id,
+ ImageCtxT *remote_image_ctx,
+ PoolMetaCache* pool_meta_cache,
+ cls::rbd::MirrorImageMode mirror_image_mode,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * | (non-clone) *
+ * |\------------> CREATE_IMAGE ---------------------\ * (error)
+ * | | *
+ * | (clone) | *
+ * \-------------> GET_PARENT_GLOBAL_IMAGE_ID * * * | * * * *
+ * | | * *
+ * v | *
+ * GET_LOCAL_PARENT_IMAGE_ID * * * * | * * * *
+ * | | * *
+ * v | *
+ * OPEN_REMOTE_PARENT * * * * * * * | * * * *
+ * | | * *
+ * v | *
+ * CLONE_IMAGE | *
+ * | | *
+ * v | *
+ * CLOSE_REMOTE_PARENT | *
+ * | v *
+ * \------------------------> <finish> < * *
+ * @endverbatim
+ */
+
+ Threads<ImageCtxT> *m_threads;
+ librados::IoCtx &m_local_io_ctx;
+ std::string m_global_image_id;
+ std::string m_remote_mirror_uuid;
+ std::string m_local_image_name;
+ std::string m_local_image_id;
+ ImageCtxT *m_remote_image_ctx;
+ PoolMetaCache* m_pool_meta_cache;
+ cls::rbd::MirrorImageMode m_mirror_image_mode;
+ Context *m_on_finish;
+
+ librados::IoCtx m_remote_parent_io_ctx;
+ ImageCtxT *m_remote_parent_image_ctx = nullptr;
+ cls::rbd::ParentImageSpec m_remote_parent_spec;
+
+ librados::IoCtx m_local_parent_io_ctx;
+ cls::rbd::ParentImageSpec m_local_parent_spec;
+
+ bufferlist m_out_bl;
+ std::string m_parent_global_image_id;
+ std::string m_parent_pool_name;
+ int m_ret_val = 0;
+
+ void create_image();
+ void handle_create_image(int r);
+
+ void get_parent_global_image_id();
+ void handle_get_parent_global_image_id(int r);
+
+ void get_local_parent_image_id();
+ void handle_get_local_parent_image_id(int r);
+
+ void open_remote_parent_image();
+ void handle_open_remote_parent_image(int r);
+
+ void clone_image();
+ void handle_clone_image(int r);
+
+ void close_remote_parent_image();
+ void handle_close_remote_parent_image(int r);
+
+ void error(int r);
+ void finish(int r);
+
+ int validate_parent();
+
+ void populate_image_options(librbd::ImageOptions* image_options);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::CreateImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_CREATE_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc
new file mode 100644
index 000000000..74e975373
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \
+ << "GetMirrorImageIdRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void GetMirrorImageIdRequest<I>::send() {
+ dout(20) << dendl;
+ get_image_id();
+}
+
+template <typename I>
+void GetMirrorImageIdRequest<I>::get_image_id() {
+ dout(20) << dendl;
+
+ // attempt to cross-reference a image id by the global image id
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_image_id_start(&op, m_global_image_id);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ GetMirrorImageIdRequest<I>,
+ &GetMirrorImageIdRequest<I>::handle_get_image_id>(
+ this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void GetMirrorImageIdRequest<I>::handle_get_image_id(int r) {
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_image_id_finish(
+ &iter, m_image_id);
+ }
+
+ dout(20) << "r=" << r << ", "
+ << "image_id=" << *m_image_id << dendl;
+
+ if (r < 0) {
+ if (r == -ENOENT) {
+ dout(10) << "global image " << m_global_image_id << " not registered"
+ << dendl;
+ } else {
+ derr << "failed to retrieve image id: " << cpp_strerror(r) << dendl;
+ }
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void GetMirrorImageIdRequest<I>::finish(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::GetMirrorImageIdRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h
new file mode 100644
index 000000000..b26645138
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_GET_MIRROR_IMAGE_ID_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_GET_MIRROR_IMAGE_ID_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados_fwd.hpp"
+#include <string>
+
+namespace librbd { struct ImageCtx; }
+
+struct Context;
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class GetMirrorImageIdRequest {
+public:
+ static GetMirrorImageIdRequest *create(librados::IoCtx &io_ctx,
+ const std::string &global_image_id,
+ std::string *image_id,
+ Context *on_finish) {
+ return new GetMirrorImageIdRequest(io_ctx, global_image_id, image_id,
+ on_finish);
+ }
+
+ GetMirrorImageIdRequest(librados::IoCtx &io_ctx,
+ const std::string &global_image_id,
+ std::string *image_id,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_global_image_id(global_image_id),
+ m_image_id(image_id), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_IMAGE_ID
+ * |
+ * v
+ * <finish>
+
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_global_image_id;
+ std::string *m_image_id;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ void get_image_id();
+ void handle_get_image_id(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::GetMirrorImageIdRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_GET_MIRROR_IMAGE_ID_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc
new file mode 100644
index 000000000..e6ab382be
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "OpenImageRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include <type_traits>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::OpenImageRequest: " \
+ << this << " " << __func__ << " "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+OpenImageRequest<I>::OpenImageRequest(librados::IoCtx &io_ctx, I **image_ctx,
+ const std::string &image_id,
+ bool read_only, Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_ctx(image_ctx), m_image_id(image_id),
+ m_read_only(read_only), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void OpenImageRequest<I>::send() {
+ send_open_image();
+}
+
+template <typename I>
+void OpenImageRequest<I>::send_open_image() {
+ dout(20) << dendl;
+
+ *m_image_ctx = I::create("", m_image_id, nullptr, m_io_ctx, m_read_only);
+
+ if (!m_read_only) {
+ // ensure non-primary images can be modified
+ (*m_image_ctx)->read_only_mask = ~librbd::IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+ }
+
+ Context *ctx = create_context_callback<
+ OpenImageRequest<I>, &OpenImageRequest<I>::handle_open_image>(
+ this);
+ (*m_image_ctx)->state->open(0, ctx);
+}
+
+template <typename I>
+void OpenImageRequest<I>::handle_open_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to open image '" << m_image_id << "': "
+ << cpp_strerror(r) << dendl;
+ *m_image_ctx = nullptr;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void OpenImageRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::OpenImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/OpenImageRequest.h b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.h
new file mode 100644
index 000000000..01ab31171
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_OPEN_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_OPEN_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include <string>
+
+class Context;
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class OpenImageRequest {
+public:
+ static OpenImageRequest* create(librados::IoCtx &io_ctx,
+ ImageCtxT **image_ctx,
+ const std::string &image_id,
+ bool read_only, Context *on_finish) {
+ return new OpenImageRequest(io_ctx, image_ctx, image_id, read_only,
+ on_finish);
+ }
+
+ OpenImageRequest(librados::IoCtx &io_ctx, ImageCtxT **image_ctx,
+ const std::string &image_id, bool read_only,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_IMAGE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ librados::IoCtx &m_io_ctx;
+ ImageCtxT **m_image_ctx;
+ std::string m_image_id;
+ bool m_read_only;
+ Context *m_on_finish;
+
+ void send_open_image();
+ void handle_open_image(int r);
+
+ void send_close_image(int r);
+ void handle_close_image(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::OpenImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_OPEN_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc
new file mode 100644
index 000000000..7f8d9608e
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc
@@ -0,0 +1,292 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "CloseImageRequest.h"
+#include "OpenLocalImageRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/journal/Policy.h"
+#include "librbd/mirror/GetInfoRequest.h"
+#include <type_traits>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::OpenLocalImageRequest: " \
+ << this << " " << __func__ << " "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+
+namespace {
+
+template <typename I>
+struct MirrorExclusiveLockPolicy : public librbd::exclusive_lock::Policy {
+ I *image_ctx;
+
+ MirrorExclusiveLockPolicy(I *image_ctx) : image_ctx(image_ctx) {
+ }
+
+ bool may_auto_request_lock() override {
+ return false;
+ }
+
+ int lock_requested(bool force) override {
+ int r = -EROFS;
+ {
+ std::shared_lock owner_locker{image_ctx->owner_lock};
+ std::shared_lock image_locker{image_ctx->image_lock};
+ if (image_ctx->journal == nullptr || image_ctx->journal->is_tag_owner()) {
+ r = 0;
+ }
+ }
+
+ if (r == 0) {
+ // if the local image journal has been closed or if it was (force)
+ // promoted allow the lock to be released to another client
+ image_ctx->exclusive_lock->release_lock(nullptr);
+ }
+ return r;
+ }
+
+ bool accept_blocked_request(
+ librbd::exclusive_lock::OperationRequestType request_type) override {
+ switch (request_type) {
+ case librbd::exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE:
+ case librbd::exclusive_lock::OPERATION_REQUEST_TYPE_FORCE_PROMOTION:
+ return true;
+ default:
+ return false;
+ }
+ }
+};
+
+struct MirrorJournalPolicy : public librbd::journal::Policy {
+ librbd::asio::ContextWQ *work_queue;
+
+ MirrorJournalPolicy(librbd::asio::ContextWQ *work_queue)
+ : work_queue(work_queue) {
+ }
+
+ bool append_disabled() const override {
+ // avoid recording any events to the local journal
+ return true;
+ }
+ bool journal_disabled() const override {
+ return false;
+ }
+
+ void allocate_tag_on_lock(Context *on_finish) override {
+ // rbd-mirror will manually create tags by copying them from the peer
+ work_queue->queue(on_finish, 0);
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+OpenLocalImageRequest<I>::OpenLocalImageRequest(
+ librados::IoCtx &local_io_ctx,
+ I **local_image_ctx,
+ const std::string &local_image_id,
+ librbd::asio::ContextWQ *work_queue,
+ Context *on_finish)
+ : m_local_io_ctx(local_io_ctx), m_local_image_ctx(local_image_ctx),
+ m_local_image_id(local_image_id), m_work_queue(work_queue),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send() {
+ send_open_image();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send_open_image() {
+ dout(20) << dendl;
+
+ *m_local_image_ctx = I::create("", m_local_image_id, nullptr,
+ m_local_io_ctx, false);
+
+ // ensure non-primary images can be modified
+ (*m_local_image_ctx)->read_only_mask =
+ ~librbd::IMAGE_READ_ONLY_FLAG_NON_PRIMARY;
+
+ {
+ std::scoped_lock locker{(*m_local_image_ctx)->owner_lock,
+ (*m_local_image_ctx)->image_lock};
+ (*m_local_image_ctx)->set_exclusive_lock_policy(
+ new MirrorExclusiveLockPolicy<I>(*m_local_image_ctx));
+ (*m_local_image_ctx)->set_journal_policy(
+ new MirrorJournalPolicy(m_work_queue));
+ }
+
+ Context *ctx = create_context_callback<
+ OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_open_image>(
+ this);
+ (*m_local_image_ctx)->state->open(0, ctx);
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::handle_open_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -ENOENT) {
+ dout(10) << ": local image does not exist" << dendl;
+ } else {
+ derr << ": failed to open image '" << m_local_image_id << "': "
+ << cpp_strerror(r) << dendl;
+ }
+ *m_local_image_ctx = nullptr;
+ finish(r);
+ return;
+ }
+
+ send_get_mirror_info();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send_get_mirror_info() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ OpenLocalImageRequest<I>,
+ &OpenLocalImageRequest<I>::handle_get_mirror_info>(
+ this);
+ auto request = librbd::mirror::GetInfoRequest<I>::create(
+ **m_local_image_ctx, &m_mirror_image, &m_promotion_state,
+ &m_primary_mirror_uuid, ctx);
+ request->send();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::handle_get_mirror_info(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(5) << ": local image is not mirrored" << dendl;
+ send_close_image(r);
+ return;
+ } else if (r < 0) {
+ derr << ": error querying local image primary status: " << cpp_strerror(r)
+ << dendl;
+ send_close_image(r);
+ return;
+ }
+
+ if (m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLING) {
+ dout(5) << ": local image mirroring is being disabled" << dendl;
+ send_close_image(-ENOENT);
+ return;
+ }
+
+ // if the local image owns the tag -- don't steal the lock since
+ // we aren't going to mirror peer data into this image anyway
+ if (m_promotion_state == librbd::mirror::PROMOTION_STATE_PRIMARY) {
+ dout(10) << ": local image is primary -- skipping image replay" << dendl;
+ send_close_image(-EREMOTEIO);
+ return;
+ }
+
+ send_lock_image();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send_lock_image() {
+ std::shared_lock owner_locker{(*m_local_image_ctx)->owner_lock};
+ if ((*m_local_image_ctx)->exclusive_lock == nullptr) {
+ owner_locker.unlock();
+ if (m_mirror_image.mode == cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT) {
+ finish(0);
+ } else {
+ derr << ": image does not support exclusive lock" << dendl;
+ send_close_image(-EINVAL);
+ }
+ return;
+ }
+
+ dout(20) << dendl;
+
+ // disallow any proxied maintenance operations before grabbing lock
+ (*m_local_image_ctx)->exclusive_lock->block_requests(-EROFS);
+
+ Context *ctx = create_context_callback<
+ OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_lock_image>(
+ this);
+
+ (*m_local_image_ctx)->exclusive_lock->acquire_lock(ctx);
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::handle_lock_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to lock image '" << m_local_image_id << "': "
+ << cpp_strerror(r) << dendl;
+ send_close_image(r);
+ return;
+ }
+
+ {
+ std::shared_lock owner_locker{(*m_local_image_ctx)->owner_lock};
+ if ((*m_local_image_ctx)->exclusive_lock == nullptr ||
+ !(*m_local_image_ctx)->exclusive_lock->is_lock_owner()) {
+ derr << ": image is not locked" << dendl;
+ send_close_image(-EBUSY);
+ return;
+ }
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send_close_image(int r) {
+ dout(20) << dendl;
+
+ if (m_ret_val == 0 && r < 0) {
+ m_ret_val = r;
+ }
+
+ Context *ctx = create_context_callback<
+ OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_close_image>(
+ this);
+ CloseImageRequest<I> *request = CloseImageRequest<I>::create(
+ m_local_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::handle_close_image(int r) {
+ dout(20) << dendl;
+
+ ceph_assert(r == 0);
+ finish(m_ret_val);
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::OpenLocalImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h
new file mode 100644
index 000000000..9a642bc39
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_OPEN_LOCAL_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_OPEN_LOCAL_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/mirror/Types.h"
+#include <string>
+
+class Context;
+namespace librbd {
+class ImageCtx;
+namespace asio { struct ContextWQ; }
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class OpenLocalImageRequest {
+public:
+ static OpenLocalImageRequest* create(librados::IoCtx &local_io_ctx,
+ ImageCtxT **local_image_ctx,
+ const std::string &local_image_id,
+ librbd::asio::ContextWQ *work_queue,
+ Context *on_finish) {
+ return new OpenLocalImageRequest(local_io_ctx, local_image_ctx,
+ local_image_id, work_queue, on_finish);
+ }
+
+ OpenLocalImageRequest(librados::IoCtx &local_io_ctx,
+ ImageCtxT **local_image_ctx,
+ const std::string &local_image_id,
+ librbd::asio::ContextWQ *work_queue,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_IMAGE * * * * * * * *
+ * | *
+ * v *
+ * GET_MIRROR_INFO * * * * *
+ * | *
+ * v (skip if primary) v
+ * LOCK_IMAGE * * * > CLOSE_IMAGE
+ * | |
+ * v |
+ * <finish> <---------------/
+ *
+ * @endverbatim
+ */
+ librados::IoCtx &m_local_io_ctx;
+ ImageCtxT **m_local_image_ctx;
+ std::string m_local_image_id;
+ librbd::asio::ContextWQ *m_work_queue;
+ Context *m_on_finish;
+
+ cls::rbd::MirrorImage m_mirror_image;
+ librbd::mirror::PromotionState m_promotion_state =
+ librbd::mirror::PROMOTION_STATE_NON_PRIMARY;
+ std::string m_primary_mirror_uuid;
+ int m_ret_val = 0;
+
+ void send_open_image();
+ void handle_open_image(int r);
+
+ void send_get_mirror_info();
+ void handle_get_mirror_info(int r);
+
+ void send_lock_image();
+ void handle_lock_image(int r);
+
+ void send_close_image(int r);
+ void handle_close_image(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::OpenLocalImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_OPEN_LOCAL_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc
new file mode 100644
index 000000000..b1fef7254
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/GetInfoRequest.h"
+#include "tools/rbd_mirror/ImageDeleter.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h"
+#include "tools/rbd_mirror/image_replayer/journal/StateBuilder.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h"
+#include <type_traits>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \
+ << "PrepareLocalImageRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void PrepareLocalImageRequest<I>::send() {
+ dout(10) << dendl;
+ get_local_image_id();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_local_image_id() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ PrepareLocalImageRequest<I>,
+ &PrepareLocalImageRequest<I>::handle_get_local_image_id>(this);
+ auto req = GetMirrorImageIdRequest<I>::create(m_io_ctx, m_global_image_id,
+ &m_local_image_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_local_image_id(int r) {
+ dout(10) << "r=" << r << ", "
+ << "local_image_id=" << m_local_image_id << dendl;
+
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ get_local_image_name();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_local_image_name() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::dir_get_name_start(&op, m_local_image_id);
+
+ m_out_bl.clear();
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ PrepareLocalImageRequest<I>,
+ &PrepareLocalImageRequest<I>::handle_get_local_image_name>(this);
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_local_image_name(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::dir_get_name_finish(&it, m_local_image_name);
+ }
+
+ if (r == -ENOENT) {
+ // proceed we should have a mirror image record if we got this far
+ dout(10) << "image does not exist for local image id " << m_local_image_id
+ << dendl;
+ *m_local_image_name = "";
+ } else if (r < 0) {
+ derr << "failed to retrieve image name: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ get_mirror_info();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_mirror_info() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ PrepareLocalImageRequest<I>,
+ &PrepareLocalImageRequest<I>::handle_get_mirror_info>(this);
+ auto req = librbd::mirror::GetInfoRequest<I>::create(
+ m_io_ctx, m_work_queue, m_local_image_id, &m_mirror_image,
+ &m_promotion_state, &m_primary_mirror_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_mirror_info(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to retrieve local mirror image info: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_CREATING) {
+ dout(5) << "local image is still in creating state, issuing a removal"
+ << dendl;
+ move_to_trash();
+ return;
+ } else if (m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLING) {
+ dout(5) << "local image mirroring is in disabling state" << dendl;
+ finish(-ERESTART);
+ return;
+ }
+
+ switch (m_mirror_image.mode) {
+ case cls::rbd::MIRROR_IMAGE_MODE_JOURNAL:
+ // journal-based local image exists
+ {
+ auto state_builder = journal::StateBuilder<I>::create(m_global_image_id);
+ state_builder->local_primary_mirror_uuid = m_primary_mirror_uuid;
+ *m_state_builder = state_builder;
+ }
+ break;
+ case cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT:
+ // snapshot-based local image exists
+ *m_state_builder = snapshot::StateBuilder<I>::create(m_global_image_id);
+ break;
+ default:
+ derr << "unsupported mirror image mode " << m_mirror_image.mode << " "
+ << "for image " << m_global_image_id << dendl;
+ finish(-EOPNOTSUPP);
+ break;
+ }
+
+ dout(10) << "local_image_id=" << m_local_image_id << ", "
+ << "local_promotion_state=" << m_promotion_state << ", "
+ << "local_primary_mirror_uuid=" << m_primary_mirror_uuid << dendl;
+ (*m_state_builder)->local_image_id = m_local_image_id;
+ (*m_state_builder)->local_promotion_state = m_promotion_state;
+ finish(0);
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::move_to_trash() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ PrepareLocalImageRequest<I>,
+ &PrepareLocalImageRequest<I>::handle_move_to_trash>(this);
+ ImageDeleter<I>::trash_move(m_io_ctx, m_global_image_id,
+ false, m_work_queue, ctx);
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_move_to_trash(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ finish(-ENOENT);
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::finish(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::PrepareLocalImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h
new file mode 100644
index 000000000..6372169ff
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados_fwd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+#include <string>
+
+struct Context;
+
+namespace librbd {
+struct ImageCtx;
+namespace asio { struct ContextWQ; }
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PrepareLocalImageRequest {
+public:
+ static PrepareLocalImageRequest *create(
+ librados::IoCtx &io_ctx,
+ const std::string &global_image_id,
+ std::string *local_image_name,
+ StateBuilder<ImageCtxT>** state_builder,
+ librbd::asio::ContextWQ *work_queue,
+ Context *on_finish) {
+ return new PrepareLocalImageRequest(io_ctx, global_image_id,
+ local_image_name, state_builder,
+ work_queue, on_finish);
+ }
+
+ PrepareLocalImageRequest(
+ librados::IoCtx &io_ctx,
+ const std::string &global_image_id,
+ std::string *local_image_name,
+ StateBuilder<ImageCtxT>** state_builder,
+ librbd::asio::ContextWQ *work_queue,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_global_image_id(global_image_id),
+ m_local_image_name(local_image_name), m_state_builder(state_builder),
+ m_work_queue(work_queue), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_LOCAL_IMAGE_ID
+ * |
+ * v
+ * GET_LOCAL_IMAGE_NAME
+ * |
+ * v
+ * GET_MIRROR_INFO
+ * |
+ * | (if the image mirror state is CREATING)
+ * v
+ * TRASH_MOVE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_global_image_id;
+ std::string *m_local_image_name;
+ StateBuilder<ImageCtxT>** m_state_builder;
+ librbd::asio::ContextWQ *m_work_queue;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ std::string m_local_image_id;
+ cls::rbd::MirrorImage m_mirror_image;
+ librbd::mirror::PromotionState m_promotion_state;
+ std::string m_primary_mirror_uuid;
+
+ void get_local_image_id();
+ void handle_get_local_image_id(int r);
+
+ void get_local_image_name();
+ void handle_get_local_image_name(int r);
+
+ void get_mirror_info();
+ void handle_get_mirror_info(int r);
+
+ void move_to_trash();
+ void handle_move_to_trash(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::PrepareLocalImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc
new file mode 100644
index 000000000..45a44a300
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc
@@ -0,0 +1,283 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "journal/Settings.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/mirror/GetInfoRequest.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h"
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+#include "tools/rbd_mirror/image_replayer/journal/StateBuilder.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \
+ << "PrepareRemoteImageRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::send() {
+ if (*m_state_builder != nullptr) {
+ (*m_state_builder)->remote_mirror_uuid = m_remote_pool_meta.mirror_uuid;
+ auto state_builder = dynamic_cast<snapshot::StateBuilder<I>*>(*m_state_builder);
+ if (state_builder) {
+ state_builder->remote_mirror_peer_uuid = m_remote_pool_meta.mirror_peer_uuid;
+ }
+ }
+
+ get_remote_image_id();
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::get_remote_image_id() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ PrepareRemoteImageRequest<I>,
+ &PrepareRemoteImageRequest<I>::handle_get_remote_image_id>(this);
+ auto req = GetMirrorImageIdRequest<I>::create(m_remote_io_ctx,
+ m_global_image_id,
+ &m_remote_image_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::handle_get_remote_image_id(int r) {
+ dout(10) << "r=" << r << ", "
+ << "remote_image_id=" << m_remote_image_id << dendl;
+
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ get_mirror_info();
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::get_mirror_info() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ PrepareRemoteImageRequest<I>,
+ &PrepareRemoteImageRequest<I>::handle_get_mirror_info>(this);
+ auto req = librbd::mirror::GetInfoRequest<I>::create(
+ m_remote_io_ctx, m_threads->work_queue, m_remote_image_id,
+ &m_mirror_image, &m_promotion_state, &m_primary_mirror_uuid,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::handle_get_mirror_info(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(10) << "image " << m_global_image_id << " not mirrored" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to retrieve mirror image details for image "
+ << m_global_image_id << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ auto state_builder = *m_state_builder;
+ if (state_builder != nullptr &&
+ state_builder->get_mirror_image_mode() != m_mirror_image.mode) {
+ derr << "local and remote mirror image using different mirroring modes "
+ << "for image " << m_global_image_id << ": split-brain" << dendl;
+ finish(-EEXIST);
+ return;
+ } else if (m_mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLING) {
+ dout(5) << "remote image mirroring is being disabled" << dendl;
+ finish(-ENOENT);
+ return;
+ }
+
+ switch (m_mirror_image.mode) {
+ case cls::rbd::MIRROR_IMAGE_MODE_JOURNAL:
+ get_client();
+ break;
+ case cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT:
+ finalize_snapshot_state_builder();
+ finish(0);
+ break;
+ default:
+ derr << "unsupported mirror image mode " << m_mirror_image.mode << " "
+ << "for image " << m_global_image_id << dendl;
+ finish(-EOPNOTSUPP);
+ break;
+ }
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::get_client() {
+ dout(10) << dendl;
+
+ auto cct = static_cast<CephContext *>(m_local_io_ctx.cct());
+ ::journal::Settings journal_settings;
+ journal_settings.commit_interval = cct->_conf.get_val<double>(
+ "rbd_mirror_journal_commit_age");
+
+ // TODO use Journal thread pool for journal ops until converted to ASIO
+ ContextWQ* context_wq;
+ librbd::Journal<>::get_work_queue(cct, &context_wq);
+
+ ceph_assert(m_remote_journaler == nullptr);
+ m_remote_journaler = new Journaler(context_wq, m_threads->timer,
+ &m_threads->timer_lock, m_remote_io_ctx,
+ m_remote_image_id, m_local_mirror_uuid,
+ journal_settings, m_cache_manager_handler);
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ PrepareRemoteImageRequest<I>,
+ &PrepareRemoteImageRequest<I>::handle_get_client>(this));
+ m_remote_journaler->get_client(m_local_mirror_uuid, &m_client, ctx);
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::handle_get_client(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ MirrorPeerClientMeta client_meta;
+ if (r == -ENOENT) {
+ dout(10) << "client not registered" << dendl;
+ register_client();
+ } else if (r < 0) {
+ derr << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ finish(r);
+ } else if (!util::decode_client_meta(m_client, &client_meta)) {
+ // require operator intervention since the data is corrupt
+ finish(-EBADMSG);
+ } else {
+ // skip registration if it already exists
+ finalize_journal_state_builder(m_client.state, client_meta);
+ finish(0);
+ }
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::register_client() {
+ dout(10) << dendl;
+
+ auto state_builder = *m_state_builder;
+ librbd::journal::MirrorPeerClientMeta client_meta{
+ (state_builder == nullptr ? "" : state_builder->local_image_id)};
+ client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+ librbd::journal::ClientData client_data{client_meta};
+ bufferlist client_data_bl;
+ encode(client_data, client_data_bl);
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ PrepareRemoteImageRequest<I>,
+ &PrepareRemoteImageRequest<I>::handle_register_client>(this));
+ m_remote_journaler->register_client(client_data_bl, ctx);
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::handle_register_client(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to register with remote journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ auto state_builder = *m_state_builder;
+ librbd::journal::MirrorPeerClientMeta client_meta{
+ (state_builder == nullptr ? "" : state_builder->local_image_id)};
+ client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+ finalize_journal_state_builder(cls::journal::CLIENT_STATE_CONNECTED,
+ client_meta);
+ finish(0);
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::finalize_journal_state_builder(
+ cls::journal::ClientState client_state,
+ const MirrorPeerClientMeta& client_meta) {
+ journal::StateBuilder<I>* state_builder = nullptr;
+ if (*m_state_builder != nullptr) {
+ // already verified that it's a matching builder in
+ // 'handle_get_mirror_info'
+ state_builder = dynamic_cast<journal::StateBuilder<I>*>(*m_state_builder);
+ ceph_assert(state_builder != nullptr);
+ } else {
+ state_builder = journal::StateBuilder<I>::create(m_global_image_id);
+ *m_state_builder = state_builder;
+ }
+
+ state_builder->remote_mirror_uuid = m_remote_pool_meta.mirror_uuid;
+ state_builder->remote_image_id = m_remote_image_id;
+ state_builder->remote_promotion_state = m_promotion_state;
+ state_builder->remote_journaler = m_remote_journaler;
+ state_builder->remote_client_state = client_state;
+ state_builder->remote_client_meta = client_meta;
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::finalize_snapshot_state_builder() {
+ snapshot::StateBuilder<I>* state_builder = nullptr;
+ if (*m_state_builder != nullptr) {
+ state_builder = dynamic_cast<snapshot::StateBuilder<I>*>(*m_state_builder);
+ ceph_assert(state_builder != nullptr);
+ } else {
+ state_builder = snapshot::StateBuilder<I>::create(m_global_image_id);
+ *m_state_builder = state_builder;
+ }
+
+ dout(10) << "remote_mirror_uuid=" << m_remote_pool_meta.mirror_uuid << ", "
+ << "remote_mirror_peer_uuid="
+ << m_remote_pool_meta.mirror_peer_uuid << ", "
+ << "remote_image_id=" << m_remote_image_id << ", "
+ << "remote_promotion_state=" << m_promotion_state << dendl;
+ state_builder->remote_mirror_uuid = m_remote_pool_meta.mirror_uuid;
+ state_builder->remote_mirror_peer_uuid = m_remote_pool_meta.mirror_peer_uuid;
+ state_builder->remote_image_id = m_remote_image_id;
+ state_builder->remote_promotion_state = m_promotion_state;
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::finish(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ delete m_remote_journaler;
+ m_remote_journaler = nullptr;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::PrepareRemoteImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h
new file mode 100644
index 000000000..483cfc001
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_PREPARE_REMOTE_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_PREPARE_REMOTE_IMAGE_REQUEST_H
+
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include "cls/journal/cls_journal_types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include "librbd/mirror/Types.h"
+#include "tools/rbd_mirror/Types.h"
+#include <string>
+
+namespace journal { class Journaler; }
+namespace journal { struct CacheManagerHandler; }
+namespace librbd { struct ImageCtx; }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+struct Context;
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+namespace image_replayer {
+
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PrepareRemoteImageRequest {
+public:
+ typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+ typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta;
+
+ static PrepareRemoteImageRequest *create(
+ Threads<ImageCtxT> *threads,
+ librados::IoCtx &local_io_ctx,
+ librados::IoCtx &remote_io_ctx,
+ const std::string &global_image_id,
+ const std::string &local_mirror_uuid,
+ const RemotePoolMeta& remote_pool_meta,
+ ::journal::CacheManagerHandler *cache_manager_handler,
+ StateBuilder<ImageCtxT>** state_builder,
+ Context *on_finish) {
+ return new PrepareRemoteImageRequest(threads, local_io_ctx, remote_io_ctx,
+ global_image_id, local_mirror_uuid,
+ remote_pool_meta,
+ cache_manager_handler, state_builder,
+ on_finish);
+ }
+
+ PrepareRemoteImageRequest(
+ Threads<ImageCtxT> *threads,
+ librados::IoCtx &local_io_ctx,
+ librados::IoCtx &remote_io_ctx,
+ const std::string &global_image_id,
+ const std::string &local_mirror_uuid,
+ const RemotePoolMeta& remote_pool_meta,
+ ::journal::CacheManagerHandler *cache_manager_handler,
+ StateBuilder<ImageCtxT>** state_builder,
+ Context *on_finish)
+ : m_threads(threads),
+ m_local_io_ctx(local_io_ctx),
+ m_remote_io_ctx(remote_io_ctx),
+ m_global_image_id(global_image_id),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_remote_pool_meta(remote_pool_meta),
+ m_cache_manager_handler(cache_manager_handler),
+ m_state_builder(state_builder),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_REMOTE_IMAGE_ID
+ * |
+ * v
+ * GET_REMOTE_MIRROR_INFO
+ * |
+ * | (journal)
+ * \-----------> GET_CLIENT
+ * | |
+ * | v (skip if not needed)
+ * | REGISTER_CLIENT
+ * | |
+ * | |
+ * |/----------------/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ Threads<ImageCtxT> *m_threads;
+ librados::IoCtx &m_local_io_ctx;
+ librados::IoCtx &m_remote_io_ctx;
+ std::string m_global_image_id;
+ std::string m_local_mirror_uuid;
+ RemotePoolMeta m_remote_pool_meta;
+ ::journal::CacheManagerHandler *m_cache_manager_handler;
+ StateBuilder<ImageCtxT>** m_state_builder;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ std::string m_remote_image_id;
+ cls::rbd::MirrorImage m_mirror_image;
+ librbd::mirror::PromotionState m_promotion_state =
+ librbd::mirror::PROMOTION_STATE_UNKNOWN;
+ std::string m_primary_mirror_uuid;
+
+ // journal-based mirroring
+ Journaler *m_remote_journaler = nullptr;
+ cls::journal::Client m_client;
+
+ void get_remote_image_id();
+ void handle_get_remote_image_id(int r);
+
+ void get_mirror_info();
+ void handle_get_mirror_info(int r);
+
+ void get_client();
+ void handle_get_client(int r);
+
+ void register_client();
+ void handle_register_client(int r);
+
+ void finalize_journal_state_builder(cls::journal::ClientState client_state,
+ const MirrorPeerClientMeta& client_meta);
+ void finalize_snapshot_state_builder();
+
+ void finish(int r);
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::PrepareRemoteImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_PREPARE_REMOTE_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/Replayer.h b/src/tools/rbd_mirror/image_replayer/Replayer.h
new file mode 100644
index 000000000..f3bfa4da0
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/Replayer.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_REPLAYER_H
+#define RBD_MIRROR_IMAGE_REPLAYER_REPLAYER_H
+
+#include <string>
+
+struct Context;
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+struct Replayer {
+ virtual ~Replayer() {}
+
+ virtual void destroy() = 0;
+
+ virtual void init(Context* on_finish) = 0;
+ virtual void shut_down(Context* on_finish) = 0;
+
+ virtual void flush(Context* on_finish) = 0;
+
+ virtual bool get_replay_status(std::string* description,
+ Context* on_finish) = 0;
+
+ virtual bool is_replaying() const = 0;
+ virtual bool is_resync_requested() const = 0;
+
+ virtual int get_error_code() const = 0;
+ virtual std::string get_error_description() const = 0;
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_REPLAYER_H
diff --git a/src/tools/rbd_mirror/image_replayer/ReplayerListener.h b/src/tools/rbd_mirror/image_replayer/ReplayerListener.h
new file mode 100644
index 000000000..f17f401b1
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/ReplayerListener.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_REPLAYER_LISTENER_H
+#define RBD_MIRROR_IMAGE_REPLAYER_REPLAYER_LISTENER_H
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+struct ReplayerListener {
+ virtual ~ReplayerListener() {}
+
+ virtual void handle_notification() = 0;
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_REPLAYER_LISTENER_H
diff --git a/src/tools/rbd_mirror/image_replayer/StateBuilder.cc b/src/tools/rbd_mirror/image_replayer/StateBuilder.cc
new file mode 100644
index 000000000..55fb3509d
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/StateBuilder.cc
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "StateBuilder.h"
+#include "include/ceph_assert.h"
+#include "include/Context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "tools/rbd_mirror/image_replayer/CloseImageRequest.h"
+#include "tools/rbd_mirror/image_sync/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \
+ << "StateBuilder: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename I>
+StateBuilder<I>::StateBuilder(const std::string& global_image_id)
+ : global_image_id(global_image_id) {
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+}
+
+template <typename I>
+StateBuilder<I>::~StateBuilder() {
+ ceph_assert(local_image_ctx == nullptr);
+ ceph_assert(remote_image_ctx == nullptr);
+ ceph_assert(m_sync_point_handler == nullptr);
+}
+
+template <typename I>
+bool StateBuilder<I>::is_local_primary() const {
+ if (local_promotion_state == librbd::mirror::PROMOTION_STATE_PRIMARY) {
+ ceph_assert(!local_image_id.empty());
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+bool StateBuilder<I>::is_remote_primary() const {
+ if (remote_promotion_state == librbd::mirror::PROMOTION_STATE_PRIMARY) {
+ ceph_assert(!remote_image_id.empty());
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+bool StateBuilder<I>::is_linked() const {
+ if (local_promotion_state == librbd::mirror::PROMOTION_STATE_NON_PRIMARY) {
+ ceph_assert(!local_image_id.empty());
+ return is_linked_impl();
+ }
+ return false;
+}
+
+template <typename I>
+void StateBuilder<I>::close_local_image(Context* on_finish) {
+ if (local_image_ctx == nullptr) {
+ on_finish->complete(0);
+ return;
+ }
+
+ dout(10) << dendl;
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_close_local_image(r, on_finish);
+ });
+ auto request = image_replayer::CloseImageRequest<I>::create(
+ &local_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void StateBuilder<I>::handle_close_local_image(int r, Context* on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(local_image_ctx == nullptr);
+ if (r < 0) {
+ derr << "failed to close local image for image " << global_image_id << ": "
+ << cpp_strerror(r) << dendl;
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void StateBuilder<I>::close_remote_image(Context* on_finish) {
+ if (remote_image_ctx == nullptr) {
+ on_finish->complete(0);
+ return;
+ }
+
+ dout(10) << dendl;
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_close_remote_image(r, on_finish);
+ });
+ auto request = image_replayer::CloseImageRequest<I>::create(
+ &remote_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void StateBuilder<I>::handle_close_remote_image(int r, Context* on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(remote_image_ctx == nullptr);
+ if (r < 0) {
+ derr << "failed to close remote image for image " << global_image_id << ": "
+ << cpp_strerror(r) << dendl;
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void StateBuilder<I>::destroy_sync_point_handler() {
+ if (m_sync_point_handler == nullptr) {
+ return;
+ }
+
+ dout(15) << dendl;
+ m_sync_point_handler->destroy();
+ m_sync_point_handler = nullptr;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::StateBuilder<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/StateBuilder.h b/src/tools/rbd_mirror/image_replayer/StateBuilder.h
new file mode 100644
index 000000000..51cf8668c
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/StateBuilder.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_STATE_BUILDER_H
+#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_STATE_BUILDER_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/Types.h"
+
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+struct BaseRequest;
+template <typename> class InstanceWatcher;
+struct PoolMetaCache;
+struct ProgressContext;
+template <typename> class Threads;
+
+namespace image_sync { struct SyncPointHandler; }
+
+namespace image_replayer {
+
+struct Replayer;
+struct ReplayerListener;
+
+template <typename ImageCtxT>
+class StateBuilder {
+public:
+ StateBuilder(const StateBuilder&) = delete;
+ StateBuilder& operator=(const StateBuilder&) = delete;
+
+ virtual ~StateBuilder();
+
+ virtual void destroy() {
+ delete this;
+ }
+
+ virtual void close(Context* on_finish) = 0;
+
+ virtual bool is_disconnected() const = 0;
+
+ bool is_local_primary() const;
+ bool is_remote_primary() const;
+ bool is_linked() const;
+
+ virtual cls::rbd::MirrorImageMode get_mirror_image_mode() const = 0;
+
+ virtual image_sync::SyncPointHandler* create_sync_point_handler() = 0;
+ void destroy_sync_point_handler();
+
+ virtual bool replay_requires_remote_image() const = 0;
+
+ void close_remote_image(Context* on_finish);
+
+ virtual BaseRequest* create_local_image_request(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& local_io_ctx,
+ const std::string& global_image_id,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ Context* on_finish) = 0;
+
+ virtual BaseRequest* create_prepare_replay_request(
+ const std::string& local_mirror_uuid,
+ ProgressContext* progress_ctx,
+ bool* resync_requested,
+ bool* syncing,
+ Context* on_finish) = 0;
+
+ virtual Replayer* create_replayer(
+ Threads<ImageCtxT>* threads,
+ InstanceWatcher<ImageCtxT>* instance_watcher,
+ const std::string& local_mirror_uuid,
+ PoolMetaCache* pool_meta_cache,
+ ReplayerListener* replayer_listener) = 0;
+
+ std::string global_image_id;
+
+ std::string local_image_id;
+ librbd::mirror::PromotionState local_promotion_state =
+ librbd::mirror::PROMOTION_STATE_UNKNOWN;
+ ImageCtxT* local_image_ctx = nullptr;
+
+ std::string remote_mirror_uuid;
+ std::string remote_image_id;
+ librbd::mirror::PromotionState remote_promotion_state =
+ librbd::mirror::PROMOTION_STATE_UNKNOWN;
+ ImageCtxT* remote_image_ctx = nullptr;
+
+protected:
+ image_sync::SyncPointHandler* m_sync_point_handler = nullptr;
+
+ StateBuilder(const std::string& global_image_id);
+
+ void close_local_image(Context* on_finish);
+
+private:
+ virtual bool is_linked_impl() const = 0;
+
+ void handle_close_local_image(int r, Context* on_finish);
+ void handle_close_remote_image(int r, Context* on_finish);
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::StateBuilder<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_STATE_BUILDER_H
diff --git a/src/tools/rbd_mirror/image_replayer/TimeRollingMean.cc b/src/tools/rbd_mirror/image_replayer/TimeRollingMean.cc
new file mode 100644
index 000000000..5d9c9aca1
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/TimeRollingMean.cc
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/TimeRollingMean.h"
+#include "common/Clock.h"
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+void TimeRollingMean::operator()(uint32_t value) {
+ auto time = ceph_clock_now();
+ if (m_last_time.is_zero()) {
+ m_last_time = time;
+ } else if (m_last_time.sec() < time.sec()) {
+ auto sec = m_last_time.sec();
+ while (sec++ < time.sec()) {
+ m_rolling_mean(m_sum);
+ m_sum = 0;
+ }
+
+ m_last_time = time;
+ }
+
+ m_sum += value;
+}
+
+double TimeRollingMean::get_average() const {
+ return boost::accumulators::rolling_mean(m_rolling_mean);
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_replayer/TimeRollingMean.h b/src/tools/rbd_mirror/image_replayer/TimeRollingMean.h
new file mode 100644
index 000000000..139ef893f
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/TimeRollingMean.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_TIME_ROLLING_MEAN_H
+#define RBD_MIRROR_IMAGE_REPLAYER_TIME_ROLLING_MEAN_H
+
+#include "include/utime.h"
+#include <boost/accumulators/accumulators.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+#include <boost/accumulators/statistics/rolling_mean.hpp>
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+class TimeRollingMean {
+public:
+
+ void operator()(uint32_t value);
+
+ double get_average() const;
+
+private:
+ typedef boost::accumulators::accumulator_set<
+ uint64_t, boost::accumulators::stats<
+ boost::accumulators::tag::rolling_mean>> RollingMean;
+
+ utime_t m_last_time;
+ uint64_t m_sum = 0;
+
+ RollingMean m_rolling_mean{
+ boost::accumulators::tag::rolling_window::window_size = 30};
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_TIME_ROLLING_MEAN_H
diff --git a/src/tools/rbd_mirror/image_replayer/Types.h b/src/tools/rbd_mirror/image_replayer/Types.h
new file mode 100644
index 000000000..6ab988a76
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/Types.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H
+#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+enum HealthState {
+ HEALTH_STATE_OK,
+ HEALTH_STATE_WARNING,
+ HEALTH_STATE_ERROR
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H
diff --git a/src/tools/rbd_mirror/image_replayer/Utils.cc b/src/tools/rbd_mirror/image_replayer/Utils.cc
new file mode 100644
index 000000000..55162a4e4
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/Utils.cc
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+#include "include/rados/librados.hpp"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::util::" \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace util {
+
+std::string compute_image_spec(librados::IoCtx& io_ctx,
+ const std::string& image_name) {
+ std::string name = io_ctx.get_namespace();
+ if (!name.empty()) {
+ name += "/";
+ }
+
+ return io_ctx.get_pool_name() + "/" + name + image_name;
+}
+
+bool decode_client_meta(const cls::journal::Client& client,
+ librbd::journal::MirrorPeerClientMeta* client_meta) {
+ dout(15) << dendl;
+
+ librbd::journal::ClientData client_data;
+ auto it = client.data.cbegin();
+ try {
+ decode(client_data, it);
+ } catch (const buffer::error &err) {
+ derr << "failed to decode client meta data: " << err.what() << dendl;
+ return false;
+ }
+
+ auto local_client_meta = boost::get<librbd::journal::MirrorPeerClientMeta>(
+ &client_data.client_meta);
+ if (local_client_meta == nullptr) {
+ derr << "unknown peer registration" << dendl;
+ return false;
+ }
+
+ *client_meta = *local_client_meta;
+ dout(15) << "client found: client_meta=" << *client_meta << dendl;
+ return true;
+}
+
+} // namespace util
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
diff --git a/src/tools/rbd_mirror/image_replayer/Utils.h b/src/tools/rbd_mirror/image_replayer/Utils.h
new file mode 100644
index 000000000..6c5352cd1
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/Utils.h
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_UTILS_H
+#define RBD_MIRROR_IMAGE_REPLAYER_UTILS_H
+
+#include "include/rados/librados_fwd.hpp"
+#include <string>
+
+namespace cls { namespace journal { struct Client; } }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace util {
+
+std::string compute_image_spec(librados::IoCtx& io_ctx,
+ const std::string& image_name);
+
+bool decode_client_meta(const cls::journal::Client& client,
+ librbd::journal::MirrorPeerClientMeta* client_meta);
+
+} // namespace util
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_UTILS_H
diff --git a/src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.cc
new file mode 100644
index 000000000..087cf4f5f
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.cc
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "CreateLocalImageRequest.h"
+#include "include/rados/librados.hpp"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/Types.h"
+#include "tools/rbd_mirror/PoolMetaCache.h"
+#include "tools/rbd_mirror/ProgressContext.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_replayer/CreateImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+#include "tools/rbd_mirror/image_replayer/journal/StateBuilder.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::journal::" \
+ << "CreateLocalImageRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+void CreateLocalImageRequest<I>::send() {
+ unregister_client();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::unregister_client() {
+ dout(10) << dendl;
+ update_progress("UNREGISTER_CLIENT");
+
+ auto ctx = create_context_callback<
+ CreateLocalImageRequest<I>,
+ &CreateLocalImageRequest<I>::handle_unregister_client>(this);
+ m_state_builder->remote_journaler->unregister_client(ctx);
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::handle_unregister_client(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to unregister with remote journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ m_state_builder->local_image_id = "";
+ m_state_builder->remote_client_meta = {};
+ register_client();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::register_client() {
+ ceph_assert(m_state_builder->local_image_id.empty());
+ m_state_builder->local_image_id =
+ librbd::util::generate_image_id<I>(m_local_io_ctx);
+ dout(10) << "local_image_id=" << m_state_builder->local_image_id << dendl;
+ update_progress("REGISTER_CLIENT");
+
+ librbd::journal::MirrorPeerClientMeta client_meta{
+ m_state_builder->local_image_id};
+ client_meta.state = librbd::journal::MIRROR_PEER_STATE_SYNCING;
+
+ librbd::journal::ClientData client_data{client_meta};
+ bufferlist client_data_bl;
+ encode(client_data, client_data_bl);
+
+ auto ctx = create_context_callback<
+ CreateLocalImageRequest<I>,
+ &CreateLocalImageRequest<I>::handle_register_client>(this);
+ m_state_builder->remote_journaler->register_client(client_data_bl, ctx);
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::handle_register_client(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to register with remote journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ m_state_builder->remote_client_state = cls::journal::CLIENT_STATE_CONNECTED;
+ m_state_builder->remote_client_meta = {m_state_builder->local_image_id};
+ m_state_builder->remote_client_meta.state =
+ librbd::journal::MIRROR_PEER_STATE_SYNCING;
+
+ create_local_image();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::create_local_image() {
+ dout(10) << "local_image_id=" << m_state_builder->local_image_id << dendl;
+ update_progress("CREATE_LOCAL_IMAGE");
+
+ m_remote_image_ctx->image_lock.lock_shared();
+ std::string image_name = m_remote_image_ctx->name;
+ m_remote_image_ctx->image_lock.unlock_shared();
+
+ auto ctx = create_context_callback<
+ CreateLocalImageRequest<I>,
+ &CreateLocalImageRequest<I>::handle_create_local_image>(this);
+ auto request = CreateImageRequest<I>::create(
+ m_threads, m_local_io_ctx, m_global_image_id,
+ m_state_builder->remote_mirror_uuid, image_name,
+ m_state_builder->local_image_id, m_remote_image_ctx,
+ m_pool_meta_cache, cls::rbd::MIRROR_IMAGE_MODE_JOURNAL, ctx);
+ request->send();
+}
+template <typename I>
+void CreateLocalImageRequest<I>::handle_create_local_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -EBADF) {
+ dout(5) << "image id " << m_state_builder->local_image_id << " "
+ << "already in-use" << dendl;
+ unregister_client();
+ return;
+ } else if (r < 0) {
+ if (r == -ENOENT) {
+ dout(10) << "parent image does not exist" << dendl;
+ } else {
+ derr << "failed to create local image: " << cpp_strerror(r) << dendl;
+ }
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::update_progress(
+ const std::string& description) {
+ dout(15) << description << dendl;
+ if (m_progress_ctx != nullptr) {
+ m_progress_ctx->update_progress(description);
+ }
+}
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::journal::CreateLocalImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.h
new file mode 100644
index 000000000..fc776ecc3
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.h
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_CREATE_LOCAL_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_CREATE_LOCAL_IMAGE_REQUEST_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "tools/rbd_mirror/BaseRequest.h"
+#include <string>
+
+struct Context;
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+class PoolMetaCache;
+class ProgressContext;
+template <typename> struct Threads;
+
+namespace image_replayer {
+namespace journal {
+
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT>
+class CreateLocalImageRequest : public BaseRequest {
+public:
+ typedef rbd::mirror::ProgressContext ProgressContext;
+
+ static CreateLocalImageRequest* create(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& local_io_ctx,
+ ImageCtxT* remote_image_ctx,
+ const std::string& global_image_id,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>* state_builder,
+ Context* on_finish) {
+ return new CreateLocalImageRequest(threads, local_io_ctx, remote_image_ctx,
+ global_image_id, pool_meta_cache,
+ progress_ctx, state_builder, on_finish);
+ }
+
+ CreateLocalImageRequest(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& local_io_ctx,
+ ImageCtxT* remote_image_ctx,
+ const std::string& global_image_id,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>* state_builder,
+ Context* on_finish)
+ : BaseRequest(on_finish),
+ m_threads(threads),
+ m_local_io_ctx(local_io_ctx),
+ m_remote_image_ctx(remote_image_ctx),
+ m_global_image_id(global_image_id),
+ m_pool_meta_cache(pool_meta_cache),
+ m_progress_ctx(progress_ctx),
+ m_state_builder(state_builder) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UNREGISTER_CLIENT < * * * * * * * *
+ * | *
+ * v *
+ * REGISTER_CLIENT *
+ * | *
+ * v (id exists) *
+ * CREATE_LOCAL_IMAGE * * * * * * * * *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ Threads<ImageCtxT>* m_threads;
+ librados::IoCtx& m_local_io_ctx;
+ ImageCtxT* m_remote_image_ctx;
+ std::string m_global_image_id;
+ PoolMetaCache* m_pool_meta_cache;
+ ProgressContext* m_progress_ctx;
+ StateBuilder<ImageCtxT>* m_state_builder;
+
+ void unregister_client();
+ void handle_unregister_client(int r);
+
+ void register_client();
+ void handle_register_client(int r);
+
+ void create_local_image();
+ void handle_create_local_image(int r);
+
+ void update_progress(const std::string& description);
+
+};
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::journal::CreateLocalImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_CREATE_LOCAL_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.cc b/src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.cc
new file mode 100644
index 000000000..f5d49048e
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.cc
@@ -0,0 +1,206 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "EventPreprocessor.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/Types.h"
+#include <boost/variant.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::journal::" \
+ << "EventPreprocessor: " << this << " " << __func__ \
+ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+EventPreprocessor<I>::EventPreprocessor(I &local_image_ctx,
+ Journaler &remote_journaler,
+ const std::string &local_mirror_uuid,
+ MirrorPeerClientMeta *client_meta,
+ librbd::asio::ContextWQ *work_queue)
+ : m_local_image_ctx(local_image_ctx), m_remote_journaler(remote_journaler),
+ m_local_mirror_uuid(local_mirror_uuid), m_client_meta(client_meta),
+ m_work_queue(work_queue) {
+}
+
+template <typename I>
+EventPreprocessor<I>::~EventPreprocessor() {
+ ceph_assert(!m_in_progress);
+}
+
+template <typename I>
+bool EventPreprocessor<I>::is_required(const EventEntry &event_entry) {
+ SnapSeqs snap_seqs(m_client_meta->snap_seqs);
+ return (prune_snap_map(&snap_seqs) ||
+ event_entry.get_event_type() ==
+ librbd::journal::EVENT_TYPE_SNAP_RENAME);
+}
+
+template <typename I>
+void EventPreprocessor<I>::preprocess(EventEntry *event_entry,
+ Context *on_finish) {
+ ceph_assert(!m_in_progress);
+ m_in_progress = true;
+ m_event_entry = event_entry;
+ m_on_finish = on_finish;
+
+ refresh_image();
+}
+
+template <typename I>
+void EventPreprocessor<I>::refresh_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ EventPreprocessor<I>, &EventPreprocessor<I>::handle_refresh_image>(this);
+ m_local_image_ctx.state->refresh(ctx);
+}
+
+template <typename I>
+void EventPreprocessor<I>::handle_refresh_image(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error encountered during image refresh: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ preprocess_event();
+}
+
+template <typename I>
+void EventPreprocessor<I>::preprocess_event() {
+ dout(20) << dendl;
+
+ m_snap_seqs = m_client_meta->snap_seqs;
+ m_snap_seqs_updated = prune_snap_map(&m_snap_seqs);
+
+ int r = boost::apply_visitor(PreprocessEventVisitor(this),
+ m_event_entry->event);
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ update_client();
+}
+
+template <typename I>
+int EventPreprocessor<I>::preprocess_snap_rename(
+ librbd::journal::SnapRenameEvent &event) {
+ dout(20) << "remote_snap_id=" << event.snap_id << ", "
+ << "src_snap_name=" << event.src_snap_name << ", "
+ << "dest_snap_name=" << event.dst_snap_name << dendl;
+
+ auto snap_seq_it = m_snap_seqs.find(event.snap_id);
+ if (snap_seq_it != m_snap_seqs.end()) {
+ dout(20) << "remapping remote snap id " << snap_seq_it->first << " "
+ << "to local snap id " << snap_seq_it->second << dendl;
+ event.snap_id = snap_seq_it->second;
+ return 0;
+ }
+
+ auto snap_id_it = m_local_image_ctx.snap_ids.find({cls::rbd::UserSnapshotNamespace(),
+ event.src_snap_name});
+ if (snap_id_it == m_local_image_ctx.snap_ids.end()) {
+ dout(20) << "cannot map remote snapshot '" << event.src_snap_name << "' "
+ << "to local snapshot" << dendl;
+ event.snap_id = CEPH_NOSNAP;
+ return -ENOENT;
+ }
+
+ dout(20) << "mapping remote snap id " << event.snap_id << " "
+ << "to local snap id " << snap_id_it->second << dendl;
+ m_snap_seqs_updated = true;
+ m_snap_seqs[event.snap_id] = snap_id_it->second;
+ event.snap_id = snap_id_it->second;
+ return 0;
+}
+
+template <typename I>
+void EventPreprocessor<I>::update_client() {
+ if (!m_snap_seqs_updated) {
+ finish(0);
+ return;
+ }
+
+ dout(20) << dendl;
+ librbd::journal::MirrorPeerClientMeta client_meta(*m_client_meta);
+ client_meta.snap_seqs = m_snap_seqs;
+
+ librbd::journal::ClientData client_data(client_meta);
+ bufferlist data_bl;
+ encode(client_data, data_bl);
+
+ Context *ctx = create_context_callback<
+ EventPreprocessor<I>, &EventPreprocessor<I>::handle_update_client>(
+ this);
+ m_remote_journaler.update_client(data_bl, ctx);
+}
+
+template <typename I>
+void EventPreprocessor<I>::handle_update_client(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to update mirror peer journal client: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_client_meta->snap_seqs = m_snap_seqs;
+ finish(0);
+}
+
+template <typename I>
+bool EventPreprocessor<I>::prune_snap_map(SnapSeqs *snap_seqs) {
+ bool pruned = false;
+
+ std::shared_lock image_locker{m_local_image_ctx.image_lock};
+ for (auto it = snap_seqs->begin(); it != snap_seqs->end(); ) {
+ auto current_it(it++);
+ if (m_local_image_ctx.snap_info.count(current_it->second) == 0) {
+ snap_seqs->erase(current_it);
+ pruned = true;
+ }
+ }
+ return pruned;
+}
+
+template <typename I>
+void EventPreprocessor<I>::finish(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ Context *on_finish = m_on_finish;
+ m_on_finish = nullptr;
+ m_event_entry = nullptr;
+ m_in_progress = false;
+ m_snap_seqs_updated = false;
+ m_work_queue->queue(on_finish, r);
+}
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::journal::EventPreprocessor<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.h b/src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.h
new file mode 100644
index 000000000..12f70eb93
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.h
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_EVENT_PREPROCESSOR_H
+#define RBD_MIRROR_IMAGE_REPLAYER_EVENT_PREPROCESSOR_H
+
+#include "include/int_types.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include <map>
+#include <string>
+#include <boost/variant/static_visitor.hpp>
+
+struct Context;
+namespace journal { class Journaler; }
+namespace librbd {
+class ImageCtx;
+namespace asio { struct ContextWQ; }
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class EventPreprocessor {
+public:
+ using Journaler = typename librbd::journal::TypeTraits<ImageCtxT>::Journaler;
+ using EventEntry = librbd::journal::EventEntry;
+ using MirrorPeerClientMeta = librbd::journal::MirrorPeerClientMeta;
+
+ static EventPreprocessor *create(ImageCtxT &local_image_ctx,
+ Journaler &remote_journaler,
+ const std::string &local_mirror_uuid,
+ MirrorPeerClientMeta *client_meta,
+ librbd::asio::ContextWQ *work_queue) {
+ return new EventPreprocessor(local_image_ctx, remote_journaler,
+ local_mirror_uuid, client_meta, work_queue);
+ }
+
+ static void destroy(EventPreprocessor* processor) {
+ delete processor;
+ }
+
+ EventPreprocessor(ImageCtxT &local_image_ctx, Journaler &remote_journaler,
+ const std::string &local_mirror_uuid,
+ MirrorPeerClientMeta *client_meta,
+ librbd::asio::ContextWQ *work_queue);
+ ~EventPreprocessor();
+
+ bool is_required(const EventEntry &event_entry);
+ void preprocess(EventEntry *event_entry, Context *on_finish);
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (skip if not required)
+ * REFRESH_IMAGE
+ * |
+ * v (skip if not required)
+ * PREPROCESS_EVENT
+ * |
+ * v (skip if not required)
+ * UPDATE_CLIENT
+ *
+ * @endverbatim
+ */
+
+ typedef std::map<uint64_t, uint64_t> SnapSeqs;
+
+ class PreprocessEventVisitor : public boost::static_visitor<int> {
+ public:
+ EventPreprocessor *event_preprocessor;
+
+ PreprocessEventVisitor(EventPreprocessor *event_preprocessor)
+ : event_preprocessor(event_preprocessor) {
+ }
+
+ template <typename T>
+ inline int operator()(T&) const {
+ return 0;
+ }
+ inline int operator()(librbd::journal::SnapRenameEvent &event) const {
+ return event_preprocessor->preprocess_snap_rename(event);
+ }
+ };
+
+ ImageCtxT &m_local_image_ctx;
+ Journaler &m_remote_journaler;
+ std::string m_local_mirror_uuid;
+ MirrorPeerClientMeta *m_client_meta;
+ librbd::asio::ContextWQ *m_work_queue;
+
+ bool m_in_progress = false;
+ EventEntry *m_event_entry = nullptr;
+ Context *m_on_finish = nullptr;
+
+ SnapSeqs m_snap_seqs;
+ bool m_snap_seqs_updated = false;
+
+ bool prune_snap_map(SnapSeqs *snap_seqs);
+
+ void refresh_image();
+ void handle_refresh_image(int r);
+
+ void preprocess_event();
+ int preprocess_snap_rename(librbd::journal::SnapRenameEvent &event);
+
+ void update_client();
+ void handle_update_client(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::journal::EventPreprocessor<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_EVENT_PREPROCESSOR_H
diff --git a/src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.cc b/src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.cc
new file mode 100644
index 000000000..c8a96a4ad
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.cc
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PrepareReplayRequest.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "tools/rbd_mirror/ProgressContext.h"
+#include "tools/rbd_mirror/image_replayer/journal/StateBuilder.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::journal::" \
+ << "PrepareReplayRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+void PrepareReplayRequest<I>::send() {
+ *m_resync_requested = false;
+ *m_syncing = false;
+
+ if (m_state_builder->local_image_id !=
+ m_state_builder->remote_client_meta.image_id) {
+ // somehow our local image has a different image id than the image id
+ // registered in the remote image
+ derr << "split-brain detected: local_image_id="
+ << m_state_builder->local_image_id << ", "
+ << "registered local_image_id="
+ << m_state_builder->remote_client_meta.image_id << dendl;
+ finish(-EEXIST);
+ return;
+ }
+
+ std::shared_lock image_locker(m_state_builder->local_image_ctx->image_lock);
+ if (m_state_builder->local_image_ctx->journal == nullptr) {
+ image_locker.unlock();
+
+ derr << "local image does not support journaling" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ int r = m_state_builder->local_image_ctx->journal->is_resync_requested(
+ m_resync_requested);
+ if (r < 0) {
+ image_locker.unlock();
+
+ derr << "failed to check if a resync was requested" << dendl;
+ finish(r);
+ return;
+ }
+
+ m_local_tag_tid = m_state_builder->local_image_ctx->journal->get_tag_tid();
+ m_local_tag_data = m_state_builder->local_image_ctx->journal->get_tag_data();
+ dout(10) << "local tag=" << m_local_tag_tid << ", "
+ << "local tag data=" << m_local_tag_data << dendl;
+ image_locker.unlock();
+
+ if (*m_resync_requested) {
+ finish(0);
+ return;
+ } else if (m_state_builder->remote_client_meta.state ==
+ librbd::journal::MIRROR_PEER_STATE_SYNCING &&
+ m_local_tag_data.mirror_uuid ==
+ m_state_builder->remote_mirror_uuid) {
+ // if the initial sync hasn't completed, we cannot replay
+ *m_syncing = true;
+ finish(0);
+ return;
+ }
+
+ update_client_state();
+}
+
+template <typename I>
+void PrepareReplayRequest<I>::update_client_state() {
+ if (m_state_builder->remote_client_meta.state !=
+ librbd::journal::MIRROR_PEER_STATE_SYNCING ||
+ m_local_tag_data.mirror_uuid == m_state_builder->remote_mirror_uuid) {
+ get_remote_tag_class();
+ return;
+ }
+
+ // our local image is not primary, is flagged as syncing on the remote side,
+ // but is no longer tied to the remote -- this implies we were forced
+ // promoted and then demoted at some point
+ dout(15) << dendl;
+ update_progress("UPDATE_CLIENT_STATE");
+
+ auto client_meta = m_state_builder->remote_client_meta;
+ client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+ librbd::journal::ClientData client_data(client_meta);
+ bufferlist data_bl;
+ encode(client_data, data_bl);
+
+ auto ctx = create_context_callback<
+ PrepareReplayRequest<I>,
+ &PrepareReplayRequest<I>::handle_update_client_state>(this);
+ m_state_builder->remote_journaler->update_client(data_bl, ctx);
+}
+
+template <typename I>
+void PrepareReplayRequest<I>::handle_update_client_state(int r) {
+ dout(15) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to update client: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_state_builder->remote_client_meta.state =
+ librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+ get_remote_tag_class();
+}
+
+template <typename I>
+void PrepareReplayRequest<I>::get_remote_tag_class() {
+ dout(10) << dendl;
+ update_progress("GET_REMOTE_TAG_CLASS");
+
+ auto ctx = create_context_callback<
+ PrepareReplayRequest<I>,
+ &PrepareReplayRequest<I>::handle_get_remote_tag_class>(this);
+ m_state_builder->remote_journaler->get_client(
+ librbd::Journal<>::IMAGE_CLIENT_ID, &m_client, ctx);
+}
+
+template <typename I>
+void PrepareReplayRequest<I>::handle_get_remote_tag_class(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to retrieve remote client: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ librbd::journal::ClientData client_data;
+ auto it = m_client.data.cbegin();
+ try {
+ decode(client_data, it);
+ } catch (const buffer::error &err) {
+ derr << "failed to decode remote client meta data: " << err.what()
+ << dendl;
+ finish(-EBADMSG);
+ return;
+ }
+
+ librbd::journal::ImageClientMeta *client_meta =
+ boost::get<librbd::journal::ImageClientMeta>(&client_data.client_meta);
+ if (client_meta == nullptr) {
+ derr << "unknown remote client registration" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ m_remote_tag_class = client_meta->tag_class;
+ dout(10) << "remote tag class=" << m_remote_tag_class << dendl;
+
+ get_remote_tags();
+}
+
+template <typename I>
+void PrepareReplayRequest<I>::get_remote_tags() {
+ dout(10) << dendl;
+ update_progress("GET_REMOTE_TAGS");
+
+ auto ctx = create_context_callback<
+ PrepareReplayRequest<I>,
+ &PrepareReplayRequest<I>::handle_get_remote_tags>(this);
+ m_state_builder->remote_journaler->get_tags(m_remote_tag_class,
+ &m_remote_tags, ctx);
+}
+
+template <typename I>
+void PrepareReplayRequest<I>::handle_get_remote_tags(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to retrieve remote tags: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ // At this point, the local image was existing, non-primary, and replaying;
+ // and the remote image is primary. Attempt to link the local image's most
+ // recent tag to the remote image's tag chain.
+ bool remote_tag_data_valid = false;
+ librbd::journal::TagData remote_tag_data;
+ boost::optional<uint64_t> remote_orphan_tag_tid =
+ boost::make_optional<uint64_t>(false, 0U);
+ bool reconnect_orphan = false;
+
+ // decode the remote tags
+ for (auto &remote_tag : m_remote_tags) {
+ if (m_local_tag_data.predecessor.commit_valid &&
+ m_local_tag_data.predecessor.mirror_uuid ==
+ m_state_builder->remote_mirror_uuid &&
+ m_local_tag_data.predecessor.tag_tid > remote_tag.tid) {
+ dout(10) << "skipping processed predecessor remote tag "
+ << remote_tag.tid << dendl;
+ continue;
+ }
+
+ try {
+ auto it = remote_tag.data.cbegin();
+ decode(remote_tag_data, it);
+ remote_tag_data_valid = true;
+ } catch (const buffer::error &err) {
+ derr << "failed to decode remote tag " << remote_tag.tid << ": "
+ << err.what() << dendl;
+ finish(-EBADMSG);
+ return;
+ }
+
+ dout(10) << "decoded remote tag " << remote_tag.tid << ": "
+ << remote_tag_data << dendl;
+
+ if (!m_local_tag_data.predecessor.commit_valid) {
+ // newly synced local image (no predecessor) replays from the first tag
+ if (remote_tag_data.mirror_uuid != librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ dout(10) << "skipping non-primary remote tag" << dendl;
+ continue;
+ }
+
+ dout(10) << "using initial primary remote tag" << dendl;
+ break;
+ }
+
+ if (m_local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+ // demotion last available local epoch
+
+ if (remote_tag_data.mirror_uuid == m_local_tag_data.mirror_uuid &&
+ remote_tag_data.predecessor.commit_valid &&
+ remote_tag_data.predecessor.tag_tid ==
+ m_local_tag_data.predecessor.tag_tid) {
+ // demotion matches remote epoch
+
+ if (remote_tag_data.predecessor.mirror_uuid == m_local_mirror_uuid &&
+ m_local_tag_data.predecessor.mirror_uuid ==
+ librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ // local demoted and remote has matching event
+ dout(10) << "found matching local demotion tag" << dendl;
+ remote_orphan_tag_tid = remote_tag.tid;
+ continue;
+ }
+
+ if (m_local_tag_data.predecessor.mirror_uuid ==
+ m_state_builder->remote_mirror_uuid &&
+ remote_tag_data.predecessor.mirror_uuid ==
+ librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ // remote demoted and local has matching event
+ dout(10) << "found matching remote demotion tag" << dendl;
+ remote_orphan_tag_tid = remote_tag.tid;
+ continue;
+ }
+ }
+
+ if (remote_tag_data.mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID &&
+ remote_tag_data.predecessor.mirror_uuid ==
+ librbd::Journal<>::ORPHAN_MIRROR_UUID &&
+ remote_tag_data.predecessor.commit_valid && remote_orphan_tag_tid &&
+ remote_tag_data.predecessor.tag_tid == *remote_orphan_tag_tid) {
+ // remote promotion tag chained to remote/local demotion tag
+ dout(10) << "found chained remote promotion tag" << dendl;
+ reconnect_orphan = true;
+ break;
+ }
+
+ // promotion must follow demotion
+ remote_orphan_tag_tid = boost::none;
+ }
+ }
+
+ if (remote_tag_data_valid &&
+ m_local_tag_data.mirror_uuid == m_state_builder->remote_mirror_uuid) {
+ dout(10) << "local image is in clean replay state" << dendl;
+ } else if (reconnect_orphan) {
+ dout(10) << "remote image was demoted/promoted" << dendl;
+ } else {
+ derr << "split-brain detected -- skipping image replay" << dendl;
+ finish(-EEXIST);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void PrepareReplayRequest<I>::update_progress(const std::string &description) {
+ dout(10) << description << dendl;
+
+ if (m_progress_ctx != nullptr) {
+ m_progress_ctx->update_progress(description);
+ }
+}
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::journal::PrepareReplayRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.h b/src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.h
new file mode 100644
index 000000000..2b6fb659b
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.h
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_PREPARE_REPLAY_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_PREPARE_REPLAY_REQUEST_H
+
+#include "include/int_types.h"
+#include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/Types.h"
+#include "librbd/mirror/Types.h"
+#include "tools/rbd_mirror/BaseRequest.h"
+#include <list>
+#include <string>
+
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+class ProgressContext;
+
+namespace image_replayer {
+namespace journal {
+
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT>
+class PrepareReplayRequest : public BaseRequest {
+public:
+ static PrepareReplayRequest* create(
+ const std::string& local_mirror_uuid,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>* state_builder,
+ bool* resync_requested,
+ bool* syncing,
+ Context* on_finish) {
+ return new PrepareReplayRequest(
+ local_mirror_uuid, progress_ctx, state_builder, resync_requested,
+ syncing, on_finish);
+ }
+
+ PrepareReplayRequest(
+ const std::string& local_mirror_uuid,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>* state_builder,
+ bool* resync_requested,
+ bool* syncing,
+ Context* on_finish)
+ : BaseRequest(on_finish),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_progress_ctx(progress_ctx),
+ m_state_builder(state_builder),
+ m_resync_requested(resync_requested),
+ m_syncing(syncing) {
+ }
+
+ void send() override;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UPDATE_CLIENT_STATE
+ * |
+ * v
+ * GET_REMOTE_TAG_CLASS
+ * |
+ * v
+ * GET_REMOTE_TAGS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ typedef std::list<cls::journal::Tag> Tags;
+
+ std::string m_local_mirror_uuid;
+ ProgressContext* m_progress_ctx;
+ StateBuilder<ImageCtxT>* m_state_builder;
+ bool* m_resync_requested;
+ bool* m_syncing;
+
+ uint64_t m_local_tag_tid = 0;
+ librbd::journal::TagData m_local_tag_data;
+
+ uint64_t m_remote_tag_class = 0;
+ Tags m_remote_tags;
+ cls::journal::Client m_client;
+
+ void update_client_state();
+ void handle_update_client_state(int r);
+
+ void get_remote_tag_class();
+ void handle_get_remote_tag_class(int r);
+
+ void get_remote_tags();
+ void handle_get_remote_tags(int r);
+
+ void update_progress(const std::string& description);
+
+};
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::journal::PrepareReplayRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_PREPARE_REPLAY_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.cc b/src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.cc
new file mode 100644
index 000000000..eb99d5add
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.cc
@@ -0,0 +1,284 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ReplayStatusFormatter.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "json_spirit/json_spirit.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::journal::" \
+ << "ReplayStatusFormatter: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+using librbd::util::unique_lock_name;
+
+namespace {
+
+double round_to_two_places(double value) {
+ return abs(round(value * 100) / 100);
+}
+
+json_spirit::mObject to_json_object(
+ const cls::journal::ObjectPosition& position) {
+ json_spirit::mObject object;
+ if (position != cls::journal::ObjectPosition{}) {
+ object["object_number"] = position.object_number;
+ object["tag_tid"] = position.tag_tid;
+ object["entry_tid"] = position.entry_tid;
+ }
+ return object;
+}
+
+} // anonymous namespace
+
+template <typename I>
+ReplayStatusFormatter<I>::ReplayStatusFormatter(Journaler *journaler,
+ const std::string &mirror_uuid)
+ : m_journaler(journaler),
+ m_mirror_uuid(mirror_uuid),
+ m_lock(ceph::make_mutex(unique_lock_name("ReplayStatusFormatter::m_lock", this))) {
+}
+
+template <typename I>
+void ReplayStatusFormatter<I>::handle_entry_processed(uint32_t bytes) {
+ dout(20) << dendl;
+
+ m_bytes_per_second(bytes);
+ m_entries_per_second(1);
+}
+
+template <typename I>
+bool ReplayStatusFormatter<I>::get_or_send_update(std::string *description,
+ Context *on_finish) {
+ dout(20) << dendl;
+
+ bool in_progress = false;
+ {
+ std::lock_guard locker{m_lock};
+ if (m_on_finish) {
+ in_progress = true;
+ } else {
+ m_on_finish = on_finish;
+ }
+ }
+
+ if (in_progress) {
+ dout(10) << "previous request is still in progress, ignoring" << dendl;
+ on_finish->complete(-EAGAIN);
+ return false;
+ }
+
+ m_master_position = cls::journal::ObjectPosition();
+ m_mirror_position = cls::journal::ObjectPosition();
+
+ cls::journal::Client master_client, mirror_client;
+ int r;
+
+ r = m_journaler->get_cached_client(librbd::Journal<>::IMAGE_CLIENT_ID,
+ &master_client);
+ if (r < 0) {
+ derr << "error retrieving registered master client: "
+ << cpp_strerror(r) << dendl;
+ } else {
+ r = m_journaler->get_cached_client(m_mirror_uuid, &mirror_client);
+ if (r < 0) {
+ derr << "error retrieving registered mirror client: "
+ << cpp_strerror(r) << dendl;
+ }
+ }
+
+ if (!master_client.commit_position.object_positions.empty()) {
+ m_master_position =
+ *(master_client.commit_position.object_positions.begin());
+ }
+
+ if (!mirror_client.commit_position.object_positions.empty()) {
+ m_mirror_position =
+ *(mirror_client.commit_position.object_positions.begin());
+ }
+
+ if (!calculate_behind_master_or_send_update()) {
+ dout(20) << "need to update tag cache" << dendl;
+ return false;
+ }
+
+ format(description);
+
+ {
+ std::lock_guard locker{m_lock};
+ ceph_assert(m_on_finish == on_finish);
+ m_on_finish = nullptr;
+ }
+
+ on_finish->complete(-EEXIST);
+ return true;
+}
+
+template <typename I>
+bool ReplayStatusFormatter<I>::calculate_behind_master_or_send_update() {
+ dout(20) << "m_master_position=" << m_master_position
+ << ", m_mirror_position=" << m_mirror_position << dendl;
+
+ m_entries_behind_master = 0;
+
+ if (m_master_position == cls::journal::ObjectPosition() ||
+ m_master_position.tag_tid < m_mirror_position.tag_tid) {
+ return true;
+ }
+
+ cls::journal::ObjectPosition master = m_master_position;
+ uint64_t mirror_tag_tid = m_mirror_position.tag_tid;
+
+ while (master.tag_tid > mirror_tag_tid) {
+ auto tag_it = m_tag_cache.find(master.tag_tid);
+ if (tag_it == m_tag_cache.end()) {
+ send_update_tag_cache(master.tag_tid, mirror_tag_tid);
+ return false;
+ }
+ librbd::journal::TagData &tag_data = tag_it->second;
+ m_entries_behind_master += master.entry_tid;
+ master = {0, tag_data.predecessor.tag_tid, tag_data.predecessor.entry_tid};
+ }
+ if (master.tag_tid == mirror_tag_tid &&
+ master.entry_tid > m_mirror_position.entry_tid) {
+ m_entries_behind_master += master.entry_tid - m_mirror_position.entry_tid;
+ }
+
+ dout(20) << "clearing tags not needed any more (below mirror position)"
+ << dendl;
+
+ uint64_t tag_tid = mirror_tag_tid;
+ size_t old_size = m_tag_cache.size();
+ while (tag_tid != 0) {
+ auto tag_it = m_tag_cache.find(tag_tid);
+ if (tag_it == m_tag_cache.end()) {
+ break;
+ }
+ librbd::journal::TagData &tag_data = tag_it->second;
+
+ dout(20) << "erasing tag " << tag_data << "for tag_tid " << tag_tid
+ << dendl;
+
+ tag_tid = tag_data.predecessor.tag_tid;
+ m_tag_cache.erase(tag_it);
+ }
+
+ dout(20) << old_size - m_tag_cache.size() << " entries cleared" << dendl;
+
+ return true;
+}
+
+template <typename I>
+void ReplayStatusFormatter<I>::send_update_tag_cache(uint64_t master_tag_tid,
+ uint64_t mirror_tag_tid) {
+ if (master_tag_tid <= mirror_tag_tid ||
+ m_tag_cache.find(master_tag_tid) != m_tag_cache.end()) {
+ Context *on_finish = nullptr;
+ {
+ std::lock_guard locker{m_lock};
+ std::swap(m_on_finish, on_finish);
+ }
+
+ ceph_assert(on_finish);
+ on_finish->complete(0);
+ return;
+ }
+
+ dout(20) << "master_tag_tid=" << master_tag_tid << ", mirror_tag_tid="
+ << mirror_tag_tid << dendl;
+
+ auto ctx = new LambdaContext(
+ [this, master_tag_tid, mirror_tag_tid](int r) {
+ handle_update_tag_cache(master_tag_tid, mirror_tag_tid, r);
+ });
+ m_journaler->get_tag(master_tag_tid, &m_tag, ctx);
+}
+
+template <typename I>
+void ReplayStatusFormatter<I>::handle_update_tag_cache(uint64_t master_tag_tid,
+ uint64_t mirror_tag_tid,
+ int r) {
+ librbd::journal::TagData tag_data;
+
+ if (r < 0) {
+ derr << "error retrieving tag " << master_tag_tid << ": " << cpp_strerror(r)
+ << dendl;
+ } else {
+ dout(20) << "retrieved tag " << master_tag_tid << ": " << m_tag << dendl;
+
+ auto it = m_tag.data.cbegin();
+ try {
+ decode(tag_data, it);
+ } catch (const buffer::error &err) {
+ derr << "error decoding tag " << master_tag_tid << ": " << err.what()
+ << dendl;
+ }
+ }
+
+ if (tag_data.predecessor.mirror_uuid !=
+ librbd::Journal<>::LOCAL_MIRROR_UUID &&
+ tag_data.predecessor.mirror_uuid !=
+ librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+ dout(20) << "hit remote image non-primary epoch" << dendl;
+ tag_data.predecessor = {};
+ }
+
+ dout(20) << "decoded tag " << master_tag_tid << ": " << tag_data << dendl;
+
+ m_tag_cache[master_tag_tid] = tag_data;
+ send_update_tag_cache(tag_data.predecessor.tag_tid, mirror_tag_tid);
+}
+
+template <typename I>
+void ReplayStatusFormatter<I>::format(std::string *description) {
+ dout(20) << "m_master_position=" << m_master_position
+ << ", m_mirror_position=" << m_mirror_position
+ << ", m_entries_behind_master=" << m_entries_behind_master << dendl;
+
+ json_spirit::mObject root_obj;
+ root_obj["primary_position"] = to_json_object(m_master_position);
+ root_obj["non_primary_position"] = to_json_object(m_mirror_position);
+ root_obj["entries_behind_primary"] = (
+ m_entries_behind_master > 0 ? m_entries_behind_master : 0);
+
+ m_bytes_per_second(0);
+ root_obj["bytes_per_second"] = round_to_two_places(
+ m_bytes_per_second.get_average());
+
+ m_entries_per_second(0);
+ auto entries_per_second = m_entries_per_second.get_average();
+ root_obj["entries_per_second"] = round_to_two_places(entries_per_second);
+
+ if (m_entries_behind_master > 0 && entries_per_second > 0) {
+ std::uint64_t seconds_until_synced = round_to_two_places(
+ m_entries_behind_master / entries_per_second);
+ if (seconds_until_synced >= std::numeric_limits<uint64_t>::max()) {
+ seconds_until_synced = std::numeric_limits<uint64_t>::max();
+ }
+
+ root_obj["seconds_until_synced"] = seconds_until_synced;
+ }
+
+ *description = json_spirit::write(
+ root_obj, json_spirit::remove_trailing_zeros);
+}
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::journal::ReplayStatusFormatter<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.h b/src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.h
new file mode 100644
index 000000000..5dbbfe10d
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.h
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_REPLAY_STATUS_FORMATTER_H
+#define RBD_MIRROR_IMAGE_REPLAYER_REPLAY_STATUS_FORMATTER_H
+
+#include "include/Context.h"
+#include "common/ceph_mutex.h"
+#include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include "tools/rbd_mirror/image_replayer/TimeRollingMean.h"
+
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ReplayStatusFormatter {
+public:
+ typedef typename librbd::journal::TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ static ReplayStatusFormatter* create(Journaler *journaler,
+ const std::string &mirror_uuid) {
+ return new ReplayStatusFormatter(journaler, mirror_uuid);
+ }
+
+ static void destroy(ReplayStatusFormatter* formatter) {
+ delete formatter;
+ }
+
+ ReplayStatusFormatter(Journaler *journaler, const std::string &mirror_uuid);
+
+ void handle_entry_processed(uint32_t bytes);
+
+ bool get_or_send_update(std::string *description, Context *on_finish);
+
+private:
+ Journaler *m_journaler;
+ std::string m_mirror_uuid;
+ ceph::mutex m_lock;
+ Context *m_on_finish = nullptr;
+ cls::journal::ObjectPosition m_master_position;
+ cls::journal::ObjectPosition m_mirror_position;
+ int64_t m_entries_behind_master = 0;
+ cls::journal::Tag m_tag;
+ std::map<uint64_t, librbd::journal::TagData> m_tag_cache;
+
+ TimeRollingMean m_bytes_per_second;
+ TimeRollingMean m_entries_per_second;
+
+ bool calculate_behind_master_or_send_update();
+ void send_update_tag_cache(uint64_t master_tag_tid, uint64_t mirror_tag_tid);
+ void handle_update_tag_cache(uint64_t master_tag_tid, uint64_t mirror_tag_tid,
+ int r);
+ void format(std::string *description);
+};
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::journal::ReplayStatusFormatter<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_REPLAY_STATUS_FORMATTER_H
diff --git a/src/tools/rbd_mirror/image_replayer/journal/Replayer.cc b/src/tools/rbd_mirror/image_replayer/journal/Replayer.cc
new file mode 100644
index 000000000..3ce9104d2
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/Replayer.cc
@@ -0,0 +1,1303 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Replayer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/journal/Replay.h"
+#include "journal/Journaler.h"
+#include "journal/JournalMetadataListener.h"
+#include "journal/ReplayHandler.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/image_replayer/CloseImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/ReplayerListener.h"
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+#include "tools/rbd_mirror/image_replayer/journal/EventPreprocessor.h"
+#include "tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.h"
+#include "tools/rbd_mirror/image_replayer/journal/StateBuilder.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::journal::" \
+ << "Replayer: " << this << " " << __func__ << ": "
+
+extern PerfCounters *g_journal_perf_counters;
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+namespace {
+
+uint32_t calculate_replay_delay(const utime_t &event_time,
+ int mirroring_replay_delay) {
+ if (mirroring_replay_delay <= 0) {
+ return 0;
+ }
+
+ utime_t now = ceph_clock_now();
+ if (event_time + mirroring_replay_delay <= now) {
+ return 0;
+ }
+
+ // ensure it is rounded up when converting to integer
+ return (event_time + mirroring_replay_delay - now) + 1;
+}
+
+} // anonymous namespace
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+struct Replayer<I>::C_ReplayCommitted : public Context {
+ Replayer* replayer;
+ ReplayEntry replay_entry;
+ uint64_t replay_bytes;
+ utime_t replay_start_time;
+
+ C_ReplayCommitted(Replayer* replayer, ReplayEntry &&replay_entry,
+ uint64_t replay_bytes, const utime_t &replay_start_time)
+ : replayer(replayer), replay_entry(std::move(replay_entry)),
+ replay_bytes(replay_bytes), replay_start_time(replay_start_time) {
+ }
+
+ void finish(int r) override {
+ replayer->handle_process_entry_safe(replay_entry, replay_bytes,
+ replay_start_time, r);
+ }
+};
+
+template <typename I>
+struct Replayer<I>::RemoteJournalerListener
+ : public ::journal::JournalMetadataListener {
+ Replayer* replayer;
+
+ RemoteJournalerListener(Replayer* replayer) : replayer(replayer) {}
+
+ void handle_update(::journal::JournalMetadata*) override {
+ auto ctx = new C_TrackedOp(
+ replayer->m_in_flight_op_tracker,
+ new LambdaContext([this](int r) {
+ replayer->handle_remote_journal_metadata_updated();
+ }));
+ replayer->m_threads->work_queue->queue(ctx, 0);
+ }
+};
+
+template <typename I>
+struct Replayer<I>::RemoteReplayHandler : public ::journal::ReplayHandler {
+ Replayer* replayer;
+
+ RemoteReplayHandler(Replayer* replayer) : replayer(replayer) {}
+ ~RemoteReplayHandler() override {};
+
+ void handle_entries_available() override {
+ replayer->handle_replay_ready();
+ }
+
+ void handle_complete(int r) override {
+ std::string error;
+ if (r == -ENOMEM) {
+ error = "not enough memory in autotune cache";
+ } else if (r < 0) {
+ error = "replay completed with error: " + cpp_strerror(r);
+ }
+ replayer->handle_replay_complete(r, error);
+ }
+};
+
+template <typename I>
+struct Replayer<I>::LocalJournalListener
+ : public librbd::journal::Listener {
+ Replayer* replayer;
+
+ LocalJournalListener(Replayer* replayer) : replayer(replayer) {
+ }
+
+ void handle_close() override {
+ replayer->handle_replay_complete(0, "");
+ }
+
+ void handle_promoted() override {
+ replayer->handle_replay_complete(0, "force promoted");
+ }
+
+ void handle_resync() override {
+ replayer->handle_resync_image();
+ }
+};
+
+template <typename I>
+Replayer<I>::Replayer(
+ Threads<I>* threads,
+ const std::string& local_mirror_uuid,
+ StateBuilder<I>* state_builder,
+ ReplayerListener* replayer_listener)
+ : m_threads(threads),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_state_builder(state_builder),
+ m_replayer_listener(replayer_listener),
+ m_lock(ceph::make_mutex(librbd::util::unique_lock_name(
+ "rbd::mirror::image_replayer::journal::Replayer", this))) {
+ dout(10) << dendl;
+}
+
+template <typename I>
+Replayer<I>::~Replayer() {
+ dout(10) << dendl;
+
+ {
+ std::unique_lock locker{m_lock};
+ unregister_perf_counters();
+ }
+
+ ceph_assert(m_remote_listener == nullptr);
+ ceph_assert(m_local_journal_listener == nullptr);
+ ceph_assert(m_local_journal_replay == nullptr);
+ ceph_assert(m_remote_replay_handler == nullptr);
+ ceph_assert(m_event_preprocessor == nullptr);
+ ceph_assert(m_replay_status_formatter == nullptr);
+ ceph_assert(m_delayed_preprocess_task == nullptr);
+ ceph_assert(m_flush_local_replay_task == nullptr);
+ ceph_assert(m_state_builder->local_image_ctx == nullptr);
+}
+
+template <typename I>
+void Replayer<I>::init(Context* on_finish) {
+ dout(10) << dendl;
+
+ {
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+ std::shared_lock image_locker{local_image_ctx->image_lock};
+ m_image_spec = util::compute_image_spec(local_image_ctx->md_ctx,
+ local_image_ctx->name);
+ }
+
+ {
+ std::unique_lock locker{m_lock};
+ register_perf_counters();
+ }
+
+ ceph_assert(m_on_init_shutdown == nullptr);
+ m_on_init_shutdown = on_finish;
+
+ init_remote_journaler();
+}
+
+template <typename I>
+void Replayer<I>::shut_down(Context* on_finish) {
+ dout(10) << dendl;
+
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_on_init_shutdown == nullptr);
+ m_on_init_shutdown = on_finish;
+
+ if (m_state == STATE_INIT) {
+ // raced with the last piece of the init state machine
+ return;
+ } else if (m_state == STATE_REPLAYING) {
+ m_state = STATE_COMPLETE;
+ }
+
+ // if shutting down due to an error notification, we don't
+ // need to propagate the same error again
+ m_error_code = 0;
+ m_error_description = "";
+
+ cancel_delayed_preprocess_task();
+ cancel_flush_local_replay_task();
+ wait_for_flush();
+}
+
+template <typename I>
+void Replayer<I>::flush(Context* on_finish) {
+ dout(10) << dendl;
+
+ flush_local_replay(new C_TrackedOp(m_in_flight_op_tracker, on_finish));
+}
+
+template <typename I>
+bool Replayer<I>::get_replay_status(std::string* description,
+ Context* on_finish) {
+ dout(10) << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (m_replay_status_formatter == nullptr) {
+ derr << "replay not running" << dendl;
+ locker.unlock();
+
+ on_finish->complete(-EAGAIN);
+ return false;
+ }
+
+ on_finish = new C_TrackedOp(m_in_flight_op_tracker, on_finish);
+ return m_replay_status_formatter->get_or_send_update(description,
+ on_finish);
+}
+
+template <typename I>
+void Replayer<I>::init_remote_journaler() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ Replayer, &Replayer<I>::handle_init_remote_journaler>(this);
+ m_state_builder->remote_journaler->init(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_init_remote_journaler(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (r < 0) {
+ derr << "failed to initialize remote journal: " << cpp_strerror(r) << dendl;
+ handle_replay_complete(locker, r, "error initializing remote journal");
+ close_local_image();
+ return;
+ }
+
+ // listen for metadata updates to check for disconnect events
+ ceph_assert(m_remote_listener == nullptr);
+ m_remote_listener = new RemoteJournalerListener(this);
+ m_state_builder->remote_journaler->add_listener(m_remote_listener);
+
+ cls::journal::Client remote_client;
+ r = m_state_builder->remote_journaler->get_cached_client(m_local_mirror_uuid,
+ &remote_client);
+ if (r < 0) {
+ derr << "error retrieving remote journal client: " << cpp_strerror(r)
+ << dendl;
+ handle_replay_complete(locker, r, "error retrieving remote journal client");
+ close_local_image();
+ return;
+ }
+
+ std::string error;
+ r = validate_remote_client_state(remote_client,
+ &m_state_builder->remote_client_meta,
+ &m_resync_requested, &error);
+ if (r < 0) {
+ handle_replay_complete(locker, r, error);
+ close_local_image();
+ return;
+ }
+
+ start_external_replay(locker);
+}
+
+template <typename I>
+void Replayer<I>::start_external_replay(std::unique_lock<ceph::mutex>& locker) {
+ dout(10) << dendl;
+
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+ std::shared_lock local_image_locker{local_image_ctx->image_lock};
+
+ ceph_assert(m_local_journal == nullptr);
+ m_local_journal = local_image_ctx->journal;
+ if (m_local_journal == nullptr) {
+ local_image_locker.unlock();
+
+ derr << "local image journal closed" << dendl;
+ handle_replay_complete(locker, -EINVAL, "error accessing local journal");
+ close_local_image();
+ return;
+ }
+
+ // safe to hold pointer to journal after external playback starts
+ Context *start_ctx = create_context_callback<
+ Replayer, &Replayer<I>::handle_start_external_replay>(this);
+ m_local_journal->start_external_replay(&m_local_journal_replay, start_ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_start_external_replay(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (r < 0) {
+ ceph_assert(m_local_journal_replay == nullptr);
+ derr << "error starting external replay on local image "
+ << m_state_builder->local_image_ctx->id << ": "
+ << cpp_strerror(r) << dendl;
+
+ handle_replay_complete(locker, r, "error starting replay on local image");
+ close_local_image();
+ return;
+ }
+
+ if (!notify_init_complete(locker)) {
+ return;
+ }
+
+ m_state = STATE_REPLAYING;
+
+ // check for resync/promotion state after adding listener
+ if (!add_local_journal_listener(locker)) {
+ return;
+ }
+
+ // start remote journal replay
+ m_event_preprocessor = EventPreprocessor<I>::create(
+ *m_state_builder->local_image_ctx, *m_state_builder->remote_journaler,
+ m_local_mirror_uuid, &m_state_builder->remote_client_meta,
+ m_threads->work_queue);
+ m_replay_status_formatter = ReplayStatusFormatter<I>::create(
+ m_state_builder->remote_journaler, m_local_mirror_uuid);
+
+ auto cct = static_cast<CephContext *>(m_state_builder->local_image_ctx->cct);
+ double poll_seconds = cct->_conf.get_val<double>(
+ "rbd_mirror_journal_poll_age");
+ m_remote_replay_handler = new RemoteReplayHandler(this);
+ m_state_builder->remote_journaler->start_live_replay(m_remote_replay_handler,
+ poll_seconds);
+
+ notify_status_updated();
+}
+
+template <typename I>
+bool Replayer<I>::add_local_journal_listener(
+ std::unique_lock<ceph::mutex>& locker) {
+ dout(10) << dendl;
+
+ // listen for promotion and resync requests against local journal
+ ceph_assert(m_local_journal_listener == nullptr);
+ m_local_journal_listener = new LocalJournalListener(this);
+ m_local_journal->add_listener(m_local_journal_listener);
+
+ // verify that the local image wasn't force-promoted and that a resync hasn't
+ // been requested now that we are listening for events
+ if (m_local_journal->is_tag_owner()) {
+ dout(10) << "local image force-promoted" << dendl;
+ handle_replay_complete(locker, 0, "force promoted");
+ return false;
+ }
+
+ bool resync_requested = false;
+ int r = m_local_journal->is_resync_requested(&resync_requested);
+ if (r < 0) {
+ dout(10) << "failed to determine resync state: " << cpp_strerror(r)
+ << dendl;
+ handle_replay_complete(locker, r, "error parsing resync state");
+ return false;
+ } else if (resync_requested) {
+ dout(10) << "local image resync requested" << dendl;
+ handle_replay_complete(locker, 0, "resync requested");
+ return false;
+ }
+
+ return true;
+}
+
+template <typename I>
+bool Replayer<I>::notify_init_complete(std::unique_lock<ceph::mutex>& locker) {
+ dout(10) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(m_state == STATE_INIT);
+
+ // notify that init has completed
+ Context *on_finish = nullptr;
+ std::swap(m_on_init_shutdown, on_finish);
+
+ locker.unlock();
+ on_finish->complete(0);
+ locker.lock();
+
+ if (m_on_init_shutdown != nullptr) {
+ // shut down requested after we notified init complete but before we
+ // grabbed the lock
+ close_local_image();
+ return false;
+ }
+
+ return true;
+}
+
+template <typename I>
+void Replayer<I>::wait_for_flush() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ // ensure that we don't have two concurrent local journal replay shut downs
+ dout(10) << dendl;
+ auto ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_wait_for_flush>(this));
+ m_flush_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_wait_for_flush(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ shut_down_local_journal_replay();
+}
+
+template <typename I>
+void Replayer<I>::shut_down_local_journal_replay() {
+ std::unique_lock locker{m_lock};
+
+ if (m_local_journal_replay == nullptr) {
+ wait_for_event_replay();
+ return;
+ }
+
+ // It's required to stop the local journal replay state machine prior to
+ // waiting for the events to complete. This is to ensure that IO is properly
+ // flushed (it might be batched), wait for any running ops to complete, and
+ // to cancel any ops waiting for their associated OnFinish events.
+ dout(10) << dendl;
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_shut_down_local_journal_replay>(this);
+ m_local_journal_replay->shut_down(true, ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_shut_down_local_journal_replay(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (r < 0) {
+ derr << "error shutting down journal replay: " << cpp_strerror(r) << dendl;
+ handle_replay_error(r, "failed to shut down local journal replay");
+ }
+
+ wait_for_event_replay();
+}
+
+template <typename I>
+void Replayer<I>::wait_for_event_replay() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ dout(10) << dendl;
+ auto ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_wait_for_event_replay>(this));
+ m_event_replay_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_wait_for_event_replay(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::unique_lock locker{m_lock};
+ close_local_image();
+}
+
+template <typename I>
+void Replayer<I>::close_local_image() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ if (m_state_builder->local_image_ctx == nullptr) {
+ stop_remote_journaler_replay();
+ return;
+ }
+
+ dout(10) << dendl;
+ if (m_local_journal_listener != nullptr) {
+ // blocks if listener notification is in-progress
+ m_local_journal->remove_listener(m_local_journal_listener);
+ delete m_local_journal_listener;
+ m_local_journal_listener = nullptr;
+ }
+
+ if (m_local_journal_replay != nullptr) {
+ m_local_journal->stop_external_replay();
+ m_local_journal_replay = nullptr;
+ }
+
+ if (m_event_preprocessor != nullptr) {
+ image_replayer::journal::EventPreprocessor<I>::destroy(
+ m_event_preprocessor);
+ m_event_preprocessor = nullptr;
+ }
+
+ m_local_journal.reset();
+
+ // NOTE: it's important to ensure that the local image is fully
+ // closed before attempting to close the remote journal in
+ // case the remote cluster is unreachable
+ ceph_assert(m_state_builder->local_image_ctx != nullptr);
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_close_local_image>(this);
+ auto request = image_replayer::CloseImageRequest<I>::create(
+ &m_state_builder->local_image_ctx, ctx);
+ request->send();
+}
+
+
+template <typename I>
+void Replayer<I>::handle_close_local_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (r < 0) {
+ derr << "error closing local iamge: " << cpp_strerror(r) << dendl;
+ handle_replay_error(r, "failed to close local image");
+ }
+
+ ceph_assert(m_state_builder->local_image_ctx == nullptr);
+ stop_remote_journaler_replay();
+}
+
+template <typename I>
+void Replayer<I>::stop_remote_journaler_replay() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (m_state_builder->remote_journaler == nullptr) {
+ wait_for_in_flight_ops();
+ return;
+ } else if (m_remote_replay_handler == nullptr) {
+ wait_for_in_flight_ops();
+ return;
+ }
+
+ dout(10) << dendl;
+ auto ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_stop_remote_journaler_replay>(this));
+ m_state_builder->remote_journaler->stop_replay(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_stop_remote_journaler_replay(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (r < 0) {
+ derr << "failed to stop remote journaler replay : " << cpp_strerror(r)
+ << dendl;
+ handle_replay_error(r, "failed to stop remote journaler replay");
+ }
+
+ delete m_remote_replay_handler;
+ m_remote_replay_handler = nullptr;
+
+ wait_for_in_flight_ops();
+}
+
+template <typename I>
+void Replayer<I>::wait_for_in_flight_ops() {
+ dout(10) << dendl;
+ if (m_remote_listener != nullptr) {
+ m_state_builder->remote_journaler->remove_listener(m_remote_listener);
+ delete m_remote_listener;
+ m_remote_listener = nullptr;
+ }
+
+ auto ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_wait_for_in_flight_ops>(this));
+ m_in_flight_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_wait_for_in_flight_ops(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ReplayStatusFormatter<I>::destroy(m_replay_status_formatter);
+ m_replay_status_formatter = nullptr;
+
+ Context* on_init_shutdown = nullptr;
+ {
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_on_init_shutdown != nullptr);
+ std::swap(m_on_init_shutdown, on_init_shutdown);
+ m_state = STATE_COMPLETE;
+ }
+ on_init_shutdown->complete(m_error_code);
+}
+
+template <typename I>
+void Replayer<I>::handle_remote_journal_metadata_updated() {
+ dout(20) << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (m_state != STATE_REPLAYING) {
+ return;
+ }
+
+ cls::journal::Client remote_client;
+ int r = m_state_builder->remote_journaler->get_cached_client(
+ m_local_mirror_uuid, &remote_client);
+ if (r < 0) {
+ derr << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ librbd::journal::MirrorPeerClientMeta remote_client_meta;
+ std::string error;
+ r = validate_remote_client_state(remote_client, &remote_client_meta,
+ &m_resync_requested, &error);
+ if (r < 0) {
+ dout(0) << "client flagged disconnected, stopping image replay" << dendl;
+ handle_replay_complete(locker, r, error);
+ }
+}
+
+template <typename I>
+void Replayer<I>::schedule_flush_local_replay_task() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ std::unique_lock timer_locker{m_threads->timer_lock};
+ if (m_state != STATE_REPLAYING || m_flush_local_replay_task != nullptr) {
+ return;
+ }
+
+ dout(15) << dendl;
+ m_flush_local_replay_task = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_flush_local_replay_task>(this));
+ m_threads->timer->add_event_after(30, m_flush_local_replay_task);
+}
+
+template <typename I>
+void Replayer<I>::cancel_flush_local_replay_task() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ std::unique_lock timer_locker{m_threads->timer_lock};
+ if (m_flush_local_replay_task != nullptr) {
+ dout(10) << dendl;
+ m_threads->timer->cancel_event(m_flush_local_replay_task);
+ m_flush_local_replay_task = nullptr;
+ }
+}
+
+template <typename I>
+void Replayer<I>::handle_flush_local_replay_task(int) {
+ dout(15) << dendl;
+
+ m_in_flight_op_tracker.start_op();
+ auto on_finish = new LambdaContext([this](int) {
+ std::unique_lock locker{m_lock};
+
+ {
+ std::unique_lock timer_locker{m_threads->timer_lock};
+ m_flush_local_replay_task = nullptr;
+ }
+
+ notify_status_updated();
+ m_in_flight_op_tracker.finish_op();
+ });
+ flush_local_replay(on_finish);
+}
+
+template <typename I>
+void Replayer<I>::flush_local_replay(Context* on_flush) {
+ std::unique_lock locker{m_lock};
+ if (m_state != STATE_REPLAYING) {
+ locker.unlock();
+ on_flush->complete(0);
+ return;
+ } else if (m_local_journal_replay == nullptr) {
+ // raced w/ a tag creation stop/start, which implies that
+ // the replay is flushed
+ locker.unlock();
+ flush_commit_position(on_flush);
+ return;
+ }
+
+ dout(15) << dendl;
+ auto ctx = new LambdaContext(
+ [this, on_flush](int r) {
+ handle_flush_local_replay(on_flush, r);
+ });
+ m_local_journal_replay->flush(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_flush_local_replay(Context* on_flush, int r) {
+ dout(15) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "error flushing local replay: " << cpp_strerror(r) << dendl;
+ on_flush->complete(r);
+ return;
+ }
+
+ flush_commit_position(on_flush);
+}
+
+template <typename I>
+void Replayer<I>::flush_commit_position(Context* on_flush) {
+ std::unique_lock locker{m_lock};
+ if (m_state != STATE_REPLAYING) {
+ locker.unlock();
+ on_flush->complete(0);
+ return;
+ }
+
+ dout(15) << dendl;
+ auto ctx = new LambdaContext(
+ [this, on_flush](int r) {
+ handle_flush_commit_position(on_flush, r);
+ });
+ m_state_builder->remote_journaler->flush_commit_position(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_flush_commit_position(Context* on_flush, int r) {
+ dout(15) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "error flushing remote journal commit position: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ on_flush->complete(r);
+}
+
+template <typename I>
+void Replayer<I>::handle_replay_error(int r, const std::string &error) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (m_error_code == 0) {
+ m_error_code = r;
+ m_error_description = error;
+ }
+}
+
+template <typename I>
+bool Replayer<I>::is_replay_complete() const {
+ std::unique_lock locker{m_lock};
+ return is_replay_complete(locker);
+}
+
+template <typename I>
+bool Replayer<I>::is_replay_complete(
+ const std::unique_lock<ceph::mutex>&) const {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ return (m_state == STATE_COMPLETE);
+}
+
+template <typename I>
+void Replayer<I>::handle_replay_complete(int r, const std::string &error) {
+ std::unique_lock locker{m_lock};
+ handle_replay_complete(locker, r, error);
+}
+
+template <typename I>
+void Replayer<I>::handle_replay_complete(
+ const std::unique_lock<ceph::mutex>&, int r, const std::string &error) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ dout(10) << "r=" << r << ", error=" << error << dendl;
+ if (r < 0) {
+ derr << "replay encountered an error: " << cpp_strerror(r) << dendl;
+ handle_replay_error(r, error);
+ }
+
+ if (m_state != STATE_REPLAYING) {
+ return;
+ }
+
+ m_state = STATE_COMPLETE;
+ notify_status_updated();
+}
+
+template <typename I>
+void Replayer<I>::handle_replay_ready() {
+ std::unique_lock locker{m_lock};
+ handle_replay_ready(locker);
+}
+
+template <typename I>
+void Replayer<I>::handle_replay_ready(
+ std::unique_lock<ceph::mutex>& locker) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ dout(20) << dendl;
+ if (is_replay_complete(locker)) {
+ return;
+ }
+
+ if (!m_state_builder->remote_journaler->try_pop_front(&m_replay_entry,
+ &m_replay_tag_tid)) {
+ dout(20) << "no entries ready for replay" << dendl;
+ return;
+ }
+
+ // can safely drop lock once the entry is tracked
+ m_event_replay_tracker.start_op();
+ locker.unlock();
+
+ dout(20) << "entry tid=" << m_replay_entry.get_commit_tid()
+ << "tag_tid=" << m_replay_tag_tid << dendl;
+ if (!m_replay_tag_valid || m_replay_tag.tid != m_replay_tag_tid) {
+ // must allocate a new local journal tag prior to processing
+ replay_flush();
+ return;
+ }
+
+ preprocess_entry();
+}
+
+template <typename I>
+void Replayer<I>::replay_flush() {
+ dout(10) << dendl;
+ m_flush_tracker.start_op();
+
+ // shut down the replay to flush all IO and ops and create a new
+ // replayer to handle the new tag epoch
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_replay_flush_shut_down>(this);
+ ceph_assert(m_local_journal_replay != nullptr);
+ m_local_journal_replay->shut_down(false, ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_replay_flush_shut_down(int r) {
+ std::unique_lock locker{m_lock};
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(m_local_journal != nullptr);
+ ceph_assert(m_local_journal_listener != nullptr);
+
+ // blocks if listener notification is in-progress
+ m_local_journal->remove_listener(m_local_journal_listener);
+ delete m_local_journal_listener;
+ m_local_journal_listener = nullptr;
+
+ m_local_journal->stop_external_replay();
+ m_local_journal_replay = nullptr;
+ m_local_journal.reset();
+
+ if (r < 0) {
+ locker.unlock();
+
+ handle_replay_flush(r);
+ return;
+ }
+
+ // journal might have been closed now that we stopped external replay
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+ std::shared_lock local_image_locker{local_image_ctx->image_lock};
+ m_local_journal = local_image_ctx->journal;
+ if (m_local_journal == nullptr) {
+ local_image_locker.unlock();
+ locker.unlock();
+
+ derr << "local image journal closed" << dendl;
+ handle_replay_flush(-EINVAL);
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_replay_flush>(this);
+ m_local_journal->start_external_replay(&m_local_journal_replay, ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_replay_flush(int r) {
+ std::unique_lock locker{m_lock};
+ dout(10) << "r=" << r << dendl;
+ m_flush_tracker.finish_op();
+
+ if (r < 0) {
+ derr << "replay flush encountered an error: " << cpp_strerror(r) << dendl;
+ handle_replay_complete(locker, r, "replay flush encountered an error");
+ m_event_replay_tracker.finish_op();
+ return;
+ } else if (is_replay_complete(locker)) {
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ // check for resync/promotion state after adding listener
+ if (!add_local_journal_listener(locker)) {
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+ locker.unlock();
+
+ get_remote_tag();
+}
+
+template <typename I>
+void Replayer<I>::get_remote_tag() {
+ dout(15) << "tag_tid: " << m_replay_tag_tid << dendl;
+
+ Context *ctx = create_context_callback<
+ Replayer, &Replayer<I>::handle_get_remote_tag>(this);
+ m_state_builder->remote_journaler->get_tag(m_replay_tag_tid, &m_replay_tag,
+ ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_get_remote_tag(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r == 0) {
+ try {
+ auto it = m_replay_tag.data.cbegin();
+ decode(m_replay_tag_data, it);
+ } catch (const buffer::error &err) {
+ r = -EBADMSG;
+ }
+ }
+
+ if (r < 0) {
+ derr << "failed to retrieve remote tag " << m_replay_tag_tid << ": "
+ << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to retrieve remote tag");
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ m_replay_tag_valid = true;
+ dout(15) << "decoded remote tag " << m_replay_tag_tid << ": "
+ << m_replay_tag_data << dendl;
+
+ allocate_local_tag();
+}
+
+template <typename I>
+void Replayer<I>::allocate_local_tag() {
+ dout(15) << dendl;
+
+ std::string mirror_uuid = m_replay_tag_data.mirror_uuid;
+ if (mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ mirror_uuid = m_state_builder->remote_mirror_uuid;
+ } else if (mirror_uuid == m_local_mirror_uuid) {
+ mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID;
+ } else if (mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+ // handle possible edge condition where daemon can failover and
+ // the local image has already been promoted/demoted
+ auto local_tag_data = m_local_journal->get_tag_data();
+ if (local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID &&
+ (local_tag_data.predecessor.commit_valid &&
+ local_tag_data.predecessor.mirror_uuid ==
+ librbd::Journal<>::LOCAL_MIRROR_UUID)) {
+ dout(15) << "skipping stale demotion event" << dendl;
+ handle_process_entry_safe(m_replay_entry, m_replay_bytes,
+ m_replay_start_time, 0);
+ handle_replay_ready();
+ return;
+ } else {
+ dout(5) << "encountered image demotion: stopping" << dendl;
+ handle_replay_complete(0, "");
+ }
+ }
+
+ librbd::journal::TagPredecessor predecessor(m_replay_tag_data.predecessor);
+ if (predecessor.mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ predecessor.mirror_uuid = m_state_builder->remote_mirror_uuid;
+ } else if (predecessor.mirror_uuid == m_local_mirror_uuid) {
+ predecessor.mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID;
+ }
+
+ dout(15) << "mirror_uuid=" << mirror_uuid << ", "
+ << "predecessor=" << predecessor << ", "
+ << "replay_tag_tid=" << m_replay_tag_tid << dendl;
+ Context *ctx = create_context_callback<
+ Replayer, &Replayer<I>::handle_allocate_local_tag>(this);
+ m_local_journal->allocate_tag(mirror_uuid, predecessor, ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_allocate_local_tag(int r) {
+ dout(15) << "r=" << r << ", "
+ << "tag_tid=" << m_local_journal->get_tag_tid() << dendl;
+ if (r < 0) {
+ derr << "failed to allocate journal tag: " << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to allocate journal tag");
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ preprocess_entry();
+}
+
+template <typename I>
+void Replayer<I>::preprocess_entry() {
+ dout(20) << "preprocessing entry tid=" << m_replay_entry.get_commit_tid()
+ << dendl;
+
+ bufferlist data = m_replay_entry.get_data();
+ auto it = data.cbegin();
+ int r = m_local_journal_replay->decode(&it, &m_event_entry);
+ if (r < 0) {
+ derr << "failed to decode journal event" << dendl;
+ handle_replay_complete(r, "failed to decode journal event");
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ m_replay_bytes = data.length();
+ uint32_t delay = calculate_replay_delay(
+ m_event_entry.timestamp,
+ m_state_builder->local_image_ctx->mirroring_replay_delay);
+ if (delay == 0) {
+ handle_preprocess_entry_ready(0);
+ return;
+ }
+
+ std::unique_lock locker{m_lock};
+ if (is_replay_complete(locker)) {
+ // don't schedule a delayed replay task if a shut-down is in-progress
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ dout(20) << "delaying replay by " << delay << " sec" << dendl;
+ std::unique_lock timer_locker{m_threads->timer_lock};
+ ceph_assert(m_delayed_preprocess_task == nullptr);
+ m_delayed_preprocess_task = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_delayed_preprocess_task>(this);
+ m_threads->timer->add_event_after(delay, m_delayed_preprocess_task);
+}
+
+template <typename I>
+void Replayer<I>::handle_delayed_preprocess_task(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_threads->timer_lock));
+ m_delayed_preprocess_task = nullptr;
+
+ m_threads->work_queue->queue(create_context_callback<
+ Replayer, &Replayer<I>::handle_preprocess_entry_ready>(this), 0);
+}
+
+template <typename I>
+void Replayer<I>::handle_preprocess_entry_ready(int r) {
+ dout(20) << "r=" << r << dendl;
+ ceph_assert(r == 0);
+
+ m_replay_start_time = ceph_clock_now();
+ if (!m_event_preprocessor->is_required(m_event_entry)) {
+ process_entry();
+ return;
+ }
+
+ Context *ctx = create_context_callback<
+ Replayer, &Replayer<I>::handle_preprocess_entry_safe>(this);
+ m_event_preprocessor->preprocess(&m_event_entry, ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_preprocess_entry_safe(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -ECANCELED) {
+ handle_replay_complete(0, "lost exclusive lock");
+ } else {
+ derr << "failed to preprocess journal event" << dendl;
+ handle_replay_complete(r, "failed to preprocess journal event");
+ }
+
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ process_entry();
+}
+
+template <typename I>
+void Replayer<I>::process_entry() {
+ dout(20) << "processing entry tid=" << m_replay_entry.get_commit_tid()
+ << dendl;
+
+ Context *on_ready = create_context_callback<
+ Replayer, &Replayer<I>::handle_process_entry_ready>(this);
+ Context *on_commit = new C_ReplayCommitted(this, std::move(m_replay_entry),
+ m_replay_bytes,
+ m_replay_start_time);
+
+ m_local_journal_replay->process(m_event_entry, on_ready, on_commit);
+}
+
+template <typename I>
+void Replayer<I>::handle_process_entry_ready(int r) {
+ std::unique_lock locker{m_lock};
+
+ dout(20) << dendl;
+ ceph_assert(r == 0);
+
+ bool update_status = false;
+ {
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+ std::shared_lock image_locker{local_image_ctx->image_lock};
+ auto image_spec = util::compute_image_spec(local_image_ctx->md_ctx,
+ local_image_ctx->name);
+ if (m_image_spec != image_spec) {
+ m_image_spec = image_spec;
+ update_status = true;
+ }
+ }
+
+ m_replay_status_formatter->handle_entry_processed(m_replay_bytes);
+
+ if (update_status) {
+ unregister_perf_counters();
+ register_perf_counters();
+ notify_status_updated();
+ }
+
+ // attempt to process the next event
+ handle_replay_ready(locker);
+}
+
+template <typename I>
+void Replayer<I>::handle_process_entry_safe(
+ const ReplayEntry &replay_entry, uint64_t replay_bytes,
+ const utime_t &replay_start_time, int r) {
+ dout(20) << "commit_tid=" << replay_entry.get_commit_tid() << ", r=" << r
+ << dendl;
+
+ if (r < 0) {
+ derr << "failed to commit journal event: " << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to commit journal event");
+ } else {
+ ceph_assert(m_state_builder->remote_journaler != nullptr);
+ m_state_builder->remote_journaler->committed(replay_entry);
+ }
+
+ auto latency = ceph_clock_now() - replay_start_time;
+ if (g_journal_perf_counters) {
+ g_journal_perf_counters->inc(l_rbd_mirror_replay);
+ g_journal_perf_counters->inc(l_rbd_mirror_replay_bytes, replay_bytes);
+ g_journal_perf_counters->tinc(l_rbd_mirror_replay_latency, latency);
+ }
+
+ auto ctx = new LambdaContext(
+ [this, replay_bytes, latency](int r) {
+ std::unique_lock locker{m_lock};
+ schedule_flush_local_replay_task();
+
+ if (m_perf_counters) {
+ m_perf_counters->inc(l_rbd_mirror_replay);
+ m_perf_counters->inc(l_rbd_mirror_replay_bytes, replay_bytes);
+ m_perf_counters->tinc(l_rbd_mirror_replay_latency, latency);
+ }
+
+ m_event_replay_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void Replayer<I>::handle_resync_image() {
+ dout(10) << dendl;
+
+ std::unique_lock locker{m_lock};
+ m_resync_requested = true;
+ handle_replay_complete(locker, 0, "resync requested");
+}
+
+template <typename I>
+void Replayer<I>::notify_status_updated() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ dout(10) << dendl;
+
+ auto ctx = new C_TrackedOp(m_in_flight_op_tracker, new LambdaContext(
+ [this](int) {
+ m_replayer_listener->handle_notification();
+ }));
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void Replayer<I>::cancel_delayed_preprocess_task() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ bool canceled_delayed_preprocess_task = false;
+ {
+ std::unique_lock timer_locker{m_threads->timer_lock};
+ if (m_delayed_preprocess_task != nullptr) {
+ dout(10) << dendl;
+ canceled_delayed_preprocess_task = m_threads->timer->cancel_event(
+ m_delayed_preprocess_task);
+ ceph_assert(canceled_delayed_preprocess_task);
+ m_delayed_preprocess_task = nullptr;
+ }
+ }
+
+ if (canceled_delayed_preprocess_task) {
+ // wake up sleeping replay
+ m_event_replay_tracker.finish_op();
+ }
+}
+
+template <typename I>
+int Replayer<I>::validate_remote_client_state(
+ const cls::journal::Client& remote_client,
+ librbd::journal::MirrorPeerClientMeta* remote_client_meta,
+ bool* resync_requested, std::string* error) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (!util::decode_client_meta(remote_client, remote_client_meta)) {
+ // require operator intervention since the data is corrupt
+ *error = "error retrieving remote journal client";
+ return -EBADMSG;
+ }
+
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+ dout(5) << "image_id=" << local_image_ctx->id << ", "
+ << "remote_client_meta.image_id="
+ << remote_client_meta->image_id << ", "
+ << "remote_client.state=" << remote_client.state << dendl;
+ if (remote_client_meta->image_id == local_image_ctx->id &&
+ remote_client.state != cls::journal::CLIENT_STATE_CONNECTED) {
+ dout(5) << "client flagged disconnected, stopping image replay" << dendl;
+ if (local_image_ctx->config.template get_val<bool>(
+ "rbd_mirroring_resync_after_disconnect")) {
+ dout(10) << "disconnected: automatic resync" << dendl;
+ *resync_requested = true;
+ *error = "disconnected: automatic resync";
+ return -ENOTCONN;
+ } else {
+ dout(10) << "disconnected" << dendl;
+ *error = "disconnected";
+ return -ENOTCONN;
+ }
+ }
+
+ return 0;
+}
+
+template <typename I>
+void Replayer<I>::register_perf_counters() {
+ dout(5) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(m_perf_counters == nullptr);
+
+ auto cct = static_cast<CephContext *>(m_state_builder->local_image_ctx->cct);
+ auto prio = cct->_conf.get_val<int64_t>("rbd_mirror_image_perf_stats_prio");
+ PerfCountersBuilder plb(g_ceph_context, "rbd_mirror_image_" + m_image_spec,
+ l_rbd_mirror_journal_first, l_rbd_mirror_journal_last);
+ plb.add_u64_counter(l_rbd_mirror_replay, "replay", "Replays", "r", prio);
+ plb.add_u64_counter(l_rbd_mirror_replay_bytes, "replay_bytes",
+ "Replayed data", "rb", prio, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_rbd_mirror_replay_latency, "replay_latency",
+ "Replay latency", "rl", prio);
+ m_perf_counters = plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(m_perf_counters);
+}
+
+template <typename I>
+void Replayer<I>::unregister_perf_counters() {
+ dout(5) << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ PerfCounters *perf_counters = nullptr;
+ std::swap(perf_counters, m_perf_counters);
+
+ if (perf_counters != nullptr) {
+ g_ceph_context->get_perfcounters_collection()->remove(perf_counters);
+ delete perf_counters;
+ }
+}
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::journal::Replayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/journal/Replayer.h b/src/tools/rbd_mirror/image_replayer/journal/Replayer.h
new file mode 100644
index 000000000..6b1f36d9c
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/Replayer.h
@@ -0,0 +1,323 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_REPLAYER_H
+#define RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_REPLAYER_H
+
+#include "tools/rbd_mirror/image_replayer/Replayer.h"
+#include "include/utime.h"
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_mutex.h"
+#include "common/RefCountedObj.h"
+#include "cls/journal/cls_journal_types.h"
+#include "journal/ReplayEntry.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include <string>
+#include <type_traits>
+
+namespace journal { class Journaler; }
+namespace librbd {
+
+struct ImageCtx;
+namespace journal { template <typename I> class Replay; }
+
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+namespace image_replayer {
+
+struct ReplayerListener;
+
+namespace journal {
+
+template <typename> class EventPreprocessor;
+template <typename> class ReplayStatusFormatter;
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT>
+class Replayer : public image_replayer::Replayer {
+public:
+ typedef typename librbd::journal::TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ static Replayer* create(
+ Threads<ImageCtxT>* threads,
+ const std::string& local_mirror_uuid,
+ StateBuilder<ImageCtxT>* state_builder,
+ ReplayerListener* replayer_listener) {
+ return new Replayer(threads, local_mirror_uuid, state_builder,
+ replayer_listener);
+ }
+
+ Replayer(
+ Threads<ImageCtxT>* threads,
+ const std::string& local_mirror_uuid,
+ StateBuilder<ImageCtxT>* state_builder,
+ ReplayerListener* replayer_listener);
+ ~Replayer();
+
+ void destroy() override {
+ delete this;
+ }
+
+ void init(Context* on_finish) override;
+ void shut_down(Context* on_finish) override;
+
+ void flush(Context* on_finish) override;
+
+ bool get_replay_status(std::string* description, Context* on_finish) override;
+
+ bool is_replaying() const override {
+ std::unique_lock locker{m_lock};
+ return (m_state == STATE_REPLAYING);
+ }
+
+ bool is_resync_requested() const override {
+ std::unique_lock locker(m_lock);
+ return m_resync_requested;
+ }
+
+ int get_error_code() const override {
+ std::unique_lock locker(m_lock);
+ return m_error_code;
+ }
+
+ std::string get_error_description() const override {
+ std::unique_lock locker(m_lock);
+ return m_error_description;
+ }
+
+ std::string get_image_spec() const {
+ std::unique_lock locker(m_lock);
+ return m_image_spec;
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <init>
+ * |
+ * v (error)
+ * INIT_REMOTE_JOURNALER * * * * * * * * * * * * * * * * * * *
+ * | *
+ * v (error) *
+ * START_EXTERNAL_REPLAY * * * * * * * * * * * * * * * * * * *
+ * | *
+ * | /--------------------------------------------\ *
+ * | | | *
+ * v v (asok flush) | *
+ * REPLAYING -------------> LOCAL_REPLAY_FLUSH | *
+ * | \ | | *
+ * | | v | *
+ * | | FLUSH_COMMIT_POSITION | *
+ * | | | | *
+ * | | \--------------------/| *
+ * | | | *
+ * | | (entries available) | *
+ * | \-----------> REPLAY_READY | *
+ * | | | *
+ * | | (skip if not | *
+ * | v needed) (error) *
+ * | REPLAY_FLUSH * * * * * * * * * *
+ * | | | * *
+ * | | (skip if not | * *
+ * | v needed) (error) * *
+ * | GET_REMOTE_TAG * * * * * * * * *
+ * | | | * *
+ * | | (skip if not | * *
+ * | v needed) (error) * *
+ * | ALLOCATE_LOCAL_TAG * * * * * * *
+ * | | | * *
+ * | v (error) * *
+ * | PREPROCESS_ENTRY * * * * * * * *
+ * | | | * *
+ * | v (error) * *
+ * | PROCESS_ENTRY * * * * * * * * * *
+ * | | | * *
+ * | \---------------------/ * *
+ * v (shutdown) * *
+ * REPLAY_COMPLETE < * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * v *
+ * WAIT_FOR_FLUSH *
+ * | *
+ * v *
+ * SHUT_DOWN_LOCAL_JOURNAL_REPLAY *
+ * | *
+ * v *
+ * WAIT_FOR_REPLAY *
+ * | *
+ * v *
+ * CLOSE_LOCAL_IMAGE < * * * * * * * * * * * * * * * * * * * *
+ * |
+ * v (skip if not started)
+ * STOP_REMOTE_JOURNALER_REPLAY
+ * |
+ * v
+ * WAIT_FOR_IN_FLIGHT_OPS
+ * |
+ * v
+ * <shutdown>
+ *
+ * @endverbatim
+ */
+
+ typedef typename librbd::journal::TypeTraits<ImageCtxT>::ReplayEntry ReplayEntry;
+
+ enum State {
+ STATE_INIT,
+ STATE_REPLAYING,
+ STATE_COMPLETE
+ };
+
+ struct C_ReplayCommitted;
+ struct RemoteJournalerListener;
+ struct RemoteReplayHandler;
+ struct LocalJournalListener;
+
+ Threads<ImageCtxT>* m_threads;
+ std::string m_local_mirror_uuid;
+ StateBuilder<ImageCtxT>* m_state_builder;
+ ReplayerListener* m_replayer_listener;
+
+ mutable ceph::mutex m_lock;
+
+ std::string m_image_spec;
+ Context* m_on_init_shutdown = nullptr;
+
+ State m_state = STATE_INIT;
+ int m_error_code = 0;
+ std::string m_error_description;
+ bool m_resync_requested = false;
+
+ ceph::ref_t<typename std::remove_pointer<decltype(ImageCtxT::journal)>::type>
+ m_local_journal;
+ RemoteJournalerListener* m_remote_listener = nullptr;
+
+ librbd::journal::Replay<ImageCtxT>* m_local_journal_replay = nullptr;
+ EventPreprocessor<ImageCtxT>* m_event_preprocessor = nullptr;
+ ReplayStatusFormatter<ImageCtxT>* m_replay_status_formatter = nullptr;
+ RemoteReplayHandler* m_remote_replay_handler = nullptr;
+ LocalJournalListener* m_local_journal_listener = nullptr;
+
+ PerfCounters *m_perf_counters = nullptr;
+
+ ReplayEntry m_replay_entry;
+ uint64_t m_replay_bytes = 0;
+ utime_t m_replay_start_time;
+ bool m_replay_tag_valid = false;
+ uint64_t m_replay_tag_tid = 0;
+ cls::journal::Tag m_replay_tag;
+ librbd::journal::TagData m_replay_tag_data;
+ librbd::journal::EventEntry m_event_entry;
+
+ AsyncOpTracker m_flush_tracker;
+
+ AsyncOpTracker m_event_replay_tracker;
+ Context *m_delayed_preprocess_task = nullptr;
+
+ AsyncOpTracker m_in_flight_op_tracker;
+ Context *m_flush_local_replay_task = nullptr;
+
+ void handle_remote_journal_metadata_updated();
+
+ void schedule_flush_local_replay_task();
+ void cancel_flush_local_replay_task();
+ void handle_flush_local_replay_task(int r);
+
+ void flush_local_replay(Context* on_flush);
+ void handle_flush_local_replay(Context* on_flush, int r);
+
+ void flush_commit_position(Context* on_flush);
+ void handle_flush_commit_position(Context* on_flush, int r);
+
+ void init_remote_journaler();
+ void handle_init_remote_journaler(int r);
+
+ void start_external_replay(std::unique_lock<ceph::mutex>& locker);
+ void handle_start_external_replay(int r);
+
+ bool add_local_journal_listener(std::unique_lock<ceph::mutex>& locker);
+
+ bool notify_init_complete(std::unique_lock<ceph::mutex>& locker);
+
+ void wait_for_flush();
+ void handle_wait_for_flush(int r);
+
+ void shut_down_local_journal_replay();
+ void handle_shut_down_local_journal_replay(int r);
+
+ void wait_for_event_replay();
+ void handle_wait_for_event_replay(int r);
+
+ void close_local_image();
+ void handle_close_local_image(int r);
+
+ void stop_remote_journaler_replay();
+ void handle_stop_remote_journaler_replay(int r);
+
+ void wait_for_in_flight_ops();
+ void handle_wait_for_in_flight_ops(int r);
+
+ void replay_flush();
+ void handle_replay_flush_shut_down(int r);
+ void handle_replay_flush(int r);
+
+ void get_remote_tag();
+ void handle_get_remote_tag(int r);
+
+ void allocate_local_tag();
+ void handle_allocate_local_tag(int r);
+
+ void handle_replay_error(int r, const std::string &error);
+
+ bool is_replay_complete() const;
+ bool is_replay_complete(const std::unique_lock<ceph::mutex>& locker) const;
+
+ void handle_replay_complete(int r, const std::string &error_desc);
+ void handle_replay_complete(const std::unique_lock<ceph::mutex>&,
+ int r, const std::string &error_desc);
+ void handle_replay_ready();
+ void handle_replay_ready(std::unique_lock<ceph::mutex>& locker);
+
+ void preprocess_entry();
+ void handle_delayed_preprocess_task(int r);
+ void handle_preprocess_entry_ready(int r);
+ void handle_preprocess_entry_safe(int r);
+
+ void process_entry();
+ void handle_process_entry_ready(int r);
+ void handle_process_entry_safe(const ReplayEntry& replay_entry,
+ uint64_t relay_bytes,
+ const utime_t &replay_start_time, int r);
+
+ void handle_resync_image();
+
+ void notify_status_updated();
+
+ void cancel_delayed_preprocess_task();
+
+ int validate_remote_client_state(
+ const cls::journal::Client& remote_client,
+ librbd::journal::MirrorPeerClientMeta* remote_client_meta,
+ bool* resync_requested, std::string* error);
+
+ void register_perf_counters();
+ void unregister_perf_counters();
+
+};
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::journal::Replayer<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_REPLAYER_H
diff --git a/src/tools/rbd_mirror/image_replayer/journal/StateBuilder.cc b/src/tools/rbd_mirror/image_replayer/journal/StateBuilder.cc
new file mode 100644
index 000000000..5f1fb0e2f
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/StateBuilder.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "StateBuilder.h"
+#include "include/ceph_assert.h"
+#include "include/Context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.h"
+#include "tools/rbd_mirror/image_replayer/journal/Replayer.h"
+#include "tools/rbd_mirror/image_replayer/journal/SyncPointHandler.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::journal::" \
+ << "StateBuilder: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+template <typename I>
+StateBuilder<I>::StateBuilder(const std::string& global_image_id)
+ : image_replayer::StateBuilder<I>(global_image_id) {
+}
+
+template <typename I>
+StateBuilder<I>::~StateBuilder() {
+ ceph_assert(remote_journaler == nullptr);
+}
+
+template <typename I>
+void StateBuilder<I>::close(Context* on_finish) {
+ dout(10) << dendl;
+
+ // close the remote journaler after closing the local image
+ // in case we have lost contact w/ the remote cluster and
+ // will block
+ on_finish = new LambdaContext([this, on_finish](int) {
+ shut_down_remote_journaler(on_finish);
+ });
+ on_finish = new LambdaContext([this, on_finish](int) {
+ this->close_local_image(on_finish);
+ });
+ this->close_remote_image(on_finish);
+}
+
+template <typename I>
+bool StateBuilder<I>::is_disconnected() const {
+ return (remote_client_state == cls::journal::CLIENT_STATE_DISCONNECTED);
+}
+
+template <typename I>
+bool StateBuilder<I>::is_linked_impl() const {
+ ceph_assert(!this->remote_mirror_uuid.empty());
+ return (local_primary_mirror_uuid == this->remote_mirror_uuid);
+}
+
+template <typename I>
+cls::rbd::MirrorImageMode StateBuilder<I>::get_mirror_image_mode() const {
+ return cls::rbd::MIRROR_IMAGE_MODE_JOURNAL;
+}
+
+template <typename I>
+image_sync::SyncPointHandler* StateBuilder<I>::create_sync_point_handler() {
+ dout(10) << dendl;
+
+ this->m_sync_point_handler = SyncPointHandler<I>::create(this);
+ return this->m_sync_point_handler;
+}
+
+template <typename I>
+BaseRequest* StateBuilder<I>::create_local_image_request(
+ Threads<I>* threads,
+ librados::IoCtx& local_io_ctx,
+ const std::string& global_image_id,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ Context* on_finish) {
+ return CreateLocalImageRequest<I>::create(
+ threads, local_io_ctx, this->remote_image_ctx, this->global_image_id,
+ pool_meta_cache, progress_ctx, this, on_finish);
+}
+
+template <typename I>
+BaseRequest* StateBuilder<I>::create_prepare_replay_request(
+ const std::string& local_mirror_uuid,
+ ProgressContext* progress_ctx,
+ bool* resync_requested,
+ bool* syncing,
+ Context* on_finish) {
+ return PrepareReplayRequest<I>::create(
+ local_mirror_uuid, progress_ctx, this, resync_requested, syncing,
+ on_finish);
+}
+
+template <typename I>
+image_replayer::Replayer* StateBuilder<I>::create_replayer(
+ Threads<I>* threads,
+ InstanceWatcher<I>* instance_watcher,
+ const std::string& local_mirror_uuid,
+ PoolMetaCache* pool_meta_cache,
+ ReplayerListener* replayer_listener) {
+ return Replayer<I>::create(
+ threads, local_mirror_uuid, this, replayer_listener);
+}
+
+template <typename I>
+void StateBuilder<I>::shut_down_remote_journaler(Context* on_finish) {
+ if (remote_journaler == nullptr) {
+ on_finish->complete(0);
+ return;
+ }
+
+ dout(10) << dendl;
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_shut_down_remote_journaler(r, on_finish);
+ });
+ remote_journaler->shut_down(ctx);
+}
+
+template <typename I>
+void StateBuilder<I>::handle_shut_down_remote_journaler(int r,
+ Context* on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to shut down remote journaler: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ delete remote_journaler;
+ remote_journaler = nullptr;
+ on_finish->complete(r);
+}
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::journal::StateBuilder<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/journal/StateBuilder.h b/src/tools/rbd_mirror/image_replayer/journal/StateBuilder.h
new file mode 100644
index 000000000..790d1390b
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/StateBuilder.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_STATE_BUILDER_H
+#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_STATE_BUILDER_H
+
+#include "tools/rbd_mirror/image_replayer/StateBuilder.h"
+#include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include <string>
+
+struct Context;
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+template <typename> class SyncPointHandler;
+
+template <typename ImageCtxT>
+class StateBuilder : public image_replayer::StateBuilder<ImageCtxT> {
+public:
+ typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+
+ static StateBuilder* create(const std::string& global_image_id) {
+ return new StateBuilder(global_image_id);
+ }
+
+ StateBuilder(const std::string& global_image_id);
+ ~StateBuilder() override;
+
+ void close(Context* on_finish) override;
+
+ bool is_disconnected() const override;
+
+ cls::rbd::MirrorImageMode get_mirror_image_mode() const override;
+
+ image_sync::SyncPointHandler* create_sync_point_handler() override;
+
+ bool replay_requires_remote_image() const override {
+ return false;
+ }
+
+ BaseRequest* create_local_image_request(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& local_io_ctx,
+ const std::string& global_image_id,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ Context* on_finish) override;
+
+ BaseRequest* create_prepare_replay_request(
+ const std::string& local_mirror_uuid,
+ ProgressContext* progress_ctx,
+ bool* resync_requested,
+ bool* syncing,
+ Context* on_finish) override;
+
+ image_replayer::Replayer* create_replayer(
+ Threads<ImageCtxT>* threads,
+ InstanceWatcher<ImageCtxT>* instance_watcher,
+ const std::string& local_mirror_uuid,
+ PoolMetaCache* pool_meta_cache,
+ ReplayerListener* replayer_listener) override;
+
+ std::string local_primary_mirror_uuid;
+
+ Journaler* remote_journaler = nullptr;
+ cls::journal::ClientState remote_client_state =
+ cls::journal::CLIENT_STATE_CONNECTED;
+ librbd::journal::MirrorPeerClientMeta remote_client_meta;
+
+ SyncPointHandler<ImageCtxT>* sync_point_handler = nullptr;
+
+private:
+ bool is_linked_impl() const override;
+
+ void shut_down_remote_journaler(Context* on_finish);
+ void handle_shut_down_remote_journaler(int r, Context* on_finish);
+};
+
+} // namespace journal
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::journal::StateBuilder<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_STATE_BUILDER_H
diff --git a/src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.cc b/src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.cc
new file mode 100644
index 000000000..66d13e555
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.cc
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SyncPointHandler.h"
+#include "StateBuilder.h"
+#include "include/ceph_assert.h"
+#include "include/Context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::journal::" \
+ << "SyncPointHandler: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+template <typename I>
+SyncPointHandler<I>::SyncPointHandler(StateBuilder<I>* state_builder)
+ : m_state_builder(state_builder),
+ m_client_meta_copy(state_builder->remote_client_meta) {
+}
+
+template <typename I>
+typename SyncPointHandler<I>::SyncPoints
+SyncPointHandler<I>::get_sync_points() const {
+ SyncPoints sync_points;
+ for (auto& sync_point : m_client_meta_copy.sync_points) {
+ sync_points.emplace_back(
+ sync_point.snap_namespace,
+ sync_point.snap_name,
+ sync_point.from_snap_name,
+ sync_point.object_number);
+ }
+ return sync_points;
+}
+
+template <typename I>
+librbd::SnapSeqs SyncPointHandler<I>::get_snap_seqs() const {
+ return m_client_meta_copy.snap_seqs;
+}
+
+template <typename I>
+void SyncPointHandler<I>::update_sync_points(
+ const librbd::SnapSeqs& snap_seqs, const SyncPoints& sync_points,
+ bool sync_complete, Context* on_finish) {
+ dout(10) << dendl;
+
+ if (sync_complete && sync_points.empty()) {
+ m_client_meta_copy.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+ }
+
+ m_client_meta_copy.snap_seqs = snap_seqs;
+ m_client_meta_copy.sync_points.clear();
+ for (auto& sync_point : sync_points) {
+ m_client_meta_copy.sync_points.emplace_back(
+ sync_point.snap_namespace,
+ sync_point.snap_name,
+ sync_point.from_snap_name,
+ sync_point.object_number);
+
+ if (sync_point.object_number) {
+ m_client_meta_copy.sync_object_count = std::max(
+ m_client_meta_copy.sync_object_count, *sync_point.object_number + 1);
+ }
+ }
+
+ dout(20) << "client_meta=" << m_client_meta_copy << dendl;
+ bufferlist client_data_bl;
+ librbd::journal::ClientData client_data{m_client_meta_copy};
+ encode(client_data, client_data_bl);
+
+ auto ctx = new LambdaContext([this, on_finish](int r) {
+ handle_update_sync_points(r, on_finish);
+ });
+ m_state_builder->remote_journaler->update_client(client_data_bl, ctx);
+}
+
+template <typename I>
+void SyncPointHandler<I>::handle_update_sync_points(int r, Context* on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r >= 0) {
+ m_state_builder->remote_client_meta.snap_seqs =
+ m_client_meta_copy.snap_seqs;
+ m_state_builder->remote_client_meta.sync_points =
+ m_client_meta_copy.sync_points;
+ } else {
+ derr << "failed to update remote journal client meta for image "
+ << m_state_builder->global_image_id << ": " << cpp_strerror(r)
+ << dendl;
+ }
+
+ on_finish->complete(r);
+}
+
+} // namespace journal
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::journal::SyncPointHandler<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.h b/src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.h
new file mode 100644
index 000000000..b4f492c19
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_SYNC_POINT_HANDLER_H
+#define RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_SYNC_POINT_HANDLER_H
+
+#include "tools/rbd_mirror/image_sync/Types.h"
+#include "librbd/journal/Types.h"
+
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace journal {
+
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT>
+class SyncPointHandler : public image_sync::SyncPointHandler {
+public:
+ using SyncPoint = image_sync::SyncPoint;
+ using SyncPoints = image_sync::SyncPoints;
+
+ static SyncPointHandler* create(StateBuilder<ImageCtxT>* state_builder) {
+ return new SyncPointHandler(state_builder);
+ }
+ SyncPointHandler(StateBuilder<ImageCtxT>* state_builder);
+
+ SyncPoints get_sync_points() const override;
+ librbd::SnapSeqs get_snap_seqs() const override;
+
+ void update_sync_points(const librbd::SnapSeqs& snap_seqs,
+ const SyncPoints& sync_points,
+ bool sync_complete,
+ Context* on_finish) override;
+
+private:
+ StateBuilder<ImageCtxT>* m_state_builder;
+
+ librbd::journal::MirrorPeerClientMeta m_client_meta_copy;
+
+ void handle_update_sync_points(int r, Context* on_finish);
+
+};
+
+} // namespace journal
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::journal::SyncPointHandler<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_SYNC_POINT_HANDLER_H
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.cc b/src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.cc
new file mode 100644
index 000000000..2ed321738
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.cc
@@ -0,0 +1,658 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ApplyImageStateRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/image/GetMetadataRequest.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/Utils.h"
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::snapshot::" \
+ << "ApplyImageStateRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace snapshot {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+ApplyImageStateRequest<I>::ApplyImageStateRequest(
+ const std::string& local_mirror_uuid,
+ const std::string& remote_mirror_uuid,
+ I* local_image_ctx,
+ I* remote_image_ctx,
+ librbd::mirror::snapshot::ImageState image_state,
+ Context* on_finish)
+ : m_local_mirror_uuid(local_mirror_uuid),
+ m_remote_mirror_uuid(remote_mirror_uuid),
+ m_local_image_ctx(local_image_ctx),
+ m_remote_image_ctx(remote_image_ctx),
+ m_image_state(image_state),
+ m_on_finish(on_finish) {
+ dout(15) << "image_state=" << m_image_state << dendl;
+
+ std::shared_lock image_locker{m_local_image_ctx->image_lock};
+ m_features = m_local_image_ctx->features & ~RBD_FEATURES_IMPLICIT_ENABLE;
+ compute_local_to_remote_snap_ids();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::send() {
+ rename_image();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::rename_image() {
+ std::shared_lock owner_locker{m_local_image_ctx->owner_lock};
+ std::shared_lock image_locker{m_local_image_ctx->image_lock};
+ if (m_local_image_ctx->name == m_image_state.name) {
+ image_locker.unlock();
+ owner_locker.unlock();
+
+ update_features();
+ return;
+ }
+ image_locker.unlock();
+
+ dout(15) << "local_image_name=" << m_local_image_ctx->name << ", "
+ << "remote_image_name=" << m_image_state.name << dendl;
+
+ auto ctx = create_context_callback<
+ ApplyImageStateRequest<I>,
+ &ApplyImageStateRequest<I>::handle_rename_image>(this);
+ m_local_image_ctx->operations->execute_rename(m_image_state.name, ctx);
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::handle_rename_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to rename image to '" << m_image_state.name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ update_features();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::update_features() {
+ uint64_t feature_updates = 0UL;
+ bool enabled = false;
+
+ auto image_state_features =
+ m_image_state.features & ~RBD_FEATURES_IMPLICIT_ENABLE;
+ feature_updates = (m_features & ~image_state_features);
+ if (feature_updates == 0UL) {
+ feature_updates = (image_state_features & ~m_features);
+ enabled = (feature_updates != 0UL);
+ }
+
+ if (feature_updates == 0UL) {
+ get_image_meta();
+ return;
+ }
+
+ dout(15) << "image_features=" << m_features << ", "
+ << "state_features=" << image_state_features << ", "
+ << "feature_updates=" << feature_updates << ", "
+ << "enabled=" << enabled << dendl;
+
+ if (enabled) {
+ m_features |= feature_updates;
+ } else {
+ m_features &= ~feature_updates;
+ }
+
+ std::shared_lock owner_lock{m_local_image_ctx->owner_lock};
+ auto ctx = create_context_callback<
+ ApplyImageStateRequest<I>,
+ &ApplyImageStateRequest<I>::handle_update_features>(this);
+ m_local_image_ctx->operations->execute_update_features(
+ feature_updates, enabled, ctx, 0U);
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::handle_update_features(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to update image features: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ update_features();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::get_image_meta() {
+ dout(15) << dendl;
+
+ auto ctx = create_context_callback<
+ ApplyImageStateRequest<I>,
+ &ApplyImageStateRequest<I>::handle_get_image_meta>(this);
+ auto req = librbd::image::GetMetadataRequest<I>::create(
+ m_local_image_ctx->md_ctx, m_local_image_ctx->header_oid, true, "", "", 0U,
+ &m_metadata, ctx);
+ req->send();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::handle_get_image_meta(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to fetch local image metadata: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ update_image_meta();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::update_image_meta() {
+ std::set<std::string> keys_to_remove;
+ for (const auto& [key, value] : m_metadata) {
+ if (m_image_state.metadata.count(key) == 0) {
+ dout(15) << "removing image-meta key '" << key << "'" << dendl;
+ keys_to_remove.insert(key);
+ }
+ }
+
+ std::map<std::string, bufferlist> metadata_to_update;
+ for (const auto& [key, value] : m_image_state.metadata) {
+ auto it = m_metadata.find(key);
+ if (it == m_metadata.end() || !it->second.contents_equal(value)) {
+ dout(15) << "updating image-meta key '" << key << "'" << dendl;
+ metadata_to_update.insert({key, value});
+ }
+ }
+
+ if (keys_to_remove.empty() && metadata_to_update.empty()) {
+ unprotect_snapshot();
+ return;
+ }
+
+ dout(15) << dendl;
+
+ librados::ObjectWriteOperation op;
+ for (const auto& key : keys_to_remove) {
+ librbd::cls_client::metadata_remove(&op, key);
+ }
+ if (!metadata_to_update.empty()) {
+ librbd::cls_client::metadata_set(&op, metadata_to_update);
+ }
+
+ auto aio_comp = create_rados_callback<
+ ApplyImageStateRequest<I>,
+ &ApplyImageStateRequest<I>::handle_update_image_meta>(this);
+ int r = m_local_image_ctx->md_ctx.aio_operate(m_local_image_ctx->header_oid, aio_comp,
+ &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::handle_update_image_meta(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to update image metadata: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_metadata.clear();
+
+ m_prev_snap_id = CEPH_NOSNAP;
+ unprotect_snapshot();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::unprotect_snapshot() {
+ std::shared_lock image_locker{m_local_image_ctx->image_lock};
+
+ auto snap_it = m_local_image_ctx->snap_info.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_it = m_local_image_ctx->snap_info.upper_bound(m_prev_snap_id);
+ }
+
+ for (; snap_it != m_local_image_ctx->snap_info.end(); ++snap_it) {
+ auto snap_id = snap_it->first;
+ const auto& snap_info = snap_it->second;
+
+ auto user_ns = boost::get<cls::rbd::UserSnapshotNamespace>(
+ &snap_info.snap_namespace);
+ if (user_ns == nullptr) {
+ dout(20) << "snapshot " << snap_id << " is not a user snapshot" << dendl;
+ continue;
+ }
+
+ if (snap_info.protection_status == RBD_PROTECTION_STATUS_UNPROTECTED) {
+ dout(20) << "snapshot " << snap_id << " is already unprotected" << dendl;
+ continue;
+ }
+
+ auto snap_id_map_it = m_local_to_remote_snap_ids.find(snap_id);
+ if (snap_id_map_it == m_local_to_remote_snap_ids.end()) {
+ dout(15) << "snapshot " << snap_id << " does not exist in remote image"
+ << dendl;
+ break;
+ }
+
+ auto remote_snap_id = snap_id_map_it->second;
+ auto snap_state_it = m_image_state.snapshots.find(remote_snap_id);
+ if (snap_state_it == m_image_state.snapshots.end()) {
+ dout(15) << "snapshot " << snap_id << " does not exist in remote image "
+ << "state" << dendl;
+ break;
+ }
+
+ const auto& snap_state = snap_state_it->second;
+ if (snap_state.protection_status == RBD_PROTECTION_STATUS_UNPROTECTED) {
+ dout(15) << "snapshot " << snap_id << " is unprotected in remote image"
+ << dendl;
+ break;
+ }
+ }
+
+ if (snap_it == m_local_image_ctx->snap_info.end()) {
+ image_locker.unlock();
+
+ // no local snapshots to unprotect
+ m_prev_snap_id = CEPH_NOSNAP;
+ remove_snapshot();
+ return;
+ }
+
+ m_prev_snap_id = snap_it->first;
+ m_snap_name = snap_it->second.name;
+ image_locker.unlock();
+
+ dout(15) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ std::shared_lock owner_locker{m_local_image_ctx->owner_lock};
+ auto ctx = create_context_callback<
+ ApplyImageStateRequest<I>,
+ &ApplyImageStateRequest<I>::handle_unprotect_snapshot>(this);
+ m_local_image_ctx->operations->execute_snap_unprotect(
+ cls::rbd::UserSnapshotNamespace{}, m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::handle_unprotect_snapshot(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to unprotect snapshot " << m_snap_name << ": "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ unprotect_snapshot();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::remove_snapshot() {
+ std::shared_lock image_locker{m_local_image_ctx->image_lock};
+
+ auto snap_it = m_local_image_ctx->snap_info.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_it = m_local_image_ctx->snap_info.upper_bound(m_prev_snap_id);
+ }
+
+ for (; snap_it != m_local_image_ctx->snap_info.end(); ++snap_it) {
+ auto snap_id = snap_it->first;
+ const auto& snap_info = snap_it->second;
+
+ auto user_ns = boost::get<cls::rbd::UserSnapshotNamespace>(
+ &snap_info.snap_namespace);
+ if (user_ns == nullptr) {
+ dout(20) << "snapshot " << snap_id << " is not a user snapshot" << dendl;
+ continue;
+ }
+
+ auto snap_id_map_it = m_local_to_remote_snap_ids.find(snap_id);
+ if (snap_id_map_it == m_local_to_remote_snap_ids.end()) {
+ dout(15) << "snapshot " << snap_id << " does not exist in remote image"
+ << dendl;
+ break;
+ }
+
+ auto remote_snap_id = snap_id_map_it->second;
+ auto snap_state_it = m_image_state.snapshots.find(remote_snap_id);
+ if (snap_state_it == m_image_state.snapshots.end()) {
+ dout(15) << "snapshot " << snap_id << " does not exist in remote image "
+ << "state" << dendl;
+ break;
+ }
+ }
+
+ if (snap_it == m_local_image_ctx->snap_info.end()) {
+ image_locker.unlock();
+
+ // no local snapshots to remove
+ m_prev_snap_id = CEPH_NOSNAP;
+ protect_snapshot();
+ return;
+ }
+
+ m_prev_snap_id = snap_it->first;
+ m_snap_name = snap_it->second.name;
+ image_locker.unlock();
+
+ dout(15) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ std::shared_lock owner_locker{m_local_image_ctx->owner_lock};
+ auto ctx = create_context_callback<
+ ApplyImageStateRequest<I>,
+ &ApplyImageStateRequest<I>::handle_remove_snapshot>(this);
+ m_local_image_ctx->operations->execute_snap_remove(
+ cls::rbd::UserSnapshotNamespace{}, m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::handle_remove_snapshot(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to remove snapshot " << m_snap_name << ": "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ remove_snapshot();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::protect_snapshot() {
+ std::shared_lock image_locker{m_local_image_ctx->image_lock};
+
+ auto snap_it = m_local_image_ctx->snap_info.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_it = m_local_image_ctx->snap_info.upper_bound(m_prev_snap_id);
+ }
+
+ for (; snap_it != m_local_image_ctx->snap_info.end(); ++snap_it) {
+ auto snap_id = snap_it->first;
+ const auto& snap_info = snap_it->second;
+
+ auto user_ns = boost::get<cls::rbd::UserSnapshotNamespace>(
+ &snap_info.snap_namespace);
+ if (user_ns == nullptr) {
+ dout(20) << "snapshot " << snap_id << " is not a user snapshot" << dendl;
+ continue;
+ }
+
+ if (snap_info.protection_status == RBD_PROTECTION_STATUS_PROTECTED) {
+ dout(20) << "snapshot " << snap_id << " is already protected" << dendl;
+ continue;
+ }
+
+ auto snap_id_map_it = m_local_to_remote_snap_ids.find(snap_id);
+ if (snap_id_map_it == m_local_to_remote_snap_ids.end()) {
+ dout(15) << "snapshot " << snap_id << " does not exist in remote image"
+ << dendl;
+ continue;
+ }
+
+ auto remote_snap_id = snap_id_map_it->second;
+ auto snap_state_it = m_image_state.snapshots.find(remote_snap_id);
+ if (snap_state_it == m_image_state.snapshots.end()) {
+ dout(15) << "snapshot " << snap_id << " does not exist in remote image "
+ << "state" << dendl;
+ continue;
+ }
+
+ const auto& snap_state = snap_state_it->second;
+ if (snap_state.protection_status == RBD_PROTECTION_STATUS_PROTECTED) {
+ dout(15) << "snapshot " << snap_id << " is protected in remote image"
+ << dendl;
+ break;
+ }
+ }
+
+ if (snap_it == m_local_image_ctx->snap_info.end()) {
+ image_locker.unlock();
+
+ // no local snapshots to protect
+ m_prev_snap_id = CEPH_NOSNAP;
+ rename_snapshot();
+ return;
+ }
+
+ m_prev_snap_id = snap_it->first;
+ m_snap_name = snap_it->second.name;
+ image_locker.unlock();
+
+ dout(15) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ std::shared_lock owner_locker{m_local_image_ctx->owner_lock};
+ auto ctx = create_context_callback<
+ ApplyImageStateRequest<I>,
+ &ApplyImageStateRequest<I>::handle_protect_snapshot>(this);
+ m_local_image_ctx->operations->execute_snap_protect(
+ cls::rbd::UserSnapshotNamespace{}, m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::handle_protect_snapshot(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to protect snapshot " << m_snap_name << ": "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ protect_snapshot();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::rename_snapshot() {
+ std::shared_lock image_locker{m_local_image_ctx->image_lock};
+
+ auto snap_it = m_local_image_ctx->snap_info.begin();
+ if (m_prev_snap_id != CEPH_NOSNAP) {
+ snap_it = m_local_image_ctx->snap_info.upper_bound(m_prev_snap_id);
+ }
+
+ for (; snap_it != m_local_image_ctx->snap_info.end(); ++snap_it) {
+ auto snap_id = snap_it->first;
+ const auto& snap_info = snap_it->second;
+
+ auto user_ns = boost::get<cls::rbd::UserSnapshotNamespace>(
+ &snap_info.snap_namespace);
+ if (user_ns == nullptr) {
+ dout(20) << "snapshot " << snap_id << " is not a user snapshot" << dendl;
+ continue;
+ }
+
+ auto snap_id_map_it = m_local_to_remote_snap_ids.find(snap_id);
+ if (snap_id_map_it == m_local_to_remote_snap_ids.end()) {
+ dout(15) << "snapshot " << snap_id << " does not exist in remote image"
+ << dendl;
+ continue;
+ }
+
+ auto remote_snap_id = snap_id_map_it->second;
+ auto snap_state_it = m_image_state.snapshots.find(remote_snap_id);
+ if (snap_state_it == m_image_state.snapshots.end()) {
+ dout(15) << "snapshot " << snap_id << " does not exist in remote image "
+ << "state" << dendl;
+ continue;
+ }
+
+ const auto& snap_state = snap_state_it->second;
+ if (snap_info.name != snap_state.name) {
+ dout(15) << "snapshot " << snap_id << " has been renamed from '"
+ << snap_info.name << "' to '" << snap_state.name << "'"
+ << dendl;
+ m_snap_name = snap_state.name;
+ break;
+ }
+ }
+
+ if (snap_it == m_local_image_ctx->snap_info.end()) {
+ image_locker.unlock();
+
+ // no local snapshots to protect
+ m_prev_snap_id = CEPH_NOSNAP;
+ set_snapshot_limit();
+ return;
+ }
+
+ m_prev_snap_id = snap_it->first;
+ image_locker.unlock();
+
+ dout(15) << "snap_name=" << m_snap_name << ", "
+ << "snap_id=" << m_prev_snap_id << dendl;
+
+ std::shared_lock owner_locker{m_local_image_ctx->owner_lock};
+ auto ctx = create_context_callback<
+ ApplyImageStateRequest<I>,
+ &ApplyImageStateRequest<I>::handle_rename_snapshot>(this);
+ m_local_image_ctx->operations->execute_snap_rename(
+ m_prev_snap_id, m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::handle_rename_snapshot(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to protect snapshot " << m_snap_name << ": "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ rename_snapshot();
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::set_snapshot_limit() {
+ dout(15) << "snap_limit=" << m_image_state.snap_limit << dendl;
+
+ // no need to even check the current limit -- just set it
+ std::shared_lock owner_locker{m_local_image_ctx->owner_lock};
+ auto ctx = create_context_callback<
+ ApplyImageStateRequest<I>,
+ &ApplyImageStateRequest<I>::handle_set_snapshot_limit>(this);
+ m_local_image_ctx->operations->execute_snap_set_limit(
+ m_image_state.snap_limit, ctx);
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::handle_set_snapshot_limit(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to update snapshot limit: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::finish(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+template <typename I>
+uint64_t ApplyImageStateRequest<I>::compute_remote_snap_id(
+ uint64_t local_snap_id) {
+ ceph_assert(ceph_mutex_is_locked(m_local_image_ctx->image_lock));
+ ceph_assert(ceph_mutex_is_locked(m_remote_image_ctx->image_lock));
+
+ // Search our local non-primary snapshots for a mapping to the remote
+ // snapshot. The non-primary mirror snapshot with the mappings will always
+ // come at or after the snapshot we are searching against
+ auto remote_snap_id = util::compute_remote_snap_id(
+ m_local_image_ctx->image_lock, m_local_image_ctx->snap_info,
+ local_snap_id, m_remote_mirror_uuid);
+ if (remote_snap_id != CEPH_NOSNAP) {
+ return remote_snap_id;
+ }
+
+ // if we failed to find a match to a remote snapshot in our local non-primary
+ // snapshots, check the remote image for non-primary snapshot mappings back
+ // to our snapshot
+ for (auto snap_it = m_remote_image_ctx->snap_info.begin();
+ snap_it != m_remote_image_ctx->snap_info.end(); ++snap_it) {
+ auto snap_id = snap_it->first;
+ auto mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &snap_it->second.snap_namespace);
+ if (mirror_ns == nullptr || !mirror_ns->is_non_primary()) {
+ continue;
+ }
+
+ if (mirror_ns->primary_mirror_uuid != m_local_mirror_uuid) {
+ dout(20) << "remote snapshot " << snap_id << " not tied to local"
+ << dendl;
+ continue;
+ } else if (mirror_ns->primary_snap_id == local_snap_id) {
+ dout(15) << "local snapshot " << local_snap_id << " maps to "
+ << "remote snapshot " << snap_id << dendl;
+ return snap_id;
+ }
+
+ const auto& snap_seqs = mirror_ns->snap_seqs;
+ for (auto [local_snap_id_seq, remote_snap_id_seq] : snap_seqs) {
+ if (local_snap_id_seq == local_snap_id) {
+ dout(15) << "local snapshot " << local_snap_id << " maps to "
+ << "remote snapshot " << remote_snap_id_seq << dendl;
+ return remote_snap_id_seq;
+ }
+ }
+ }
+
+ return CEPH_NOSNAP;
+}
+
+template <typename I>
+void ApplyImageStateRequest<I>::compute_local_to_remote_snap_ids() {
+ ceph_assert(ceph_mutex_is_locked(m_local_image_ctx->image_lock));
+ std::shared_lock remote_image_locker{m_remote_image_ctx->image_lock};
+
+ for (const auto& [snap_id, snap_info] : m_local_image_ctx->snap_info) {
+ m_local_to_remote_snap_ids[snap_id] = compute_remote_snap_id(snap_id);
+ }
+
+ dout(15) << "local_to_remote_snap_ids=" << m_local_to_remote_snap_ids
+ << dendl;
+}
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::snapshot::ApplyImageStateRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.h b/src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.h
new file mode 100644
index 000000000..0e2d09ddf
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.h
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_APPLY_IMAGE_STATE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_APPLY_IMAGE_STATE_REQUEST_H
+
+#include "common/ceph_mutex.h"
+#include "librbd/mirror/snapshot/Types.h"
+#include <map>
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace snapshot {
+
+template <typename> class EventPreprocessor;
+template <typename> class ReplayStatusFormatter;
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT>
+class ApplyImageStateRequest {
+public:
+ static ApplyImageStateRequest* create(
+ const std::string& local_mirror_uuid,
+ const std::string& remote_mirror_uuid,
+ ImageCtxT* local_image_ctx,
+ ImageCtxT* remote_image_ctx,
+ librbd::mirror::snapshot::ImageState image_state,
+ Context* on_finish) {
+ return new ApplyImageStateRequest(local_mirror_uuid, remote_mirror_uuid,
+ local_image_ctx, remote_image_ctx,
+ image_state, on_finish);
+ }
+
+ ApplyImageStateRequest(
+ const std::string& local_mirror_uuid,
+ const std::string& remote_mirror_uuid,
+ ImageCtxT* local_image_ctx,
+ ImageCtxT* remote_image_ctx,
+ librbd::mirror::snapshot::ImageState image_state,
+ Context* on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * RENAME_IMAGE
+ * |
+ * | /---------\
+ * | | |
+ * v v |
+ * UPDATE_FEATURES -----/
+ * |
+ * v
+ * GET_IMAGE_META
+ * |
+ * | /---------\
+ * | | |
+ * v v |
+ * UPDATE_IMAGE_META ---/
+ * |
+ * | /---------\
+ * | | |
+ * v v |
+ * UNPROTECT_SNAPSHOT |
+ * | |
+ * v |
+ * REMOVE_SNAPSHOT |
+ * | |
+ * v |
+ * PROTECT_SNAPSHOT |
+ * | |
+ * v |
+ * RENAME_SNAPSHOT -----/
+ * |
+ * v
+ * SET_SNAPSHOT_LIMIT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ std::string m_local_mirror_uuid;
+ std::string m_remote_mirror_uuid;
+ ImageCtxT* m_local_image_ctx;
+ ImageCtxT* m_remote_image_ctx;
+ librbd::mirror::snapshot::ImageState m_image_state;
+ Context* m_on_finish;
+
+ std::map<uint64_t, uint64_t> m_local_to_remote_snap_ids;
+
+ uint64_t m_features = 0;
+
+ std::map<std::string, bufferlist> m_metadata;
+
+ uint64_t m_prev_snap_id = 0;
+ std::string m_snap_name;
+
+ void rename_image();
+ void handle_rename_image(int r);
+
+ void update_features();
+ void handle_update_features(int r);
+
+ void get_image_meta();
+ void handle_get_image_meta(int r);
+
+ void update_image_meta();
+ void handle_update_image_meta(int r);
+
+ void unprotect_snapshot();
+ void handle_unprotect_snapshot(int r);
+
+ void remove_snapshot();
+ void handle_remove_snapshot(int r);
+
+ void protect_snapshot();
+ void handle_protect_snapshot(int r);
+
+ void rename_snapshot();
+ void handle_rename_snapshot(int r);
+
+ void set_snapshot_limit();
+ void handle_set_snapshot_limit(int r);
+
+ void finish(int r);
+
+ uint64_t compute_remote_snap_id(uint64_t snap_id);
+ void compute_local_to_remote_snap_ids();
+};
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::snapshot::ApplyImageStateRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_APPLY_IMAGE_STATE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.cc
new file mode 100644
index 000000000..c923395c9
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.cc
@@ -0,0 +1,204 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "CreateLocalImageRequest.h"
+#include "include/rados/librados.hpp"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "tools/rbd_mirror/ProgressContext.h"
+#include "tools/rbd_mirror/image_replayer/CreateImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::snapshot::" \
+ << "CreateLocalImageRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace snapshot {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void CreateLocalImageRequest<I>::send() {
+ disable_mirror_image();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::disable_mirror_image() {
+ if (m_state_builder->local_image_id.empty()) {
+ add_mirror_image();
+ return;
+ }
+
+ dout(10) << dendl;
+ update_progress("DISABLE_MIRROR_IMAGE");
+
+ // need to send 'disabling' since the cls methods will fail if we aren't
+ // in that state
+ cls::rbd::MirrorImage mirror_image{
+ cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT, m_global_image_id,
+ cls::rbd::MIRROR_IMAGE_STATE_DISABLING};
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_set(&op, m_state_builder->local_image_id,
+ mirror_image);
+
+ auto aio_comp = create_rados_callback<
+ CreateLocalImageRequest<I>,
+ &CreateLocalImageRequest<I>::handle_disable_mirror_image>(this);
+ int r = m_local_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::handle_disable_mirror_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to disable mirror image " << m_global_image_id << ": "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ remove_mirror_image();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::remove_mirror_image() {
+ dout(10) << dendl;
+ update_progress("REMOVE_MIRROR_IMAGE");
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_remove(&op, m_state_builder->local_image_id);
+
+ auto aio_comp = create_rados_callback<
+ CreateLocalImageRequest<I>,
+ &CreateLocalImageRequest<I>::handle_remove_mirror_image>(this);
+ int r = m_local_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::handle_remove_mirror_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to remove mirror image " << m_global_image_id << ": "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_state_builder->local_image_id = "";
+ add_mirror_image();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::add_mirror_image() {
+ ceph_assert(m_state_builder->local_image_id.empty());
+ m_state_builder->local_image_id =
+ librbd::util::generate_image_id<I>(m_local_io_ctx);
+
+ dout(10) << "local_image_id=" << m_state_builder->local_image_id << dendl;
+ update_progress("ADD_MIRROR_IMAGE");
+
+ // use 'creating' to track a partially constructed image. it will
+ // be switched to 'enabled' once the image is fully created
+ cls::rbd::MirrorImage mirror_image{
+ cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT, m_global_image_id,
+ cls::rbd::MIRROR_IMAGE_STATE_CREATING};
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_set(&op, m_state_builder->local_image_id,
+ mirror_image);
+
+ auto aio_comp = create_rados_callback<
+ CreateLocalImageRequest<I>,
+ &CreateLocalImageRequest<I>::handle_add_mirror_image>(this);
+ int r = m_local_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::handle_add_mirror_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to register mirror image " << m_global_image_id << ": "
+ << cpp_strerror(r) << dendl;
+ this->finish(r);
+ return;
+ }
+
+ create_local_image();
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::create_local_image() {
+ dout(10) << "local_image_id=" << m_state_builder->local_image_id << dendl;
+ update_progress("CREATE_LOCAL_IMAGE");
+
+ m_remote_image_ctx->image_lock.lock_shared();
+ std::string image_name = m_remote_image_ctx->name;
+ m_remote_image_ctx->image_lock.unlock_shared();
+
+ auto ctx = create_context_callback<
+ CreateLocalImageRequest<I>,
+ &CreateLocalImageRequest<I>::handle_create_local_image>(this);
+ auto request = CreateImageRequest<I>::create(
+ m_threads, m_local_io_ctx, m_global_image_id,
+ m_state_builder->remote_mirror_uuid, image_name,
+ m_state_builder->local_image_id, m_remote_image_ctx,
+ m_pool_meta_cache, cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT, ctx);
+ request->send();
+}
+template <typename I>
+void CreateLocalImageRequest<I>::handle_create_local_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -EBADF) {
+ dout(5) << "image id " << m_state_builder->local_image_id << " "
+ << "already in-use" << dendl;
+ disable_mirror_image();
+ return;
+ } else if (r < 0) {
+ if (r == -ENOENT) {
+ dout(10) << "parent image does not exist" << dendl;
+ } else {
+ derr << "failed to create local image: " << cpp_strerror(r) << dendl;
+ }
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void CreateLocalImageRequest<I>::update_progress(
+ const std::string& description) {
+ dout(15) << description << dendl;
+ if (m_progress_ctx != nullptr) {
+ m_progress_ctx->update_progress(description);
+ }
+}
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::snapshot::CreateLocalImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.h
new file mode 100644
index 000000000..3345154b4
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_CREATE_LOCAL_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_CREATE_LOCAL_IMAGE_REQUEST_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "tools/rbd_mirror/BaseRequest.h"
+#include <string>
+
+struct Context;
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+class PoolMetaCache;
+class ProgressContext;
+template <typename> struct Threads;
+
+namespace image_replayer {
+namespace snapshot {
+
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT>
+class CreateLocalImageRequest : public BaseRequest {
+public:
+ typedef rbd::mirror::ProgressContext ProgressContext;
+
+ static CreateLocalImageRequest* create(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& local_io_ctx,
+ ImageCtxT* remote_image_ctx,
+ const std::string& global_image_id,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>* state_builder,
+ Context* on_finish) {
+ return new CreateLocalImageRequest(threads, local_io_ctx, remote_image_ctx,
+ global_image_id, pool_meta_cache,
+ progress_ctx, state_builder, on_finish);
+ }
+
+ CreateLocalImageRequest(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& local_io_ctx,
+ ImageCtxT* remote_image_ctx,
+ const std::string& global_image_id,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>* state_builder,
+ Context* on_finish)
+ : BaseRequest(on_finish),
+ m_threads(threads),
+ m_local_io_ctx(local_io_ctx),
+ m_remote_image_ctx(remote_image_ctx),
+ m_global_image_id(global_image_id),
+ m_pool_meta_cache(pool_meta_cache),
+ m_progress_ctx(progress_ctx),
+ m_state_builder(state_builder) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * DISABLE_MIRROR_IMAGE < * * * * * *
+ * | *
+ * v *
+ * REMOVE_MIRROR_IMAGE *
+ * | *
+ * v *
+ * ADD_MIRROR_IMAGE *
+ * | *
+ * v (id exists) *
+ * CREATE_LOCAL_IMAGE * * * * * * * *
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ Threads<ImageCtxT>* m_threads;
+ librados::IoCtx& m_local_io_ctx;
+ ImageCtxT* m_remote_image_ctx;
+ std::string m_global_image_id;
+ PoolMetaCache* m_pool_meta_cache;
+ ProgressContext* m_progress_ctx;
+ StateBuilder<ImageCtxT>* m_state_builder;
+
+ void disable_mirror_image();
+ void handle_disable_mirror_image(int r);
+
+ void remove_mirror_image();
+ void handle_remove_mirror_image(int r);
+
+ void add_mirror_image();
+ void handle_add_mirror_image(int r);
+
+ void create_local_image();
+ void handle_create_local_image(int r);
+
+ void update_progress(const std::string& description);
+
+};
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::snapshot::CreateLocalImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_CREATE_LOCAL_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.cc b/src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.cc
new file mode 100644
index 000000000..575eb8534
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PrepareReplayRequest.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/mirror/snapshot/ImageMeta.h"
+#include "tools/rbd_mirror/ProgressContext.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::snapshot::" \
+ << "PrepareReplayRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace snapshot {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+void PrepareReplayRequest<I>::send() {
+ *m_resync_requested = false;
+ *m_syncing = false;
+
+ load_local_image_meta();
+}
+
+template <typename I>
+void PrepareReplayRequest<I>::load_local_image_meta() {
+ dout(15) << dendl;
+
+ ceph_assert(m_state_builder->local_image_meta == nullptr);
+ m_state_builder->local_image_meta =
+ librbd::mirror::snapshot::ImageMeta<I>::create(
+ m_state_builder->local_image_ctx, m_local_mirror_uuid);
+
+ auto ctx = create_context_callback<
+ PrepareReplayRequest<I>,
+ &PrepareReplayRequest<I>::handle_load_local_image_meta>(this);
+ m_state_builder->local_image_meta->load(ctx);
+}
+
+template <typename I>
+void PrepareReplayRequest<I>::handle_load_local_image_meta(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to load local image-meta: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ *m_resync_requested = m_state_builder->local_image_meta->resync_requested;
+ finish(0);
+}
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::snapshot::PrepareReplayRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.h b/src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.h
new file mode 100644
index 000000000..4e9246acd
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_PREPARE_REPLAY_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_PREPARE_REPLAY_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/mirror/Types.h"
+#include "tools/rbd_mirror/BaseRequest.h"
+#include <list>
+#include <string>
+
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+class ProgressContext;
+
+namespace image_replayer {
+namespace snapshot {
+
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT>
+class PrepareReplayRequest : public BaseRequest {
+public:
+ static PrepareReplayRequest* create(
+ const std::string& local_mirror_uuid,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>* state_builder,
+ bool* resync_requested,
+ bool* syncing,
+ Context* on_finish) {
+ return new PrepareReplayRequest(
+ local_mirror_uuid, progress_ctx, state_builder, resync_requested,
+ syncing, on_finish);
+ }
+
+ PrepareReplayRequest(
+ const std::string& local_mirror_uuid,
+ ProgressContext* progress_ctx,
+ StateBuilder<ImageCtxT>* state_builder,
+ bool* resync_requested,
+ bool* syncing,
+ Context* on_finish)
+ : BaseRequest(on_finish),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_progress_ctx(progress_ctx),
+ m_state_builder(state_builder),
+ m_resync_requested(resync_requested),
+ m_syncing(syncing) {
+ }
+
+ void send() override;
+
+private:
+ // TODO
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * LOAD_LOCAL_IMAGE_META
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ std::string m_local_mirror_uuid;
+ ProgressContext* m_progress_ctx;
+ StateBuilder<ImageCtxT>* m_state_builder;
+ bool* m_resync_requested;
+ bool* m_syncing;
+
+ void load_local_image_meta();
+ void handle_load_local_image_meta(int r);
+
+};
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::snapshot::PrepareReplayRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_JOURNAL_PREPARE_REPLAY_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc b/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc
new file mode 100644
index 000000000..4a44a57bc
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc
@@ -0,0 +1,1586 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Replayer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "common/Timer.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "json_spirit/json_spirit.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/asio/ContextWQ.h"
+#include "librbd/deep_copy/Handler.h"
+#include "librbd/deep_copy/ImageCopyRequest.h"
+#include "librbd/deep_copy/SnapshotCopyRequest.h"
+#include "librbd/mirror/ImageStateUpdateRequest.h"
+#include "librbd/mirror/snapshot/CreateNonPrimaryRequest.h"
+#include "librbd/mirror/snapshot/GetImageStateRequest.h"
+#include "librbd/mirror/snapshot/ImageMeta.h"
+#include "librbd/mirror/snapshot/UnlinkPeerRequest.h"
+#include "tools/rbd_mirror/InstanceWatcher.h"
+#include "tools/rbd_mirror/PoolMetaCache.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/image_replayer/CloseImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/ReplayerListener.h"
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/Utils.h"
+#include <set>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::snapshot::" \
+ << "Replayer: " << this << " " << __func__ << ": "
+
+extern PerfCounters *g_snapshot_perf_counters;
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace snapshot {
+
+namespace {
+
+double round_to_two_places(double value) {
+ return abs(round(value * 100) / 100);
+}
+
+template<typename I>
+std::pair<uint64_t, librbd::SnapInfo*> get_newest_mirror_snapshot(
+ I* image_ctx) {
+ for (auto snap_info_it = image_ctx->snap_info.rbegin();
+ snap_info_it != image_ctx->snap_info.rend(); ++snap_info_it) {
+ const auto& snap_ns = snap_info_it->second.snap_namespace;
+ auto mirror_ns = boost::get<
+ cls::rbd::MirrorSnapshotNamespace>(&snap_ns);
+ if (mirror_ns == nullptr || !mirror_ns->complete) {
+ continue;
+ }
+
+ return {snap_info_it->first, &snap_info_it->second};
+ }
+
+ return {CEPH_NOSNAP, nullptr};
+}
+
+} // anonymous namespace
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+struct Replayer<I>::C_UpdateWatchCtx : public librbd::UpdateWatchCtx {
+ Replayer<I>* replayer;
+
+ C_UpdateWatchCtx(Replayer<I>* replayer) : replayer(replayer) {
+ }
+
+ void handle_notify() override {
+ replayer->handle_image_update_notify();
+ }
+};
+
+template <typename I>
+struct Replayer<I>::DeepCopyHandler : public librbd::deep_copy::Handler {
+ Replayer *replayer;
+
+ DeepCopyHandler(Replayer* replayer) : replayer(replayer) {
+ }
+
+ void handle_read(uint64_t bytes_read) override {
+ replayer->handle_copy_image_read(bytes_read);
+ }
+
+ int update_progress(uint64_t object_number, uint64_t object_count) override {
+ replayer->handle_copy_image_progress(object_number, object_count);
+ return 0;
+ }
+};
+
+template <typename I>
+Replayer<I>::Replayer(
+ Threads<I>* threads,
+ InstanceWatcher<I>* instance_watcher,
+ const std::string& local_mirror_uuid,
+ PoolMetaCache* pool_meta_cache,
+ StateBuilder<I>* state_builder,
+ ReplayerListener* replayer_listener)
+ : m_threads(threads),
+ m_instance_watcher(instance_watcher),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_pool_meta_cache(pool_meta_cache),
+ m_state_builder(state_builder),
+ m_replayer_listener(replayer_listener),
+ m_lock(ceph::make_mutex(librbd::util::unique_lock_name(
+ "rbd::mirror::image_replayer::snapshot::Replayer", this))) {
+ dout(10) << dendl;
+}
+
+template <typename I>
+Replayer<I>::~Replayer() {
+ dout(10) << dendl;
+
+ {
+ std::unique_lock locker{m_lock};
+ unregister_perf_counters();
+ }
+
+ ceph_assert(m_state == STATE_COMPLETE);
+ ceph_assert(m_update_watch_ctx == nullptr);
+ ceph_assert(m_deep_copy_handler == nullptr);
+}
+
+template <typename I>
+void Replayer<I>::init(Context* on_finish) {
+ dout(10) << dendl;
+
+ ceph_assert(m_state == STATE_INIT);
+
+ RemotePoolMeta remote_pool_meta;
+ int r = m_pool_meta_cache->get_remote_pool_meta(
+ m_state_builder->remote_image_ctx->md_ctx.get_id(), &remote_pool_meta);
+ if (r < 0 || remote_pool_meta.mirror_peer_uuid.empty()) {
+ derr << "failed to retrieve mirror peer uuid from remote pool" << dendl;
+ m_state = STATE_COMPLETE;
+ m_threads->work_queue->queue(on_finish, r);
+ return;
+ }
+
+ m_remote_mirror_peer_uuid = remote_pool_meta.mirror_peer_uuid;
+ dout(10) << "remote_mirror_peer_uuid=" << m_remote_mirror_peer_uuid << dendl;
+
+ {
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+ std::shared_lock image_locker{local_image_ctx->image_lock};
+ m_image_spec = image_replayer::util::compute_image_spec(
+ local_image_ctx->md_ctx, local_image_ctx->name);
+ }
+
+ {
+ std::unique_lock locker{m_lock};
+ register_perf_counters();
+ }
+
+ ceph_assert(m_on_init_shutdown == nullptr);
+ m_on_init_shutdown = on_finish;
+
+ register_local_update_watcher();
+}
+
+template <typename I>
+void Replayer<I>::shut_down(Context* on_finish) {
+ dout(10) << dendl;
+
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_on_init_shutdown == nullptr);
+ m_on_init_shutdown = on_finish;
+ m_error_code = 0;
+ m_error_description = "";
+
+ ceph_assert(m_state != STATE_INIT);
+ auto state = STATE_COMPLETE;
+ std::swap(m_state, state);
+
+ if (state == STATE_REPLAYING) {
+ // if a sync request was pending, request a cancelation
+ m_instance_watcher->cancel_sync_request(
+ m_state_builder->local_image_ctx->id);
+
+ // TODO interrupt snapshot copy and image copy state machines even if remote
+ // cluster is unreachable
+ dout(10) << "shut down pending on completion of snapshot replay" << dendl;
+ return;
+ }
+ locker.unlock();
+
+ unregister_remote_update_watcher();
+}
+
+template <typename I>
+void Replayer<I>::flush(Context* on_finish) {
+ dout(10) << dendl;
+
+ // TODO
+ m_threads->work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+bool Replayer<I>::get_replay_status(std::string* description,
+ Context* on_finish) {
+ dout(10) << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (m_state != STATE_REPLAYING && m_state != STATE_IDLE) {
+ locker.unlock();
+
+ derr << "replay not running" << dendl;
+ on_finish->complete(-EAGAIN);
+ return false;
+ }
+
+ std::shared_lock local_image_locker{
+ m_state_builder->local_image_ctx->image_lock};
+ auto [local_snap_id, local_snap_info] = get_newest_mirror_snapshot(
+ m_state_builder->local_image_ctx);
+
+ std::shared_lock remote_image_locker{
+ m_state_builder->remote_image_ctx->image_lock};
+ auto [remote_snap_id, remote_snap_info] = get_newest_mirror_snapshot(
+ m_state_builder->remote_image_ctx);
+
+ if (remote_snap_info == nullptr) {
+ remote_image_locker.unlock();
+ local_image_locker.unlock();
+ locker.unlock();
+
+ derr << "remote image does not contain mirror snapshots" << dendl;
+ on_finish->complete(-EAGAIN);
+ return false;
+ }
+
+ std::string replay_state = "idle";
+ if (m_remote_snap_id_end != CEPH_NOSNAP) {
+ replay_state = "syncing";
+ }
+
+ json_spirit::mObject root_obj;
+ root_obj["replay_state"] = replay_state;
+ root_obj["remote_snapshot_timestamp"] = remote_snap_info->timestamp.sec();
+
+ auto matching_remote_snap_id = util::compute_remote_snap_id(
+ m_state_builder->local_image_ctx->image_lock,
+ m_state_builder->local_image_ctx->snap_info,
+ local_snap_id, m_state_builder->remote_mirror_uuid);
+ auto matching_remote_snap_it =
+ m_state_builder->remote_image_ctx->snap_info.find(matching_remote_snap_id);
+ if (matching_remote_snap_id != CEPH_NOSNAP &&
+ matching_remote_snap_it !=
+ m_state_builder->remote_image_ctx->snap_info.end()) {
+ // use the timestamp from the matching remote image since
+ // the local snapshot would just be the time the snapshot was
+ // synced and not the consistency point in time.
+ root_obj["local_snapshot_timestamp"] =
+ matching_remote_snap_it->second.timestamp.sec();
+ }
+
+ matching_remote_snap_it = m_state_builder->remote_image_ctx->snap_info.find(
+ m_remote_snap_id_end);
+ if (m_remote_snap_id_end != CEPH_NOSNAP &&
+ matching_remote_snap_it !=
+ m_state_builder->remote_image_ctx->snap_info.end()) {
+ root_obj["syncing_snapshot_timestamp"] = remote_snap_info->timestamp.sec();
+ root_obj["syncing_percent"] = static_cast<uint64_t>(
+ 100 * m_local_mirror_snap_ns.last_copied_object_number /
+ static_cast<float>(std::max<uint64_t>(1U, m_local_object_count)));
+ }
+
+ m_bytes_per_second(0);
+ auto bytes_per_second = m_bytes_per_second.get_average();
+ root_obj["bytes_per_second"] = round_to_two_places(bytes_per_second);
+
+ auto bytes_per_snapshot = boost::accumulators::rolling_mean(
+ m_bytes_per_snapshot);
+ root_obj["bytes_per_snapshot"] = round_to_two_places(bytes_per_snapshot);
+
+ auto pending_bytes = bytes_per_snapshot * m_pending_snapshots;
+ if (bytes_per_second > 0 && m_pending_snapshots > 0) {
+ std::uint64_t seconds_until_synced = round_to_two_places(
+ pending_bytes / bytes_per_second);
+ if (seconds_until_synced >= std::numeric_limits<uint64_t>::max()) {
+ seconds_until_synced = std::numeric_limits<uint64_t>::max();
+ }
+
+ root_obj["seconds_until_synced"] = seconds_until_synced;
+ }
+
+ *description = json_spirit::write(
+ root_obj, json_spirit::remove_trailing_zeros);
+
+ local_image_locker.unlock();
+ remote_image_locker.unlock();
+ locker.unlock();
+ on_finish->complete(-EEXIST);
+ return true;
+}
+
+template <typename I>
+void Replayer<I>::load_local_image_meta() {
+ dout(10) << dendl;
+
+ {
+ // reset state in case new snapshot is added while we are scanning
+ std::unique_lock locker{m_lock};
+ m_image_updated = false;
+ }
+
+ bool update_status = false;
+ {
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+ std::shared_lock image_locker{local_image_ctx->image_lock};
+ auto image_spec = image_replayer::util::compute_image_spec(
+ local_image_ctx->md_ctx, local_image_ctx->name);
+ if (m_image_spec != image_spec) {
+ m_image_spec = image_spec;
+ update_status = true;
+ }
+ }
+ if (update_status) {
+ std::unique_lock locker{m_lock};
+ unregister_perf_counters();
+ register_perf_counters();
+ notify_status_updated();
+ }
+
+ ceph_assert(m_state_builder->local_image_meta != nullptr);
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_load_local_image_meta>(this);
+ m_state_builder->local_image_meta->load(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_load_local_image_meta(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to load local image-meta: " << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to load local image-meta");
+ return;
+ }
+
+ if (r >= 0 && m_state_builder->local_image_meta->resync_requested) {
+ m_resync_requested = true;
+
+ dout(10) << "local image resync requested" << dendl;
+ handle_replay_complete(0, "resync requested");
+ return;
+ }
+
+ refresh_local_image();
+}
+
+template <typename I>
+void Replayer<I>::refresh_local_image() {
+ if (!m_state_builder->local_image_ctx->state->is_refresh_required()) {
+ refresh_remote_image();
+ return;
+ }
+
+ dout(10) << dendl;
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_refresh_local_image>(this);
+ m_state_builder->local_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_refresh_local_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to refresh local image: " << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to refresh local image");
+ return;
+ }
+
+ refresh_remote_image();
+}
+
+template <typename I>
+void Replayer<I>::refresh_remote_image() {
+ if (!m_state_builder->remote_image_ctx->state->is_refresh_required()) {
+ std::unique_lock locker{m_lock};
+ scan_local_mirror_snapshots(&locker);
+ return;
+ }
+
+ dout(10) << dendl;
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_refresh_remote_image>(this);
+ m_state_builder->remote_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_refresh_remote_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to refresh remote image: " << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to refresh remote image");
+ return;
+ }
+
+ std::unique_lock locker{m_lock};
+ scan_local_mirror_snapshots(&locker);
+}
+
+template <typename I>
+void Replayer<I>::scan_local_mirror_snapshots(
+ std::unique_lock<ceph::mutex>* locker) {
+ if (is_replay_interrupted(locker)) {
+ return;
+ }
+
+ dout(10) << dendl;
+
+ m_local_snap_id_start = 0;
+ m_local_snap_id_end = CEPH_NOSNAP;
+ m_local_mirror_snap_ns = {};
+ m_local_object_count = 0;
+
+ m_remote_snap_id_start = 0;
+ m_remote_snap_id_end = CEPH_NOSNAP;
+ m_remote_mirror_snap_ns = {};
+
+ std::set<uint64_t> prune_snap_ids;
+
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+ std::shared_lock image_locker{local_image_ctx->image_lock};
+ for (auto snap_info_it = local_image_ctx->snap_info.begin();
+ snap_info_it != local_image_ctx->snap_info.end(); ++snap_info_it) {
+ const auto& snap_ns = snap_info_it->second.snap_namespace;
+ auto mirror_ns = boost::get<
+ cls::rbd::MirrorSnapshotNamespace>(&snap_ns);
+ if (mirror_ns == nullptr) {
+ continue;
+ }
+
+ dout(15) << "local mirror snapshot: id=" << snap_info_it->first << ", "
+ << "mirror_ns=" << *mirror_ns << dendl;
+ m_local_mirror_snap_ns = *mirror_ns;
+
+ auto local_snap_id = snap_info_it->first;
+ if (mirror_ns->is_non_primary()) {
+ if (mirror_ns->complete) {
+ // if remote has new snapshots, we would sync from here
+ m_local_snap_id_start = local_snap_id;
+ ceph_assert(m_local_snap_id_end == CEPH_NOSNAP);
+
+ if (mirror_ns->mirror_peer_uuids.empty()) {
+ // no other peer will attempt to sync to this snapshot so store as
+ // a candidate for removal
+ prune_snap_ids.insert(local_snap_id);
+ }
+ } else if (mirror_ns->last_copied_object_number == 0 &&
+ m_local_snap_id_start > 0) {
+ // snapshot might be missing image state, object-map, etc, so just
+ // delete and re-create it if we haven't started copying data
+ // objects. Also only prune this snapshot since we will need the
+ // previous mirror snapshot for syncing. Special case exception for
+ // the first non-primary snapshot since we know its snapshot is
+ // well-formed because otherwise the mirror-image-state would have
+ // forced an image deletion.
+ prune_snap_ids.clear();
+ prune_snap_ids.insert(local_snap_id);
+ break;
+ } else {
+ // start snap will be last complete mirror snapshot or initial
+ // image revision
+ m_local_snap_id_end = local_snap_id;
+ break;
+ }
+ } else if (mirror_ns->is_primary()) {
+ if (mirror_ns->complete) {
+ m_local_snap_id_start = local_snap_id;
+ ceph_assert(m_local_snap_id_end == CEPH_NOSNAP);
+ } else {
+ derr << "incomplete local primary snapshot" << dendl;
+ handle_replay_complete(locker, -EINVAL,
+ "incomplete local primary snapshot");
+ return;
+ }
+ } else {
+ derr << "unknown local mirror snapshot state" << dendl;
+ handle_replay_complete(locker, -EINVAL,
+ "invalid local mirror snapshot state");
+ return;
+ }
+ }
+ image_locker.unlock();
+
+ if (m_local_snap_id_start > 0) {
+ // remove candidate that is required for delta snapshot sync
+ prune_snap_ids.erase(m_local_snap_id_start);
+ }
+ if (!prune_snap_ids.empty()) {
+ locker->unlock();
+
+ auto prune_snap_id = *prune_snap_ids.begin();
+ dout(5) << "pruning unused non-primary snapshot " << prune_snap_id << dendl;
+ prune_non_primary_snapshot(prune_snap_id);
+ return;
+ }
+
+ if (m_local_snap_id_start > 0 || m_local_snap_id_end != CEPH_NOSNAP) {
+ if (m_local_mirror_snap_ns.is_non_primary() &&
+ m_local_mirror_snap_ns.primary_mirror_uuid !=
+ m_state_builder->remote_mirror_uuid) {
+ // TODO support multiple peers
+ derr << "local image linked to unknown peer: "
+ << m_local_mirror_snap_ns.primary_mirror_uuid << dendl;
+ handle_replay_complete(locker, -EEXIST,
+ "local image linked to unknown peer");
+ return;
+ } else if (m_local_mirror_snap_ns.state ==
+ cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY) {
+ dout(5) << "local image promoted" << dendl;
+ handle_replay_complete(locker, 0, "force promoted");
+ return;
+ }
+
+ dout(10) << "found local mirror snapshot: "
+ << "local_snap_id_start=" << m_local_snap_id_start << ", "
+ << "local_snap_id_end=" << m_local_snap_id_end << ", "
+ << "local_snap_ns=" << m_local_mirror_snap_ns << dendl;
+ if (!m_local_mirror_snap_ns.is_primary() &&
+ m_local_mirror_snap_ns.complete) {
+ // our remote sync should start after this completed snapshot
+ m_remote_snap_id_start = m_local_mirror_snap_ns.primary_snap_id;
+ }
+ }
+
+ // we don't have any mirror snapshots or only completed non-primary
+ // mirror snapshots
+ scan_remote_mirror_snapshots(locker);
+}
+
+template <typename I>
+void Replayer<I>::scan_remote_mirror_snapshots(
+ std::unique_lock<ceph::mutex>* locker) {
+ dout(10) << dendl;
+
+ m_pending_snapshots = 0;
+
+ std::set<uint64_t> unlink_snap_ids;
+ bool split_brain = false;
+ bool remote_demoted = false;
+ auto remote_image_ctx = m_state_builder->remote_image_ctx;
+ std::shared_lock image_locker{remote_image_ctx->image_lock};
+ for (auto snap_info_it = remote_image_ctx->snap_info.begin();
+ snap_info_it != remote_image_ctx->snap_info.end(); ++snap_info_it) {
+ const auto& snap_ns = snap_info_it->second.snap_namespace;
+ auto mirror_ns = boost::get<
+ cls::rbd::MirrorSnapshotNamespace>(&snap_ns);
+ if (mirror_ns == nullptr) {
+ continue;
+ }
+
+ dout(15) << "remote mirror snapshot: id=" << snap_info_it->first << ", "
+ << "mirror_ns=" << *mirror_ns << dendl;
+ remote_demoted = mirror_ns->is_demoted();
+ if (!mirror_ns->is_primary() && !mirror_ns->is_non_primary()) {
+ derr << "unknown remote mirror snapshot state" << dendl;
+ handle_replay_complete(locker, -EINVAL,
+ "invalid remote mirror snapshot state");
+ return;
+ } else if (mirror_ns->mirror_peer_uuids.count(m_remote_mirror_peer_uuid) ==
+ 0) {
+ dout(15) << "skipping remote snapshot due to missing mirror peer"
+ << dendl;
+ continue;
+ }
+
+ auto remote_snap_id = snap_info_it->first;
+ if (m_local_snap_id_start > 0 || m_local_snap_id_end != CEPH_NOSNAP) {
+ // we have a local mirror snapshot
+ if (m_local_mirror_snap_ns.is_non_primary()) {
+ // previously validated that it was linked to remote
+ ceph_assert(m_local_mirror_snap_ns.primary_mirror_uuid ==
+ m_state_builder->remote_mirror_uuid);
+
+ if (m_remote_snap_id_end == CEPH_NOSNAP) {
+ // haven't found the end snap so treat this as a candidate for unlink
+ unlink_snap_ids.insert(remote_snap_id);
+ }
+ if (m_local_mirror_snap_ns.complete &&
+ m_local_mirror_snap_ns.primary_snap_id >= remote_snap_id) {
+ // skip past completed remote snapshot
+ m_remote_snap_id_start = remote_snap_id;
+ m_remote_mirror_snap_ns = *mirror_ns;
+ dout(15) << "skipping synced remote snapshot " << remote_snap_id
+ << dendl;
+ continue;
+ } else if (!m_local_mirror_snap_ns.complete &&
+ m_local_mirror_snap_ns.primary_snap_id > remote_snap_id) {
+ // skip until we get to the in-progress remote snapshot
+ dout(15) << "skipping synced remote snapshot " << remote_snap_id
+ << " while search for in-progress sync" << dendl;
+ m_remote_snap_id_start = remote_snap_id;
+ m_remote_mirror_snap_ns = *mirror_ns;
+ continue;
+ }
+ } else if (m_local_mirror_snap_ns.state ==
+ cls::rbd::MIRROR_SNAPSHOT_STATE_PRIMARY_DEMOTED) {
+ // find the matching demotion snapshot in remote image
+ ceph_assert(m_local_snap_id_start > 0);
+ if (mirror_ns->state ==
+ cls::rbd::MIRROR_SNAPSHOT_STATE_NON_PRIMARY_DEMOTED &&
+ mirror_ns->primary_mirror_uuid == m_local_mirror_uuid &&
+ mirror_ns->primary_snap_id == m_local_snap_id_start) {
+ dout(10) << "located matching demotion snapshot: "
+ << "remote_snap_id=" << remote_snap_id << ", "
+ << "local_snap_id=" << m_local_snap_id_start << dendl;
+ m_remote_snap_id_start = remote_snap_id;
+ split_brain = false;
+ continue;
+ } else if (m_remote_snap_id_start == 0) {
+ // still looking for our matching demotion snapshot
+ dout(15) << "skipping remote snapshot " << remote_snap_id << " "
+ << "while searching for demotion" << dendl;
+ split_brain = true;
+ continue;
+ }
+ } else {
+ // should not have been able to reach this
+ ceph_assert(false);
+ }
+ } else if (!mirror_ns->is_primary()) {
+ dout(15) << "skipping non-primary remote snapshot" << dendl;
+ continue;
+ }
+
+ // found candidate snapshot to sync
+ ++m_pending_snapshots;
+ if (m_remote_snap_id_end != CEPH_NOSNAP) {
+ continue;
+ }
+
+ // first primary snapshot where were are listed as a peer
+ m_remote_snap_id_end = remote_snap_id;
+ m_remote_mirror_snap_ns = *mirror_ns;
+ }
+
+ if (m_remote_snap_id_start != 0 &&
+ remote_image_ctx->snap_info.count(m_remote_snap_id_start) == 0) {
+ // the remote start snapshot was deleted out from under us
+ derr << "failed to locate remote start snapshot: "
+ << "snap_id=" << m_remote_snap_id_start << dendl;
+ split_brain = true;
+ }
+
+ image_locker.unlock();
+
+ if (!split_brain) {
+ unlink_snap_ids.erase(m_remote_snap_id_start);
+ unlink_snap_ids.erase(m_remote_snap_id_end);
+ if (!unlink_snap_ids.empty()) {
+ locker->unlock();
+
+ // retry the unlinking process for a remote snapshot that we do not
+ // need anymore
+ auto remote_snap_id = *unlink_snap_ids.begin();
+ dout(10) << "unlinking from remote snapshot " << remote_snap_id << dendl;
+ unlink_peer(remote_snap_id);
+ return;
+ }
+
+ if (m_remote_snap_id_end != CEPH_NOSNAP) {
+ dout(10) << "found remote mirror snapshot: "
+ << "remote_snap_id_start=" << m_remote_snap_id_start << ", "
+ << "remote_snap_id_end=" << m_remote_snap_id_end << ", "
+ << "remote_snap_ns=" << m_remote_mirror_snap_ns << dendl;
+ if (m_remote_mirror_snap_ns.complete) {
+ locker->unlock();
+
+ if (m_local_snap_id_end != CEPH_NOSNAP &&
+ !m_local_mirror_snap_ns.complete) {
+ // attempt to resume image-sync
+ dout(10) << "local image contains in-progress mirror snapshot"
+ << dendl;
+ get_local_image_state();
+ } else {
+ copy_snapshots();
+ }
+ return;
+ } else {
+ // might have raced with the creation of a remote mirror snapshot
+ // so we will need to refresh and rescan once it completes
+ dout(15) << "remote mirror snapshot not complete" << dendl;
+ }
+ }
+ }
+
+ if (m_image_updated) {
+ // received update notification while scanning image, restart ...
+ m_image_updated = false;
+ locker->unlock();
+
+ dout(10) << "restarting snapshot scan due to remote update notification"
+ << dendl;
+ load_local_image_meta();
+ return;
+ }
+
+ if (is_replay_interrupted(locker)) {
+ return;
+ } else if (split_brain) {
+ derr << "split-brain detected: failed to find matching non-primary "
+ << "snapshot in remote image: "
+ << "local_snap_id_start=" << m_local_snap_id_start << ", "
+ << "local_snap_ns=" << m_local_mirror_snap_ns << dendl;
+ handle_replay_complete(locker, -EEXIST, "split-brain");
+ return;
+ } else if (remote_demoted) {
+ dout(10) << "remote image demoted" << dendl;
+ handle_replay_complete(locker, -EREMOTEIO, "remote image demoted");
+ return;
+ }
+
+ dout(10) << "all remote snapshots synced: idling waiting for new snapshot"
+ << dendl;
+ ceph_assert(m_state == STATE_REPLAYING);
+ m_state = STATE_IDLE;
+
+ notify_status_updated();
+}
+
+template <typename I>
+void Replayer<I>::prune_non_primary_snapshot(uint64_t snap_id) {
+ dout(10) << "snap_id=" << snap_id << dendl;
+
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+ bool snap_valid = false;
+ cls::rbd::SnapshotNamespace snap_namespace;
+ std::string snap_name;
+
+ {
+ std::shared_lock image_locker{local_image_ctx->image_lock};
+ auto snap_info = local_image_ctx->get_snap_info(snap_id);
+ if (snap_info != nullptr) {
+ snap_valid = true;
+ snap_namespace = snap_info->snap_namespace;
+ snap_name = snap_info->name;
+
+ ceph_assert(boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &snap_namespace) != nullptr);
+ }
+ }
+
+ if (!snap_valid) {
+ load_local_image_meta();
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_prune_non_primary_snapshot>(this);
+ local_image_ctx->operations->snap_remove(snap_namespace, snap_name, ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_prune_non_primary_snapshot(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to prune non-primary snapshot: " << cpp_strerror(r)
+ << dendl;
+ handle_replay_complete(r, "failed to prune non-primary snapshot");
+ return;
+ }
+
+ if (is_replay_interrupted()) {
+ return;
+ }
+
+ load_local_image_meta();
+}
+
+template <typename I>
+void Replayer<I>::copy_snapshots() {
+ dout(10) << "remote_snap_id_start=" << m_remote_snap_id_start << ", "
+ << "remote_snap_id_end=" << m_remote_snap_id_end << ", "
+ << "local_snap_id_start=" << m_local_snap_id_start << dendl;
+
+ ceph_assert(m_remote_snap_id_start != CEPH_NOSNAP);
+ ceph_assert(m_remote_snap_id_end > 0 &&
+ m_remote_snap_id_end != CEPH_NOSNAP);
+ ceph_assert(m_local_snap_id_start != CEPH_NOSNAP);
+
+ m_local_mirror_snap_ns = {};
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_copy_snapshots>(this);
+ auto req = librbd::deep_copy::SnapshotCopyRequest<I>::create(
+ m_state_builder->remote_image_ctx, m_state_builder->local_image_ctx,
+ m_remote_snap_id_start, m_remote_snap_id_end, m_local_snap_id_start,
+ false, m_threads->work_queue, &m_local_mirror_snap_ns.snap_seqs,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void Replayer<I>::handle_copy_snapshots(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to copy snapshots from remote to local image: "
+ << cpp_strerror(r) << dendl;
+ handle_replay_complete(
+ r, "failed to copy snapshots from remote to local image");
+ return;
+ }
+
+ dout(10) << "remote_snap_id_start=" << m_remote_snap_id_start << ", "
+ << "remote_snap_id_end=" << m_remote_snap_id_end << ", "
+ << "local_snap_id_start=" << m_local_snap_id_start << ", "
+ << "snap_seqs=" << m_local_mirror_snap_ns.snap_seqs << dendl;
+ get_remote_image_state();
+}
+
+template <typename I>
+void Replayer<I>::get_remote_image_state() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_get_remote_image_state>(this);
+ auto req = librbd::mirror::snapshot::GetImageStateRequest<I>::create(
+ m_state_builder->remote_image_ctx, m_remote_snap_id_end,
+ &m_image_state, ctx);
+ req->send();
+}
+
+template <typename I>
+void Replayer<I>::handle_get_remote_image_state(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to retrieve remote snapshot image state: "
+ << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to retrieve remote snapshot image state");
+ return;
+ }
+
+ create_non_primary_snapshot();
+}
+
+template <typename I>
+void Replayer<I>::get_local_image_state() {
+ dout(10) << dendl;
+
+ ceph_assert(m_local_snap_id_end != CEPH_NOSNAP);
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_get_local_image_state>(this);
+ auto req = librbd::mirror::snapshot::GetImageStateRequest<I>::create(
+ m_state_builder->local_image_ctx, m_local_snap_id_end,
+ &m_image_state, ctx);
+ req->send();
+}
+
+template <typename I>
+void Replayer<I>::handle_get_local_image_state(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to retrieve local snapshot image state: "
+ << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to retrieve local snapshot image state");
+ return;
+ }
+
+ request_sync();
+}
+
+template <typename I>
+void Replayer<I>::create_non_primary_snapshot() {
+ auto local_image_ctx = m_state_builder->local_image_ctx;
+
+ if (m_local_snap_id_start > 0) {
+ std::shared_lock local_image_locker{local_image_ctx->image_lock};
+
+ auto local_snap_info_it = local_image_ctx->snap_info.find(
+ m_local_snap_id_start);
+ if (local_snap_info_it == local_image_ctx->snap_info.end()) {
+ local_image_locker.unlock();
+
+ derr << "failed to locate local snapshot " << m_local_snap_id_start
+ << dendl;
+ handle_replay_complete(-ENOENT, "failed to locate local start snapshot");
+ return;
+ }
+
+ auto mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &local_snap_info_it->second.snap_namespace);
+ ceph_assert(mirror_ns != nullptr);
+
+ auto remote_image_ctx = m_state_builder->remote_image_ctx;
+ std::shared_lock remote_image_locker{remote_image_ctx->image_lock};
+
+ // (re)build a full mapping from remote to local snap ids for all user
+ // snapshots to support applying image state in the future
+ for (auto& [remote_snap_id, remote_snap_info] :
+ remote_image_ctx->snap_info) {
+ if (remote_snap_id >= m_remote_snap_id_end) {
+ break;
+ }
+
+ // we can ignore all non-user snapshots since image state only includes
+ // user snapshots
+ if (boost::get<cls::rbd::UserSnapshotNamespace>(
+ &remote_snap_info.snap_namespace) == nullptr) {
+ continue;
+ }
+
+ uint64_t local_snap_id = CEPH_NOSNAP;
+ if (mirror_ns->is_demoted() && !m_remote_mirror_snap_ns.is_demoted()) {
+ // if we are creating a non-primary snapshot following a demotion,
+ // re-build the full snapshot sequence since we don't have a valid
+ // snapshot mapping
+ auto local_snap_id_it = local_image_ctx->snap_ids.find(
+ {remote_snap_info.snap_namespace, remote_snap_info.name});
+ if (local_snap_id_it != local_image_ctx->snap_ids.end()) {
+ local_snap_id = local_snap_id_it->second;
+ }
+ } else {
+ auto snap_seq_it = mirror_ns->snap_seqs.find(remote_snap_id);
+ if (snap_seq_it != mirror_ns->snap_seqs.end()) {
+ local_snap_id = snap_seq_it->second;
+ }
+ }
+
+ if (m_local_mirror_snap_ns.snap_seqs.count(remote_snap_id) == 0 &&
+ local_snap_id != CEPH_NOSNAP) {
+ dout(15) << "mapping remote snapshot " << remote_snap_id << " to "
+ << "local snapshot " << local_snap_id << dendl;
+ m_local_mirror_snap_ns.snap_seqs[remote_snap_id] = local_snap_id;
+ }
+ }
+ }
+
+ dout(10) << "demoted=" << m_remote_mirror_snap_ns.is_demoted() << ", "
+ << "primary_mirror_uuid="
+ << m_state_builder->remote_mirror_uuid << ", "
+ << "primary_snap_id=" << m_remote_snap_id_end << ", "
+ << "snap_seqs=" << m_local_mirror_snap_ns.snap_seqs << dendl;
+
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_create_non_primary_snapshot>(this);
+ auto req = librbd::mirror::snapshot::CreateNonPrimaryRequest<I>::create(
+ local_image_ctx, m_remote_mirror_snap_ns.is_demoted(),
+ m_state_builder->remote_mirror_uuid, m_remote_snap_id_end,
+ m_local_mirror_snap_ns.snap_seqs, m_image_state, &m_local_snap_id_end, ctx);
+ req->send();
+}
+
+template <typename I>
+void Replayer<I>::handle_create_non_primary_snapshot(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to create local mirror snapshot: " << cpp_strerror(r)
+ << dendl;
+ handle_replay_complete(r, "failed to create local mirror snapshot");
+ return;
+ }
+
+ dout(15) << "local_snap_id_end=" << m_local_snap_id_end << dendl;
+
+ update_mirror_image_state();
+}
+
+template <typename I>
+void Replayer<I>::update_mirror_image_state() {
+ if (m_local_snap_id_start > 0) {
+ request_sync();
+ return;
+ }
+
+ // a newly created non-primary image has a local mirror state of CREATING
+ // until this point so that we could avoid preserving the image until
+ // the first non-primary snapshot linked the two images together.
+ dout(10) << dendl;
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_update_mirror_image_state>(this);
+ auto req = librbd::mirror::ImageStateUpdateRequest<I>::create(
+ m_state_builder->local_image_ctx->md_ctx,
+ m_state_builder->local_image_ctx->id,
+ cls::rbd::MIRROR_IMAGE_STATE_ENABLED, {}, ctx);
+ req->send();
+}
+
+template <typename I>
+void Replayer<I>::handle_update_mirror_image_state(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to update local mirror image state: " << cpp_strerror(r)
+ << dendl;
+ handle_replay_complete(r, "failed to update local mirror image state");
+ return;
+ }
+
+ request_sync();
+}
+
+template <typename I>
+void Replayer<I>::request_sync() {
+ if (m_remote_mirror_snap_ns.clean_since_snap_id == m_remote_snap_id_start) {
+ dout(10) << "skipping unnecessary image copy: "
+ << "remote_snap_id_start=" << m_remote_snap_id_start << ", "
+ << "remote_mirror_snap_ns=" << m_remote_mirror_snap_ns << dendl;
+ apply_image_state();
+ return;
+ }
+
+ dout(10) << dendl;
+ std::unique_lock locker{m_lock};
+ if (is_replay_interrupted(&locker)) {
+ return;
+ }
+
+ auto ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_request_sync>(this));
+ m_instance_watcher->notify_sync_request(m_state_builder->local_image_ctx->id,
+ ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_request_sync(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (is_replay_interrupted(&locker)) {
+ return;
+ } else if (r == -ECANCELED) {
+ dout(5) << "image-sync canceled" << dendl;
+ handle_replay_complete(&locker, r, "image-sync canceled");
+ return;
+ } else if (r < 0) {
+ derr << "failed to request image-sync: " << cpp_strerror(r) << dendl;
+ handle_replay_complete(&locker, r, "failed to request image-sync");
+ return;
+ }
+
+ m_sync_in_progress = true;
+ locker.unlock();
+
+ copy_image();
+}
+
+template <typename I>
+void Replayer<I>::copy_image() {
+ dout(10) << "remote_snap_id_start=" << m_remote_snap_id_start << ", "
+ << "remote_snap_id_end=" << m_remote_snap_id_end << ", "
+ << "local_snap_id_start=" << m_local_snap_id_start << ", "
+ << "last_copied_object_number="
+ << m_local_mirror_snap_ns.last_copied_object_number << ", "
+ << "snap_seqs=" << m_local_mirror_snap_ns.snap_seqs << dendl;
+
+ m_snapshot_bytes = 0;
+ m_snapshot_replay_start = ceph_clock_now();
+ m_deep_copy_handler = new DeepCopyHandler(this);
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_copy_image>(this);
+ auto req = librbd::deep_copy::ImageCopyRequest<I>::create(
+ m_state_builder->remote_image_ctx, m_state_builder->local_image_ctx,
+ m_remote_snap_id_start, m_remote_snap_id_end, m_local_snap_id_start, false,
+ (m_local_mirror_snap_ns.last_copied_object_number > 0 ?
+ librbd::deep_copy::ObjectNumber{
+ m_local_mirror_snap_ns.last_copied_object_number} :
+ librbd::deep_copy::ObjectNumber{}),
+ m_local_mirror_snap_ns.snap_seqs, m_deep_copy_handler, ctx);
+ req->send();
+}
+
+template <typename I>
+void Replayer<I>::handle_copy_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ delete m_deep_copy_handler;
+ m_deep_copy_handler = nullptr;
+
+ if (r < 0) {
+ derr << "failed to copy remote image to local image: " << cpp_strerror(r)
+ << dendl;
+ handle_replay_complete(r, "failed to copy remote image");
+ return;
+ }
+
+ {
+ std::unique_lock locker{m_lock};
+ m_bytes_per_snapshot(m_snapshot_bytes);
+ auto time = ceph_clock_now() - m_snapshot_replay_start;
+ if (g_snapshot_perf_counters) {
+ g_snapshot_perf_counters->inc(l_rbd_mirror_snapshot_replay_bytes,
+ m_snapshot_bytes);
+ g_snapshot_perf_counters->inc(l_rbd_mirror_snapshot_replay_snapshots);
+ g_snapshot_perf_counters->tinc(
+ l_rbd_mirror_snapshot_replay_snapshots_time, time);
+ }
+ if (m_perf_counters) {
+ m_perf_counters->inc(l_rbd_mirror_snapshot_replay_bytes, m_snapshot_bytes);
+ m_perf_counters->inc(l_rbd_mirror_snapshot_replay_snapshots);
+ m_perf_counters->tinc(l_rbd_mirror_snapshot_replay_snapshots_time, time);
+ }
+ m_snapshot_bytes = 0;
+ }
+
+ apply_image_state();
+}
+
+template <typename I>
+void Replayer<I>::handle_copy_image_progress(uint64_t object_number,
+ uint64_t object_count) {
+ dout(10) << "object_number=" << object_number << ", "
+ << "object_count=" << object_count << dendl;
+
+ std::unique_lock locker{m_lock};
+ m_local_mirror_snap_ns.last_copied_object_number = std::min(
+ object_number, object_count);
+ m_local_object_count = object_count;
+
+ update_non_primary_snapshot(false);
+}
+
+template <typename I>
+void Replayer<I>::handle_copy_image_read(uint64_t bytes_read) {
+ dout(20) << "bytes_read=" << bytes_read << dendl;
+
+ std::unique_lock locker{m_lock};
+ m_bytes_per_second(bytes_read);
+ m_snapshot_bytes += bytes_read;
+}
+
+template <typename I>
+void Replayer<I>::apply_image_state() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_apply_image_state>(this);
+ auto req = ApplyImageStateRequest<I>::create(
+ m_local_mirror_uuid,
+ m_state_builder->remote_mirror_uuid,
+ m_state_builder->local_image_ctx,
+ m_state_builder->remote_image_ctx,
+ m_image_state, ctx);
+ req->send();
+}
+
+template <typename I>
+void Replayer<I>::handle_apply_image_state(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to apply remote image state to local image: "
+ << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to apply remote image state");
+ return;
+ }
+
+ std::unique_lock locker{m_lock};
+ update_non_primary_snapshot(true);
+}
+
+template <typename I>
+void Replayer<I>::update_non_primary_snapshot(bool complete) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ if (!complete) {
+ // disallow two in-flight updates if this isn't the completion of the sync
+ if (m_updating_sync_point) {
+ return;
+ }
+ m_updating_sync_point = true;
+ } else {
+ m_local_mirror_snap_ns.complete = true;
+ }
+
+ dout(10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_snapshot_set_copy_progress(
+ &op, m_local_snap_id_end, m_local_mirror_snap_ns.complete,
+ m_local_mirror_snap_ns.last_copied_object_number);
+
+ auto ctx = new C_TrackedOp(
+ m_in_flight_op_tracker, new LambdaContext([this, complete](int r) {
+ handle_update_non_primary_snapshot(complete, r);
+ }));
+ auto aio_comp = create_rados_callback(ctx);
+ int r = m_state_builder->local_image_ctx->md_ctx.aio_operate(
+ m_state_builder->local_image_ctx->header_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void Replayer<I>::handle_update_non_primary_snapshot(bool complete, int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to update local snapshot progress: " << cpp_strerror(r)
+ << dendl;
+ if (complete) {
+ // only fail if this was the final update
+ handle_replay_complete(r, "failed to update local snapshot progress");
+ return;
+ }
+ }
+
+ if (!complete) {
+ // periodic sync-point update -- do not advance state machine
+ std::unique_lock locker{m_lock};
+
+ ceph_assert(m_updating_sync_point);
+ m_updating_sync_point = false;
+ return;
+ }
+
+ notify_image_update();
+}
+
+template <typename I>
+void Replayer<I>::notify_image_update() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_notify_image_update>(this);
+ m_state_builder->local_image_ctx->notify_update(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_notify_image_update(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to notify local image update: " << cpp_strerror(r) << dendl;
+ }
+
+ unlink_peer(m_remote_snap_id_start);
+}
+
+template <typename I>
+void Replayer<I>::unlink_peer(uint64_t remote_snap_id) {
+ if (remote_snap_id == 0) {
+ finish_sync();
+ return;
+ }
+
+ // local snapshot fully synced -- we no longer depend on the sync
+ // start snapshot in the remote image
+ dout(10) << "remote_snap_id=" << remote_snap_id << dendl;
+
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_unlink_peer>(this);
+ auto req = librbd::mirror::snapshot::UnlinkPeerRequest<I>::create(
+ m_state_builder->remote_image_ctx, remote_snap_id,
+ m_remote_mirror_peer_uuid, ctx);
+ req->send();
+}
+
+template <typename I>
+void Replayer<I>::handle_unlink_peer(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to unlink local peer from remote image: " << cpp_strerror(r)
+ << dendl;
+ handle_replay_complete(r, "failed to unlink local peer from remote image");
+ return;
+ }
+
+ finish_sync();
+}
+
+template <typename I>
+void Replayer<I>::finish_sync() {
+ dout(10) << dendl;
+
+ {
+ std::unique_lock locker{m_lock};
+ notify_status_updated();
+
+ if (m_sync_in_progress) {
+ m_sync_in_progress = false;
+ m_instance_watcher->notify_sync_complete(
+ m_state_builder->local_image_ctx->id);
+ }
+ }
+
+ if (is_replay_interrupted()) {
+ return;
+ }
+
+ load_local_image_meta();
+}
+
+template <typename I>
+void Replayer<I>::register_local_update_watcher() {
+ dout(10) << dendl;
+
+ m_update_watch_ctx = new C_UpdateWatchCtx(this);
+
+ int r = m_state_builder->local_image_ctx->state->register_update_watcher(
+ m_update_watch_ctx, &m_local_update_watcher_handle);
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_register_local_update_watcher>(this);
+ m_threads->work_queue->queue(ctx, r);
+}
+
+template <typename I>
+void Replayer<I>::handle_register_local_update_watcher(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to register local update watcher: " << cpp_strerror(r)
+ << dendl;
+ handle_replay_complete(r, "failed to register local image update watcher");
+ m_state = STATE_COMPLETE;
+
+ delete m_update_watch_ctx;
+ m_update_watch_ctx = nullptr;
+
+ Context* on_init = nullptr;
+ std::swap(on_init, m_on_init_shutdown);
+ on_init->complete(r);
+ return;
+ }
+
+ register_remote_update_watcher();
+}
+
+template <typename I>
+void Replayer<I>::register_remote_update_watcher() {
+ dout(10) << dendl;
+
+ int r = m_state_builder->remote_image_ctx->state->register_update_watcher(
+ m_update_watch_ctx, &m_remote_update_watcher_handle);
+ auto ctx = create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_register_remote_update_watcher>(this);
+ m_threads->work_queue->queue(ctx, r);
+}
+
+template <typename I>
+void Replayer<I>::handle_register_remote_update_watcher(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to register remote update watcher: " << cpp_strerror(r)
+ << dendl;
+ handle_replay_complete(r, "failed to register remote image update watcher");
+ m_state = STATE_COMPLETE;
+
+ unregister_local_update_watcher();
+ return;
+ }
+
+ m_state = STATE_REPLAYING;
+
+ Context* on_init = nullptr;
+ std::swap(on_init, m_on_init_shutdown);
+ on_init->complete(0);
+
+ // delay initial snapshot scan until after we have alerted
+ // image replayer that we have initialized in case an error
+ // occurs
+ {
+ std::unique_lock locker{m_lock};
+ notify_status_updated();
+ }
+
+ load_local_image_meta();
+}
+
+template <typename I>
+void Replayer<I>::unregister_remote_update_watcher() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ Replayer<I>,
+ &Replayer<I>::handle_unregister_remote_update_watcher>(this);
+ m_state_builder->remote_image_ctx->state->unregister_update_watcher(
+ m_remote_update_watcher_handle, ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_unregister_remote_update_watcher(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to unregister remote update watcher: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ unregister_local_update_watcher();
+}
+
+template <typename I>
+void Replayer<I>::unregister_local_update_watcher() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ Replayer<I>,
+ &Replayer<I>::handle_unregister_local_update_watcher>(this);
+ m_state_builder->local_image_ctx->state->unregister_update_watcher(
+ m_local_update_watcher_handle, ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_unregister_local_update_watcher(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to unregister local update watcher: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ delete m_update_watch_ctx;
+ m_update_watch_ctx = nullptr;
+
+ wait_for_in_flight_ops();
+}
+
+template <typename I>
+void Replayer<I>::wait_for_in_flight_ops() {
+ dout(10) << dendl;
+
+ auto ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ Replayer<I>, &Replayer<I>::handle_wait_for_in_flight_ops>(this));
+ m_in_flight_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void Replayer<I>::handle_wait_for_in_flight_ops(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context* on_shutdown = nullptr;
+ {
+ std::unique_lock locker{m_lock};
+ ceph_assert(m_on_init_shutdown != nullptr);
+ std::swap(on_shutdown, m_on_init_shutdown);
+ }
+ on_shutdown->complete(m_error_code);
+}
+
+template <typename I>
+void Replayer<I>::handle_image_update_notify() {
+ dout(10) << dendl;
+
+ std::unique_lock locker{m_lock};
+ if (m_state == STATE_REPLAYING) {
+ dout(15) << "flagging snapshot rescan required" << dendl;
+ m_image_updated = true;
+ } else if (m_state == STATE_IDLE) {
+ m_state = STATE_REPLAYING;
+ locker.unlock();
+
+ dout(15) << "restarting idle replayer" << dendl;
+ load_local_image_meta();
+ }
+}
+
+template <typename I>
+void Replayer<I>::handle_replay_complete(int r,
+ const std::string& description) {
+ std::unique_lock locker{m_lock};
+ handle_replay_complete(&locker, r, description);
+}
+
+template <typename I>
+void Replayer<I>::handle_replay_complete(std::unique_lock<ceph::mutex>* locker,
+ int r,
+ const std::string& description) {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ if (m_sync_in_progress) {
+ m_sync_in_progress = false;
+ m_instance_watcher->notify_sync_complete(
+ m_state_builder->local_image_ctx->id);
+ }
+
+ // don't set error code and description if resuming a pending
+ // shutdown
+ if (is_replay_interrupted(locker)) {
+ return;
+ }
+
+ if (m_error_code == 0) {
+ m_error_code = r;
+ m_error_description = description;
+ }
+
+ if (m_state != STATE_REPLAYING && m_state != STATE_IDLE) {
+ return;
+ }
+
+ m_state = STATE_COMPLETE;
+ notify_status_updated();
+}
+
+template <typename I>
+void Replayer<I>::notify_status_updated() {
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ dout(10) << dendl;
+ auto ctx = new C_TrackedOp(m_in_flight_op_tracker, new LambdaContext(
+ [this](int) {
+ m_replayer_listener->handle_notification();
+ }));
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+bool Replayer<I>::is_replay_interrupted() {
+ std::unique_lock locker{m_lock};
+ return is_replay_interrupted(&locker);
+}
+
+template <typename I>
+bool Replayer<I>::is_replay_interrupted(std::unique_lock<ceph::mutex>* locker) {
+ if (m_state == STATE_COMPLETE) {
+ locker->unlock();
+
+ dout(10) << "resuming pending shut down" << dendl;
+ unregister_remote_update_watcher();
+ return true;
+ }
+ return false;
+}
+
+template <typename I>
+void Replayer<I>::register_perf_counters() {
+ dout(5) << dendl;
+
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+ ceph_assert(m_perf_counters == nullptr);
+
+ auto cct = static_cast<CephContext *>(m_state_builder->local_image_ctx->cct);
+ auto prio = cct->_conf.get_val<int64_t>("rbd_mirror_image_perf_stats_prio");
+ PerfCountersBuilder plb(g_ceph_context,
+ "rbd_mirror_snapshot_image_" + m_image_spec,
+ l_rbd_mirror_snapshot_first,
+ l_rbd_mirror_snapshot_last);
+ plb.add_u64_counter(l_rbd_mirror_snapshot_replay_snapshots,
+ "snapshots", "Snapshots", "r", prio);
+ plb.add_time_avg(l_rbd_mirror_snapshot_replay_snapshots_time,
+ "snapshots_time", "Snapshots time", "rl", prio);
+ plb.add_u64_counter(l_rbd_mirror_snapshot_replay_bytes, "replay_bytes",
+ "Replayed data", "rb", prio, unit_t(UNIT_BYTES));
+ m_perf_counters = plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(m_perf_counters);
+}
+
+template <typename I>
+void Replayer<I>::unregister_perf_counters() {
+ dout(5) << dendl;
+ ceph_assert(ceph_mutex_is_locked_by_me(m_lock));
+
+ PerfCounters *perf_counters = nullptr;
+ std::swap(perf_counters, m_perf_counters);
+
+ if (perf_counters != nullptr) {
+ g_ceph_context->get_perfcounters_collection()->remove(perf_counters);
+ delete perf_counters;
+ }
+}
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::snapshot::Replayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.h b/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.h
new file mode 100644
index 000000000..e3c4c2089
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/Replayer.h
@@ -0,0 +1,346 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_REPLAYER_H
+#define RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_REPLAYER_H
+
+#include "tools/rbd_mirror/image_replayer/Replayer.h"
+#include "common/ceph_mutex.h"
+#include "common/AsyncOpTracker.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/mirror/snapshot/Types.h"
+#include "tools/rbd_mirror/image_replayer/TimeRollingMean.h"
+#include <boost/accumulators/accumulators.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+#include <boost/accumulators/statistics/rolling_mean.hpp>
+#include <string>
+#include <type_traits>
+
+namespace librbd {
+
+struct ImageCtx;
+namespace snapshot { template <typename I> class Replay; }
+
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct InstanceWatcher;
+class PoolMetaCache;
+template <typename> struct Threads;
+
+namespace image_replayer {
+
+struct ReplayerListener;
+
+namespace snapshot {
+
+template <typename> class EventPreprocessor;
+template <typename> class ReplayStatusFormatter;
+template <typename> class StateBuilder;
+
+template <typename ImageCtxT>
+class Replayer : public image_replayer::Replayer {
+public:
+ static Replayer* create(
+ Threads<ImageCtxT>* threads,
+ InstanceWatcher<ImageCtxT>* instance_watcher,
+ const std::string& local_mirror_uuid,
+ PoolMetaCache* pool_meta_cache,
+ StateBuilder<ImageCtxT>* state_builder,
+ ReplayerListener* replayer_listener) {
+ return new Replayer(threads, instance_watcher, local_mirror_uuid,
+ pool_meta_cache, state_builder, replayer_listener);
+ }
+
+ Replayer(
+ Threads<ImageCtxT>* threads,
+ InstanceWatcher<ImageCtxT>* instance_watcher,
+ const std::string& local_mirror_uuid,
+ PoolMetaCache* pool_meta_cache,
+ StateBuilder<ImageCtxT>* state_builder,
+ ReplayerListener* replayer_listener);
+ ~Replayer();
+
+ void destroy() override {
+ delete this;
+ }
+
+ void init(Context* on_finish) override;
+ void shut_down(Context* on_finish) override;
+
+ void flush(Context* on_finish) override;
+
+ bool get_replay_status(std::string* description, Context* on_finish) override;
+
+ bool is_replaying() const override {
+ std::unique_lock locker{m_lock};
+ return (m_state == STATE_REPLAYING || m_state == STATE_IDLE);
+ }
+
+ bool is_resync_requested() const override {
+ std::unique_lock locker{m_lock};
+ return m_resync_requested;
+ }
+
+ int get_error_code() const override {
+ std::unique_lock locker(m_lock);
+ return m_error_code;
+ }
+
+ std::string get_error_description() const override {
+ std::unique_lock locker(m_lock);
+ return m_error_description;
+ }
+
+ std::string get_image_spec() const {
+ std::unique_lock locker(m_lock);
+ return m_image_spec;
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <init>
+ * |
+ * v
+ * REGISTER_LOCAL_UPDATE_WATCHER
+ * |
+ * v
+ * REGISTER_REMOTE_UPDATE_WATCHER
+ * |
+ * v
+ * LOAD_LOCAL_IMAGE_META <----------------------------\
+ * | |
+ * v (skip if not needed) |
+ * REFRESH_LOCAL_IMAGE |
+ * | |
+ * v (skip if not needed) |
+ * REFRESH_REMOTE_IMAGE |
+ * | |
+ * | (unused non-primary snapshot) |
+ * |\--------------> PRUNE_NON_PRIMARY_SNAPSHOT---/|
+ * | |
+ * | (interrupted sync) |
+ * |\--------------> GET_LOCAL_IMAGE_STATE ------\ |
+ * | | |
+ * | (new snapshot) | |
+ * |\--------------> COPY_SNAPSHOTS | |
+ * | | | |
+ * | v | |
+ * | GET_REMOTE_IMAGE_STATE | |
+ * | | | |
+ * | v | |
+ * | CREATE_NON_PRIMARY_SNAPSHOT | |
+ * | | | |
+ * | v (skip if not needed)| |
+ * | UPDATE_MIRROR_IMAGE_STATE | |
+ * | | | |
+ * | |/--------------------/ |
+ * | | |
+ * | v |
+ * | REQUEST_SYNC |
+ * | | |
+ * | v |
+ * | COPY_IMAGE |
+ * | | |
+ * | v |
+ * | APPLY_IMAGE_STATE |
+ * | | |
+ * | v |
+ * | UPDATE_NON_PRIMARY_SNAPSHOT |
+ * | | |
+ * | v |
+ * | NOTIFY_IMAGE_UPDATE |
+ * | | |
+ * | (interrupted unlink) v |
+ * |\--------------> UNLINK_PEER |
+ * | | |
+ * | v |
+ * | NOTIFY_LISTENER |
+ * | | |
+ * | \----------------------/|
+ * | |
+ * | (remote demoted) |
+ * \---------------> NOTIFY_LISTENER |
+ * | | |
+ * |/--------------------/ |
+ * | |
+ * | (update notification) |
+ * <idle> --------------------------------------------/
+ * |
+ * v
+ * <shut down>
+ * |
+ * v
+ * UNREGISTER_REMOTE_UPDATE_WATCHER
+ * |
+ * v
+ * UNREGISTER_LOCAL_UPDATE_WATCHER
+ * |
+ * v
+ * WAIT_FOR_IN_FLIGHT_OPS
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ enum State {
+ STATE_INIT,
+ STATE_REPLAYING,
+ STATE_IDLE,
+ STATE_COMPLETE
+ };
+
+ struct C_UpdateWatchCtx;
+ struct DeepCopyHandler;
+
+ Threads<ImageCtxT>* m_threads;
+ InstanceWatcher<ImageCtxT>* m_instance_watcher;
+ std::string m_local_mirror_uuid;
+ PoolMetaCache* m_pool_meta_cache;
+ StateBuilder<ImageCtxT>* m_state_builder;
+ ReplayerListener* m_replayer_listener;
+
+ mutable ceph::mutex m_lock;
+
+ State m_state = STATE_INIT;
+
+ std::string m_image_spec;
+ Context* m_on_init_shutdown = nullptr;
+
+ bool m_resync_requested = false;
+ int m_error_code = 0;
+ std::string m_error_description;
+
+ C_UpdateWatchCtx* m_update_watch_ctx = nullptr;
+ uint64_t m_local_update_watcher_handle = 0;
+ uint64_t m_remote_update_watcher_handle = 0;
+ bool m_image_updated = false;
+
+ AsyncOpTracker m_in_flight_op_tracker;
+
+ uint64_t m_local_snap_id_start = 0;
+ uint64_t m_local_snap_id_end = CEPH_NOSNAP;
+ cls::rbd::MirrorSnapshotNamespace m_local_mirror_snap_ns;
+ uint64_t m_local_object_count = 0;
+
+ std::string m_remote_mirror_peer_uuid;
+ uint64_t m_remote_snap_id_start = 0;
+ uint64_t m_remote_snap_id_end = CEPH_NOSNAP;
+ cls::rbd::MirrorSnapshotNamespace m_remote_mirror_snap_ns;
+
+ librbd::mirror::snapshot::ImageState m_image_state;
+ DeepCopyHandler* m_deep_copy_handler = nullptr;
+
+ TimeRollingMean m_bytes_per_second;
+
+ uint64_t m_snapshot_bytes = 0;
+ boost::accumulators::accumulator_set<
+ uint64_t, boost::accumulators::stats<
+ boost::accumulators::tag::rolling_mean>> m_bytes_per_snapshot{
+ boost::accumulators::tag::rolling_window::window_size = 2};
+ utime_t m_snapshot_replay_start;
+
+ uint32_t m_pending_snapshots = 0;
+
+ bool m_remote_image_updated = false;
+ bool m_updating_sync_point = false;
+ bool m_sync_in_progress = false;
+
+ PerfCounters *m_perf_counters = nullptr;
+
+ void load_local_image_meta();
+ void handle_load_local_image_meta(int r);
+
+ void refresh_local_image();
+ void handle_refresh_local_image(int r);
+
+ void refresh_remote_image();
+ void handle_refresh_remote_image(int r);
+
+ void scan_local_mirror_snapshots(std::unique_lock<ceph::mutex>* locker);
+ void scan_remote_mirror_snapshots(std::unique_lock<ceph::mutex>* locker);
+
+ void prune_non_primary_snapshot(uint64_t snap_id);
+ void handle_prune_non_primary_snapshot(int r);
+
+ void copy_snapshots();
+ void handle_copy_snapshots(int r);
+
+ void get_remote_image_state();
+ void handle_get_remote_image_state(int r);
+
+ void get_local_image_state();
+ void handle_get_local_image_state(int r);
+
+ void create_non_primary_snapshot();
+ void handle_create_non_primary_snapshot(int r);
+
+ void update_mirror_image_state();
+ void handle_update_mirror_image_state(int r);
+
+ void request_sync();
+ void handle_request_sync(int r);
+
+ void copy_image();
+ void handle_copy_image(int r);
+ void handle_copy_image_progress(uint64_t object_number,
+ uint64_t object_count);
+ void handle_copy_image_read(uint64_t bytes_read);
+
+ void apply_image_state();
+ void handle_apply_image_state(int r);
+
+ void update_non_primary_snapshot(bool complete);
+ void handle_update_non_primary_snapshot(bool complete, int r);
+
+ void notify_image_update();
+ void handle_notify_image_update(int r);
+
+ void unlink_peer(uint64_t remote_snap_id);
+ void handle_unlink_peer(int r);
+
+ void finish_sync();
+
+ void register_local_update_watcher();
+ void handle_register_local_update_watcher(int r);
+
+ void register_remote_update_watcher();
+ void handle_register_remote_update_watcher(int r);
+
+ void unregister_remote_update_watcher();
+ void handle_unregister_remote_update_watcher(int r);
+
+ void unregister_local_update_watcher();
+ void handle_unregister_local_update_watcher(int r);
+
+ void wait_for_in_flight_ops();
+ void handle_wait_for_in_flight_ops(int r);
+
+ void handle_image_update_notify();
+
+ void handle_replay_complete(int r, const std::string& description);
+ void handle_replay_complete(std::unique_lock<ceph::mutex>* locker,
+ int r, const std::string& description);
+ void notify_status_updated();
+
+ bool is_replay_interrupted();
+ bool is_replay_interrupted(std::unique_lock<ceph::mutex>* lock);
+
+ void register_perf_counters();
+ void unregister_perf_counters();
+};
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::snapshot::Replayer<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_REPLAYER_H
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.cc b/src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.cc
new file mode 100644
index 000000000..ca3e6918b
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.cc
@@ -0,0 +1,120 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "StateBuilder.h"
+#include "include/ceph_assert.h"
+#include "include/Context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/mirror/snapshot/ImageMeta.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.h"
+#include "tools/rbd_mirror/image_replayer/snapshot/Replayer.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::snapshot::" \
+ << "StateBuilder: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace snapshot {
+
+template <typename I>
+StateBuilder<I>::StateBuilder(const std::string& global_image_id)
+ : image_replayer::StateBuilder<I>(global_image_id) {
+}
+
+template <typename I>
+StateBuilder<I>::~StateBuilder() {
+ ceph_assert(local_image_meta == nullptr);
+}
+
+template <typename I>
+void StateBuilder<I>::close(Context* on_finish) {
+ dout(10) << dendl;
+
+ delete local_image_meta;
+ local_image_meta = nullptr;
+
+ // close the remote image after closing the local
+ // image in case the remote cluster is unreachable and
+ // we cannot close it.
+ on_finish = new LambdaContext([this, on_finish](int) {
+ this->close_remote_image(on_finish);
+ });
+ this->close_local_image(on_finish);
+}
+
+template <typename I>
+bool StateBuilder<I>::is_disconnected() const {
+ return false;
+}
+
+template <typename I>
+bool StateBuilder<I>::is_linked_impl() const {
+ // the remote has to have us registered as a peer
+ return !remote_mirror_peer_uuid.empty();
+}
+
+template <typename I>
+cls::rbd::MirrorImageMode StateBuilder<I>::get_mirror_image_mode() const {
+ return cls::rbd::MIRROR_IMAGE_MODE_SNAPSHOT;
+}
+
+template <typename I>
+image_sync::SyncPointHandler* StateBuilder<I>::create_sync_point_handler() {
+ dout(10) << dendl;
+
+ // TODO
+ ceph_assert(false);
+ return nullptr;
+}
+
+template <typename I>
+BaseRequest* StateBuilder<I>::create_local_image_request(
+ Threads<I>* threads,
+ librados::IoCtx& local_io_ctx,
+ const std::string& global_image_id,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ Context* on_finish) {
+ return CreateLocalImageRequest<I>::create(
+ threads, local_io_ctx, this->remote_image_ctx, global_image_id,
+ pool_meta_cache, progress_ctx, this, on_finish);
+}
+
+template <typename I>
+BaseRequest* StateBuilder<I>::create_prepare_replay_request(
+ const std::string& local_mirror_uuid,
+ ProgressContext* progress_ctx,
+ bool* resync_requested,
+ bool* syncing,
+ Context* on_finish) {
+ return PrepareReplayRequest<I>::create(
+ local_mirror_uuid, progress_ctx, this, resync_requested, syncing,
+ on_finish);
+}
+
+template <typename I>
+image_replayer::Replayer* StateBuilder<I>::create_replayer(
+ Threads<I>* threads,
+ InstanceWatcher<I>* instance_watcher,
+ const std::string& local_mirror_uuid,
+ PoolMetaCache* pool_meta_cache,
+ ReplayerListener* replayer_listener) {
+ return Replayer<I>::create(
+ threads, instance_watcher, local_mirror_uuid, pool_meta_cache, this,
+ replayer_listener);
+}
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::snapshot::StateBuilder<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h b/src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h
new file mode 100644
index 000000000..a4ab82982
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_STATE_BUILDER_H
+#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_STATE_BUILDER_H
+
+#include "tools/rbd_mirror/image_replayer/StateBuilder.h"
+#include <string>
+
+struct Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace mirror {
+namespace snapshot {
+
+template <typename> class ImageMeta;
+
+} // namespace snapshot
+} // namespace mirror
+} // namespace librbd
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace snapshot {
+
+template <typename> class SyncPointHandler;
+
+template <typename ImageCtxT>
+class StateBuilder : public image_replayer::StateBuilder<ImageCtxT> {
+public:
+ static StateBuilder* create(const std::string& global_image_id) {
+ return new StateBuilder(global_image_id);
+ }
+
+ StateBuilder(const std::string& global_image_id);
+ ~StateBuilder() override;
+
+ void close(Context* on_finish) override;
+
+ bool is_disconnected() const override;
+
+ cls::rbd::MirrorImageMode get_mirror_image_mode() const override;
+
+ image_sync::SyncPointHandler* create_sync_point_handler() override;
+
+ bool replay_requires_remote_image() const override {
+ return true;
+ }
+
+ BaseRequest* create_local_image_request(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx& local_io_ctx,
+ const std::string& global_image_id,
+ PoolMetaCache* pool_meta_cache,
+ ProgressContext* progress_ctx,
+ Context* on_finish) override;
+
+ BaseRequest* create_prepare_replay_request(
+ const std::string& local_mirror_uuid,
+ ProgressContext* progress_ctx,
+ bool* resync_requested,
+ bool* syncing,
+ Context* on_finish) override;
+
+ image_replayer::Replayer* create_replayer(
+ Threads<ImageCtxT>* threads,
+ InstanceWatcher<ImageCtxT>* instance_watcher,
+ const std::string& local_mirror_uuid,
+ PoolMetaCache* pool_meta_cache,
+ ReplayerListener* replayer_listener) override;
+
+ SyncPointHandler<ImageCtxT>* sync_point_handler = nullptr;
+
+ std::string remote_mirror_peer_uuid;
+
+ librbd::mirror::snapshot::ImageMeta<ImageCtxT>* local_image_meta = nullptr;
+
+private:
+ bool is_linked_impl() const override;
+};
+
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::snapshot::StateBuilder<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_STATE_BUILDER_H
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/Utils.cc b/src/tools/rbd_mirror/image_replayer/snapshot/Utils.cc
new file mode 100644
index 000000000..7c20410cb
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/Utils.cc
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Utils.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::snapshot::util::" \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace snapshot {
+namespace util {
+
+uint64_t compute_remote_snap_id(
+ const ceph::shared_mutex& local_image_lock,
+ const std::map<librados::snap_t, librbd::SnapInfo>& local_snap_infos,
+ uint64_t local_snap_id, const std::string& remote_mirror_uuid) {
+ ceph_assert(ceph_mutex_is_locked(local_image_lock));
+
+ // Search our local non-primary snapshots for a mapping to the remote
+ // snapshot. The non-primary mirror snapshot with the mappings will always
+ // come at or after the snapshot we are searching against
+ for (auto snap_it = local_snap_infos.lower_bound(local_snap_id);
+ snap_it != local_snap_infos.end(); ++snap_it) {
+ auto mirror_ns = boost::get<cls::rbd::MirrorSnapshotNamespace>(
+ &snap_it->second.snap_namespace);
+ if (mirror_ns == nullptr || !mirror_ns->is_non_primary()) {
+ continue;
+ }
+
+ if (mirror_ns->primary_mirror_uuid != remote_mirror_uuid) {
+ dout(20) << "local snapshot " << snap_it->first << " not tied to remote"
+ << dendl;
+ continue;
+ } else if (local_snap_id == snap_it->first) {
+ dout(15) << "local snapshot " << local_snap_id << " maps to "
+ << "remote snapshot " << mirror_ns->primary_snap_id << dendl;
+ return mirror_ns->primary_snap_id;
+ }
+
+ const auto& snap_seqs = mirror_ns->snap_seqs;
+ for (auto [remote_snap_id_seq, local_snap_id_seq] : snap_seqs) {
+ if (local_snap_id_seq == local_snap_id) {
+ dout(15) << "local snapshot " << local_snap_id << " maps to "
+ << "remote snapshot " << remote_snap_id_seq << dendl;
+ return remote_snap_id_seq;
+ }
+ }
+ }
+
+ return CEPH_NOSNAP;
+}
+
+} // namespace util
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_replayer/snapshot/Utils.h b/src/tools/rbd_mirror/image_replayer/snapshot/Utils.h
new file mode 100644
index 000000000..8efc58685
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/snapshot/Utils.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_UTILS_H
+#define RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_UTILS_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include "librbd/Types.h"
+#include <map>
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace snapshot {
+namespace util {
+
+uint64_t compute_remote_snap_id(
+ const ceph::shared_mutex& local_image_lock,
+ const std::map<librados::snap_t, librbd::SnapInfo>& local_snap_infos,
+ uint64_t local_snap_id, const std::string& remote_mirror_uuid);
+
+} // namespace util
+} // namespace snapshot
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_SNAPSHOT_UTILS_H
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc
new file mode 100644
index 000000000..1bd5d77f0
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SyncPointCreateRequest.h"
+#include "include/uuid.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "tools/rbd_mirror/image_sync/Types.h"
+#include "tools/rbd_mirror/image_sync/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_sync::SyncPointCreateRequest: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+SyncPointCreateRequest<I>::SyncPointCreateRequest(
+ I *remote_image_ctx,
+ const std::string &local_mirror_uuid,
+ SyncPointHandler* sync_point_handler,
+ Context *on_finish)
+ : m_remote_image_ctx(remote_image_ctx),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_sync_point_handler(sync_point_handler),
+ m_on_finish(on_finish) {
+ m_sync_points_copy = m_sync_point_handler->get_sync_points();
+ ceph_assert(m_sync_points_copy.size() < 2);
+
+ // initialize the updated client meta with the new sync point
+ m_sync_points_copy.emplace_back();
+ if (m_sync_points_copy.size() > 1) {
+ m_sync_points_copy.back().from_snap_name =
+ m_sync_points_copy.front().snap_name;
+ }
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send() {
+ send_update_sync_points();
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send_update_sync_points() {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+
+ auto& sync_point = m_sync_points_copy.back();
+ sync_point.snap_name = util::get_snapshot_name_prefix(m_local_mirror_uuid) +
+ uuid_gen.to_string();
+
+ auto ctx = create_context_callback<
+ SyncPointCreateRequest<I>,
+ &SyncPointCreateRequest<I>::handle_update_sync_points>(this);
+ m_sync_point_handler->update_sync_points(
+ m_sync_point_handler->get_snap_seqs(), m_sync_points_copy, false, ctx);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::handle_update_sync_points(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to update client data: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ send_refresh_image();
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send_refresh_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ SyncPointCreateRequest<I>, &SyncPointCreateRequest<I>::handle_refresh_image>(
+ this);
+ m_remote_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::handle_refresh_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": remote image refresh failed: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_create_snap();
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send_create_snap() {
+ dout(20) << dendl;
+
+ auto& sync_point = m_sync_points_copy.back();
+
+ Context *ctx = create_context_callback<
+ SyncPointCreateRequest<I>, &SyncPointCreateRequest<I>::handle_create_snap>(
+ this);
+ m_remote_image_ctx->operations->snap_create(
+ cls::rbd::UserSnapshotNamespace(), sync_point.snap_name.c_str(),
+ librbd::SNAP_CREATE_FLAG_SKIP_NOTIFY_QUIESCE, m_prog_ctx, ctx);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::handle_create_snap(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ send_update_sync_points();
+ return;
+ } else if (r < 0) {
+ derr << ": failed to create snapshot: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_final_refresh_image();
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send_final_refresh_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ SyncPointCreateRequest<I>,
+ &SyncPointCreateRequest<I>::handle_final_refresh_image>(this);
+ m_remote_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::handle_final_refresh_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to refresh image for snapshot: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_sync::SyncPointCreateRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h
new file mode 100644
index 000000000..9b52b8374
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H
+#define RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H
+
+#include "librbd/internal.h"
+#include "Types.h"
+#include <string>
+
+class Context;
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SyncPointCreateRequest {
+public:
+ static SyncPointCreateRequest* create(
+ ImageCtxT *remote_image_ctx,
+ const std::string &local_mirror_uuid,
+ SyncPointHandler* sync_point_handler,
+ Context *on_finish) {
+ return new SyncPointCreateRequest(remote_image_ctx, local_mirror_uuid,
+ sync_point_handler, on_finish);
+ }
+
+ SyncPointCreateRequest(
+ ImageCtxT *remote_image_ctx,
+ const std::string &local_mirror_uuid,
+ SyncPointHandler* sync_point_handler,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UPDATE_SYNC_POINTS < . .
+ * | .
+ * v .
+ * REFRESH_IMAGE .
+ * | . (repeat on EEXIST)
+ * v .
+ * CREATE_SNAP . . . . . .
+ * |
+ * v
+ * REFRESH_IMAGE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_remote_image_ctx;
+ std::string m_local_mirror_uuid;
+ SyncPointHandler* m_sync_point_handler;
+ Context *m_on_finish;
+
+ SyncPoints m_sync_points_copy;
+ librbd::NoOpProgressContext m_prog_ctx;
+
+ void send_update_sync_points();
+ void handle_update_sync_points(int r);
+
+ void send_refresh_image();
+ void handle_refresh_image(int r);
+
+ void send_create_snap();
+ void handle_create_snap(int r);
+
+ void send_final_refresh_image();
+ void handle_final_refresh_image(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_sync::SyncPointCreateRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc
new file mode 100644
index 000000000..d1cd32b39
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc
@@ -0,0 +1,213 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SyncPointPruneRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include <set>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_sync::SyncPointPruneRequest: " \
+ << this << " " << __func__
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+SyncPointPruneRequest<I>::SyncPointPruneRequest(
+ I *remote_image_ctx,
+ bool sync_complete,
+ SyncPointHandler* sync_point_handler,
+ Context *on_finish)
+ : m_remote_image_ctx(remote_image_ctx),
+ m_sync_complete(sync_complete),
+ m_sync_point_handler(sync_point_handler),
+ m_on_finish(on_finish) {
+ m_sync_points_copy = m_sync_point_handler->get_sync_points();
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::send() {
+ if (m_sync_points_copy.empty()) {
+ send_remove_snap();
+ return;
+ }
+
+ if (m_sync_complete) {
+ // if sync is complete, we can remove the master sync point
+ auto it = m_sync_points_copy.begin();
+ auto& sync_point = *it;
+
+ ++it;
+ if (it == m_sync_points_copy.end() ||
+ it->from_snap_name != sync_point.snap_name) {
+ m_snap_names.push_back(sync_point.snap_name);
+ }
+
+ if (!sync_point.from_snap_name.empty()) {
+ m_snap_names.push_back(sync_point.from_snap_name);
+ }
+ } else {
+ // if we have more than one sync point or invalid sync points,
+ // trim them off
+ std::shared_lock image_locker{m_remote_image_ctx->image_lock};
+ std::set<std::string> snap_names;
+ for (auto it = m_sync_points_copy.rbegin();
+ it != m_sync_points_copy.rend(); ++it) {
+ auto& sync_point = *it;
+ if (&sync_point == &m_sync_points_copy.front()) {
+ if (m_remote_image_ctx->get_snap_id(
+ cls::rbd::UserSnapshotNamespace(), sync_point.snap_name) ==
+ CEPH_NOSNAP) {
+ derr << ": failed to locate sync point snapshot: "
+ << sync_point.snap_name << dendl;
+ } else if (!sync_point.from_snap_name.empty()) {
+ derr << ": unexpected from_snap_name in primary sync point: "
+ << sync_point.from_snap_name << dendl;
+ } else {
+ // first sync point is OK -- keep it
+ break;
+ }
+ m_invalid_master_sync_point = true;
+ }
+
+ if (snap_names.count(sync_point.snap_name) == 0) {
+ snap_names.insert(sync_point.snap_name);
+ m_snap_names.push_back(sync_point.snap_name);
+ }
+
+ auto& front_sync_point = m_sync_points_copy.front();
+ if (!sync_point.from_snap_name.empty() &&
+ snap_names.count(sync_point.from_snap_name) == 0 &&
+ sync_point.from_snap_name != front_sync_point.snap_name) {
+ snap_names.insert(sync_point.from_snap_name);
+ m_snap_names.push_back(sync_point.from_snap_name);
+ }
+ }
+ }
+
+ send_remove_snap();
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::send_remove_snap() {
+ if (m_snap_names.empty()) {
+ send_refresh_image();
+ return;
+ }
+
+ const std::string &snap_name = m_snap_names.front();
+
+ dout(20) << ": snap_name=" << snap_name << dendl;
+
+ Context *ctx = create_context_callback<
+ SyncPointPruneRequest<I>, &SyncPointPruneRequest<I>::handle_remove_snap>(
+ this);
+ m_remote_image_ctx->operations->snap_remove(cls::rbd::UserSnapshotNamespace(),
+ snap_name.c_str(),
+ ctx);
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::handle_remove_snap(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ ceph_assert(!m_snap_names.empty());
+ std::string snap_name = m_snap_names.front();
+ m_snap_names.pop_front();
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r < 0) {
+ derr << ": failed to remove snapshot '" << snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_remove_snap();
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::send_refresh_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ SyncPointPruneRequest<I>, &SyncPointPruneRequest<I>::handle_refresh_image>(
+ this);
+ m_remote_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::handle_refresh_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": remote image refresh failed: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_update_sync_points();
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::send_update_sync_points() {
+ dout(20) << dendl;
+
+ if (m_sync_complete) {
+ m_sync_points_copy.pop_front();
+ } else {
+ while (m_sync_points_copy.size() > 1) {
+ m_sync_points_copy.pop_back();
+ }
+ if (m_invalid_master_sync_point) {
+ // all subsequent sync points would have been pruned
+ m_sync_points_copy.clear();
+ }
+ }
+
+ auto ctx = create_context_callback<
+ SyncPointPruneRequest<I>,
+ &SyncPointPruneRequest<I>::handle_update_sync_points>(this);
+ m_sync_point_handler->update_sync_points(
+ m_sync_point_handler->get_snap_seqs(), m_sync_points_copy,
+ m_sync_complete, ctx);
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::handle_update_sync_points(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to update client data: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_sync::SyncPointPruneRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h
new file mode 100644
index 000000000..08bf840b1
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H
+#define RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H
+
+#include "tools/rbd_mirror/image_sync/Types.h"
+#include <list>
+#include <string>
+
+class Context;
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SyncPointPruneRequest {
+public:
+ static SyncPointPruneRequest* create(
+ ImageCtxT *remote_image_ctx,
+ bool sync_complete,
+ SyncPointHandler* sync_point_handler,
+ Context *on_finish) {
+ return new SyncPointPruneRequest(remote_image_ctx, sync_complete,
+ sync_point_handler, on_finish);
+ }
+
+ SyncPointPruneRequest(
+ ImageCtxT *remote_image_ctx,
+ bool sync_complete,
+ SyncPointHandler* sync_point_handler,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | . . . . .
+ * | . .
+ * v v . (repeat if from snap
+ * REMOVE_SNAP . . . unused by other sync)
+ * |
+ * v
+ * REFRESH_IMAGE
+ * |
+ * v
+ * UPDATE_CLIENT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_remote_image_ctx;
+ bool m_sync_complete;
+ SyncPointHandler* m_sync_point_handler;
+ Context *m_on_finish;
+
+ SyncPoints m_sync_points_copy;
+ std::list<std::string> m_snap_names;
+
+ bool m_invalid_master_sync_point = false;
+
+ void send_remove_snap();
+ void handle_remove_snap(int r);
+
+ void send_refresh_image();
+ void handle_refresh_image(int r);
+
+ void send_update_sync_points();
+ void handle_update_sync_points(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_sync::SyncPointPruneRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_sync/Types.h b/src/tools/rbd_mirror/image_sync/Types.h
new file mode 100644
index 000000000..d748dc93e
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/Types.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_SYNC_TYPES_H
+#define RBD_MIRROR_IMAGE_SYNC_TYPES_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Types.h"
+#include <list>
+#include <string>
+#include <boost/optional.hpp>
+
+struct Context;
+
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+
+struct SyncPoint {
+ typedef boost::optional<uint64_t> ObjectNumber;
+
+ SyncPoint() {
+ }
+ SyncPoint(const cls::rbd::SnapshotNamespace& snap_namespace,
+ const std::string& snap_name,
+ const std::string& from_snap_name,
+ const ObjectNumber& object_number)
+ : snap_namespace(snap_namespace), snap_name(snap_name),
+ from_snap_name(from_snap_name), object_number(object_number) {
+ }
+
+ cls::rbd::SnapshotNamespace snap_namespace =
+ {cls::rbd::UserSnapshotNamespace{}};
+ std::string snap_name;
+ std::string from_snap_name;
+ ObjectNumber object_number = boost::none;
+
+ bool operator==(const SyncPoint& rhs) const {
+ return (snap_namespace == rhs.snap_namespace &&
+ snap_name == rhs.snap_name &&
+ from_snap_name == rhs.from_snap_name &&
+ object_number == rhs.object_number);
+ }
+};
+
+typedef std::list<SyncPoint> SyncPoints;
+
+struct SyncPointHandler {
+public:
+ SyncPointHandler(const SyncPointHandler&) = delete;
+ SyncPointHandler& operator=(const SyncPointHandler&) = delete;
+
+ virtual ~SyncPointHandler() {}
+ virtual void destroy() {
+ delete this;
+ }
+
+ virtual SyncPoints get_sync_points() const = 0;
+ virtual librbd::SnapSeqs get_snap_seqs() const = 0;
+
+ virtual void update_sync_points(const librbd::SnapSeqs& snap_seq,
+ const SyncPoints& sync_points,
+ bool sync_complete,
+ Context* on_finish) = 0;
+
+protected:
+ SyncPointHandler() {}
+};
+
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_IMAGE_SYNC_TYPES_H
diff --git a/src/tools/rbd_mirror/image_sync/Utils.cc b/src/tools/rbd_mirror/image_sync/Utils.cc
new file mode 100644
index 000000000..6a3eae72d
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/Utils.cc
@@ -0,0 +1,24 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Utils.h"
+
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+namespace util {
+
+namespace {
+
+static const std::string SNAP_NAME_PREFIX(".rbd-mirror");
+
+} // anonymous namespace
+
+std::string get_snapshot_name_prefix(const std::string& local_mirror_uuid) {
+ return SNAP_NAME_PREFIX + "." + local_mirror_uuid + ".";
+}
+
+} // namespace util
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_sync/Utils.h b/src/tools/rbd_mirror/image_sync/Utils.h
new file mode 100644
index 000000000..139699daa
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/Utils.h
@@ -0,0 +1,16 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <string>
+
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+namespace util {
+
+std::string get_snapshot_name_prefix(const std::string& local_mirror_uuid);
+
+} // namespace util
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/instance_watcher/Types.cc b/src/tools/rbd_mirror/instance_watcher/Types.cc
new file mode 100644
index 000000000..0e9922733
--- /dev/null
+++ b/src/tools/rbd_mirror/instance_watcher/Types.cc
@@ -0,0 +1,245 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Types.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+
+namespace rbd {
+namespace mirror {
+namespace instance_watcher {
+
+namespace {
+
+class EncodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(Payload::NOTIFY_OP), m_bl);
+ payload.encode(m_bl);
+ }
+
+private:
+ bufferlist &m_bl;
+};
+
+class DecodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {}
+
+ template <typename Payload>
+ inline void operator()(Payload &payload) const {
+ payload.decode(m_version, m_iter);
+ }
+
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ NotifyOp notify_op = Payload::NOTIFY_OP;
+ m_formatter->dump_string("notify_op", stringify(notify_op));
+ payload.dump(m_formatter);
+ }
+
+private:
+ ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void PayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(request_id, bl);
+}
+
+void PayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(request_id, iter);
+}
+
+void PayloadBase::dump(Formatter *f) const {
+ f->dump_unsigned("request_id", request_id);
+}
+
+void ImagePayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ PayloadBase::encode(bl);
+ encode(global_image_id, bl);
+}
+
+void ImagePayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ PayloadBase::decode(version, iter);
+ decode(global_image_id, iter);
+}
+
+void ImagePayloadBase::dump(Formatter *f) const {
+ PayloadBase::dump(f);
+ f->dump_string("global_image_id", global_image_id);
+}
+
+void PeerImageRemovedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ PayloadBase::encode(bl);
+ encode(global_image_id, bl);
+ encode(peer_mirror_uuid, bl);
+}
+
+void PeerImageRemovedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ PayloadBase::decode(version, iter);
+ decode(global_image_id, iter);
+ decode(peer_mirror_uuid, iter);
+}
+
+void PeerImageRemovedPayload::dump(Formatter *f) const {
+ PayloadBase::dump(f);
+ f->dump_string("global_image_id", global_image_id);
+ f->dump_string("peer_mirror_uuid", peer_mirror_uuid);
+}
+
+void SyncPayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ PayloadBase::encode(bl);
+ encode(sync_id, bl);
+}
+
+void SyncPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ PayloadBase::decode(version, iter);
+ decode(sync_id, iter);
+}
+
+void SyncPayloadBase::dump(Formatter *f) const {
+ PayloadBase::dump(f);
+ f->dump_string("sync_id", sync_id);
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(2, 2, bl);
+ boost::apply_visitor(EncodePayloadVisitor(bl), payload);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(2, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_IMAGE_ACQUIRE:
+ payload = ImageAcquirePayload();
+ break;
+ case NOTIFY_OP_IMAGE_RELEASE:
+ payload = ImageReleasePayload();
+ break;
+ case NOTIFY_OP_PEER_IMAGE_REMOVED:
+ payload = PeerImageRemovedPayload();
+ break;
+ case NOTIFY_OP_SYNC_REQUEST:
+ payload = SyncRequestPayload();
+ break;
+ case NOTIFY_OP_SYNC_START:
+ payload = SyncStartPayload();
+ break;
+ default:
+ payload = UnknownPayload();
+ break;
+ }
+
+ apply_visitor(DecodePayloadVisitor(struct_v, iter), payload);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage(ImageAcquirePayload()));
+ o.push_back(new NotifyMessage(ImageAcquirePayload(1, "gid")));
+
+ o.push_back(new NotifyMessage(ImageReleasePayload()));
+ o.push_back(new NotifyMessage(ImageReleasePayload(1, "gid")));
+
+ o.push_back(new NotifyMessage(PeerImageRemovedPayload()));
+ o.push_back(new NotifyMessage(PeerImageRemovedPayload(1, "gid", "uuid")));
+
+ o.push_back(new NotifyMessage(SyncRequestPayload()));
+ o.push_back(new NotifyMessage(SyncRequestPayload(1, "sync_id")));
+
+ o.push_back(new NotifyMessage(SyncStartPayload()));
+ o.push_back(new NotifyMessage(SyncStartPayload(1, "sync_id")));
+}
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op) {
+ switch (op) {
+ case NOTIFY_OP_IMAGE_ACQUIRE:
+ out << "ImageAcquire";
+ break;
+ case NOTIFY_OP_IMAGE_RELEASE:
+ out << "ImageRelease";
+ break;
+ case NOTIFY_OP_PEER_IMAGE_REMOVED:
+ out << "PeerImageRemoved";
+ break;
+ case NOTIFY_OP_SYNC_REQUEST:
+ out << "SyncRequest";
+ break;
+ case NOTIFY_OP_SYNC_START:
+ out << "SyncStart";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+void NotifyAckPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(instance_id, bl);
+ encode(request_id, bl);
+ encode(ret_val, bl);
+}
+
+void NotifyAckPayload::decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(instance_id, iter);
+ decode(request_id, iter);
+ decode(ret_val, iter);
+}
+
+void NotifyAckPayload::dump(Formatter *f) const {
+ f->dump_string("instance_id", instance_id);
+ f->dump_unsigned("request_id", request_id);
+ f->dump_int("request_id", ret_val);
+}
+
+} // namespace instance_watcher
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/instance_watcher/Types.h b/src/tools/rbd_mirror/instance_watcher/Types.h
new file mode 100644
index 000000000..b0b7b7791
--- /dev/null
+++ b/src/tools/rbd_mirror/instance_watcher/Types.h
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_INSTANCE_WATCHER_TYPES_H
+#define RBD_MIRROR_INSTANCE_WATCHER_TYPES_H
+
+#include <string>
+#include <set>
+#include <boost/variant.hpp>
+
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "include/int_types.h"
+
+namespace ceph { class Formatter; }
+
+namespace rbd {
+namespace mirror {
+namespace instance_watcher {
+
+enum NotifyOp {
+ NOTIFY_OP_IMAGE_ACQUIRE = 0,
+ NOTIFY_OP_IMAGE_RELEASE = 1,
+ NOTIFY_OP_PEER_IMAGE_REMOVED = 2,
+ NOTIFY_OP_SYNC_REQUEST = 3,
+ NOTIFY_OP_SYNC_START = 4
+};
+
+struct PayloadBase {
+ uint64_t request_id;
+
+ PayloadBase() : request_id(0) {
+ }
+
+ PayloadBase(uint64_t request_id) : request_id(request_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct ImagePayloadBase : public PayloadBase {
+ std::string global_image_id;
+
+ ImagePayloadBase() : PayloadBase() {
+ }
+
+ ImagePayloadBase(uint64_t request_id, const std::string &global_image_id)
+ : PayloadBase(request_id), global_image_id(global_image_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct ImageAcquirePayload : public ImagePayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_ACQUIRE;
+
+ ImageAcquirePayload() {
+ }
+ ImageAcquirePayload(uint64_t request_id, const std::string &global_image_id)
+ : ImagePayloadBase(request_id, global_image_id) {
+ }
+};
+
+struct ImageReleasePayload : public ImagePayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_RELEASE;
+
+ ImageReleasePayload() {
+ }
+ ImageReleasePayload(uint64_t request_id, const std::string &global_image_id)
+ : ImagePayloadBase(request_id, global_image_id) {
+ }
+};
+
+struct PeerImageRemovedPayload : public PayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_PEER_IMAGE_REMOVED;
+
+ std::string global_image_id;
+ std::string peer_mirror_uuid;
+
+ PeerImageRemovedPayload() {
+ }
+ PeerImageRemovedPayload(uint64_t request_id,
+ const std::string& global_image_id,
+ const std::string& peer_mirror_uuid)
+ : PayloadBase(request_id),
+ global_image_id(global_image_id), peer_mirror_uuid(peer_mirror_uuid) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct SyncPayloadBase : public PayloadBase {
+ std::string sync_id;
+
+ SyncPayloadBase() : PayloadBase() {
+ }
+
+ SyncPayloadBase(uint64_t request_id, const std::string &sync_id)
+ : PayloadBase(request_id), sync_id(sync_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct SyncRequestPayload : public SyncPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SYNC_REQUEST;
+
+ SyncRequestPayload() : SyncPayloadBase() {
+ }
+
+ SyncRequestPayload(uint64_t request_id, const std::string &sync_id)
+ : SyncPayloadBase(request_id, sync_id) {
+ }
+};
+
+struct SyncStartPayload : public SyncPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SYNC_START;
+
+ SyncStartPayload() : SyncPayloadBase() {
+ }
+
+ SyncStartPayload(uint64_t request_id, const std::string &sync_id)
+ : SyncPayloadBase(request_id, sync_id) {
+ }
+};
+
+struct UnknownPayload {
+ static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
+ UnknownPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<ImageAcquirePayload,
+ ImageReleasePayload,
+ PeerImageRemovedPayload,
+ SyncRequestPayload,
+ SyncStartPayload,
+ UnknownPayload> Payload;
+
+struct NotifyMessage {
+ NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) {
+ }
+
+ Payload payload;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+WRITE_CLASS_ENCODER(NotifyMessage);
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op);
+
+struct NotifyAckPayload {
+ std::string instance_id;
+ uint64_t request_id;
+ int ret_val;
+
+ NotifyAckPayload() : request_id(0), ret_val(0) {
+ }
+
+ NotifyAckPayload(const std::string &instance_id, uint64_t request_id,
+ int ret_val)
+ : instance_id(instance_id), request_id(request_id), ret_val(ret_val) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+WRITE_CLASS_ENCODER(NotifyAckPayload);
+
+} // namespace instance_watcher
+} // namespace mirror
+} // namespace librbd
+
+using rbd::mirror::instance_watcher::encode;
+using rbd::mirror::instance_watcher::decode;
+
+#endif // RBD_MIRROR_INSTANCE_WATCHER_TYPES_H
diff --git a/src/tools/rbd_mirror/instances/Types.h b/src/tools/rbd_mirror/instances/Types.h
new file mode 100644
index 000000000..8b0a68fc3
--- /dev/null
+++ b/src/tools/rbd_mirror/instances/Types.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_INSTANCES_TYPES_H
+#define CEPH_RBD_MIRROR_INSTANCES_TYPES_H
+
+#include <string>
+#include <vector>
+
+namespace rbd {
+namespace mirror {
+namespace instances {
+
+struct Listener {
+ typedef std::vector<std::string> InstanceIds;
+
+ virtual ~Listener() {
+ }
+
+ virtual void handle_added(const InstanceIds& instance_ids) = 0;
+ virtual void handle_removed(const InstanceIds& instance_ids) = 0;
+};
+
+} // namespace instances
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_INSTANCES_TYPES_H
diff --git a/src/tools/rbd_mirror/leader_watcher/Types.cc b/src/tools/rbd_mirror/leader_watcher/Types.cc
new file mode 100644
index 000000000..d2fb7908f
--- /dev/null
+++ b/src/tools/rbd_mirror/leader_watcher/Types.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Types.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+
+namespace rbd {
+namespace mirror {
+namespace leader_watcher {
+
+namespace {
+
+class EncodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(Payload::NOTIFY_OP), m_bl);
+ payload.encode(m_bl);
+ }
+
+private:
+ bufferlist &m_bl;
+};
+
+class DecodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {}
+
+ template <typename Payload>
+ inline void operator()(Payload &payload) const {
+ payload.decode(m_version, m_iter);
+ }
+
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ NotifyOp notify_op = Payload::NOTIFY_OP;
+ m_formatter->dump_string("notify_op", stringify(notify_op));
+ payload.dump(m_formatter);
+ }
+
+private:
+ ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void HeartbeatPayload::encode(bufferlist &bl) const {
+}
+
+void HeartbeatPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void HeartbeatPayload::dump(Formatter *f) const {
+}
+
+void LockAcquiredPayload::encode(bufferlist &bl) const {
+}
+
+void LockAcquiredPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void LockAcquiredPayload::dump(Formatter *f) const {
+}
+
+void LockReleasedPayload::encode(bufferlist &bl) const {
+}
+
+void LockReleasedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void LockReleasedPayload::dump(Formatter *f) const {
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ boost::apply_visitor(EncodePayloadVisitor(bl), payload);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_HEARTBEAT:
+ payload = HeartbeatPayload();
+ break;
+ case NOTIFY_OP_LOCK_ACQUIRED:
+ payload = LockAcquiredPayload();
+ break;
+ case NOTIFY_OP_LOCK_RELEASED:
+ payload = LockReleasedPayload();
+ break;
+ default:
+ payload = UnknownPayload();
+ break;
+ }
+
+ apply_visitor(DecodePayloadVisitor(struct_v, iter), payload);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage(HeartbeatPayload()));
+ o.push_back(new NotifyMessage(LockAcquiredPayload()));
+ o.push_back(new NotifyMessage(LockReleasedPayload()));
+}
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op) {
+ switch (op) {
+ case NOTIFY_OP_HEARTBEAT:
+ out << "Heartbeat";
+ break;
+ case NOTIFY_OP_LOCK_ACQUIRED:
+ out << "LockAcquired";
+ break;
+ case NOTIFY_OP_LOCK_RELEASED:
+ out << "LockReleased";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+} // namespace leader_watcher
+} // namespace mirror
+} // namespace librbd
diff --git a/src/tools/rbd_mirror/leader_watcher/Types.h b/src/tools/rbd_mirror/leader_watcher/Types.h
new file mode 100644
index 000000000..1278e54b7
--- /dev/null
+++ b/src/tools/rbd_mirror/leader_watcher/Types.h
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_LEADER_WATCHER_TYPES_H
+#define RBD_MIRROR_LEADER_WATCHER_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include <string>
+#include <vector>
+#include <boost/variant.hpp>
+
+struct Context;
+
+namespace ceph { class Formatter; }
+
+namespace rbd {
+namespace mirror {
+namespace leader_watcher {
+
+struct Listener {
+ typedef std::vector<std::string> InstanceIds;
+
+ virtual ~Listener() {
+ }
+
+ virtual void post_acquire_handler(Context *on_finish) = 0;
+ virtual void pre_release_handler(Context *on_finish) = 0;
+
+ virtual void update_leader_handler(
+ const std::string &leader_instance_id) = 0;
+
+ virtual void handle_instances_added(const InstanceIds& instance_ids) = 0;
+ virtual void handle_instances_removed(const InstanceIds& instance_ids) = 0;
+};
+
+enum NotifyOp {
+ NOTIFY_OP_HEARTBEAT = 0,
+ NOTIFY_OP_LOCK_ACQUIRED = 1,
+ NOTIFY_OP_LOCK_RELEASED = 2,
+};
+
+struct HeartbeatPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_HEARTBEAT;
+
+ HeartbeatPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct LockAcquiredPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_LOCK_ACQUIRED;
+
+ LockAcquiredPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct LockReleasedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_LOCK_RELEASED;
+
+ LockReleasedPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownPayload {
+ static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
+ UnknownPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<HeartbeatPayload,
+ LockAcquiredPayload,
+ LockReleasedPayload,
+ UnknownPayload> Payload;
+
+struct NotifyMessage {
+ NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) {
+ }
+
+ Payload payload;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+WRITE_CLASS_ENCODER(NotifyMessage);
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op);
+
+} // namespace leader_watcher
+} // namespace mirror
+} // namespace librbd
+
+using rbd::mirror::leader_watcher::encode;
+using rbd::mirror::leader_watcher::decode;
+
+#endif // RBD_MIRROR_LEADER_WATCHER_TYPES_H
diff --git a/src/tools/rbd_mirror/main.cc b/src/tools/rbd_mirror/main.cc
new file mode 100644
index 000000000..74c97272e
--- /dev/null
+++ b/src/tools/rbd_mirror/main.cc
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+#include "Mirror.h"
+#include "Types.h"
+
+#include <vector>
+
+rbd::mirror::Mirror *mirror = nullptr;
+PerfCounters *g_journal_perf_counters = nullptr;
+PerfCounters *g_snapshot_perf_counters = nullptr;
+
+void usage() {
+ std::cout << "usage: rbd-mirror [options...]" << std::endl;
+ std::cout << "options:\n";
+ std::cout << " -m monaddress[:port] connect to specified monitor\n";
+ std::cout << " --keyring=<path> path to keyring for local cluster\n";
+ std::cout << " --log-file=<logfile> file to log debug output\n";
+ std::cout << " --debug-rbd-mirror=<log-level>/<memory-level> set rbd-mirror debug level\n";
+ generic_server_usage();
+}
+
+static void handle_signal(int signum)
+{
+ if (mirror)
+ mirror->handle_signal(signum);
+}
+
+int main(int argc, const char **argv)
+{
+ std::vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+
+ if (g_conf()->daemonize) {
+ global_init_daemonize(g_ceph_context);
+ }
+
+ common_init_finish(g_ceph_context);
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, handle_signal);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ std::vector<const char*> cmd_args;
+ argv_to_vec(argc, argv, cmd_args);
+
+ // disable unnecessary librbd cache
+ g_ceph_context->_conf.set_val_or_die("rbd_cache", "false");
+
+ auto prio =
+ g_ceph_context->_conf.get_val<int64_t>("rbd_mirror_perf_stats_prio");
+ {
+ PerfCountersBuilder plb(g_ceph_context, "rbd_mirror",
+ rbd::mirror::l_rbd_mirror_journal_first,
+ rbd::mirror::l_rbd_mirror_journal_last);
+ plb.add_u64_counter(rbd::mirror::l_rbd_mirror_replay, "replay", "Replays",
+ "r", prio);
+ plb.add_u64_counter(rbd::mirror::l_rbd_mirror_replay_bytes, "replay_bytes",
+ "Replayed data", "rb", prio, unit_t(UNIT_BYTES));
+ plb.add_time_avg(rbd::mirror::l_rbd_mirror_replay_latency, "replay_latency",
+ "Replay latency", "rl", prio);
+ g_journal_perf_counters = plb.create_perf_counters();
+ }
+ {
+ PerfCountersBuilder plb(g_ceph_context, "rbd_mirror_snapshot",
+ rbd::mirror::l_rbd_mirror_snapshot_first,
+ rbd::mirror::l_rbd_mirror_snapshot_last);
+ plb.add_u64_counter(rbd::mirror::l_rbd_mirror_snapshot_replay_snapshots,
+ "snapshots", "Snapshots", "r", prio);
+ plb.add_time_avg(rbd::mirror::l_rbd_mirror_snapshot_replay_snapshots_time,
+ "snapshots_time", "Snapshots time", "rl", prio);
+ plb.add_u64_counter(rbd::mirror::l_rbd_mirror_snapshot_replay_bytes,
+ "replay_bytes", "Replayed data", "rb", prio,
+ unit_t(UNIT_BYTES));
+ g_snapshot_perf_counters = plb.create_perf_counters();
+ }
+ g_ceph_context->get_perfcounters_collection()->add(g_journal_perf_counters);
+ g_ceph_context->get_perfcounters_collection()->add(g_snapshot_perf_counters);
+
+ mirror = new rbd::mirror::Mirror(g_ceph_context, cmd_args);
+ int r = mirror->init();
+ if (r < 0) {
+ std::cerr << "failed to initialize: " << cpp_strerror(r) << std::endl;
+ goto cleanup;
+ }
+
+ mirror->run();
+
+ cleanup:
+ unregister_async_signal_handler(SIGHUP, handle_signal);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ g_ceph_context->get_perfcounters_collection()->remove(g_journal_perf_counters);
+ g_ceph_context->get_perfcounters_collection()->remove(g_snapshot_perf_counters);
+
+ delete mirror;
+ delete g_journal_perf_counters;
+ delete g_snapshot_perf_counters;
+
+ return r < 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc
new file mode 100644
index 000000000..a1d9c1b54
--- /dev/null
+++ b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/Utils.h"
+#include <map>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::pool_watcher::RefreshImagesRequest " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace pool_watcher {
+
+static const uint32_t MAX_RETURN = 1024;
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void RefreshImagesRequest<I>::send() {
+ m_image_ids->clear();
+ mirror_image_list();
+}
+
+template <typename I>
+void RefreshImagesRequest<I>::mirror_image_list() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_list_start(&op, m_start_after, MAX_RETURN);
+
+ m_out_bl.clear();
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ RefreshImagesRequest<I>,
+ &RefreshImagesRequest<I>::handle_mirror_image_list>(this);
+ int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void RefreshImagesRequest<I>::handle_mirror_image_list(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::map<std::string, std::string> ids;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_list_finish(&it, &ids);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to list mirrored images: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ // store as global -> local image ids
+ for (auto &id : ids) {
+ m_image_ids->emplace(id.second, id.first);
+ }
+
+ if (ids.size() == MAX_RETURN) {
+ m_start_after = ids.rbegin()->first;
+ mirror_image_list();
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void RefreshImagesRequest<I>::finish(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace pool_watcher
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::pool_watcher::RefreshImagesRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h
new file mode 100644
index 000000000..8bfeabe29
--- /dev/null
+++ b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_POOL_WATCHER_REFRESH_IMAGES_REQUEST_H
+#define CEPH_RBD_MIRROR_POOL_WATCHER_REFRESH_IMAGES_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "tools/rbd_mirror/Types.h"
+#include <string>
+
+struct Context;
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace pool_watcher {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class RefreshImagesRequest {
+public:
+ static RefreshImagesRequest *create(librados::IoCtx &remote_io_ctx,
+ ImageIds *image_ids, Context *on_finish) {
+ return new RefreshImagesRequest(remote_io_ctx, image_ids, on_finish);
+ }
+
+ RefreshImagesRequest(librados::IoCtx &remote_io_ctx, ImageIds *image_ids,
+ Context *on_finish)
+ : m_remote_io_ctx(remote_io_ctx), m_image_ids(image_ids),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | /-------------\
+ * | | |
+ * v v | (more images)
+ * MIRROR_IMAGE_LIST ---/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_remote_io_ctx;
+ ImageIds *m_image_ids;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ std::string m_start_after;
+
+ void mirror_image_list();
+ void handle_mirror_image_list(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace pool_watcher
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::pool_watcher::RefreshImagesRequest<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_POOL_WATCHER_REFRESH_IMAGES_REQUEST_H
diff --git a/src/tools/rbd_mirror/pool_watcher/Types.h b/src/tools/rbd_mirror/pool_watcher/Types.h
new file mode 100644
index 000000000..52dfc342d
--- /dev/null
+++ b/src/tools/rbd_mirror/pool_watcher/Types.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_POOL_WATCHER_TYPES_H
+#define CEPH_RBD_MIRROR_POOL_WATCHER_TYPES_H
+
+#include "tools/rbd_mirror/Types.h"
+#include <string>
+
+namespace rbd {
+namespace mirror {
+namespace pool_watcher {
+
+struct Listener {
+ virtual ~Listener() {
+ }
+
+ virtual void handle_update(const std::string &mirror_uuid,
+ ImageIds &&added_image_ids,
+ ImageIds &&removed_image_ids) = 0;
+};
+
+} // namespace pool_watcher
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_POOL_WATCHER_TYPES_H
diff --git a/src/tools/rbd_mirror/service_daemon/Types.cc b/src/tools/rbd_mirror/service_daemon/Types.cc
new file mode 100644
index 000000000..7dc6537c5
--- /dev/null
+++ b/src/tools/rbd_mirror/service_daemon/Types.cc
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/service_daemon/Types.h"
+#include <iostream>
+
+namespace rbd {
+namespace mirror {
+namespace service_daemon {
+
+std::ostream& operator<<(std::ostream& os, const CalloutLevel& callout_level) {
+ switch (callout_level) {
+ case CALLOUT_LEVEL_INFO:
+ os << "info";
+ break;
+ case CALLOUT_LEVEL_WARNING:
+ os << "warning";
+ break;
+ case CALLOUT_LEVEL_ERROR:
+ os << "error";
+ break;
+ }
+ return os;
+}
+
+} // namespace service_daemon
+} // namespace mirror
+} // namespace rbd
+
diff --git a/src/tools/rbd_mirror/service_daemon/Types.h b/src/tools/rbd_mirror/service_daemon/Types.h
new file mode 100644
index 000000000..3aab72016
--- /dev/null
+++ b/src/tools/rbd_mirror/service_daemon/Types.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H
+#define CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H
+
+#include "include/int_types.h"
+#include <iosfwd>
+#include <string>
+#include <boost/variant.hpp>
+
+namespace rbd {
+namespace mirror {
+namespace service_daemon {
+
+typedef uint64_t CalloutId;
+const uint64_t CALLOUT_ID_NONE {0};
+
+enum CalloutLevel {
+ CALLOUT_LEVEL_INFO,
+ CALLOUT_LEVEL_WARNING,
+ CALLOUT_LEVEL_ERROR
+};
+
+std::ostream& operator<<(std::ostream& os, const CalloutLevel& callout_level);
+
+typedef boost::variant<bool, uint64_t, std::string> AttributeValue;
+
+} // namespace service_daemon
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H
diff --git a/src/tools/rbd_nbd/CMakeLists.txt b/src/tools/rbd_nbd/CMakeLists.txt
new file mode 100644
index 000000000..da758f514
--- /dev/null
+++ b/src/tools/rbd_nbd/CMakeLists.txt
@@ -0,0 +1,4 @@
+find_package(nl REQUIRED genl)
+add_executable(rbd-nbd rbd-nbd.cc)
+target_link_libraries(rbd-nbd librbd librados global nl::genl)
+install(TARGETS rbd-nbd DESTINATION bin)
diff --git a/src/tools/rbd_nbd/nbd-netlink.h b/src/tools/rbd_nbd/nbd-netlink.h
new file mode 100644
index 000000000..2d0b90964
--- /dev/null
+++ b/src/tools/rbd_nbd/nbd-netlink.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2017 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef _UAPILINUX_NBD_NETLINK_H
+#define _UAPILINUX_NBD_NETLINK_H
+
+#define NBD_GENL_FAMILY_NAME "nbd"
+#define NBD_GENL_VERSION 0x1
+#define NBD_GENL_MCAST_GROUP_NAME "nbd_mc_group"
+
+/* Configuration policy attributes, used for CONNECT */
+enum {
+ NBD_ATTR_UNSPEC,
+ NBD_ATTR_INDEX,
+ NBD_ATTR_SIZE_BYTES,
+ NBD_ATTR_BLOCK_SIZE_BYTES,
+ NBD_ATTR_TIMEOUT,
+ NBD_ATTR_SERVER_FLAGS,
+ NBD_ATTR_CLIENT_FLAGS,
+ NBD_ATTR_SOCKETS,
+ NBD_ATTR_DEAD_CONN_TIMEOUT,
+ NBD_ATTR_DEVICE_LIST,
+ NBD_ATTR_BACKEND_IDENTIFIER,
+ __NBD_ATTR_MAX,
+};
+#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1)
+
+/*
+ * This is the format for multiple devices with NBD_ATTR_DEVICE_LIST
+ *
+ * [NBD_ATTR_DEVICE_LIST]
+ * [NBD_DEVICE_ITEM]
+ * [NBD_DEVICE_INDEX]
+ * [NBD_DEVICE_CONNECTED]
+ */
+enum {
+ NBD_DEVICE_ITEM_UNSPEC,
+ NBD_DEVICE_ITEM,
+ __NBD_DEVICE_ITEM_MAX,
+};
+#define NBD_DEVICE_ITEM_MAX (__NBD_DEVICE_ITEM_MAX - 1)
+
+enum {
+ NBD_DEVICE_UNSPEC,
+ NBD_DEVICE_INDEX,
+ NBD_DEVICE_CONNECTED,
+ __NBD_DEVICE_MAX,
+};
+#define NBD_DEVICE_ATTR_MAX (__NBD_DEVICE_MAX - 1)
+
+/*
+ * This is the format for multiple sockets with NBD_ATTR_SOCKETS
+ *
+ * [NBD_ATTR_SOCKETS]
+ * [NBD_SOCK_ITEM]
+ * [NBD_SOCK_FD]
+ * [NBD_SOCK_ITEM]
+ * [NBD_SOCK_FD]
+ */
+enum {
+ NBD_SOCK_ITEM_UNSPEC,
+ NBD_SOCK_ITEM,
+ __NBD_SOCK_ITEM_MAX,
+};
+#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1)
+
+enum {
+ NBD_SOCK_UNSPEC,
+ NBD_SOCK_FD,
+ __NBD_SOCK_MAX,
+};
+#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1)
+
+enum {
+ NBD_CMD_UNSPEC,
+ NBD_CMD_CONNECT,
+ NBD_CMD_DISCONNECT,
+ NBD_CMD_RECONFIGURE,
+ NBD_CMD_LINK_DEAD,
+ NBD_CMD_STATUS,
+ __NBD_CMD_MAX,
+};
+#define NBD_CMD_MAX (__NBD_CMD_MAX - 1)
+
+#endif /* _UAPILINUX_NBD_NETLINK_H */
diff --git a/src/tools/rbd_nbd/rbd-nbd.cc b/src/tools/rbd_nbd/rbd-nbd.cc
new file mode 100644
index 000000000..eb9e858f4
--- /dev/null
+++ b/src/tools/rbd_nbd/rbd-nbd.cc
@@ -0,0 +1,2304 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * rbd-nbd - RBD in userspace
+ *
+ * Copyright (C) 2015 - 2016 Kylin Corporation
+ *
+ * Author: Yunchuan Wen <yunchuan.wen@kylin-cloud.com>
+ * Li Wang <li.wang@kylin-cloud.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+*/
+
+#include "acconfig.h"
+#include "include/int_types.h"
+#include "include/scope_guard.h"
+
+#include <libgen.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <linux/nbd.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+
+#include "nbd-netlink.h"
+#include <libnl3/netlink/genl/genl.h>
+#include <libnl3/netlink/genl/ctrl.h>
+#include <libnl3/netlink/genl/mngt.h>
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#else
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#endif
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <regex>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "common/Formatter.h"
+#include "common/Preforker.h"
+#include "common/SubProcess.h"
+#include "common/TextTable.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/event_socket.h"
+#include "common/module.h"
+#include "common/safe_io.h"
+#include "common/version.h"
+
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "include/stringify.h"
+#include "include/xlist.h"
+
+#include "mon/MonClient.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd-nbd: "
+
+enum Command {
+ None,
+ Map,
+ Unmap,
+ Attach,
+ Detach,
+ List
+};
+
+struct Config {
+ int nbds_max = 0;
+ int max_part = 255;
+ int io_timeout = -1;
+ int reattach_timeout = 30;
+
+ bool exclusive = false;
+ bool quiesce = false;
+ bool readonly = false;
+ bool set_max_part = false;
+ bool try_netlink = false;
+ bool show_cookie = false;
+
+ std::string poolname;
+ std::string nsname;
+ std::string imgname;
+ std::string snapname;
+ std::string devpath;
+ std::string quiesce_hook = CMAKE_INSTALL_LIBEXECDIR "/rbd-nbd/rbd-nbd_quiesce";
+
+ std::string format;
+ bool pretty_format = false;
+
+ std::optional<librbd::encryption_format_t> encryption_format;
+ std::optional<std::string> encryption_passphrase_file;
+
+ Command command = None;
+ int pid = 0;
+ std::string cookie;
+
+ std::string image_spec() const {
+ std::string spec = poolname + "/";
+
+ if (!nsname.empty()) {
+ spec += "/" + nsname;
+ }
+ spec += imgname;
+
+ if (!snapname.empty()) {
+ spec += "@" + snapname;
+ }
+
+ return spec;
+ }
+};
+
+static void usage()
+{
+ std::cout << "Usage: rbd-nbd [options] map <image-or-snap-spec> Map image to nbd device\n"
+ << " detach <device|image-or-snap-spec> Detach image from nbd device\n"
+ << " [options] attach <image-or-snap-spec> Attach image to nbd device\n"
+ << " unmap <device|image-or-snap-spec> Unmap nbd device\n"
+ << " [options] list-mapped List mapped nbd devices\n"
+ << "Map and attach options:\n"
+ << " --device <device path> Specify nbd device path (/dev/nbd{num})\n"
+ << " --encryption-format Image encryption format\n"
+ << " (possible values: luks1, luks2)\n"
+ << " --encryption-passphrase-file Path of file containing passphrase for unlocking image encryption\n"
+ << " --exclusive Forbid writes by other clients\n"
+ << " --io-timeout <sec> Set nbd IO timeout\n"
+ << " --max_part <limit> Override for module param max_part\n"
+ << " --nbds_max <limit> Override for module param nbds_max\n"
+ << " --quiesce Use quiesce callbacks\n"
+ << " --quiesce-hook <path> Specify quiesce hook path\n"
+ << " (default: " << Config().quiesce_hook << ")\n"
+ << " --read-only Map read-only\n"
+ << " --reattach-timeout <sec> Set nbd re-attach timeout\n"
+ << " (default: " << Config().reattach_timeout << ")\n"
+ << " --try-netlink Use the nbd netlink interface\n"
+ << " --show-cookie Show device cookie\n"
+ << " --cookie Specify device cookie\n"
+ << "\n"
+ << "List options:\n"
+ << " --format plain|json|xml Output format (default: plain)\n"
+ << " --pretty-format Pretty formatting (json and xml)\n"
+ << std::endl;
+ generic_server_usage();
+}
+
+static int nbd = -1;
+static int nbd_index = -1;
+static EventSocket terminate_event_sock;
+
+#define RBD_NBD_BLKSIZE 512UL
+
+#define HELP_INFO 1
+#define VERSION_INFO 2
+
+#ifdef CEPH_BIG_ENDIAN
+#define ntohll(a) (a)
+#elif defined(CEPH_LITTLE_ENDIAN)
+#define ntohll(a) swab(a)
+#else
+#error "Could not determine endianess"
+#endif
+#define htonll(a) ntohll(a)
+
+static int parse_args(vector<const char*>& args, std::ostream *err_msg,
+ Config *cfg);
+static int netlink_disconnect(int index);
+static int netlink_resize(int nbd_index, uint64_t size);
+
+static int run_quiesce_hook(const std::string &quiesce_hook,
+ const std::string &devpath,
+ const std::string &command);
+
+static std::string get_cookie(const std::string &devpath);
+
+class NBDServer
+{
+public:
+ uint64_t quiesce_watch_handle = 0;
+
+private:
+ int fd;
+ librbd::Image &image;
+ Config *cfg;
+
+public:
+ NBDServer(int fd, librbd::Image& image, Config *cfg)
+ : fd(fd)
+ , image(image)
+ , cfg(cfg)
+ , reader_thread(*this, &NBDServer::reader_entry)
+ , writer_thread(*this, &NBDServer::writer_entry)
+ , quiesce_thread(*this, &NBDServer::quiesce_entry)
+ {
+ std::vector<librbd::config_option_t> options;
+ image.config_list(&options);
+ for (auto &option : options) {
+ if ((option.name == std::string("rbd_cache") ||
+ option.name == std::string("rbd_cache_writethrough_until_flush")) &&
+ option.value == "false") {
+ allow_internal_flush = true;
+ break;
+ }
+ }
+ }
+
+ Config *get_cfg() const {
+ return cfg;
+ }
+
+private:
+ int terminate_event_fd = -1;
+ ceph::mutex disconnect_lock =
+ ceph::make_mutex("NBDServer::DisconnectLocker");
+ ceph::condition_variable disconnect_cond;
+ std::atomic<bool> terminated = { false };
+ std::atomic<bool> allow_internal_flush = { false };
+
+ struct IOContext
+ {
+ xlist<IOContext*>::item item;
+ NBDServer *server = nullptr;
+ struct nbd_request request;
+ struct nbd_reply reply;
+ bufferlist data;
+ int command = 0;
+
+ IOContext()
+ : item(this)
+ {}
+ };
+
+ friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx);
+
+ ceph::mutex lock = ceph::make_mutex("NBDServer::Locker");
+ ceph::condition_variable cond;
+ xlist<IOContext*> io_pending;
+ xlist<IOContext*> io_finished;
+
+ void io_start(IOContext *ctx)
+ {
+ std::lock_guard l{lock};
+ io_pending.push_back(&ctx->item);
+ }
+
+ void io_finish(IOContext *ctx)
+ {
+ std::lock_guard l{lock};
+ ceph_assert(ctx->item.is_on_list());
+ ctx->item.remove_myself();
+ io_finished.push_back(&ctx->item);
+ cond.notify_all();
+ }
+
+ IOContext *wait_io_finish()
+ {
+ std::unique_lock l{lock};
+ cond.wait(l, [this] {
+ return !io_finished.empty() ||
+ (io_pending.empty() && terminated);
+ });
+
+ if (io_finished.empty())
+ return NULL;
+
+ IOContext *ret = io_finished.front();
+ io_finished.pop_front();
+
+ return ret;
+ }
+
+ void wait_clean()
+ {
+ std::unique_lock l{lock};
+ cond.wait(l, [this] { return io_pending.empty(); });
+
+ while(!io_finished.empty()) {
+ std::unique_ptr<IOContext> free_ctx(io_finished.front());
+ io_finished.pop_front();
+ }
+ }
+
+ void assert_clean()
+ {
+ std::unique_lock l{lock};
+
+ ceph_assert(!reader_thread.is_started());
+ ceph_assert(!writer_thread.is_started());
+ ceph_assert(io_pending.empty());
+ ceph_assert(io_finished.empty());
+ }
+
+ static void aio_callback(librbd::completion_t cb, void *arg)
+ {
+ librbd::RBD::AioCompletion *aio_completion =
+ reinterpret_cast<librbd::RBD::AioCompletion*>(cb);
+
+ IOContext *ctx = reinterpret_cast<IOContext *>(arg);
+ int ret = aio_completion->get_return_value();
+
+ dout(20) << __func__ << ": " << *ctx << dendl;
+
+ if (ret == -EINVAL) {
+ // if shrinking an image, a pagecache writeback might reference
+ // extents outside of the range of the new image extents
+ dout(0) << __func__ << ": masking IO out-of-bounds error" << dendl;
+ ctx->data.clear();
+ ret = 0;
+ }
+
+ if (ret < 0) {
+ ctx->reply.error = htonl(-ret);
+ } else if ((ctx->command == NBD_CMD_READ) &&
+ ret < static_cast<int>(ctx->request.len)) {
+ int pad_byte_count = static_cast<int> (ctx->request.len) - ret;
+ ctx->data.append_zero(pad_byte_count);
+ dout(20) << __func__ << ": " << *ctx << ": Pad byte count: "
+ << pad_byte_count << dendl;
+ ctx->reply.error = htonl(0);
+ } else {
+ ctx->reply.error = htonl(0);
+ }
+ ctx->server->io_finish(ctx);
+
+ aio_completion->release();
+ }
+
+ void reader_entry()
+ {
+ struct pollfd poll_fds[2];
+ memset(poll_fds, 0, sizeof(struct pollfd) * 2);
+ poll_fds[0].fd = fd;
+ poll_fds[0].events = POLLIN;
+ poll_fds[1].fd = terminate_event_fd;
+ poll_fds[1].events = POLLIN;
+
+ while (true) {
+ std::unique_ptr<IOContext> ctx(new IOContext());
+ ctx->server = this;
+
+ dout(20) << __func__ << ": waiting for nbd request" << dendl;
+
+ int r = poll(poll_fds, 2, -1);
+ if (r == -1) {
+ if (errno == EINTR) {
+ continue;
+ }
+ r = -errno;
+ derr << "failed to poll nbd: " << cpp_strerror(r) << dendl;
+ goto error;
+ }
+
+ if ((poll_fds[1].revents & POLLIN) != 0) {
+ dout(0) << __func__ << ": terminate received" << dendl;
+ goto signal;
+ }
+
+ if ((poll_fds[0].revents & POLLIN) == 0) {
+ dout(20) << __func__ << ": nothing to read" << dendl;
+ continue;
+ }
+
+ r = safe_read_exact(fd, &ctx->request, sizeof(struct nbd_request));
+ if (r < 0) {
+ derr << "failed to read nbd request header: " << cpp_strerror(r)
+ << dendl;
+ goto error;
+ }
+
+ if (ctx->request.magic != htonl(NBD_REQUEST_MAGIC)) {
+ derr << "invalid nbd request header" << dendl;
+ goto signal;
+ }
+
+ ctx->request.from = ntohll(ctx->request.from);
+ ctx->request.type = ntohl(ctx->request.type);
+ ctx->request.len = ntohl(ctx->request.len);
+
+ ctx->reply.magic = htonl(NBD_REPLY_MAGIC);
+ memcpy(ctx->reply.handle, ctx->request.handle, sizeof(ctx->reply.handle));
+
+ ctx->command = ctx->request.type & 0x0000ffff;
+
+ dout(20) << *ctx << ": start" << dendl;
+
+ switch (ctx->command)
+ {
+ case NBD_CMD_DISC:
+ // NBD_DO_IT will return when pipe is closed
+ dout(0) << "disconnect request received" << dendl;
+ goto signal;
+ case NBD_CMD_WRITE:
+ bufferptr ptr(ctx->request.len);
+ r = safe_read_exact(fd, ptr.c_str(), ctx->request.len);
+ if (r < 0) {
+ derr << *ctx << ": failed to read nbd request data: "
+ << cpp_strerror(r) << dendl;
+ goto error;
+ }
+ ctx->data.push_back(ptr);
+ break;
+ }
+
+ IOContext *pctx = ctx.release();
+ io_start(pctx);
+ librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(pctx, aio_callback);
+ switch (pctx->command)
+ {
+ case NBD_CMD_WRITE:
+ image.aio_write(pctx->request.from, pctx->request.len, pctx->data, c);
+ break;
+ case NBD_CMD_READ:
+ image.aio_read(pctx->request.from, pctx->request.len, pctx->data, c);
+ break;
+ case NBD_CMD_FLUSH:
+ image.aio_flush(c);
+ allow_internal_flush = true;
+ break;
+ case NBD_CMD_TRIM:
+ image.aio_discard(pctx->request.from, pctx->request.len, c);
+ break;
+ default:
+ derr << *pctx << ": invalid request command" << dendl;
+ c->release();
+ goto signal;
+ }
+ }
+error:
+ {
+ int r = netlink_disconnect(nbd_index);
+ if (r == 1) {
+ ioctl(nbd, NBD_DISCONNECT);
+ }
+ }
+signal:
+ std::lock_guard l{lock};
+ terminated = true;
+ cond.notify_all();
+
+ std::lock_guard disconnect_l{disconnect_lock};
+ disconnect_cond.notify_all();
+
+ dout(20) << __func__ << ": terminated" << dendl;
+ }
+
+ void writer_entry()
+ {
+ while (true) {
+ dout(20) << __func__ << ": waiting for io request" << dendl;
+ std::unique_ptr<IOContext> ctx(wait_io_finish());
+ if (!ctx) {
+ dout(20) << __func__ << ": no io requests, terminating" << dendl;
+ goto done;
+ }
+
+ dout(20) << __func__ << ": got: " << *ctx << dendl;
+
+ int r = safe_write(fd, &ctx->reply, sizeof(struct nbd_reply));
+ if (r < 0) {
+ derr << *ctx << ": failed to write reply header: " << cpp_strerror(r)
+ << dendl;
+ goto error;
+ }
+ if (ctx->command == NBD_CMD_READ && ctx->reply.error == htonl(0)) {
+ r = ctx->data.write_fd(fd);
+ if (r < 0) {
+ derr << *ctx << ": failed to write replay data: " << cpp_strerror(r)
+ << dendl;
+ goto error;
+ }
+ }
+ dout(20) << *ctx << ": finish" << dendl;
+ }
+ error:
+ wait_clean();
+ done:
+ ::shutdown(fd, SHUT_RDWR);
+
+ dout(20) << __func__ << ": terminated" << dendl;
+ }
+
+ bool wait_quiesce() {
+ dout(20) << __func__ << dendl;
+
+ std::unique_lock locker{lock};
+ cond.wait(locker, [this] { return quiesce || terminated; });
+
+ if (terminated) {
+ return false;
+ }
+
+ dout(20) << __func__ << ": got quiesce request" << dendl;
+ return true;
+ }
+
+ void wait_unquiesce(std::unique_lock<ceph::mutex> &locker) {
+ dout(20) << __func__ << dendl;
+
+ cond.wait(locker, [this] { return !quiesce || terminated; });
+
+ dout(20) << __func__ << ": got unquiesce request" << dendl;
+ }
+
+ void wait_inflight_io() {
+ if (!allow_internal_flush) {
+ return;
+ }
+
+ uint64_t features = 0;
+ image.features(&features);
+ if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) {
+ bool is_owner = false;
+ image.is_exclusive_lock_owner(&is_owner);
+ if (!is_owner) {
+ return;
+ }
+ }
+
+ dout(20) << __func__ << dendl;
+
+ int r = image.flush();
+ if (r < 0) {
+ derr << "flush failed: " << cpp_strerror(r) << dendl;
+ }
+ }
+
+ void quiesce_entry()
+ {
+ ceph_assert(cfg->quiesce);
+
+ while (wait_quiesce()) {
+
+ int r = run_quiesce_hook(cfg->quiesce_hook, cfg->devpath, "quiesce");
+
+ wait_inflight_io();
+
+ {
+ std::unique_lock locker{lock};
+ ceph_assert(quiesce == true);
+
+ image.quiesce_complete(quiesce_watch_handle, r);
+
+ if (r < 0) {
+ quiesce = false;
+ continue;
+ }
+
+ wait_unquiesce(locker);
+ }
+
+ run_quiesce_hook(cfg->quiesce_hook, cfg->devpath, "unquiesce");
+ }
+
+ dout(20) << __func__ << ": terminated" << dendl;
+ }
+
+ class ThreadHelper : public Thread
+ {
+ public:
+ typedef void (NBDServer::*entry_func)();
+ private:
+ NBDServer &server;
+ entry_func func;
+ public:
+ ThreadHelper(NBDServer &_server, entry_func _func)
+ :server(_server)
+ ,func(_func)
+ {}
+ protected:
+ void* entry() override
+ {
+ (server.*func)();
+ return NULL;
+ }
+ } reader_thread, writer_thread, quiesce_thread;
+
+ bool started = false;
+ bool quiesce = false;
+
+public:
+ void start()
+ {
+ if (!started) {
+ dout(10) << __func__ << ": starting" << dendl;
+
+ started = true;
+
+ terminate_event_fd = eventfd(0, EFD_NONBLOCK);
+ ceph_assert(terminate_event_fd > 0);
+ int r = terminate_event_sock.init(terminate_event_fd,
+ EVENT_SOCKET_TYPE_EVENTFD);
+ ceph_assert(r >= 0);
+
+ reader_thread.create("rbd_reader");
+ writer_thread.create("rbd_writer");
+ if (cfg->quiesce) {
+ quiesce_thread.create("rbd_quiesce");
+ }
+ }
+ }
+
+ void wait_for_disconnect()
+ {
+ if (!started)
+ return;
+
+ std::unique_lock l{disconnect_lock};
+ disconnect_cond.wait(l);
+ }
+
+ void notify_quiesce() {
+ dout(10) << __func__ << dendl;
+
+ ceph_assert(cfg->quiesce);
+
+ std::unique_lock locker{lock};
+ ceph_assert(quiesce == false);
+ quiesce = true;
+ cond.notify_all();
+ }
+
+ void notify_unquiesce() {
+ dout(10) << __func__ << dendl;
+
+ ceph_assert(cfg->quiesce);
+
+ std::unique_lock locker{lock};
+ ceph_assert(quiesce == true);
+ quiesce = false;
+ cond.notify_all();
+ }
+
+ ~NBDServer()
+ {
+ if (started) {
+ dout(10) << __func__ << ": terminating" << dendl;
+
+ terminate_event_sock.notify();
+
+ reader_thread.join();
+ writer_thread.join();
+ if (cfg->quiesce) {
+ quiesce_thread.join();
+ }
+
+ assert_clean();
+
+ close(terminate_event_fd);
+ started = false;
+ }
+ }
+};
+
+std::ostream &operator<<(std::ostream &os, const NBDServer::IOContext &ctx) {
+
+ os << "[" << std::hex << ntohll(*((uint64_t *)ctx.request.handle));
+
+ switch (ctx.command)
+ {
+ case NBD_CMD_WRITE:
+ os << " WRITE ";
+ break;
+ case NBD_CMD_READ:
+ os << " READ ";
+ break;
+ case NBD_CMD_FLUSH:
+ os << " FLUSH ";
+ break;
+ case NBD_CMD_TRIM:
+ os << " TRIM ";
+ break;
+ case NBD_CMD_DISC:
+ os << " DISC ";
+ break;
+ default:
+ os << " UNKNOWN(" << ctx.command << ") ";
+ break;
+ }
+
+ os << ctx.request.from << "~" << ctx.request.len << " "
+ << std::dec << ntohl(ctx.reply.error) << "]";
+
+ return os;
+}
+
+class NBDQuiesceWatchCtx : public librbd::QuiesceWatchCtx
+{
+public:
+ NBDQuiesceWatchCtx(NBDServer *server) : server(server) {
+ }
+
+ void handle_quiesce() override {
+ server->notify_quiesce();
+ }
+
+ void handle_unquiesce() override {
+ server->notify_unquiesce();
+ }
+
+private:
+ NBDServer *server;
+};
+
+class NBDWatchCtx : public librbd::UpdateWatchCtx
+{
+private:
+ int fd;
+ int nbd_index;
+ bool use_netlink;
+ librados::IoCtx &io_ctx;
+ librbd::Image &image;
+ unsigned long size;
+public:
+ NBDWatchCtx(int _fd,
+ int _nbd_index,
+ bool _use_netlink,
+ librados::IoCtx &_io_ctx,
+ librbd::Image &_image,
+ unsigned long _size)
+ : fd(_fd)
+ , nbd_index(_nbd_index)
+ , use_netlink(_use_netlink)
+ , io_ctx(_io_ctx)
+ , image(_image)
+ , size(_size)
+ { }
+
+ ~NBDWatchCtx() override {}
+
+ void handle_notify() override
+ {
+ librbd::image_info_t info;
+ if (image.stat(info, sizeof(info)) == 0) {
+ unsigned long new_size = info.size;
+ int ret;
+
+ if (new_size != size) {
+ dout(5) << "resize detected" << dendl;
+ if (ioctl(fd, BLKFLSBUF, NULL) < 0)
+ derr << "invalidate page cache failed: " << cpp_strerror(errno)
+ << dendl;
+ if (use_netlink) {
+ ret = netlink_resize(nbd_index, new_size);
+ } else {
+ ret = ioctl(fd, NBD_SET_SIZE, new_size);
+ if (ret < 0)
+ derr << "resize failed: " << cpp_strerror(errno) << dendl;
+ }
+
+ if (!ret)
+ size = new_size;
+
+ if (ioctl(fd, BLKRRPART, NULL) < 0) {
+ derr << "rescan of partition table failed: " << cpp_strerror(errno)
+ << dendl;
+ }
+ if (image.invalidate_cache() < 0)
+ derr << "invalidate rbd cache failed" << dendl;
+ }
+ }
+ }
+};
+
+class NBDListIterator {
+public:
+ bool get(Config *cfg) {
+ while (true) {
+ std::string nbd_path = "/sys/block/nbd" + stringify(m_index);
+ if(access(nbd_path.c_str(), F_OK) != 0) {
+ return false;
+ }
+
+ *cfg = Config();
+ cfg->devpath = "/dev/nbd" + stringify(m_index++);
+
+ int pid;
+ std::ifstream ifs;
+ ifs.open(nbd_path + "/pid", std::ifstream::in);
+ if (!ifs.is_open()) {
+ continue;
+ }
+ ifs >> pid;
+ ifs.close();
+
+ // If the rbd-nbd is re-attached the pid may store garbage
+ // here. We are sure this is the case when it is negative or
+ // zero. Then we just try to find the attached process scanning
+ // /proc fs. If it is positive we check the process with this
+ // pid first and if it is not rbd-nbd fallback to searching the
+ // attached process.
+ do {
+ if (pid <= 0) {
+ pid = find_attached(cfg->devpath);
+ if (pid <= 0) {
+ break;
+ }
+ }
+
+ if (get_mapped_info(pid, cfg) >= 0) {
+ return true;
+ }
+ pid = -1;
+ } while (true);
+ }
+ }
+
+private:
+ int m_index = 0;
+ std::map<int, Config> m_mapped_info_cache;
+
+ int get_mapped_info(int pid, Config *cfg) {
+ ceph_assert(!cfg->devpath.empty());
+
+ auto it = m_mapped_info_cache.find(pid);
+ if (it != m_mapped_info_cache.end()) {
+ if (it->second.devpath != cfg->devpath) {
+ return -EINVAL;
+ }
+ *cfg = it->second;
+ return 0;
+ }
+
+ m_mapped_info_cache[pid] = {};
+
+ int r;
+ std::string path = "/proc/" + stringify(pid) + "/comm";
+ std::ifstream ifs;
+ std::string comm;
+ ifs.open(path.c_str(), std::ifstream::in);
+ if (!ifs.is_open())
+ return -1;
+ ifs >> comm;
+ if (comm != "rbd-nbd") {
+ return -EINVAL;
+ }
+ ifs.close();
+
+ path = "/proc/" + stringify(pid) + "/cmdline";
+ std::string cmdline;
+ std::vector<const char*> args;
+
+ ifs.open(path.c_str(), std::ifstream::in);
+ if (!ifs.is_open())
+ return -1;
+ ifs >> cmdline;
+
+ if (cmdline.empty()) {
+ return -EINVAL;
+ }
+
+ for (unsigned i = 0; i < cmdline.size(); i++) {
+ char *arg = &cmdline[i];
+ if (i == 0) {
+ if (strcmp(basename(arg) , "rbd-nbd") != 0) {
+ return -EINVAL;
+ }
+ } else {
+ args.push_back(arg);
+ }
+
+ while (cmdline[i] != '\0') {
+ i++;
+ }
+ }
+
+ std::ostringstream err_msg;
+ Config c;
+ r = parse_args(args, &err_msg, &c);
+ if (r < 0) {
+ return r;
+ }
+
+ if (c.command != Map && c.command != Attach) {
+ return -ENOENT;
+ }
+
+ c.pid = pid;
+ m_mapped_info_cache.erase(pid);
+ if (!c.devpath.empty()) {
+ m_mapped_info_cache[pid] = c;
+ if (c.devpath != cfg->devpath) {
+ return -ENOENT;
+ }
+ } else {
+ c.devpath = cfg->devpath;
+ }
+
+ c.cookie = get_cookie(cfg->devpath);
+ *cfg = c;
+ return 0;
+ }
+
+ int find_attached(const std::string &devpath) {
+ for (auto &entry : fs::directory_iterator("/proc")) {
+ if (!fs::is_directory(entry.status())) {
+ continue;
+ }
+
+ int pid;
+ try {
+ pid = boost::lexical_cast<uint64_t>(entry.path().filename().c_str());
+ } catch (boost::bad_lexical_cast&) {
+ continue;
+ }
+
+ Config cfg;
+ cfg.devpath = devpath;
+ if (get_mapped_info(pid, &cfg) >=0 && cfg.command == Attach) {
+ return cfg.pid;
+ }
+ }
+
+ return -1;
+ }
+};
+
+static std::string get_cookie(const std::string &devpath)
+{
+ std::string cookie;
+ std::ifstream ifs;
+ std::string path = "/sys/block/" + devpath.substr(sizeof("/dev/") - 1) + "/backend";
+
+ ifs.open(path, std::ifstream::in);
+ if (ifs.is_open()) {
+ std::getline(ifs, cookie);
+ ifs.close();
+ }
+ return cookie;
+}
+
+static int load_module(Config *cfg)
+{
+ ostringstream param;
+ int ret;
+
+ if (cfg->nbds_max)
+ param << "nbds_max=" << cfg->nbds_max;
+
+ if (cfg->max_part)
+ param << " max_part=" << cfg->max_part;
+
+ if (!access("/sys/module/nbd", F_OK)) {
+ if (cfg->nbds_max || cfg->set_max_part)
+ cerr << "rbd-nbd: ignoring kernel module parameter options: nbd module already loaded"
+ << std::endl;
+ return 0;
+ }
+
+ ret = module_load("nbd", param.str().c_str());
+ if (ret < 0)
+ cerr << "rbd-nbd: failed to load nbd kernel module: " << cpp_strerror(-ret)
+ << std::endl;
+
+ return ret;
+}
+
+static int check_device_size(int nbd_index, unsigned long expected_size)
+{
+ // There are bugs with some older kernel versions that result in an
+ // overflow for large image sizes. This check is to ensure we are
+ // not affected.
+
+ unsigned long size = 0;
+ std::string path = "/sys/block/nbd" + stringify(nbd_index) + "/size";
+ std::ifstream ifs;
+ ifs.open(path.c_str(), std::ifstream::in);
+ if (!ifs.is_open()) {
+ cerr << "rbd-nbd: failed to open " << path << std::endl;
+ return -EINVAL;
+ }
+ ifs >> size;
+ size *= RBD_NBD_BLKSIZE;
+
+ if (size == 0) {
+ // Newer kernel versions will report real size only after nbd
+ // connect. Assume this is the case and return success.
+ return 0;
+ }
+
+ if (size != expected_size) {
+ cerr << "rbd-nbd: kernel reported invalid device size (" << size
+ << ", expected " << expected_size << ")" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int parse_nbd_index(const std::string& devpath)
+{
+ int index, ret;
+
+ ret = sscanf(devpath.c_str(), "/dev/nbd%d", &index);
+ if (ret <= 0) {
+ // mean an early matching failure. But some cases need a negative value.
+ if (ret == 0)
+ ret = -EINVAL;
+ cerr << "rbd-nbd: invalid device path: " << devpath
+ << " (expected /dev/nbd{num})" << std::endl;
+ return ret;
+ }
+
+ return index;
+}
+
+static int try_ioctl_setup(Config *cfg, int fd, uint64_t size,
+ uint64_t blksize, uint64_t flags)
+{
+ int index = 0, r;
+
+ if (cfg->devpath.empty()) {
+ char dev[64];
+ const char *path = "/sys/module/nbd/parameters/nbds_max";
+ int nbds_max = -1;
+ if (access(path, F_OK) == 0) {
+ std::ifstream ifs;
+ ifs.open(path, std::ifstream::in);
+ if (ifs.is_open()) {
+ ifs >> nbds_max;
+ ifs.close();
+ }
+ }
+
+ while (true) {
+ snprintf(dev, sizeof(dev), "/dev/nbd%d", index);
+
+ nbd = open(dev, O_RDWR);
+ if (nbd < 0) {
+ if (nbd == -EPERM && nbds_max != -1 && index < (nbds_max-1)) {
+ ++index;
+ continue;
+ }
+ r = nbd;
+ cerr << "rbd-nbd: failed to find unused device" << std::endl;
+ goto done;
+ }
+
+ r = ioctl(nbd, NBD_SET_SOCK, fd);
+ if (r < 0) {
+ close(nbd);
+ ++index;
+ continue;
+ }
+
+ cfg->devpath = dev;
+ break;
+ }
+ } else {
+ r = parse_nbd_index(cfg->devpath);
+ if (r < 0)
+ goto done;
+ index = r;
+
+ nbd = open(cfg->devpath.c_str(), O_RDWR);
+ if (nbd < 0) {
+ r = nbd;
+ cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl;
+ goto done;
+ }
+
+ r = ioctl(nbd, NBD_SET_SOCK, fd);
+ if (r < 0) {
+ r = -errno;
+ cerr << "rbd-nbd: the device " << cfg->devpath << " is busy" << std::endl;
+ close(nbd);
+ goto done;
+ }
+ }
+
+ r = ioctl(nbd, NBD_SET_BLKSIZE, blksize);
+ if (r < 0) {
+ r = -errno;
+ cerr << "rbd-nbd: NBD_SET_BLKSIZE failed" << std::endl;
+ goto close_nbd;
+ }
+
+ r = ioctl(nbd, NBD_SET_SIZE, size);
+ if (r < 0) {
+ cerr << "rbd-nbd: NBD_SET_SIZE failed" << std::endl;
+ r = -errno;
+ goto close_nbd;
+ }
+
+ ioctl(nbd, NBD_SET_FLAGS, flags);
+
+ if (cfg->io_timeout >= 0) {
+ r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)cfg->io_timeout);
+ if (r < 0) {
+ r = -errno;
+ cerr << "rbd-nbd: failed to set IO timeout: " << cpp_strerror(r)
+ << std::endl;
+ goto close_nbd;
+ }
+ }
+
+ dout(10) << "ioctl setup complete for " << cfg->devpath << dendl;
+ nbd_index = index;
+ return 0;
+
+close_nbd:
+ if (r < 0) {
+ ioctl(nbd, NBD_CLEAR_SOCK);
+ cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r) << std::endl;
+ }
+ close(nbd);
+done:
+ return r;
+}
+
+static void netlink_cleanup(struct nl_sock *sock)
+{
+ if (!sock)
+ return;
+
+ nl_close(sock);
+ nl_socket_free(sock);
+}
+
+static struct nl_sock *netlink_init(int *id)
+{
+ struct nl_sock *sock;
+ int ret;
+
+ sock = nl_socket_alloc();
+ if (!sock) {
+ cerr << "rbd-nbd: Could not allocate netlink socket." << std::endl;
+ return NULL;
+ }
+
+ ret = genl_connect(sock);
+ if (ret < 0) {
+ cerr << "rbd-nbd: Could not connect netlink socket. Error " << ret
+ << std::endl;
+ goto free_sock;
+ }
+
+ *id = genl_ctrl_resolve(sock, "nbd");
+ if (*id < 0)
+ // nbd netlink interface not supported.
+ goto close_sock;
+
+ return sock;
+
+close_sock:
+ nl_close(sock);
+free_sock:
+ nl_socket_free(sock);
+ return NULL;
+}
+
+static int netlink_disconnect(int index)
+{
+ struct nl_sock *sock;
+ struct nl_msg *msg;
+ int ret, nl_id;
+
+ sock = netlink_init(&nl_id);
+ if (!sock)
+ // Try ioctl
+ return 1;
+
+ nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL);
+
+ msg = nlmsg_alloc();
+ if (!msg) {
+ cerr << "rbd-nbd: Could not allocate netlink message." << std::endl;
+ goto free_sock;
+ }
+
+ if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0,
+ NBD_CMD_DISCONNECT, 0)) {
+ cerr << "rbd-nbd: Could not setup message." << std::endl;
+ goto nla_put_failure;
+ }
+
+ NLA_PUT_U32(msg, NBD_ATTR_INDEX, index);
+
+ ret = nl_send_sync(sock, msg);
+ netlink_cleanup(sock);
+ if (ret < 0) {
+ cerr << "rbd-nbd: netlink disconnect failed: " << nl_geterror(-ret)
+ << std::endl;
+ return -EIO;
+ }
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_free(msg);
+free_sock:
+ netlink_cleanup(sock);
+ return -EIO;
+}
+
+static int netlink_disconnect_by_path(const std::string& devpath)
+{
+ int index;
+
+ index = parse_nbd_index(devpath);
+ if (index < 0)
+ return index;
+
+ return netlink_disconnect(index);
+}
+
+static int netlink_resize(int nbd_index, uint64_t size)
+{
+ struct nl_sock *sock;
+ struct nl_msg *msg;
+ int nl_id, ret;
+
+ sock = netlink_init(&nl_id);
+ if (!sock) {
+ cerr << "rbd-nbd: Netlink interface not supported." << std::endl;
+ return 1;
+ }
+
+ nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL);
+
+ msg = nlmsg_alloc();
+ if (!msg) {
+ cerr << "rbd-nbd: Could not allocate netlink message." << std::endl;
+ goto free_sock;
+ }
+
+ if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0,
+ NBD_CMD_RECONFIGURE, 0)) {
+ cerr << "rbd-nbd: Could not setup message." << std::endl;
+ goto free_msg;
+ }
+
+ NLA_PUT_U32(msg, NBD_ATTR_INDEX, nbd_index);
+ NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, size);
+
+ ret = nl_send_sync(sock, msg);
+ if (ret < 0) {
+ cerr << "rbd-nbd: netlink resize failed: " << nl_geterror(ret) << std::endl;
+ goto free_sock;
+ }
+
+ netlink_cleanup(sock);
+ dout(10) << "netlink resize complete for nbd" << nbd_index << dendl;
+ return 0;
+
+nla_put_failure:
+free_msg:
+ nlmsg_free(msg);
+free_sock:
+ netlink_cleanup(sock);
+ return -EIO;
+}
+
+static int netlink_connect_cb(struct nl_msg *msg, void *arg)
+{
+ struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlmsg_hdr(msg));
+ Config *cfg = (Config *)arg;
+ struct nlattr *msg_attr[NBD_ATTR_MAX + 1];
+ uint32_t index;
+ int ret;
+
+ ret = nla_parse(msg_attr, NBD_ATTR_MAX, genlmsg_attrdata(gnlh, 0),
+ genlmsg_attrlen(gnlh, 0), NULL);
+ if (ret) {
+ cerr << "rbd-nbd: Unsupported netlink reply" << std::endl;
+ return -NLE_MSGTYPE_NOSUPPORT;
+ }
+
+ if (!msg_attr[NBD_ATTR_INDEX]) {
+ cerr << "rbd-nbd: netlink connect reply missing device index." << std::endl;
+ return -NLE_MSGTYPE_NOSUPPORT;
+ }
+
+ index = nla_get_u32(msg_attr[NBD_ATTR_INDEX]);
+ cfg->devpath = "/dev/nbd" + stringify(index);
+ nbd_index = index;
+
+ return NL_OK;
+}
+
+static int netlink_connect(Config *cfg, struct nl_sock *sock, int nl_id, int fd,
+ uint64_t size, uint64_t flags, bool reconnect)
+{
+ struct nlattr *sock_attr;
+ struct nlattr *sock_opt;
+ struct nl_msg *msg;
+ int ret;
+
+ if (reconnect) {
+ dout(10) << "netlink try reconnect for " << cfg->devpath << dendl;
+
+ nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL);
+ } else {
+ nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, netlink_connect_cb,
+ cfg);
+ }
+
+ msg = nlmsg_alloc();
+ if (!msg) {
+ cerr << "rbd-nbd: Could not allocate netlink message." << std::endl;
+ return -ENOMEM;
+ }
+
+ if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0,
+ reconnect ? NBD_CMD_RECONFIGURE : NBD_CMD_CONNECT, 0)) {
+ cerr << "rbd-nbd: Could not setup message." << std::endl;
+ goto free_msg;
+ }
+
+ if (!cfg->devpath.empty()) {
+ ret = parse_nbd_index(cfg->devpath);
+ if (ret < 0)
+ goto free_msg;
+
+ NLA_PUT_U32(msg, NBD_ATTR_INDEX, ret);
+ if (reconnect) {
+ nbd_index = ret;
+ }
+ }
+
+ if (cfg->io_timeout >= 0)
+ NLA_PUT_U64(msg, NBD_ATTR_TIMEOUT, cfg->io_timeout);
+
+ NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, size);
+ NLA_PUT_U64(msg, NBD_ATTR_BLOCK_SIZE_BYTES, RBD_NBD_BLKSIZE);
+ NLA_PUT_U64(msg, NBD_ATTR_SERVER_FLAGS, flags);
+ NLA_PUT_U64(msg, NBD_ATTR_DEAD_CONN_TIMEOUT, cfg->reattach_timeout);
+ if (!cfg->cookie.empty())
+ NLA_PUT_STRING(msg, NBD_ATTR_BACKEND_IDENTIFIER, cfg->cookie.c_str());
+
+ sock_attr = nla_nest_start(msg, NBD_ATTR_SOCKETS);
+ if (!sock_attr) {
+ cerr << "rbd-nbd: Could not init sockets in netlink message." << std::endl;
+ goto free_msg;
+ }
+
+ sock_opt = nla_nest_start(msg, NBD_SOCK_ITEM);
+ if (!sock_opt) {
+ cerr << "rbd-nbd: Could not init sock in netlink message." << std::endl;
+ goto free_msg;
+ }
+
+ NLA_PUT_U32(msg, NBD_SOCK_FD, fd);
+ nla_nest_end(msg, sock_opt);
+ nla_nest_end(msg, sock_attr);
+
+ ret = nl_send_sync(sock, msg);
+ if (ret < 0) {
+ cerr << "rbd-nbd: netlink connect failed: " << nl_geterror(ret)
+ << std::endl;
+ return -EIO;
+ }
+
+ dout(10) << "netlink connect complete for " << cfg->devpath << dendl;
+ return 0;
+
+nla_put_failure:
+free_msg:
+ nlmsg_free(msg);
+ return -EIO;
+}
+
+static int try_netlink_setup(Config *cfg, int fd, uint64_t size, uint64_t flags,
+ bool reconnect)
+{
+ struct nl_sock *sock;
+ int nl_id, ret;
+
+ sock = netlink_init(&nl_id);
+ if (!sock) {
+ cerr << "rbd-nbd: Netlink interface not supported. Using ioctl interface."
+ << std::endl;
+ return 1;
+ }
+
+ dout(10) << "netlink interface supported." << dendl;
+
+ ret = netlink_connect(cfg, sock, nl_id, fd, size, flags, reconnect);
+ netlink_cleanup(sock);
+
+ if (ret != 0)
+ return ret;
+
+ nbd = open(cfg->devpath.c_str(), O_RDWR);
+ if (nbd < 0) {
+ cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl;
+ return nbd;
+ }
+
+ return 0;
+}
+
+static int run_quiesce_hook(const std::string &quiesce_hook,
+ const std::string &devpath,
+ const std::string &command) {
+ dout(10) << __func__ << ": " << quiesce_hook << " " << devpath << " "
+ << command << dendl;
+
+ SubProcess hook(quiesce_hook.c_str(), SubProcess::CLOSE, SubProcess::PIPE,
+ SubProcess::PIPE);
+ hook.add_cmd_args(devpath.c_str(), command.c_str(), NULL);
+ bufferlist err;
+ int r = hook.spawn();
+ if (r < 0) {
+ err.append("subprocess spawn failed");
+ } else {
+ err.read_fd(hook.get_stderr(), 16384);
+ r = hook.join();
+ if (r > 0) {
+ r = -r;
+ }
+ }
+ if (r < 0) {
+ derr << __func__ << ": " << quiesce_hook << " " << devpath << " "
+ << command << " failed: " << err.to_str() << dendl;
+ } else {
+ dout(10) << " succeeded: " << err.to_str() << dendl;
+ }
+
+ return r;
+}
+
+static void handle_signal(int signum)
+{
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
+
+ dout(20) << __func__ << ": " << "notifying terminate" << dendl;
+
+ ceph_assert(terminate_event_sock.is_valid());
+ terminate_event_sock.notify();
+}
+
+static NBDServer *start_server(int fd, librbd::Image& image, Config *cfg)
+{
+ NBDServer *server;
+
+ server = new NBDServer(fd, image, cfg);
+ server->start();
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, sighup_handler);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ return server;
+}
+
+static void run_server(Preforker& forker, NBDServer *server, bool netlink_used)
+{
+ if (g_conf()->daemonize) {
+ global_init_postfork_finish(g_ceph_context);
+ forker.daemonize();
+ }
+
+ if (netlink_used)
+ server->wait_for_disconnect();
+ else
+ ioctl(nbd, NBD_DO_IT);
+
+ unregister_async_signal_handler(SIGHUP, sighup_handler);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+}
+
+// Eventually it should be removed when pidfd_open is widely supported.
+
+static int wait_for_terminate_legacy(int pid, int timeout)
+{
+ for (int i = 0; ; i++) {
+ if (kill(pid, 0) == -1) {
+ if (errno == ESRCH) {
+ return 0;
+ }
+ int r = -errno;
+ cerr << "rbd-nbd: kill(" << pid << ", 0) failed: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (i >= timeout * 2) {
+ break;
+ }
+ usleep(500000);
+ }
+
+ cerr << "rbd-nbd: waiting for process exit timed out" << std::endl;
+ return -ETIMEDOUT;
+}
+
+// Eventually it should be replaced with glibc' pidfd_open
+// when it is widely available.
+
+#ifdef __NR_pidfd_open
+static int pidfd_open(pid_t pid, unsigned int flags)
+{
+ return syscall(__NR_pidfd_open, pid, flags);
+}
+#else
+static int pidfd_open(pid_t pid, unsigned int flags)
+{
+ errno = ENOSYS;
+ return -1;
+}
+#endif
+
+static int wait_for_terminate(int pid, int timeout)
+{
+ int fd = pidfd_open(pid, 0);
+ if (fd == -1) {
+ if (errno == ENOSYS) {
+ return wait_for_terminate_legacy(pid, timeout);
+ }
+ if (errno == ESRCH) {
+ return 0;
+ }
+ int r = -errno;
+ cerr << "rbd-nbd: pidfd_open(" << pid << ") failed: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ struct pollfd poll_fds[1];
+ memset(poll_fds, 0, sizeof(struct pollfd));
+ poll_fds[0].fd = fd;
+ poll_fds[0].events = POLLIN;
+
+ int r = poll(poll_fds, 1, timeout * 1000);
+ if (r == -1) {
+ r = -errno;
+ cerr << "rbd-nbd: failed to poll rbd-nbd process: " << cpp_strerror(r)
+ << std::endl;
+ goto done;
+ } else {
+ r = 0;
+ }
+
+ if ((poll_fds[0].revents & POLLIN) == 0) {
+ cerr << "rbd-nbd: waiting for process exit timed out" << std::endl;
+ r = -ETIMEDOUT;
+ }
+
+done:
+ close(fd);
+
+ return r;
+}
+
+static int do_map(int argc, const char *argv[], Config *cfg, bool reconnect)
+{
+ int r;
+
+ librados::Rados rados;
+ librbd::RBD rbd;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+
+ int read_only = 0;
+ unsigned long flags;
+ unsigned long size;
+ unsigned long blksize = RBD_NBD_BLKSIZE;
+ bool use_netlink;
+
+ int fd[2];
+
+ librbd::image_info_t info;
+
+ Preforker forker;
+ NBDServer *server;
+
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+ g_ceph_context->_conf.set_val_or_die("pid_file", "");
+
+ if (global_init_prefork(g_ceph_context) >= 0) {
+ std::string err;
+ r = forker.prefork(err);
+ if (r < 0) {
+ cerr << err << std::endl;
+ return r;
+ }
+ if (forker.is_parent()) {
+ if (forker.parent_wait(err) != 0) {
+ return -ENXIO;
+ }
+ return 0;
+ }
+ global_init_postfork_start(g_ceph_context);
+ }
+
+ common_init_finish(g_ceph_context);
+ global_init_chdir(g_ceph_context);
+
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, fd) == -1) {
+ r = -errno;
+ goto close_ret;
+ }
+
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0)
+ goto close_fd;
+
+ r = rados.connect();
+ if (r < 0)
+ goto close_fd;
+
+ r = rados.ioctx_create(cfg->poolname.c_str(), io_ctx);
+ if (r < 0)
+ goto close_fd;
+
+ io_ctx.set_namespace(cfg->nsname);
+
+ r = rbd.open(io_ctx, image, cfg->imgname.c_str());
+ if (r < 0)
+ goto close_fd;
+
+ if (cfg->exclusive) {
+ r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE);
+ if (r < 0) {
+ cerr << "rbd-nbd: failed to acquire exclusive lock: " << cpp_strerror(r)
+ << std::endl;
+ goto close_fd;
+ }
+ }
+
+ if (!cfg->snapname.empty()) {
+ r = image.snap_set(cfg->snapname.c_str());
+ if (r < 0)
+ goto close_fd;
+ }
+
+ if (cfg->encryption_format.has_value()) {
+ if (!cfg->encryption_passphrase_file.has_value()) {
+ r = -EINVAL;
+ cerr << "rbd-nbd: missing encryption-passphrase-file" << std::endl;
+ goto close_fd;
+ }
+ std::ifstream file(cfg->encryption_passphrase_file.value().c_str());
+ if (file.fail()) {
+ r = -errno;
+ std::cerr << "rbd-nbd: unable to open passphrase file:"
+ << cpp_strerror(errno) << std::endl;
+ goto close_fd;
+ }
+ std::string passphrase((std::istreambuf_iterator<char>(file)),
+ (std::istreambuf_iterator<char>()));
+ auto sg = make_scope_guard([&] {
+ ceph_memzero_s(&passphrase[0], passphrase.size(), passphrase.size()); });
+ file.close();
+ if (!passphrase.empty() && passphrase[passphrase.length() - 1] == '\n') {
+ passphrase.erase(passphrase.length() - 1);
+ }
+
+ switch (cfg->encryption_format.value()) {
+ case RBD_ENCRYPTION_FORMAT_LUKS1: {
+ librbd::encryption_luks1_format_options_t opts = {};
+ opts.passphrase = passphrase;
+ r = image.encryption_load(
+ RBD_ENCRYPTION_FORMAT_LUKS1, &opts, sizeof(opts));
+ break;
+ }
+ case RBD_ENCRYPTION_FORMAT_LUKS2: {
+ librbd::encryption_luks2_format_options_t opts = {};
+ opts.passphrase = passphrase;
+ r = image.encryption_load(
+ RBD_ENCRYPTION_FORMAT_LUKS2, &opts, sizeof(opts));
+ blksize = 4096;
+ break;
+ }
+ default:
+ r = -ENOTSUP;
+ cerr << "rbd-nbd: unsupported encryption format" << std::endl;
+ goto close_fd;
+ }
+
+ if (r != 0) {
+ cerr << "rbd-nbd: failed to load encryption: " << cpp_strerror(r)
+ << std::endl;
+ goto close_fd;
+ }
+ }
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ goto close_fd;
+
+ flags = NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_TRIM | NBD_FLAG_HAS_FLAGS;
+ if (!cfg->snapname.empty() || cfg->readonly) {
+ flags |= NBD_FLAG_READ_ONLY;
+ read_only = 1;
+ }
+
+ if (info.size > ULONG_MAX) {
+ r = -EFBIG;
+ cerr << "rbd-nbd: image is too large (" << byte_u_t(info.size)
+ << ", max is " << byte_u_t(ULONG_MAX) << ")" << std::endl;
+ goto close_fd;
+ }
+
+ size = info.size;
+
+ r = load_module(cfg);
+ if (r < 0)
+ goto close_fd;
+
+ server = start_server(fd[1], image, cfg);
+
+ use_netlink = cfg->try_netlink || reconnect;
+ if (use_netlink) {
+ // generate when the cookie is not supplied at CLI
+ if (!reconnect && cfg->cookie.empty()) {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+ cfg->cookie = uuid_gen.to_string();
+ }
+ r = try_netlink_setup(cfg, fd[0], size, flags, reconnect);
+ if (r < 0) {
+ goto free_server;
+ } else if (r == 1) {
+ use_netlink = false;
+ }
+ }
+
+ if (!use_netlink) {
+ r = try_ioctl_setup(cfg, fd[0], size, blksize, flags);
+ if (r < 0)
+ goto free_server;
+ }
+
+ r = check_device_size(nbd_index, size);
+ if (r < 0)
+ goto close_nbd;
+
+ r = ioctl(nbd, BLKROSET, (unsigned long) &read_only);
+ if (r < 0) {
+ r = -errno;
+ goto close_nbd;
+ }
+
+ {
+ NBDQuiesceWatchCtx quiesce_watch_ctx(server);
+ if (cfg->quiesce) {
+ r = image.quiesce_watch(&quiesce_watch_ctx,
+ &server->quiesce_watch_handle);
+ if (r < 0) {
+ goto close_nbd;
+ }
+ }
+
+ uint64_t handle;
+
+ NBDWatchCtx watch_ctx(nbd, nbd_index, use_netlink, io_ctx, image,
+ info.size);
+ r = image.update_watch(&watch_ctx, &handle);
+ if (r < 0)
+ goto close_nbd;
+
+ std::string cookie;
+ if (use_netlink) {
+ cookie = get_cookie(cfg->devpath);
+ ceph_assert(cookie == cfg->cookie || cookie.empty());
+ }
+ if (cfg->show_cookie && !cookie.empty()) {
+ cout << cfg->devpath << " " << cookie << std::endl;
+ } else {
+ cout << cfg->devpath << std::endl;
+ }
+
+ run_server(forker, server, use_netlink);
+
+ if (cfg->quiesce) {
+ r = image.quiesce_unwatch(server->quiesce_watch_handle);
+ ceph_assert(r == 0);
+ }
+
+ r = image.update_unwatch(handle);
+ ceph_assert(r == 0);
+ }
+
+close_nbd:
+ if (r < 0) {
+ if (use_netlink) {
+ netlink_disconnect(nbd_index);
+ } else {
+ ioctl(nbd, NBD_CLEAR_SOCK);
+ cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r)
+ << std::endl;
+ }
+ }
+ close(nbd);
+free_server:
+ delete server;
+close_fd:
+ close(fd[0]);
+ close(fd[1]);
+close_ret:
+ image.close();
+ io_ctx.close();
+ rados.shutdown();
+
+ forker.exit(r < 0 ? EXIT_FAILURE : 0);
+ // Unreachable;
+ return r;
+}
+
+static int do_detach(Config *cfg)
+{
+ int r = kill(cfg->pid, SIGTERM);
+ if (r == -1) {
+ r = -errno;
+ cerr << "rbd-nbd: failed to terminate " << cfg->pid << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return wait_for_terminate(cfg->pid, cfg->reattach_timeout);
+}
+
+static int do_unmap(Config *cfg)
+{
+ /*
+ * The netlink disconnect call supports devices setup with netlink or ioctl,
+ * so we always try that first.
+ */
+ int r = netlink_disconnect_by_path(cfg->devpath);
+ if (r < 0) {
+ return r;
+ }
+
+ if (r == 1) {
+ int nbd = open(cfg->devpath.c_str(), O_RDWR);
+ if (nbd < 0) {
+ cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl;
+ return nbd;
+ }
+
+ r = ioctl(nbd, NBD_DISCONNECT);
+ if (r < 0) {
+ cerr << "rbd-nbd: the device is not used" << std::endl;
+ }
+
+ close(nbd);
+
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (cfg->pid > 0) {
+ r = wait_for_terminate(cfg->pid, cfg->reattach_timeout);
+ }
+
+ return 0;
+}
+
+static int parse_imgpath(const std::string &imgpath, Config *cfg,
+ std::ostream *err_msg) {
+ std::regex pattern("^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$");
+ std::smatch match;
+ if (!std::regex_match(imgpath, match, pattern)) {
+ std::cerr << "rbd-nbd: invalid spec '" << imgpath << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (match[1].matched) {
+ cfg->poolname = match[1];
+ }
+
+ if (match[2].matched) {
+ cfg->nsname = match[2];
+ }
+
+ cfg->imgname = match[3];
+
+ if (match[4].matched)
+ cfg->snapname = match[4];
+
+ return 0;
+}
+
+static int do_list_mapped_devices(const std::string &format, bool pretty_format)
+{
+ bool should_print = false;
+ std::unique_ptr<ceph::Formatter> f;
+ TextTable tbl;
+
+ if (format == "json") {
+ f.reset(new JSONFormatter(pretty_format));
+ } else if (format == "xml") {
+ f.reset(new XMLFormatter(pretty_format));
+ } else if (!format.empty() && format != "plain") {
+ std::cerr << "rbd-nbd: invalid output format: " << format << std::endl;
+ return -EINVAL;
+ }
+
+ if (f) {
+ f->open_array_section("devices");
+ } else {
+ tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("cookie", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ Config cfg;
+ NBDListIterator it;
+ while (it.get(&cfg)) {
+ if (f) {
+ f->open_object_section("device");
+ f->dump_int("id", cfg.pid);
+ f->dump_string("pool", cfg.poolname);
+ f->dump_string("namespace", cfg.nsname);
+ f->dump_string("image", cfg.imgname);
+ f->dump_string("snap", cfg.snapname);
+ f->dump_string("device", cfg.devpath);
+ f->dump_string("cookie", cfg.cookie);
+ f->close_section();
+ } else {
+ should_print = true;
+ if (cfg.snapname.empty()) {
+ cfg.snapname = "-";
+ }
+ tbl << cfg.pid << cfg.poolname << cfg.nsname << cfg.imgname
+ << cfg.snapname << cfg.devpath << cfg.cookie << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section(); // devices
+ f->flush(std::cout);
+ }
+ if (should_print) {
+ std::cout << tbl;
+ }
+ return 0;
+}
+
+static bool find_mapped_dev_by_spec(Config *cfg, int skip_pid=-1) {
+ Config c;
+ NBDListIterator it;
+ while (it.get(&c)) {
+ if (c.pid != skip_pid &&
+ c.poolname == cfg->poolname && c.nsname == cfg->nsname &&
+ c.imgname == cfg->imgname && c.snapname == cfg->snapname &&
+ (cfg->devpath.empty() || c.devpath == cfg->devpath)) {
+ *cfg = c;
+ return true;
+ }
+ }
+ return false;
+}
+
+static int find_proc_by_dev(Config *cfg) {
+ Config c;
+ NBDListIterator it;
+ while (it.get(&c)) {
+ if (c.devpath == cfg->devpath) {
+ *cfg = c;
+ return true;
+ }
+ }
+ return false;
+}
+
+static int parse_args(vector<const char*>& args, std::ostream *err_msg,
+ Config *cfg) {
+ std::string conf_file_list;
+ std::string cluster;
+ CephInitParameters iparams = ceph_argparse_early_args(
+ args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list);
+
+ ConfigProxy config{false};
+ config->name = iparams.name;
+ config->cluster = cluster;
+
+ if (!conf_file_list.empty()) {
+ config.parse_config_files(conf_file_list.c_str(), nullptr, 0);
+ } else {
+ config.parse_config_files(nullptr, nullptr, 0);
+ }
+ config.parse_env(CEPH_ENTITY_TYPE_CLIENT);
+ config.parse_argv(args);
+ cfg->poolname = config.get_val<std::string>("rbd_default_pool");
+
+ std::vector<const char*>::iterator i;
+ std::ostringstream err;
+ std::string arg_value;
+
+ for (i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+ return HELP_INFO;
+ } else if (ceph_argparse_flag(args, i, "-v", "--version", (char*)NULL)) {
+ return VERSION_INFO;
+ } else if (ceph_argparse_witharg(args, i, &cfg->devpath, "--device", (char *)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &cfg->io_timeout, err,
+ "--io-timeout", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-nbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->io_timeout < 0) {
+ *err_msg << "rbd-nbd: Invalid argument for io-timeout!";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i, &cfg->nbds_max, err, "--nbds_max", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-nbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->nbds_max < 0) {
+ *err_msg << "rbd-nbd: Invalid argument for nbds_max!";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i, &cfg->max_part, err, "--max_part", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-nbd: " << err.str();
+ return -EINVAL;
+ }
+ if ((cfg->max_part < 0) || (cfg->max_part > 255)) {
+ *err_msg << "rbd-nbd: Invalid argument for max_part(0~255)!";
+ return -EINVAL;
+ }
+ cfg->set_max_part = true;
+ } else if (ceph_argparse_flag(args, i, "--quiesce", (char *)NULL)) {
+ cfg->quiesce = true;
+ } else if (ceph_argparse_witharg(args, i, &cfg->quiesce_hook,
+ "--quiesce-hook", (char *)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
+ cfg->readonly = true;
+ } else if (ceph_argparse_witharg(args, i, &cfg->reattach_timeout, err,
+ "--reattach-timeout", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-nbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->reattach_timeout < 0) {
+ *err_msg << "rbd-nbd: Invalid argument for reattach-timeout!";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) {
+ cfg->exclusive = true;
+ } else if (ceph_argparse_witharg(args, i, &cfg->io_timeout, err,
+ "--timeout", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-nbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->io_timeout < 0) {
+ *err_msg << "rbd-nbd: Invalid argument for timeout!";
+ return -EINVAL;
+ }
+ *err_msg << "rbd-nbd: --timeout is deprecated (use --io-timeout)";
+ } else if (ceph_argparse_witharg(args, i, &cfg->format, err, "--format",
+ (char *)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) {
+ cfg->pretty_format = true;
+ } else if (ceph_argparse_flag(args, i, "--try-netlink", (char *)NULL)) {
+ cfg->try_netlink = true;
+ } else if (ceph_argparse_flag(args, i, "--show-cookie", (char *)NULL)) {
+ cfg->show_cookie = true;
+ } else if (ceph_argparse_witharg(args, i, &cfg->cookie, "--cookie", (char *)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &arg_value,
+ "--encryption-format", (char *)NULL)) {
+ if (arg_value == "luks1") {
+ cfg->encryption_format =
+ std::make_optional(RBD_ENCRYPTION_FORMAT_LUKS1);
+ } else if (arg_value == "luks2") {
+ cfg->encryption_format =
+ std::make_optional(RBD_ENCRYPTION_FORMAT_LUKS2);
+ } else {
+ *err_msg << "rbd-nbd: Invalid encryption format";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i, &arg_value,
+ "--encryption-passphrase-file",
+ (char *)NULL)) {
+ cfg->encryption_passphrase_file = std::make_optional(arg_value);
+ } else {
+ ++i;
+ }
+ }
+
+ Command cmd = None;
+ if (args.begin() != args.end()) {
+ if (strcmp(*args.begin(), "map") == 0) {
+ cmd = Map;
+ } else if (strcmp(*args.begin(), "unmap") == 0) {
+ cmd = Unmap;
+ } else if (strcmp(*args.begin(), "attach") == 0) {
+ cmd = Attach;
+ } else if (strcmp(*args.begin(), "detach") == 0) {
+ cmd = Detach;
+ } else if (strcmp(*args.begin(), "list-mapped") == 0) {
+ cmd = List;
+ } else {
+ *err_msg << "rbd-nbd: unknown command: " << *args.begin();
+ return -EINVAL;
+ }
+ args.erase(args.begin());
+ }
+
+ if (cmd == None) {
+ *err_msg << "rbd-nbd: must specify command";
+ return -EINVAL;
+ }
+
+ std::string cookie;
+ switch (cmd) {
+ case Attach:
+ if (cfg->devpath.empty()) {
+ *err_msg << "rbd-nbd: must specify device to attach";
+ return -EINVAL;
+ }
+ // Allowing attach without --cookie option for kernel without
+ // NBD_ATTR_BACKEND_IDENTIFIER support for compatibility
+ cookie = get_cookie(cfg->devpath);
+ if (!cookie.empty()) {
+ if (cfg->cookie.empty()) {
+ *err_msg << "rbd-nbd: must specify cookie to attach";
+ return -EINVAL;
+ } else if (cookie != cfg->cookie) {
+ *err_msg << "rbd-nbd: cookie mismatch";
+ return -EINVAL;
+ }
+ } else if (!cfg->cookie.empty()) {
+ *err_msg << "rbd-nbd: kernel does not have cookie support";
+ return -EINVAL;
+ }
+ [[fallthrough]];
+ case Map:
+ if (args.begin() == args.end()) {
+ *err_msg << "rbd-nbd: must specify image-or-snap-spec";
+ return -EINVAL;
+ }
+ if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) {
+ return -EINVAL;
+ }
+ args.erase(args.begin());
+ break;
+ case Detach:
+ case Unmap:
+ if (args.begin() == args.end()) {
+ *err_msg << "rbd-nbd: must specify nbd device or image-or-snap-spec";
+ return -EINVAL;
+ }
+ if (boost::starts_with(*args.begin(), "/dev/")) {
+ cfg->devpath = *args.begin();
+ } else {
+ if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) {
+ return -EINVAL;
+ }
+ }
+ args.erase(args.begin());
+ break;
+ default:
+ //shut up gcc;
+ break;
+ }
+
+ if (args.begin() != args.end()) {
+ *err_msg << "rbd-nbd: unknown args: " << *args.begin();
+ return -EINVAL;
+ }
+
+ cfg->command = cmd;
+ return 0;
+}
+
+static int rbd_nbd(int argc, const char *argv[])
+{
+ int r;
+ Config cfg;
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ std::ostringstream err_msg;
+ r = parse_args(args, &err_msg, &cfg);
+ if (r == HELP_INFO) {
+ usage();
+ return 0;
+ } else if (r == VERSION_INFO) {
+ std::cout << pretty_version_to_str() << std::endl;
+ return 0;
+ } else if (r < 0) {
+ cerr << err_msg.str() << std::endl;
+ return r;
+ }
+
+ if (!err_msg.str().empty()) {
+ cerr << err_msg.str() << std::endl;
+ }
+
+ switch (cfg.command) {
+ case Attach:
+ ceph_assert(!cfg.devpath.empty());
+ if (find_mapped_dev_by_spec(&cfg, getpid())) {
+ cerr << "rbd-nbd: " << cfg.devpath << " has process " << cfg.pid
+ << " connected" << std::endl;
+ return -EBUSY;
+ }
+ [[fallthrough]];
+ case Map:
+ if (cfg.imgname.empty()) {
+ cerr << "rbd-nbd: image name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ r = do_map(argc, argv, &cfg, cfg.command == Attach);
+ if (r < 0)
+ return -EINVAL;
+ break;
+ case Detach:
+ if (cfg.devpath.empty()) {
+ if (!find_mapped_dev_by_spec(&cfg)) {
+ cerr << "rbd-nbd: " << cfg.image_spec() << " is not mapped"
+ << std::endl;
+ return -ENOENT;
+ }
+ } else if (!find_proc_by_dev(&cfg)) {
+ cerr << "rbd-nbd: no process attached to " << cfg.devpath << " found"
+ << std::endl;
+ return -ENOENT;
+ }
+ r = do_detach(&cfg);
+ if (r < 0)
+ return -EINVAL;
+ break;
+ case Unmap:
+ if (cfg.devpath.empty()) {
+ if (!find_mapped_dev_by_spec(&cfg)) {
+ cerr << "rbd-nbd: " << cfg.image_spec() << " is not mapped"
+ << std::endl;
+ return -ENOENT;
+ }
+ } else if (!find_proc_by_dev(&cfg)) {
+ // still try to send disconnect to the device
+ }
+ r = do_unmap(&cfg);
+ if (r < 0)
+ return -EINVAL;
+ break;
+ case List:
+ r = do_list_mapped_devices(cfg.format, cfg.pretty_format);
+ if (r < 0)
+ return -EINVAL;
+ break;
+ default:
+ usage();
+ break;
+ }
+
+ return 0;
+}
+
+int main(int argc, const char *argv[])
+{
+ int r = rbd_nbd(argc, argv);
+ if (r < 0) {
+ return EXIT_FAILURE;
+ }
+ return 0;
+}
diff --git a/src/tools/rbd_nbd/rbd-nbd_quiesce b/src/tools/rbd_nbd/rbd-nbd_quiesce
new file mode 100755
index 000000000..a62a12b15
--- /dev/null
+++ b/src/tools/rbd_nbd/rbd-nbd_quiesce
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+echo "$0 $@" >&2
+
+if [ $# -lt 2 ]; then
+ echo "usage: $0 <dev> <cmd>" >&2
+ exit 1
+fi
+
+dev=$1
+cmd=$2
+
+export PATH=/usr/sbin:/usr/bin:/sbin:/bin
+
+findmnt -S "${dev}" -fno TARGET |
+while read mnt; do
+ case "${cmd}" in
+ quiesce)
+ echo "freezing ${mnt}" >&2
+ fsfreeze -f "${mnt}"
+ ;;
+ unquiesce)
+ echo "unfreezing ${mnt}" >&2
+ fsfreeze -u "${mnt}"
+ ;;
+ *)
+ echo "unknown command ${cmd}" >&2
+ exit 1
+ ;;
+ esac
+done
diff --git a/src/tools/rbd_recover_tool/FAQ b/src/tools/rbd_recover_tool/FAQ
new file mode 100644
index 000000000..1655e8530
--- /dev/null
+++ b/src/tools/rbd_recover_tool/FAQ
@@ -0,0 +1,16 @@
+# author: min chen(minchen@ubuntukylin.com) 2014 2015
+
+1. error "get_image_metadata_v2: no meta_header_seq input"
+cause:
+ database is old, refresh database
+solution:
+ ./rbd-recover-tool database
+
+2. Error initializing leveldb: IO error: lock /var/lib/ceph/osd/ceph-0/current/omap/LOCK: Resource temporarily unavailable
+ ERROR: error flushing journal /var/lib/ceph/osd/ceph-0/journal for object store /var/lib/ceph/osd/ceph-0: (1) Operation not permitted
+cause:
+ when ./rbd-recover-tool database is interrupted , but command has been sent to each osd node, and there is a process reading leveldb and it is LOCKED
+ if run ./rbd-recover-tool database again, all command are sent to osd nodes again, while previous process is locking leveldb, so all new command
+ are failed.
+solution:
+ wait until all previous command finished.
diff --git a/src/tools/rbd_recover_tool/README b/src/tools/rbd_recover_tool/README
new file mode 100644
index 000000000..d289c11ca
--- /dev/null
+++ b/src/tools/rbd_recover_tool/README
@@ -0,0 +1,97 @@
+# author: Min chen(minchen@ubuntukylin.com) 2014 2015
+
+------------- ceph rbd recover tool -------------
+
+ ceph rbd recover tool is used for recovering ceph rbd image, when all ceph services are killed.
+it is based on ceph-0.80.x (Firefly and newer)
+ currently, ceph service(ceph-mon, ceph-osd) evently are not available caused by bugs or sth else
+, especially on large scale ceph cluster, so that the ceph cluster can not supply service
+and rbd images can not be accessed. In this case, a tool to recover rbd image is necessary.
+ ceph rbd recover tool is just used for this, it can collect all objects of an image from distributed
+osd nodes with the latest pg epoch, and splice objects by offset to a complete image. To make sure
+object data is complete, this tool does flush osd journal on each osd node before recovering.
+ but, there are some limitions:
+-need ssh service and unobstructed network
+-osd data must be accessed on local disk
+-clone image is not supported, while snapshot is supported
+-only support relicated pool
+
+before you run this tool, you should make sure that:
+1). all processes (ceph-osd, ceph-mon, ceph-mds) are shutdown
+2). ssh daemon is running & network is ok (ssh to each node without password)
+3). ceph-kvstore-tool is installed(for ubuntu: apt-get install ceph-test)
+4). osd disk is not crashed and data can be accessed on local filesystem
+
+-architecture:
+
+ +---- osd.0
+ |
+admin_node -----------+---- osd.1
+ |
+ +---- osd.2
+ |
+ ......
+
+-files:
+admin_node: {rbd-recover-tool common_h epoch_h metadata_h database_h}
+osd: {osd_job common_h epoch_h metadata_h} #/var/rbd_tool/osd_job
+in this architecture, admin_node acts as client, osds act as server.
+so, they run different files:
+on admin_node run: rbd-recover-tool <action> [<parameters>]
+on osd node run: ./osd_job <function> <parameters>
+admin_node will copy files: osd_job, common_h, epoch_h, metadata_h to remote osd node
+
+
+-config file
+before you run this tool, make sure write config files first
+osd_host_path: osd hostnames and osd data path #user input
+ osdhost0 /var/lib/ceph/osd/ceph-0
+ osdhost1 /var/lib/ceph/osd/ceph-1
+ ......
+mon_host: all mon node hostname #user input
+ monhost0
+ monhost1
+ ......
+mds_host: all mds node hostname #user input
+ mdshost0
+ mdshost1
+ ......
+then, init_env_admin function will create file: osd_host
+osd_host: all osd node hostname #generated by admin_job, user ignore it
+ osdhost0
+ osdhost1
+ ......
+
+
+-usage:
+rbd-recovert-tool <operation>
+<operation> :
+database #generating offline database: hobject path, node hostname, pg_epoch and image metadata
+list #list all images from offline database
+lookup <pool_id>/<image_name>[@[<snap_name>]] #lookup image metadata in offline database
+recover <pool_id><image_name>[@[<snap_name>]] [/path/to/store/image] #recover image data according to image metadata
+
+-steps:
+1. stop all ceph services: ceph-mon, ceph-osd, ceph-mds
+2. setup config files: osd_host_path, mon_host, mds_host
+3. rbd-recover-tool database # wait a long time
+4. rbd-recover-tool list
+4. rbd-recover-tool recover <pool_id>/<image_name>[@[<image_name>]] [/path/to/store/image]
+
+
+-debug & error check
+if admin_node operation is failed, you can check it on osd node
+cd /var/rbd_tool/osd_job
+./osd_job <operation>
+<operation> :
+do_image_id <image_id_hobject> #get image id of image format v2
+do_image_id <image_header_hobject> #get image id of image format v1
+do_image_metadata_v1 <image_header_hobject> #get image metadata of image format v1, maybe pg epoch is not latest
+do_image_metadata_v2 <image_header_hobject> #get image metadata of image format v2, maybe pg epoch is not latest
+do_image_list #get all images on this osd(image head hobject)
+do_pg_epoch #get all pg epoch and store it in /var/rbd_tool/single_node/node_pg_epoch
+do_omap_list #list all omap headers and omap entries on this osd
+
+
+-FAQ
+file FAQ lists some common confusing cases while testing
diff --git a/src/tools/rbd_recover_tool/TODO b/src/tools/rbd_recover_tool/TODO
new file mode 100644
index 000000000..c36d4c947
--- /dev/null
+++ b/src/tools/rbd_recover_tool/TODO
@@ -0,0 +1,2 @@
+
+1.support clone imag
diff --git a/src/tools/rbd_recover_tool/common_h b/src/tools/rbd_recover_tool/common_h
new file mode 100644
index 000000000..f2df662ad
--- /dev/null
+++ b/src/tools/rbd_recover_tool/common_h
@@ -0,0 +1,412 @@
+#!/usr/bin/env bash
+# file: common_h
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+
+# admin node init path
+rbd_image=/var/rbd_tool/rbd_image
+database=$rbd_image/database
+image_coll_v1=$rbd_image/image_coll_v1
+image_coll_v2=$rbd_image/image_coll_v2
+pg_coll=$rbd_image/pg_coll
+images=$rbd_image/images
+images_meta=$rbd_image/images_meta
+default_backup_dir=/var/rbd_tool/default_backup_dir
+
+# admin node: image snap & nosnap
+nosnap= #$rbd_image/<image_name>/nosnap
+snap= #rbd_image/<image_name>/<snap_name>
+
+# osd node init path
+job_path=/var/rbd_tool/osd_job
+single_node=/var/rbd_tool/single_node
+
+# osd node vars
+osd_env= #single_node/$cluster$id/osd_env
+osd_data= #/var/lib/ceph/osd/$cluster-$id
+omap_path= #$osd_data/current/omap
+image_list_v1= #single_node/$cluster-$id/image_list_v1
+image_list_v2= #single_node/$cluster-$id/image_list_v2
+image_v1= #$single_node/$cluster-$id/image_v1
+image_v2= #$single_node/$cluster-$id/image_v2
+pgid_list= #$single_node/$cluster-$id/pgid_list
+node_pg_epoch= #$single_node/$cluster-$id/node_pg_epoch
+omap_list= #$single_node/$cluster-$id/omap_list
+
+# admin node config file
+osd_host_path=$my_dir/config/osd_host_path
+osd_host_mapping= #$pwd_path/config/osd_host_mapping # host --> host_remote: by init_env_admin()
+osd_host=$my_dir/config/osd_host #generated by function init_env_admin()
+mon_host=$my_dir/config/mon_host
+mds_host=$my_dir/config/mds_host
+
+# ssh option
+ssh_option="-o ConnectTimeout=1"
+
+# gen md5sum
+function gen_md5()
+{
+ echo $1|md5sum|awk '{print $1}'
+}
+
+# on each osd node
+# check ceph environment: ssh, ceph-kvstore-tool, osd_data_path
+function check_ceph_env()
+{
+ local func="check_ceph_env"
+ if [ $# -lt 2 ];then
+ echo "$func: parameters: <node> <data_path>"
+ exit
+ fi
+ local node=$1
+ local data_path=$2
+ local res=
+ local cmd=
+
+ trap 'echo [$node]: ssh failed; exit' INT HUP
+ ssh -o ConnectTimeout=1 $node "echo -n" </dev/null
+ res=$?
+ if [ $res -ne 0 ];then
+ echo "[$node]: ssh failed"
+ exit
+ fi
+
+ cmd=ceph-kvstore-tool
+ trap 'echo [$node]: $cmd failed; exit' INT HUP
+ ssh -o ConnectTimeout=1 $node "$cmd &>/dev/null;" </dev/null
+ res=$?
+ # ceph-kvstore-tool will return 1 with no parameters input
+ if [ $res -ne 1 ];then
+ echo "[$node]: $cmd not installed"
+ exit
+ fi
+
+ trap 'echo [$node]: stat $data_path failed; exit' INT HUP
+ ssh -o ConnectTimeout=1 $node "stat $data_path &>/dev/null;" </dev/null
+ res=$?
+ if [ $res -ne 0 ];then
+ echo "[$node]: $data_path not exists"
+ exit
+ fi
+}
+
+# osd node context : osd_data_path
+function init_env_osd()
+{
+ local func="init_env_osd"
+ if [ "$1"x = ""x ];then
+ echo "$func: no osd_data_path input"
+ exit
+ fi
+ osd_data=$1
+ omap_path=$osd_data/current/omap
+
+ if [ ! -e $single_node ];then
+ mkdir -p $single_node
+ fi
+
+ local osd_id=`gen_md5 $osd_data`
+ local osd_dir=$single_node/$osd_id
+
+ if [ ! -e $osd_dir ];then
+ mkdir -p $osd_dir
+ fi
+
+ image_list_v1=$osd_dir/image_list_v1
+ image_list_v2=$osd_dir/image_list_v2
+ image_v1=$osd_dir/image_v1
+ image_v2=$osd_dir/image_v2
+ pgid_list=$osd_dir/pgid_list
+ node_pg_epoch=$osd_dir/node_pg_epoch
+ omap_list=$osd_dir/omap_list
+}
+
+# admin node process file: osd_host_path
+function init_env_admin()
+{
+ local func="init_env_admin"
+ local pwd_path=`pwd`
+ osd_host_mapping=$pwd_path/config/osd_host_mapping
+ if [ ! -s $osd_host_path ];then
+ echo "$func: config/osd_host_path not exists or empty"
+ exit
+ fi
+ if [ ! -e $rbd_image ];then
+ mkdir -p $rbd_image
+ fi
+ if [ ! -e $images ];then
+ mkdir -p $images
+ fi
+
+ if [ ! -s $mon_host ];then
+ echo "$func: config/mon_host not exists or empty"
+ exit
+ fi
+ if [ ! -e $mds_host ];then
+ echo "$func: config/mds_host not exists"
+ exit
+ fi
+
+ # we just judge if osd_host is needed to be updated
+ if [ -s $osd_host ] && [ $osd_host -nt $osd_host_path ];then
+ return
+ fi
+ echo "$func: create osd_host ..."
+ # create file: osd_host and osd_host_mapping
+ >$osd_host
+ >$osd_host_mapping
+ local lines=0
+ local lineno=0
+ while read line
+ do
+ lineno=$(($lineno + 1))
+ if [ "$line"x = ""x ];then
+ continue;
+ fi
+ local node=`echo $line|awk '{print $1}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: osd_host_path : line $lineno: osd hostname not input"
+ rm -rf $osd_host $osd_host_mapping
+ exit
+ fi
+ local data_path=`echo $line|awk '{print $2}'`
+ if [ "$data_path"x = ""x ];then
+ echo "$func: osd_host_path : line $lineno: osd data_path not input"
+ rm -rf $osd_host $osd_host_mapping
+ exit
+ fi
+ lines=$(($lines + 1))
+ # in case : there are servral hostnames on the same node
+ # just need output of `hostname`
+ local hostname_alias=
+ hostname_alias=`ssh $ssh_option $node "hostname" 2>/dev/null </dev/null`
+ if [ "$hostname_alias"x = ""x ];then
+ echo "$func: osd_host_path: line $lineno: $node: get remote hostname alias failed"
+ rm -rf $osd_host $osd_host_mapping
+ exit
+ fi
+ echo "$node $hostname_alias" >>$osd_host_mapping
+ echo $node >> $osd_host
+ # check ceph env on remote osd
+ check_ceph_env $node $data_path
+ done < $osd_host_path
+
+ if [ $lines = 0 ];then
+ echo "$func: no osd host path valid"
+ exit
+ fi
+}
+
+function admin_parse_osd()
+{
+ local func="admin_parse_osd"
+ if [ -s $osd_host ];then
+ return
+ fi
+ # create file: osd_host
+ >$osd_host
+ local lines=0
+ local lineno=0
+ while read line
+ do
+ lineno=$(($lineno + 1))
+ if [ "$line"x = ""x ];then
+ continue;
+ fi
+ local node=`echo $line|awk '{print $1}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: osd_host_path : line $lineno: osd_host not input"
+ exit
+ fi
+ local data_path=`echo $line|awk '{print $2}'`
+ if [ "$data_path"x = ""x ];then
+ echo "$func: osd_host_path : line $lineno: osd_data not input"
+ exit
+ fi
+ lines=$(($lines + 1))
+ echo $node >> $osd_host
+ done < $osd_host_path
+}
+
+# for osd node
+function get_omap_list()
+{
+ ceph-kvstore-tool $omap_path list > $omap_list
+}
+
+function convert_underline()
+{
+ if [ "$1"x = ""x ];then
+ return
+ fi
+
+ echo $1|sed -e 's/_/\\u/gp'|head -n 1
+}
+
+function dump_backslash()
+{
+ echo $*|sed -e 's/\\/\\\\/gp'|head -n 1
+}
+
+function dump_dump_backslash()
+{
+ echo $*|sed -e 's/\\/\\\\\\\\/gp'|head -n 1
+}
+
+function char_convert()
+{
+ if [ "$1"x = ""x ];then
+ return
+ fi
+
+ echo $1|sed -e 's/_/\\u/gp' -e 's/\./%e/gp' -e 's/%/%p/gp'|head -n 1
+}
+
+function check_osd_process()
+{
+ local func="check_osd_process"
+ local host=$1
+ if [ "$1"x = ""x ];then
+ exit
+ fi
+ local cmds="ps aux|grep ceph-osd|grep -v grep"
+ local ret=/tmp/ret.$$$$
+ ssh $ssh_option $host $cmds |tee $ret
+ if [ -s $ret ];then
+ echo "$func: [$host] ceph-osd process is not killed"
+ exit
+ fi
+ rm -f $ret
+}
+
+function get_map_header_prefix()
+{
+ echo "_HOBJTOSEQ_"
+}
+
+function get_map_header_key()
+{
+ local func="get_map_header_key"
+ if [ "$1"x = ""x ];then
+ #echo $func': no keyword input'
+ exit
+ fi
+ local keyword=$1
+ local res=`cat $omap_list| grep $keyword`
+ if [ "$res"x = ""x ];then
+ #echo "$func: map_header_key = $keyword not exists"
+ exit
+ fi
+ echo $res|awk -F ":" '{print $2}'
+}
+
+function get_header_seq()
+{
+ local func="get_header_seq"
+ if [ "$1"x == ""x ];then
+ #echo "$func: no prefix input"
+ exit;
+ elif [ "$2"x == ""x ];then
+ #echo "$func: no key input"
+ exit;
+ fi
+ local prefix=$1;
+ local key=$2;
+ local res=/tmp/header_seq.$$$$
+
+ ceph-kvstore-tool $omap_path get $prefix $key 2>/dev/null 1>$res
+ if [ $? != 0 ]; then
+ #echo "$func: <$prefix , $key> not exists" ;
+ exit;
+ fi
+
+ # ceph-kvstore-tool get result like this:
+ # 02 01 7e 00 00 00 12 44 00 00 00 00 00 00 00 00
+ # get header seq bytes:
+ # 12 44 00 00 00 00 00 00
+ # -> 00 00 00 00 00 00 44 12
+ # echo $((16#0000000000004412)) -> 17426 == header_seq
+ local seq=`cat $res |head -n 2|tail -n 1| \
+ awk '
+ BEGIN {
+ FS=":"
+ seq="";
+ i=7;
+ } {
+ split($2, arr, " ")
+ # header_seq uint64 : 8 bytes
+ for (x=7; x>=0; --x) {
+ seq=seq""arr[i+x];
+ }
+ }
+ END {
+ print seq
+ }'`
+ if [ "$seq"x = ""x ];then
+ #echo "$func: get <$prefix , $key> failed"
+ exit;
+ fi
+ rm -f $res
+ echo $((16#$seq))
+}
+
+# get header info key/value
+function get_header_kv()
+{
+ local func="get_header_kv"
+ if [ "$1"x = ""x ];then
+ #echo "$func: no prefix input"
+ exit
+ elif [ "$2"x = ""x ];then
+ #echo "$func: no key input"
+ exit
+ elif [ "$3"x != "string"x ] && [ "$3"x != "int"x ];then
+ #echo "$func: no valid type input, use type (string|int)"
+ exit
+ fi
+
+ local prefix=$1
+ local key=$2
+ local types=$3
+ local res=/tmp/kv.$$$$
+
+ ceph-kvstore-tool $omap_path get $prefix $key 2>/dev/null 1>$res
+ if [ $? != 0 ];then
+ #echo "$func: <$prefix , $key> not exists"
+ exit
+ fi
+
+ if [ "$types"x = "string"x ];then
+ local value=`cat $res |tail -n +2|head -n -1|awk -F ": " '{printf $3}'|sed -n 's/^\.\{4\}//p'`
+ echo $value
+ elif [ "$types"x = "int"x ];then
+ local value=`cat $res |tail -n +2|head -n -1| \
+ awk '
+ BEGIN{
+ FS=":"
+ } {
+ split($2, arr, " ");
+ len=length(arr)
+ for (i=len; i>0; --i) {
+ printf arr[i];
+ }
+ }'`
+ echo $((16#$value))
+ fi
+ rm -f $res
+}
diff --git a/src/tools/rbd_recover_tool/config/mds_host b/src/tools/rbd_recover_tool/config/mds_host
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/tools/rbd_recover_tool/config/mds_host
diff --git a/src/tools/rbd_recover_tool/config/mon_host b/src/tools/rbd_recover_tool/config/mon_host
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/tools/rbd_recover_tool/config/mon_host
diff --git a/src/tools/rbd_recover_tool/config/osd_host_path b/src/tools/rbd_recover_tool/config/osd_host_path
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/tools/rbd_recover_tool/config/osd_host_path
diff --git a/src/tools/rbd_recover_tool/database_h b/src/tools/rbd_recover_tool/database_h
new file mode 100644
index 000000000..4ff20425a
--- /dev/null
+++ b/src/tools/rbd_recover_tool/database_h
@@ -0,0 +1,1134 @@
+#!/usr/bin/env bash
+# file: database_h
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+
+. $my_dir/common_h
+. $my_dir/metadata_h
+. $my_dir/epoch_h
+
+db_image_prefix=
+db_image_size=
+db_order=
+db_snap_id=
+db_snap_image_size=
+found=0
+
+#init osd_data and get all objects path
+function gen_database()
+{
+ local func="gen_database"
+ rm -rf $database/*
+ rm -rf $images
+ rm -rf $raw
+ mkdir -p $database
+ local host=
+ local data_path=
+
+ trap 'echo $func failed; exit;' INT HUP
+ while read line
+ do
+ {
+ host=`echo $line|awk '{print $1}'`
+ data_path=`echo $line|awk '{print $2}'`
+ if [ "$host"x = ""x ] || [ "$data_path"x = ""x ];then
+ continue
+ fi
+ local cmds="find $data_path/current -type f"
+ ssh $ssh_option $host $cmds > $database/$host
+ } &
+ done < $osd_host_path
+ wait
+ echo "$func: finish"
+}
+
+# collect hobjects from database
+# and choose the object whose epoch is latest
+# then, sort the objects by their offsets in image
+function gather_hobject_common()
+{
+ func="gather_hobject_common"
+
+ trap 'echo $func failed; exit;' INT HUP
+ if [ $# -lt 2 ];then
+ echo "$func: parameters: <pool_id> <image_prefix> [<snap_id>]"
+ exit
+ fi
+
+ local pool_id=$1
+ local image_prefix=$2
+ pool_id=$(($pool_id))
+ local hex_pool_id=`printf "%x" $pool_id`
+ # NOSNAP = uint64(-2)
+ local snap_id=`printf "%u" -2`
+ local hex_snap_id="head"
+ local psuffix=
+ local fsuffix="_head"
+ if [ $# = 3 ];then
+ snap_id=$(($3))
+ hex_snap_id=`printf "%x" $snap_id`
+ psuffix="_"$snap_id
+ fsuffix="_"$snap_id
+ fi
+ local underline_image_prefix=`convert_underline $image_prefix`
+ local dump_image_prefix=`dump_backslash $underline_image_prefix`
+ local ddump_image_prefix=`dump_dump_backslash $underline_image_prefix`
+ local images_raw_dir=$rbd_image/raw
+ local image_hobjects_dir=$images/pool_$pool_id/$image_prefix
+ # $images/raw/$image_prefix"_head"
+ local image_hobjects_raw=$images_raw_dir/$image_prefix"$fsuffix"
+ # $images/$image_prefix/$image_prefix"_head"
+ local image_hobjects_stable=$image_hobjects_dir/$image_prefix"$fsuffix"
+
+ if [ ! -e $images_raw_dir ];then
+ mkdir -p $images_raw_dir
+ fi
+ if [ ! -e $image_hobjects_dir ];then
+ local image_metadata=$images_meta/$image_name_in
+ mkdir -p $image_hobjects_dir
+ fi
+
+ pushd $database >/dev/null
+ local pattern="\.[0-9a-f]+__"$hex_snap_id"_[0-9A-F]{8}__"$hex_pool_id
+ >$image_hobjects_raw
+ grep -r -E $dump_image_prefix""$pattern * >$image_hobjects_raw
+ if [ ! -s $image_hobjects_raw ];then
+ echo "$func: image snap [ $image_prefix"$psuffix" ] is empty"
+ return 1 #no data available
+ fi
+ popd >/dev/null
+
+ local offset_dir_temp=$images_raw_dir/$image_prefix"$fsuffix""_dir_temp"
+ rm -rf $offset_dir_temp
+ mkdir -p $offset_dir_temp
+
+ echo "gather hobjects from database: snapid=$snap_id ..."
+
+ # format: ceph2:/var/lib/ceph/osd/ceph-1/current/2.d3_head/rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2
+ local tmp_image=$offset_dir_temp/tmpimage.$$$$
+ >$tmp_image
+ cat $image_hobjects_raw |
+ awk -F ':' '
+ BEGIN {
+ pg_coll="'$pg_coll'"
+ tmp_image="'$tmp_image'"
+ osd_host_mapping="'$osd_host_mapping'"
+ snapid="'$snap_id'"
+ }{
+ # $2 = /var/lib/ceph/osd/ceph-1/current/2.d3_head/rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2
+
+ split($2, arr1, "/current/"); # {/var/lib/ceph/osd/ceph-1/, 2.d3_head/rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2}
+ split(arr1[2], arr2, "/"); # {2.d3_head, rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2}
+ split(arr2[1], arr3, "_head"); # {2.d3,}
+
+ hobject=$2;
+ data_path=arr1[1];
+ gsub(/\\u/, "\\\\\\\\u", hobject); # dump backslash to delay escape (\ -> \\)
+ "awk \"\\$1 == \\\""$1"\\\" {print \\$2}\" "osd_host_mapping" | head -n 1" | getline node
+ pgid = arr3[1];
+
+ len=length(arr2);
+ offset_hobject=arr2[len] # rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2
+ split(offset_hobject, offarr1, "."); # {rb, 0, 1293, 6b8b4567, 000000000002__head_FB425CD3__2}
+ len1=length(offarr1)
+ offset_p=offarr1[len1] # 000000000002__head_FB425CD3__2
+ split(offset_p, offarr2, "__"); # {000000000002, head_FB425CD3, 2}
+ offset=offarr2[1]; # 000000000002
+
+ system("echo -n \""node" "pgid" "hobject" "offset" "snapid" \" >>"tmp_image);
+ #system("echo -n \""node" "pgid" "hobject" "offset" "snapid" \"");
+ #print node" "pgid" "hobject" "offset" "snapid
+
+ # find pg_epoch from pg_coll database
+ system("awk \"\\$1 == \\\""node"\\\" && \\$2 == \\\""pgid"\\\" && \\$4 == \\\""data_path"\\\" {print \\$3}\" "pg_coll" >>"tmp_image);
+ #system("awk \"\\$1 == \\\""node"\\\" && \\$2 == \\\""pgid"\\\" && \\$4 == \\\""data_path"\\\" {print \\$3}\" "pg_coll);
+ }'
+
+ local sort_image=$offset_dir_temp/sortimage.$$$$
+ >$sort_image
+ sort -t ' ' -k 4.1,4 -k 6.1nr -k 1.1,1 $tmp_image >$sort_image
+ sort -t ' ' -k 4.1,4 -u $sort_image > $image_hobjects_stable
+
+ #rm -rf $offset_dir_temp
+ return 0
+}
+
+function gather_hobject_nosnap()
+{
+ gather_hobject_common $1 $2
+}
+
+function gather_hobject_snap()
+{
+ gather_hobject_common $1 $2 $3
+}
+
+# select the max pg_epoch item of the same $field
+# if no same $field, choose the first
+# format : "node $field pg_epoch"
+function choose_epoch()
+{
+ cat $1|sort -t ' ' -k 3.1,3nr -k 2.1,2n |head -n 1;
+}
+
+# lookup image info , after scatter_node_jobs & gather_node_infos
+function lookup_image()
+{
+ local func="lookup_image"
+ if [ $# -lt 2 ];then
+ echo "$func: parameters error <pool_id> <image_name> [<snap_name>]"
+ fi
+ local pool_id=$1
+ local image_name=$2
+ local snap_name=$3
+ pool_id=$((pool_id))
+ echo -e "$func: pool_id = $pool_id\timage_name = $image_name\tsnap_name = $snap_name"
+ if [ $pool_id -lt 0 ];then
+ echo "$func: pool_id must great than zero"
+ exit
+ fi
+ local hex_pool_id=`printf "%x" $pool_id`
+ input_image $image_name
+ local node=
+ local item=/tmp/item.$$$$
+ local img_name=`dump_backslash $image_name`
+
+ local image_format=0
+ local image_id_hobject=
+ local image_header_hobject=
+ local result=/tmp/tmp_result.$$$$
+ local res1=/tmp/tmp_res1.$$$$
+ local res2=/tmp/tmp_res2.$$$$
+ local data_path=
+
+ # image format v1
+ {
+ cat $image_coll_v1|grep -E "/$img_name\.rbd__head_[0-9A-F]{8}__$hex_pool_id" >$res1
+ if [ -s $res1 ];then
+ echo -n "$func: rbd_header_hobject = "
+ choose_epoch $res1| tee $item
+ #choose_epoch $res1 > $item
+
+ if [ -e $item ];then
+ node=`cat $item|awk '{print $1}'`
+ image_header_hobject=`cat $item|awk '{print $2}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: v1 node is NULL"
+ exit
+ fi
+ if [ "$image_header_hobject"x = ""x ];then
+ echo "$func: v1 image_header_hobject is NULL"
+ exit
+ fi
+ rm -f $item
+ fi
+
+ image_format=1
+ echo -e "image_name:\t$image_name_in"
+ echo -e "image_format:\t$image_format"
+ data_path=`echo $image_header_hobject|awk -F "/current" '{print $1}'`
+
+ >$result
+ cmds="bash $job_path/osd_job do_image_metadata_v1 $data_path `dump_backslash $image_header_hobject` $snap_name"
+ ssh $ssh_option $node $cmds | tee $result
+ fi
+ }
+
+ # image format v2
+ {
+ cat $image_coll_v2|grep -E "/rbd\\\\uid\."$img_name"__head_[0-9A-F]{8}__$hex_pool_id" >$res2
+ if [ -s $res2 ];then
+ echo -n "$func: rbd_id_hobject = "
+ choose_epoch $res2 | tee $item
+ #choose_epoch $res2 > $item
+
+ if [ -e $item ];then
+ node=`cat $item|awk '{print $1}'`
+ image_id_hobject=`cat $item|awk '{print $2}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: v2 node is NULL(to get image_id_hobject)"
+ exit
+ fi
+ if [ "$image_id_hobject"x = ""x ];then
+ echo "$func: v2 image_id_hobject is NULL"
+ exit
+ fi
+ rm -f $item
+ fi
+
+ check_osd_process $node
+ image_format=2
+
+ local tid=/tmp/image_id.$$$$
+ data_path=`echo $image_id_hobject|awk -F "/current" '{print $1}'`
+ >$tid
+ cmds="bash $job_path/osd_job do_image_id $data_path `dump_backslash $image_id_hobject`"
+ ssh $ssh_option $node $cmds > $tid
+
+ local image_id=`cat $tid`
+ rm -f $tid
+
+ #get image_header_hobject
+ pushd $database >/dev/null
+ local pattern="header\."$image_id"__head_[0-9A-F]{8}__$hex_pool_id"
+ local tcoll=/tmp/tmp_image_head_coll.$$$$
+
+ # hostname(by command hostname) in $pg_coll maybe different from hostname in tcoll(input by user)
+ # t_host: hostname read from config file ($tcoll)
+ # t_host_remote: $(hostname) on osd node ($pg_coll)
+ grep -r -E $pattern * >$tcoll
+ popd >/dev/null
+
+ local t_host=(`cat $tcoll|awk -F ":" '{print $1}'`)
+ local t_pgid=(`cat $tcoll|awk -F ":" '{print $2}'|sed -n 's/.*\/\([0-9a-fA-F]\+\.[0-9a-fA-F]\+\)_head\/.*/\1/p'`)
+ local t_hobject=(`cat $tcoll|awk -F ":" '{print $2}'`)
+ local t_data_path=(`cat $tcoll|awk -F ":" '{split($2, arr, "/current/"); print arr[1];}'`)
+ rm -f $tcoll
+ declare -a t_host_remote
+
+ #if there is no failed pg migration, number of t_host is replica num
+ #replica num : 3, 4, 5 ...
+ local t_hostname=/tmp/t_hostname.$$$$
+ for ((i=0; i<${#t_host[*]}; i++))
+ do
+ ssh $ssh_option ${t_host[$i]} "hostname" >$t_hostname
+ if [ $? != 0 ];then
+ echo "$func: ${t_host[$i]} get host_remote failed"
+ exit
+ fi
+ t_host_remote[$i]=`cat $t_hostname`
+ done
+ rm -f $t_hostname
+
+ local t_item=/tmp/tmp_item.$$$$
+ local tmp_item=/tmp/tmp_tmp_item.$$$$
+
+ >$tmp_item
+ for ((i=0; i<${#t_host_remote[*]}; i++ ))
+ do
+ local node=${t_host_remote[$i]}
+ local pgid=${t_pgid[$i]}
+ awk '$1 == "'"$node"'" && $2 == "'"$pgid"'" {print}' $pg_coll >>$tmp_item
+ done
+
+ # t_item: <remote_hostname> <pgid> <epoch> <data_path>
+ sort -u $tmp_item >$t_item
+ rm -f $tmp_item
+
+ local entry=`choose_epoch $t_item` #t_host_remote
+ rm -f $t_item
+
+ node=`echo $entry|awk '{print $1}'`
+ data_path=`echo $entry|awk '{print $4}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: v2 node is NULL (to get image_header_hobject)"
+ exit
+ fi
+
+ for ((i=0; i<${#t_host_remote[*]}; i++))
+ do
+ if [ "${t_host_remote[$i]}"x = "$node"x ] && [ "${t_data_path[$i]}"x = "$data_path"x ];then
+ image_header_hobject=${t_hobject[$i]}
+ break
+ fi
+ done
+
+ if [ "$image_id_hobject"x = ""x ];then
+ echo "$func: v2 image_header_hobject is NULL"
+ exit
+ fi
+
+ check_osd_process $node
+
+ echo "$func: rbd_header_hobject = $node $image_header_hobject"
+ echo -e "image_name:\t$image_name_in"
+ echo -e "image_format:\t$image_format"
+
+ #data_path=`echo $image_header_hobject|awk -F "/current" '{print $1}'`
+ >$result
+ cmds="bash $job_path/osd_job do_image_metadata_v2 $data_path $image_id `dump_backslash $image_header_hobject` $snap_name"
+ ssh $ssh_option $node $cmds | tee $result
+ fi
+ }
+
+ if [ ! -s $result ];then
+ echo "$func: $image_name_in not exists"
+ exit
+ fi
+
+ # to assign value to global variable
+ db_image_prefix=`cat $result|awk '/^(object_prefix|block_name):/{print $2}'`
+ if [ "$db_image_prefix"x = ""x ];then
+ echo "$func: image_prefix is NULL"
+ exit
+ fi
+
+ db_image_size=`cat $result|awk '/^image_size:/{print $2}'`
+ db_order=`cat $result|awk '/^order:/{print $2}'`
+ if [ "$snap_name"x != ""x ];then
+ db_snap_id=`cat $result|awk '/^snapshot:/{print $2}'`
+ if [ "$db_snap_id"x = ""x ];then
+ echo "$func: $image_name_in@$snap_name NOT EXISTS"
+ exit
+ fi
+ db_snap_image_size=`cat $result|awk '/^snapshot:/{print $4}'`
+ else
+ #save snaplist
+ local image_snaplist=$images/pool_$pool_id/$image_name_in/@snaplist
+ local image_dir=$images/pool_$pool_id/$image_name_in
+ if [ ! -e $image_dir ];then
+ mkdir -p $image_dir
+ fi
+ cat $result|awk '/^snapshot:/{print $2" "$3" "$4}' >$image_snaplist
+ fi
+ found=1
+ rm -f $result
+}
+
+function list_images()
+{
+ echo "=============== format =============="
+ echo "format: <pool_id>/<image_name>"
+ echo "================ v1: ================"
+ #sed -n 's/\(.*\)\/\(.*\)\.rbd__\(.*\)/\2/p' $image_coll_v1|sort -u|sed -e 's/\\u/_/g'
+ sed -n 's/.*\/\(.*\)\.rbd__head_[0-9A-F]\{8\}__\([0-9a-f]\+\).*/\2 \1/p' $image_coll_v1|sort -u|awk '{print strtonum("0x"$1)"/"$2;}'|sed -e 's/\\u/_/g'
+ echo "================ v2: ================"
+ #sed -n 's/\(.*\)\/rbd\\uid.\(.*\)__\(head.*\)/\2/p' $image_coll_v2|sort -u|sed 's/\\u/_/g'
+ sed -n 's/.*\/rbd\\uid.\(.*\)__head_[0-9A-F]\{8\}__\([0-9a-f]\+\).*/\2 \1/p' $image_coll_v2|sort -u|awk '{print strtonum("0x"$1)"/"$2}'|sed 's/\\u/_/g'
+}
+
+# lookup image metadata
+# and
+# collect hobjects of image with the latest pg epoch
+function discover_image_nosnap()
+{
+ local func="discover_image_nosnap"
+ echo "$func ..."
+ local pool_id=$1
+ local image_name=$2
+ pool_id=$(($pool_id))
+ lookup_image $pool_id $image_name # assign $image_prefix
+ gather_hobject_nosnap $pool_id $db_image_prefix
+ if [ $? -ne 0 ];then
+ exit
+ fi
+ local image_hobjects_stable_nosnap=$images/pool_$pool_id/$db_image_prefix/$db_image_prefix"_head"
+ local image_hobjects_dir=$images/pool_$pool_id/$image_name_in
+ if [ ! -e $image_hobjects_dir ];then
+ mkdir -p $image_hobjects_dir
+ fi
+ # mv image_prefix to image_name
+ mv $image_hobjects_stable_nosnap $image_hobjects_dir/$image_name_in
+ rm -rf $images/pool_$pool_id/$db_image_prefix
+}
+
+# get the offset snapid object
+# if there is no object, choose the smallest snapid which is greater than current snapid
+function get_object_clone()
+{
+ local func="get_object_clone"
+ if [ $# -lt 4 ];then
+ exit
+ fi
+
+ local object_offset_string=$1
+ local snapid=$2
+ local snaplist_path=$3
+ local snapset_output_dir=$4
+
+ # snapid in desc
+ local snap_coll_arr=(`
+ cat $snaplist_path|awk '{ if ($1 >= '"$snapid"') print "'"$snapset_output_dir"'/@"$1}'`)
+
+ local hex_snapid=`printf "%x" $snapid`
+ pushd $snapset_output_dir >/dev/null
+ # get object with the smallest snapid greater than current snapid
+ awk '$4 == "'"$object_offset_string"'" && $5 >= '$snapid' {print}' `echo ${snap_coll_arr[@]}` |tail -n 1
+ popd >/dev/null
+}
+
+# gather hobject for each snapid
+function gen_snapset_hobject()
+{
+ local func="gen_image_snapset"
+ echo "$func ..."
+ if [ $# -lt 4 ];then
+ echo "$func: parameters: <pool_id> <image_prefix> <snaplist_path> <snapset_output_dir>"
+ exit
+ fi
+ local pool_id=$1
+ local image_prefix=$2
+ local snaplist_path=$3
+ local snapset_output_dir=$4
+ pool_id=$(($pool_id))
+ OIFS=$IFS
+ IFS=$'\n'
+ local snaparr=(`cat $snaplist_path`)
+ # gather hobject for each snapshot
+ trap 'echo $func failed; exit;' INT HUP
+ for line in ${snaparr[@]}
+ do
+ OOIFS=$IFS
+ IFS=$' '
+ local field=(`echo $line`)
+ local snapid=${field[0]}
+ local image_hobjects_stable_snap=$images/pool_$pool_id/$image_prefix/$image_prefix"_"$snapid
+ local image_snap=$snapset_output_dir/@$snapid
+ gather_hobject_snap $pool_id $image_prefix $snapid
+ local res=$?
+ if [ $res -ne 0 ];then
+ touch $image_snap
+ else
+ mv $image_hobjects_stable_snap $image_snap
+ fi
+ IFS=$OOIFS
+ done
+ IFS=$OIFS
+}
+
+# lookup image metadata and get snapid hobjects
+function discover_image_snap()
+{
+ local func="discover_image_snap"
+ echo "$func ..."
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <pool_id> <image_name> [<snap_name>]"
+ exit
+ fi
+ local pool_id=$1
+ local image_name=$2
+ local snap_name=$3
+ pool_id=$(($pool_id))
+ #mkdir -p $images/$image_prefix
+ lookup_image $pool_id $image_name $snap_name # input image_name and snap_name to lookup metadata and snap_id
+ if [ "$db_snap_id"x = ""x ];then
+ echo "$func: lookup image failed to gen snapid"
+ exit
+ fi
+ local image_hobjects_dir_prefix=$images/pool_$pool_id/$db_image_prefix
+ local image_nosnap=$images/pool_$pool_id/$image_name_in
+ #check if image nosnap recovered
+ if [ ! -s $image_nosnap ];then
+ echo "$func: please recover image nosnap before recover with snap"
+ rm -rf $image_hobjects_dir_prefix
+ exit
+ fi
+ local image_hobject_dir=$images/pool_$pool_id/$image_name_in
+ local image_snap_hobject=$image_hobject_dir/$image_name_in@$db_snap_id
+ local image_snap_hobject_head=$image_hobject_dir/$image_name_in@$db_snap_id@head
+ local image_snaplist=$image_hobject_dir/@snaplist
+ local image_snapset_dir=$image_hobject_dir/@snapset_dir
+ local image_head=$image_hobject_dir/$image_name_in
+ if [ ! -e $image_hobject_dir ];then
+ mkdir -p $image_hobject_dir
+ fi
+ # only gen snapset one time
+ if [ ! -e $image_snapset_dir ];then
+ mkdir -p $image_snapset_dir
+ gen_snapset_hobject $pool_id $db_image_prefix $image_snaplist $image_snapset_dir
+
+ fi
+
+ echo "$func: will get object clone ..."
+ >$image_snap_hobject
+ >$image_snap_hobject_head
+
+ trap 'echo $func failed; exit;' INT HUP
+ # get each offset 's snapid hobject
+ while read line
+ do
+ #echo $line
+ OOIFS=$IFS
+ IFS=$' '
+ local field=(`echo $line`)
+ local offset_string=${field[3]}
+ IFS=$OOIFS
+ local entry=`get_object_clone $offset_string $db_snap_id $image_snaplist $image_snapset_dir`
+ if [ "$entry"x != ""x ];then
+ echo $entry >> $image_snap_hobject
+ echo `dump_backslash $line` >> $image_snap_hobject_head
+ fi
+ done < $image_head
+ rm -rf $image_hobjects_dir_prefix
+}
+
+# after discover_image_nosnap
+# collect objects from osds one by one in sequence
+function copy_image_nosnap_single_thread()
+{
+ local func="copy_image_nosnap_single_thread"
+ echo "$func ..."
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <pool_id> <image_hobjects> <backup_dir>"
+ exit
+ fi
+ local pool_id=$1
+ local image_hobjects=$2
+ local backup_dir=$3
+ pool_id=$(($pool_id))
+
+ # make sure lookup_image first
+ if [ $found = 0 ];then
+ echo "$func: image not found, maybe forget to discover_image"
+ exit
+ fi
+ if [ ! -e $backup_dir ];then
+ mkdir -p $backup_dir
+ fi
+
+ local image_dir=$backup_dir/pool_$pool_id/$image_name_in
+ local image_file=$image_dir/$image_name_in
+ local CURRENT=$image_dir/@CURRENT
+ local LOCK=$image_dir/@LOCK
+ if [ ! -e $image_dir ];then
+ mkdir -p $image_dir
+ fi
+ if [ -e $LOCK ];then
+ echo "$func: $LOCK is locked by other process"
+ exit
+ else
+ touch $LOCK
+ fi
+
+ >$image_file
+ truncate -s $db_image_size $image_file
+ echo "head">$CURRENT
+
+ local count=$(($db_image_size >> $db_order))
+ local start=`cat $image_hobjects|head -n 1|awk '{print $4}'`
+ local end=`cat $image_hobjects|tail -n 1|awk '{print $4}'`
+ local entry_count=`cat $image_hobjects|wc -l`
+
+ local char_bits=$((`echo $start|wc -c` -1 ))
+ local format="%0"$char_bits"x"
+
+ local expect_start=`printf $format 0`
+ local expect_end=`printf $format $(($count -1 ))`
+
+ echo -e "object_count\t$entry_count"
+ echo -e "expect\t\t[$expect_start ~ $expect_end] count:$count"
+ echo -e "range\t\t[$start ~ $end] count:$entry_count"
+
+ local icount=0
+ local istart=
+ local iend=
+ local percent=
+
+ trap 'echo $func failed; exit;' INT HUP
+ local unit=$((1<<$db_order))
+ while read line
+ do
+ {
+ icount=$(($icount+1))
+ node=`echo $line|awk '{print $1}'`
+ hobject=`echo $line|awk '{print $3}'`
+ offset=`echo $line|awk '{print $4}'`
+ off=$((16#$offset))
+ if [ $icount = 1 ];then
+ istart=$offset
+ fi
+ hobject=`dump_backslash $hobject`
+ iend=$offset
+ sshcmd="cat $hobject"
+ ssh $ssh_option $node $sshcmd < /dev/null | dd of=$image_file bs=$unit seek=$off conv=notrunc 2>/dev/null
+ percent=`echo "scale=3; 100*$icount/$entry_count"|bc`
+ tput sc #record current cursor
+ echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%"
+ if [ $icount != $entry_count ];then
+ tput rc # backport most recent cursor
+ fi
+ }
+ done < $image_hobjects
+
+ echo
+ echo -n "size: "
+ ls -lh $image_file|awk '{print $5"\t"$9}'
+ echo -n "du: "
+ du -h $image_file
+ #unlock
+ rm -f $LOCK
+}
+
+
+# ssh copy snap_object & head_object from osd to admin node
+# copy all snapshot objects
+# and
+# all head objects which have the same offset as snapshot objects
+function collect_image_snap_objects()
+{
+ local func="collect_image_snap_objects"
+ #$1=backup_dir, $2=snap_name, $3=snap_hobjects, $4=head_hobjects
+ if [ $# -lt 6 ];then
+ echo "$func: parameters: <pool_id> <image_name> <snap_id> <snap_hobjects> <head_hobjects> <backup_dir>"
+ exit
+ fi
+
+ local pool_id=$1
+ local image_name=$2
+ local snap_id=$3
+ local snap_hobjects=$4 #snap hobjects info
+ local head_hobjects=$5 #head hobjects info
+ local backup_dir=$6
+ pool_id=$(($pool_id))
+
+ local head_dir=$backup_dir/pool_$pool_id/$image_name/@head
+ local snap_dir=$backup_dir/pool_$pool_id/$image_name/@$snap_id
+ local CURRENT=$backup_dir/pool_$pool_id/$image_name/@CURRENT
+
+ if [ ! -e $head_dir ];then
+ mkdir -p $head_dir
+ fi
+ if [ ! -e $snap_dir ];then
+ mkdir -p $snap_dir
+ fi
+
+ local snap_node= #osd node
+ local snap_hobject= #hobject path with snapid on osd
+ local snap_offset=
+ local snap_filename=
+
+ local head_node=
+ local head_hobject=
+ local head_offset=
+ local head_filename=
+
+ # ignore if there is no object in snapshot(empty )
+ if [ ! -s $snap_hobjects ];then
+ echo "$func: $snap_hobjects is empty"
+ return 0
+ fi
+ local start=`head -n 1 $snap_hobjects|awk '{print $4}'`
+ local end=`tail -n 1 $snap_hobjects|awk '{print $4}'`
+ local entry_count=`cat $snap_hobjects|wc -l`
+ if [ $((16#$first_offset)) -gt $((16#$last_offset)) ];then
+ echo "$func: $snap_hobjects not sorted"
+ return 1
+ fi
+
+ # just assert if ignored empty snapshot
+ if [ "$start"x = ""x ] || [ "$end"x = ""x ];then
+ return 1
+ fi
+
+ # speed up copy snapshot
+ # lookup the corresponding head hobject of snap hobject
+ # use command: grep <offset> <head hobjects>
+ #
+ # eg.
+ # head hobjects: (32 objects, snapid = uint64(-2) = 18446744073709551614)
+ # ceph1 29.4d /var/lib/ceph/osd/ceph-0/current/29.4d_head/rb.0.1c414.6b8b4567.000000000000__head_EC2C1C4D__1d 000000000000 18446744073709551614 869
+ # ceph1 29.8c /var/lib/ceph/osd/ceph-0/current/29.8c_head/rb.0.1c414.6b8b4567.000000000001__head_0F439A8C__1d 000000000001 18446744073709551614 867
+ # ceph1 29.6a /var/lib/ceph/osd/ceph-0/current/29.6a_head/rb.0.1c414.6b8b4567.000000000002__head_FC55706A__1d 000000000002 18446744073709551614 869
+ # ceph1 29.8b /var/lib/ceph/osd/ceph-0/current/29.8b_head/rb.0.1c414.6b8b4567.000000000003__head_20A6328B__1d 000000000003 18446744073709551614 869
+ # ceph2 29.75 /var/lib/ceph/osd/ceph-1/current/29.75_head/rb.0.1c414.6b8b4567.000000000004__head_AC5ADB75__1d 000000000004 18446744073709551614 867
+ # ceph2 29.23 /var/lib/ceph/osd/ceph-1/current/29.23_head/rb.0.1c414.6b8b4567.000000000005__head_1FDEA823__1d 000000000005 18446744073709551614 867
+ # ......
+ # ceph1 29.34 /var/lib/ceph/osd/ceph-0/current/29.34_head/rb.0.1c414.6b8b4567.00000000001f__head_52373734__1d 00000000001f 18446744073709551614 869
+ #
+ # snap hobjects: (3 objects, snapid >= 29)
+ # ceph1 29.8c /var/lib/ceph/osd/ceph-0/current/29.8c_head/rb.0.1c414.6b8b4567.000000000001__1f_0F439A8C__1d 000000000001 31 867
+ # ceph1 29.6a /var/lib/ceph/osd/ceph-0/current/29.6a_head/rb.0.1c414.6b8b4567.000000000002__1e_FC55706A__1d 000000000002 30 869
+ # ceph1 29.8b /var/lib/ceph/osd/ceph-0/current/29.8b_head/rb.0.1c414.6b8b4567.000000000003__1d_20A6328B__1d 000000000003 29 869
+ #
+ # so find out offset in head hobjects line number:
+ # snap hobjects: 000000000001 ---> head hobjects: 2 (n1)
+ # snap hobjects: 000000000003 ---> head hobjects: 4 (n2)
+ #
+ # finally , grep range from the whole file [1 ~ N] shranked to part of file [n1 ~ n2]
+ # the worst case : [n1 ~ n2] = [1 ~ N], means no shranking
+
+ # get the line number of the start offset in head hobjects
+ local n1=`grep -n $start $head_hobjects|head -n 1|cut -d ":" -f 1`
+ # get the line number of the end offset in head hobjects
+ local n2=`grep -n $end $head_hobjects|head -n 1|cut -d ":" -f 1`
+
+ local icount=0
+ local istart=
+ local iend=
+ local percent=
+
+ OIFS=$IFS
+ IFS=$'\n'
+
+ #assume file:snap_hobjects is not very large, and can be loaded into memory
+ local snap_arr=(`cat $snap_hobjects`)
+ local snap_tmp=/tmp/snaptmp.$$$$
+
+ # snap_tmp:
+ # consists of snap hobject or head hobject
+ # select lineno range: [n1 ~ n2]
+ head -n $n2 $head_hobjects|tail -n $(($n2-$n1+1)) >$snap_tmp
+
+ echo "copy image snap/head objects from osd ..."
+ echo -e "object_count\t$entry_count"
+ echo -e "range\t\t[$start ~ $end] count:$entry_count"
+
+ trap 'echo $func failed; exit;' INT HUP
+ for line in ${snap_arr[*]}
+ do
+ icount=$(($icount+1))
+
+ OOIFS=$IFS
+ IFS=$' '
+
+ local arr=(`echo $line`)
+ snap_node=${arr[0]}
+ snap_hobject=${arr[2]}
+ snap_offset=${arr[3]}
+ snap_filename=$snap_dir/$snap_offset
+
+ if [ $icount = 1 ];then
+ istart=$snap_offset
+ fi
+ iend=$snap_offset
+
+ #lookup corresponding head hobject of snap hobject
+ local res=`grep $snap_offset $snap_tmp|head -n 1`
+ if [ "$res"x = ""x ];then
+ echo "$func: image object[ $snap_offset ] missing"
+ exit
+ fi
+
+ local arr2=(`echo $res`)
+ head_node=${arr2[0]}
+ head_hobject=${arr2[2]}
+ head_offset=${arr2[3]}
+ head_filename=$head_dir/$head_offset
+
+ # just copy object(snap/head) if it does not exist
+ if [ ! -e $snap_filename ];then
+ ssh $ssh_option $snap_node "cat $snap_hobject" > $snap_filename
+ fi
+ if [ ! -e $head_filename ];then
+ ssh $ssh_option $head_node "cat $head_hobject" > $head_filename
+ fi
+ IFS=$OOIFS
+
+ percent=`echo "scale=3; 100*$icount/$entry_count"|bc`
+ tput sc #record current cursor
+ echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%"
+ if [ $icount != $entry_count ];then
+ tput rc # backport most recent cursor
+ fi
+ done
+ echo
+ IFS=$OIFS
+ rm -f $snap_tmp
+ return 0
+}
+
+# copy all snap objects and corresponding head objects from osds
+# in single process
+function copy_image_snap_single_thread()
+{
+ local func="copy_image_snap_single_thread"
+ if [ $# -lt 6 ];then
+ echo "$func: parameters: <pool_id> <image_name> <snap_id> <snap_hobjects> <head_hobjects> <backup_dir>"
+ exit
+ fi
+ local pool_id=$1
+ local image_name=$2
+ local snap_id=$3
+ local snap_hobjects=$4
+ local head_hobjects=$5
+ local backup_dir=$6
+ pool_id=$(($pool_id))
+
+ local CURRENT=$backup_dir/pool_$pool_id/$image_name/@CURRENT
+ local LOCK=$backup_dir/pool_$pool_id/$image_name/@LOCK
+ #lock
+ if [ -e $LOCK ];then
+ echo "$func: $LOCK is locked by other process"
+ exit
+ else
+ touch $LOCK
+ fi
+ collect_image_snap_objects $pool_id $image_name $snap_id $snap_hobjects $head_hobjects $backup_dir
+ #unlock
+ rm -f $LOCK
+}
+
+# after all snap objects and necessary head objects are copied,
+# just pick appropriate head objects and snap objects and write them to image
+# in order to rollback image to snapshot
+#
+# init: image is created by copy_image_nosnap_single_thread firstly
+#
+# all output include 3 parts:
+# <image> <head objects> <snap objects>
+#
+# head objects1 --- snap1 objects
+# head objects2 --- snap2 objects
+# image head objects3 --- snap3 objects
+# ......
+# head objectsN --- snapN objects
+#
+# how to rollback:
+# firstly rollback to head, secondly write <snapX objects>
+# head = <image> + <head objects>
+# snap1 = <image> + <head objects> + <snap1 objects>
+# snap2 = <image> + <head objects> + <snap2 objects>
+# snap3 = <image> + <head objects> + <snap3 objects>
+# ......
+# snapN = <image> + <head objects> + <snapN objects>
+#
+# improve rollback:
+# there is intersection of head objects and snapX objects, if snapX objects are not empty
+# and need to deduplicate the intersection.
+# deduplicate steps:
+# - get difference set of head objects and snapX objects
+# - write the difference set objects to image
+# - write the snapX objects to image
+function rollback_image_snap()
+{
+ local func="rollback_image_snap"
+
+ echo "$func ..."
+
+ trap 'echo $func failed; exit;' INT HUP
+ if [ $# -lt 6 ];then
+ echo "$func: parameters <pool_id> <image_name> <snap_id> <snap_object_dir> <backup_dir> <image_unit>"
+ exit
+ fi
+ local pool_id=$1
+ local image_name=$2
+ local snap_id=$3
+ local snap_object_dir=$4
+ local backup_dir=$5
+ local image_unit=$6
+
+ local need_diff_set=0
+
+ local image_path=$backup_dir/pool_$pool_id/$image_name/$image_name
+ local head_object_dir=$backup_dir/pool_$pool_id/$image_name/@head
+ local CURRENT=$backup_dir/pool_$pool_id/$image_name/@CURRENT
+ local LOCK=$backup_dir/pool_$pool_id/$image_name/@LOCK
+ if [ -e $LOCK ];then
+ echo "$func: $LOCK is locked by other process"
+ exit
+ else
+ touch $LOCK
+ fi
+ if [ $snap_id -ne -2 ];then
+ echo $snap_id > $CURRENT
+ else
+ echo "head" > $CURRENT
+ fi
+
+ if [ ! -e $snap_object_dir ];then
+ return 0
+ fi
+
+ if [ "$snap_object_dir"x != "$head_object_dir"x ];then
+ echo "$func: need to compute diff_set of head"
+ need_diff_set=1
+ else
+ echo "$func: NO diff_set"
+ need_diff_set=0
+ fi
+
+ local entry_count=0
+ local start=
+ local end=
+ local offset=
+ local icount=0
+ local istart=
+ local iend=
+ local percent=
+
+ local snap_objects=
+ local head_objects=
+ local diff_set=
+
+ snap_objects=(`ls $snap_object_dir`)
+
+ # if need to compute difference set of head_objects and snap_objects
+ if [ $need_diff_set -ne 0 ];then
+ head_objects=(`ls $head_object_dir`)
+
+ #get the difference set: ( head_objects - snap_objects )
+ diff_set=(`
+ sort -m <(echo ${head_objects[@]}|xargs -n 1 echo) <(echo ${snap_objects[@]}|xargs -n 1 echo) \
+ <(echo ${snap_objects[@]}|xargs -n 1 echo) |uniq -u`)
+
+ # copy diff_set of head object to image
+ pushd $head_object_dir >/dev/null
+
+ echo "$func: copy diff_set head objects ..."
+ entry_count=${#diff_set[@]}
+ start=${diff_set[0]}
+ end=
+ if [ $entry_count -gt 0 ];then
+ end=${diff_set[$(($entry_count - 1))]}
+ fi
+ offset=
+ icount=0
+ istart=
+ iend=
+ percent=
+
+ echo -e "object_count\t$entry_count"
+ echo -e "range\t\t[$start ~ $end] count:$entry_count"
+
+ for object in ${diff_set[@]}
+ do
+ icount=$(($icount+1))
+ if [ $icount = 1 ];then
+ istart=$object
+ fi
+ iend=$object
+
+ local offset=$((16#$object))
+ dd if=$object of=$image_path bs=$image_unit seek=$offset conv=notrunc 2>/dev/null
+
+ percent=`echo "scale=3; 100*$icount/$entry_count"|bc`
+ tput sc #record current cursor
+ echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%"
+ if [ $icount != $entry_count ];then
+ tput rc # backport most recent cursor
+ fi
+ done
+ if [ $entry_count -gt 0 ];then
+ echo
+ fi
+ popd >/dev/null
+
+ if [ $snap_id -ne -2 ];then
+ echo -e "$image_name already rollback diff_set: (head - snap)"
+ fi
+ fi
+
+ # copy snap object to image
+ pushd $snap_object_dir >/dev/null
+
+ if [ $need_diff_set -ne 0 ];then
+ echo "$func: copy snap objects ..."
+ else
+ echo "$func: copy head objects ..."
+ fi
+ entry_count=${#snap_objects[@]}
+ start=${snap_objects[0]}
+ end=
+ if [ $entry_count -gt 0 ];then
+ end=${snap_objects[$(($entry_count - 1))]}
+ fi
+ offset=
+ icount=0
+ istart=
+ iend=
+ percent=
+
+ echo -e "object_count\t$entry_count"
+ echo -e "range\t\t[$start ~ $end] count:$entry_count"
+
+ for object in ${snap_objects[@]}
+ do
+ icount=$(($icount+1))
+ if [ $icount = 1 ];then
+ istart=$object
+ fi
+ iend=$object
+
+ local offset=$((16#$object))
+ dd if=$object of=$image_path bs=$image_unit seek=$offset conv=notrunc 2>/dev/null
+
+ percent=`echo "scale=3; 100*$icount/$entry_count"|bc`
+ tput sc #record current cursor
+ echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%"
+ if [ $icount != $entry_count ];then
+ tput rc # backport most recent cursor
+ fi
+ done
+ if [ $entry_count -gt 0 ];then
+ echo
+ fi
+ popd >/dev/null
+
+ rm -f $LOCK
+ if [ $snap_id -ne -2 ];then
+ echo "$image_name rollback to snapid: $snap_id"
+ else
+ echo "$image_name rollback to head"
+ fi
+}
+
+function recover_image()
+{
+ local func="recover_image"
+ echo "$func ..."
+
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <pool_id> <image_name> <snap_name> [<backup_dir>]"
+ exit
+ fi
+
+ local pool_id=$1
+ local img_name=$2
+ local snap_name=$3
+ local backup_dir=$4
+ pool_id=$(($pool_id))
+ if [ "$snap_name"x = "@"x ];then
+ snap_name=
+ fi
+ if [ "$backup_dir"x = ""x ];then
+ backup_dir=$default_backup_dir
+ fi
+
+ #recover image with nosnap
+ if [ "$snap_name"x = ""x ];then
+ discover_image_nosnap $pool_id $img_name #input image_name
+ local image_hobjects=$images/pool_$pool_id/$image_name_in/$image_name_in
+ copy_image_nosnap_single_thread $pool_id $image_hobjects $backup_dir
+
+ #recover image with snap
+ else
+
+ # check if recovered head already
+ local img_hobjects_path=$images/pool_$pool_id/$img_name/$img_name
+ local img_file_path=$backup_dir/pool_$pool_id/$img_name/$img_name
+ if [ ! -e $img_hobjects_path ] || [ ! -e $img_file_path ];then
+ echo "$func: $img_name@$snap_name : can not rollback to snapshot, please recover image head first"
+ exit
+ fi
+
+ # rollback to head
+ if [ "$snap_name"x = "@@"x ];then
+ local head_dir=$backup_dir/pool_$pool_id/$img_name/@head
+ if [ -e $head_dir ];then
+ local unit=`pushd $head_dir >/dev/null; ls|head -n 1|xargs -n 1 stat|awk '/Size:/{print $2}'`
+ # rollback to head
+ rollback_image_snap $pool_id $img_name -2 $backup_dir/$img_name/@head $backup_dir $unit
+ echo "$image_name_in head : $backup_dir/$img_name/$img_name"
+ else
+ echo "$func: no need to rollback to head"
+ fi
+ return 0
+ fi
+
+ # rollback to snap
+ discover_image_snap $pool_id $img_name $snap_name # get image meta & get snapid object
+ local snap_hobjects=$images/pool_$pool_id/$image_name_in/$image_name_in@$db_snap_id
+ local head_hobjects=$images/pool_$pool_id/$image_name_in/$image_name_in@$db_snap_id@head
+ local snap_object_dir=$backup_dir/pool_$pool_id/$image_name_in/@$db_snap_id
+ local image_path=$backup_dir/pool_$pool_id/$image_name_in/$image_name_in
+ local image_unit=$((1<<$db_order))
+ copy_image_snap_single_thread $pool_id $image_name_in $db_snap_id $snap_hobjects $head_hobjects $backup_dir
+ rollback_image_snap $pool_id $image_name_in $db_snap_id $snap_object_dir $backup_dir $image_unit
+ echo "$image_name_in@$snap_name : $image_path"
+ fi
+}
diff --git a/src/tools/rbd_recover_tool/epoch_h b/src/tools/rbd_recover_tool/epoch_h
new file mode 100644
index 000000000..e268eafa7
--- /dev/null
+++ b/src/tools/rbd_recover_tool/epoch_h
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+# file: epoch_h
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+. $my_dir/common_h
+
+#pgid_list=$single_node/$cluster-$id/pgid_list
+function get_pgid_list()
+{
+ find $osd_data/current/ -type d -name "*_head"|\
+ sed -n 's/\(.*\)\/current\/\([0-9a-fA-F]\+\.[0-9a-fA-F]\+\)_head/\2 \1/p'|\
+ sort -t ' ' -k 1.1,1h -k 2.1,2 > $pgid_list;
+}
+
+function get_pgid()
+{
+ hobject_path=$1
+ echo $hobject_path| sed -n 's/\(.*\)\/\([0-9a-fA-F]\+\.[0-9a-fA-F]\+\)_head\(.*\)/\2/p'
+}
+
+infos_seq=
+function get_infos_seq()
+{
+ local func="get_infos_seq"
+
+ local keyword=":infos."
+ local infos_key=`get_map_header_key $keyword`
+
+ if [ "$infos_key"x = ""x ];then
+ echo "$func: keyword not input or infos_key not exists"
+ exit
+ fi
+ local prefix=`get_map_header_prefix`
+ local key=$infos_key
+
+ infos_seq=`get_header_seq $prefix $key`
+ if [ "$infos_seq"x = ""x ];then
+ echo "$func: infos_seq not exists"
+ exit
+ fi
+}
+
+pg_epoch=
+function get_pg_epoch()
+{
+ local func="get_pg_epoch"
+ if [ "$1"x = ""x ];then
+ echo "$func: no pgid input"
+ exit
+ fi
+
+ get_pg_epoch_firefly "$1"
+ if [ "$pg_epoch"x != ""x ]; then
+ # echo "Epoch for $1: $pg_epoch (firefly)"
+ return
+ fi
+
+ get_pg_epoch_hammer "$1"
+ if [ "$pg_epoch"x != ""x ]; then
+ # echo "Epoch for $1: $pg_epoch (hammer)"
+ return
+ fi
+
+ echo "$func: Couldn't find epoch for $1"
+ exit
+}
+
+function get_pg_epoch_firefly()
+{
+ local func="get_pg_epoch_firefly"
+ if [ "$1"x = ""x ];then
+ echo "$func: no pgid input"
+ exit
+ fi
+ local pgid=$1
+ local key=$pgid"_epoch"
+
+ #get_infos_seq;
+ # infos_seq default to 1
+ infos_seq=1
+ local infos_seq=`printf "%016d" $infos_seq`
+ local prefix="_USER_"$infos_seq"_USER_"
+
+ pg_epoch=`get_header_kv $prefix $key int`
+}
+
+function get_pg_epoch_hammer()
+{
+ local func="get_pg_epoch_hammer"
+ if [ "$1"x = ""x ];then
+ echo "$func: no pgid input"
+ exit
+ fi
+ local pgid="$1"
+ local hkey_prefix="$(get_map_header_prefix)"
+ local hkey="$(printf '...head.%x.%08X' "$(echo "$pgid"|cut -d'.' -f1)" "$((0x$(echo "$pgid"|cut -d'.' -f2)))")"
+
+ local infos_seq="$(get_header_seq "$hkey_prefix" "$hkey")"
+ local infos_seq=`printf "%016d" $infos_seq`
+ local prefix="_USER_"$infos_seq"_USER_"
+ local key="_epoch"
+
+ pg_epoch=`get_header_kv $prefix $key int`
+}
diff --git a/src/tools/rbd_recover_tool/metadata_h b/src/tools/rbd_recover_tool/metadata_h
new file mode 100644
index 000000000..b736ceea7
--- /dev/null
+++ b/src/tools/rbd_recover_tool/metadata_h
@@ -0,0 +1,368 @@
+#!/usr/bin/env bash
+# file: metadata_h
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+. $my_dir/common_h
+. $my_dir/epoch_h
+
+# put origin name in $image_name_in: for output
+# put convert "_" name in $image_name: for grep image hobjects from database
+image_name_in=
+image_name=
+function input_image()
+{
+ local func="input_image"
+ if [ "$1"x = ""x ];then
+ echo "$func: no image name input"
+ exit
+ fi
+
+ image_name_in=$1
+ # "_" -> "\u"
+ image_name=`convert_underline $image_name_in`
+}
+
+#======================================== distinguish v1 or v2 ===================================
+#image_list_v1=$single_node/$cluster-$id/image_list_v1
+#image_list_v2=$single_node/$cluster-$id/image_list_v2
+function get_image_list()
+{
+ find $osd_data/current/ -type f|grep ".rbd__" >$image_list_v1
+ find $osd_data/current/ -type f|grep "rbd\\\\uid." >$image_list_v2
+}
+
+function get_image_format_by_hobject()
+{
+ local func="get_image_format"
+ if [ "$1"x = ""x ];then
+ exit
+ fi
+ local res1=`cat $image_list_v1|grep $1`
+ if [ "$res1"x != ""x ];then
+ echo 1
+ exit
+ fi
+
+ local res2=`cat $image_list_v2|grep $1`
+ if [ "$res2"x = ""x ];then
+ echo 2
+ exit
+ fi
+}
+
+#======================================== image format v1 ========================================
+# <image_name>.rbd include 3 parts:
+# header + snap_count*snapshot + snap_count*snap_name
+#
+# struct rbd_obj_header_ondisk {
+# 40 char text[40];
+# 24 char block_name[RBD_MAX_BLOCK_NAME_SIZE];
+# 4 char signature[4];
+# 8 char version[8];
+# struct {
+# 1 __u8 order;
+# 1 __u8 crypt_type;
+# 1 __u8 comp_type;
+# 1 __u8 unused;
+# } __attribute__((packed)) options;
+# 8 ceph_le64 image_size;//hexdump -C s=80 n=8
+# 8 ceph_le64 snap_seq; //hexdump -C s=88 n=8
+# 4 ceph_le32 snap_count;//hexdump -C s=96 n=4
+# 4 ceph_le32 reserved;
+# 8 ceph_le64 snap_names_len;//hexdump -C s=104 n=8
+# struct rbd_obj_snap_ondisk snaps[0];
+# } __attribute__((packed));
+#
+# sizeof(rbd_obj_header_ondisk): 112
+#
+# struct rbd_obj_snap_ondisk {
+# 8 ceph_le64 id; //hexdump -C s=112+i*16 n=8 , i=[0, snap_count)
+# 8 ceph_le64 image_size;//hexdump -C s=112+i*16+8 n=8, i=[0, snap_count)
+# } __attribute__((packed));
+# sizeof(rbd_obj_snap_ondisk): 16
+#
+# get snap_names form <image_nane>.rbd
+# hexdump -e '10/1 "%_c"' -s $((112 + $snap_count*16)) -n $snap_names_len <image_name>.rbd
+# then split snap_names into array
+
+function get_image_metadata_v1()
+{
+ local func="get_image_metadata_v1"
+ if [ "$1"x = ""x ];then
+ echo "$func: no image head object input"
+ exit
+ fi
+ local snap_name=
+ if [ "$2"x != ""x ];then
+ snap_name=$2
+ fi
+
+ if [ ! -e $1 ];then
+ echo "$func: $1 not exists"
+ exit
+ fi
+ local hobject_path=$1
+ d_hobject_path=`dump_backslash $1`
+ local image_format=`get_image_format_by_hobject $d_hobject_path`
+ if [ $image_format != 1 ];then
+ echo "$func: image_format must be 1"
+ exit
+ fi
+
+ if [ ! -e $hobject_path ];then
+ echo "$func: $hobject_path not exists"
+ exit
+ fi
+
+ # decode rbd_obj_header_ondisk of <image_name>.rbd
+ local block_name=`hexdump -e '10/1 "%c"' -s 40 -n 24 $hobject_path`
+ local order=`hexdump -e '10/4 "%u"' -s 76 -n 1 $hobject_path`
+ local image_size=`hexdump -C -s 80 -n 8 $hobject_path|head -n 1|awk '{for (i=9; i>1; i--) {printf $i}}'`
+ image_size=$((16#$image_size))
+ local snap_seq=`hexdump -C -s 88 -n 8 $hobject_path|head -n 1|
+ awk '{num=""; for(i=9; i>1; i--){ num=num""$i;} print strtonum("0x"num);}'`
+ local snap_count=`hexdump -C -s 96 -n 4 $hobject_path|head -n 1|
+ awk '{num=""; for(i=5; i>1; i--){ num=num""$i;} print strtonum("0x"num);}'`
+ local snap_names_len=`hexdump -C -s 104 -n 8 $hobject_path|head -n 1|
+ awk '{num=""; for(i=9; i>1; i--){ num=num""$i;} print strtonum("0x"num);}'`
+
+ echo -e "block_name:\t$block_name"
+ echo -e "order:\t\t$order"
+ echo -e "image_size:\t$image_size"
+ echo -e "snap_seq:\t$snap_seq"
+
+ # decode N rbd_obj_snap_ondisk of <image_name>.rbd
+ declare -a snap_ids
+ declare -a snap_names
+ declare -a snap_image_sizes
+ local size_header=112 #sizeof(rbd_obj_header_ondisk)
+ local size_snap=16 #sizeof(rbd_obj_snap_ondisk)
+ local offset=0
+ local id_off=0
+ local size_off=0
+ for ((i=0; i<$snap_count; i++))
+ do
+ offset=$(($size_header + $i * $size_snap))
+ id_off=$offset
+ size_off=$(($offset + 8))
+ snap_ids[$i]=`hexdump -C -s $id_off -n 8 $hobject_path|head -n 1|
+ awk '{num=""; for(i=9; i>1; i--){num=num""$i;} print strtonum("0x"num);}'`
+ snap_image_sizes[$i]=`hexdump -C -s $size_off -n 8 $hobject_path|head -n 1|
+ awk '{num=""; for(i=9; i>1; i--){num=num""$i;} print strtonum("0x"num);}'`
+ done
+ offset=$(($size_header + $snap_count * $size_snap))
+ snap_names=(`hexdump -e '10/1 "%_c"' -s $offset -n $snap_names_len $hobject_path|
+ awk -F "\\\\\\\\\\\\\\\\0" '{for(i=1; i<=NF; i++) {print $i" "} }'`);
+
+ echo -e "\t\tID\tNAME\t\tSIZE"
+ for ((i=0; i<$snap_count; i++))
+ do
+ if [ "$snap_name"x = ""x ];then
+ echo -n -e "snapshot:\t"
+ echo -e "${snap_ids[$i]}\t${snap_names[$i]}\t\t${snap_image_sizes[$i]}"
+ continue
+ fi
+ if [ "$snap_name"x = "${snap_names[$i]}"x ];then
+ echo -n -e "snapshot:\t"
+ echo -e "${snap_ids[$i]}\t${snap_names[$i]}\t\t${snap_image_sizes[$i]}"
+ return
+ fi
+ done
+}
+
+#======================================== end image format v1 ========================================
+
+#======================================== image format v2 ========================================
+
+# map_header, header_seq, header, key/value
+# eg.
+# map_header _HOBJTOSEQ_:rbd%uheader%e139a6b8b4567...head.2.68E826B6
+# meta_header_seq 17426
+# header: _USER_0000000000017426_USER_:object_prefix
+# _USER_0000000000017426_USER_:order
+# _USER_0000000000017426_USER_:size
+# _USER_0000000000017426_USER_:snap_seq
+# key/value ceph-kvstore-tool /storepath get _USER_0000000000017426_USER_ (object_prefix|order|size|snap_seq)
+
+# decode image id from image_id_hobject
+function get_image_id()
+{
+ local func="get_image_id"
+ if [ "$1"x = ""x ];then
+ exit;
+ fi
+ local image_id_hobject=$1 #from admin node's database
+
+ if [ ! -e $image_id_hobject ];then
+ #echo "$func: $image_id_hobject not exists"
+ exit;
+ fi
+
+ # get len of string
+ local n=`hexdump -e '10/4 "%u"' -s 0 -n 4 $image_id_hobject`
+ # get string
+ hexdump -e '10/1 "%c"' -s 4 -n $n $image_id_hobject
+}
+
+#find image_id omap entry in omaplist
+map_header_prefix=
+map_header_key=
+function get_map_header()
+{
+ local func="get_map_header"
+ local image_id=$1
+ if [ "$image_id"x = ""x ];then
+ echo "$func: no image_id input"
+ exit;
+ fi
+ map_header_prefix=`get_map_header_prefix`
+ local keyword="header%e"$image_id
+ map_header_key=`get_map_header_key $keyword`
+ if [ "$map_header_key"x = ""x ];then
+ echo "$func: map_header_key is NULL(not in omaplist)"
+ exit
+ fi
+}
+
+#get meta header seq from map_header
+meta_header_seq=
+function get_meta_header_seq()
+{
+ local func="get_meta_header_seq"
+ if [ "$1"x == ""x ];then
+ echo "$func: no prefix input"
+ exit;
+ elif [ "$2"x == ""x ];then
+ echo "$func: no key input"
+ exit;
+ fi
+ local prefix=$1;
+ local key=$2;
+ meta_header_seq=`get_header_seq $prefix $key`
+}
+
+# get image metadata : object_prefix, order, image_size, snap_seq
+object_prefix=
+order=
+image_size=
+snap_seq=
+function get_image_metadata_v2()
+{
+ local func="get_image_metadata_v2"
+ if [ "$1"x = ""x ];then
+ echo "$func: no meta_header_seq input"
+ exit;
+ fi
+ local meta_header_seq=`printf "%016d" $1`
+ #echo "$func: meta_header_seq = "$meta_header_seq
+ local ghobject_key="_USER_"$meta_header_seq"_USER_"
+ local prefix=$ghobject_key
+
+ object_prefix=`get_header_kv $prefix object_prefix string`
+ #object_prefix="rbd_data.$image_id"
+ order=`get_header_kv $prefix order int`
+ image_size=`get_header_kv $prefix size int`
+ snap_seq=`get_header_kv $prefix snap_seq int`
+
+ echo -e "object_prefix:\t$object_prefix"
+ echo -e "order:\t\t$order"
+ echo -e "image_size:\t$image_size"
+ echo -e "snap_seq:\t$snap_seq"
+
+ # list snapshot
+ list_snaps_v2 $1 $2
+}
+
+# struct cls_rbd_snap {
+# snapid_t id;
+# string name;
+# uint64_t image_size;
+# uint64_t features;
+# uint8_t protection_status;
+# cls_rbd_parent parent;
+# }
+# decode cls_rbd_snap
+# 1 u8 struct_v
+# 1 u8 struct_compat
+# 4 u32 struct_len
+# 8 u64 snapid_t id //s=6 n=8
+# 4 u32 len of name //s=14 n=4
+# len char name //s=18 n=len
+# 8 u64 image_size
+# 8 u64 features
+# ......
+#
+function list_snaps_v2()
+{
+ local func="list_snaps_v2"
+ if [ "$1"x = ""x ];then
+ exit
+ fi
+ local sname=
+ if [ $# -eq 2 ];then
+ sname=$2
+ fi
+ local meta_header_seq=`printf "%016d" $1`
+ local prefix="_USER_"$meta_header_seq"_USER_"
+ local keys=(`awk -F ":" '/snapshot_/ && $1 == "'"$prefix"'" {if ($2 == "") exit; split($2, arr, "_");
+ print arr[2];}' $omap_list|sort -r`)
+ echo -e "\t\tID\tNAME\t\tSIZE"
+ for key in ${keys[@]}
+ do
+ key="snapshot_$key"
+ local arr=(`ceph-kvstore-tool $omap_path get $prefix $key|awk -F ":" '{print $2}'`);
+ # get snap_name
+ tmp=
+ for ((i=17; i>13; i--))
+ do
+ tmp="$tmp${arr[$i]}"
+ done
+ local len=$((16#$tmp))
+ local snap_name=
+ for ((i=18; i<$((18+$len)); i++))
+ do
+ # convert ascii to char
+ local char=`echo -e "\x${arr[$i]}"`
+ snap_name="$snap_name$char"
+ done
+ # get snap_id (little endian)
+ local tmp=
+ for ((i=13; i>5; i--))
+ do
+ tmp="$tmp${arr[$i]}"
+ done
+ local snap_id=$((16#$tmp))
+ # get image_size of current snap (little endian)
+ tmp=
+ for ((i=$((25+$len)); i>$((17+$len)); i--))
+ do
+ tmp="$tmp${arr[$i]}"
+ done
+ local image_size=$((16#$tmp))
+ if [ "$sname"x = ""x ];then
+ echo -e "snapshot:\t$snap_id\t$snap_name\t\t$image_size"
+ continue
+ fi
+ if [ "$sname"x = "$snap_name"x ];then
+ echo -e "snapshot:\t$snap_id\t$snap_name\t\t$image_size"
+ return
+ fi
+ done
+}
+
+#======================================== end image format v2 ========================================
diff --git a/src/tools/rbd_recover_tool/osd_job b/src/tools/rbd_recover_tool/osd_job
new file mode 100755
index 000000000..b4b80be8a
--- /dev/null
+++ b/src/tools/rbd_recover_tool/osd_job
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+# file: osd_job
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+
+. $my_dir/common_h
+. $my_dir/metadata_h
+. $my_dir/epoch_h
+
+function check_ceph_osd()
+{
+ local func="check_ceph_osd"
+ local host=`hostname`
+ # if ceph-osd service is still running, except flush-journal
+ if [ "`ps aux|grep ceph-osd|grep -v flush-journal|grep -v grep`"x != ""x ];then
+ echo "[$host]: $func: ceph-osd is running..., stop it"
+ exit
+ fi
+}
+
+function cat_pg_epoch()
+{
+ local func="cat_pg_epoch"
+ init_env_osd $1
+ if [ -e $node_pg_epoch ];then
+ cat $node_pg_epoch
+ fi
+}
+
+function cat_image_v1()
+{
+ local func="cat_image_v1"
+ init_env_osd $1
+ if [ -e $image_v1 ];then
+ cat $image_v1
+ fi
+}
+
+function cat_image_v2()
+{
+ local func="cat_image_v2"
+ init_env_osd $1
+ if [ -e $image_v2 ];then
+ cat $image_v2
+ fi
+}
+
+function flush_osd_journal()
+{
+ local func="flush_osd_journal"
+ init_env_osd $1
+ local osd_data_path=$osd_data
+ local osd_journal_path=$osd_data/journal
+ local whoami_path=$osd_data/whoami
+ local host=`hostname`
+ if [ ! -e $whoami_path ];then
+ echo "[$host]: $func: $whoami_path not exists"
+ exit
+ fi
+ local whoami=`cat $whoami_path`
+ echo "[$host]: $func ..."
+ ceph-osd -i $whoami --osd-data $osd_data_path --osd-journal $osd_journal_path --flush-journal >/dev/null
+ if [ $? -ne 0 ];then
+ echo "[$host]: $func: flush osd journal failed"
+ exit
+ fi
+}
+
+function do_omap_list()
+{
+ local func="do_omap_list"
+ init_env_osd $1
+ local host=`hostname`
+ echo "[$host]: $func ..."
+ get_omap_list
+}
+
+# get all pgs epoch
+function do_pg_epoch()
+{
+ local func="do_pg_epoch"
+ init_env_osd $1
+ local node=`hostname`
+ get_pgid_list
+ >$node_pg_epoch
+ local pgid=
+ local data_path=
+ local host=`hostname`
+ echo "[$host]: $func ..."
+ while read line
+ do
+ {
+ pgid=`echo $line|awk '{print $1}'`
+ data_path=`echo $line|awk '{print $2}'`
+ get_pg_epoch $pgid
+ echo -e "$node $pgid $pg_epoch $data_path" >>$node_pg_epoch
+ }
+ done < $pgid_list
+}
+
+# get an list of image in this osd node, pg epoch maybe not the latest, the admin node will do distinguish
+function do_image_list()
+{
+ local func="do_image_list"
+ init_env_osd $1
+ get_image_list
+ local node=`hostname`
+ >$image_v1
+ >$image_v2
+ local host=`hostname`
+ echo "[$host]: $func ..."
+ for line in `cat $image_list_v1`
+ do
+ pgid=`get_pgid $line`
+ get_pg_epoch $pgid
+ echo "$node $line $pg_epoch" >> $image_v1
+ done
+ for line in `cat $image_list_v2`
+ do
+ pgid=`get_pgid $line`
+ get_pg_epoch $pgid
+ echo "$node $line $pg_epoch" >> $image_v2
+ done
+}
+
+function do_image_id()
+{
+ local func="do_image_id"
+ init_env_osd $1
+ get_image_id $2
+}
+
+function do_image_metadata_v1()
+{
+ local func="do_image_metadata_v1"
+ init_env_osd $1
+ local image_header_hobject=$2
+ local snap_name=$3
+ get_image_metadata_v1 $image_header_hobject $snap_name
+}
+
+function do_image_metadata_v2()
+{
+ local func="do_image_metadata_v2"
+ init_env_osd $1
+ local image_id=$2
+ local image_header_hobject=$3
+ local snap_name=$4
+ get_map_header $image_id
+ get_meta_header_seq $map_header_prefix $map_header_key
+ get_image_metadata_v2 $meta_header_seq $snap_name
+}
+
+check_ceph_osd
+$*
diff --git a/src/tools/rbd_recover_tool/rbd-recover-tool b/src/tools/rbd_recover_tool/rbd-recover-tool
new file mode 100755
index 000000000..b7a258650
--- /dev/null
+++ b/src/tools/rbd_recover_tool/rbd-recover-tool
@@ -0,0 +1,327 @@
+#!/usr/bin/env bash
+# file: rbd-recover-tool
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+# rbd-recover-tool is an offline recover tool for rbd image in replicated pool
+# when ceph cluster is stopped.
+# it is a simple disater recovery policy, just for urgent condition
+
+my_dir=$(dirname "$0")
+
+. $my_dir/common_h
+. $my_dir/metadata_h
+. $my_dir/epoch_h
+. $my_dir/database_h
+
+#scp files from admin node to osd node
+file1=common_h
+file2=metadata_h
+file3=epoch_h
+file4=osd_job
+
+#------------ admin node's action -------------
+
+function scp_file()
+{
+ local func="scp_file"
+ file=$1
+ if [ "$1"x = ""x ];then
+ echo "$func: not file input"
+ exit
+ fi
+ for host in `cat $osd_host`
+ do
+ {
+ echo "$func: $host"
+ scp $ssh_option $file $host:$job_path 1>/dev/null
+ } &
+ done
+}
+
+function scp_files()
+{
+ local func="scp_files"
+ for host in `cat $osd_host`
+ do
+ {
+ echo "$func: $host"
+ scp $ssh_option $file1 $host:$job_path
+ scp $ssh_option $file2 $host:$job_path
+ scp $ssh_option $file3 $host:$job_path
+ scp $ssh_option $file4 $host:$job_path
+ } &
+ done
+ wait
+ echo "$func: finish"
+}
+
+function scatter_node_jobs()
+{
+ local func="scatter_node_jobs"
+ local host=
+ local data_path=
+ echo "$func: flush osd journal & generate infos: omap, pg, image metadata ..."
+
+ trap 'echo $func failed; exit' INT HUP
+ while read line
+ do
+ {
+ host=`echo $line|awk '{print $1}'`
+ data_path=`echo $line|awk '{print $2}'`
+ check_osd_process $host
+
+ cmd="mkdir -p $job_path"
+ ssh $ssh_option $host $cmd
+ scp $ssh_option $file1 $host:$job_path >/dev/null
+ scp $ssh_option $file2 $host:$job_path >/dev/null
+ scp $ssh_option $file3 $host:$job_path >/dev/null
+ scp $ssh_option $file4 $host:$job_path >/dev/null
+
+ cmd="bash $job_path/osd_job flush_osd_journal $data_path;"
+ cmd="$cmd $job_path/osd_job do_omap_list $data_path;"
+ cmd="$cmd bash $job_path/osd_job do_pg_epoch $data_path;"
+ cmd="$cmd bash $job_path/osd_job do_image_list $data_path;"
+
+ ssh $ssh_option $host $cmd </dev/null
+ } &
+ done < $osd_host_path
+ wait
+ echo "$func: finish"
+}
+
+function gather_node_infos()
+{
+ local func="gather_node_infos"
+ echo "$func ..."
+ >$pg_coll
+ >$image_coll_v1
+ >$image_coll_v2
+ trap 'echo $func failed; exit' INT HUP
+ while read line
+ do
+ {
+ host=`echo $line|awk '{print $1}'`
+ data_path=`echo $line|awk '{print $2}'`
+ echo "$func: $host"
+ check_osd_process $host
+
+ #pg epoch
+ cmd1="bash $job_path/osd_job cat_pg_epoch $data_path"
+ ssh $ssh_option $host $cmd1 >> $pg_coll
+ #image v1
+ cmd2="bash $job_path/osd_job cat_image_v1 $data_path"
+ ssh $ssh_option $host $cmd2 >> $image_coll_v1
+ #image v2
+ cmd3="bash $job_path/osd_job cat_image_v2 $data_path"
+ ssh $ssh_option $host $cmd3 >> $image_coll_v2
+ } &
+ done < $osd_host_path
+ wait
+ echo "$func: finish"
+}
+
+function scatter_gather()
+{
+ local func="scatter_gather"
+ if [ ! -s $osd_host ];then
+ echo "$func: no osd_host input"
+ exit
+ fi
+ if [ ! -s $mon_host ];then
+ echo "$func: no mon_host input"
+ exit
+ fi
+ scatter_node_jobs
+ gather_node_infos
+}
+
+
+#------------- operations --------------
+
+function database()
+{
+ scatter_gather
+ gen_database
+}
+
+function list()
+{
+ list_images
+}
+
+function lookup()
+{
+ lookup_image $1 $2 $3
+}
+
+function recover()
+{
+ recover_image $1 $2 $3 $4
+}
+
+#------------- helper -------------
+
+function usage()
+{
+ local cmd_name="rbd-recover-tool"
+ echo
+ echo "$cmd_name is used to recover rbd image of replicated pool,
+ when all ceph services are stopped"
+ echo "Usage:"
+ echo "$cmd_name database
+ gather pg info, object info, image metadata,
+ and epoch info from all osd nodes,
+ this will cosume a long time, just be patient,
+ especially when scale up to 1000+ osds"
+ echo "$cmd_name list
+ list all rbd images of all replicated pools,
+ before to lookup & recover"
+ echo "$cmd_name lookup <pool_id>/<image_name>[@[<snap_name>]]
+ show image metadata: image format, rbd id, size, order, snapseq
+ In addition, for image with snapshots,
+ this will list all snapshot infomations"
+ echo "$cmd_name recover <pool_id>/<image_name>[@[<snap_name>]] [</path/to/store/image>]
+ all snapshots share one image head, to economize disk space
+ so there is only one snapshot at any time,
+ image is saved at </path/to/store/image>/pool_<pool_id>/image_name/image_name
+ cat <path/to/store/image>/pool_<pool_id>/image_name/@CURRENT,
+ will show snapid
+ recover to raw image/nosnap/head: <image_name>
+ rollback to image head: <image_name>@
+ rollback to image snap: <image_name>@<snap_name>
+ recover steps:
+ 1. recover image nosnap (only one time)
+ 2. rollback to image snap"
+}
+
+function get_path()
+{
+ local func="get_path"
+ if [ $# -lt 1 ];then
+ return
+ fi
+ if [[ $1 =~ // ]];then
+ return # "/path//to" is invalid
+ fi
+ local parent=`dirname $1`
+ local name=`basename $1`
+ if [ "$parent"x = "/"x ];then
+ echo "$parent$name"
+ else
+ echo -n "$parent/$name"
+ fi
+}
+
+function admin_cmd()
+{
+ local func="admin_cmd"
+ if [ $# -lt 1 ];then
+ usage
+ exit
+ fi
+ if [ "$1"x = "-h"x ] || [ "$1"x = "--help"x ];then
+ usage
+ exit
+ fi
+
+ if [ "$1"x = "database"x ];then
+ if [ $# -gt 1 ];then
+ usage
+ exit
+ fi
+ # remove osd_host to refresh osd_host and osd_host_mapping
+ rm -f $osd_host
+ init_env_admin
+ database
+ elif [ "$1"x = "list"x ];then
+ if [ $# -gt 1 ];then
+ usage
+ exit
+ fi
+ init_env_admin
+ list
+ elif [ "$1"x = "lookup"x ];then
+ if [ $# -gt 2 ];then
+ usage
+ exit
+ fi
+ local pool_id=-1
+ local image_name=
+ local snap_name=
+ if [[ $2 =~ ^([^@/]+)/([^@/]+)$ ]];then
+ pool_id="${BASH_REMATCH[1]}"
+ image_name="${BASH_REMATCH[2]}"
+ elif [[ $2 =~ ^([^@/]+)/([^@/]+)@([^@/]*)$ ]];then
+ pool_id="${BASH_REMATCH[1]}"
+ image_name="${BASH_REMATCH[2]}"
+ snap_name="${BASH_REMATCH[3]}"
+ else
+ echo "format: $2 is invalid, use <pool_id>/<image_name>[@[<snap_name>]]"
+ exit
+ fi
+ init_env_admin
+ lookup $pool_id $image_name $snap_name
+ elif [ "$1"x = "recover"x ];then
+ if [ $# -lt 2 ] || [ $# -gt 3 ];then
+ usage
+ exit
+ fi
+ local pool_id=-1
+ local image_name=
+ local snap_name=@
+ local image_dir=
+ if [[ $2 =~ ^([^@/]+)/([^@/]+)$ ]];then
+ pool_id="${BASH_REMATCH[1]}"
+ image_name="${BASH_REMATCH[2]}"
+ elif [[ $2 =~ ^([^@/]+)/([^@/]+)@([^@/]*)$ ]];then
+ pool_id="${BASH_REMATCH[1]}"
+ image_name="${BASH_REMATCH[2]}"
+ snap_name="${BASH_REMATCH[3]}"
+ if [ "$snap_name"x = ""x ];then
+ snap_name=@@
+ fi
+ else
+ echo "format: $2 is invalid, use <pool_id>/<image_name>[@[<snap_name>]]"
+ exit
+ fi
+ if [ $# = 3 ];then
+ image_dir=`get_path $3`
+ if [ "image_dir"x = ""x ];then
+ echo "$3 invalid"
+ exit
+ fi
+ fi
+ init_env_admin
+ recover $pool_id $image_name $snap_name $image_dir
+ elif [ "$1"x = "scp_files"x ];then
+ if [ $# -gt 1 ];then
+ exit
+ fi
+ admin_parse_osd
+ scp_files
+ elif [ "$1"x = "scp_file"x ];then
+ if [ $# -gt 2 ];then
+ exit
+ fi
+ admin_parse_osd
+ scp_file $2
+ else
+ echo "$func: $1: command not found"
+ fi
+}
+
+admin_cmd $*
diff --git a/src/tools/rbd_recover_tool/test_rbd_recover_tool.sh b/src/tools/rbd_recover_tool/test_rbd_recover_tool.sh
new file mode 100755
index 000000000..876b47b90
--- /dev/null
+++ b/src/tools/rbd_recover_tool/test_rbd_recover_tool.sh
@@ -0,0 +1,542 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+# unit test case for rbd-recover-tool
+
+#prepare:
+# - write config files: config/osd_host, config/mon_host, config/storage_path, config/mds_host if exist mds
+#step 1. rbd export all images as you need
+#step 2. stop all ceph services
+#step 3. use ceph_rbd_recover_tool to recover all images
+#step 4. compare md5sum of recover image with that of export image who has the same image name
+
+ssh_opt="-o ConnectTimeout=1"
+my_dir=$(dirname "$0")
+tool_dir=$my_dir
+
+#storage_path=$my_dir/config/storage_path
+mon_host=$my_dir/config/mon_host
+osd_host=$my_dir/config/osd_host
+mds_host=$my_dir/config/mds_host
+
+test_dir= # `cat $storage_path`
+export_dir= #$test_dir/export
+recover_dir= #$test_dir/recover
+image_names= #$test_dir/image_names
+online_images= #$test_dir/online_images, all images on ceph rbd pool
+gen_db= #$test_dir/gen_db, label database if exist
+pool=rbd
+pool_id=2
+
+function get_pool_id()
+{
+ local pool_id_file=/tmp/pool_id_file.$$$$
+ ceph osd pool stats $pool|head -n 1|awk '{print $4}' >$pool_id_file
+ if [ $? -ne 0 ];then
+ echo "$func: get pool id failed: pool = $pool"
+ rm -f $pool_id_file
+ exit
+ fi
+ pool_id=`cat $pool_id_file`
+ echo "$func: pool_id = $pool_id"
+ rm -f $pool_id_file
+}
+
+function init()
+{
+ local func="init"
+ if [ $# -eq 0 ];then
+ echo "$func: must input <path> to storage images, enough disk space is good"
+ exit
+ fi
+ if [ ! -s $osd_host ];then
+ echo "$func: config/osd_host not exists or empty"
+ exit
+ fi
+ if [ ! -s $mon_host ];then
+ echo "$func: config/mon_host not exists or empty"
+ exit
+ fi
+ if [ ! -e $mds_host ];then
+ echo "$func: config/mds_host not exists"
+ exit
+ fi
+ test_dir=$1
+ export_dir=$test_dir/export
+ recover_dir=$test_dir/recover
+ image_names=$test_dir/image_names
+ online_images=$test_dir/online_images
+ gen_db=$test_dir/gen_db
+
+ trap 'echo "ceph cluster is stopped ..."; exit;' INT
+ ceph -s >/dev/null
+ get_pool_id
+
+ mkdir -p $test_dir
+ mkdir -p $export_dir
+ mkdir -p $recover_dir
+ rm -rf $export_dir/*
+ rm -rf $recover_dir/*
+}
+
+function do_gen_database()
+{
+ local func="do_gen_database"
+ if [ -s $gen_db ] && [ `cat $gen_db` = 1 ];then
+ echo "$func: database already existed"
+ exit
+ fi
+ bash $tool_dir/rbd-recover-tool database
+ echo 1 >$gen_db
+}
+
+#check if all ceph processes are stopped
+function check_ceph_service()
+{
+ local func="check_ceph_service"
+ local res=`cat $osd_host $mon_host $mds_host|sort -u|tr -d [:blank:]|xargs -n 1 -I @ ssh $ssh_opt @ "ps aux|grep -E \"(ceph-osd|ceph-mon|ceph-mds)\"|grep -v grep"`
+ if [ "$res"x != ""x ];then
+ echo "$func: NOT all ceph services are stopped"
+ return 1
+ exit
+ fi
+ echo "$func: all ceph services are stopped"
+ return 0
+}
+
+function stop_ceph()
+{
+ local func="stop_ceph"
+ #cat osd_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-osd"
+ while read osd
+ do
+ {
+ osd=`echo $osd|tr -d [:blank:]`
+ if [ "$osd"x = ""x ];then
+ continue
+ fi
+ #ssh $ssh_opt $osd "killall ceph-osd ceph-mon ceph-mds" </dev/null
+ ssh $ssh_opt $osd "killall ceph-osd" </dev/null
+ } &
+ done < $osd_host
+ wait
+ echo "waiting kill all osd ..."
+ sleep 1
+ #cat $mon_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mon ceph-osd ceph-mds"
+ cat $mon_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mon"
+ #cat $mds_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mds ceph-mon ceph-osd"
+ cat $mds_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mds"
+}
+
+function create_image()
+{
+ local func="create_image"
+ if [ ${#} -lt 3 ];then
+ echo "create_image: parameters: <image_name> <size> <image_format>"
+ exit
+ fi
+ local image_name=$1
+ local size=$2
+ local image_format=$3
+ if [ $image_format -lt 1 ] || [ $image_format -gt 2 ];then
+ echo "$func: image_format must be 1 or 2"
+ exit
+ fi
+ local res=`rbd list|grep -E "^$1$"`
+ echo "$func $image_name ..."
+ if [ "$res"x = ""x ];then
+ rbd -p $pool create $image_name --size $size --image_format $image_format
+ else
+ if [ $image_format -eq 2 ];then
+ rbd snap ls $image_name|tail -n +2|awk '{print $2}'|xargs -n 1 -I % rbd snap unprotect $image_name@%
+ fi
+ rbd snap purge $image_name
+ #rbd rm $image_name
+ rbd -p $pool resize --allow-shrink --size $size $image_name
+ fi
+}
+
+function export_image()
+{
+ local func="export_image"
+
+ if [ $# -lt 2 ];then
+ echo "$func: parameters: <image_name> <image_format> [<image_size>]"
+ exit
+ fi
+
+ local image_name=$1
+ local format=$(($2))
+ local size=$(($3)) #MB
+
+ if [ $format -ne 1 ] && [ $format -ne 2 ];then
+ echo "$func: image format must be 1 or 2"
+ exit
+ fi
+
+ if [ $size -eq 0 ];then
+ size=24 #MB
+ echo "$func: size = $size"
+ fi
+ local mnt=/rbdfuse
+
+ mount |grep "rbd-fuse on /rbdfuse" &>/dev/null
+ if [ $? -ne 0 ];then
+ rbd-fuse $mnt
+ fi
+
+ create_image $image_name $size $format
+
+ dd conv=notrunc if=/dev/urandom of=$mnt/$image_name bs=4M count=$(($size/4))
+
+ local export_image_dir=$export_dir/pool_$pool_id/$image_name
+ mkdir -p $export_image_dir
+ local export_md5_nosnap=$export_image_dir/@md5_nosnap
+ >$export_md5_nosnap
+
+ local export_image_path=$export_image_dir/$image_name
+ rm -f $export_image_path
+
+ rbd export $pool/$image_name $export_image_path
+ md5sum $export_image_path |awk '{print $1}' >$export_md5_nosnap
+}
+
+function recover_image()
+{
+ local func="recover_snapshots"
+ if [ $# -lt 1 ];then
+ echo "$func: parameters: <image_name>"
+ exit
+ fi
+
+ local image_name=$1
+ #pool_id=29
+
+ local recover_image_dir=$recover_dir/pool_$pool_id/$image_name
+ mkdir -p $recover_image_dir
+ local recover_md5_nosnap=$recover_image_dir/@md5_nosnap
+ >$recover_md5_nosnap
+ local snapshot=
+
+ bash $tool_dir/rbd-recover-tool recover $pool_id/$image_name $recover_dir
+ md5sum $recover_image_dir/$image_name|awk '{print $1}' >$recover_md5_nosnap
+}
+
+function make_snapshot()
+{
+ local func="make_snapshot"
+ if [ $# -lt 5 ];then
+ echo "$func: parameters: <ofile> <seek> <count> <snap> <export_image_dir>"
+ exit
+ fi
+ local ofile=$1
+ local seek=$(($2))
+ local count=$(($3))
+ local snap=$4
+ local export_image_dir=$5
+
+ if [ $seek -lt 0 ];then
+ echo "$func: seek can not be minus"
+ exit
+ fi
+
+ if [ $count -lt 1 ];then
+ echo "$func: count must great than zero"
+ exit
+ fi
+
+ echo "[$snap] $func ..."
+ echo "$1 $2 $3 $4"
+ rbd snap ls $image_name|grep $snap;
+
+ local res=$?
+ if [ $res -eq 0 ];then
+ return $res
+ fi
+
+ dd conv=notrunc if=/dev/urandom of=$ofile bs=1M count=$count seek=$seek 2>/dev/null
+ snapshot=$image_name@$snap
+ rbd snap create $snapshot
+ rm -f $export_image_dir/$snapshot
+ rbd export $pool/$image_name $export_image_dir/$snapshot
+ pushd $export_image_dir >/dev/null
+ md5sum $snapshot >> @md5
+ popd >/dev/null
+}
+
+function recover_snapshots()
+{
+ local func="recover_snapshots"
+ if [ $# -lt 1 ];then
+ echo "$func: parameters: <image_name>"
+ exit
+ fi
+
+ local image_name=$1
+ #pool_id=29
+
+ local recover_image_dir=$recover_dir/pool_$pool_id/$image_name
+ mkdir -p $recover_image_dir
+ local recover_md5=$recover_image_dir/@md5
+ >$recover_md5
+ local snapshot=
+
+
+ # recover head
+ bash $tool_dir/rbd-recover-tool recover $pool_id/$image_name $recover_dir
+
+ # recover snapshots
+ for((i=1; i<10; i++))
+ do
+ snapshot=snap$i
+ bash $tool_dir/rbd-recover-tool recover $pool_id/$image_name@$snapshot $recover_dir
+ pushd $recover_image_dir >/dev/null
+ local chksum=`md5sum $image_name|awk '{print $1}'`
+ echo "$chksum $image_name@$snapshot" >>@md5
+ popd >/dev/null
+ done
+}
+
+function export_snapshots()
+{
+ local func="export_snapshots"
+
+ if [ $# -lt 2 ];then
+ echo "$func: parameters: <image_name> <image_format> [<image_size>]"
+ exit
+ fi
+
+ local image_name=$1
+ local format=$(($2))
+ local size=$(($3)) #MB
+
+ if [ $format -ne 1 ] && [ $format -ne 2 ];then
+ echo "$func: image format must be 1 or 2"
+ exit
+ fi
+
+ if [ $size -eq 0 ];then
+ size=24 #MB
+ echo "$func: size = $size"
+ fi
+ local mnt=/rbdfuse
+
+ mount |grep "rbd-fuse on /rbdfuse" &>/dev/null
+ if [ $? -ne 0 ];then
+ rbd-fuse $mnt
+ fi
+
+ create_image $image_name $size $format
+
+ local export_image_dir=$export_dir/pool_$pool_id/$image_name
+ mkdir -p $export_image_dir
+ local export_md5=$export_image_dir/@md5
+ >$export_md5
+
+ # create 9 snapshots
+ # image = {object0, object1, object2, object3, object4, object5, ...}
+ #
+ # snap1 : init/write all objects
+ # snap2 : write object0
+ # snap3 : write object1
+ # snap4 : write object2
+ # snap5 : write object3
+ # snap6 : write object4
+ # snap7 : write object5
+ # snap8 : write object0
+ # snap9 : write object3
+
+ make_snapshot $mnt/$image_name 0 $size snap1 $export_image_dir
+ make_snapshot $mnt/$image_name 0 1 snap2 $export_image_dir
+ make_snapshot $mnt/$image_name 4 1 snap3 $export_image_dir
+ make_snapshot $mnt/$image_name 8 1 snap4 $export_image_dir
+ make_snapshot $mnt/$image_name 12 1 snap5 $export_image_dir
+ make_snapshot $mnt/$image_name 16 1 snap6 $export_image_dir
+ make_snapshot $mnt/$image_name 20 1 snap7 $export_image_dir
+ make_snapshot $mnt/$image_name 1 1 snap8 $export_image_dir
+ make_snapshot $mnt/$image_name 13 1 snap9 $export_image_dir
+}
+
+function check_recover_nosnap()
+{
+ local func="check_recover_nosnap"
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <export_md5_file> <recover_md5_file> <image_name>"
+ fi
+ local export_md5=$1
+ local recover_md5=$2
+ local image_name=$3
+
+ local ifpassed="FAILED"
+
+ echo "================ < $image_name nosnap > ================"
+
+ local export_md5sum=`cat $export_md5`
+ local recover_md5sum=`cat $recover_md5`
+
+ if [ "$export_md5sum"x != ""x ] && [ "$export_md5sum"x = "$recover_md5sum"x ];then
+ ifpassed="PASSED"
+ fi
+ echo "export: $export_md5sum"
+ echo "recover: $recover_md5sum $ifpassed"
+}
+
+function check_recover_snapshots()
+{
+ local func="check_recover_snapshots"
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <export_md5_file> <recover_md5_file> <image_name>"
+ fi
+ local export_md5=$1
+ local recover_md5=$2
+ local image_name=$3
+
+ local ifpassed="FAILED"
+
+ echo "================ < $image_name snapshots > ================"
+
+ OIFS=$IFS
+ IFS=$'\n'
+ local export_md5s=(`cat $export_md5`)
+ local recover_md5s=(`cat $recover_md5`)
+ for((i=0; i<9; i++))
+ do
+ OOIFS=$IFS
+ IFS=$' '
+ local x=$(($i+1))
+ snapshot=snap$x
+
+ local export_arr=(`echo ${export_md5s[$i]}`)
+ local recover_arr=(`echo ${recover_md5s[$i]}`)
+ echo "export: ${export_md5s[$i]}"
+ if [ "${export_arr[1]}"x != ""x ] && [ "${export_arr[1]}"x = "${recover_arr[1]}"x ];then
+ ifpassed="PASSED"
+ fi
+ echo "recover: ${recover_md5s[$i]} $ifpassed"
+ IFS=$OOIFS
+ done
+ IFS=$OIFS
+}
+
+# step 1: export image, snapshot
+function do_export_nosnap()
+{
+ export_image image_v1_nosnap 1
+ export_image image_v2_nosnap 2
+}
+
+function do_export_snap()
+{
+ export_snapshots image_v1_snap 1
+ export_snapshots image_v2_snap 2
+}
+
+# step 2: stop ceph cluster and gen database
+function stop_cluster_gen_database()
+{
+ trap 'echo stop ceph cluster failed; exit;' INT HUP
+ stop_ceph
+ sleep 2
+ check_ceph_service
+ local res=$?
+ while [ $res -ne 0 ]
+ do
+ stop_ceph
+ sleep 2
+ check_ceph_service
+ res=$?
+ done
+
+ echo 0 >$gen_db
+ do_gen_database
+}
+
+# step 3: recover image,snapshot
+function do_recover_nosnap()
+{
+ recover_image image_v1_nosnap
+ recover_image image_v2_nosnap
+}
+
+function do_recover_snap()
+{
+ recover_snapshots image_v1_snap
+ recover_snapshots image_v2_snap
+}
+
+# step 4: check md5sum pair<export_md5sum, recover_md5sum>
+function do_check_recover_nosnap()
+{
+ local image1=image_v1_nosnap
+ local image2=image_v2_nosnap
+
+ local export_md5_1=$export_dir/pool_$pool_id/$image1/@md5_nosnap
+ local export_md5_2=$export_dir/pool_$pool_id/$image2/@md5_nosnap
+ local recover_md5_1=$recover_dir/pool_$pool_id/$image1/@md5_nosnap
+ local recover_md5_2=$recover_dir/pool_$pool_id/$image2/@md5_nosnap
+
+ check_recover_nosnap $export_md5_1 $recover_md5_1 $image1
+ check_recover_nosnap $export_md5_2 $recover_md5_2 $image2
+}
+
+function do_check_recover_snap()
+{
+ local image1=image_v1_snap
+ local image2=image_v2_snap
+
+ local export_md5_1=$export_dir/pool_$pool_id/$image1/@md5
+ local export_md5_2=$export_dir/pool_$pool_id/$image2/@md5
+ local recover_md5_1=$recover_dir/pool_$pool_id/$image1/@md5
+ local recover_md5_2=$recover_dir/pool_$pool_id/$image2/@md5
+
+ check_recover_snapshots $export_md5_1 $recover_md5_1 $image1
+ check_recover_snapshots $export_md5_2 $recover_md5_2 $image2
+}
+
+function test_case_1()
+{
+ do_export_nosnap
+ stop_cluster_gen_database
+ do_recover_nosnap
+ do_check_recover_nosnap
+}
+
+function test_case_2()
+{
+ do_export_snap
+ stop_cluster_gen_database
+ do_recover_snap
+ do_check_recover_snap
+}
+
+function test_case_3()
+{
+ do_export_nosnap
+ do_export_snap
+
+ stop_cluster_gen_database
+
+ do_recover_nosnap
+ do_recover_snap
+
+ do_check_recover_nosnap
+ do_check_recover_snap
+}
+
+
+init $*
+test_case_3
diff --git a/src/tools/rbd_wnbd/CMakeLists.txt b/src/tools/rbd_wnbd/CMakeLists.txt
new file mode 100644
index 000000000..38f463961
--- /dev/null
+++ b/src/tools/rbd_wnbd/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_executable(rbd-wnbd wnbd_handler.cc rbd_wnbd.cc)
+set_target_properties(
+ rbd-wnbd PROPERTIES COMPILE_FLAGS
+ "-fpermissive -I${WNBD_INCLUDE_DIRS}")
+target_link_libraries(
+ rbd-wnbd setupapi rpcrt4
+ ${WNBD_LIBRARIES}
+ ${Boost_FILESYSTEM_LIBRARY}
+ librbd librados global)
+install(TARGETS rbd-wnbd DESTINATION bin)
diff --git a/src/tools/rbd_wnbd/rbd_wnbd.cc b/src/tools/rbd_wnbd/rbd_wnbd.cc
new file mode 100644
index 000000000..d2bb9d8e9
--- /dev/null
+++ b/src/tools/rbd_wnbd/rbd_wnbd.cc
@@ -0,0 +1,1690 @@
+/*
+ * rbd-wnbd - RBD in userspace
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+*/
+
+#include "include/int_types.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <boost/locale/encoding_utf.hpp>
+
+#include "wnbd_handler.h"
+#include "rbd_wnbd.h"
+
+#include <fstream>
+#include <memory>
+#include <regex>
+
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/version.h"
+#include "common/win32/service.h"
+#include "common/admin_socket_client.h"
+
+#include "global/global_init.h"
+
+#include "include/uuid.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+
+#include <shellapi.h>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd-wnbd: "
+
+using boost::locale::conv::utf_to_utf;
+
+std::wstring to_wstring(const std::string& str)
+{
+ return utf_to_utf<wchar_t>(str.c_str(), str.c_str() + str.size());
+}
+
+std::string to_string(const std::wstring& str)
+{
+ return utf_to_utf<char>(str.c_str(), str.c_str() + str.size());
+}
+
+bool is_process_running(DWORD pid)
+{
+ HANDLE process = OpenProcess(SYNCHRONIZE, FALSE, pid);
+ DWORD ret = WaitForSingleObject(process, 0);
+ CloseHandle(process);
+ return ret == WAIT_TIMEOUT;
+}
+
+DWORD WNBDActiveDiskIterator::fetch_list(
+ PWNBD_CONNECTION_LIST* conn_list)
+{
+ DWORD curr_buff_sz = 0;
+ DWORD buff_sz = 0;
+ DWORD err = 0;
+ PWNBD_CONNECTION_LIST tmp_list = NULL;
+
+ // We're using a loop because other connections may show up by the time
+ // we retry.
+ do {
+ if (tmp_list)
+ free(tmp_list);
+
+ if (buff_sz) {
+ tmp_list = (PWNBD_CONNECTION_LIST) calloc(1, buff_sz);
+ if (!tmp_list) {
+ derr << "Could not allocate " << buff_sz << " bytes." << dendl;
+ err = ERROR_NOT_ENOUGH_MEMORY;
+ break;
+ }
+ }
+
+ curr_buff_sz = buff_sz;
+ // If the buffer is too small, the return value is 0 and "BufferSize"
+ // will contain the required size. This is counterintuitive, but
+ // Windows drivers can't return a buffer as well as a non-zero status.
+ err = WnbdList(tmp_list, &buff_sz);
+ if (err)
+ break;
+ } while (curr_buff_sz < buff_sz);
+
+ if (err) {
+ if (tmp_list)
+ free(tmp_list);
+ } else {
+ *conn_list = tmp_list;
+ }
+ return err;
+}
+
+WNBDActiveDiskIterator::WNBDActiveDiskIterator()
+{
+ DWORD status = WNBDActiveDiskIterator::fetch_list(&conn_list);
+ if (status) {
+ error = EINVAL;
+ }
+}
+
+WNBDActiveDiskIterator::~WNBDActiveDiskIterator()
+{
+ if (conn_list) {
+ free(conn_list);
+ conn_list = NULL;
+ }
+}
+
+bool WNBDActiveDiskIterator::get(Config *cfg)
+{
+ index += 1;
+ *cfg = Config();
+
+ if (!conn_list || index >= (int)conn_list->Count) {
+ return false;
+ }
+
+ auto conn_info = conn_list->Connections[index];
+ auto conn_props = conn_info.Properties;
+
+ if (strncmp(conn_props.Owner, RBD_WNBD_OWNER_NAME, WNBD_MAX_OWNER_LENGTH)) {
+ dout(10) << "Ignoring disk: " << conn_props.InstanceName
+ << ". Owner: " << conn_props.Owner << dendl;
+ return this->get(cfg);
+ }
+
+ error = load_mapping_config_from_registry(conn_props.InstanceName, cfg);
+ if (error) {
+ derr << "Could not load registry disk info for: "
+ << conn_props.InstanceName << ". Error: " << error << dendl;
+ return false;
+ }
+
+ cfg->disk_number = conn_info.DiskNumber;
+ cfg->serial_number = std::string(conn_props.SerialNumber);
+ cfg->pid = conn_props.Pid;
+ cfg->active = cfg->disk_number > 0 && is_process_running(conn_props.Pid);
+ cfg->wnbd_mapped = true;
+
+ return true;
+}
+
+RegistryDiskIterator::RegistryDiskIterator()
+{
+ reg_key = new RegistryKey(g_ceph_context, HKEY_LOCAL_MACHINE,
+ SERVICE_REG_KEY, false);
+ if (!reg_key->hKey) {
+ if (!reg_key->missingKey)
+ error = EINVAL;
+ return;
+ }
+
+ if (RegQueryInfoKey(reg_key->hKey, NULL, NULL, NULL, &subkey_count,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL)) {
+ derr << "Could not query registry key: " << SERVICE_REG_KEY << dendl;
+ error = EINVAL;
+ return;
+ }
+}
+
+bool RegistryDiskIterator::get(Config *cfg)
+{
+ index += 1;
+ *cfg = Config();
+
+ if (!reg_key->hKey || !subkey_count) {
+ return false;
+ }
+
+ char subkey_name[MAX_PATH] = {0};
+ DWORD subkey_name_sz = MAX_PATH;
+ int err = RegEnumKeyEx(
+ reg_key->hKey, index, subkey_name, &subkey_name_sz,
+ NULL, NULL, NULL, NULL);
+ if (err == ERROR_NO_MORE_ITEMS) {
+ return false;
+ } else if (err) {
+ derr << "Could not enumerate registry. Error: " << err << dendl;
+ error = EINVAL;
+ return false;
+ }
+
+ if (load_mapping_config_from_registry(subkey_name, cfg)) {
+ error = EINVAL;
+ return false;
+ };
+
+ return true;
+}
+
+// Iterate over all RBD mappings, getting info from the registry and the driver.
+bool WNBDDiskIterator::get(Config *cfg)
+{
+ *cfg = Config();
+
+ bool found_active = active_iterator.get(cfg);
+ if (found_active) {
+ active_devices.insert(cfg->devpath);
+ return true;
+ }
+
+ error = active_iterator.get_error();
+ if (error) {
+ dout(5) << ": WNBD iterator error: " << error << dendl;
+ return false;
+ }
+
+ while(registry_iterator.get(cfg)) {
+ if (active_devices.find(cfg->devpath) != active_devices.end()) {
+ // Skip active devices that were already yielded.
+ continue;
+ }
+ return true;
+ }
+
+ error = registry_iterator.get_error();
+ if (error) {
+ dout(5) << ": Registry iterator error: " << error << dendl;
+ }
+ return false;
+}
+
+int get_exe_path(std::string& path) {
+ char buffer[MAX_PATH];
+ DWORD err = 0;
+
+ int ret = GetModuleFileNameA(NULL, buffer, MAX_PATH);
+ if (!ret || ret == MAX_PATH) {
+ err = GetLastError();
+ derr << "Could not retrieve executable path. "
+ << "Error: " << win32_strerror(err) << dendl;
+ return -EINVAL;
+ }
+
+ path = buffer;
+ return 0;
+}
+
+std::string get_cli_args() {
+ std::ostringstream cmdline;
+ for (int i=1; i<__argc; i++) {
+ if (i > 1)
+ cmdline << " ";
+ cmdline << std::quoted(__argv[i]);
+ }
+ return cmdline.str();
+}
+
+int send_map_request(std::string arguments) {
+ dout(15) << __func__ << ": command arguments: " << arguments << dendl;
+
+ BYTE request_buff[SERVICE_PIPE_BUFFSZ] = { 0 };
+ ServiceRequest* request = (ServiceRequest*) request_buff;
+ request->command = Connect;
+ arguments.copy(
+ (char*)request->arguments,
+ SERVICE_PIPE_BUFFSZ - FIELD_OFFSET(ServiceRequest, arguments));
+ ServiceReply reply = { 0 };
+
+ DWORD bytes_read = 0;
+ BOOL success = CallNamedPipe(
+ SERVICE_PIPE_NAME,
+ request_buff,
+ SERVICE_PIPE_BUFFSZ,
+ &reply,
+ sizeof(reply),
+ &bytes_read,
+ DEFAULT_MAP_TIMEOUT_MS);
+ if (!success) {
+ DWORD err = GetLastError();
+ derr << "Could not send device map request. "
+ << "Make sure that the ceph service is running. "
+ << "Error: " << win32_strerror(err) << dendl;
+ return -EINVAL;
+ }
+ if (reply.status) {
+ derr << "The ceph service failed to map the image. Error: "
+ << reply.status << dendl;
+ }
+
+ return reply.status;
+}
+
+// Spawn a subprocess using the specified "rbd-wnbd" command
+// arguments. A pipe is passed to the child process,
+// which will allow it to communicate the mapping status
+int map_device_using_suprocess(std::string arguments, int timeout_ms)
+{
+ STARTUPINFO si;
+ PROCESS_INFORMATION pi;
+ char ch;
+ DWORD err = 0, status = 0;
+ int exit_code = 0;
+ std::ostringstream command_line;
+ std::string exe_path;
+ // Windows async IO context
+ OVERLAPPED connect_o, read_o;
+ HANDLE connect_event = NULL, read_event = NULL;
+ // Used for waiting on multiple events that are going to be initialized later.
+ HANDLE wait_events[2] = { INVALID_HANDLE_VALUE, INVALID_HANDLE_VALUE};
+ DWORD bytes_read = 0;
+ // We may get a command line containing an old pipe handle when
+ // recreating mappings, so we'll have to replace it.
+ std::regex pipe_pattern("([\'\"]?--pipe-name[\'\"]? +[\'\"]?[^ ]+[\'\"]?)");
+
+ uuid_d uuid;
+ uuid.generate_random();
+ std::ostringstream pipe_name;
+ pipe_name << "\\\\.\\pipe\\rbd-wnbd-" << uuid;
+
+ // Create an unique named pipe to communicate with the child. */
+ HANDLE pipe_handle = CreateNamedPipe(
+ pipe_name.str().c_str(),
+ PIPE_ACCESS_INBOUND | FILE_FLAG_FIRST_PIPE_INSTANCE |
+ FILE_FLAG_OVERLAPPED,
+ PIPE_WAIT,
+ 1, // Only accept one instance
+ SERVICE_PIPE_BUFFSZ,
+ SERVICE_PIPE_BUFFSZ,
+ SERVICE_PIPE_TIMEOUT_MS,
+ NULL);
+ if (pipe_handle == INVALID_HANDLE_VALUE) {
+ err = GetLastError();
+ derr << "CreateNamedPipe failed: " << win32_strerror(err) << dendl;
+ exit_code = -ECHILD;
+ goto finally;
+ }
+ connect_event = CreateEvent(0, TRUE, FALSE, NULL);
+ read_event = CreateEvent(0, TRUE, FALSE, NULL);
+ if (!connect_event || !read_event) {
+ err = GetLastError();
+ derr << "CreateEvent failed: " << win32_strerror(err) << dendl;
+ exit_code = -ECHILD;
+ goto finally;
+ }
+ connect_o.hEvent = connect_event;
+ read_o.hEvent = read_event;
+
+ status = ConnectNamedPipe(pipe_handle, &connect_o);
+ err = GetLastError();
+ if (status || err != ERROR_IO_PENDING) {
+ if (status)
+ err = status;
+ derr << "ConnectNamedPipe failed: " << win32_strerror(err) << dendl;
+ exit_code = -ECHILD;
+ goto finally;
+ }
+ err = 0;
+
+ dout(5) << __func__ << ": command arguments: " << arguments << dendl;
+
+ // We'll avoid running arbitrary commands, instead using the executable
+ // path of this process (expected to be the full rbd-wnbd.exe path).
+ err = get_exe_path(exe_path);
+ if (err) {
+ exit_code = -EINVAL;
+ goto finally;
+ }
+ command_line << std::quoted(exe_path)
+ << " " << std::regex_replace(arguments, pipe_pattern, "")
+ << " --pipe-name " << pipe_name.str();
+
+ dout(5) << __func__ << ": command line: " << command_line.str() << dendl;
+
+ GetStartupInfo(&si);
+ // Create a detached child
+ if (!CreateProcess(NULL, (char*)command_line.str().c_str(),
+ NULL, NULL, FALSE, DETACHED_PROCESS,
+ NULL, NULL, &si, &pi)) {
+ err = GetLastError();
+ derr << "CreateProcess failed: " << win32_strerror(err) << dendl;
+ exit_code = -ECHILD;
+ goto finally;
+ }
+
+ wait_events[0] = connect_event;
+ wait_events[1] = pi.hProcess;
+ status = WaitForMultipleObjects(2, wait_events, FALSE, timeout_ms);
+ switch(status) {
+ case WAIT_OBJECT_0:
+ if (!GetOverlappedResult(pipe_handle, &connect_o, &bytes_read, TRUE)) {
+ err = GetLastError();
+ derr << "Couln't establish a connection with the child process. "
+ << "Error: " << win32_strerror(err) << dendl;
+ exit_code = -ECHILD;
+ goto clean_process;
+ }
+ // We have an incoming connection.
+ break;
+ case WAIT_OBJECT_0 + 1:
+ // The process has exited prematurely.
+ goto clean_process;
+ case WAIT_TIMEOUT:
+ derr << "Timed out waiting for child process connection." << dendl;
+ goto clean_process;
+ default:
+ derr << "Failed waiting for child process. Status: " << status << dendl;
+ goto clean_process;
+ }
+ // Block and wait for child to say it is ready.
+ dout(5) << __func__ << ": waiting for child notification." << dendl;
+ if (!ReadFile(pipe_handle, &ch, 1, NULL, &read_o)) {
+ err = GetLastError();
+ if (err != ERROR_IO_PENDING) {
+ derr << "Receiving child process reply failed with: "
+ << win32_strerror(err) << dendl;
+ exit_code = -ECHILD;
+ goto clean_process;
+ }
+ }
+ wait_events[0] = read_event;
+ wait_events[1] = pi.hProcess;
+ // The RBD daemon is expected to write back right after opening the
+ // pipe. We'll use the same timeout value for now.
+ status = WaitForMultipleObjects(2, wait_events, FALSE, timeout_ms);
+ switch(status) {
+ case WAIT_OBJECT_0:
+ if (!GetOverlappedResult(pipe_handle, &read_o, &bytes_read, TRUE)) {
+ err = GetLastError();
+ derr << "Receiving child process reply failed with: "
+ << win32_strerror(err) << dendl;
+ exit_code = -ECHILD;
+ goto clean_process;
+ }
+ break;
+ case WAIT_OBJECT_0 + 1:
+ // The process has exited prematurely.
+ goto clean_process;
+ case WAIT_TIMEOUT:
+ derr << "Timed out waiting for child process message." << dendl;
+ goto clean_process;
+ default:
+ derr << "Failed waiting for child process. Status: " << status << dendl;
+ goto clean_process;
+ }
+
+ dout(5) << __func__ << ": received child notification." << dendl;
+ goto finally;
+
+ clean_process:
+ if (!is_process_running(pi.dwProcessId)) {
+ GetExitCodeProcess(pi.hProcess, (PDWORD)&exit_code);
+ derr << "Daemon failed with: " << cpp_strerror(exit_code) << dendl;
+ } else {
+ // The process closed the pipe without notifying us or exiting.
+ // This is quite unlikely, but we'll terminate the process.
+ dout(5) << "Terminating unresponsive process." << dendl;
+ TerminateProcess(pi.hProcess, 1);
+ exit_code = -EINVAL;
+ }
+
+ finally:
+ if (exit_code)
+ derr << "Could not start RBD daemon." << dendl;
+ if (pipe_handle)
+ CloseHandle(pipe_handle);
+ if (connect_event)
+ CloseHandle(connect_event);
+ if (read_event)
+ CloseHandle(read_event);
+ return exit_code;
+}
+
+BOOL WINAPI console_handler_routine(DWORD dwCtrlType)
+{
+ dout(5) << "Received control signal: " << dwCtrlType
+ << ". Exiting." << dendl;
+
+ std::unique_lock l{shutdown_lock};
+ if (handler)
+ handler->shutdown();
+
+ return true;
+}
+
+int save_config_to_registry(Config* cfg)
+{
+ std::string strKey{ SERVICE_REG_KEY };
+ strKey.append("\\");
+ strKey.append(cfg->devpath);
+ auto reg_key = RegistryKey(
+ g_ceph_context, HKEY_LOCAL_MACHINE, strKey.c_str(), true);
+ if (!reg_key.hKey) {
+ return -EINVAL;
+ }
+
+ int ret_val = 0;
+ // Registry writes are immediately available to other processes.
+ // Still, we'll do a flush to ensure that the mapping can be
+ // recreated after a system crash.
+ if (reg_key.set("pid", getpid()) ||
+ reg_key.set("devpath", cfg->devpath) ||
+ reg_key.set("poolname", cfg->poolname) ||
+ reg_key.set("nsname", cfg->nsname) ||
+ reg_key.set("imgname", cfg->imgname) ||
+ reg_key.set("snapname", cfg->snapname) ||
+ reg_key.set("command_line", get_cli_args()) ||
+ reg_key.set("persistent", cfg->persistent) ||
+ reg_key.set("admin_sock_path", g_conf()->admin_socket) ||
+ reg_key.flush()) {
+ ret_val = -EINVAL;
+ }
+
+ return ret_val;
+}
+
+int remove_config_from_registry(Config* cfg)
+{
+ std::string strKey{ SERVICE_REG_KEY };
+ strKey.append("\\");
+ strKey.append(cfg->devpath);
+ return RegistryKey::remove(
+ g_ceph_context, HKEY_LOCAL_MACHINE, strKey.c_str());
+}
+
+int load_mapping_config_from_registry(string devpath, Config* cfg)
+{
+ std::string strKey{ SERVICE_REG_KEY };
+ strKey.append("\\");
+ strKey.append(devpath);
+ auto reg_key = RegistryKey(
+ g_ceph_context, HKEY_LOCAL_MACHINE, strKey.c_str(), false);
+ if (!reg_key.hKey) {
+ if (reg_key.missingKey)
+ return -ENOENT;
+ else
+ return -EINVAL;
+ }
+
+ reg_key.get("devpath", cfg->devpath);
+ reg_key.get("poolname", cfg->poolname);
+ reg_key.get("nsname", cfg->nsname);
+ reg_key.get("imgname", cfg->imgname);
+ reg_key.get("snapname", cfg->snapname);
+ reg_key.get("command_line", cfg->command_line);
+ reg_key.get("persistent", cfg->persistent);
+ reg_key.get("admin_sock_path", cfg->admin_sock_path);
+
+ return 0;
+}
+
+int restart_registered_mappings(
+ int worker_count,
+ int total_timeout,
+ int image_map_timeout)
+{
+ Config cfg;
+ WNBDDiskIterator iterator;
+ int err = 0, r;
+
+ int total_timeout_ms = max(total_timeout, total_timeout * 1000);
+ int image_map_timeout_ms = max(image_map_timeout, image_map_timeout * 1000);
+
+ LARGE_INTEGER start_t, counter_freq;
+ QueryPerformanceFrequency(&counter_freq);
+ QueryPerformanceCounter(&start_t);
+
+ boost::asio::thread_pool pool(worker_count);
+ while (iterator.get(&cfg)) {
+ if (cfg.command_line.empty()) {
+ derr << "Could not recreate mapping, missing command line: "
+ << cfg.devpath << dendl;
+ err = -EINVAL;
+ continue;
+ }
+ if (cfg.wnbd_mapped) {
+ dout(5) << __func__ << ": device already mapped: "
+ << cfg.devpath << dendl;
+ continue;
+ }
+ if (!cfg.persistent) {
+ dout(5) << __func__ << ": cleaning up non-persistent mapping: "
+ << cfg.devpath << dendl;
+ r = remove_config_from_registry(&cfg);
+ if (r) {
+ derr << __func__ << ": could not clean up non-persistent mapping: "
+ << cfg.devpath << dendl;
+ }
+ continue;
+ }
+
+ boost::asio::post(pool,
+ [&, cfg]() mutable
+ {
+ LARGE_INTEGER curr_t, elapsed_ms;
+ QueryPerformanceCounter(&curr_t);
+ elapsed_ms.QuadPart = curr_t.QuadPart - start_t.QuadPart;
+ elapsed_ms.QuadPart *= 1000;
+ elapsed_ms.QuadPart /= counter_freq.QuadPart;
+
+ int time_left_ms = max(
+ 0,
+ total_timeout_ms - (int)elapsed_ms.QuadPart);
+ time_left_ms = min(image_map_timeout_ms, time_left_ms);
+ if (!time_left_ms) {
+ err = -ETIMEDOUT;
+ return;
+ }
+
+ dout(5) << "Remapping: " << cfg.devpath
+ << ". Timeout: " << time_left_ms << " ms." << dendl;
+
+ // We'll try to map all devices and return a non-zero value
+ // if any of them fails.
+ r = map_device_using_suprocess(cfg.command_line, time_left_ms);
+ if (r) {
+ err = r;
+ derr << "Could not create mapping: "
+ << cfg.devpath << ". Error: " << r << dendl;
+ } else {
+ dout(5) << "Successfully remapped: " << cfg.devpath << dendl;
+ }
+ });
+ }
+ pool.join();
+
+ r = iterator.get_error();
+ if (r) {
+ derr << "Could not fetch all mappings. Error: " << r << dendl;
+ err = r;
+ }
+
+ return err;
+}
+
+int disconnect_all_mappings(
+ bool unregister,
+ bool hard_disconnect,
+ int soft_disconnect_timeout,
+ int worker_count)
+{
+ // Although not generally recommended, soft_disconnect_timeout can be 0,
+ // which means infinite timeout.
+ ceph_assert(soft_disconnect_timeout >= 0);
+ ceph_assert(worker_count > 0);
+ int64_t timeout_ms = soft_disconnect_timeout * 1000;
+
+ Config cfg;
+ WNBDActiveDiskIterator iterator;
+ int err = 0, r;
+
+ boost::asio::thread_pool pool(worker_count);
+ LARGE_INTEGER start_t, counter_freq;
+ QueryPerformanceFrequency(&counter_freq);
+ QueryPerformanceCounter(&start_t);
+ while (iterator.get(&cfg)) {
+ boost::asio::post(pool,
+ [&, cfg]() mutable
+ {
+ LARGE_INTEGER curr_t, elapsed_ms;
+ QueryPerformanceCounter(&curr_t);
+ elapsed_ms.QuadPart = curr_t.QuadPart - start_t.QuadPart;
+ elapsed_ms.QuadPart *= 1000;
+ elapsed_ms.QuadPart /= counter_freq.QuadPart;
+
+ int64_t time_left_ms = max((int64_t)0, timeout_ms - elapsed_ms.QuadPart);
+
+ cfg.hard_disconnect = hard_disconnect || !time_left_ms;
+ cfg.hard_disconnect_fallback = true;
+ cfg.soft_disconnect_timeout = time_left_ms / 1000;
+
+ dout(5) << "Removing mapping: " << cfg.devpath
+ << ". Timeout: " << cfg.soft_disconnect_timeout
+ << "s. Hard disconnect: " << cfg.hard_disconnect
+ << dendl;
+
+ r = do_unmap(&cfg, unregister);
+ if (r) {
+ err = r;
+ derr << "Could not remove mapping: " << cfg.devpath
+ << ". Error: " << r << dendl;
+ } else {
+ dout(5) << "Successfully removed mapping: " << cfg.devpath << dendl;
+ }
+ });
+ }
+ pool.join();
+
+ r = iterator.get_error();
+ if (r) {
+ derr << "Could not fetch all mappings. Error: " << r << dendl;
+ err = r;
+ }
+
+ return err;
+}
+
+class RBDService : public ServiceBase {
+ private:
+ bool hard_disconnect;
+ int soft_disconnect_timeout;
+ int thread_count;
+ int service_start_timeout;
+ int image_map_timeout;
+ bool remap_failure_fatal;
+
+ public:
+ RBDService(bool _hard_disconnect,
+ int _soft_disconnect_timeout,
+ int _thread_count,
+ int _service_start_timeout,
+ int _image_map_timeout,
+ bool _remap_failure_fatal)
+ : ServiceBase(g_ceph_context)
+ , hard_disconnect(_hard_disconnect)
+ , soft_disconnect_timeout(_soft_disconnect_timeout)
+ , thread_count(_thread_count)
+ , service_start_timeout(_service_start_timeout)
+ , image_map_timeout(_image_map_timeout)
+ , remap_failure_fatal(_remap_failure_fatal)
+ {
+ }
+
+ static int execute_command(ServiceRequest* request)
+ {
+ switch(request->command) {
+ case Connect:
+ dout(5) << "Received device connect request. Command line: "
+ << (char*)request->arguments << dendl;
+ // TODO: use the configured service map timeout.
+ // TODO: add ceph.conf options.
+ return map_device_using_suprocess(
+ (char*)request->arguments, DEFAULT_MAP_TIMEOUT_MS);
+ default:
+ dout(5) << "Received unsupported command: "
+ << request->command << dendl;
+ return -ENOSYS;
+ }
+ }
+
+ static DWORD handle_connection(HANDLE pipe_handle)
+ {
+ PBYTE message[SERVICE_PIPE_BUFFSZ] = { 0 };
+ DWORD bytes_read = 0, bytes_written = 0;
+ DWORD err = 0;
+ DWORD reply_sz = 0;
+ ServiceReply reply = { 0 };
+
+ dout(20) << __func__ << ": Receiving message." << dendl;
+ BOOL success = ReadFile(
+ pipe_handle, message, SERVICE_PIPE_BUFFSZ,
+ &bytes_read, NULL);
+ if (!success || !bytes_read) {
+ err = GetLastError();
+ derr << "Could not read service command: "
+ << win32_strerror(err) << dendl;
+ goto exit;
+ }
+
+ dout(20) << __func__ << ": Executing command." << dendl;
+ reply.status = execute_command((ServiceRequest*) message);
+ reply_sz = sizeof(reply);
+
+ dout(20) << __func__ << ": Sending reply. Status: "
+ << reply.status << dendl;
+ success = WriteFile(
+ pipe_handle, &reply, reply_sz, &bytes_written, NULL);
+ if (!success || reply_sz != bytes_written) {
+ err = GetLastError();
+ derr << "Could not send service command result: "
+ << win32_strerror(err) << dendl;
+ }
+
+exit:
+ dout(20) << __func__ << ": Cleaning up connection." << dendl;
+ FlushFileBuffers(pipe_handle);
+ DisconnectNamedPipe(pipe_handle);
+ CloseHandle(pipe_handle);
+
+ return err;
+ }
+
+ // We have to support Windows server 2016. Unix sockets only work on
+ // WS 2019, so we can't use the Ceph admin socket abstraction.
+ // Getting the Ceph admin sockets to work with Windows named pipes
+ // would require quite a few changes.
+ static DWORD accept_pipe_connection() {
+ DWORD err = 0;
+ // We're currently using default ACLs, which grant full control to the
+ // LocalSystem account and administrator as well as the owner.
+ dout(20) << __func__ << ": opening new pipe instance" << dendl;
+ HANDLE pipe_handle = CreateNamedPipe(
+ SERVICE_PIPE_NAME,
+ PIPE_ACCESS_DUPLEX,
+ PIPE_TYPE_MESSAGE | PIPE_READMODE_MESSAGE | PIPE_WAIT,
+ PIPE_UNLIMITED_INSTANCES,
+ SERVICE_PIPE_BUFFSZ,
+ SERVICE_PIPE_BUFFSZ,
+ SERVICE_PIPE_TIMEOUT_MS,
+ NULL);
+ if (pipe_handle == INVALID_HANDLE_VALUE) {
+ err = GetLastError();
+ derr << "CreatePipe failed: " << win32_strerror(err) << dendl;
+ return -EINVAL;
+ }
+
+ dout(20) << __func__ << ": waiting for connections." << dendl;
+ BOOL connected = ConnectNamedPipe(pipe_handle, NULL);
+ if (!connected) {
+ err = GetLastError();
+ if (err != ERROR_PIPE_CONNECTED) {
+ derr << "Pipe connection failed: " << win32_strerror(err) << dendl;
+
+ CloseHandle(pipe_handle);
+ return err;
+ }
+ }
+
+ dout(20) << __func__ << ": Connection received." << dendl;
+ // We'll handle the connection in a separate thread and at the same time
+ // accept a new connection.
+ HANDLE handler_thread = CreateThread(
+ NULL, 0, (LPTHREAD_START_ROUTINE) handle_connection, pipe_handle, 0, 0);
+ if (!handler_thread) {
+ err = GetLastError();
+ derr << "Could not start pipe connection handler thread: "
+ << win32_strerror(err) << dendl;
+ CloseHandle(pipe_handle);
+ } else {
+ CloseHandle(handler_thread);
+ }
+
+ return err;
+ }
+
+ static int pipe_server_loop(LPVOID arg)
+ {
+ dout(5) << "Accepting admin pipe connections." << dendl;
+ while (1) {
+ // This call will block until a connection is received, which will
+ // then be handled in a separate thread. The function returns, allowing
+ // us to accept another simultaneous connection.
+ accept_pipe_connection();
+ }
+ return 0;
+ }
+
+ int create_pipe_server() {
+ HANDLE handler_thread = CreateThread(
+ NULL, 0, (LPTHREAD_START_ROUTINE) pipe_server_loop, NULL, 0, 0);
+ DWORD err = 0;
+
+ if (!handler_thread) {
+ err = GetLastError();
+ derr << "Could not start pipe server: " << win32_strerror(err) << dendl;
+ } else {
+ CloseHandle(handler_thread);
+ }
+
+ return err;
+ }
+
+ int run_hook() override {
+ // Restart registered mappings before accepting new ones.
+ int r = restart_registered_mappings(
+ thread_count, service_start_timeout, image_map_timeout);
+ if (r) {
+ if (remap_failure_fatal) {
+ derr << "Couldn't remap all images. Cleaning up." << dendl;
+ return r;
+ } else {
+ dout(0) << "Ignoring image remap failure." << dendl;
+ }
+ }
+
+ return create_pipe_server();
+ }
+
+ // Invoked when the service is requested to stop.
+ int stop_hook() override {
+ return disconnect_all_mappings(
+ false, hard_disconnect, soft_disconnect_timeout, thread_count);
+ }
+ // Invoked when the system is shutting down.
+ int shutdown_hook() override {
+ return stop_hook();
+ }
+};
+
+static void usage()
+{
+ const char* usage_str =R"(
+Usage: rbd-wnbd [options] map <image-or-snap-spec> Map an image to wnbd device
+ [options] unmap <device|image-or-snap-spec> Unmap wnbd device
+ [options] list List mapped wnbd devices
+ [options] show <image-or-snap-spec> Show mapped wnbd device
+ stats <image-or-snap-spec> Show IO counters
+ [options] service Windows service entrypoint,
+ handling device lifecycle
+
+Map options:
+ --device <device path> Optional mapping unique identifier
+ --exclusive Forbid writes by other clients
+ --read-only Map read-only
+ --non-persistent Do not recreate the mapping when the Ceph service
+ restarts. By default, mappings are persistent
+ --io-req-workers The number of workers that dispatch IO requests.
+ Default: 4
+ --io-reply-workers The number of workers that dispatch IO replies.
+ Default: 4
+
+Unmap options:
+ --hard-disconnect Skip attempting a soft disconnect
+ --no-hard-disconnect-fallback Immediately return an error if the soft
+ disconnect fails instead of attempting a hard
+ disconnect as fallback
+ --soft-disconnect-timeout Soft disconnect timeout in seconds. The soft
+ disconnect operation uses PnP to notify the
+ Windows storage stack that the device is going to
+ be disconnectd. Storage drivers can block this
+ operation if there are pending operations,
+ unflushed caches or open handles. Default: 15
+
+Service options:
+ --hard-disconnect Skip attempting a soft disconnect
+ --soft-disconnect-timeout Cummulative soft disconnect timeout in seconds,
+ used when disconnecting existing mappings. A hard
+ disconnect will be issued when hitting the timeout
+ --service-thread-count The number of workers used when mapping or
+ unmapping images. Default: 8
+ --start-timeout The service start timeout in seconds. Default: 120
+ --map-timeout Individual image map timeout in seconds. Default: 20
+ --remap-failure-fatal If set, the service will stop when failing to remap
+ an image at start time, unmapping images that have
+ been mapped so far.
+
+Show|List options:
+ --format plain|json|xml Output format (default: plain)
+ --pretty-format Pretty formatting (json and xml)
+
+Common options:
+ --wnbd-log-level libwnbd.dll log level
+
+)";
+
+ std::cout << usage_str;
+ generic_server_usage();
+}
+
+
+static Command cmd = None;
+
+int construct_devpath_if_missing(Config* cfg)
+{
+ // Windows doesn't allow us to request specific disk paths when mapping an
+ // image. This will just be used by rbd-wnbd and wnbd as an identifier.
+ if (cfg->devpath.empty()) {
+ if (cfg->imgname.empty()) {
+ derr << "Missing image name." << dendl;
+ return -EINVAL;
+ }
+
+ if (!cfg->poolname.empty()) {
+ cfg->devpath += cfg->poolname;
+ cfg->devpath += '/';
+ }
+ if (!cfg->nsname.empty()) {
+ cfg->devpath += cfg->nsname;
+ cfg->devpath += '/';
+ }
+
+ cfg->devpath += cfg->imgname;
+
+ if (!cfg->snapname.empty()) {
+ cfg->devpath += '@';
+ cfg->devpath += cfg->snapname;
+ }
+ }
+
+ return 0;
+}
+
+boost::intrusive_ptr<CephContext> do_global_init(
+ int argc, const char *argv[], Config *cfg)
+{
+ std::vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ code_environment_t code_env;
+ int flags;
+
+ switch(cmd) {
+ case Connect:
+ code_env = CODE_ENVIRONMENT_DAEMON;
+ flags = CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS;
+ break;
+ case Service:
+ code_env = CODE_ENVIRONMENT_DAEMON;
+ flags = CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS |
+ CINIT_FLAG_NO_MON_CONFIG |
+ CINIT_FLAG_NO_DAEMON_ACTIONS;
+ break;
+ default:
+ code_env = CODE_ENVIRONMENT_UTILITY;
+ flags = CINIT_FLAG_NO_MON_CONFIG;
+ break;
+ }
+
+ global_pre_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, code_env, flags);
+ // Avoid cluttering the console when spawning a mapping that will run
+ // in the background.
+ if (g_conf()->daemonize && cfg->parent_pipe.empty()) {
+ flags |= CINIT_FLAG_NO_DAEMON_ACTIONS;
+ }
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ code_env, flags, FALSE);
+
+ // There's no fork on Windows, we should be safe calling this anytime.
+ common_init_finish(g_ceph_context);
+ global_init_chdir(g_ceph_context);
+
+ return cct;
+}
+
+static int do_map(Config *cfg)
+{
+ int r;
+
+ librados::Rados rados;
+ librbd::RBD rbd;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ librbd::image_info_t info;
+ HANDLE parent_pipe_handle = INVALID_HANDLE_VALUE;
+ int err = 0;
+
+ if (g_conf()->daemonize && cfg->parent_pipe.empty()) {
+ return send_map_request(get_cli_args());
+ }
+
+ dout(0) << "Mapping RBD image: " << cfg->devpath << dendl;
+
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "rbd-wnbd: couldn't initialize rados: " << cpp_strerror(r)
+ << dendl;
+ goto close_ret;
+ }
+
+ r = rados.connect();
+ if (r < 0) {
+ derr << "rbd-wnbd: couldn't connect to rados: " << cpp_strerror(r)
+ << dendl;
+ goto close_ret;
+ }
+
+ r = rados.ioctx_create(cfg->poolname.c_str(), io_ctx);
+ if (r < 0) {
+ derr << "rbd-wnbd: couldn't create IO context: " << cpp_strerror(r)
+ << dendl;
+ goto close_ret;
+ }
+
+ io_ctx.set_namespace(cfg->nsname);
+
+ r = rbd.open(io_ctx, image, cfg->imgname.c_str());
+ if (r < 0) {
+ derr << "rbd-wnbd: couldn't open rbd image: " << cpp_strerror(r)
+ << dendl;
+ goto close_ret;
+ }
+
+ if (cfg->exclusive) {
+ r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE);
+ if (r < 0) {
+ derr << "rbd-wnbd: failed to acquire exclusive lock: " << cpp_strerror(r)
+ << dendl;
+ goto close_ret;
+ }
+ }
+
+ if (!cfg->snapname.empty()) {
+ r = image.snap_set(cfg->snapname.c_str());
+ if (r < 0) {
+ derr << "rbd-wnbd: couldn't use snapshot: " << cpp_strerror(r)
+ << dendl;
+ goto close_ret;
+ }
+ }
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ goto close_ret;
+
+ if (info.size > _UI64_MAX) {
+ r = -EFBIG;
+ derr << "rbd-wnbd: image is too large (" << byte_u_t(info.size)
+ << ", max is " << byte_u_t(_UI64_MAX) << ")" << dendl;
+ goto close_ret;
+ }
+
+ // We're storing mapping details in the registry even for non-persistent
+ // mappings. This allows us to easily retrieve mapping details such
+ // as the rbd pool or admin socket path.
+ // We're cleaning up the registry entry when the non-persistent mapping
+ // gets disconnected or when the ceph service restarts.
+ r = save_config_to_registry(cfg);
+ if (r < 0)
+ goto close_ret;
+
+ handler = new WnbdHandler(image, cfg->devpath,
+ info.size / RBD_WNBD_BLKSIZE,
+ RBD_WNBD_BLKSIZE,
+ !cfg->snapname.empty() || cfg->readonly,
+ g_conf().get_val<bool>("rbd_cache"),
+ cfg->io_req_workers,
+ cfg->io_reply_workers);
+ r = handler->start();
+ if (r) {
+ r = r == ERROR_ALREADY_EXISTS ? -EEXIST : -EINVAL;
+ goto close_ret;
+ }
+
+ // We're informing the parent processes that the initialization
+ // was successful.
+ if (!cfg->parent_pipe.empty()) {
+ parent_pipe_handle = CreateFile(
+ cfg->parent_pipe.c_str(), GENERIC_WRITE, 0, NULL,
+ OPEN_EXISTING, 0, NULL);
+ if (parent_pipe_handle == INVALID_HANDLE_VALUE) {
+ derr << "Could not open parent pipe: " << win32_strerror(err) << dendl;
+ } else if (!WriteFile(parent_pipe_handle, "a", 1, NULL, NULL)) {
+ // TODO: consider exiting in this case. The parent didn't wait for us,
+ // maybe it was killed after a timeout.
+ err = GetLastError();
+ derr << "Failed to communicate with the parent: "
+ << win32_strerror(err) << dendl;
+ } else {
+ dout(5) << __func__ << ": submitted parent notification." << dendl;
+ }
+
+ if (parent_pipe_handle != INVALID_HANDLE_VALUE)
+ CloseHandle(parent_pipe_handle);
+
+ global_init_postfork_finish(g_ceph_context);
+ }
+
+ handler->wait();
+ handler->shutdown();
+
+ // The registry record shouldn't be removed for (already) running mappings.
+ if (!cfg->persistent) {
+ dout(5) << __func__ << ": cleaning up non-persistent mapping: "
+ << cfg->devpath << dendl;
+ r = remove_config_from_registry(cfg);
+ if (r) {
+ derr << __func__ << ": could not clean up non-persistent mapping: "
+ << cfg->devpath << dendl;
+ }
+ }
+
+close_ret:
+ std::unique_lock l{shutdown_lock};
+
+ image.close();
+ io_ctx.close();
+ rados.shutdown();
+ if (handler) {
+ delete handler;
+ handler = nullptr;
+ }
+
+ return r;
+}
+
+static int do_unmap(Config *cfg, bool unregister)
+{
+ WNBD_REMOVE_OPTIONS remove_options = {0};
+ remove_options.Flags.HardRemove = cfg->hard_disconnect;
+ remove_options.Flags.HardRemoveFallback = cfg->hard_disconnect_fallback;
+ remove_options.SoftRemoveTimeoutMs = cfg->soft_disconnect_timeout * 1000;
+ remove_options.SoftRemoveRetryIntervalMs = SOFT_REMOVE_RETRY_INTERVAL * 1000;
+
+ int err = WnbdRemoveEx(cfg->devpath.c_str(), &remove_options);
+ if (err && err != ERROR_FILE_NOT_FOUND) {
+ return -EINVAL;
+ }
+
+ if (unregister) {
+ err = remove_config_from_registry(cfg);
+ if (err) {
+ derr << "rbd-wnbd: failed to unregister device: "
+ << cfg->devpath << ". Error: " << err << dendl;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int parse_imgpath(const std::string &imgpath, Config *cfg,
+ std::ostream *err_msg)
+{
+ std::regex pattern("^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$");
+ std::smatch match;
+ if (!std::regex_match(imgpath, match, pattern)) {
+ derr << "rbd-wnbd: invalid spec '" << imgpath << "'" << dendl;
+ return -EINVAL;
+ }
+
+ if (match[1].matched) {
+ cfg->poolname = match[1];
+ }
+
+ if (match[2].matched) {
+ cfg->nsname = match[2];
+ }
+
+ cfg->imgname = match[3];
+
+ if (match[4].matched)
+ cfg->snapname = match[4];
+
+ return 0;
+}
+
+static int do_list_mapped_devices(const std::string &format, bool pretty_format)
+{
+ std::unique_ptr<ceph::Formatter> f;
+ TextTable tbl;
+
+ if (format == "json") {
+ f.reset(new JSONFormatter(pretty_format));
+ } else if (format == "xml") {
+ f.reset(new XMLFormatter(pretty_format));
+ } else if (!format.empty() && format != "plain") {
+ derr << "rbd-wnbd: invalid output format: " << format << dendl;
+ return -EINVAL;
+ }
+
+ if (f) {
+ f->open_array_section("devices");
+ } else {
+ tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("disk_number", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("status", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ Config cfg;
+ WNBDDiskIterator wnbd_disk_iterator;
+
+ while (wnbd_disk_iterator.get(&cfg)) {
+ const char* status = cfg.active ?
+ WNBD_STATUS_ACTIVE : WNBD_STATUS_INACTIVE;
+
+ if (f) {
+ f->open_object_section("device");
+ f->dump_int("id", cfg.pid ? cfg.pid : -1);
+ f->dump_string("device", cfg.devpath);
+ f->dump_string("pool", cfg.poolname);
+ f->dump_string("namespace", cfg.nsname);
+ f->dump_string("image", cfg.imgname);
+ f->dump_string("snap", cfg.snapname);
+ f->dump_int("disk_number", cfg.disk_number ? cfg.disk_number : -1);
+ f->dump_string("status", status);
+ f->close_section();
+ } else {
+ if (cfg.snapname.empty()) {
+ cfg.snapname = "-";
+ }
+ tbl << (cfg.pid ? cfg.pid : -1) << cfg.poolname << cfg.nsname
+ << cfg.imgname << cfg.snapname << cfg.devpath
+ << cfg.disk_number << status << TextTable::endrow;
+ }
+ }
+ int error = wnbd_disk_iterator.get_error();
+ if (error) {
+ derr << "Could not get disk list: " << error << dendl;
+ return -error;
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+static int do_show_mapped_device(std::string format, bool pretty_format,
+ std::string devpath)
+{
+ std::unique_ptr<ceph::Formatter> f;
+ TextTable tbl;
+
+ if (format.empty() || format == "plain") {
+ format = "json";
+ pretty_format = true;
+ }
+ if (format == "json") {
+ f.reset(new JSONFormatter(pretty_format));
+ } else if (format == "xml") {
+ f.reset(new XMLFormatter(pretty_format));
+ } else {
+ derr << "rbd-wnbd: invalid output format: " << format << dendl;
+ return -EINVAL;
+ }
+
+ Config cfg;
+ int error = load_mapping_config_from_registry(devpath, &cfg);
+ if (error) {
+ derr << "Could not load registry disk info for: "
+ << devpath << ". Error: " << error << dendl;
+ return error;
+ }
+
+ WNBD_CONNECTION_INFO conn_info = { 0 };
+ // If the device is currently disconnected but there is a persistent
+ // mapping record, we'll show that.
+ DWORD ret = WnbdShow(devpath.c_str(), &conn_info);
+ if (ret && ret != ERROR_FILE_NOT_FOUND) {
+ return -EINVAL;
+ }
+
+ auto conn_props = conn_info.Properties;
+ cfg.active = conn_info.DiskNumber > 0 && is_process_running(conn_props.Pid);
+ f->open_object_section("device");
+ f->dump_int("id", conn_props.Pid ? conn_props.Pid : -1);
+ f->dump_string("device", cfg.devpath);
+ f->dump_string("pool", cfg.poolname);
+ f->dump_string("namespace", cfg.nsname);
+ f->dump_string("image", cfg.imgname);
+ f->dump_string("snap", cfg.snapname);
+ f->dump_int("persistent", cfg.persistent);
+ f->dump_int("disk_number", conn_info.DiskNumber ? conn_info.DiskNumber : -1);
+ f->dump_string("status", cfg.active ? WNBD_STATUS_ACTIVE : WNBD_STATUS_INACTIVE);
+ f->dump_string("pnp_device_id", to_string(conn_info.PNPDeviceID));
+ f->dump_int("readonly", conn_props.Flags.ReadOnly);
+ f->dump_int("block_size", conn_props.BlockSize);
+ f->dump_int("block_count", conn_props.BlockCount);
+ f->dump_int("flush_enabled", conn_props.Flags.FlushSupported);
+ f->close_section();
+ f->flush(std::cout);
+
+ return 0;
+}
+
+static int do_stats(std::string search_devpath)
+{
+ Config cfg;
+ WNBDDiskIterator wnbd_disk_iterator;
+
+ while (wnbd_disk_iterator.get(&cfg)) {
+ if (cfg.devpath != search_devpath)
+ continue;
+
+ AdminSocketClient client = AdminSocketClient(cfg.admin_sock_path);
+ std::string output;
+ std::string result = client.do_request("{\"prefix\":\"wnbd stats\"}",
+ &output);
+ if (!result.empty()) {
+ std::cerr << "Admin socket error: " << result << std::endl;
+ return -EINVAL;
+ }
+
+ std::cout << output << std::endl;
+ return 0;
+ }
+ int error = wnbd_disk_iterator.get_error();
+ if (!error) {
+ error = ENOENT;
+ }
+
+ derr << "Could not find the specified disk." << dendl;
+ return -error;
+}
+
+static int parse_args(std::vector<const char*>& args,
+ std::ostream *err_msg,
+ Command *command, Config *cfg)
+{
+ std::string conf_file_list;
+ std::string cluster;
+ CephInitParameters iparams = ceph_argparse_early_args(
+ args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list);
+
+ ConfigProxy config{false};
+ config->name = iparams.name;
+ config->cluster = cluster;
+
+ if (!conf_file_list.empty()) {
+ config.parse_config_files(conf_file_list.c_str(), nullptr, 0);
+ } else {
+ config.parse_config_files(nullptr, nullptr, 0);
+ }
+ config.parse_env(CEPH_ENTITY_TYPE_CLIENT);
+ config.parse_argv(args);
+ cfg->poolname = config.get_val<std::string>("rbd_default_pool");
+
+ std::vector<const char*>::iterator i;
+ std::ostringstream err;
+
+ // TODO: consider using boost::program_options like Device.cc does.
+ // This should simplify argument parsing. Also, some arguments must be tied
+ // to specific commands, for example the disconnect timeout. Luckily,
+ // this is enforced by the "rbd device" wrapper.
+ for (i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+ return HELP_INFO;
+ } else if (ceph_argparse_flag(args, i, "-v", "--version", (char*)NULL)) {
+ return VERSION_INFO;
+ } else if (ceph_argparse_witharg(args, i, &cfg->devpath, "--device", (char *)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &cfg->format, err, "--format",
+ (char *)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
+ cfg->readonly = true;
+ } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) {
+ cfg->exclusive = true;
+ } else if (ceph_argparse_flag(args, i, "--non-persistent", (char *)NULL)) {
+ cfg->persistent = false;
+ } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) {
+ cfg->pretty_format = true;
+ } else if (ceph_argparse_flag(args, i, "--remap-failure-fatal", (char *)NULL)) {
+ cfg->remap_failure_fatal = true;
+ } else if (ceph_argparse_witharg(args, i, &cfg->parent_pipe, err,
+ "--pipe-name", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-wnbd: " << err.str();
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i, (int*)&cfg->wnbd_log_level,
+ err, "--wnbd-log-level", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-wnbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->wnbd_log_level < 0) {
+ *err_msg << "rbd-wnbd: Invalid argument for wnbd-log-level";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i, (int*)&cfg->io_req_workers,
+ err, "--io-req-workers", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-wnbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->io_req_workers <= 0) {
+ *err_msg << "rbd-wnbd: Invalid argument for io-req-workers";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i, (int*)&cfg->io_reply_workers,
+ err, "--io-reply-workers", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-wnbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->io_reply_workers <= 0) {
+ *err_msg << "rbd-wnbd: Invalid argument for io-reply-workers";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i, (int*)&cfg->service_thread_count,
+ err, "--service-thread-count", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-wnbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->service_thread_count <= 0) {
+ *err_msg << "rbd-wnbd: Invalid argument for service-thread-count";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_flag(args, i, "--hard-disconnect", (char *)NULL)) {
+ cfg->hard_disconnect = true;
+ } else if (ceph_argparse_flag(args, i,
+ "--no-hard-disconnect-fallback", (char *)NULL)) {
+ cfg->hard_disconnect_fallback = false;
+ } else if (ceph_argparse_witharg(args, i,
+ (int*)&cfg->soft_disconnect_timeout,
+ err, "--soft-disconnect-timeout",
+ (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-wnbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->soft_disconnect_timeout < 0) {
+ *err_msg << "rbd-wnbd: Invalid argument for soft-disconnect-timeout";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i,
+ (int*)&cfg->service_start_timeout,
+ err, "--start-timeout",
+ (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-wnbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->service_start_timeout <= 0) {
+ *err_msg << "rbd-wnbd: Invalid argument for start-timeout";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i,
+ (int*)&cfg->image_map_timeout,
+ err, "--map-timeout",
+ (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-wnbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->image_map_timeout <= 0) {
+ *err_msg << "rbd-wnbd: Invalid argument for map-timeout";
+ return -EINVAL;
+ }
+ } else {
+ ++i;
+ }
+ }
+
+ Command cmd = None;
+ if (args.begin() != args.end()) {
+ if (strcmp(*args.begin(), "map") == 0) {
+ cmd = Connect;
+ } else if (strcmp(*args.begin(), "unmap") == 0) {
+ cmd = Disconnect;
+ } else if (strcmp(*args.begin(), "list") == 0) {
+ cmd = List;
+ } else if (strcmp(*args.begin(), "show") == 0) {
+ cmd = Show;
+ } else if (strcmp(*args.begin(), "service") == 0) {
+ cmd = Service;
+ } else if (strcmp(*args.begin(), "stats") == 0) {
+ cmd = Stats;
+ } else if (strcmp(*args.begin(), "help") == 0) {
+ return HELP_INFO;
+ } else {
+ *err_msg << "rbd-wnbd: unknown command: " << *args.begin();
+ return -EINVAL;
+ }
+ args.erase(args.begin());
+ }
+
+ if (cmd == None) {
+ *err_msg << "rbd-wnbd: must specify command";
+ return -EINVAL;
+ }
+
+ switch (cmd) {
+ case Connect:
+ case Disconnect:
+ case Show:
+ case Stats:
+ if (args.begin() == args.end()) {
+ *err_msg << "rbd-wnbd: must specify wnbd device or image-or-snap-spec";
+ return -EINVAL;
+ }
+ if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) {
+ return -EINVAL;
+ }
+ args.erase(args.begin());
+ break;
+ default:
+ //shut up gcc;
+ break;
+ }
+
+ if (args.begin() != args.end()) {
+ *err_msg << "rbd-wnbd: unknown args: " << *args.begin();
+ return -EINVAL;
+ }
+
+ *command = cmd;
+ return 0;
+}
+
+static int rbd_wnbd(int argc, const char *argv[])
+{
+ int r;
+ Config cfg;
+ std::vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ // Avoid using dout before calling "do_global_init"
+ if (args.empty()) {
+ std::cout << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+
+ std::ostringstream err_msg;
+ r = parse_args(args, &err_msg, &cmd, &cfg);
+ if (r == HELP_INFO) {
+ usage();
+ return 0;
+ } else if (r == VERSION_INFO) {
+ std::cout << pretty_version_to_str() << std::endl;
+ return 0;
+ } else if (r < 0) {
+ std::cout << err_msg.str() << std::endl;
+ return r;
+ }
+
+ auto cct = do_global_init(argc, argv, &cfg);
+
+ WnbdSetLogger(WnbdHandler::LogMessage);
+ WnbdSetLogLevel(cfg.wnbd_log_level);
+
+ switch (cmd) {
+ case Connect:
+ if (construct_devpath_if_missing(&cfg)) {
+ return -EINVAL;
+ }
+ r = do_map(&cfg);
+ if (r < 0)
+ return r;
+ break;
+ case Disconnect:
+ if (construct_devpath_if_missing(&cfg)) {
+ return -EINVAL;
+ }
+ r = do_unmap(&cfg, true);
+ if (r < 0)
+ return r;
+ break;
+ case List:
+ r = do_list_mapped_devices(cfg.format, cfg.pretty_format);
+ if (r < 0)
+ return r;
+ break;
+ case Show:
+ if (construct_devpath_if_missing(&cfg)) {
+ return r;
+ }
+ r = do_show_mapped_device(cfg.format, cfg.pretty_format, cfg.devpath);
+ if (r < 0)
+ return r;
+ break;
+ case Service:
+ {
+ RBDService service(cfg.hard_disconnect, cfg.soft_disconnect_timeout,
+ cfg.service_thread_count,
+ cfg.service_start_timeout,
+ cfg.image_map_timeout,
+ cfg.remap_failure_fatal);
+ // This call will block until the service stops.
+ r = RBDService::initialize(&service);
+ if (r < 0)
+ return r;
+ break;
+ }
+ case Stats:
+ if (construct_devpath_if_missing(&cfg)) {
+ return -EINVAL;
+ }
+ return do_stats(cfg.devpath);
+ default:
+ usage();
+ break;
+ }
+
+ return 0;
+}
+
+int main(int argc, const char *argv[])
+{
+ SetConsoleCtrlHandler(console_handler_routine, true);
+ // Avoid the Windows Error Reporting dialog.
+ SetErrorMode(GetErrorMode() | SEM_NOGPFAULTERRORBOX);
+ int r = rbd_wnbd(argc, argv);
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
diff --git a/src/tools/rbd_wnbd/rbd_wnbd.h b/src/tools/rbd_wnbd/rbd_wnbd.h
new file mode 100644
index 000000000..d17eb792b
--- /dev/null
+++ b/src/tools/rbd_wnbd/rbd_wnbd.h
@@ -0,0 +1,192 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef RBD_WNBD_H
+#define RBD_WNBD_H
+
+#include <string.h>
+#include <iostream>
+#include <vector>
+
+#include "include/compat.h"
+#include "common/win32/registry.h"
+
+#include "wnbd_handler.h"
+
+#define SERVICE_REG_KEY "SYSTEM\\CurrentControlSet\\Services\\rbd-wnbd"
+#define SERVICE_PIPE_NAME "\\\\.\\pipe\\rbd-wnbd"
+#define SERVICE_PIPE_TIMEOUT_MS 5000
+#define SERVICE_PIPE_BUFFSZ 4096
+
+#define DEFAULT_MAP_TIMEOUT_MS 30000
+
+#define RBD_WNBD_BLKSIZE 512UL
+
+#define DEFAULT_SERVICE_START_TIMEOUT 120
+#define DEFAULT_IMAGE_MAP_TIMEOUT 20
+
+#define HELP_INFO 1
+#define VERSION_INFO 2
+
+#define WNBD_STATUS_ACTIVE "active"
+#define WNBD_STATUS_INACTIVE "inactive"
+
+#define DEFAULT_SERVICE_THREAD_COUNT 8
+
+static WnbdHandler* handler = nullptr;
+ceph::mutex shutdown_lock = ceph::make_mutex("RbdWnbd::ShutdownLock");
+
+struct Config {
+ bool exclusive = false;
+ bool readonly = false;
+
+ std::string parent_pipe;
+
+ std::string poolname;
+ std::string nsname;
+ std::string imgname;
+ std::string snapname;
+ std::string devpath;
+
+ std::string format;
+ bool pretty_format = false;
+
+ bool hard_disconnect = false;
+ int soft_disconnect_timeout = DEFAULT_SOFT_REMOVE_TIMEOUT;
+ bool hard_disconnect_fallback = true;
+
+ int service_start_timeout = DEFAULT_SERVICE_START_TIMEOUT;
+ int image_map_timeout = DEFAULT_IMAGE_MAP_TIMEOUT;
+ bool remap_failure_fatal = false;
+
+ // TODO: consider moving those fields to a separate structure. Those
+ // provide connection information without actually being configurable.
+ // The disk number is provided by Windows.
+ int disk_number = -1;
+ int pid = 0;
+ std::string serial_number;
+ bool active = false;
+ bool wnbd_mapped = false;
+ std::string command_line;
+ std::string admin_sock_path;
+
+ WnbdLogLevel wnbd_log_level = WnbdLogLevelInfo;
+ int io_req_workers = DEFAULT_IO_WORKER_COUNT;
+ int io_reply_workers = DEFAULT_IO_WORKER_COUNT;
+ int service_thread_count = DEFAULT_SERVICE_THREAD_COUNT;
+
+ // register the mapping, recreating it when the Ceph service starts.
+ bool persistent = true;
+};
+
+enum Command {
+ None,
+ Connect,
+ Disconnect,
+ List,
+ Show,
+ Service,
+ Stats
+};
+
+typedef struct {
+ Command command;
+ BYTE arguments[1];
+} ServiceRequest;
+
+typedef struct {
+ int status;
+} ServiceReply;
+
+bool is_process_running(DWORD pid);
+void unmap_at_exit();
+
+int disconnect_all_mappings(
+ bool unregister,
+ bool hard_disconnect,
+ int soft_disconnect_timeout,
+ int worker_count);
+int restart_registered_mappings(
+ int worker_count, int total_timeout, int image_map_timeout);
+int map_device_using_suprocess(std::string command_line);
+
+int construct_devpath_if_missing(Config* cfg);
+int save_config_to_registry(Config* cfg);
+int remove_config_from_registry(Config* cfg);
+int load_mapping_config_from_registry(std::string devpath, Config* cfg);
+
+BOOL WINAPI console_handler_routine(DWORD dwCtrlType);
+
+static int parse_args(std::vector<const char*>& args,
+ std::ostream *err_msg,
+ Command *command, Config *cfg);
+static int do_unmap(Config *cfg, bool unregister);
+
+
+class BaseIterator {
+ public:
+ virtual ~BaseIterator() {};
+ virtual bool get(Config *cfg) = 0;
+
+ int get_error() {
+ return error;
+ }
+ protected:
+ int error = 0;
+ int index = -1;
+};
+
+// Iterate over mapped devices, retrieving info from the driver.
+class WNBDActiveDiskIterator : public BaseIterator {
+ public:
+ WNBDActiveDiskIterator();
+ ~WNBDActiveDiskIterator();
+
+ bool get(Config *cfg);
+
+ private:
+ PWNBD_CONNECTION_LIST conn_list = NULL;
+
+ static DWORD fetch_list(PWNBD_CONNECTION_LIST* conn_list);
+};
+
+
+// Iterate over the Windows registry key, retrieving registered mappings.
+class RegistryDiskIterator : public BaseIterator {
+ public:
+ RegistryDiskIterator();
+ ~RegistryDiskIterator() {
+ delete reg_key;
+ }
+
+ bool get(Config *cfg);
+ private:
+ DWORD subkey_count = 0;
+ char subkey_name[MAX_PATH];
+
+ RegistryKey* reg_key = NULL;
+};
+
+// Iterate over all RBD mappings, getting info from the registry and driver.
+class WNBDDiskIterator : public BaseIterator {
+ public:
+ bool get(Config *cfg);
+
+ private:
+ // We'll keep track of the active devices.
+ std::set<std::string> active_devices;
+
+ WNBDActiveDiskIterator active_iterator;
+ RegistryDiskIterator registry_iterator;
+};
+
+#endif // RBD_WNBD_H
diff --git a/src/tools/rbd_wnbd/wnbd_handler.cc b/src/tools/rbd_wnbd/wnbd_handler.cc
new file mode 100644
index 000000000..ecfa47240
--- /dev/null
+++ b/src/tools/rbd_wnbd/wnbd_handler.cc
@@ -0,0 +1,430 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+
+#include "wnbd_handler.h"
+
+#define _NTSCSI_USER_MODE_
+#include <rpc.h>
+#include <ddk/scsi.h>
+
+#include <boost/thread/tss.hpp>
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/SubProcess.h"
+#include "common/Formatter.h"
+
+#include "global/global_context.h"
+
+WnbdHandler::~WnbdHandler()
+{
+ if (started && wnbd_disk) {
+ dout(10) << __func__ << ": terminating" << dendl;
+
+ shutdown();
+ reply_tpool->join();
+
+ WnbdClose(wnbd_disk);
+
+ started = false;
+
+ delete reply_tpool;
+ delete admin_hook;
+ }
+}
+
+int WnbdHandler::wait()
+{
+ int err = 0;
+ if (started && wnbd_disk) {
+ dout(10) << __func__ << ": waiting" << dendl;
+
+ err = WnbdWaitDispatcher(wnbd_disk);
+ if (err) {
+ derr << __func__ << " failed waiting for dispatcher to stop: "
+ << err << dendl;
+ }
+ }
+
+ return err;
+}
+
+int WnbdAdminHook::call (std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& errss,
+ bufferlist& out) {
+ if (command == "wnbd stats") {
+ return m_handler->dump_stats(f);
+ }
+ return -ENOSYS;
+ }
+
+int WnbdHandler::dump_stats(Formatter *f)
+{
+ if (!f) {
+ return -EINVAL;
+ }
+
+ WNBD_USR_STATS stats = { 0 };
+ DWORD err = WnbdGetUserspaceStats(wnbd_disk, &stats);
+ if (err) {
+ derr << "Failed to retrieve WNBD userspace stats. Error: " << err << dendl;
+ return -EINVAL;
+ }
+
+ f->open_object_section("stats");
+ f->dump_int("TotalReceivedRequests", stats.TotalReceivedRequests);
+ f->dump_int("TotalSubmittedRequests", stats.TotalSubmittedRequests);
+ f->dump_int("TotalReceivedReplies", stats.TotalReceivedReplies);
+ f->dump_int("UnsubmittedRequests", stats.UnsubmittedRequests);
+ f->dump_int("PendingSubmittedRequests", stats.PendingSubmittedRequests);
+ f->dump_int("PendingReplies", stats.PendingReplies);
+ f->dump_int("ReadErrors", stats.ReadErrors);
+ f->dump_int("WriteErrors", stats.WriteErrors);
+ f->dump_int("FlushErrors", stats.FlushErrors);
+ f->dump_int("UnmapErrors", stats.UnmapErrors);
+ f->dump_int("InvalidRequests", stats.InvalidRequests);
+ f->dump_int("TotalRWRequests", stats.TotalRWRequests);
+ f->dump_int("TotalReadBlocks", stats.TotalReadBlocks);
+ f->dump_int("TotalWrittenBlocks", stats.TotalWrittenBlocks);
+
+ f->close_section();
+ return 0;
+}
+
+void WnbdHandler::shutdown()
+{
+ std::unique_lock l{shutdown_lock};
+ if (!terminated && wnbd_disk) {
+ // We're requesting the disk to be removed but continue serving IO
+ // requests until the driver sends us the "Disconnect" event.
+ // TODO: expose PWNBD_REMOVE_OPTIONS, we're using the defaults ATM.
+ WnbdRemove(wnbd_disk, NULL);
+ wait();
+ terminated = true;
+ }
+}
+
+void WnbdHandler::aio_callback(librbd::completion_t cb, void *arg)
+{
+ librbd::RBD::AioCompletion *aio_completion =
+ reinterpret_cast<librbd::RBD::AioCompletion*>(cb);
+
+ WnbdHandler::IOContext* ctx = static_cast<WnbdHandler::IOContext*>(arg);
+ int ret = aio_completion->get_return_value();
+
+ dout(20) << __func__ << ": " << *ctx << dendl;
+
+ if (ret == -EINVAL) {
+ // if shrinking an image, a pagecache writeback might reference
+ // extents outside of the range of the new image extents
+ dout(0) << __func__ << ": masking IO out-of-bounds error" << *ctx << dendl;
+ ctx->data.clear();
+ ret = 0;
+ }
+
+ if (ret < 0) {
+ ctx->err_code = -ret;
+ // TODO: check the actual error.
+ ctx->set_sense(SCSI_SENSE_MEDIUM_ERROR,
+ SCSI_ADSENSE_UNRECOVERED_ERROR);
+ } else if ((ctx->req_type == WnbdReqTypeRead) &&
+ ret < static_cast<int>(ctx->req_size)) {
+ int pad_byte_count = static_cast<int> (ctx->req_size) - ret;
+ ctx->data.append_zero(pad_byte_count);
+ dout(20) << __func__ << ": " << *ctx << ": Pad byte count: "
+ << pad_byte_count << dendl;
+ ctx->err_code = 0;
+ } else {
+ ctx->err_code = 0;
+ }
+
+ boost::asio::post(
+ *ctx->handler->reply_tpool,
+ [&, ctx]()
+ {
+ ctx->handler->send_io_response(ctx);
+ });
+
+ aio_completion->release();
+}
+
+void WnbdHandler::send_io_response(WnbdHandler::IOContext *ctx) {
+ std::unique_ptr<WnbdHandler::IOContext> pctx{ctx};
+ ceph_assert(WNBD_DEFAULT_MAX_TRANSFER_LENGTH >= pctx->data.length());
+
+ WNBD_IO_RESPONSE wnbd_rsp = {0};
+ wnbd_rsp.RequestHandle = pctx->req_handle;
+ wnbd_rsp.RequestType = pctx->req_type;
+ wnbd_rsp.Status = pctx->wnbd_status;
+ int err = 0;
+
+ // Use TLS to store an overlapped structure so that we avoid
+ // recreating one each time we send a reply.
+ static boost::thread_specific_ptr<OVERLAPPED> overlapped_tls(
+ // Cleanup routine
+ [](LPOVERLAPPED p_overlapped)
+ {
+ if (p_overlapped->hEvent) {
+ CloseHandle(p_overlapped->hEvent);
+ }
+ delete p_overlapped;
+ });
+
+ LPOVERLAPPED overlapped = overlapped_tls.get();
+ if (!overlapped)
+ {
+ overlapped = new OVERLAPPED{0};
+ HANDLE overlapped_evt = CreateEventA(0, TRUE, TRUE, NULL);
+ if (!overlapped_evt) {
+ err = GetLastError();
+ derr << "Could not create event. Error: " << err << dendl;
+ return;
+ }
+
+ overlapped->hEvent = overlapped_evt;
+ overlapped_tls.reset(overlapped);
+ }
+
+ if (!ResetEvent(overlapped->hEvent)) {
+ err = GetLastError();
+ derr << "Could not reset event. Error: " << err << dendl;
+ return;
+ }
+
+ err = WnbdSendResponseEx(
+ pctx->handler->wnbd_disk,
+ &wnbd_rsp,
+ pctx->data.c_str(),
+ pctx->data.length(),
+ overlapped);
+ if (err == ERROR_IO_PENDING) {
+ DWORD returned_bytes = 0;
+ err = 0;
+ // We've got ERROR_IO_PENDING, which means that the operation is in
+ // progress. We'll use GetOverlappedResult to wait for it to complete
+ // and then retrieve the result.
+ if (!GetOverlappedResult(pctx->handler->wnbd_disk, overlapped,
+ &returned_bytes, TRUE)) {
+ err = GetLastError();
+ derr << "Could not send response. Request id: " << wnbd_rsp.RequestHandle
+ << ". Error: " << err << dendl;
+ }
+ }
+}
+
+void WnbdHandler::IOContext::set_sense(uint8_t sense_key, uint8_t asc, uint64_t info)
+{
+ WnbdSetSenseEx(&wnbd_status, sense_key, asc, info);
+}
+
+void WnbdHandler::IOContext::set_sense(uint8_t sense_key, uint8_t asc)
+{
+ WnbdSetSense(&wnbd_status, sense_key, asc);
+}
+
+void WnbdHandler::Read(
+ PWNBD_DISK Disk,
+ UINT64 RequestHandle,
+ PVOID Buffer,
+ UINT64 BlockAddress,
+ UINT32 BlockCount,
+ BOOLEAN ForceUnitAccess)
+{
+ WnbdHandler* handler = nullptr;
+ ceph_assert(!WnbdGetUserContext(Disk, (PVOID*)&handler));
+
+ WnbdHandler::IOContext* ctx = new WnbdHandler::IOContext();
+ ctx->handler = handler;
+ ctx->req_handle = RequestHandle;
+ ctx->req_type = WnbdReqTypeRead;
+ ctx->req_size = BlockCount * handler->block_size;
+ ctx->req_from = BlockAddress * handler->block_size;
+ ceph_assert(ctx->req_size <= WNBD_DEFAULT_MAX_TRANSFER_LENGTH);
+
+ int op_flags = 0;
+ if (ForceUnitAccess) {
+ op_flags |= LIBRADOS_OP_FLAG_FADVISE_FUA;
+ }
+
+ dout(20) << *ctx << ": start" << dendl;
+
+ librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(ctx, aio_callback);
+ handler->image.aio_read2(ctx->req_from, ctx->req_size, ctx->data, c, op_flags);
+
+ dout(20) << *ctx << ": submitted" << dendl;
+}
+
+void WnbdHandler::Write(
+ PWNBD_DISK Disk,
+ UINT64 RequestHandle,
+ PVOID Buffer,
+ UINT64 BlockAddress,
+ UINT32 BlockCount,
+ BOOLEAN ForceUnitAccess)
+{
+ WnbdHandler* handler = nullptr;
+ ceph_assert(!WnbdGetUserContext(Disk, (PVOID*)&handler));
+
+ WnbdHandler::IOContext* ctx = new WnbdHandler::IOContext();
+ ctx->handler = handler;
+ ctx->req_handle = RequestHandle;
+ ctx->req_type = WnbdReqTypeWrite;
+ ctx->req_size = BlockCount * handler->block_size;
+ ctx->req_from = BlockAddress * handler->block_size;
+
+ bufferptr ptr((char*)Buffer, ctx->req_size);
+ ctx->data.push_back(ptr);
+
+ int op_flags = 0;
+ if (ForceUnitAccess) {
+ op_flags |= LIBRADOS_OP_FLAG_FADVISE_FUA;
+ }
+
+ dout(20) << *ctx << ": start" << dendl;
+
+ librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(ctx, aio_callback);
+ handler->image.aio_write2(ctx->req_from, ctx->req_size, ctx->data, c, op_flags);
+
+ dout(20) << *ctx << ": submitted" << dendl;
+}
+
+void WnbdHandler::Flush(
+ PWNBD_DISK Disk,
+ UINT64 RequestHandle,
+ UINT64 BlockAddress,
+ UINT32 BlockCount)
+{
+ WnbdHandler* handler = nullptr;
+ ceph_assert(!WnbdGetUserContext(Disk, (PVOID*)&handler));
+
+ WnbdHandler::IOContext* ctx = new WnbdHandler::IOContext();
+ ctx->handler = handler;
+ ctx->req_handle = RequestHandle;
+ ctx->req_type = WnbdReqTypeFlush;
+ ctx->req_size = BlockCount * handler->block_size;
+ ctx->req_from = BlockAddress * handler->block_size;
+
+ dout(20) << *ctx << ": start" << dendl;
+
+ librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(ctx, aio_callback);
+ handler->image.aio_flush(c);
+
+ dout(20) << *ctx << ": submitted" << dendl;
+}
+
+void WnbdHandler::Unmap(
+ PWNBD_DISK Disk,
+ UINT64 RequestHandle,
+ PWNBD_UNMAP_DESCRIPTOR Descriptors,
+ UINT32 Count)
+{
+ WnbdHandler* handler = nullptr;
+ ceph_assert(!WnbdGetUserContext(Disk, (PVOID*)&handler));
+ ceph_assert(1 == Count);
+
+ WnbdHandler::IOContext* ctx = new WnbdHandler::IOContext();
+ ctx->handler = handler;
+ ctx->req_handle = RequestHandle;
+ ctx->req_type = WnbdReqTypeUnmap;
+ ctx->req_size = Descriptors[0].BlockCount * handler->block_size;
+ ctx->req_from = Descriptors[0].BlockAddress * handler->block_size;
+
+ dout(20) << *ctx << ": start" << dendl;
+
+ librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(ctx, aio_callback);
+ handler->image.aio_discard(ctx->req_from, ctx->req_size, c);
+
+ dout(20) << *ctx << ": submitted" << dendl;
+}
+
+void WnbdHandler::LogMessage(
+ WnbdLogLevel LogLevel,
+ const char* Message,
+ const char* FileName,
+ UINT32 Line,
+ const char* FunctionName)
+{
+ // We're already passing the log level to WNBD, so we'll use the highest
+ // log level here.
+ dout(0) << "libwnbd.dll!" << FunctionName << " "
+ << WnbdLogLevelToStr(LogLevel) << " " << Message << dendl;
+}
+
+
+int WnbdHandler::start()
+{
+ int err = 0;
+ WNBD_PROPERTIES wnbd_props = {0};
+
+ instance_name.copy(wnbd_props.InstanceName, sizeof(wnbd_props.InstanceName));
+ ceph_assert(strlen(RBD_WNBD_OWNER_NAME) < WNBD_MAX_OWNER_LENGTH);
+ strncpy(wnbd_props.Owner, RBD_WNBD_OWNER_NAME, WNBD_MAX_OWNER_LENGTH);
+
+ wnbd_props.BlockCount = block_count;
+ wnbd_props.BlockSize = block_size;
+ wnbd_props.MaxUnmapDescCount = 1;
+
+ wnbd_props.Flags.ReadOnly = readonly;
+ wnbd_props.Flags.UnmapSupported = 1;
+ if (rbd_cache_enabled) {
+ wnbd_props.Flags.FUASupported = 1;
+ wnbd_props.Flags.FlushSupported = 1;
+ }
+
+ err = WnbdCreate(&wnbd_props, &RbdWnbdInterface, this, &wnbd_disk);
+ if (err)
+ goto exit;
+
+ started = true;
+
+ err = WnbdStartDispatcher(wnbd_disk, io_req_workers);
+ if (err) {
+ derr << "Could not start WNBD dispatcher. Error: " << err << dendl;
+ }
+
+exit:
+ return err;
+}
+
+std::ostream &operator<<(std::ostream &os, const WnbdHandler::IOContext &ctx) {
+
+ os << "[" << std::hex << ctx.req_handle;
+
+ switch (ctx.req_type)
+ {
+ case WnbdReqTypeRead:
+ os << " READ ";
+ break;
+ case WnbdReqTypeWrite:
+ os << " WRITE ";
+ break;
+ case WnbdReqTypeFlush:
+ os << " FLUSH ";
+ break;
+ case WnbdReqTypeUnmap:
+ os << " TRIM ";
+ break;
+ default:
+ os << " UNKNOWN(" << ctx.req_type << ") ";
+ break;
+ }
+
+ os << ctx.req_from << "~" << ctx.req_size << " "
+ << std::dec << ntohl(ctx.err_code) << "]";
+
+ return os;
+}
diff --git a/src/tools/rbd_wnbd/wnbd_handler.h b/src/tools/rbd_wnbd/wnbd_handler.h
new file mode 100644
index 000000000..9a8217745
--- /dev/null
+++ b/src/tools/rbd_wnbd/wnbd_handler.h
@@ -0,0 +1,186 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef WNBD_HANDLER_H
+#define WNBD_HANDLER_H
+
+#include <wnbd.h>
+
+#include "common/admin_socket.h"
+#include "common/ceph_context.h"
+#include "common/Thread.h"
+
+#include "include/rbd/librbd.hpp"
+#include "include/xlist.h"
+
+#include "global/global_context.h"
+
+// TODO: make this configurable.
+#define RBD_WNBD_MAX_TRANSFER 2 * 1024 * 1024
+#define SOFT_REMOVE_RETRY_INTERVAL 2
+#define DEFAULT_SOFT_REMOVE_TIMEOUT 15
+#define DEFAULT_IO_WORKER_COUNT 4
+
+// Not defined by mingw.
+#ifndef SCSI_ADSENSE_UNRECOVERED_ERROR
+#define SCSI_ADSENSE_UNRECOVERED_ERROR 0x11
+#endif
+
+// The following will be assigned to the "Owner" field of the WNBD
+// parameters, which can be used to determine the application managing
+// a disk. We'll ignore other disks.
+#define RBD_WNBD_OWNER_NAME "ceph-rbd-wnbd"
+
+class WnbdHandler;
+
+class WnbdAdminHook : public AdminSocketHook {
+ WnbdHandler *m_handler;
+
+public:
+ explicit WnbdAdminHook(WnbdHandler *handler) :
+ m_handler(handler) {
+ g_ceph_context->get_admin_socket()->register_command(
+ "wnbd stats", this, "get WNBD stats");
+ }
+ ~WnbdAdminHook() override {
+ g_ceph_context->get_admin_socket()->unregister_commands(this);
+ }
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f, std::ostream& errss, bufferlist& out) override;
+};
+
+
+class WnbdHandler
+{
+private:
+ librbd::Image &image;
+ std::string instance_name;
+ uint64_t block_count;
+ uint32_t block_size;
+ bool readonly;
+ bool rbd_cache_enabled;
+ uint32_t io_req_workers;
+ uint32_t io_reply_workers;
+ WnbdAdminHook* admin_hook;
+ boost::asio::thread_pool* reply_tpool;
+
+public:
+ WnbdHandler(librbd::Image& _image, std::string _instance_name,
+ uint64_t _block_count, uint32_t _block_size,
+ bool _readonly, bool _rbd_cache_enabled,
+ uint32_t _io_req_workers,
+ uint32_t _io_reply_workers)
+ : image(_image)
+ , instance_name(_instance_name)
+ , block_count(_block_count)
+ , block_size(_block_size)
+ , readonly(_readonly)
+ , rbd_cache_enabled(_rbd_cache_enabled)
+ , io_req_workers(_io_req_workers)
+ , io_reply_workers(_io_reply_workers)
+ {
+ admin_hook = new WnbdAdminHook(this);
+ // Instead of relying on librbd's own thread pool, we're going to use a
+ // separate one. This allows us to make assumptions on the threads that
+ // are going to send the IO replies and thus be able to cache Windows
+ // OVERLAPPED structures.
+ reply_tpool = new boost::asio::thread_pool(_io_reply_workers);
+ }
+
+ int start();
+ // Wait for the handler to stop, which normally happens when the driver
+ // passes the "Disconnect" request.
+ int wait();
+ void shutdown();
+
+ int dump_stats(Formatter *f);
+
+ ~WnbdHandler();
+
+ static VOID LogMessage(
+ WnbdLogLevel LogLevel,
+ const char* Message,
+ const char* FileName,
+ UINT32 Line,
+ const char* FunctionName);
+
+private:
+ ceph::mutex shutdown_lock = ceph::make_mutex("WnbdHandler::DisconnectLocker");
+ bool started = false;
+ bool terminated = false;
+ WNBD_DISK* wnbd_disk = nullptr;
+
+ struct IOContext
+ {
+ xlist<IOContext*>::item item;
+ WnbdHandler *handler = nullptr;
+ WNBD_STATUS wnbd_status = {0};
+ WnbdRequestType req_type = WnbdReqTypeUnknown;
+ uint64_t req_handle = 0;
+ uint32_t err_code = 0;
+ size_t req_size;
+ uint64_t req_from;
+ bufferlist data;
+
+ IOContext()
+ : item(this)
+ {}
+
+ void set_sense(uint8_t sense_key, uint8_t asc, uint64_t info);
+ void set_sense(uint8_t sense_key, uint8_t asc);
+ };
+
+ friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx);
+
+ void send_io_response(IOContext *ctx);
+
+ static void aio_callback(librbd::completion_t cb, void *arg);
+
+ // WNBD IO entry points
+ static void Read(
+ PWNBD_DISK Disk,
+ UINT64 RequestHandle,
+ PVOID Buffer,
+ UINT64 BlockAddress,
+ UINT32 BlockCount,
+ BOOLEAN ForceUnitAccess);
+ static void Write(
+ PWNBD_DISK Disk,
+ UINT64 RequestHandle,
+ PVOID Buffer,
+ UINT64 BlockAddress,
+ UINT32 BlockCount,
+ BOOLEAN ForceUnitAccess);
+ static void Flush(
+ PWNBD_DISK Disk,
+ UINT64 RequestHandle,
+ UINT64 BlockAddress,
+ UINT32 BlockCount);
+ static void Unmap(
+ PWNBD_DISK Disk,
+ UINT64 RequestHandle,
+ PWNBD_UNMAP_DESCRIPTOR Descriptors,
+ UINT32 Count);
+
+ static constexpr WNBD_INTERFACE RbdWnbdInterface =
+ {
+ Read,
+ Write,
+ Flush,
+ Unmap,
+ };
+};
+
+std::ostream &operator<<(std::ostream &os, const WnbdHandler::IOContext &ctx);
+
+#endif // WNBD_HANDLER_H
diff --git a/src/tools/rebuild_mondb.cc b/src/tools/rebuild_mondb.cc
new file mode 100644
index 000000000..8e3d5b458
--- /dev/null
+++ b/src/tools/rebuild_mondb.cc
@@ -0,0 +1,351 @@
+#include "auth/cephx/CephxKeyServer.h"
+#include "common/errno.h"
+#include "mon/AuthMonitor.h"
+#include "mon/MonitorDBStore.h"
+#include "os/ObjectStore.h"
+#include "osd/OSD.h"
+
+static int update_auth(const string& keyring_path,
+ const OSDSuperblock& sb,
+ MonitorDBStore& ms);
+static int update_monitor(const OSDSuperblock& sb, MonitorDBStore& ms);
+static int update_osdmap(ObjectStore& fs,
+ OSDSuperblock& sb,
+ MonitorDBStore& ms);
+
+int update_mon_db(ObjectStore& fs, OSDSuperblock& sb,
+ const string& keyring,
+ const string& store_path)
+{
+ MonitorDBStore ms(store_path);
+ int r = ms.create_and_open(cerr);
+ if (r < 0) {
+ cerr << "unable to open mon store: " << store_path << std::endl;
+ return r;
+ }
+ if ((r = update_auth(keyring, sb, ms)) < 0) {
+ goto out;
+ }
+ if ((r = update_osdmap(fs, sb, ms)) < 0) {
+ goto out;
+ }
+ if ((r = update_monitor(sb, ms)) < 0) {
+ goto out;
+ }
+ out:
+ ms.close();
+ return r;
+}
+
+static void add_auth(KeyServerData::Incremental& auth_inc,
+ MonitorDBStore& ms)
+{
+ AuthMonitor::Incremental inc;
+ inc.inc_type = AuthMonitor::AUTH_DATA;
+ encode(auth_inc, inc.auth_data);
+ inc.auth_type = CEPH_AUTH_CEPHX;
+
+ bufferlist bl;
+ __u8 v = 1;
+ encode(v, bl);
+ inc.encode(bl, CEPH_FEATURES_ALL);
+
+ const string prefix("auth");
+ auto last_committed = ms.get(prefix, "last_committed") + 1;
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put(prefix, last_committed, bl);
+ t->put(prefix, "last_committed", last_committed);
+ auto first_committed = ms.get(prefix, "first_committed");
+ if (!first_committed) {
+ t->put(prefix, "first_committed", last_committed);
+ }
+ ms.apply_transaction(t);
+}
+
+static int get_auth_inc(const string& keyring_path,
+ const OSDSuperblock& sb,
+ KeyServerData::Incremental* auth_inc)
+{
+ auth_inc->op = KeyServerData::AUTH_INC_ADD;
+
+ // get the name
+ EntityName entity;
+ // assuming the entity name of OSD is "osd.<osd_id>"
+ entity.set(CEPH_ENTITY_TYPE_OSD, std::to_string(sb.whoami));
+ auth_inc->name = entity;
+
+ // read keyring from disk
+ KeyRing keyring;
+ {
+ bufferlist bl;
+ string error;
+ int r = bl.read_file(keyring_path.c_str(), &error);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ cout << "ignoring keyring (" << keyring_path << ")"
+ << ": " << error << std::endl;
+ return 0;
+ } else {
+ cerr << "unable to read keyring (" << keyring_path << ")"
+ << ": " << error << std::endl;
+ return r;
+ }
+ } else if (bl.length() == 0) {
+ cout << "ignoring empty keyring: " << keyring_path << std::endl;
+ return 0;
+ }
+ auto bp = bl.cbegin();
+ try {
+ decode(keyring, bp);
+ } catch (const buffer::error& e) {
+ cerr << "error decoding keyring: " << keyring_path << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ // get the key
+ EntityAuth new_inc;
+ if (!keyring.get_auth(auth_inc->name, new_inc)) {
+ cerr << "key for " << auth_inc->name << " not found in keyring: "
+ << keyring_path << std::endl;
+ return -EINVAL;
+ }
+ auth_inc->auth.key = new_inc.key;
+
+ // get the caps
+ map<string,bufferlist> caps;
+ if (new_inc.caps.empty()) {
+ // fallback to default caps for an OSD
+ // osd 'allow *' mon 'allow rwx'
+ // as suggested by document.
+ encode(string("allow *"), caps["osd"]);
+ encode(string("allow rwx"), caps["mon"]);
+ } else {
+ caps = new_inc.caps;
+ }
+ auth_inc->auth.caps = caps;
+ return 0;
+}
+
+// rebuild
+// - auth/${epoch}
+// - auth/first_committed
+// - auth/last_committed
+static int update_auth(const string& keyring_path,
+ const OSDSuperblock& sb,
+ MonitorDBStore& ms)
+{
+ // stolen from AuthMonitor::prepare_command(), where prefix is "auth add"
+ KeyServerData::Incremental auth_inc;
+ int r;
+ if ((r = get_auth_inc(keyring_path, sb, &auth_inc))) {
+ return r;
+ }
+ add_auth(auth_inc, ms);
+ return 0;
+}
+
+// stolen from Monitor::check_fsid()
+static int check_fsid(const uuid_d& fsid, MonitorDBStore& ms)
+{
+ bufferlist bl;
+ int r = ms.get("monitor", "cluster_uuid", bl);
+ if (r == -ENOENT)
+ return r;
+ string uuid(bl.c_str(), bl.length());
+ auto end = uuid.find_first_of('\n');
+ if (end != uuid.npos) {
+ uuid.resize(end);
+ }
+ uuid_d existing;
+ if (!existing.parse(uuid.c_str())) {
+ cerr << "error: unable to parse uuid" << std::endl;
+ return -EINVAL;
+ }
+ if (fsid != existing) {
+ cerr << "error: cluster_uuid " << existing << " != " << fsid << std::endl;
+ return -EEXIST;
+ }
+ return 0;
+}
+
+// rebuild
+// - monitor/cluster_uuid
+int update_monitor(const OSDSuperblock& sb, MonitorDBStore& ms)
+{
+ switch (check_fsid(sb.cluster_fsid, ms)) {
+ case -ENOENT:
+ break;
+ case -EINVAL:
+ return -EINVAL;
+ case -EEXIST:
+ return -EEXIST;
+ case 0:
+ return 0;
+ default:
+ ceph_abort();
+ }
+ string uuid = stringify(sb.cluster_fsid) + "\n";
+ bufferlist bl;
+ bl.append(uuid);
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put("monitor", "cluster_uuid", bl);
+ ms.apply_transaction(t);
+ return 0;
+}
+
+// rebuild
+// - osdmap/${epoch}
+// - osdmap/full_${epoch}
+// - osdmap/full_latest
+// - osdmap/first_committed
+// - osdmap/last_committed
+int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms)
+{
+ const string prefix("osdmap");
+ const string first_committed_name("first_committed");
+ const string last_committed_name("last_committed");
+ epoch_t first_committed = ms.get(prefix, first_committed_name);
+ epoch_t last_committed = ms.get(prefix, last_committed_name);
+ auto t = make_shared<MonitorDBStore::Transaction>();
+
+ // trim stale maps
+ unsigned ntrimmed = 0;
+ // osdmap starts at 1. if we have a "0" first_committed, then there is nothing
+ // to trim. and "1 osdmaps trimmed" in the output message is misleading. so
+ // let's make it an exception.
+ for (auto e = first_committed; first_committed && e < sb.oldest_map; e++) {
+ t->erase(prefix, e);
+ t->erase(prefix, ms.combine_strings("full", e));
+ ntrimmed++;
+ }
+ // make sure we have a non-zero first_committed. OSDMonitor relies on this.
+ // because PaxosService::put_last_committed() set it to last_committed, if it
+ // is zero. which breaks OSDMonitor::update_from_paxos(), in which we believe
+ // that latest_full should always be greater than last_committed.
+ if (first_committed == 0 && sb.oldest_map < sb.newest_map) {
+ first_committed = 1;
+ } else if (ntrimmed) {
+ first_committed += ntrimmed;
+ }
+ if (first_committed) {
+ t->put(prefix, first_committed_name, first_committed);
+ ms.apply_transaction(t);
+ t = make_shared<MonitorDBStore::Transaction>();
+ }
+
+ unsigned nadded = 0;
+
+ auto ch = fs.open_collection(coll_t::meta());
+ OSDMap osdmap;
+ for (auto e = std::max(last_committed+1, sb.oldest_map);
+ e <= sb.newest_map; e++) {
+ bool have_crc = false;
+ uint32_t crc = -1;
+ uint64_t features = 0;
+ // add inc maps
+ auto add_inc_result = [&] {
+ const auto oid = OSD::get_inc_osdmap_pobject_name(e);
+ bufferlist bl;
+ int nread = fs.read(ch, oid, 0, 0, bl);
+ if (nread <= 0) {
+ cout << "missing " << oid << std::endl;
+ return -ENOENT;
+ }
+ t->put(prefix, e, bl);
+
+ OSDMap::Incremental inc;
+ auto p = bl.cbegin();
+ inc.decode(p);
+ features = inc.encode_features | CEPH_FEATURE_RESERVED;
+ if (osdmap.get_epoch() && e > 1) {
+ if (osdmap.apply_incremental(inc)) {
+ cerr << "bad fsid: "
+ << osdmap.get_fsid() << " != " << inc.fsid << std::endl;
+ return -EINVAL;
+ }
+ have_crc = inc.have_crc;
+ if (inc.have_crc) {
+ crc = inc.full_crc;
+ bufferlist fbl;
+ osdmap.encode(fbl, features);
+ if (osdmap.get_crc() != inc.full_crc) {
+ cerr << "mismatched inc crc: "
+ << osdmap.get_crc() << " != " << inc.full_crc << std::endl;
+ return -EINVAL;
+ }
+ // inc.decode() verifies `inc_crc`, so it's been taken care of.
+ }
+ }
+ return 0;
+ }();
+ switch (add_inc_result) {
+ case -ENOENT:
+ // no worries, we always have full map
+ break;
+ case -EINVAL:
+ return -EINVAL;
+ case 0:
+ break;
+ default:
+ assert(0);
+ }
+ // add full maps
+ {
+ const auto oid = OSD::get_osdmap_pobject_name(e);
+ bufferlist bl;
+ int nread = fs.read(ch, oid, 0, 0, bl);
+ if (nread <= 0) {
+ cerr << "missing " << oid << std::endl;
+ return -EINVAL;
+ }
+ t->put(prefix, ms.combine_strings("full", e), bl);
+
+ auto p = bl.cbegin();
+ osdmap.decode(p);
+ if (osdmap.have_crc()) {
+ if (have_crc && osdmap.get_crc() != crc) {
+ cerr << "mismatched full/inc crc: "
+ << osdmap.get_crc() << " != " << crc << std::endl;
+ return -EINVAL;
+ }
+ uint32_t saved_crc = osdmap.get_crc();
+ bufferlist fbl;
+ osdmap.encode(fbl, features);
+ if (osdmap.get_crc() != saved_crc) {
+ cerr << "mismatched full crc: "
+ << saved_crc << " != " << osdmap.get_crc() << std::endl;
+ return -EINVAL;
+ }
+ }
+ }
+ nadded++;
+
+ // last_committed
+ t->put(prefix, last_committed_name, e);
+ // full last
+ t->put(prefix, ms.combine_strings("full", "latest"), e);
+
+ // this number comes from the default value of osd_target_transaction_size,
+ // so we won't OOM or stuff too many maps in a single transaction if OSD is
+ // keeping a large series of osdmap
+ static constexpr unsigned TRANSACTION_SIZE = 30;
+ if (t->size() >= TRANSACTION_SIZE) {
+ ms.apply_transaction(t);
+ t = make_shared<MonitorDBStore::Transaction>();
+ }
+ }
+ if (!t->empty()) {
+ ms.apply_transaction(t);
+ }
+ t.reset();
+
+ string osd_name("osd.");
+ osd_name += std::to_string(sb.whoami);
+ cout << std::left << setw(8)
+ << osd_name << ": "
+ << ntrimmed << " osdmaps trimmed, "
+ << nadded << " osdmaps added." << std::endl;
+ return 0;
+}
+
diff --git a/src/tools/rebuild_mondb.h b/src/tools/rebuild_mondb.h
new file mode 100644
index 000000000..8a2317d8c
--- /dev/null
+++ b/src/tools/rebuild_mondb.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <string>
+
+class ObjectStore;
+class OSDSuperblock;
+
+int update_mon_db(ObjectStore& fs, OSDSuperblock& sb,
+ const std::string& keyring_path,
+ const std::string& store_path);
diff --git a/src/tools/rgw/parse-cr-dump.py b/src/tools/rgw/parse-cr-dump.py
new file mode 100755
index 000000000..539929b11
--- /dev/null
+++ b/src/tools/rgw/parse-cr-dump.py
@@ -0,0 +1,168 @@
+#!/usr/bin/python
+from __future__ import print_function
+from collections import Counter
+import argparse
+import json
+import re
+import sys
+
+def gen_mgrs(args, cr_dump):
+ """ traverse and return one manager at a time """
+ mgrs = cr_dump['coroutine_managers']
+ if args.manager is not None:
+ yield mgrs[args.manager]
+ else:
+ for mgr in mgrs:
+ yield mgr
+
+def gen_stacks(args, cr_dump):
+ """ traverse and return one stack at a time """
+ for mgr in gen_mgrs(args, cr_dump):
+ for ctx in mgr['run_contexts']:
+ for stack in ctx['entries']:
+ yield stack
+
+def gen_ops(args, cr_dump):
+ """ traverse and return one op at a time """
+ for stack in gen_stacks(args, cr_dump):
+ for op in stack['ops']:
+ yield stack, op
+
+def op_status(op):
+ """ return op status or (none) """
+ # "status": {"status": "...", "timestamp": "..."}
+ return op.get('status', {}).get('status', '(none)')
+
+def do_crs(args, cr_dump):
+ """ print a sorted list of coroutines """
+ counter = Counter()
+
+ if args.group == 'status':
+ print('Count:\tStatus:')
+ for _, op in gen_ops(args, cr_dump):
+ if args.filter and not re.search(args.filter, op['type']):
+ continue
+ counter[op_status(op)] += 1
+ else:
+ print('Count:\tCoroutine:')
+ for _, op in gen_ops(args, cr_dump):
+ name = op['type']
+ if args.filter and not re.search(args.filter, name):
+ continue
+ counter[name] += 1
+
+ crs = counter.most_common();
+
+ if args.order == 'asc':
+ crs.reverse()
+ if args.limit:
+ crs = crs[:args.limit]
+
+ for op in crs:
+ print('%d\t%s' % (op[1], op[0]))
+ print('Total:', sum(counter.values()))
+ return 0
+
+def match_ops(name, ops):
+ """ return true if any op matches the given filter """
+ for op in ops:
+ if re.search(name, op):
+ return True
+ return False
+
+def do_stacks(args, cr_dump):
+ """ print a list of coroutine stacks """
+ print('Stack:\t\tCoroutines:')
+ count = 0
+ for stack in gen_stacks(args, cr_dump):
+ stack_id = stack['stack']
+ ops = [op['type'] for op in stack['ops']]
+ if args.filter and not match_ops(args.filter, ops):
+ continue
+ if args.limit and count == args.limit:
+ print('...')
+ break
+ print('%s\t%s' % (stack_id, ', '.join(ops)))
+ count += 1
+ print('Total:', count)
+ return 0
+
+def traverse_spawned_stacks(args, stack, depth, stacks, callback):
+ """ recurse through spawned stacks, passing each op to the callback """
+ for op in stack['ops']:
+ # only filter ops in base stack
+ if depth == 0 and args.filter and not re.search(args.filter, op['type']):
+ continue
+ if not callback(stack, op, depth):
+ return False
+ for spawned in op.get('spawned', []):
+ s = stacks.get(spawned)
+ if not s:
+ continue
+ if not traverse_spawned_stacks(args, s, depth + 1, stacks, callback):
+ return False
+ return True
+
+def do_stack(args, cr_dump):
+ """ inspect a given stack and its descendents """
+ # build a lookup table of stacks by id
+ stacks = {s['stack']: s for s in gen_stacks(args, cr_dump)}
+
+ stack = stacks.get(args.stack)
+ if not stack:
+ print('Stack %s not found' % args.stack, file=sys.stderr)
+ return 1
+
+ do_stack.count = 0 # for use in closure
+ def print_stack_op(stack, op, depth):
+ indent = ' ' * depth * 4
+ if args.limit and do_stack.count == args.limit:
+ print('%s...' % indent)
+ return False # stop traversal
+ do_stack.count += 1
+ print('%s[%s] %s: %s' % (indent, stack['stack'], op['type'], op_status(op)))
+ return True
+
+ traverse_spawned_stacks(args, stack, 0, stacks, print_stack_op)
+ return 0
+
+def do_spawned(args, cr_dump):
+ """ search all ops for the given spawned stack """
+ for stack, op in gen_ops(args, cr_dump):
+ if args.stack in op.get('spawned', []):
+ print('Stack %s spawned by [%s] %s' % (args.stack, stack['stack'], op['type']))
+ return 0
+ print('Stack %s not spawned' % args.stack, file=sys.stderr)
+ return 1
+
+def main():
+ parser = argparse.ArgumentParser(description='Parse and inspect the output of the "cr dump" admin socket command.')
+ parser.add_argument('--filename', type=argparse.FileType(), default=sys.stdin, help='Input filename (or stdin if empty)')
+ parser.add_argument('--filter', type=str, help='Filter by coroutine type (regex syntax is supported)')
+ parser.add_argument('--limit', type=int)
+ parser.add_argument('--manager', type=int, help='Index into coroutine_managers[]')
+
+ subparsers = parser.add_subparsers()
+
+ crs_parser = subparsers.add_parser('crs', help='Produce a sorted list of coroutines')
+ crs_parser.add_argument('--group', type=str, choices=['type', 'status'])
+ crs_parser.add_argument('--order', type=str, choices=['desc', 'asc'])
+ crs_parser.set_defaults(func=do_crs)
+
+ stacks_parser = subparsers.add_parser('stacks', help='Produce a list of coroutine stacks and their ops')
+ stacks_parser.set_defaults(func=do_stacks)
+
+ stack_parser = subparsers.add_parser('stack', help='Inspect a given coroutine stack')
+ stack_parser.add_argument('stack', type=str)
+ stack_parser.set_defaults(func=do_stack)
+
+ spawned_parser = subparsers.add_parser('spawned', help='Find the op that spawned the given stack')
+ spawned_parser.add_argument('stack', type=str)
+ spawned_parser.set_defaults(func=do_spawned)
+
+ args = parser.parse_args()
+ return args.func(args, json.load(args.filename))
+
+if __name__ == "__main__":
+ result = main()
+ sys.exit(result)
diff --git a/src/tools/scratchtool.c b/src/tools/scratchtool.c
new file mode 100644
index 000000000..70aeb6d89
--- /dev/null
+++ b/src/tools/scratchtool.c
@@ -0,0 +1,319 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+static int do_rados_setxattr(rados_ioctx_t io_ctx, const char *oid,
+ const char *key, const char *val)
+{
+ int ret = rados_setxattr(io_ctx, oid, key, val, strlen(val) + 1);
+ if (ret < 0) {
+ printf("rados_setxattr failed with error %d\n", ret);
+ return 1;
+ }
+ printf("rados_setxattr %s=%s\n", key, val);
+ return 0;
+}
+
+static int do_rados_getxattr(rados_ioctx_t io_ctx, const char *oid,
+ const char *key, const char *expected)
+{
+ size_t blen = strlen(expected) + 1;
+ char buf[blen];
+ memset(buf, 0, sizeof(buf));
+ int r = rados_getxattr(io_ctx, oid, key, buf, blen);
+ if (r < 0) {
+ printf("rados_getxattr(%s) failed with error %d\n", key, r);
+ return 1;
+ }
+ if (strcmp(buf, expected) != 0) {
+ printf("rados_getxattr(%s) got wrong result! "
+ "expected: '%s'. got '%s'\n", key, expected, buf);
+ return 1;
+ }
+ printf("rados_getxattr %s=%s\n", key, buf);
+ return 0;
+}
+
+static int do_rados_getxattrs(rados_ioctx_t io_ctx, const char *oid,
+ const char **exkeys, const char **exvals)
+{
+ rados_xattrs_iter_t iter;
+ int nval = 0, i, nfound = 0, r = 0, ret = 1;
+
+ for (i = 0; exvals[i]; ++i) {
+ ++nval;
+ }
+ r = rados_getxattrs(io_ctx, oid, &iter);
+ if (r) {
+ printf("rados_getxattrs(%s) failed with error %d\n", oid, r);
+ return 1;
+ }
+ while (1) {
+ size_t len;
+ const char *key, *val;
+ r = rados_getxattrs_next(iter, &key, &val, &len);
+ if (r) {
+ printf("rados_getxattrs(%s): rados_getxattrs_next "
+ "returned error %d\n", oid, r);
+ goto out_err;
+ }
+ if (!key)
+ break;
+ for (i = 0; i < nval; ++i) {
+ if (strcmp(exkeys[i], key))
+ continue;
+ if ((len == strlen(exvals[i]) + 1) && (val != NULL) && (!strcmp(exvals[i], val))) {
+ nfound++;
+ break;
+ }
+ printf("rados_getxattrs(%s): got key %s, but the "
+ "value was %s rather than %s.\n",
+ oid, key, val, exvals[i]);
+ goto out_err;
+ }
+ }
+ if (nfound != nval) {
+ printf("rados_getxattrs(%s): only found %d extended attributes. "
+ "Expected %d\n", oid, nfound, nval);
+ goto out_err;
+ }
+ ret = 0;
+ printf("rados_getxattrs(%s)\n", oid);
+
+out_err:
+ rados_getxattrs_end(iter);
+ return ret;
+}
+
+static int testrados(void)
+{
+ char tmp[32];
+ int i, r, safe;
+ int ret = 1; //set 1 as error case
+ rados_t cl;
+ const char *oid = "foo_object";
+ const char *exkeys[] = { "a", "b", "c", NULL };
+ const char *exvals[] = { "1", "2", "3", NULL };
+
+ if (rados_create(&cl, NULL) < 0) {
+ printf("error initializing\n");
+ return 1;
+ }
+
+ if (rados_conf_read_file(cl, NULL)) {
+ printf("error reading configuration file\n");
+ goto out_err;
+ }
+
+ // Try to set a configuration option that doesn't exist.
+ // This should fail.
+ if (!rados_conf_set(cl, "config option that doesn't exist",
+ "some random value")) {
+ printf("error: succeeded in setting nonexistent config option\n");
+ goto out_err;
+ }
+
+ if (rados_conf_get(cl, "log to stderr", tmp, sizeof(tmp))) {
+ printf("error: failed to read log_to_stderr from config\n");
+ goto out_err;
+ }
+
+ // Can we change it?
+ if (rados_conf_set(cl, "log to stderr", "true")) {
+ printf("error: error setting log_to_stderr\n");
+ goto out_err;
+ }
+ if (rados_conf_get(cl, "log to stderr", tmp, sizeof(tmp))) {
+ printf("error: failed to read log_to_stderr from config\n");
+ goto out_err;
+ }
+ if (strcmp(tmp, "true")) {
+ printf("error: new setting for log_to_stderr failed to take effect.\n");
+ goto out_err;
+ }
+
+ if (rados_connect(cl)) {
+ printf("error connecting\n");
+ goto out_err;
+ }
+ if (rados_connect(cl) == 0) {
+ printf("second connect attempt didn't return an error\n");
+ goto out_err;
+ }
+
+ /* create an io_ctx */
+ r = rados_pool_create(cl, "foo");
+ printf("rados_pool_create = %d\n", r);
+
+ rados_ioctx_t io_ctx;
+ r = rados_ioctx_create(cl, "foo", &io_ctx);
+ if (r < 0) {
+ printf("error creating ioctx\n");
+ goto out_err;
+ }
+ printf("rados_ioctx_create = %d, io_ctx = %p\n", r, io_ctx);
+
+ /* list all pools */
+ {
+ int buf_sz = rados_pool_list(cl, NULL, 0);
+ printf("need buffer size of %d\n", buf_sz);
+ char buf[buf_sz];
+ int r = rados_pool_list(cl, buf, buf_sz);
+ if (r != buf_sz) {
+ printf("buffer size mismatch: got %d the first time, but %d "
+ "the second.\n", buf_sz, r);
+ goto out_err_cleanup;
+ }
+ const char *b = buf;
+ printf("begin pools.\n");
+ while (1) {
+ if (b[0] == '\0')
+ break;
+ printf(" pool: '%s'\n", b);
+ b += strlen(b) + 1;
+ };
+ printf("end pools.\n");
+ }
+
+
+ /* stat */
+ struct rados_pool_stat_t st;
+ r = rados_ioctx_pool_stat(io_ctx, &st);
+ printf("rados_ioctx_pool_stat = %d, %lld KB, %lld objects\n", r, (long long)st.num_kb, (long long)st.num_objects);
+
+ /* snapshots */
+ r = rados_ioctx_snap_create(io_ctx, "snap1");
+ printf("rados_ioctx_snap_create snap1 = %d\n", r);
+ rados_snap_t snaps[10];
+ r = rados_ioctx_snap_list(io_ctx, snaps, 10);
+ for (i=0; i<r; i++) {
+ char name[100];
+ rados_ioctx_snap_get_name(io_ctx, snaps[i], name, sizeof(name));
+ printf("rados_ioctx_snap_list got snap %lld %s\n", (long long)snaps[i], name);
+ }
+ rados_snap_t snapid;
+ r = rados_ioctx_snap_lookup(io_ctx, "snap1", &snapid);
+ printf("rados_ioctx_snap_lookup snap1 got %lld, result %d\n", (long long)snapid, r);
+ r = rados_ioctx_snap_remove(io_ctx, "snap1");
+ printf("rados_ioctx_snap_remove snap1 = %d\n", r);
+
+ /* sync io */
+ time_t tm;
+ char buf[128], buf2[128];
+ time(&tm);
+ snprintf(buf, 128, "%s", ctime(&tm));
+ r = rados_write(io_ctx, oid, buf, strlen(buf) + 1, 0);
+ printf("rados_write = %d\n", r);
+ r = rados_read(io_ctx, oid, buf2, sizeof(buf2), 0);
+ printf("rados_read = %d\n", r);
+ if (memcmp(buf, buf2, r))
+ printf("*** content mismatch ***\n");
+
+ /* attrs */
+ if (do_rados_setxattr(io_ctx, oid, "b", "2"))
+ goto out_err_cleanup;
+ if (do_rados_setxattr(io_ctx, oid, "a", "1"))
+ goto out_err_cleanup;
+ if (do_rados_setxattr(io_ctx, oid, "c", "3"))
+ goto out_err_cleanup;
+ if (do_rados_getxattr(io_ctx, oid, "a", "1"))
+ goto out_err_cleanup;
+ if (do_rados_getxattr(io_ctx, oid, "b", "2"))
+ goto out_err_cleanup;
+ if (do_rados_getxattr(io_ctx, oid, "c", "3"))
+ goto out_err_cleanup;
+ if (do_rados_getxattrs(io_ctx, oid, exkeys, exvals))
+ goto out_err_cleanup;
+
+ uint64_t size;
+ time_t mtime;
+ r = rados_stat(io_ctx, oid, &size, &mtime);
+ printf("rados_stat size = %lld mtime = %d = %d\n", (long long)size, (int)mtime, r);
+ r = rados_stat(io_ctx, "does_not_exist", NULL, NULL);
+ printf("rados_stat(does_not_exist) = %d\n", r);
+
+ /* exec */
+ rados_exec(io_ctx, oid, "crypto", "md5", buf, strlen(buf) + 1, buf, 128);
+ printf("exec result=%s\n", buf);
+ r = rados_read(io_ctx, oid, buf2, 128, 0);
+ printf("read result=%s\n", buf2);
+ printf("size=%d\n", r);
+
+ /* aio */
+ rados_completion_t a, b;
+ rados_aio_create_completion2(NULL, NULL, &a);
+ rados_aio_create_completion2(NULL, NULL, &b);
+ rados_aio_write(io_ctx, "a", a, buf, 100, 0);
+ rados_aio_write(io_ctx, "../b/bb_bb_bb\\foo\\bar", b, buf, 100, 0);
+ rados_aio_wait_for_complete(a);
+ printf("a safe\n");
+ rados_aio_wait_for_complete(b);
+ printf("b safe\n");
+ rados_aio_release(a);
+ rados_aio_release(b);
+
+ /* test flush */
+ printf("testing aio flush\n");
+ rados_completion_t c;
+ rados_aio_create_completion2(NULL, NULL, &c);
+ rados_aio_write(io_ctx, "c", c, buf, 100, 0);
+ safe = rados_aio_is_safe(c);
+ printf("a should not yet be safe and ... %s\n", safe ? "is":"is not");
+ assert(!safe);
+ rados_aio_flush(io_ctx);
+ safe = rados_aio_is_safe(c);
+ printf("a should be safe and ... %s\n", safe ? "is":"is not");
+ assert(safe);
+ rados_aio_release(c);
+
+ rados_read(io_ctx, "../b/bb_bb_bb\\foo\\bar", buf2, 128, 0);
+
+ /* list objects */
+ rados_list_ctx_t h;
+ r = rados_nobjects_list_open(io_ctx, &h);
+ printf("rados_nobjects_list_open = %d, h = %p\n", r, h);
+ const char *poolname;
+ while (rados_nobjects_list_next2(h, &poolname, NULL, NULL, NULL, NULL, NULL) == 0)
+ printf("rados_nobjects_list_next2 got object '%s'\n", poolname);
+ rados_nobjects_list_close(h);
+
+ /* stat */
+ r = rados_ioctx_pool_stat(io_ctx, &st);
+ printf("rados_stat_pool = %d, %lld KB, %lld objects\n", r, (long long)st.num_kb, (long long)st.num_objects);
+
+ ret = 0;
+
+out_err_cleanup:
+ /* delete a pool */
+ rados_ioctx_destroy(io_ctx);
+
+ r = rados_pool_delete(cl, "foo");
+ printf("rados_delete_pool = %d\n", r);
+
+out_err:
+ rados_shutdown(cl);
+ return ret;
+}
+
+int main(int argc, const char **argv)
+{
+ return testrados();
+}
diff --git a/src/tools/scratchtoolpp.cc b/src/tools/scratchtoolpp.cc
new file mode 100644
index 000000000..26a35bebc
--- /dev/null
+++ b/src/tools/scratchtoolpp.cc
@@ -0,0 +1,293 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+
+using namespace librados;
+
+#include <iostream>
+
+#include <errno.h>
+#include <stdlib.h>
+#include <time.h>
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+void buf_to_hex(const unsigned char *buf, int len, char *str)
+{
+ str[0] = '\0';
+ for (int i = 0; i < len; i++) {
+ sprintf(&str[i*2], "%02x", (int)buf[i]);
+ }
+}
+
+class C_Watch : public WatchCtx {
+public:
+ C_Watch() {}
+ void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) override {
+ cout << "C_Watch::notify() opcode=" << (int)opcode << " ver=" << ver << std::endl;
+ }
+};
+
+void testradospp_milestone(void)
+{
+ int c;
+ cout << "*** press enter to continue ***" << std::endl;
+ while ((c = getchar()) != EOF) {
+ if (c == '\n')
+ break;
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ Rados rados;
+ if (rados.init(NULL) < 0) {
+ cerr << "couldn't initialize rados!" << std::endl;
+ exit(1);
+ }
+
+ if (rados.conf_read_file(NULL)) {
+ cerr << "couldn't read configuration file." << std::endl;
+ exit(1);
+ }
+ rados.conf_parse_argv(argc, argv);
+
+ if (!rados.conf_set("config option that doesn't exist",
+ "some random value")) {
+ printf("error: succeeded in setting nonexistent config option\n");
+ exit(1);
+ }
+ if (rados.conf_set("log to stderr", "true")) {
+ printf("error: error setting log_to_stderr\n");
+ exit(1);
+ }
+ std::string tmp;
+ if (rados.conf_get("log to stderr", tmp)) {
+ printf("error: failed to read log_to_stderr from config\n");
+ exit(1);
+ }
+ if (tmp != "true") {
+ printf("error: new setting for log_to_stderr failed to take effect.\n");
+ exit(1);
+ }
+
+ if (rados.connect()) {
+ printf("error connecting\n");
+ exit(1);
+ }
+
+ cout << "rados_initialize completed" << std::endl;
+ testradospp_milestone();
+
+ time_t tm;
+ bufferlist bl, bl2, blf;
+ char buf[128];
+
+ time(&tm);
+ snprintf(buf, 128, "%s", ctime(&tm));
+ bl.append(buf, strlen(buf));
+ blf.append(buf, 16);
+
+ const char *oid = "bar";
+
+ int r = rados.pool_create("foo");
+ cout << "pool_create result = " << r << std::endl;
+
+ IoCtx io_ctx;
+ r = rados.ioctx_create("foo", io_ctx);
+ cout << "ioctx_create result = " << r << std::endl;
+
+ r = io_ctx.write(oid, bl, bl.length(), 0);
+ uint64_t objver = io_ctx.get_last_version();
+ ceph_assert(objver > 0);
+ cout << "io_ctx.write returned " << r << " last_ver=" << objver << std::endl;
+
+ uint64_t stat_size;
+ time_t stat_mtime;
+ r = io_ctx.stat(oid, &stat_size, &stat_mtime);
+ cout << "io_ctx.stat returned " << r << " size = " << stat_size << " mtime = " << stat_mtime << std::endl;
+
+ r = io_ctx.stat(oid, NULL, NULL);
+ cout << "io_ctx.stat(does_not_exist) = " << r << std::endl;
+
+ uint64_t handle;
+ C_Watch wc;
+ r = io_ctx.watch(oid, objver, &handle, &wc);
+ cout << "io_ctx.watch returned " << r << std::endl;
+
+ testradospp_milestone();
+ io_ctx.set_notify_timeout(7);
+ bufferlist notify_bl;
+ r = io_ctx.notify(oid, objver, notify_bl);
+ cout << "io_ctx.notify returned " << r << std::endl;
+ testradospp_milestone();
+
+ r = io_ctx.notify(oid, objver, notify_bl);
+ cout << "io_ctx.notify returned " << r << std::endl;
+ testradospp_milestone();
+
+ r = io_ctx.unwatch(oid, handle);
+ cout << "io_ctx.unwatch returned " << r << std::endl;
+ testradospp_milestone();
+
+ r = io_ctx.notify(oid, objver, notify_bl);
+ cout << "io_ctx.notify returned " << r << std::endl;
+ testradospp_milestone();
+ io_ctx.set_assert_version(objver);
+
+ r = io_ctx.write(oid, bl, bl.length() - 1, 0);
+ cout << "io_ctx.write returned " << r << std::endl;
+
+ r = io_ctx.write(oid, bl, bl.length() - 2, 0);
+ cout << "io_ctx.write returned " << r << std::endl;
+ r = io_ctx.write(oid, bl, bl.length() - 3, 0);
+ cout << "rados.write returned " << r << std::endl;
+ r = io_ctx.append(oid, bl, bl.length());
+ cout << "rados.write returned " << r << std::endl;
+ r = io_ctx.write_full(oid, blf);
+ cout << "rados.write_full returned " << r << std::endl;
+ r = io_ctx.read(oid, bl, bl.length(), 0);
+ cout << "rados.read returned " << r << std::endl;
+ r = io_ctx.trunc(oid, 8);
+ cout << "rados.trunc returned " << r << std::endl;
+ r = io_ctx.read(oid, bl, bl.length(), 0);
+ cout << "rados.read returned " << r << std::endl;
+ r = io_ctx.exec(oid, "crypto", "md5", bl, bl2);
+ cout << "exec returned " << r << " buf size=" << bl2.length() << std::endl;
+ const unsigned char *md5 = (const unsigned char *)bl2.c_str();
+ char md5_str[bl2.length()*2 + 1];
+ buf_to_hex(md5, bl2.length(), md5_str);
+ cout << "md5 result=" << md5_str << std::endl;
+
+ // test assert_version
+ r = io_ctx.read(oid, bl, 0, 1);
+ ceph_assert(r >= 0);
+ uint64_t v = io_ctx.get_last_version();
+ cout << oid << " version is " << v << std::endl;
+ ceph_assert(v > 0);
+ io_ctx.set_assert_version(v);
+ r = io_ctx.read(oid, bl, 0, 1);
+ ceph_assert(r >= 0);
+ io_ctx.set_assert_version(v - 1);
+ r = io_ctx.read(oid, bl, 0, 1);
+ ceph_assert(r == -ERANGE);
+ io_ctx.set_assert_version(v + 1);
+ r = io_ctx.read(oid, bl, 0, 1);
+ ceph_assert(r == -EOVERFLOW);
+
+ r = io_ctx.exec(oid, "crypto", "sha1", bl, bl2);
+ cout << "exec returned " << r << std::endl;
+ const unsigned char *sha1 = (const unsigned char *)bl2.c_str();
+ char sha1_str[bl2.length()*2 + 1];
+ buf_to_hex(sha1, bl2.length(), sha1_str);
+ cout << "sha1 result=" << sha1_str << std::endl;
+
+ r = io_ctx.exec(oid, "acl", "set", bl, bl2);
+ cout << "exec (set) returned " << r << std::endl;
+ r = io_ctx.exec(oid, "acl", "get", bl, bl2);
+ cout << "exec (get) returned " << r << std::endl;
+ if (bl2.length() > 0) {
+ cout << "attr=" << bl2.c_str() << std::endl;
+ }
+
+ int size = io_ctx.read(oid, bl2, 128, 0);
+ if (size <= 0) {
+ cout << "failed to read oid " << oid << "." << std::endl;
+ exit(1);
+ }
+ if (size > 4096) {
+ cout << "read too many bytes from oid " << oid << "." << std::endl;
+ exit(1);
+ }
+ char rbuf[size + 1];
+ memcpy(rbuf, bl2.c_str(), size);
+ rbuf[size] = '\0';
+ cout << "read result='" << rbuf << "'" << std::endl;
+ cout << "size=" << size << std::endl;
+
+ const char *oid2 = "jjj10.rbd";
+ r = io_ctx.exec(oid2, "rbd", "snap_list", bl, bl2);
+ cout << "snap_list result=" << r << std::endl;
+ r = io_ctx.exec(oid2, "rbd", "snap_add", bl, bl2);
+ cout << "snap_add result=" << r << std::endl;
+
+ if (r > 0) {
+ char *s = bl2.c_str();
+ for (int i=0; i<r; i++, s += strlen(s) + 1)
+ cout << s << std::endl;
+ }
+
+ cout << "compound operation..." << std::endl;
+ ObjectWriteOperation o;
+ o.write(0, bl);
+ o.setxattr("foo", bl2);
+ r = io_ctx.operate(oid, &o);
+ cout << "operate result=" << r << std::endl;
+
+ cout << "cmpxattr" << std::endl;
+ bufferlist val;
+ val.append("foo");
+ r = io_ctx.setxattr(oid, "foo", val);
+ ceph_assert(r >= 0);
+ {
+ ObjectReadOperation o;
+ o.cmpxattr("foo", CEPH_OSD_CMPXATTR_OP_EQ, val);
+ r = io_ctx.operate(oid, &o, &bl2);
+ cout << " got " << r << " wanted >= 0" << std::endl;
+ ceph_assert(r >= 0);
+ }
+ val.append("...");
+ {
+ ObjectReadOperation o;
+ o.cmpxattr("foo", CEPH_OSD_CMPXATTR_OP_EQ, val);
+ r = io_ctx.operate(oid, &o, &bl2);
+ cout << " got " << r << " wanted " << -ECANCELED << " (-ECANCELED)" << std::endl;
+ ceph_assert(r == -ECANCELED);
+ }
+
+ io_ctx.locator_set_key(string());
+
+ cout << "iterating over objects..." << std::endl;
+ int num_objs = 0;
+ for (NObjectIterator iter = io_ctx.nobjects_begin();
+ iter != io_ctx.nobjects_end(); ++iter) {
+ num_objs++;
+ cout << "'" << *iter << "'" << std::endl;
+ }
+ cout << "iterated over " << num_objs << " objects." << std::endl;
+ map<string, bufferlist> attrset;
+ io_ctx.getxattrs(oid, attrset);
+
+ map<string, bufferlist>::iterator it;
+ for (it = attrset.begin(); it != attrset.end(); ++it) {
+ cout << "xattr: " << it->first << std::endl;
+ }
+
+ r = io_ctx.remove(oid);
+ cout << "remove result=" << r << std::endl;
+
+ r = rados.pool_delete("foo");
+ cout << "pool_delete result=" << r << std::endl;
+
+ rados.shutdown();
+
+ return 0;
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
diff --git a/src/tools/setup-virtualenv.sh b/src/tools/setup-virtualenv.sh
new file mode 100755
index 000000000..34aee158a
--- /dev/null
+++ b/src/tools/setup-virtualenv.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2016 <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+SCRIPTNAME="$(basename $0)"
+if [ `uname` == FreeBSD ]; then
+ GETOPT="/usr/local/bin/getopt"
+else
+ GETOPT=getopt
+fi
+
+function usage {
+ echo
+ echo "$SCRIPTNAME - automate setup of Python virtual environment"
+ echo " (for use in building Ceph)"
+ echo
+ echo "Usage:"
+ echo " $SCRIPTNAME [--python=PYTHON_BINARY] TARGET_DIRECTORY"
+ echo
+ echo " TARGET_DIRECTORY will be created if it doesn't exist,"
+ echo " and completely destroyed and re-created if it does!"
+ echo
+ exit 1
+}
+
+TEMP=$($GETOPT --options "h" --long "help,python:" --name "$SCRIPTNAME" -- "$@")
+test $? != 0 && usage
+eval set -- "$TEMP"
+
+PYTHON=python3
+while true ; do
+ case "$1" in
+ -h|--help) usage ;; # does not return
+ --python) PYTHON="$2" ; shift ; shift ;;
+ --) shift ; break ;;
+ *) echo "Internal error" ; exit 1 ;;
+ esac
+done
+
+if ! $PYTHON -VV; then
+ echo "$SCRIPTNAME: unable to locate a valid PYTHON_BINARY"
+ usage
+fi
+
+DIR=$1
+if [ -z "$DIR" ] ; then
+ echo "$SCRIPTNAME: need a directory path, but none was provided"
+ usage
+fi
+rm -fr $DIR
+mkdir -p $DIR
+$PYTHON -m venv $DIR
+. $DIR/bin/activate
+
+if pip --help | grep -q disable-pip-version-check; then
+ DISABLE_PIP_VERSION_CHECK=--disable-pip-version-check
+else
+ DISABLE_PIP_VERSION_CHECK=
+fi
+
+# older versions of pip will not install wrap_console scripts
+# when using wheel packages
+pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install --upgrade 'pip >= 6.1'
+
+if pip --help | grep -q disable-pip-version-check; then
+ DISABLE_PIP_VERSION_CHECK=--disable-pip-version-check
+else
+ DISABLE_PIP_VERSION_CHECK=
+fi
+
+if test -d wheelhouse ; then
+ export NO_INDEX=--no-index
+fi
+
+pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX --find-links=file://$(pwd)/wheelhouse 'tox >=2.9.1'
+
+require_files=$(ls *requirements*.txt 2>/dev/null) || true
+constraint_files=$(ls *constraints*.txt 2>/dev/null) || true
+require=$(echo -n "$require_files" | sed -e 's/^/-r /')
+constraint=$(echo -n "$constraint_files" | sed -e 's/^/-c /')
+md5=wheelhouse/md5
+if test "$require"; then
+ if ! test -f $md5 || ! md5sum -c wheelhouse/md5 > /dev/null; then
+ NO_INDEX=''
+ fi
+ pip --exists-action i $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX \
+ --find-links=file://$(pwd)/wheelhouse $require $constraint
+fi