summaryrefslogtreecommitdiffstats
path: root/src/tools
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/tools
parentInitial commit. (diff)
downloadceph-upstream.tar.xz
ceph-upstream.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/tools/CMakeLists.txt129
-rw-r--r--src/tools/RadosDump.cc166
-rw-r--r--src/tools/RadosDump.h409
-rw-r--r--src/tools/ceph-client-debug.cc190
-rw-r--r--src/tools/ceph-dencoder/CMakeLists.txt68
-rw-r--r--src/tools/ceph-dencoder/ceph_dencoder.cc480
-rw-r--r--src/tools/ceph-dencoder/ceph_time.h68
-rw-r--r--src/tools/ceph-dencoder/sstring.h40
-rw-r--r--src/tools/ceph-dencoder/types.h880
-rw-r--r--src/tools/ceph-diff-sorted.cc173
-rw-r--r--src/tools/ceph-lazy/bash_completion.d/ceph-lazy27
-rwxr-xr-xsrc/tools/ceph-lazy/ceph-lazy709
-rwxr-xr-xsrc/tools/ceph-monstore-update-crush.sh174
-rw-r--r--src/tools/ceph_authtool.cc316
-rw-r--r--src/tools/ceph_conf.cc258
-rw-r--r--src/tools/ceph_dedup_tool.cc834
-rw-r--r--src/tools/ceph_kvstore_tool.cc356
-rw-r--r--src/tools/ceph_monstore_tool.cc1297
-rw-r--r--src/tools/ceph_objectstore_tool.cc4249
-rw-r--r--src/tools/ceph_objectstore_tool.h44
-rw-r--r--src/tools/ceph_osdomap_tool.cc211
-rw-r--r--src/tools/cephfs/CMakeLists.txt49
-rw-r--r--src/tools/cephfs/DataScan.cc2188
-rw-r--r--src/tools/cephfs/DataScan.h341
-rw-r--r--src/tools/cephfs/Dumper.cc431
-rw-r--r--src/tools/cephfs/Dumper.h45
-rw-r--r--src/tools/cephfs/EventOutput.cc153
-rw-r--r--src/tools/cephfs/EventOutput.h42
-rw-r--r--src/tools/cephfs/JournalFilter.cc315
-rw-r--r--src/tools/cephfs/JournalFilter.h73
-rw-r--r--src/tools/cephfs/JournalScanner.cc438
-rw-r--r--src/tools/cephfs/JournalScanner.h133
-rw-r--r--src/tools/cephfs/JournalTool.cc1256
-rw-r--r--src/tools/cephfs/JournalTool.h101
-rw-r--r--src/tools/cephfs/MDSUtility.cc162
-rw-r--r--src/tools/cephfs/MDSUtility.h59
-rw-r--r--src/tools/cephfs/PgFiles.cc194
-rw-r--r--src/tools/cephfs/PgFiles.h51
-rw-r--r--src/tools/cephfs/Resetter.cc224
-rw-r--r--src/tools/cephfs/Resetter.h50
-rw-r--r--src/tools/cephfs/RoleSelector.cc59
-rw-r--r--src/tools/cephfs/RoleSelector.h36
-rw-r--r--src/tools/cephfs/TableTool.cc417
-rw-r--r--src/tools/cephfs/TableTool.h40
-rw-r--r--src/tools/cephfs/cephfs-data-scan.cc47
-rw-r--r--src/tools/cephfs/cephfs-journal-tool.cc58
-rw-r--r--src/tools/cephfs/cephfs-shell1295
-rw-r--r--src/tools/cephfs/cephfs-table-tool.cc47
-rw-r--r--src/tools/cephfs/setup.py27
-rw-r--r--src/tools/crushtool.cc1304
-rwxr-xr-xsrc/tools/histogram_dump.py104
-rw-r--r--src/tools/kvstore_tool.cc316
-rw-r--r--src/tools/kvstore_tool.h80
-rw-r--r--src/tools/monmaptool.cc473
-rw-r--r--src/tools/osdmaptool.cc799
-rw-r--r--src/tools/psim.cc117
-rw-r--r--src/tools/rados/PoolDump.cc169
-rw-r--r--src/tools/rados/PoolDump.h29
-rw-r--r--src/tools/rados/RadosImport.cc399
-rw-r--r--src/tools/rados/RadosImport.h45
-rw-r--r--src/tools/rados/rados.cc4135
-rw-r--r--src/tools/radosacl.cc186
-rw-r--r--src/tools/rbd/ArgumentTypes.cc515
-rw-r--r--src/tools/rbd/ArgumentTypes.h218
-rw-r--r--src/tools/rbd/CMakeLists.txt65
-rw-r--r--src/tools/rbd/IndentStream.cc59
-rw-r--r--src/tools/rbd/IndentStream.h60
-rw-r--r--src/tools/rbd/MirrorDaemonServiceInfo.cc174
-rw-r--r--src/tools/rbd/MirrorDaemonServiceInfo.h34
-rw-r--r--src/tools/rbd/OptionPrinter.cc110
-rw-r--r--src/tools/rbd/OptionPrinter.h40
-rw-r--r--src/tools/rbd/Shell.cc432
-rw-r--r--src/tools/rbd/Shell.h76
-rw-r--r--src/tools/rbd/Utils.cc907
-rw-r--r--src/tools/rbd/Utils.h204
-rw-r--r--src/tools/rbd/action/Bench.cc539
-rw-r--r--src/tools/rbd/action/Children.cc166
-rw-r--r--src/tools/rbd/action/Clone.cc99
-rw-r--r--src/tools/rbd/action/Config.cc890
-rw-r--r--src/tools/rbd/action/Copy.cc195
-rw-r--r--src/tools/rbd/action/Create.cc264
-rw-r--r--src/tools/rbd/action/Device.cc185
-rw-r--r--src/tools/rbd/action/Diff.cc143
-rw-r--r--src/tools/rbd/action/DiskUsage.cc341
-rw-r--r--src/tools/rbd/action/Export.cc651
-rw-r--r--src/tools/rbd/action/Feature.cc116
-rw-r--r--src/tools/rbd/action/Flatten.cc74
-rw-r--r--src/tools/rbd/action/Ggate.cc193
-rw-r--r--src/tools/rbd/action/Group.cc904
-rw-r--r--src/tools/rbd/action/ImageMeta.cc345
-rw-r--r--src/tools/rbd/action/Import.cc1037
-rw-r--r--src/tools/rbd/action/Info.cc459
-rw-r--r--src/tools/rbd/action/Journal.cc1254
-rw-r--r--src/tools/rbd/action/Kernel.cc561
-rw-r--r--src/tools/rbd/action/List.cc340
-rw-r--r--src/tools/rbd/action/Lock.cc279
-rw-r--r--src/tools/rbd/action/MergeDiff.cc454
-rw-r--r--src/tools/rbd/action/Migration.cc338
-rw-r--r--src/tools/rbd/action/MirrorImage.cc360
-rw-r--r--src/tools/rbd/action/MirrorPool.cc1537
-rw-r--r--src/tools/rbd/action/Namespace.cc191
-rw-r--r--src/tools/rbd/action/Nbd.cc286
-rw-r--r--src/tools/rbd/action/ObjectMap.cc131
-rw-r--r--src/tools/rbd/action/Perf.cc699
-rw-r--r--src/tools/rbd/action/Pool.cc162
-rw-r--r--src/tools/rbd/action/Remove.cc161
-rw-r--r--src/tools/rbd/action/Rename.cc94
-rw-r--r--src/tools/rbd/action/Resize.cc106
-rw-r--r--src/tools/rbd/action/Snap.cc889
-rw-r--r--src/tools/rbd/action/Sparsify.cc82
-rw-r--r--src/tools/rbd/action/Status.cc214
-rw-r--r--src/tools/rbd/action/Trash.cc525
-rw-r--r--src/tools/rbd/action/Watch.cc149
-rw-r--r--src/tools/rbd/rbd.cc10
-rw-r--r--src/tools/rbd_ggate/CMakeLists.txt9
-rw-r--r--src/tools/rbd_ggate/Driver.cc165
-rw-r--r--src/tools/rbd_ggate/Driver.h50
-rw-r--r--src/tools/rbd_ggate/Request.h55
-rw-r--r--src/tools/rbd_ggate/Server.cc270
-rw-r--r--src/tools/rbd_ggate/Server.h88
-rw-r--r--src/tools/rbd_ggate/Watcher.cc48
-rw-r--r--src/tools/rbd_ggate/Watcher.h34
-rw-r--r--src/tools/rbd_ggate/debug.cc55
-rw-r--r--src/tools/rbd_ggate/debug.h17
-rw-r--r--src/tools/rbd_ggate/ggate_drv.c379
-rw-r--r--src/tools/rbd_ggate/ggate_drv.h64
-rw-r--r--src/tools/rbd_ggate/main.cc521
-rw-r--r--src/tools/rbd_mirror/BaseRequest.h43
-rw-r--r--src/tools/rbd_mirror/CMakeLists.txt69
-rw-r--r--src/tools/rbd_mirror/ClusterWatcher.cc223
-rw-r--r--src/tools/rbd_mirror/ClusterWatcher.h69
-rw-r--r--src/tools/rbd_mirror/ImageDeleter.cc549
-rw-r--r--src/tools/rbd_mirror/ImageDeleter.h180
-rw-r--r--src/tools/rbd_mirror/ImageMap.cc601
-rw-r--r--src/tools/rbd_mirror/ImageMap.h175
-rw-r--r--src/tools/rbd_mirror/ImageReplayer.cc1896
-rw-r--r--src/tools/rbd_mirror/ImageReplayer.h438
-rw-r--r--src/tools/rbd_mirror/ImageSync.cc481
-rw-r--r--src/tools/rbd_mirror/ImageSync.h160
-rw-r--r--src/tools/rbd_mirror/ImageSyncThrottler.cc227
-rw-r--r--src/tools/rbd_mirror/ImageSyncThrottler.h65
-rw-r--r--src/tools/rbd_mirror/InstanceReplayer.cc510
-rw-r--r--src/tools/rbd_mirror/InstanceReplayer.h123
-rw-r--r--src/tools/rbd_mirror/InstanceWatcher.cc1299
-rw-r--r--src/tools/rbd_mirror/InstanceWatcher.h264
-rw-r--r--src/tools/rbd_mirror/Instances.cc359
-rw-r--r--src/tools/rbd_mirror/Instances.h167
-rw-r--r--src/tools/rbd_mirror/LeaderWatcher.cc1145
-rw-r--r--src/tools/rbd_mirror/LeaderWatcher.h320
-rw-r--r--src/tools/rbd_mirror/Mirror.cc448
-rw-r--r--src/tools/rbd_mirror/Mirror.h77
-rw-r--r--src/tools/rbd_mirror/MirrorStatusWatcher.cc74
-rw-r--r--src/tools/rbd_mirror/MirrorStatusWatcher.h39
-rw-r--r--src/tools/rbd_mirror/PoolReplayer.cc1133
-rw-r--r--src/tools/rbd_mirror/PoolReplayer.h303
-rw-r--r--src/tools/rbd_mirror/PoolWatcher.cc553
-rw-r--r--src/tools/rbd_mirror/PoolWatcher.h166
-rw-r--r--src/tools/rbd_mirror/ProgressContext.h21
-rw-r--r--src/tools/rbd_mirror/ServiceDaemon.cc251
-rw-r--r--src/tools/rbd_mirror/ServiceDaemon.h86
-rw-r--r--src/tools/rbd_mirror/Threads.cc45
-rw-r--r--src/tools/rbd_mirror/Threads.h39
-rw-r--r--src/tools/rbd_mirror/Types.cc21
-rw-r--r--src/tools/rbd_mirror/Types.h123
-rw-r--r--src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc290
-rw-r--r--src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h104
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc384
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h136
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc265
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h113
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashWatcher.cc384
-rw-r--r--src/tools/rbd_mirror/image_deleter/TrashWatcher.h139
-rw-r--r--src/tools/rbd_mirror/image_deleter/Types.h54
-rw-r--r--src/tools/rbd_mirror/image_map/LoadRequest.cc98
-rw-r--r--src/tools/rbd_mirror/image_map/LoadRequest.h64
-rw-r--r--src/tools/rbd_mirror/image_map/Policy.cc406
-rw-r--r--src/tools/rbd_mirror/image_map/Policy.h122
-rw-r--r--src/tools/rbd_mirror/image_map/SimplePolicy.cc89
-rw-r--r--src/tools/rbd_mirror/image_map/SimplePolicy.h39
-rw-r--r--src/tools/rbd_mirror/image_map/StateTransition.cc94
-rw-r--r--src/tools/rbd_mirror/image_map/StateTransition.h76
-rw-r--r--src/tools/rbd_mirror/image_map/Types.cc138
-rw-r--r--src/tools/rbd_mirror/image_map/Types.h130
-rw-r--r--src/tools/rbd_mirror/image_map/UpdateRequest.cc100
-rw-r--r--src/tools/rbd_mirror/image_map/UpdateRequest.h65
-rw-r--r--src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc785
-rw-r--r--src/tools/rbd_mirror/image_replayer/BootstrapRequest.h230
-rw-r--r--src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc64
-rw-r--r--src/tools/rbd_mirror/image_replayer/CloseImageRequest.h56
-rw-r--r--src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc506
-rw-r--r--src/tools/rbd_mirror/image_replayer/CreateImageRequest.h154
-rw-r--r--src/tools/rbd_mirror/image_replayer/EventPreprocessor.cc204
-rw-r--r--src/tools/rbd_mirror/image_replayer/EventPreprocessor.h122
-rw-r--r--src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc85
-rw-r--r--src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h75
-rw-r--r--src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.cc125
-rw-r--r--src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.h67
-rw-r--r--src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc75
-rw-r--r--src/tools/rbd_mirror/image_replayer/OpenImageRequest.h71
-rw-r--r--src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc271
-rw-r--r--src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h90
-rw-r--r--src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc180
-rw-r--r--src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h102
-rw-r--r--src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc195
-rw-r--r--src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h141
-rw-r--r--src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc246
-rw-r--r--src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h60
-rw-r--r--src/tools/rbd_mirror/image_replayer/Types.h21
-rw-r--r--src/tools/rbd_mirror/image_replayer/Utils.cc50
-rw-r--r--src/tools/rbd_mirror/image_replayer/Utils.h23
-rw-r--r--src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc182
-rw-r--r--src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h96
-rw-r--r--src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc220
-rw-r--r--src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h96
-rw-r--r--src/tools/rbd_mirror/instance_watcher/Types.cc245
-rw-r--r--src/tools/rbd_mirror/instance_watcher/Types.h197
-rw-r--r--src/tools/rbd_mirror/instances/Types.h28
-rw-r--r--src/tools/rbd_mirror/leader_watcher/Types.cc161
-rw-r--r--src/tools/rbd_mirror/leader_watcher/Types.h117
-rw-r--r--src/tools/rbd_mirror/main.cc104
-rw-r--r--src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc89
-rw-r--r--src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h73
-rw-r--r--src/tools/rbd_mirror/pool_watcher/Types.h27
-rw-r--r--src/tools/rbd_mirror/service_daemon/Types.cc29
-rw-r--r--src/tools/rbd_mirror/service_daemon/Types.h33
-rw-r--r--src/tools/rbd_nbd/CMakeLists.txt4
-rw-r--r--src/tools/rbd_nbd/nbd-netlink.h70
-rw-r--r--src/tools/rbd_nbd/rbd-nbd.cc1615
-rw-r--r--src/tools/rbd_recover_tool/FAQ16
-rw-r--r--src/tools/rbd_recover_tool/README97
-rw-r--r--src/tools/rbd_recover_tool/TODO2
-rw-r--r--src/tools/rbd_recover_tool/common_h412
-rw-r--r--src/tools/rbd_recover_tool/config/mds_host0
-rw-r--r--src/tools/rbd_recover_tool/config/mon_host0
-rw-r--r--src/tools/rbd_recover_tool/config/osd_host_path0
-rw-r--r--src/tools/rbd_recover_tool/database_h1134
-rw-r--r--src/tools/rbd_recover_tool/epoch_h119
-rw-r--r--src/tools/rbd_recover_tool/metadata_h368
-rwxr-xr-xsrc/tools/rbd_recover_tool/osd_job170
-rwxr-xr-xsrc/tools/rbd_recover_tool/rbd-recover-tool327
-rwxr-xr-xsrc/tools/rbd_recover_tool/test_rbd_recover_tool.sh542
-rw-r--r--src/tools/rebuild_mondb.cc351
-rw-r--r--src/tools/rebuild_mondb.h9
-rwxr-xr-xsrc/tools/rgw/parse-cr-dump.py168
-rw-r--r--src/tools/scratchtool.c319
-rw-r--r--src/tools/scratchtoolpp.cc293
-rwxr-xr-xsrc/tools/setup-virtualenv.sh89
247 files changed, 75469 insertions, 0 deletions
diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt
new file mode 100644
index 00000000..fc8539ff
--- /dev/null
+++ b/src/tools/CMakeLists.txt
@@ -0,0 +1,129 @@
+set(rados_srcs
+ rados/rados.cc
+ RadosDump.cc
+ rados/RadosImport.cc
+ rados/PoolDump.cc
+ ${PROJECT_SOURCE_DIR}/src/common/util.cc
+ ${PROJECT_SOURCE_DIR}/src/common/obj_bencher.cc
+ ${PROJECT_SOURCE_DIR}/src/osd/ECUtil.cc)
+add_executable(rados ${rados_srcs})
+
+target_link_libraries(rados librados global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+if(WITH_LIBRADOSSTRIPER)
+ target_link_libraries(rados radosstriper)
+else()
+ target_link_libraries(rados cls_lock_client)
+endif()
+install(TARGETS rados DESTINATION bin)
+
+if(WITH_TESTS)
+add_executable(ceph_scratchtool scratchtool.c)
+target_link_libraries(ceph_scratchtool librados global)
+install(TARGETS ceph_scratchtool DESTINATION bin)
+
+add_executable(ceph_scratchtoolpp scratchtoolpp.cc)
+target_link_libraries(ceph_scratchtoolpp librados global)
+install(TARGETS ceph_scratchtoolpp DESTINATION bin)
+
+add_executable(ceph_radosacl radosacl.cc)
+target_link_libraries(ceph_radosacl librados global)
+install(TARGETS ceph_radosacl DESTINATION bin)
+
+install(PROGRAMS
+ ceph-monstore-update-crush.sh
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/ceph)
+endif(WITH_TESTS)
+
+add_executable(ceph-osdomap-tool ceph_osdomap_tool.cc)
+target_link_libraries(ceph-osdomap-tool os global Boost::program_options)
+install(TARGETS ceph-osdomap-tool DESTINATION bin)
+
+add_executable(ceph-monstore-tool
+ ceph_monstore_tool.cc
+ ../mgr/mgr_commands.cc)
+target_link_libraries(ceph-monstore-tool os global Boost::program_options)
+install(TARGETS ceph-monstore-tool DESTINATION bin)
+
+add_executable(ceph-objectstore-tool
+ ceph_objectstore_tool.cc
+ rebuild_mondb.cc
+ RadosDump.cc)
+target_link_libraries(ceph-objectstore-tool osd os global Boost::program_options ${CMAKE_DL_LIBS})
+if(WITH_FUSE)
+ target_link_libraries(ceph-objectstore-tool FUSE::FUSE)
+endif(WITH_FUSE)
+install(TARGETS ceph-objectstore-tool DESTINATION bin)
+
+if(WITH_LIBCEPHFS)
+if(WITH_TESTS)
+ add_executable(ceph-client-debug ceph-client-debug.cc)
+ target_link_libraries(ceph-client-debug cephfs global client)
+ install(TARGETS ceph-client-debug DESTINATION bin)
+endif(WITH_TESTS)
+endif(WITH_LIBCEPHFS)
+
+add_executable(ceph-kvstore-tool
+ kvstore_tool.cc
+ ceph_kvstore_tool.cc)
+target_link_libraries(ceph-kvstore-tool os global)
+install(TARGETS ceph-kvstore-tool DESTINATION bin)
+
+set(ceph_conf_srcs ceph_conf.cc)
+add_executable(ceph-conf ${ceph_conf_srcs})
+target_link_libraries(ceph-conf global)
+install(TARGETS ceph-conf DESTINATION bin)
+
+set(crushtool_srcs crushtool.cc)
+add_executable(crushtool ${crushtool_srcs})
+target_link_libraries(crushtool global)
+install(TARGETS crushtool DESTINATION bin)
+
+set(monmaptool_srcs monmaptool.cc)
+add_executable(monmaptool ${monmaptool_srcs})
+target_link_libraries(monmaptool global)
+install(TARGETS monmaptool DESTINATION bin)
+
+set(osdomaptool_srcs osdmaptool.cc)
+add_executable(osdmaptool ${osdomaptool_srcs})
+target_link_libraries(osdmaptool global)
+install(TARGETS osdmaptool DESTINATION bin)
+
+set(ceph-diff-sorted_srcs ceph-diff-sorted.cc)
+add_executable(ceph-diff-sorted ${ceph-diff-sorted_srcs})
+install(TARGETS ceph-diff-sorted DESTINATION bin)
+
+if(WITH_TESTS)
+set(ceph_psim_srcs psim.cc)
+add_executable(ceph_psim ${ceph_psim_srcs})
+target_link_libraries(ceph_psim global)
+install(TARGETS ceph_psim DESTINATION bin)
+endif(WITH_TESTS)
+
+set(ceph_authtool_srcs ceph_authtool.cc)
+add_executable(ceph-authtool ${ceph_authtool_srcs})
+target_link_libraries(ceph-authtool global ${EXTRALIBS} ${CRYPTO_LIBS})
+install(TARGETS ceph-authtool DESTINATION bin)
+
+if(WITH_TESTS)
+set(cephdeduptool_srcs ceph_dedup_tool.cc)
+add_executable(cephdeduptool ${cephdeduptool_srcs})
+target_link_libraries(cephdeduptool librados global cls_cas_client)
+install(TARGETS cephdeduptool DESTINATION bin)
+endif(WITH_TESTS)
+
+if(WITH_CEPHFS)
+ add_subdirectory(cephfs)
+endif(WITH_CEPHFS)
+
+if(WITH_RBD)
+ add_subdirectory(rbd)
+ add_subdirectory(rbd_mirror)
+ if(LINUX)
+ add_subdirectory(rbd_nbd)
+ endif()
+ if(FREEBSD)
+ add_subdirectory(rbd_ggate)
+ endif()
+endif(WITH_RBD)
+
+add_subdirectory(ceph-dencoder)
diff --git a/src/tools/RadosDump.cc b/src/tools/RadosDump.cc
new file mode 100644
index 00000000..420cd9fc
--- /dev/null
+++ b/src/tools/RadosDump.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "RadosDump.h"
+
+int RadosDump::read_super()
+{
+ bufferlist ebl;
+ auto ebliter = ebl.cbegin();
+ ssize_t bytes;
+
+ bytes = ebl.read_fd(file_fd, super_header::FIXED_LENGTH);
+ if ((size_t)bytes != super_header::FIXED_LENGTH) {
+ cerr << "Unexpected EOF" << std::endl;
+ return -EFAULT;
+ }
+
+ sh.decode(ebliter);
+
+ return 0;
+}
+
+
+int RadosDump::get_header(header *h)
+{
+ assert (h != NULL);
+
+ bufferlist ebl;
+ auto ebliter = ebl.cbegin();
+ ssize_t bytes;
+
+ bytes = ebl.read_fd(file_fd, sh.header_size);
+ if ((size_t)bytes != sh.header_size) {
+ cerr << "Unexpected EOF" << std::endl;
+ return -EFAULT;
+ }
+
+ h->decode(ebliter);
+
+ return 0;
+}
+
+int RadosDump::get_footer(footer *f)
+{
+ ceph_assert(f != NULL);
+
+ bufferlist ebl;
+ auto ebliter = ebl.cbegin();
+ ssize_t bytes;
+
+ bytes = ebl.read_fd(file_fd, sh.footer_size);
+ if ((size_t)bytes != sh.footer_size) {
+ cerr << "Unexpected EOF" << std::endl;
+ return -EFAULT;
+ }
+
+ f->decode(ebliter);
+
+ if (f->magic != endmagic) {
+ cerr << "Bad footer magic" << std::endl;
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+int RadosDump::read_section(sectiontype_t *type, bufferlist *bl)
+{
+ header hdr;
+ ssize_t bytes;
+
+ int ret = get_header(&hdr);
+ if (ret)
+ return ret;
+
+ *type = hdr.type;
+
+ bl->clear();
+ bytes = bl->read_fd(file_fd, hdr.size);
+ if (bytes != hdr.size) {
+ cerr << "Unexpected EOF" << std::endl;
+ return -EFAULT;
+ }
+
+ if (hdr.size > 0) {
+ footer ft;
+ ret = get_footer(&ft);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+
+int RadosDump::skip_object(bufferlist &bl)
+{
+ bufferlist ebl;
+ bool done = false;
+ while(!done) {
+ sectiontype_t type;
+ int ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown object section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_DATA:
+ case TYPE_ATTRS:
+ case TYPE_OMAP_HDR:
+ case TYPE_OMAP:
+#ifdef DIAGNOSTIC
+ cerr << "Skip type " << (int)type << std::endl;
+#endif
+ break;
+ case TYPE_OBJECT_END:
+ done = true;
+ break;
+ default:
+ cerr << "Can't skip unknown type: " << type << std::endl;
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
+
+//Write super_header with its fixed 16 byte length
+void RadosDump::write_super()
+{
+ if (dry_run) {
+ return;
+ }
+
+ bufferlist superbl;
+ super_header sh;
+ footer ft;
+
+ header hdr(TYPE_NONE, 0);
+ hdr.encode(superbl);
+
+ sh.magic = super_header::super_magic;
+ sh.version = super_header::super_ver;
+ sh.header_size = superbl.length();
+ superbl.clear();
+ ft.encode(superbl);
+ sh.footer_size = superbl.length();
+ superbl.clear();
+
+ sh.encode(superbl);
+ ceph_assert(super_header::FIXED_LENGTH == superbl.length());
+ superbl.write_fd(file_fd);
+}
diff --git a/src/tools/RadosDump.h b/src/tools/RadosDump.h
new file mode 100644
index 00000000..83f02e69
--- /dev/null
+++ b/src/tools/RadosDump.h
@@ -0,0 +1,409 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef RADOS_DUMP_H_
+#define RADOS_DUMP_H_
+
+#include <stdint.h>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+
+#include "osd/osd_types.h"
+#include "osd/OSDMap.h"
+
+typedef uint8_t sectiontype_t;
+typedef uint32_t mymagic_t;
+typedef int64_t mysize_t;
+
+enum {
+ TYPE_NONE = 0,
+ TYPE_PG_BEGIN,
+ TYPE_PG_END,
+ TYPE_OBJECT_BEGIN,
+ TYPE_OBJECT_END,
+ TYPE_DATA,
+ TYPE_ATTRS,
+ TYPE_OMAP_HDR,
+ TYPE_OMAP,
+ TYPE_PG_METADATA,
+ TYPE_POOL_BEGIN,
+ TYPE_POOL_END,
+ END_OF_TYPES, //Keep at the end
+};
+
+const uint16_t shortmagic = 0xffce; //goes into stream as "ceff"
+//endmagic goes into stream as "ceff ffec"
+const mymagic_t endmagic = (0xecff << 16) | shortmagic;
+
+//The first FIXED_LENGTH bytes are a fixed
+//portion of the export output. This includes the overall
+//version number, and size of header and footer.
+//THIS STRUCTURE CAN ONLY BE APPENDED TO. If it needs to expand,
+//the version can be bumped and then anything
+//can be added to the export format.
+struct super_header {
+ static const uint32_t super_magic = (shortmagic << 16) | shortmagic;
+ // ver = 1, Initial version
+ // ver = 2, Add OSDSuperblock to pg_begin
+ static const uint32_t super_ver = 2;
+ static const uint32_t FIXED_LENGTH = 16;
+ uint32_t magic;
+ uint32_t version;
+ uint32_t header_size;
+ uint32_t footer_size;
+
+ super_header() : magic(0), version(0), header_size(0), footer_size(0) { }
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(magic, bl);
+ encode(version, bl);
+ encode(header_size, bl);
+ encode(footer_size, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(magic, bl);
+ decode(version, bl);
+ decode(header_size, bl);
+ decode(footer_size, bl);
+ }
+};
+
+struct header {
+ sectiontype_t type;
+ mysize_t size;
+ header(sectiontype_t type, mysize_t size) :
+ type(type), size(size) { }
+ header(): type(0), size(0) { }
+
+ void encode(bufferlist& bl) const {
+ uint32_t debug_type = (type << 24) | (type << 16) | shortmagic;
+ ENCODE_START(1, 1, bl);
+ encode(debug_type, bl);
+ encode(size, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ uint32_t debug_type;
+ DECODE_START(1, bl);
+ decode(debug_type, bl);
+ type = debug_type >> 24;
+ decode(size, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct footer {
+ mymagic_t magic;
+ footer() : magic(endmagic) { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(magic, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(magic, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct pg_begin {
+ spg_t pgid;
+ OSDSuperblock superblock;
+
+ pg_begin(spg_t pg, const OSDSuperblock& sb):
+ pgid(pg), superblock(sb) { }
+ pg_begin() { }
+
+ void encode(bufferlist& bl) const {
+ // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then
+ // shard will be NO_SHARD for a replicated pool. This means
+ // that we allow the decode by struct_v 2.
+ ENCODE_START(3, 2, bl);
+ encode(pgid.pgid, bl);
+ encode(superblock, bl);
+ encode(pgid.shard, bl);
+ ENCODE_FINISH(bl);
+ }
+ // NOTE: New super_ver prevents decode from ver 1
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(3, bl);
+ decode(pgid.pgid, bl);
+ if (struct_v > 1) {
+ decode(superblock, bl);
+ }
+ if (struct_v > 2) {
+ decode(pgid.shard, bl);
+ } else {
+ pgid.shard = shard_id_t::NO_SHARD;
+ }
+ DECODE_FINISH(bl);
+ }
+};
+
+struct object_begin {
+ ghobject_t hoid;
+
+ // Duplicate what is in the OI_ATTR so we have it at the start
+ // of object processing.
+ object_info_t oi;
+
+ explicit object_begin(const ghobject_t &hoid): hoid(hoid) { }
+ object_begin() { }
+
+ // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then
+ // generation will be NO_GEN, shard_id will be NO_SHARD for a replicated
+ // pool. This means we will allow the decode by struct_v 1.
+ void encode(bufferlist& bl) const {
+ ENCODE_START(3, 1, bl);
+ encode(hoid.hobj, bl);
+ encode(hoid.generation, bl);
+ encode(hoid.shard_id, bl);
+ encode(oi, bl, -1); /* FIXME: we always encode with full features */
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(3, bl);
+ decode(hoid.hobj, bl);
+ if (struct_v > 1) {
+ decode(hoid.generation, bl);
+ decode(hoid.shard_id, bl);
+ } else {
+ hoid.generation = ghobject_t::NO_GEN;
+ hoid.shard_id = shard_id_t::NO_SHARD;
+ }
+ if (struct_v > 2) {
+ decode(oi, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+};
+
+struct data_section {
+ uint64_t offset;
+ uint64_t len;
+ bufferlist databl;
+ data_section(uint64_t offset, uint64_t len, bufferlist bl):
+ offset(offset), len(len), databl(bl) { }
+ data_section(): offset(0), len(0) { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(offset, bl);
+ encode(len, bl);
+ encode(databl, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(offset, bl);
+ decode(len, bl);
+ decode(databl, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct attr_section {
+ map<string,bufferlist> data;
+ explicit attr_section(const map<string,bufferlist> &data) : data(data) { }
+ explicit attr_section(map<string, bufferptr> &data_)
+ {
+ for (std::map<std::string, bufferptr>::iterator i = data_.begin();
+ i != data_.end(); ++i) {
+ bufferlist bl;
+ bl.push_back(i->second);
+ data[i->first] = bl;
+ }
+ }
+
+ attr_section() { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(data, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(data, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct omap_hdr_section {
+ bufferlist hdr;
+ explicit omap_hdr_section(bufferlist hdr) : hdr(hdr) { }
+ omap_hdr_section() { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(hdr, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(hdr, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct omap_section {
+ map<string, bufferlist> omap;
+ explicit omap_section(const map<string, bufferlist> &omap) :
+ omap(omap) { }
+ omap_section() { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(omap, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(1, bl);
+ decode(omap, bl);
+ DECODE_FINISH(bl);
+ }
+};
+
+struct metadata_section {
+ // struct_ver is the on-disk version of original pg
+ __u8 struct_ver; // for reference
+ epoch_t map_epoch;
+ pg_info_t info;
+ pg_log_t log;
+ PastIntervals past_intervals;
+ OSDMap osdmap;
+ bufferlist osdmap_bl; // Used in lieu of encoding osdmap due to crc checking
+ map<eversion_t, hobject_t> divergent_priors;
+ pg_missing_t missing;
+
+ metadata_section(
+ __u8 struct_ver,
+ epoch_t map_epoch,
+ const pg_info_t &info,
+ const pg_log_t &log,
+ const PastIntervals &past_intervals,
+ const pg_missing_t &missing)
+ : struct_ver(struct_ver),
+ map_epoch(map_epoch),
+ info(info),
+ log(log),
+ past_intervals(past_intervals),
+ missing(missing) {}
+ metadata_section()
+ : struct_ver(0),
+ map_epoch(0) { }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(6, 6, bl);
+ encode(struct_ver, bl);
+ encode(map_epoch, bl);
+ encode(info, bl);
+ encode(log, bl);
+ encode(past_intervals, bl);
+ // Equivalent to osdmap.encode(bl, features); but
+ // preserving exact layout for CRC checking.
+ bl.append(osdmap_bl);
+ encode(divergent_priors, bl);
+ encode(missing, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ DECODE_START(6, bl);
+ decode(struct_ver, bl);
+ decode(map_epoch, bl);
+ decode(info, bl);
+ decode(log, bl);
+ if (struct_v >= 6) {
+ decode(past_intervals, bl);
+ } else if (struct_v > 1) {
+ cout << "NOTICE: Older export with classic past_intervals" << std::endl;
+ } else {
+ cout << "NOTICE: Older export without past_intervals" << std::endl;
+ }
+ if (struct_v > 2) {
+ osdmap.decode(bl);
+ } else {
+ cout << "WARNING: Older export without OSDMap information" << std::endl;
+ }
+ if (struct_v > 3) {
+ decode(divergent_priors, bl);
+ }
+ if (struct_v > 4) {
+ decode(missing, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+};
+
+/**
+ * Superclass for classes that will need to handle a serialized RADOS
+ * dump. Requires that the serialized dump be opened with a known FD.
+ */
+class RadosDump
+{
+ protected:
+ int file_fd;
+ super_header sh;
+ bool dry_run;
+
+ public:
+ RadosDump(int file_fd_, bool dry_run_)
+ : file_fd(file_fd_), dry_run(dry_run_)
+ {}
+
+ int read_super();
+ int get_header(header *h);
+ int get_footer(footer *f);
+ int read_section(sectiontype_t *type, bufferlist *bl);
+ int skip_object(bufferlist &bl);
+ void write_super();
+
+ // Define this in .h because it's templated
+ template <typename T>
+ int write_section(sectiontype_t type, const T& obj, int fd) {
+ if (dry_run)
+ return 0;
+ bufferlist blhdr, bl, blftr;
+ obj.encode(bl);
+ header hdr(type, bl.length());
+ hdr.encode(blhdr);
+ footer ft;
+ ft.encode(blftr);
+
+ int ret = blhdr.write_fd(fd);
+ if (ret) return ret;
+ ret = bl.write_fd(fd);
+ if (ret) return ret;
+ ret = blftr.write_fd(fd);
+ return ret;
+ }
+
+ int write_simple(sectiontype_t type, int fd)
+ {
+ if (dry_run)
+ return 0;
+ bufferlist hbl;
+
+ header hdr(type, 0);
+ hdr.encode(hbl);
+ return hbl.write_fd(fd);
+ }
+};
+
+#endif
diff --git a/src/tools/ceph-client-debug.cc b/src/tools/ceph-client-debug.cc
new file mode 100644
index 00000000..7a43c9c2
--- /dev/null
+++ b/src/tools/ceph-client-debug.cc
@@ -0,0 +1,190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/Formatter.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "client/Inode.h"
+#include "client/Dentry.h"
+#include "client/Dir.h"
+#include "include/cephfs/libcephfs.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_client
+
+void usage()
+{
+ std::cout << "Usage: ceph-client-debug [options] <inode number>" << std::endl;
+ generic_client_usage();
+}
+
+
+/**
+ * Given an inode, look up the path from the Client cache: assumes
+ * client cache is fully populated.
+ */
+void traverse_dentries(Inode *ino, std::vector<Dentry*> &parts)
+{
+ if (ino->dentries.empty()) {
+ return;
+ }
+
+ Dentry* dn = *(ino->dentries.begin());
+ parts.push_back(dn);
+ traverse_dentries(dn->dir->parent_inode, parts);
+}
+
+
+/**
+ * Given an inode, send lookup requests to the MDS for
+ * all its ancestors, such that the full trace will be
+ * populated in client cache.
+ */
+int lookup_trace(ceph_mount_info *client, inodeno_t const ino)
+{
+ Inode *inode;
+ int r = ceph_ll_lookup_inode(client, ino, &inode);
+ if (r != 0) {
+ return r;
+ } else {
+ if (!inode->dentries.empty()) {
+ Dentry *dn = *(inode->dentries.begin());
+ ceph_assert(dn->dir);
+ ceph_assert(dn->dir->parent_inode);
+ r = lookup_trace(client, dn->dir->parent_inode->ino);
+ if (r) {
+ return r;
+ }
+ } else {
+ // We reached the root of the tree
+ ceph_assert(inode->ino == CEPH_INO_ROOT);
+ }
+ }
+
+ return r;
+}
+
+
+int main(int argc, const char **argv)
+{
+ // Argument handling
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS|
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
+ common_init_finish(g_ceph_context);
+
+ // Expect exactly one positional argument (inode number)
+ if (args.size() != 1) {
+ cerr << "missing position argument (inode number)" << std::endl;
+ exit(1);
+ }
+ char const *inode_str = args[0];
+ inodeno_t inode = strtoll(inode_str, NULL, 0);
+ if (inode <= 0) {
+ derr << "Invalid inode: " << inode_str << dendl;
+ return -1;
+ }
+
+ // Initialize filesystem client
+ struct ceph_mount_info *client;
+ int r = ceph_create_with_context(&client, g_ceph_context);
+ if (r) {
+ derr << "Error initializing libcephfs: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_mount(client, "/");
+ if (r) {
+ derr << "Error mounting: " << cpp_strerror(r) << dendl;
+ ceph_shutdown(client);
+ return r;
+ }
+
+
+ // Populate client cache with inode of interest & ancestors
+ r = lookup_trace(client, inode);
+ if (r) {
+ derr << "Error looking up inode " << std::hex << inode << std::dec <<
+ ": " << cpp_strerror(r) << dendl;
+ return -1;
+ }
+
+ // Retrieve inode of interest
+ struct vinodeno_t vinode;
+ vinode.ino = inode;
+ vinode.snapid = CEPH_NOSNAP;
+ Inode *ino = ceph_ll_get_inode(client, vinode);
+
+ // Retrieve dentry trace
+ std::vector<Dentry*> path;
+ traverse_dentries(ino, path);
+
+ // Print inode and path as a JSON object
+ JSONFormatter jf(true);
+ jf.open_object_section("client_debug");
+ {
+ jf.open_object_section("inode");
+ {
+ ino->dump(&jf);
+ }
+ jf.close_section(); // inode
+ jf.open_array_section("path");
+ {
+ for (std::vector<Dentry*>::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) {
+ jf.open_object_section("dentry");
+ {
+ (*p)->dump(&jf);
+ }
+ jf.close_section(); // dentry
+ }
+ }
+ jf.close_section(); // path
+ }
+ jf.close_section(); // client_debug
+ jf.flush(std::cout);
+ std::cout << std::endl;
+
+ // Release Inode references
+ ceph_ll_forget(client, ino, 1);
+ for (std::vector<Dentry*>::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) {
+ ceph_ll_forget(client, (*p)->inode.get(), 1);
+ }
+ ino = NULL;
+ path.clear();
+
+ // Shut down
+ r = ceph_unmount(client);
+ if (r) {
+ derr << "Error mounting: " << cpp_strerror(r) << dendl;
+ }
+ ceph_shutdown(client);
+
+ return r;
+}
diff --git a/src/tools/ceph-dencoder/CMakeLists.txt b/src/tools/ceph-dencoder/CMakeLists.txt
new file mode 100644
index 00000000..15604d09
--- /dev/null
+++ b/src/tools/ceph-dencoder/CMakeLists.txt
@@ -0,0 +1,68 @@
+## dencoder
+set_source_files_properties(
+ ceph_dencoder.cc
+ APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/include/ceph_ver.h)
+
+if(HAS_VTA)
+ set_source_files_properties(ceph_dencoder.cc
+ PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+endif()
+
+set(dencoder_srcs
+ ceph_dencoder.cc
+ $<TARGET_OBJECTS:common_texttable_obj>)
+if(WITH_RADOSGW)
+ list(APPEND dencoder_srcs
+ ${CMAKE_SOURCE_DIR}/src/rgw/rgw_dencoder.cc)
+endif()
+
+add_executable(ceph-dencoder ${dencoder_srcs})
+
+if(WITH_RADOSGW)
+ list(APPEND DENCODER_EXTRALIBS
+ rgw_a
+ cls_rgw_client)
+ if(WITH_RADOSGW_AMQP_ENDPOINT)
+ list(APPEND DENCODER_EXTRALIBS
+ rabbitmq)
+ endif()
+ if(WITH_RADOSGW_KAFKA_ENDPOINT)
+ list(APPEND DENCODER_EXTRALIBS
+ rdkafka)
+ endif()
+endif()
+
+if(WITH_RBD)
+ list(APPEND DENCODER_EXTRALIBS
+ cls_rbd_client
+ rbd_mirror_types
+ rbd_types
+ rbd_replay_types)
+ if(WITH_KRBD)
+ list(APPEND DENCODER_EXTRALIBS
+ krbd)
+ endif()
+endif()
+
+if(WITH_CEPHFS)
+ list(APPEND DENCODER_EXTRALIBS
+ mds)
+endif()
+
+target_link_libraries(ceph-dencoder
+ global
+ os
+ osd
+ mon
+ journal
+ ${DENCODER_EXTRALIBS}
+ cls_lock_client
+ cls_refcount_client
+ cls_log_client
+ cls_version_client
+ cls_user_client
+ cls_journal_client
+ cls_timeindex_client
+ ${EXTRALIBS}
+ ${CMAKE_DL_LIBS})
+install(TARGETS ceph-dencoder DESTINATION bin)
diff --git a/src/tools/ceph-dencoder/ceph_dencoder.cc b/src/tools/ceph-dencoder/ceph_dencoder.cc
new file mode 100644
index 00000000..1f201ea7
--- /dev/null
+++ b/src/tools/ceph-dencoder/ceph_dencoder.cc
@@ -0,0 +1,480 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <errno.h>
+#include "include/types.h"
+#include "ceph_ver.h"
+#include "include/encoding.h"
+#include "include/ceph_features.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+#include "msg/Message.h"
+#include "include/ceph_assert.h"
+
+#define TYPE(t)
+#define TYPE_STRAYDATA(t)
+#define TYPE_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL(t)
+#define TYPE_FEATUREFUL_STRAYDATA(t)
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t)
+#define TYPE_FEATUREFUL_NOCOPY(t)
+#define TYPE_NOCOPY(t)
+#define MESSAGE(t)
+#include "tools/ceph-dencoder/types.h"
+#undef TYPE
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
+#undef TYPE_NOCOPY
+#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
+#undef TYPE_FEATUREFUL_NOCOPY
+#undef MESSAGE
+
+#define MB(m) ((m) * 1024 * 1024)
+
+void usage(ostream &out)
+{
+ out << "usage: ceph-dencoder [commands ...]" << std::endl;
+ out << "\n";
+ out << " version print version string (to stdout)\n";
+ out << "\n";
+ out << " import <encfile> read encoded data from encfile\n";
+ out << " export <outfile> write encoded data to outfile\n";
+ out << "\n";
+ out << " set_features <num> set feature bits used for encoding\n";
+ out << " get_features print feature bits (int) to stdout\n";
+ out << "\n";
+ out << " list_types list supported types\n";
+ out << " type <classname> select in-memory type\n";
+ out << " skip <num> skip <num> leading bytes before decoding\n";
+ out << " decode decode into in-memory object\n";
+ out << " encode encode in-memory object\n";
+ out << " dump_json dump in-memory object as json (to stdout)\n";
+ out << " hexdump print encoded data in hex\n";
+ out << "\n";
+ out << " copy copy object (via operator=)\n";
+ out << " copy_ctor copy object (via copy ctor)\n";
+ out << "\n";
+ out << " count_tests print number of generated test objects (to stdout)\n";
+ out << " select_test <n> select generated test object as in-memory object\n";
+ out << " is_deterministic exit w/ success if type encodes deterministically\n";
+}
+struct Dencoder {
+ virtual ~Dencoder() {}
+ virtual string decode(bufferlist bl, uint64_t seek) = 0;
+ virtual void encode(bufferlist& out, uint64_t features) = 0;
+ virtual void dump(ceph::Formatter *f) = 0;
+ virtual void copy() {
+ cerr << "copy operator= not supported" << std::endl;
+ }
+ virtual void copy_ctor() {
+ cerr << "copy ctor not supported" << std::endl;
+ }
+ virtual void generate() = 0;
+ virtual int num_generated() = 0;
+ virtual string select_generated(unsigned n) = 0;
+ virtual bool is_deterministic() = 0;
+ //virtual void print(ostream& out) = 0;
+};
+
+template<class T>
+class DencoderBase : public Dencoder {
+protected:
+ T* m_object;
+ list<T*> m_list;
+ bool stray_okay;
+ bool nondeterministic;
+
+public:
+ DencoderBase(bool stray_okay, bool nondeterministic)
+ : m_object(new T),
+ stray_okay(stray_okay),
+ nondeterministic(nondeterministic) {}
+ ~DencoderBase() override {
+ delete m_object;
+ }
+
+ string decode(bufferlist bl, uint64_t seek) override {
+ auto p = bl.cbegin();
+ p.seek(seek);
+ try {
+ using ceph::decode;
+ decode(*m_object, p);
+ }
+ catch (buffer::error& e) {
+ return e.what();
+ }
+ if (!stray_okay && !p.end()) {
+ ostringstream ss;
+ ss << "stray data at end of buffer, offset " << p.get_off();
+ return ss.str();
+ }
+ return string();
+ }
+
+ void encode(bufferlist& out, uint64_t features) override = 0;
+
+ void dump(ceph::Formatter *f) override {
+ m_object->dump(f);
+ }
+ void generate() override {
+ T::generate_test_instances(m_list);
+ }
+ int num_generated() override {
+ return m_list.size();
+ }
+ string select_generated(unsigned i) override {
+ // allow 0- or 1-based (by wrapping)
+ if (i == 0)
+ i = m_list.size();
+ if ((i == 0) || (i > m_list.size()))
+ return "invalid id for generated object";
+ m_object = *(std::next(m_list.begin(), i-1));
+ return string();
+ }
+
+ bool is_deterministic() override {
+ return !nondeterministic;
+ }
+};
+
+template<class T>
+class DencoderImplNoFeatureNoCopy : public DencoderBase<T> {
+public:
+ DencoderImplNoFeatureNoCopy(bool stray_ok, bool nondeterministic)
+ : DencoderBase<T>(stray_ok, nondeterministic) {}
+ void encode(bufferlist& out, uint64_t features) override {
+ out.clear();
+ using ceph::encode;
+ encode(*this->m_object, out);
+ }
+};
+
+template<class T>
+class DencoderImplNoFeature : public DencoderImplNoFeatureNoCopy<T> {
+public:
+ DencoderImplNoFeature(bool stray_ok, bool nondeterministic)
+ : DencoderImplNoFeatureNoCopy<T>(stray_ok, nondeterministic) {}
+ void copy() override {
+ T *n = new T;
+ *n = *this->m_object;
+ delete this->m_object;
+ this->m_object = n;
+ }
+ void copy_ctor() override {
+ T *n = new T(*this->m_object);
+ delete this->m_object;
+ this->m_object = n;
+ }
+};
+
+template<class T>
+class DencoderImplFeaturefulNoCopy : public DencoderBase<T> {
+public:
+ DencoderImplFeaturefulNoCopy(bool stray_ok, bool nondeterministic)
+ : DencoderBase<T>(stray_ok, nondeterministic) {}
+ void encode(bufferlist& out, uint64_t features) override {
+ out.clear();
+ using ceph::encode;
+ encode(*(this->m_object), out, features);
+ }
+};
+
+template<class T>
+class DencoderImplFeatureful : public DencoderImplFeaturefulNoCopy<T> {
+public:
+ DencoderImplFeatureful(bool stray_ok, bool nondeterministic)
+ : DencoderImplFeaturefulNoCopy<T>(stray_ok, nondeterministic) {}
+ void copy() override {
+ T *n = new T;
+ *n = *this->m_object;
+ delete this->m_object;
+ this->m_object = n;
+ }
+ void copy_ctor() override {
+ T *n = new T(*this->m_object);
+ delete this->m_object;
+ this->m_object = n;
+ }
+};
+
+template<class T>
+class MessageDencoderImpl : public Dencoder {
+ typename T::ref m_object;
+ list<typename T::ref> m_list;
+
+public:
+ MessageDencoderImpl() : m_object(T::create()) {}
+ ~MessageDencoderImpl() override {}
+
+ string decode(bufferlist bl, uint64_t seek) override {
+ auto p = bl.cbegin();
+ p.seek(seek);
+ try {
+ Message::ref n(decode_message(g_ceph_context, 0, p), false);
+ if (!n)
+ throw std::runtime_error("failed to decode");
+ if (n->get_type() != m_object->get_type()) {
+ stringstream ss;
+ ss << "decoded type " << n->get_type() << " instead of expected " << m_object->get_type();
+ throw std::runtime_error(ss.str());
+ }
+ m_object = boost::static_pointer_cast<typename T::ref::element_type, std::remove_reference<decltype(n)>::type::element_type>(n);
+ }
+ catch (buffer::error& e) {
+ return e.what();
+ }
+ if (!p.end()) {
+ ostringstream ss;
+ ss << "stray data at end of buffer, offset " << p.get_off();
+ return ss.str();
+ }
+ return string();
+ }
+
+ void encode(bufferlist& out, uint64_t features) override {
+ out.clear();
+ encode_message(m_object.get(), features, out);
+ }
+
+ void dump(ceph::Formatter *f) override {
+ m_object->dump(f);
+ }
+ void generate() override {
+ //T::generate_test_instances(m_list);
+ }
+ int num_generated() override {
+ return m_list.size();
+ }
+ string select_generated(unsigned i) override {
+ // allow 0- or 1-based (by wrapping)
+ if (i == 0)
+ i = m_list.size();
+ if ((i == 0) || (i > m_list.size()))
+ return "invalid id for generated object";
+ m_object = *(std::next(m_list.begin(), i-1));
+ return string();
+ }
+ bool is_deterministic() override {
+ return true;
+ }
+
+ //void print(ostream& out) {
+ //out << m_object << std::endl;
+ //}
+};
+
+
+
+int main(int argc, const char **argv)
+{
+ // dencoders
+ map<string,Dencoder*> dencoders;
+
+#define T_STR(x) #x
+#define T_STRINGIFY(x) T_STR(x)
+#define TYPE(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(false, false);
+#define TYPE_STRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(true, false);
+#define TYPE_NONDETERMINISTIC(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeature<t>(false, true);
+#define TYPE_FEATUREFUL(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(false, false);
+#define TYPE_FEATUREFUL_STRAYDATA(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(true, false);
+#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeatureful<t>(false, true);
+#define TYPE_FEATUREFUL_NOCOPY(t) dencoders[T_STRINGIFY(t)] = new DencoderImplFeaturefulNoCopy<t>(false, false);
+#define TYPE_NOCOPY(t) dencoders[T_STRINGIFY(t)] = new DencoderImplNoFeatureNoCopy<t>(false, false);
+#define MESSAGE(t) dencoders[T_STRINGIFY(t)] = new MessageDencoderImpl<t>;
+#include "tools/ceph-dencoder/types.h"
+#undef TYPE
+#undef TYPE_STRAYDATA
+#undef TYPE_NONDETERMINISTIC
+#undef TYPE_NOCOPY
+#undef TYPE_FEATUREFUL
+#undef TYPE_FEATUREFUL_STRAYDATA
+#undef TYPE_FEATUREFUL_NONDETERMINISTIC
+#undef TYPE_FEATUREFUL_NOCOPY
+#undef T_STR
+#undef T_STRINGIFY
+
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ env_to_vec(args);
+
+ Dencoder *den = NULL;
+ uint64_t features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ bufferlist encbl;
+ uint64_t skip = 0;
+
+ if (args.empty()) {
+ cerr << "-h for help" << std::endl;
+ exit(1);
+ }
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ++i) {
+ string err;
+
+ if (*i == string("help") || *i == string("-h") || *i == string("--help")) {
+ usage(cout);
+ exit(0);
+ } else if (*i == string("version")) {
+ cout << CEPH_GIT_NICE_VER << std::endl;
+ } else if (*i == string("list_types")) {
+ for (map<string,Dencoder*>::iterator p = dencoders.begin();
+ p != dencoders.end();
+ ++p)
+ cout << p->first << std::endl;
+ exit(0);
+ } else if (*i == string("type")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting type" << std::endl;
+ exit(1);
+ }
+ string cname = *i;
+ if (!dencoders.count(cname)) {
+ cerr << "class '" << cname << "' unknown" << std::endl;
+ exit(1);
+ }
+ den = dencoders[cname];
+ den->generate();
+ } else if (*i == string("skip")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting byte count" << std::endl;
+ exit(1);
+ }
+ skip = atoi(*i);
+ } else if (*i == string("get_features")) {
+ cout << CEPH_FEATURES_SUPPORTED_DEFAULT << std::endl;
+ exit(0);
+ } else if (*i == string("set_features")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting features" << std::endl;
+ exit(1);
+ }
+ features = atoll(*i);
+ } else if (*i == string("encode")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ exit(1);
+ }
+ den->encode(encbl, features | CEPH_FEATURE_RESERVED); // hack for OSDMap
+ } else if (*i == string("decode")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ exit(1);
+ }
+ err = den->decode(encbl, skip);
+ } else if (*i == string("copy_ctor")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ exit(1);
+ }
+ den->copy_ctor();
+ } else if (*i == string("copy")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ exit(1);
+ }
+ den->copy();
+ } else if (*i == string("dump_json")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ exit(1);
+ }
+ JSONFormatter jf(true);
+ jf.open_object_section("object");
+ den->dump(&jf);
+ jf.close_section();
+ jf.flush(cout);
+ cout << std::endl;
+
+ } else if (*i == string("hexdump")) {
+ encbl.hexdump(cout);
+ } else if (*i == string("import")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting filename" << std::endl;
+ exit(1);
+ }
+ int r;
+ if (*i == string("-")) {
+ *i = "stdin";
+ // Read up to 1mb if stdin specified
+ r = encbl.read_fd(STDIN_FILENO, MB(1));
+ } else {
+ r = encbl.read_file(*i, &err);
+ }
+ if (r < 0) {
+ cerr << "error reading " << *i << ": " << err << std::endl;
+ exit(1);
+ }
+
+ } else if (*i == string("export")) {
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting filename" << std::endl;
+ exit(1);
+ }
+ int fd = ::open(*i, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+ if (fd < 0) {
+ cerr << "error opening " << *i << " for write: " << cpp_strerror(errno) << std::endl;
+ exit(1);
+ }
+ int r = encbl.write_fd(fd);
+ if (r < 0) {
+ cerr << "error writing " << *i << ": " << cpp_strerror(errno) << std::endl;
+ exit(1);
+ }
+ ::close(fd);
+
+ } else if (*i == string("count_tests")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ exit(1);
+ }
+ cout << den->num_generated() << std::endl;
+ } else if (*i == string("select_test")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ exit(1);
+ }
+ ++i;
+ if (i == args.end()) {
+ cerr << "expecting instance number" << std::endl;
+ exit(1);
+ }
+ int n = atoi(*i);
+ err = den->select_generated(n);
+ } else if (*i == string("is_deterministic")) {
+ if (!den) {
+ cerr << "must first select type with 'type <name>'" << std::endl;
+ exit(1);
+ }
+ if (den->is_deterministic())
+ exit(0);
+ else
+ exit(1);
+ } else {
+ cerr << "unknown option '" << *i << "'" << std::endl;
+ exit(1);
+ }
+ if (err.length()) {
+ cerr << "error: " << err << std::endl;
+ exit(1);
+ }
+ }
+ return 0;
+}
diff --git a/src/tools/ceph-dencoder/ceph_time.h b/src/tools/ceph-dencoder/ceph_time.h
new file mode 100644
index 00000000..c27cb574
--- /dev/null
+++ b/src/tools/ceph-dencoder/ceph_time.h
@@ -0,0 +1,68 @@
+#ifndef TEST_CEPH_TIME_H
+#define TEST_CEPH_TIME_H
+
+#include <list>
+
+#include "include/encoding.h"
+#include "common/ceph_time.h"
+#include "common/Formatter.h"
+
+// wrapper for ceph::real_time that implements the dencoder interface
+template <typename Clock>
+class time_point_wrapper {
+ using time_point = typename Clock::time_point;
+ time_point t;
+ public:
+ time_point_wrapper() = default;
+ explicit time_point_wrapper(const time_point& t) : t(t) {}
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(t, bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ using ceph::decode;
+ decode(t, p);
+ }
+ void dump(Formatter* f) {
+ auto epoch_time = Clock::to_time_t(t);
+ f->dump_string("time", std::ctime(&epoch_time));
+ }
+ static void generate_test_instances(std::list<time_point_wrapper*>& ls) {
+ constexpr time_t t{455500800}; // Ghostbusters release date
+ ls.push_back(new time_point_wrapper(Clock::from_time_t(t)));
+ }
+};
+
+using real_time_wrapper = time_point_wrapper<ceph::real_clock>;
+WRITE_CLASS_ENCODER(real_time_wrapper)
+
+using coarse_real_time_wrapper = time_point_wrapper<ceph::coarse_real_clock>;
+WRITE_CLASS_ENCODER(coarse_real_time_wrapper)
+
+// wrapper for ceph::timespan that implements the dencoder interface
+class timespan_wrapper {
+ ceph::timespan d;
+ public:
+ timespan_wrapper() = default;
+ explicit timespan_wrapper(const ceph::timespan& d) : d(d) {}
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(d, bl);
+ }
+ void decode(bufferlist::const_iterator &p) {
+ using ceph::decode;
+ decode(d, p);
+ }
+ void dump(Formatter* f) {
+ f->dump_int("timespan", d.count());
+ }
+ static void generate_test_instances(std::list<timespan_wrapper*>& ls) {
+ constexpr std::chrono::seconds d{7377}; // marathon world record (2:02:57)
+ ls.push_back(new timespan_wrapper(d));
+ }
+};
+WRITE_CLASS_ENCODER(timespan_wrapper)
+
+#endif
diff --git a/src/tools/ceph-dencoder/sstring.h b/src/tools/ceph-dencoder/sstring.h
new file mode 100644
index 00000000..c2493c10
--- /dev/null
+++ b/src/tools/ceph-dencoder/sstring.h
@@ -0,0 +1,40 @@
+#ifndef TEST_SSTRING_H
+#define TEST_SSTRING_H
+
+#include "common/sstring.hh"
+
+// wrapper for sstring that implements the dencoder interface
+class sstring_wrapper {
+ using sstring16 = basic_sstring<char, uint32_t, 16>;
+ sstring16 s1;
+ using sstring24 = basic_sstring<unsigned char, uint16_t, 24>;
+ sstring24 s2;
+ public:
+ sstring_wrapper() = default;
+ sstring_wrapper(sstring16&& s1, sstring24&& s2)
+ : s1(std::move(s1)), s2(std::move(s2))
+ {}
+
+ DENC(sstring_wrapper, w, p) {
+ DENC_START(1, 1, p);
+ denc(w.s1, p);
+ denc(w.s2, p);
+ DENC_FINISH(p);
+ }
+ void dump(Formatter* f) {
+ f->dump_string("s1", s1.c_str());
+ f->dump_string("s2", reinterpret_cast<const char*>(s2.c_str()));
+ }
+ static void generate_test_instances(std::list<sstring_wrapper*>& ls) {
+ ls.push_back(new sstring_wrapper());
+ // initialize sstrings that fit in internal storage
+ constexpr auto cstr6 = "abcdef";
+ ls.push_back(new sstring_wrapper(sstring16{cstr6}, sstring24{cstr6}));
+ // initialize sstrings that overflow into external storage
+ constexpr auto cstr26 = "abcdefghijklmnopqrstuvwxyz";
+ ls.push_back(new sstring_wrapper(sstring16{cstr26}, sstring24{cstr26}));
+ }
+};
+WRITE_CLASS_DENC(sstring_wrapper)
+
+#endif
diff --git a/src/tools/ceph-dencoder/types.h b/src/tools/ceph-dencoder/types.h
new file mode 100644
index 00000000..6cfd6f16
--- /dev/null
+++ b/src/tools/ceph-dencoder/types.h
@@ -0,0 +1,880 @@
+#include "acconfig.h"
+
+#include "ceph_time.h"
+TYPE(real_time_wrapper)
+TYPE(coarse_real_time_wrapper)
+TYPE(timespan_wrapper)
+
+#include "sstring.h"
+TYPE(sstring_wrapper)
+
+#include "include/CompatSet.h"
+TYPE(CompatSet)
+
+#include "include/filepath.h"
+TYPE(filepath)
+
+#include "include/fs_types.h"
+TYPE_FEATUREFUL(file_layout_t)
+
+#include "include/util.h"
+TYPE(ceph_data_stats)
+
+#include "common/bit_vector.hpp"
+TYPE(BitVector<2>)
+
+#include "common/bloom_filter.hpp"
+TYPE(bloom_filter)
+TYPE(compressible_bloom_filter)
+
+#include "common/DecayCounter.h"
+TYPE(DecayCounter)
+
+#include "common/histogram.h"
+TYPE(pow2_hist_t)
+
+#include "common/hobject.h"
+TYPE(hobject_t)
+TYPE(ghobject_t)
+
+#include "common/LogEntry.h"
+TYPE_FEATUREFUL(LogEntry)
+TYPE_FEATUREFUL(LogSummary)
+
+#include "common/SloppyCRCMap.h"
+TYPE(SloppyCRCMap)
+
+#include "common/snap_types.h"
+TYPE(SnapContext)
+TYPE(SnapRealmInfo)
+
+#include "msg/msg_types.h"
+TYPE(entity_name_t)
+TYPE_FEATUREFUL(entity_addr_t)
+TYPE_FEATUREFUL(entity_addrvec_t)
+TYPE_FEATUREFUL(entity_inst_t)
+
+#include "crush/CrushWrapper.h"
+TYPE_FEATUREFUL_NOCOPY(CrushWrapper)
+
+#include "osd/OSDMap.h"
+TYPE(osd_info_t)
+TYPE(osd_xinfo_t)
+TYPE_FEATUREFUL_NOCOPY(OSDMap)
+TYPE_FEATUREFUL_STRAYDATA(OSDMap::Incremental)
+
+#include "osd/osd_types.h"
+TYPE(osd_reqid_t)
+TYPE(object_locator_t)
+TYPE(request_redirect_t)
+TYPE(pg_t)
+TYPE(coll_t)
+TYPE_FEATUREFUL(objectstore_perf_stat_t)
+TYPE_FEATUREFUL(osd_stat_t)
+TYPE(OSDSuperblock)
+TYPE_FEATUREFUL(pool_snap_info_t)
+TYPE_FEATUREFUL(pg_pool_t)
+TYPE(object_stat_sum_t)
+TYPE(object_stat_collection_t)
+TYPE(pg_stat_t)
+TYPE_FEATUREFUL(pool_stat_t)
+TYPE(pg_hit_set_info_t)
+TYPE(pg_hit_set_history_t)
+TYPE(pg_history_t)
+TYPE(pg_info_t)
+TYPE(PastIntervals)
+TYPE_FEATUREFUL(pg_query_t)
+TYPE(ObjectModDesc)
+TYPE(pg_log_entry_t)
+TYPE(pg_log_dup_t)
+TYPE(pg_log_t)
+TYPE_FEATUREFUL(pg_missing_item)
+TYPE(pg_missing_t)
+TYPE(pg_nls_response_t)
+TYPE(pg_ls_response_t)
+TYPE(object_copy_cursor_t)
+TYPE_FEATUREFUL(object_copy_data_t)
+TYPE(pg_create_t)
+TYPE(OSDSuperblock)
+TYPE(SnapSet)
+TYPE_FEATUREFUL(watch_info_t)
+TYPE(object_manifest_t)
+TYPE_FEATUREFUL(object_info_t)
+TYPE(SnapSet)
+TYPE_FEATUREFUL(ObjectRecoveryInfo)
+TYPE(ObjectRecoveryProgress)
+TYPE(PushReplyOp)
+TYPE_FEATUREFUL(PullOp)
+TYPE_FEATUREFUL(PushOp)
+TYPE(ScrubMap::object)
+TYPE(ScrubMap)
+TYPE_FEATUREFUL(obj_list_watch_response_t)
+TYPE(clone_info)
+TYPE(obj_list_snap_response_t)
+TYPE(pool_pg_num_history_t)
+
+#include "osd/ECUtil.h"
+// TYPE(stripe_info_t) non-standard encoding/decoding functions
+TYPE(ECUtil::HashInfo)
+
+#include "osd/ECMsgTypes.h"
+TYPE_NOCOPY(ECSubWrite)
+TYPE(ECSubWriteReply)
+TYPE_FEATUREFUL(ECSubRead)
+TYPE(ECSubReadReply)
+
+#include "osd/HitSet.h"
+TYPE_NONDETERMINISTIC(ExplicitHashHitSet)
+TYPE_NONDETERMINISTIC(ExplicitObjectHitSet)
+TYPE(BloomHitSet)
+TYPE_NONDETERMINISTIC(HitSet) // because some subclasses are
+TYPE(HitSet::Params)
+
+#include "os/ObjectStore.h"
+TYPE(ObjectStore::Transaction)
+
+#include "os/filestore/SequencerPosition.h"
+TYPE(SequencerPosition)
+
+#ifdef WITH_BLUESTORE
+#include "os/bluestore/bluestore_types.h"
+TYPE(bluestore_bdev_label_t)
+TYPE(bluestore_cnode_t)
+TYPE(bluestore_compression_header_t)
+TYPE(bluestore_extent_ref_map_t)
+TYPE(bluestore_pextent_t)
+TYPE(bluestore_blob_use_tracker_t)
+// TODO: bluestore_blob_t repurposes the "feature" param of encode() for its
+// struct_v. at a higher level, BlueStore::ExtentMap encodes the extends using
+// a different interface than the normal ones. see
+// BlueStore::ExtentMap::encode_some(). maybe we can test it using another
+// approach.
+// TYPE_FEATUREFUL(bluestore_blob_t)
+// TYPE(bluestore_shared_blob_t) there is no encode here
+TYPE(bluestore_onode_t)
+TYPE(bluestore_deferred_op_t)
+TYPE(bluestore_deferred_transaction_t)
+// TYPE(bluestore_compression_header_t) there is no encode here
+
+#include "os/bluestore/bluefs_types.h"
+TYPE(bluefs_extent_t)
+TYPE(bluefs_fnode_t)
+TYPE(bluefs_super_t)
+TYPE(bluefs_transaction_t)
+#endif
+
+#include "mon/AuthMonitor.h"
+TYPE_FEATUREFUL(AuthMonitor::Incremental)
+
+#include "mon/PGMap.h"
+TYPE_FEATUREFUL_NONDETERMINISTIC(PGMapDigest)
+TYPE_FEATUREFUL_NONDETERMINISTIC(PGMap)
+
+#include "mon/MonitorDBStore.h"
+TYPE(MonitorDBStore::Transaction)
+TYPE(MonitorDBStore::Op)
+
+#include "mon/MonMap.h"
+TYPE_FEATUREFUL(MonMap)
+
+#include "mon/MonCap.h"
+TYPE(MonCap)
+
+#include "mon/MgrMap.h"
+TYPE_FEATUREFUL(MgrMap)
+
+#include "mon/mon_types.h"
+TYPE(LevelDBStoreStats)
+TYPE(ScrubResult)
+
+#include "mon/CreatingPGs.h"
+TYPE(creating_pgs_t)
+
+#include "mgr/ServiceMap.h"
+TYPE_FEATUREFUL(ServiceMap)
+TYPE_FEATUREFUL(ServiceMap::Service)
+TYPE_FEATUREFUL(ServiceMap::Daemon)
+
+#include "os/filestore/DBObjectMap.h"
+TYPE(DBObjectMap::_Header)
+TYPE(DBObjectMap::State)
+
+#include "os/filestore/FileStore.h"
+TYPE(FSSuperblock)
+
+#include "os/kstore/kstore_types.h"
+TYPE(kstore_cnode_t)
+TYPE(kstore_onode_t)
+
+#ifdef WITH_CEPHFS
+#include "mds/JournalPointer.h"
+TYPE(JournalPointer)
+
+#include "osdc/Journaler.h"
+TYPE(Journaler::Header)
+
+#include "mds/snap.h"
+TYPE(SnapInfo)
+TYPE(snaplink_t)
+TYPE(sr_t)
+
+#include "mds/mdstypes.h"
+TYPE(frag_info_t)
+TYPE(nest_info_t)
+TYPE(quota_info_t)
+TYPE(client_writeable_range_t)
+TYPE_FEATUREFUL(inode_t<std::allocator>)
+TYPE_FEATUREFUL(old_inode_t<std::allocator>)
+TYPE(fnode_t)
+TYPE(old_rstat_t)
+TYPE_FEATUREFUL(session_info_t)
+TYPE(string_snap_t)
+TYPE(MDSCacheObjectInfo)
+TYPE(mds_table_pending_t)
+TYPE(cap_reconnect_t)
+TYPE(inode_load_vec_t)
+TYPE(dirfrag_load_vec_t)
+TYPE(mds_load_t)
+TYPE(MDSCacheObjectInfo)
+TYPE(inode_backtrace_t)
+TYPE(inode_backpointer_t)
+
+#include "mds/CInode.h"
+TYPE_FEATUREFUL(InodeStore)
+TYPE_FEATUREFUL(InodeStoreBare)
+
+#include "mds/MDSMap.h"
+TYPE_FEATUREFUL(MDSMap)
+TYPE_FEATUREFUL(MDSMap::mds_info_t)
+
+#include "mds/FSMap.h"
+//TYPE_FEATUREFUL(Filesystem)
+TYPE_FEATUREFUL(FSMap)
+
+#include "mds/Capability.h"
+TYPE_NOCOPY(Capability)
+
+#include "mds/inode_backtrace.h"
+TYPE(inode_backpointer_t)
+TYPE(inode_backtrace_t)
+
+#include "mds/InoTable.h"
+TYPE(InoTable)
+
+#include "mds/SnapServer.h"
+TYPE_STRAYDATA(SnapServer)
+
+#include "mds/events/ECommitted.h"
+TYPE_FEATUREFUL_NOCOPY(ECommitted)
+
+#include "mds/events/EExport.h"
+TYPE_FEATUREFUL_NOCOPY(EExport)
+
+#include "mds/events/EFragment.h"
+TYPE_FEATUREFUL_NOCOPY(EFragment)
+
+#include "mds/events/EImportFinish.h"
+TYPE_FEATUREFUL_NOCOPY(EImportFinish)
+
+#include "mds/events/EImportStart.h"
+TYPE_FEATUREFUL_NOCOPY(EImportStart)
+
+#include "mds/events/EMetaBlob.h"
+TYPE_FEATUREFUL_NOCOPY(EMetaBlob::fullbit)
+TYPE(EMetaBlob::remotebit)
+TYPE(EMetaBlob::nullbit)
+TYPE_FEATUREFUL_NOCOPY(EMetaBlob::dirlump)
+TYPE_FEATUREFUL_NOCOPY(EMetaBlob)
+
+#include "mds/events/EOpen.h"
+TYPE_FEATUREFUL_NOCOPY(EOpen)
+
+#include "mds/events/EResetJournal.h"
+TYPE_FEATUREFUL_NOCOPY(EResetJournal)
+
+#include "mds/events/ESession.h"
+TYPE_FEATUREFUL_NOCOPY(ESession)
+
+#include "mds/events/ESessions.h"
+TYPE_FEATUREFUL_NOCOPY(ESessions)
+
+#include "mds/events/ESlaveUpdate.h"
+TYPE(link_rollback)
+TYPE(rmdir_rollback)
+TYPE(rename_rollback::drec)
+TYPE(rename_rollback)
+TYPE_FEATUREFUL_NOCOPY(ESlaveUpdate)
+
+#include "mds/events/ESubtreeMap.h"
+TYPE_FEATUREFUL_NOCOPY(ESubtreeMap)
+
+#include "mds/events/ETableClient.h"
+TYPE_FEATUREFUL_NOCOPY(ETableClient)
+
+#include "mds/events/ETableServer.h"
+TYPE_FEATUREFUL_NOCOPY(ETableServer)
+
+#include "mds/events/EUpdate.h"
+TYPE_FEATUREFUL_NOCOPY(EUpdate)
+#endif // WITH_CEPHFS
+
+#ifdef WITH_RBD
+#include "librbd/journal/Types.h"
+TYPE(librbd::journal::EventEntry)
+TYPE(librbd::journal::ClientData)
+TYPE(librbd::journal::TagData)
+#include "librbd/mirroring_watcher/Types.h"
+TYPE(librbd::mirroring_watcher::NotifyMessage)
+#include "librbd/trash_watcher/Types.h"
+TYPE(librbd::mirroring_watcher::NotifyMessage)
+#include "librbd/WatchNotifyTypes.h"
+TYPE(librbd::watch_notify::NotifyMessage)
+TYPE(librbd::watch_notify::ResponseMessage)
+
+#include "rbd_replay/ActionTypes.h"
+TYPE(rbd_replay::action::Dependency)
+TYPE(rbd_replay::action::ActionEntry)
+
+#include "tools/rbd_mirror/image_map/Types.h"
+TYPE(rbd::mirror::image_map::PolicyData)
+#endif
+
+#ifdef WITH_RADOSGW
+
+#include "rgw/rgw_rados.h"
+TYPE(RGWOLHInfo)
+TYPE(RGWObjManifestPart)
+TYPE(RGWObjManifest)
+
+#include "rgw/rgw_zone.h"
+TYPE(RGWZoneParams)
+TYPE(RGWZone)
+TYPE(RGWZoneGroup)
+TYPE(RGWRealm)
+TYPE(RGWPeriod)
+
+#include "rgw/rgw_acl.h"
+TYPE(ACLPermission)
+TYPE(ACLGranteeType)
+TYPE(ACLGrant)
+TYPE(RGWAccessControlList)
+TYPE(ACLOwner)
+TYPE(RGWAccessControlPolicy)
+
+#include "rgw/rgw_cache.h"
+TYPE(ObjectMetaInfo)
+TYPE(ObjectCacheInfo)
+TYPE(RGWCacheNotifyInfo)
+
+#include "rgw/rgw_lc.h"
+TYPE(RGWLifecycleConfiguration)
+
+#include "cls/rgw/cls_rgw_types.h"
+TYPE(rgw_bucket_pending_info)
+TYPE(rgw_bucket_dir_entry_meta)
+TYPE(rgw_bucket_entry_ver)
+TYPE(rgw_bucket_dir_entry)
+TYPE(rgw_bucket_category_stats)
+TYPE(rgw_bucket_dir_header)
+TYPE(rgw_bucket_dir)
+TYPE(rgw_bucket_entry_ver)
+TYPE(cls_rgw_obj_key)
+TYPE(rgw_bucket_olh_log_entry)
+TYPE(rgw_usage_log_entry)
+
+#include "cls/rgw/cls_rgw_ops.h"
+TYPE(rgw_cls_obj_prepare_op)
+TYPE(rgw_cls_obj_complete_op)
+TYPE(rgw_cls_list_op)
+TYPE(rgw_cls_list_ret)
+TYPE(cls_rgw_gc_defer_entry_op)
+TYPE(cls_rgw_gc_list_op)
+TYPE(cls_rgw_gc_list_ret)
+TYPE(cls_rgw_gc_obj_info)
+TYPE(cls_rgw_gc_remove_op)
+TYPE(cls_rgw_gc_set_entry_op)
+TYPE(cls_rgw_obj)
+TYPE(cls_rgw_obj_chain)
+TYPE(rgw_cls_tag_timeout_op)
+TYPE(cls_rgw_bi_log_list_op)
+TYPE(cls_rgw_bi_log_trim_op)
+TYPE(cls_rgw_bi_log_list_ret)
+TYPE(rgw_cls_link_olh_op)
+TYPE(rgw_cls_unlink_instance_op)
+TYPE(rgw_cls_read_olh_log_op)
+TYPE(rgw_cls_read_olh_log_ret)
+TYPE(rgw_cls_trim_olh_log_op)
+TYPE(rgw_cls_bucket_clear_olh_op)
+TYPE(rgw_cls_check_index_ret)
+TYPE(cls_rgw_reshard_add_op)
+TYPE(cls_rgw_reshard_list_op)
+TYPE(cls_rgw_reshard_list_ret)
+TYPE(cls_rgw_reshard_get_op)
+TYPE(cls_rgw_reshard_get_ret)
+TYPE(cls_rgw_reshard_remove_op)
+TYPE(cls_rgw_set_bucket_resharding_op)
+TYPE(cls_rgw_clear_bucket_resharding_op)
+TYPE(cls_rgw_lc_obj_head)
+
+#include "cls/rgw/cls_rgw_client.h"
+TYPE(rgw_bi_log_entry)
+TYPE(cls_rgw_reshard_entry)
+TYPE(cls_rgw_bucket_instance_entry)
+
+#include "cls/user/cls_user_types.h"
+TYPE(cls_user_bucket)
+TYPE(cls_user_bucket_entry)
+TYPE(cls_user_stats)
+TYPE(cls_user_header)
+
+#include "cls/user/cls_user_ops.h"
+TYPE(cls_user_set_buckets_op)
+TYPE(cls_user_remove_bucket_op)
+TYPE(cls_user_list_buckets_op)
+TYPE(cls_user_list_buckets_ret)
+TYPE(cls_user_get_header_op)
+TYPE(cls_user_get_header_ret)
+TYPE(cls_user_complete_stats_sync_op)
+
+#include "cls/journal/cls_journal_types.h"
+TYPE(cls::journal::ObjectPosition)
+TYPE(cls::journal::ObjectSetPosition)
+TYPE(cls::journal::Client)
+TYPE(cls::journal::Tag)
+
+#include "rgw/rgw_common.h"
+TYPE(RGWAccessKey)
+TYPE(RGWSubUser)
+TYPE(RGWUserInfo)
+TYPE(rgw_bucket)
+TYPE(RGWBucketInfo)
+TYPE(RGWBucketEnt)
+TYPE(RGWUploadPartInfo)
+TYPE(rgw_obj)
+
+#include "rgw/rgw_log.h"
+TYPE(rgw_log_entry)
+
+#include "rgw/rgw_meta_sync_status.h"
+TYPE(rgw_meta_sync_info)
+TYPE(rgw_meta_sync_marker)
+TYPE(rgw_meta_sync_status)
+
+#include "rgw/rgw_data_sync.h"
+TYPE(rgw_data_sync_info)
+TYPE(rgw_data_sync_marker)
+TYPE(rgw_data_sync_status)
+
+#endif
+
+#ifdef WITH_RBD
+#include "cls/rbd/cls_rbd.h"
+TYPE_FEATUREFUL(cls_rbd_parent)
+TYPE_FEATUREFUL(cls_rbd_snap)
+
+#include "cls/rbd/cls_rbd_types.h"
+TYPE(cls::rbd::ParentImageSpec)
+TYPE(cls::rbd::ChildImageSpec)
+TYPE(cls::rbd::MigrationSpec)
+TYPE(cls::rbd::MirrorPeer)
+TYPE(cls::rbd::MirrorImage)
+TYPE(cls::rbd::MirrorImageMap)
+TYPE(cls::rbd::MirrorImageStatus)
+TYPE(cls::rbd::GroupImageSpec)
+TYPE(cls::rbd::GroupImageStatus)
+TYPE(cls::rbd::GroupSnapshot)
+TYPE(cls::rbd::GroupSpec)
+TYPE(cls::rbd::ImageSnapshotSpec)
+TYPE(cls::rbd::SnapshotInfo)
+TYPE(cls::rbd::SnapshotNamespace)
+#endif
+
+#include "cls/lock/cls_lock_types.h"
+TYPE(rados::cls::lock::locker_id_t)
+TYPE_FEATUREFUL(rados::cls::lock::locker_info_t)
+TYPE_FEATUREFUL(rados::cls::lock::lock_info_t)
+
+#include "cls/lock/cls_lock_ops.h"
+TYPE(cls_lock_lock_op)
+TYPE(cls_lock_unlock_op)
+TYPE(cls_lock_break_op)
+TYPE(cls_lock_get_info_op)
+TYPE_FEATUREFUL(cls_lock_get_info_reply)
+TYPE(cls_lock_list_locks_reply)
+TYPE(cls_lock_assert_op)
+TYPE(cls_lock_set_cookie_op)
+
+#include "cls/refcount/cls_refcount_ops.h"
+TYPE(cls_refcount_get_op)
+TYPE(cls_refcount_put_op)
+TYPE(cls_refcount_set_op)
+TYPE(cls_refcount_read_op)
+TYPE(cls_refcount_read_ret)
+TYPE(obj_refcount)
+
+#include "journal/Entry.h"
+TYPE(journal::Entry)
+
+// --- messages ---
+#include "messages/MAuth.h"
+MESSAGE(MAuth)
+
+#include "messages/MAuthReply.h"
+MESSAGE(MAuthReply)
+
+#include "messages/MCacheExpire.h"
+MESSAGE(MCacheExpire)
+
+#include "messages/MClientCapRelease.h"
+MESSAGE(MClientCapRelease)
+
+#include "messages/MClientCaps.h"
+MESSAGE(MClientCaps)
+
+#include "messages/MClientLease.h"
+MESSAGE(MClientLease)
+
+#include "messages/MClientReconnect.h"
+MESSAGE(MClientReconnect)
+
+#include "messages/MClientReply.h"
+MESSAGE(MClientReply)
+
+#include "messages/MClientRequest.h"
+MESSAGE(MClientRequest)
+
+#include "messages/MClientRequestForward.h"
+MESSAGE(MClientRequestForward)
+
+#include "messages/MClientQuota.h"
+MESSAGE(MClientQuota)
+
+#include "messages/MClientSession.h"
+MESSAGE(MClientSession)
+
+#include "messages/MClientSnap.h"
+MESSAGE(MClientSnap)
+
+#include "messages/MCommand.h"
+MESSAGE(MCommand)
+
+#include "messages/MCommandReply.h"
+MESSAGE(MCommandReply)
+
+#include "messages/MConfig.h"
+MESSAGE(MConfig)
+
+#include "messages/MDataPing.h"
+MESSAGE(MDataPing)
+
+#include "messages/MDentryLink.h"
+MESSAGE(MDentryLink)
+
+#include "messages/MDentryUnlink.h"
+MESSAGE(MDentryUnlink)
+
+#include "messages/MDirUpdate.h"
+MESSAGE(MDirUpdate)
+
+#include "messages/MDiscover.h"
+MESSAGE(MDiscover)
+
+#include "messages/MDiscoverReply.h"
+MESSAGE(MDiscoverReply)
+
+#include "messages/MExportCaps.h"
+MESSAGE(MExportCaps)
+
+#include "messages/MExportCapsAck.h"
+MESSAGE(MExportCapsAck)
+
+#include "messages/MExportDir.h"
+MESSAGE(MExportDir)
+
+#include "messages/MExportDirAck.h"
+MESSAGE(MExportDirAck)
+
+#include "messages/MExportDirCancel.h"
+MESSAGE(MExportDirCancel)
+
+#include "messages/MExportDirDiscover.h"
+MESSAGE(MExportDirDiscover)
+
+#include "messages/MExportDirDiscoverAck.h"
+MESSAGE(MExportDirDiscoverAck)
+
+#include "messages/MExportDirFinish.h"
+MESSAGE(MExportDirFinish)
+
+#include "messages/MExportDirNotify.h"
+MESSAGE(MExportDirNotify)
+
+#include "messages/MExportDirNotifyAck.h"
+MESSAGE(MExportDirNotifyAck)
+
+#include "messages/MExportDirPrep.h"
+MESSAGE(MExportDirPrep)
+
+#include "messages/MExportDirPrepAck.h"
+MESSAGE(MExportDirPrepAck)
+
+#include "messages/MForward.h"
+MESSAGE(MForward)
+
+#include "messages/MFSMap.h"
+MESSAGE(MFSMap)
+
+#include "messages/MFSMapUser.h"
+MESSAGE(MFSMapUser)
+
+#include "messages/MGatherCaps.h"
+MESSAGE(MGatherCaps)
+
+#include "messages/MGenericMessage.h"
+MESSAGE(MGenericMessage)
+
+#include "messages/MGetConfig.h"
+MESSAGE(MGetConfig)
+
+#include "messages/MGetPoolStats.h"
+MESSAGE(MGetPoolStats)
+
+#include "messages/MGetPoolStatsReply.h"
+MESSAGE(MGetPoolStatsReply)
+
+#include "messages/MHeartbeat.h"
+MESSAGE(MHeartbeat)
+
+#include "messages/MInodeFileCaps.h"
+MESSAGE(MInodeFileCaps)
+
+#include "messages/MLock.h"
+MESSAGE(MLock)
+
+#include "messages/MLog.h"
+MESSAGE(MLog)
+
+#include "messages/MLogAck.h"
+MESSAGE(MLogAck)
+
+#include "messages/MMDSOpenIno.h"
+MESSAGE(MMDSOpenIno)
+
+#include "messages/MMDSOpenInoReply.h"
+MESSAGE(MMDSOpenInoReply)
+
+#include "messages/MMDSBeacon.h"
+MESSAGE(MMDSBeacon)
+
+#include "messages/MMDSCacheRejoin.h"
+MESSAGE(MMDSCacheRejoin)
+
+#include "messages/MMDSFindIno.h"
+MESSAGE(MMDSFindIno)
+
+#include "messages/MMDSFindInoReply.h"
+MESSAGE(MMDSFindInoReply)
+
+#include "messages/MMDSFragmentNotify.h"
+MESSAGE(MMDSFragmentNotify)
+
+#include "messages/MMDSLoadTargets.h"
+MESSAGE(MMDSLoadTargets)
+
+#include "messages/MMDSMap.h"
+MESSAGE(MMDSMap)
+
+#include "messages/MMgrReport.h"
+MESSAGE(MMgrReport)
+
+#include "messages/MMDSResolve.h"
+MESSAGE(MMDSResolve)
+
+#include "messages/MMDSResolveAck.h"
+MESSAGE(MMDSResolveAck)
+
+#include "messages/MMDSSlaveRequest.h"
+MESSAGE(MMDSSlaveRequest)
+
+#include "messages/MMDSSnapUpdate.h"
+MESSAGE(MMDSSnapUpdate)
+
+#include "messages/MMDSTableRequest.h"
+MESSAGE(MMDSTableRequest)
+
+#include "messages/MMgrClose.h"
+MESSAGE(MMgrClose)
+
+#include "messages/MMgrConfigure.h"
+MESSAGE(MMgrConfigure)
+
+#include "messages/MMgrDigest.h"
+MESSAGE(MMgrDigest)
+
+#include "messages/MMgrMap.h"
+MESSAGE(MMgrMap)
+
+#include "messages/MMgrOpen.h"
+MESSAGE(MMgrOpen)
+
+#include "messages/MMonCommand.h"
+MESSAGE(MMonCommand)
+
+#include "messages/MMonCommandAck.h"
+MESSAGE(MMonCommandAck)
+
+#include "messages/MMonElection.h"
+MESSAGE(MMonElection)
+
+#include "messages/MMonGetMap.h"
+MESSAGE(MMonGetMap)
+
+#include "messages/MMonGetVersion.h"
+MESSAGE(MMonGetVersion)
+
+#include "messages/MMonGetVersionReply.h"
+MESSAGE(MMonGetVersionReply)
+
+#include "messages/MMonGlobalID.h"
+MESSAGE(MMonGlobalID)
+
+#include "messages/MMonJoin.h"
+MESSAGE(MMonJoin)
+
+#include "messages/MMonMap.h"
+MESSAGE(MMonMap)
+
+#include "messages/MMonMetadata.h"
+MESSAGE(MMonMetadata)
+
+#include "messages/MMonPaxos.h"
+MESSAGE(MMonPaxos)
+
+#include "messages/MMonProbe.h"
+MESSAGE(MMonProbe)
+
+#include "messages/MMonScrub.h"
+MESSAGE(MMonScrub)
+
+#include "messages/MMonSync.h"
+MESSAGE(MMonSync)
+
+#include "messages/MMonSubscribe.h"
+MESSAGE(MMonSubscribe)
+
+#include "messages/MMonSubscribeAck.h"
+MESSAGE(MMonSubscribeAck)
+
+#include "messages/MNop.h"
+MESSAGE(MNop)
+
+#include "messages/MOSDAlive.h"
+MESSAGE(MOSDAlive)
+
+#include "messages/MOSDBoot.h"
+MESSAGE(MOSDBoot)
+
+#include "messages/MOSDFailure.h"
+MESSAGE(MOSDFailure)
+
+#include "messages/MOSDMap.h"
+MESSAGE(MOSDMap)
+
+#include "messages/MOSDOp.h"
+MESSAGE(MOSDOp)
+
+#include "messages/MOSDOpReply.h"
+MESSAGE(MOSDOpReply)
+
+#include "messages/MOSDPGBackfill.h"
+MESSAGE(MOSDPGBackfill)
+
+#include "messages/MOSDPGCreate.h"
+MESSAGE(MOSDPGCreate)
+
+#include "messages/MOSDPGCreate2.h"
+MESSAGE(MOSDPGCreate2)
+
+#include "messages/MOSDPGInfo.h"
+MESSAGE(MOSDPGInfo)
+
+#include "messages/MOSDPGLog.h"
+MESSAGE(MOSDPGLog)
+
+#include "messages/MOSDPGNotify.h"
+MESSAGE(MOSDPGNotify)
+
+#include "messages/MOSDPGQuery.h"
+MESSAGE(MOSDPGQuery)
+
+#include "messages/MOSDPGRemove.h"
+MESSAGE(MOSDPGRemove)
+
+#include "messages/MOSDPGRecoveryDelete.h"
+MESSAGE(MOSDPGRecoveryDelete)
+
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+MESSAGE(MOSDPGRecoveryDeleteReply)
+
+#include "messages/MOSDPGScan.h"
+MESSAGE(MOSDPGScan)
+
+#include "messages/MOSDPGTemp.h"
+MESSAGE(MOSDPGTemp)
+
+#include "messages/MOSDPGTrim.h"
+MESSAGE(MOSDPGTrim)
+
+#include "messages/MOSDPing.h"
+MESSAGE(MOSDPing)
+
+#include "messages/MOSDRepScrub.h"
+MESSAGE(MOSDRepScrub)
+
+#include "messages/MOSDScrub.h"
+MESSAGE(MOSDScrub)
+
+#include "messages/MOSDScrub2.h"
+MESSAGE(MOSDScrub2)
+
+#include "messages/MOSDForceRecovery.h"
+MESSAGE(MOSDForceRecovery)
+
+#include "messages/MPGStats.h"
+MESSAGE(MPGStats)
+
+#include "messages/MPGStatsAck.h"
+MESSAGE(MPGStatsAck)
+
+#include "messages/MPing.h"
+MESSAGE(MPing)
+
+#include "messages/MPoolOp.h"
+MESSAGE(MPoolOp)
+
+#include "messages/MPoolOpReply.h"
+MESSAGE(MPoolOpReply)
+
+#include "messages/MRemoveSnaps.h"
+MESSAGE(MRemoveSnaps)
+
+#include "messages/MRoute.h"
+MESSAGE(MRoute)
+
+#include "messages/MServiceMap.h"
+MESSAGE(MServiceMap)
+
+#include "messages/MStatfs.h"
+MESSAGE(MStatfs)
+
+#include "messages/MStatfsReply.h"
+MESSAGE(MStatfsReply)
+
+#include "messages/MTimeCheck.h"
+MESSAGE(MTimeCheck)
+
+#include "messages/MTimeCheck2.h"
+MESSAGE(MTimeCheck2)
+
+#include "messages/MWatchNotify.h"
+MESSAGE(MWatchNotify)
diff --git a/src/tools/ceph-diff-sorted.cc b/src/tools/ceph-diff-sorted.cc
new file mode 100644
index 00000000..f8e4c28e
--- /dev/null
+++ b/src/tools/ceph-diff-sorted.cc
@@ -0,0 +1,173 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * diffsorted -- a utility to compute a line-by-line diff on two
+ * sorted input files
+ *
+ * Copyright © 2019 Red Hat
+ *
+ * Author: J. Eric Ivancich
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.
+ */
+
+
+/*
+ * SUMMARY
+ *
+ * The `diffsorted` utility does a line-by-line diff on two sorted text
+ * files and indicating lines that are in one file but not the other
+ * using diff-style notation (although line numbers are not indicated).
+ *
+ * USAGE
+ *
+ * rgw-diff-sorted file1.txt file2.txt
+ *
+ * NOTES
+ *
+ * Each files should have its lines in sorted order and should have no
+ * empty lines.
+ *
+ * A potential input file can be sorted using the `sort` utility provided
+ * that LANG=C to insure byte lexical order. For example:
+ *
+ * LANG=C sort unsorted.txt >sorted.txt
+ *
+ * or:
+ *
+ * export LANG=C
+ * sort unsorted.txt >sorted.txt
+ *
+ * EXIT STATUS
+ *
+ * 0 : files same
+ * 1 : files different
+ * 2 : usage problem (e.g., wrong number of command-line arguments)
+ * 3 : problem opening input file
+ * 4 : bad file content (e.g., unsorted order or empty lines)
+ */
+
+
+#include <iostream>
+#include <fstream>
+
+
+struct FileOfLines {
+ const char* filename;
+ std::ifstream input;
+ std::string this_line, prev_line;
+ bool next_eof;
+ bool is_eof;
+
+ FileOfLines(const char* _filename) :
+ filename(_filename),
+ input(filename),
+ next_eof(false),
+ is_eof(false)
+ { }
+
+ void dump(const std::string& prefix) {
+ do {
+ std::cout << prefix << this_line << std::endl;
+ advance();
+ } while (!eof());
+ }
+
+ bool eof() const {
+ return is_eof;
+ }
+
+ bool good() const {
+ return input.good();
+ }
+
+ void advance() {
+ if (next_eof) {
+ is_eof = true;
+ return;
+ }
+
+ prev_line = this_line;
+ std::getline(input, this_line);
+ if (this_line.empty()) {
+ if (!input.eof()) {
+ std::cerr << "Error: " << filename << " has an empty line." <<
+ std::endl;
+ exit(4);
+ }
+ is_eof = true;
+ return;
+ } else if (input.eof()) {
+ next_eof = true;
+ }
+
+ if (this_line < prev_line) {
+ std::cerr << "Error: " << filename << " is not in sorted order; \"" <<
+ this_line << "\" follows \"" << prev_line << "\"." << std::endl;
+ exit(4);
+ }
+ }
+
+ const std::string line() const {
+ return this_line;
+ }
+};
+
+int main(int argc, const char* argv[]) {
+ if (argc != 3) {
+ std::cerr << "Usage: " << argv[0] << " <file1> <file2>" << std::endl;
+ exit(2);
+ }
+
+ FileOfLines input1(argv[1]);
+ if (!input1.good()) {
+ std::cerr << "Error opening " << argv[1] <<
+ "." << std::endl;
+ exit(3);
+ }
+
+ FileOfLines input2(argv[2]);
+ if (!input2.good()) {
+ std::cerr << "Error opening " << argv[2] <<
+ "." << std::endl;
+ exit(3);
+ }
+
+ bool files_same = true;
+
+ input1.advance();
+ input2.advance();
+
+ while (!input1.eof() && !input2.eof()) {
+ if (input1.line() == input2.line()) {
+ input1.advance();
+ input2.advance();
+ } else if (input1.line() < input2.line()) {
+ files_same = false;
+ std::cout << "< " << input1.line() << std::endl;
+ input1.advance();
+ } else {
+ files_same = false;
+ std::cout << "> " << input2.line() << std::endl;
+ input2.advance();
+ }
+ }
+
+ if (!input1.eof()) {
+ files_same = false;
+ input1.dump("< ");
+ } else if (!input2.eof()) {
+ files_same = false;
+ input2.dump("> ");
+ }
+
+ if (files_same) {
+ exit(0);
+ } else {
+ exit(1);
+ }
+}
diff --git a/src/tools/ceph-lazy/bash_completion.d/ceph-lazy b/src/tools/ceph-lazy/bash_completion.d/ceph-lazy
new file mode 100644
index 00000000..4429def4
--- /dev/null
+++ b/src/tools/ceph-lazy/bash_completion.d/ceph-lazy
@@ -0,0 +1,27 @@
+_ceph-lazy()
+{
+ local cur prev all_opts commands
+ COMPREPLY=()
+ cur="${COMP_WORDS[COMP_CWORD]}"
+ prev="${COMP_WORDS[COMP_CWORD-1]}"
+
+ commands="host-get-osd host-get-nodes host-osd-usage host-all-usage pg-get-host pg-most-write pg-less-write pg-most-write-kb pg-less-write-kb pg-most-read pg-less-read pg-most-read-kb pg-less-read-kb pg-empty rbd-prefix rbd-count rbd-host rbd-osd rbd-size rbd-all-size osd-most-used osd-less-used osd-get-ppg osd-get-pg object-get-host"
+
+ all_opts="$commands -d -h"
+
+
+
+# If first option is -d keep completing without -d & -h
+ if [[ ${prev} == "-d" && ${#COMP_WORDS[@]} -eq 3 ]] ; then
+ COMPREPLY=( $(compgen -W "${commands}" -- ${cur}) )
+ return 0
+# Do completion for first args
+ elif [[ ${#COMP_WORDS[@]} -eq 2 ]]; then
+ COMPREPLY=( $(compgen -W "${all_opts}" -- ${cur}) )
+ return 0
+# Else do nothing
+ else
+ return 0
+ fi
+}
+complete -F _ceph-lazy ceph-lazy
diff --git a/src/tools/ceph-lazy/ceph-lazy b/src/tools/ceph-lazy/ceph-lazy
new file mode 100755
index 00000000..39a33192
--- /dev/null
+++ b/src/tools/ceph-lazy/ceph-lazy
@@ -0,0 +1,709 @@
+#!/usr/bin/env bash
+#
+# ceph-lazy : Be efficient, be lazy !
+#
+# Author: Gregory Charot <gcharot@redhat.com>
+#
+# This is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+
+# Specify exta option for ceph like the username/keyring/etc. Can also be done with CEPH_ARGS global variable
+#CEPH_OPT="-n client.username"
+VERSION="1.1.2"
+
+#
+# Print info message to stderr
+#
+
+function echoinfo() {
+ printf "INFO: %s\n" "$*" >&2;
+}
+
+
+#
+# Print error message to stderr
+#
+
+function echoerr() {
+ printf "ERROR: %s\n" "$*" >&2;
+}
+
+
+function help() {
+ >&2 echo "Usage : ceph-lazy [-d | -h] [command] [parameters]
+
+Ceph complex querying tool - Version $VERSION
+
+OPTIONS
+========
+ -d Activate debug mode
+ -h Print help
+
+COMMANDS
+=========
+
+ Host
+ -----
+ host-get-osd hostname List all OSD IDs attached to a particular node.
+ host-get-nodes List all storage nodes.
+ host-osd-usage hostname Show total OSD space usage of a particular node (-d for details).
+ host-all-usage Show total OSD space usage of each nodes (-d for details)
+
+ Placement groups
+ -----------------
+ pg-get-host pgid Find PG storage hosts (first is primary)
+ pg-most-write Find most written PG (nb operations)
+ pg-less-write Find less written PG (nb operations)
+ pg-most-write-kb Find most written PG (data written)
+ pg-less-write-kb Find less written PG (data written)
+ pg-most-read Find most read PG (nb operations)
+ pg-less-read Find less read PG (nb operations)
+ pg-most-read-kb Find most read PG (data read)
+ pg-less-read-kb Find less read PG (data read)
+ pg-empty Find empty PGs (no stored object)
+
+ RBD
+ ----
+ rbd-prefix pool_name image_name Return RBD image prefix
+ rbd-count pool_name image_name Count number of objects in a RBD image
+ rbd-host pool_name image_name Find RBD primary storage hosts
+ rbd-osd pool_name image_name Find RBD primary OSDs
+ rbd-size pool_name image_name Print RBD image real size
+ rbd-all-size pool_name Print all RBD images size (Top first)
+
+ OSD
+ ----
+ osd-most-used Show the most used OSD (capacity)
+ osd-less-used Show the less used OSD (capacity)
+ osd-get-ppg osd_id Show all primaries PGS hosted on a OSD
+ osd-get-pg osd_id Show all PGS hosted on a OSD
+
+ Objects
+ --------
+ object-get-host pool_name object_id Find object storage hosts (first is primary)
+ "
+
+}
+
+#
+# Check dependencies
+#
+function check_requirements()
+{
+
+ # List of command dependencies
+ local bin_dep="ceph rados rbd osdmaptool jq"
+
+ for cmd in $bin_dep; do
+ [ $DEBUG -eq 1 ] && echoinfo "Checking for $cmd..."
+ $cmd --version >/dev/null 2>&1 || { echoerr "$cmd cannot be found... Aborting."; return 1; }
+ done
+
+ CEPH="ceph $CEPH_OPT"
+
+ [ $DEBUG -eq 1 ] && echoinfo "Checking Ceph connectivity & basic permissions..."
+
+ if ! $CEPH -s &> /dev/null; then
+ echoerr "Cannot connect to cluster, please check your username & permissions"
+ echoerr "Command $CEPH -s failed"
+ return 1
+ fi
+
+ JQ="jq -M --raw-output"
+}
+
+#
+# Print the host that hosts a specific PG
+#
+function find_host_from_pg() {
+
+ if [ $# -eq 1 ]; then
+ local PGID=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "PG $PGID has been found at (first is primary) : "
+
+ for osd in $($CEPH pg $PGID query | $JQ -cr .up[]); do
+ echo -n "OSD:osd.$osd | Host:"
+ $CEPH osd find $osd --format json 2> /dev/null | $JQ .crush_location.host
+ done
+}
+
+
+#
+# Print the host that hosts a specific object
+#
+function find_host_from_object() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local objid=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ local pgid=$($CEPH osd map $pool $objid --format json 2> /dev/null | $JQ -cr .pgid)
+
+ [ $DEBUG -eq 1 ] && echoinfo $objid found into PG $pgid
+
+ while read host; do
+ echo "PG:$pgid | $host"
+ done < <(find_host_from_pg $pgid)
+}
+
+
+#
+# Print all primary pgs hosted by an OSD
+#
+function find_prim_pg_from_osd() {
+
+ if [ $# -eq 1 ]; then
+ local posd=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for primary PGs belonging to OSD $posd"
+ $CEPH pg dump pgs --format json 2>/dev/null | $JQ --argjson posd $posd '.[] | select(.acting_primary==$posd).pgid'
+}
+
+
+#
+# Print all pgs (primary & secondary) hosted by an OSD
+#
+function find_all_pg_from_osd() {
+
+ if [ $# -eq 1 ]; then
+ local osd=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for all PGs mapped to OSD $osd"
+ $CEPH pg dump pgs --format json 2> /dev/null | $JQ -M --argjson osd $osd '.[] | select(.up[]==$osd).pgid'
+}
+
+
+#
+# Check if a given image exists
+#
+function check_rbd_exists(){
+
+ pool=$1
+ rbd=$2
+
+ if ! rbd info -p $pool $rbd &> /dev/null; then
+ echoerr "Unable to find image $pool/$rbd"
+ exit 1
+ fi
+}
+
+
+#
+# Return RBD prefix from image name
+#
+function get_rbd_prefix() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ check_rbd_exists $pool $rbd
+
+ local prefix=$(rbd --image $rbd -p $pool info --format json 2> /dev/null | jq --raw-output .block_name_prefix)
+ if [ -z $prefix ]; then
+ echoerr "Unable to find RBD Prefix for image $pool/$rbd"
+ exit 1
+ else
+ echo $prefix
+ fi
+
+}
+
+
+#
+# Count number of object in a RBD image
+#
+function count_rbd_object() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ check_rbd_exists $pool $rbd
+
+ local rbd_prefix=$(get_rbd_prefix $pool $rbd)
+
+ [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now couning objects..."
+
+ local nb_obj=$(rados -p $pool ls | grep $rbd_prefix | wc -l)
+
+ [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has $nb_obj objects"
+ echo $nb_obj
+}
+
+
+#
+# Find primary storage host for a given RBD image
+#
+function find_prim_host_from_rbd() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ check_rbd_exists $pool $rbd
+
+ local osd="null"
+ local osdmap_t=$(mktemp)
+ local osdtree_t=$(mktemp)
+ # Get RBD image prefix
+ local rbd_prefix=$(get_rbd_prefix $pool $rbd)
+# Exit if we received an empty prefix
+ [ -z $rbd_prefix ] && exit 1
+
+# Get pool ID from pool name
+ local pool_id=$(ceph osd lspools -f json | $JQ -M --arg pool $pool '.[]|select(.poolname==$pool).poolnum')
+
+ [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now finding primary host..."
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD map to $osdmap_t"
+ if ! $CEPH osd getmap > $osdmap_t 2> /dev/null; then
+ echoerr "Failed to retrieve OSD map"
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD tree to $osdtree_t"
+
+ if ! $CEPH osd tree --format json > $osdtree_t; then
+ echoerr "Failed to retrieve OSD tree"
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for hosts..."
+
+# For each object in the RBD image
+ for obj in $(rados -p $pool ls | grep $rbd_prefix);
+ do
+# Map object to osd. osdmaptoot does not support json output so using dirty sed.
+ osd=$(osdmaptool --test-map-object $obj --pool $pool_id $osdmap_t 2>/dev/null | sed -r 's/.*\[([[:digit:]]+),.*/\1/' | grep -v osdmaptool)
+# Map osd to host
+ $JQ --argjson osd $osd '.nodes[] | select(.type=="host") | select(.children[] == $osd).name' $osdtree_t
+ done | sort -u
+
+# Cleaning files
+ rm -f $osdtree_t $osdmap_t
+}
+
+
+#
+# Find primary OSDs for a given RBD image
+#
+function find_prim_osd_from_rbd() {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ check_rbd_exists $pool $rbd
+
+ local osd="null"
+ local osdmap_t=$(mktemp)
+ local osdtree_t=$(mktemp)
+ # Get RBD image prefix
+ local rbd_prefix=$(get_rbd_prefix $pool $rbd)
+
+# Exit if we received an empty prefix
+ [ -z $rbd_prefix ] && exit 1
+
+ [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now finding primary OSDs..."
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD map to $osdmap_t"
+ if ! $CEPH osd getmap > $osdmap_t; then
+ echoerr "Failed to retrieve OSD map"
+ exit 1
+ fi
+
+# For each object in the RBD image
+ for obj in $(rados -p $pool ls | grep $rbd_prefix);
+ do
+# Map object to osd. osdmaptoot does not support json output so using dirty sed.
+ osd=$(osdmaptool --test-map-object $obj $osdmap_t 2>/dev/null | sed -r 's/.*\[([[:digit:]]+),.*/\1/' | grep -v osdmaptool)
+ echo "osd.${osd}"
+ done | sort -u
+
+# Cleaning files
+ rm -f $osdmap_t
+}
+
+
+#
+# Print RBD image real size - Source http://ceph.com/planet/real-size-of-a-ceph-rbd-image/
+#
+
+function print_rbd_real_size {
+
+ if [ $# -eq 2 ]; then
+ local pool=$1
+ local rbd=$2
+ else
+ echoerr "This command requires two arguments"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Checking if RBD image exists..."
+
+ check_rbd_exists $pool $rbd
+
+ rbd diff $pool/$rbd | awk '{ SUM += $2 } END { print SUM/1024/1024 " MB" }'
+
+}
+
+
+#
+# Print all RBD image real sizes - Top first
+#
+
+function list_all_rbd_real_size {
+
+ if [ $# -eq 1 ]; then
+ local pool=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for RBD images in pool $pool"
+
+ while read rbd; do
+ [ $DEBUG -eq 1 ] && echoinfo "Inspecting image $rbd"
+ rbd diff $pool/$rbd | awk -v rbd="$rbd" '{ SUM += $2 } END { print SUM/1024/1024 " MB - " rbd }'
+ done < <(rbd -p $pool ls) | sort -rV
+}
+
+
+#
+# Print OSDs belonging to a particular storage host
+#
+
+function list_osd_from_host() {
+
+ if [ $# -eq 1 ]; then
+ local host=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ $CEPH osd tree --format json-pretty 2> /dev/null | $JQ --arg host $host '.nodes[] | select(.type=="host") | select(.name == $host).children[]' | sort -V
+
+}
+
+
+#
+# List all OSD nodes
+#
+
+function list_all_nodes() {
+
+
+ $CEPH osd tree --format json | $JQ -M --raw-output '.nodes[] | select(.type=="host") | .name' | sort -V
+
+}
+
+
+#
+# Print Total OSD usage of a particular storage host
+#
+
+function show_host_osd_usage() {
+
+ if [ $# -eq 1 ]; then
+ local host=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ local pgmap_t=$(mktemp)
+
+ local osd_used_kb=0
+ local total_used_kb=0
+
+ local total_available_kb=0
+ local osd_available_kb=0
+
+ local total_size_kb=0
+ local osd_size_kb=0
+ local nb_osd=0
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping PG map..."
+ if ! $CEPH pg dump osds --format json 2>/dev/null > $pgmap_t; then
+ echoerr "Failed to retrieve PG map"
+ exit 1
+ fi
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking for all OSDs on host $host..."
+
+ for osd in $(list_osd_from_host $host); do
+
+ osd_used_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb_used' $pgmap_t)
+ osd_available_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb_avail' $pgmap_t)
+ osd_size_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb' $pgmap_t)
+
+ [ $DEBUG -eq 1 ] && echoinfo "OSD:$osd | Size:$(echo "scale=1;$osd_size_kb/1024/1024" | bc -l)GB | Used:$(echo "scale=1;$osd_used_kb /1024/1024" | bc -l)GB | Available:$(echo "scale=1;$osd_available_kb/1024/1024" | bc -l)GB"
+
+ let "total_used_kb=total_used_kb+osd_used_kb"
+ let "total_available_kb=total_available_kb+osd_available_kb"
+ let "total_size_kb=total_size_kb+osd_size_kb"
+ let "nb_osd++"
+
+ done
+
+ echo "Host:$host | OSDs:$nb_osd | Total_Size:$(echo "scale=1;$total_size_kb/1024/1024" | bc -l)GB | Total_Used:$(echo "scale=1;$total_used_kb /1024/1024" | bc -l)GB | Total_Available:$(echo "scale=1;$total_available_kb/1024/1024" | bc -l)GB"
+
+ rm -f $pgmap_t
+}
+
+
+#
+# Print Total OSD usage of all nodes
+#
+
+function list_all_nodes_osd_usage() {
+
+
+ for host in $(list_all_nodes); do
+
+ [ $DEBUG -eq 1 ] && echoinfo "Looking at node $host..."
+
+ show_host_osd_usage $host
+ done
+
+}
+
+
+#
+# Find most used (space) OSD
+#
+
+function find_most_used_osd() {
+
+ local osd=$($CEPH pg dump osds --format json 2> /dev/null| $JQ 'max_by(.kb_used) | .osd')
+ local host=$($CEPH osd find $osd 2> /dev/null | $JQ .crush_location.host)
+
+ echo "OSD:osd.${osd} | host:$host"
+}
+
+
+#
+# Find less used (space) OSD
+#
+
+function find_less_used_osd() {
+
+ local osd=$($CEPH pg dump osds --format json 2> /dev/null| $JQ 'min_by(.kb_used) | .osd')
+ local host=$($CEPH osd find $osd 2> /dev/null | $JQ .crush_location.host)
+
+ echo "OSD:osd.${osd} | host:$host"
+}
+
+
+#
+# Query PG stats
+#
+
+function pg_stat_query() {
+
+ if [ $# -eq 1 ]; then
+ local query_type=$1
+ else
+ echoerr "This command requires one argument"
+ help
+ exit 1
+ fi
+
+ local pgmap_t=$(mktemp)
+
+ [ $DEBUG -eq 1 ] && echoinfo "Dumping PG map..."
+ if ! $CEPH pg dump pgs --format json 2>/dev/null > $pgmap_t; then
+ echoerr "Failed to retrieve PG map"
+ exit 1
+ fi
+
+ local pgid=$($JQ --arg query_type $query_type "$query_type" $pgmap_t)
+ [ $DEBUG -eq 1 ] && echoinfo "Found PGID $pgid"
+
+ local osd=$($JQ --arg pgid $pgid '.[] | select(.pgid == $pgid).acting_primary' $pgmap_t)
+ [ $DEBUG -eq 1 ] && echoinfo "Found OSD $osd"
+
+ local host=$($CEPH osd find $osd --format json 2> /dev/null | $JQ .crush_location.host)
+ [ $DEBUG -eq 1 ] && echoinfo "Found host $host"
+
+ echo "PG:$pgid | OSD:osd.$osd | Host:$host"
+
+ rm -f $pgmap_t
+}
+
+
+#
+# Find empty pgs (no object stored)
+#
+
+function find_empty_pg() {
+
+ $CEPH pg dump pgs --format json 2>/dev/null | $JQ '.[] | select(.stat_sum.num_objects == 0).pgid'
+
+}
+
+
+#
+# MAIN
+#
+
+
+# Print help if no argument is given
+if [ $# -eq 0 ]; then
+ help
+ exit 1
+fi
+
+# Activate debug mode if -d is specified as first parameter
+if [ "$1" = "-d" ]; then
+ echoinfo "Debug mode activated"
+ DEBUG=1
+ shift
+else
+ DEBUG=0
+fi
+
+
+# Check if all requirements are met
+check_requirements || exit 1
+
+
+# Call proper function
+case $1 in
+ "-h")
+ help
+ exit 0
+ ;;
+ "host-get-osd")
+ list_osd_from_host $2
+ ;;
+ "host-get-nodes")
+ list_all_nodes
+ ;;
+ "host-osd-usage")
+ show_host_osd_usage $2
+ ;;
+ "host-all-usage")
+ list_all_nodes_osd_usage
+ ;;
+ "pg-get-host")
+ find_host_from_pg $2
+ ;;
+ "pg-most-write")
+ pg_stat_query "max_by(.stat_sum.num_write).pgid"
+ ;;
+ "pg-less-write")
+ pg_stat_query "min_by(.stat_sum.num_write).pgid"
+ ;;
+ "pg-most-write-kb")
+ pg_stat_query "max_by(.stat_sum.num_write_kb).pgid"
+ ;;
+ "pg-less-write-kb")
+ pg_stat_query "min_by(.stat_sum.num_write_kb).pgid"
+ ;;
+ "pg-most-read")
+ pg_stat_query "max_by(.stat_sum.num_read).pgid"
+ ;;
+ "pg-less-read")
+ pg_stat_query "min_by(.stat_sum.num_read).pgid"
+ ;;
+ "pg-most-read-kb")
+ pg_stat_query "max_by(.stat_sum.num_read_kb).pgid"
+ ;;
+ "pg-less-read-kb")
+ pg_stat_query "min_by(.stat_sum.num_read_kb).pgid"
+ ;;
+ "rbd-prefix")
+ get_rbd_prefix $2 $3
+ ;;
+ "rbd-count")
+ count_rbd_object $2 $3
+ ;;
+ "rbd-host")
+ find_prim_host_from_rbd $2 $3
+ ;;
+ "rbd-osd")
+ find_prim_osd_from_rbd $2 $3
+ ;;
+ "rbd-size")
+ print_rbd_real_size $2 $3
+ ;;
+ "rbd-all-size")
+ list_all_rbd_real_size $2
+ ;;
+ "osd-most-used")
+ find_most_used_osd
+ ;;
+ "osd-less-used")
+ find_less_used_osd
+ ;;
+ "osd-get-ppg")
+ find_prim_pg_from_osd $2
+ ;;
+ "osd-get-pg")
+ find_all_pg_from_osd $2
+ ;;
+ "pg-empty")
+ find_empty_pg
+ ;;
+ "object-get-host")
+ find_host_from_object $2 $3
+ ;;
+ *)
+ echoerr "Unknown command : $1"
+ help
+ exit 1
+ ;;
+esac
+
diff --git a/src/tools/ceph-monstore-update-crush.sh b/src/tools/ceph-monstore-update-crush.sh
new file mode 100755
index 00000000..5adfacdc
--- /dev/null
+++ b/src/tools/ceph-monstore-update-crush.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2015 Red Hat <contact@redhat.com>
+#
+# Author: Kefu Chai <kchai@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+verbose=
+
+test -d ../src && export PATH=$PATH:.
+
+if ! which jq ; then
+ echo "Missing jq binary!"
+ exit 1
+fi
+
+if [ `uname` = FreeBSD ]; then
+ GETOPT=/usr/local/bin/getopt
+else
+ GETOPT=getopt
+fi
+
+function osdmap_get() {
+ local store_path=$1
+ local query=$2
+ local epoch=${3:+-v $3}
+ local osdmap=`mktemp`
+
+ $CEPH_BIN/ceph-monstore-tool $store_path get osdmap -- \
+ $epoch -o $osdmap > /dev/null || return
+
+ echo $($CEPH_BIN/osdmaptool --dump json $osdmap 2> /dev/null | \
+ jq "$query")
+
+ rm -f $osdmap
+}
+
+function test_crush() {
+ local store_path=$1
+ local epoch=$2
+ local max_osd=$3
+ local crush=$4
+ local osdmap=`mktemp`
+
+ $CEPH_BIN/ceph-monstore-tool $store_path get osdmap -- \
+ -v $epoch -o $osdmap > /dev/null
+ $CEPH_BIN/osdmaptool --export-crush $crush $osdmap &> /dev/null
+
+ if $CEPH_BIN/crushtool --test --check $max_osd -i $crush > /dev/null; then
+ good=true
+ else
+ good=false
+ fi
+ rm -f $osdmap
+ $good || return 1
+}
+
+function die() {
+ local retval=$?
+ echo "$@" >&2
+ exit $retval
+}
+
+function usage() {
+ [ $# -gt 0 ] && echo -e "\n$@"
+ cat <<EOF
+
+Usage: $0 [options ...] <mon-store>
+
+Search backward for a latest known-good epoch in monstore. Rewrite the osdmap
+epochs after it with the crush map in the found epoch if asked to do so. By
+default, print out the crush map in the good epoch.
+
+ [-h|--help] display this message
+ [--out] write the found crush map to given file (default: stdout)
+ [--rewrite] rewrite the monitor storage with the found crush map
+ [--verbose] be more chatty
+EOF
+ [ $# -gt 0 ] && exit 1
+ exit 0
+}
+
+function main() {
+ local temp
+ temp=$($GETOPT -o h --long verbose,help,mon-store:,out:,rewrite -n $0 -- "$@") || return 1
+
+ eval set -- "$temp"
+ local rewrite
+ while [ "$1" != "--" ]; do
+ case "$1" in
+ --verbose)
+ verbose=true
+ # set -xe
+ # PS4='${FUNCNAME[0]}: $LINENO: '
+ shift;;
+ -h|--help)
+ usage
+ return 0;;
+ --out)
+ output=$2
+ shift 2;;
+ --osdmap-epoch)
+ osdmap_epoch=$2
+ shift 2;;
+ --rewrite)
+ rewrite=true
+ shift;;
+ *)
+ usage "unexpected argument $1"
+ shift;;
+ esac
+ done
+ shift
+
+ local store_path="$1"
+ test $store_path || usage "I need the path to mon-store."
+
+ # try accessing the store; if it fails, likely means a mon is running
+ local last_osdmap_epoch
+ local max_osd
+ last_osdmap_epoch=$(osdmap_get $store_path ".epoch") || \
+ die "error accessing mon store at $store_path"
+ # get the max_osd # in last osdmap epoch, crushtool will use it to check
+ # the crush maps in previous osdmaps
+ max_osd=$(osdmap_get $store_path ".max_osd" $last_osdmap_epoch)
+
+ local good_crush
+ local good_epoch
+ test $verbose && echo "the latest osdmap epoch is $last_osdmap_epoch"
+ for epoch in `seq $last_osdmap_epoch -1 1`; do
+ local crush_path=`mktemp`
+ test $verbose && echo "checking crush map #$epoch"
+ if test_crush $store_path $epoch $max_osd $crush_path; then
+ test $verbose && echo "crush map version #$epoch works with osdmap epoch #$osdmap_epoch"
+ good_epoch=$epoch
+ good_crush=$crush_path
+ break
+ fi
+ rm -f $crush_path
+ done
+
+ if test $good_epoch; then
+ echo "good crush map found at epoch $epoch/$last_osdmap_epoch"
+ else
+ echo "Unable to find a crush map for osdmap version #$osdmap_epoch." 2>&1
+ return 1
+ fi
+
+ if test $good_epoch -eq $last_osdmap_epoch; then
+ echo "and mon store has no faulty crush maps."
+ elif test $output; then
+ $CEPH_BIN/crushtool --decompile $good_crush --outfn $output
+ elif test $rewrite; then
+ $CEPH_BIN/ceph-monstore-tool $store_path rewrite-crush -- \
+ --crush $good_crush \
+ --good-epoch $good_epoch
+ else
+ echo
+ $CEPH_BIN/crushtool --decompile $good_crush
+ fi
+ rm -f $good_crush
+}
+
+main "$@"
diff --git a/src/tools/ceph_authtool.cc b/src/tools/ceph_authtool.cc
new file mode 100644
index 00000000..f5a78c52
--- /dev/null
+++ b/src/tools/ceph_authtool.cc
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/ConfUtils.h"
+#include "common/ceph_argparse.h"
+#include "common/config_proxy.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+
+#include "auth/Crypto.h"
+#include "auth/Auth.h"
+#include "auth/KeyRing.h"
+
+void usage()
+{
+ cout << "usage: ceph-authtool keyringfile [OPTIONS]...\n"
+ << "where the options are:\n"
+ << " -l, --list will list all keys and capabilities present in\n"
+ << " the keyring\n"
+ << " -p, --print-key will print an encoded key for the specified\n"
+ << " entityname. This is suitable for the\n"
+ << " 'mount -o secret=..' argument\n"
+ << " -C, --create-keyring will create a new keyring, overwriting any\n"
+ << " existing keyringfile\n"
+ << " -g, --gen-key will generate a new secret key for the\n"
+ << " specified entityname\n"
+ << " --gen-print-key will generate a new secret key without set it\n"
+ << " to the keyringfile, prints the secret to stdout\n"
+ << " --import-keyring FILE will import the content of a given keyring\n"
+ << " into the keyringfile\n"
+ << " -n NAME, --name NAME specify entityname to operate on\n"
+ << " -a BASE64, --add-key BASE64 will add an encoded key to the keyring\n"
+ << " --cap SUBSYSTEM CAPABILITY will set the capability for given subsystem\n"
+ << " --caps CAPSFILE will set all of capabilities associated with a\n"
+ << " given key, for all subsystems\n"
+ << " --mode MODE will set the desired file mode to the keyring\n"
+ << " e.g: '0644', defaults to '0600'"
+ << std::endl;
+ exit(1);
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ std::string add_key;
+ std::string caps_fn;
+ std::string import_keyring;
+ map<string,bufferlist> caps;
+ std::string fn;
+
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
+ bool gen_key = false;
+ bool gen_print_key = false;
+ bool list = false;
+ bool print_key = false;
+ bool create_keyring = false;
+ int mode = 0600; // keyring file mode
+ std::vector<const char*>::iterator i;
+
+ /* Handle options unique to ceph-authtool
+ * -n NAME, --name NAME is handled by global_init
+ * */
+ for (i = args.begin(); i != args.end(); ) {
+ std::string val;
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_flag(args, i, "-g", "--gen-key", (char*)NULL)) {
+ gen_key = true;
+ } else if (ceph_argparse_flag(args, i, "--gen-print-key", (char*)NULL)) {
+ gen_print_key = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "-a", "--add-key", (char*)NULL)) {
+ if (val.empty()) {
+ cerr << "Option --add-key requires an argument" << std::endl;
+ exit(1);
+ }
+ add_key = val;
+ } else if (ceph_argparse_flag(args, i, "-l", "--list", (char*)NULL)) {
+ list = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) {
+ caps_fn = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--cap", (char*)NULL)) {
+ std::string my_key = val;
+ if (i == args.end()) {
+ cerr << "must give two arguments to --cap: key and val." << std::endl;
+ exit(1);
+ }
+ std::string my_val = *i;
+ ++i;
+ encode(my_val, caps[my_key]);
+ } else if (ceph_argparse_flag(args, i, "-p", "--print-key", (char*)NULL)) {
+ print_key = true;
+ } else if (ceph_argparse_flag(args, i, "-C", "--create-keyring", (char*)NULL)) {
+ create_keyring = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--import-keyring", (char*)NULL)) {
+ import_keyring = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--mode", (char*)NULL)) {
+ std::string err;
+ mode = strict_strtoll(val.c_str(), 8, &err);
+ if (!err.empty()) {
+ cerr << "Option --mode requires an argument" << std::endl;
+ exit(1);
+ }
+ } else if (fn.empty()) {
+ fn = *i++;
+ } else {
+ cerr << argv[0] << ": unexpected '" << *i << "'" << std::endl;
+ usage();
+ }
+ }
+
+ if (fn.empty() && !gen_print_key) {
+ cerr << argv[0] << ": must specify filename" << std::endl;
+ usage();
+ }
+ if (!(gen_key ||
+ gen_print_key ||
+ !add_key.empty() ||
+ list ||
+ !caps_fn.empty() ||
+ !caps.empty() ||
+ print_key ||
+ create_keyring ||
+ !import_keyring.empty())) {
+ cerr << "no command specified" << std::endl;
+ usage();
+ }
+ if (gen_key && (!add_key.empty())) {
+ cerr << "can't both gen-key and add-key" << std::endl;
+ usage();
+ }
+
+ common_init_finish(g_ceph_context);
+ EntityName ename(g_conf()->name);
+
+ // Enforce the use of gen-key or add-key when creating to avoid ending up
+ // with an "empty" key (key = AAAAAAAAAAAAAAAA)
+ if (create_keyring && !gen_key && add_key.empty() && !caps.empty()) {
+ cerr << "must specify either gen-key or add-key when creating" << std::endl;
+ usage();
+ }
+
+ if (gen_print_key) {
+ CryptoKey key;
+ key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ cout << key << std::endl;
+ return 0;
+ }
+
+ // keyring --------
+ bool modified = false;
+ bool added_entity = false;
+ KeyRing keyring;
+
+ bufferlist bl;
+ int r = 0;
+ if (create_keyring) {
+ cout << "creating " << fn << std::endl;
+ modified = true;
+ } else {
+ std::string err;
+ r = bl.read_file(fn.c_str(), &err);
+ if (r >= 0) {
+ try {
+ auto iter = bl.cbegin();
+ decode(keyring, iter);
+ } catch (const buffer::error &err) {
+ cerr << "error reading file " << fn << std::endl;
+ exit(1);
+ }
+ } else {
+ cerr << "can't open " << fn << ": " << err << std::endl;
+ exit(1);
+ }
+ }
+
+ // Validate that "name" actually has an existing key in this keyring if we
+ // have not given gen-key or add-key options
+ if (!gen_key && add_key.empty() && !caps.empty()) {
+ CryptoKey key;
+ if (!keyring.get_secret(ename, key)) {
+ cerr << "can't find existing key for " << ename
+ << " and neither gen-key nor add-key specified" << std::endl;
+ exit(1);
+ }
+ }
+
+ // write commands
+ if (!import_keyring.empty()) {
+ KeyRing other;
+ bufferlist obl;
+ std::string err;
+ int r = obl.read_file(import_keyring.c_str(), &err);
+ if (r >= 0) {
+ try {
+ auto iter = obl.cbegin();
+ decode(other, iter);
+ } catch (const buffer::error &err) {
+ cerr << "error reading file " << import_keyring << std::endl;
+ exit(1);
+ }
+
+ cout << "importing contents of " << import_keyring << " into " << fn << std::endl;
+ //other.print(cout);
+ keyring.import(g_ceph_context, other);
+ modified = true;
+ } else {
+ cerr << "can't open " << import_keyring << ": " << err << std::endl;
+ exit(1);
+ }
+ }
+ if (gen_key) {
+ EntityAuth eauth;
+ eauth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
+ keyring.add(ename, eauth);
+ modified = true;
+ }
+ if (!add_key.empty()) {
+ EntityAuth eauth;
+ try {
+ eauth.key.decode_base64(add_key);
+ } catch (const buffer::error &err) {
+ cerr << "can't decode key '" << add_key << "'" << std::endl;
+ exit(1);
+ }
+ keyring.add(ename, eauth);
+ modified = true;
+ cout << "added entity " << ename << " " << eauth << std::endl;
+ added_entity = true;
+ }
+ if (!caps_fn.empty()) {
+ ConfFile cf;
+ std::deque<std::string> parse_errors;
+ if (cf.parse_file(caps_fn, &parse_errors, &cerr) != 0) {
+ cerr << "could not parse caps file " << caps_fn << std::endl;
+ exit(1);
+ }
+ complain_about_parse_errors(g_ceph_context, &parse_errors);
+ map<string, bufferlist> caps;
+ const char *key_names[] = { "mon", "osd", "mds", "mgr", NULL };
+ for (int i=0; key_names[i]; i++) {
+ std::string val;
+ if (cf.read("global", key_names[i], val) == 0) {
+ bufferlist bl;
+ encode(val, bl);
+ string s(key_names[i]);
+ caps[s] = bl;
+ }
+ }
+ keyring.set_caps(ename, caps);
+ modified = true;
+ }
+ if (!caps.empty()) {
+ keyring.set_caps(ename, caps);
+ modified = true;
+ }
+ if (added_entity && caps.size() > 0) {
+ cout << "added " << caps.size() << " caps to entity " << ename << std::endl;
+ }
+
+ // read commands
+ if (list) {
+ try {
+ keyring.print(cout);
+ } catch (ceph::buffer::end_of_buffer &eob) {
+ cout << "Exception (end_of_buffer) in print(), exit." << std::endl;
+ exit(1);
+ }
+ }
+ if (print_key) {
+ CryptoKey key;
+ if (keyring.get_secret(ename, key)) {
+ cout << key << std::endl;
+ } else {
+ cerr << "entity " << ename << " not found" << std::endl;
+ exit(1);
+ }
+ }
+
+ // write result?
+ if (modified) {
+ bufferlist bl;
+ keyring.encode_plaintext(bl);
+ r = bl.write_file(fn.c_str(), mode);
+ if (r < 0) {
+ cerr << "could not write " << fn << std::endl;
+ exit(1);
+ }
+ //cout << "wrote " << bl.length() << " bytes to " << fn << std::endl;
+ }
+ return 0;
+}
diff --git a/src/tools/ceph_conf.cc b/src/tools/ceph_conf.cc
new file mode 100644
index 00000000..48511e5c
--- /dev/null
+++ b/src/tools/ceph_conf.cc
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iomanip>
+#include <string>
+
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "mon/AuthMonitor.h"
+#include "common/Formatter.h"
+
+using std::deque;
+using std::string;
+
+static void usage(std::ostream& out)
+{
+ // TODO: add generic_usage once cerr/derr issues are resolved
+ out << R"(Ceph configuration query tool
+
+USAGE
+ceph-conf <flags> <action>
+
+ACTIONS
+ -L|--list-all-sections List all sections
+ -l|--list-sections <prefix> List sections with the given prefix
+ --filter-key <key> Filter section list to only include sections
+ with given key defined.
+ --filter-key-value <key>=<val> Filter section list to only include sections
+ with given key/value pair.
+ --lookup <key> Print a configuration setting to stdout.
+ Returns 0 (success) if the configuration setting is
+ found; 1 otherwise.
+ -r|--resolve-search search for the first file that exists and
+ can be opened in the resulted comma
+ delimited search list.
+ -D|--dump-all dump all variables.
+
+FLAGS
+ --name name Set type.id
+ [-s <section>] Add to list of sections to search
+ [--format plain|json|json-pretty]
+ dump variables in plain text, json or pretty
+ json
+
+If there is no action given, the action will default to --lookup.
+
+EXAMPLES
+$ ceph-conf --name mon.0 -c /etc/ceph/ceph.conf 'mon addr'
+Find out what the value of 'mon addr' is for monitor 0.
+
+$ ceph-conf -l mon
+List sections beginning with 'mon'.
+
+RETURN CODE
+Return code will be 0 on success; error code otherwise.
+)";
+}
+
+static int list_sections(const std::string &prefix,
+ const std::list<string>& filter_key,
+ const std::map<string,string>& filter_key_value)
+{
+ std::vector <std::string> sections;
+ int ret = g_conf().get_all_sections(sections);
+ if (ret)
+ return 2;
+ for (std::vector<std::string>::const_iterator p = sections.begin();
+ p != sections.end(); ++p) {
+ if (strncmp(prefix.c_str(), p->c_str(), prefix.size()))
+ continue;
+
+ std::vector<std::string> sec;
+ sec.push_back(*p);
+
+ int r = 0;
+ for (std::list<string>::const_iterator q = filter_key.begin(); q != filter_key.end(); ++q) {
+ string v;
+ r = g_conf().get_val_from_conf_file(sec, q->c_str(), v, false);
+ if (r < 0)
+ break;
+ }
+ if (r < 0)
+ continue;
+
+ for (std::map<string,string>::const_iterator q = filter_key_value.begin();
+ q != filter_key_value.end();
+ ++q) {
+ string v;
+ r = g_conf().get_val_from_conf_file(sec, q->first.c_str(), v, false);
+ if (r < 0 || v != q->second) {
+ r = -1;
+ break;
+ }
+ }
+ if (r < 0)
+ continue;
+
+ cout << *p << std::endl;
+ }
+ return 0;
+}
+
+static int lookup(const std::deque<std::string> &sections,
+ const std::string &key, bool resolve_search)
+{
+ std::vector <std::string> my_sections;
+ for (deque<string>::const_iterator s = sections.begin(); s != sections.end(); ++s) {
+ my_sections.push_back(*s);
+ }
+ g_conf().get_my_sections(my_sections);
+ std::string val;
+ int ret = g_conf().get_val_from_conf_file(my_sections, key.c_str(), val, true);
+ if (ret == -ENOENT)
+ return 1;
+ else if (ret == 0) {
+ if (resolve_search) {
+ string result;
+ ret = ceph_resolve_file_search(val, result);
+ if (!ret)
+ puts(result.c_str());
+ }
+ else {
+ puts(val.c_str());
+ }
+ return 0;
+ }
+ else {
+ cerr << "error looking up '" << key << "': error " << ret << std::endl;
+ return 2;
+ }
+}
+
+static int dump_all(const string& format)
+{
+ if (format == "" || format == "plain") {
+ g_conf().show_config(std::cout);
+ return 0;
+ } else {
+ unique_ptr<Formatter> f(Formatter::create(format));
+ if (f) {
+ f->open_object_section("ceph-conf");
+ g_conf().show_config(f.get());
+ f->close_section();
+ f->flush(std::cout);
+ return 0;
+ }
+ cerr << "format '" << format << "' not recognized." << std::endl;
+ usage(cerr);
+ return 1;
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ deque<std::string> sections;
+ bool resolve_search = false;
+ std::string action;
+ std::string lookup_key;
+ std::string section_list_prefix;
+ std::list<string> filter_key;
+ std::map<string,string> filter_key_value;
+ std::string dump_format;
+
+ argv_to_vec(argc, argv, args);
+ auto orig_args = args;
+ auto cct = [&args] {
+ std::map<std::string,std::string> defaults = {{"log_to_file", "false"}};
+ return global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_NO_DAEMON_ACTIONS |
+ CINIT_FLAG_NO_MON_CONFIG);
+ }();
+
+ // do not common_init_finish(); do not start threads; do not do any of thing
+ // wonky things the daemon whose conf we are examining would do (like initialize
+ // the admin socket).
+ //common_init_finish(g_ceph_context);
+
+ std::string val;
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_witharg(args, i, &val, "-s", "--section", (char*)NULL)) {
+ sections.push_back(val);
+ } else if (ceph_argparse_flag(args, i, "-r", "--resolve_search", (char*)NULL)) {
+ resolve_search = true;
+ } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+ action = "help";
+ } else if (ceph_argparse_witharg(args, i, &val, "--lookup", (char*)NULL)) {
+ action = "lookup";
+ lookup_key = val;
+ } else if (ceph_argparse_flag(args, i, "-L", "--list_all_sections", (char*)NULL)) {
+ action = "list-sections";
+ section_list_prefix = "";
+ } else if (ceph_argparse_witharg(args, i, &val, "-l", "--list_sections", (char*)NULL)) {
+ action = "list-sections";
+ section_list_prefix = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--filter_key", (char*)NULL)) {
+ filter_key.push_back(val);
+ } else if (ceph_argparse_witharg(args, i, &val, "--filter_key_value", (char*)NULL)) {
+ size_t pos = val.find_first_of('=');
+ if (pos == string::npos) {
+ cerr << "expecting argument like 'key=value' for --filter-key-value (not '" << val << "')" << std::endl;
+ usage(cerr);
+ return EXIT_FAILURE;
+ }
+ string key(val, 0, pos);
+ string value(val, pos+1);
+ filter_key_value[key] = value;
+ } else if (ceph_argparse_flag(args, i, "-D", "--dump_all", (char*)NULL)) {
+ action = "dumpall";
+ } else if (ceph_argparse_witharg(args, i, &val, "--format", (char*)NULL)) {
+ dump_format = val;
+ } else {
+ if (((action == "lookup") || (action == "")) && (lookup_key.empty())) {
+ action = "lookup";
+ lookup_key = *i++;
+ } else {
+ cerr << "unable to parse option: '" << *i << "'" << std::endl;
+ cerr << "args:";
+ for (auto arg : orig_args) {
+ cerr << " " << quoted(arg);
+ }
+ cerr << std::endl;
+ usage(cerr);
+ return EXIT_FAILURE;
+ }
+ }
+ }
+
+ cct->_log->flush();
+ if (action == "help") {
+ usage(cout);
+ return EXIT_SUCCESS;
+ } else if (action == "list-sections") {
+ return list_sections(section_list_prefix, filter_key, filter_key_value);
+ } else if (action == "lookup") {
+ return lookup(sections, lookup_key, resolve_search);
+ } else if (action == "dumpall") {
+ return dump_all(dump_format);
+ } else {
+ cerr << "You must give an action, such as --lookup or --list-all-sections." << std::endl;
+ cerr << "Pass --help for more help." << std::endl;
+ return EXIT_FAILURE;
+ }
+}
diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc
new file mode 100644
index 00000000..1713cde4
--- /dev/null
+++ b/src/tools/ceph_dedup_tool.cc
@@ -0,0 +1,834 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Myoungwon Oh <ohmyoungwon@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "include/types.h"
+
+#include "include/rados/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rados/rados_types.hpp"
+
+#include "acconfig.h"
+
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/Cond.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/obj_bencher.h"
+
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <time.h>
+#include <sstream>
+#include <errno.h>
+#include <dirent.h>
+#include <stdexcept>
+#include <climits>
+#include <locale>
+#include <memory>
+
+#include "tools/RadosDump.h"
+#include "cls/cas/cls_cas_client.h"
+#include "include/stringify.h"
+#include "global/signal_handler.h"
+
+using namespace librados;
+unsigned default_op_size = 1 << 22;
+unsigned default_max_thread = 2;
+int32_t default_report_period = 2;
+map< string, pair <uint64_t, uint64_t> > chunk_statistics; // < key, <count, chunk_size> >
+Mutex glock("chunk_statistics::Locker");
+
+void usage()
+{
+ cout << " usage: [--op <estimate|chunk_scrub|add_chunk_ref|get_chunk_ref>] [--pool <pool_name> ] " << std::endl;
+ cout << " --object <object_name> " << std::endl;
+ cout << " --chunk-size <size> chunk-size (byte) " << std::endl;
+ cout << " --chunk-algorithm <fixed> " << std::endl;
+ cout << " --fingerprint-algorithm <sha1> " << std::endl;
+ cout << " --chunk-pool <pool name> " << std::endl;
+ cout << " --max-thread <threads> " << std::endl;
+ cout << " --report-perioid <seconds> " << std::endl;
+ exit(1);
+}
+
+[[noreturn]] static void usage_exit()
+{
+ usage();
+ exit(1);
+}
+
+template <typename I, typename T>
+static int rados_sistrtoll(I &i, T *val) {
+ std::string err;
+ *val = strict_iecstrtoll(i->second.c_str(), &err);
+ if (err != "") {
+ cerr << "Invalid value for " << i->first << ": " << err << std::endl;
+ return -EINVAL;
+ } else {
+ return 0;
+ }
+}
+
+class EstimateDedupRatio;
+class ChunkScrub;
+class EstimateThread : public Thread
+{
+ IoCtx io_ctx;
+ int n;
+ int m;
+ ObjectCursor begin;
+ ObjectCursor end;
+ Mutex m_lock;
+ Cond m_cond;
+ int32_t timeout;
+ bool m_stop = false;
+ uint64_t total_bytes = 0;
+ uint64_t examined_objects = 0;
+ uint64_t total_objects = 0;
+#define COND_WAIT_INTERVAL 10
+
+public:
+ EstimateThread(IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end, int32_t timeout):
+ io_ctx(io_ctx), n(n), m(m), begin(begin), end(end), m_lock("EstimateThread::Locker"), timeout(timeout)
+ {}
+ void signal(int signum) {
+ Mutex::Locker l(m_lock);
+ m_stop = true;
+ m_cond.Signal();
+ }
+ virtual void print_status(Formatter *f, ostream &out) = 0;
+ uint64_t count_objects(IoCtx &ioctx, ObjectCursor &begin, ObjectCursor &end);
+ uint64_t get_examined_objects() { return examined_objects; }
+ uint64_t get_total_bytes() { return total_bytes; }
+ uint64_t get_total_objects() { return total_objects; }
+ friend class EstimateDedupRatio;
+ friend class ChunkScrub;
+};
+
+class EstimateDedupRatio : public EstimateThread
+{
+ string chunk_algo;
+ string fp_algo;
+ uint64_t chunk_size;
+ map< string, pair <uint64_t, uint64_t> > local_chunk_statistics; // < key, <count, chunk_size> >
+
+public:
+ EstimateDedupRatio(IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end,
+ string chunk_algo, string fp_algo, uint64_t chunk_size, int32_t timeout):
+ EstimateThread(io_ctx, n, m, begin, end, timeout), chunk_algo(chunk_algo), fp_algo(fp_algo),
+ chunk_size(chunk_size) { }
+
+ void* entry() {
+ count_objects(io_ctx, begin, end);
+ estimate_dedup_ratio();
+ return NULL;
+ }
+ void estimate_dedup_ratio();
+ void print_status(Formatter *f, ostream &out);
+ map< string, pair <uint64_t, uint64_t> > &get_chunk_statistics() { return local_chunk_statistics; }
+ uint64_t fixed_chunk(string oid, uint64_t offset);
+};
+
+class ChunkScrub: public EstimateThread
+{
+ IoCtx chunk_io_ctx;
+ int fixed_objects = 0;
+
+public:
+ ChunkScrub(IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end,
+ IoCtx& chunk_io_ctx, int32_t timeout):
+ EstimateThread(io_ctx, n, m, begin, end, timeout), chunk_io_ctx(chunk_io_ctx)
+ { }
+ void* entry() {
+ count_objects(chunk_io_ctx, begin, end);
+ chunk_scrub_common();
+ return NULL;
+ }
+ void chunk_scrub_common();
+ int get_fixed_objects() { return fixed_objects; }
+ void print_status(Formatter *f, ostream &out);
+};
+
+vector<std::unique_ptr<EstimateThread>> estimate_threads;
+
+uint64_t EstimateThread::count_objects(IoCtx &ioctx, ObjectCursor &begin, ObjectCursor &end)
+{
+ ObjectCursor shard_start;
+ ObjectCursor shard_end;
+ uint64_t count = 0;
+
+ ioctx.object_list_slice(
+ begin,
+ end,
+ n,
+ m,
+ &shard_start,
+ &shard_end);
+
+ ObjectCursor c(shard_start);
+ while(c < shard_end)
+ {
+ std::vector<ObjectItem> result;
+ int r = ioctx.object_list(c, shard_end, 12, {}, &result, &c);
+ if (r < 0 ) {
+ cerr << "error object_list : " << cpp_strerror(r) << std::endl;
+ return 0;
+ }
+ count += result.size();
+ total_objects += result.size();
+ }
+ return count;
+}
+
+static void print_dedup_estimate(bool debug = false)
+{
+ uint64_t total_size = 0;
+ uint64_t dedup_size = 0;
+ uint64_t examined_objects = 0;
+ uint64_t total_objects = 0;
+ EstimateDedupRatio *ratio = NULL;
+ for (auto &et : estimate_threads) {
+ Mutex::Locker l(glock);
+ ratio = dynamic_cast<EstimateDedupRatio*>(et.get());
+ assert(ratio);
+ for (auto p : ratio->get_chunk_statistics()) {
+ auto c = chunk_statistics.find(p.first);
+ if (c != chunk_statistics.end()) {
+ c->second.first += p.second.first;
+ } else {
+ chunk_statistics.insert(p);
+ }
+ }
+ }
+
+ if (debug) {
+ for (auto p : chunk_statistics) {
+ cout << " -- " << std::endl;
+ cout << " key: " << p.first << std::endl;
+ cout << " count: " << p.second.first << std::endl;
+ cout << " chunk_size: " << p.second.second << std::endl;
+ dedup_size += p.second.second;
+ cout << " -- " << std::endl;
+ }
+ } else {
+ for (auto p : chunk_statistics) {
+ dedup_size += p.second.second;
+ }
+
+ }
+
+ for (auto &et : estimate_threads) {
+ total_size += et->get_total_bytes();
+ examined_objects += et->get_examined_objects();
+ total_objects += et->get_total_objects();
+ }
+
+ cout << " result: " << total_size << " | " << dedup_size << " (total size | deduped size) " << std::endl;
+ cout << " Dedup ratio: " << (100 - (double)(dedup_size)/total_size*100) << " % " << std::endl;
+ cout << " Examined objects: " << examined_objects << std::endl;
+ cout << " Total objects: " << total_objects << std::endl;
+}
+
+static void handle_signal(int signum)
+{
+ Mutex::Locker l(glock);
+ for (auto &p : estimate_threads) {
+ p->signal(signum);
+ }
+}
+
+void EstimateDedupRatio::print_status(Formatter *f, ostream &out)
+{
+ if (f) {
+ f->open_array_section("estimate_dedup_ratio");
+ f->dump_string("PID", stringify(get_pid()));
+ for (auto p : local_chunk_statistics) {
+ f->open_object_section("fingerprint object");
+ f->dump_string("fingperint", p.first);
+ f->dump_string("count", stringify(p.second.first));
+ f->dump_string("chunk_size", stringify(p.second.second));
+ }
+ f->close_section();
+ f->open_object_section("Status");
+ f->dump_string("Total bytes", stringify(total_bytes));
+ f->dump_string("Examined objectes", stringify(examined_objects));
+ f->close_section();
+ f->flush(out);
+ cout << std::endl;
+ }
+}
+
+void EstimateDedupRatio::estimate_dedup_ratio()
+{
+ ObjectCursor shard_start;
+ ObjectCursor shard_end;
+ utime_t cur_time = ceph_clock_now();
+
+ io_ctx.object_list_slice(
+ begin,
+ end,
+ n,
+ m,
+ &shard_start,
+ &shard_end);
+
+ ObjectCursor c(shard_start);
+ while(c < shard_end)
+ {
+ std::vector<ObjectItem> result;
+ int r = io_ctx.object_list(c, shard_end, 12, {}, &result, &c);
+ if (r < 0 ){
+ cerr << "error object_list : " << cpp_strerror(r) << std::endl;
+ return;
+ }
+
+ for (const auto & i : result) {
+ const auto &oid = i.oid;
+ uint64_t offset = 0;
+ while (true) {
+ Mutex::Locker l(m_lock);
+ if (m_stop) {
+ Formatter *formatter = Formatter::create("json-pretty");
+ print_status(formatter, cout);
+ delete formatter;
+ return;
+ }
+
+ uint64_t next_offset;
+ if (chunk_algo == "fixed") {
+ next_offset = fixed_chunk(oid, offset);
+ } else {
+ // CDC ..
+ ceph_assert(0 == "no support chunk algorithm");
+ }
+
+ if (!next_offset) {
+ break;
+ }
+ offset += next_offset;
+ m_cond.WaitInterval(m_lock,utime_t(0, COND_WAIT_INTERVAL));
+ if (cur_time + utime_t(timeout, 0) < ceph_clock_now()) {
+ Formatter *formatter = Formatter::create("json-pretty");
+ print_status(formatter, cout);
+ delete formatter;
+ cur_time = ceph_clock_now();
+ }
+ }
+ examined_objects++;
+ }
+ }
+}
+
+uint64_t EstimateDedupRatio::fixed_chunk(string oid, uint64_t offset)
+{
+ unsigned op_size = default_op_size;
+ int ret;
+ bufferlist outdata;
+ ret = io_ctx.read(oid, outdata, op_size, offset);
+ if (ret <= 0) {
+ return 0;
+ }
+
+ if (fp_algo == "sha1") {
+ uint64_t c_offset = 0;
+ while (c_offset < outdata.length()) {
+ bufferlist chunk;
+ if (outdata.length() - c_offset > chunk_size) {
+ bufferptr bptr(chunk_size);
+ chunk.push_back(std::move(bptr));
+ chunk.copy_in(0, chunk_size, outdata.c_str());
+ } else {
+ bufferptr bptr(outdata.length() - c_offset);
+ chunk.push_back(std::move(bptr));
+ chunk.copy_in(0, outdata.length() - c_offset, outdata.c_str());
+ }
+ sha1_digest_t sha1_val = chunk.sha1();
+ string fp = sha1_val.to_str();
+ auto p = local_chunk_statistics.find(fp);
+ if (p != local_chunk_statistics.end()) {
+ uint64_t count = p->second.first;
+ count++;
+ local_chunk_statistics[fp] = make_pair(count, chunk.length());
+ } else {
+ local_chunk_statistics[fp] = make_pair(1, chunk.length());
+ }
+ total_bytes += chunk.length();
+ c_offset = c_offset + chunk_size;
+ }
+ } else {
+ ceph_assert(0 == "no support fingerperint algorithm");
+ }
+
+ if (outdata.length() < op_size) {
+ return 0;
+ }
+ return outdata.length();
+}
+
+void ChunkScrub::chunk_scrub_common()
+{
+ ObjectCursor shard_start;
+ ObjectCursor shard_end;
+ int ret;
+ utime_t cur_time = ceph_clock_now();
+
+ chunk_io_ctx.object_list_slice(
+ begin,
+ end,
+ n,
+ m,
+ &shard_start,
+ &shard_end);
+
+ ObjectCursor c(shard_start);
+ while(c < shard_end)
+ {
+ std::vector<ObjectItem> result;
+ int r = chunk_io_ctx.object_list(c, shard_end, 12, {}, &result, &c);
+ if (r < 0 ){
+ cerr << "error object_list : " << cpp_strerror(r) << std::endl;
+ return;
+ }
+
+ for (const auto & i : result) {
+ Mutex::Locker l(m_lock);
+ if (m_stop) {
+ Formatter *formatter = Formatter::create("json-pretty");
+ print_status(formatter, cout);
+ delete formatter;
+ return;
+ }
+ auto oid = i.oid;
+ set<hobject_t> refs;
+ set<hobject_t> real_refs;
+ ret = cls_chunk_refcount_read(chunk_io_ctx, oid, &refs);
+ if (ret < 0) {
+ continue;
+ }
+
+ for (auto pp : refs) {
+ ret = cls_chunk_has_chunk(io_ctx, pp.oid.name, oid);
+ if (ret != -ENOENT) {
+ real_refs.insert(pp);
+ }
+ }
+
+ if (refs.size() != real_refs.size()) {
+ ObjectWriteOperation op;
+ cls_chunk_refcount_set(op, real_refs);
+ ret = chunk_io_ctx.operate(oid, &op);
+ if (ret < 0) {
+ continue;
+ }
+ fixed_objects++;
+ }
+ examined_objects++;
+ m_cond.WaitInterval(m_lock,utime_t(0, COND_WAIT_INTERVAL));
+ if (cur_time + utime_t(timeout, 0) < ceph_clock_now()) {
+ Formatter *formatter = Formatter::create("json-pretty");
+ print_status(formatter, cout);
+ delete formatter;
+ cur_time = ceph_clock_now();
+ }
+ }
+ }
+}
+
+void ChunkScrub::print_status(Formatter *f, ostream &out)
+{
+ if (f) {
+ f->open_array_section("chunk_scrub");
+ f->dump_string("PID", stringify(get_pid()));
+ f->open_object_section("Status");
+ f->dump_string("Total object", stringify(total_objects));
+ f->dump_string("Examined objectes", stringify(examined_objects));
+ f->dump_string("Fixed objectes", stringify(fixed_objects));
+ f->close_section();
+ f->flush(out);
+ cout << std::endl;
+ }
+}
+
+int estimate_dedup_ratio(const std::map < std::string, std::string > &opts,
+ std::vector<const char*> &nargs)
+{
+ Rados rados;
+ IoCtx io_ctx;
+ std::string chunk_algo;
+ string fp_algo;
+ string pool_name;
+ uint64_t chunk_size = 0;
+ unsigned max_thread = default_max_thread;
+ uint32_t report_period = default_report_period;
+ int ret;
+ std::map<std::string, std::string>::const_iterator i;
+ bool debug = false;
+ ObjectCursor begin;
+ ObjectCursor end;
+
+ i = opts.find("pool");
+ if (i != opts.end()) {
+ pool_name = i->second.c_str();
+ }
+ i = opts.find("chunk-algorithm");
+ if (i != opts.end()) {
+ chunk_algo = i->second.c_str();
+ if (chunk_algo != "fixed") {
+ usage_exit();
+ }
+ } else {
+ usage_exit();
+ }
+
+ i = opts.find("fingerprint-algorithm");
+ if (i != opts.end()) {
+ fp_algo = i->second.c_str();
+ if (fp_algo != "sha1") {
+ usage_exit();
+ }
+ } else {
+ usage_exit();
+ }
+
+ i = opts.find("chunk-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &chunk_size)) {
+ return -EINVAL;
+ }
+ } else {
+ usage_exit();
+ }
+
+ i = opts.find("max-thread");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_thread)) {
+ return -EINVAL;
+ }
+ }
+
+ i = opts.find("report-period");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &report_period)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("debug");
+ if (i != opts.end()) {
+ debug = true;
+ }
+
+ i = opts.find("pgid");
+ boost::optional<pg_t> pgid(i != opts.end(), pg_t());
+
+ ret = rados.init_with_context(g_ceph_context);
+ if (ret < 0) {
+ cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = rados.connect();
+ if (ret) {
+ cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl;
+ ret = -1;
+ goto out;
+ }
+ if (pool_name.empty()) {
+ cerr << "--create-pool requested but pool_name was not specified!" << std::endl;
+ usage_exit();
+ }
+ ret = rados.ioctx_create(pool_name.c_str(), io_ctx);
+ if (ret < 0) {
+ cerr << "error opening pool "
+ << pool_name << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ glock.Lock();
+ begin = io_ctx.object_list_begin();
+ end = io_ctx.object_list_end();
+ for (unsigned i = 0; i < max_thread; i++) {
+ std::unique_ptr<EstimateThread> ptr (new EstimateDedupRatio(io_ctx, i, max_thread, begin, end,
+ chunk_algo, fp_algo, chunk_size,
+ report_period));
+ ptr->create("estimate_thread");
+ estimate_threads.push_back(move(ptr));
+ }
+ glock.Unlock();
+
+ for (auto &p : estimate_threads) {
+ p->join();
+ }
+
+ print_dedup_estimate(debug);
+
+ out:
+ return (ret < 0) ? 1 : 0;
+}
+
+static void print_chunk_scrub()
+{
+ uint64_t total_objects = 0;
+ uint64_t examined_objects = 0;
+ int fixed_objects = 0;
+
+ for (auto &et : estimate_threads) {
+ total_objects += et->get_total_objects();
+ examined_objects += et->get_examined_objects();
+ ChunkScrub *ptr = static_cast<ChunkScrub*>(et.get());
+ fixed_objects += ptr->get_fixed_objects();
+ }
+
+ cout << " Total object : " << total_objects << std::endl;
+ cout << " Examined object : " << examined_objects << std::endl;
+ cout << " Fixed object : " << fixed_objects << std::endl;
+}
+
+int chunk_scrub_common(const std::map < std::string, std::string > &opts,
+ std::vector<const char*> &nargs)
+{
+ Rados rados;
+ IoCtx io_ctx, chunk_io_ctx;
+ std::string object_name, target_object_name;
+ string pool_name, chunk_pool_name, op_name;
+ int ret;
+ unsigned max_thread = default_max_thread;
+ std::map<std::string, std::string>::const_iterator i;
+ uint32_t report_period = default_report_period;
+ ObjectCursor begin;
+ ObjectCursor end;
+
+ i = opts.find("pool");
+ if (i != opts.end()) {
+ pool_name = i->second.c_str();
+ } else {
+ usage_exit();
+ }
+ i = opts.find("op_name");
+ if (i != opts.end()) {
+ op_name= i->second.c_str();
+ } else {
+ usage_exit();
+ }
+
+ i = opts.find("chunk-pool");
+ if (i != opts.end()) {
+ chunk_pool_name = i->second.c_str();
+ } else {
+ usage_exit();
+ }
+ i = opts.find("max-thread");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_thread)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("report-period");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &report_period)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("pgid");
+ boost::optional<pg_t> pgid(i != opts.end(), pg_t());
+
+ ret = rados.init_with_context(g_ceph_context);
+ if (ret < 0) {
+ cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = rados.connect();
+ if (ret) {
+ cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl;
+ ret = -1;
+ goto out;
+ }
+ if (pool_name.empty()) {
+ cerr << "--create-pool requested but pool_name was not specified!" << std::endl;
+ usage_exit();
+ }
+ ret = rados.ioctx_create(pool_name.c_str(), io_ctx);
+ if (ret < 0) {
+ cerr << "error opening pool "
+ << pool_name << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = rados.ioctx_create(chunk_pool_name.c_str(), chunk_io_ctx);
+ if (ret < 0) {
+ cerr << "error opening pool "
+ << chunk_pool_name << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ if (op_name == "add_chunk_ref") {
+ string target_object_name;
+ i = opts.find("object");
+ if (i != opts.end()) {
+ object_name = i->second.c_str();
+ } else {
+ usage_exit();
+ }
+ i = opts.find("target-ref");
+ if (i != opts.end()) {
+ target_object_name = i->second.c_str();
+ } else {
+ usage_exit();
+ }
+
+ set<hobject_t> refs;
+ ret = cls_chunk_refcount_read(chunk_io_ctx, object_name, &refs);
+ if (ret < 0) {
+ cerr << " cls_chunk_refcount_read fail : " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ for (auto p : refs) {
+ cout << " " << p.oid.name << " ";
+ }
+
+ uint32_t hash;
+ ret = chunk_io_ctx.get_object_hash_position2(object_name, &hash);
+ if (ret < 0) {
+ return ret;
+ }
+ hobject_t oid(sobject_t(target_object_name, CEPH_NOSNAP), "", hash, -1, "");
+ refs.insert(oid);
+
+ ObjectWriteOperation op;
+ cls_chunk_refcount_set(op, refs);
+ ret = chunk_io_ctx.operate(object_name, &op);
+ if (ret < 0) {
+ cerr << " operate fail : " << cpp_strerror(ret) << std::endl;
+ }
+
+ return ret;
+
+ } else if (op_name == "get_chunk_ref") {
+ i = opts.find("object");
+ if (i != opts.end()) {
+ object_name = i->second.c_str();
+ } else {
+ usage_exit();
+ }
+ set<hobject_t> refs;
+ cout << " refs: " << std::endl;
+ ret = cls_chunk_refcount_read(chunk_io_ctx, object_name, &refs);
+ for (auto p : refs) {
+ cout << " " << p.oid.name << " ";
+ }
+ cout << std::endl;
+ return ret;
+ }
+
+ glock.Lock();
+ begin = io_ctx.object_list_begin();
+ end = io_ctx.object_list_end();
+ for (unsigned i = 0; i < max_thread; i++) {
+ std::unique_ptr<EstimateThread> ptr (new ChunkScrub(io_ctx, i, max_thread, begin, end, chunk_io_ctx,
+ report_period));
+ ptr->create("estimate_thread");
+ estimate_threads.push_back(move(ptr));
+ }
+ glock.Unlock();
+
+ for (auto &p : estimate_threads) {
+ p->join();
+ }
+
+ print_chunk_scrub();
+
+out:
+ return (ret < 0) ? 1 : 0;
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ std::string fn;
+ string op_name;
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+ init_async_signal_handler();
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+ std::map < std::string, std::string > opts;
+ std::string val;
+ std::vector<const char*>::iterator i;
+ for (i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_witharg(args, i, &val, "--op", (char*)NULL)) {
+ opts["op_name"] = val;
+ op_name = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--pool", (char*)NULL)) {
+ opts["pool"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--object", (char*)NULL)) {
+ opts["object"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--chunk-algorithm", (char*)NULL)) {
+ opts["chunk-algorithm"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--chunk-size", (char*)NULL)) {
+ opts["chunk-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--fingerprint-algorithm", (char*)NULL)) {
+ opts["fingerprint-algorithm"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--chunk-pool", (char*)NULL)) {
+ opts["chunk-pool"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-ref", (char*)NULL)) {
+ opts["target-ref"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-thread", (char*)NULL)) {
+ opts["max-thread"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--report-period", (char*)NULL)) {
+ opts["report-period"] = val;
+ } else if (ceph_argparse_flag(args, i, "--debug", (char*)NULL)) {
+ opts["debug"] = "true";
+ } else {
+ if (val[0] == '-')
+ usage_exit();
+ ++i;
+ }
+ }
+
+ if (op_name == "estimate") {
+ return estimate_dedup_ratio(opts, args);
+ } else if (op_name == "chunk_scrub") {
+ return chunk_scrub_common(opts, args);
+ } else if (op_name == "add_chunk_ref") {
+ return chunk_scrub_common(opts, args);
+ } else if (op_name == "get_chunk_ref") {
+ return chunk_scrub_common(opts, args);
+ } else {
+ usage();
+ exit(0);
+ }
+
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ return 0;
+}
diff --git a/src/tools/ceph_kvstore_tool.cc b/src/tools/ceph_kvstore_tool.cc
new file mode 100644
index 00000000..4a4f5214
--- /dev/null
+++ b/src/tools/ceph_kvstore_tool.cc
@@ -0,0 +1,356 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#include <map>
+#include <set>
+#include <string>
+#include <fstream>
+
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/url_escape.h"
+
+#include "global/global_context.h"
+#include "global/global_init.h"
+
+#include "kvstore_tool.h"
+
+void usage(const char *pname)
+{
+ std::cout << "Usage: " << pname << " <leveldb|rocksdb|bluestore-kv> <store path> command [args...]\n"
+ << "\n"
+ << "Commands:\n"
+ << " list [prefix]\n"
+ << " list-crc [prefix]\n"
+ << " dump [prefix]\n"
+ << " exists <prefix> [key]\n"
+ << " get <prefix> <key> [out <file>]\n"
+ << " crc <prefix> <key>\n"
+ << " get-size [<prefix> <key>]\n"
+ << " set <prefix> <key> [ver <N>|in <file>]\n"
+ << " rm <prefix> <key>\n"
+ << " rm-prefix <prefix>\n"
+ << " store-copy <path> [num-keys-per-tx] [leveldb|rocksdb|...] \n"
+ << " store-crc <path>\n"
+ << " compact\n"
+ << " compact-prefix <prefix>\n"
+ << " compact-range <prefix> <start> <end>\n"
+ << " destructive-repair (use only as last resort! may corrupt healthy data)\n"
+ << " stats\n"
+ << std::endl;
+}
+
+int main(int argc, const char *argv[])
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage(argv[0]);
+ exit(0);
+ }
+
+ map<string,string> defaults = {
+ { "debug_rocksdb", "2" }
+ };
+
+ auto cct = global_init(
+ &defaults, args,
+ CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+
+ ceph_assert((int)args.size() < argc);
+ for(size_t i=0; i<args.size(); i++)
+ argv[i+1] = args[i];
+ argc = args.size() + 1;
+
+ if (args.size() < 3) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ string type(args[0]);
+ string path(args[1]);
+ string cmd(args[2]);
+
+ if (type != "leveldb" &&
+ type != "rocksdb" &&
+ type != "bluestore-kv") {
+
+ std::cerr << "Unrecognized type: " << args[0] << std::endl;
+ usage(argv[0]);
+ return 1;
+ }
+
+ bool need_open_db = (cmd != "destructive-repair");
+ bool need_stats = (cmd == "stats");
+ StoreTool st(type, path, need_open_db, need_stats);
+
+ if (cmd == "destructive-repair") {
+ int ret = st.destructive_repair();
+ if (!ret) {
+ std::cout << "destructive-repair completed without reporting an error"
+ << std::endl;
+ } else {
+ std::cout << "destructive-repair failed with " << cpp_strerror(ret)
+ << std::endl;
+ }
+ return ret;
+ } else if (cmd == "list" || cmd == "list-crc") {
+ string prefix;
+ if (argc > 4)
+ prefix = url_unescape(argv[4]);
+
+ bool do_crc = (cmd == "list-crc");
+ st.list(prefix, do_crc, false);
+
+ } else if (cmd == "dump") {
+ string prefix;
+ if (argc > 4)
+ prefix = url_unescape(argv[4]);
+ st.list(prefix, false, true);
+
+ } else if (cmd == "exists") {
+ string key;
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ if (argc > 5)
+ key = url_unescape(argv[5]);
+
+ bool ret = st.exists(prefix, key);
+ std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ") "
+ << (ret ? "exists" : "does not exist")
+ << std::endl;
+ return (ret ? 0 : 1);
+
+ } else if (cmd == "get") {
+ if (argc < 6) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+
+ bool exists = false;
+ bufferlist bl = st.get(prefix, key, exists);
+ std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ")";
+ if (!exists) {
+ std::cout << " does not exist" << std::endl;
+ return 1;
+ }
+ std::cout << std::endl;
+
+ if (argc >= 7) {
+ string subcmd(argv[6]);
+ if (subcmd != "out") {
+ std::cerr << "unrecognized subcmd '" << subcmd << "'"
+ << std::endl;
+ return 1;
+ }
+ if (argc < 8) {
+ std::cerr << "output path not specified" << std::endl;
+ return 1;
+ }
+ string out(argv[7]);
+
+ if (out.empty()) {
+ std::cerr << "unspecified out file" << std::endl;
+ return 1;
+ }
+
+ int err = bl.write_file(argv[7], 0644);
+ if (err < 0) {
+ std::cerr << "error writing value to '" << out << "': "
+ << cpp_strerror(err) << std::endl;
+ return 1;
+ }
+ } else {
+ ostringstream os;
+ bl.hexdump(os);
+ std::cout << os.str() << std::endl;
+ }
+
+ } else if (cmd == "crc") {
+ if (argc < 6) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+
+ bool exists = false;
+ bufferlist bl = st.get(prefix, key, exists);
+ std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ") ";
+ if (!exists) {
+ std::cout << " does not exist" << std::endl;
+ return 1;
+ }
+ std::cout << " crc " << bl.crc32c(0) << std::endl;
+
+ } else if (cmd == "get-size") {
+ std::cout << "estimated store size: " << st.get_size() << std::endl;
+
+ if (argc < 5)
+ return 0;
+
+ if (argc < 6) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+
+ bool exists = false;
+ bufferlist bl = st.get(prefix, key, exists);
+ if (!exists) {
+ std::cerr << "(" << url_escape(prefix) << "," << url_escape(key)
+ << ") does not exist" << std::endl;
+ return 1;
+ }
+ std::cout << "(" << url_escape(prefix) << "," << url_escape(key)
+ << ") size " << byte_u_t(bl.length()) << std::endl;
+
+ } else if (cmd == "set") {
+ if (argc < 8) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+ string subcmd(argv[6]);
+
+ bufferlist val;
+ string errstr;
+ if (subcmd == "ver") {
+ version_t v = (version_t) strict_strtoll(argv[7], 10, &errstr);
+ if (!errstr.empty()) {
+ std::cerr << "error reading version: " << errstr << std::endl;
+ return 1;
+ }
+ encode(v, val);
+ } else if (subcmd == "in") {
+ int ret = val.read_file(argv[7], &errstr);
+ if (ret < 0 || !errstr.empty()) {
+ std::cerr << "error reading file: " << errstr << std::endl;
+ return 1;
+ }
+ } else {
+ std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl;
+ usage(argv[0]);
+ return 1;
+ }
+
+ bool ret = st.set(prefix, key, val);
+ if (!ret) {
+ std::cerr << "error setting ("
+ << url_escape(prefix) << "," << url_escape(key) << ")" << std::endl;
+ return 1;
+ }
+ } else if (cmd == "rm") {
+ if (argc < 6) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string key(url_unescape(argv[5]));
+
+ bool ret = st.rm(prefix, key);
+ if (!ret) {
+ std::cerr << "error removing ("
+ << url_escape(prefix) << "," << url_escape(key) << ")"
+ << std::endl;
+ return 1;
+ }
+ } else if (cmd == "rm-prefix") {
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+
+ bool ret = st.rm_prefix(prefix);
+ if (!ret) {
+ std::cerr << "error removing prefix ("
+ << url_escape(prefix) << ")"
+ << std::endl;
+ return 1;
+ }
+ } else if (cmd == "store-copy") {
+ int num_keys_per_tx = 128; // magic number that just feels right.
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ } else if (argc > 5) {
+ string err;
+ num_keys_per_tx = strict_strtol(argv[5], 10, &err);
+ if (!err.empty()) {
+ std::cerr << "invalid num_keys_per_tx: " << err << std::endl;
+ return 1;
+ }
+ }
+ string other_store_type = argv[1];
+ if (argc > 6) {
+ other_store_type = argv[6];
+ }
+
+ int ret = st.copy_store_to(argv[1], argv[4], num_keys_per_tx, other_store_type);
+ if (ret < 0) {
+ std::cerr << "error copying store to path '" << argv[4]
+ << "': " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ } else if (cmd == "store-crc") {
+ if (argc < 4) {
+ usage(argv[0]);
+ return 1;
+ }
+ std::ofstream fs(argv[4]);
+ uint32_t crc = st.traverse(string(), true, false, &fs);
+ std::cout << "store at '" << argv[4] << "' crc " << crc << std::endl;
+
+ } else if (cmd == "compact") {
+ st.compact();
+ } else if (cmd == "compact-prefix") {
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ st.compact_prefix(prefix);
+ } else if (cmd == "compact-range") {
+ if (argc < 7) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(url_unescape(argv[4]));
+ string start(url_unescape(argv[5]));
+ string end(url_unescape(argv[6]));
+ st.compact_range(prefix, start, end);
+ } else if (cmd == "stats") {
+ st.print_stats();
+ } else {
+ std::cerr << "Unrecognized command: " << cmd << std::endl;
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc
new file mode 100644
index 00000000..9ff08f32
--- /dev/null
+++ b/src/tools/ceph_monstore_tool.cc
@@ -0,0 +1,1297 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/scope_exit.hpp>
+
+#include <stdlib.h>
+#include <string>
+
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "auth/KeyRing.h"
+#include "auth/cephx/CephxKeyServer.h"
+#include "global/global_init.h"
+#include "include/stringify.h"
+#include "mgr/mgr_commands.h"
+#include "mon/AuthMonitor.h"
+#include "mon/MonitorDBStore.h"
+#include "mon/Paxos.h"
+#include "mon/MonMap.h"
+#include "mds/FSMap.h"
+#include "mon/MgrMap.h"
+#include "osd/OSDMap.h"
+#include "crush/CrushCompiler.h"
+#include "mon/CreatingPGs.h"
+
+namespace po = boost::program_options;
+
+class TraceIter {
+ int fd;
+ unsigned idx;
+ MonitorDBStore::TransactionRef t;
+public:
+ explicit TraceIter(string fname) : fd(-1), idx(-1) {
+ fd = ::open(fname.c_str(), O_RDONLY);
+ t.reset(new MonitorDBStore::Transaction);
+ }
+ bool valid() {
+ return fd != -1;
+ }
+ MonitorDBStore::TransactionRef cur() {
+ ceph_assert(valid());
+ return t;
+ }
+ unsigned num() { return idx; }
+ void next() {
+ ++idx;
+ bufferlist bl;
+ int r = bl.read_fd(fd, 6);
+ if (r < 0) {
+ std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd"
+ << std::endl;
+ ::close(fd);
+ fd = -1;
+ return;
+ } else if ((unsigned)r < 6) {
+ std::cerr << "short read" << std::endl;
+ ::close(fd);
+ fd = -1;
+ return;
+ }
+ auto bliter = bl.cbegin();
+ uint8_t ver, ver2;
+ decode(ver, bliter);
+ decode(ver2, bliter);
+ uint32_t len;
+ decode(len, bliter);
+ r = bl.read_fd(fd, len);
+ if (r < 0) {
+ std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd"
+ << std::endl;
+ ::close(fd);
+ fd = -1;
+ return;
+ } else if ((unsigned)r < len) {
+ std::cerr << "short read" << std::endl;
+ ::close(fd);
+ fd = -1;
+ return;
+ }
+ bliter = bl.cbegin();
+ t.reset(new MonitorDBStore::Transaction);
+ t->decode(bliter);
+ }
+ void init() {
+ next();
+ }
+ ~TraceIter() {
+ if (fd != -1) {
+ ::close(fd);
+ fd = -1;
+ }
+ }
+};
+
+
+int parse_cmd_args(
+ po::options_description *desc, /// < visible options description
+ po::options_description *hidden_desc, /// < hidden options description
+ po::positional_options_description *positional, /// < positional args
+ vector<string> &cmd_args, /// < arguments to be parsed
+ po::variables_map *vm /// > post-parsing variable map
+ )
+{
+ // desc_all will aggregate all visible and hidden options for parsing.
+ //
+ // From boost's program_options point of view, there is absolutely no
+ // distinction between 'desc' and 'hidden_desc'. This is a distinction
+ // that is only useful to us: 'desc' is whatever we are willing to show
+ // on 'usage()', whereas 'hidden_desc' refers to parameters we wish to
+ // take advantage of but do not wish to show on 'usage()'.
+ //
+ // For example, consider that program_options matches positional arguments
+ // (specified via 'positional') against the paramenters defined on a
+ // given 'po::options_description' class. This is performed below,
+ // supplying both the description and the positional arguments to the
+ // parser. However, we do not want the parameters that are mapped to
+ // positional arguments to be shown on usage, as that makes for ugly and
+ // confusing usage messages. Therefore we dissociate the options'
+ // description that is to be used as an aid to the user from those options
+ // that are nothing but useful for internal purposes (i.e., mapping options
+ // to positional arguments). We still need to aggregate them before parsing
+ // and that's what 'desc_all' is all about.
+ //
+
+ ceph_assert(desc != NULL);
+
+ po::options_description desc_all;
+ desc_all.add(*desc);
+ if (hidden_desc != NULL)
+ desc_all.add(*hidden_desc);
+
+ try {
+ po::command_line_parser parser = po::command_line_parser(cmd_args).
+ options(desc_all);
+
+ if (positional) {
+ parser = parser.positional(*positional);
+ }
+
+ po::parsed_options parsed = parser.run();
+ po::store(parsed, *vm);
+ po::notify(*vm);
+ } catch (po::error &e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+
+/**
+ * usage: ceph-monstore-tool <store-path> <command> [options]
+ *
+ * commands:
+ *
+ * store-copy < --out arg >
+ * dump-keys
+ * compact
+ * getmonmap < --out arg [ --version arg ] >
+ * getosdmap < --out arg [ --version arg ] >
+ * dump-paxos <--dump-start VER> <--dump-end VER>
+ * dump-trace < --trace-file arg >
+ * replay-trace
+ * random-gen
+ * rewrite-crush
+ *
+ * wanted syntax:
+ *
+ * ceph-monstore-tool PATH CMD [options]
+ *
+ * ceph-monstore-tool PATH store-copy <PATH2 | -o PATH2>
+ * ceph-monstore-tool PATH dump-keys
+ * ceph-monstore-tool PATH compact
+ * ceph-monstore-tool PATH get monmap [VER]
+ * ceph-monstore-tool PATH get osdmap [VER]
+ * ceph-monstore-tool PATH dump-paxos STARTVER ENDVER
+ *
+ *
+ */
+void usage(const char *n, po::options_description &d)
+{
+ std::cerr <<
+ "usage: " << n << " <store-path> <cmd> [args|options]\n"
+ << "\n"
+ << "Commands:\n"
+ << " store-copy PATH copies store to PATH\n"
+ << " compact compacts the store\n"
+ << " get monmap [-- options] get monmap (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " get osdmap [-- options] get osdmap (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " get mdsmap [-- options] get mdsmap (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " get mgr [-- options] get mgr map (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " get crushmap [-- options] get crushmap (version VER if specified)\n"
+ << " (default: last committed)\n"
+ << " show-versions [-- options] show the first&last committed version of map\n"
+ << " (show-versions -- --help for more info)\n"
+ << " dump-keys dumps store keys to FILE\n"
+ << " (default: stdout)\n"
+ << " dump-paxos [-- options] dump paxos transactions\n"
+ << " (dump-paxos -- --help for more info)\n"
+ << " dump-trace FILE [-- options] dump contents of trace file FILE\n"
+ << " (dump-trace -- --help for more info)\n"
+ << " replay-trace FILE [-- options] replay trace from FILE\n"
+ << " (replay-trace -- --help for more info)\n"
+ << " random-gen [-- options] add randomly generated ops to the store\n"
+ << " (random-gen -- --help for more info)\n"
+ << " rewrite-crush [-- options] add a rewrite commit to the store\n"
+ << " (rewrite-crush -- --help for more info)\n"
+ << " rebuild rebuild store\n"
+ << " (rebuild -- --help for more info)\n"
+ << std::endl;
+ std::cerr << d << std::endl;
+ std::cerr
+ << "\nPlease Note:\n"
+ << "* Ceph-specific options should be in the format --option-name=VAL\n"
+ << " (specifically, do not forget the '='!!)\n"
+ << "* Command-specific options need to be passed after a '--'\n"
+ << " e.g., 'get monmap -- --version 10 --out /tmp/foo'"
+ << std::endl;
+}
+
+int update_osdmap(MonitorDBStore& store, version_t ver, bool copy,
+ std::shared_ptr<CrushWrapper> crush,
+ MonitorDBStore::Transaction* t) {
+ const string prefix("osdmap");
+
+ // full
+ bufferlist bl;
+ int r = 0;
+ r = store.get(prefix, store.combine_strings("full", ver), bl);
+ if (r) {
+ std::cerr << "Error getting full map: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ osdmap.crush = crush;
+ if (copy) {
+ osdmap.inc_epoch();
+ }
+ bl.clear();
+ // be consistent with OSDMonitor::update_from_paxos()
+ osdmap.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+ t->put(prefix, store.combine_strings("full", osdmap.get_epoch()), bl);
+
+ // incremental
+ OSDMap::Incremental inc;
+ if (copy) {
+ inc.epoch = osdmap.get_epoch();
+ inc.fsid = osdmap.get_fsid();
+ } else {
+ bl.clear();
+ r = store.get(prefix, ver, bl);
+ if (r) {
+ std::cerr << "Error getting inc map: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ OSDMap::Incremental inc(bl);
+ if (inc.crush.length()) {
+ inc.crush.clear();
+ crush->encode(inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ }
+ if (inc.fullmap.length()) {
+ OSDMap fullmap;
+ fullmap.decode(inc.fullmap);
+ fullmap.crush = crush;
+ inc.fullmap.clear();
+ fullmap.encode(inc.fullmap);
+ }
+ }
+ ceph_assert(osdmap.have_crc());
+ inc.full_crc = osdmap.get_crc();
+ bl.clear();
+ // be consistent with OSDMonitor::update_from_paxos()
+ inc.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+ t->put(prefix, inc.epoch, bl);
+ return 0;
+}
+
+int rewrite_transaction(MonitorDBStore& store, int version,
+ const string& crush_file,
+ MonitorDBStore::Transaction* t) {
+ const string prefix("osdmap");
+
+ // calc the known-good epoch
+ version_t last_committed = store.get(prefix, "last_committed");
+ version_t good_version = 0;
+ if (version <= 0) {
+ if (last_committed >= (unsigned)-version) {
+ good_version = last_committed + version;
+ } else {
+ std::cerr << "osdmap-version is less than: -" << last_committed << std::endl;
+ return EINVAL;
+ }
+ } else {
+ good_version = version;
+ }
+ if (good_version >= last_committed) {
+ std::cout << "good epoch is greater or equal to the last committed one: "
+ << good_version << " >= " << last_committed << std::endl;
+ return 0;
+ }
+
+ // load/extract the crush map
+ int r = 0;
+ std::shared_ptr<CrushWrapper> crush(new CrushWrapper);
+ if (crush_file.empty()) {
+ bufferlist bl;
+ r = store.get(prefix, store.combine_strings("full", good_version), bl);
+ if (r) {
+ std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ crush = osdmap.crush;
+ } else {
+ string err;
+ bufferlist bl;
+ r = bl.read_file(crush_file.c_str(), &err);
+ if (r) {
+ std::cerr << err << ": " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ auto p = bl.cbegin();
+ crush->decode(p);
+ }
+
+ // prepare a transaction to rewrite the epochs
+ // (good_version, last_committed]
+ // with the good crush map.
+ // XXX: may need to break this into several paxos versions?
+ ceph_assert(good_version < last_committed);
+ for (version_t v = good_version + 1; v <= last_committed; v++) {
+ cout << "rewriting epoch #" << v << "/" << last_committed << std::endl;
+ r = update_osdmap(store, v, false, crush, t);
+ if (r)
+ return r;
+ }
+
+ // add a new osdmap epoch to store, so monitors will update their current osdmap
+ // in addition to the ones stored in epochs.
+ //
+ // This is needed due to the way the monitor updates from paxos and the
+ // facilities we are leveraging to push this update to the rest of the
+ // quorum.
+ //
+ // In a nutshell, we are generating a good version of the osdmap, with a
+ // proper crush, and building a transaction that will replace the bad
+ // osdmaps with good osdmaps. But this transaction needs to be applied on
+ // all nodes, so that the monitors will have good osdmaps to share with
+ // clients. We thus leverage Paxos, specifically the recovery mechanism, by
+ // creating a pending value that will be committed once the monitors form an
+ // initial quorum after being brought back to life.
+ //
+ // However, the way the monitor works has the paxos services, including the
+ // OSDMonitor, updating their state from disk *prior* to the recovery phase
+ // begins (so they have an up to date state in memory). This means the
+ // OSDMonitor will see the old, broken map, before the new paxos version is
+ // applied to disk, and the old version is cached. Even though we have the
+ // good map now, and we share the good map with clients, we will still be
+ // working on the old broken map. Instead of mucking around the monitor to
+ // make this work, we instead opt for adding the same osdmap but with a
+ // newer version, so that the OSDMonitor picks up on it when it updates from
+ // paxos after the proposal has been committed. This is not elegant, but
+ // avoids further unpleasantness that would arise from kludging around the
+ // current behavior. Also, has the added benefit of making sure the clients
+ // get an updated version of the map (because last_committed+1 >
+ // last_committed) :)
+ //
+ cout << "adding a new epoch #" << last_committed+1 << std::endl;
+ r = update_osdmap(store, last_committed++, true, crush, t);
+ if (r)
+ return r;
+ t->put(prefix, store.combine_strings("full", "latest"), last_committed);
+ t->put(prefix, "last_committed", last_committed);
+ return 0;
+}
+
+/**
+ * create a new paxos version which carries a proposal to rewrite all epochs
+ * of incremental and full map of "osdmap" after a faulty crush map is injected.
+ * so the leader will trigger a recovery and propagate this fix to its peons,
+ * after the proposal is accepted, and the transaction in it is applied. all
+ * monitors will rewrite the bad crush map with the good one, and have a new
+ * osdmap epoch with the good crush map in it.
+ */
+int rewrite_crush(const char* progname,
+ vector<string>& subcmds,
+ MonitorDBStore& store) {
+ po::options_description op_desc("Allowed 'rewrite-crush' options");
+ int version = -1;
+ string crush_file;
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("crush", po::value<string>(&crush_file),
+ ("path to the crush map file "
+ "(default: will instead extract it from the known-good osdmap)"))
+ ("good-epoch", po::value<int>(&version),
+ "known-good epoch of osdmap, if a negative number '-N' is given, the "
+ "$last_committed-N is used instead (default: -1). "
+ "Please note, -1 is not necessarily a good epoch, because there are "
+ "good chance that we have more epochs slipped into the monstore after "
+ "the one where the crushmap is firstly injected.")
+ ;
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, NULL, NULL, subcmds, &op_vm);
+ if (r) {
+ return -r;
+ }
+ if (op_vm.count("help")) {
+ usage(progname, op_desc);
+ return 0;
+ }
+
+ MonitorDBStore::Transaction rewrite_txn;
+ r = rewrite_transaction(store, version, crush_file, &rewrite_txn);
+ if (r) {
+ return r;
+ }
+
+ // store the transaction into store as a proposal
+ const string prefix("paxos");
+ version_t pending_v = store.get(prefix, "last_committed") + 1;
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ bufferlist bl;
+ rewrite_txn.encode(bl);
+ cout << "adding pending commit " << pending_v
+ << " " << bl.length() << " bytes" << std::endl;
+ t->put(prefix, pending_v, bl);
+ t->put(prefix, "pending_v", pending_v);
+ // a large enough yet unique proposal number will probably do the trick
+ version_t pending_pn = (store.get(prefix, "accepted_pn") / 100 + 4) * 100 + 1;
+ t->put(prefix, "pending_pn", pending_pn);
+ store.apply_transaction(t);
+ return 0;
+}
+
+static int update_auth(MonitorDBStore& st, const string& keyring_path)
+{
+ // import all keyrings stored in the keyring file
+ KeyRing keyring;
+ int r = keyring.load(g_ceph_context, keyring_path);
+ if (r < 0) {
+ cerr << "unable to load admin keyring: " << keyring_path << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ __u8 v = 1;
+ encode(v, bl);
+
+ for (const auto& k : keyring.get_keys()) {
+ KeyServerData::Incremental auth_inc;
+ auth_inc.name = k.first;
+ auth_inc.auth = k.second;
+ if (auth_inc.auth.caps.empty()) {
+ cerr << "no caps granted to: " << auth_inc.name << std::endl;
+ return -EINVAL;
+ }
+ auth_inc.op = KeyServerData::AUTH_INC_ADD;
+
+ AuthMonitor::Incremental inc;
+ inc.inc_type = AuthMonitor::AUTH_DATA;
+ encode(auth_inc, inc.auth_data);
+ inc.auth_type = CEPH_AUTH_CEPHX;
+
+ inc.encode(bl, CEPH_FEATURES_ALL);
+ }
+
+ const string prefix("auth");
+ auto last_committed = st.get(prefix, "last_committed") + 1;
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put(prefix, last_committed, bl);
+ t->put(prefix, "last_committed", last_committed);
+ auto first_committed = st.get(prefix, "first_committed");
+ if (!first_committed) {
+ t->put(prefix, "first_committed", last_committed);
+ }
+ st.apply_transaction(t);
+ return 0;
+}
+
+static int update_mkfs(MonitorDBStore& st,
+ const string& monmap_path,
+ const vector<string>& mon_ids)
+{
+ MonMap monmap;
+ if (!monmap_path.empty()) {
+ cout << __func__ << " pulling initial monmap from " << monmap_path << std::endl;
+ bufferlist bl;
+ string err;
+ int r = bl.read_file(monmap_path.c_str(), &err);
+ if (r < 0) {
+ cerr << "failed to read monmap from " << monmap_path << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ monmap.decode(bl);
+ } else {
+ cout << __func__ << " generating seed initial monmap" << std::endl;
+ int r = monmap.build_initial(g_ceph_context, true, cerr);
+ if (r) {
+ cerr << "no initial monitors" << std::endl;
+ return -EINVAL;
+ }
+ vector<string> new_names;
+ if (!mon_ids.empty()) {
+ if (mon_ids.size() != monmap.size()) {
+ cerr << "Please pass the same number of <mon-ids> to name the hosts "
+ << "listed in 'mon_host'. "
+ << mon_ids.size() << " mon-id(s) specified, "
+ << "while you have " << monmap.size() << " mon hosts." << std::endl;
+ return -EINVAL;
+ }
+ new_names = mon_ids;
+ } else {
+ for (unsigned rank = 0; rank < monmap.size(); rank++) {
+ string new_name{"a"};
+ new_name[0] += rank;
+ new_names.push_back(std::move(new_name));
+ }
+ }
+ for (unsigned rank = 0; rank < monmap.size(); rank++) {
+ auto name = monmap.get_name(rank);
+ if (name.compare(0, 7, "noname-") == 0) {
+ monmap.rename(name, new_names[rank]);
+ }
+ }
+ }
+ monmap.print(cout);
+ bufferlist bl;
+ monmap.encode(bl, CEPH_FEATURES_ALL);
+ monmap.set_epoch(0);
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put("mkfs", "monmap", bl);
+ st.apply_transaction(t);
+ return 0;
+}
+
+static int update_monitor(MonitorDBStore& st)
+{
+ const string prefix("monitor");
+ // a stripped-down Monitor::mkfs()
+ bufferlist bl;
+ bl.append(CEPH_MON_ONDISK_MAGIC "\n");
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put(prefix, "magic", bl);
+ st.apply_transaction(t);
+ return 0;
+}
+
+// rebuild
+// - creating_pgs
+static int update_creating_pgs(MonitorDBStore& st)
+{
+ bufferlist bl;
+ auto last_osdmap_epoch = st.get("osdmap", "last_committed");
+ int r = st.get("osdmap", st.combine_strings("full", last_osdmap_epoch), bl);
+ if (r < 0) {
+ cerr << "unable to losd osdmap e" << last_osdmap_epoch << std::endl;
+ return r;
+ }
+
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ creating_pgs_t creating;
+ for (auto& i : osdmap.get_pools()) {
+ creating.created_pools.insert(i.first);
+ }
+ creating.last_scan_epoch = last_osdmap_epoch;
+
+ bufferlist newbl;
+ ::encode(creating, newbl);
+
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put("osd_pg_creating", "creating", newbl);
+ st.apply_transaction(t);
+ return 0;
+}
+
+// rebuild
+// - mgr
+// - mgr_command_desc
+static int update_mgrmap(MonitorDBStore& st)
+{
+ auto t = make_shared<MonitorDBStore::Transaction>();
+
+ {
+ MgrMap map;
+ // mgr expects epoch > 1
+ map.epoch++;
+ auto initial_modules =
+ get_str_vec(g_ceph_context->_conf.get_val<string>("mgr_initial_modules"));
+ copy(begin(initial_modules),
+ end(initial_modules),
+ inserter(map.modules, end(map.modules)));
+ bufferlist bl;
+ map.encode(bl, CEPH_FEATURES_ALL);
+ t->put("mgr", map.epoch, bl);
+ t->put("mgr", "last_committed", map.epoch);
+ }
+ {
+ auto mgr_command_descs = mgr_commands;
+ for (auto& c : mgr_command_descs) {
+ c.set_flag(MonCommand::FLAG_MGR);
+ }
+ bufferlist bl;
+ encode(mgr_command_descs, bl);
+ t->put("mgr_command_descs", "", bl);
+ }
+ return st.apply_transaction(t);
+}
+
+static int update_paxos(MonitorDBStore& st)
+{
+ // build a pending paxos proposal from all non-permanent k/v pairs. once the
+ // proposal is committed, it will gets applied. on the sync provider side, it
+ // will be a no-op, but on its peers, the paxos commit will help to build up
+ // the necessary epochs.
+ bufferlist pending_proposal;
+ {
+ MonitorDBStore::Transaction t;
+ vector<string> prefixes = {"auth", "osdmap",
+ "mgr", "mgr_command_desc"};
+ for (const auto& prefix : prefixes) {
+ for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
+ auto key = i->raw_key();
+ auto val = i->value();
+ t.put(key.first, key.second, val);
+ }
+ }
+ t.encode(pending_proposal);
+ }
+ const string prefix("paxos");
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put(prefix, "first_committed", 0);
+ t->put(prefix, "last_committed", 0);
+ auto pending_v = 1;
+ t->put(prefix, pending_v, pending_proposal);
+ t->put(prefix, "pending_v", pending_v);
+ t->put(prefix, "pending_pn", 400);
+ st.apply_transaction(t);
+ return 0;
+}
+
+int rebuild_monstore(const char* progname,
+ vector<string>& subcmds,
+ MonitorDBStore& st)
+{
+ po::options_description op_desc("Allowed 'rebuild' options");
+ string keyring_path;
+ string monmap_path;
+ vector<string> mon_ids;
+ op_desc.add_options()
+ ("keyring", po::value<string>(&keyring_path),
+ "path to the client.admin key")
+ ("monmap", po::value<string>(&monmap_path),
+ "path to the initial monmap")
+ ("mon-ids", po::value<vector<string>>(&mon_ids)->multitoken(),
+ "mon ids, use 'a', 'b', ... if not specified");
+ po::positional_options_description pos_desc;
+ pos_desc.add("mon-ids", -1);
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, nullptr, &pos_desc, subcmds, &op_vm);
+ if (r) {
+ return -r;
+ }
+ if (op_vm.count("help")) {
+ usage(progname, op_desc);
+ return 0;
+ }
+ if (!keyring_path.empty())
+ update_auth(st, keyring_path);
+ if ((r = update_creating_pgs(st))) {
+ return r;
+ }
+ if ((r = update_mgrmap(st))) {
+ return r;
+ }
+ if ((r = update_paxos(st))) {
+ return r;
+ }
+ if ((r = update_mkfs(st, monmap_path, mon_ids))) {
+ return r;
+ }
+ if ((r = update_monitor(st))) {
+ return r;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ int err = 0;
+ po::options_description desc("Allowed options");
+ string store_path, cmd;
+ vector<string> subcmds;
+ desc.add_options()
+ ("help,h", "produce help message")
+ ;
+
+ /* Dear Future Developer:
+ *
+ * for further improvement, should you need to pass specific options to
+ * a command (e.g., get osdmap VER --hex), you can expand the current
+ * format by creating additional 'po::option_description' and passing
+ * 'subcmds' to 'po::command_line_parser', much like what is currently
+ * done by default. However, beware: in order to differentiate a
+ * command-specific option from the generic/global options, you will need
+ * to pass '--' in the command line (so that the first parser, the one
+ * below, assumes it has reached the end of all options); e.g.,
+ * 'get osdmap VER -- --hex'. Not pretty; far from intuitive; it was as
+ * far as I got with this library. Improvements on this format will be
+ * left as an excercise for the reader. -Joao
+ */
+ po::options_description positional_desc("Positional argument options");
+ positional_desc.add_options()
+ ("store-path", po::value<string>(&store_path),
+ "path to monitor's store")
+ ("command", po::value<string>(&cmd),
+ "Command")
+ ("subcmd", po::value<vector<string> >(&subcmds),
+ "Command arguments/Sub-Commands")
+ ;
+ po::positional_options_description positional;
+ positional.add("store-path", 1);
+ positional.add("command", 1);
+ positional.add("subcmd", -1);
+
+ po::options_description all_desc("All options");
+ all_desc.add(desc).add(positional_desc);
+
+ vector<string> ceph_option_strings;
+ po::variables_map vm;
+ try {
+ po::parsed_options parsed =
+ po::command_line_parser(argc, argv).
+ options(all_desc).
+ positional(positional).
+ allow_unregistered().run();
+
+ po::store(
+ parsed,
+ vm);
+ po::notify(vm);
+
+ // Specifying po::include_positional would have our positional arguments
+ // being collected (thus being part of ceph_option_strings and eventually
+ // passed on to global_init() below).
+ // Instead we specify po::exclude_positional, which has the upside of
+ // completely avoid this, but the downside of having to specify ceph
+ // options as --VAR=VAL (note the '='); otherwise we will capture the
+ // positional 'VAL' as belonging to us, never being collected.
+ ceph_option_strings = po::collect_unrecognized(parsed.options,
+ po::exclude_positional);
+
+ } catch(po::error &e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ return 1;
+ }
+
+ // parse command structure before calling global_init() and friends.
+
+ if (vm.empty() || vm.count("help") ||
+ store_path.empty() || cmd.empty() ||
+ *cmd.begin() == '-') {
+ usage(argv[0], desc);
+ return 1;
+ }
+
+ vector<const char *> ceph_options;
+ ceph_options.reserve(ceph_option_strings.size());
+ for (vector<string>::iterator i = ceph_option_strings.begin();
+ i != ceph_option_strings.end();
+ ++i) {
+ ceph_options.push_back(i->c_str());
+ }
+
+ auto cct = global_init(
+ NULL, ceph_options, CEPH_ENTITY_TYPE_MON,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_MON_CONFIG);
+ common_init_finish(g_ceph_context);
+ cct->_conf.apply_changes(nullptr);
+
+ // this is where we'll write *whatever*, on a per-command basis.
+ // not all commands require some place to write their things.
+ MonitorDBStore st(store_path);
+ if (store_path.size()) {
+ stringstream ss;
+ int r = st.open(ss);
+ if (r < 0) {
+ std::cerr << ss.str() << std::endl;
+ return EINVAL;
+ }
+ }
+
+ if (cmd == "dump-keys") {
+ KeyValueDB::WholeSpaceIterator iter = st.get_iterator();
+ while (iter->valid()) {
+ pair<string,string> key(iter->raw_key());
+ cout << key.first << " / " << key.second << std::endl;
+ iter->next();
+ }
+ } else if (cmd == "compact") {
+ st.compact();
+ } else if (cmd == "get") {
+ unsigned v = 0;
+ string outpath;
+ bool readable = false;
+ string map_type;
+ // visible options for this command
+ po::options_description op_desc("Allowed 'get' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("out,o", po::value<string>(&outpath),
+ "output file (default: stdout)")
+ ("version,v", po::value<unsigned>(&v),
+ "map version to obtain")
+ ("readable,r", po::value<bool>(&readable)->default_value(false),
+ "print the map information in human readable format")
+ ;
+ // this is going to be a positional argument; we don't want to show
+ // it as an option during --help, but we do want to have it captured
+ // when parsing.
+ po::options_description hidden_op_desc("Hidden 'get' options");
+ hidden_op_desc.add_options()
+ ("map-type", po::value<string>(&map_type),
+ "map-type")
+ ;
+ po::positional_options_description op_positional;
+ op_positional.add("map-type", 1);
+
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
+ subcmds, &op_vm);
+ if (r < 0) {
+ err = -r;
+ goto done;
+ }
+
+ if (op_vm.count("help") || map_type.empty()) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ if (v == 0) {
+ if (map_type == "crushmap") {
+ v = st.get("osdmap", "last_committed");
+ } else {
+ v = st.get(map_type, "last_committed");
+ }
+ }
+
+ int fd = STDOUT_FILENO;
+ if (!outpath.empty()){
+ fd = ::open(outpath.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
+ if (fd < 0) {
+ std::cerr << "error opening output file: "
+ << cpp_strerror(errno) << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+ }
+
+ BOOST_SCOPE_EXIT((&r) (&fd) (&outpath)) {
+ ::close(fd);
+ if (r < 0 && fd != STDOUT_FILENO) {
+ ::remove(outpath.c_str());
+ }
+ } BOOST_SCOPE_EXIT_END
+
+ bufferlist bl;
+ r = 0;
+ if (map_type == "osdmap") {
+ r = st.get(map_type, st.combine_strings("full", v), bl);
+ } else if (map_type == "crushmap") {
+ bufferlist tmp;
+ r = st.get("osdmap", st.combine_strings("full", v), tmp);
+ if (r >= 0) {
+ OSDMap osdmap;
+ osdmap.decode(tmp);
+ osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ }
+ } else {
+ r = st.get(map_type, v, bl);
+ }
+ if (r < 0) {
+ std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ if (readable) {
+ stringstream ss;
+ bufferlist out;
+ try {
+ if (map_type == "monmap") {
+ MonMap monmap;
+ monmap.decode(bl);
+ monmap.print(ss);
+ } else if (map_type == "osdmap") {
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ osdmap.print(ss);
+ } else if (map_type == "mdsmap") {
+ FSMap fs_map;
+ fs_map.decode(bl);
+ fs_map.print(ss);
+ } else if (map_type == "mgr") {
+ MgrMap mgr_map;
+ auto p = bl.cbegin();
+ mgr_map.decode(p);
+ JSONFormatter f;
+ f.dump_object("mgrmap", mgr_map);
+ f.flush(ss);
+ } else if (map_type == "crushmap") {
+ CrushWrapper cw;
+ auto it = bl.cbegin();
+ cw.decode(it);
+ CrushCompiler cc(cw, std::cerr, 0);
+ cc.decompile(ss);
+ } else {
+ std::cerr << "This type of readable map does not exist: " << map_type
+ << std::endl << "You can only specify[osdmap|monmap|mdsmap"
+ "|crushmap|mgr]" << std::endl;
+ }
+ } catch (const buffer::error &err) {
+ std::cerr << "Could not decode for human readable output (you may still"
+ " use non-readable mode). Detail: " << err << std::endl;
+ }
+
+ out.append(ss);
+ out.write_fd(fd);
+ } else {
+ bl.write_fd(fd);
+ }
+
+ if (!outpath.empty()) {
+ std::cout << "wrote " << map_type
+ << " version " << v << " to " << outpath
+ << std::endl;
+ }
+ } else if (cmd == "show-versions") {
+ string map_type; //map type:osdmap,monmap...
+ // visible options for this command
+ po::options_description op_desc("Allowed 'show-versions' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("map-type", po::value<string>(&map_type), "map_type");
+
+ po::positional_options_description op_positional;
+ op_positional.add("map-type", 1);
+
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, NULL, &op_positional,
+ subcmds, &op_vm);
+ if (r < 0) {
+ err = -r;
+ goto done;
+ }
+
+ if (op_vm.count("help") || map_type.empty()) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ unsigned int v_first = 0;
+ unsigned int v_last = 0;
+ v_first = st.get(map_type, "first_committed");
+ v_last = st.get(map_type, "last_committed");
+
+ std::cout << "first committed:\t" << v_first << "\n"
+ << "last committed:\t" << v_last << std::endl;
+ } else if (cmd == "dump-paxos") {
+ unsigned dstart = 0;
+ unsigned dstop = ~0;
+ po::options_description op_desc("Allowed 'dump-paxos' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("start,s", po::value<unsigned>(&dstart),
+ "starting version (default: 0)")
+ ("end,e", po::value<unsigned>(&dstop),
+ "finish version (default: ~0)")
+ ;
+
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, NULL, NULL,
+ subcmds, &op_vm);
+ if (r < 0) {
+ err = -r;
+ goto done;
+ }
+
+ if (op_vm.count("help")) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ if (dstart > dstop) {
+ std::cerr << "error: 'start' version (value: " << dstart << ") "
+ << " is greater than 'end' version (value: " << dstop << ")"
+ << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ version_t v = dstart;
+ for (; v <= dstop; ++v) {
+ bufferlist bl;
+ st.get("paxos", v, bl);
+ if (bl.length() == 0)
+ break;
+ cout << "\n--- " << v << " ---" << std::endl;
+ auto tx(std::make_shared<MonitorDBStore::Transaction>());
+ Paxos::decode_append_transaction(tx, bl);
+ JSONFormatter f(true);
+ tx->dump(&f);
+ f.flush(cout);
+ }
+
+ std::cout << "dumped " << v << " paxos versions" << std::endl;
+
+ } else if (cmd == "dump-trace") {
+ unsigned dstart = 0;
+ unsigned dstop = ~0;
+ string outpath;
+
+ // visible options for this command
+ po::options_description op_desc("Allowed 'dump-trace' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("start,s", po::value<unsigned>(&dstart),
+ "starting version (default: 0)")
+ ("end,e", po::value<unsigned>(&dstop),
+ "finish version (default: ~0)")
+ ;
+ // this is going to be a positional argument; we don't want to show
+ // it as an option during --help, but we do want to have it captured
+ // when parsing.
+ po::options_description hidden_op_desc("Hidden 'dump-trace' options");
+ hidden_op_desc.add_options()
+ ("out,o", po::value<string>(&outpath),
+ "file to write the dump to")
+ ;
+ po::positional_options_description op_positional;
+ op_positional.add("out", 1);
+
+ po::variables_map op_vm;
+ int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional,
+ subcmds, &op_vm);
+ if (r < 0) {
+ err = -r;
+ goto done;
+ }
+
+ if (op_vm.count("help")) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ if (outpath.empty()) {
+ usage(argv[0], op_desc);
+ err = EINVAL;
+ goto done;
+ }
+
+ if (dstart > dstop) {
+ std::cerr << "error: 'start' version (value: " << dstart << ") "
+ << " is greater than 'stop' version (value: " << dstop << ")"
+ << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ TraceIter iter(outpath.c_str());
+ iter.init();
+ while (true) {
+ if (!iter.valid())
+ break;
+ if (iter.num() >= dstop) {
+ break;
+ }
+ if (iter.num() >= dstart) {
+ JSONFormatter f(true);
+ iter.cur()->dump(&f, false);
+ f.flush(std::cout);
+ std::cout << std::endl;
+ }
+ iter.next();
+ }
+ std::cerr << "Read up to transaction " << iter.num() << std::endl;
+ } else if (cmd == "replay-trace") {
+ string inpath;
+ unsigned num_replays = 1;
+ // visible options for this command
+ po::options_description op_desc("Allowed 'replay-trace' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("num-replays,n", po::value<unsigned>(&num_replays),
+ "finish version (default: 1)")
+ ;
+ // this is going to be a positional argument; we don't want to show
+ // it as an option during --help, but we do want to have it captured
+ // when parsing.
+ po::options_description hidden_op_desc("Hidden 'replay-trace' options");
+ hidden_op_desc.add_options()
+ ("in,i", po::value<string>(&inpath),
+ "file to write the dump to")
+ ;
+ po::positional_options_description op_positional;
+ op_positional.add("in", 1);
+
+ // op_desc_all will aggregate all visible and hidden options for parsing.
+ // when we call 'usage()' we just pass 'op_desc', as that's the description
+ // holding the visible options.
+ po::options_description op_desc_all;
+ op_desc_all.add(op_desc).add(hidden_op_desc);
+
+ po::variables_map op_vm;
+ try {
+ po::parsed_options op_parsed = po::command_line_parser(subcmds).
+ options(op_desc_all).positional(op_positional).run();
+ po::store(op_parsed, op_vm);
+ po::notify(op_vm);
+ } catch (po::error &e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ if (op_vm.count("help")) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ if (inpath.empty()) {
+ usage(argv[0], op_desc);
+ err = EINVAL;
+ goto done;
+ }
+
+ unsigned num = 0;
+ for (unsigned i = 0; i < num_replays; ++i) {
+ TraceIter iter(inpath.c_str());
+ iter.init();
+ while (true) {
+ if (!iter.valid())
+ break;
+ std::cerr << "Replaying trans num " << num << std::endl;
+ st.apply_transaction(iter.cur());
+ iter.next();
+ ++num;
+ }
+ std::cerr << "Read up to transaction " << iter.num() << std::endl;
+ }
+ } else if (cmd == "random-gen") {
+ unsigned tsize = 200;
+ unsigned tvalsize = 1024;
+ unsigned ntrans = 100;
+ po::options_description op_desc("Allowed 'random-gen' options");
+ op_desc.add_options()
+ ("help,h", "produce this help message")
+ ("num-keys,k", po::value<unsigned>(&tsize),
+ "keys to write in each transaction (default: 200)")
+ ("size,s", po::value<unsigned>(&tvalsize),
+ "size (in bytes) of the value to write in each key (default: 1024)")
+ ("ntrans,n", po::value<unsigned>(&ntrans),
+ "number of transactions to run (default: 100)")
+ ;
+
+ po::variables_map op_vm;
+ try {
+ po::parsed_options op_parsed = po::command_line_parser(subcmds).
+ options(op_desc).run();
+ po::store(op_parsed, op_vm);
+ po::notify(op_vm);
+ } catch (po::error &e) {
+ std::cerr << "error: " << e.what() << std::endl;
+ err = EINVAL;
+ goto done;
+ }
+
+ if (op_vm.count("help")) {
+ usage(argv[0], op_desc);
+ err = 0;
+ goto done;
+ }
+
+ unsigned num = 0;
+ for (unsigned i = 0; i < ntrans; ++i) {
+ std::cerr << "Applying trans " << i << std::endl;
+ auto t(std::make_shared<MonitorDBStore::Transaction>());
+ string prefix;
+ prefix.push_back((i%26)+'a');
+ for (unsigned j = 0; j < tsize; ++j) {
+ stringstream os;
+ os << num;
+ bufferlist bl;
+ for (unsigned k = 0; k < tvalsize; ++k) bl.append(rand());
+ t->put(prefix, os.str(), bl);
+ ++num;
+ }
+ t->compact_prefix(prefix);
+ st.apply_transaction(t);
+ }
+ } else if (cmd == "store-copy") {
+ if (subcmds.size() < 1 || subcmds[0].empty()) {
+ usage(argv[0], desc);
+ err = EINVAL;
+ goto done;
+ }
+
+ string out_path = subcmds[0];
+
+ MonitorDBStore out_store(out_path);
+ {
+ stringstream ss;
+ int r = out_store.create_and_open(ss);
+ if (r < 0) {
+ std::cerr << ss.str() << std::endl;
+ goto done;
+ }
+ }
+
+
+ KeyValueDB::WholeSpaceIterator it = st.get_iterator();
+ uint64_t total_keys = 0;
+ uint64_t total_size = 0;
+ uint64_t total_tx = 0;
+
+ do {
+ uint64_t num_keys = 0;
+
+ auto tx(std::make_shared<MonitorDBStore::Transaction>());
+
+ while (it->valid() && num_keys < 128) {
+ pair<string,string> k = it->raw_key();
+ bufferlist v = it->value();
+ tx->put(k.first, k.second, v);
+
+ num_keys ++;
+ total_tx ++;
+ total_size += v.length();
+
+ it->next();
+ }
+
+ total_keys += num_keys;
+
+ if (!tx->empty())
+ out_store.apply_transaction(tx);
+
+ std::cout << "copied " << total_keys << " keys so far ("
+ << stringify(byte_u_t(total_size)) << ")" << std::endl;
+
+ } while (it->valid());
+ out_store.close();
+ std::cout << "summary: copied " << total_keys << " keys, using "
+ << total_tx << " transactions, totalling "
+ << stringify(byte_u_t(total_size)) << std::endl;
+ std::cout << "from '" << store_path << "' to '" << out_path << "'"
+ << std::endl;
+ } else if (cmd == "rewrite-crush") {
+ err = rewrite_crush(argv[0], subcmds, st);
+ } else if (cmd == "rebuild") {
+ err = rebuild_monstore(argv[0], subcmds, st);
+ } else {
+ std::cerr << "Unrecognized command: " << cmd << std::endl;
+ usage(argv[0], desc);
+ goto done;
+ }
+
+ done:
+ st.close();
+ return err;
+}
diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc
new file mode 100644
index 00000000..9ae5750c
--- /dev/null
+++ b/src/tools/ceph_objectstore_tool.cc
@@ -0,0 +1,4249 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/optional.hpp>
+
+#include <stdlib.h>
+
+#include "common/Formatter.h"
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+
+#include "global/global_init.h"
+
+#include "os/ObjectStore.h"
+#include "os/filestore/FileJournal.h"
+#include "os/filestore/FileStore.h"
+#ifdef HAVE_LIBFUSE
+#include "os/FuseStore.h"
+#endif
+
+#include "osd/PGLog.h"
+#include "osd/OSD.h"
+#include "osd/PG.h"
+#include "osd/ECUtil.h"
+
+#include "json_spirit/json_spirit_value.h"
+#include "json_spirit/json_spirit_reader.h"
+
+#include "rebuild_mondb.h"
+#include "ceph_objectstore_tool.h"
+#include "include/compat.h"
+#include "include/util.h"
+
+namespace po = boost::program_options;
+
+#ifdef INTERNAL_TEST
+CompatSet get_test_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+#ifdef INTERNAL_TEST2
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+#endif
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+#endif
+
+const ssize_t max_read = 1024 * 1024;
+const int fd_none = INT_MIN;
+bool outistty;
+bool dry_run;
+
+struct action_on_object_t {
+ virtual ~action_on_object_t() {}
+ virtual void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) = 0;
+};
+
+int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug)
+{
+ auto ch = store->open_collection(coll);
+ unsigned LIST_AT_A_TIME = 100;
+ ghobject_t next;
+ while (!next.is_max()) {
+ vector<ghobject_t> list;
+ int r = store->collection_list(ch,
+ next,
+ ghobject_t::get_max(),
+ LIST_AT_A_TIME,
+ &list,
+ &next);
+ if (r < 0) {
+ cerr << "Error listing collection: " << coll << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ for (vector<ghobject_t>::iterator obj = list.begin();
+ obj != list.end();
+ ++obj) {
+ if (obj->is_pgmeta())
+ continue;
+ object_info_t oi;
+ if (coll != coll_t::meta()) {
+ bufferlist attr;
+ r = store->getattr(ch, *obj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
+ << cpp_strerror(r) << std::endl;
+ } else {
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding attr on : " << make_pair(coll, *obj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+ }
+ action.call(store, coll, *obj, oi);
+ }
+ }
+ return 0;
+}
+
+int action_on_all_objects_in_pg(ObjectStore *store, string pgidstr, action_on_object_t &action, bool debug)
+{
+ spg_t pgid;
+ // Scan collections in case this is an ec pool but no shard specified
+ unsigned scanned = 0;
+ int r = 0;
+ vector<coll_t> colls_to_check;
+ vector<coll_t> candidates;
+ r = store->list_collections(candidates);
+ if (r < 0) {
+ cerr << "Error listing collections: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ pgid.parse(pgidstr.c_str());
+ for (vector<coll_t>::iterator i = candidates.begin();
+ i != candidates.end();
+ ++i) {
+ spg_t cand_pgid;
+ if (!i->is_pg(&cand_pgid))
+ continue;
+
+ // If an exact match or treat no shard as any shard
+ if (cand_pgid == pgid ||
+ (pgid.is_no_shard() && pgid.pgid == cand_pgid.pgid)) {
+ colls_to_check.push_back(*i);
+ }
+ }
+
+ if (debug)
+ cerr << colls_to_check.size() << " pgs to scan" << std::endl;
+ for (vector<coll_t>::iterator i = colls_to_check.begin();
+ i != colls_to_check.end();
+ ++i, ++scanned) {
+ if (debug)
+ cerr << "Scanning " << *i << ", " << scanned << "/"
+ << colls_to_check.size() << " completed" << std::endl;
+ r = _action_on_all_objects_in_pg(store, *i, action, debug);
+ if (r < 0)
+ break;
+ }
+ return r;
+}
+
+int action_on_all_objects_in_exact_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug)
+{
+ int r = _action_on_all_objects_in_pg(store, coll, action, debug);
+ return r;
+}
+
+int _action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool debug)
+{
+ unsigned scanned = 0;
+ int r = 0;
+ vector<coll_t> colls_to_check;
+ vector<coll_t> candidates;
+ r = store->list_collections(candidates);
+ if (r < 0) {
+ cerr << "Error listing collections: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ for (vector<coll_t>::iterator i = candidates.begin();
+ i != candidates.end();
+ ++i) {
+ if (i->is_pg()) {
+ colls_to_check.push_back(*i);
+ }
+ }
+
+ if (debug)
+ cerr << colls_to_check.size() << " pgs to scan" << std::endl;
+ for (vector<coll_t>::iterator i = colls_to_check.begin();
+ i != colls_to_check.end();
+ ++i, ++scanned) {
+ if (debug)
+ cerr << "Scanning " << *i << ", " << scanned << "/"
+ << colls_to_check.size() << " completed" << std::endl;
+ r = _action_on_all_objects_in_pg(store, *i, action, debug);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+int action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool debug)
+{
+ int r = _action_on_all_objects(store, action, debug);
+ return r;
+}
+
+struct pgid_object_list {
+ list<pair<coll_t, ghobject_t> > _objects;
+
+ void insert(coll_t coll, ghobject_t &ghobj) {
+ _objects.push_back(make_pair(coll, ghobj));
+ }
+
+ void dump(Formatter *f, bool human_readable) const {
+ if (!human_readable)
+ f->open_array_section("pgid_objects");
+ for (list<pair<coll_t, ghobject_t> >::const_iterator i = _objects.begin();
+ i != _objects.end();
+ ++i) {
+ f->open_array_section("pgid_object");
+ spg_t pgid;
+ bool is_pg = i->first.is_pg(&pgid);
+ if (is_pg)
+ f->dump_string("pgid", stringify(pgid));
+ if (!is_pg || !human_readable)
+ f->dump_string("coll", i->first.to_str());
+ f->open_object_section("ghobject");
+ i->second.dump(f);
+ f->close_section();
+ f->close_section();
+ if (human_readable) {
+ f->flush(cout);
+ cout << std::endl;
+ }
+ }
+ if (!human_readable) {
+ f->close_section();
+ f->flush(cout);
+ cout << std::endl;
+ }
+ }
+};
+
+struct lookup_ghobject : public action_on_object_t {
+ pgid_object_list _objects;
+ const string _name;
+ const boost::optional<std::string> _namespace;
+ bool _need_snapset;
+
+ lookup_ghobject(const string& name, const boost::optional<std::string>& nspace, bool need_snapset = false) : _name(name),
+ _namespace(nspace), _need_snapset(need_snapset) { }
+
+ void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) override {
+ if (_need_snapset && !ghobj.hobj.has_snapset())
+ return;
+ if ((_name.length() == 0 || ghobj.hobj.oid.name == _name) &&
+ (!_namespace || ghobj.hobj.nspace == _namespace))
+ _objects.insert(coll, ghobj);
+ return;
+ }
+
+ int size() const {
+ return _objects._objects.size();
+ }
+
+ pair<coll_t, ghobject_t> pop() {
+ pair<coll_t, ghobject_t> front = _objects._objects.front();
+ _objects._objects.pop_front();
+ return front;
+ }
+
+ void dump(Formatter *f, bool human_readable) const {
+ _objects.dump(f, human_readable);
+ }
+};
+
+int file_fd = fd_none;
+bool debug;
+bool force = false;
+super_header sh;
+
+static int get_fd_data(int fd, bufferlist &bl)
+{
+ uint64_t total = 0;
+ do {
+ ssize_t bytes = bl.read_fd(fd, max_read);
+ if (bytes < 0) {
+ cerr << "read_fd error " << cpp_strerror(bytes) << std::endl;
+ return bytes;
+ }
+
+ if (bytes == 0)
+ break;
+
+ total += bytes;
+ } while(true);
+
+ ceph_assert(bl.length() == total);
+ return 0;
+}
+
+int get_log(ObjectStore *fs, __u8 struct_ver,
+ spg_t pgid, const pg_info_t &info,
+ PGLog::IndexedLog &log, pg_missing_t &missing)
+{
+ try {
+ auto ch = fs->open_collection(coll_t(pgid));
+ if (!ch) {
+ return -ENOENT;
+ }
+ ostringstream oss;
+ ceph_assert(struct_ver > 0);
+ PGLog::read_log_and_missing(
+ fs, ch,
+ pgid.make_pgmeta_oid(),
+ info, log, missing,
+ oss,
+ g_ceph_context->_conf->osd_ignore_stale_divergent_priors);
+ if (debug && oss.str().size())
+ cerr << oss.str() << std::endl;
+ }
+ catch (const buffer::error &e) {
+ cerr << "read_log_and_missing threw exception error " << e.what() << std::endl;
+ return -EFAULT;
+ }
+ return 0;
+}
+
+void dump_log(Formatter *formatter, ostream &out, pg_log_t &log,
+ pg_missing_t &missing)
+{
+ formatter->open_object_section("op_log");
+ formatter->open_object_section("pg_log_t");
+ log.dump(formatter);
+ formatter->close_section();
+ formatter->flush(out);
+ formatter->open_object_section("pg_missing_t");
+ missing.dump(formatter);
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(out);
+}
+
+//Based on part of OSD::load_pgs()
+int finish_remove_pgs(ObjectStore *store)
+{
+ vector<coll_t> ls;
+ int r = store->list_collections(ls);
+ if (r < 0) {
+ cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ for (vector<coll_t>::iterator it = ls.begin();
+ it != ls.end();
+ ++it) {
+ spg_t pgid;
+
+ if (it->is_temp(&pgid) ||
+ (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
+ cout << "finish_remove_pgs " << *it << " removing " << pgid << std::endl;
+ OSD::recursive_remove_collection(g_ceph_context, store, pgid, *it);
+ continue;
+ }
+
+ //cout << "finish_remove_pgs ignoring unrecognized " << *it << std::endl;
+ }
+ return 0;
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t)
+{
+ pg_info_t info(pgid);
+ coll_t coll(pgid);
+ ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid());
+
+ epoch_t map_epoch = 0;
+ int r = PG::peek_map_epoch(fs, pgid, &map_epoch);
+ if (r < 0)
+ cerr << __func__ << " warning: peek_map_epoch reported error" << std::endl;
+ PastIntervals past_intervals;
+ __u8 struct_v;
+ r = PG::read_info(fs, pgid, coll, info, past_intervals, struct_v);
+ if (r < 0) {
+ cerr << __func__ << " error on read_info " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ ceph_assert(struct_v >= 8);
+ // new omap key
+ cout << "setting '_remove' omap key" << std::endl;
+ map<string,bufferlist> values;
+ encode((char)1, values["_remove"]);
+ t->omap_setkeys(coll, pgmeta_oid, values);
+ return 0;
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+template<typename Func>
+void wait_until_done(ObjectStore::Transaction* txn, Func&& func)
+{
+ bool finished = false;
+ std::condition_variable cond;
+ std::mutex m;
+ txn->register_on_complete(make_lambda_context([&]() {
+ std::unique_lock lock{m};
+ finished = true;
+ cond.notify_one();
+ }));
+ std::move(func)();
+ std::unique_lock lock{m};
+ cond.wait(lock, [&] {return finished;});
+}
+
+int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid)
+{
+ if (!dry_run)
+ finish_remove_pgs(store);
+ if (!store->collection_exists(coll_t(r_pgid)))
+ return -ENOENT;
+
+ cout << " marking collection for removal" << std::endl;
+ if (dry_run)
+ return 0;
+ ObjectStore::Transaction rmt;
+ int r = mark_pg_for_removal(store, r_pgid, &rmt);
+ if (r < 0) {
+ return r;
+ }
+ ObjectStore::CollectionHandle ch = store->open_collection(coll_t(r_pgid));
+ store->queue_transaction(ch, std::move(rmt));
+ finish_remove_pgs(store);
+ return r;
+}
+
+int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
+ PastIntervals &past_intervals)
+{
+ //Empty for this
+ coll_t coll(info.pgid);
+ ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid());
+ map<string,bufferlist> km;
+ pg_info_t last_written_info;
+ int ret = PG::_prepare_write_info(
+ g_ceph_context,
+ &km, epoch,
+ info,
+ last_written_info,
+ past_intervals,
+ true, true, false);
+ if (ret) cerr << "Failed to write info" << std::endl;
+ t.omap_setkeys(coll, pgmeta_oid, km);
+ return ret;
+}
+
+typedef map<eversion_t, hobject_t> divergent_priors_t;
+
+int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
+ pg_log_t &log, PastIntervals &past_intervals,
+ divergent_priors_t &divergent,
+ pg_missing_t &missing)
+{
+ cout << __func__ << " epoch " << epoch << " info " << info << std::endl;
+ int ret = write_info(t, epoch, info, past_intervals);
+ if (ret)
+ return ret;
+ coll_t coll(info.pgid);
+ map<string,bufferlist> km;
+
+ if (!divergent.empty()) {
+ ceph_assert(missing.get_items().empty());
+ PGLog::write_log_and_missing_wo_missing(
+ t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true);
+ } else {
+ pg_missing_tracker_t tmissing(missing);
+ bool rebuilt_missing_set_with_deletes = missing.may_include_deletes;
+ PGLog::write_log_and_missing(
+ t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true,
+ &rebuilt_missing_set_with_deletes);
+ }
+ t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km);
+ return 0;
+}
+
+int do_trim_pg_log(ObjectStore *store, const coll_t &coll,
+ pg_info_t &info, const spg_t &pgid,
+ epoch_t map_epoch,
+ PastIntervals &past_intervals)
+{
+ ghobject_t oid = pgid.make_pgmeta_oid();
+ struct stat st;
+ auto ch = store->open_collection(coll);
+ int r = store->stat(ch, oid, &st);
+ ceph_assert(r == 0);
+ ceph_assert(st.st_size == 0);
+
+ cerr << "Log bounds are: " << "(" << info.log_tail << ","
+ << info.last_update << "]" << std::endl;
+
+ uint64_t max_entries = g_ceph_context->_conf->osd_max_pg_log_entries;
+ if (info.last_update.version - info.log_tail.version <= max_entries) {
+ cerr << "Log not larger than osd_max_pg_log_entries " << max_entries << std::endl;
+ return 0;
+ }
+
+ ceph_assert(info.last_update.version > max_entries);
+ version_t trim_to = info.last_update.version - max_entries;
+ size_t trim_at_once = g_ceph_context->_conf->osd_pg_log_trim_max;
+ eversion_t new_tail;
+ bool done = false;
+
+ while (!done) {
+ // gather keys so we can delete them in a batch without
+ // affecting the iterator
+ set<string> keys_to_trim;
+ {
+ ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, oid);
+ if (!p)
+ break;
+ for (p->seek_to_first(); p->valid(); p->next()) {
+ if (p->key()[0] == '_')
+ continue;
+ if (p->key() == "can_rollback_to")
+ continue;
+ if (p->key() == "divergent_priors")
+ continue;
+ if (p->key() == "rollback_info_trimmed_to")
+ continue;
+ if (p->key() == "may_include_deletes_in_missing")
+ continue;
+ if (p->key().substr(0, 7) == string("missing"))
+ continue;
+ if (p->key().substr(0, 4) == string("dup_"))
+ continue;
+
+ bufferlist bl = p->value();
+ auto bp = bl.cbegin();
+ pg_log_entry_t e;
+ try {
+ e.decode_with_checksum(bp);
+ } catch (const buffer::error &e) {
+ cerr << "Error reading pg log entry: " << e << std::endl;
+ }
+ if (debug) {
+ cerr << "read entry " << e << std::endl;
+ }
+ if (e.version.version > trim_to) {
+ done = true;
+ break;
+ }
+ keys_to_trim.insert(p->key());
+ new_tail = e.version;
+ if (keys_to_trim.size() >= trim_at_once)
+ break;
+ }
+
+ if (!p->valid())
+ done = true;
+ } // deconstruct ObjectMapIterator
+
+ // delete the keys
+ if (!dry_run && !keys_to_trim.empty()) {
+ cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl;
+ ObjectStore::Transaction t;
+ t.omap_rmkeys(coll, oid, keys_to_trim);
+ store->queue_transaction(ch, std::move(t));
+ ch->flush();
+ }
+ }
+
+ // update pg info with new tail
+ if (!dry_run && new_tail != eversion_t()) {
+ info.log_tail = new_tail;
+ ObjectStore::Transaction t;
+ int ret = write_info(t, map_epoch, info, past_intervals);
+ if (ret)
+ return ret;
+ store->queue_transaction(ch, std::move(t));
+ ch->flush();
+ }
+
+ // compact the db since we just removed a bunch of data
+ cerr << "Finished trimming, now compacting..." << std::endl;
+ if (!dry_run)
+ store->compact();
+ return 0;
+}
+
+const int OMAP_BATCH_SIZE = 25;
+void get_omap_batch(ObjectMap::ObjectMapIterator &iter, map<string, bufferlist> &oset)
+{
+ oset.clear();
+ for (int count = OMAP_BATCH_SIZE; count && iter->valid(); --count, iter->next()) {
+ oset.insert(pair<string, bufferlist>(iter->key(), iter->value()));
+ }
+}
+
+int ObjectStoreTool::export_file(ObjectStore *store, coll_t cid, ghobject_t &obj)
+{
+ struct stat st;
+ mysize_t total;
+ footer ft;
+
+ auto ch = store->open_collection(cid);
+ int ret = store->stat(ch, obj, &st);
+ if (ret < 0)
+ return ret;
+
+ cerr << "Read " << obj << std::endl;
+
+ total = st.st_size;
+ if (debug)
+ cerr << "size=" << total << std::endl;
+
+ object_begin objb(obj);
+
+ {
+ bufferptr bp;
+ bufferlist bl;
+ ret = store->getattr(ch, obj, OI_ATTR, bp);
+ if (ret < 0) {
+ cerr << "getattr failure object_info " << ret << std::endl;
+ return ret;
+ }
+ bl.push_back(bp);
+ decode(objb.oi, bl);
+ if (debug)
+ cerr << "object_info: " << objb.oi << std::endl;
+ }
+
+ // NOTE: we include whiteouts, lost, etc.
+
+ ret = write_section(TYPE_OBJECT_BEGIN, objb, file_fd);
+ if (ret < 0)
+ return ret;
+
+ uint64_t offset = 0;
+ bufferlist rawdatabl;
+ while(total > 0) {
+ rawdatabl.clear();
+ mysize_t len = max_read;
+ if (len > total)
+ len = total;
+
+ ret = store->read(ch, obj, offset, len, rawdatabl);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -EINVAL;
+
+ data_section dblock(offset, len, rawdatabl);
+ if (debug)
+ cerr << "data section offset=" << offset << " len=" << len << std::endl;
+
+ total -= ret;
+ offset += ret;
+
+ ret = write_section(TYPE_DATA, dblock, file_fd);
+ if (ret) return ret;
+ }
+
+ //Handle attrs for this object
+ map<string,bufferptr> aset;
+ ret = store->getattrs(ch, obj, aset);
+ if (ret) return ret;
+ attr_section as(aset);
+ ret = write_section(TYPE_ATTRS, as, file_fd);
+ if (ret)
+ return ret;
+
+ if (debug) {
+ cerr << "attrs size " << aset.size() << std::endl;
+ }
+
+ //Handle omap information
+ bufferlist hdrbuf;
+ ret = store->omap_get_header(ch, obj, &hdrbuf, true);
+ if (ret < 0) {
+ cerr << "omap_get_header: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ omap_hdr_section ohs(hdrbuf);
+ ret = write_section(TYPE_OMAP_HDR, ohs, file_fd);
+ if (ret)
+ return ret;
+
+ ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, obj);
+ if (!iter) {
+ ret = -ENOENT;
+ cerr << "omap_get_iterator: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ iter->seek_to_first();
+ int mapcount = 0;
+ map<string, bufferlist> out;
+ while(iter->valid()) {
+ get_omap_batch(iter, out);
+
+ if (out.empty()) break;
+
+ mapcount += out.size();
+ omap_section oms(out);
+ ret = write_section(TYPE_OMAP, oms, file_fd);
+ if (ret)
+ return ret;
+ }
+ if (debug)
+ cerr << "omap map size " << mapcount << std::endl;
+
+ ret = write_simple(TYPE_OBJECT_END, file_fd);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int ObjectStoreTool::export_files(ObjectStore *store, coll_t coll)
+{
+ ghobject_t next;
+ auto ch = store->open_collection(coll);
+ while (!next.is_max()) {
+ vector<ghobject_t> objects;
+ int r = store->collection_list(ch, next, ghobject_t::get_max(), 300,
+ &objects, &next);
+ if (r < 0)
+ return r;
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ ceph_assert(!i->hobj.is_meta());
+ if (i->is_pgmeta() || i->hobj.is_temp() || !i->is_no_gen()) {
+ continue;
+ }
+ r = export_file(store, coll, *i);
+ if (r < 0)
+ return r;
+ }
+ }
+ return 0;
+}
+
+int set_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) {
+ OSDMap::Incremental inc;
+ auto it = bl.cbegin();
+ inc.decode(it);
+ if (e == 0) {
+ e = inc.epoch;
+ } else if (e != inc.epoch) {
+ cerr << "incremental.epoch mismatch: "
+ << inc.epoch << " != " << e << std::endl;
+ if (force) {
+ cerr << "But will continue anyway." << std::endl;
+ } else {
+ return -EINVAL;
+ }
+ }
+ auto ch = store->open_collection(coll_t::meta());
+ const ghobject_t inc_oid = OSD::get_inc_osdmap_pobject_name(e);
+ if (!store->exists(ch, inc_oid)) {
+ cerr << "inc-osdmap (" << inc_oid << ") does not exist." << std::endl;
+ if (!force) {
+ return -ENOENT;
+ }
+ cout << "Creating a new epoch." << std::endl;
+ }
+ if (dry_run)
+ return 0;
+ ObjectStore::Transaction t;
+ t.write(coll_t::meta(), inc_oid, 0, bl.length(), bl);
+ t.truncate(coll_t::meta(), inc_oid, bl.length());
+ store->queue_transaction(ch, std::move(t));
+ return 0;
+}
+
+int get_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl)
+{
+ auto ch = store->open_collection(coll_t::meta());
+ if (store->read(ch,
+ OSD::get_inc_osdmap_pobject_name(e),
+ 0, 0, bl) < 0) {
+ return -ENOENT;
+ }
+ return 0;
+}
+
+int set_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) {
+ OSDMap osdmap;
+ osdmap.decode(bl);
+ if (e == 0) {
+ e = osdmap.get_epoch();
+ } else if (e != osdmap.get_epoch()) {
+ cerr << "osdmap.epoch mismatch: "
+ << e << " != " << osdmap.get_epoch() << std::endl;
+ if (force) {
+ cerr << "But will continue anyway." << std::endl;
+ } else {
+ return -EINVAL;
+ }
+ }
+ auto ch = store->open_collection(coll_t::meta());
+ const ghobject_t full_oid = OSD::get_osdmap_pobject_name(e);
+ if (!store->exists(ch, full_oid)) {
+ cerr << "osdmap (" << full_oid << ") does not exist." << std::endl;
+ if (!force) {
+ return -ENOENT;
+ }
+ cout << "Creating a new epoch." << std::endl;
+ }
+ if (dry_run)
+ return 0;
+ ObjectStore::Transaction t;
+ t.write(coll_t::meta(), full_oid, 0, bl.length(), bl);
+ t.truncate(coll_t::meta(), full_oid, bl.length());
+ store->queue_transaction(ch, std::move(t));
+ return 0;
+}
+
+int get_osdmap(ObjectStore *store, epoch_t e, OSDMap &osdmap, bufferlist& bl)
+{
+ ObjectStore::CollectionHandle ch = store->open_collection(coll_t::meta());
+ bool found = store->read(
+ ch, OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
+ if (!found) {
+ cerr << "Can't find OSDMap for pg epoch " << e << std::endl;
+ return -ENOENT;
+ }
+ osdmap.decode(bl);
+ if (debug)
+ cerr << osdmap << std::endl;
+ return 0;
+}
+
+int get_pg_num_history(ObjectStore *store, pool_pg_num_history_t *h)
+{
+ ObjectStore::CollectionHandle ch = store->open_collection(coll_t::meta());
+ bufferlist bl;
+ auto pghist = OSD::make_pg_num_history_oid();
+ int r = store->read(ch, pghist, 0, 0, bl, 0);
+ if (r >= 0 && bl.length() > 0) {
+ auto p = bl.cbegin();
+ decode(*h, p);
+ }
+ cout << __func__ << " pg_num_history " << *h << std::endl;
+ return 0;
+}
+
+int add_osdmap(ObjectStore *store, metadata_section &ms)
+{
+ return get_osdmap(store, ms.map_epoch, ms.osdmap, ms.osdmap_bl);
+}
+
+int ObjectStoreTool::do_export(ObjectStore *fs, coll_t coll, spg_t pgid,
+ pg_info_t &info, epoch_t map_epoch, __u8 struct_ver,
+ const OSDSuperblock& superblock,
+ PastIntervals &past_intervals)
+{
+ PGLog::IndexedLog log;
+ pg_missing_t missing;
+
+ cerr << "Exporting " << pgid << " info " << info << std::endl;
+
+ int ret = get_log(fs, struct_ver, pgid, info, log, missing);
+ if (ret > 0)
+ return ret;
+
+ if (debug) {
+ Formatter *formatter = Formatter::create("json-pretty");
+ ceph_assert(formatter);
+ dump_log(formatter, cerr, log, missing);
+ delete formatter;
+ }
+ write_super();
+
+ pg_begin pgb(pgid, superblock);
+ // Special case: If replicated pg don't require the importing OSD to have shard feature
+ if (pgid.is_no_shard()) {
+ pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+ }
+ ret = write_section(TYPE_PG_BEGIN, pgb, file_fd);
+ if (ret)
+ return ret;
+
+ // The metadata_section is now before files, so import can detect
+ // errors and abort without wasting time.
+ metadata_section ms(
+ struct_ver,
+ map_epoch,
+ info,
+ log,
+ past_intervals,
+ missing);
+ ret = add_osdmap(fs, ms);
+ if (ret)
+ return ret;
+ ret = write_section(TYPE_PG_METADATA, ms, file_fd);
+ if (ret)
+ return ret;
+
+ ret = export_files(fs, coll);
+ if (ret) {
+ cerr << "export_files error " << ret << std::endl;
+ return ret;
+ }
+
+ ret = write_simple(TYPE_PG_END, file_fd);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int dump_data(Formatter *formatter, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ data_section ds;
+ ds.decode(ebliter);
+
+ formatter->open_object_section("data_block");
+ formatter->dump_unsigned("offset", ds.offset);
+ formatter->dump_unsigned("len", ds.len);
+ // XXX: Add option to dump data like od -cx ?
+ formatter->close_section();
+ formatter->flush(cout);
+ return 0;
+}
+
+int get_data(ObjectStore *store, coll_t coll, ghobject_t hoid,
+ ObjectStore::Transaction *t, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ data_section ds;
+ ds.decode(ebliter);
+
+ if (debug)
+ cerr << "\tdata: offset " << ds.offset << " len " << ds.len << std::endl;
+ t->write(coll, hoid, ds.offset, ds.len, ds.databl);
+ return 0;
+}
+
+int dump_attrs(
+ Formatter *formatter, ghobject_t hoid,
+ bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ attr_section as;
+ as.decode(ebliter);
+
+ // This could have been handled in the caller if we didn't need to
+ // support exports that didn't include object_info_t in object_begin.
+ if (hoid.generation == ghobject_t::NO_GEN &&
+ hoid.hobj.is_head()) {
+ map<string,bufferlist>::iterator mi = as.data.find(SS_ATTR);
+ if (mi != as.data.end()) {
+ SnapSet snapset;
+ auto p = mi->second.cbegin();
+ snapset.decode(p);
+ formatter->open_object_section("snapset");
+ snapset.dump(formatter);
+ formatter->close_section();
+ } else {
+ formatter->open_object_section("snapset");
+ formatter->dump_string("error", "missing SS_ATTR");
+ formatter->close_section();
+ }
+ }
+
+ formatter->open_object_section("attrs");
+ formatter->open_array_section("user");
+ for (auto kv : as.data) {
+ // Skip system attributes
+ if (('_' != kv.first.at(0)) || kv.first.size() == 1)
+ continue;
+ formatter->open_object_section("user_attr");
+ formatter->dump_string("name", kv.first.substr(1));
+ bool b64;
+ formatter->dump_string("value", cleanbin(kv.second, b64));
+ formatter->dump_bool("Base64", b64);
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->open_array_section("system");
+ for (auto kv : as.data) {
+ // Skip user attributes
+ if (('_' == kv.first.at(0)) && kv.first.size() != 1)
+ continue;
+ formatter->open_object_section("sys_attr");
+ formatter->dump_string("name", kv.first);
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(cout);
+
+ return 0;
+}
+
+int get_attrs(
+ ObjectStore *store, coll_t coll, ghobject_t hoid,
+ ObjectStore::Transaction *t, bufferlist &bl,
+ OSDriver &driver, SnapMapper &snap_mapper)
+{
+ auto ebliter = bl.cbegin();
+ attr_section as;
+ as.decode(ebliter);
+
+ auto ch = store->open_collection(coll);
+ if (debug)
+ cerr << "\tattrs: len " << as.data.size() << std::endl;
+ t->setattrs(coll, hoid, as.data);
+
+ // This could have been handled in the caller if we didn't need to
+ // support exports that didn't include object_info_t in object_begin.
+ if (hoid.generation == ghobject_t::NO_GEN &&
+ hoid.hobj.is_head()) {
+ map<string,bufferlist>::iterator mi = as.data.find(SS_ATTR);
+ if (mi != as.data.end()) {
+ SnapSet snapset;
+ auto p = mi->second.cbegin();
+ snapset.decode(p);
+ cout << "snapset " << snapset << std::endl;
+ for (auto& p : snapset.clone_snaps) {
+ ghobject_t clone = hoid;
+ clone.hobj.snap = p.first;
+ set<snapid_t> snaps(p.second.begin(), p.second.end());
+ if (!store->exists(ch, clone)) {
+ // no clone, skip. this is probably a cache pool. this works
+ // because we use a separate transaction per object and clones
+ // come before head in the archive.
+ if (debug)
+ cerr << "\tskipping missing " << clone << " (snaps "
+ << snaps << ")" << std::endl;
+ continue;
+ }
+ if (debug)
+ cerr << "\tsetting " << clone.hobj << " snaps " << snaps
+ << std::endl;
+ OSDriver::OSTransaction _t(driver.get_transaction(t));
+ ceph_assert(!snaps.empty());
+ snap_mapper.add_oid(clone.hobj, snaps, &_t);
+ }
+ } else {
+ cerr << "missing SS_ATTR on " << hoid << std::endl;
+ }
+ }
+ return 0;
+}
+
+int dump_omap_hdr(Formatter *formatter, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ omap_hdr_section oh;
+ oh.decode(ebliter);
+
+ formatter->open_object_section("omap_header");
+ formatter->dump_string("value", string(oh.hdr.c_str(), oh.hdr.length()));
+ formatter->close_section();
+ formatter->flush(cout);
+ return 0;
+}
+
+int get_omap_hdr(ObjectStore *store, coll_t coll, ghobject_t hoid,
+ ObjectStore::Transaction *t, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ omap_hdr_section oh;
+ oh.decode(ebliter);
+
+ if (debug)
+ cerr << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length())
+ << std::endl;
+ t->omap_setheader(coll, hoid, oh.hdr);
+ return 0;
+}
+
+int dump_omap(Formatter *formatter, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ omap_section os;
+ os.decode(ebliter);
+
+ formatter->open_object_section("omaps");
+ formatter->dump_unsigned("count", os.omap.size());
+ formatter->open_array_section("data");
+ for (auto o : os.omap) {
+ formatter->open_object_section("omap");
+ formatter->dump_string("name", o.first);
+ bool b64;
+ formatter->dump_string("value", cleanbin(o.second, b64));
+ formatter->dump_bool("Base64", b64);
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(cout);
+ return 0;
+}
+
+int get_omap(ObjectStore *store, coll_t coll, ghobject_t hoid,
+ ObjectStore::Transaction *t, bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ omap_section os;
+ os.decode(ebliter);
+
+ if (debug)
+ cerr << "\tomap: size " << os.omap.size() << std::endl;
+ t->omap_setkeys(coll, hoid, os.omap);
+ return 0;
+}
+
+int ObjectStoreTool::dump_object(Formatter *formatter,
+ bufferlist &bl)
+{
+ auto ebliter = bl.cbegin();
+ object_begin ob;
+ ob.decode(ebliter);
+
+ if (ob.hoid.hobj.is_temp()) {
+ cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl;
+ return -EFAULT;
+ }
+
+ formatter->open_object_section("object");
+ formatter->open_object_section("oid");
+ ob.hoid.dump(formatter);
+ formatter->close_section();
+ formatter->open_object_section("object_info");
+ ob.oi.dump(formatter);
+ formatter->close_section();
+
+ bufferlist ebl;
+ bool done = false;
+ while(!done) {
+ sectiontype_t type;
+ int ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ //cout << "\tdo_object: Section type " << hex << type << dec << std::endl;
+ //cout << "\t\tsection size " << ebl.length() << std::endl;
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown object section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_DATA:
+ if (dry_run) break;
+ ret = dump_data(formatter, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_ATTRS:
+ if (dry_run) break;
+ ret = dump_attrs(formatter, ob.hoid, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OMAP_HDR:
+ if (dry_run) break;
+ ret = dump_omap_hdr(formatter, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OMAP:
+ if (dry_run) break;
+ ret = dump_omap(formatter, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OBJECT_END:
+ done = true;
+ break;
+ default:
+ cerr << "Unknown section type " << type << std::endl;
+ return -EFAULT;
+ }
+ }
+ formatter->close_section();
+ return 0;
+}
+
+int ObjectStoreTool::get_object(ObjectStore *store,
+ OSDriver& driver,
+ SnapMapper& mapper,
+ coll_t coll,
+ bufferlist &bl, OSDMap &origmap,
+ bool *skipped_objects)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+ auto ebliter = bl.cbegin();
+ object_begin ob;
+ ob.decode(ebliter);
+
+ if (ob.hoid.hobj.is_temp()) {
+ cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl;
+ return -EFAULT;
+ }
+ ceph_assert(g_ceph_context);
+
+ auto ch = store->open_collection(coll);
+ if (ob.hoid.hobj.nspace != g_ceph_context->_conf->osd_hit_set_namespace) {
+ object_t oid = ob.hoid.hobj.oid;
+ object_locator_t loc(ob.hoid.hobj);
+ pg_t raw_pgid = origmap.object_locator_to_pg(oid, loc);
+ pg_t pgid = origmap.raw_pg_to_pg(raw_pgid);
+
+ spg_t coll_pgid;
+ if (coll.is_pg(&coll_pgid) == false) {
+ cerr << "INTERNAL ERROR: Bad collection during import" << std::endl;
+ return -EFAULT;
+ }
+ if (coll_pgid.shard != ob.hoid.shard_id) {
+ cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard
+ << " but object shard is " << ob.hoid.shard_id << std::endl;
+ return -EFAULT;
+ }
+
+ if (coll_pgid.pgid != pgid) {
+ cerr << "Skipping object '" << ob.hoid << "' which belongs in pg " << pgid << std::endl;
+ *skipped_objects = true;
+ skip_object(bl);
+ return 0;
+ }
+ }
+
+ if (!dry_run)
+ t->touch(coll, ob.hoid);
+
+ cout << "Write " << ob.hoid << std::endl;
+
+ bufferlist ebl;
+ bool done = false;
+ while(!done) {
+ sectiontype_t type;
+ int ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ //cout << "\tdo_object: Section type " << hex << type << dec << std::endl;
+ //cout << "\t\tsection size " << ebl.length() << std::endl;
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown object section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_DATA:
+ if (dry_run) break;
+ ret = get_data(store, coll, ob.hoid, t, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_ATTRS:
+ if (dry_run) break;
+ ret = get_attrs(store, coll, ob.hoid, t, ebl, driver, mapper);
+ if (ret) return ret;
+ break;
+ case TYPE_OMAP_HDR:
+ if (dry_run) break;
+ ret = get_omap_hdr(store, coll, ob.hoid, t, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OMAP:
+ if (dry_run) break;
+ ret = get_omap(store, coll, ob.hoid, t, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_OBJECT_END:
+ done = true;
+ break;
+ default:
+ cerr << "Unknown section type " << type << std::endl;
+ return -EFAULT;
+ }
+ }
+ if (!dry_run) {
+ wait_until_done(t, [&] {
+ store->queue_transaction(ch, std::move(*t));
+ ch->flush();
+ });
+ }
+ return 0;
+}
+
+int dump_pg_metadata(Formatter *formatter, bufferlist &bl, metadata_section &ms)
+{
+ auto ebliter = bl.cbegin();
+ ms.decode(ebliter);
+
+ formatter->open_object_section("metadata_section");
+
+ formatter->dump_unsigned("pg_disk_version", (int)ms.struct_ver);
+ formatter->dump_unsigned("map_epoch", ms.map_epoch);
+
+ formatter->open_object_section("OSDMap");
+ ms.osdmap.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+
+ formatter->open_object_section("info");
+ ms.info.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+
+ formatter->open_object_section("log");
+ ms.log.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+
+ formatter->open_object_section("pg_missing_t");
+ ms.missing.dump(formatter);
+ formatter->close_section();
+
+ // XXX: ms.past_intervals?
+
+ formatter->close_section();
+ formatter->flush(cout);
+
+ if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) {
+ cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl;
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms,
+ const OSDSuperblock& sb, spg_t pgid)
+{
+ auto ebliter = bl.cbegin();
+ ms.decode(ebliter);
+ spg_t old_pgid = ms.info.pgid;
+ ms.info.pgid = pgid;
+
+ if (debug) {
+ cout << "export pgid " << old_pgid << std::endl;
+ cout << "struct_v " << (int)ms.struct_ver << std::endl;
+ cout << "map epoch " << ms.map_epoch << std::endl;
+
+#ifdef DIAGNOSTIC
+ Formatter *formatter = new JSONFormatter(true);
+ formatter->open_object_section("stuff");
+
+ formatter->open_object_section("importing OSDMap");
+ ms.osdmap.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+
+ cout << "osd current epoch " << sb.current_epoch << std::endl;
+
+ formatter->open_object_section("info");
+ ms.info.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+
+ formatter->open_object_section("log");
+ ms.log.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+#endif
+ }
+
+ if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) {
+ cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl;
+ return -EFAULT;
+ }
+
+ if (ms.map_epoch > sb.current_epoch) {
+ cerr << "ERROR: Export PG's map_epoch " << ms.map_epoch << " > OSD's epoch " << sb.current_epoch << std::endl;
+ cerr << "The OSD you are using is older than the exported PG" << std::endl;
+ cerr << "Either use another OSD or join selected OSD to cluster to update it first" << std::endl;
+ return -EINVAL;
+ }
+
+ // Old exports didn't include OSDMap
+ if (ms.osdmap.get_epoch() == 0) {
+ cerr << "WARNING: No OSDMap in old export, this is an ancient export."
+ " Not supported." << std::endl;
+ return -EINVAL;
+ }
+
+ if (ms.osdmap.get_epoch() < sb.oldest_map) {
+ cerr << "PG export's map " << ms.osdmap.get_epoch()
+ << " is older than OSD's oldest_map " << sb.oldest_map << std::endl;
+ if (!force) {
+ cerr << " pass --force to proceed anyway (with incomplete PastIntervals)"
+ << std::endl;
+ return -EINVAL;
+ }
+ }
+ if (debug) {
+ cerr << "Import pgid " << ms.info.pgid << std::endl;
+ cerr << "Previous past_intervals " << ms.past_intervals << std::endl;
+ cerr << "history.same_interval_since "
+ << ms.info.history.same_interval_since << std::endl;
+ }
+
+ return 0;
+}
+
+// out: pg_log_t that only has entries that apply to import_pgid using curmap
+// reject: Entries rejected from "in" are in the reject.log. Other fields not set.
+void filter_divergent_priors(spg_t import_pgid, const OSDMap &curmap,
+ const string &hit_set_namespace, const divergent_priors_t &in,
+ divergent_priors_t &out, divergent_priors_t &reject)
+{
+ out.clear();
+ reject.clear();
+
+ for (divergent_priors_t::const_iterator i = in.begin();
+ i != in.end(); ++i) {
+
+ // Reject divergent priors for temporary objects
+ if (i->second.is_temp()) {
+ reject.insert(*i);
+ continue;
+ }
+
+ if (i->second.nspace != hit_set_namespace) {
+ object_t oid = i->second.oid;
+ object_locator_t loc(i->second);
+ pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
+ pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
+
+ if (import_pgid.pgid == pgid) {
+ out.insert(*i);
+ } else {
+ reject.insert(*i);
+ }
+ } else {
+ out.insert(*i);
+ }
+ }
+}
+
+int ObjectStoreTool::dump_export(Formatter *formatter)
+{
+ bufferlist ebl;
+ pg_info_t info;
+ PGLog::IndexedLog log;
+ //bool skipped_objects = false;
+
+ int ret = read_super();
+ if (ret)
+ return ret;
+
+ if (sh.magic != super_header::super_magic) {
+ cerr << "Invalid magic number" << std::endl;
+ return -EFAULT;
+ }
+
+ if (sh.version > super_header::super_ver) {
+ cerr << "Can't handle export format version=" << sh.version << std::endl;
+ return -EINVAL;
+ }
+
+ formatter->open_object_section("Export");
+
+ //First section must be TYPE_PG_BEGIN
+ sectiontype_t type;
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+ if (type == TYPE_POOL_BEGIN) {
+ cerr << "Dump of pool exports not supported" << std::endl;
+ return -EINVAL;
+ } else if (type != TYPE_PG_BEGIN) {
+ cerr << "Invalid first section type " << std::to_string(type) << std::endl;
+ return -EFAULT;
+ }
+
+ auto ebliter = ebl.cbegin();
+ pg_begin pgb;
+ pgb.decode(ebliter);
+ spg_t pgid = pgb.pgid;
+
+ formatter->dump_string("pgid", stringify(pgid));
+ formatter->dump_string("cluster_fsid", stringify(pgb.superblock.cluster_fsid));
+ formatter->dump_string("features", stringify(pgb.superblock.compat_features));
+
+ bool done = false;
+ bool found_metadata = false;
+ metadata_section ms;
+ bool objects_started = false;
+ while(!done) {
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ if (debug) {
+ cerr << "dump_export: Section type " << std::to_string(type) << std::endl;
+ }
+ if (type >= END_OF_TYPES) {
+ cerr << "Skipping unknown section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_OBJECT_BEGIN:
+ if (!objects_started) {
+ formatter->open_array_section("objects");
+ objects_started = true;
+ }
+ ret = dump_object(formatter, ebl);
+ if (ret) return ret;
+ break;
+ case TYPE_PG_METADATA:
+ if (objects_started)
+ cerr << "WARNING: metadata_section out of order" << std::endl;
+ ret = dump_pg_metadata(formatter, ebl, ms);
+ if (ret) return ret;
+ found_metadata = true;
+ break;
+ case TYPE_PG_END:
+ if (objects_started) {
+ formatter->close_section();
+ }
+ done = true;
+ break;
+ default:
+ cerr << "Unknown section type " << std::to_string(type) << std::endl;
+ return -EFAULT;
+ }
+ }
+
+ if (!found_metadata) {
+ cerr << "Missing metadata section" << std::endl;
+ return -EFAULT;
+ }
+
+ formatter->close_section();
+ formatter->flush(cout);
+
+ return 0;
+}
+
+int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb,
+ bool force, std::string pgidstr)
+{
+ bufferlist ebl;
+ pg_info_t info;
+ PGLog::IndexedLog log;
+ bool skipped_objects = false;
+
+ if (!dry_run)
+ finish_remove_pgs(store);
+
+ int ret = read_super();
+ if (ret)
+ return ret;
+
+ if (sh.magic != super_header::super_magic) {
+ cerr << "Invalid magic number" << std::endl;
+ return -EFAULT;
+ }
+
+ if (sh.version > super_header::super_ver) {
+ cerr << "Can't handle export format version=" << sh.version << std::endl;
+ return -EINVAL;
+ }
+
+ //First section must be TYPE_PG_BEGIN
+ sectiontype_t type;
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+ if (type == TYPE_POOL_BEGIN) {
+ cerr << "Pool exports cannot be imported into a PG" << std::endl;
+ return -EINVAL;
+ } else if (type != TYPE_PG_BEGIN) {
+ cerr << "Invalid first section type " << std::to_string(type) << std::endl;
+ return -EFAULT;
+ }
+
+ auto ebliter = ebl.cbegin();
+ pg_begin pgb;
+ pgb.decode(ebliter);
+ spg_t pgid = pgb.pgid;
+
+ if (pgidstr.length()) {
+ spg_t user_pgid;
+
+ bool ok = user_pgid.parse(pgidstr.c_str());
+ // This succeeded in main() already
+ ceph_assert(ok);
+ if (pgid != user_pgid) {
+ cerr << "specified pgid " << user_pgid
+ << " does not match actual pgid " << pgid << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (!pgb.superblock.cluster_fsid.is_zero()
+ && pgb.superblock.cluster_fsid != sb.cluster_fsid) {
+ cerr << "Export came from different cluster with fsid "
+ << pgb.superblock.cluster_fsid << std::endl;
+ return -EINVAL;
+ }
+
+ if (debug) {
+ cerr << "Exported features: " << pgb.superblock.compat_features << std::endl;
+ }
+
+ // Special case: Old export has SHARDS incompat feature on replicated pg, removqqe it
+ if (pgid.is_no_shard())
+ pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+
+ if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
+ CompatSet unsupported = sb.compat_features.unsupported(pgb.superblock.compat_features);
+
+ cerr << "Export has incompatible features set " << unsupported << std::endl;
+
+ // Let them import if they specify the --force option
+ if (!force)
+ return 11; // Positive return means exit status
+ }
+
+ // we need the latest OSDMap to check for collisions
+ OSDMap curmap;
+ bufferlist bl;
+ ret = get_osdmap(store, sb.current_epoch, curmap, bl);
+ if (ret) {
+ cerr << "Can't find latest local OSDMap " << sb.current_epoch << std::endl;
+ return ret;
+ }
+ if (!curmap.have_pg_pool(pgid.pgid.m_pool)) {
+ cerr << "Pool " << pgid.pgid.m_pool << " no longer exists" << std::endl;
+ // Special exit code for this error, used by test code
+ return 10; // Positive return means exit status
+ }
+
+ pool_pg_num_history_t pg_num_history;
+ get_pg_num_history(store, &pg_num_history);
+
+ ghobject_t pgmeta_oid = pgid.make_pgmeta_oid();
+
+ // Check for PG already present.
+ coll_t coll(pgid);
+ if (store->collection_exists(coll)) {
+ cerr << "pgid " << pgid << " already exists" << std::endl;
+ return -EEXIST;
+ }
+
+ ObjectStore::CollectionHandle ch;
+
+ OSDriver driver(
+ store,
+ coll_t(),
+ OSD::make_snapmapper_oid());
+ SnapMapper mapper(g_ceph_context, &driver, 0, 0, 0, pgid.shard);
+
+ cout << "Importing pgid " << pgid;
+ cout << std::endl;
+
+ bool done = false;
+ bool found_metadata = false;
+ metadata_section ms;
+ while(!done) {
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ if (debug) {
+ cout << __func__ << ": Section type " << std::to_string(type) << std::endl;
+ }
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_OBJECT_BEGIN:
+ ceph_assert(found_metadata);
+ ret = get_object(store, driver, mapper, coll, ebl, ms.osdmap,
+ &skipped_objects);
+ if (ret) return ret;
+ break;
+ case TYPE_PG_METADATA:
+ ret = get_pg_metadata(store, ebl, ms, sb, pgid);
+ if (ret) return ret;
+ found_metadata = true;
+
+ if (pgid != ms.info.pgid) {
+ cerr << "specified pgid " << pgid << " does not match import file pgid "
+ << ms.info.pgid << std::endl;
+ return -EINVAL;
+ }
+
+ // make sure there are no conflicting splits or merges
+ if (ms.osdmap.have_pg_pool(pgid.pgid.pool())) {
+ auto p = pg_num_history.pg_nums.find(pgid.pgid.m_pool);
+ if (p != pg_num_history.pg_nums.end() &&
+ !p->second.empty()) {
+ unsigned start_pg_num = ms.osdmap.get_pg_num(pgid.pgid.pool());
+ unsigned pg_num = start_pg_num;
+ for (auto q = p->second.lower_bound(ms.map_epoch);
+ q != p->second.end();
+ ++q) {
+ unsigned new_pg_num = q->second;
+ cout << "pool " << pgid.pgid.pool() << " pg_num " << pg_num
+ << " -> " << new_pg_num << std::endl;
+
+ // check for merge target
+ spg_t target;
+ if (pgid.is_merge_source(pg_num, new_pg_num, &target)) {
+ // FIXME: this checks assumes the OSD's PG is at the OSD's
+ // map epoch; it could be, say, at *our* epoch, pre-merge.
+ coll_t coll(target);
+ if (store->collection_exists(coll)) {
+ cerr << "pgid " << pgid << " merges to target " << target
+ << " which already exists" << std::endl;
+ return 12;
+ }
+ }
+
+ // check for split children
+ set<spg_t> children;
+ if (pgid.is_split(start_pg_num, new_pg_num, &children)) {
+ cerr << " children are " << children << std::endl;
+ for (auto child : children) {
+ coll_t coll(child);
+ if (store->collection_exists(coll)) {
+ cerr << "pgid " << pgid << " splits to " << children
+ << " and " << child << " exists" << std::endl;
+ return 12;
+ }
+ }
+ }
+ pg_num = new_pg_num;
+ }
+ }
+ } else {
+ cout << "pool " << pgid.pgid.pool() << " doesn't existing, not checking"
+ << " for splits or mergers" << std::endl;
+ }
+
+ if (!dry_run) {
+ ObjectStore::Transaction t;
+ ch = store->create_new_collection(coll);
+ PG::_create(
+ t, pgid,
+ pgid.get_split_bits(ms.osdmap.get_pg_pool(pgid.pool())->get_pg_num()));
+ PG::_init(t, pgid, NULL);
+
+ // mark this coll for removal until we're done
+ map<string,bufferlist> values;
+ encode((char)1, values["_remove"]);
+ t.omap_setkeys(coll, pgid.make_pgmeta_oid(), values);
+
+ store->queue_transaction(ch, std::move(t));
+ }
+
+ break;
+ case TYPE_PG_END:
+ ceph_assert(found_metadata);
+ done = true;
+ break;
+ default:
+ cerr << "Unknown section type " << std::to_string(type) << std::endl;
+ return -EFAULT;
+ }
+ }
+
+ if (!found_metadata) {
+ cerr << "Missing metadata section" << std::endl;
+ return -EFAULT;
+ }
+
+ ObjectStore::Transaction t;
+ if (!dry_run) {
+ pg_log_t newlog, reject;
+ pg_log_t::filter_log(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace,
+ ms.log, newlog, reject);
+ if (debug) {
+ for (list<pg_log_entry_t>::iterator i = newlog.log.begin();
+ i != newlog.log.end(); ++i)
+ cerr << "Keeping log entry " << *i << std::endl;
+ for (list<pg_log_entry_t>::iterator i = reject.log.begin();
+ i != reject.log.end(); ++i)
+ cerr << "Skipping log entry " << *i << std::endl;
+ }
+
+ divergent_priors_t newdp, rejectdp;
+ filter_divergent_priors(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace,
+ ms.divergent_priors, newdp, rejectdp);
+ ms.divergent_priors = newdp;
+ if (debug) {
+ for (divergent_priors_t::iterator i = newdp.begin();
+ i != newdp.end(); ++i)
+ cerr << "Keeping divergent_prior " << *i << std::endl;
+ for (divergent_priors_t::iterator i = rejectdp.begin();
+ i != rejectdp.end(); ++i)
+ cerr << "Skipping divergent_prior " << *i << std::endl;
+ }
+
+ ms.missing.filter_objects([&](const hobject_t &obj) {
+ if (obj.nspace == g_ceph_context->_conf->osd_hit_set_namespace)
+ return false;
+ ceph_assert(!obj.is_temp());
+ object_t oid = obj.oid;
+ object_locator_t loc(obj);
+ pg_t raw_pgid = ms.osdmap.object_locator_to_pg(oid, loc);
+ pg_t _pgid = ms.osdmap.raw_pg_to_pg(raw_pgid);
+
+ return pgid.pgid != _pgid;
+ });
+
+
+ if (debug) {
+ pg_missing_t missing;
+ Formatter *formatter = Formatter::create("json-pretty");
+ dump_log(formatter, cerr, newlog, ms.missing);
+ delete formatter;
+ }
+
+ // Just like a split invalidate stats since the object count is changed
+ if (skipped_objects)
+ ms.info.stats.stats_invalid = true;
+
+ ret = write_pg(
+ t,
+ ms.map_epoch,
+ ms.info,
+ newlog,
+ ms.past_intervals,
+ ms.divergent_priors,
+ ms.missing);
+ if (ret) return ret;
+ }
+
+ // done, clear removal flag
+ if (debug)
+ cerr << "done, clearing removal flag" << std::endl;
+
+ if (!dry_run) {
+ set<string> remove;
+ remove.insert("_remove");
+ t.omap_rmkeys(coll, pgid.make_pgmeta_oid(), remove);
+ wait_until_done(&t, [&] {
+ store->queue_transaction(ch, std::move(t));
+ // make sure we flush onreadable items before mapper/driver are destroyed.
+ ch->flush();
+ });
+ }
+ return 0;
+}
+
+int do_list(ObjectStore *store, string pgidstr, string object, boost::optional<std::string> nspace,
+ Formatter *formatter, bool debug, bool human_readable, bool head)
+{
+ int r;
+ lookup_ghobject lookup(object, nspace, head);
+ if (pgidstr.length() > 0) {
+ r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug);
+ } else {
+ r = action_on_all_objects(store, lookup, debug);
+ }
+ if (r)
+ return r;
+ lookup.dump(formatter, human_readable);
+ formatter->flush(cout);
+ return 0;
+}
+
+int do_meta(ObjectStore *store, string object, Formatter *formatter, bool debug, bool human_readable)
+{
+ int r;
+ boost::optional<std::string> nspace; // Not specified
+ lookup_ghobject lookup(object, nspace);
+ r = action_on_all_objects_in_exact_pg(store, coll_t::meta(), lookup, debug);
+ if (r)
+ return r;
+ lookup.dump(formatter, human_readable);
+ formatter->flush(cout);
+ return 0;
+}
+
+enum rmtype {
+ BOTH,
+ SNAPMAP,
+ NOSNAPMAP
+};
+
+int remove_object(coll_t coll, ghobject_t &ghobj,
+ SnapMapper &mapper,
+ MapCacher::Transaction<std::string, bufferlist> *_t,
+ ObjectStore::Transaction *t,
+ enum rmtype type)
+{
+ if (type == BOTH || type == SNAPMAP) {
+ int r = mapper.remove_oid(ghobj.hobj, _t);
+ if (r < 0 && r != -ENOENT) {
+ cerr << "remove_oid returned " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ if (type == BOTH || type == NOSNAPMAP) {
+ t->remove(coll, ghobj);
+ }
+ return 0;
+}
+
+int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent);
+
+int do_remove_object(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, bool all, bool force, enum rmtype type)
+{
+ auto ch = store->open_collection(coll);
+ spg_t pg;
+ coll.is_pg_prefix(&pg);
+ OSDriver driver(
+ store,
+ coll_t(),
+ OSD::make_snapmapper_oid());
+ SnapMapper mapper(g_ceph_context, &driver, 0, 0, 0, pg.shard);
+ struct stat st;
+
+ int r = store->stat(ch, ghobj, &st);
+ if (r < 0) {
+ cerr << "remove: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ SnapSet ss;
+ if (ghobj.hobj.has_snapset()) {
+ r = get_snapset(store, coll, ghobj, ss, false);
+ if (r < 0) {
+ cerr << "Can't get snapset error " << cpp_strerror(r) << std::endl;
+ // If --force and bad snapset let them remove the head
+ if (!(force && !all))
+ return r;
+ }
+ if (!ss.snaps.empty() && !all) {
+ if (force) {
+ cout << "WARNING: only removing "
+ << (ghobj.hobj.is_head() ? "head" : "snapdir")
+ << " with snapshots present" << std::endl;
+ ss.snaps.clear();
+ } else {
+ cerr << "Snapshots are present, use removeall to delete everything" << std::endl;
+ return -EINVAL;
+ }
+ }
+ }
+
+ ObjectStore::Transaction t;
+ OSDriver::OSTransaction _t(driver.get_transaction(&t));
+
+ ghobject_t snapobj = ghobj;
+ for (vector<snapid_t>::iterator i = ss.snaps.begin() ;
+ i != ss.snaps.end() ; ++i) {
+ snapobj.hobj.snap = *i;
+ cout << "remove " << snapobj << std::endl;
+ if (!dry_run) {
+ r = remove_object(coll, snapobj, mapper, &_t, &t, type);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ cout << "remove " << ghobj << std::endl;
+
+ if (!dry_run) {
+ r = remove_object(coll, ghobj, mapper, &_t, &t, type);
+ if (r < 0)
+ return r;
+ }
+
+ if (!dry_run) {
+ wait_until_done(&t, [&] {
+ store->queue_transaction(ch, std::move(t));
+ ch->flush();
+ });
+ }
+ return 0;
+}
+
+int do_list_attrs(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
+{
+ auto ch = store->open_collection(coll);
+ map<string,bufferptr> aset;
+ int r = store->getattrs(ch, ghobj, aset);
+ if (r < 0) {
+ cerr << "getattrs: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ for (map<string,bufferptr>::iterator i = aset.begin();i != aset.end(); ++i) {
+ string key(i->first);
+ if (outistty)
+ key = cleanbin(key);
+ cout << key << std::endl;
+ }
+ return 0;
+}
+
+int do_list_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
+{
+ auto ch = store->open_collection(coll);
+ ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, ghobj);
+ if (!iter) {
+ cerr << "omap_get_iterator: " << cpp_strerror(ENOENT) << std::endl;
+ return -ENOENT;
+ }
+ iter->seek_to_first();
+ map<string, bufferlist> oset;
+ while(iter->valid()) {
+ get_omap_batch(iter, oset);
+
+ for (map<string,bufferlist>::iterator i = oset.begin();i != oset.end(); ++i) {
+ string key(i->first);
+ if (outistty)
+ key = cleanbin(key);
+ cout << key << std::endl;
+ }
+ }
+ return 0;
+}
+
+int do_get_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd)
+{
+ auto ch = store->open_collection(coll);
+ struct stat st;
+ mysize_t total;
+
+ int ret = store->stat(ch, ghobj, &st);
+ if (ret < 0) {
+ cerr << "get-bytes: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ total = st.st_size;
+ if (debug)
+ cerr << "size=" << total << std::endl;
+
+ uint64_t offset = 0;
+ bufferlist rawdatabl;
+ while(total > 0) {
+ rawdatabl.clear();
+ mysize_t len = max_read;
+ if (len > total)
+ len = total;
+
+ ret = store->read(ch, ghobj, offset, len, rawdatabl);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -EINVAL;
+
+ if (debug)
+ cerr << "data section offset=" << offset << " len=" << len << std::endl;
+
+ total -= ret;
+ offset += ret;
+
+ ret = write(fd, rawdatabl.c_str(), ret);
+ if (ret == -1) {
+ perror("write");
+ return -errno;
+ }
+ }
+
+ return 0;
+}
+
+int do_set_bytes(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, int fd)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (debug)
+ cerr << "Write " << ghobj << std::endl;
+
+ if (!dry_run) {
+ t->touch(coll, ghobj);
+ t->truncate(coll, ghobj, 0);
+ }
+
+ uint64_t offset = 0;
+ bufferlist rawdatabl;
+ do {
+ rawdatabl.clear();
+ ssize_t bytes = rawdatabl.read_fd(fd, max_read);
+ if (bytes < 0) {
+ cerr << "read_fd error " << cpp_strerror(bytes) << std::endl;
+ return bytes;
+ }
+
+ if (bytes == 0)
+ break;
+
+ if (debug)
+ cerr << "\tdata: offset " << offset << " bytes " << bytes << std::endl;
+ if (!dry_run)
+ t->write(coll, ghobj, offset, bytes, rawdatabl);
+
+ offset += bytes;
+ // XXX: Should we queue_transaction() every once in a while for very large files
+ } while(true);
+
+ auto ch = store->open_collection(coll);
+ if (!dry_run)
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_get_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
+{
+ auto ch = store->open_collection(coll);
+ bufferptr bp;
+
+ int r = store->getattr(ch, ghobj, key.c_str(), bp);
+ if (r < 0) {
+ cerr << "getattr: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ string value(bp.c_str(), bp.length());
+ if (outistty) {
+ value = cleanbin(value);
+ value.push_back('\n');
+ }
+ cout << value;
+
+ return 0;
+}
+
+int do_set_attr(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, string key, int fd)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+ bufferlist bl;
+
+ if (debug)
+ cerr << "Setattr " << ghobj << std::endl;
+
+ int ret = get_fd_data(fd, bl);
+ if (ret < 0)
+ return ret;
+
+ if (dry_run)
+ return 0;
+
+ t->touch(coll, ghobj);
+
+ t->setattr(coll, ghobj, key, bl);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_rm_attr(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, string key)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (debug)
+ cerr << "Rmattr " << ghobj << std::endl;
+
+ if (dry_run)
+ return 0;
+
+ t->rmattr(coll, ghobj, key);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_get_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key)
+{
+ auto ch = store->open_collection(coll);
+ set<string> keys;
+ map<string, bufferlist> out;
+
+ keys.insert(key);
+
+ int r = store->omap_get_values(ch, ghobj, keys, &out);
+ if (r < 0) {
+ cerr << "omap_get_values: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (out.empty()) {
+ cerr << "Key not found" << std::endl;
+ return -ENOENT;
+ }
+
+ ceph_assert(out.size() == 1);
+
+ bufferlist bl = out.begin()->second;
+ string value(bl.c_str(), bl.length());
+ if (outistty) {
+ value = cleanbin(value);
+ value.push_back('\n');
+ }
+ cout << value;
+
+ return 0;
+}
+
+int do_set_omap(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, string key, int fd)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+ map<string, bufferlist> attrset;
+ bufferlist valbl;
+
+ if (debug)
+ cerr << "Set_omap " << ghobj << std::endl;
+
+ int ret = get_fd_data(fd, valbl);
+ if (ret < 0)
+ return ret;
+
+ attrset.insert(pair<string, bufferlist>(key, valbl));
+
+ if (dry_run)
+ return 0;
+
+ t->touch(coll, ghobj);
+
+ t->omap_setkeys(coll, ghobj, attrset);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_rm_omap(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, string key)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+ set<string> keys;
+
+ keys.insert(key);
+
+ if (debug)
+ cerr << "Rm_omap " << ghobj << std::endl;
+
+ if (dry_run)
+ return 0;
+
+ t->omap_rmkeys(coll, ghobj, keys);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+int do_get_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj)
+{
+ auto ch = store->open_collection(coll);
+ bufferlist hdrbl;
+
+ int r = store->omap_get_header(ch, ghobj, &hdrbl, true);
+ if (r < 0) {
+ cerr << "omap_get_header: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ string header(hdrbl.c_str(), hdrbl.length());
+ if (outistty) {
+ header = cleanbin(header);
+ header.push_back('\n');
+ }
+ cout << header;
+
+ return 0;
+}
+
+int do_set_omaphdr(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, int fd)
+{
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+ bufferlist hdrbl;
+
+ if (debug)
+ cerr << "Omap_setheader " << ghobj << std::endl;
+
+ int ret = get_fd_data(fd, hdrbl);
+ if (ret)
+ return ret;
+
+ if (dry_run)
+ return 0;
+
+ t->touch(coll, ghobj);
+
+ t->omap_setheader(coll, ghobj, hdrbl);
+
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(*t));
+ return 0;
+}
+
+struct do_fix_lost : public action_on_object_t {
+ void call(ObjectStore *store, coll_t coll,
+ ghobject_t &ghobj, object_info_t &oi) override {
+ if (oi.is_lost()) {
+ cout << coll << "/" << ghobj << " is lost";
+ if (!dry_run)
+ cout << ", fixing";
+ cout << std::endl;
+ if (dry_run)
+ return;
+ oi.clear_flag(object_info_t::FLAG_LOST);
+ bufferlist bl;
+ encode(oi, bl, -1); /* fixme: using full features */
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, OI_ATTR, bl);
+ auto ch = store->open_collection(coll);
+ store->queue_transaction(ch, std::move(t));
+ }
+ return;
+ }
+};
+
+int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent = false)
+{
+ auto ch = store->open_collection(coll);
+ bufferlist attr;
+ int r = store->getattr(ch, ghobj, SS_ATTR, attr);
+ if (r < 0) {
+ if (!silent)
+ cerr << "Error getting snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ auto bp = attr.cbegin();
+ try {
+ decode(ss, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter)
+{
+ auto ch = store->open_collection(coll);
+ int r = 0;
+ formatter->open_object_section("obj");
+ formatter->open_object_section("id");
+ ghobj.dump(formatter);
+ formatter->close_section();
+
+ bufferlist attr;
+ int gr = store->getattr(ch, ghobj, OI_ATTR, attr);
+ if (gr < 0) {
+ r = gr;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ } else {
+ object_info_t oi;
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ formatter->open_object_section("info");
+ oi.dump(formatter);
+ formatter->close_section();
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+ struct stat st;
+ int sr = store->stat(ch, ghobj, &st, true);
+ if (sr < 0) {
+ r = sr;
+ cerr << "Error stat on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ } else {
+ formatter->open_object_section("stat");
+ formatter->dump_int("size", st.st_size);
+ formatter->dump_int("blksize", st.st_blksize);
+ formatter->dump_int("blocks", st.st_blocks);
+ formatter->dump_int("nlink", st.st_nlink);
+ formatter->close_section();
+ }
+
+ if (ghobj.hobj.has_snapset()) {
+ SnapSet ss;
+ int snr = get_snapset(store, coll, ghobj, ss);
+ if (snr < 0) {
+ r = snr;
+ } else {
+ formatter->open_object_section("SnapSet");
+ ss.dump(formatter);
+ formatter->close_section();
+ }
+ }
+ bufferlist hattr;
+ gr = store->getattr(ch, ghobj, ECUtil::get_hinfo_key(), hattr);
+ if (gr == 0) {
+ ECUtil::HashInfo hinfo;
+ auto hp = hattr.cbegin();
+ try {
+ decode(hinfo, hp);
+ formatter->open_object_section("hinfo");
+ hinfo.dump(formatter);
+ formatter->close_section();
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error decoding hinfo on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+ return r;
+}
+
+int corrupt_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter)
+{
+ auto ch = store->open_collection(coll);
+ bufferlist attr;
+ int r = store->getattr(ch, ghobj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ object_info_t oi;
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (!dry_run) {
+ attr.clear();
+ oi.alloc_hint_flags += 0xff;
+ ObjectStore::Transaction t;
+ encode(oi, attr, -1); /* fixme: using full features */
+ t.setattr(coll, ghobj, OI_ATTR, attr);
+ auto ch = store->open_collection(coll);
+ r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int set_size(
+ ObjectStore *store, coll_t coll, ghobject_t &ghobj, uint64_t setsize, Formatter* formatter,
+ bool corrupt)
+{
+ auto ch = store->open_collection(coll);
+ if (ghobj.hobj.is_snapdir()) {
+ cerr << "Can't set the size of a snapdir" << std::endl;
+ return -EINVAL;
+ }
+ bufferlist attr;
+ int r = store->getattr(ch, ghobj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ object_info_t oi;
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ struct stat st;
+ r = store->stat(ch, ghobj, &st, true);
+ if (r < 0) {
+ cerr << "Error stat on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ }
+ ghobject_t head(ghobj);
+ SnapSet ss;
+ bool found_head = true;
+ map<snapid_t, uint64_t>::iterator csi;
+ bool is_snap = ghobj.hobj.is_snap();
+ if (is_snap) {
+ head.hobj = head.hobj.get_head();
+ r = get_snapset(store, coll, head, ss, true);
+ if (r < 0 && r != -ENOENT) {
+ // Requested get_snapset() silent, so if not -ENOENT show error
+ cerr << "Error getting snapset on : " << make_pair(coll, head) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (r == -ENOENT) {
+ head.hobj = head.hobj.get_snapdir();
+ r = get_snapset(store, coll, head, ss);
+ if (r < 0)
+ return r;
+ found_head = false;
+ } else {
+ found_head = true;
+ }
+ csi = ss.clone_size.find(ghobj.hobj.snap);
+ if (csi == ss.clone_size.end()) {
+ cerr << "SnapSet is missing clone_size for snap " << ghobj.hobj.snap << std::endl;
+ return -EINVAL;
+ }
+ }
+ if ((uint64_t)st.st_size == setsize && oi.size == setsize
+ && (!is_snap || csi->second == setsize)) {
+ cout << "Size of object is already " << setsize << std::endl;
+ return 0;
+ }
+ cout << "Setting size to " << setsize << ", stat size " << st.st_size
+ << ", obj info size " << oi.size;
+ if (is_snap) {
+ cout << ", " << (found_head ? "head" : "snapdir")
+ << " clone_size " << csi->second;
+ csi->second = setsize;
+ }
+ cout << std::endl;
+ if (!dry_run) {
+ attr.clear();
+ oi.size = setsize;
+ ObjectStore::Transaction t;
+ // Only modify object info if we want to corrupt it
+ if (!corrupt && (uint64_t)st.st_size != setsize) {
+ t.truncate(coll, ghobj, setsize);
+ // Changing objectstore size will invalidate data_digest, so clear it.
+ oi.clear_data_digest();
+ }
+ encode(oi, attr, -1); /* fixme: using full features */
+ t.setattr(coll, ghobj, OI_ATTR, attr);
+ if (is_snap) {
+ bufferlist snapattr;
+ snapattr.clear();
+ encode(ss, snapattr);
+ t.setattr(coll, head, SS_ATTR, snapattr);
+ }
+ auto ch = store->open_collection(coll);
+ r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int clear_data_digest(ObjectStore *store, coll_t coll, ghobject_t &ghobj) {
+ auto ch = store->open_collection(coll);
+ bufferlist attr;
+ int r = store->getattr(ch, ghobj, OI_ATTR, attr);
+ if (r < 0) {
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ object_info_t oi;
+ auto bp = attr.cbegin();
+ try {
+ decode(oi, bp);
+ } catch (...) {
+ r = -EINVAL;
+ cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (!dry_run) {
+ attr.clear();
+ oi.clear_data_digest();
+ encode(oi, attr, -1); /* fixme: using full features */
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, OI_ATTR, attr);
+ auto ch = store->open_collection(coll);
+ r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int clear_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj,
+ string arg)
+{
+ SnapSet ss;
+ int ret = get_snapset(store, coll, ghobj, ss);
+ if (ret < 0)
+ return ret;
+
+ // Use "corrupt" to clear entire SnapSet
+ // Use "seq" to just corrupt SnapSet.seq
+ if (arg == "corrupt" || arg == "seq")
+ ss.seq = 0;
+ // Use "snaps" to just clear SnapSet.snaps
+ if (arg == "corrupt" || arg == "snaps")
+ ss.snaps.clear();
+ // By default just clear clone, clone_overlap and clone_size
+ if (arg == "corrupt")
+ arg = "";
+ if (arg == "" || arg == "clones")
+ ss.clones.clear();
+ if (arg == "" || arg == "clone_overlap")
+ ss.clone_overlap.clear();
+ if (arg == "" || arg == "clone_size")
+ ss.clone_size.clear();
+ // Break all clone sizes by adding 1
+ if (arg == "size") {
+ for (map<snapid_t, uint64_t>::iterator i = ss.clone_size.begin();
+ i != ss.clone_size.end(); ++i)
+ ++(i->second);
+ }
+
+ if (!dry_run) {
+ bufferlist bl;
+ encode(ss, bl);
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, SS_ATTR, bl);
+ auto ch = store->open_collection(coll);
+ int r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+vector<snapid_t>::iterator find(vector<snapid_t> &v, snapid_t clid)
+{
+ return std::find(v.begin(), v.end(), clid);
+}
+
+map<snapid_t, interval_set<uint64_t> >::iterator
+find(map<snapid_t, interval_set<uint64_t> > &m, snapid_t clid)
+{
+ return m.find(clid);
+}
+
+map<snapid_t, uint64_t>::iterator find(map<snapid_t, uint64_t> &m,
+ snapid_t clid)
+{
+ return m.find(clid);
+}
+
+template<class T>
+int remove_from(T &mv, string name, snapid_t cloneid, bool force)
+{
+ typename T::iterator i = find(mv, cloneid);
+ if (i != mv.end()) {
+ mv.erase(i);
+ } else {
+ cerr << "Clone " << cloneid << " doesn't exist in " << name;
+ if (force) {
+ cerr << " (ignored)" << std::endl;
+ return 0;
+ }
+ cerr << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int remove_clone(
+ ObjectStore *store, coll_t coll, ghobject_t &ghobj, snapid_t cloneid, bool force)
+{
+ // XXX: Don't allow this if in a cache tier or former cache tier
+ // bool allow_incomplete_clones() const {
+ // return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
+
+ SnapSet snapset;
+ int ret = get_snapset(store, coll, ghobj, snapset);
+ if (ret < 0)
+ return ret;
+
+ // Derived from trim_object()
+ // ...from snapset
+ vector<snapid_t>::iterator p;
+ for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p)
+ if (*p == cloneid)
+ break;
+ if (p == snapset.clones.end()) {
+ cerr << "Clone " << cloneid << " not present";
+ return -ENOENT;
+ }
+ if (p != snapset.clones.begin()) {
+ // not the oldest... merge overlap into next older clone
+ vector<snapid_t>::iterator n = p - 1;
+ hobject_t prev_coid = ghobj.hobj;
+ prev_coid.snap = *n;
+ //bool adjust_prev_bytes = is_present_clone(prev_coid);
+
+ //if (adjust_prev_bytes)
+ // ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
+
+ snapset.clone_overlap[*n].intersection_of(
+ snapset.clone_overlap[*p]);
+
+ //if (adjust_prev_bytes)
+ // ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
+ }
+
+ ret = remove_from(snapset.clones, "clones", cloneid, force);
+ if (ret) return ret;
+ ret = remove_from(snapset.clone_overlap, "clone_overlap", cloneid, force);
+ if (ret) return ret;
+ ret = remove_from(snapset.clone_size, "clone_size", cloneid, force);
+ if (ret) return ret;
+
+ if (dry_run)
+ return 0;
+
+ bufferlist bl;
+ encode(snapset, bl);
+ ObjectStore::Transaction t;
+ t.setattr(coll, ghobj, SS_ATTR, bl);
+ auto ch = store->open_collection(coll);
+ int r = store->queue_transaction(ch, std::move(t));
+ if (r < 0) {
+ cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ cout << "Removal of clone " << cloneid << " complete" << std::endl;
+ cout << "Use pg repair after OSD restarted to correct stat information" << std::endl;
+ return 0;
+}
+
+int dup(string srcpath, ObjectStore *src, string dstpath, ObjectStore *dst)
+{
+ cout << "dup from " << src->get_type() << ": " << srcpath << "\n"
+ << " to " << dst->get_type() << ": " << dstpath
+ << std::endl;
+ int num, i;
+ vector<coll_t> collections;
+ int r;
+
+ r = src->mount();
+ if (r < 0) {
+ cerr << "failed to mount src: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ r = dst->mount();
+ if (r < 0) {
+ cerr << "failed to mount dst: " << cpp_strerror(r) << std::endl;
+ goto out_src;
+ }
+
+ if (src->get_fsid() != dst->get_fsid()) {
+ cerr << "src fsid " << src->get_fsid() << " != dest " << dst->get_fsid()
+ << std::endl;
+ goto out;
+ }
+ cout << "fsid " << src->get_fsid() << std::endl;
+
+ // make sure dst is empty
+ r = dst->list_collections(collections);
+ if (r < 0) {
+ cerr << "error listing collections on dst: " << cpp_strerror(r) << std::endl;
+ goto out;
+ }
+ if (!collections.empty()) {
+ cerr << "destination store is not empty" << std::endl;
+ goto out;
+ }
+
+ r = src->list_collections(collections);
+ if (r < 0) {
+ cerr << "error listing collections on src: " << cpp_strerror(r) << std::endl;
+ goto out;
+ }
+
+ num = collections.size();
+ cout << num << " collections" << std::endl;
+ i = 1;
+ for (auto cid : collections) {
+ cout << i++ << "/" << num << " " << cid << std::endl;
+ auto ch = src->open_collection(cid);
+ auto dch = dst->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ int bits = src->collection_bits(ch);
+ if (bits < 0) {
+ if (src->get_type() == "filestore" && cid.is_meta()) {
+ bits = 0;
+ } else {
+ cerr << "cannot get bit count for collection " << cid << ": "
+ << cpp_strerror(bits) << std::endl;
+ goto out;
+ }
+ }
+ t.create_collection(cid, bits);
+ dst->queue_transaction(dch, std::move(t));
+ }
+
+ ghobject_t pos;
+ uint64_t n = 0;
+ uint64_t bytes = 0, keys = 0;
+ while (true) {
+ vector<ghobject_t> ls;
+ r = src->collection_list(ch, pos, ghobject_t::get_max(), 1000, &ls, &pos);
+ if (r < 0) {
+ cerr << "collection_list on " << cid << " from " << pos << " got: "
+ << cpp_strerror(r) << std::endl;
+ goto out;
+ }
+ if (ls.empty()) {
+ break;
+ }
+
+ for (auto& oid : ls) {
+ //cout << " " << cid << " " << oid << std::endl;
+ if (n % 100 == 0) {
+ cout << " " << std::setw(16) << n << " objects, "
+ << std::setw(16) << bytes << " bytes, "
+ << std::setw(16) << keys << " keys"
+ << std::setw(1) << "\r" << std::flush;
+ }
+ n++;
+
+ ObjectStore::Transaction t;
+ t.touch(cid, oid);
+
+ map<string,bufferptr> attrs;
+ src->getattrs(ch, oid, attrs);
+ if (!attrs.empty()) {
+ t.setattrs(cid, oid, attrs);
+ }
+
+ bufferlist bl;
+ src->read(ch, oid, 0, 0, bl);
+ if (bl.length()) {
+ t.write(cid, oid, 0, bl.length(), bl);
+ bytes += bl.length();
+ }
+
+ bufferlist header;
+ map<string,bufferlist> omap;
+ src->omap_get(ch, oid, &header, &omap);
+ if (header.length()) {
+ t.omap_setheader(cid, oid, header);
+ ++keys;
+ }
+ if (!omap.empty()) {
+ keys += omap.size();
+ t.omap_setkeys(cid, oid, omap);
+ }
+
+ dst->queue_transaction(dch, std::move(t));
+ }
+ }
+ cout << " " << std::setw(16) << n << " objects, "
+ << std::setw(16) << bytes << " bytes, "
+ << std::setw(16) << keys << " keys"
+ << std::setw(1) << std::endl;
+ }
+
+ // keyring
+ cout << "keyring" << std::endl;
+ {
+ bufferlist bl;
+ string s = srcpath + "/keyring";
+ string err;
+ r = bl.read_file(s.c_str(), &err);
+ if (r < 0) {
+ cerr << "failed to copy " << s << ": " << err << std::endl;
+ } else {
+ string d = dstpath + "/keyring";
+ bl.write_file(d.c_str(), 0600);
+ }
+ }
+
+ // osd metadata
+ cout << "duping osd metadata" << std::endl;
+ {
+ for (auto k : {"magic", "whoami", "ceph_fsid", "fsid"}) {
+ string val;
+ src->read_meta(k, &val);
+ dst->write_meta(k, val);
+ }
+ }
+
+ dst->write_meta("ready", "ready");
+
+ cout << "done." << std::endl;
+ r = 0;
+ out:
+ dst->umount();
+ out_src:
+ src->umount();
+ return r;
+}
+
+void usage(po::options_description &desc)
+{
+ cerr << std::endl;
+ cerr << desc << std::endl;
+ cerr << std::endl;
+ cerr << "Positional syntax:" << std::endl;
+ cerr << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> (get|set)-bytes [file]" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> set-(attr|omap) <key> [file]" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> (get|rm)-(attr|omap) <key>" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> get-omaphdr" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> set-omaphdr [file]" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> list-attrs" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> list-omap" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> remove|removeall" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> dump" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> set-size" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> clear-data-digest" << std::endl;
+ cerr << "ceph-objectstore-tool ... <object> remove-clone-metadata <cloneid>" << std::endl;
+ cerr << std::endl;
+ cerr << "<object> can be a JSON object description as displayed" << std::endl;
+ cerr << "by --op list." << std::endl;
+ cerr << "<object> can be an object name which will be looked up in all" << std::endl;
+ cerr << "the OSD's PGs." << std::endl;
+ cerr << "<object> can be the empty string ('') which with a provided pgid " << std::endl;
+ cerr << "specifies the pgmeta object" << std::endl;
+ cerr << std::endl;
+ cerr << "The optional [file] argument will read stdin or write stdout" << std::endl;
+ cerr << "if not specified or if '-' specified." << std::endl;
+}
+
+bool ends_with(const string& check, const string& ending)
+{
+ return check.size() >= ending.size() && check.rfind(ending) == (check.size() - ending.size());
+}
+
+// Based on FileStore::dump_journal(), set-up enough to only dump
+int mydump_journal(Formatter *f, string journalpath, bool m_journal_dio)
+{
+ int r;
+
+ if (!journalpath.length())
+ return -EINVAL;
+
+ FileJournal *journal = new FileJournal(g_ceph_context, uuid_d(), NULL, NULL,
+ journalpath.c_str(), m_journal_dio);
+ r = journal->_fdump(*f, false);
+ delete journal;
+ return r;
+}
+
+int apply_layout_settings(ObjectStore *os, const OSDSuperblock &superblock,
+ const string &pool_name, const spg_t &pgid, bool dry_run,
+ int target_level)
+{
+ int r = 0;
+
+ FileStore *fs = dynamic_cast<FileStore*>(os);
+ if (!fs) {
+ cerr << "Nothing to do for non-filestore backend" << std::endl;
+ return 0; // making this return success makes testing easier
+ }
+
+ OSDMap curmap;
+ bufferlist bl;
+ r = get_osdmap(os, superblock.current_epoch, curmap, bl);
+ if (r) {
+ cerr << "Can't find local OSDMap: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ int64_t poolid = -1;
+ if (pool_name.length()) {
+ poolid = curmap.lookup_pg_pool_name(pool_name);
+ if (poolid < 0) {
+ cerr << "Couldn't find pool " << pool_name << ": " << cpp_strerror(poolid)
+ << std::endl;
+ return poolid;
+ }
+ }
+
+ vector<coll_t> collections, filtered_colls;
+ r = os->list_collections(collections);
+ if (r < 0) {
+ cerr << "Error listing collections: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ for (auto const &coll : collections) {
+ spg_t coll_pgid;
+ if (coll.is_pg(&coll_pgid) &&
+ ((poolid >= 0 && coll_pgid.pool() == (uint64_t)poolid) ||
+ coll_pgid == pgid)) {
+ filtered_colls.push_back(coll);
+ }
+ }
+
+ size_t done = 0, total = filtered_colls.size();
+ for (auto const &coll : filtered_colls) {
+ if (dry_run) {
+ cerr << "Would apply layout settings to " << coll << std::endl;
+ } else {
+ cerr << "Finished " << done << "/" << total << " collections" << "\r";
+ r = fs->apply_layout_settings(coll, target_level);
+ if (r < 0) {
+ cerr << "Error applying layout settings to " << coll << std::endl;
+ return r;
+ }
+ }
+ ++done;
+ }
+
+ cerr << "Finished " << total << "/" << total << " collections" << "\r" << std::endl;
+ return r;
+}
+
+int main(int argc, char **argv)
+{
+ string dpath, jpath, pgidstr, op, file, mountpoint, mon_store_path, object;
+ string target_data_path, fsid;
+ string objcmd, arg1, arg2, type, format, argnspace, pool, rmtypestr;
+ boost::optional<std::string> nspace;
+ spg_t pgid;
+ unsigned epoch = 0;
+ ghobject_t ghobj;
+ bool human_readable;
+ Formatter *formatter;
+ bool head;
+
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("help", "produce help message")
+ ("type", po::value<string>(&type),
+ "Arg is one of [bluestore (default), filestore, memstore]")
+ ("data-path", po::value<string>(&dpath),
+ "path to object store, mandatory")
+ ("journal-path", po::value<string>(&jpath),
+ "path to journal, use if tool can't find it")
+ ("pgid", po::value<string>(&pgidstr),
+ "PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, and mandatory for apply-layout-settings if --pool is not specified")
+ ("pool", po::value<string>(&pool),
+ "Pool name, mandatory for apply-layout-settings if --pgid is not specified")
+ ("op", po::value<string>(&op),
+ "Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, "
+ "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log, statfs]")
+ ("epoch", po::value<unsigned>(&epoch),
+ "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
+ ("file", po::value<string>(&file),
+ "path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap")
+ ("mon-store-path", po::value<string>(&mon_store_path),
+ "path of monstore to update-mon-db")
+ ("fsid", po::value<string>(&fsid),
+ "fsid for new store created by mkfs")
+ ("target-data-path", po::value<string>(&target_data_path),
+ "path of target object store (for --op dup)")
+ ("mountpoint", po::value<string>(&mountpoint),
+ "fuse mountpoint")
+ ("format", po::value<string>(&format)->default_value("json-pretty"),
+ "Output format which may be json, json-pretty, xml, xml-pretty")
+ ("debug", "Enable diagnostic output to stderr")
+ ("force", "Ignore some types of errors and proceed with operation - USE WITH CAUTION: CORRUPTION POSSIBLE NOW OR IN THE FUTURE")
+ ("skip-journal-replay", "Disable journal replay")
+ ("skip-mount-omap", "Disable mounting of omap")
+ ("head", "Find head/snapdir when searching for objects by name")
+ ("dry-run", "Don't modify the objectstore")
+ ("namespace", po::value<string>(&argnspace), "Specify namespace when searching for objects")
+ ("rmtype", po::value<string>(&rmtypestr), "Specify corrupting object removal 'snapmap' or 'nosnapmap' - TESTING USE ONLY")
+ ;
+
+ po::options_description positional("Positional options");
+ positional.add_options()
+ ("object", po::value<string>(&object), "'' for pgmeta_oid, object name or ghobject in json")
+ ("objcmd", po::value<string>(&objcmd), "command [(get|set)-bytes, (get|set|rm)-(attr|omap), (get|set)-omaphdr, list-attrs, list-omap, remove]")
+ ("arg1", po::value<string>(&arg1), "arg1 based on cmd")
+ ("arg2", po::value<string>(&arg2), "arg2 based on cmd")
+ ;
+
+ po::options_description all;
+ all.add(desc).add(positional);
+
+ po::positional_options_description pd;
+ pd.add("object", 1).add("objcmd", 1).add("arg1", 1).add("arg2", 1);
+
+ vector<string> ceph_option_strings;
+
+ po::variables_map vm;
+ try {
+ po::parsed_options parsed =
+ po::command_line_parser(argc, argv).options(all).allow_unregistered().positional(pd).run();
+ po::store( parsed, vm);
+ po::notify(vm);
+ ceph_option_strings = po::collect_unrecognized(parsed.options,
+ po::include_positional);
+ } catch(po::error &e) {
+ std::cerr << e.what() << std::endl;
+ return 1;
+ }
+
+ if (vm.count("help")) {
+ usage(desc);
+ return 1;
+ }
+
+ // Compatibility with previous option name
+ if (op == "dump-import")
+ op = "dump-export";
+
+ debug = (vm.count("debug") > 0);
+
+ force = (vm.count("force") > 0);
+
+ if (vm.count("namespace"))
+ nspace = argnspace;
+
+ dry_run = (vm.count("dry-run") > 0);
+
+ osflagbits_t flags = 0;
+ if (dry_run || vm.count("skip-journal-replay"))
+ flags |= SKIP_JOURNAL_REPLAY;
+ if (vm.count("skip-mount-omap"))
+ flags |= SKIP_MOUNT_OMAP;
+ if (op == "update-mon-db")
+ flags |= SKIP_JOURNAL_REPLAY;
+
+ head = (vm.count("head") > 0);
+
+ // infer osd id so we can authenticate
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/whoami", dpath.c_str());
+ int fd = ::open(fn, O_RDONLY);
+ if (fd >= 0) {
+ bufferlist bl;
+ bl.read_fd(fd, 64);
+ string s(bl.c_str(), bl.length());
+ int whoami = atoi(s.c_str());
+ vector<string> tmp;
+ // identify ourselves as this osd so we can auth and fetch our configs
+ tmp.push_back("-n");
+ tmp.push_back(string("osd.") + stringify(whoami));
+ // populate osd_data so that the default keyring location works
+ tmp.push_back("--osd-data");
+ tmp.push_back(dpath);
+ tmp.insert(tmp.end(), ceph_option_strings.begin(),
+ ceph_option_strings.end());
+ tmp.swap(ceph_option_strings);
+ }
+
+ vector<const char *> ceph_options;
+ ceph_options.reserve(ceph_options.size() + ceph_option_strings.size());
+ for (vector<string>::iterator i = ceph_option_strings.begin();
+ i != ceph_option_strings.end();
+ ++i) {
+ ceph_options.push_back(i->c_str());
+ }
+
+ snprintf(fn, sizeof(fn), "%s/type", dpath.c_str());
+ fd = ::open(fn, O_RDONLY);
+ if (fd >= 0) {
+ bufferlist bl;
+ bl.read_fd(fd, 64);
+ if (bl.length()) {
+ string dp_type = string(bl.c_str(), bl.length() - 1); // drop \n
+ if (vm.count("type") && dp_type != "" && type != dp_type)
+ cerr << "WARNING: Ignoring type \"" << type << "\" - found data-path type \""
+ << dp_type << "\"" << std::endl;
+ type = dp_type;
+ //cout << "object store type is " << type << std::endl;
+ }
+ ::close(fd);
+ }
+
+ if (!vm.count("type") && type == "") {
+ type = "bluestore";
+ }
+ if (!vm.count("data-path") &&
+ op != "dump-export" &&
+ !(op == "dump-journal" && type == "filestore")) {
+ cerr << "Must provide --data-path" << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (type == "filestore" && !vm.count("journal-path")) {
+ jpath = dpath + "/journal";
+ }
+ if (!vm.count("op") && !vm.count("object")) {
+ cerr << "Must provide --op or object command..." << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (op != "list" && op != "apply-layout-settings" &&
+ vm.count("op") && vm.count("object")) {
+ cerr << "Can't specify both --op and object command syntax" << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (op == "apply-layout-settings" && !(vm.count("pool") ^ vm.count("pgid"))) {
+ cerr << "apply-layout-settings requires either --pool or --pgid"
+ << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (op != "list" && op != "apply-layout-settings" && vm.count("object") && !vm.count("objcmd")) {
+ cerr << "Invalid syntax, missing command" << std::endl;
+ usage(desc);
+ return 1;
+ }
+ if (op == "fuse" && mountpoint.length() == 0) {
+ cerr << "Missing fuse mountpoint" << std::endl;
+ usage(desc);
+ return 1;
+ }
+ outistty = isatty(STDOUT_FILENO);
+
+ file_fd = fd_none;
+ if ((op == "export" || op == "export-remove" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) {
+ if (!vm.count("file") || file == "-") {
+ if (outistty) {
+ cerr << "stdout is a tty and no --file filename specified" << std::endl;
+ return 1;
+ }
+ file_fd = STDOUT_FILENO;
+ } else {
+ file_fd = open(file.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
+ }
+ } else if (op == "import" || op == "dump-export" || op == "set-osdmap" || op == "set-inc-osdmap") {
+ if (!vm.count("file") || file == "-") {
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no --file filename specified" << std::endl;
+ return 1;
+ }
+ file_fd = STDIN_FILENO;
+ } else {
+ file_fd = open(file.c_str(), O_RDONLY);
+ }
+ }
+
+ ObjectStoreTool tool = ObjectStoreTool(file_fd, dry_run);
+
+ if (vm.count("file") && file_fd == fd_none && !dry_run) {
+ cerr << "--file option only applies to import, dump-export, export, export-remove, "
+ << "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl;
+ return 1;
+ }
+
+ if (file_fd != fd_none && file_fd < 0) {
+ string err = string("file: ") + file;
+ perror(err.c_str());
+ return 1;
+ }
+
+ auto cct = global_init(
+ NULL, ceph_options,
+ CEPH_ENTITY_TYPE_OSD,
+ CODE_ENVIRONMENT_UTILITY_NODOUT,
+ 0);
+ common_init_finish(g_ceph_context);
+ if (debug) {
+ g_conf().set_val_or_die("log_to_stderr", "true");
+ g_conf().set_val_or_die("err_to_stderr", "true");
+ }
+ g_conf().apply_changes(nullptr);
+
+ // Special list handling. Treating pretty_format as human readable,
+ // with one object per line and not an enclosing array.
+ human_readable = ends_with(format, "-pretty");
+ if ((op == "list" || op == "meta-list") && human_readable) {
+ // Remove -pretty from end of format which we know is there
+ format = format.substr(0, format.size() - strlen("-pretty"));
+ }
+
+ formatter = Formatter::create(format);
+ if (formatter == NULL) {
+ cerr << "unrecognized format: " << format << std::endl;
+ return 1;
+ }
+
+ // Special handling for filestore journal, so we can dump it without mounting
+ if (op == "dump-journal" && type == "filestore") {
+ int ret = mydump_journal(formatter, jpath, g_conf()->journal_dio);
+ if (ret < 0) {
+ cerr << "journal-path: " << jpath << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ formatter->flush(cout);
+ return 0;
+ }
+
+ if (op == "dump-export") {
+ int ret = tool.dump_export(formatter);
+ if (ret < 0) {
+ cerr << "dump-export: "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ return 0;
+ }
+
+ //Verify that data-path really exists
+ struct stat st;
+ if (::stat(dpath.c_str(), &st) == -1) {
+ string err = string("data-path: ") + dpath;
+ perror(err.c_str());
+ return 1;
+ }
+
+ if (pgidstr.length() && !pgid.parse(pgidstr.c_str())) {
+ cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl;
+ return 1;
+ }
+
+ //Verify that the journal-path really exists
+ if (type == "filestore") {
+ if (::stat(jpath.c_str(), &st) == -1) {
+ string err = string("journal-path: ") + jpath;
+ perror(err.c_str());
+ return 1;
+ }
+ if (S_ISDIR(st.st_mode)) {
+ cerr << "journal-path: " << jpath << ": "
+ << cpp_strerror(EISDIR) << std::endl;
+ return 1;
+ }
+ }
+
+ ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags);
+ if (fs == NULL) {
+ cerr << "Unable to create store of type " << type << std::endl;
+ return 1;
+ }
+
+ if (op == "fsck" || op == "fsck-deep") {
+ int r = fs->fsck(op == "fsck-deep");
+ if (r < 0) {
+ cerr << "fsck failed: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ if (r > 0) {
+ cerr << "fsck found " << r << " errors" << std::endl;
+ return 1;
+ }
+ cout << "fsck found no errors" << std::endl;
+ return 0;
+ }
+ if (op == "repair" || op == "repair-deep") {
+ int r = fs->repair(op == "repair-deep");
+ if (r < 0) {
+ cerr << "repair failed: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ if (r > 0) {
+ cerr << "repair found " << r << " errors" << std::endl;
+ return 1;
+ }
+ cout << "repair found no errors" << std::endl;
+ return 0;
+ }
+ if (op == "mkfs") {
+ if (fsid.length()) {
+ uuid_d f;
+ bool r = f.parse(fsid.c_str());
+ if (!r) {
+ cerr << "failed to parse uuid '" << fsid << "'" << std::endl;
+ return 1;
+ }
+ fs->set_fsid(f);
+ }
+ int r = fs->mkfs();
+ if (r < 0) {
+ cerr << "mkfs failed: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ return 0;
+ }
+ if (op == "dup") {
+ string target_type;
+ char fn[PATH_MAX];
+ snprintf(fn, sizeof(fn), "%s/type", target_data_path.c_str());
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ cerr << "Unable to open " << target_data_path << "/type" << std::endl;
+ exit(1);
+ }
+ bufferlist bl;
+ bl.read_fd(fd, 64);
+ if (bl.length()) {
+ target_type = string(bl.c_str(), bl.length() - 1); // drop \n
+ }
+ ::close(fd);
+ ObjectStore *targetfs = ObjectStore::create(
+ g_ceph_context, target_type,
+ target_data_path, "", 0);
+ if (targetfs == NULL) {
+ cerr << "Unable to open store of type " << target_type << std::endl;
+ return 1;
+ }
+ int r = dup(dpath, fs, target_data_path, targetfs);
+ if (r < 0) {
+ cerr << "dup failed: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ return 0;
+ }
+
+ int ret = fs->mount();
+ if (ret < 0) {
+ if (ret == -EBUSY) {
+ cerr << "OSD has the store locked" << std::endl;
+ } else {
+ cerr << "Mount failed with '" << cpp_strerror(ret) << "'" << std::endl;
+ }
+ return 1;
+ }
+
+ if (op == "fuse") {
+#ifdef HAVE_LIBFUSE
+ FuseStore fuse(fs, mountpoint);
+ cout << "mounting fuse at " << mountpoint << " ..." << std::endl;
+ int r = fuse.main();
+ if (r < 0) {
+ cerr << "failed to mount fuse: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+#else
+ cerr << "fuse support not enabled" << std::endl;
+#endif
+ return 0;
+ }
+
+ vector<coll_t> ls;
+ vector<coll_t>::iterator it;
+ CompatSet supported;
+
+#ifdef INTERNAL_TEST
+ supported = get_test_compat_set();
+#else
+ supported = OSD::get_osd_compat_set();
+#endif
+
+ bufferlist bl;
+ OSDSuperblock superblock;
+ auto ch = fs->open_collection(coll_t::meta());
+ bufferlist::const_iterator p;
+ ret = fs->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
+ if (ret < 0) {
+ cerr << "Failure to read OSD superblock: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ p = bl.cbegin();
+ decode(superblock, p);
+
+ if (debug) {
+ cerr << "Cluster fsid=" << superblock.cluster_fsid << std::endl;
+ }
+
+ if (debug) {
+ cerr << "Supported features: " << supported << std::endl;
+ cerr << "On-disk features: " << superblock.compat_features << std::endl;
+ }
+ if (supported.compare(superblock.compat_features) == -1) {
+ CompatSet unsupported = supported.unsupported(superblock.compat_features);
+ cerr << "On-disk OSD incompatible features set "
+ << unsupported << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (op == "apply-layout-settings") {
+ int target_level = 0;
+ // Single positional argument with apply-layout-settings
+ // for target_level.
+ if (vm.count("object") && isdigit(object[0])) {
+ target_level = atoi(object.c_str());
+ // This requires --arg1 to be specified since
+ // this is the third positional argument and normally
+ // used with object operations.
+ } else if (vm.count("arg1") && isdigit(arg1[0])) {
+ target_level = atoi(arg1.c_str());
+ }
+ ret = apply_layout_settings(fs, superblock, pool, pgid, dry_run, target_level);
+ goto out;
+ }
+
+ if (op != "list" && vm.count("object")) {
+ // Special case: Create pgmeta_oid if empty string specified
+ // This can't conflict with any actual object names.
+ if (object == "") {
+ ghobj = pgid.make_pgmeta_oid();
+ } else {
+ json_spirit::Value v;
+ try {
+ if (!json_spirit::read(object, v) ||
+ (v.type() != json_spirit::array_type && v.type() != json_spirit::obj_type)) {
+ // Special: Need head/snapdir so set even if user didn't specify
+ if (vm.count("objcmd") && (objcmd == "remove-clone-metadata"))
+ head = true;
+ lookup_ghobject lookup(object, nspace, head);
+ if (pgidstr.length())
+ ret = action_on_all_objects_in_exact_pg(fs, coll_t(pgid), lookup, debug);
+ else
+ ret = action_on_all_objects(fs, lookup, debug);
+ if (ret) {
+ throw std::runtime_error("Internal error");
+ } else {
+ if (lookup.size() != 1) {
+ stringstream ss;
+ if (lookup.size() == 0)
+ ss << "No object id '" << object << "' found or invalid JSON specified";
+ else
+ ss << "Found " << lookup.size() << " objects with id '" << object
+ << "', please use a JSON spec from --op list instead";
+ throw std::runtime_error(ss.str());
+ }
+ pair<coll_t, ghobject_t> found = lookup.pop();
+ pgidstr = found.first.to_str();
+ pgid.parse(pgidstr.c_str());
+ ghobj = found.second;
+ }
+ } else {
+ stringstream ss;
+ if (pgidstr.length() == 0 && v.type() != json_spirit::array_type) {
+ ss << "Without --pgid the object '" << object
+ << "' must be a JSON array";
+ throw std::runtime_error(ss.str());
+ }
+ if (v.type() == json_spirit::array_type) {
+ json_spirit::Array array = v.get_array();
+ if (array.size() != 2) {
+ ss << "Object '" << object
+ << "' must be a JSON array with 2 elements";
+ throw std::runtime_error(ss.str());
+ }
+ vector<json_spirit::Value>::iterator i = array.begin();
+ ceph_assert(i != array.end());
+ if (i->type() != json_spirit::str_type) {
+ ss << "Object '" << object
+ << "' must be a JSON array with the first element a string";
+ throw std::runtime_error(ss.str());
+ }
+ string object_pgidstr = i->get_str();
+ if (object_pgidstr != "meta") {
+ spg_t object_pgid;
+ object_pgid.parse(object_pgidstr.c_str());
+ if (pgidstr.length() > 0) {
+ if (object_pgid != pgid) {
+ ss << "object '" << object
+ << "' has a pgid different from the --pgid="
+ << pgidstr << " option";
+ throw std::runtime_error(ss.str());
+ }
+ } else {
+ pgidstr = object_pgidstr;
+ pgid = object_pgid;
+ }
+ } else {
+ pgidstr = object_pgidstr;
+ }
+ ++i;
+ v = *i;
+ }
+ try {
+ ghobj.decode(v);
+ } catch (std::runtime_error& e) {
+ ss << "Decode object JSON error: " << e.what();
+ throw std::runtime_error(ss.str());
+ }
+ if (pgidstr != "meta" && (uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) {
+ cerr << "Object pool and pgid pool don't match" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ } catch (std::runtime_error& e) {
+ cerr << e.what() << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ }
+
+ // The ops which require --pgid option are checked here and
+ // mentioned in the usage for --pgid.
+ if ((op == "info" || op == "log" || op == "remove" || op == "export"
+ || op == "export-remove" || op == "mark-complete"
+ || op == "reset-last-complete"
+ || op == "trim-pg-log") &&
+ pgidstr.length() == 0) {
+ cerr << "Must provide pgid" << std::endl;
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+
+ if (op == "import") {
+
+ try {
+ ret = tool.do_import(fs, superblock, force, pgidstr);
+ }
+ catch (const buffer::error &e) {
+ cerr << "do_import threw exception error " << e.what() << std::endl;
+ ret = -EFAULT;
+ }
+ if (ret == -EFAULT) {
+ cerr << "Corrupt input for import" << std::endl;
+ }
+ if (ret == 0)
+ cout << "Import successful" << std::endl;
+ goto out;
+ } else if (op == "dump-journal-mount") {
+ // Undocumented feature to dump journal with mounted fs
+ // This doesn't support the format option, but it uses the
+ // ObjectStore::dump_journal() and mounts to get replay to run.
+ ret = fs->dump_journal(cout);
+ if (ret) {
+ if (ret == -EOPNOTSUPP) {
+ cerr << "Object store type \"" << type << "\" doesn't support journal dump" << std::endl;
+ } else {
+ cerr << "Journal dump failed with error " << cpp_strerror(ret) << std::endl;
+ }
+ }
+ goto out;
+ } else if (op == "get-osdmap") {
+ bufferlist bl;
+ OSDMap osdmap;
+ if (epoch == 0) {
+ epoch = superblock.current_epoch;
+ }
+ ret = get_osdmap(fs, epoch, osdmap, bl);
+ if (ret) {
+ cerr << "Failed to get osdmap#" << epoch << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = bl.write_fd(file_fd);
+ if (ret) {
+ cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl;
+ } else {
+ cout << "osdmap#" << epoch << " exported." << std::endl;
+ }
+ goto out;
+ } else if (op == "set-osdmap") {
+ bufferlist bl;
+ ret = get_fd_data(file_fd, bl);
+ if (ret < 0) {
+ cerr << "Failed to read osdmap " << cpp_strerror(ret) << std::endl;
+ } else {
+ ret = set_osdmap(fs, epoch, bl, force);
+ }
+ goto out;
+ } else if (op == "get-inc-osdmap") {
+ bufferlist bl;
+ if (epoch == 0) {
+ epoch = superblock.current_epoch;
+ }
+ ret = get_inc_osdmap(fs, epoch, bl);
+ if (ret < 0) {
+ cerr << "Failed to get incremental osdmap# " << epoch << ": "
+ << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = bl.write_fd(file_fd);
+ if (ret) {
+ cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl;
+ } else {
+ cout << "inc-osdmap#" << epoch << " exported." << std::endl;
+ }
+ goto out;
+ } else if (op == "set-inc-osdmap") {
+ bufferlist bl;
+ ret = get_fd_data(file_fd, bl);
+ if (ret < 0) {
+ cerr << "Failed to read incremental osdmap " << cpp_strerror(ret) << std::endl;
+ goto out;
+ } else {
+ ret = set_inc_osdmap(fs, epoch, bl, force);
+ }
+ goto out;
+ } else if (op == "update-mon-db") {
+ if (!vm.count("mon-store-path")) {
+ cerr << "Please specify the path to monitor db to update" << std::endl;
+ ret = -EINVAL;
+ } else {
+ ret = update_mon_db(*fs, superblock, dpath + "/keyring", mon_store_path);
+ }
+ goto out;
+ }
+
+ if (op == "remove") {
+ if (!force && !dry_run) {
+ cerr << "Please use export-remove or you must use --force option" << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = initiate_new_remove_pg(fs, pgid);
+ if (ret < 0) {
+ cerr << "PG '" << pgid << "' not found" << std::endl;
+ goto out;
+ }
+ cout << "Remove successful" << std::endl;
+ goto out;
+ }
+
+ if (op == "fix-lost") {
+ boost::scoped_ptr<action_on_object_t> action;
+ action.reset(new do_fix_lost());
+ if (pgidstr.length())
+ ret = action_on_all_objects_in_exact_pg(fs, coll_t(pgid), *action, debug);
+ else
+ ret = action_on_all_objects(fs, *action, debug);
+ goto out;
+ }
+
+ if (op == "list") {
+ ret = do_list(fs, pgidstr, object, nspace, formatter, debug,
+ human_readable, head);
+ if (ret < 0) {
+ cerr << "do_list failed: " << cpp_strerror(ret) << std::endl;
+ }
+ goto out;
+ }
+
+ if (op == "dump-super") {
+ formatter->open_object_section("superblock");
+ superblock.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+ goto out;
+ }
+
+ if (op == "statfs") {
+ store_statfs_t statsbuf;
+ ret = fs->statfs(&statsbuf);
+ if (ret < 0) {
+ cerr << "error from statfs: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ formatter->open_object_section("statfs");
+ statsbuf.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+ goto out;
+ }
+
+ if (op == "meta-list") {
+ ret = do_meta(fs, object, formatter, debug, human_readable);
+ if (ret < 0) {
+ cerr << "do_meta failed: " << cpp_strerror(ret) << std::endl;
+ }
+ goto out;
+ }
+
+ ret = fs->list_collections(ls);
+ if (ret < 0) {
+ cerr << "failed to list pgs: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ if (debug && op == "list-pgs")
+ cout << "Performing list-pgs operation" << std::endl;
+
+ // Find pg
+ for (it = ls.begin(); it != ls.end(); ++it) {
+ spg_t tmppgid;
+
+ if (pgidstr == "meta") {
+ if (it->to_str() == "meta")
+ break;
+ else
+ continue;
+ }
+
+ if (!it->is_pg(&tmppgid)) {
+ continue;
+ }
+
+ if (it->is_temp(&tmppgid)) {
+ continue;
+ }
+
+ if (op != "list-pgs" && tmppgid != pgid) {
+ continue;
+ }
+
+ if (op != "list-pgs") {
+ //Found!
+ break;
+ }
+
+ cout << tmppgid << std::endl;
+ }
+
+ if (op == "list-pgs") {
+ ret = 0;
+ goto out;
+ }
+
+ // If not an object command nor any of the ops handled below, then output this usage
+ // before complaining about a bad pgid
+ if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log") {
+ cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, "
+ "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, statfs)"
+ << std::endl;
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ epoch_t map_epoch;
+// The following code for export, info, log require omap or !skip-mount-omap
+ if (it != ls.end()) {
+
+ coll_t coll = *it;
+
+ if (vm.count("objcmd")) {
+ ret = 0;
+ if (objcmd == "remove" || objcmd == "removeall") {
+ bool all = (objcmd == "removeall");
+ enum rmtype type = BOTH;
+ if (rmtypestr == "nosnapmap")
+ type = NOSNAPMAP;
+ else if (rmtypestr == "snapmap")
+ type = SNAPMAP;
+ ret = do_remove_object(fs, coll, ghobj, all, force, type);
+ goto out;
+ } else if (objcmd == "list-attrs") {
+ ret = do_list_attrs(fs, coll, ghobj);
+ goto out;
+ } else if (objcmd == "list-omap") {
+ ret = do_list_omap(fs, coll, ghobj);
+ goto out;
+ } else if (objcmd == "get-bytes" || objcmd == "set-bytes") {
+ if (objcmd == "get-bytes") {
+ int fd;
+ if (vm.count("arg1") == 0 || arg1 == "-") {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = open(arg1.c_str(), O_WRONLY|O_TRUNC|O_CREAT|O_EXCL|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_get_bytes(fs, coll, ghobj, fd);
+ if (fd != STDOUT_FILENO)
+ close(fd);
+ } else {
+ int fd;
+ if (vm.count("arg1") == 0 || arg1 == "-") {
+ // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no file specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(arg1.c_str(), O_RDONLY|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_set_bytes(fs, coll, ghobj, fd);
+ if (fd != STDIN_FILENO)
+ close(fd);
+ }
+ goto out;
+ } else if (objcmd == "get-attr") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_get_attr(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "set-attr") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ }
+
+ int fd;
+ if (vm.count("arg2") == 0 || arg2 == "-") {
+ // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no file specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(arg2.c_str(), O_RDONLY|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg2 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_set_attr(fs, coll, ghobj, arg1, fd);
+ if (fd != STDIN_FILENO)
+ close(fd);
+ goto out;
+ } else if (objcmd == "rm-attr") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_rm_attr(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "get-omap") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_get_omap(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "set-omap") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ int fd;
+ if (vm.count("arg2") == 0 || arg2 == "-") {
+ // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no file specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(arg2.c_str(), O_RDONLY|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg2 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_set_omap(fs, coll, ghobj, arg1, fd);
+ if (fd != STDIN_FILENO)
+ close(fd);
+ goto out;
+ } else if (objcmd == "rm-omap") {
+ if (vm.count("arg1") == 0) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_rm_omap(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "get-omaphdr") {
+ if (vm.count("arg1")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = do_get_omaphdr(fs, coll, ghobj);
+ goto out;
+ } else if (objcmd == "set-omaphdr") {
+ // Extra arg
+ if (vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ int fd;
+ if (vm.count("arg1") == 0 || arg1 == "-") {
+ // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it.
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin is a tty and no file specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(arg1.c_str(), O_RDONLY|O_LARGEFILE, 0666);
+ if (fd == -1) {
+ cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl;
+ ret = 1;
+ goto out;
+ }
+ }
+ ret = do_set_omaphdr(fs, coll, ghobj, fd);
+ if (fd != STDIN_FILENO)
+ close(fd);
+ goto out;
+ } else if (objcmd == "dump") {
+ // There should not be any other arguments
+ if (vm.count("arg1") || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = print_obj_info(fs, coll, ghobj, formatter);
+ goto out;
+ } else if (objcmd == "corrupt-info") { // Undocumented testing feature
+ // There should not be any other arguments
+ if (vm.count("arg1") || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ ret = corrupt_info(fs, coll, ghobj, formatter);
+ goto out;
+ } else if (objcmd == "set-size" || objcmd == "corrupt-size") {
+ // Undocumented testing feature
+ bool corrupt = (objcmd == "corrupt-size");
+ // Extra arg
+ if (vm.count("arg1") == 0 || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) {
+ cerr << "Invalid size '" << arg1 << "' specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ uint64_t size = atoll(arg1.c_str());
+ ret = set_size(fs, coll, ghobj, size, formatter, corrupt);
+ goto out;
+ } else if (objcmd == "clear-data-digest") {
+ ret = clear_data_digest(fs, coll, ghobj);
+ goto out;
+ } else if (objcmd == "clear-snapset") {
+ // UNDOCUMENTED: For testing zap SnapSet
+ // IGNORE extra args since not in usage anyway
+ if (!ghobj.hobj.has_snapset()) {
+ cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ ret = clear_snapset(fs, coll, ghobj, arg1);
+ goto out;
+ } else if (objcmd == "remove-clone-metadata") {
+ // Extra arg
+ if (vm.count("arg1") == 0 || vm.count("arg2")) {
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+ if (!ghobj.hobj.has_snapset()) {
+ cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) {
+ cerr << "Invalid cloneid '" << arg1 << "' specified" << std::endl;
+ ret = 1;
+ goto out;
+ }
+ snapid_t cloneid = atoi(arg1.c_str());
+ ret = remove_clone(fs, coll, ghobj, cloneid, force);
+ goto out;
+ }
+ cerr << "Unknown object command '" << objcmd << "'" << std::endl;
+ usage(desc);
+ ret = 1;
+ goto out;
+ }
+
+ map_epoch = 0;
+ ret = PG::peek_map_epoch(fs, pgid, &map_epoch);
+ if (ret < 0)
+ cerr << "peek_map_epoch reports error" << std::endl;
+ if (debug)
+ cerr << "map_epoch " << map_epoch << std::endl;
+
+ pg_info_t info(pgid);
+ PastIntervals past_intervals;
+ __u8 struct_ver;
+ ret = PG::read_info(fs, pgid, coll, info, past_intervals, struct_ver);
+ if (ret < 0) {
+ cerr << "read_info error " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ if (struct_ver < PG::get_compat_struct_v()) {
+ cerr << "PG is too old to upgrade, use older Ceph version" << std::endl;
+ ret = -EFAULT;
+ goto out;
+ }
+ if (debug)
+ cerr << "struct_v " << (int)struct_ver << std::endl;
+
+ if (op == "export" || op == "export-remove") {
+ ret = tool.do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock, past_intervals);
+ if (ret == 0) {
+ cerr << "Export successful" << std::endl;
+ if (op == "export-remove") {
+ ret = initiate_new_remove_pg(fs, pgid);
+ // Export succeeded, so pgid is there
+ ceph_assert(ret == 0);
+ cerr << "Remove successful" << std::endl;
+ }
+ }
+ } else if (op == "info") {
+ formatter->open_object_section("info");
+ info.dump(formatter);
+ formatter->close_section();
+ formatter->flush(cout);
+ cout << std::endl;
+ } else if (op == "log") {
+ PGLog::IndexedLog log;
+ pg_missing_t missing;
+ ret = get_log(fs, struct_ver, pgid, info, log, missing);
+ if (ret < 0)
+ goto out;
+
+ dump_log(formatter, cout, log, missing);
+ } else if (op == "mark-complete") {
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (struct_ver < PG::get_compat_struct_v()) {
+ cerr << "Can't mark-complete, version mismatch " << (int)struct_ver
+ << " (pg) < compat " << (int)PG::get_compat_struct_v() << " (tool)"
+ << std::endl;
+ ret = 1;
+ goto out;
+ }
+
+ cout << "Marking complete " << std::endl;
+
+ info.last_update = eversion_t(superblock.current_epoch, info.last_update.version + 1);
+ info.last_backfill = hobject_t::get_max();
+ info.last_epoch_started = superblock.current_epoch;
+ info.history.last_epoch_started = superblock.current_epoch;
+ info.history.last_epoch_clean = superblock.current_epoch;
+ past_intervals.clear();
+
+ if (!dry_run) {
+ ret = write_info(*t, map_epoch, info, past_intervals);
+ if (ret != 0)
+ goto out;
+ auto ch = fs->open_collection(coll_t(pgid));
+ fs->queue_transaction(ch, std::move(*t));
+ }
+ cout << "Marking complete succeeded" << std::endl;
+ } else if (op == "trim-pg-log") {
+ ret = do_trim_pg_log(fs, coll, info, pgid,
+ map_epoch, past_intervals);
+ if (ret < 0) {
+ cerr << "Error trimming pg log: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ cout << "Finished trimming pg log" << std::endl;
+ goto out;
+ } else if (op == "reset-last-complete") {
+ if (!force) {
+ std::cerr << "WARNING: reset-last-complete is extremely dangerous and almost "
+ << "certain to lead to permanent data loss unless you know exactly "
+ << "what you are doing. Pass --force to proceed anyway."
+ << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+ ObjectStore::Transaction tran;
+ ObjectStore::Transaction *t = &tran;
+
+ if (struct_ver < PG::get_compat_struct_v()) {
+ cerr << "Can't reset-last-complete, version mismatch " << (int)struct_ver
+ << " (pg) < compat " << (int)PG::get_compat_struct_v() << " (tool)"
+ << std::endl;
+ ret = 1;
+ goto out;
+ }
+
+ cout << "Reseting last_complete " << std::endl;
+
+ info.last_complete = info.last_update;
+
+ if (!dry_run) {
+ ret = write_info(*t, map_epoch, info, past_intervals);
+ if (ret != 0)
+ goto out;
+ fs->queue_transaction(ch, std::move(*t));
+ }
+ cout << "Reseting last_complete succeeded" << std::endl;
+
+ } else {
+ ceph_assert(!"Should have already checked for valid --op");
+ }
+ } else {
+ cerr << "PG '" << pgid << "' not found" << std::endl;
+ ret = -ENOENT;
+ }
+
+out:
+ int r = fs->umount();
+ if (r < 0) {
+ cerr << "umount failed: " << cpp_strerror(r) << std::endl;
+ // If no previous error, then use umount() error
+ if (ret == 0)
+ ret = r;
+ }
+
+ if (dry_run) {
+ // Export output can go to stdout, so put this message on stderr
+ if (op == "export")
+ cerr << "dry-run: Nothing changed" << std::endl;
+ else
+ cout << "dry-run: Nothing changed" << std::endl;
+ }
+
+ if (ret < 0)
+ ret = 1;
+ return ret;
+}
diff --git a/src/tools/ceph_objectstore_tool.h b/src/tools/ceph_objectstore_tool.h
new file mode 100644
index 00000000..aafe886b
--- /dev/null
+++ b/src/tools/ceph_objectstore_tool.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OBJECTSTORE_TOOL_H_
+#define CEPH_OBJECTSTORE_TOOL_H_
+
+#include "RadosDump.h"
+
+class ObjectStoreTool : public RadosDump
+{
+ public:
+ ObjectStoreTool(int file_fd, bool dry_run)
+ : RadosDump(file_fd, dry_run)
+ {}
+
+ int dump_export(Formatter *formatter);
+ int do_import(ObjectStore *store, OSDSuperblock& sb, bool force,
+ std::string pgidstr);
+ int do_export(ObjectStore *fs, coll_t coll, spg_t pgid,
+ pg_info_t &info, epoch_t map_epoch, __u8 struct_ver,
+ const OSDSuperblock& superblock,
+ PastIntervals &past_intervals);
+ int dump_object(Formatter *formatter,
+ bufferlist &bl);
+ int get_object(
+ ObjectStore *store, OSDriver& driver, SnapMapper& mapper, coll_t coll,
+ bufferlist &bl, OSDMap &curmap, bool *skipped_objects);
+ int export_file(
+ ObjectStore *store, coll_t cid, ghobject_t &obj);
+ int export_files(ObjectStore *store, coll_t coll);
+};
+
+#endif // CEPH_OBJECSTORE_TOOL_H_
diff --git a/src/tools/ceph_osdomap_tool.cc b/src/tools/ceph_osdomap_tool.cc
new file mode 100644
index 00000000..8e15851d
--- /dev/null
+++ b/src/tools/ceph_osdomap_tool.cc
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License kkjversion 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include <stdlib.h>
+#include <string>
+
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "os/filestore/DBObjectMap.h"
+#include "kv/KeyValueDB.h"
+
+namespace po = boost::program_options;
+
+int main(int argc, char **argv) {
+ po::options_description desc("Allowed options");
+ string store_path, cmd, oid, backend;
+ bool debug = false;
+ desc.add_options()
+ ("help", "produce help message")
+ ("omap-path", po::value<string>(&store_path),
+ "path to omap directory, mandatory (current/omap usually)")
+ ("paranoid", "use paranoid checking")
+ ("debug", "Additional debug output from DBObjectMap")
+ ("oid", po::value<string>(&oid), "Restrict to this object id when dumping objects")
+ ("command", po::value<string>(&cmd),
+ "command arg is one of [dump-raw-keys, dump-raw-key-vals, dump-objects, dump-objects-with-keys, check, dump-headers, repair, compact], mandatory")
+ ("backend", po::value<string>(&backend),
+ "DB backend (default rocksdb)")
+ ;
+ po::positional_options_description p;
+ p.add("command", 1);
+
+ vector<string> ceph_option_strings;
+ po::variables_map vm;
+ try {
+ po::parsed_options parsed =
+ po::command_line_parser(argc, argv).options(desc).positional(p).allow_unregistered().run();
+ po::store(
+ parsed,
+ vm);
+ po::notify(vm);
+
+ ceph_option_strings = po::collect_unrecognized(parsed.options,
+ po::include_positional);
+ } catch(po::error &e) {
+ std::cerr << e.what() << std::endl;
+ return 1;
+ }
+
+ vector<const char *> ceph_options;
+ ceph_options.reserve(ceph_option_strings.size());
+ for (vector<string>::iterator i = ceph_option_strings.begin();
+ i != ceph_option_strings.end();
+ ++i) {
+ ceph_options.push_back(i->c_str());
+ }
+
+ if (vm.count("debug")) debug = true;
+
+ if (vm.count("help")) {
+ std::cerr << desc << std::endl;
+ return 1;
+ }
+
+ auto cct = global_init(
+ NULL, ceph_options, CEPH_ENTITY_TYPE_OSD,
+ CODE_ENVIRONMENT_UTILITY_NODOUT, 0);
+ common_init_finish(g_ceph_context);
+ cct->_conf.apply_changes(nullptr);
+ if (debug) {
+ g_conf().set_val_or_die("log_to_stderr", "true");
+ g_conf().set_val_or_die("err_to_stderr", "true");
+ }
+ g_conf().apply_changes(nullptr);
+
+ if (vm.count("omap-path") == 0) {
+ std::cerr << "Required argument --omap-path" << std::endl;
+ return 1;
+ }
+
+ if (vm.count("command") == 0) {
+ std::cerr << "Required argument --command" << std::endl;
+ return 1;
+ }
+
+ if (vm.count("backend") == 0) {
+ backend = "rocksdb";
+ }
+
+ KeyValueDB* store(KeyValueDB::create(g_ceph_context, backend, store_path));
+ if (store == NULL) {
+ std::cerr << "Invalid backend '" << backend << "' specified" << std::endl;
+ return 1;
+ }
+ /*if (vm.count("paranoid")) {
+ std::cerr << "Enabling paranoid checks" << std::endl;
+ store->options.paranoid_checks = true;
+ }*/
+ DBObjectMap omap(cct.get(), store);
+ stringstream out;
+ int r = store->open(out);
+ if (r < 0) {
+ std::cerr << "Store open got: " << cpp_strerror(r) << std::endl;
+ std::cerr << "Output: " << out.str() << std::endl;
+ return r;
+ }
+ // We don't call omap.init() here because it will repair
+ // the DBObjectMap which we might want to examine for diagnostic
+ // reasons. Instead use --command repair.
+
+ omap.get_state();
+ std::cout << "Version: " << (int)omap.state.v << std::endl;
+ std::cout << "Seq: " << omap.state.seq << std::endl;
+ std::cout << "legacy: " << (omap.state.legacy ? "true" : "false") << std::endl;
+
+ if (cmd == "dump-raw-keys") {
+ KeyValueDB::WholeSpaceIterator i = store->get_wholespace_iterator();
+ for (i->seek_to_first(); i->valid(); i->next()) {
+ std::cout << i->raw_key() << std::endl;
+ }
+ return 0;
+ } else if (cmd == "dump-raw-key-vals") {
+ KeyValueDB::WholeSpaceIterator i = store->get_wholespace_iterator();
+ for (i->seek_to_first(); i->valid(); i->next()) {
+ std::cout << i->raw_key() << std::endl;
+ i->value().hexdump(std::cout);
+ }
+ return 0;
+ } else if (cmd == "dump-objects") {
+ vector<ghobject_t> objects;
+ r = omap.list_objects(&objects);
+ if (r < 0) {
+ std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (vm.count("oid") != 0 && i->hobj.oid.name != oid)
+ continue;
+ std::cout << *i << std::endl;
+ }
+ return 0;
+ } else if (cmd == "dump-objects-with-keys") {
+ vector<ghobject_t> objects;
+ r = omap.list_objects(&objects);
+ if (r < 0) {
+ std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if (vm.count("oid") != 0 && i->hobj.oid.name != oid)
+ continue;
+ std::cout << "Object: " << *i << std::endl;
+ ObjectMap::ObjectMapIterator j = omap.get_iterator(ghobject_t(i->hobj));
+ for (j->seek_to_first(); j->valid(); j->next()) {
+ std::cout << j->key() << std::endl;
+ j->value().hexdump(std::cout);
+ }
+ }
+ return 0;
+ } else if (cmd == "check" || cmd == "repair") {
+ ostringstream ss;
+ bool repair = (cmd == "repair");
+ r = omap.check(ss, repair, true);
+ if (r) {
+ std::cerr << ss.str() << std::endl;
+ if (r > 0) {
+ std::cerr << "check got " << r << " error(s)" << std::endl;
+ return 1;
+ }
+ }
+ std::cout << (repair ? "repair" : "check") << " succeeded" << std::endl;
+ return 0;
+ } else if (cmd == "dump-headers") {
+ vector<DBObjectMap::_Header> headers;
+ r = omap.list_object_headers(&headers);
+ if (r < 0) {
+ std::cerr << "list_object_headers got: " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ for (auto i : headers)
+ std::cout << i << std::endl;
+ return 0;
+ } else if (cmd == "resetv2") {
+ omap.state.v = 2;
+ omap.state.legacy = false;
+ omap.set_state();
+ } else if (cmd == "compact") {
+ omap.compact();
+ return 0;
+ } else {
+ std::cerr << "Did not recognize command " << cmd << std::endl;
+ return 1;
+ }
+}
diff --git a/src/tools/cephfs/CMakeLists.txt b/src/tools/cephfs/CMakeLists.txt
new file mode 100644
index 00000000..2cca8dc0
--- /dev/null
+++ b/src/tools/cephfs/CMakeLists.txt
@@ -0,0 +1,49 @@
+set(cephfs_journal_tool_srcs
+ cephfs-journal-tool.cc
+ JournalTool.cc
+ JournalFilter.cc
+ JournalScanner.cc
+ EventOutput.cc
+ Dumper.cc
+ Resetter.cc
+ RoleSelector.cc
+ MDSUtility.cc)
+add_executable(cephfs-journal-tool ${cephfs_journal_tool_srcs})
+target_link_libraries(cephfs-journal-tool librados mds osdc global
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs_table_tool_srcs
+ cephfs-table-tool.cc
+ TableTool.cc
+ RoleSelector.cc
+ MDSUtility.cc)
+add_executable(cephfs-table-tool ${cephfs_table_tool_srcs})
+target_link_libraries(cephfs-table-tool librados mds osdc global
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs_data_scan_srcs
+ cephfs-data-scan.cc
+ DataScan.cc
+ RoleSelector.cc
+ PgFiles.cc
+ MDSUtility.cc)
+add_executable(cephfs-data-scan ${cephfs_data_scan_srcs})
+target_link_libraries(cephfs-data-scan librados cephfs mds osdc global
+ cls_cephfs_client
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+install(TARGETS
+ cephfs-journal-tool
+ cephfs-table-tool
+ cephfs-data-scan
+ DESTINATION bin)
+
+option(WITH_CEPHFS_SHELL "install cephfs-shell" OFF)
+if(WITH_CEPHFS_SHELL)
+ if(NOT WITH_PYTHON3)
+ message(SEND_ERROR "Please enable WITH_PYTHON3 for cephfs-shell")
+ endif()
+ set(PYTHON_VERSION 3)
+ include(Distutils)
+ distutils_install_module(cephfs-shell)
+endif()
diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc
new file mode 100644
index 00000000..8fb670ad
--- /dev/null
+++ b/src/tools/cephfs/DataScan.cc
@@ -0,0 +1,2188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+#include <fstream>
+#include "include/util.h"
+
+#include "mds/CInode.h"
+#include "mds/InoTable.h"
+#include "mds/SnapServer.h"
+#include "cls/cephfs/cls_cephfs_client.h"
+
+#include "PgFiles.h"
+#include "DataScan.h"
+#include "include/compat.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "datascan." << __func__ << ": "
+
+void DataScan::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-data-scan init [--force-init]\n"
+ << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
+ << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
+ << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
+ << " cephfs-data-scan scan_links\n"
+ << "\n"
+ << " --force-corrupt: overrite apparently corrupt structures\n"
+ << " --force-init: write root inodes even if they exist\n"
+ << " --force-pool: use data pool even if it is not in FSMap\n"
+ << " --worker_m: Maximum number of workers\n"
+ << " --worker_n: Worker number, range 0-(worker_m-1)\n"
+ << "\n"
+ << " cephfs-data-scan scan_frags [--force-corrupt]\n"
+ << " cephfs-data-scan cleanup <data pool name>\n"
+ << std::endl;
+
+ generic_client_usage();
+}
+
+bool DataScan::parse_kwarg(
+ const std::vector<const char*> &args,
+ std::vector<const char *>::const_iterator &i,
+ int *r)
+{
+ if (i + 1 == args.end()) {
+ return false;
+ }
+
+ const std::string arg(*i);
+ const std::string val(*(i + 1));
+
+ if (arg == std::string("--output-dir")) {
+ if (driver != NULL) {
+ derr << "Unexpected --output-dir: output already selected!" << dendl;
+ *r = -EINVAL;
+ return false;
+ }
+ dout(4) << "Using local file output to '" << val << "'" << dendl;
+ driver = new LocalFileDriver(val, data_io);
+ return true;
+ } else if (arg == std::string("--worker_n")) {
+ std::string err;
+ n = strict_strtoll(val.c_str(), 10, &err);
+ if (!err.empty()) {
+ std::cerr << "Invalid worker number '" << val << "'" << std::endl;
+ *r = -EINVAL;
+ return false;
+ }
+ return true;
+ } else if (arg == std::string("--worker_m")) {
+ std::string err;
+ m = strict_strtoll(val.c_str(), 10, &err);
+ if (!err.empty()) {
+ std::cerr << "Invalid worker count '" << val << "'" << std::endl;
+ *r = -EINVAL;
+ return false;
+ }
+ return true;
+ } else if (arg == std::string("--filter-tag")) {
+ filter_tag = val;
+ dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
+ return true;
+ } else if (arg == std::string("--filesystem")) {
+ std::shared_ptr<const Filesystem> fs;
+ *r = fsmap->parse_filesystem(val, &fs);
+ if (*r != 0) {
+ std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
+ return false;
+ }
+ fscid = fs->fscid;
+ return true;
+ } else if (arg == std::string("--alternate-pool")) {
+ metadata_pool_name = val;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool DataScan::parse_arg(
+ const std::vector<const char*> &args,
+ std::vector<const char *>::const_iterator &i)
+{
+ const std::string arg(*i);
+ if (arg == "--force-pool") {
+ force_pool = true;
+ return true;
+ } else if (arg == "--force-corrupt") {
+ force_corrupt = true;
+ return true;
+ } else if (arg == "--force-init") {
+ force_init = true;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+int DataScan::main(const std::vector<const char*> &args)
+{
+ // Parse args
+ // ==========
+ if (args.size() < 1) {
+ cerr << "missing position argument" << std::endl;
+ return -EINVAL;
+ }
+
+ // Common RADOS init: open metadata pool
+ // =====================================
+ librados::Rados rados;
+ int r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable" << dendl;
+ return r;
+ }
+
+ std::string const &command = args[0];
+ std::string data_pool_name;
+
+ std::string pg_files_path;
+ std::set<pg_t> pg_files_pgs;
+
+ // Consume any known --key val or --flag arguments
+ for (std::vector<const char *>::const_iterator i = args.begin() + 1;
+ i != args.end(); ++i) {
+ if (parse_kwarg(args, i, &r)) {
+ // Skip the kwarg value field
+ ++i;
+ continue;
+ } else if (r) {
+ return r;
+ }
+
+ if (parse_arg(args, i)) {
+ continue;
+ }
+
+ // Trailing positional argument
+ if (i + 1 == args.end() &&
+ (command == "scan_inodes"
+ || command == "scan_extents"
+ || command == "cleanup")) {
+ data_pool_name = *i;
+ continue;
+ }
+
+ if (command == "pg_files") {
+ if (i == args.begin() + 1) {
+ pg_files_path = *i;
+ continue;
+ } else {
+ pg_t pg;
+ bool parsed = pg.parse(*i);
+ if (!parsed) {
+ std::cerr << "Invalid PG '" << *i << "'" << std::endl;
+ return -EINVAL;
+ } else {
+ pg_files_pgs.insert(pg);
+ continue;
+ }
+ }
+
+ }
+
+ // Fall through: unhandled
+ std::cerr << "Unknown argument '" << *i << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // If caller didn't specify a namespace, try to pick
+ // one if only one exists
+ if (fscid == FS_CLUSTER_ID_NONE) {
+ if (fsmap->filesystem_count() == 1) {
+ fscid = fsmap->get_filesystem()->fscid;
+ } else {
+ std::cerr << "Specify a filesystem with --filesystem" << std::endl;
+ return -EINVAL;
+ }
+ }
+ auto fs = fsmap->get_filesystem(fscid);
+ ceph_assert(fs != nullptr);
+
+ // Default to output to metadata pool
+ if (driver == NULL) {
+ driver = new MetadataDriver();
+ driver->set_force_corrupt(force_corrupt);
+ driver->set_force_init(force_init);
+ dout(4) << "Using metadata pool output" << dendl;
+ }
+
+ dout(4) << "connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ r = driver->init(rados, metadata_pool_name, fsmap, fscid);
+ if (r < 0) {
+ return r;
+ }
+
+ if (command == "pg_files") {
+ auto pge = PgFiles(objecter, pg_files_pgs);
+ pge.init();
+ return pge.scan_path(pg_files_path);
+ }
+
+ // Initialize data_io for those commands that need it
+ if (command == "scan_inodes" ||
+ command == "scan_extents" ||
+ command == "cleanup") {
+ if (data_pool_name.empty()) {
+ std::cerr << "Data pool not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ data_pool_id = rados.pool_lookup(data_pool_name.c_str());
+ if (data_pool_id < 0) {
+ std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
+ return -ENOENT;
+ } else {
+ dout(4) << "data pool '" << data_pool_name
+ << "' has ID " << data_pool_id << dendl;
+ }
+
+ if (!fs->mds_map.is_data_pool(data_pool_id)) {
+ std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
+ "CephFS data pool!" << std::endl;
+ if (!force_pool) {
+ std::cerr << "Use --force-pool to continue" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
+ r = rados.ioctx_create(data_pool_name.c_str(), data_io);
+ if (r != 0) {
+ return r;
+ }
+ }
+
+ // Initialize metadata_io from MDSMap for scan_frags
+ if (command == "scan_frags" || command == "scan_links") {
+ const auto fs = fsmap->get_filesystem(fscid);
+ if (fs == nullptr) {
+ std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
+ return -ENOENT;
+ }
+ int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
+
+ dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+ int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+ if (r < 0) {
+ std::cerr << "Pool " << metadata_pool_id
+ << " identified in MDS map not found in RADOS!" << std::endl;
+ return r;
+ }
+
+ r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+ if (r != 0) {
+ return r;
+ }
+
+ data_pools = fs->mds_map.get_data_pools();
+ }
+
+ // Finally, dispatch command
+ if (command == "scan_inodes") {
+ return scan_inodes();
+ } else if (command == "scan_extents") {
+ return scan_extents();
+ } else if (command == "scan_frags") {
+ return scan_frags();
+ } else if (command == "scan_links") {
+ return scan_links();
+ } else if (command == "cleanup") {
+ return cleanup();
+ } else if (command == "init") {
+ return driver->init_roots(fs->mds_map.get_first_data_pool());
+ } else {
+ std::cerr << "Unknown command '" << command << "'" << std::endl;
+ return -EINVAL;
+ }
+}
+
+int MetadataDriver::inject_unlinked_inode(
+ inodeno_t inono, int mode, int64_t data_pool_id)
+{
+ const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
+
+ // Skip if exists
+ bool already_exists = false;
+ int r = root_exists(inono, &already_exists);
+ if (r) {
+ return r;
+ }
+ if (already_exists && !force_init) {
+ std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
+ " exists, skipping create. Use --force-init to overwrite"
+ " the existing object." << std::endl;
+ return 0;
+ }
+
+ // Compose
+ InodeStore inode;
+ inode.inode.ino = inono;
+ inode.inode.version = 1;
+ inode.inode.xattr_version = 1;
+ inode.inode.mode = 0500 | mode;
+ // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
+ // (we won't actually give the *correct* dirstat here though)
+ inode.inode.dirstat.nfiles = 1;
+
+ inode.inode.ctime =
+ inode.inode.mtime = ceph_clock_now();
+ inode.inode.nlink = 1;
+ inode.inode.truncate_size = -1ull;
+ inode.inode.truncate_seq = 1;
+ inode.inode.uid = g_conf()->mds_root_ino_uid;
+ inode.inode.gid = g_conf()->mds_root_ino_gid;
+
+ // Force layout to default: should we let users override this so that
+ // they don't have to mount the filesystem to correct it?
+ inode.inode.layout = file_layout_t::get_default();
+ inode.inode.layout.pool_id = data_pool_id;
+ inode.inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+
+ // Assume that we will get our stats wrong, and that we may
+ // be ignoring dirfrags that exist
+ inode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
+
+ if (inono == MDS_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) {
+ sr_t srnode;
+ srnode.seq = 1;
+ encode(srnode, inode.snap_blob);
+ }
+
+ // Serialize
+ bufferlist inode_bl;
+ encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
+ inode.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ // Write
+ r = metadata_io.write_full(oid.name, inode_bl);
+ if (r != 0) {
+ derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+}
+
+int MetadataDriver::root_exists(inodeno_t ino, bool *result)
+{
+ object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+ uint64_t size;
+ time_t mtime;
+ int r = metadata_io.stat(oid.name, &size, &mtime);
+ if (r == -ENOENT) {
+ *result = false;
+ return 0;
+ } else if (r < 0) {
+ return r;
+ }
+
+ *result = true;
+ return 0;
+}
+
+int MetadataDriver::init_roots(int64_t data_pool_id)
+{
+ int r = 0;
+ r = inject_unlinked_inode(MDS_INO_ROOT, S_IFDIR|0755, data_pool_id);
+ if (r != 0) {
+ return r;
+ }
+ r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
+ if (r != 0) {
+ return r;
+ }
+ bool created = false;
+ r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
+ if (r != 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int MetadataDriver::check_roots(bool *result)
+{
+ int r;
+ r = root_exists(MDS_INO_ROOT, result);
+ if (r != 0) {
+ return r;
+ }
+ if (!*result) {
+ return 0;
+ }
+
+ r = root_exists(MDS_INO_MDSDIR(0), result);
+ if (r != 0) {
+ return r;
+ }
+ if (!*result) {
+ return 0;
+ }
+
+ return 0;
+}
+
+/**
+ * Stages:
+ *
+ * SERIAL init
+ * 0. Create root inodes if don't exist
+ * PARALLEL scan_extents
+ * 1. Size and mtime recovery: scan ALL objects, and update 0th
+ * objects with max size and max mtime seen.
+ * PARALLEL scan_inodes
+ * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
+ * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
+ * or rstats at this stage. Inodes without backtraces go into
+ * lost+found
+ * TODO: SERIAL "recover stats"
+ * 3. Dirfrag statistics: depth first traverse into metadata tree,
+ * rebuilding dir sizes.
+ * TODO PARALLEL "clean up"
+ * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
+ * anything onto them) and remove any of the xattrs that we
+ * used for accumulating.
+ */
+
+
+int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
+{
+ if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
+ return -EINVAL;
+ }
+
+ std::string err;
+ std::string inode_str = oid.substr(0, oid.find("."));
+ *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+
+ std::string pos_string = oid.substr(oid.find(".") + 1);
+ *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+int DataScan::scan_extents()
+{
+ return forall_objects(data_io, false, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ // Read size
+ uint64_t size;
+ time_t mtime;
+ int r = data_io.stat(oid, &size, &mtime);
+ dout(10) << "handling object " << obj_name_ino
+ << "." << obj_name_offset << dendl;
+ if (r != 0) {
+ dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
+ return r;
+ }
+
+ // I need to keep track of
+ // * The highest object ID seen
+ // * The size of the highest object ID seen
+ // * The largest object seen
+ //
+ // Given those things, I can later infer the object chunking
+ // size, the offset of the last object (chunk size * highest ID seen)
+ // and the actual size (offset of last object + size of highest ID seen)
+ //
+ // This logic doesn't take account of striping.
+ r = ClsCephFSClient::accumulate_inode_metadata(
+ data_io,
+ obj_name_ino,
+ obj_name_offset,
+ size,
+ mtime);
+ if (r < 0) {
+ derr << "Failed to accumulate metadata data from '"
+ << oid << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+ });
+}
+
+int DataScan::probe_filter(librados::IoCtx &ioctx)
+{
+ bufferlist filter_bl;
+ ClsCephFSClient::build_tag_filter("test", &filter_bl);
+ librados::ObjectCursor range_i;
+ librados::ObjectCursor range_end;
+
+ std::vector<librados::ObjectItem> tmp_result;
+ librados::ObjectCursor tmp_next;
+ int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
+ 1, filter_bl, &tmp_result, &tmp_next);
+
+ return r >= 0;
+}
+
+int DataScan::forall_objects(
+ librados::IoCtx &ioctx,
+ bool untagged_only,
+ std::function<int(std::string, uint64_t, uint64_t)> handler
+ )
+{
+ librados::ObjectCursor range_i;
+ librados::ObjectCursor range_end;
+ ioctx.object_list_slice(
+ ioctx.object_list_begin(),
+ ioctx.object_list_end(),
+ n,
+ m,
+ &range_i,
+ &range_end);
+
+
+ bufferlist filter_bl;
+
+ bool legacy_filtering = false;
+ if (untagged_only) {
+ // probe to deal with older OSDs that don't support
+ // the cephfs pgls filtering mode
+ legacy_filtering = !probe_filter(ioctx);
+ if (!legacy_filtering) {
+ ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
+ }
+ }
+
+ int r = 0;
+ while(range_i < range_end) {
+ std::vector<librados::ObjectItem> result;
+ int r = ioctx.object_list(range_i, range_end, 1,
+ filter_bl, &result, &range_i);
+ if (r < 0) {
+ derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto &i : result) {
+ const std::string &oid = i.oid;
+ uint64_t obj_name_ino = 0;
+ uint64_t obj_name_offset = 0;
+ r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+ if (r != 0) {
+ dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+ continue;
+ }
+
+ if (untagged_only && legacy_filtering) {
+ dout(20) << "Applying filter to " << oid << dendl;
+
+ // We are only interested in 0th objects during this phase: we touched
+ // the other objects during scan_extents
+ if (obj_name_offset != 0) {
+ dout(20) << "Non-zeroth object" << dendl;
+ continue;
+ }
+
+ bufferlist scrub_tag_bl;
+ int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
+ if (r >= 0) {
+ std::string read_tag;
+ auto q = scrub_tag_bl.cbegin();
+ try {
+ decode(read_tag, q);
+ if (read_tag == filter_tag) {
+ dout(20) << "skipping " << oid << " because it has the filter_tag"
+ << dendl;
+ continue;
+ }
+ } catch (const buffer::error &err) {
+ }
+ dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
+ } else {
+ dout(20) << "no tag read (" << r << ")" << dendl;
+ }
+
+ } else if (untagged_only) {
+ ceph_assert(obj_name_offset == 0);
+ dout(20) << "OSD matched oid " << oid << dendl;
+ }
+
+ int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
+ if (r == 0 && this_oid_r < 0) {
+ r = this_oid_r;
+ }
+ }
+ }
+
+ return r;
+}
+
+int DataScan::scan_inodes()
+{
+ bool roots_present;
+ int r = driver->check_roots(&roots_present);
+ if (r != 0) {
+ derr << "Unexpected error checking roots: '"
+ << cpp_strerror(r) << "'" << dendl;
+ return r;
+ }
+
+ if (!roots_present) {
+ std::cerr << "Some or all system inodes are absent. Run 'init' from "
+ "one node before running 'scan_inodes'" << std::endl;
+ return -EIO;
+ }
+
+ return forall_objects(data_io, true, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ int r = 0;
+
+ dout(10) << "handling object "
+ << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
+ << dendl;
+
+ AccumulateResult accum_res;
+ inode_backtrace_t backtrace;
+ file_layout_t loaded_layout = file_layout_t::get_default();
+ r = ClsCephFSClient::fetch_inode_accumulate_result(
+ data_io, oid, &backtrace, &loaded_layout, &accum_res);
+
+ if (r == -EINVAL) {
+ dout(4) << "Accumulated metadata missing from '"
+ << oid << ", did you run scan_extents?" << dendl;
+ return r;
+ } else if (r < 0) {
+ dout(4) << "Unexpected error loading accumulated metadata from '"
+ << oid << "': " << cpp_strerror(r) << dendl;
+ // FIXME: this creates situation where if a client has a corrupt
+ // backtrace/layout, we will fail to inject it. We should (optionally)
+ // proceed if the backtrace/layout is corrupt but we have valid
+ // accumulated metadata.
+ return r;
+ }
+
+ const time_t file_mtime = accum_res.max_mtime;
+ uint64_t file_size = 0;
+ bool have_backtrace = !(backtrace.ancestors.empty());
+
+ // This is the layout we will use for injection, populated either
+ // from loaded_layout or from best guesses
+ file_layout_t guessed_layout;
+ guessed_layout.pool_id = data_pool_id;
+
+ // Calculate file_size, guess the layout
+ if (accum_res.ceiling_obj_index > 0) {
+ uint32_t chunk_size = file_layout_t::get_default().object_size;
+ // When there are multiple objects, the largest object probably
+ // indicates the chunk size. But not necessarily, because files
+ // can be sparse. Only make this assumption if size seen
+ // is a power of two, as chunk sizes typically are.
+ if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
+ chunk_size = accum_res.max_obj_size;
+ }
+
+ if (loaded_layout.pool_id == -1) {
+ // If no stashed layout was found, guess it
+ guessed_layout.object_size = chunk_size;
+ guessed_layout.stripe_unit = chunk_size;
+ guessed_layout.stripe_count = 1;
+ } else if (!loaded_layout.is_valid() ||
+ loaded_layout.object_size < accum_res.max_obj_size) {
+ // If the max size seen exceeds what the stashed layout claims, then
+ // disbelieve it. Guess instead. Same for invalid layouts on disk.
+ dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
+ << std::dec << ", ignoring in favour of best guess" << dendl;
+ guessed_layout.object_size = chunk_size;
+ guessed_layout.stripe_unit = chunk_size;
+ guessed_layout.stripe_count = 1;
+ } else {
+ // We have a stashed layout that we can't disprove, so apply it
+ guessed_layout = loaded_layout;
+ dout(20) << "loaded layout from xattr:"
+ << " os: " << guessed_layout.object_size
+ << " sc: " << guessed_layout.stripe_count
+ << " su: " << guessed_layout.stripe_unit
+ << dendl;
+ // User might have transplanted files from a pool with a different
+ // ID, so whatever the loaded_layout says, we'll force the injected
+ // layout to point to the pool we really read from
+ guessed_layout.pool_id = data_pool_id;
+ }
+
+ if (guessed_layout.stripe_count == 1) {
+ // Unstriped file: simple chunking
+ file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
+ + accum_res.ceiling_obj_size;
+ } else {
+ // Striped file: need to examine the last stripe_count objects
+ // in the file to determine the size.
+
+ // How many complete (i.e. not last stripe) objects?
+ uint64_t complete_objs = 0;
+ if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
+ complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
+ } else {
+ complete_objs = 0;
+ }
+
+ // How many potentially-short objects (i.e. last stripe set) objects?
+ uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
+
+ dout(10) << "calculating striped size from complete objs: "
+ << complete_objs << ", partial objs: " << partial_objs
+ << dendl;
+
+ // Maximum amount of data that may be in the incomplete objects
+ uint64_t incomplete_size = 0;
+
+ // For each short object, calculate the max file size within it
+ // and accumulate the maximum
+ for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
+ char buf[60];
+ snprintf(buf, sizeof(buf), "%llx.%08llx",
+ (long long unsigned)obj_name_ino, (long long unsigned)i);
+
+ uint64_t osize(0);
+ time_t omtime(0);
+ r = data_io.stat(std::string(buf), &osize, &omtime);
+ if (r == 0) {
+ if (osize > 0) {
+ // Upper bound within this object
+ uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
+ * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
+ + (i % guessed_layout.stripe_count)
+ * guessed_layout.stripe_unit + (osize - 1)
+ % guessed_layout.stripe_unit + 1;
+ incomplete_size = std::max(incomplete_size, upper_size);
+ }
+ } else if (r == -ENOENT) {
+ // Absent object, treat as size 0 and ignore.
+ } else {
+ // Unexpected error, carry r to outer scope for handling.
+ break;
+ }
+ }
+ if (r != 0 && r != -ENOENT) {
+ derr << "Unexpected error checking size of ino 0x" << std::hex
+ << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ file_size = complete_objs * guessed_layout.object_size
+ + incomplete_size;
+ }
+ } else {
+ file_size = accum_res.ceiling_obj_size;
+ if (loaded_layout.pool_id < 0
+ || loaded_layout.object_size < accum_res.max_obj_size) {
+ // No layout loaded, or inconsistent layout, use default
+ guessed_layout = file_layout_t::get_default();
+ guessed_layout.pool_id = data_pool_id;
+ } else {
+ guessed_layout = loaded_layout;
+ }
+ }
+
+ // Santity checking backtrace ino against object name
+ if (have_backtrace && backtrace.ino != obj_name_ino) {
+ dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+ << " doesn't match object name ino 0x" << obj_name_ino
+ << std::dec << dendl;
+ have_backtrace = false;
+ }
+
+ InodeStore dentry;
+ build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry);
+
+ // Inject inode to the metadata pool
+ if (have_backtrace) {
+ inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+ if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+ /* Special case for strays: even if we have a good backtrace,
+ * don't put it in the stray dir, because while that would technically
+ * give it linkage it would still be invisible to the user */
+ r = driver->inject_lost_and_found(obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ } else {
+ /* Happy case: we will inject a named dentry for this inode */
+ r = driver->inject_with_backtrace(backtrace, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+ } else {
+ /* Backtrace-less case: we will inject a lost+found dentry */
+ r = driver->inject_lost_and_found(
+ obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+
+ return r;
+ });
+}
+
+int DataScan::cleanup()
+{
+ // We are looking for only zeroth object
+ //
+ return forall_objects(data_io, true, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ int r = 0;
+ r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
+ if (r < 0) {
+ dout(4) << "Error deleting accumulated metadata from '"
+ << oid << "': " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ });
+}
+
+bool DataScan::valid_ino(inodeno_t ino) const
+{
+ return (ino >= inodeno_t((1ull << 40)))
+ || (MDS_INO_IS_STRAY(ino))
+ || (MDS_INO_IS_MDSDIR(ino))
+ || ino == MDS_INO_ROOT
+ || ino == MDS_INO_CEPH;
+}
+
+int DataScan::scan_links()
+{
+ MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
+ if (!metadata_driver) {
+ derr << "Unexpected --output-dir option for scan_links" << dendl;
+ return -EINVAL;
+ }
+
+ interval_set<uint64_t> used_inos;
+ map<inodeno_t, int> remote_links;
+ map<snapid_t, SnapInfo> snaps;
+ snapid_t last_snap = 1;
+ snapid_t snaprealm_v2_since = 2;
+
+ struct link_info_t {
+ inodeno_t dirino;
+ frag_t frag;
+ string name;
+ version_t version;
+ int nlink;
+ bool is_dir;
+ map<snapid_t, SnapInfo> snaps;
+ link_info_t() : version(0), nlink(0), is_dir(false) {}
+ link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::mempool_inode& i) :
+ dirino(di), frag(df), name(n),
+ version(i.version), nlink(i.nlink), is_dir(S_IFDIR & i.mode) {}
+ dirfrag_t dirfrag() const {
+ return dirfrag_t(dirino, frag);
+ }
+ };
+ map<inodeno_t, list<link_info_t> > dup_primaries;
+ map<inodeno_t, link_info_t> bad_nlink_inos;
+ map<inodeno_t, link_info_t> injected_inos;
+
+ map<dirfrag_t, set<string> > to_remove;
+
+ enum {
+ SCAN_INOS = 1,
+ CHECK_LINK,
+ };
+
+ for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
+ const librados::NObjectIterator it_end = metadata_io.nobjects_end();
+ for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
+ const std::string oid = it->get_oid();
+
+ uint64_t dir_ino = 0;
+ uint64_t frag_id = 0;
+ int r = parse_oid(oid, &dir_ino, &frag_id);
+ if (r == -EINVAL) {
+ dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
+ continue;
+ } else {
+ // parse_oid can only do 0 or -EINVAL
+ ceph_assert(r == 0);
+ }
+
+ if (!valid_ino(dir_ino)) {
+ dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
+ continue;
+ }
+
+ std::map<std::string, bufferlist> items;
+ r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
+ if (r < 0) {
+ derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& p : items) {
+ auto q = p.second.cbegin();
+ string dname;
+ snapid_t last;
+ dentry_key_t::decode_helper(p.first, dname, last);
+
+ if (last != CEPH_NOSNAP) {
+ if (last > last_snap)
+ last_snap = last;
+ continue;
+ }
+
+ try {
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ if (dnfirst <= CEPH_MAXSNAP) {
+ if (dnfirst - 1 > last_snap)
+ last_snap = dnfirst - 1;
+ }
+ char dentry_type;
+ decode(dentry_type, q);
+ if (dentry_type == 'I') {
+ InodeStore inode;
+ inode.decode_bare(q);
+ inodeno_t ino = inode.inode.ino;
+
+ if (step == SCAN_INOS) {
+ if (used_inos.contains(ino, 1)) {
+ dup_primaries[ino].size();
+ } else {
+ used_inos.insert(ino);
+ }
+ } else if (step == CHECK_LINK) {
+ sr_t srnode;
+ if (inode.snap_blob.length()) {
+ auto p = inode.snap_blob.cbegin();
+ decode(srnode, p);
+ for (auto it = srnode.snaps.begin();
+ it != srnode.snaps.end(); ) {
+ if (it->second.ino != ino ||
+ it->second.snapid != it->first) {
+ srnode.snaps.erase(it++);
+ } else {
+ ++it;
+ }
+ }
+ if (!srnode.past_parents.empty()) {
+ snapid_t last = srnode.past_parents.rbegin()->first;
+ if (last + 1 > snaprealm_v2_since)
+ snaprealm_v2_since = last + 1;
+ }
+ }
+ if (!inode.old_inodes.empty()) {
+ if (inode.old_inodes.rbegin()->first > last_snap)
+ last_snap = inode.old_inodes.rbegin()->first;
+ }
+ auto q = dup_primaries.find(ino);
+ if (q != dup_primaries.end()) {
+ q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
+ q->second.back().snaps.swap(srnode.snaps);
+ } else {
+ int nlink = 0;
+ auto r = remote_links.find(ino);
+ if (r != remote_links.end())
+ nlink = r->second;
+ if (!MDS_INO_IS_STRAY(dir_ino))
+ nlink++;
+ if (inode.inode.nlink != nlink) {
+ derr << "Bad nlink on " << ino << " expected " << nlink
+ << " has " << inode.inode.nlink << dendl;
+ bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
+ bad_nlink_inos[ino].nlink = nlink;
+ }
+ snaps.insert(make_move_iterator(begin(srnode.snaps)),
+ make_move_iterator(end(srnode.snaps)));
+ }
+ if (dnfirst == CEPH_NOSNAP)
+ injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
+ }
+ } else if (dentry_type == 'L') {
+ inodeno_t ino;
+ unsigned char d_type;
+ decode(ino, q);
+ decode(d_type, q);
+
+ if (step == SCAN_INOS) {
+ remote_links[ino]++;
+ } else if (step == CHECK_LINK) {
+ if (!used_inos.contains(ino, 1)) {
+ derr << "Bad remote link dentry 0x" << std::hex << dir_ino
+ << std::dec << "/" << dname
+ << ", ino " << ino << " not found" << dendl;
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+ dn_key.encode(key);
+ to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
+ }
+ }
+ } else {
+ derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
+ << std::dec << "/" << dname << dendl;
+ return -EINVAL;
+ }
+ } catch (const buffer::error &err) {
+ derr << "Error decoding dentry 0x" << std::hex << dir_ino
+ << std::dec << "/" << dname << dendl;
+ return -EINVAL;
+ }
+ }
+ }
+ }
+
+ map<unsigned, uint64_t> max_ino_map;
+ {
+ auto prev_max_ino = (uint64_t)1 << 40;
+ for (auto p = used_inos.begin(); p != used_inos.end(); ++p) {
+ auto cur_max = p.get_start() + p.get_len() - 1;
+ if (cur_max < prev_max_ino)
+ continue; // system inodes
+
+ if ((prev_max_ino >> 40) != (cur_max >> 40)) {
+ unsigned rank = (prev_max_ino >> 40) - 1;
+ max_ino_map[rank] = prev_max_ino;
+ } else if ((p.get_start() >> 40) != (cur_max >> 40)) {
+ unsigned rank = (p.get_start() >> 40) - 1;
+ max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1;
+ }
+ prev_max_ino = cur_max;
+ }
+ unsigned rank = (prev_max_ino >> 40) - 1;
+ max_ino_map[rank] = prev_max_ino;
+ }
+
+ used_inos.clear();
+
+ for (auto& p : dup_primaries) {
+ link_info_t newest;
+ for (auto& q : p.second) {
+ if (q.version > newest.version) {
+ newest = q;
+ } else if (q.version == newest.version &&
+ !MDS_INO_IS_STRAY(q.dirino) &&
+ MDS_INO_IS_STRAY(newest.dirino)) {
+ newest = q;
+ }
+ }
+
+ for (auto& q : p.second) {
+ // in the middle of dir fragmentation?
+ if (newest.dirino == q.dirino && newest.name == q.name) {
+ snaps.insert(make_move_iterator(begin(q.snaps)),
+ make_move_iterator(end(q.snaps)));
+ continue;
+ }
+
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
+ dn_key.encode(key);
+ to_remove[q.dirfrag()].insert(key);
+ derr << "Remove duplicated ino 0x" << p.first << " from "
+ << q.dirfrag() << "/" << q.name << dendl;
+ }
+
+ int nlink = 0;
+ auto q = remote_links.find(p.first);
+ if (q != remote_links.end())
+ nlink = q->second;
+ if (!MDS_INO_IS_STRAY(newest.dirino))
+ nlink++;
+
+ if (nlink != newest.nlink) {
+ derr << "Bad nlink on " << p.first << " expected " << nlink
+ << " has " << newest.nlink << dendl;
+ bad_nlink_inos[p.first] = newest;
+ bad_nlink_inos[p.first].nlink = nlink;
+ }
+ }
+ dup_primaries.clear();
+ remote_links.clear();
+
+ {
+ objecter->with_osdmap([&](const OSDMap& o) {
+ for (auto p : data_pools) {
+ const pg_pool_t *pi = o.get_pg_pool(p);
+ if (!pi)
+ continue;
+ if (pi->snap_seq > last_snap)
+ last_snap = pi->snap_seq;
+ }
+ });
+
+ if (!snaps.empty()) {
+ if (snaps.rbegin()->first > last_snap)
+ last_snap = snaps.rbegin()->first;
+ }
+ }
+
+ for (auto& p : to_remove) {
+ object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
+
+ int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
+ if (r != 0) {
+ derr << "Error removing duplicated dentries from " << p.first << dendl;
+ return r;
+ }
+ }
+ to_remove.clear();
+
+ for (auto &p : bad_nlink_inos) {
+ InodeStore inode;
+ snapid_t first;
+ int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
+ if (r < 0) {
+ derr << "Unexpected error reading dentry "
+ << p.second.dirfrag() << "/" << p.second.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (inode.inode.ino != p.first || inode.inode.version != p.second.version)
+ continue;
+
+ inode.inode.nlink = p.second.nlink;
+ r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
+ if (r < 0)
+ return r;
+ }
+
+ for (auto &p : injected_inos) {
+ InodeStore inode;
+ snapid_t first;
+ int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
+ if (r < 0) {
+ derr << "Unexpected error reading dentry "
+ << p.second.dirfrag() << "/" << p.second.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (first != CEPH_NOSNAP)
+ continue;
+
+ first = last_snap + 1;
+ r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
+ if (r < 0)
+ return r;
+ }
+
+ for (auto& p : max_ino_map) {
+ InoTable inotable(nullptr);
+ inotable.set_rank(p.first);
+ bool dirty = false;
+ int r = metadata_driver->load_table(&inotable);
+ if (r < 0) {
+ inotable.reset_state();
+ dirty = true;
+ }
+ if (inotable.force_consume_to(p.second))
+ dirty = true;
+ if (dirty) {
+ r = metadata_driver->save_table(&inotable);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ {
+ SnapServer snaptable;
+ snaptable.set_rank(0);
+ bool dirty = false;
+ int r = metadata_driver->load_table(&snaptable);
+ if (r < 0) {
+ snaptable.reset_state();
+ dirty = true;
+ }
+ if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps))
+ dirty = true;
+ if (dirty) {
+ r = metadata_driver->save_table(&snaptable);
+ if (r < 0)
+ return r;
+ }
+ }
+ return 0;
+}
+
+int DataScan::scan_frags()
+{
+ bool roots_present;
+ int r = driver->check_roots(&roots_present);
+ if (r != 0) {
+ derr << "Unexpected error checking roots: '"
+ << cpp_strerror(r) << "'" << dendl;
+ return r;
+ }
+
+ if (!roots_present) {
+ std::cerr << "Some or all system inodes are absent. Run 'init' from "
+ "one node before running 'scan_inodes'" << std::endl;
+ return -EIO;
+ }
+
+ return forall_objects(metadata_io, true, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ int r = 0;
+ r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+ if (r != 0) {
+ dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+ return r;
+ }
+
+ if (obj_name_ino < (1ULL << 40)) {
+ // FIXME: we're skipping stray dirs here: if they're
+ // orphaned then we should be resetting them some other
+ // way
+ dout(10) << "Skipping system ino " << obj_name_ino << dendl;
+ return 0;
+ }
+
+ AccumulateResult accum_res;
+ inode_backtrace_t backtrace;
+
+ // Default to inherit layout (i.e. no explicit layout on dir) which is
+ // expressed as a zeroed layout struct (see inode_t::has_layout)
+ file_layout_t loaded_layout;
+
+ int parent_r = 0;
+ bufferlist parent_bl;
+ int layout_r = 0;
+ bufferlist layout_bl;
+ bufferlist op_bl;
+
+ librados::ObjectReadOperation op;
+ op.getxattr("parent", &parent_bl, &parent_r);
+ op.getxattr("layout", &layout_bl, &layout_r);
+ r = metadata_io.operate(oid, &op, &op_bl);
+ if (r != 0 && r != -ENODATA) {
+ derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
+ return r;
+ }
+
+ if (parent_r != -ENODATA) {
+ try {
+ auto q = parent_bl.cbegin();
+ backtrace.decode(q);
+ } catch (buffer::error &e) {
+ dout(4) << "Corrupt backtrace on '" << oid << "': " << e << dendl;
+ if (!force_corrupt) {
+ return -EINVAL;
+ } else {
+ // Treat backtrace as absent: we'll inject into lost+found
+ backtrace = inode_backtrace_t();
+ }
+ }
+ }
+
+ if (layout_r != -ENODATA) {
+ try {
+ auto q = layout_bl.cbegin();
+ decode(loaded_layout, q);
+ } catch (buffer::error &e) {
+ dout(4) << "Corrupt layout on '" << oid << "': " << e << dendl;
+ if (!force_corrupt) {
+ return -EINVAL;
+ }
+ }
+ }
+
+ bool have_backtrace = !(backtrace.ancestors.empty());
+
+ // Santity checking backtrace ino against object name
+ if (have_backtrace && backtrace.ino != obj_name_ino) {
+ dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+ << " doesn't match object name ino 0x" << obj_name_ino
+ << std::dec << dendl;
+ have_backtrace = false;
+ }
+
+ uint64_t fnode_version = 0;
+ fnode_t fnode;
+ r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
+ if (r == -EINVAL) {
+ derr << "Corrupt fnode on " << oid << dendl;
+ if (force_corrupt) {
+ fnode.fragstat.mtime = 0;
+ fnode.fragstat.nfiles = 1;
+ fnode.fragstat.nsubdirs = 0;
+ fnode.accounted_fragstat = fnode.fragstat;
+ } else {
+ return r;
+ }
+ }
+
+ InodeStore dentry;
+ build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
+ loaded_layout, &dentry);
+
+ // Inject inode to the metadata pool
+ if (have_backtrace) {
+ inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+ if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+ /* Special case for strays: even if we have a good backtrace,
+ * don't put it in the stray dir, because while that would technically
+ * give it linkage it would still be invisible to the user */
+ r = driver->inject_lost_and_found(obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ } else {
+ /* Happy case: we will inject a named dentry for this inode */
+ r = driver->inject_with_backtrace(backtrace, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+ } else {
+ /* Backtrace-less case: we will inject a lost+found dentry */
+ r = driver->inject_lost_and_found(
+ obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+
+ return r;
+ });
+}
+
+int MetadataTool::read_fnode(
+ inodeno_t ino, frag_t frag, fnode_t *fnode,
+ uint64_t *last_version)
+{
+ ceph_assert(fnode != NULL);
+
+ object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
+ bufferlist fnode_bl;
+ int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
+ *last_version = metadata_io.get_last_version();
+ if (r < 0) {
+ return r;
+ }
+
+ auto old_fnode_iter = fnode_bl.cbegin();
+ try {
+ (*fnode).decode(old_fnode_iter);
+ } catch (const buffer::error &err) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
+ const std::string &dname, InodeStore *inode, snapid_t *dnfirst)
+{
+ ceph_assert(inode != NULL);
+
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+ dn_key.encode(key);
+
+ std::set<std::string> keys;
+ keys.insert(key);
+ std::map<std::string, bufferlist> vals;
+ object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
+ int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);
+ dout(20) << "oid=" << frag_oid.name
+ << " dname=" << dname
+ << " frag=" << frag
+ << ", r=" << r << dendl;
+ if (r < 0) {
+ return r;
+ }
+
+ if (vals.find(key) == vals.end()) {
+ dout(20) << key << " not found in result" << dendl;
+ return -ENOENT;
+ }
+
+ try {
+ auto q = vals[key].cbegin();
+ snapid_t first;
+ decode(first, q);
+ char dentry_type;
+ decode(dentry_type, q);
+ if (dentry_type == 'I') {
+ inode->decode_bare(q);
+ } else {
+ dout(20) << "dentry type '" << dentry_type << "': cannot"
+ "read an inode out of that" << dendl;
+ return -EINVAL;
+ }
+ if (dnfirst)
+ *dnfirst = first;
+ } catch (const buffer::error &err) {
+ dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
+ << std::dec << "/" << dname << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int MetadataDriver::load_table(MDSTable *table)
+{
+ object_t table_oid = table->get_object_name();
+
+ bufferlist table_bl;
+ int r = metadata_io.read(table_oid.name, table_bl, 0, 0);
+ if (r < 0) {
+ derr << "unable to read mds table '" << table_oid.name << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ try {
+ version_t table_ver;
+ auto p = table_bl.cbegin();
+ decode(table_ver, p);
+ table->decode_state(p);
+ table->force_replay_version(table_ver);
+ } catch (const buffer::error &err) {
+ derr << "unable to decode mds table '" << table_oid.name << "': "
+ << err.what() << dendl;
+ return -EIO;
+ }
+ return 0;
+}
+
+int MetadataDriver::save_table(MDSTable *table)
+{
+ object_t table_oid = table->get_object_name();
+
+ bufferlist table_bl;
+ encode(table->get_version(), table_bl);
+ table->encode_state(table_bl);
+ int r = metadata_io.write_full(table_oid.name, table_bl);
+ if (r != 0) {
+ derr << "error updating mds table " << table_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+int MetadataDriver::inject_lost_and_found(
+ inodeno_t ino, const InodeStore &dentry)
+{
+ // Create lost+found if doesn't exist
+ bool created = false;
+ int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
+ if (r < 0) {
+ return r;
+ }
+ InodeStore lf_ino;
+ r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
+ if (r == -ENOENT || r == -EINVAL) {
+ if (r == -EINVAL && !force_corrupt) {
+ return r;
+ }
+
+ // To have a directory not specify a layout, give it zeros (see
+ // inode_t::has_layout)
+ file_layout_t inherit_layout;
+
+ // Construct LF inode
+ frag_info_t fragstat;
+ fragstat.nfiles = 1,
+ build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
+
+ // Inject link to LF inode in the root dir
+ r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
+ if (r < 0) {
+ return r;
+ }
+ } else {
+ if (!(lf_ino.inode.mode & S_IFDIR)) {
+ derr << "lost+found exists but is not a directory!" << dendl;
+ // In this case we error out, and the user should do something about
+ // this problem.
+ return -EINVAL;
+ }
+ }
+
+ r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
+ if (r < 0) {
+ return r;
+ }
+
+ InodeStore recovered_ino;
+
+
+ const std::string dname = lost_found_dname(ino);
+
+ // Write dentry into lost+found dirfrag
+ return inject_linkage(lf_ino.inode.ino, dname, frag_t(), dentry);
+}
+
+
+int MetadataDriver::get_frag_of(
+ inodeno_t dirino,
+ const std::string &target_dname,
+ frag_t *result_ft)
+{
+ object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
+
+ dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
+
+ // Find and load fragtree if existing dirfrag
+ // ==========================================
+ bool have_backtrace = false;
+ bufferlist parent_bl;
+ int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
+ if (r == -ENODATA) {
+ dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
+ } else if (r < 0) {
+ dout(4) << "Unexpected error on '" << root_frag_oid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Deserialize backtrace
+ inode_backtrace_t backtrace;
+ if (parent_bl.length()) {
+ try {
+ auto q = parent_bl.cbegin();
+ backtrace.decode(q);
+ have_backtrace = true;
+ } catch (buffer::error &e) {
+ dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': " << e << dendl;
+ }
+ }
+
+ if (!(have_backtrace && backtrace.ancestors.size())) {
+ // Can't work out fragtree without a backtrace
+ dout(4) << "No backtrace on '" << root_frag_oid
+ << "': cannot determine fragtree" << dendl;
+ return -ENOENT;
+ }
+
+ // The parentage of dirino
+ const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
+
+ // The inode of dirino's parent
+ const inodeno_t parent_ino = bp.dirino;
+
+ // The dname of dirino in its parent.
+ const std::string &parent_dname = bp.dname;
+
+ dout(20) << "got backtrace parent " << parent_ino << "/"
+ << parent_dname << dendl;
+
+ // The primary dentry for dirino
+ InodeStore existing_dentry;
+
+ // See if we can find ourselves in dirfrag zero of the parent: this
+ // is a fast path that avoids needing to go further up the tree
+ // if the parent isn't fragmented (worst case we would have to
+ // go all the way to the root)
+ r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
+ if (r >= 0) {
+ // Great, fast path: return the fragtree from here
+ if (existing_dentry.inode.ino != dirino) {
+ dout(4) << "Unexpected inode in dentry! 0x" << std::hex
+ << existing_dentry.inode.ino
+ << " vs expected 0x" << dirino << std::dec << dendl;
+ return -ENOENT;
+ }
+ dout(20) << "fast path, fragtree is "
+ << existing_dentry.dirfragtree << dendl;
+ *result_ft = existing_dentry.pick_dirfrag(target_dname);
+ dout(20) << "frag is " << *result_ft << dendl;
+ return 0;
+ } else if (r != -ENOENT) {
+ // Dentry not present in 0th frag, must read parent's fragtree
+ frag_t parent_frag;
+ r = get_frag_of(parent_ino, parent_dname, &parent_frag);
+ if (r == 0) {
+ // We have the parent fragtree, so try again to load our dentry
+ r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
+ if (r >= 0) {
+ // Got it!
+ *result_ft = existing_dentry.pick_dirfrag(target_dname);
+ dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
+ return 0;
+ } else {
+ if (r == -EINVAL || r == -ENOENT) {
+ return -ENOENT; // dentry missing or corrupt, so frag is missing
+ } else {
+ return r;
+ }
+ }
+ } else {
+ // Couldn't resolve parent fragtree, so can't find ours.
+ return r;
+ }
+ } else if (r == -EINVAL) {
+ // Unreadable dentry, can't know the fragtree.
+ return -ENOENT;
+ } else {
+ // Unexpected error, raise it
+ return r;
+ }
+}
+
+
+int MetadataDriver::inject_with_backtrace(
+ const inode_backtrace_t &backtrace, const InodeStore &dentry)
+
+{
+
+ // On dirfrags
+ // ===========
+ // In order to insert something into a directory, we first (ideally)
+ // need to know the fragtree for the directory. Sometimes we can't
+ // get that, in which case we just go ahead and insert it into
+ // fragment zero for a good chance of that being the right thing
+ // anyway (most moderate-sized dirs aren't fragmented!)
+
+ // On ancestry
+ // ===========
+ // My immediate ancestry should be correct, so if we can find that
+ // directory's dirfrag then go inject it there. This works well
+ // in the case that this inode's dentry was somehow lost and we
+ // are recreating it, because the rest of the hierarchy
+ // will probably still exist.
+ //
+ // It's more of a "better than nothing" approach when rebuilding
+ // a whole tree, as backtraces will in general not be up to date
+ // beyond the first parent, if anything in the trace was ever
+ // moved after the file was created.
+
+ // On inode numbers
+ // ================
+ // The backtrace tells us inodes for each of the parents. If we are
+ // creating those parent dirfrags, then there is a risk that somehow
+ // the inode indicated here was also used for data (not a dirfrag) at
+ // some stage. That would be a zany situation, and we don't check
+ // for it here, because to do so would require extra IOs for everything
+ // we inject, and anyway wouldn't guarantee that the inode number
+ // wasn't in use in some dentry elsewhere in the metadata tree that
+ // just happened not to have any data objects.
+
+ // On multiple workers touching the same traces
+ // ============================================
+ // When creating linkage for a directory, *only* create it if we are
+ // also creating the object. That way, we might not manage to get the
+ // *right* linkage for a directory, but at least we won't multiply link
+ // it. We assume that if a root dirfrag exists for a directory, then
+ // it is linked somewhere (i.e. that the metadata pool is not already
+ // inconsistent).
+ //
+ // Making sure *that* is true is someone else's job! Probably someone
+ // who is not going to run in parallel, so that they can self-consistently
+ // look at versions and move things around as they go.
+ // Note this isn't 100% safe: if we die immediately after creating dirfrag
+ // object, next run will fail to create linkage for the dirfrag object
+ // and leave it orphaned.
+
+ inodeno_t ino = backtrace.ino;
+ dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl;
+ for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
+ i != backtrace.ancestors.end(); ++i) {
+ const inode_backpointer_t &backptr = *i;
+ dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec
+ << "/" << backptr.dname << dendl;
+
+ // Examine root dirfrag for parent
+ const inodeno_t parent_ino = backptr.dirino;
+ const std::string dname = backptr.dname;
+
+ frag_t fragment;
+ int r = get_frag_of(parent_ino, dname, &fragment);
+ if (r == -ENOENT) {
+ // Don't know fragment, fall back to assuming root
+ dout(20) << "don't know fragment for 0x" << std::hex <<
+ parent_ino << std::dec << "/" << dname << ", will insert to root"
+ << dendl;
+ }
+
+ // Find or create dirfrag
+ // ======================
+ bool created_dirfrag;
+ r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
+ if (r < 0) {
+ return r;
+ }
+
+ // Check if dentry already exists
+ // ==============================
+ InodeStore existing_dentry;
+ r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
+ bool write_dentry = false;
+ if (r == -ENOENT || r == -EINVAL) {
+ if (r == -EINVAL && !force_corrupt) {
+ return r;
+ }
+ // Missing or corrupt dentry
+ write_dentry = true;
+ } else if (r < 0) {
+ derr << "Unexpected error reading dentry 0x" << std::hex
+ << parent_ino << std::dec << "/"
+ << dname << ": " << cpp_strerror(r) << dendl;
+ break;
+ } else {
+ // Dentry already present, does it link to me?
+ if (existing_dentry.inode.ino == ino) {
+ dout(20) << "Dentry 0x" << std::hex
+ << parent_ino << std::dec << "/"
+ << dname << " already exists and points to me" << dendl;
+ } else {
+ derr << "Dentry 0x" << std::hex
+ << parent_ino << std::dec << "/"
+ << dname << " already exists but points to 0x"
+ << std::hex << existing_dentry.inode.ino << std::dec << dendl;
+ // Fall back to lost+found!
+ return inject_lost_and_found(backtrace.ino, dentry);
+ }
+ }
+
+ // Inject linkage
+ // ==============
+
+ if (write_dentry) {
+ if (i == backtrace.ancestors.begin()) {
+ // This is the linkage for the file of interest
+ dout(10) << "Linking inode 0x" << std::hex << ino
+ << " at 0x" << parent_ino << "/" << dname << std::dec
+ << " with size=" << dentry.inode.size << " bytes" << dendl;
+
+ r = inject_linkage(parent_ino, dname, fragment, dentry);
+ } else {
+ // This is the linkage for an ancestor directory
+ InodeStore ancestor_dentry;
+ ancestor_dentry.inode.mode = 0755 | S_IFDIR;
+
+ // Set nfiles to something non-zero, to fool any other code
+ // that tries to ignore 'empty' directories. This won't be
+ // accurate, but it should avoid functional issues.
+
+ ancestor_dentry.inode.dirstat.nfiles = 1;
+ ancestor_dentry.inode.dir_layout.dl_dir_hash =
+ g_conf()->mds_default_dir_hash;
+
+ ancestor_dentry.inode.nlink = 1;
+ ancestor_dentry.inode.ino = ino;
+ ancestor_dentry.inode.uid = g_conf()->mds_root_ino_uid;
+ ancestor_dentry.inode.gid = g_conf()->mds_root_ino_gid;
+ ancestor_dentry.inode.version = 1;
+ ancestor_dentry.inode.backtrace_version = 1;
+ r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
+ }
+
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (!created_dirfrag) {
+ // If the parent dirfrag already existed, then stop traversing the
+ // backtrace: assume that the other ancestors already exist too. This
+ // is an assumption rather than a truth, but it's a convenient way
+ // to avoid the risk of creating multiply-linked directories while
+ // injecting data. If there are in fact missing ancestors, this
+ // should be fixed up using a separate tool scanning the metadata
+ // pool.
+ break;
+ } else {
+ // Proceed up the backtrace, creating parents
+ ino = parent_ino;
+ }
+ }
+
+ return 0;
+}
+
+int MetadataDriver::find_or_create_dirfrag(
+ inodeno_t ino,
+ frag_t fragment,
+ bool *created)
+{
+ ceph_assert(created != NULL);
+
+ fnode_t existing_fnode;
+ *created = false;
+
+ uint64_t read_version = 0;
+ int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
+ dout(10) << "read_version = " << read_version << dendl;
+
+ if (r == -ENOENT || r == -EINVAL) {
+ if (r == -EINVAL && !force_corrupt) {
+ return r;
+ }
+
+ // Missing or corrupt fnode, create afresh
+ bufferlist fnode_bl;
+ fnode_t blank_fnode;
+ blank_fnode.version = 1;
+ // mark it as non-empty
+ blank_fnode.fragstat.nfiles = 1;
+ blank_fnode.accounted_fragstat = blank_fnode.fragstat;
+ blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
+ blank_fnode.encode(fnode_bl);
+
+
+ librados::ObjectWriteOperation op;
+
+ if (read_version) {
+ ceph_assert(r == -EINVAL);
+ // Case A: We must assert that the version isn't changed since we saw the object
+ // was unreadable, to avoid the possibility of two data-scan processes
+ // both creating the frag.
+ op.assert_version(read_version);
+ } else {
+ ceph_assert(r == -ENOENT);
+ // Case B: The object didn't exist in read_fnode, so while creating it we must
+ // use an exclusive create to correctly populate *creating with
+ // whether we created it ourselves or someone beat us to it.
+ op.create(true);
+ }
+
+ object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
+ op.omap_set_header(fnode_bl);
+ r = metadata_io.operate(frag_oid.name, &op);
+ if (r == -EOVERFLOW || r == -EEXIST) {
+ // Someone else wrote it (see case A above)
+ dout(10) << "Dirfrag creation race: 0x" << std::hex
+ << ino << " " << fragment << std::dec << dendl;
+ *created = false;
+ return 0;
+ } else if (r < 0) {
+ // We were unable to create or write it, error out
+ derr << "Failed to create dirfrag 0x" << std::hex
+ << ino << std::dec << ": " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ // Success: the dirfrag object now exists with a value header
+ dout(10) << "Created dirfrag: 0x" << std::hex
+ << ino << std::dec << dendl;
+ *created = true;
+ }
+ } else if (r < 0) {
+ derr << "Unexpected error reading dirfrag 0x" << std::hex
+ << ino << std::dec << " : " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ dout(20) << "Dirfrag already exists: 0x" << std::hex
+ << ino << " " << fragment << std::dec << dendl;
+ }
+
+ return 0;
+}
+
+int MetadataDriver::inject_linkage(
+ inodeno_t dir_ino, const std::string &dname,
+ const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst)
+{
+ object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
+
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+ dn_key.encode(key);
+
+ bufferlist dentry_bl;
+ encode(dnfirst, dentry_bl);
+ encode('I', dentry_bl);
+ inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ // Write out
+ std::map<std::string, bufferlist> vals;
+ vals[key] = dentry_bl;
+ int r = metadata_io.omap_set(frag_oid.name, vals);
+ if (r != 0) {
+ derr << "Error writing dentry 0x" << std::hex
+ << dir_ino << std::dec << "/"
+ << dname << ": " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ dout(20) << "Injected dentry 0x" << std::hex
+ << dir_ino << "/" << dname << " pointing to 0x"
+ << inode.inode.ino << std::dec << dendl;
+ return 0;
+ }
+}
+
+
+int MetadataDriver::init(
+ librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+ fs_cluster_id_t fscid)
+{
+ if (metadata_pool_name.empty()) {
+ auto fs = fsmap->get_filesystem(fscid);
+ ceph_assert(fs != nullptr);
+ int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
+
+ dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+ int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+ if (r < 0) {
+ derr << "Pool " << metadata_pool_id
+ << " identified in MDS map not found in RADOS!" << dendl;
+ return r;
+ }
+ dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
+ } else {
+ dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
+ }
+ return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+}
+
+int LocalFileDriver::init(
+ librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+ fs_cluster_id_t fscid)
+{
+ return 0;
+}
+
+int LocalFileDriver::inject_data(
+ const std::string &file_path,
+ uint64_t size,
+ uint32_t chunk_size,
+ inodeno_t ino)
+{
+ // Scrape the file contents out of the data pool and into the
+ // local filesystem
+ std::fstream f;
+ f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
+
+ for (uint64_t offset = 0; offset < size; offset += chunk_size) {
+ bufferlist bl;
+
+ char buf[32];
+ snprintf(buf, sizeof(buf),
+ "%llx.%08llx",
+ (unsigned long long)ino,
+ (unsigned long long)(offset / chunk_size));
+ std::string oid(buf);
+
+ int r = data_io.read(oid, bl, chunk_size, 0);
+
+ if (r <= 0 && r != -ENOENT) {
+ derr << "error reading data object '" << oid << "': "
+ << cpp_strerror(r) << dendl;
+ f.close();
+ return r;
+ } else if (r >=0) {
+
+ f.seekp(offset);
+ bl.write_stream(f);
+ }
+ }
+ f.close();
+
+ return 0;
+}
+
+
+int LocalFileDriver::inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry)
+{
+ std::string path_builder = path;
+
+ // Iterate through backtrace creating directory parents
+ std::vector<inode_backpointer_t>::const_reverse_iterator i;
+ for (i = bt.ancestors.rbegin();
+ i != bt.ancestors.rend(); ++i) {
+
+ const inode_backpointer_t &backptr = *i;
+ path_builder += "/";
+ path_builder += backptr.dname;
+
+ // Last entry is the filename itself
+ bool is_file = (i + 1 == bt.ancestors.rend());
+ if (is_file) {
+ // FIXME: inject_data won't cope with interesting (i.e. striped)
+ // layouts (need a librados-compatible Filer to read these)
+ inject_data(path_builder, dentry.inode.size,
+ dentry.inode.layout.object_size, bt.ino);
+ } else {
+ int r = mkdir(path_builder.c_str(), 0755);
+ if (r != 0 && r != -EPERM) {
+ derr << "error creating directory: '" << path_builder << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int LocalFileDriver::inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry)
+{
+ std::string lf_path = path + "/lost+found";
+ int r = mkdir(lf_path.c_str(), 0755);
+ if (r != 0 && r != -EPERM) {
+ derr << "error creating directory: '" << lf_path << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::string file_path = lf_path + "/" + lost_found_dname(ino);
+ return inject_data(file_path, dentry.inode.size,
+ dentry.inode.layout.object_size, ino);
+}
+
+int LocalFileDriver::init_roots(int64_t data_pool_id)
+{
+ // Ensure that the path exists and is a directory
+ bool exists;
+ int r = check_roots(&exists);
+ if (r != 0) {
+ return r;
+ }
+
+ if (exists) {
+ return 0;
+ } else {
+ return ::mkdir(path.c_str(), 0755);
+ }
+}
+
+int LocalFileDriver::check_roots(bool *result)
+{
+ // Check if the path exists and is a directory
+ DIR *d = ::opendir(path.c_str());
+ if (d == NULL) {
+ *result = false;
+ } else {
+ int r = closedir(d);
+ if (r != 0) {
+ // Weird, but maybe possible with e.g. stale FD on NFS mount?
+ *result = false;
+ } else {
+ *result = true;
+ }
+ }
+
+ return 0;
+}
+
+void MetadataTool::build_file_dentry(
+ inodeno_t ino, uint64_t file_size, time_t file_mtime,
+ const file_layout_t &layout, InodeStore *out)
+{
+ ceph_assert(out != NULL);
+
+ out->inode.mode = 0500 | S_IFREG;
+ out->inode.size = file_size;
+ out->inode.max_size_ever = file_size;
+ out->inode.mtime.tv.tv_sec = file_mtime;
+ out->inode.atime.tv.tv_sec = file_mtime;
+ out->inode.ctime.tv.tv_sec = file_mtime;
+
+ out->inode.layout = layout;
+
+ out->inode.truncate_seq = 1;
+ out->inode.truncate_size = -1ull;
+
+ out->inode.inline_data.version = CEPH_INLINE_NONE;
+
+ out->inode.nlink = 1;
+ out->inode.ino = ino;
+ out->inode.version = 1;
+ out->inode.backtrace_version = 1;
+ out->inode.uid = g_conf()->mds_root_ino_uid;
+ out->inode.gid = g_conf()->mds_root_ino_gid;
+}
+
+void MetadataTool::build_dir_dentry(
+ inodeno_t ino, const frag_info_t &fragstat,
+ const file_layout_t &layout, InodeStore *out)
+{
+ ceph_assert(out != NULL);
+
+ out->inode.mode = 0755 | S_IFDIR;
+ out->inode.dirstat = fragstat;
+ out->inode.mtime.tv.tv_sec = fragstat.mtime;
+ out->inode.atime.tv.tv_sec = fragstat.mtime;
+ out->inode.ctime.tv.tv_sec = fragstat.mtime;
+
+ out->inode.layout = layout;
+ out->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+
+ out->inode.truncate_seq = 1;
+ out->inode.truncate_size = -1ull;
+
+ out->inode.inline_data.version = CEPH_INLINE_NONE;
+
+ out->inode.nlink = 1;
+ out->inode.ino = ino;
+ out->inode.version = 1;
+ out->inode.backtrace_version = 1;
+ out->inode.uid = g_conf()->mds_root_ino_uid;
+ out->inode.gid = g_conf()->mds_root_ino_gid;
+}
+
diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h
new file mode 100644
index 00000000..5c87fe2b
--- /dev/null
+++ b/src/tools/cephfs/DataScan.h
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "MDSUtility.h"
+#include "include/rados/librados.hpp"
+
+class InodeStore;
+class MDSTable;
+
+class RecoveryDriver {
+ protected:
+ // If true, overwrite structures that generate decoding errors.
+ bool force_corrupt;
+
+ // If true, overwrite root objects during init_roots even if they
+ // exist
+ bool force_init;
+
+ public:
+ virtual int init(
+ librados::Rados &rados,
+ std::string &metadata_pool_name,
+ const FSMap *fsmap,
+ fs_cluster_id_t fscid) = 0;
+
+ void set_force_corrupt(const bool val)
+ {
+ force_corrupt = val;
+ }
+
+ void set_force_init(const bool val)
+ {
+ force_init = val;
+ }
+
+
+ /**
+ * Inject an inode + dentry parents into the metadata pool,
+ * based on a backtrace recovered from the data pool
+ */
+ virtual int inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry) = 0;
+
+ /**
+ * Inject an inode + dentry into the lost+found directory,
+ * when all we know about a file is its inode.
+ */
+ virtual int inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry) = 0;
+
+ /**
+ * Create any missing roots (i.e. mydir, strays, root inode)
+ */
+ virtual int init_roots(
+ int64_t data_pool_id) = 0;
+
+ /**
+ * Pre-injection check that all the roots are present in
+ * the metadata pool. Used to avoid parallel workers interfering
+ * with one another, by cueing the user to go run 'init' on a
+ * single node before running a parallel scan.
+ *
+ * @param result: set to true if roots are present, else set to false
+ * @returns 0 on no unexpected errors, else error code. Missing objects
+ * are not considered an unexpected error: check *result for
+ * this case.
+ */
+ virtual int check_roots(bool *result) = 0;
+
+ /**
+ * Helper to compose dnames for links to lost+found
+ * inodes.
+ */
+ std::string lost_found_dname(inodeno_t ino)
+ {
+ char s[20];
+ snprintf(s, sizeof(s), "%llx", (unsigned long long)ino);
+ return std::string(s);
+ }
+
+ RecoveryDriver()
+ : force_corrupt(false),
+ force_init(false)
+ {}
+
+ virtual ~RecoveryDriver() {}
+};
+
+class LocalFileDriver : public RecoveryDriver
+{
+ protected:
+ const std::string path;
+ librados::IoCtx &data_io;
+
+ int inject_data(
+ const std::string &file_path,
+ uint64_t size,
+ uint32_t chunk_size,
+ inodeno_t ino);
+ public:
+
+ LocalFileDriver(const std::string &path_, librados::IoCtx &data_io_)
+ : RecoveryDriver(), path(path_), data_io(data_io_)
+ {}
+
+ // Implement RecoveryDriver interface
+ int init(
+ librados::Rados &rados,
+ std::string &metadata_pool_name,
+ const FSMap *fsmap,
+ fs_cluster_id_t fscid) override;
+
+ int inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry) override;
+
+ int inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry) override;
+
+ int init_roots(int64_t data_pool_id) override;
+
+ int check_roots(bool *result) override;
+};
+
+/**
+ * A class that knows how to work with objects in a CephFS
+ * metadata pool.
+ */
+class MetadataTool
+{
+ protected:
+
+ librados::IoCtx metadata_io;
+
+ /**
+ * Construct a synthetic InodeStore for a normal file
+ */
+ void build_file_dentry(
+ inodeno_t ino, uint64_t file_size, time_t file_mtime,
+ const file_layout_t &layout,
+ InodeStore *out);
+
+ /**
+ * Construct a synthetic InodeStore for a directory
+ */
+ void build_dir_dentry(
+ inodeno_t ino,
+ const frag_info_t &fragstat,
+ const file_layout_t &layout,
+ InodeStore *out);
+
+ /**
+ * Try and read an fnode from a dirfrag
+ */
+ int read_fnode(inodeno_t ino, frag_t frag,
+ fnode_t *fnode, uint64_t *read_version);
+
+ /**
+ * Try and read a dentry from a dirfrag
+ */
+ int read_dentry(inodeno_t parent_ino, frag_t frag,
+ const std::string &dname, InodeStore *inode, snapid_t *dnfirst=nullptr);
+};
+
+/**
+ * A class that knows how to manipulate CephFS metadata pools
+ */
+class MetadataDriver : public RecoveryDriver, public MetadataTool
+{
+ protected:
+ /**
+ * Create a .inode object, i.e. root or mydir
+ */
+ int inject_unlinked_inode(inodeno_t inono, int mode, int64_t data_pool_id);
+
+ /**
+ * Check for existence of .inode objects, before
+ * trying to go ahead and inject metadata.
+ */
+ int root_exists(inodeno_t ino, bool *result);
+ int find_or_create_dirfrag(
+ inodeno_t ino,
+ frag_t fragment,
+ bool *created);
+
+
+ /**
+ * Work out which fragment of a directory should contain a named
+ * dentry, recursing up the trace as necessary to retrieve
+ * fragtrees.
+ */
+ int get_frag_of(
+ inodeno_t dirino,
+ const std::string &dname,
+ frag_t *result_ft);
+
+ public:
+
+ // Implement RecoveryDriver interface
+ int init(
+ librados::Rados &rados,
+ std::string &metadata_pool_name,
+ const FSMap *fsmap,
+ fs_cluster_id_t fscid) override;
+
+ int inject_linkage(
+ inodeno_t dir_ino, const std::string &dname,
+ const frag_t fragment, const InodeStore &inode, snapid_t dnfirst=CEPH_NOSNAP);
+
+ int inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry) override;
+
+ int inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry) override;
+
+ int init_roots(int64_t data_pool_id) override;
+
+ int check_roots(bool *result) override;
+
+ int load_table(MDSTable *table);
+ int save_table(MDSTable *table);
+};
+
+class DataScan : public MDSUtility, public MetadataTool
+{
+ protected:
+ RecoveryDriver *driver;
+ fs_cluster_id_t fscid;
+
+ string metadata_pool_name;
+ std::vector<int64_t> data_pools;
+
+ // IoCtx for data pool (where we scrape file backtraces from)
+ librados::IoCtx data_io;
+ // Remember the data pool ID for use in layouts
+ int64_t data_pool_id;
+
+ uint32_t n;
+ uint32_t m;
+
+ /**
+ * Scan data pool for backtraces, and inject inodes to metadata pool
+ */
+ int scan_inodes();
+
+ /**
+ * Scan data pool for file sizes and mtimes
+ */
+ int scan_extents();
+
+ /**
+ * Scan metadata pool for 0th dirfrags to link orphaned
+ * directory inodes.
+ */
+ int scan_frags();
+
+ /**
+ * Cleanup xattrs from data pool
+ */
+ int cleanup();
+
+ /**
+ * Check if an inode number is in the permitted ranges
+ */
+ bool valid_ino(inodeno_t ino) const;
+
+
+ int scan_links();
+
+ // Accept pools which are not in the FSMap
+ bool force_pool;
+ // Respond to decode errors by overwriting
+ bool force_corrupt;
+ // Overwrite root objects even if they exist
+ bool force_init;
+ // Only scan inodes without this scrub tag
+ string filter_tag;
+
+ /**
+ * @param r set to error on valid key with invalid value
+ * @return true if argument consumed, else false
+ */
+ bool parse_kwarg(
+ const std::vector<const char*> &args,
+ std::vector<const char *>::const_iterator &i,
+ int *r);
+
+ /**
+ * @return true if argument consumed, else false
+ */
+ bool parse_arg(
+ const std::vector<const char*> &arg,
+ std::vector<const char *>::const_iterator &i);
+
+ int probe_filter(librados::IoCtx &ioctx);
+
+ /**
+ * Apply a function to all objects in an ioctx's pool, optionally
+ * restricted to only those objects with a 00000000 offset and
+ * no tag matching DataScan::scrub_tag.
+ */
+ int forall_objects(
+ librados::IoCtx &ioctx,
+ bool untagged_only,
+ std::function<int(std::string, uint64_t, uint64_t)> handler);
+
+ public:
+ static void usage();
+ int main(const std::vector<const char *> &args);
+
+ DataScan()
+ : driver(NULL), fscid(FS_CLUSTER_ID_NONE),
+ data_pool_id(-1), n(0), m(1),
+ force_pool(false), force_corrupt(false),
+ force_init(false)
+ {
+ }
+
+ ~DataScan() override
+ {
+ delete driver;
+ }
+};
+
diff --git a/src/tools/cephfs/Dumper.cc b/src/tools/cephfs/Dumper.cc
new file mode 100644
index 00000000..6b758497
--- /dev/null
+++ b/src/tools/cephfs/Dumper.cc
@@ -0,0 +1,431 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef _BACKWARD_BACKWARD_WARNING_H
+#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_*
+#endif
+
+#include "include/compat.h"
+#include "include/fs_types.h"
+#include "common/entity_name.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/JournalPointer.h"
+#include "osdc/Journaler.h"
+#include "mon/MonClient.h"
+
+#include "Dumper.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+#define HEADER_LEN 4096
+
+int Dumper::init(mds_role_t role_, const std::string &type)
+{
+ role = role_;
+
+ int r = MDSUtility::init();
+ if (r < 0) {
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ if (type == "mdlog") {
+ JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool());
+ int jp_load_result = jp.load(objecter);
+ if (jp_load_result != 0) {
+ std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl;
+ return jp_load_result;
+ } else {
+ ino = jp.front;
+ }
+ } else if (type == "purge_queue") {
+ ino = MDS_INO_PURGE_QUEUE + role.rank;
+ } else {
+ ceph_abort(); // should not get here
+ }
+ return 0;
+}
+
+
+int Dumper::recover_journal(Journaler *journaler)
+{
+ C_SaferCond cond;
+ lock.Lock();
+ journaler->recover(&cond);
+ lock.Unlock();
+ const int r = cond.wait();
+
+ if (r < 0) { // Error
+ derr << "error on recovery: " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ dout(10) << "completed journal recovery" << dendl;
+ return 0;
+ }
+}
+
+
+int Dumper::dump(const char *dump_file)
+{
+ int r = 0;
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
+ &finisher);
+ r = recover_journal(&journaler);
+ if (r) {
+ return r;
+ }
+ uint64_t start = journaler.get_read_pos();
+ uint64_t end = journaler.get_write_pos();
+ uint64_t len = end-start;
+
+ Filer filer(objecter, &finisher);
+
+ cout << "journal is " << start << "~" << len << std::endl;
+
+ int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+ if (fd >= 0) {
+ // include an informative header
+ uuid_d fsid = monc->get_fsid();
+ char fsid_str[40];
+ fsid.print(fsid_str);
+ char buf[HEADER_LEN];
+ memset(buf, 0, sizeof(buf));
+ snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\
+ length %llu (0x%llx)\n write_pos %llu (0x%llx)\n format %llu\n\
+ trimmed_pos %llu (0x%llx)\n stripe_unit %lu (0x%lx)\n stripe_count %lu (0x%lx)\n\
+ object_size %lu (0x%lx)\n fsid %s\n%c",
+ role.rank,
+ (unsigned long long)start, (unsigned long long)start,
+ (unsigned long long)len, (unsigned long long)len,
+ (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos,
+ (unsigned long long)journaler.last_committed.stream_format,
+ (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos,
+ (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit,
+ (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count,
+ (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size,
+ fsid_str,
+ 4);
+ r = safe_write(fd, buf, sizeof(buf));
+ if (r) {
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl;
+ ::close(fd);
+ return r;
+ }
+
+ // write the data
+ off64_t seeked = ::lseek64(fd, start, SEEK_SET);
+ if (seeked == (off64_t)-1) {
+ r = errno;
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl;
+ ::close(fd);
+ return r;
+ }
+
+
+ // Read and write 32MB chunks. Slower than it could be because we're not
+ // streaming, but that's okay because this is just a debug/disaster tool.
+ const uint32_t chunk_size = 32 * 1024 * 1024;
+
+ for (uint64_t pos = start; pos < start + len; pos += chunk_size) {
+ bufferlist bl;
+ dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl;
+
+ const uint32_t read_size = std::min<uint64_t>(chunk_size, end - pos);
+
+ C_SaferCond cond;
+ lock.Lock();
+ filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP,
+ pos, read_size, &bl, 0, &cond);
+ lock.Unlock();
+ r = cond.wait();
+ if (r < 0) {
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") reading "
+ "journal at offset 0x" << std::hex << pos << std::dec << dendl;
+ ::close(fd);
+ return r;
+ }
+ dout(10) << "Got 0x" << std::hex << bl.length() << std::dec
+ << " bytes" << dendl;
+
+ r = bl.write_fd(fd);
+ if (r) {
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl;
+ ::close(fd);
+ return r;
+ }
+ }
+
+ r = ::close(fd);
+ if (r) {
+ r = errno;
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl;
+ return r;
+ }
+
+ cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n"
+ << "NOTE: this is a _sparse_ file; you can\n"
+ << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n"
+ << " to efficiently compress it while preserving sparseness." << std::endl;
+ return 0;
+ } else {
+ int err = errno;
+ derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl;
+ return err;
+ }
+}
+
+int Dumper::undump(const char *dump_file, bool force)
+{
+ cout << "undump " << dump_file << std::endl;
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ int r = 0;
+ // try get layout info from cluster
+ Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
+ &finisher);
+ int recovered = recover_journal(&journaler);
+ if (recovered != 0) {
+ derr << "recover_journal failed, try to get header from dump file " << dendl;
+ }
+
+ int fd = ::open(dump_file, O_RDONLY);
+ if (fd < 0) {
+ r = errno;
+ derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Ceph mds0 journal dump
+ // start offset 232401996 (0xdda2c4c)
+ // length 1097504 (0x10bf20)
+
+ char buf[HEADER_LEN];
+ r = safe_read(fd, buf, sizeof(buf));
+ if (r < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+
+ long long unsigned start, len, write_pos, format, trimmed_pos;
+ long unsigned stripe_unit, stripe_count, object_size;
+ sscanf(strstr(buf, "start offset"), "start offset %llu", &start);
+ sscanf(strstr(buf, "length"), "length %llu", &len);
+ sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos);
+ sscanf(strstr(buf, "format"), "format %llu", &format);
+
+ if (!force) {
+ // need to check if fsid match onlien cluster fsid
+ if (strstr(buf, "fsid")) {
+ uuid_d fsid;
+ char fsid_str[40];
+ sscanf(strstr(buf, "fsid"), "fsid %39s", fsid_str);
+ r = fsid.parse(fsid_str);
+ if (!r) {
+ derr << "Invalid fsid" << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+
+ if (fsid != monc->get_fsid()) {
+ derr << "Imported journal fsid does not match online cluster fsid" << dendl;
+ derr << "Use --force to skip fsid check" << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+ } else {
+ derr << "Invalid header, no fsid embeded" << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+ }
+
+ if (recovered == 0) {
+ stripe_unit = journaler.last_committed.layout.stripe_unit;
+ stripe_count = journaler.last_committed.layout.stripe_count;
+ object_size = journaler.last_committed.layout.object_size;
+ } else {
+ // try to get layout from dump file header, if failed set layout to default
+ if (strstr(buf, "stripe_unit")) {
+ sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit);
+ } else {
+ stripe_unit = file_layout_t::get_default().stripe_unit;
+ }
+ if (strstr(buf, "stripe_count")) {
+ sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count);
+ } else {
+ stripe_count = file_layout_t::get_default().stripe_count;
+ }
+ if (strstr(buf, "object_size")) {
+ sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size);
+ } else {
+ object_size = file_layout_t::get_default().object_size;
+ }
+ }
+
+ if (strstr(buf, "trimmed_pos")) {
+ sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos);
+ } else {
+ // Old format dump, any untrimmed objects before expire_pos will
+ // be discarded as trash.
+ trimmed_pos = start - (start % object_size);
+ }
+
+ if (trimmed_pos > start) {
+ derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos
+ << " > expire 0x" << start << std::dec << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+
+ if (start > write_pos) {
+ derr << std::hex << "Invalid header (expire 0x" << start
+ << " > write 0x" << write_pos << std::dec << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+
+ cout << "start " << start <<
+ " len " << len <<
+ " write_pos " << write_pos <<
+ " format " << format <<
+ " trimmed_pos " << trimmed_pos <<
+ " stripe_unit " << stripe_unit <<
+ " stripe_count " << stripe_count <<
+ " object_size " << object_size << std::endl;
+
+ Journaler::Header h;
+ h.trimmed_pos = trimmed_pos;
+ h.expire_pos = start;
+ h.write_pos = write_pos;
+ h.stream_format = format;
+ h.magic = CEPH_FS_ONDISK_MAGIC;
+
+ h.layout.stripe_unit = stripe_unit;
+ h.layout.stripe_count = stripe_count;
+ h.layout.object_size = object_size;
+ h.layout.pool_id = fs->mds_map.get_metadata_pool();
+
+ bufferlist hbl;
+ encode(h, hbl);
+
+ object_t oid = file_object_t(ino, 0);
+ object_locator_t oloc(fs->mds_map.get_metadata_pool());
+ SnapContext snapc;
+
+ cout << "writing header " << oid << std::endl;
+ C_SaferCond header_cond;
+ lock.Lock();
+ objecter->write_full(oid, oloc, snapc, hbl,
+ ceph::real_clock::now(), 0,
+ &header_cond);
+ lock.Unlock();
+
+ r = header_cond.wait();
+ if (r != 0) {
+ derr << "Failed to write header: " << cpp_strerror(r) << dendl;
+ ::close(fd);
+ return r;
+ }
+
+ Filer filer(objecter, &finisher);
+
+ /* Erase any objects at the end of the region to which we shall write
+ * the new log data. This is to avoid leaving trailing junk after
+ * the newly written data. Any junk more than one object ahead
+ * will be taken care of during normal operation by Journaler's
+ * prezeroing behaviour */
+ {
+ uint32_t const object_size = h.layout.object_size;
+ ceph_assert(object_size > 0);
+ uint64_t last_obj = h.write_pos / object_size;
+ uint64_t purge_count = 2;
+ /* When the length is zero, the last_obj should be zeroed
+ * from the offset determined by the new write_pos instead of being purged.
+ */
+ if (!len) {
+ purge_count = 1;
+ ++last_obj;
+ }
+ C_SaferCond purge_cond;
+ cout << "Purging " << purge_count << " objects from " << last_obj << std::endl;
+ lock.Lock();
+ filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count,
+ ceph::real_clock::now(), 0, &purge_cond);
+ lock.Unlock();
+ purge_cond.wait();
+ }
+ /* When the length is zero, zero the last object
+ * from the offset determined by the new write_pos.
+ */
+ if (!len) {
+ uint64_t offset_in_obj = h.write_pos % h.layout.object_size;
+ uint64_t len = h.layout.object_size - offset_in_obj;
+ C_SaferCond zero_cond;
+ cout << "Zeroing " << len << " bytes in the last object." << std::endl;
+
+ lock.Lock();
+ filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond);
+ lock.Unlock();
+ zero_cond.wait();
+ }
+
+ // Stream from `fd` to `filer`
+ uint64_t pos = start;
+ uint64_t left = len;
+ while (left > 0) {
+ // Read
+ bufferlist j;
+ lseek64(fd, pos, SEEK_SET);
+ uint64_t l = std::min<uint64_t>(left, 1024*1024);
+ j.read_fd(fd, l);
+
+ // Write
+ cout << " writing " << pos << "~" << l << std::endl;
+ C_SaferCond write_cond;
+ lock.Lock();
+ filer.write(ino, &h.layout, snapc, pos, l, j,
+ ceph::real_clock::now(), 0, &write_cond);
+ lock.Unlock();
+
+ r = write_cond.wait();
+ if (r != 0) {
+ derr << "Failed to write header: " << cpp_strerror(r) << dendl;
+ ::close(fd);
+ return r;
+ }
+
+ // Advance
+ pos += l;
+ left -= l;
+ }
+
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ cout << "done." << std::endl;
+ return 0;
+}
+
diff --git a/src/tools/cephfs/Dumper.h b/src/tools/cephfs/Dumper.h
new file mode 100644
index 00000000..758f3cde
--- /dev/null
+++ b/src/tools/cephfs/Dumper.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef JOURNAL_DUMPER_H_
+#define JOURNAL_DUMPER_H_
+
+
+#include "MDSUtility.h"
+
+class Journaler;
+
+/**
+ * This class lets you dump out an mds journal for troubleshooting or whatever.
+ *
+ * It was built to work with cmds so some of the design choices are random.
+ * To use, create a Dumper, call init(), and then call dump() with the name
+ * of the file to dump to.
+ */
+
+class Dumper : public MDSUtility {
+private:
+ mds_role_t role;
+ inodeno_t ino;
+
+public:
+ Dumper() : ino(-1)
+ {}
+
+ int init(mds_role_t role_, const std::string &type);
+ int recover_journal(Journaler *journaler);
+ int dump(const char *dumpfile);
+ int undump(const char *dumpfile, bool force);
+};
+
+#endif /* JOURNAL_DUMPER_H_ */
diff --git a/src/tools/cephfs/EventOutput.cc b/src/tools/cephfs/EventOutput.cc
new file mode 100644
index 00000000..8cb235a8
--- /dev/null
+++ b/src/tools/cephfs/EventOutput.cc
@@ -0,0 +1,153 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include <iostream>
+#include <fstream>
+
+#include "common/errno.h"
+#include "mds/mdstypes.h"
+#include "mds/events/EUpdate.h"
+#include "mds/LogEvent.h"
+#include "JournalScanner.h"
+
+#include "EventOutput.h"
+
+
+int EventOutput::binary() const
+{
+ // Binary output, files
+ int r = ::mkdir(path.c_str(), 0755);
+ if (r != 0) {
+ r = -errno;
+ if (r != -EEXIST) {
+ std::cerr << "Error creating output directory: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ bufferlist bin;
+ std::stringstream filename;
+ if (auto& le = i->second.log_event; le) {
+ le->encode(bin, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ filename << "0x" << std::hex << i->first << std::dec << "_" << le->get_type_str() << ".bin";
+ } else if (auto& pi = i->second.pi; pi) {
+ pi->encode(bin);
+ filename << "0x" << std::hex << i->first << std::dec << "_" << pi->get_type_str() << ".bin";
+ }
+
+ std::string const file_path = path + std::string("/") + filename.str();
+ std::ofstream bin_file(file_path.c_str(), std::ofstream::out | std::ofstream::binary);
+ bin.write_stream(bin_file);
+ bin_file.close();
+ if (bin_file.fail()) {
+ return -EIO;
+ }
+ }
+ std::cerr << "Wrote output to binary files in directory '" << path << "'" << std::endl;
+
+ return 0;
+}
+
+int EventOutput::json() const
+{
+ JSONFormatter jf(true);
+ std::ofstream out_file(path.c_str(), std::ofstream::out);
+ jf.open_array_section("journal");
+ {
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ if (auto& le = i->second.log_event; le) {
+ jf.open_object_section("log_event");
+ le->dump(&jf);
+ jf.close_section(); // log_event
+ } else if (auto& pi = i->second.pi; pi) {
+ jf.open_object_section("purge_action");
+ pi->dump(&jf);
+ jf.close_section();
+ }
+ }
+ }
+ jf.close_section(); // journal
+ jf.flush(out_file);
+ out_file.close();
+
+ if (out_file.fail()) {
+ return -EIO;
+ } else {
+ std::cerr << "Wrote output to JSON file '" << path << "'" << std::endl;
+ return 0;
+ }
+}
+
+void EventOutput::list() const
+{
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ if (auto& le = i->second.log_event; le) {
+ std::vector<std::string> ev_paths;
+ EMetaBlob const *emb = le->get_metablob();
+ if (emb) {
+ emb->get_paths(ev_paths);
+ }
+
+ std::string detail;
+ if (le->get_type() == EVENT_UPDATE) {
+ auto& eu = reinterpret_cast<EUpdate&>(*le);
+ detail = eu.type;
+ }
+
+ std::cout << le->get_stamp() << " 0x"
+ << std::hex << i->first << std::dec << " "
+ << le->get_type_str() << ": "
+ << " (" << detail << ")" << std::endl;
+ for (std::vector<std::string>::iterator i = ev_paths.begin(); i != ev_paths.end(); ++i) {
+ std::cout << " " << *i << std::endl;
+ }
+ } else if (auto& pi = i->second.pi; pi) {
+ std::cout << pi->stamp << " 0x"
+ << std::hex << i->first << std::dec << " "
+ << pi->get_type_str() << std::endl;
+ }
+ }
+}
+
+void EventOutput::summary() const
+{
+ std::map<std::string, int> type_count;
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ std::string type;
+ if (auto& le = i->second.log_event; le)
+ type = le->get_type_str();
+ else if (auto& pi = i->second.pi; pi)
+ type = pi->get_type_str();
+ if (type_count.count(type) == 0) {
+ type_count[type] = 0;
+ }
+ type_count[type] += 1;
+ }
+
+ std::cout << "Events by type:" << std::endl;
+ for (std::map<std::string, int>::iterator i = type_count.begin(); i != type_count.end(); ++i) {
+ std::cout << " " << i->first << ": " << i->second << std::endl;
+ }
+
+ std::cout << "Errors: " << scan.errors.size() << std::endl;
+ if (!scan.errors.empty()) {
+ for (JournalScanner::ErrorMap::const_iterator i = scan.errors.begin();
+ i != scan.errors.end(); ++i) {
+ std::cout << " 0x" << std::hex << i->first << std::dec
+ << ": " << i->second.r << " "
+ << i->second.description << std::endl;
+ }
+ }
+}
diff --git a/src/tools/cephfs/EventOutput.h b/src/tools/cephfs/EventOutput.h
new file mode 100644
index 00000000..65d96840
--- /dev/null
+++ b/src/tools/cephfs/EventOutput.h
@@ -0,0 +1,42 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#ifndef EVENT_OUTPUT_H
+#define EVENT_OUTPUT_H
+
+#include <string>
+
+class JournalScanner;
+
+/**
+ * Different output formats for the results of a journal scan
+ */
+class EventOutput
+{
+ private:
+ JournalScanner const &scan;
+ std::string const path;
+
+ public:
+ EventOutput(JournalScanner const &scan_, std::string const &path_)
+ : scan(scan_), path(path_) {}
+
+ void summary() const;
+ void list() const;
+ int json() const;
+ int binary() const;
+};
+
+#endif // EVENT_OUTPUT_H
+
diff --git a/src/tools/cephfs/JournalFilter.cc b/src/tools/cephfs/JournalFilter.cc
new file mode 100644
index 00000000..266d7fcc
--- /dev/null
+++ b/src/tools/cephfs/JournalFilter.cc
@@ -0,0 +1,315 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include "JournalFilter.h"
+
+#include "common/ceph_argparse.h"
+
+#include "mds/events/ESession.h"
+#include "mds/events/EUpdate.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+
+const string JournalFilter::range_separator("..");
+
+bool JournalFilter::apply(uint64_t pos, PurgeItem &pi) const
+{
+ /* Filtering by journal offset range */
+ if (pos < range_start || pos >= range_end) {
+ return false;
+ }
+
+ if (purge_action != PurgeItem::NONE) {
+ if (pi.action != purge_action)
+ return false;
+ }
+
+ if (inode) {
+ if (inode != pi.ino)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Return whether a LogEvent is to be included or excluded.
+ *
+ * The filter parameters are applied on an AND basis: if any
+ * condition is not met, the event is excluded. Try to do
+ * the fastest checks first.
+ */
+bool JournalFilter::apply(uint64_t pos, LogEvent &le) const
+{
+ /* Filtering by journal offset range */
+ if (pos < range_start || pos >= range_end) {
+ return false;
+ }
+
+ /* Filtering by event type */
+ if (event_type != 0) {
+ if (le.get_type() != event_type) {
+ return false;
+ }
+ }
+
+ /* Filtering by client */
+ if (client_name.num()) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ if (metablob->get_client_name() != client_name) {
+ return false;
+ }
+ } else if (le.get_type() == EVENT_SESSION) {
+ ESession *es = reinterpret_cast<ESession*>(&le);
+ if (es->get_client_inst().name != client_name) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /* Filtering by inode */
+ if (inode) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ std::set<inodeno_t> inodes;
+ metablob->get_inodes(inodes);
+ bool match_any = false;
+ for (std::set<inodeno_t>::iterator i = inodes.begin(); i != inodes.end(); ++i) {
+ if (*i == inode) {
+ match_any = true;
+ break;
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /* Filtering by frag and dentry */
+ if (!frag_dentry.empty() || frag.ino) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ std::map<dirfrag_t, std::set<std::string> > dentries;
+ metablob->get_dentries(dentries);
+
+ if (frag.ino) {
+ bool match_any = false;
+ for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin();
+ i != dentries.end(); ++i) {
+ if (i->first == frag) {
+ match_any = true;
+ break;
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ }
+
+ if (!frag_dentry.empty()) {
+ bool match_any = false;
+ for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin();
+ i != dentries.end() && !match_any; ++i) {
+ std::set<std::string> const &names = i->second;
+ for (std::set<std::string>::iterator j = names.begin();
+ j != names.end() && !match_any; ++j) {
+ if (*j == frag_dentry) {
+ match_any = true;
+ }
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ }
+
+ } else {
+ return false;
+ }
+ }
+
+ /* Filtering by file path */
+ if (!path_expr.empty()) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ std::vector<std::string> paths;
+ metablob->get_paths(paths);
+ bool match_any = false;
+ for (std::vector<std::string>::iterator p = paths.begin(); p != paths.end(); ++p) {
+ if ((*p).find(path_expr) != std::string::npos) {
+ match_any = true;
+ break;
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+int JournalFilter::parse_args(
+ std::vector<const char*> &argv,
+ std::vector<const char*>::iterator &arg)
+{
+ while(arg != argv.end()) {
+ std::string arg_str;
+ if (ceph_argparse_witharg(argv, arg, &arg_str, "--range", (char*)NULL)) {
+ size_t sep_loc = arg_str.find(JournalFilter::range_separator);
+ if (sep_loc == std::string::npos || arg_str.size() <= JournalFilter::range_separator.size()) {
+ derr << "Invalid range '" << arg_str << "'" << dendl;
+ return -EINVAL;
+ }
+
+ // We have a lower bound
+ if (sep_loc > 0) {
+ std::string range_start_str = arg_str.substr(0, sep_loc);
+ std::string parse_err;
+ range_start = strict_strtoll(range_start_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid lower bound '" << range_start_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+ }
+
+ if (sep_loc < arg_str.size() - JournalFilter::range_separator.size()) {
+ std::string range_end_str = arg_str.substr(sep_loc + range_separator.size());
+ std::string parse_err;
+ range_end = strict_strtoll(range_end_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid upper bound '" << range_end_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+ }
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--path\"." << dendl;
+ return -EINVAL;
+ }
+ dout(4) << "Filtering by path '" << arg_str << "'" << dendl;
+ path_expr = arg_str;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--inode", (char*)NULL)) {
+ dout(4) << "Filtering by inode '" << arg_str << "'" << dendl;
+ std::string parse_err;
+ inode = strict_strtoll(arg_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid inode '" << arg_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--type", (char*)NULL)) {
+ try {
+ if (!type.compare("mdlog")) {
+ event_type = LogEvent::str_to_type(arg_str);
+ } else if (!type.compare("purge_queue")) {
+ purge_action = PurgeItem::str_to_type(arg_str);
+ }
+ } catch (const std::out_of_range&) {
+ derr << "Invalid event type '" << arg_str << "'" << dendl;
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--frag", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--frag\"." << dendl;
+ return -EINVAL;
+ }
+ std::string const frag_sep = ".";
+ size_t sep_loc = arg_str.find(frag_sep);
+ std::string inode_str;
+ std::string frag_str;
+ if (sep_loc != std::string::npos) {
+ inode_str = arg_str.substr(0, sep_loc);
+ frag_str = arg_str.substr(sep_loc + 1);
+ } else {
+ inode_str = arg_str;
+ frag_str = "0";
+ }
+
+ std::string parse_err;
+ inodeno_t frag_ino = strict_strtoll(inode_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid inode '" << inode_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ uint32_t frag_enc = strict_strtoll(frag_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid frag '" << frag_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ frag = dirfrag_t(frag_ino, frag_t(frag_enc));
+ dout(4) << "dirfrag filter: '" << frag << "'" << dendl;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--dname", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--dname\"." << dendl;
+ return -EINVAL;
+ }
+ frag_dentry = arg_str;
+ dout(4) << "dentry filter: '" << frag_dentry << "'" << dendl;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--client", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--client\"." << dendl;
+ return -EINVAL;
+ }
+
+ std::string parse_err;
+ int64_t client_num = strict_strtoll(arg_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid client number " << arg_str << dendl;
+ return -EINVAL;
+ }
+ client_name = entity_name_t::CLIENT(client_num);
+ } else {
+ // We're done with args the filter understands
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * If the filter params are only range, then return
+ * true and set start & end. Else return false.
+ *
+ * Use this to discover if the user has requested a contiguous range
+ * rather than any per-event filtering.
+ */
+bool JournalFilter::get_range(uint64_t &start, uint64_t &end) const
+{
+ if (!path_expr.empty()
+ || inode != 0
+ || event_type != 0
+ || frag.ino != 0
+ || client_name.num() != 0
+ || (range_start == 0 && range_end == (uint64_t)(-1))) {
+ return false;
+ } else {
+ start = range_start;
+ end = range_end;
+ return true;
+ }
+}
diff --git a/src/tools/cephfs/JournalFilter.h b/src/tools/cephfs/JournalFilter.h
new file mode 100644
index 00000000..f7a2db61
--- /dev/null
+++ b/src/tools/cephfs/JournalFilter.h
@@ -0,0 +1,73 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#ifndef JOURNAL_FILTER_H
+#define JOURNAL_FILTER_H
+
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/PurgeQueue.h"
+
+/**
+ * A set of conditions for narrowing down a search through the journal
+ */
+class JournalFilter
+{
+ private:
+
+ /* Filtering by journal offset range */
+ uint64_t range_start;
+ uint64_t range_end;
+ static const std::string range_separator;
+
+ /* Filtering by file (sub) path */
+ std::string path_expr;
+
+ /* Filtering by inode */
+ inodeno_t inode;
+
+ /* Filtering by type */
+ LogEvent::EventType event_type;
+
+ std::string type;
+
+ /* Filtering by PurgeItem::Action */
+ PurgeItem::Action purge_action;
+
+ /* Filtering by dirfrag */
+ dirfrag_t frag;
+ std::string frag_dentry; //< optional, filter dentry name within fragment
+
+ /* Filtering by metablob client name */
+ entity_name_t client_name;
+
+ public:
+ JournalFilter(std::string t) :
+ range_start(0),
+ range_end(-1),
+ inode(0),
+ event_type(0),
+ type(t),
+ purge_action(PurgeItem::NONE) {}
+
+ bool get_range(uint64_t &start, uint64_t &end) const;
+ bool apply(uint64_t pos, LogEvent &le) const;
+ bool apply(uint64_t pos, PurgeItem &pi) const;
+ int parse_args(
+ std::vector<const char*> &argv,
+ std::vector<const char*>::iterator &arg);
+};
+
+#endif // JOURNAL_FILTER_H
+
diff --git a/src/tools/cephfs/JournalScanner.cc b/src/tools/cephfs/JournalScanner.cc
new file mode 100644
index 00000000..ea9d6ddf
--- /dev/null
+++ b/src/tools/cephfs/JournalScanner.cc
@@ -0,0 +1,438 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include "include/rados/librados.hpp"
+#include "mds/JournalPointer.h"
+
+#include "mds/events/ESubtreeMap.h"
+#include "mds/PurgeQueue.h"
+
+#include "JournalScanner.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+/**
+ * Read journal header, followed by sequential scan through journal space.
+ *
+ * Return 0 on success, else error code. Note that success has the special meaning
+ * that we were able to apply our checks, it does *not* mean that the journal is
+ * healthy.
+ */
+int JournalScanner::scan(bool const full)
+{
+ int r = 0;
+
+ r = set_journal_ino();
+ if (r < 0) {
+ return r;
+ }
+
+ if (!is_mdlog || pointer_present) {
+ r = scan_header();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (full && header_present) {
+ r = scan_events();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+
+int JournalScanner::set_journal_ino()
+{
+ int r = 0;
+ if (type == "purge_queue") {
+ ino = MDS_INO_PURGE_QUEUE + rank;
+ }
+ else if (type == "mdlog"){
+ r = scan_pointer();
+ is_mdlog = true;
+ }
+ else {
+ ceph_abort(); // should not get here
+ }
+ return r;
+}
+
+int JournalScanner::scan_pointer()
+{
+ // Issue read
+ std::string const pointer_oid = obj_name(MDS_INO_LOG_POINTER_OFFSET + rank, 0);
+ bufferlist pointer_bl;
+ int r = io.read(pointer_oid, pointer_bl, INT_MAX, 0);
+ if (r == -ENOENT) {
+ // 'Successfully' discovered the pointer is missing.
+ derr << "Pointer " << pointer_oid << " is absent" << dendl;
+ return 0;
+ } else if (r < 0) {
+ // Error preventing us interrogating pointer
+ derr << "Pointer " << pointer_oid << " is unreadable" << dendl;
+ return r;
+ } else {
+ dout(4) << "Pointer " << pointer_oid << " is readable" << dendl;
+ pointer_present = true;
+
+ JournalPointer jp;
+ try {
+ auto q = pointer_bl.cbegin();
+ jp.decode(q);
+ } catch(buffer::error &e) {
+ derr << "Pointer " << pointer_oid << " is corrupt: " << e.what() << dendl;
+ return 0;
+ }
+
+ pointer_valid = true;
+ ino = jp.front;
+ return 0;
+ }
+}
+
+
+int JournalScanner::scan_header()
+{
+ int r;
+
+ bufferlist header_bl;
+ std::string header_name = obj_name(0);
+ dout(4) << "JournalScanner::scan: reading header object '" << header_name << "'" << dendl;
+ r = io.read(header_name, header_bl, INT_MAX, 0);
+ if (r < 0) {
+ derr << "Header " << header_name << " is unreadable" << dendl;
+ return 0; // "Successfully" found an error
+ } else {
+ header_present = true;
+ }
+
+ auto header_bl_i = header_bl.cbegin();
+ header = new Journaler::Header();
+ try
+ {
+ header->decode(header_bl_i);
+ }
+ catch (buffer::error &e)
+ {
+ derr << "Header is corrupt (" << e.what() << ")" << dendl;
+ delete header;
+ header = NULL;
+ return 0; // "Successfully" found an error
+ }
+
+ if (header->magic != std::string(CEPH_FS_ONDISK_MAGIC)) {
+ derr << "Header is corrupt (bad magic)" << dendl;
+ return 0; // "Successfully" found an error
+ }
+ if (!((header->trimmed_pos <= header->expire_pos) && (header->expire_pos <= header->write_pos))) {
+ derr << "Header is invalid (inconsistent offsets)" << dendl;
+ return 0; // "Successfully" found an error
+ }
+ header_valid = true;
+
+ return 0;
+}
+
+
+int JournalScanner::scan_events()
+{
+ uint64_t object_size = g_conf()->mds_log_segment_size;
+ if (object_size == 0) {
+ // Default layout object size
+ object_size = file_layout_t::get_default().object_size;
+ }
+
+ uint64_t read_offset = header->expire_pos;
+ dout(10) << std::hex << "Header 0x"
+ << header->trimmed_pos << " 0x"
+ << header->expire_pos << " 0x"
+ << header->write_pos << std::dec << dendl;
+ dout(10) << "Starting journal scan from offset 0x" << std::hex << read_offset << std::dec << dendl;
+
+ // TODO also check for extraneous objects before the trimmed pos or after the write pos,
+ // which would indicate a bogus header.
+
+ bufferlist read_buf;
+ bool gap = false;
+ uint64_t gap_start = -1;
+ for (uint64_t obj_offset = (read_offset / object_size); ; obj_offset++) {
+ uint64_t offset_in_obj = 0;
+ if (obj_offset * object_size < header->expire_pos) {
+ // Skip up to expire_pos from start of the object
+ // (happens for the first object we read)
+ offset_in_obj = header->expire_pos - obj_offset * object_size;
+ }
+
+ // Read this journal segment
+ bufferlist this_object;
+ std::string const oid = obj_name(obj_offset);
+ int r = io.read(oid, this_object, INT_MAX, offset_in_obj);
+
+ // Handle absent journal segments
+ if (r < 0) {
+ if (obj_offset > (header->write_pos / object_size)) {
+ dout(4) << "Reached end of journal objects" << dendl;
+ break;
+ } else {
+ derr << "Missing object " << oid << dendl;
+ }
+
+ objects_missing.push_back(obj_offset);
+ if (!gap) {
+ gap_start = read_offset;
+ gap = true;
+ }
+ if (read_buf.length() > 0) {
+ read_offset += read_buf.length();
+ read_buf.clear();
+ }
+ read_offset += object_size - offset_in_obj;
+ continue;
+ } else {
+ dout(4) << "Read 0x" << std::hex << this_object.length() << std::dec
+ << " bytes from " << oid << " gap=" << gap << dendl;
+ objects_valid.push_back(oid);
+ this_object.copy(0, this_object.length(), read_buf);
+ }
+
+ if (gap) {
+ // No valid data at the current read offset, scan forward until we find something valid looking
+ // or have to drop out to load another object.
+ dout(4) << "Searching for sentinel from 0x" << std::hex << read_offset
+ << ", 0x" << read_buf.length() << std::dec << " bytes available" << dendl;
+
+ do {
+ auto p = read_buf.cbegin();
+ uint64_t candidate_sentinel;
+ decode(candidate_sentinel, p);
+
+ dout(4) << "Data at 0x" << std::hex << read_offset << " = 0x" << candidate_sentinel << std::dec << dendl;
+
+ if (candidate_sentinel == JournalStream::sentinel) {
+ dout(4) << "Found sentinel at 0x" << std::hex << read_offset << std::dec << dendl;
+ ranges_invalid.push_back(Range(gap_start, read_offset));
+ gap = false;
+ break;
+ } else {
+ // No sentinel, discard this byte
+ read_buf.splice(0, 1);
+ read_offset += 1;
+ }
+ } while (read_buf.length() >= sizeof(JournalStream::sentinel));
+ dout(4) << "read_buf size is " << read_buf.length() << dendl;
+ }
+ {
+ dout(10) << "Parsing data, 0x" << std::hex << read_buf.length() << std::dec << " bytes available" << dendl;
+ while(true) {
+ // TODO: detect and handle legacy format journals: can do many things
+ // on them but on read errors have to give up instead of searching
+ // for sentinels.
+ JournalStream journal_stream(JOURNAL_FORMAT_RESILIENT);
+ bool readable = false;
+ try {
+ uint64_t need;
+ readable = journal_stream.readable(read_buf, &need);
+ } catch (buffer::error &e) {
+ readable = false;
+ dout(4) << "Invalid container encoding at 0x" << std::hex << read_offset << std::dec << dendl;
+ gap = true;
+ gap_start = read_offset;
+ read_buf.splice(0, 1);
+ read_offset += 1;
+ break;
+ }
+
+ if (!readable) {
+ // Out of data, continue to read next object
+ break;
+ }
+
+ bufferlist le_bl; //< Serialized LogEvent blob
+ dout(10) << "Attempting decode at 0x" << std::hex << read_offset << std::dec << dendl;
+ // This cannot fail to decode because we pre-checked that a serialized entry
+ // blob would be readable.
+ uint64_t start_ptr = 0;
+ uint64_t consumed = journal_stream.read(read_buf, &le_bl, &start_ptr);
+ dout(10) << "Consumed 0x" << std::hex << consumed << std::dec << " bytes" << dendl;
+ if (start_ptr != read_offset) {
+ derr << "Bad entry start ptr (0x" << std::hex << start_ptr << ") at 0x"
+ << read_offset << std::dec << dendl;
+ gap = true;
+ gap_start = read_offset;
+ // FIXME: given that entry was invalid, should we be skipping over it?
+ // maybe push bytes back onto start of read_buf and just advance one byte
+ // to start scanning instead. e.g. if a bogus size value is found it can
+ // cause us to consume and thus skip a bunch of following valid events.
+ read_offset += consumed;
+ break;
+ }
+ bool valid_entry = true;
+ if (is_mdlog) {
+ auto le = LogEvent::decode_event(le_bl.cbegin());
+
+ if (le) {
+ dout(10) << "Valid entry at 0x" << std::hex << read_offset << std::dec << dendl;
+
+ if (le->get_type() == EVENT_SUBTREEMAP
+ || le->get_type() == EVENT_SUBTREEMAP_TEST) {
+ auto&& sle = dynamic_cast<ESubtreeMap&>(*le);
+ if (sle.expire_pos > read_offset) {
+ errors.insert(std::make_pair(
+ read_offset, EventError(
+ -ERANGE,
+ "ESubtreeMap has expire_pos ahead of its own position")));
+ }
+ }
+
+ if (filter.apply(read_offset, *le)) {
+ events.insert_or_assign(read_offset, EventRecord(std::move(le), consumed));
+ }
+ } else {
+ valid_entry = false;
+ }
+ } else if (type == "purge_queue"){
+ auto pi = std::make_unique<PurgeItem>();
+ try {
+ auto q = le_bl.cbegin();
+ pi->decode(q);
+ if (filter.apply(read_offset, *pi)) {
+ events.insert_or_assign(read_offset, EventRecord(std::move(pi), consumed));
+ }
+ } catch (const buffer::error &err) {
+ valid_entry = false;
+ }
+ } else {
+ ceph_abort(); // should not get here
+ }
+ if (!valid_entry) {
+ dout(10) << "Invalid entry at 0x" << std::hex << read_offset << std::dec << dendl;
+ gap = true;
+ gap_start = read_offset;
+ read_offset += consumed;
+ break;
+ } else {
+ events_valid.push_back(read_offset);
+ read_offset += consumed;
+ }
+ }
+ }
+ }
+
+ if (gap) {
+ // Ended on a gap, assume it ran to end
+ ranges_invalid.push_back(Range(gap_start, -1));
+ }
+
+ dout(4) << "Scanned objects, " << objects_missing.size() << " missing, " << objects_valid.size() << " valid" << dendl;
+ dout(4) << "Events scanned, " << ranges_invalid.size() << " gaps" << dendl;
+ dout(4) << "Found " << events_valid.size() << " valid events" << dendl;
+ dout(4) << "Selected " << events.size() << " events events for processing" << dendl;
+
+ return 0;
+}
+
+
+JournalScanner::~JournalScanner()
+{
+ if (header) {
+ delete header;
+ header = NULL;
+ }
+ dout(4) << events.size() << " events" << dendl;
+ events.clear();
+}
+
+
+/**
+ * Whether the journal data looks valid and replayable
+ */
+bool JournalScanner::is_healthy() const
+{
+ return ((!is_mdlog || (pointer_present && pointer_valid))
+ && header_present && header_valid
+ && ranges_invalid.empty()
+ && objects_missing.empty());
+}
+
+
+/**
+ * Whether the journal data can be read from RADOS
+ */
+bool JournalScanner::is_readable() const
+{
+ return (header_present && header_valid && objects_missing.empty());
+}
+
+
+/**
+ * Calculate the object name for a given offset
+ */
+std::string JournalScanner::obj_name(inodeno_t ino, uint64_t offset) const
+{
+ char name[60];
+ snprintf(name, sizeof(name), "%llx.%08llx",
+ (unsigned long long)(ino),
+ (unsigned long long)offset);
+ return std::string(name);
+}
+
+
+std::string JournalScanner::obj_name(uint64_t offset) const
+{
+ return obj_name(ino, offset);
+}
+
+
+/*
+ * Write a human readable summary of the journal health
+ */
+void JournalScanner::report(std::ostream &out) const
+{
+ out << "Overall journal integrity: " << (is_healthy() ? "OK" : "DAMAGED") << std::endl;
+
+ if (is_mdlog) {
+ if (!pointer_present) {
+ out << "Pointer not found" << std::endl;
+ } else if (!pointer_valid) {
+ out << "Pointer could not be decoded" << std::endl;
+ }
+ }
+ if (!header_present) {
+ out << "Header not found" << std::endl;
+ } else if (!header_valid) {
+ out << "Header could not be decoded" << std::endl;
+ }
+
+ if (objects_missing.size()) {
+ out << "Objects missing:" << std::endl;
+ for (std::vector<uint64_t>::const_iterator om = objects_missing.begin();
+ om != objects_missing.end(); ++om) {
+ out << " 0x" << std::hex << *om << std::dec << std::endl;
+ }
+ }
+
+ if (ranges_invalid.size()) {
+ out << "Corrupt regions:" << std::endl;
+ for (std::vector<Range>::const_iterator r = ranges_invalid.begin();
+ r != ranges_invalid.end(); ++r) {
+ out << " 0x" << std::hex << r->first << "-" << r->second << std::dec << std::endl;
+ }
+ }
+}
+
diff --git a/src/tools/cephfs/JournalScanner.h b/src/tools/cephfs/JournalScanner.h
new file mode 100644
index 00000000..9197b559
--- /dev/null
+++ b/src/tools/cephfs/JournalScanner.h
@@ -0,0 +1,133 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+#ifndef JOURNAL_SCANNER_H
+#define JOURNAL_SCANNER_H
+
+#include "include/rados/librados_fwd.hpp"
+
+// For Journaler::Header, can't forward-declare nested classes
+#include <osdc/Journaler.h>
+
+#include "JournalFilter.h"
+
+/**
+ * A simple sequential reader for metadata journals. Unlike
+ * the MDS Journaler class, this is written to detect, record,
+ * and read past corruptions and missing objects. It is also
+ * less efficient but more plainly written.
+ */
+class JournalScanner
+{
+ private:
+ librados::IoCtx &io;
+
+ // Input constraints
+ const int rank;
+ std::string type;
+ JournalFilter const filter;
+
+ void gap_advance();
+
+ public:
+ JournalScanner(
+ librados::IoCtx &io_,
+ int rank_,
+ const std::string &type_,
+ JournalFilter const &filter_) :
+ io(io_),
+ rank(rank_),
+ type(type_),
+ filter(filter_),
+ is_mdlog(false),
+ pointer_present(false),
+ pointer_valid(false),
+ header_present(false),
+ header_valid(false),
+ header(NULL) {};
+
+ JournalScanner(
+ librados::IoCtx &io_,
+ int rank_,
+ const std::string &type_) :
+ io(io_),
+ rank(rank_),
+ type(type_),
+ filter(type_),
+ is_mdlog(false),
+ pointer_present(false),
+ pointer_valid(false),
+ header_present(false),
+ header_valid(false),
+ header(NULL) {};
+
+ ~JournalScanner();
+
+ int set_journal_ino();
+ int scan(bool const full=true);
+ int scan_pointer();
+ int scan_header();
+ int scan_events();
+ void report(std::ostream &out) const;
+
+ std::string obj_name(uint64_t offset) const;
+ std::string obj_name(inodeno_t ino, uint64_t offset) const;
+
+ // The results of the scan
+ inodeno_t ino; // Corresponds to journal ino according their type
+ struct EventRecord {
+ EventRecord(std::unique_ptr<LogEvent> le, uint32_t rs) : log_event(std::move(le)), raw_size(rs) {}
+ EventRecord(std::unique_ptr<PurgeItem> p, uint32_t rs) : pi(std::move(p)), raw_size(rs) {}
+ std::unique_ptr<LogEvent> log_event;
+ std::unique_ptr<PurgeItem> pi;
+ uint32_t raw_size = 0; //< Size from start offset including all encoding overhead
+ };
+
+ class EventError {
+ public:
+ int r;
+ std::string description;
+ EventError(int r_, const std::string &desc_)
+ : r(r_), description(desc_) {}
+ };
+
+ typedef std::map<uint64_t, EventRecord> EventMap;
+ typedef std::map<uint64_t, EventError> ErrorMap;
+ typedef std::pair<uint64_t, uint64_t> Range;
+ bool is_mdlog;
+ bool pointer_present; //mdlog specific
+ bool pointer_valid; //mdlog specific
+ bool header_present;
+ bool header_valid;
+ Journaler::Header *header;
+
+ bool is_healthy() const;
+ bool is_readable() const;
+ std::vector<std::string> objects_valid;
+ std::vector<uint64_t> objects_missing;
+ std::vector<Range> ranges_invalid;
+ std::vector<uint64_t> events_valid;
+ EventMap events;
+
+ // For events present in ::events (i.e. scanned successfully),
+ // any subsequent errors handling them (e.g. replaying)
+ ErrorMap errors;
+
+
+ private:
+ // Forbid copy construction because I have ptr members
+ JournalScanner(const JournalScanner &rhs);
+};
+
+#endif // JOURNAL_SCANNER_H
+
diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc
new file mode 100644
index 00000000..f6d7c411
--- /dev/null
+++ b/src/tools/cephfs/JournalTool.cc
@@ -0,0 +1,1256 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include <sstream>
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/InoTable.h"
+
+#include "mds/events/ENoOp.h"
+#include "mds/events/EUpdate.h"
+
+#include "JournalScanner.h"
+#include "EventOutput.h"
+#include "Dumper.h"
+#include "Resetter.h"
+
+#include "JournalTool.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+
+
+void JournalTool::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-journal-tool [options] journal <command>\n"
+ << " <command>:\n"
+ << " inspect\n"
+ << " import <path> [--force]\n"
+ << " export <path>\n"
+ << " reset [--force]\n"
+ << " cephfs-journal-tool [options] header <get|set> <field> <value>\n"
+ << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n"
+ << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
+ << " <selector>:\n"
+ << " --range=<start>..<end>\n"
+ << " --path=<substring>\n"
+ << " --inode=<integer>\n"
+ << " --type=<UPDATE|OPEN|SESSION...><\n"
+ << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
+ << " --client=<session id integer>\n"
+ << " <effect>: [get|recover_dentries|splice]\n"
+ << " <output>: [summary|list|binary|json] [--path <path>]\n"
+ << "\n"
+ << "General options:\n"
+ << " --rank=filesystem:mds-rank|all Journal rank (mandatory)\n"
+ << " --journal=<mdlog|purge_queue> Journal type (purge_queue means\n"
+ << " this journal is used to queue for purge operation,\n"
+ << " default is mdlog, and only mdlog support event mode)\n"
+ << "\n"
+ << "Special options\n"
+ << " --alternate-pool <name> Alternative metadata pool to target\n"
+ << " when using recover_dentries.\n";
+
+ generic_client_usage();
+}
+
+
+/**
+ * Handle arguments and hand off to journal/header/event mode
+ */
+int JournalTool::main(std::vector<const char*> &argv)
+{
+ int r;
+
+ dout(10) << "JournalTool::main " << dendl;
+ // Common arg parsing
+ // ==================
+ if (argv.empty()) {
+ cerr << "missing positional argument" << std::endl;
+ return -EINVAL;
+ }
+
+ std::vector<const char*>::iterator arg = argv.begin();
+
+ std::string rank_str;
+ if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
+ derr << "missing mandatory \"--rank\" argument" << dendl;
+ return -EINVAL;
+ }
+
+ if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) {
+ // Default is mdlog
+ type = "mdlog";
+ }
+
+ r = validate_type(type);
+ if (r != 0) {
+ derr << "journal type is not correct." << dendl;
+ return r;
+ }
+
+ r = role_selector.parse(*fsmap, rank_str, false);
+ if (r != 0) {
+ derr << "Couldn't determine MDS rank." << dendl;
+ return r;
+ }
+
+ std::string mode;
+ if (arg == argv.end()) {
+ derr << "Missing mode [journal|header|event]" << dendl;
+ return -EINVAL;
+ }
+ mode = std::string(*arg);
+ arg = argv.erase(arg);
+
+ // RADOS init
+ // ==========
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+ return r;
+ }
+
+ dout(4) << "JournalTool: connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ ceph_assert(fs != nullptr);
+ int64_t const pool_id = fs->mds_map.get_metadata_pool();
+ dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
+ std::string pool_name;
+ r = rados.pool_reverse_lookup(pool_id, &pool_name);
+ if (r < 0) {
+ derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
+ return r;
+ }
+
+ dout(4) << "JournalTool: creating IoCtx.." << dendl;
+ r = rados.ioctx_create(pool_name.c_str(), input);
+ ceph_assert(r == 0);
+ output.dup(input);
+
+ // Execution
+ // =========
+ // journal and header are general journal mode
+ // event mode is only specific for mdlog
+ auto roles = role_selector.get_roles();
+ if (roles.size() > 1) {
+ const std::string &command = argv[0];
+ bool allowed = can_execute_for_all_ranks(mode, command);
+ if (!allowed) {
+ derr << "operation not allowed for all ranks" << dendl;
+ return -EINVAL;
+ }
+
+ all_ranks = true;
+ }
+ for (auto role : roles) {
+ rank = role.rank;
+ std::vector<const char *> rank_argv(argv);
+ dout(4) << "Executing for rank " << rank << dendl;
+ if (mode == std::string("journal")) {
+ r = main_journal(rank_argv);
+ } else if (mode == std::string("header")) {
+ r = main_header(rank_argv);
+ } else if (mode == std::string("event")) {
+ r = main_event(rank_argv);
+ } else {
+ cerr << "Bad command '" << mode << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (r != 0) {
+ return r;
+ }
+ }
+
+ return r;
+}
+
+int JournalTool::validate_type(const std::string &type)
+{
+ if (type == "mdlog" || type == "purge_queue") {
+ return 0;
+ }
+ return -1;
+}
+
+std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
+ if (!all_ranks) {
+ return prefix;
+ }
+
+ return prefix + "." + std::to_string(rank);
+}
+
+bool JournalTool::can_execute_for_all_ranks(const std::string &mode,
+ const std::string &command) {
+ if (mode == "journal" && command == "import") {
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * Handle arguments for 'journal' mode
+ *
+ * This is for operations that act on the journal as a whole.
+ */
+int JournalTool::main_journal(std::vector<const char*> &argv)
+{
+ if (argv.empty()) {
+ derr << "Missing journal command, please see help" << dendl;
+ return -EINVAL;
+ }
+
+ std::string command = argv[0];
+ if (command == "inspect") {
+ return journal_inspect();
+ } else if (command == "export" || command == "import") {
+ bool force = false;
+ if (argv.size() >= 2) {
+ std::string const path = argv[1];
+ if (argv.size() == 3) {
+ if (std::string(argv[2]) == "--force") {
+ force = true;
+ } else {
+ std::cerr << "Unknown argument " << argv[1] << std::endl;
+ return -EINVAL;
+ }
+ }
+ return journal_export(path, command == "import", force);
+ } else {
+ derr << "Missing path" << dendl;
+ return -EINVAL;
+ }
+ } else if (command == "reset") {
+ bool force = false;
+ if (argv.size() == 2) {
+ if (std::string(argv[1]) == "--force") {
+ force = true;
+ } else {
+ std::cerr << "Unknown argument " << argv[1] << std::endl;
+ return -EINVAL;
+ }
+ } else if (argv.size() > 2) {
+ std::cerr << "Too many arguments!" << std::endl;
+ return -EINVAL;
+ }
+ return journal_reset(force);
+ } else {
+ derr << "Bad journal command '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+}
+
+
+/**
+ * Parse arguments and execute for 'header' mode
+ *
+ * This is for operations that act on the header only.
+ */
+int JournalTool::main_header(std::vector<const char*> &argv)
+{
+ JournalFilter filter(type);
+ JournalScanner js(input, rank, type, filter);
+ int r = js.scan(false);
+ if (r < 0) {
+ std::cerr << "Unable to scan journal" << std::endl;
+ return r;
+ }
+
+ if (!js.header_present) {
+ std::cerr << "Header object not found!" << std::endl;
+ return -ENOENT;
+ } else if (!js.header_valid && js.header == NULL) {
+ // Can't do a read or a single-field write without a copy of the original
+ derr << "Header could not be read!" << dendl;
+ return -ENOENT;
+ } else {
+ ceph_assert(js.header != NULL);
+ }
+
+ if (argv.empty()) {
+ derr << "Missing header command, must be [get|set]" << dendl;
+ return -EINVAL;
+ }
+ std::vector<const char *>::iterator arg = argv.begin();
+ std::string const command = *arg;
+ arg = argv.erase(arg);
+
+ if (command == std::string("get")) {
+ // Write JSON journal dump to stdout
+ JSONFormatter jf(true);
+ js.header->dump(&jf);
+ jf.flush(std::cout);
+ std::cout << std::endl;
+ } else if (command == std::string("set")) {
+ // Need two more args <key> <val>
+ if (argv.size() != 2) {
+ derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
+ return -EINVAL;
+ }
+
+ std::string const field_name = *arg;
+ arg = argv.erase(arg);
+
+ std::string const value_str = *arg;
+ arg = argv.erase(arg);
+ ceph_assert(argv.empty());
+
+ std::string parse_err;
+ uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t *field = NULL;
+ if (field_name == "trimmed_pos") {
+ field = &(js.header->trimmed_pos);
+ } else if (field_name == "expire_pos") {
+ field = &(js.header->expire_pos);
+ } else if (field_name == "write_pos") {
+ field = &(js.header->write_pos);
+ } else if (field_name == "pool_id") {
+ field = (uint64_t*)(&(js.header->layout.pool_id));
+ } else {
+ derr << "Invalid field '" << field_name << "'" << dendl;
+ return -EINVAL;
+ }
+
+ std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
+ *field = new_val;
+
+ dout(4) << "Writing object..." << dendl;
+ bufferlist header_bl;
+ encode(*(js.header), header_bl);
+ output.write_full(js.obj_name(0), header_bl);
+ dout(4) << "Write complete." << dendl;
+ std::cout << "Successfully updated header." << std::endl;
+ } else {
+ derr << "Bad header command '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+/**
+ * Parse arguments and execute for 'event' mode
+ *
+ * This is for operations that act on LogEvents within the log
+ */
+int JournalTool::main_event(std::vector<const char*> &argv)
+{
+ int r;
+
+ if (argv.empty()) {
+ derr << "Missing event command, please see help" << dendl;
+ return -EINVAL;
+ }
+
+ std::vector<const char*>::iterator arg = argv.begin();
+ bool dry_run = false;
+
+ std::string command = *(arg++);
+ if (command != "get" && command != "splice" && command != "recover_dentries") {
+ derr << "Unknown argument '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+
+ if (command == "recover_dentries") {
+ if (type != "mdlog") {
+ derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl;
+ return -EINVAL;
+ } else {
+ if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
+ dry_run = true;
+ }
+ }
+ }
+
+ if (arg == argv.end()) {
+ derr << "Incomplete command line" << dendl;
+ return -EINVAL;
+ }
+
+ // Parse filter options
+ // ====================
+ JournalFilter filter(type);
+ r = filter.parse_args(argv, arg);
+ if (r) {
+ return r;
+ }
+
+ // Parse output options
+ // ====================
+ if (arg == argv.end()) {
+ cerr << "Missing output command" << std::endl;
+ return -EINVAL;
+ }
+ std::string output_style = *(arg++);
+ if (output_style != "binary" && output_style != "json" &&
+ output_style != "summary" && output_style != "list") {
+ cerr << "Unknown argument: '" << output_style << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string output_path = "dump";
+ while(arg != argv.end()) {
+ std::string arg_str;
+ if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
+ output_path = arg_str;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
+ nullptr)) {
+ dout(1) << "Using alternate pool " << arg_str << dendl;
+ int r = rados.ioctx_create(arg_str.c_str(), output);
+ ceph_assert(r == 0);
+ other_pool = true;
+ } else {
+ cerr << "Unknown argument: '" << *arg << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ const std::string dump_path = gen_dump_file_path(output_path);
+
+ // Execute command
+ // ===============
+ JournalScanner js(input, rank, type, filter);
+ if (command == "get") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+ } else if (command == "recover_dentries") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+
+ /**
+ * Iterate over log entries, attempting to scavenge from each one
+ */
+ std::set<inodeno_t> consumed_inos;
+ for (JournalScanner::EventMap::iterator i = js.events.begin();
+ i != js.events.end(); ++i) {
+ auto& le = i->second.log_event;
+ EMetaBlob const *mb = le->get_metablob();
+ if (mb) {
+ int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
+ if (scav_r) {
+ dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
+ << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
+ if (r == 0) {
+ r = scav_r;
+ }
+ // Our goal is to read all we can, so don't stop on errors, but
+ // do record them for possible later output
+ js.errors.insert(std::make_pair(i->first,
+ JournalScanner::EventError(scav_r, cpp_strerror(r))));
+ }
+ }
+ }
+
+ /**
+ * Update InoTable to reflect any inode numbers consumed during scavenge
+ */
+ dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
+ if (consumed_inos.size() && !dry_run) {
+ int consume_r = consume_inos(consumed_inos);
+ if (consume_r) {
+ dout(1) << "Error updating InoTable for " << consumed_inos.size()
+ << " consume inos: " << cpp_strerror(consume_r) << dendl;
+ if (r == 0) {
+ r = consume_r;
+ }
+ }
+ }
+
+ // Remove consumed dentries from lost+found.
+ if (other_pool && !dry_run) {
+ std::set<std::string> found;
+
+ for (auto i : consumed_inos) {
+ char s[20];
+
+ snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
+ dout(20) << "removing " << s << dendl;
+ found.insert(std::string(s));
+ }
+
+ object_t frag_oid;
+ frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
+ frag_t(), "");
+ output.omap_rm_keys(frag_oid.name, found);
+ }
+ } else if (command == "splice") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+
+ uint64_t start, end;
+ if (filter.get_range(start, end)) {
+ // Special case for range filter: erase a numeric range in the log
+ uint64_t range = end - start;
+ int r = erase_region(js, start, range);
+ if (r) {
+ derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else {
+ // General case: erase a collection of individual entries in the log
+ for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
+ dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
+
+ int r = erase_region(js, i->first, i->second.raw_size);
+ if (r) {
+ derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+
+ } else {
+ cerr << "Unknown argument '" << command << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // Generate output
+ // ===============
+ EventOutput output(js, dump_path);
+ int output_result = 0;
+ if (output_style == "binary") {
+ output_result = output.binary();
+ } else if (output_style == "json") {
+ output_result = output.json();
+ } else if (output_style == "summary") {
+ output.summary();
+ } else if (output_style == "list") {
+ output.list();
+ } else {
+ std::cerr << "Bad output command '" << output_style << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (output_result != 0) {
+ std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
+ }
+
+ return output_result;
+}
+
+/**
+ * Provide the user with information about the condition of the journal,
+ * especially indicating what range of log events is available and where
+ * any gaps or corruptions in the journal are.
+ */
+int JournalTool::journal_inspect()
+{
+ int r;
+
+ JournalFilter filter(type);
+ JournalScanner js(input, rank, type, filter);
+ r = js.scan();
+ if (r) {
+ std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
+ return r;
+ }
+
+ js.report(std::cout);
+
+ return 0;
+}
+
+
+/**
+ * Attempt to export a binary dump of the journal.
+ *
+ * This is allowed to fail if the header is malformed or there are
+ * objects inaccessible, in which case the user would have to fall
+ * back to manually listing RADOS objects and extracting them, which
+ * they can do with the ``rados`` CLI.
+ */
+int JournalTool::journal_export(std::string const &path, bool import, bool force)
+{
+ int r = 0;
+ JournalScanner js(input, rank, type);
+
+ if (!import) {
+ /*
+ * If doing an export, first check that the header is valid and
+ * no objects are missing before trying to dump
+ */
+ r = js.scan();
+ if (r < 0) {
+ derr << "Unable to scan journal, assuming badly damaged" << dendl;
+ return r;
+ }
+ if (!js.is_readable()) {
+ derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
+ return -EIO;
+ }
+ }
+
+ /*
+ * Assuming we can cleanly read the journal data, dump it out to a file
+ */
+ {
+ Dumper dumper;
+ r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type);
+ if (r < 0) {
+ derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (import) {
+ r = dumper.undump(path.c_str(), force);
+ } else {
+ const std::string ex_path = gen_dump_file_path(path);
+ r = dumper.dump(ex_path.c_str());
+ }
+ }
+
+ return r;
+}
+
+
+/**
+ * Truncate journal and insert EResetJournal
+ */
+int JournalTool::journal_reset(bool hard)
+{
+ int r = 0;
+ Resetter resetter;
+ r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard);
+ if (r < 0) {
+ derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (hard) {
+ r = resetter.reset_hard();
+ } else {
+ r = resetter.reset();
+ }
+
+ return r;
+}
+
+
+/**
+ * Selective offline replay which only reads out dentries and writes
+ * them to the backing store iff their version is > what is currently
+ * in the backing store.
+ *
+ * In order to write dentries to the backing store, we may create the
+ * required enclosing dirfrag objects.
+ *
+ * Test this by running scavenge on an unflushed journal, then nuking
+ * it offline, then starting an MDS and seeing that the dentries are
+ * visible.
+ *
+ * @param metablob an EMetaBlob retrieved from the journal
+ * @param dry_run if true, do no writes to RADOS
+ * @param consumed_inos output, populated with any inos inserted
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::recover_dentries(
+ EMetaBlob const &metablob,
+ bool const dry_run,
+ std::set<inodeno_t> *consumed_inos)
+{
+ ceph_assert(consumed_inos != NULL);
+
+ int r = 0;
+
+ // Replay fullbits (dentry+inode)
+ for (const auto& frag : metablob.lump_order) {
+ EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
+ lump._decode_bits();
+ object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
+
+ dout(4) << "inspecting lump " << frag_oid.name << dendl;
+
+
+ // We will record old fnode version for use in hard link handling
+ // If we don't read an old fnode, take version as zero and write in
+ // all hardlinks we find.
+ version_t old_fnode_version = 0;
+
+ // Update fnode in omap header of dirfrag object
+ bool write_fnode = false;
+ bufferlist old_fnode_bl;
+ r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
+ if (r == -ENOENT) {
+ // Creating dirfrag from scratch
+ dout(4) << "failed to read OMAP header from directory fragment "
+ << frag_oid.name << " " << cpp_strerror(r) << dendl;
+ write_fnode = true;
+ // Note: creating the dirfrag *without* a backtrace, relying on
+ // MDS to regenerate backtraces on read or in FSCK
+ } else if (r == 0) {
+ // Conditionally update existing omap header
+ fnode_t old_fnode;
+ auto old_fnode_iter = old_fnode_bl.cbegin();
+ try {
+ old_fnode.decode(old_fnode_iter);
+ dout(4) << "frag " << frag_oid.name << " fnode old v" <<
+ old_fnode.version << " vs new v" << lump.fnode.version << dendl;
+ old_fnode_version = old_fnode.version;
+ write_fnode = old_fnode_version < lump.fnode.version;
+ } catch (const buffer::error &err) {
+ dout(1) << "frag " << frag_oid.name
+ << " is corrupt, overwriting" << dendl;
+ write_fnode = true;
+ }
+ } else {
+ // Unexpected error
+ dout(4) << "failed to read OMAP header from directory fragment "
+ << frag_oid.name << " " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if ((other_pool || write_fnode) && !dry_run) {
+ dout(4) << "writing fnode to omap header" << dendl;
+ bufferlist fnode_bl;
+ lump.fnode.encode(fnode_bl);
+ if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
+ r = output.omap_set_header(frag_oid.name, fnode_bl);
+ }
+ if (r != 0) {
+ derr << "Failed to write fnode for frag object "
+ << frag_oid.name << dendl;
+ return r;
+ }
+ }
+
+ std::set<std::string> read_keys;
+
+ // Compose list of potentially-existing dentries we would like to fetch
+ for (const auto& fb : lump.get_dfull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ for(const auto& rb : lump.get_dremote()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ for (const auto& nb : lump.get_dnull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ // Perform bulk read of existing dentries
+ std::map<std::string, bufferlist> read_vals;
+ r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ if (r == -ENOENT && other_pool) {
+ r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ }
+ if (r != 0) {
+ derr << "unexpected error reading fragment object "
+ << frag_oid.name << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Compose list of dentries we will write back
+ std::map<std::string, bufferlist> write_vals;
+ for (const auto& fb : lump.get_dfull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
+ << dendl;
+ bool write_dentry = false;
+ if (read_vals.find(key) == read_vals.end()) {
+ dout(4) << "dentry did not already exist, will create" << dendl;
+ write_dentry = true;
+ } else {
+ dout(4) << "dentry " << key << " existed already" << dendl;
+ dout(4) << "dentry exists, checking versions..." << dendl;
+ bufferlist &old_dentry = read_vals[key];
+ // Decode dentry+inode
+ auto q = old_dentry.cbegin();
+
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ if (dentry_type == 'L') {
+ // leave write_dentry false, we have no version to
+ // compare with in a hardlink, so it's not safe to
+ // squash over it with what's in this fullbit
+ dout(10) << "Existing remote inode in slot to be (maybe) written "
+ << "by a full inode from the journal dn '" << fb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode.version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode.version;
+ } else if (dentry_type == 'I') {
+ // Read out inode version to compare with backing store
+ InodeStore inode;
+ inode.decode_bare(q);
+ dout(4) << "decoded embedded inode version "
+ << inode.inode.version << " vs fullbit version "
+ << fb.inode.version << dendl;
+ if (inode.inode.version < fb.inode.version) {
+ write_dentry = true;
+ }
+ } else {
+ dout(4) << "corrupt dentry in backing store, overwriting from "
+ "journal" << dendl;
+ write_dentry = true;
+ }
+ }
+
+ if ((other_pool || write_dentry) && !dry_run) {
+ dout(4) << "writing I dentry " << key << " into frag "
+ << frag_oid.name << dendl;
+
+ // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+ bufferlist dentry_bl;
+ encode(fb.dnfirst, dentry_bl);
+ encode('I', dentry_bl);
+ encode_fullbit_as_inode(fb, true, &dentry_bl);
+
+ // Record for writing to RADOS
+ write_vals[key] = dentry_bl;
+ consumed_inos->insert(fb.inode.ino);
+ }
+ }
+
+ for(const auto& rb : lump.get_dremote()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
+ << dendl;
+ bool write_dentry = false;
+ if (read_vals.find(key) == read_vals.end()) {
+ dout(4) << "dentry did not already exist, will create" << dendl;
+ write_dentry = true;
+ } else {
+ dout(4) << "dentry " << key << " existed already" << dendl;
+ dout(4) << "dentry exists, checking versions..." << dendl;
+ bufferlist &old_dentry = read_vals[key];
+ // Decode dentry+inode
+ auto q = old_dentry.cbegin();
+
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ if (dentry_type == 'L') {
+ dout(10) << "Existing hardlink inode in slot to be (maybe) written "
+ << "by a remote inode from the journal dn '" << rb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode.version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode.version;
+ } else if (dentry_type == 'I') {
+ dout(10) << "Existing full inode in slot to be (maybe) written "
+ << "by a remote inode from the journal dn '" << rb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode.version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode.version;
+ } else {
+ dout(4) << "corrupt dentry in backing store, overwriting from "
+ "journal" << dendl;
+ write_dentry = true;
+ }
+ }
+
+ if ((other_pool || write_dentry) && !dry_run) {
+ dout(4) << "writing L dentry " << key << " into frag "
+ << frag_oid.name << dendl;
+
+ // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+ bufferlist dentry_bl;
+ encode(rb.dnfirst, dentry_bl);
+ encode('L', dentry_bl);
+ encode(rb.ino, dentry_bl);
+ encode(rb.d_type, dentry_bl);
+
+ // Record for writing to RADOS
+ write_vals[key] = dentry_bl;
+ consumed_inos->insert(rb.ino);
+ }
+ }
+
+ std::set<std::string> null_vals;
+ for (const auto& nb : lump.get_dnull()) {
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
+ << dendl;
+
+ auto it = read_vals.find(key);
+ if (it != read_vals.end()) {
+ dout(4) << "dentry exists, will remove" << dendl;
+
+ auto q = it->second.cbegin();
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ bool remove_dentry = false;
+ if (dentry_type == 'L') {
+ dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode.version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode.version;
+ } else if (dentry_type == 'I') {
+ dout(10) << "Existing full inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode.version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode.version;
+ } else {
+ dout(4) << "corrupt dentry in backing store, will remove" << dendl;
+ remove_dentry = true;
+ }
+
+ if (remove_dentry)
+ null_vals.insert(key);
+ }
+ }
+
+ // Write back any new/changed dentries
+ if (!write_vals.empty()) {
+ r = output.omap_set(frag_oid.name, write_vals);
+ if (r != 0) {
+ derr << "error writing dentries to " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ // remove any null dentries
+ if (!null_vals.empty()) {
+ r = output.omap_rm_keys(frag_oid.name, null_vals);
+ if (r != 0) {
+ derr << "error removing dentries from " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ /* Now that we've looked at the dirlumps, we finally pay attention to
+ * the roots (i.e. inodes without ancestry). This is necessary in order
+ * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
+ * important because clients use them to infer completeness
+ * of directories
+ */
+ for (const auto& fb : metablob.roots) {
+ inodeno_t ino = fb.inode.ino;
+ dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
+
+ object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+ dout(4) << "object id " << root_oid.name << dendl;
+
+ bool write_root_ino = false;
+ bufferlist old_root_ino_bl;
+ r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
+ if (r == -ENOENT) {
+ dout(4) << "root does not exist, will create" << dendl;
+ write_root_ino = true;
+ } else if (r >= 0) {
+ r = 0;
+ InodeStore old_inode;
+ dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
+ << ")" << dendl;
+ auto inode_bl_iter = old_root_ino_bl.cbegin();
+ std::string magic;
+ decode(magic, inode_bl_iter);
+ if (magic == CEPH_FS_ONDISK_MAGIC) {
+ dout(4) << "magic ok" << dendl;
+ old_inode.decode(inode_bl_iter);
+
+ if (old_inode.inode.version < fb.inode.version) {
+ write_root_ino = true;
+ }
+ } else {
+ dout(4) << "magic bad: '" << magic << "'" << dendl;
+ write_root_ino = true;
+ }
+ } else {
+ derr << "error reading root inode object " << root_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (write_root_ino && !dry_run) {
+ dout(4) << "writing root ino " << root_oid.name
+ << " version " << fb.inode.version << dendl;
+
+ // Compose: root ino format is magic,InodeStore(bare=false)
+ bufferlist new_root_ino_bl;
+ encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
+ encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
+
+ // Write to RADOS
+ r = output.write_full(root_oid.name, new_root_ino_bl);
+ if (r != 0) {
+ derr << "error writing inode object " << root_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ return r;
+}
+
+
+/**
+ * Erase a region of the log by overwriting it with ENoOp
+ *
+ */
+int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
+{
+ // To erase this region, we use our preamble, the encoding overhead
+ // of an ENoOp, and our trailing start ptr. Calculate how much padding
+ // is needed inside the ENoOp to make up the difference.
+ bufferlist tmp;
+ if (type == "mdlog") {
+ ENoOp enoop(0);
+ enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else if (type == "purge_queue") {
+ PurgeItem pi;
+ pi.encode(tmp);
+ }
+
+ dout(4) << "erase_region " << pos << " len=" << length << dendl;
+
+ // FIXME: get the preamble/postamble length via JournalStream
+ int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
+ dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
+
+ if (padding < 0) {
+ derr << "Erase region " << length << " too short" << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist entry;
+ if (type == "mdlog") {
+ // Serialize an ENoOp with the correct amount of padding
+ ENoOp enoop(padding);
+ enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else if (type == "purge_queue") {
+ PurgeItem pi;
+ pi.pad_size = padding;
+ pi.encode(entry);
+ }
+ JournalStream stream(JOURNAL_FORMAT_RESILIENT);
+ // Serialize region of log stream
+ bufferlist log_data;
+ stream.write(entry, &log_data, pos);
+
+ dout(4) << "erase_region data length " << log_data.length() << dendl;
+ ceph_assert(log_data.length() == length);
+
+ // Write log stream region to RADOS
+ // FIXME: get object size somewhere common to scan_events
+ uint32_t object_size = g_conf()->mds_log_segment_size;
+ if (object_size == 0) {
+ // Default layout object size
+ object_size = file_layout_t::get_default().object_size;
+ }
+
+ uint64_t write_offset = pos;
+ uint64_t obj_offset = (pos / object_size);
+ int r = 0;
+ while(log_data.length()) {
+ std::string const oid = js.obj_name(obj_offset);
+ uint32_t offset_in_obj = write_offset % object_size;
+ uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
+
+ r = output.write(oid, log_data, write_len, offset_in_obj);
+ if (r < 0) {
+ return r;
+ } else {
+ dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
+ r = 0;
+ }
+
+ log_data.splice(0, write_len);
+ write_offset += write_len;
+ obj_offset++;
+ }
+
+ return r;
+}
+
+/**
+ * Given an EMetaBlob::fullbit containing an inode, write out
+ * the encoded inode in the format used by InodeStore (i.e. the
+ * backing store format)
+ *
+ * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
+ * on an offline InodeStore instance. It's way simpler, because we are just
+ * uncritically hauling the data between structs.
+ *
+ * @param fb a fullbit extracted from a journal entry
+ * @param bare if true, leave out [EN|DE]CODE_START decoration
+ * @param out_bl output, write serialized inode to this bufferlist
+ */
+void JournalTool::encode_fullbit_as_inode(
+ const EMetaBlob::fullbit &fb,
+ const bool bare,
+ bufferlist *out_bl)
+{
+ ceph_assert(out_bl != NULL);
+
+ // Compose InodeStore
+ InodeStore new_inode;
+ new_inode.inode = fb.inode;
+ new_inode.xattrs = fb.xattrs;
+ new_inode.dirfragtree = fb.dirfragtree;
+ new_inode.snap_blob = fb.snapbl;
+ new_inode.symlink = fb.symlink;
+ new_inode.old_inodes = fb.old_inodes;
+
+ // Serialize InodeStore
+ if (bare) {
+ new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else {
+ new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ }
+}
+
+/**
+ * Given a list of inode numbers known to be in use by
+ * inodes in the backing store, ensure that none of these
+ * numbers are listed as free in the InoTables in the
+ * backing store.
+ *
+ * Used after injecting inodes into the backing store, to
+ * ensure that the same inode numbers are not subsequently
+ * used for new files during ordinary operation.
+ *
+ * @param inos list of inode numbers to be removed from
+ * free lists in InoTables
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
+{
+ int r = 0;
+
+ // InoTable is a per-MDS structure, so iterate over assigned ranks
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ std::set<mds_rank_t> in_ranks;
+ fs->mds_map.get_mds_set(in_ranks);
+
+ for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
+ rank_i != in_ranks.end(); ++rank_i)
+ {
+ // Compose object name
+ std::ostringstream oss;
+ oss << "mds" << *rank_i << "_inotable";
+ object_t inotable_oid = object_t(oss.str());
+
+ // Read object
+ bufferlist inotable_bl;
+ int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
+ if (read_r < 0) {
+ // Things are really bad if we can't read inotable. Beyond our powers.
+ derr << "unable to read inotable '" << inotable_oid.name << "': "
+ << cpp_strerror(read_r) << dendl;
+ r = r ? r : read_r;
+ continue;
+ }
+
+ // Deserialize InoTable
+ version_t inotable_ver;
+ auto q = inotable_bl.cbegin();
+ decode(inotable_ver, q);
+ InoTable ino_table(NULL);
+ ino_table.decode(q);
+
+ // Update InoTable in memory
+ bool inotable_modified = false;
+ for (std::set<inodeno_t>::iterator i = inos.begin();
+ i != inos.end(); ++i)
+ {
+ const inodeno_t ino = *i;
+ if (ino_table.force_consume(ino)) {
+ dout(4) << "Used ino 0x" << std::hex << ino << std::dec
+ << " requires inotable update" << dendl;
+ inotable_modified = true;
+ }
+ }
+
+ // Serialize and write InoTable
+ if (inotable_modified) {
+ inotable_ver += 1;
+ dout(4) << "writing modified inotable version " << inotable_ver << dendl;
+ bufferlist inotable_new_bl;
+ encode(inotable_ver, inotable_new_bl);
+ ino_table.encode_state(inotable_new_bl);
+ int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
+ if (write_r != 0) {
+ derr << "error writing modified inotable " << inotable_oid.name
+ << ": " << cpp_strerror(write_r) << dendl;
+ r = r ? r : read_r;
+ continue;
+ }
+ }
+ }
+
+ return r;
+}
+
diff --git a/src/tools/cephfs/JournalTool.h b/src/tools/cephfs/JournalTool.h
new file mode 100644
index 00000000..8d610a86
--- /dev/null
+++ b/src/tools/cephfs/JournalTool.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+#include <vector>
+
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/events/EMetaBlob.h"
+
+#include "include/rados/librados.hpp"
+
+#include "JournalFilter.h"
+
+class JournalScanner;
+
+
+/**
+ * Command line tool for investigating and repairing filesystems
+ * with damaged metadata logs
+ */
+class JournalTool : public MDSUtility
+{
+ private:
+ MDSRoleSelector role_selector;
+ // Bit hacky, use this `rank` member to control behaviour of the
+ // various main_ functions.
+ mds_rank_t rank;
+ // when set, generate per rank dump file path
+ bool all_ranks = false;
+
+ std::string type;
+
+ // Entry points
+ int main_journal(std::vector<const char*> &argv);
+ int main_header(std::vector<const char*> &argv);
+ int main_event(std::vector<const char*> &argv);
+
+ // Shared functionality
+ int recover_journal();
+
+ // Journal operations
+ int journal_inspect();
+ int journal_export(std::string const &path, bool import, bool force);
+ int journal_reset(bool hard);
+
+ // Header operations
+ int header_set();
+
+ // I/O handles
+ librados::Rados rados;
+ librados::IoCtx input;
+ librados::IoCtx output;
+
+ bool other_pool;
+
+ // Metadata backing store manipulation
+ int read_lost_found(std::set<std::string> &lost);
+ int recover_dentries(
+ EMetaBlob const &metablob,
+ bool const dry_run,
+ std::set<inodeno_t> *consumed_inos);
+
+ // Splicing
+ int erase_region(JournalScanner const &jp, uint64_t const pos, uint64_t const length);
+
+ // Backing store helpers
+ void encode_fullbit_as_inode(
+ const EMetaBlob::fullbit &fb,
+ const bool bare,
+ bufferlist *out_bl);
+ int consume_inos(const std::set<inodeno_t> &inos);
+
+ //validate type
+ int validate_type(const std::string &type);
+
+ // generate output file path for dump/export
+ std::string gen_dump_file_path(const std::string &prefix);
+
+ // check if an operation (mode, command) is safe to be
+ // executed on all ranks.
+ bool can_execute_for_all_ranks(const std::string &mode,
+ const std::string &command);
+ public:
+ static void usage();
+ JournalTool() :
+ rank(0), other_pool(false) {}
+ int main(std::vector<const char*> &argv);
+};
+
diff --git a/src/tools/cephfs/MDSUtility.cc b/src/tools/cephfs/MDSUtility.cc
new file mode 100644
index 00000000..b5a3219c
--- /dev/null
+++ b/src/tools/cephfs/MDSUtility.cc
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "MDSUtility.h"
+#include "mon/MonClient.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+
+MDSUtility::MDSUtility() :
+ Dispatcher(g_ceph_context),
+ objecter(NULL),
+ lock("MDSUtility::lock"),
+ finisher(g_ceph_context, "MDSUtility", "fn_mds_utility"),
+ waiting_for_mds_map(NULL),
+ inited(false)
+{
+ monc = new MonClient(g_ceph_context);
+ messenger = Messenger::create_client_messenger(g_ceph_context, "mds");
+ fsmap = new FSMap();
+ objecter = new Objecter(g_ceph_context, messenger, monc, NULL, 0, 0);
+}
+
+
+MDSUtility::~MDSUtility()
+{
+ if (inited) {
+ shutdown();
+ }
+ delete objecter;
+ delete monc;
+ delete messenger;
+ delete fsmap;
+ ceph_assert(waiting_for_mds_map == NULL);
+}
+
+
+int MDSUtility::init()
+{
+ // Initialize Messenger
+ messenger->start();
+
+ objecter->set_client_incarnation(0);
+ objecter->init();
+
+ // Connect dispatchers before starting objecter
+ messenger->add_dispatcher_tail(objecter);
+ messenger->add_dispatcher_tail(this);
+
+ // Initialize MonClient
+ if (monc->build_initial_monmap() < 0) {
+ objecter->shutdown();
+ messenger->shutdown();
+ messenger->wait();
+ return -1;
+ }
+
+ monc->set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD|CEPH_ENTITY_TYPE_MDS);
+ monc->set_messenger(messenger);
+ monc->init();
+ int r = monc->authenticate();
+ if (r < 0) {
+ derr << "Authentication failed, did you specify an MDS ID with a valid keyring?" << dendl;
+ monc->shutdown();
+ objecter->shutdown();
+ messenger->shutdown();
+ messenger->wait();
+ return r;
+ }
+
+ client_t whoami = monc->get_global_id();
+ messenger->set_myname(entity_name_t::CLIENT(whoami.v));
+
+ // Start Objecter and wait for OSD map
+ objecter->start();
+ objecter->wait_for_osd_map();
+
+ // Prepare to receive MDS map and request it
+ Mutex init_lock("MDSUtility:init");
+ Cond cond;
+ bool done = false;
+ ceph_assert(!fsmap->get_epoch());
+ lock.Lock();
+ waiting_for_mds_map = new C_SafeCond(&init_lock, &cond, &done, NULL);
+ lock.Unlock();
+ monc->sub_want("fsmap", 0, CEPH_SUBSCRIBE_ONETIME);
+ monc->renew_subs();
+
+ // Wait for MDS map
+ dout(4) << "waiting for MDS map..." << dendl;
+ init_lock.Lock();
+ while (!done)
+ cond.Wait(init_lock);
+ init_lock.Unlock();
+ dout(4) << "Got MDS map " << fsmap->get_epoch() << dendl;
+
+ finisher.start();
+
+ inited = true;
+ return 0;
+}
+
+
+void MDSUtility::shutdown()
+{
+ finisher.stop();
+
+ lock.Lock();
+ objecter->shutdown();
+ lock.Unlock();
+ monc->shutdown();
+ messenger->shutdown();
+ messenger->wait();
+}
+
+
+bool MDSUtility::ms_dispatch(Message *m)
+{
+ Mutex::Locker locker(lock);
+ switch (m->get_type()) {
+ case CEPH_MSG_FS_MAP:
+ handle_fs_map((MFSMap*)m);
+ break;
+ case CEPH_MSG_OSD_MAP:
+ break;
+ default:
+ return false;
+ }
+ m->put();
+ return true;
+}
+
+
+void MDSUtility::handle_fs_map(MFSMap* m)
+{
+ *fsmap = m->get_fsmap();
+ if (waiting_for_mds_map) {
+ waiting_for_mds_map->complete(0);
+ waiting_for_mds_map = NULL;
+ }
+}
+
+
+bool MDSUtility::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
+{
+ if (dest_type == CEPH_ENTITY_TYPE_MON)
+ return true;
+
+ *authorizer = monc->build_authorizer(dest_type);
+ return *authorizer != NULL;
+}
diff --git a/src/tools/cephfs/MDSUtility.h b/src/tools/cephfs/MDSUtility.h
new file mode 100644
index 00000000..e75a7192
--- /dev/null
+++ b/src/tools/cephfs/MDSUtility.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef MDS_UTILITY_H_
+#define MDS_UTILITY_H_
+
+#include "osdc/Objecter.h"
+#include "mds/FSMap.h"
+#include "messages/MFSMap.h"
+#include "msg/Dispatcher.h"
+#include "msg/Messenger.h"
+#include "auth/Auth.h"
+#include "common/Finisher.h"
+#include "common/Timer.h"
+
+/// MDS Utility
+/**
+ * This class is the parent for MDS utilities, i.e. classes that
+ * need access the objects belonging to the MDS without actually
+ * acting as an MDS daemon themselves.
+ */
+class MDSUtility : public Dispatcher {
+protected:
+ Objecter *objecter;
+ FSMap *fsmap;
+ Messenger *messenger;
+ MonClient *monc;
+
+ Mutex lock;
+ Finisher finisher;
+
+ Context *waiting_for_mds_map;
+
+ bool inited;
+public:
+ MDSUtility();
+ ~MDSUtility() override;
+
+ void handle_fs_map(MFSMap* m);
+ bool ms_dispatch(Message *m) override;
+ bool ms_handle_reset(Connection *con) override { return false; }
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override { return false; }
+ bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer) override;
+ int init();
+ void shutdown();
+};
+
+#endif /* MDS_UTILITY_H_ */
diff --git a/src/tools/cephfs/PgFiles.cc b/src/tools/cephfs/PgFiles.cc
new file mode 100644
index 00000000..2abca722
--- /dev/null
+++ b/src/tools/cephfs/PgFiles.cc
@@ -0,0 +1,194 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "osdc/Striper.h"
+
+#include "PgFiles.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "pgeffects." << __func__ << ": "
+
+int PgFiles::init()
+{
+ int r = ceph_create_with_context(&cmount, g_ceph_context);
+ if (r != 0) {
+ return r;
+ }
+
+ return ceph_init(cmount);
+}
+
+PgFiles::PgFiles(Objecter *o, const std::set<pg_t> &pgs_)
+ : objecter(o), pgs(pgs_)
+{
+ for (const auto &i : pgs) {
+ pools.insert(i.m_pool);
+ }
+}
+
+PgFiles::~PgFiles()
+{
+ ceph_release(cmount);
+}
+
+void PgFiles::hit_dir(std::string const &path)
+{
+ dout(10) << "entering " << path << dendl;
+
+ ceph_dir_result *dr = nullptr;
+ int r = ceph_opendir(cmount, path.c_str(), &dr);
+ if (r != 0) {
+ derr << "Failed to open path: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ struct dirent de;
+ while((r = ceph_readdir_r(cmount, dr, &de)) != 0) {
+ if (r < 0) {
+ derr << "Error reading path " << path << ": " << cpp_strerror(r)
+ << dendl;
+ ceph_closedir(cmount, dr); // best effort, ignore r
+ return;
+ }
+
+ if (std::string(de.d_name) == "." || std::string(de.d_name) == "..") {
+ continue;
+ }
+
+ struct ceph_statx stx;
+ std::string de_path = (path + std::string("/") + de.d_name);
+ r = ceph_statx(cmount, de_path.c_str(), &stx,
+ CEPH_STATX_INO|CEPH_STATX_SIZE, 0);
+ if (r != 0) {
+ derr << "Failed to stat path " << de_path << ": "
+ << cpp_strerror(r) << dendl;
+ // Don't hold up the whole process for one bad inode
+ continue;
+ }
+
+ if (S_ISREG(stx.stx_mode)) {
+ hit_file(de_path, stx);
+ } else if (S_ISDIR(stx.stx_mode)) {
+ hit_dir(de_path);
+ } else {
+ dout(20) << "Skipping non reg/dir file: " << de_path << dendl;
+ }
+ }
+
+ r = ceph_closedir(cmount, dr);
+ if (r != 0) {
+ derr << "Error closing path " << path << ": " << cpp_strerror(r) << dendl;
+ return;
+ }
+}
+
+void PgFiles::hit_file(std::string const &path, const struct ceph_statx &stx)
+{
+ ceph_assert(S_ISREG(stx.stx_mode));
+
+ dout(20) << "Hitting file '" << path << "'" << dendl;
+
+ int l_stripe_unit = 0;
+ int l_stripe_count = 0;
+ int l_object_size = 0;
+ int l_pool_id = 0;
+ int r = ceph_get_path_layout(cmount, path.c_str(), &l_stripe_unit,
+ &l_stripe_count, &l_object_size,
+ &l_pool_id);
+ if (r != 0) {
+ derr << "Error reading layout on " << path << ": " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+
+ struct file_layout_t layout;
+ layout.stripe_unit = l_stripe_unit;
+ layout.stripe_count = l_stripe_count;
+ layout.object_size = l_object_size;
+ layout.pool_id = l_pool_id;
+
+ // Avoid calculating PG if the layout targeted a completely different pool
+ if (pools.count(layout.pool_id) == 0) {
+ dout(20) << "Fast check missed: pool " << layout.pool_id << " not in "
+ "target set" << dendl;
+ return;
+ }
+
+ auto num_objects = Striper::get_num_objects(layout, stx.stx_size);
+
+ for (uint64_t i = 0; i < num_objects; ++i) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)stx.stx_ino,
+ (long long unsigned int)i);
+ dout(20) << " object " << std::string(buf) << dendl;
+
+ pg_t target;
+ object_t oid;
+ object_locator_t loc;
+ loc.pool = layout.pool_id;
+ loc.key = std::string(buf);
+
+ unsigned pg_num_mask = 0;
+ unsigned pg_num = 0;
+
+ int r = 0;
+ objecter->with_osdmap([&r, oid, loc, &target, &pg_num_mask, &pg_num]
+ (const OSDMap &osd_map) {
+ r = osd_map.object_locator_to_pg(oid, loc, target);
+ if (r == 0) {
+ auto pool = osd_map.get_pg_pool(loc.pool);
+ pg_num_mask = pool->get_pg_num_mask();
+ pg_num = pool->get_pg_num();
+ }
+ });
+ if (r != 0) {
+ // Can happen if layout pointed to pool not in osdmap, for example
+ continue;
+ }
+
+ target.m_seed = ceph_stable_mod(target.ps(), pg_num, pg_num_mask);
+
+ dout(20) << " target " << target << dendl;
+
+ if (pgs.count(target)) {
+ std::cout << path << std::endl;
+ return;
+ }
+ }
+
+}
+
+int PgFiles::scan_path(std::string const &path)
+{
+ int r = ceph_mount(cmount, "/");
+ if (r != 0) {
+ derr << "Failed to mount: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ hit_dir(path);
+
+ r = ceph_unmount(cmount);
+ if (r != 0) {
+ derr << "Failed to unmount: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+}
+
diff --git a/src/tools/cephfs/PgFiles.h b/src/tools/cephfs/PgFiles.h
new file mode 100644
index 00000000..1ba4b3d2
--- /dev/null
+++ b/src/tools/cephfs/PgFiles.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef PG_EFFECTS_H_
+#define PG_EFFECTS_H_
+
+#include "include/cephfs/libcephfs.h"
+#include "osd/osd_types.h"
+#include <set>
+#include "osdc/Objecter.h"
+
+/**
+ * This utility scans the files (via an online MDS) and works out
+ * which ones rely on named PGs. For use when someone has
+ * some bad/damaged PGs and wants to see which files might be
+ * affected.
+ */
+class PgFiles
+{
+private:
+ Objecter *objecter;
+ struct ceph_mount_info *cmount = nullptr;
+
+ std::set<pg_t> pgs;
+ std::set<uint64_t> pools;
+
+ void hit_file(std::string const &path, const struct ceph_statx &stx);
+ void hit_dir(std::string const &path);
+
+
+public:
+ PgFiles(Objecter *o, const std::set<pg_t> &pgs_);
+ ~PgFiles();
+
+ int init();
+ int scan_path(std::string const &path);
+};
+
+#endif
+
diff --git a/src/tools/cephfs/Resetter.cc b/src/tools/cephfs/Resetter.cc
new file mode 100644
index 00000000..8ab134f8
--- /dev/null
+++ b/src/tools/cephfs/Resetter.cc
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include <memory>
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/JournalPointer.h"
+
+#include "mds/mdstypes.h"
+#include "mds/MDCache.h"
+#include "mon/MonClient.h"
+#include "mds/events/EResetJournal.h"
+
+#include "Resetter.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+int Resetter::init(mds_role_t role_, const std::string &type, bool hard)
+{
+ role = role_;
+ int r = MDSUtility::init();
+ if (r < 0) {
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(nullptr != fs);
+
+ is_mdlog = false;
+ if (type == "mdlog") {
+ JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool());
+ int rt = 0;
+ if (hard) {
+ jp.front = role.rank + MDS_INO_LOG_OFFSET;
+ jp.back = 0;
+ rt = jp.save(objecter);
+ if (rt != 0) {
+ derr << "Error writing journal pointer: " << cpp_strerror(rt) << dendl;
+ return rt;
+ }
+ ino = jp.front; // only need to reset ino for mdlog
+ } else {
+ rt = jp.load(objecter);
+ if (rt != 0) {
+ std::cerr << "Error loading journal: " << cpp_strerror(rt) <<
+ ", pass --force to forcibly reset this journal" << std::endl;
+ return rt;
+ } else {
+ ino = jp.front;
+ }
+ }
+ is_mdlog = true;
+ } else if (type == "purge_queue") {
+ ino = MDS_INO_PURGE_QUEUE + role.rank;
+ } else {
+ ceph_abort(); // should not get here
+ }
+ return 0;
+}
+
+int Resetter::reset()
+{
+ Mutex mylock("Resetter::reset::lock");
+ Cond cond;
+ bool done;
+ int r;
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ Journaler journaler("resetter", ino,
+ fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC,
+ objecter, 0, 0, &finisher);
+
+ lock.Lock();
+ journaler.recover(new C_SafeCond(&mylock, &cond, &done, &r));
+ lock.Unlock();
+
+ mylock.Lock();
+ while (!done)
+ cond.Wait(mylock);
+ mylock.Unlock();
+
+ if (r != 0) {
+ if (r == -ENOENT) {
+ cerr << "journal does not exist on-disk. Did you set a bad rank?"
+ << std::endl;
+ std::cerr << "Error loading journal: " << cpp_strerror(r) <<
+ ", pass --force to forcibly reset this journal" << std::endl;
+ return r;
+ } else {
+ cerr << "got error " << r << "from Journaler, failing" << std::endl;
+ return r;
+ }
+ }
+
+ lock.Lock();
+ uint64_t old_start = journaler.get_read_pos();
+ uint64_t old_end = journaler.get_write_pos();
+ uint64_t old_len = old_end - old_start;
+ cout << "old journal was " << old_start << "~" << old_len << std::endl;
+
+ uint64_t new_start = round_up_to(old_end+1, journaler.get_layout_period());
+ cout << "new journal start will be " << new_start
+ << " (" << (new_start - old_end) << " bytes past old end)" << std::endl;
+
+ journaler.set_read_pos(new_start);
+ journaler.set_write_pos(new_start);
+ journaler.set_expire_pos(new_start);
+ journaler.set_trimmed_pos(new_start);
+ journaler.set_writeable();
+
+ cout << "writing journal head" << std::endl;
+ journaler.write_head(new C_SafeCond(&mylock, &cond, &done, &r));
+ lock.Unlock();
+
+ mylock.Lock();
+ while (!done)
+ cond.Wait(mylock);
+ mylock.Unlock();
+
+ Mutex::Locker l(lock);
+ if (r != 0) {
+ return r;
+ }
+
+ if (is_mdlog) {
+ r = _write_reset_event(&journaler); // reset envent is specific for mdlog journal
+ if (r != 0) {
+ return r;
+ }
+ }
+ cout << "done" << std::endl;
+
+ return 0;
+}
+
+int Resetter::reset_hard()
+{
+ auto fs = fsmap->get_filesystem(role.fscid);
+
+ Journaler journaler("resetter", ino,
+ fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC,
+ objecter, 0, 0, &finisher);
+ journaler.set_writeable();
+
+ file_layout_t default_log_layout = MDCache::gen_default_log_layout(
+ fsmap->get_filesystem(role.fscid)->mds_map);
+ journaler.create(&default_log_layout, g_conf()->mds_journal_format);
+
+ C_SaferCond cond;
+ {
+ Mutex::Locker l(lock);
+ journaler.write_head(&cond);
+ }
+
+ int r = cond.wait();
+ if (r != 0) {
+ derr << "Error writing journal header: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (is_mdlog) // reset event is specific for mdlog journal
+ {
+ Mutex::Locker l(lock);
+ r = _write_reset_event(&journaler);
+ if (r != 0) {
+ derr << "Error writing EResetJournal: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ if (is_mdlog) {
+ dout(4) << "Successfully wrote new journal pointer and header for rank "
+ << role << dendl;
+ } else {
+ dout(4) << "Successfully wrote header for rank " << role << dendl;
+ }
+ return 0;
+}
+
+int Resetter::_write_reset_event(Journaler *journaler)
+{
+ ceph_assert(journaler != NULL);
+
+ auto le = std::make_unique<EResetJournal>();
+
+ bufferlist bl;
+ le->encode_with_header(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ cout << "writing EResetJournal entry" << std::endl;
+ journaler->append_entry(bl);
+
+ int ret;
+ {
+ C_SaferCond cond;
+ journaler->flush(&cond);
+ ret = cond.wait();
+ if (ret < 0)
+ return ret;
+ }
+ {
+ // wait until all journal prezero ops are done
+ C_SaferCond cond;
+ journaler->wait_for_prezero(&cond);
+ cond.wait();
+ }
+
+ return ret;
+}
+
diff --git a/src/tools/cephfs/Resetter.h b/src/tools/cephfs/Resetter.h
new file mode 100644
index 00000000..6998e459
--- /dev/null
+++ b/src/tools/cephfs/Resetter.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef JOURNAL_RESETTER_H_
+#define JOURNAL_RESETTER_H_
+
+
+#include "MDSUtility.h"
+
+class Journaler;
+
+/**
+ * This class lets you reset an mds journal for troubleshooting or whatever.
+ *
+ * To use, create a Resetter, call init(), and then call reset() with the name
+ * of the file to dump to.
+ */
+class Resetter : public MDSUtility {
+private:
+ mds_role_t role;
+ inodeno_t ino;
+ bool is_mdlog;
+
+protected:
+ int _write_reset_event(Journaler *journaler);
+
+public:
+ Resetter() {}
+ ~Resetter() {}
+
+ int init(mds_role_t role_, const std::string &type, bool hard);
+ /**
+ * For use when no journal header/pointer was present: write one
+ * out from scratch.
+ */
+ int reset_hard();
+ int reset();
+};
+
+#endif /* JOURNAL_RESETTER_H_ */
diff --git a/src/tools/cephfs/RoleSelector.cc b/src/tools/cephfs/RoleSelector.cc
new file mode 100644
index 00000000..e2d53b86
--- /dev/null
+++ b/src/tools/cephfs/RoleSelector.cc
@@ -0,0 +1,59 @@
+
+#include "RoleSelector.h"
+
+int MDSRoleSelector::parse_rank(
+ const FSMap &fsmap,
+ std::string const &str)
+{
+ if (str == "all" || str == "*") {
+ std::set<mds_rank_t> in;
+ const MDSMap &mds_map = fsmap.get_filesystem(fscid)->mds_map;
+ mds_map.get_mds_set(in);
+
+ for (auto rank : in) {
+ roles.push_back(mds_role_t(fscid, rank));
+ }
+
+ return 0;
+ } else {
+ std::string rank_err;
+ mds_rank_t rank = strict_strtol(str.c_str(), 10, &rank_err);
+ if (!rank_err.empty()) {
+ return -EINVAL;
+ }
+ if (fsmap.get_filesystem(fscid)->mds_map.is_dne(rank)) {
+ return -ENOENT;
+ }
+ roles.push_back(mds_role_t(fscid, rank));
+ return 0;
+ }
+}
+
+int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str,
+ bool allow_unqualified_rank)
+{
+ auto colon_pos = str.find(":");
+ if (colon_pos == std::string::npos) {
+ // An unqualified rank. Only valid if there is only one
+ // namespace.
+ if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) {
+ fscid = fsmap.get_filesystem()->fscid;
+ return parse_rank(fsmap, str);
+ } else {
+ return -EINVAL;
+ }
+ } else if (colon_pos == 0 || colon_pos == str.size() - 1) {
+ return -EINVAL;
+ } else {
+ const std::string ns_str = str.substr(0, colon_pos);
+ const std::string rank_str = str.substr(colon_pos + 1);
+ std::shared_ptr<const Filesystem> fs_ptr;
+ int r = fsmap.parse_filesystem(ns_str, &fs_ptr);
+ if (r != 0) {
+ return r;
+ }
+ fscid = fs_ptr->fscid;
+ return parse_rank(fsmap, rank_str);
+ }
+}
+
diff --git a/src/tools/cephfs/RoleSelector.h b/src/tools/cephfs/RoleSelector.h
new file mode 100644
index 00000000..9090b720
--- /dev/null
+++ b/src/tools/cephfs/RoleSelector.h
@@ -0,0 +1,36 @@
+
+#ifndef ROLE_SELECTOR_H_
+#define ROLE_SELECTOR_H_
+
+#include <string>
+#include <vector>
+#include "mds/mdstypes.h"
+#include "mds/FSMap.h"
+
+/**
+ * When you want to let the user act on a single rank in a namespace,
+ * or all of them.
+ */
+class MDSRoleSelector
+{
+ public:
+ const std::vector<mds_role_t> &get_roles() const {return roles;}
+ int parse(const FSMap &fsmap, std::string const &str,
+ bool allow_unqualified_rank=true);
+ MDSRoleSelector()
+ : fscid(FS_CLUSTER_ID_NONE)
+ {}
+ fs_cluster_id_t get_ns() const
+ {
+ return fscid;
+ }
+ protected:
+ int parse_rank(
+ const FSMap &fsmap,
+ std::string const &str);
+ std::vector<mds_role_t> roles;
+ fs_cluster_id_t fscid;
+};
+
+#endif // ROLE_SELECTOR_H_
+
diff --git a/src/tools/cephfs/TableTool.cc b/src/tools/cephfs/TableTool.cc
new file mode 100644
index 00000000..e779b4b6
--- /dev/null
+++ b/src/tools/cephfs/TableTool.cc
@@ -0,0 +1,417 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+
+#include "mds/SessionMap.h"
+#include "mds/InoTable.h"
+#include "mds/SnapServer.h"
+
+#include "TableTool.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+void TableTool::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-table-tool <all|[mds rank]> <reset|show> <session|snap|inode>"
+ << " cephfs-table-tool <all|[mds rank]> <take_inos> <max_ino>"
+ << std::endl;
+
+ generic_client_usage();
+}
+
+
+/**
+ * For a function that takes an MDS role as an argument and
+ * returns an error code, execute it on the roles specified
+ * by `role_selector`.
+ */
+int TableTool::apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f)
+{
+ ceph_assert(f != NULL);
+
+ int r = 0;
+
+ f->open_object_section("ranks");
+
+ for (auto role : role_selector.get_roles()) {
+ std::ostringstream rank_str;
+ rank_str << role.rank;
+ f->open_object_section(rank_str.str().c_str());
+
+ f->open_object_section("data");
+ int rank_r = fptr(role, f);
+ f->close_section();
+ r = r ? r : rank_r;
+
+ f->dump_int("result", rank_r);
+ f->close_section();
+
+
+ }
+
+ f->close_section();
+
+ return r;
+}
+
+
+/**
+ * This class wraps an MDS table class (SessionMap, SnapServer, InoTable)
+ * with offline load/store code such that we can do offline dumps and resets
+ * on those tables.
+ */
+template <typename A>
+class TableHandler
+{
+protected:
+ // The RADOS object ID for the table
+ std::string object_name;
+
+ // The role in question (may be NONE)
+ mds_role_t role;
+
+ // Whether this is an MDSTable subclass (i.e. has leading version field to decode)
+ bool mds_table;
+
+public:
+ TableHandler(mds_role_t r, std::string const &name, bool mds_table_)
+ : role(r), mds_table(mds_table_)
+ {
+ // Compose object name of the table we will dump
+ std::ostringstream oss;
+ oss << "mds";
+ if (!role.is_none()) {
+ oss << role.rank;
+ }
+ oss << "_" << name;
+ object_name = oss.str();
+ }
+
+ int load_and_dump(librados::IoCtx *io, Formatter *f)
+ {
+ ceph_assert(io != NULL);
+ ceph_assert(f != NULL);
+
+ // Attempt read
+ bufferlist table_bl;
+ int read_r = io->read(object_name, table_bl, 0, 0);
+ if (read_r >= 0) {
+ auto q = table_bl.cbegin();
+ try {
+ if (mds_table) {
+ version_t version;
+ decode(version, q);
+ f->dump_int("version", version);
+ }
+ A table_inst;
+ table_inst.set_rank(role.rank);
+ table_inst.decode(q);
+ table_inst.dump(f);
+
+ return 0;
+ } catch (buffer::error &e) {
+ derr << "table " << object_name << " is corrupt" << dendl;
+ return -EIO;
+ }
+ } else {
+ derr << "error reading table object " << object_name
+ << ": " << cpp_strerror(read_r) << dendl;
+ return read_r;
+ }
+ }
+
+ int reset(librados::IoCtx *io)
+ {
+ A table_inst;
+ // Compose new (blank) table
+ table_inst.set_rank(role.rank);
+ table_inst.reset_state();
+ // Write the table out
+ return write(table_inst, io);
+ }
+
+protected:
+
+ int write(const A &table_inst, librados::IoCtx *io)
+ {
+ bufferlist new_bl;
+ if (mds_table) {
+ version_t version = 1;
+ encode(version, new_bl);
+ }
+ table_inst.encode_state(new_bl);
+
+ // Write out new table
+ int r = io->write_full(object_name, new_bl);
+ if (r != 0) {
+ derr << "error writing table object " << object_name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+ }
+};
+
+template <typename A>
+class TableHandlerOmap
+{
+private:
+ // The RADOS object ID for the table
+ std::string object_name;
+
+ // The role (rank may be NONE)
+ mds_role_t role;
+
+ // Whether this is an MDSTable subclass (i.e. has leading version field to decode)
+ bool mds_table;
+
+public:
+ TableHandlerOmap(mds_role_t r, std::string const &name, bool mds_table_)
+ : role(r), mds_table(mds_table_)
+ {
+ // Compose object name of the table we will dump
+ std::ostringstream oss;
+ oss << "mds";
+ if (!role.is_none()) {
+ oss << role.rank;
+ }
+ oss << "_" << name;
+ object_name = oss.str();
+ }
+
+ int load_and_dump(librados::IoCtx *io, Formatter *f)
+ {
+ ceph_assert(io != NULL);
+ ceph_assert(f != NULL);
+
+ // Read in the header
+ bufferlist header_bl;
+ int r = io->omap_get_header(object_name, &header_bl);
+ if (r != 0) {
+ derr << "error reading header on '" << object_name << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Decode the header
+ A table_inst;
+ table_inst.set_rank(role.rank);
+ try {
+ table_inst.decode_header(header_bl);
+ } catch (buffer::error &e) {
+ derr << "table " << object_name << " is corrupt" << dendl;
+ return -EIO;
+ }
+
+ // Read and decode OMAP values in chunks
+ std::string last_key = "";
+ while(true) {
+ std::map<std::string, bufferlist> values;
+ int r = io->omap_get_vals(object_name, last_key,
+ g_conf()->mds_sessionmap_keys_per_op, &values);
+
+ if (r != 0) {
+ derr << "error reading values: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (values.empty()) {
+ break;
+ }
+
+ try {
+ table_inst.decode_values(values);
+ } catch (buffer::error &e) {
+ derr << "table " << object_name << " is corrupt" << dendl;
+ return -EIO;
+ }
+ last_key = values.rbegin()->first;
+ }
+
+ table_inst.dump(f);
+
+ return 0;
+ }
+
+ int reset(librados::IoCtx *io)
+ {
+ A table_inst;
+ table_inst.set_rank(role.rank);
+ table_inst.reset_state();
+ bufferlist header_bl;
+ table_inst.encode_header(&header_bl);
+
+ // Compose a transaction to clear and write header
+ librados::ObjectWriteOperation op;
+ op.omap_clear();
+ op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
+ op.omap_set_header(header_bl);
+
+ return io->operate(object_name, &op);
+ }
+};
+
+class InoTableHandler : public TableHandler<InoTable>
+{
+ public:
+ explicit InoTableHandler(mds_role_t r)
+ : TableHandler(r, "inotable", true)
+ {}
+
+ int take_inos(librados::IoCtx *io, inodeno_t max, Formatter *f)
+ {
+ InoTable inst;
+ inst.set_rank(role.rank);
+ inst.reset_state();
+
+ int r = 0;
+ if (inst.force_consume_to(max)) {
+ r = write(inst, io);
+ }
+
+ f->dump_int("version", inst.get_version());
+ inst.dump(f);
+
+ return r;
+ }
+};
+
+
+int TableTool::main(std::vector<const char*> &argv)
+{
+ int r;
+
+ dout(10) << __func__ << dendl;
+
+ // RADOS init
+ // ==========
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+ return r;
+ }
+
+ dout(4) << "connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Require at least 3 args <rank> <mode> <arg> [args...]
+ if (argv.size() < 3) {
+ cerr << "missing required 3 arguments" << std::endl;
+ return -EINVAL;
+ }
+
+ const std::string role_str = std::string(argv[0]);
+ const std::string mode = std::string(argv[1]);
+ const std::string table = std::string(argv[2]);
+
+ r = role_selector.parse(*fsmap, role_str);
+ if (r < 0) {
+ derr << "Bad rank selection: " << role_str << "'" << dendl;
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ ceph_assert(fs != nullptr);
+ int64_t const pool_id = fs->mds_map.get_metadata_pool();
+ dout(4) << "resolving pool " << pool_id << dendl;
+ std::string pool_name;
+ r = rados.pool_reverse_lookup(pool_id, &pool_name);
+ if (r < 0) {
+ derr << "Pool " << pool_id << " identified in MDS map not found in RADOS!"
+ << dendl;
+ return r;
+ }
+
+ dout(4) << "creating IoCtx.." << dendl;
+ r = rados.ioctx_create(pool_name.c_str(), io);
+ if (r != 0) {
+ return r;
+ }
+
+ JSONFormatter jf(true);
+ if (mode == "reset") {
+ const std::string table = std::string(argv[2]);
+ if (table == "session") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).reset(&io);
+ }, &jf);
+ } else if (table == "inode") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandler<InoTable>(rank, "inotable", true).reset(&io);
+ }, &jf);
+ } else if (table == "snap") {
+ r = TableHandler<SnapServer>(mds_role_t(), "snaptable", true).reset(&io);
+ jf.open_object_section("reset_snap_status");
+ jf.dump_int("result", r);
+ jf.close_section();
+ } else {
+ cerr << "Invalid table '" << table << "'" << std::endl;
+ return -EINVAL;
+ }
+ } else if (mode == "show") {
+ const std::string table = std::string(argv[2]);
+ if (table == "session") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f);
+ }, &jf);
+ } else if (table == "inode") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandler<InoTable>(rank, "inotable", true).load_and_dump(&io, f);;
+ }, &jf);
+ } else if (table == "snap") {
+ jf.open_object_section("show_snap_table");
+ {
+ r = TableHandler<SnapServer>(
+ mds_role_t(), "snaptable", true).load_and_dump(&io, &jf);
+ jf.dump_int("result", r);
+ }
+ jf.close_section();
+ } else {
+ cerr << "Invalid table '" << table << "'" << std::endl;
+ return -EINVAL;
+ }
+ } else if (mode == "take_inos") {
+ const std::string ino_str = std::string(argv[2]);
+ std::string ino_err;
+ inodeno_t ino = strict_strtoll(ino_str.c_str(), 10, &ino_err);
+ if (!ino_err.empty()) {
+ derr << "Bad ino '" << ino_str << "'" << dendl;
+ return -EINVAL;
+ }
+ r = apply_role_fn([this, ino](mds_role_t rank, Formatter *f) -> int {
+ return InoTableHandler(rank).take_inos(&io, ino, f);
+ }, &jf);
+ } else {
+ cerr << "Invalid mode '" << mode << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // Subcommand should have written to formatter, flush it
+ jf.flush(std::cout);
+ std::cout << std::endl;
+ return r;
+}
+
diff --git a/src/tools/cephfs/TableTool.h b/src/tools/cephfs/TableTool.h
new file mode 100644
index 00000000..bf9b95c1
--- /dev/null
+++ b/src/tools/cephfs/TableTool.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+
+#include "include/rados/librados.hpp"
+
+/**
+ * Command line tool for debugging the backing store of
+ * MDSTable instances.
+ */
+class TableTool : public MDSUtility
+{
+ private:
+ MDSRoleSelector role_selector;
+
+ // I/O handles
+ librados::Rados rados;
+ librados::IoCtx io;
+
+ int apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f);
+
+ public:
+ static void usage();
+ int main(std::vector<const char*> &argv);
+
+};
+
diff --git a/src/tools/cephfs/cephfs-data-scan.cc b/src/tools/cephfs/cephfs-data-scan.cc
new file mode 100644
index 00000000..e6efff66
--- /dev/null
+++ b/src/tools/cephfs/cephfs-data-scan.cc
@@ -0,0 +1,47 @@
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "DataScan.h"
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ DataScan::usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ DataScan data_scan;
+
+ // Connect to mon cluster, download MDS map etc
+ int rc = data_scan.init();
+ if (rc != 0) {
+ std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+
+ // Finally, execute the user's commands
+ rc = data_scan.main(args);
+ if (rc != 0) {
+ std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+ }
+
+
+ return rc;
+}
+
diff --git a/src/tools/cephfs/cephfs-journal-tool.cc b/src/tools/cephfs/cephfs-journal-tool.cc
new file mode 100644
index 00000000..290cb305
--- /dev/null
+++ b/src/tools/cephfs/cephfs-journal-tool.cc
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "JournalTool.h"
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ JournalTool::usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ JournalTool jt;
+
+ // Connect to mon cluster, download MDS map etc
+ int rc = jt.init();
+ if (rc != 0) {
+ std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+
+ // Finally, execute the user's commands
+ rc = jt.main(args);
+ if (rc != 0) {
+ std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+ }
+
+ return rc;
+}
+
diff --git a/src/tools/cephfs/cephfs-shell b/src/tools/cephfs/cephfs-shell
new file mode 100644
index 00000000..5db84b56
--- /dev/null
+++ b/src/tools/cephfs/cephfs-shell
@@ -0,0 +1,1295 @@
+#!/usr/bin/python3
+# coding = utf-8
+
+import argparse
+import os
+import os.path
+import sys
+from cmd2 import Cmd
+import cephfs as libcephfs
+import shutil
+import traceback
+import colorama
+import fnmatch
+import math
+import re
+import shlex
+
+if sys.version_info.major < 3:
+ raise RuntimeError("cephfs-shell is only compatible with python3")
+
+try:
+ from cmd2 import with_argparser
+except ImportError:
+ def with_argparser(argparser):
+ import functools
+
+ def argparser_decorator(func):
+ @functools.wraps(func)
+ def wrapper(thiz, cmdline):
+ if isinstance(cmdline, list):
+ arglist = cmdline
+ else:
+ # do not split if it's already a list
+ arglist = shlex.split(cmdline, posix=False)
+ # in case user quotes the command args
+ arglist = [arg.strip('\'""') for arg in arglist]
+ try:
+ args = argparser.parse_args(arglist)
+ except SystemExit:
+ # argparse exits at seeing bad arguments
+ return
+ else:
+ return func(thiz, args)
+ argparser.prog = func.__name__[3:]
+ if argparser.description is None and func.__doc__:
+ argparser.description = func.__doc__
+
+ return wrapper
+
+ return argparser_decorator
+
+
+cephfs = None
+shell = None
+
+
+def poutput(s, end='\n'):
+ shell.poutput(s, end=end)
+
+
+def setup_cephfs(config_file):
+ """
+ Mouting a cephfs
+ """
+ global cephfs
+ cephfs = libcephfs.LibCephFS(conffile=config_file)
+ cephfs.mount()
+
+
+def mode_notation(mode):
+ """
+ """
+ permission_bits = {'0': '---',
+ '1': '--x',
+ '2': '-w-',
+ '3': '-wx',
+ '4': 'r--',
+ '5': 'r-x',
+ '6': 'rw-',
+ '7': 'rwx'}
+ mode = str(oct(mode))
+ notation = '-'
+ if mode[2] == '4':
+ notation = 'd'
+ for i in mode[-3:]:
+ notation += permission_bits[i]
+ return notation
+
+
+def get_chunks(file_size):
+ chunk_start = 0
+ chunk_size = 0x20000 # 131072 bytes, default max ssl buffer size
+ while chunk_start + chunk_size < file_size:
+ yield(chunk_start, chunk_size)
+ chunk_start += chunk_size
+ final_chunk_size = file_size - chunk_start
+ yield(chunk_start, final_chunk_size)
+
+
+def to_bytes(string):
+ return bytes(string, encoding='utf-8')
+
+def ls(path, opts=''):
+ # opts tries to be like /bin/ls opts
+ almost_all = 'A' in opts
+ try:
+ with cephfs.opendir(path) as d:
+ while True:
+ dent = cephfs.readdir(d)
+ if dent is None:
+ return
+ elif almost_all and dent.d_name in (b'.', b'..'):
+ continue
+ yield dent
+ except cephfs.ObjectNotFound:
+ return []
+
+def glob(path, pattern):
+ paths = []
+ parent_dir = os.path.dirname(path)
+ if parent_dir == b'':
+ parent_dir = b'/'
+ if path == b'/' or is_dir_exists(os.path.basename(path), parent_dir):
+ for i in ls(path, opts='A'):
+ if fnmatch.fnmatch(i.d_name, pattern):
+ paths.append(os.path.join(path, i.d_name))
+ return paths
+
+
+def locate_file(name, case_sensitive=True):
+ dir_list = sorted(set(dirwalk(cephfs.getcwd())))
+ if not case_sensitive:
+ return [dname for dname in dir_list if name.lower() in dname.lower()]
+ else:
+ return [dname for dname in dir_list if name in dname]
+
+
+def get_all_possible_paths(pattern):
+ complete_pattern = pattern[:]
+ paths = []
+ is_rel_path = not os.path.isabs(pattern)
+ if is_rel_path:
+ dir_ = cephfs.getcwd()
+ else:
+ dir_ = b'/'
+ pattern = pattern[1:]
+ patterns = pattern.split(b'/')
+ paths.extend(glob(dir_, patterns[0]))
+ patterns.pop(0)
+ for pattern in patterns:
+ for path in paths:
+ paths.extend(glob(path, pattern))
+ return [path for path in paths if fnmatch.fnmatch(path,
+ os.path.join(cephfs.getcwd(), complete_pattern))]
+
+
+suffixes = ['B', 'K', 'M', 'G', 'T', 'P']
+
+
+def humansize(nbytes):
+ i = 0
+ while nbytes >= 1024 and i < len(suffixes)-1:
+ nbytes /= 1024.
+ i += 1
+ nbytes = math.ceil(nbytes)
+ f = ('%d' % nbytes).rstrip('.')
+ return '%s%s' % (f, suffixes[i])
+
+
+def print_long(path, is_dir, human_readable):
+ info = cephfs.stat(path)
+ pretty = os.path.basename(path.decode('utf-8'))
+ if is_dir:
+ pretty = colorama.Style.BRIGHT + colorama.Fore.CYAN + pretty + '/' + colorama.Style.RESET_ALL
+ if human_readable:
+ poutput('{}\t{:10s} {} {} {} {}'.format(
+ mode_notation(info.st_mode),
+ humansize(info.st_size), info.st_uid,
+ info.st_gid, info.st_mtime, pretty, sep='\t'))
+ else:
+ poutput('{} {:12d} {} {} {} {}'.format(
+ mode_notation(info.st_mode), info.st_size, info.st_uid,
+ info.st_gid, info.st_mtime, pretty, sep='\t'))
+
+
+def word_len(word):
+ """
+ Returns the word length, minus any color codes.
+ """
+ if word[0] == '\x1b':
+ return len(word) - 9
+ return len(word)
+
+
+def is_dir_exists(path, dir_=b''):
+ path_to_stat = os.path.join(dir_, path)
+ try:
+ return ((cephfs.stat(path_to_stat).st_mode & 0o0040000) != 0)
+ except libcephfs.Error:
+ return False
+
+
+def is_file_exists(path, dir_=b''):
+ try:
+ # if its not a directory, then its a file
+ return ((cephfs.stat(os.path.join(dir_, path)).st_mode & 0o0040000) == 0)
+ except libcephfs.Error:
+ return False
+
+
+def print_list(words, termwidth=79):
+ if not words:
+ return
+ words = [word.decode('utf-8') if isinstance(word, bytes) else word for word in words]
+ width = max([word_len(word) for word in words]) + 2
+ nwords = len(words)
+ ncols = max(1, (termwidth + 1) // (width + 1))
+ nrows = (nwords + ncols - 1) // ncols
+ for row in range(nrows):
+ for i in range(row, nwords, nrows):
+ word = words[i]
+ print_width = width
+ if word[0] == '\x1b':
+ print_width = print_width + 10
+
+ poutput('%-*s' % (print_width, words[i]),
+ end='\n' if i + nrows >= nwords else '')
+
+
+def copy_from_local(local_path, remote_path):
+ stdin = -1
+ file_ = None
+ fd = None
+ convert_to_bytes = False
+ if local_path == b'-':
+ file_ = sys.stdin
+ convert_to_bytes = True
+ else:
+ try:
+ file_ = open(local_path, 'rb')
+ except PermissionError:
+ perror('error: no permission to read local file {}'.format(
+ local_path.decode('utf-8')), end='\n', apply_style=True)
+ return
+ stdin = 1
+ try:
+ fd = cephfs.open(remote_path, 'w', 0o666)
+ except libcephfs.Error:
+ perror('error: no permission to write remote file {}'.format(
+ remote_path.decode('utf-8')), end='\n', apply_style=True)
+ return
+ progress = 0
+ while True:
+ data = file_.read(65536)
+ if not data or len(data) == 0:
+ break
+ if convert_to_bytes:
+ data = to_bytes(data)
+ wrote = cephfs.write(fd, data, progress)
+ if wrote < 0:
+ break
+ progress += wrote
+ cephfs.close(fd)
+ if stdin > 0:
+ file_.close()
+ poutput('')
+
+
+def copy_to_local(remote_path, local_path):
+ fd = None
+ if local_path != b'-':
+ local_dir = os.path.dirname(local_path)
+ dir_list = remote_path.rsplit(b'/', 1)
+ if not os.path.exists(local_dir):
+ os.makedirs(local_dir)
+ if len(dir_list) > 2 and dir_list[1] == b'':
+ return
+ fd = open(local_path, 'wb+')
+ file_ = cephfs.open(remote_path, 'r')
+ file_size = cephfs.stat(remote_path).st_size
+ if file_size <= 0:
+ return
+ progress = 0
+ for chunk_start, chunk_size in get_chunks(file_size):
+ file_chunk = cephfs.read(file_, chunk_start, chunk_size)
+ progress += len(file_chunk)
+ if fd:
+ fd.write(file_chunk)
+ else:
+ poutput(file_chunk.decode('utf-8'))
+ cephfs.close(file_)
+ if fd:
+ fd.close()
+
+
+def dirwalk(path):
+ """
+ walk a directory tree, using a generator
+ """
+ path = os.path.normpath(path)
+ for item in ls(path, opts='A'):
+ fullpath = os.path.join(path, item.d_name)
+ src_path = fullpath.rsplit(b'/', 1)[0]
+
+ yield os.path.normpath(fullpath)
+ if is_dir_exists(item.d_name, src_path):
+ for x in dirwalk(fullpath):
+ yield x
+
+
+class CephFSShell(Cmd):
+
+ def __init__(self):
+ super().__init__(use_ipython=False)
+ self.working_dir = cephfs.getcwd().decode('utf-8')
+ self.set_prompt()
+ self.interactive = False
+ self.umask = '2'
+
+ def default(self, line):
+ self.poutput('Unrecognized command')
+
+ def set_prompt(self):
+ self.prompt = ('\033[01;33mCephFS:~' + colorama.Fore.LIGHTCYAN_EX
+ + self.working_dir + colorama.Style.RESET_ALL
+ + '\033[01;33m>>>\033[00m ')
+
+ def create_argparser(self, command):
+ try:
+ argparse_args = getattr(self, 'argparse_' + command)
+ except AttributeError:
+ return None
+ doc_lines = getattr(
+ self, 'do_' + command).__doc__.expandtabs().splitlines()
+ if ''in doc_lines:
+ blank_idx = doc_lines.index('')
+ usage = doc_lines[:blank_idx]
+ description = doc_lines[blank_idx + 1:]
+ else:
+ usage = doc_lines
+ description = []
+ parser = argparse.ArgumentParser(
+ prog=command,
+ usage='\n'.join(usage),
+ description='\n'.join(description),
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
+ for args, kwargs in argparse_args:
+ parser.add_argument(*args, **kwargs)
+ return parser
+
+ def complete_filenames(self, text, line, begidx, endidx):
+ if not text:
+ completions = [x.d_name.decode('utf-8') + '/' * int(x.is_dir())
+ for x in ls(b".", opts='A')]
+ else:
+ if text.count('/') > 0:
+ completions = [text.rsplit('/', 1)[0] + '/'
+ + x.d_name.decode('utf-8') + '/'
+ * int(x.is_dir()) for x in ls('/'
+ + text.rsplit('/', 1)[0], opts='A')
+ if x.d_name.decode('utf-8').startswith(
+ text.rsplit('/', 1)[1])]
+ else:
+ completions = [x.d_name.decode('utf-8') + '/'
+ * int(x.is_dir()) for x in ls(b".", opts='A')
+ if x.d_name.decode('utf-8').startswith(text)]
+ if len(completions) == 1 and completions[0][-1] == '/':
+ dir_, file_ = completions[0].rsplit('/', 1)
+ completions.extend([dir_ + '/' + x.d_name.decode('utf-8')
+ + '/' * int(x.is_dir()) for x in
+ ls('/' + dir_, opts='A')
+ if x.d_name.decode('utf-8').startswith(file_)])
+ return self.delimiter_complete(text, line, begidx, endidx, completions, '/')
+ return completions
+
+ def onecmd(self, line):
+ """
+ Global error catcher
+ """
+ try:
+ res = Cmd.onecmd(self, line)
+ if self.interactive:
+ self.set_prompt()
+ return res
+ except ConnectionError as e:
+ self.poutput('***', e)
+ except KeyboardInterrupt:
+ self.poutput('Command aborted')
+ except Exception as e:
+ self.poutput(e)
+ traceback.print_exc(file=sys.stdout)
+
+ class path_to_bytes(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ if isinstance(values, str):
+ values = to_bytes(values)
+ if isinstance(values, list):
+ values = list(map(to_bytes, values))
+ setattr(namespace, self.dest, values)
+
+ def complete_mkdir(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ class ModeAction(argparse.Action):
+ def __init__(self, option_strings, dest, nargs=None, **kwargs):
+ if nargs is not None and nargs != '?':
+ raise ValueError("more than one modes not allowed")
+ super().__init__(option_strings, dest, **kwargs)
+
+ def __call__(self, parser, namespace, values, option_string=None):
+ o_mode = 0
+ res = None
+ try:
+ o_mode = int(values, base=8)
+ except ValueError:
+ res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', values)
+ if res is None:
+ parser.error("invalid mode: %s\n"
+ "mode must be a numeric octal literal\n"
+ "or ((u?g?o?)|(a?))(=)(r?w?x?)" %
+ values)
+ else:
+ # we are supporting only assignment of mode and not + or -
+ # as is generally available with the chmod command
+ # eg.
+ # >>> res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', 'go=')
+ # >>> res.groups()
+ # ('go', 'go', None, '=', '')
+ val = res.groups()
+
+ if val[3] != '=':
+ parser.error("need assignment operator between user "
+ "and mode specifiers")
+ if val[4] == '':
+ parser.error("invalid mode: %s\n"
+ "mode must be combination of: r | w | x" %
+ values)
+ users = ''
+ if val[2] is None:
+ users = val[1]
+ else:
+ users = val[2]
+
+ t_mode = 0
+ if users == 'a':
+ users = 'ugo'
+
+ if 'r' in val[4]:
+ t_mode |= 4
+ if 'w' in val[4]:
+ t_mode |= 2
+ if 'x' in val[4]:
+ t_mode |= 1
+
+ if 'u' in users:
+ o_mode |= (t_mode << 6)
+ if 'g' in users:
+ o_mode |= (t_mode << 3)
+ if 'o' in users:
+ o_mode |= t_mode
+
+ if o_mode < 0:
+ parser.error("invalid mode: %s\n"
+ "mode cannot be negative" % values)
+ if o_mode > 0o777:
+ parser.error("invalid mode: %s\n"
+ "mode cannot be greater than octal 0777" % values)
+
+ setattr(namespace, self.dest, str(oct(o_mode)))
+
+ mkdir_parser = argparse.ArgumentParser(
+ description='Create the directory(ies), if they do not already exist.')
+ mkdir_parser.add_argument('dirs', type=str,
+ action=path_to_bytes,
+ metavar='DIR_NAME',
+ help='Name of new_directory.',
+ nargs='+')
+ mkdir_parser.add_argument('-m', '--mode', type=str,
+ action=ModeAction,
+ help='Sets the access mode for the new directory.')
+ mkdir_parser.add_argument('-p', '--parent', action='store_true',
+ help='Create parent directories as necessary. \
+When this option is specified, no error is reported if a directory already \
+exists.')
+
+ @with_argparser(mkdir_parser)
+ def do_mkdir(self, args):
+ """
+ Create directory.
+ """
+ for path in args.dirs:
+ if args.mode:
+ permission = int(args.mode, 8)
+ else:
+ permission = 0o777
+ if args.parent:
+ cephfs.mkdirs(path, permission)
+ else:
+ try:
+ cephfs.mkdir(path, permission)
+ except libcephfs.Error:
+ self.poutput("directory missing in the path; "
+ "you may want to pass the -p argument")
+ return
+
+ def complete_put(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ index_dict = {1: self.path_complete}
+ return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+ put_parser = argparse.ArgumentParser(
+ description='Copy a file/directory to Ceph File System from Local File System.')
+ put_parser.add_argument('local_path', type=str, action=path_to_bytes,
+ help='Path of the file in the local system')
+ put_parser.add_argument('remote_path', type=str, action=path_to_bytes,
+ help='Path of the file in the remote system.',
+ nargs='?', default='.')
+ put_parser.add_argument('-f', '--force', action='store_true',
+ help='Overwrites the destination if it already exists.')
+
+ @with_argparser(put_parser)
+ def do_put(self, args):
+ """
+ Copy a file to Ceph File System from Local Directory.
+ """
+ root_src_dir = args.local_path
+ root_dst_dir = args.remote_path
+ if args.local_path == b'.' or args.local_path == b'./':
+ root_src_dir = os.getcwdb()
+ elif len(args.local_path.rsplit(b'/', 1)) < 2:
+ root_src_dir = os.path.join(os.getcwdb(), args.local_path)
+ else:
+ p = args.local_path.split(b'/')
+ if p[0] == b'.':
+ root_src_dir = os.getcwdb()
+ p.pop(0)
+ while len(p) > 0:
+ root_src_dir += b'/' + p.pop(0)
+
+ if root_dst_dir == b'.':
+ if args.local_path != b'-':
+ root_dst_dir = root_src_dir.rsplit(b'/', 1)[1]
+ if root_dst_dir == b'':
+ root_dst_dir = root_src_dir.rsplit(b'/', 1)[0]
+ a = root_dst_dir.rsplit(b'/', 1)
+ if len(a) > 1:
+ root_dst_dir = a[1]
+ else:
+ root_dst_dir = a[0]
+ else:
+ self.poutput("error: no filename specified for destination")
+ return
+
+ if root_dst_dir[-1] != b'/':
+ root_dst_dir += b'/'
+
+ if args.local_path == b'-' or os.path.isfile(root_src_dir):
+ if not args.force:
+ if os.path.isfile(root_src_dir):
+ dst_file = root_dst_dir
+ if is_file_exists(dst_file):
+ self.perror('{}: file exists! use --force to overwrite'.format(
+ dst_file.decode('utf-8')), end='\n',
+ apply_style=True)
+ return
+ if args.local_path == b'-':
+ root_src_dir = b'-'
+ copy_from_local(root_src_dir, root_dst_dir)
+ else:
+ for src_dir, dirs, files in os.walk(root_src_dir):
+ if isinstance(src_dir, str):
+ src_dir = to_bytes(src_dir)
+ dst_dir = src_dir.replace(root_src_dir, root_dst_dir, 1)
+ dst_dir = re.sub(rb'\/+', b'/', cephfs.getcwd()
+ + dst_dir)
+ if args.force and dst_dir != b'/' and not is_dir_exists(
+ dst_dir[:-1]) and not locate_file(dst_dir):
+ try:
+ cephfs.mkdirs(dst_dir, 0o777)
+ except libcephfs.Error:
+ pass
+ if (not args.force) and dst_dir != b'/' and not is_dir_exists(
+ dst_dir) and not os.path.isfile(root_src_dir):
+ try:
+ cephfs.mkdirs(dst_dir, 0o777)
+ except libcephfs.Error:
+ pass
+
+ for dir_ in dirs:
+ dir_name = os.path.join(dst_dir, dir_)
+ if not is_dir_exists(dir_name):
+ try:
+ cephfs.mkdirs(dir_name, 0o777)
+ except libcephfs.Error:
+ pass
+
+ for file_ in files:
+ src_file = os.path.join(src_dir, file_)
+ dst_file = re.sub(rb'\/+', b'/', b'/' + dst_dir + b'/' + file_)
+ if (not args.force) and is_file_exists(dst_file):
+ return
+ copy_from_local(src_file, os.path.join(cephfs.getcwd(),
+ dst_file))
+
+ def complete_get(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ get_parser = argparse.ArgumentParser(
+ description='Copy a file from Ceph File System from Local Directory.')
+ get_parser.add_argument('remote_path', type=str, action=path_to_bytes,
+ help='Path of the file in the remote system')
+ get_parser.add_argument('local_path', type=str, action=path_to_bytes,
+ help='Path of the file in the local system',
+ nargs='?', default='.')
+ get_parser.add_argument('-f', '--force', action='store_true',
+ help='Overwrites the destination if it already exists.')
+
+ @with_argparser(get_parser)
+ def do_get(self, args):
+ """
+ Copy a file/directory from Ceph File System to Local Directory.
+ """
+ root_src_dir = args.remote_path
+ root_dst_dir = args.local_path
+ fname = root_src_dir.rsplit(b'/', 1)
+ if args.local_path == b'.':
+ root_dst_dir = os.getcwdb()
+ if args.remote_path == b'.':
+ root_src_dir = cephfs.getcwd()
+ if args.local_path == b'-':
+ if args.remote_path == b'.' or args.remote_path == b'./':
+ self.perror('error: no remote file name specified', end='\n',
+ apply_style=True)
+ return
+ copy_to_local(root_src_dir, b'-')
+ elif is_file_exists(args.remote_path):
+ copy_to_local(root_src_dir,
+ root_dst_dir + b'/' + root_src_dir)
+ elif b'/'in root_src_dir and is_file_exists(fname[1], fname[0]):
+ copy_to_local(root_src_dir, root_dst_dir)
+ else:
+ files = list(reversed(sorted(dirwalk(root_src_dir))))
+ if len(files) == 0:
+ try:
+ os.makedirs(root_dst_dir + b'/' + root_src_dir)
+ except OSError:
+ if args.force:
+ pass
+ else:
+ self.perror('{}: already exists! use --force to overwrite'.format(
+ root_src_dir.decode('utf-8')), end='\n',
+ apply_style=True)
+ return
+
+ for file_ in files:
+ dst_dirpath, dst_file = file_.rsplit(b'/', 1)
+ if dst_dirpath in files:
+ files.remove(dst_dirpath)
+ dst_path = os.path.join(root_dst_dir, dst_dirpath, dst_file)
+ dst_path = os.path.normpath(dst_path)
+ if is_dir_exists(file_):
+ try:
+ os.makedirs(dst_path)
+ except OSError:
+ pass
+ else:
+ if not args.force:
+ try:
+ os.stat(dst_path)
+ self.perror('{}: file already exists! use --force to override'.format(
+ file_.decode('utf-8')), end='\n',
+ apply_style=True)
+ return
+ except OSError:
+ copy_to_local(file_, dst_path)
+ else:
+ copy_to_local(file_, dst_path)
+
+ return 0
+
+ def complete_ls(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ ls_parser = argparse.ArgumentParser(
+ description='Copy a file from Ceph File System from Local Directory.')
+ ls_parser.add_argument('-l', '--long', action='store_true',
+ help='Detailed list of items in the directory.')
+ ls_parser.add_argument('-r', '--reverse', action='store_true',
+ help='Reverse order of listing items in the directory.')
+ ls_parser.add_argument('-H', action='store_true', help='Human Readable')
+ ls_parser.add_argument('-a', '--all', action='store_true',
+ help='Do not Ignore entries starting with .')
+ ls_parser.add_argument('-S', action='store_true', help='Sort by file_size')
+ ls_parser.add_argument('paths', help='Name of Directories',
+ action=path_to_bytes, nargs='*', default=['.'])
+
+ @with_argparser(ls_parser)
+ def do_ls(self, args):
+ """
+ List all the files and directories in the current working directory
+ """
+ paths = args.paths
+ for path in paths:
+ values = []
+ items = []
+ if path.count(b'*') > 0:
+ all_items = get_all_possible_paths(path)
+ if len(all_items) == 0:
+ continue
+ path = all_items[0].rsplit(b'/', 1)[0]
+ if path == b'':
+ path = b'/'
+ dirs = []
+ for i in all_items:
+ for item in ls(path):
+ d_name = item.d_name
+ if os.path.basename(i) == d_name:
+ if item.is_dir():
+ dirs.append(os.path.join(path, d_name))
+ else:
+ items.append(item)
+ if dirs:
+ paths.extend(dirs)
+ else:
+ self.poutput(path.decode('utf-8'), end=':\n')
+ items = sorted(items, key=lambda item: item.d_name)
+ else:
+ if path != b'' and path != cephfs.getcwd() and len(paths) > 1:
+ self.poutput(path.decode('utf-8'), end=':\n')
+ items = sorted(ls(path),
+ key=lambda item: item.d_name)
+ if not args.all:
+ items = [i for i in items if not i.d_name.startswith(b'.')]
+
+ if args.S:
+ items = sorted(items, key=lambda item: cephfs.stat(
+ path + b'/' + item.d_name).st_size)
+
+ if args.reverse:
+ items = reversed(items)
+ for item in items:
+ filepath = item.d_name
+ is_dir = item.is_dir()
+
+ if args.long and args.H:
+ print_long(cephfs.getcwd()
+ + path
+ + b'/'
+ + filepath,
+ is_dir, True)
+ elif args.long:
+ print_long(cephfs.getcwd()
+ + path
+ + b'/'
+ + filepath,
+ is_dir, False)
+ elif is_dir:
+ values.append(colorama.Style.BRIGHT
+ + colorama.Fore.CYAN
+ + filepath.decode('utf-8')
+ + '/'
+ + colorama.Style.RESET_ALL)
+ else:
+ values.append(filepath)
+ if not args.long:
+ print_list(values, shutil.get_terminal_size().columns)
+ if path != paths[-1]:
+ self.poutput('')
+
+ def complete_rmdir(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ rmdir_parser = argparse.ArgumentParser(description='Remove Directory.')
+ rmdir_parser.add_argument('paths', help='Directory Path.', nargs='+',
+ action=path_to_bytes)
+ rmdir_parser.add_argument('-p', '--parent', action='store_true',
+ help='Remove parent directories as necessary. \
+When this option is specified, no error is reported if a directory has any \
+sub-directories, files')
+
+ @with_argparser(rmdir_parser)
+ def do_rmdir(self, args):
+ """
+ Remove a specific Directory
+ """
+ is_pattern = False
+ paths = args.paths
+ for path in paths:
+ if path.count(b'*') > 0:
+ is_pattern = True
+ all_items = get_all_possible_paths(path)
+ if len(all_items) > 0:
+ path = all_items[0].rsplit(b'/', 1)[0]
+ if path == b'':
+ path = b'/'
+ dirs = []
+ for i in all_items:
+ for item in ls(path):
+ d_name = item.d_name
+ if os.path.basename(i) == d_name:
+ if item.is_dir():
+ dirs.append(os.path.join(path, d_name))
+ paths.extend(dirs)
+ continue
+ else:
+ is_pattern = False
+ path = os.path.normpath(os.path.join(cephfs.getcwd(), path))
+ if args.parent:
+ files = reversed(sorted(set(dirwalk(path))))
+ for filepath in files:
+ filepath = os.path.normpath(filepath)
+ if filepath[1:] != path:
+ try:
+ cephfs.rmdir(filepath)
+ except libcephfs.Error:
+ cephfs.unlink(filepath)
+ if not is_pattern and path != os.path.normpath(b''):
+ try:
+ cephfs.rmdir(path)
+ except libcephfs.Error:
+ self.perror('error: no such directory {} exists'.format(
+ path.decode('utf-8')), end='\n',
+ apply_style=True)
+
+ def complete_rm(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ rm_parser = argparse.ArgumentParser(description='Remove File.')
+ rm_parser.add_argument('paths', help='File Path.', nargs='+',
+ action=path_to_bytes)
+
+ @with_argparser(rm_parser)
+ def do_rm(self, args):
+ """
+ Remove a specific file
+ """
+ file_paths = args.paths
+ for path in file_paths:
+ if path.count(b'*') > 0:
+ file_paths.extend([i for i in get_all_possible_paths(
+ path) if is_file_exists(i)])
+ else:
+ try:
+ cephfs.unlink(path)
+ except libcephfs.Error:
+ self.perror('{}: no such file'.format(path.decode('utf-8')),
+ end='\n', apply_style=True)
+
+ def complete_mv(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ mv_parser = argparse.ArgumentParser(description='Move File.')
+ mv_parser.add_argument('src_path', type=str, action=path_to_bytes,
+ help='Source File Path.')
+ mv_parser.add_argument('dest_path', type=str, action=path_to_bytes,
+ help='Destination File Path.')
+
+ @with_argparser(mv_parser)
+ def do_mv(self, args):
+ """
+ Rename a file or Move a file from source path to the destination
+ """
+ try:
+ cephfs.rename(args.src_path, args.dest_path)
+ except libcephfs.Error:
+ self.poutput("error: need a file name to move to")
+
+ def complete_cd(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ cd_parser = argparse.ArgumentParser(description='Change working directory')
+ cd_parser.add_argument('path', type=str, help='Name of the directory.',
+ action=path_to_bytes, nargs='?', default='/')
+
+ @with_argparser(cd_parser)
+ def do_cd(self, args):
+ """
+ Change working directory
+ """
+ try:
+ cephfs.chdir(args.path)
+ self.working_dir = cephfs.getcwd().decode('utf-8')
+ self.set_prompt()
+ except libcephfs.Error:
+ self.perror('{}: no such directory'.format(args.path.decode('utf-8')),
+ end='\n', apply_style=True)
+
+ def do_cwd(self, arglist):
+ """
+ Get current working directory.
+ """
+ self.poutput(cephfs.getcwd().decode('utf-8'))
+
+ def complete_chmod(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ chmod_parser = argparse.ArgumentParser(description='Create Directory.')
+ chmod_parser.add_argument('mode', type=str, action=ModeAction, help='Mode')
+ chmod_parser.add_argument('paths', type=str, action=path_to_bytes,
+ help='Name of the file', nargs='+')
+
+ @with_argparser(chmod_parser)
+ def do_chmod(self, args):
+ """
+ Change permission of a file
+ """
+ for path in args.paths:
+ mode = int(args.mode, base=8)
+ try:
+ cephfs.chmod(path, mode)
+ except libcephfs.Error:
+ self.perror('{}: no such file or directory'.format(
+ path.decode('utf-8')), end='\n', apply_style=True)
+
+ def complete_cat(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ cat_parser = argparse.ArgumentParser(description='')
+ cat_parser.add_argument('paths', help='Name of Files', action=path_to_bytes,
+ nargs='+')
+
+ @with_argparser(cat_parser)
+ def do_cat(self, args):
+ """
+ Print contents of a file
+ """
+ for path in args.paths:
+ if is_file_exists(path):
+ copy_to_local(path, b'-')
+ else:
+ self.perror('{}: no such file'.format(path.decode('utf-8')),
+ end='\n', apply_style=True)
+
+ umask_parser = argparse.ArgumentParser(description='Set umask value.')
+ umask_parser.add_argument('mode', help='Mode', type=str, action=ModeAction,
+ nargs='?', default='')
+
+ @with_argparser(umask_parser)
+ def do_umask(self, args):
+ """
+ Set Umask value.
+ """
+ if args.mode == '':
+ self.poutput(self.umask.zfill(4))
+ else:
+ mode = int(args.mode, 8)
+ self.umask = str(oct(cephfs.umask(mode))[2:])
+
+ def complete_write(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ write_parser = argparse.ArgumentParser(description='Writes data into a file')
+ write_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of File')
+
+ @with_argparser(write_parser)
+ def do_write(self, args):
+ """
+ Write data into a file.
+ """
+
+ copy_from_local(b'-', args.path)
+
+ def complete_lcd(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ index_dict = {1: self.path_complete}
+ return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+ lcd_parser = argparse.ArgumentParser(description='')
+ lcd_parser.add_argument('path', type=str, action=path_to_bytes, help='Path')
+
+ @with_argparser(lcd_parser)
+ def do_lcd(self, args):
+ """
+ Moves into the given local directory
+ """
+ try:
+ os.chdir(os.path.expanduser(args.path))
+ except OSError as e:
+ self.perror("Cannot change to {}: {}".format(e.filename,
+ e.strerror), False)
+
+ def complete_lls(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ index_dict = {1: self.path_complete}
+ return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+ lls_parser = argparse.ArgumentParser(
+ description='List files in local system.')
+ lls_parser.add_argument('paths', help='Paths', action=path_to_bytes,
+ nargs='*')
+
+ @with_argparser(lls_parser)
+ def do_lls(self, args):
+ """
+ Lists all files and folders in the current local directory
+ """
+ if not args.paths:
+ print_list(os.listdir(os.getcwdb()))
+ else:
+ for path in args.paths:
+ try:
+ items = os.listdir(path)
+ self.poutput("{}:".format(path.decode('utf-8')))
+ print_list(items)
+ except OSError as e:
+ self.perror("'{}': {}".format(e.filename, e.strerror), False)
+ # Arguments to the with_argpaser decorator function are sticky.
+ # The items in args.path do not get overwritten in subsequent calls.
+ # The arguments remain in args.paths after the function exits and we
+ # neeed to clean it up to ensure the next call works as expected.
+ args.paths.clear()
+
+ def do_lpwd(self, arglist):
+ """
+ Prints the absolute path of the current local directory
+ """
+ self.poutput(os.getcwd())
+
+ def do_df(self, arglist):
+ """
+ Display the amount of available disk space for file systems
+ """
+ for index, i in enumerate(ls(b".", opts='A')):
+ if index == 0:
+ self.poutput('{:25s}\t{:5s}\t{:15s}{:10s}{}'.format(
+ "1K-blocks", "Used", "Available", "Use%", "Stored on"))
+ if not is_dir_exists(i.d_name):
+ statfs = cephfs.statfs(i.d_name)
+ stat = cephfs.stat(i.d_name)
+ block_size = statfs['f_blocks']*statfs['f_bsize'] // 1024
+ available = block_size - stat.st_size
+ use = 0
+ if block_size > 0:
+ use = (stat.st_size*100 // block_size)
+ self.poutput('{:25d}\t{:5d}\t{:10d}\t{:5s} {}'.format(
+ statfs['f_fsid'], stat.st_size, available,
+ str(int(use)) + '%', i.d_name.decode('utf-8')))
+
+ locate_parser = argparse.ArgumentParser(
+ description='Find file within file system')
+ locate_parser.add_argument('name', help='name', type=str,
+ action=path_to_bytes)
+ locate_parser.add_argument('-c', '--count', action='store_true',
+ help='Count list of items located.')
+ locate_parser.add_argument(
+ '-i', '--ignorecase', action='store_true', help='Ignore case')
+
+ @with_argparser(locate_parser)
+ def do_locate(self, args):
+ """
+ Find a file within the File System
+ """
+ if args.name.count(b'*') == 1:
+ if args.name[0] == b'*':
+ args.name += b'/'
+ elif args.name[-1] == '*':
+ args.name = b'/' + args.name
+ args.name = args.name.replace(b'*', b'')
+ if args.ignorecase:
+ locations = locate_file(args.name, False)
+ else:
+ locations = locate_file(args.name)
+ if args.count:
+ self.poutput(len(locations))
+ else:
+ self.poutput((b'\n'.join(locations)).decode('utf-8'))
+
+ def complete_du(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ du_parser = argparse.ArgumentParser(
+ description='Disk Usage of a Directory')
+ du_parser.add_argument('dirs', type=str, action=path_to_bytes,
+ help='Name of the directory.', nargs='?',
+ default='.')
+ du_parser.add_argument('-r', action='store_true',
+ help='Recursive Disk usage of all directories.')
+
+ @with_argparser(du_parser)
+ def do_du(self, args):
+ """
+ Disk Usage of a Directory
+ """
+ if args.dirs == b'':
+ args.dirs = cephfs.getcwd()
+ for dir_ in args.dirs:
+ if args.r:
+ for i in reversed(sorted(set(dirwalk(dir_)))):
+ i = os.path.normpath(i)
+ try:
+ xattr = cephfs.getxattr(i, 'ceph.dir.rbytes')
+ self.poutput('{:10s} {}'.format(
+ humansize(int(xattr.decode('utf-8'))), '.'
+ + i.decode('utf-8')))
+ except libcephfs.Error:
+ continue
+ else:
+ dir_ = os.path.normpath(dir_)
+ self.poutput('{:10s} {}'.format(humansize(int(cephfs.getxattr(
+ dir_, 'ceph.dir.rbytes').decode('utf-8'))), '.'
+ + dir_.decode('utf-8')))
+
+ quota_parser = argparse.ArgumentParser(
+ description='Quota management for a Directory')
+ quota_parser.add_argument('op', choices=['get', 'set'],
+ help='Quota operation type.')
+ quota_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of the directory.')
+ quota_parser.add_argument('--max_bytes', type=int, default=-1, nargs='?',
+ help='Max cumulative size of the data under '
+ 'this directory.')
+ quota_parser.add_argument('--max_files', type=int, default=-1, nargs='?',
+ help='Total number of files under this '
+ 'directory tree.')
+
+ @with_argparser(quota_parser)
+ def do_quota(self, args):
+ """
+ Quota management.
+ """
+ if not is_dir_exists(args.path):
+ self.perror('error: no such directory {}'.format(args.path.decode('utf-8')),
+ end='\n', apply_style=True)
+ return
+
+ if args.op == 'set':
+ if (args.max_bytes == -1) and (args.max_files == -1):
+ self.poutput('please specify either --max_bytes or '
+ '--max_files or both')
+ return
+
+ if args.max_bytes >= 0:
+ max_bytes = to_bytes(str(args.max_bytes))
+ try:
+ cephfs.setxattr(args.path, 'ceph.quota.max_bytes',
+ max_bytes, len(max_bytes),
+ os.XATTR_CREATE)
+ self.poutput('max_bytes set to %d' % args.max_bytes)
+ except libcephfs.Error:
+ cephfs.setxattr(args.path, 'ceph.quota.max_bytes',
+ max_bytes, len(max_bytes),
+ os.XATTR_REPLACE)
+ self.poutput('max_bytes reset to %d' % args.max_bytes)
+
+ if args.max_files >= 0:
+ max_files = to_bytes(str(args.max_files))
+ try:
+ cephfs.setxattr(args.path, 'ceph.quota.max_files',
+ max_files, len(max_files),
+ os.XATTR_CREATE)
+ self.poutput('max_files set to %d' % args.max_files)
+ except libcephfs.Error:
+ cephfs.setxattr(args.path, 'ceph.quota.max_files',
+ max_files, len(max_files),
+ os.XATTR_REPLACE)
+ self.poutput('max_files reset to %d' % args.max_files)
+ elif args.op == 'get':
+ max_bytes = '0'
+ max_files = '0'
+ try:
+ max_bytes = cephfs.getxattr(args.path,
+ 'ceph.quota.max_bytes')
+ self.poutput('max_bytes: %s' % max_bytes)
+ except libcephfs.Error:
+ self.poutput('max_bytes is not set')
+ pass
+
+ try:
+ max_files = cephfs.getxattr(args.path,
+ 'ceph.quota.max_files')
+ self.poutput('max_files: %s' % max_files)
+ except libcephfs.Error:
+ self.poutput('max_files is not set')
+ pass
+
+ def do_help(self, line):
+ """
+ Get details about a command.
+ Usage: help <cmd> - for a specific command
+ help all - for all the commands
+ """
+ if line == 'all':
+ for k in dir(self):
+ if k.startswith('do_'):
+ self.poutput('-'*80)
+ super().do_help(k[3:])
+ return
+ parser = self.create_argparser(line)
+ if parser:
+ parser.print_help()
+ else:
+ super().do_help(line)
+
+ def complete_stat(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ stat_parser = argparse.ArgumentParser(
+ description='Display file or file system status')
+ stat_parser.add_argument('paths', type=str, help='file paths',
+ action=path_to_bytes, nargs='+')
+
+ @with_argparser(stat_parser)
+ def do_stat(self, args):
+ """
+ Display file or file system status
+ """
+ for path in args.paths:
+ try:
+ stat = cephfs.stat(path)
+ atime = stat.st_atime.isoformat(' ')
+ mtime = stat.st_mtime.isoformat(' ')
+ ctime = stat.st_mtime.isoformat(' ')
+
+ self.poutput("File: {}\nSize: {:d}\nBlocks: {:d}\nIO Block: {:d}\n\
+Device: {:d}\tInode: {:d}\tLinks: {:d}\nPermission: {:o}/{}\tUid: {:d}\tGid: {:d}\n\
+Access: {}\nModify: {}\nChange: {}".format(path.decode('utf-8'), stat.st_size,
+ stat.st_blocks, stat.st_blksize, stat.st_dev,
+ stat.st_ino, stat.st_nlink, stat.st_mode,
+ mode_notation(stat.st_mode), stat.st_uid,
+ stat.st_gid, atime, mtime, ctime))
+ except libcephfs.Error:
+ self.perror('{}: no such file or directory'.format(path.decode('utf-8')),
+ end='\n', apply_style=True)
+
+
+if __name__ == '__main__':
+ config_file = ''
+ exe = sys.argv[0]
+ main_parser = argparse.ArgumentParser(description='')
+ main_parser.add_argument('-c', '--config', action='store',
+ help='Configuration file_path', type=str)
+ main_parser.add_argument(
+ '-b', '--batch', action='store', help='Batch File path.', type=str)
+ main_parser.add_argument('-t', '--test', action='store',
+ help='Test against transcript(s) in FILE',
+ nargs='+')
+ main_parser.add_argument('commands', nargs='*',
+ help='comma delimited commands', default=[])
+ args = main_parser.parse_args()
+ if args.config:
+ config_file = args.config
+ if args.batch:
+ args.commands = ['load ' + args.batch, ',quit']
+ if args.test:
+ args.commands.extend(['-t,'] + [arg+',' for arg in args.test])
+ sys.argv.clear()
+ sys.argv.append(exe)
+ sys.argv.extend([i.strip() for i in ' '.join(args.commands).split(',')])
+ setup_cephfs(config_file)
+ shell = CephFSShell()
+ shell.cmdloop()
diff --git a/src/tools/cephfs/cephfs-table-tool.cc b/src/tools/cephfs/cephfs-table-tool.cc
new file mode 100644
index 00000000..47b475dd
--- /dev/null
+++ b/src/tools/cephfs/cephfs-table-tool.cc
@@ -0,0 +1,47 @@
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "TableTool.h"
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ TableTool::usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ TableTool tt;
+
+ // Connect to mon cluster, download MDS map etc
+ int rc = tt.init();
+ if (rc != 0) {
+ std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+
+ // Finally, execute the user's commands
+ rc = tt.main(args);
+ if (rc != 0) {
+ std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+ }
+
+ return rc;
+}
+
+
diff --git a/src/tools/cephfs/setup.py b/src/tools/cephfs/setup.py
new file mode 100644
index 00000000..8cf7f28f
--- /dev/null
+++ b/src/tools/cephfs/setup.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+from setuptools import setup
+
+__version__ = '0.0.1'
+
+setup(
+ name='cephfs-shell',
+ version=__version__,
+ description='Interactive shell for Ceph file system',
+ keywords='cephfs, shell',
+ scripts=['cephfs-shell'],
+ install_requires=[
+ 'cephfs',
+ 'cmd2',
+ 'colorama',
+ ],
+ classifiers=[
+ 'Development Status :: 3 - Alpha',
+ 'Environment :: Console',
+ 'Intended Audience :: System Administrators',
+ 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)',
+ 'Operating System :: POSIX :: Linux',
+ 'Programming Language :: Python :: 3'
+ ],
+ license='LGPLv2+',
+)
diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc
new file mode 100644
index 00000000..07b8b79a
--- /dev/null
+++ b/src/tools/crushtool.cc
@@ -0,0 +1,1304 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <fstream>
+#include <type_traits>
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/Formatter.h"
+
+#include "common/ceph_argparse.h"
+#include "include/stringify.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "osd/OSDMap.h"
+#include "crush/CrushWrapper.h"
+#include "crush/CrushCompiler.h"
+#include "crush/CrushTester.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_crush
+
+
+const char *infn = "stdin";
+
+static int get_fd_data(int fd, bufferlist &bl)
+{
+
+ uint64_t total = 0;
+ do {
+ ssize_t bytes = bl.read_fd(fd, 1024*1024);
+ if (bytes < 0) {
+ cerr << "read_fd error " << cpp_strerror(-bytes) << "\n";
+ return -1;
+ }
+
+ if (bytes == 0)
+ break;
+
+ total += bytes;
+ } while(true);
+
+ ceph_assert(bl.length() == total);
+ return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+void data_analysis_usage()
+{
+cout << "data output from testing routine ...\n";
+cout << " absolute_weights\n";
+cout << " the decimal weight of each OSD\n";
+cout << " data layout: ROW MAJOR\n";
+cout << " OSD id (int), weight (int)\n";
+cout << " batch_device_expected_utilization_all\n";
+cout << " the expected number of objects each OSD should receive per placement batch\n";
+cout << " which may be a decimal value\n";
+cout << " data layout: COLUMN MAJOR\n";
+cout << " round (int), objects expected on OSD 0...OSD n (float)\n";
+cout << " batch_device_utilization_all\n";
+cout << " the number of objects stored on each OSD during each placement round\n";
+cout << " data layout: COLUMN MAJOR\n";
+cout << " round (int), objects stored on OSD 0...OSD n (int)\n";
+cout << " device_utilization_all\n";
+cout << " the number of objects stored on each OSD at the end of placements\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " OSD id (int), objects stored (int), objects expected (float)\n";
+cout << " device_utilization\n";
+cout << " the number of objects stored on each OSD marked 'up' at the end of placements\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " OSD id (int), objects stored (int), objects expected (float)\n";
+cout << " placement_information\n";
+cout << " the map of input -> OSD\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " input (int), OSD's mapped (int)\n";
+cout << " proportional_weights_all\n";
+cout << " the proportional weight of each OSD specified in the CRUSH map\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " OSD id (int), proportional weight (float)\n";
+cout << " proportional_weights\n";
+cout << " the proportional weight of each 'up' OSD specified in the CRUSH map\n";
+cout << " data_layout: ROW MAJOR\n";
+cout << " OSD id (int), proportional weight (float)\n";
+}
+
+void usage()
+{
+ cout << "usage: crushtool ...\n";
+ cout << "\n";
+ cout << "Display, modify and test a crush map\n";
+ cout << "\n";
+ cout << "There are five stages, running one after the other:\n";
+ cout << "\n";
+ cout << " - input/build\n";
+ cout << " - tunables adjustments\n";
+ cout << " - modifications\n";
+ cout << " - display/test\n";
+ cout << " - output\n";
+ cout << "\n";
+ cout << "Options that are not specific to a stage.\n";
+ cout << "\n";
+ cout << " [--infn|-i infile]\n";
+ cout << " read the crush map from infile\n";
+ cout << "\n";
+ cout << "Options for the input/build stage\n";
+ cout << "\n";
+ cout << " --decompile|-d map decompile a crush map to source\n";
+ cout << " [--outfn|-o outfile]\n";
+ cout << " specify output for for (de)compilation\n";
+ cout << " --compile|-c map.txt compile a map from source\n";
+ cout << " --enable-unsafe-tunables\n";
+ cout << " compile with unsafe tunables\n";
+ cout << " --build --num_osds N layer1 ...\n";
+ cout << " build a new map, where each 'layer' is\n";
+ cout << " 'name (uniform|straw2|straw|list|tree) size'\n";
+ cout << "\n";
+ cout << "Options for the tunables adjustments stage\n";
+ cout << "\n";
+ cout << " --set-choose-local-tries N\n";
+ cout << " set choose local retries before re-descent\n";
+ cout << " --set-choose-local-fallback-tries N\n";
+ cout << " set choose local retries using fallback\n";
+ cout << " permutation before re-descent\n";
+ cout << " --set-choose-total-tries N\n";
+ cout << " set choose total descent attempts\n";
+ cout << " --set-chooseleaf-descend-once <0|1>\n";
+ cout << " set chooseleaf to (not) retry the recursive descent\n";
+ cout << " --set-chooseleaf-vary-r <0|1>\n";
+ cout << " set chooseleaf to (not) vary r based on parent\n";
+ cout << " --set-chooseleaf-stable <0|1>\n";
+ cout << " set chooseleaf firstn to (not) return stable results\n";
+ cout << "\n";
+ cout << "Options for the modifications stage\n";
+ cout << "\n";
+ cout << " -i mapfn --add-item id weight name [--loc type name ...]\n";
+ cout << " insert an item into the hierarchy at the\n";
+ cout << " given location\n";
+ cout << " -i mapfn --update-item id weight name [--loc type name ...]\n";
+ cout << " insert or move an item into the hierarchy at the\n";
+ cout << " given location\n";
+ cout << " -i mapfn --remove-item name\n"
+ << " remove the given item\n";
+ cout << " -i mapfn --reweight-item name weight\n";
+ cout << " reweight a given item (and adjust ancestor\n"
+ << " weights as needed)\n";
+ cout << " -i mapfn --add-bucket name type [--loc type name ...]\n"
+ << " insert a bucket into the hierarchy at the given\n"
+ << " location\n";
+ cout << " -i mapfn --move name --loc type name ...\n"
+ << " move the given item to specified location\n";
+ cout << " -i mapfn --reweight recalculate all bucket weights\n";
+ cout << " -i mapfn --rebuild-class-roots\n";
+ cout << " rebuild the per-class shadow trees (normally a no-op)\n";
+ cout << " -i mapfn --create-simple-rule name root type mode\n"
+ << " create crush rule <name> to start from <root>,\n"
+ << " replicate across buckets of type <type>, using\n"
+ << " a choose mode of <firstn|indep>\n";
+ cout << " -i mapfn --create-replicated-rule name root type\n"
+ << " create crush rule <name> to start from <root>,\n"
+ << " replicate across buckets of type <type>\n";
+ cout << " --device-class <class>\n";
+ cout << " use device class <class> for new rule\n";
+ cout << " -i mapfn --remove-rule name\n"
+ << " remove the specified crush rule\n";
+ cout << "\n";
+ cout << "Options for the display/test stage\n";
+ cout << "\n";
+ cout << " -f --format the format of --dump, defaults to json-pretty\n";
+ cout << " can be one of json, json-pretty, xml, xml-pretty,\n";
+ cout << " table, table-kv, html, html-pretty\n";
+ cout << " --dump dump the crush map\n";
+ cout << " --tree print map summary as a tree\n";
+ cout << " --check [max_id] check if any item is referencing an unknown name/type\n";
+ cout << " -i mapfn --show-location id\n";
+ cout << " show location for given device id\n";
+ cout << " -i mapfn --test test a range of inputs on the map\n";
+ cout << " [--min-x x] [--max-x x] [--x x]\n";
+ cout << " [--min-rule r] [--max-rule r] [--rule r] [--ruleset rs]\n";
+ cout << " [--num-rep n]\n";
+ cout << " [--pool-id n] specifies pool id\n";
+ cout << " [--batches b] split the CRUSH mapping into b > 1 rounds\n";
+ cout << " [--weight|-w devno weight]\n";
+ cout << " where weight is 0 to 1.0\n";
+ cout << " [--simulate] simulate placements using a random\n";
+ cout << " number generator in place of the CRUSH\n";
+ cout << " algorithm\n";
+ cout << " --show-utilization show OSD usage\n";
+ cout << " --show-utilization-all\n";
+ cout << " include zero weight items\n";
+ cout << " --show-statistics show chi squared statistics\n";
+ cout << " --show-mappings show mappings\n";
+ cout << " --show-bad-mappings show bad mappings\n";
+ cout << " --show-choose-tries show choose tries histogram\n";
+ cout << " --output-name name\n";
+ cout << " prepend the data file(s) generated during the\n";
+ cout << " testing routine with name\n";
+ cout << " --output-csv\n";
+ cout << " export select data generated during testing routine\n";
+ cout << " to CSV files for off-line post-processing\n";
+ cout << " use --help-output for more information\n";
+ cout << " --reclassify transform legacy CRUSH map buckets and rules\n";
+ cout << " by adding classes\n";
+ cout << " --reclassify-bucket <bucket-match> <class> <default-parent>\n";
+ cout << " --reclassify-root <bucket-name> <class>\n";
+ cout << " --set-subtree-class <bucket-name> <class>\n";
+ cout << " set class for all items beneath bucket-name\n";
+ cout << " --compare <otherfile> compare two maps using --test parameters\n";
+ cout << "\n";
+ cout << "Options for the output stage\n";
+ cout << "\n";
+ cout << " [--outfn|-o outfile]\n";
+ cout << " specify output for modified crush map\n";
+ cout << "\n";
+}
+
+struct bucket_types_t {
+ const char *name;
+ int type;
+} bucket_types[] = {
+ { "uniform", CRUSH_BUCKET_UNIFORM },
+ { "list", CRUSH_BUCKET_LIST },
+ { "straw", CRUSH_BUCKET_STRAW },
+ { "straw2", CRUSH_BUCKET_STRAW2 },
+ { "tree", CRUSH_BUCKET_TREE },
+ { 0, 0 },
+};
+
+struct layer_t {
+ const char *name;
+ const char *buckettype;
+ int size;
+};
+
+template<typename... Args>
+bool argparse_withargs(std::vector<const char*> &args,
+ std::vector<const char*>::iterator& i,
+ std::ostream& oss,
+ const char* opt,
+ Args*... opts)
+{
+ if (!ceph_argparse_flag(args, i, opt, nullptr)) {
+ return false;
+ }
+ auto parse = [&](auto& opt) {
+ if (i == args.end()) {
+ oss << "expecting additional argument to " << opt;
+ return false;
+ }
+ using opt_t = std::remove_pointer_t<decay_t<decltype(opt)>>;
+ string err;
+ if constexpr (std::is_same_v<opt_t, string>) {
+ opt->assign(*i);
+ } else if constexpr (is_same_v<opt_t, int>) {
+ *opt = strict_strtol(*i, 10, &err);
+ } else if constexpr (is_same_v<opt_t, float>) {
+ *opt = strict_strtof(*i, &err);
+ }
+ i = args.erase(i);
+ if (err.empty())
+ return true;
+ else {
+ oss << err;
+ return false;
+ }
+ };
+ (... && parse(opts));
+ return true;
+}
+
+int do_add_bucket(CephContext* cct,
+ const char* me,
+ CrushWrapper& crush,
+ const string& add_name,
+ const string& add_type,
+ const map<string,string>& add_loc) {
+ int bucketno;
+ if (crush.name_exists(add_name)) {
+ cerr << me << " bucket '" << add_name << "' already exists" << std::endl;
+ return -EEXIST;
+ }
+ int type = crush.get_type_id(add_type);
+ if (type <= 0) {
+ cerr << me << " bad bucket type: " << add_type << std::endl;
+ return -EINVAL;
+ }
+ if (int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT, type, 0, nullptr, nullptr, &bucketno);
+ r < 0) {
+ cerr << me << " unable to add bucket: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (int r = crush.set_item_name(bucketno, add_name); r < 0) {
+ cerr << me << " bad bucket name: " << add_name << std::endl;
+ return r;
+ }
+ if (!add_loc.empty()) {
+ if (!crush.check_item_loc(cct, bucketno, add_loc, (int*)nullptr)) {
+ if (int r = crush.move_bucket(cct, bucketno, add_loc); r < 0) {
+ cerr << me << " error moving bucket '" << add_name << "' to " << add_loc << std::endl;
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+// return 1 for no change, 0 for successful change, negative on error
+int do_move_item(CephContext* cct,
+ const char *me,
+ CrushWrapper& crush,
+ const string& name,
+ const map<string,string>& loc)
+{
+ if (!crush.name_exists(name)) {
+ cerr << me << " item '" << name << "' does not exist" << std::endl;
+ return -ENOENT;
+ }
+ int id = crush.get_item_id(name);
+ if (loc.empty()) {
+ cerr << me << " expecting additional --loc argument to --move" << std::endl;
+ return -EINVAL;
+ }
+ if (crush.check_item_loc(cct, id, loc, (int*)nullptr)) {
+ // it's already there
+ cerr << me << " item '" << name << "' already at " << loc << std::endl;
+ return 1;
+ }
+ if (id >= 0) {
+ switch (int r = crush.create_or_move_item(cct, id, 0, name, loc)) {
+ case 0:
+ return 1;
+ case 1:
+ return 0;
+ default:
+ return r;
+ }
+ } else {
+ return crush.move_bucket(cct, id, loc);
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ const char *me = argv[0];
+ std::string infn, srcfn, outfn, add_name, add_type, remove_name, reweight_name;
+ std::string move_name;
+ bool compile = false;
+ bool decompile = false;
+ bool check = false;
+ int max_id = -1;
+ bool test = false;
+ bool display = false;
+ bool tree = false;
+ string dump_format = "json-pretty";
+ bool dump = false;
+ int full_location = -1;
+ bool write_to_file = false;
+ int verbose = 0;
+ bool unsafe_tunables = false;
+
+ bool rebuild_class_roots = false;
+
+ bool reweight = false;
+ int add_item = -1;
+ bool add_bucket = false;
+ bool update_item = false;
+ bool move_item = false;
+ bool add_rule = false;
+ std::string rule_name, rule_root, rule_type, rule_mode, rule_device_class;
+ bool del_rule = false;
+ float add_weight = 0;
+ map<string,string> add_loc;
+ float reweight_weight = 0;
+
+ bool adjust = false;
+
+ int build = 0;
+ int num_osds =0;
+ vector<layer_t> layers;
+
+ int choose_local_tries = -1;
+ int choose_local_fallback_tries = -1;
+ int choose_total_tries = -1;
+ int chooseleaf_descend_once = -1;
+ int chooseleaf_vary_r = -1;
+ int chooseleaf_stable = -1;
+ int straw_calc_version = -1;
+ int allowed_bucket_algs = -1;
+
+ bool reclassify = false;
+ map<string,pair<string,string>> reclassify_bucket; // %suffix or prefix% -> class, default_root
+ map<string,string> reclassify_root; // bucket -> class
+ map<string,string> set_subtree_class; // bucket -> class
+
+ string compare;
+
+ CrushWrapper crush;
+
+ CrushTester tester(crush, cout);
+
+ // we use -c, don't confuse the generic arg parsing
+ // only parse arguments from CEPH_ARGS, if in the environment
+ vector<const char *> empty_args;
+ auto cct = global_init(NULL, empty_args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ // crushtool times out occasionally when quits. so do not
+ // release the g_ceph_context.
+ cct->get();
+ common_init_finish(g_ceph_context);
+
+ int x;
+ float y;
+ long long z;
+
+ std::string val;
+ std::ostringstream err;
+ int tmp;
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_witharg(args, i, &val, "-d", "--decompile", (char*)NULL)) {
+ infn = val;
+ decompile = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "-i", "--infn", (char*)NULL)) {
+ infn = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-o", "--outfn", (char*)NULL)) {
+ outfn = val;
+ } else if (ceph_argparse_flag(args, i, "-v", "--verbose", (char*)NULL)) {
+ verbose += 1;
+ } else if (ceph_argparse_witharg(args, i, &val, "--compare", (char*)NULL)) {
+ compare = val;
+ } else if (ceph_argparse_flag(args, i, "--reclassify", (char*)NULL)) {
+ reclassify = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-bucket",
+ (char*)NULL)) {
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ string c = *i;
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ reclassify_bucket[val] = make_pair(c, *i);
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-root",
+ (char*)NULL)) {
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ reclassify_root[val] = *i;
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &val, "--set-subtree-class",
+ (char*)NULL)) {
+ if (i == args.end()) {
+ cerr << "expecting additional argument" << std::endl;
+ return EXIT_FAILURE;
+ }
+ set_subtree_class[val] = *i;
+ i = args.erase(i);
+ } else if (ceph_argparse_flag(args, i, "--tree", (char*)NULL)) {
+ tree = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "-f", "--format", (char*)NULL)) {
+ dump_format = val;
+ } else if (ceph_argparse_flag(args, i, "--dump", (char*)NULL)) {
+ dump = true;
+ } else if (ceph_argparse_flag(args, i, "--show_utilization", (char*)NULL)) {
+ display = true;
+ tester.set_output_utilization(true);
+ } else if (ceph_argparse_flag(args, i, "--show_utilization_all", (char*)NULL)) {
+ display = true;
+ tester.set_output_utilization_all(true);
+ } else if (ceph_argparse_flag(args, i, "--show_statistics", (char*)NULL)) {
+ display = true;
+ tester.set_output_statistics(true);
+ } else if (ceph_argparse_flag(args, i, "--show_mappings", (char*)NULL)) {
+ display = true;
+ tester.set_output_mappings(true);
+ } else if (ceph_argparse_flag(args, i, "--show_bad_mappings", (char*)NULL)) {
+ display = true;
+ tester.set_output_bad_mappings(true);
+ } else if (ceph_argparse_flag(args, i, "--show_choose_tries", (char*)NULL)) {
+ display = true;
+ tester.set_output_choose_tries(true);
+ } else if (ceph_argparse_witharg(args, i, &val, "-c", "--compile", (char*)NULL)) {
+ srcfn = val;
+ compile = true;
+ } else if (ceph_argparse_witharg(args, i, &max_id, err, "--check", (char*)NULL)) {
+ check = true;
+ } else if (ceph_argparse_flag(args, i, "-t", "--test", (char*)NULL)) {
+ test = true;
+ } else if (ceph_argparse_witharg(args, i, &full_location, err, "--show-location", (char*)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "-s", "--simulate", (char*)NULL)) {
+ tester.set_random_placement();
+ } else if (ceph_argparse_flag(args, i, "--enable-unsafe-tunables", (char*)NULL)) {
+ unsafe_tunables = true;
+ } else if (ceph_argparse_witharg(args, i, &choose_local_tries, err,
+ "--set_choose_local_tries", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &choose_local_fallback_tries, err,
+ "--set_choose_local_fallback_tries", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &choose_total_tries, err,
+ "--set_choose_total_tries", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &chooseleaf_descend_once, err,
+ "--set_chooseleaf_descend_once", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &chooseleaf_vary_r, err,
+ "--set_chooseleaf_vary_r", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &chooseleaf_stable, err,
+ "--set_chooseleaf_stable", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &straw_calc_version, err,
+ "--set_straw_calc_version", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_witharg(args, i, &allowed_bucket_algs, err,
+ "--set_allowed_bucket_algs", (char*)NULL)) {
+ adjust = true;
+ } else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) {
+ reweight = true;
+ } else if (ceph_argparse_flag(args, i, "--rebuild-class-roots", (char*)NULL)) {
+ rebuild_class_roots = true;
+ } else if (ceph_argparse_witharg(args, i, &add_item, err, "--add_item", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --add-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_weight = atof(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --add-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_name.assign(*i);
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &add_item, err, "--update_item", (char*)NULL)) {
+ update_item = true;
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --update-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_weight = atof(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --update-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_name.assign(*i);
+ i = args.erase(i);
+ } else if (argparse_withargs(args, i, err, "--add-bucket",
+ &add_name, &add_type)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ add_bucket = true;
+ } else if (argparse_withargs(args, i, err, "--move",
+ &move_name)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ move_item = true;
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--create-simple-rule", (char*)NULL)) {
+ rule_name.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-simple-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_root.assign(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-simple-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_type.assign(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-simple-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_mode.assign(*i);
+ i = args.erase(i);
+
+ cout << "--create-simple-rule:"
+ << " name=" << rule_name
+ << " root=" << rule_root
+ << " type=" << rule_type
+ << " mode=" << rule_mode
+ << std::endl;
+ add_rule = true;
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--create-replicated-rule", (char*)NULL)) {
+ rule_name.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-replicated-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_root.assign(*i);
+ i = args.erase(i);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --create-replicated-rule" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ rule_type.assign(*i);
+ i = args.erase(i);
+ rule_mode = "firstn";
+
+ cout << "--create-replicated-rule:"
+ << " name=" << rule_name
+ << " root=" << rule_root
+ << " type=" << rule_type
+ << std::endl;
+ add_rule = true;
+
+ } else if (ceph_argparse_witharg(args, i, &val, "--device-class", (char*)NULL)) {
+ rule_device_class.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ } else if (ceph_argparse_witharg(args, i, &val, "--remove-rule", (char*)NULL)) {
+ rule_name.assign(val);
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ del_rule = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--loc", (char*)NULL)) {
+ std::string type(val);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --loc" << std::endl;
+ return EXIT_FAILURE;
+ }
+ std::string name(*i);
+ i = args.erase(i);
+ add_loc[type] = name;
+ } else if (ceph_argparse_flag(args, i, "--output-csv", (char*)NULL)) {
+ write_to_file = true;
+ tester.set_output_data_file(true);
+ tester.set_output_csv(true);
+ } else if (ceph_argparse_flag(args, i, "--help-output", (char*)NULL)) {
+ data_analysis_usage();
+ return EXIT_SUCCESS;
+ } else if (ceph_argparse_witharg(args, i, &val, "--output-name", (char*)NULL)) {
+ std::string name(val);
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --output-name" << std::endl;
+ return EXIT_FAILURE;
+ }
+ else {
+ tester.set_output_data_file_name(name + "-");
+ }
+ } else if (ceph_argparse_witharg(args, i, &val, "--remove_item", (char*)NULL)) {
+ remove_name = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--reweight_item", (char*)NULL)) {
+ reweight_name = val;
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --reweight-item" << std::endl;
+ return EXIT_FAILURE;
+ }
+ reweight_weight = atof(*i);
+ i = args.erase(i);
+ } else if (ceph_argparse_flag(args, i, "--build", (char*)NULL)) {
+ build = true;
+ } else if (ceph_argparse_witharg(args, i, &num_osds, err, "--num_osds", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--num_rep", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_num_rep(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--max_x", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_max_x(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--min_x", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_min_x(x);
+ } else if (ceph_argparse_witharg(args, i, &z, err, "--pool_id", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_pool_id(z);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--x", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_x(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--max_rule", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_max_rule(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--min_rule", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_min_rule(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--rule", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_rule(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--ruleset", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_ruleset(x);
+ } else if (ceph_argparse_witharg(args, i, &x, err, "--batches", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_batches(x);
+ } else if (ceph_argparse_witharg(args, i, &y, err, "--mark-down-ratio", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_device_down_ratio(y);
+ } else if (ceph_argparse_witharg(args, i, &y, err, "--mark-down-bucket-ratio", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ tester.set_bucket_down_ratio(y);
+ } else if (ceph_argparse_witharg(args, i, &tmp, err, "--weight", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ int dev = tmp;
+ if (i == args.end()) {
+ cerr << "expecting additional argument to --weight" << std::endl;
+ return EXIT_FAILURE;
+ }
+ float f = atof(*i);
+ i = args.erase(i);
+ tester.set_device_weight(dev, f);
+ }
+ else {
+ ++i;
+ }
+ }
+
+ if (test && !check && !display && !write_to_file && compare.empty()) {
+ cerr << "WARNING: no output selected; use --output-csv or --show-X" << std::endl;
+ }
+
+ if (decompile + compile + build > 1) {
+ cerr << "cannot specify more than one of compile, decompile, and build" << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (!check && !compile && !decompile && !build && !test && !reweight && !adjust && !tree && !dump &&
+ add_item < 0 && !add_bucket && !move_item && !add_rule && !del_rule && full_location < 0 &&
+ !reclassify && !rebuild_class_roots &&
+ compare.empty() &&
+ remove_name.empty() && reweight_name.empty()) {
+ cerr << "no action specified; -h for help" << std::endl;
+ return EXIT_FAILURE;
+ }
+ if ((!build) && (!args.empty())) {
+ cerr << "unrecognized arguments: " << args << std::endl;
+ return EXIT_FAILURE;
+ }
+ else {
+ if ((args.size() % 3) != 0U) {
+ cerr << "remaining args: " << args << std::endl;
+ cerr << "layers must be specified with 3-tuples of (name, buckettype, size)"
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ for (size_t j = 0; j < args.size(); j += 3) {
+ layer_t l;
+ l.name = args[j];
+ l.buckettype = args[j+1];
+ l.size = atoi(args[j+2]);
+ layers.push_back(l);
+ }
+ }
+
+ /*
+ if (outfn) cout << "outfn " << outfn << std::endl;
+ if (cinfn) cout << "cinfn " << cinfn << std::endl;
+ if (dinfn) cout << "dinfn " << dinfn << std::endl;
+ */
+
+ bool modified = false;
+
+ // input ----
+
+ if (!infn.empty()) {
+ bufferlist bl;
+ std::string error;
+
+ int r = 0;
+ if (infn == "-") {
+ if (isatty(STDIN_FILENO)) {
+ cerr << "stdin must not be from a tty" << std::endl;
+ return EXIT_FAILURE;
+ }
+ r = get_fd_data(STDIN_FILENO, bl);
+ if (r < 0) {
+ cerr << "error reading data from STDIN" << std::endl;
+ return EXIT_FAILURE;
+ }
+ } else {
+ r = bl.read_file(infn.c_str(), &error);
+ if (r < 0) {
+ cerr << me << ": error reading '" << infn << "': "
+ << error << std::endl;
+ return EXIT_FAILURE;
+ }
+ }
+ auto p = bl.cbegin();
+ try {
+ crush.decode(p);
+ } catch(...) {
+ cerr << me << ": unable to decode " << infn << std::endl;
+ return EXIT_FAILURE;
+ }
+ }
+
+ if (compile) {
+ crush.create();
+
+ // read the file
+ ifstream in(srcfn.c_str());
+ if (!in.is_open()) {
+ cerr << "input file " << srcfn << " not found" << std::endl;
+ return -ENOENT;
+ }
+
+ CrushCompiler cc(crush, cerr, verbose);
+ if (unsafe_tunables)
+ cc.enable_unsafe_tunables();
+ int r = cc.compile(in, srcfn.c_str());
+ if (r < 0)
+ return EXIT_FAILURE;
+
+ modified = true;
+ }
+
+ if (build) {
+ if (layers.empty()) {
+ cerr << me << ": must specify at least one layer" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ crush.create();
+
+ vector<int> lower_items;
+ vector<int> lower_weights;
+
+ crush.set_max_devices(num_osds);
+ for (int i=0; i<num_osds; i++) {
+ lower_items.push_back(i);
+ lower_weights.push_back(0x10000);
+ crush.set_item_name(i, "osd." + stringify(i));
+ }
+
+ crush.set_type_name(0, "osd");
+ int type = 1;
+ for (vector<layer_t>::iterator p = layers.begin(); p != layers.end(); ++p, type++) {
+ layer_t &l = *p;
+
+ dout(2) << "layer " << type
+ << " " << l.name
+ << " bucket type " << l.buckettype
+ << " " << l.size
+ << dendl;
+
+ crush.set_type_name(type, l.name);
+
+ int buckettype = -1;
+ for (int i = 0; bucket_types[i].name; i++)
+ if (l.buckettype && strcmp(l.buckettype, bucket_types[i].name) == 0) {
+ buckettype = bucket_types[i].type;
+ break;
+ }
+ if (buckettype < 0) {
+ cerr << "unknown bucket type '" << l.buckettype << "'" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ // build items
+ vector<int> cur_items;
+ vector<int> cur_weights;
+ unsigned lower_pos = 0; // lower pos
+
+ dout(2) << "lower_items " << lower_items << dendl;
+ dout(2) << "lower_weights " << lower_weights << dendl;
+
+ int i = 0;
+ while (1) {
+ if (lower_pos == lower_items.size())
+ break;
+
+ int items[num_osds];
+ int weights[num_osds];
+
+ int weight = 0;
+ int j;
+ for (j=0; j<l.size || l.size==0; j++) {
+ if (lower_pos == lower_items.size())
+ break;
+ items[j] = lower_items[lower_pos];
+ weights[j] = lower_weights[lower_pos];
+ weight += weights[j];
+ lower_pos++;
+ dout(2) << " item " << items[j] << " weight " << weights[j] << dendl;
+ }
+
+ int id;
+ int r = crush.add_bucket(0, buckettype, CRUSH_HASH_DEFAULT, type, j, items, weights, &id);
+ if (r < 0) {
+ cerr << " Couldn't add bucket: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ char format[20];
+ format[sizeof(format)-1] = '\0';
+ if (l.size)
+ snprintf(format, sizeof(format)-1, "%s%%d", l.name);
+ else
+ strncpy(format, l.name, sizeof(format)-1);
+ char name[20];
+ snprintf(name, sizeof(name), format, i);
+ crush.set_item_name(id, name);
+
+ dout(2) << " in bucket " << id << " '" << name << "' size " << j << " weight " << weight << dendl;
+
+ cur_items.push_back(id);
+ cur_weights.push_back(weight);
+ i++;
+ }
+
+ lower_items.swap(cur_items);
+ lower_weights.swap(cur_weights);
+ }
+
+ string root = layers.back().size == 0 ? layers.back().name :
+ string(layers.back().name) + "0";
+
+ {
+ set<int> roots;
+ crush.find_roots(&roots);
+ if (roots.size() > 1) {
+ cerr << "The crush rulesets will use the root " << root << "\n"
+ << "and ignore the others.\n"
+ << "There are " << roots.size() << " roots, they can be\n"
+ << "grouped into a single root by appending something like:\n"
+ << " root straw 0\n"
+ << std::endl;
+ }
+ }
+
+ if (OSDMap::build_simple_crush_rules(g_ceph_context, crush, root, &cerr))
+ return EXIT_FAILURE;
+
+ modified = true;
+ }
+
+ // mutate ----
+
+ if (choose_local_tries >= 0) {
+ crush.set_choose_local_tries(choose_local_tries);
+ modified = true;
+ }
+ if (choose_local_fallback_tries >= 0) {
+ crush.set_choose_local_fallback_tries(choose_local_fallback_tries);
+ modified = true;
+ }
+ if (choose_total_tries >= 0) {
+ crush.set_choose_total_tries(choose_total_tries);
+ modified = true;
+ }
+ if (chooseleaf_descend_once >= 0) {
+ crush.set_chooseleaf_descend_once(chooseleaf_descend_once);
+ modified = true;
+ }
+ if (chooseleaf_vary_r >= 0) {
+ crush.set_chooseleaf_vary_r(chooseleaf_vary_r);
+ modified = true;
+ }
+ if (chooseleaf_stable >= 0) {
+ crush.set_chooseleaf_stable(chooseleaf_stable);
+ modified = true;
+ }
+ if (straw_calc_version >= 0) {
+ crush.set_straw_calc_version(straw_calc_version);
+ modified = true;
+ }
+ if (allowed_bucket_algs >= 0) {
+ crush.set_allowed_bucket_algs(allowed_bucket_algs);
+ modified = true;
+ }
+
+ if (!reweight_name.empty()) {
+ cout << me << " reweighting item " << reweight_name << " to " << reweight_weight << std::endl;
+ int r;
+ if (!crush.name_exists(reweight_name)) {
+ cerr << " name " << reweight_name << " dne" << std::endl;
+ r = -ENOENT;
+ } else {
+ int item = crush.get_item_id(reweight_name);
+ r = crush.adjust_item_weightf(g_ceph_context, item, reweight_weight);
+ }
+ if (r >= 0)
+ modified = true;
+ else {
+ cerr << me << " " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ if (!remove_name.empty()) {
+ cout << me << " removing item " << remove_name << std::endl;
+ int r;
+ if (!crush.name_exists(remove_name)) {
+ cerr << " name " << remove_name << " dne" << std::endl;
+ r = -ENOENT;
+ } else {
+ int remove_item = crush.get_item_id(remove_name);
+ r = crush.remove_item(g_ceph_context, remove_item, false);
+ }
+ if (r == 0)
+ modified = true;
+ else {
+ cerr << me << " " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ if (add_item >= 0) {
+ int r;
+ if (update_item) {
+ r = crush.update_item(g_ceph_context, add_item, add_weight, add_name.c_str(), add_loc);
+ } else {
+ r = crush.insert_item(g_ceph_context, add_item, add_weight, add_name.c_str(), add_loc);
+ }
+ if (r >= 0) {
+ modified = true;
+ } else {
+ cerr << me << " " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ if (add_bucket) {
+ if (int r = do_add_bucket(cct.get(), me, crush, add_name, add_type, add_loc); !r) {
+ modified = true;
+ } else {
+ return r;
+ }
+ }
+
+ if (move_item) {
+ if (int r = do_move_item(cct.get(), me, crush, move_name, add_loc); !r) {
+ modified = true;
+ } else {
+ return r;
+ }
+ }
+ if (add_rule) {
+ if (crush.rule_exists(rule_name)) {
+ cerr << "rule " << rule_name << " already exists" << std::endl;
+ return EXIT_FAILURE;
+ }
+ int r = crush.add_simple_rule(rule_name, rule_root, rule_type,
+ rule_device_class,
+ rule_mode, pg_pool_t::TYPE_REPLICATED, &err);
+ if (r < 0) {
+ cerr << err.str() << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
+
+ if (del_rule) {
+ if (!crush.rule_exists(rule_name)) {
+ cerr << "rule " << rule_name << " does not exist" << std::endl;
+ return 0;
+ }
+ int ruleno = crush.get_rule_id(rule_name);
+ ceph_assert(ruleno >= 0);
+ int r = crush.remove_rule(ruleno);
+ if (r < 0) {
+ cerr << "fail to remove rule " << rule_name << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
+
+ if (reweight) {
+ crush.reweight(g_ceph_context);
+ modified = true;
+ }
+ if (rebuild_class_roots) {
+ int r = crush.rebuild_roots_with_classes(g_ceph_context);
+ if (r < 0) {
+ cerr << "failed to rebuidl roots with classes" << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
+
+ for (auto& i : set_subtree_class) {
+ crush.set_subtree_class(i.first, i.second);
+ modified = true;
+ }
+ if (reclassify) {
+ int r = crush.reclassify(
+ g_ceph_context,
+ cout,
+ reclassify_root,
+ reclassify_bucket);
+ if (r < 0) {
+ cerr << "failed to reclassify map" << std::endl;
+ return EXIT_FAILURE;
+ }
+ modified = true;
+ }
+
+ // display ---
+ if (full_location >= 0) {
+ map<string, string> loc = crush.get_full_location(full_location);
+ for (map<string,string>::iterator p = loc.begin();
+ p != loc.end();
+ ++p) {
+ cout << p->first << "\t" << p->second << std::endl;
+ }
+ }
+
+ if (tree) {
+ crush.dump_tree(&cout, NULL, {}, true);
+ }
+
+ if (dump) {
+ boost::scoped_ptr<Formatter> f(Formatter::create(dump_format, "json-pretty", "json-pretty"));
+ f->open_object_section("crush_map");
+ crush.dump(f.get());
+ f->close_section();
+ f->flush(cout);
+ cout << "\n";
+ }
+
+ if (decompile) {
+ CrushCompiler cc(crush, cerr, verbose);
+ if (!outfn.empty()) {
+ ofstream o;
+ o.open(outfn.c_str(), ios::out | ios::binary | ios::trunc);
+ if (!o.is_open()) {
+ cerr << me << ": error writing '" << outfn << "'" << std::endl;
+ return EXIT_FAILURE;
+ }
+ cc.decompile(o);
+ o.close();
+ } else {
+ cc.decompile(cout);
+ }
+ }
+
+ if (check) {
+ tester.check_overlapped_rules();
+ if (max_id >= 0) {
+ if (!tester.check_name_maps(max_id)) {
+ return EXIT_FAILURE;
+ }
+ }
+ }
+
+ if (test) {
+ if (tester.get_output_utilization_all() ||
+ tester.get_output_utilization())
+ tester.set_output_statistics(true);
+
+ int r = tester.test();
+ if (r < 0)
+ return EXIT_FAILURE;
+ }
+
+ if (compare.size()) {
+ CrushWrapper crush2;
+ bufferlist in;
+ string error;
+ int r = in.read_file(compare.c_str(), &error);
+ if (r < 0) {
+ cerr << me << ": error reading '" << compare << "': "
+ << error << std::endl;
+ return EXIT_FAILURE;
+ }
+ auto p = in.cbegin();
+ try {
+ crush2.decode(p);
+ } catch(...) {
+ cerr << me << ": unable to decode " << compare << std::endl;
+ return EXIT_FAILURE;
+ }
+ r = tester.compare(crush2);
+ if (r < 0)
+ return EXIT_FAILURE;
+ }
+
+ // output ---
+ if (modified) {
+ crush.finalize();
+
+ if (outfn.empty()) {
+ cout << me << " successfully built or modified map. Use '-o <file>' to write it out." << std::endl;
+ } else {
+ bufferlist bl;
+ crush.encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ int r = bl.write_file(outfn.c_str());
+ if (r < 0) {
+ cerr << me << ": error writing '" << outfn << "': " << cpp_strerror(r) << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (verbose)
+ cout << "wrote crush map to " << outfn << std::endl;
+ }
+ }
+
+ return 0;
+}
+/*
+ * Local Variables:
+ * compile-command: "cd .. ; make crushtool && test/run-cli-tests"
+ * End:
+ */
diff --git a/src/tools/histogram_dump.py b/src/tools/histogram_dump.py
new file mode 100755
index 00000000..bafc24b0
--- /dev/null
+++ b/src/tools/histogram_dump.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+# coding: utf-8
+#
+# Ceph - scalable distributed file system
+#
+# Copyright (C) 2017 OVH
+#
+# This is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public
+# License version 2, as published by the Free Software
+# Foundation. See file COPYING.
+#
+
+import json
+import subprocess
+import time
+import os
+import argparse
+
+
+def shorten(val):
+ if isinstance(val, str):
+ return val
+ for u in ((3, ''), (6, 'k'), (9, 'M'), (12, 'G'), (15, 'T')):
+ if val < 10**u[0]:
+ return "{}{}".format(int(val / (10 ** (u[0]-3))), u[1])
+ return val
+
+
+def print_histogram(asok, logger, counter, last):
+
+ try:
+ out = subprocess.check_output(
+ "ceph --admin-daemon {} perf histogram dump".format(asok),
+ shell=True)
+ j = json.loads(out.decode('utf-8'))
+ except Exception as e:
+ return (last,
+ "Couldn't connect to admin socket, result: \n{}".format(e))
+
+ current = j['osd'][counter]['values']
+ axes = j['osd'][counter]['axes']
+ content = ""
+
+ content += "{}:\n".format(axes[1]['name'])
+ for r in axes[1]['ranges']:
+ content += "{0: >4} ".format(
+ shorten(r['min']) if 'min' in r else '')
+ content += "\n"
+ for r in axes[1]['ranges']:
+ content += "{0: >4} ".format(
+ shorten(r['max']) if 'max' in r else '')
+ content += "\n"
+
+ content += ("{0: >"+str(len(axes[1]['ranges'])*5+14)+"}:\n").format(
+ axes[0]['name'])
+
+ for i in range(len(current)):
+ for j in range(len(current[i])):
+ try:
+ diff = current[i][j] - last[i][j]
+ except IndexError:
+ diff = '-'
+ content += "{0: >4} ".format(shorten(diff))
+
+ r = axes[0]['ranges'][i]
+ content += "{0: >6} : {1}\n".format(
+ shorten(r['min']) if 'min' in r else '',
+ shorten(r['max']) if 'max' in r else '')
+ return (current, content)
+
+
+def loop_print(asok, logger, counter):
+ last = []
+ while True:
+
+ last, content = print_histogram(asok, logger, counter, last)
+ print("{}{}".format("\n"*100, content))
+ time.sleep(1)
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description='Continuously display ceph performance histogram')
+ parser.add_argument(
+ '--asok',
+ type=str,
+ default='/var/run/ceph/*.asok',
+ help='Path to asok file, can use wildcards')
+ parser.add_argument(
+ '--logger',
+ type=str,
+ default='osd')
+ parser.add_argument(
+ '--counter',
+ type=str,
+ default='op_w_latency_in_bytes_histogram')
+ args = parser.parse_args()
+
+ loop_print(args.asok, args.logger, args.counter)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/src/tools/kvstore_tool.cc b/src/tools/kvstore_tool.cc
new file mode 100644
index 00000000..ed33b29c
--- /dev/null
+++ b/src/tools/kvstore_tool.cc
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "kvstore_tool.h"
+
+#include <iostream>
+
+#include "common/errno.h"
+#include "common/url_escape.h"
+#include "include/buffer.h"
+#include "kv/KeyValueDB.h"
+
+StoreTool::StoreTool(const string& type,
+ const string& path,
+ bool need_open_db,
+ bool need_stats)
+ : store_path(path)
+{
+
+ if (need_stats) {
+ g_conf()->rocksdb_perf = true;
+ g_conf()->rocksdb_collect_compaction_stats = true;
+ }
+
+ if (type == "bluestore-kv") {
+#ifdef WITH_BLUESTORE
+ if (load_bluestore(path, need_open_db) != 0)
+ exit(1);
+#else
+ cerr << "bluestore not compiled in" << std::endl;
+ exit(1);
+#endif
+ } else {
+ auto db_ptr = KeyValueDB::create(g_ceph_context, type, path);
+ if (need_open_db) {
+ if (int r = db_ptr->open(std::cerr); r < 0) {
+ cerr << "failed to open type " << type << " path " << path << ": "
+ << cpp_strerror(r) << std::endl;
+ exit(1);
+ }
+ db.reset(db_ptr);
+ }
+ }
+}
+
+int StoreTool::load_bluestore(const string& path, bool need_open_db)
+{
+ auto bluestore = new BlueStore(g_ceph_context, path);
+ KeyValueDB *db_ptr;
+ int r = bluestore->start_kv_only(&db_ptr, need_open_db);
+ if (r < 0) {
+ return -EINVAL;
+ }
+ db = decltype(db){db_ptr, Deleter(bluestore)};
+ return 0;
+}
+
+uint32_t StoreTool::traverse(const string& prefix,
+ const bool do_crc,
+ const bool do_value_dump,
+ ostream *out)
+{
+ KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
+
+ if (prefix.empty())
+ iter->seek_to_first();
+ else
+ iter->seek_to_first(prefix);
+
+ uint32_t crc = -1;
+
+ while (iter->valid()) {
+ pair<string,string> rk = iter->raw_key();
+ if (!prefix.empty() && (rk.first != prefix))
+ break;
+
+ if (out)
+ *out << url_escape(rk.first) << "\t" << url_escape(rk.second);
+ if (do_crc) {
+ bufferlist bl;
+ bl.append(rk.first);
+ bl.append(rk.second);
+ bl.append(iter->value());
+
+ crc = bl.crc32c(crc);
+ if (out) {
+ *out << "\t" << bl.crc32c(0);
+ }
+ }
+ if (out)
+ *out << std::endl;
+ if (out && do_value_dump) {
+ bufferptr bp = iter->value_as_ptr();
+ bufferlist value;
+ value.append(bp);
+ ostringstream os;
+ value.hexdump(os);
+ std::cout << os.str() << std::endl;
+ }
+ iter->next();
+ }
+
+ return crc;
+}
+
+void StoreTool::list(const string& prefix, const bool do_crc,
+ const bool do_value_dump)
+{
+ traverse(prefix, do_crc, do_value_dump,& std::cout);
+}
+
+bool StoreTool::exists(const string& prefix)
+{
+ ceph_assert(!prefix.empty());
+ KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
+ iter->seek_to_first(prefix);
+ return (iter->valid() && (iter->raw_key().first == prefix));
+}
+
+bool StoreTool::exists(const string& prefix, const string& key)
+{
+ ceph_assert(!prefix.empty());
+
+ if (key.empty()) {
+ return exists(prefix);
+ }
+ bool exists = false;
+ get(prefix, key, exists);
+ return exists;
+}
+
+bufferlist StoreTool::get(const string& prefix,
+ const string& key,
+ bool& exists)
+{
+ ceph_assert(!prefix.empty() && !key.empty());
+
+ map<string,bufferlist> result;
+ std::set<std::string> keys;
+ keys.insert(key);
+ db->get(prefix, keys, &result);
+
+ if (result.count(key) > 0) {
+ exists = true;
+ return result[key];
+ } else {
+ exists = false;
+ return bufferlist();
+ }
+}
+
+uint64_t StoreTool::get_size()
+{
+ map<string,uint64_t> extras;
+ uint64_t s = db->get_estimated_size(extras);
+ for (auto& [name, size] : extras) {
+ std::cout << name << " - " << size << std::endl;
+ }
+ std::cout << "total: " << s << std::endl;
+ return s;
+}
+
+bool StoreTool::set(const string &prefix, const string &key, bufferlist &val)
+{
+ ceph_assert(!prefix.empty());
+ ceph_assert(!key.empty());
+ ceph_assert(val.length() > 0);
+
+ KeyValueDB::Transaction tx = db->get_transaction();
+ tx->set(prefix, key, val);
+ int ret = db->submit_transaction_sync(tx);
+
+ return (ret == 0);
+}
+
+bool StoreTool::rm(const string& prefix, const string& key)
+{
+ ceph_assert(!prefix.empty());
+ ceph_assert(!key.empty());
+
+ KeyValueDB::Transaction tx = db->get_transaction();
+ tx->rmkey(prefix, key);
+ int ret = db->submit_transaction_sync(tx);
+
+ return (ret == 0);
+}
+
+bool StoreTool::rm_prefix(const string& prefix)
+{
+ ceph_assert(!prefix.empty());
+
+ KeyValueDB::Transaction tx = db->get_transaction();
+ tx->rmkeys_by_prefix(prefix);
+ int ret = db->submit_transaction_sync(tx);
+
+ return (ret == 0);
+}
+
+void StoreTool::print_summary(const uint64_t total_keys, const uint64_t total_size,
+ const uint64_t total_txs, const string& store_path,
+ const string& other_path, const int duration) const
+{
+ std::cout << "summary:" << std::endl;
+ std::cout << " copied " << total_keys << " keys" << std::endl;
+ std::cout << " used " << total_txs << " transactions" << std::endl;
+ std::cout << " total size " << byte_u_t(total_size) << std::endl;
+ std::cout << " from '" << store_path << "' to '" << other_path << "'"
+ << std::endl;
+ std::cout << " duration " << duration << " seconds" << std::endl;
+}
+
+int StoreTool::print_stats() const
+{
+ ostringstream ostr;
+ Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty");
+ int ret = -1;
+ if (g_conf()->rocksdb_perf) {
+ db->get_statistics(f);
+ ostr << "db_statistics ";
+ f->flush(ostr);
+ ret = 0;
+ } else {
+ ostr << "db_statistics not enabled";
+ f->flush(ostr);
+ }
+ std::cout << ostr.str() << std::endl;
+ delete f;
+ return ret;
+}
+
+int StoreTool::copy_store_to(const string& type, const string& other_path,
+ const int num_keys_per_tx,
+ const string& other_type)
+{
+ if (num_keys_per_tx <= 0) {
+ std::cerr << "must specify a number of keys/tx > 0" << std::endl;
+ return -EINVAL;
+ }
+
+ // open or create a leveldb store at @p other_path
+ boost::scoped_ptr<KeyValueDB> other;
+ KeyValueDB *other_ptr = KeyValueDB::create(g_ceph_context,
+ other_type,
+ other_path);
+ if (int err = other_ptr->create_and_open(std::cerr); err < 0) {
+ return err;
+ }
+ other.reset(other_ptr);
+
+ KeyValueDB::WholeSpaceIterator it = db->get_wholespace_iterator();
+ it->seek_to_first();
+ uint64_t total_keys = 0;
+ uint64_t total_size = 0;
+ uint64_t total_txs = 0;
+
+ auto duration = [start=coarse_mono_clock::now()] {
+ const auto now = coarse_mono_clock::now();
+ auto seconds = std::chrono::duration<double>(now - start);
+ return seconds.count();
+ };
+
+ do {
+ int num_keys = 0;
+
+ KeyValueDB::Transaction tx = other->get_transaction();
+
+ while (it->valid() && num_keys < num_keys_per_tx) {
+ auto [prefix, key] = it->raw_key();
+ bufferlist v = it->value();
+ tx->set(prefix, key, v);
+
+ num_keys++;
+ total_size += v.length();
+
+ it->next();
+ }
+
+ total_txs++;
+ total_keys += num_keys;
+
+ if (num_keys > 0)
+ other->submit_transaction_sync(tx);
+
+ std::cout << "ts = " << duration() << "s, copied " << total_keys
+ << " keys so far (" << byte_u_t(total_size) << ")"
+ << std::endl;
+
+ } while (it->valid());
+
+ print_summary(total_keys, total_size, total_txs, store_path, other_path,
+ duration());
+
+ return 0;
+}
+
+void StoreTool::compact()
+{
+ db->compact();
+}
+
+void StoreTool::compact_prefix(const string& prefix)
+{
+ db->compact_prefix(prefix);
+}
+
+void StoreTool::compact_range(const string& prefix,
+ const string& start,
+ const string& end)
+{
+ db->compact_range(prefix, start, end);
+}
+
+int StoreTool::destructive_repair()
+{
+ return db->repair(std::cout);
+}
diff --git a/src/tools/kvstore_tool.h b/src/tools/kvstore_tool.h
new file mode 100644
index 00000000..d8c89661
--- /dev/null
+++ b/src/tools/kvstore_tool.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include "acconfig.h"
+#include "include/buffer_fwd.h"
+#ifdef WITH_BLUESTORE
+#include "os/bluestore/BlueStore.h"
+#endif
+
+class KeyValueDB;
+
+class StoreTool
+{
+#ifdef WITH_BLUESTORE
+ struct Deleter {
+ BlueStore *bluestore;
+ Deleter()
+ : bluestore(nullptr) {}
+ Deleter(BlueStore *store)
+ : bluestore(store) {}
+ void operator()(KeyValueDB *db) {
+ if (bluestore) {
+ bluestore->umount();
+ delete bluestore;
+ } else {
+ delete db;
+ }
+ }
+ };
+ std::unique_ptr<KeyValueDB, Deleter> db;
+#else
+ std::unique_ptr<KeyValueDB> db;
+#endif
+
+ const std::string store_path;
+
+public:
+ StoreTool(const std::string& type,
+ const std::string& path,
+ bool need_open_db = true,
+ bool need_stats = false);
+ int load_bluestore(const std::string& path, bool need_open_db);
+ uint32_t traverse(const std::string& prefix,
+ const bool do_crc,
+ const bool do_value_dump,
+ ostream *out);
+ void list(const std::string& prefix,
+ const bool do_crc,
+ const bool do_value_dump);
+ bool exists(const std::string& prefix);
+ bool exists(const std::string& prefix, const std::string& key);
+ ceph::bufferlist get(const std::string& prefix,
+ const std::string& key,
+ bool& exists);
+ uint64_t get_size();
+ bool set(const std::string& prefix,
+ const std::string& key,
+ ceph::bufferlist& val);
+ bool rm(const std::string& prefix, const std::string& key);
+ bool rm_prefix(const std::string& prefix);
+ void print_summary(const uint64_t total_keys, const uint64_t total_size,
+ const uint64_t total_txs, const std::string& store_path,
+ const std::string& other_path, const int duration) const;
+ int copy_store_to(const std::string& type, const std::string& other_path,
+ const int num_keys_per_tx, const std::string& other_type);
+ void compact();
+ void compact_prefix(const std::string& prefix);
+ void compact_range(const std::string& prefix,
+ const std::string& start,
+ const std::string& end);
+ int destructive_repair();
+
+ int print_stats() const;
+};
diff --git a/src/tools/monmaptool.cc b/src/tools/monmaptool.cc
new file mode 100644
index 00000000..ef819a3a
--- /dev/null
+++ b/src/tools/monmaptool.cc
@@ -0,0 +1,473 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include <string>
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+
+#include "global/global_init.h"
+#include "include/str_list.h"
+#include "mon/MonMap.h"
+
+
+void usage()
+{
+ cout << "usage: monmaptool [--print] [--create [--clobber] [--fsid uuid]]\n"
+ << " [--enable-all-features]\n"
+ << " [--generate] [--set-initial-members]\n"
+ << " [--add name 1.2.3.4:567] [--rm name]\n"
+ << " [--feature-list [plain|parseable]]\n"
+ << " [--feature-set <value> [--optional|--persistent]]\n"
+ << " [--feature-unset <value> [--optional|--persistent]]\n"
+ << " [--set-min-mon-release <release-major-number>]\n"
+ << " <mapfilename>"
+ << std::endl;
+}
+
+void helpful_exit()
+{
+ cerr << "monmaptool -h for usage" << std::endl;
+ exit(1);
+}
+
+struct feature_op_t {
+ enum type_t {
+ PERSISTENT,
+ OPTIONAL,
+ PLAIN,
+ PARSEABLE,
+ NONE
+ };
+
+ enum op_t {
+ OP_SET,
+ OP_UNSET,
+ OP_LIST
+ };
+
+ op_t op;
+ type_t type;
+ mon_feature_t feature;
+
+ feature_op_t() : op(OP_LIST), type(NONE) { }
+ // default to 'persistent' feature if not specified
+ feature_op_t(op_t o) : op(o), type(PERSISTENT) { }
+ feature_op_t(op_t o, type_t t) : op(o), type(t) { }
+ feature_op_t(op_t o, type_t t, mon_feature_t &f) :
+ op(o), type(t), feature(t) { }
+
+ void set_optional() {
+ type = OPTIONAL;
+ }
+ void set_persistent() {
+ type = PERSISTENT;
+ }
+ bool parse_value(string &s, ostream *errout = NULL) {
+
+ feature = ceph::features::mon::get_feature_by_name(s);
+ if (feature != ceph::features::mon::FEATURE_NONE) {
+ return true;
+ }
+
+ // try parsing as numerical value
+ uint64_t feature_val;
+ string interr;
+ feature_val = strict_strtoll(s.c_str(), 10, &interr);
+ if (!interr.empty()) {
+ if (errout) {
+ *errout << "unknown features name '" << s
+ << "' or unable to parse value: " << interr << std::endl;
+ }
+ return false;
+ }
+ feature = mon_feature_t(feature_val);
+ return true;
+ }
+};
+
+void features_list(feature_op_t &f, MonMap &m)
+{
+ if (f.type == feature_op_t::type_t::PLAIN) {
+
+ cout << "MONMAP FEATURES:" << std::endl;
+ cout << " persistent: ";
+ m.persistent_features.print_with_value(cout);
+ cout << std::endl;
+ cout << " optional: ";
+ m.optional_features.print_with_value(cout);
+ cout << std::endl;
+ cout << " required: ";
+ m.get_required_features().print_with_value(cout);
+ cout << std::endl;
+
+ cout << std::endl;
+ cout << "AVAILABLE FEATURES:" << std::endl;
+ cout << " supported: ";
+ ceph::features::mon::get_supported().print_with_value(cout);
+ cout << std::endl;
+ cout << " persistent: ";
+ ceph::features::mon::get_persistent().print_with_value(cout);
+ cout << std::endl;
+ } else if (f.type == feature_op_t::type_t::PARSEABLE) {
+
+ cout << "monmap:persistent:";
+ m.persistent_features.print_with_value(cout);
+ cout << std::endl;
+ cout << "monmap:optional:";
+ m.optional_features.print_with_value(cout);
+ cout << std::endl;
+ cout << "monmap:required:";
+ m.get_required_features().print_with_value(cout);
+ cout << std::endl;
+ cout << "available:supported:";
+ ceph::features::mon::get_supported().print_with_value(cout);
+ cout << std::endl;
+ cout << "available:persistent:";
+ ceph::features::mon::get_persistent().print_with_value(cout);
+ cout << std::endl;
+ }
+}
+
+bool handle_features(list<feature_op_t>& lst, MonMap &m)
+{
+ if (lst.empty())
+ return false;
+
+ bool modified = false;
+
+ for (auto &f : lst) {
+ if (f.op == feature_op_t::op_t::OP_LIST) {
+ features_list(f, m);
+ } else if (f.op == feature_op_t::op_t::OP_SET ||
+ f.op == feature_op_t::op_t::OP_UNSET) {
+
+ modified = true;
+
+ mon_feature_t &target =
+ ( f.type == feature_op_t::type_t::OPTIONAL ?
+ m.optional_features : m.persistent_features );
+
+ if (f.op == feature_op_t::op_t::OP_SET) {
+ target.set_feature(f.feature);
+ } else {
+ target.unset_feature(f.feature);
+ }
+ } else {
+ cerr << "unknown feature operation type '" << f.op << "'" << std::endl;
+ }
+ }
+ return modified;
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ const char *me = argv[0];
+
+ std::string fn;
+ bool print = false;
+ bool create = false;
+ bool enable_all_features = false;
+ bool clobber = false;
+ bool modified = false;
+ bool show_features = false;
+ bool generate = false;
+ bool filter = false;
+ int min_mon_release = -1;
+ map<string,entity_addr_t> add;
+ map<string,entity_addrvec_t> addv;
+ list<string> rm;
+ list<feature_op_t> features;
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+ std::string val;
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_flag(args, i, "-p", "--print", (char*)NULL)) {
+ print = true;
+ } else if (ceph_argparse_flag(args, i, "--create", (char*)NULL)) {
+ create = true;
+ } else if (ceph_argparse_flag(args, i, "--enable-all-features", (char*)NULL)) {
+ enable_all_features = true;
+ } else if (ceph_argparse_flag(args, i, "--clobber", (char*)NULL)) {
+ clobber = true;
+ } else if (ceph_argparse_flag(args, i, "--generate", (char*)NULL)) {
+ generate = true;
+ } else if (ceph_argparse_flag(args, i, "--set-initial-members", (char*)NULL)) {
+ filter = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--set-min-mon-release",
+ (char*)NULL)) {
+ min_mon_release = atoi(val.c_str());
+ } else if (ceph_argparse_flag(args, i, "--add", (char*)NULL)) {
+ string name = *i;
+ i = args.erase(i);
+ if (i == args.end())
+ helpful_exit();
+ entity_addr_t addr;
+ if (!addr.parse(*i)) {
+ cerr << me << ": invalid ip:port '" << *i << "'" << std::endl;
+ return -1;
+ }
+ add[name] = addr;
+ modified = true;
+ i = args.erase(i);
+ } else if (ceph_argparse_flag(args, i, "--addv", (char*)NULL)) {
+ string name = *i;
+ i = args.erase(i);
+ if (i == args.end())
+ helpful_exit();
+ entity_addrvec_t addrs;
+ if (!addrs.parse(*i)) {
+ cerr << me << ": invalid ip:port '" << *i << "'" << std::endl;
+ return -1;
+ }
+ addv[name] = addrs;
+ modified = true;
+ i = args.erase(i);
+ } else if (ceph_argparse_witharg(args, i, &val, "--rm", (char*)NULL)) {
+ rm.push_back(val);
+ modified = true;
+ } else if (ceph_argparse_flag(args, i, "--feature-list", (char*)NULL)) {
+ string format = *i;
+ if (format == "plain" || format == "parseable") {
+ i = args.erase(i);
+ } else {
+ format = "plain";
+ }
+
+ feature_op_t f(feature_op_t::op_t::OP_LIST,
+ feature_op_t::type_t::PLAIN);
+
+ if (format == "parseable") {
+ f.type = feature_op_t::type_t::PARSEABLE;
+ } else if (format != "plain") {
+ cerr << "invalid format type for list: '" << val << "'" << std::endl;
+ helpful_exit();
+ }
+
+ features.push_back(f);
+ show_features = true;
+ } else if (ceph_argparse_witharg(args, i, &val,
+ "--feature-set", (char*)NULL)) {
+ // parse value
+ feature_op_t f(feature_op_t::op_t::OP_SET);
+ if (!f.parse_value(val, &cerr)) {
+ helpful_exit();
+ }
+ features.push_back(f);
+
+ } else if (ceph_argparse_witharg(args, i, &val,
+ "--feature-unset", (char*)NULL)) {
+ // parse value
+ feature_op_t f(feature_op_t::op_t::OP_UNSET);
+ if (!f.parse_value(val, &cerr)) {
+ helpful_exit();
+ }
+ features.push_back(f);
+ } else if (ceph_argparse_flag(args, i, "--optional", (char*)NULL)) {
+ if (features.empty()) {
+ helpful_exit();
+ }
+ features.back().set_optional();
+ } else if (ceph_argparse_flag(args, i, "--persistent", (char*)NULL)) {
+ if (features.empty()) {
+ helpful_exit();
+ }
+ features.back().set_persistent();
+ } else {
+ ++i;
+ }
+ }
+ if (args.empty()) {
+ cerr << me << ": must specify monmap filename" << std::endl;
+ helpful_exit();
+ }
+ else if (args.size() > 1) {
+ cerr << me << ": too many arguments" << std::endl;
+ helpful_exit();
+ }
+ fn = args[0];
+
+ MonMap monmap;
+
+ cout << me << ": monmap file " << fn << std::endl;
+
+ int r = 0;
+ if (!(create && clobber)) {
+ try {
+ r = monmap.read(fn.c_str());
+ } catch (...) {
+ cerr << me << ": unable to read monmap file" << std::endl;
+ return -1;
+ }
+ }
+
+ if (!create && r < 0) {
+ cerr << me << ": couldn't open " << fn << ": " << cpp_strerror(r) << std::endl;
+ return -1;
+ }
+ else if (create && !clobber && r == 0) {
+ cerr << me << ": " << fn << " exists, --clobber to overwrite" << std::endl;
+ return -1;
+ }
+
+ if (create) {
+ monmap.epoch = 0;
+ monmap.created = ceph_clock_now();
+ monmap.last_changed = monmap.created;
+ srand(getpid() + time(0));
+ if (g_conf().get_val<uuid_d>("fsid").is_zero()) {
+ monmap.generate_fsid();
+ cout << me << ": generated fsid " << monmap.fsid << std::endl;
+ }
+ modified = true;
+ }
+ if (enable_all_features) {
+ // populate persistent features, too
+ monmap.persistent_features = ceph::features::mon::get_persistent();
+ modified = true;
+ }
+
+ if (generate) {
+ int r = monmap.build_initial(g_ceph_context, true, cerr);
+ if (r < 0)
+ return r;
+ }
+
+ if (min_mon_release >= 0) {
+ monmap.min_mon_release = min_mon_release;
+ cout << "setting min_mon_release = " << min_mon_release << std::endl;
+ modified = true;
+ }
+
+ if (filter) {
+ // apply initial members
+ list<string> initial_members;
+ get_str_list(g_conf()->mon_initial_members, initial_members);
+ if (!initial_members.empty()) {
+ cout << "initial_members " << initial_members << ", filtering seed monmap" << std::endl;
+ set<entity_addrvec_t> removed;
+ monmap.set_initial_members(g_ceph_context, initial_members,
+ string(), entity_addrvec_t(),
+ &removed);
+ cout << "removed " << removed << std::endl;
+ }
+ modified = true;
+ }
+
+ if (!g_conf().get_val<uuid_d>("fsid").is_zero()) {
+ monmap.fsid = g_conf().get_val<uuid_d>("fsid");
+ cout << me << ": set fsid to " << monmap.fsid << std::endl;
+ modified = true;
+ }
+
+ for (auto& p : add) {
+ entity_addr_t addr = p.second;
+ entity_addrvec_t addrs;
+ if (monmap.contains(p.first)) {
+ cerr << me << ": map already contains mon." << p.first << std::endl;
+ helpful_exit();
+ }
+ if (addr.get_port() == 0) {
+ if (monmap.persistent_features.contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ addr.set_port(CEPH_MON_PORT_IANA);
+ addrs.v.push_back(addr);
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ addrs.v.push_back(addr);
+ } else {
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addr.set_port(CEPH_MON_PORT_LEGACY);
+ addrs.v.push_back(addr);
+ }
+ } else if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
+ addr.set_type(entity_addr_t::TYPE_LEGACY);
+ addrs.v.push_back(addr);
+ } else {
+ if (monmap.persistent_features.contains_all(
+ ceph::features::mon::FEATURE_NAUTILUS)) {
+ addr.set_type(entity_addr_t::TYPE_MSGR2);
+ }
+ addrs.v.push_back(addr);
+ }
+ if (monmap.contains(addrs)) {
+ cerr << me << ": map already contains " << addrs << std::endl;
+ helpful_exit();
+ }
+ monmap.add(p.first, addrs);
+ }
+ for (auto& p : addv) {
+ if (monmap.contains(p.first)) {
+ cerr << me << ": map already contains mon." << p.first << std::endl;
+ helpful_exit();
+ }
+ if (monmap.contains(p.second)) {
+ cerr << me << ": map already contains " << p.second << std::endl;
+ helpful_exit();
+ }
+ monmap.add(p.first, p.second);
+ }
+ for (auto& p : rm) {
+ cout << me << ": removing " << p << std::endl;
+ if (!monmap.contains(p)) {
+ cerr << me << ": map does not contain " << p << std::endl;
+ helpful_exit();
+ }
+ monmap.remove(p);
+ }
+
+ if (handle_features(features, monmap)) {
+ modified = true;
+ }
+
+ if (!print && !modified && !show_features) {
+ cerr << "no action specified" << std::endl;
+ helpful_exit();
+ }
+
+ if (print)
+ monmap.print(cout);
+
+ if (modified) {
+ // write it out
+ cout << me << ": writing epoch " << monmap.epoch
+ << " to " << fn
+ << " (" << monmap.size() << " monitors)"
+ << std::endl;
+ int r = monmap.write(fn.c_str());
+ if (r < 0) {
+ cerr << "monmaptool: error writing to '" << fn << "': " << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ }
+
+
+ return 0;
+}
diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc
new file mode 100644
index 00000000..887086e5
--- /dev/null
+++ b/src/tools/osdmaptool.cc
@@ -0,0 +1,799 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string>
+#include <sys/stat.h>
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "mon/health_check.h"
+#include <time.h>
+#include <algorithm>
+
+#include "global/global_init.h"
+#include "osd/OSDMap.h"
+
+
+void usage()
+{
+ cout << " usage: [--print] <mapfilename>" << std::endl;
+ cout << " --create-from-conf creates an osd map with default configurations" << std::endl;
+ cout << " --createsimple <numosd> [--clobber] [--pg-bits <bitsperosd>] [--pgp-bits <bits>] creates a relatively generic OSD map with <numosd> devices" << std::endl;
+ cout << " --pgp-bits <bits> pgp_num map attribute will be shifted by <bits>" << std::endl;
+ cout << " --pg-bits <bits> pg_num map attribute will be shifted by <bits>" << std::endl;
+ cout << " --clobber allows osdmaptool to overwrite <mapfilename> if it already exists" << std::endl;
+ cout << " --export-crush <file> write osdmap's crush map to <file>" << std::endl;
+ cout << " --import-crush <file> replace osdmap's crush map with <file>" << std::endl;
+ cout << " --health dump health checks" << std::endl;
+ cout << " --test-map-pgs [--pool <poolid>] [--pg_num <pg_num>] [--range-first <first> --range-last <last>] map all pgs" << std::endl;
+ cout << " --test-map-pgs-dump [--pool <poolid>] [--range-first <first> --range-last <last>] map all pgs" << std::endl;
+ cout << " --test-map-pgs-dump-all [--pool <poolid>] [--range-first <first> --range-last <last>] map all pgs to osds" << std::endl;
+ cout << " --mark-up-in mark osds up and in (but do not persist)" << std::endl;
+ cout << " --mark-out <osdid> mark an osd as out (but do not persist)" << std::endl;
+ cout << " --with-default-pool include default pool when creating map" << std::endl;
+ cout << " --clear-temp clear pg_temp and primary_temp" << std::endl;
+ cout << " --clean-temps clean pg_temps" << std::endl;
+ cout << " --test-random do random placements" << std::endl;
+ cout << " --test-map-pg <pgid> map a pgid to osds" << std::endl;
+ cout << " --test-map-object <objectname> [--pool <poolid>] map an object to osds"
+ << std::endl;
+ cout << " --upmap-cleanup <file> clean up pg_upmap[_items] entries, writing" << std::endl;
+ cout << " commands to <file> [default: - for stdout]" << std::endl;
+ cout << " --upmap <file> calculate pg upmap entries to balance pg layout" << std::endl;
+ cout << " writing commands to <file> [default: - for stdout]" << std::endl;
+ cout << " --upmap-max <max-count> set max upmap entries to calculate [default: 10]" << std::endl;
+ cout << " --upmap-deviation <max-deviation>" << std::endl;
+ cout << " max deviation from target [default: 5]" << std::endl;
+ cout << " --upmap-pool <poolname> restrict upmap balancing to 1 or more pools" << std::endl;
+ cout << " --upmap-save write modified OSDMap with upmap changes" << std::endl;
+ cout << " --upmap-active Act like an active balancer, keep applying changes until balanced" << std::endl;
+ cout << " --dump <format> displays the map in plain text when <format> is 'plain', 'json' if specified format is not supported" << std::endl;
+ cout << " --tree displays a tree of the map" << std::endl;
+ cout << " --test-crush [--range-first <first> --range-last <last>] map pgs to acting osds" << std::endl;
+ exit(1);
+}
+
+void print_inc_upmaps(const OSDMap::Incremental& pending_inc, int fd)
+{
+ ostringstream ss;
+ for (auto& i : pending_inc.old_pg_upmap) {
+ ss << "ceph osd rm-pg-upmap " << i << std::endl;
+ }
+ for (auto& i : pending_inc.new_pg_upmap) {
+ ss << "ceph osd pg-upmap " << i.first;
+ for (auto osd : i.second) {
+ ss << " " << osd;
+ }
+ ss << std::endl;
+ }
+ for (auto& i : pending_inc.old_pg_upmap_items) {
+ ss << "ceph osd rm-pg-upmap-items " << i << std::endl;
+ }
+ for (auto& i : pending_inc.new_pg_upmap_items) {
+ ss << "ceph osd pg-upmap-items " << i.first;
+ for (auto p : i.second) {
+ ss << " " << p.first << " " << p.second;
+ }
+ ss << std::endl;
+ }
+ string s = ss.str();
+ int r = safe_write(fd, s.c_str(), s.size());
+ if (r < 0) {
+ cerr << "error writing output: " << cpp_strerror(r) << std::endl;
+ exit(1);
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+ common_init_finish(g_ceph_context);
+
+ const char *me = argv[0];
+
+ std::string fn;
+ bool print = false;
+ boost::scoped_ptr<Formatter> print_formatter;
+ bool tree = false;
+ boost::scoped_ptr<Formatter> tree_formatter;
+ bool createsimple = false;
+ bool createpool = false;
+ bool create_from_conf = false;
+ int num_osd = 0;
+ int pg_bits = 6;
+ int pgp_bits = 6;
+ bool clobber = false;
+ bool modified = false;
+ std::string export_crush, import_crush, test_map_pg, test_map_object;
+ bool test_crush = false;
+ int range_first = -1;
+ int range_last = -1;
+ int pool = -1;
+ bool mark_up_in = false;
+ int marked_out = -1;
+ bool clear_temp = false;
+ bool clean_temps = false;
+ bool test_map_pgs = false;
+ bool test_map_pgs_dump = false;
+ bool test_random = false;
+ bool upmap_cleanup = false;
+ bool upmap = false;
+ bool upmap_save = false;
+ bool health = false;
+ std::string upmap_file = "-";
+ int upmap_max = 10;
+ int upmap_deviation = 5;
+ bool upmap_active = false;
+ std::set<std::string> upmap_pools;
+ int64_t pg_num = -1;
+ bool test_map_pgs_dump_all = false;
+
+ std::string val;
+ std::ostringstream err;
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_flag(args, i, "-p", "--print", (char*)NULL)) {
+ print = true;
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--dump", (char*)NULL)) {
+ print = true;
+ if (!val.empty() && val != "plain") {
+ print_formatter.reset(Formatter::create(val, "", "json"));
+ }
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--tree", (char*)NULL)) {
+ tree = true;
+ if (!val.empty() && val != "plain") {
+ tree_formatter.reset(Formatter::create(val, "", "json"));
+ }
+ } else if (ceph_argparse_witharg(args, i, &pg_bits, err, "--osd-pg-bits", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &pgp_bits, err, "--osd-pgp-bits", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &upmap_file, "--upmap-cleanup", (char*)NULL)) {
+ upmap_cleanup = true;
+ } else if (ceph_argparse_witharg(args, i, &upmap_file, "--upmap-save", (char*)NULL)) {
+ upmap_save = true;
+ } else if (ceph_argparse_witharg(args, i, &upmap_file, "--upmap", (char*)NULL)) {
+ upmap_cleanup = true;
+ upmap = true;
+ } else if (ceph_argparse_witharg(args, i, &upmap_max, err, "--upmap-max", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &upmap_deviation, err, "--upmap-deviation", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &val, "--upmap-pool", (char*)NULL)) {
+ upmap_pools.insert(val);
+ } else if (ceph_argparse_witharg(args, i, &num_osd, err, "--createsimple", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ createsimple = true;
+ } else if (ceph_argparse_flag(args, i, "--upmap-active", (char*)NULL)) {
+ upmap_active = true;
+ } else if (ceph_argparse_flag(args, i, "--health", (char*)NULL)) {
+ health = true;
+ } else if (ceph_argparse_flag(args, i, "--with-default-pool", (char*)NULL)) {
+ createpool = true;
+ } else if (ceph_argparse_flag(args, i, "--create-from-conf", (char*)NULL)) {
+ create_from_conf = true;
+ } else if (ceph_argparse_flag(args, i, "--mark-up-in", (char*)NULL)) {
+ mark_up_in = true;
+ } else if (ceph_argparse_witharg(args, i, &val, "--mark-out", (char*)NULL)) {
+ marked_out = std::stoi(val);
+ } else if (ceph_argparse_flag(args, i, "--clear-temp", (char*)NULL)) {
+ clear_temp = true;
+ } else if (ceph_argparse_flag(args, i, "--clean-temps", (char*)NULL)) {
+ clean_temps = true;
+ } else if (ceph_argparse_flag(args, i, "--test-map-pgs", (char*)NULL)) {
+ test_map_pgs = true;
+ } else if (ceph_argparse_flag(args, i, "--test-map-pgs-dump", (char*)NULL)) {
+ test_map_pgs_dump = true;
+ } else if (ceph_argparse_flag(args, i, "--test-map-pgs-dump-all", (char*)NULL)) {
+ test_map_pgs_dump_all = true;
+ } else if (ceph_argparse_flag(args, i, "--test-random", (char*)NULL)) {
+ test_random = true;
+ } else if (ceph_argparse_flag(args, i, "--clobber", (char*)NULL)) {
+ clobber = true;
+ } else if (ceph_argparse_witharg(args, i, &pg_bits, err, "--pg_bits", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ } else if (ceph_argparse_witharg(args, i, &pgp_bits, err, "--pgp_bits", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ } else if (ceph_argparse_witharg(args, i, &val, "--export_crush", (char*)NULL)) {
+ export_crush = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--import_crush", (char*)NULL)) {
+ import_crush = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--test_map_pg", (char*)NULL)) {
+ test_map_pg = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--test_map_object", (char*)NULL)) {
+ test_map_object = val;
+ } else if (ceph_argparse_flag(args, i, "--test_crush", (char*)NULL)) {
+ test_crush = true;
+ } else if (ceph_argparse_witharg(args, i, &val, err, "--pg_num", (char*)NULL)) {
+ string interr;
+ pg_num = strict_strtoll(val.c_str(), 10, &interr);
+ if (interr.length() > 0) {
+ cerr << "error parsing integer value " << interr << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ } else if (ceph_argparse_witharg(args, i, &range_first, err, "--range_first", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &range_last, err, "--range_last", (char*)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &pool, err, "--pool", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ ++i;
+ }
+ }
+ if (args.empty()) {
+ cerr << me << ": must specify osdmap filename" << std::endl;
+ usage();
+ }
+ else if (args.size() > 1) {
+ cerr << me << ": too many arguments" << std::endl;
+ usage();
+ }
+ if (upmap_deviation < 1) {
+ cerr << me << ": upmap-deviation must be >= 1" << std::endl;
+ usage();
+ }
+ fn = args[0];
+
+ if (range_first >= 0 && range_last >= 0) {
+ set<OSDMap*> maps;
+ OSDMap *prev = NULL;
+ for (int i=range_first; i <= range_last; i++) {
+ ostringstream f;
+ f << fn << "/" << i;
+ bufferlist bl;
+ string error, s = f.str();
+ int r = bl.read_file(s.c_str(), &error);
+ if (r < 0) {
+ cerr << "unable to read " << s << ": " << cpp_strerror(r) << std::endl;
+ exit(1);
+ }
+ cout << s << " got " << bl.length() << " bytes" << std::endl;
+ OSDMap *o = new OSDMap;
+ o->decode(bl);
+ maps.insert(o);
+ if (prev)
+ OSDMap::dedup(prev, o);
+ prev = o;
+ }
+ exit(0);
+ }
+
+ OSDMap osdmap;
+ bufferlist bl;
+
+ cerr << me << ": osdmap file '" << fn << "'" << std::endl;
+
+ int r = 0;
+ struct stat st;
+ if (!createsimple && !create_from_conf && !clobber) {
+ std::string error;
+ r = bl.read_file(fn.c_str(), &error);
+ if (r == 0) {
+ try {
+ osdmap.decode(bl);
+ }
+ catch (const buffer::error &e) {
+ cerr << me << ": error decoding osdmap '" << fn << "'" << std::endl;
+ return -1;
+ }
+ }
+ else {
+ cerr << me << ": couldn't open " << fn << ": " << error << std::endl;
+ return -1;
+ }
+ }
+ else if ((createsimple || create_from_conf) && !clobber && ::stat(fn.c_str(), &st) == 0) {
+ cerr << me << ": " << fn << " exists, --clobber to overwrite" << std::endl;
+ return -1;
+ }
+
+ if (createsimple || create_from_conf) {
+ if (createsimple) {
+ if (num_osd < 1) {
+ cerr << me << ": osd count must be > 0" << std::endl;
+ exit(1);
+ }
+ } else {
+ num_osd = -1;
+ }
+ uuid_d fsid;
+ if (createpool) {
+ osdmap.build_simple_with_pool(
+ g_ceph_context, 0, fsid, num_osd, pg_bits, pgp_bits);
+ } else {
+ osdmap.build_simple(g_ceph_context, 0, fsid, num_osd);
+ }
+ modified = true;
+ }
+
+ if (mark_up_in) {
+ cout << "marking all OSDs up and in" << std::endl;
+ int n = osdmap.get_max_osd();
+ for (int i=0; i<n; i++) {
+ osdmap.set_state(i, osdmap.get_state(i) | CEPH_OSD_UP);
+ osdmap.set_weight(i, CEPH_OSD_IN);
+ osdmap.crush->adjust_item_weightf(g_ceph_context, i, 1.0);
+ }
+ }
+
+ if (marked_out >=0 && marked_out < osdmap.get_max_osd()) {
+ cout << "marking OSD@" << marked_out << " as out" << std::endl;
+ int id = marked_out;
+ osdmap.set_state(id, osdmap.get_state(id) | CEPH_OSD_UP);
+ osdmap.set_weight(id, CEPH_OSD_OUT);
+ osdmap.crush->adjust_item_weightf(g_ceph_context, id, 1.0);
+ }
+
+ if (clear_temp) {
+ cout << "clearing pg/primary temp" << std::endl;
+ osdmap.clear_temp();
+ }
+ if (clean_temps) {
+ cout << "cleaning pg temps" << std::endl;
+ OSDMap::Incremental pending_inc(osdmap.get_epoch()+1);
+ OSDMap tmpmap;
+ tmpmap.deepish_copy_from(osdmap);
+ tmpmap.apply_incremental(pending_inc);
+ OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
+ }
+ int upmap_fd = STDOUT_FILENO;
+ if (upmap || upmap_cleanup) {
+ if (upmap_file != "-") {
+ upmap_fd = ::open(upmap_file.c_str(), O_CREAT|O_WRONLY|O_TRUNC, 0644);
+ if (upmap_fd < 0) {
+ cerr << "error opening " << upmap_file << ": " << cpp_strerror(errno)
+ << std::endl;
+ exit(1);
+ }
+ cout << "writing upmap command output to: " << upmap_file << std::endl;
+ }
+ }
+ if (upmap_cleanup) {
+ cout << "checking for upmap cleanups" << std::endl;
+ OSDMap::Incremental pending_inc(osdmap.get_epoch()+1);
+ pending_inc.fsid = osdmap.get_fsid();
+ int r = osdmap.clean_pg_upmaps(g_ceph_context, &pending_inc);
+ if (r > 0) {
+ print_inc_upmaps(pending_inc, upmap_fd);
+ r = osdmap.apply_incremental(pending_inc);
+ ceph_assert(r == 0);
+ }
+ }
+ if (upmap) {
+ cout << "upmap, max-count " << upmap_max
+ << ", max deviation " << upmap_deviation
+ << std::endl;
+ vector<int64_t> pools;
+ set<int64_t> upmap_pool_nums;
+ for (auto& s : upmap_pools) {
+ int64_t p = osdmap.lookup_pg_pool_name(s);
+ if (p < 0) {
+ cerr << " pool " << s << " does not exist" << std::endl;
+ exit(1);
+ }
+ pools.push_back(p);
+ upmap_pool_nums.insert(p);
+ }
+ if (!pools.empty()) {
+ cout << " limiting to pools " << upmap_pools << " (" << pools << ")"
+ << std::endl;
+ } else {
+ mempool::osdmap::map<int64_t,pg_pool_t> opools = osdmap.get_pools();
+ for (auto& i : opools) {
+ pools.push_back(i.first);
+ }
+ }
+ if (pools.empty()) {
+ cout << "No pools available" << std::endl;
+ goto skip_upmap;
+ }
+ int rounds = 0;
+ struct timespec round_start;
+ int r = clock_gettime(CLOCK_MONOTONIC, &round_start);
+ assert(r == 0);
+ do {
+ std::random_device rd;
+ std::shuffle(pools.begin(), pools.end(), std::mt19937{rd()});
+ cout << "pools ";
+ for (auto& i: pools)
+ cout << osdmap.get_pool_name(i) << " ";
+ cout << std::endl;
+ OSDMap::Incremental pending_inc(osdmap.get_epoch()+1);
+ pending_inc.fsid = osdmap.get_fsid();
+ int total_did = 0;
+ int left = upmap_max;
+ struct timespec begin, end;
+ r = clock_gettime(CLOCK_MONOTONIC, &begin);
+ assert(r == 0);
+ for (auto& i: pools) {
+ set<int64_t> one_pool;
+ one_pool.insert(i);
+ int did = osdmap.calc_pg_upmaps(
+ g_ceph_context, upmap_deviation,
+ left, one_pool,
+ &pending_inc);
+ total_did += did;
+ left -= did;
+ if (left <= 0)
+ break;
+ }
+ r = clock_gettime(CLOCK_MONOTONIC, &end);
+ assert(r == 0);
+ cout << "prepared " << total_did << "/" << upmap_max << " changes" << std::endl;
+ float elapsed_time = (end.tv_sec - begin.tv_sec) + 1.0e-9*(end.tv_nsec - begin.tv_nsec);
+ if (upmap_active)
+ cout << "Time elapsed " << elapsed_time << " secs" << std::endl;
+ if (total_did > 0) {
+ print_inc_upmaps(pending_inc, upmap_fd);
+ if (upmap_save || upmap_active) {
+ int r = osdmap.apply_incremental(pending_inc);
+ ceph_assert(r == 0);
+ if (upmap_save)
+ modified = true;
+ }
+ } else {
+ cout << "Unable to find further optimization, "
+ << "or distribution is already perfect"
+ << std::endl;
+ if (upmap_active) {
+ map<int,set<pg_t>> pgs_by_osd;
+ for (auto& i : osdmap.get_pools()) {
+ if (!upmap_pool_nums.empty() && !upmap_pool_nums.count(i.first))
+ continue;
+ for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
+ pg_t pg(ps, i.first);
+ vector<int> up;
+ osdmap.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
+ //ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
+ for (auto osd : up) {
+ if (osd != CRUSH_ITEM_NONE)
+ pgs_by_osd[osd].insert(pg);
+ }
+ }
+ }
+ for (auto& i : pgs_by_osd)
+ cout << "osd." << i.first << " pgs " << i.second.size() << std::endl;
+ float elapsed_time = (end.tv_sec - round_start.tv_sec) + 1.0e-9*(end.tv_nsec - round_start.tv_nsec);
+ cout << "Total time elapsed " << elapsed_time << " secs, " << rounds << " rounds" << std::endl;
+ }
+ break;
+ }
+ ++rounds;
+ } while(upmap_active);
+ }
+skip_upmap:
+ if (upmap_file != "-") {
+ ::close(upmap_fd);
+ }
+
+ if (!import_crush.empty()) {
+ bufferlist cbl;
+ std::string error;
+ r = cbl.read_file(import_crush.c_str(), &error);
+ if (r) {
+ cerr << me << ": error reading crush map from " << import_crush
+ << ": " << error << std::endl;
+ exit(1);
+ }
+
+ // validate
+ CrushWrapper cw;
+ auto p = cbl.cbegin();
+ cw.decode(p);
+
+ if (cw.get_max_devices() > osdmap.get_max_osd()) {
+ cerr << me << ": crushmap max_devices " << cw.get_max_devices()
+ << " > osdmap max_osd " << osdmap.get_max_osd() << std::endl;
+ exit(1);
+ }
+
+ // apply
+ OSDMap::Incremental inc;
+ inc.fsid = osdmap.get_fsid();
+ inc.epoch = osdmap.get_epoch()+1;
+ inc.crush = cbl;
+ osdmap.apply_incremental(inc);
+ cout << me << ": imported " << cbl.length() << " byte crush map from " << import_crush << std::endl;
+ modified = true;
+ }
+
+ if (!export_crush.empty()) {
+ bufferlist cbl;
+ osdmap.crush->encode(cbl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ r = cbl.write_file(export_crush.c_str());
+ if (r < 0) {
+ cerr << me << ": error writing crush map to " << import_crush << std::endl;
+ exit(1);
+ }
+ cout << me << ": exported crush map to " << export_crush << std::endl;
+ }
+
+ if (!test_map_object.empty()) {
+ object_t oid(test_map_object);
+ if (pool == -1) {
+ cout << me << ": assuming pool 1 (use --pool to override)" << std::endl;
+ pool = 1;
+ }
+ if (!osdmap.have_pg_pool(pool)) {
+ cerr << "There is no pool " << pool << std::endl;
+ exit(1);
+ }
+ object_locator_t loc(pool);
+ pg_t raw_pgid = osdmap.object_locator_to_pg(oid, loc);
+ pg_t pgid = osdmap.raw_pg_to_pg(raw_pgid);
+
+ vector<int> acting;
+ osdmap.pg_to_acting_osds(pgid, acting);
+ cout << " object '" << oid
+ << "' -> " << pgid
+ << " -> " << acting
+ << std::endl;
+ }
+ if (!test_map_pg.empty()) {
+ pg_t pgid;
+ if (!pgid.parse(test_map_pg.c_str())) {
+ cerr << me << ": failed to parse pg '" << test_map_pg << std::endl;
+ usage();
+ }
+ cout << " parsed '" << test_map_pg << "' -> " << pgid << std::endl;
+
+ vector<int> raw, up, acting;
+ int raw_primary, up_primary, acting_primary;
+ osdmap.pg_to_raw_osds(pgid, &raw, &raw_primary);
+ osdmap.pg_to_up_acting_osds(pgid, &up, &up_primary,
+ &acting, &acting_primary);
+ cout << pgid << " raw (" << raw << ", p" << raw_primary
+ << ") up (" << up << ", p" << up_primary
+ << ") acting (" << acting << ", p" << acting_primary << ")"
+ << std::endl;
+ }
+ if (test_map_pgs || test_map_pgs_dump || test_map_pgs_dump_all) {
+ if (pool != -1 && !osdmap.have_pg_pool(pool)) {
+ cerr << "There is no pool " << pool << std::endl;
+ exit(1);
+ }
+ int n = osdmap.get_max_osd();
+ vector<int> count(n, 0);
+ vector<int> first_count(n, 0);
+ vector<int> primary_count(n, 0);
+ vector<int> size(30, 0);
+ int max_size = 0;
+ if (test_random)
+ srand(getpid());
+ auto& pools = osdmap.get_pools();
+ for (auto p = pools.begin(); p != pools.end(); ++p) {
+ if (pool != -1 && p->first != pool)
+ continue;
+ if (pg_num > 0)
+ p->second.set_pg_num(pg_num);
+
+ cout << "pool " << p->first
+ << " pg_num " << p->second.get_pg_num() << std::endl;
+ for (unsigned i = 0; i < p->second.get_pg_num(); ++i) {
+ pg_t pgid = pg_t(i, p->first);
+
+ vector<int> osds, raw, up, acting;
+ int primary, calced_primary, up_primary, acting_primary;
+ if (test_random) {
+ osds.resize(p->second.size);
+ for (unsigned i=0; i<osds.size(); ++i) {
+ osds[i] = rand() % osdmap.get_max_osd();
+ }
+ primary = osds[0];
+ } else if (test_map_pgs_dump_all) {
+ osdmap.pg_to_raw_osds(pgid, &raw, &calced_primary);
+ osdmap.pg_to_up_acting_osds(pgid, &up, &up_primary,
+ &acting, &acting_primary);
+ osds = acting;
+ primary = acting_primary;
+ } else {
+ osdmap.pg_to_acting_osds(pgid, &osds, &primary);
+ }
+ size[osds.size()]++;
+ if ((unsigned)max_size < osds.size())
+ max_size = osds.size();
+
+ if (test_map_pgs_dump) {
+ cout << pgid << "\t" << osds << "\t" << primary << std::endl;
+ } else if (test_map_pgs_dump_all) {
+ cout << pgid << " raw (" << raw << ", p" << calced_primary
+ << ") up (" << up << ", p" << up_primary
+ << ") acting (" << acting << ", p" << acting_primary << ")"
+ << std::endl;
+ }
+
+ for (unsigned i=0; i<osds.size(); i++) {
+ //cout << " rep " << i << " on " << osds[i] << std::endl;
+ count[osds[i]]++;
+ }
+ if (osds.size())
+ first_count[osds[0]]++;
+ if (primary >= 0)
+ primary_count[primary]++;
+ }
+ }
+
+ uint64_t total = 0;
+ int in = 0;
+ int min_osd = -1;
+ int max_osd = -1;
+ cout << "#osd\tcount\tfirst\tprimary\tc wt\twt\n";
+ for (int i=0; i<n; i++) {
+ if (!osdmap.is_in(i))
+ continue;
+ if (osdmap.crush->get_item_weight(i) <= 0)
+ continue;
+ in++;
+ cout << "osd." << i
+ << "\t" << count[i]
+ << "\t" << first_count[i]
+ << "\t" << primary_count[i]
+ << "\t" << osdmap.crush->get_item_weightf(i)
+ << "\t" << osdmap.get_weightf(i)
+ << std::endl;
+ total += count[i];
+ if (count[i] &&
+ (min_osd < 0 ||
+ count[i] < count[min_osd]))
+ min_osd = i;
+ if (count[i] &&
+ (max_osd < 0 ||
+ count[i] > count[max_osd]))
+ max_osd = i;
+
+ }
+ uint64_t avg = in ? (total / in) : 0;
+ double dev = 0;
+ for (int i=0; i<n; i++) {
+ if (!osdmap.is_in(i))
+ continue;
+ if (osdmap.crush->get_item_weight(i) <= 0)
+ continue;
+ dev += (avg - count[i]) * (avg - count[i]);
+ }
+ dev /= in;
+ dev = sqrt(dev);
+
+ //double edev = sqrt(pgavg) * (double)avg / pgavg;
+ double edev = sqrt((double)total / (double)in * (1.0 - (1.0 / (double)in)));
+ cout << " in " << in << std::endl;
+ cout << " avg " << avg
+ << " stddev " << dev
+ << " (" << (dev/avg) << "x)"
+ << " (expected " << edev << " " << (edev/avg) << "x))"
+ << std::endl;
+
+ if (min_osd >= 0)
+ cout << " min osd." << min_osd << " " << count[min_osd] << std::endl;
+ if (max_osd >= 0)
+ cout << " max osd." << max_osd << " " << count[max_osd] << std::endl;
+
+ for (int i=0; i<=max_size; i++) {
+ if (size[i])
+ cout << "size " << i << "\t" << size[i] << std::endl;
+ }
+ }
+ if (test_crush) {
+ int pass = 0;
+ while (1) {
+ cout << "pass " << ++pass << std::endl;
+
+ ceph::unordered_map<pg_t,vector<int> > m;
+ for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
+ p != osdmap.get_pools().end();
+ ++p) {
+ const pg_pool_t *pool = osdmap.get_pg_pool(p->first);
+ for (ps_t ps = 0; ps < pool->get_pg_num(); ps++) {
+ pg_t pgid(ps, p->first);
+ for (int i=0; i<100; i++) {
+ cout << pgid << " attempt " << i << std::endl;
+
+ vector<int> r;
+ osdmap.pg_to_acting_osds(pgid, r);
+ //cout << pgid << " " << r << std::endl;
+ if (m.count(pgid)) {
+ if (m[pgid] != r) {
+ cout << pgid << " had " << m[pgid] << " now " << r << std::endl;
+ ceph_abort();
+ }
+ } else
+ m[pgid] = r;
+ }
+ }
+ }
+ }
+ }
+
+ if (!print && !health && !tree && !modified &&
+ export_crush.empty() && import_crush.empty() &&
+ test_map_pg.empty() && test_map_object.empty() &&
+ !test_map_pgs && !test_map_pgs_dump && !test_map_pgs_dump_all &&
+ !upmap && !upmap_cleanup) {
+ cerr << me << ": no action specified?" << std::endl;
+ usage();
+ }
+
+ if (modified)
+ osdmap.inc_epoch();
+
+ if (health) {
+ health_check_map_t checks;
+ osdmap.check_health(cct.get(), &checks);
+ JSONFormatter jf(true);
+ jf.dump_object("checks", checks);
+ jf.flush(cout);
+ }
+ if (print) {
+ if (print_formatter) {
+ print_formatter->open_object_section("osdmap");
+ osdmap.dump(print_formatter.get());
+ print_formatter->close_section();
+ print_formatter->flush(cout);
+ } else {
+ osdmap.print(cout);
+ }
+ }
+
+ if (tree) {
+ if (tree_formatter) {
+ tree_formatter->open_object_section("tree");
+ osdmap.print_tree(tree_formatter.get(), NULL);
+ tree_formatter->close_section();
+ tree_formatter->flush(cout);
+ cout << std::endl;
+ } else {
+ osdmap.print_tree(NULL, &cout);
+ }
+ }
+ if (modified) {
+ bl.clear();
+ osdmap.encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT | CEPH_FEATURE_RESERVED);
+
+ // write it out
+ cout << me << ": writing epoch " << osdmap.get_epoch()
+ << " to " << fn
+ << std::endl;
+ int r = bl.write_file(fn.c_str());
+ if (r) {
+ cerr << "osdmaptool: error writing to '" << fn << "': "
+ << cpp_strerror(r) << std::endl;
+ return 1;
+ }
+ }
+
+
+ return 0;
+}
diff --git a/src/tools/psim.cc b/src/tools/psim.cc
new file mode 100644
index 00000000..90e6fb95
--- /dev/null
+++ b/src/tools/psim.cc
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd/OSDMap.h"
+#include "include/buffer.h"
+
+int main(int argc, char **argv)
+{
+ /*
+ * you need to create a suitable osdmap first. e.g., for 40 osds,
+ * $ ./osdmaptool --createsimple 40 --clobber .ceph_osdmap
+ */
+ bufferlist bl;
+ std::string error;
+ if (bl.read_file(".ceph_osdmap", &error)) {
+ cout << argv[0] << ": error reading .ceph_osdmap: " << error << std::endl;
+ return 1;
+ }
+ OSDMap osdmap;
+
+ try {
+ osdmap.decode(bl);
+ } catch (ceph::buffer::end_of_buffer &eob) {
+ cout << "Exception (end_of_buffer) in decode(), exit." << std::endl;
+ exit(1);
+ }
+
+ //osdmap.set_primary_affinity(0, 0x8000);
+ //osdmap.set_primary_affinity(3, 0);
+
+ int n = osdmap.get_max_osd();
+ int count[n];
+ int first_count[n];
+ int primary_count[n];
+ int size[4];
+
+ memset(count, 0, sizeof(count));
+ memset(first_count, 0, sizeof(first_count));
+ memset(primary_count, 0, sizeof(primary_count));
+ memset(size, 0, sizeof(size));
+
+ for (int i=0; i<n; i++) {
+ osdmap.set_state(i, osdmap.get_state(i) | CEPH_OSD_UP);
+ //if (i<12)
+ osdmap.set_weight(i, CEPH_OSD_IN);
+ }
+
+ //pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(0);
+ //p->type = pg_pool_t::TYPE_ERASURE;
+
+ for (int n = 0; n < 10; n++) { // namespaces
+ char nspace[20];
+ snprintf(nspace, sizeof(nspace), "n%d", n);
+ for (int f = 0; f < 5000; f++) { // files
+ for (int b = 0; b < 4; b++) { // blocks
+ char foo[20];
+ snprintf(foo, sizeof(foo), "%d.%d", f, b);
+ object_t oid(foo);
+ ceph_object_layout l = osdmap.make_object_layout(oid, 0, nspace);
+ vector<int> osds;
+ pg_t pgid = pg_t(l.ol_pgid);
+ //pgid.u.ps = f * 4 + b;
+ int primary;
+ osdmap.pg_to_acting_osds(pgid, &osds, &primary);
+ size[osds.size()]++;
+#if 0
+ if (0) {
+ hash<object_t> H;
+ int x = H(oid);
+ x = ceph_stable_mod(x, 1023, 1023);
+ int s = crush_hash32(x) % 15;
+ //cout << "ceph_psim: x = " << x << " s = " << s << std::endl;
+ //osds[0] = s;
+ }
+#endif
+ //osds[0] = crush_hash32(f) % n;
+ //cout << "oid " << oid << " pgid " << pgid << " on " << osds << std::endl;
+ for (unsigned i=0; i<osds.size(); i++) {
+ //cout << " rep " << i << " on " << osds[i] << std::endl;
+ count[osds[i]]++;
+ }
+ if (osds.size())
+ first_count[osds[0]]++;
+ if (primary >= 0)
+ primary_count[primary]++;
+ }
+ }
+ }
+
+ uint64_t avg = 0;
+ for (int i=0; i<n; i++) {
+ cout << "osd." << i << "\t" << count[i]
+ << "\t" << first_count[i]
+ << "\t" << primary_count[i]
+ << std::endl;
+ avg += count[i];
+ }
+ avg /= n;
+ double dev = 0;
+ for (int i=0; i<n; i++)
+ dev += (avg - count[i]) * (avg - count[i]);
+ dev /= n;
+ dev = sqrt(dev);
+
+ double pgavg = (double)osdmap.get_pg_pool(0)->get_pg_num() / (double)n;
+ double edev = sqrt(pgavg) * (double)avg / pgavg;
+ cout << " avg " << avg
+ << " stddev " << dev
+ << " (expected " << edev << ")"
+ << " (indep object placement would be " << sqrt(avg) << ")" << std::endl;
+
+ for (int i=0; i<4; i++) {
+ cout << "size" << i << "\t" << size[i] << std::endl;
+ }
+
+ return 0;
+}
diff --git a/src/tools/rados/PoolDump.cc b/src/tools/rados/PoolDump.cc
new file mode 100644
index 00000000..9bfafa10
--- /dev/null
+++ b/src/tools/rados/PoolDump.cc
@@ -0,0 +1,169 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+
+#include "PoolDump.h"
+
+using namespace librados;
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rados
+
+/**
+ * Export RADOS objects from a live cluster
+ * to a serialized format via a file descriptor.
+ *
+ * @returns 0 on success, else error code
+ */
+int PoolDump::dump(IoCtx *io_ctx)
+{
+ ceph_assert(io_ctx != NULL);
+
+ int r = 0;
+ write_super();
+
+ r = write_simple(TYPE_POOL_BEGIN, file_fd);
+ if (r != 0) {
+ return r;
+ }
+
+ io_ctx->set_namespace(all_nspaces);
+ librados::NObjectIterator i = io_ctx->nobjects_begin();
+
+ librados::NObjectIterator i_end = io_ctx->nobjects_end();
+ for (; i != i_end; ++i) {
+ const std::string oid = i->get_oid();
+ dout(10) << "OID '" << oid << "'" << dendl;
+
+ // Compose OBJECT_BEGIN
+ // ====================
+ object_begin obj_begin;
+ obj_begin.hoid.hobj.oid = i->get_oid();
+ obj_begin.hoid.hobj.nspace = i->get_nspace();
+ obj_begin.hoid.hobj.set_key(i->get_locator());
+
+ // Only output head, RadosImport only wants that
+ obj_begin.hoid.hobj.snap = CEPH_NOSNAP;
+
+ // Skip setting object_begin.oi, RadosImport doesn't care
+
+ r = write_section(TYPE_OBJECT_BEGIN, obj_begin, file_fd);
+ if (r != 0) {
+ return r;
+ }
+
+ // Compose TYPE_DATA chunks
+ // ========================
+ const uint32_t op_size = 4096 * 1024;
+ uint64_t offset = 0;
+ io_ctx->set_namespace(i->get_nspace());
+ io_ctx->locator_set_key(i->get_locator());
+ while (true) {
+ bufferlist outdata;
+ r = io_ctx->read(oid, outdata, op_size, offset);
+ if (r <= 0) {
+ // Error or no data
+ break;
+ }
+
+ r = write_section(TYPE_DATA,
+ data_section(offset, outdata.length(), outdata), file_fd);
+ if (r != 0) {
+ // Output stream error
+ return r;
+ }
+
+ if (outdata.length() < op_size) {
+ // No more data
+ break;
+ }
+ offset += outdata.length();
+ }
+
+ // Compose TYPE_ATTRS chunk
+ // ========================
+ std::map<std::string, bufferlist> raw_xattrs;
+ std::map<std::string, bufferlist> xattrs;
+ r = io_ctx->getxattrs(oid, raw_xattrs);
+ if (r < 0) {
+ cerr << "error getting xattr set " << oid << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ // Prepend "_" to mimic how user keys are represented in a pg export
+ for (std::map<std::string, bufferlist>::iterator i = raw_xattrs.begin();
+ i != raw_xattrs.end(); ++i) {
+ std::pair< std::string, bufferlist> item(std::string("_") + std::string(i->first.c_str()), i->second);
+ xattrs.insert(item);
+ }
+ r = write_section(TYPE_ATTRS, attr_section(xattrs), file_fd);
+ if (r != 0) {
+ return r;
+ }
+
+ // Compose TYPE_OMAP_HDR section
+ // =============================
+ bufferlist omap_header;
+ r = io_ctx->omap_get_header(oid, &omap_header);
+ if (r < 0) {
+ cerr << "error getting omap header " << oid
+ << ": " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ r = write_section(TYPE_OMAP_HDR, omap_hdr_section(omap_header), file_fd);
+ if (r != 0) {
+ return r;
+ }
+
+ // Compose TYPE_OMAP
+ int MAX_READ = 512;
+ string last_read = "";
+ do {
+ map<string, bufferlist> values;
+ r = io_ctx->omap_get_vals(oid, last_read, MAX_READ, &values);
+ if (r < 0) {
+ cerr << "error getting omap keys " << oid << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (values.size()) {
+ last_read = values.rbegin()->first;
+ } else {
+ break;
+ }
+
+ r = write_section(TYPE_OMAP, omap_section(values), file_fd);
+ if (r != 0) {
+ return r;
+ }
+ r = values.size();
+ } while (r == MAX_READ);
+
+ // Close object
+ // =============
+ r = write_simple(TYPE_OBJECT_END, file_fd);
+ if (r != 0) {
+ return r;
+ }
+ }
+
+ r = write_simple(TYPE_POOL_END, file_fd);
+#if defined(__linux__)
+ if (file_fd != STDOUT_FILENO)
+ posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+ return r;
+}
diff --git a/src/tools/rados/PoolDump.h b/src/tools/rados/PoolDump.h
new file mode 100644
index 00000000..33abd886
--- /dev/null
+++ b/src/tools/rados/PoolDump.h
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef POOL_DUMP_H_
+#define POOL_DUMP_H_
+
+#include "include/rados/librados_fwd.hpp"
+#include "tools/RadosDump.h"
+
+class PoolDump : public RadosDump
+{
+ public:
+ explicit PoolDump(int file_fd_) : RadosDump(file_fd_, false) {}
+ int dump(librados::IoCtx *io_ctx);
+};
+
+#endif // POOL_DUMP_H_
diff --git a/src/tools/rados/RadosImport.cc b/src/tools/rados/RadosImport.cc
new file mode 100644
index 00000000..0a901b70
--- /dev/null
+++ b/src/tools/rados/RadosImport.cc
@@ -0,0 +1,399 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "common/errno.h"
+
+#include "osd/PGLog.h"
+#include "RadosImport.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rados
+
+int RadosImport::import(std::string pool, bool no_overwrite)
+{
+ librados::IoCtx ioctx;
+ librados::Rados cluster;
+
+ char *id = getenv("CEPH_CLIENT_ID");
+ if (id) cerr << "Client id is: " << id << std::endl;
+ int ret = cluster.init(id);
+ if (ret) {
+ cerr << "Error " << ret << " in cluster.init" << std::endl;
+ return ret;
+ }
+ ret = cluster.conf_read_file(NULL);
+ if (ret) {
+ cerr << "Error " << ret << " in cluster.conf_read_file" << std::endl;
+ return ret;
+ }
+ ret = cluster.conf_parse_env(NULL);
+ if (ret) {
+ cerr << "Error " << ret << " in cluster.conf_read_env" << std::endl;
+ return ret;
+ }
+ ret = cluster.connect();
+ if (ret) {
+ cerr << "Error " << ret << " in cluster.connect" << std::endl;
+ return ret;
+ }
+
+ ret = cluster.ioctx_create(pool.c_str(), ioctx);
+ if (ret < 0) {
+ cerr << "ioctx_create " << pool << " failed with " << ret << std::endl;
+ return ret;
+ }
+
+ return import(ioctx, no_overwrite);
+}
+
+int RadosImport::import(librados::IoCtx &io_ctx, bool no_overwrite)
+{
+ bufferlist ebl;
+ pg_info_t info;
+ PGLog::IndexedLog log;
+
+ int ret = read_super();
+ if (ret)
+ return ret;
+
+ if (sh.magic != super_header::super_magic) {
+ cerr << "Invalid magic number: 0x"
+ << std::hex << sh.magic << " vs. 0x" << super_header::super_magic
+ << std::dec << std::endl;
+ return -EFAULT;
+ }
+
+ if (sh.version > super_header::super_ver) {
+ cerr << "Can't handle export format version=" << sh.version << std::endl;
+ return -EINVAL;
+ }
+
+ //First section must be TYPE_PG_BEGIN
+ sectiontype_t type;
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ bool pool_mode = false;
+ if (type == TYPE_POOL_BEGIN) {
+ pool_mode = true;
+ cout << "Importing pool" << std::endl;
+ } else if (type == TYPE_PG_BEGIN) {
+ auto ebliter = ebl.cbegin();
+ pg_begin pgb;
+ pgb.decode(ebliter);
+ spg_t pgid = pgb.pgid;;
+ if (!pgid.is_no_shard()) {
+ cerr << "Importing Erasure Coded shard is not supported" << std::endl;
+ return -EOPNOTSUPP;
+ }
+ dout(10) << "Exported features: " << pgb.superblock.compat_features << dendl;
+ cout << "Importing from pgid " << pgid << std::endl;
+ } else {
+ cerr << "Invalid initial section code " << type << std::endl;
+ return -EFAULT;
+ }
+
+ // XXX: How to check export features?
+#if 0
+ if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
+ cerr << "Export has incompatible features set "
+ << pgb.superblock.compat_features << std::endl;
+ return -EINVAL;
+ }
+#endif
+
+#if defined(__linux__)
+ if (file_fd != STDIN_FILENO)
+ posix_fadvise(file_fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+
+ bool done = false;
+ bool found_metadata = false;
+ while(!done) {
+ ret = read_section(&type, &ebl);
+ if (ret)
+ return ret;
+
+ //cout << "do_import: Section type " << hex << type << dec << std::endl;
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_OBJECT_BEGIN:
+ ret = get_object_rados(io_ctx, ebl, no_overwrite);
+ if (ret) {
+ cerr << "Error inserting object: " << ret << std::endl;
+ return ret;
+ }
+ break;
+ case TYPE_PG_METADATA:
+ dout(10) << "Don't care about the old metadata" << dendl;
+ found_metadata = true;
+ break;
+ case TYPE_PG_END:
+ done = true;
+ break;
+ case TYPE_POOL_END:
+ done = true;
+ break;
+ default:
+ return -EFAULT;
+ }
+ }
+
+ if (!(pool_mode || found_metadata)) {
+ cerr << "Missing metadata section!" << std::endl;
+ }
+
+#if defined(__linux__)
+ if (file_fd != STDIN_FILENO)
+ posix_fadvise(file_fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+ return 0;
+}
+
+int RadosImport::get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite)
+{
+ auto ebliter = bl.cbegin();
+ object_begin ob;
+ ob.decode(ebliter);
+ map<string,bufferlist>::iterator i;
+ bufferlist abl;
+ bool skipping;
+
+ data_section ds;
+ attr_section as;
+ omap_hdr_section oh;
+ omap_section os;
+
+ ceph_assert(g_ceph_context);
+ if (ob.hoid.hobj.nspace == g_ceph_context->_conf->osd_hit_set_namespace) {
+ cout << "Skipping internal object " << ob.hoid << std::endl;
+ skip_object(bl);
+ return 0;
+ }
+
+ if (!ob.hoid.hobj.is_head()) {
+ cout << "Skipping non-head for " << ob.hoid << std::endl;
+ skip_object(bl);
+ return 0;
+ }
+
+ ioctx.set_namespace(ob.hoid.hobj.get_namespace());
+ ioctx.locator_set_key(ob.hoid.hobj.get_key());
+
+ string msg("Write");
+ skipping = false;
+ if (dry_run) {
+ uint64_t psize;
+ time_t pmtime;
+ int ret = ioctx.stat(ob.hoid.hobj.oid.name, &psize, &pmtime);
+ if (ret == 0) {
+ if (no_overwrite)
+ // Could set skipping, but dry-run doesn't change anything either
+ msg = "Skipping existing";
+ else
+ msg = "***Overwrite***";
+ }
+ } else {
+ int ret = ioctx.create(ob.hoid.hobj.oid.name, true);
+ if (ret && ret != -EEXIST) {
+ cerr << "create failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ if (ret == -EEXIST) {
+ if (no_overwrite) {
+ msg = "Skipping existing";
+ skipping = true;
+ } else {
+ msg = "***Overwrite***";
+ ret = ioctx.remove(ob.hoid.hobj.oid.name);
+ if (ret < 0) {
+ cerr << "remove failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ ret = ioctx.create(ob.hoid.hobj.oid.name, true);
+ // If object re-appeared after removal, let's just skip it
+ if (ret == -EEXIST) {
+ skipping = true;
+ msg = "Skipping in-use object";
+ ret = 0;
+ }
+ if (ret < 0) {
+ cerr << "create failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ }
+ }
+ }
+
+ cout << msg << " " << ob.hoid << std::endl;
+
+ bool need_align = false;
+ uint64_t alignment = 0;
+ if (align) {
+ need_align = true;
+ alignment = align;
+ } else {
+ int ret = ioctx.pool_requires_alignment2(&need_align);
+ if (ret < 0) {
+ cerr << "pool_requires_alignment2 failed: " << cpp_strerror(ret)
+ << std::endl;
+ return ret;
+ }
+
+ if (need_align) {
+ ret = ioctx.pool_required_alignment2(&alignment);
+ if (ret < 0) {
+ cerr << "pool_required_alignment2 failed: " << cpp_strerror(ret)
+ << std::endl;
+ return ret;
+ }
+ ceph_assert(alignment != 0);
+ }
+ }
+
+ if (need_align) {
+ dout(10) << "alignment = " << alignment << dendl;
+ }
+
+ bufferlist ebl, databl;
+ uint64_t in_offset = 0, out_offset = 0;
+ bool done = false;
+ while(!done) {
+ sectiontype_t type;
+ int ret = read_section(&type, &ebl);
+ if (ret) {
+ cerr << "Error reading section: " << ret << std::endl;
+ return ret;
+ }
+
+ ebliter = ebl.cbegin();
+ //cout << "\tdo_object: Section type " << hex << type << dec << std::endl;
+ //cout << "\t\tsection size " << ebl.length() << std::endl;
+ if (type >= END_OF_TYPES) {
+ cout << "Skipping unknown object section type" << std::endl;
+ continue;
+ }
+ switch(type) {
+ case TYPE_DATA:
+ ds.decode(ebliter);
+ dout(10) << "\tdata: offset " << ds.offset << " len " << ds.len << dendl;
+ if (need_align) {
+ if (ds.offset != in_offset) {
+ cerr << "Discontiguous object data in export" << std::endl;
+ return -EFAULT;
+ }
+ ceph_assert(ds.databl.length() == ds.len);
+ databl.claim_append(ds.databl);
+ in_offset += ds.len;
+ if (databl.length() >= alignment) {
+ uint64_t rndlen = uint64_t(databl.length() / alignment) * alignment;
+ dout(10) << "write offset=" << out_offset << " len=" << rndlen << dendl;
+ if (!dry_run && !skipping) {
+ ret = ioctx.write(ob.hoid.hobj.oid.name, databl, rndlen, out_offset);
+ if (ret) {
+ cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ }
+ out_offset += rndlen;
+ bufferlist n;
+ if (databl.length() > rndlen) {
+ ceph_assert(databl.length() - rndlen < alignment);
+ n.substr_of(databl, rndlen, databl.length() - rndlen);
+ }
+ databl = n;
+ }
+ break;
+ }
+ if (!dry_run && !skipping) {
+ ret = ioctx.write(ob.hoid.hobj.oid.name, ds.databl, ds.len, ds.offset);
+ if (ret) {
+ cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ }
+ break;
+ case TYPE_ATTRS:
+ as.decode(ebliter);
+
+ dout(10) << "\tattrs: len " << as.data.size() << dendl;
+ if (dry_run || skipping)
+ break;
+ for (std::map<string,bufferlist>::iterator i = as.data.begin();
+ i != as.data.end(); ++i) {
+ // The user xattrs that we want all begin with "_" with length > 1.
+ // Drop key "_" and all attributes that do not start with '_'
+ if (i->first == "_" || i->first[0] != '_')
+ continue;
+ ret = ioctx.setxattr(ob.hoid.hobj.oid.name, i->first.substr(1).c_str(), i->second);
+ if (ret) {
+ cerr << "setxattr failed: " << cpp_strerror(ret) << std::endl;
+ if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+ }
+ break;
+ case TYPE_OMAP_HDR:
+ oh.decode(ebliter);
+
+ dout(10) << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length())
+ << dendl;
+ if (dry_run || skipping)
+ break;
+ ret = ioctx.omap_set_header(ob.hoid.hobj.oid.name, oh.hdr);
+ if (ret) {
+ cerr << "omap_set_header failed: " << cpp_strerror(ret) << std::endl;
+ if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+ break;
+ case TYPE_OMAP:
+ os.decode(ebliter);
+
+ dout(10) << "\tomap: size " << os.omap.size() << dendl;
+ if (dry_run || skipping)
+ break;
+ ret = ioctx.omap_set(ob.hoid.hobj.oid.name, os.omap);
+ if (ret) {
+ cerr << "omap_set failed: " << cpp_strerror(ret) << std::endl;
+ if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+ break;
+ case TYPE_OBJECT_END:
+ done = true;
+ if (need_align && databl.length() > 0) {
+ ceph_assert(databl.length() < alignment);
+ dout(10) << "END write offset=" << out_offset << " len=" << databl.length() << dendl;
+ if (dry_run || skipping)
+ break;
+ ret = ioctx.write(ob.hoid.hobj.oid.name, databl, databl.length(), out_offset);
+ if (ret) {
+ cerr << "write failed: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ }
+ break;
+ default:
+ cerr << "Unexpected section type " << type << std::endl;
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
diff --git a/src/tools/rados/RadosImport.h b/src/tools/rados/RadosImport.h
new file mode 100644
index 00000000..3a516630
--- /dev/null
+++ b/src/tools/rados/RadosImport.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef RADOS_IMPORT_H_
+#define RADOS_IMPORT_H_
+
+#include <string>
+
+#include "include/rados/librados.hpp"
+#include "include/buffer_fwd.h"
+
+#include "tools/RadosDump.h"
+
+/**
+ * Specialization of RadosDump that adds
+ * methods for importing objects from a stream
+ * to a live cluster.
+ */
+class RadosImport : public RadosDump
+{
+ protected:
+ uint64_t align;
+ int get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool no_overwrite);
+
+ public:
+ RadosImport(int file_fd_, uint64_t align_, bool dry_run_)
+ : RadosDump(file_fd_, dry_run_), align(align_)
+ {}
+
+ int import(std::string pool, bool no_overwrite);
+ int import(librados::IoCtx &io_ctx, bool no_overwrite);
+};
+
+#endif // RADOS_IMPORT_H_
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
new file mode 100644
index 00000000..280a51dd
--- /dev/null
+++ b/src/tools/rados/rados.cc
@@ -0,0 +1,4135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/types.h"
+
+#include "include/rados/buffer.h"
+#include "include/rados/librados.hpp"
+#include "include/rados/rados_types.hpp"
+
+#include "acconfig.h"
+#ifdef WITH_LIBRADOSSTRIPER
+ #include "include/radosstriper/libradosstriper.hpp"
+ using namespace libradosstriper;
+#endif
+
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/Cond.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/obj_bencher.h"
+#include "common/TextTable.h"
+#include "include/stringify.h"
+#include "mds/inode_backtrace.h"
+#include "include/random.h"
+#include <iostream>
+#include <fstream>
+
+#include <stdlib.h>
+#include <time.h>
+#include <sstream>
+#include <errno.h>
+#include <dirent.h>
+#include <stdexcept>
+#include <climits>
+#include <locale>
+#include <memory>
+#include <optional>
+
+#include "cls/lock/cls_lock_client.h"
+#include "include/compat.h"
+#include "include/util.h"
+#include "common/hobject.h"
+
+#include "PoolDump.h"
+#include "RadosImport.h"
+
+#include "osd/ECUtil.h"
+
+using namespace librados;
+using ceph::util::generate_random_number;
+
+// two steps seem to be necessary to do this right
+#define STR(x) _STR(x)
+#define _STR(x) #x
+
+void usage(ostream& out)
+{
+ out << \
+"usage: rados [options] [commands]\n"
+"POOL COMMANDS\n"
+" lspools list pools\n"
+" cppool <pool-name> <dest-pool> copy content of a pool\n"
+" purge <pool-name> --yes-i-really-really-mean-it\n"
+" remove all objects from pool <pool-name> without removing it\n"
+" df show per-pool and total usage\n"
+" ls list objects in pool\n\n"
+"\n"
+"POOL SNAP COMMANDS\n"
+" lssnap list snaps\n"
+" mksnap <snap-name> create snap <snap-name>\n"
+" rmsnap <snap-name> remove snap <snap-name>\n"
+"\n"
+"OBJECT COMMANDS\n"
+" get <obj-name> <outfile> fetch object\n"
+" put <obj-name> <infile> [--offset offset]\n"
+" write object with start offset (default:0)\n"
+" append <obj-name> <infile> append object\n"
+" truncate <obj-name> length truncate object\n"
+" create <obj-name> create object\n"
+" rm <obj-name> ...[--force-full] [force no matter full or not]remove object(s)\n"
+" cp <obj-name> [target-obj] copy object\n"
+" listxattr <obj-name>\n"
+" getxattr <obj-name> attr\n"
+" setxattr <obj-name> attr val\n"
+" rmxattr <obj-name> attr\n"
+" stat <obj-name> stat the named object\n"
+" stat2 <obj-name> stat2 the named object (with high precision time)\n"
+" touch <obj-name> [timestamp] change the named object modification time\n"
+" mapext <obj-name>\n"
+" rollback <obj-name> <snap-name> roll back object to snap <snap-name>\n"
+"\n"
+" listsnaps <obj-name> list the snapshots of this object\n"
+" bench <seconds> write|seq|rand [-t concurrent_operations] [--no-cleanup] [--run-name run_name] [--no-hints] [--reuse-bench]\n"
+" default is 16 concurrent IOs and 4 MB ops\n"
+" default is to clean up after write benchmark\n"
+" default run-name is 'benchmark_last_metadata'\n"
+" cleanup [--run-name run_name] [--prefix prefix]\n"
+" clean up a previous benchmark operation\n"
+" default run-name is 'benchmark_last_metadata'\n"
+" load-gen [options] generate load on the cluster\n"
+" listomapkeys <obj-name> list the keys in the object map\n"
+" listomapvals <obj-name> list the keys and vals in the object map \n"
+" getomapval <obj-name> <key> [file] show the value for the specified key\n"
+" in the object's object map\n"
+" setomapval <obj-name> <key> <val>\n"
+" rmomapkey <obj-name> <key>\n"
+" clearomap <obj-name> [obj-name2 obj-name3...] clear all the omap keys for the specified objects\n"
+" getomapheader <obj-name> [file]\n"
+" setomapheader <obj-name> <val>\n"
+" watch <obj-name> add watcher on this object\n"
+" notify <obj-name> <message> notify watcher of this object with message\n"
+" listwatchers <obj-name> list the watchers of this object\n"
+" set-alloc-hint <obj-name> <expected-object-size> <expected-write-size>\n"
+" set allocation hint for an object\n"
+" set-redirect <object A> --target-pool <caspool> <target object A> [--with-reference]\n"
+" set redirect target\n"
+" set-chunk <object A> <offset> <length> --target-pool <caspool> <target object A> <taget-offset> [--with-reference]\n"
+" convert an object to chunked object\n"
+" tier-promote <obj-name> promote the object to the base tier\n"
+" unset-manifest <obj-name> unset redirect or chunked object\n"
+"\n"
+"IMPORT AND EXPORT\n"
+" export [filename]\n"
+" Serialize pool contents to a file or standard out.\n"
+" import [--dry-run] [--no-overwrite] < filename | - >\n"
+" Load pool contents from a file or standard in\n"
+"\n"
+"ADVISORY LOCKS\n"
+" lock list <obj-name>\n"
+" List all advisory locks on an object\n"
+" lock get <obj-name> <lock-name>\n"
+" Try to acquire a lock\n"
+" lock break <obj-name> <lock-name> <locker-name>\n"
+" Try to break a lock acquired by another client\n"
+" lock info <obj-name> <lock-name>\n"
+" Show lock information\n"
+" options:\n"
+" --lock-tag Lock tag, all locks operation should use\n"
+" the same tag\n"
+" --lock-cookie Locker cookie\n"
+" --lock-description Description of lock\n"
+" --lock-duration Lock duration (in seconds)\n"
+" --lock-type Lock type (shared, exclusive)\n"
+"\n"
+"SCRUB AND REPAIR:\n"
+" list-inconsistent-pg <pool> list inconsistent PGs in given pool\n"
+" list-inconsistent-obj <pgid> list inconsistent objects in given PG\n"
+" list-inconsistent-snapset <pgid> list inconsistent snapsets in the given PG\n"
+"\n"
+"CACHE POOLS: (for testing/development only)\n"
+" cache-flush <obj-name> flush cache pool object (blocking)\n"
+" cache-try-flush <obj-name> flush cache pool object (non-blocking)\n"
+" cache-evict <obj-name> evict cache pool object\n"
+" cache-flush-evict-all flush+evict all objects\n"
+" cache-try-flush-evict-all try-flush+evict all objects\n"
+"\n"
+"GLOBAL OPTIONS:\n"
+" --object_locator object_locator\n"
+" set object_locator for operation\n"
+" -p pool\n"
+" --pool=pool\n"
+" select given pool by name\n"
+" --target-pool=pool\n"
+" select target pool by name\n"
+" --pgid PG id\n"
+" select given PG id\n"
+" -f [--format plain|json|json-pretty]\n"
+" --format=[--format plain|json|json-pretty]\n"
+" -b op_size\n"
+" set the block size for put/get ops and for write benchmarking\n"
+" -O object_size\n"
+" set the object size for put/get ops and for write benchmarking\n"
+" --max-objects\n"
+" set the max number of objects for write benchmarking\n"
+" --obj-name-file file\n"
+" use the content of the specified file in place of <obj-name>\n"
+" -s name\n"
+" --snap name\n"
+" select given snap name for (read) IO\n"
+" --create\n"
+" create the pool or directory that was specified\n"
+" -N namespace\n"
+" --namespace=namespace\n"
+" specify the namespace to use for the object\n"
+" --all\n"
+" Use with ls to list objects in all namespaces\n"
+" Put in CEPH_ARGS environment variable to make this the default\n"
+" --default\n"
+" Use with ls to list objects in default namespace\n"
+" Takes precedence over --all in case --all is in environment\n"
+" --target-locator\n"
+" Use with cp to specify the locator of the new object\n"
+" --target-nspace\n"
+" Use with cp to specify the namespace of the new object\n"
+#ifdef WITH_LIBRADOSSTRIPER
+" --striper\n"
+" Use radostriper interface rather than pure rados\n"
+" Available for stat, get, put, truncate, rm, ls and \n"
+" all xattr related operations\n"
+#endif
+"\n"
+"BENCH OPTIONS:\n"
+" -t N\n"
+" --concurrent-ios=N\n"
+" Set number of concurrent I/O operations\n"
+" --show-time\n"
+" prefix output with date/time\n"
+" --no-verify\n"
+" do not verify contents of read objects\n"
+" --write-object\n"
+" write contents to the objects\n"
+" --write-omap\n"
+" write contents to the omap\n"
+" --write-xattr\n"
+" write contents to the extended attributes\n"
+"\n"
+"LOAD GEN OPTIONS:\n"
+" --num-objects total number of objects\n"
+" --min-object-size min object size\n"
+" --max-object-size max object size\n"
+" --min-op-len min io size of operations\n"
+" --max-op-len max io size of operations\n"
+" --max-ops max number of operations\n"
+" --max-backlog max backlog size\n"
+" --read-percent percent of operations that are read\n"
+" --target-throughput target throughput (in bytes)\n"
+" --run-length total time (in seconds)\n"
+" --offset-align at what boundary to align random op offsets"
+"CACHE POOLS OPTIONS:\n"
+" --with-clones include clones when doing flush or evict\n"
+"OMAP OPTIONS:\n"
+" --omap-key-file file read the omap key from a file\n";
+}
+
+namespace detail {
+
+#ifdef WITH_LIBRADOSSTRIPER
+RadosStriper& striper()
+{
+ static RadosStriper s;
+ return s;
+}
+#endif
+
+int read([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, buffer::list& out_data, const unsigned op_size, const uint64_t offset, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().read(oid, &out_data, op_size, offset);
+#endif
+
+ return io_ctx.read(oid, out_data, op_size, offset);
+}
+
+int write([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, buffer::list& indata, const uint64_t count, const uint64_t offset, [[maybe_unused]] const bool use_striper)
+{
+ #ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().write(oid, indata, count, offset);
+#endif
+
+ return io_ctx.write(oid, indata, count, offset);
+}
+
+int write_full([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, bufferlist& indata, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().write_full(oid, indata);
+#endif
+
+ return io_ctx.write_full(oid, indata);
+}
+
+int trunc([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const uint64_t offset, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().trunc(oid, offset);
+#endif
+
+ return io_ctx.trunc(oid, offset);
+}
+
+int append([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, buffer::list& indata, const uint64_t count, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().append(oid, indata, count);
+#endif
+
+ return io_ctx.append(oid, indata, count);
+}
+
+int setxattr([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const std::string& attr_name, buffer::list& bl, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().setxattr(oid, attr_name.c_str(), bl);
+#endif
+
+ return io_ctx.setxattr(oid, attr_name.c_str(), bl);
+}
+
+int getxattr([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const std::string& attr_name, buffer::list& bl, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().getxattr(oid, attr_name.c_str(), bl);
+#endif
+
+ return io_ctx.getxattr(oid, attr_name.c_str(), bl);
+}
+
+int rmxattr([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const std::string& attr_name, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().rmxattr(oid, attr_name.c_str());
+#endif
+
+ return io_ctx.rmxattr(oid, attr_name.c_str());
+}
+
+int getxattrs([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, std::map<std::string, buffer::list>& attrset, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().getxattrs(oid, attrset);
+#endif
+
+ return io_ctx.getxattrs(oid, attrset);
+}
+
+int remove([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, const int flags, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().remove(oid, flags);
+#endif
+
+ return io_ctx.remove(oid, flags);
+}
+
+int remove([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().remove(oid);
+#endif
+
+ return io_ctx.remove(oid);
+}
+
+std::string get_oid(librados::NObjectIterator& i, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return i->get_oid().substr(0, i->get_oid().length()-17);
+#endif
+
+ return i->get_oid();
+}
+
+int stat([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, uint64_t& size, time_t& mtime, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().stat(oid, &size, &mtime);
+#endif
+
+ return io_ctx.stat(oid, &size, &mtime);
+}
+
+int stat2([[maybe_unused]] IoCtx& io_ctx, const std::string& oid, uint64_t& size, timespec& mtime, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper)
+ return striper().stat2(oid, &size, &mtime);
+#endif
+
+ return io_ctx.stat2(oid, &size, &mtime);
+}
+
+void dump_name(Formatter *formatter, const librados::NObjectIterator& i, [[maybe_unused]] const bool use_striper)
+{
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper) {
+ formatter->dump_string("name", i->get_oid().substr(0, i->get_oid().length()-17));
+ return;
+ }
+#endif
+
+ formatter->dump_string("name", i->get_oid());
+}
+
+} // namespace detail
+
+unsigned default_op_size = 1 << 22;
+
+[[noreturn]] static void usage_exit()
+{
+ usage(cerr);
+ exit(1);
+}
+
+
+template <typename I, typename T>
+static int rados_sistrtoll(I &i, T *val) {
+ std::string err;
+ *val = strict_iecstrtoll(i->second.c_str(), &err);
+ if (err != "") {
+ cerr << "Invalid value for " << i->first << ": " << err << std::endl;
+ return -EINVAL;
+ } else {
+ return 0;
+ }
+}
+
+
+static int dump_data(std::string const &filename, bufferlist const &data)
+{
+ int fd;
+ if (filename == "-") {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = TEMP_FAILURE_RETRY(::open(filename.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0644));
+ if (fd < 0) {
+ int err = errno;
+ cerr << "failed to open file: " << cpp_strerror(err) << std::endl;
+ return -err;
+ }
+ }
+
+ int r = data.write_fd(fd);
+
+ if (fd != 1) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+
+ return r;
+}
+
+
+static int do_get(IoCtx& io_ctx, const std::string& oid, const char *outfile, unsigned op_size, [[maybe_unused]] const bool use_striper)
+{
+ int fd;
+ if (strcmp(outfile, "-") == 0) {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = TEMP_FAILURE_RETRY(::open(outfile, O_WRONLY|O_CREAT|O_TRUNC, 0644));
+ if (fd < 0) {
+ int err = errno;
+ cerr << "failed to open file: " << cpp_strerror(err) << std::endl;
+ return -err;
+ }
+ }
+
+ uint64_t offset = 0;
+ int ret;
+ while (true) {
+ bufferlist outdata;
+
+ ret = detail::read(io_ctx, oid, outdata, op_size, offset, use_striper);
+ if (ret <= 0) {
+ goto out;
+ }
+ ret = outdata.write_fd(fd);
+ if (ret < 0) {
+ cerr << "error writing to file: " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ if (outdata.length() < op_size)
+ break;
+ offset += outdata.length();
+ }
+ ret = 0;
+
+ out:
+ if (fd != 1)
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return ret;
+}
+
+static int do_copy(IoCtx& io_ctx, const char *objname,
+ IoCtx& target_ctx, const char *target_obj)
+{
+ __le32 src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ __le32 dest_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+ ObjectWriteOperation op;
+ op.copy_from(objname, io_ctx, 0, src_fadvise_flags);
+ op.set_op_flags2(dest_fadvise_flags);
+
+ return target_ctx.operate(target_obj, &op);
+}
+
+static int do_copy_pool(Rados& rados, const char *src_pool, const char *target_pool)
+{
+ IoCtx src_ctx, target_ctx;
+ int ret = rados.ioctx_create(src_pool, src_ctx);
+ if (ret < 0) {
+ cerr << "cannot open source pool: " << src_pool << std::endl;
+ return ret;
+ }
+ ret = rados.ioctx_create(target_pool, target_ctx);
+ if (ret < 0) {
+ cerr << "cannot open target pool: " << target_pool << std::endl;
+ return ret;
+ }
+ src_ctx.set_namespace(all_nspaces);
+ librados::NObjectIterator i = src_ctx.nobjects_begin();
+ librados::NObjectIterator i_end = src_ctx.nobjects_end();
+ for (; i != i_end; ++i) {
+ string nspace = i->get_nspace();
+ string oid = i->get_oid();
+ string locator = i->get_locator();
+
+ string target_name = (nspace.size() ? nspace + "/" : "") + oid;
+ string src_name = target_name;
+ if (locator.size())
+ src_name += "(@" + locator + ")";
+ cout << src_pool << ":" << src_name << " => "
+ << target_pool << ":" << target_name << std::endl;
+
+ src_ctx.locator_set_key(locator);
+ src_ctx.set_namespace(nspace);
+ target_ctx.set_namespace(nspace);
+ ret = do_copy(src_ctx, oid.c_str(), target_ctx, oid.c_str());
+ if (ret < 0) {
+ cerr << "error copying object: " << cpp_strerror(errno) << std::endl;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int do_put(IoCtx& io_ctx,
+ const std::string& oid, const char *infile, int op_size,
+ uint64_t obj_offset,
+ const bool use_striper)
+{
+ bool stdio = (strcmp(infile, "-") == 0);
+ int ret = 0;
+ int fd = STDIN_FILENO;
+ if (!stdio)
+ fd = open(infile, O_RDONLY);
+ if (fd < 0) {
+ cerr << "error reading input file " << infile << ": " << cpp_strerror(errno) << std::endl;
+ return 1;
+ }
+ int count = op_size;
+ uint64_t offset = obj_offset;
+ while (count != 0) {
+ bufferlist indata;
+ count = indata.read_fd(fd, op_size);
+ if (count < 0) {
+ ret = -errno;
+ cerr << "error reading input file " << infile << ": " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+
+ if (count == 0) {
+ if (offset == obj_offset) { // in case we have to create an empty object & if obj_offset > 0 do a hole
+ ret = detail::write_full(io_ctx, oid, indata, use_striper); // indata is empty
+
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (offset) {
+ ret = detail::trunc(io_ctx, oid, offset, use_striper); // before truncate, object must be existed.
+
+ if (ret < 0) {
+ goto out;
+ }
+ }
+ }
+ continue;
+ }
+
+ if (0 == offset)
+ ret = detail::write_full(io_ctx, oid, indata, use_striper);
+ else
+ ret = detail::write(io_ctx, oid, indata, count, offset, use_striper);
+
+ if (ret < 0) {
+ goto out;
+ }
+ offset += count;
+ }
+ ret = 0;
+ out:
+ if (fd != STDOUT_FILENO)
+ VOID_TEMP_FAILURE_RETRY(close(fd));
+ return ret;
+}
+
+static int do_append(IoCtx& io_ctx,
+ const std::string& oid, const char *infile, int op_size,
+ const bool use_striper)
+{
+ bool stdio = (strcmp(infile, "-") == 0);
+ int ret = 0;
+ int fd = STDIN_FILENO;
+ if (!stdio)
+ fd = open(infile, O_RDONLY);
+ if (fd < 0) {
+ cerr << "error reading input file " << infile << ": " << cpp_strerror(errno) << std::endl;
+ return 1;
+ }
+ int count = op_size;
+ while (count != 0) {
+ bufferlist indata;
+ count = indata.read_fd(fd, op_size);
+ if (count < 0) {
+ ret = -errno;
+ cerr << "error reading input file " << infile << ": " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = detail::append(io_ctx, oid, indata, count, use_striper);
+
+ if (ret < 0) {
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ if (fd != STDOUT_FILENO)
+ VOID_TEMP_FAILURE_RETRY(close(fd));
+ return ret;
+}
+
+class RadosWatchCtx : public librados::WatchCtx2 {
+ IoCtx& ioctx;
+ string name;
+public:
+ RadosWatchCtx(IoCtx& io, const char *imgname) : ioctx(io), name(imgname) {}
+ ~RadosWatchCtx() override {}
+ void handle_notify(uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ bufferlist& bl) override {
+ cout << "NOTIFY"
+ << " cookie " << cookie
+ << " notify_id " << notify_id
+ << " from " << notifier_id
+ << std::endl;
+ bl.hexdump(cout);
+ ioctx.notify_ack(name, notify_id, cookie, bl);
+ }
+ void handle_error(uint64_t cookie, int err) override {
+ cout << "ERROR"
+ << " cookie " << cookie
+ << " err " << cpp_strerror(err)
+ << std::endl;
+ }
+};
+
+static const char alphanum_table[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
+
+void gen_rand_alphanumeric(char *dest, int size) /* size should be the required string size + 1 */
+{
+ const int max = sizeof(alphanum_table) - 2;
+
+ int i;
+ for (i=0; i<size - 1; i++) {
+ int pos = generate_random_number(0, max);
+ dest[i] = alphanum_table[pos];
+ }
+ dest[i] = '\0';
+}
+
+struct obj_info {
+ string name;
+ size_t len;
+};
+
+class LoadGen {
+ size_t total_sent;
+ size_t total_completed;
+
+ IoCtx io_ctx;
+ Rados *rados;
+
+ map<int, obj_info> objs;
+
+ utime_t start_time;
+
+ bool going_down;
+
+public:
+ int read_percent;
+ int num_objs;
+ size_t min_obj_len;
+ size_t max_obj_len;
+ size_t min_op_len;
+ size_t max_op_len;
+ size_t max_ops;
+ size_t max_backlog;
+ size_t target_throughput;
+ size_t offset_align = 0;
+ int run_length;
+
+ enum {
+ OP_READ,
+ OP_WRITE,
+ };
+
+ struct LoadGenOp {
+ int id;
+ int type;
+ string oid;
+ size_t off;
+ size_t len;
+ bufferlist bl;
+ LoadGen *lg;
+ librados::AioCompletion *completion;
+
+ LoadGenOp() : id(0), type(0), off(0), len(0), lg(NULL), completion(NULL) {}
+ explicit LoadGenOp(LoadGen *_lg) : id(0), type(0), off(0), len(0), lg(_lg), completion(NULL) {}
+ };
+
+ int max_op;
+
+ map<int, LoadGenOp *> pending_ops;
+
+ void gen_op(LoadGenOp *op);
+ uint64_t gen_next_op();
+ void run_op(LoadGenOp *op);
+
+ uint64_t cur_sent_rate() {
+ return total_sent / time_passed();
+ }
+
+ uint64_t cur_completed_rate() {
+ return total_completed / time_passed();
+ }
+
+ uint64_t total_expected() {
+ return target_throughput * time_passed();
+ }
+
+ float time_passed() {
+ utime_t now = ceph_clock_now();
+ now -= start_time;
+ uint64_t ns = now.nsec();
+ float total = (float) ns / 1000000000.0;
+ total += now.sec();
+ return total;
+ }
+
+ Mutex lock;
+ Cond cond;
+
+ explicit LoadGen(Rados *_rados) : rados(_rados), going_down(false), lock("LoadGen") {
+ read_percent = 80;
+ min_obj_len = 1024;
+ max_obj_len = 5ull * 1024ull * 1024ull * 1024ull;
+ min_op_len = 1024;
+ target_throughput = 5 * 1024 * 1024; // B/sec
+ max_op_len = 2 * 1024 * 1024;
+ max_ops = 16;
+ max_backlog = target_throughput * 2;
+ run_length = 60;
+
+ total_sent = 0;
+ total_completed = 0;
+ num_objs = 200;
+ max_op = 0;
+ }
+ int bootstrap(const char *pool);
+ int run();
+ void cleanup();
+
+ void io_cb(completion_t c, LoadGenOp *op) {
+ Mutex::Locker l(lock);
+
+ total_completed += op->len;
+
+ double rate = (double)cur_completed_rate() / (1024 * 1024);
+ std::streamsize original_precision = cout.precision();
+ cout.precision(3);
+ cout << "op " << op->id << " completed, throughput=" << rate << "MB/sec" << std::endl;
+ cout.precision(original_precision);
+
+ map<int, LoadGenOp *>::iterator iter = pending_ops.find(op->id);
+ if (iter != pending_ops.end())
+ pending_ops.erase(iter);
+
+ if (!going_down)
+ op->completion->release();
+
+ delete op;
+
+ cond.Signal();
+ }
+};
+
+static void _load_gen_cb(completion_t c, void *param)
+{
+ LoadGen::LoadGenOp *op = (LoadGen::LoadGenOp *)param;
+ op->lg->io_cb(c, op);
+}
+
+int LoadGen::bootstrap(const char *pool)
+{
+ char buf[128];
+ int i;
+
+ if (!pool) {
+ cerr << "ERROR: pool name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ int ret = rados->ioctx_create(pool, io_ctx);
+ if (ret < 0) {
+ cerr << "error opening pool " << pool << ": " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ int buf_len = 1;
+ bufferptr p = buffer::create(buf_len);
+ bufferlist bl;
+ memset(p.c_str(), 0, buf_len);
+ bl.push_back(p);
+
+ list<librados::AioCompletion *> completions;
+ for (i = 0; i < num_objs; i++) {
+ obj_info info;
+ gen_rand_alphanumeric(buf, 16);
+ info.name = "obj-";
+ info.name.append(buf);
+ info.len = generate_random_number(min_obj_len, max_obj_len);
+
+ // throttle...
+ while (completions.size() > max_ops) {
+ AioCompletion *c = completions.front();
+ c->wait_for_complete();
+ ret = c->get_return_value();
+ c->release();
+ completions.pop_front();
+ if (ret < 0) {
+ cerr << "aio_write failed" << std::endl;
+ return ret;
+ }
+ }
+
+ librados::AioCompletion *c = rados->aio_create_completion(NULL, NULL, NULL);
+ completions.push_back(c);
+ // generate object
+ ret = io_ctx.aio_write(info.name, c, bl, buf_len, info.len - buf_len);
+ if (ret < 0) {
+ cerr << "couldn't write obj: " << info.name << " ret=" << ret << std::endl;
+ return ret;
+ }
+ objs[i] = info;
+ }
+
+ list<librados::AioCompletion *>::iterator iter;
+ for (iter = completions.begin(); iter != completions.end(); ++iter) {
+ AioCompletion *c = *iter;
+ c->wait_for_complete();
+ ret = c->get_return_value();
+ c->release();
+ if (ret < 0) { // yes, we leak.
+ cerr << "aio_write failed" << std::endl;
+ return ret;
+ }
+ }
+ return 0;
+}
+
+void LoadGen::run_op(LoadGenOp *op)
+{
+ op->completion = rados->aio_create_completion(op, _load_gen_cb, NULL);
+
+ switch (op->type) {
+ case OP_READ:
+ io_ctx.aio_read(op->oid, op->completion, &op->bl, op->len, op->off);
+ break;
+ case OP_WRITE:
+ bufferptr p = buffer::create(op->len);
+ memset(p.c_str(), 0, op->len);
+ op->bl.push_back(p);
+
+ io_ctx.aio_write(op->oid, op->completion, op->bl, op->len, op->off);
+ break;
+ }
+
+ total_sent += op->len;
+}
+
+void LoadGen::gen_op(LoadGenOp *op)
+{
+ int i = generate_random_number<int>(0, objs.size() - 1);
+ obj_info& info = objs[i];
+ op->oid = info.name;
+
+ size_t len = generate_random_number(min_op_len, max_op_len);
+ if (len > info.len)
+ len = info.len;
+ size_t off = generate_random_number<size_t>(0, info.len);
+
+ if (off + len > info.len)
+ off = info.len - len;
+
+ if (offset_align)
+ off = p2align(off, offset_align);
+
+ op->off = off;
+ op->len = len;
+
+ i = generate_random_number(1, 100);
+ if (i > read_percent)
+ op->type = OP_WRITE;
+ else
+ op->type = OP_READ;
+
+ cout << (op->type == OP_READ ? "READ" : "WRITE") << " : oid=" << op->oid << " off=" << op->off << " len=" << op->len << std::endl;
+}
+
+uint64_t LoadGen::gen_next_op()
+{
+ lock.Lock();
+
+ LoadGenOp *op = new LoadGenOp(this);
+ gen_op(op);
+ op->id = max_op++;
+ pending_ops[op->id] = op;
+
+ lock.Unlock();
+
+ run_op(op);
+
+ return op->len;
+}
+
+int LoadGen::run()
+{
+ start_time = ceph_clock_now();
+ utime_t end_time = start_time;
+ end_time += run_length;
+ utime_t stamp_time = start_time;
+ uint32_t total_sec = 0;
+
+ while (1) {
+ lock.Lock();
+ utime_t one_second(1, 0);
+ cond.WaitInterval(lock, one_second);
+ lock.Unlock();
+ utime_t now = ceph_clock_now();
+
+ if (now > end_time)
+ break;
+
+ uint64_t expected = total_expected();
+ lock.Lock();
+ uint64_t sent = total_sent;
+ uint64_t completed = total_completed;
+ lock.Unlock();
+
+ if (now - stamp_time >= utime_t(1, 0)) {
+ double rate = (double)cur_completed_rate() / (1024 * 1024);
+ ++total_sec;
+ std::streamsize original_precision = cout.precision();
+ cout.precision(3);
+ cout << setw(5) << total_sec << ": throughput=" << rate << "MB/sec" << " pending data=" << sent - completed << std::endl;
+ cout.precision(original_precision);
+ stamp_time = now;
+ }
+
+ while (sent < expected &&
+ sent - completed < max_backlog &&
+ pending_ops.size() < max_ops) {
+ sent += gen_next_op();
+ }
+ }
+
+ // get a reference to all pending requests
+ vector<librados::AioCompletion *> completions;
+ lock.Lock();
+ going_down = true;
+ map<int, LoadGenOp *>::iterator iter;
+ for (iter = pending_ops.begin(); iter != pending_ops.end(); ++iter) {
+ LoadGenOp *op = iter->second;
+ completions.push_back(op->completion);
+ }
+ lock.Unlock();
+
+ cout << "waiting for all operations to complete" << std::endl;
+
+ // now wait on all the pending requests
+ for (vector<librados::AioCompletion *>::iterator citer = completions.begin(); citer != completions.end(); ++citer) {
+ librados::AioCompletion *c = *citer;
+ c->wait_for_complete();
+ c->release();
+ }
+
+ return 0;
+}
+
+void LoadGen::cleanup()
+{
+ cout << "cleaning up objects" << std::endl;
+ map<int, obj_info>::iterator iter;
+ for (iter = objs.begin(); iter != objs.end(); ++iter) {
+ obj_info& info = iter->second;
+ int ret = io_ctx.remove(info.name);
+ if (ret < 0)
+ cerr << "couldn't remove obj: " << info.name << " ret=" << ret << std::endl;
+ }
+}
+
+enum OpWriteDest {
+ OP_WRITE_DEST_OBJ = 2 << 0,
+ OP_WRITE_DEST_OMAP = 2 << 1,
+ OP_WRITE_DEST_XATTR = 2 << 2,
+};
+
+class RadosBencher : public ObjBencher {
+ librados::AioCompletion **completions;
+ librados::Rados& rados;
+ librados::IoCtx& io_ctx;
+ librados::NObjectIterator oi;
+ bool iterator_valid;
+ OpWriteDest write_destination;
+
+protected:
+ int completions_init(int concurrentios) override {
+ completions = new librados::AioCompletion *[concurrentios];
+ return 0;
+ }
+ void completions_done() override {
+ delete[] completions;
+ completions = NULL;
+ }
+ int create_completion(int slot, void (*cb)(void *, void*), void *arg) override {
+ completions[slot] = rados.aio_create_completion((void *) arg, 0, cb);
+
+ if (!completions[slot])
+ return -EINVAL;
+
+ return 0;
+ }
+ void release_completion(int slot) override {
+ completions[slot]->release();
+ completions[slot] = 0;
+ }
+
+ int aio_read(const std::string& oid, int slot, bufferlist *pbl, size_t len,
+ size_t offset) override {
+ return io_ctx.aio_read(oid, completions[slot], pbl, len, offset);
+ }
+
+ int aio_write(const std::string& oid, int slot, bufferlist& bl, size_t len,
+ size_t offset) override {
+ librados::ObjectWriteOperation op;
+
+ if (write_destination & OP_WRITE_DEST_OBJ) {
+ if (data.hints)
+ op.set_alloc_hint2(data.object_size, data.op_size,
+ ALLOC_HINT_FLAG_SEQUENTIAL_WRITE |
+ ALLOC_HINT_FLAG_SEQUENTIAL_READ |
+ ALLOC_HINT_FLAG_APPEND_ONLY |
+ ALLOC_HINT_FLAG_IMMUTABLE);
+ op.write(offset, bl);
+ }
+
+ if (write_destination & OP_WRITE_DEST_OMAP) {
+ std::map<std::string, librados::bufferlist> omap;
+ omap[string("bench-omap-key-") + stringify(offset)] = bl;
+ op.omap_set(omap);
+ }
+
+ if (write_destination & OP_WRITE_DEST_XATTR) {
+ char key[80];
+ snprintf(key, sizeof(key), "bench-xattr-key-%d", (int)offset);
+ op.setxattr(key, bl);
+ }
+
+ return io_ctx.aio_operate(oid, completions[slot], &op);
+ }
+
+ int aio_remove(const std::string& oid, int slot) override {
+ return io_ctx.aio_remove(oid, completions[slot]);
+ }
+
+ int sync_read(const std::string& oid, bufferlist& bl, size_t len) override {
+ return io_ctx.read(oid, bl, len, 0);
+ }
+ int sync_write(const std::string& oid, bufferlist& bl, size_t len) override {
+ return io_ctx.write_full(oid, bl);
+ }
+
+ int sync_remove(const std::string& oid) override {
+ return io_ctx.remove(oid);
+ }
+
+ bool completion_is_done(int slot) override {
+ return completions[slot]->is_safe();
+ }
+
+ int completion_wait(int slot) override {
+ return completions[slot]->wait_for_safe_and_cb();
+ }
+ int completion_ret(int slot) override {
+ return completions[slot]->get_return_value();
+ }
+
+ bool get_objects(std::list<Object>* objects, int num) override {
+ int count = 0;
+
+ if (!iterator_valid) {
+ oi = io_ctx.nobjects_begin();
+ iterator_valid = true;
+ }
+
+ librados::NObjectIterator ei = io_ctx.nobjects_end();
+
+ if (oi == ei) {
+ iterator_valid = false;
+ return false;
+ }
+
+ objects->clear();
+ for ( ; oi != ei && count < num; ++oi) {
+ Object obj(oi->get_oid(), oi->get_nspace());
+ objects->push_back(obj);
+ ++count;
+ }
+
+ return true;
+ }
+
+ void set_namespace( const std::string& ns) override {
+ io_ctx.set_namespace(ns);
+ }
+
+public:
+ RadosBencher(CephContext *cct_, librados::Rados& _r, librados::IoCtx& _i)
+ : ObjBencher(cct_), completions(NULL), rados(_r), io_ctx(_i), iterator_valid(false), write_destination(OP_WRITE_DEST_OBJ) {}
+ ~RadosBencher() override { }
+
+ void set_write_destination(OpWriteDest dest) {
+ write_destination = dest;
+ }
+};
+
+static int do_lock_cmd(std::vector<const char*> &nargs,
+ const std::map < std::string, std::string > &opts,
+ IoCtx *ioctx,
+ Formatter *formatter)
+{
+ if (nargs.size() < 3)
+ usage_exit();
+
+ string cmd(nargs[1]);
+ string oid(nargs[2]);
+
+ string lock_tag;
+ string lock_cookie;
+ string lock_description;
+ int lock_duration = 0;
+ ClsLockType lock_type = LOCK_EXCLUSIVE;
+
+ map<string, string>::const_iterator i;
+ i = opts.find("lock-tag");
+ if (i != opts.end()) {
+ lock_tag = i->second;
+ }
+ i = opts.find("lock-cookie");
+ if (i != opts.end()) {
+ lock_cookie = i->second;
+ }
+ i = opts.find("lock-description");
+ if (i != opts.end()) {
+ lock_description = i->second;
+ }
+ i = opts.find("lock-duration");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &lock_duration)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("lock-type");
+ if (i != opts.end()) {
+ const string& type_str = i->second;
+ if (type_str.compare("exclusive") == 0) {
+ lock_type = LOCK_EXCLUSIVE;
+ } else if (type_str.compare("shared") == 0) {
+ lock_type = LOCK_SHARED;
+ } else {
+ cerr << "unknown lock type was specified, aborting" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (cmd.compare("list") == 0) {
+ list<string> locks;
+ int ret = rados::cls::lock::list_locks(ioctx, oid, &locks);
+ if (ret < 0) {
+ cerr << "ERROR: rados_list_locks(): " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ formatter->open_object_section("object");
+ formatter->dump_string("objname", oid);
+ formatter->open_array_section("locks");
+ list<string>::iterator iter;
+ for (iter = locks.begin(); iter != locks.end(); ++iter) {
+ formatter->open_object_section("lock");
+ formatter->dump_string("name", *iter);
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(cout);
+ return 0;
+ }
+
+ if (nargs.size() < 4)
+ usage_exit();
+
+ string lock_name(nargs[3]);
+
+ if (cmd.compare("info") == 0) {
+ map<rados::cls::lock::locker_id_t, rados::cls::lock::locker_info_t> lockers;
+ ClsLockType type = LOCK_NONE;
+ string tag;
+ int ret = rados::cls::lock::get_lock_info(ioctx, oid, lock_name, &lockers, &type, &tag);
+ if (ret < 0) {
+ cerr << "ERROR: rados_lock_get_lock_info(): " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ formatter->open_object_section("lock");
+ formatter->dump_string("name", lock_name);
+ formatter->dump_string("type", cls_lock_type_str(type));
+ formatter->dump_string("tag", tag);
+ formatter->open_array_section("lockers");
+ map<rados::cls::lock::locker_id_t, rados::cls::lock::locker_info_t>::iterator iter;
+ for (iter = lockers.begin(); iter != lockers.end(); ++iter) {
+ const rados::cls::lock::locker_id_t& id = iter->first;
+ const rados::cls::lock::locker_info_t& info = iter->second;
+ formatter->open_object_section("locker");
+ formatter->dump_stream("name") << id.locker;
+ formatter->dump_string("cookie", id.cookie);
+ formatter->dump_string("description", info.description);
+ formatter->dump_stream("expiration") << info.expiration;
+ formatter->dump_stream("addr") << info.addr.get_legacy_str();
+ formatter->close_section();
+ }
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(cout);
+
+ return ret;
+ } else if (cmd.compare("get") == 0) {
+ rados::cls::lock::Lock l(lock_name);
+ l.set_cookie(lock_cookie);
+ l.set_tag(lock_tag);
+ l.set_duration(utime_t(lock_duration, 0));
+ l.set_description(lock_description);
+ int ret;
+ switch (lock_type) {
+ case LOCK_SHARED:
+ ret = l.lock_shared(ioctx, oid);
+ break;
+ default:
+ ret = l.lock_exclusive(ioctx, oid);
+ }
+ if (ret < 0) {
+ cerr << "ERROR: failed locking: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+
+ return ret;
+ }
+
+ if (nargs.size() < 5)
+ usage_exit();
+
+ if (cmd.compare("break") == 0) {
+ string locker(nargs[4]);
+ rados::cls::lock::Lock l(lock_name);
+ l.set_cookie(lock_cookie);
+ l.set_tag(lock_tag);
+ entity_name_t name;
+ if (!name.parse(locker)) {
+ cerr << "ERROR: failed to parse locker name (" << locker << ")" << std::endl;
+ return -EINVAL;
+ }
+ int ret = l.break_lock(ioctx, oid, name);
+ if (ret < 0) {
+ cerr << "ERROR: failed breaking lock: " << cpp_strerror(ret) << std::endl;
+ return ret;
+ }
+ } else {
+ usage_exit();
+ }
+
+ return 0;
+}
+
+static int do_cache_flush(IoCtx& io_ctx, string oid)
+{
+ ObjectReadOperation op;
+ op.cache_flush();
+ librados::AioCompletion *completion =
+ librados::Rados::aio_create_completion();
+ io_ctx.aio_operate(oid.c_str(), completion, &op,
+ librados::OPERATION_IGNORE_CACHE |
+ librados::OPERATION_IGNORE_OVERLAY,
+ NULL);
+ completion->wait_for_safe();
+ int r = completion->get_return_value();
+ completion->release();
+ return r;
+}
+
+static int do_cache_try_flush(IoCtx& io_ctx, string oid)
+{
+ ObjectReadOperation op;
+ op.cache_try_flush();
+ librados::AioCompletion *completion =
+ librados::Rados::aio_create_completion();
+ io_ctx.aio_operate(oid.c_str(), completion, &op,
+ librados::OPERATION_IGNORE_CACHE |
+ librados::OPERATION_IGNORE_OVERLAY |
+ librados::OPERATION_SKIPRWLOCKS,
+ NULL);
+ completion->wait_for_safe();
+ int r = completion->get_return_value();
+ completion->release();
+ return r;
+}
+
+static int do_cache_evict(IoCtx& io_ctx, string oid)
+{
+ ObjectReadOperation op;
+ op.cache_evict();
+ librados::AioCompletion *completion =
+ librados::Rados::aio_create_completion();
+ io_ctx.aio_operate(oid.c_str(), completion, &op,
+ librados::OPERATION_IGNORE_CACHE |
+ librados::OPERATION_IGNORE_OVERLAY |
+ librados::OPERATION_SKIPRWLOCKS,
+ NULL);
+ completion->wait_for_safe();
+ int r = completion->get_return_value();
+ completion->release();
+ return r;
+}
+
+static int do_cache_flush_evict_all(IoCtx& io_ctx, bool blocking)
+{
+ int errors = 0;
+ io_ctx.set_namespace(all_nspaces);
+ try {
+ librados::NObjectIterator i = io_ctx.nobjects_begin();
+ librados::NObjectIterator i_end = io_ctx.nobjects_end();
+ for (; i != i_end; ++i) {
+ int r;
+ cout << i->get_nspace() << "\t" << i->get_oid() << "\t" << i->get_locator() << std::endl;
+ if (i->get_locator().size()) {
+ io_ctx.locator_set_key(i->get_locator());
+ } else {
+ io_ctx.locator_set_key(string());
+ }
+ io_ctx.set_namespace(i->get_nspace());
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ r = io_ctx.list_snaps(i->get_oid(), &ls);
+ if (r < 0) {
+ cerr << "error listing snap shots " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ continue;
+ }
+ std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ // no snapshots
+ if (ci == ls.clones.end()) {
+ io_ctx.snap_set_read(CEPH_NOSNAP);
+ if (blocking)
+ r = do_cache_flush(io_ctx, i->get_oid());
+ else
+ r = do_cache_try_flush(io_ctx, i->get_oid());
+ if (r < 0) {
+ cerr << "failed to flush " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ continue;
+ }
+ r = do_cache_evict(io_ctx, i->get_oid());
+ if (r < 0) {
+ cerr << "failed to evict " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ continue;
+ }
+ } else {
+ // has snapshots
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+ io_ctx.snap_set_read(ci->cloneid);
+ if (blocking)
+ r = do_cache_flush(io_ctx, i->get_oid());
+ else
+ r = do_cache_try_flush(io_ctx, i->get_oid());
+ if (r < 0) {
+ cerr << "failed to flush " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ break;
+ }
+ r = do_cache_evict(io_ctx, i->get_oid());
+ if (r < 0) {
+ cerr << "failed to evict " << i->get_nspace() << "/" << i->get_oid() << ": "
+ << cpp_strerror(r) << std::endl;
+ ++errors;
+ break;
+ }
+ }
+ }
+ }
+ }
+ catch (const std::exception& e) {
+ cerr << e.what() << std::endl;
+ return -1;
+ }
+ return errors ? -1 : 0;
+}
+
+static int do_get_inconsistent_pg_cmd(const std::vector<const char*> &nargs,
+ Rados& rados,
+ Formatter& formatter)
+{
+ if (nargs.size() < 2) {
+ usage_exit();
+ }
+ int64_t pool_id = rados.pool_lookup(nargs[1]);
+ if (pool_id < 0) {
+ cerr << "pool \"" << nargs[1] << "\" not found" << std::endl;
+ return (int)pool_id;
+ }
+ std::vector<PlacementGroup> pgs;
+ int ret = rados.get_inconsistent_pgs(pool_id, &pgs);
+ if (ret) {
+ return ret;
+ }
+ formatter.open_array_section("pgs");
+ for (auto& pg : pgs) {
+ formatter.dump_stream("pg") << pg;
+ }
+ formatter.close_section();
+ formatter.flush(cout);
+ cout << std::endl;
+ return 0;
+}
+
+static void dump_errors(const err_t &err, Formatter &f, const char *name)
+{
+ f.open_array_section(name);
+ if (err.has_shard_missing())
+ f.dump_string("error", "missing");
+ if (err.has_stat_error())
+ f.dump_string("error", "stat_error");
+ if (err.has_read_error())
+ f.dump_string("error", "read_error");
+ if (err.has_data_digest_mismatch_info())
+ f.dump_string("error", "data_digest_mismatch_info");
+ if (err.has_omap_digest_mismatch_info())
+ f.dump_string("error", "omap_digest_mismatch_info");
+ if (err.has_size_mismatch_info())
+ f.dump_string("error", "size_mismatch_info");
+ if (err.has_ec_hash_error())
+ f.dump_string("error", "ec_hash_error");
+ if (err.has_ec_size_error())
+ f.dump_string("error", "ec_size_error");
+ if (err.has_info_missing())
+ f.dump_string("error", "info_missing");
+ if (err.has_info_corrupted())
+ f.dump_string("error", "info_corrupted");
+ if (err.has_obj_size_info_mismatch())
+ f.dump_string("error", "obj_size_info_mismatch");
+ if (err.has_snapset_missing())
+ f.dump_string("error", "snapset_missing");
+ if (err.has_snapset_corrupted())
+ f.dump_string("error", "snapset_corrupted");
+ if (err.has_hinfo_missing())
+ f.dump_string("error", "hinfo_missing");
+ if (err.has_hinfo_corrupted())
+ f.dump_string("error", "hinfo_corrupted");
+ f.close_section();
+}
+
+static void dump_shard(const shard_info_t& shard,
+ const inconsistent_obj_t& inc,
+ Formatter &f)
+{
+ dump_errors(shard, f, "errors");
+
+ if (shard.has_shard_missing())
+ return;
+
+ if (!shard.has_stat_error())
+ f.dump_unsigned("size", shard.size);
+ if (shard.omap_digest_present) {
+ f.dump_format("omap_digest", "0x%08x", shard.omap_digest);
+ }
+ if (shard.data_digest_present) {
+ f.dump_format("data_digest", "0x%08x", shard.data_digest);
+ }
+
+ if ((inc.union_shards.has_info_missing()
+ || inc.union_shards.has_info_corrupted()
+ || inc.has_object_info_inconsistency()
+ || shard.has_obj_size_info_mismatch()) &&
+ !shard.has_info_missing()) {
+ map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(OI_ATTR);
+ ceph_assert(k != shard.attrs.end()); // Can't be missing
+ if (!shard.has_info_corrupted()) {
+ object_info_t oi;
+ bufferlist bl;
+ auto bliter = k->second.cbegin();
+ decode(oi, bliter); // Can't be corrupted
+ f.open_object_section("object_info");
+ oi.dump(&f);
+ f.close_section();
+ } else {
+ bool b64;
+ f.dump_string("object_info", cleanbin(k->second, b64));
+ }
+ }
+ if ((inc.union_shards.has_snapset_missing()
+ || inc.union_shards.has_snapset_corrupted()
+ || inc.has_snapset_inconsistency()) &&
+ !shard.has_snapset_missing()) {
+ map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(SS_ATTR);
+ ceph_assert(k != shard.attrs.end()); // Can't be missing
+ if (!shard.has_snapset_corrupted()) {
+ SnapSet ss;
+ bufferlist bl;
+ auto bliter = k->second.cbegin();
+ decode(ss, bliter); // Can't be corrupted
+ f.open_object_section("snapset");
+ ss.dump(&f);
+ f.close_section();
+ } else {
+ bool b64;
+ f.dump_string("snapset", cleanbin(k->second, b64));
+ }
+ }
+ if ((inc.union_shards.has_hinfo_missing()
+ || inc.union_shards.has_hinfo_corrupted()
+ || inc.has_hinfo_inconsistency()) &&
+ !shard.has_hinfo_missing()) {
+ map<std::string, ceph::bufferlist>::iterator k = (const_cast<shard_info_t&>(shard)).attrs.find(ECUtil::get_hinfo_key());
+ ceph_assert(k != shard.attrs.end()); // Can't be missing
+ if (!shard.has_hinfo_corrupted()) {
+ ECUtil::HashInfo hi;
+ bufferlist bl;
+ auto bliter = k->second.cbegin();
+ decode(hi, bliter); // Can't be corrupted
+ f.open_object_section("hashinfo");
+ hi.dump(&f);
+ f.close_section();
+ } else {
+ bool b64;
+ f.dump_string("hashinfo", cleanbin(k->second, b64));
+ }
+ }
+ if (inc.has_attr_name_mismatch() || inc.has_attr_value_mismatch()) {
+ f.open_array_section("attrs");
+ for (auto kv : shard.attrs) {
+ // System attribute handled above
+ if (kv.first == OI_ATTR || kv.first[0] != '_')
+ continue;
+ f.open_object_section("attr");
+ // Skip leading underscore since only giving user attrs
+ f.dump_string("name", kv.first.substr(1));
+ bool b64;
+ f.dump_string("value", cleanbin(kv.second, b64));
+ f.dump_bool("Base64", b64);
+ f.close_section();
+ }
+ f.close_section();
+ }
+}
+
+static void dump_obj_errors(const obj_err_t &err, Formatter &f)
+{
+ f.open_array_section("errors");
+ if (err.has_object_info_inconsistency())
+ f.dump_string("error", "object_info_inconsistency");
+ if (err.has_data_digest_mismatch())
+ f.dump_string("error", "data_digest_mismatch");
+ if (err.has_omap_digest_mismatch())
+ f.dump_string("error", "omap_digest_mismatch");
+ if (err.has_size_mismatch())
+ f.dump_string("error", "size_mismatch");
+ if (err.has_attr_value_mismatch())
+ f.dump_string("error", "attr_value_mismatch");
+ if (err.has_attr_name_mismatch())
+ f.dump_string("error", "attr_name_mismatch");
+ if (err.has_snapset_inconsistency())
+ f.dump_string("error", "snapset_inconsistency");
+ if (err.has_hinfo_inconsistency())
+ f.dump_string("error", "hinfo_inconsistency");
+ if (err.has_size_too_large())
+ f.dump_string("error", "size_too_large");
+ f.close_section();
+}
+
+static void dump_object_id(const object_id_t& object,
+ Formatter &f)
+{
+ f.dump_string("name", object.name);
+ f.dump_string("nspace", object.nspace);
+ f.dump_string("locator", object.locator);
+ switch (object.snap) {
+ case CEPH_NOSNAP:
+ f.dump_string("snap", "head");
+ break;
+ case CEPH_SNAPDIR:
+ f.dump_string("snap", "snapdir");
+ break;
+ default:
+ f.dump_unsigned("snap", object.snap);
+ break;
+ }
+}
+
+static void dump_inconsistent(const inconsistent_obj_t& inc,
+ Formatter &f)
+{
+ f.open_object_section("object");
+ dump_object_id(inc.object, f);
+ f.dump_unsigned("version", inc.version);
+ f.close_section();
+
+ dump_obj_errors(inc, f);
+ dump_errors(inc.union_shards, f, "union_shard_errors");
+ for (const auto& shard_info : inc.shards) {
+ shard_info_t shard = const_cast<shard_info_t&>(shard_info.second);
+ if (shard.selected_oi) {
+ object_info_t oi;
+ bufferlist bl;
+ auto k = shard.attrs.find(OI_ATTR);
+ ceph_assert(k != shard.attrs.end()); // Can't be missing
+ auto bliter = k->second.cbegin();
+ decode(oi, bliter); // Can't be corrupted
+ f.open_object_section("selected_object_info");
+ oi.dump(&f);
+ f.close_section();
+ break;
+ }
+ }
+ f.open_array_section("shards");
+ for (const auto& shard_info : inc.shards) {
+ f.open_object_section("shard");
+ auto& osd_shard = shard_info.first;
+ f.dump_int("osd", osd_shard.osd);
+ f.dump_bool("primary", shard_info.second.primary);
+ auto shard = osd_shard.shard;
+ if (shard != shard_id_t::NO_SHARD)
+ f.dump_unsigned("shard", shard);
+ dump_shard(shard_info.second, inc, f);
+ f.close_section();
+ }
+ f.close_section();
+}
+
+static void dump_inconsistent(const inconsistent_snapset_t& inc,
+ Formatter &f)
+{
+ dump_object_id(inc.object, f);
+
+ if (inc.ss_bl.length()) {
+ SnapSet ss;
+ bufferlist bl = inc.ss_bl;
+ auto bliter = bl.cbegin();
+ decode(ss, bliter); // Can't be corrupted
+ f.open_object_section("snapset");
+ ss.dump(&f);
+ f.close_section();
+ }
+ f.open_array_section("errors");
+ if (inc.snapset_missing())
+ f.dump_string("error", "snapset_missing");
+ if (inc.snapset_corrupted())
+ f.dump_string("error", "snapset_corrupted");
+ if (inc.info_missing())
+ f.dump_string("error", "info_missing");
+ if (inc.info_corrupted())
+ f.dump_string("error", "info_corrupted");
+ if (inc.snapset_error())
+ f.dump_string("error", "snapset_error");
+ if (inc.headless())
+ f.dump_string("error", "headless");
+ if (inc.size_mismatch())
+ f.dump_string("error", "size_mismatch");
+ if (inc.extra_clones())
+ f.dump_string("error", "extra_clones");
+ if (inc.clone_missing())
+ f.dump_string("error", "clone_missing");
+ f.close_section();
+
+ if (inc.extra_clones()) {
+ f.open_array_section("extra clones");
+ for (auto snap : inc.clones) {
+ f.dump_unsigned("snap", snap);
+ }
+ f.close_section();
+ }
+
+ if (inc.clone_missing()) {
+ f.open_array_section("missing");
+ for (auto snap : inc.missing) {
+ f.dump_unsigned("snap", snap);
+ }
+ f.close_section();
+ }
+}
+
+// dispatch the call by type
+static int do_get_inconsistent(Rados& rados,
+ const PlacementGroup& pg,
+ const librados::object_id_t &start,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_obj_t>* objs,
+ uint32_t* interval)
+{
+ return rados.get_inconsistent_objects(pg, start, max_return, c,
+ objs, interval);
+}
+
+static int do_get_inconsistent(Rados& rados,
+ const PlacementGroup& pg,
+ const librados::object_id_t &start,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_snapset_t>* snapsets,
+ uint32_t* interval)
+{
+ return rados.get_inconsistent_snapsets(pg, start, max_return, c,
+ snapsets, interval);
+}
+
+template <typename T>
+static int do_get_inconsistent_cmd(const std::vector<const char*> &nargs,
+ Rados& rados,
+ Formatter& formatter)
+{
+ if (nargs.size() < 2) {
+ usage_exit();
+ }
+ PlacementGroup pg;
+ int ret = 0;
+ ret = pg.parse(nargs[1]);
+ if (!ret) {
+ cerr << "bad pg: " << nargs[1] << std::endl;
+ return ret;
+ }
+ uint32_t interval = 0, first_interval = 0;
+ const unsigned max_item_num = 32;
+ bool opened = false;
+ for (librados::object_id_t start;;) {
+ std::vector<T> items;
+ auto completion = librados::Rados::aio_create_completion();
+ ret = do_get_inconsistent(rados, pg, start, max_item_num, completion,
+ &items, &interval);
+ completion->wait_for_safe();
+ ret = completion->get_return_value();
+ completion->release();
+ if (ret < 0) {
+ if (ret == -EAGAIN)
+ cerr << "interval#" << interval << " expired." << std::endl;
+ else if (ret == -ENOENT)
+ cerr << "No scrub information available for pg " << pg << std::endl;
+ break;
+ }
+ // It must be the same interval every time. EAGAIN would
+ // occur if interval changes.
+ ceph_assert(start.name.empty() || first_interval == interval);
+ if (start.name.empty()) {
+ first_interval = interval;
+ formatter.open_object_section("info");
+ formatter.dump_int("epoch", interval);
+ formatter.open_array_section("inconsistents");
+ opened = true;
+ }
+ for (auto& inc : items) {
+ formatter.open_object_section("inconsistent");
+ dump_inconsistent(inc, formatter);
+ formatter.close_section();
+ }
+ if (items.size() < max_item_num) {
+ formatter.close_section();
+ break;
+ }
+ if (!items.empty()) {
+ start = items.back().object;
+ }
+ items.clear();
+ }
+ if (opened) {
+ formatter.close_section();
+ formatter.flush(cout);
+ }
+ return ret;
+}
+
+static std::string prettify(const std::string& s)
+{
+ if (std::find_if_not(s.begin(), s.end(),
+ (int (*)(int))isprint) != s.end()) {
+ return "(binary key)";
+ } else {
+ return s;
+ }
+}
+
+/**********************************************
+
+**********************************************/
+static int rados_tool_common(const std::map < std::string, std::string > &opts,
+ std::vector<const char*> &nargs)
+{
+ int ret;
+ bool create_pool = false;
+ const char *pool_name = NULL;
+ const char *target_pool_name = NULL;
+ string oloc, target_oloc, nspace, target_nspace;
+ int concurrent_ios = 16;
+ unsigned op_size = default_op_size;
+ unsigned object_size = 0;
+ unsigned max_objects = 0;
+ uint64_t obj_offset = 0;
+ bool block_size_specified = false;
+ int bench_write_dest = 0;
+ bool cleanup = true;
+ bool hints = true; // for rados bench
+ bool reuse_bench = false;
+ bool no_verify = false;
+ bool use_striper = false;
+ bool with_clones = false;
+ const char *snapname = NULL;
+ snap_t snapid = CEPH_NOSNAP;
+ std::map<std::string, std::string>::const_iterator i;
+
+ uint64_t offset_align = 0;
+ uint64_t min_obj_len = 0;
+ uint64_t max_obj_len = 0;
+ uint64_t min_op_len = 0;
+ uint64_t max_op_len = 0;
+ uint64_t max_ops = 0;
+ uint64_t max_backlog = 0;
+ uint64_t target_throughput = 0;
+ int64_t read_percent = -1;
+ uint64_t num_objs = 0;
+ int run_length = 0;
+
+ bool show_time = false;
+ bool wildcard = false;
+
+ std::string run_name;
+ std::string prefix;
+ bool forcefull = false;
+ unique_ptr<Formatter> formatter = nullptr;
+ bool pretty_format = false;
+ const char *output = NULL;
+ std::optional<std::string> omap_key;
+ std::optional<std::string> obj_name;
+ bool with_reference = false;
+
+ Rados rados;
+ IoCtx io_ctx;
+
+ i = opts.find("create");
+ if (i != opts.end()) {
+ create_pool = true;
+ }
+ i = opts.find("pool");
+ if (i != opts.end()) {
+ pool_name = i->second.c_str();
+ }
+ i = opts.find("target_pool");
+ if (i != opts.end()) {
+ target_pool_name = i->second.c_str();
+ }
+ i = opts.find("object_locator");
+ if (i != opts.end()) {
+ oloc = i->second;
+ }
+ i = opts.find("target_locator");
+ if (i != opts.end()) {
+ target_oloc = i->second;
+ }
+ i = opts.find("target_nspace");
+ if (i != opts.end()) {
+ target_nspace = i->second;
+ }
+ i = opts.find("concurrent-ios");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &concurrent_ios)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("run-name");
+ if (i != opts.end()) {
+ run_name = i->second;
+ }
+
+ i = opts.find("force-full");
+ if (i != opts.end()) {
+ forcefull = true;
+ }
+ i = opts.find("prefix");
+ if (i != opts.end()) {
+ prefix = i->second;
+ }
+ i = opts.find("block-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &op_size)) {
+ return -EINVAL;
+ }
+ block_size_specified = true;
+ }
+ i = opts.find("object-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &object_size)) {
+ return -EINVAL;
+ }
+ block_size_specified = true;
+ }
+ i = opts.find("max-objects");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_objects)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("offset");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &obj_offset)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("snap");
+ if (i != opts.end()) {
+ snapname = i->second.c_str();
+ }
+ i = opts.find("snapid");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &snapid)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("min-object-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &min_obj_len)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-object-size");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_obj_len)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("min-op-len");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &min_op_len)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-op-len");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_op_len)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-ops");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_ops)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("max-backlog");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &max_backlog)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("target-throughput");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &target_throughput)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("read-percent");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &read_percent)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("num-objects");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &num_objs)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("run-length");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &run_length)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("show-time");
+ if (i != opts.end()) {
+ show_time = true;
+ }
+ i = opts.find("no-cleanup");
+ if (i != opts.end()) {
+ cleanup = false;
+ }
+ i = opts.find("no-hints");
+ if (i != opts.end()) {
+ hints = false;
+ }
+ i = opts.find("reuse-bench");
+ if (i != opts.end()) {
+ reuse_bench = true;
+ }
+ i = opts.find("pretty-format");
+ if (i != opts.end()) {
+ pretty_format = true;
+ }
+ i = opts.find("format");
+ if (i != opts.end()) {
+ const char *format = i->second.c_str();
+ formatter.reset(Formatter::create(format));
+ if (!formatter) {
+ cerr << "unrecognized format: " << format << std::endl;
+ return -EINVAL;
+ }
+ }
+ i = opts.find("namespace");
+ if (i != opts.end()) {
+ nspace = i->second;
+ }
+ i = opts.find("no-verify");
+ if (i != opts.end()) {
+ no_verify = true;
+ }
+ i = opts.find("output");
+ if (i != opts.end()) {
+ output = i->second.c_str();
+ }
+ i = opts.find("write-dest-obj");
+ if (i != opts.end()) {
+ bench_write_dest |= static_cast<int>(OP_WRITE_DEST_OBJ);
+ }
+ i = opts.find("write-dest-omap");
+ if (i != opts.end()) {
+ bench_write_dest |= static_cast<int>(OP_WRITE_DEST_OMAP);
+ }
+ i = opts.find("write-dest-xattr");
+ if (i != opts.end()) {
+ bench_write_dest |= static_cast<int>(OP_WRITE_DEST_XATTR);
+ }
+ i = opts.find("with-clones");
+ if (i != opts.end()) {
+ with_clones = true;
+ }
+ i = opts.find("omap-key-file");
+ if (i != opts.end()) {
+ string err;
+ bufferlist indata;
+ ret = indata.read_file(i->second.c_str(), &err);
+ if (ret < 0) {
+ cerr << err << std::endl;
+ return 1;
+ }
+ omap_key = std::string(indata.c_str(), indata.length());
+ }
+ i = opts.find("obj-name-file");
+ if (i != opts.end()) {
+ string err;
+ bufferlist indata;
+ ret = indata.read_file(i->second.c_str(), &err);
+ if (ret < 0) {
+ cerr << err << std::endl;
+ return 1;
+ }
+ obj_name = std::string(indata.c_str(), indata.length());
+ }
+ i = opts.find("offset_align");
+ if (i != opts.end()) {
+ if (rados_sistrtoll(i, &offset_align)) {
+ return -EINVAL;
+ }
+ }
+ i = opts.find("with-reference");
+ if (i != opts.end()) {
+ with_reference = true;
+ }
+
+ // open rados
+ ret = rados.init_with_context(g_ceph_context);
+ if (ret < 0) {
+ cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ ret = rados.connect();
+ if (ret) {
+ cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ if (create_pool && !pool_name) {
+ cerr << "--create-pool requested but pool_name was not specified!" << std::endl;
+ usage(cerr);
+ return 1;
+ }
+
+ if (create_pool) {
+ ret = rados.pool_create(pool_name);
+ if (ret < 0) {
+ cerr << "error creating pool " << pool_name << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+
+ i = opts.find("pgid");
+ boost::optional<pg_t> pgid(i != opts.end(), pg_t());
+ if (pgid && (!pgid->parse(i->second.c_str()) || (pool_name && rados.pool_lookup(pool_name) != pgid->pool()))) {
+ cerr << "invalid pgid" << std::endl;
+ return 1;
+ }
+
+ // open io context.
+ if (pool_name || pgid) {
+ ret = pool_name ? rados.ioctx_create(pool_name, io_ctx) : rados.ioctx_create2(pgid->pool(), io_ctx);
+ if (ret < 0) {
+ cerr << "error opening pool "
+ << (pool_name ? pool_name : std::string("with id ") + std::to_string(pgid->pool())) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ // align op_size
+ {
+ bool requires;
+ ret = io_ctx.pool_requires_alignment2(&requires);
+ if (ret < 0) {
+ cerr << "error checking pool alignment requirement"
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ if (requires) {
+ uint64_t align = 0;
+ ret = io_ctx.pool_required_alignment2(&align);
+ if (ret < 0) {
+ cerr << "error getting pool alignment"
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ const uint64_t prev_op_size = op_size;
+ op_size = uint64_t((op_size + align - 1) / align) * align;
+ // Warn: if user specified and it was rounded
+ if (prev_op_size != default_op_size && prev_op_size != op_size)
+ cerr << "INFO: op_size has been rounded to " << op_size << std::endl;
+ }
+ }
+
+#ifdef WITH_LIBRADOSSTRIPER
+ // create striper interface
+ if (opts.find("striper") != opts.end()) {
+ // Note that this call does a tricky thing by reaching into a "singleton". We count
+ // on this happening only once:
+ ret = RadosStriper::striper_create(io_ctx, &detail::striper());
+ if (0 != ret) {
+ cerr << "error opening pool " << pool_name << " with striper interface: "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ use_striper = true;
+ }
+#endif // USE_LIBRADOSSTRIPER
+ }
+
+ // snapname?
+ if (snapname) {
+ if (!pool_name) {
+ cerr << "pool name must be specified with --snap" << std::endl;
+ return 1;
+ }
+ ret = io_ctx.snap_lookup(snapname, &snapid);
+ if (ret < 0) {
+ cerr << "error looking up snap '" << snapname << "': " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ if (oloc.size()) {
+ if (!pool_name) {
+ cerr << "pool name must be specified with --object_locator" << std::endl;
+ return 1;
+ }
+ io_ctx.locator_set_key(oloc);
+ }
+ // Use namespace from command line if specified
+ if (opts.find("namespace") != opts.end()) {
+ if (!pool_name) {
+ cerr << "pool name must be specified with --namespace" << std::endl;
+ return 1;
+ }
+ io_ctx.set_namespace(nspace);
+ // Use wildcard if --all specified and --default NOT specified
+ } else if (opts.find("all") != opts.end() && opts.find("default") == opts.end()) {
+ // Only the ls should ever set namespace to special value
+ wildcard = true;
+ }
+ if (snapid != CEPH_NOSNAP) {
+ if (!pool_name) {
+ cerr << "pool name must be specified with --snapid" << std::endl;
+ return 1;
+ }
+ string name;
+ ret = io_ctx.snap_get_name(snapid, &name);
+ if (ret < 0) {
+ cerr << "snapid " << snapid << " doesn't exist in pool "
+ << io_ctx.get_pool_name() << std::endl;
+ return 1;
+ }
+ io_ctx.snap_set_read(snapid);
+ cout << "selected snap " << snapid << " '" << name << "'" << std::endl;
+ }
+
+ ceph_assert(!nargs.empty());
+
+ // list pools?
+ if (strcmp(nargs[0], "lspools") == 0) {
+ list<string> vec;
+ ret = rados.pool_list(vec);
+ if (ret < 0) {
+ cerr << "error listing pools: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ for (list<string>::iterator i = vec.begin(); i != vec.end(); ++i)
+ cout << *i << std::endl;
+ }
+ else if (strcmp(nargs[0], "df") == 0) {
+ // pools
+ list<string> vec;
+
+ if (!pool_name) {
+ ret = rados.pool_list(vec);
+ if (ret < 0) {
+ cerr << "error listing pools: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else {
+ vec.push_back(pool_name);
+ }
+
+ map<string,librados::pool_stat_t> stats;
+ ret = rados.get_pool_stats(vec, stats);
+ if (ret < 0) {
+ cerr << "error fetching pool stats: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ TextTable tab;
+
+ if (!formatter) {
+ tab.define_column("POOL_NAME", TextTable::LEFT, TextTable::LEFT);
+ tab.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("OBJECTS", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("CLONES", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("COPIES", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("MISSING_ON_PRIMARY", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("UNFOUND", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("DEGRADED", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("RD_OPS", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("RD", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("WR_OPS", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("WR", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("USED COMPR", TextTable::RIGHT, TextTable::RIGHT);
+ tab.define_column("UNDER COMPR", TextTable::RIGHT, TextTable::RIGHT);
+ } else {
+ formatter->open_object_section("stats");
+ formatter->open_array_section("pools");
+ }
+ for (map<string,librados::pool_stat_t>::iterator i = stats.begin();
+ i != stats.end();
+ ++i) {
+ const char *pool_name = i->first.c_str();
+ librados::pool_stat_t& s = i->second;
+ if (!formatter) {
+ tab << pool_name
+ << byte_u_t(s.num_bytes)
+ << s.num_objects
+ << s.num_object_clones
+ << s.num_object_copies
+ << s.num_objects_missing_on_primary
+ << s.num_objects_unfound
+ << s.num_objects_degraded
+ << s.num_rd
+ << byte_u_t(s.num_rd_kb << 10)
+ << s.num_wr
+ << byte_u_t(s.num_wr_kb << 10)
+ << byte_u_t(s.compressed_bytes_alloc)
+ << byte_u_t(s.compressed_bytes_orig)
+ << TextTable::endrow;
+ } else {
+ formatter->open_object_section("pool");
+ int64_t pool_id = rados.pool_lookup(pool_name);
+ formatter->dump_string("name", pool_name);
+ if (pool_id >= 0)
+ formatter->dump_int("id", pool_id);
+ else
+ cerr << "ERROR: lookup_pg_pool_name for name=" << pool_name
+ << " returned " << pool_id << std::endl;
+ formatter->dump_int("size_bytes",s.num_bytes);
+ formatter->dump_int("size_kb", s.num_kb);
+ formatter->dump_int("num_objects", s.num_objects);
+ formatter->dump_int("num_object_clones", s.num_object_clones);
+ formatter->dump_int("num_object_copies", s.num_object_copies);
+ formatter->dump_int("num_objects_missing_on_primary", s.num_objects_missing_on_primary);
+ formatter->dump_int("num_objects_unfound", s.num_objects_unfound);
+ formatter->dump_int("num_objects_degraded", s.num_objects_degraded);
+ formatter->dump_int("read_ops", s.num_rd);
+ formatter->dump_int("read_bytes", s.num_rd_kb * 1024ull);
+ formatter->dump_int("write_ops", s.num_wr);
+ formatter->dump_int("write_bytes", s.num_wr_kb * 1024ull);
+ formatter->dump_int("compress_bytes_used", s.compressed_bytes_alloc);
+ formatter->dump_int("compress_under_bytes", s.compressed_bytes_orig);
+ formatter->close_section();
+ }
+ }
+
+ if (!formatter) {
+ cout << tab;
+ }
+
+ // total
+ cluster_stat_t tstats;
+ ret = rados.cluster_stat(tstats);
+ if (ret < 0) {
+ cerr << "error getting total cluster usage: " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ if (!formatter) {
+ cout << std::endl;
+ cout << "total_objects " << tstats.num_objects
+ << std::endl;
+ cout << "total_used " << byte_u_t(tstats.kb_used << 10)
+ << std::endl;
+ cout << "total_avail " << byte_u_t(tstats.kb_avail << 10)
+ << std::endl;
+ cout << "total_space " << byte_u_t(tstats.kb << 10)
+ << std::endl;
+ } else {
+ formatter->close_section();
+ formatter->dump_int("total_objects", tstats.num_objects);
+ formatter->dump_int("total_used", tstats.kb_used);
+ formatter->dump_int("total_avail", tstats.kb_avail);
+ formatter->dump_int("total_space", tstats.kb);
+ formatter->close_section();
+ formatter->flush(cout);
+ }
+ }
+
+ else if (strcmp(nargs[0], "ls") == 0) {
+ if (!pool_name && !pgid) {
+ cerr << "either pool name or pg id needs to be specified" << std::endl;
+ return 1;
+ }
+
+ if (wildcard) {
+ io_ctx.set_namespace(all_nspaces);
+ }
+ bool use_stdout = (!output && (nargs.size() < 2 || (strcmp(nargs[1], "-") == 0)));
+ if (!use_stdout && !output) {
+ cerr << "Please use --output to specify the output file name" << std::endl;
+ return 1;
+ }
+
+ ostream *outstream;
+ if (use_stdout) {
+ outstream = &cout;
+ } else {
+ outstream = new ofstream(output);
+ }
+
+ {
+ if (formatter) {
+ formatter->open_array_section("objects");
+ }
+ try {
+ librados::NObjectIterator i = pgid ? io_ctx.nobjects_begin(pgid->ps()) : io_ctx.nobjects_begin();
+ const librados::NObjectIterator i_end = io_ctx.nobjects_end();
+ for (; i != i_end; ++i) {
+#ifdef WITH_LIBRADOSSTRIPER
+ if (use_striper) {
+ // in case of --striper option, we only list striped
+ // objects, so we only display the first object of
+ // each, without its suffix '.000...000'
+ size_t l = i->get_oid().length();
+ if (l <= 17 ||
+ (0 != i->get_oid().compare(l-17, 17,".0000000000000000"))) {
+ continue;
+ }
+ }
+#endif // WITH_LIBRADOSSTRIPER
+ if (pgid) {
+ uint32_t ps;
+ if (io_ctx.get_object_pg_hash_position2(i->get_oid(), &ps) || pgid->ps() != ps) {
+ break;
+ }
+ }
+ if (!formatter) {
+ // Only include namespace in output when wildcard specified
+ if (wildcard) {
+ *outstream << i->get_nspace() << "\t";
+ }
+ *outstream << detail::get_oid(i, use_striper);
+ if (i->get_locator().size()) {
+ *outstream << "\t" << i->get_locator();
+ }
+ *outstream << std::endl;
+ } else {
+ formatter->open_object_section("object");
+ formatter->dump_string("namespace", i->get_nspace());
+
+ detail::dump_name(formatter.get(), i, use_striper);
+
+ if (i->get_locator().size()) {
+ formatter->dump_string("locator", i->get_locator());
+ }
+ formatter->close_section(); //object
+
+ constexpr int TARGET_BYTES_PER_FLUSH = 4096;
+ if (formatter->get_len() >= TARGET_BYTES_PER_FLUSH) {
+ formatter->flush(*outstream);
+ }
+ }
+ }
+ }
+ catch (const std::exception& e) {
+ cerr << e.what() << std::endl;
+ return 1;
+ }
+ }
+ if (formatter) {
+ formatter->close_section(); //objects
+ formatter->flush(*outstream);
+ if (pretty_format) {
+ *outstream << std::endl;
+ }
+ formatter->flush(*outstream);
+ }
+ if (!stdout) {
+ delete outstream;
+ }
+ }
+ else if (strcmp(nargs[0], "mapext") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ std::map<uint64_t,uint64_t> m;
+ ret = io_ctx.mapext(*obj_name, 0, -1, m);
+ if (ret < 0) {
+ cerr << "mapext error on " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ std::map<uint64_t,uint64_t>::iterator iter;
+ for (iter = m.begin(); iter != m.end(); ++iter) {
+ cout << hex << iter->first << "\t" << iter->second << dec << std::endl;
+ }
+ }
+ else if (strcmp(nargs[0], "stat") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ uint64_t size;
+ time_t mtime;
+
+ ret = detail::stat(io_ctx, *obj_name, size, mtime, use_striper);
+
+ if (ret < 0) {
+ cerr << " error stat-ing " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ utime_t t(mtime, 0);
+ cout << pool_name << "/" << prettify(*obj_name)
+ << " mtime " << t << ", size " << size << std::endl;
+ }
+ }
+ else if (strcmp(nargs[0], "stat2") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ uint64_t size;
+ struct timespec mtime;
+
+ ret = detail::stat2(io_ctx, *obj_name, size, mtime, use_striper);
+
+ if (ret < 0) {
+ cerr << " error stat-ing " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ utime_t t(mtime);
+ cout << pool_name << "/" << prettify(*obj_name)
+ << " mtime " << t << ", size " << size << std::endl;
+ }
+ }
+ else if (strcmp(nargs[0], "touch") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ time_t timestamp = time(NULL);
+ if (nargs.size() > (obj_name ? 1 : 2)) {
+ char* endptr = NULL;
+ timestamp = static_cast<time_t>(strtoll(nargs[obj_name ? 1 : 2], &endptr, 10));
+ if (*endptr) {
+ cerr << "Invalid value for timestamp: '" << nargs[obj_name ? 1 : 2] << "'" << std::endl;
+ ret = -EINVAL;
+ return 1;
+ }
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ObjectWriteOperation op;
+ op.create(false);
+ op.mtime(&timestamp);
+ ret = io_ctx.operate(*obj_name, &op);
+ if (ret < 0) {
+ cerr << " error touch-ing " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "get") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+ const char* out_filename;
+ if (obj_name) {
+ out_filename = nargs[1];
+ } else {
+ obj_name = nargs[1];
+ out_filename = nargs[2];
+ }
+ ret = do_get(io_ctx, *obj_name, out_filename, op_size, use_striper);
+ if (ret < 0) {
+ cerr << "error getting " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "put") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+ const char* in_filename;
+ if (obj_name) {
+ in_filename = nargs[1];
+ } else {
+ obj_name = nargs[1];
+ in_filename = nargs[2];
+ }
+ ret = do_put(io_ctx, *obj_name, in_filename, op_size, obj_offset, use_striper);
+ if (ret < 0) {
+ cerr << "error putting " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "append") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+ const char* in_filename;
+ if (obj_name) {
+ in_filename = nargs[1];
+ } else {
+ obj_name = nargs[1];
+ in_filename = nargs[2];
+ }
+ ret = do_append(io_ctx, *obj_name, in_filename, op_size, use_striper);
+ if (ret < 0) {
+ cerr << "error appending " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "truncate") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+
+ char* endptr = NULL;
+ long size;
+ if (!obj_name) {
+ obj_name = nargs[1];
+ size = strtoll(nargs[2], &endptr, 10);
+ } else {
+ size = strtoll(nargs[1], &endptr, 10);
+ }
+ if (*endptr) {
+ cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl;
+ ret = -EINVAL;
+ return 1;
+ }
+ if (size < 0) {
+ cerr << "error, cannot truncate to negative value" << std::endl;
+ usage(cerr);
+ return 1;
+ }
+
+ ret = detail::trunc(io_ctx, *obj_name, size, use_striper);
+
+ if (ret < 0) {
+ cerr << "error truncating oid "
+ << prettify(*obj_name) << " to " << size << ": "
+ << cpp_strerror(ret) << std::endl;
+ } else {
+ ret = 0;
+ }
+ }
+ else if (strcmp(nargs[0], "setxattr") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3) ||
+ nargs.size() > (obj_name ? 3 : 4)) {
+ usage(cerr);
+ return 1;
+ }
+ string attr_name(nargs[obj_name ? 1 : 2]);
+ bufferlist bl;
+ if (nargs.size() == (obj_name ? 3 : 4)) {
+ string attr_val(nargs[obj_name ? 2 : 3]);
+ bl.append(attr_val.c_str(), attr_val.length());
+ } else {
+ do {
+ ret = bl.read_fd(STDIN_FILENO, 1024); // from stdin
+ if (ret < 0)
+ return 1;
+ } while (ret > 0);
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+
+ ret = detail::setxattr(io_ctx, *obj_name, attr_name, bl, use_striper);
+
+ if (ret < 0) {
+ cerr << "error setting xattr " << pool_name << "/" << prettify(*obj_name) << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ else
+ ret = 0;
+ }
+ else if (strcmp(nargs[0], "getxattr") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+ string attr_name(nargs[obj_name ? 1 : 2]);
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ bufferlist bl;
+ ret = detail::getxattr(io_ctx, *obj_name, attr_name, bl, use_striper);
+
+ if (ret < 0) {
+ cerr << "error getting xattr " << pool_name << "/" << prettify(*obj_name) << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ else
+ ret = 0;
+ string s(bl.c_str(), bl.length());
+ cout << s;
+ } else if (strcmp(nargs[0], "rmxattr") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+
+ string attr_name(nargs[obj_name ? 1 : 2]);
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ret = detail::rmxattr(io_ctx, *obj_name, attr_name, use_striper);
+
+ if (ret < 0) {
+ cerr << "error removing xattr " << pool_name << "/" << prettify(*obj_name) << "/" << attr_name << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "listxattr") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ bufferlist bl;
+ map<std::string, bufferlist> attrset;
+
+ ret = detail::getxattrs(io_ctx, *obj_name, attrset, use_striper);
+
+ if (ret < 0) {
+ cerr << "error getting xattr set " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ for (map<std::string, bufferlist>::iterator iter = attrset.begin();
+ iter != attrset.end(); ++iter) {
+ cout << iter->first << std::endl;
+ }
+ } else if (strcmp(nargs[0], "getomapheader") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ string outfile;
+ if (nargs.size() >= (obj_name ? 2 : 3)) {
+ outfile = nargs[obj_name ? 1 : 2];
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ bufferlist header;
+ ret = io_ctx.omap_get_header(*obj_name, &header);
+ if (ret < 0) {
+ cerr << "error getting omap header " << pool_name << "/" << prettify(*obj_name)
+ << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ if (!outfile.empty()) {
+ cerr << "Writing to " << outfile << std::endl;
+ dump_data(outfile, header);
+ } else {
+ cout << "header (" << header.length() << " bytes) :\n";
+ header.hexdump(cout);
+ cout << std::endl;
+ }
+ ret = 0;
+ }
+ } else if (strcmp(nargs[0], "setomapheader") == 0) {
+ if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+ usage(cerr);
+ return 1;
+ }
+
+ bufferlist bl;
+ if (!obj_name) {
+ obj_name = nargs[1];
+ bl.append(nargs[2]); // val
+ } else {
+ bl.append(nargs[1]); // val
+ }
+ ret = io_ctx.omap_set_header(*obj_name, bl);
+ if (ret < 0) {
+ cerr << "error setting omap value " << pool_name << "/" << prettify(*obj_name)
+ << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ ret = 0;
+ }
+ } else if (strcmp(nargs[0], "setomapval") == 0) {
+ uint32_t min_args = (omap_key ? 2 : 3);
+ if (!pool_name || nargs.size() < min_args || nargs.size() > min_args + 1) {
+ usage(cerr);
+ return 1;
+ }
+
+ string oid(nargs[1]);
+ if (!omap_key) {
+ omap_key = nargs[2];
+ }
+
+ bufferlist bl;
+ if (nargs.size() > min_args) {
+ string val(nargs[min_args]);
+ bl.append(val);
+ } else {
+ do {
+ ret = bl.read_fd(STDIN_FILENO, 1024); // from stdin
+ if (ret < 0) {
+ return 1;
+ }
+ } while (ret > 0);
+ }
+
+ map<string, bufferlist> values;
+ values[*omap_key] = bl;
+
+ ret = io_ctx.omap_set(oid, values);
+ if (ret < 0) {
+ cerr << "error setting omap value " << pool_name << "/" << oid << "/"
+ << prettify(*omap_key) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ ret = 0;
+ }
+ } else if (strcmp(nargs[0], "getomapval") == 0) {
+ uint32_t min_args = (omap_key ? (obj_name ? 1 : 2)
+ : (obj_name ? 2 : 3));
+ if (!pool_name || nargs.size() < min_args || nargs.size() > min_args + 1) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (!omap_key) {
+ omap_key = nargs[obj_name ? 1 : 2];
+ }
+
+ set<string> keys;
+ keys.insert(*omap_key);
+
+ std::string outfile;
+ if (nargs.size() > min_args) {
+ outfile = nargs[min_args];
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+
+ map<string, bufferlist> values;
+ ret = io_ctx.omap_get_vals_by_keys(*obj_name, keys, &values);
+ if (ret < 0) {
+ cerr << "error getting omap value " << pool_name << "/" << prettify(*obj_name) << "/"
+ << prettify(*omap_key) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ ret = 0;
+ }
+
+ if (values.size() && values.begin()->first == *omap_key) {
+ if (!outfile.empty()) {
+ cerr << "Writing to " << outfile << std::endl;
+ dump_data(outfile, values.begin()->second);
+ } else {
+ cout << "value (" << values.begin()->second.length() << " bytes) :\n";
+ values.begin()->second.hexdump(cout);
+ cout << std::endl;
+ }
+ ret = 0;
+ } else {
+ cout << "No such key: " << pool_name << "/" << prettify(*obj_name) << "/"
+ << prettify(*omap_key) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "rmomapkey") == 0) {
+ uint32_t num_args = (omap_key ? (obj_name ? 1 : 2)
+ : (obj_name ? 2 : 3));
+ if (!pool_name || nargs.size() != num_args) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (!omap_key) {
+ omap_key = nargs[obj_name ? 1 : 2];
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ set<string> keys;
+ keys.insert(*omap_key);
+
+ ret = io_ctx.omap_rm_keys(*obj_name, keys);
+ if (ret < 0) {
+ cerr << "error removing omap key " << pool_name << "/" << prettify(*obj_name) << "/"
+ << prettify(*omap_key) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ } else {
+ ret = 0;
+ }
+ } else if (strcmp(nargs[0], "clearomap") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ // strip nargs[0] which is "clearomap"
+ std::vector<std::string> oids(std::next(std::begin(nargs)),
+ std::end(nargs));
+ if (obj_name) {
+ oids.push_back(*obj_name);
+ }
+
+ for (const auto& oid : oids) {
+ ret = io_ctx.omap_clear(oid);
+ if (ret < 0) {
+ cerr << "error clearing omap keys " << pool_name << "/" << prettify(*obj_name) << "/"
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ ret = 0;
+ } else if (strcmp(nargs[0], "listomapvals") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ string last_read = "";
+ int MAX_READ = 512;
+ do {
+ map<string, bufferlist> values;
+ ret = io_ctx.omap_get_vals(*obj_name, last_read, MAX_READ, &values);
+ if (ret < 0) {
+ cerr << "error getting omap keys " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ ret = values.size();
+ for (map<string, bufferlist>::const_iterator it = values.begin();
+ it != values.end(); ++it) {
+ last_read = it->first;
+ // dump key in hex if it contains nonprintable characters
+ if (std::count_if(it->first.begin(), it->first.end(),
+ (int (*)(int))isprint) < (int)it->first.length()) {
+ cout << "key (" << it->first.length() << " bytes):\n";
+ bufferlist keybl;
+ keybl.append(it->first);
+ keybl.hexdump(cout);
+ } else {
+ cout << it->first;
+ }
+ cout << std::endl;
+ cout << "value (" << it->second.length() << " bytes) :\n";
+ it->second.hexdump(cout);
+ cout << std::endl;
+ }
+ } while (ret == MAX_READ);
+ ret = 0;
+ }
+ else if (strcmp(nargs[0], "cp") == 0) {
+ // XXX: binary names aren't supported for this operation
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (nargs.size() < 2 || nargs.size() > 3) {
+ usage(cerr);
+ return 1;
+ }
+
+ const char *target = target_pool_name;
+ if (!target)
+ target = pool_name;
+
+ const char *target_obj;
+ if (nargs.size() < 3) {
+ if (strcmp(target, pool_name) == 0) {
+ cerr << "cannot copy object into itself" << std::endl;
+ return 1;
+ }
+ target_obj = nargs[1];
+ } else {
+ target_obj = nargs[2];
+ }
+
+ // open io context.
+ IoCtx target_ctx;
+ ret = rados.ioctx_create(target, target_ctx);
+ if (ret < 0) {
+ cerr << "error opening target pool " << target << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ if (target_oloc.size()) {
+ target_ctx.locator_set_key(target_oloc);
+ }
+ if (target_nspace.size()) {
+ target_ctx.set_namespace(target_nspace);
+ }
+
+ ret = do_copy(io_ctx, nargs[1], target_ctx, target_obj);
+ if (ret < 0) {
+ cerr << "error copying " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "rm") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ // strip nargs[0] which is "rm"
+ std::vector<std::string> oids(std::next(std::begin(nargs)),
+ std::end(nargs));
+ if (obj_name) {
+ oids.push_back(*obj_name);
+ }
+ for (const auto& oid : oids) {
+ if (forcefull) {
+ ret = detail::remove(io_ctx, oid, (CEPH_OSD_FLAG_FULL_FORCE |
+ CEPH_OSD_FLAG_FULL_TRY), use_striper);
+ } else {
+ ret = detail::remove(io_ctx, oid, use_striper);
+ }
+
+ if (ret < 0) {
+ string name = (nspace.size() ? nspace + "/" : "" ) + prettify(oid);
+ cerr << "error removing " << pool_name << ">" << name << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ }
+ else if (strcmp(nargs[0], "create") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ret = io_ctx.create(*obj_name, true);
+ if (ret < 0) {
+ cerr << "error creating " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ else if (strcmp(nargs[0], "cppool") == 0) {
+ bool force = nargs.size() == 4 && !strcmp(nargs[3], "--yes-i-really-mean-it");
+ if (nargs.size() != 3 && !(nargs.size() == 4 && force)) {
+ usage(cerr);
+ return 1;
+ }
+ const char *src_pool = nargs[1];
+ const char *target_pool = nargs[2];
+
+ if (strcmp(src_pool, target_pool) == 0) {
+ cerr << "cannot copy pool into itself" << std::endl;
+ return 1;
+ }
+
+ cerr << "WARNING: pool copy does not preserve user_version, which some "
+ << " apps may rely on." << std::endl;
+
+ if (rados.get_pool_is_selfmanaged_snaps_mode(src_pool)) {
+ cerr << "WARNING: pool " << src_pool << " has selfmanaged snaps, which are not preserved\n"
+ << " by the cppool operation. This will break any snapshot user."
+ << std::endl;
+ if (!force) {
+ cerr << " If you insist on making a broken copy, you can pass\n"
+ << " --yes-i-really-mean-it to proceed anyway."
+ << std::endl;
+ exit(1);
+ }
+ }
+
+ ret = do_copy_pool(rados, src_pool, target_pool);
+ if (ret < 0) {
+ cerr << "error copying pool " << src_pool << " => " << target_pool << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ cout << "successfully copied pool " << nargs[1] << std::endl;
+ }
+ else if (strcmp(nargs[0], "purge") == 0) {
+ if (nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+ if (nargs.size() < 3 ||
+ strcmp(nargs[2], "--yes-i-really-really-mean-it") != 0) {
+ cerr << "WARNING:\n"
+ << " This will PERMANENTLY DESTROY all objects from a pool with no way back.\n"
+ << " To confirm, follow pool with --yes-i-really-really-mean-it" << std::endl;
+ return 1;
+ }
+ ret = rados.ioctx_create(nargs[1], io_ctx);
+ if (ret < 0) {
+ cerr << "error pool " << nargs[1] << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ io_ctx.set_namespace(all_nspaces);
+ io_ctx.set_osdmap_full_try();
+ RadosBencher bencher(g_ceph_context, rados, io_ctx);
+ ret = bencher.clean_up_slow("", concurrent_ios);
+ if (ret >= 0) {
+ cout << "successfully purged pool " << nargs[1] << std::endl;
+ } else { //error
+ cerr << "pool " << nargs[1] << " could not be purged" << std::endl;
+ cerr << "Check your monitor configuration - `mon allow pool delete` is set to false by default,"
+ << " change it to true to allow deletion of pools" << std::endl;
+ }
+ }
+ else if (strcmp(nargs[0], "lssnap") == 0) {
+ if (!pool_name || nargs.size() != 1) {
+ usage(cerr);
+ return 1;
+ }
+
+ vector<snap_t> snaps;
+ io_ctx.snap_list(&snaps);
+ for (vector<snap_t>::iterator i = snaps.begin();
+ i != snaps.end();
+ ++i) {
+ string s;
+ time_t t;
+ if (io_ctx.snap_get_name(*i, &s) < 0)
+ continue;
+ if (io_ctx.snap_get_stamp(*i, &t) < 0)
+ continue;
+ struct tm bdt;
+ localtime_r(&t, &bdt);
+ cout << *i << "\t" << s << "\t";
+
+ std::ios_base::fmtflags original_flags = cout.flags();
+ cout.setf(std::ios::right);
+ cout.fill('0');
+ cout << std::setw(4) << (bdt.tm_year+1900)
+ << '.' << std::setw(2) << (bdt.tm_mon+1)
+ << '.' << std::setw(2) << bdt.tm_mday
+ << ' '
+ << std::setw(2) << bdt.tm_hour
+ << ':' << std::setw(2) << bdt.tm_min
+ << ':' << std::setw(2) << bdt.tm_sec
+ << std::endl;
+ cout.flags(original_flags);
+ }
+ cout << snaps.size() << " snaps" << std::endl;
+ }
+
+ else if (strcmp(nargs[0], "mksnap") == 0) {
+ if (!pool_name || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (rados.get_pool_is_selfmanaged_snaps_mode(pool_name)) {
+ cerr << "can't create snapshot: pool " << pool_name
+ << " is in selfmanaged snaps mode" << std::endl;
+ return 1;
+ }
+
+ ret = io_ctx.snap_create(nargs[1]);
+ if (ret < 0) {
+ cerr << "error creating pool " << pool_name << " snapshot " << nargs[1]
+ << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ cout << "created pool " << pool_name << " snap " << nargs[1] << std::endl;
+ }
+
+ else if (strcmp(nargs[0], "rmsnap") == 0) {
+ if (!pool_name || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ ret = io_ctx.snap_remove(nargs[1]);
+ if (ret < 0) {
+ cerr << "error removing pool " << pool_name << " snapshot " << nargs[1]
+ << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ cout << "removed pool " << pool_name << " snap " << nargs[1] << std::endl;
+ }
+
+ else if (strcmp(nargs[0], "rollback") == 0) {
+ if (!pool_name || nargs.size() < 3) {
+ usage(cerr);
+ return 1;
+ }
+
+ ret = io_ctx.snap_rollback(nargs[1], nargs[2]);
+ if (ret < 0) {
+ cerr << "error rolling back pool " << pool_name << " to snapshot " << nargs[1]
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ cout << "rolled back pool " << pool_name
+ << " to snapshot " << nargs[2] << std::endl;
+ }
+ else if (strcmp(nargs[0], "bench") == 0) {
+ if (!pool_name || nargs.size() < 3) {
+ usage(cerr);
+ return 1;
+ }
+ char* endptr = NULL;
+ int seconds = strtol(nargs[1], &endptr, 10);
+ if (*endptr) {
+ cerr << "Invalid value for seconds: '" << nargs[1] << "'" << std::endl;
+ return 1;
+ }
+ int operation = 0;
+ if (strcmp(nargs[2], "write") == 0)
+ operation = OP_WRITE;
+ else if (strcmp(nargs[2], "seq") == 0)
+ operation = OP_SEQ_READ;
+ else if (strcmp(nargs[2], "rand") == 0)
+ operation = OP_RAND_READ;
+ else {
+ usage(cerr);
+ return 1;
+ }
+ if (operation != OP_WRITE) {
+ if (block_size_specified) {
+ cerr << "-b|--block_size option can be used only with 'write' bench test"
+ << std::endl;
+ return 1;
+ }
+ if (bench_write_dest != 0) {
+ cerr << "--write-object, --write-omap and --write-xattr options can "
+ "only be used with the 'write' bench test"
+ << std::endl;
+ return 1;
+ }
+ }
+ else if (bench_write_dest == 0) {
+ bench_write_dest = OP_WRITE_DEST_OBJ;
+ }
+
+ if (!formatter && output) {
+ cerr << "-o|--output option can only be used with '--format' option"
+ << std::endl;
+ return 1;
+ }
+ RadosBencher bencher(g_ceph_context, rados, io_ctx);
+ bencher.set_show_time(show_time);
+ bencher.set_write_destination(static_cast<OpWriteDest>(bench_write_dest));
+
+ ostream *outstream = NULL;
+ if (formatter) {
+ bencher.set_formatter(formatter.get());
+ if (output)
+ outstream = new ofstream(output);
+ else
+ outstream = &cout;
+ bencher.set_outstream(*outstream);
+ }
+ if (!object_size)
+ object_size = op_size;
+ else if (object_size < op_size)
+ op_size = object_size;
+ cout << "hints = " << (int)hints << std::endl;
+ ret = bencher.aio_bench(operation, seconds,
+ concurrent_ios, op_size, object_size,
+ max_objects, cleanup, hints, run_name, reuse_bench, no_verify);
+ if (ret != 0)
+ cerr << "error during benchmark: " << cpp_strerror(ret) << std::endl;
+ if (formatter && output)
+ delete outstream;
+ }
+ else if (strcmp(nargs[0], "cleanup") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+ if (wildcard)
+ io_ctx.set_namespace(all_nspaces);
+ RadosBencher bencher(g_ceph_context, rados, io_ctx);
+ ret = bencher.clean_up(prefix, concurrent_ios, run_name);
+ if (ret != 0)
+ cerr << "error during cleanup: " << cpp_strerror(ret) << std::endl;
+ }
+ else if (strcmp(nargs[0], "watch") == 0) {
+ if (!pool_name || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+ string oid(nargs[1]);
+ RadosWatchCtx ctx(io_ctx, oid.c_str());
+ uint64_t cookie;
+ ret = io_ctx.watch2(oid, &cookie, &ctx);
+ if (ret != 0)
+ cerr << "error calling watch: " << cpp_strerror(ret) << std::endl;
+ else {
+ cout << "press enter to exit..." << std::endl;
+ getchar();
+ io_ctx.unwatch2(cookie);
+ rados.watch_flush();
+ }
+ }
+ else if (strcmp(nargs[0], "notify") == 0) {
+ if (!pool_name || nargs.size() < 3) {
+ usage(cerr);
+ return 1;
+ }
+ string oid(nargs[1]);
+ string msg(nargs[2]);
+ bufferlist bl, replybl;
+ encode(msg, bl);
+ ret = io_ctx.notify2(oid, bl, 10000, &replybl);
+ if (ret != 0)
+ cerr << "error calling notify: " << cpp_strerror(ret) << std::endl;
+ if (replybl.length()) {
+ map<pair<uint64_t,uint64_t>,bufferlist> rm;
+ set<pair<uint64_t,uint64_t> > missed;
+ auto p = replybl.cbegin();
+ decode(rm, p);
+ decode(missed, p);
+ for (map<pair<uint64_t,uint64_t>,bufferlist>::iterator p = rm.begin();
+ p != rm.end();
+ ++p) {
+ cout << "reply client." << p->first.first
+ << " cookie " << p->first.second
+ << " : " << p->second.length() << " bytes" << std::endl;
+ if (p->second.length())
+ p->second.hexdump(cout);
+ }
+ for (multiset<pair<uint64_t,uint64_t> >::iterator p = missed.begin();
+ p != missed.end(); ++p) {
+ cout << "timeout client." << p->first
+ << " cookie " << p->second << std::endl;
+ }
+ }
+ } else if (strcmp(nargs[0], "set-alloc-hint") == 0) {
+ // cmd, [oid, ] obj_size, write_size
+ if (!pool_name || nargs.size() < (obj_name ? 3 : 4)) {
+ usage(cerr);
+ return 1;
+ }
+ string err;
+ uint64_t expected_object_size = strict_strtoll(nargs[obj_name ? 1 : 2], 10, &err);
+ if (!err.empty()) {
+ cerr << "couldn't parse expected_object_size: " << err << std::endl;
+ usage(cerr);
+ return 1;
+ }
+ uint64_t expected_write_size = strict_strtoll(nargs[obj_name ? 2 : 3], 10, &err);
+ if (!err.empty()) {
+ cerr << "couldn't parse expected_write_size: " << err << std::endl;
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ret = io_ctx.set_alloc_hint(*obj_name, expected_object_size, expected_write_size);
+ if (ret < 0) {
+ cerr << "error setting alloc-hint " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "load-gen") == 0) {
+ if (!pool_name) {
+ cerr << "error: must specify pool" << std::endl;
+ usage(cerr);
+ return 1;
+ }
+ LoadGen lg(&rados);
+ if (min_obj_len)
+ lg.min_obj_len = min_obj_len;
+ if (max_obj_len)
+ lg.max_obj_len = max_obj_len;
+ if (min_op_len)
+ lg.min_op_len = min_op_len;
+ if (max_op_len)
+ lg.max_op_len = max_op_len;
+ if (max_ops)
+ lg.max_ops = max_ops;
+ if (max_backlog)
+ lg.max_backlog = max_backlog;
+ if (target_throughput)
+ lg.target_throughput = target_throughput;
+ if (read_percent >= 0)
+ lg.read_percent = read_percent;
+ if (num_objs)
+ lg.num_objs = num_objs;
+ if (run_length)
+ lg.run_length = run_length;
+ if (offset_align)
+ lg.offset_align = offset_align;
+
+ cout << "run length " << run_length << " seconds" << std::endl;
+ cout << "preparing " << lg.num_objs << " objects" << std::endl;
+ ret = lg.bootstrap(pool_name);
+ if (ret < 0) {
+ cerr << "load-gen bootstrap failed" << std::endl;
+ return 1;
+ }
+ cout << "load-gen will run " << lg.run_length << " seconds" << std::endl;
+ lg.run();
+ lg.cleanup();
+ } else if (strcmp(nargs[0], "listomapkeys") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ set<string> out_keys;
+ ret = io_ctx.omap_get_keys(*obj_name, "", LONG_MAX, &out_keys);
+ if (ret < 0) {
+ cerr << "error getting omap key set " << pool_name << "/"
+ << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+
+ for (set<string>::iterator iter = out_keys.begin();
+ iter != out_keys.end(); ++iter) {
+ cout << *iter << std::endl;
+ }
+ } else if (strcmp(nargs[0], "lock") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+
+ if (!formatter) {
+ formatter = std::make_unique<JSONFormatter>(pretty_format);
+ }
+ ret = do_lock_cmd(nargs, opts, &io_ctx, formatter.get());
+ } else if (strcmp(nargs[0], "listwatchers") == 0) {
+ if (!pool_name || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ string oid(nargs[1]);
+ std::list<obj_watch_t> lw;
+
+ ret = io_ctx.list_watchers(oid, &lw);
+ if (ret < 0) {
+ cerr << "error listing watchers " << pool_name << "/" << oid << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ else
+ ret = 0;
+
+ for (std::list<obj_watch_t>::iterator i = lw.begin(); i != lw.end(); ++i) {
+ cout << "watcher=" << i->addr << " client." << i->watcher_id << " cookie=" << i->cookie << std::endl;
+ }
+ } else if (strcmp(nargs[0], "listsnaps") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ ret = io_ctx.list_snaps(*obj_name, &ls);
+ if (ret < 0) {
+ cerr << "error listing snap shots " << pool_name << "/" << prettify(*obj_name) << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ else
+ ret = 0;
+
+ map<snap_t,string> snamemap;
+ if (formatter || pretty_format) {
+ vector<snap_t> snaps;
+ io_ctx.snap_list(&snaps);
+ for (vector<snap_t>::iterator i = snaps.begin();
+ i != snaps.end(); ++i) {
+ string s;
+ if (io_ctx.snap_get_name(*i, &s) < 0)
+ continue;
+ snamemap.insert(pair<snap_t,string>(*i, s));
+ }
+ }
+
+ if (formatter) {
+ formatter->open_object_section("object");
+ formatter->dump_string("name", *obj_name);
+ formatter->open_array_section("clones");
+ } else {
+ cout << prettify(*obj_name) << ":" << std::endl;
+ cout << "cloneid snaps size overlap" << std::endl;
+ }
+
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+
+ if (formatter) formatter->open_object_section("clone");
+
+ if (ci->cloneid == librados::SNAP_HEAD) {
+ if (formatter)
+ formatter->dump_string("id", "head");
+ else
+ cout << "head";
+ } else {
+ if (formatter)
+ formatter->dump_unsigned("id", ci->cloneid);
+ else
+ cout << ci->cloneid;
+ }
+
+ if (formatter)
+ formatter->open_array_section("snapshots");
+ else
+ cout << "\t";
+
+ if (!formatter && ci->snaps.empty()) {
+ cout << "-";
+ }
+ for (std::vector<snap_t>::const_iterator snapindex = ci->snaps.begin();
+ snapindex != ci->snaps.end(); ++snapindex) {
+
+ map<snap_t,string>::iterator si;
+
+ if (formatter || pretty_format) si = snamemap.find(*snapindex);
+
+ if (formatter) {
+ formatter->open_object_section("snapshot");
+ formatter->dump_unsigned("id", *snapindex);
+ if (si != snamemap.end())
+ formatter->dump_string("name", si->second);
+ formatter->close_section(); //snapshot
+ } else {
+ if (snapindex != ci->snaps.begin()) cout << ",";
+ if (!pretty_format || (si == snamemap.end()))
+ cout << *snapindex;
+ else
+ cout << si->second << "(" << *snapindex << ")";
+ }
+ }
+
+ if (formatter) {
+ formatter->close_section(); //Snapshots
+ formatter->dump_unsigned("size", ci->size);
+ } else {
+ cout << "\t" << ci->size;
+ }
+
+ if (ci->cloneid != librados::SNAP_HEAD) {
+ if (formatter)
+ formatter->open_array_section("overlaps");
+ else
+ cout << "\t[";
+
+ for (std::vector< std::pair<uint64_t,uint64_t> >::iterator ovi = ci->overlap.begin();
+ ovi != ci->overlap.end(); ++ovi) {
+ if (formatter) {
+ formatter->open_object_section("section");
+ formatter->dump_unsigned("start", ovi->first);
+ formatter->dump_unsigned("length", ovi->second);
+ formatter->close_section(); //section
+ } else {
+ if (ovi != ci->overlap.begin()) cout << ",";
+ cout << ovi->first << "~" << ovi->second;
+ }
+ }
+ if (formatter)
+ formatter->close_section(); //overlaps
+ else
+ cout << "]" << std::endl;
+ }
+ if (formatter) formatter->close_section(); //clone
+ }
+ if (formatter) {
+ formatter->close_section(); //clones
+ formatter->close_section(); //object
+ formatter->flush(cout);
+ } else {
+ cout << std::endl;
+ }
+ } else if (strcmp(nargs[0], "list-inconsistent-pg") == 0) {
+ if (!formatter) {
+ formatter = std::make_unique<JSONFormatter>(pretty_format);
+ }
+ ret = do_get_inconsistent_pg_cmd(nargs, rados, *formatter);
+ } else if (strcmp(nargs[0], "list-inconsistent-obj") == 0) {
+ if (!formatter) {
+ formatter = std::make_unique<JSONFormatter>(pretty_format);
+ }
+ ret = do_get_inconsistent_cmd<inconsistent_obj_t>(nargs, rados, *formatter);
+ } else if (strcmp(nargs[0], "list-inconsistent-snapset") == 0) {
+ if (!formatter) {
+ formatter = std::make_unique<JSONFormatter>(pretty_format);
+ }
+ ret = do_get_inconsistent_cmd<inconsistent_snapset_t>(nargs, rados, *formatter);
+ } else if (strcmp(nargs[0], "cache-flush") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ if (with_clones) {
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ ret = io_ctx.list_snaps(*obj_name, &ls);
+ if (ret < 0) {
+ cerr << "error listing snapshots " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+ if (snapid != CEPH_NOSNAP && ci->cloneid > snapid)
+ break;
+ io_ctx.snap_set_read(ci->cloneid);
+ ret = do_cache_flush(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else {
+ ret = do_cache_flush(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else if (strcmp(nargs[0], "cache-try-flush") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ if (with_clones) {
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ ret = io_ctx.list_snaps(*obj_name, &ls);
+ if (ret < 0) {
+ cerr << "error listing snapshots " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+ if (snapid != CEPH_NOSNAP && ci->cloneid > snapid)
+ break;
+ io_ctx.snap_set_read(ci->cloneid);
+ ret = do_cache_try_flush(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else {
+ ret = do_cache_try_flush(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else if (strcmp(nargs[0], "cache-evict") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ if (with_clones) {
+ snap_set_t ls;
+ io_ctx.snap_set_read(LIBRADOS_SNAP_DIR);
+ ret = io_ctx.list_snaps(*obj_name, &ls);
+ if (ret < 0) {
+ cerr << "error listing snapshots " << pool_name << "/" << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ for (std::vector<clone_info_t>::iterator ci = ls.clones.begin();
+ ci != ls.clones.end(); ++ci) {
+ if (snapid != CEPH_NOSNAP && ci->cloneid > snapid)
+ break;
+ io_ctx.snap_set_read(ci->cloneid);
+ ret = do_cache_evict(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else {
+ ret = do_cache_evict(io_ctx, *obj_name);
+ if (ret < 0) {
+ cerr << "error from cache-flush " << prettify(*obj_name) << ": "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ }
+ } else if (strcmp(nargs[0], "cache-flush-evict-all") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+ ret = do_cache_flush_evict_all(io_ctx, true);
+ if (ret < 0) {
+ cerr << "cache-flush-evict-all finished with errors" << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "cache-try-flush-evict-all") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+ ret = do_cache_flush_evict_all(io_ctx, false);
+ if (ret < 0) {
+ cerr << "cache-try-flush-evict-all finished with errors" << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "set-redirect") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+
+ const char *target = target_pool_name;
+ if (!target)
+ target = pool_name;
+
+ const char *target_obj;
+ if (nargs.size() < 3) {
+ if (strcmp(target, pool_name) == 0) {
+ cerr << "cannot copy object into itself" << std::endl;
+ return 1;
+ }
+ target_obj = nargs[1];
+ } else {
+ target_obj = nargs[2];
+ }
+
+ IoCtx target_ctx;
+ ret = rados.ioctx_create(target, target_ctx);
+ if (target_oloc.size()) {
+ target_ctx.locator_set_key(target_oloc);
+ }
+ if (target_nspace.size()) {
+ target_ctx.set_namespace(target_nspace);
+ }
+
+ ObjectWriteOperation op;
+ if (with_reference) {
+ op.set_redirect(target_obj, target_ctx, 0, CEPH_OSD_OP_FLAG_WITH_REFERENCE);
+ } else {
+ op.set_redirect(target_obj, target_ctx, 0);
+ }
+ ret = io_ctx.operate(nargs[1], &op);
+ if (ret < 0) {
+ cerr << "error set-redirect " << pool_name << "/" << nargs[1] << " => " << target << "/" << target_obj << ": " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "set-chunk") == 0) {
+ if (!pool_name) {
+ usage(cerr);
+ return 1;
+ }
+
+ const char *target = target_pool_name;
+ if (!target)
+ target = pool_name;
+
+ uint64_t offset;
+ uint64_t length;
+ uint64_t tgt_offset;
+ string tgt_oid;
+ if (nargs.size() < 6) {
+ usage(cerr);
+ return 1;
+ } else {
+ char* endptr = NULL;
+ offset = strtoull(nargs[2], &endptr, 10);
+ if (*endptr) {
+ cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl;
+ return 1;
+ }
+ length = strtoull(nargs[3], &endptr, 10);
+ if (*endptr) {
+ cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl;
+ return 1;
+ }
+ tgt_oid = string(nargs[4]);
+ tgt_offset = strtoull(nargs[5], &endptr, 10);
+ if (*endptr) {
+ cerr << "Invalid value for size: '" << nargs[2] << "'" << std::endl;
+ return 1;
+ }
+ }
+
+ IoCtx target_ctx;
+ ret = rados.ioctx_create(target, target_ctx);
+ ObjectWriteOperation op;
+ if (with_reference) {
+ op.set_chunk(offset, length, target_ctx, tgt_oid, tgt_offset, CEPH_OSD_OP_FLAG_WITH_REFERENCE);
+ } else {
+ op.set_chunk(offset, length, target_ctx, tgt_oid, tgt_offset);
+ }
+ ret = io_ctx.operate(nargs[1], &op);
+ if (ret < 0) {
+ cerr << "error set-chunk " << pool_name << "/" << nargs[1] << " " << " offset " << offset
+ << " length " << length << " target_pool " << target
+ << "tgt_offset: " << tgt_offset << " : " << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "tier-promote") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ObjectWriteOperation op;
+ op.tier_promote();
+ ret = io_ctx.operate(*obj_name, &op);
+ if (ret < 0) {
+ cerr << "error tier-promote " << pool_name << "/" << prettify(*obj_name) << " : "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "unset-manifest") == 0) {
+ if (!pool_name || (nargs.size() < 2 && !obj_name)) {
+ usage(cerr);
+ return 1;
+ }
+ if (!obj_name) {
+ obj_name = nargs[1];
+ }
+ ObjectWriteOperation op;
+ op.unset_manifest();
+ ret = io_ctx.operate(*obj_name, &op);
+ if (ret < 0) {
+ cerr << "error unset-manifest " << pool_name << "/" << prettify(*obj_name) << " : "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "export") == 0) {
+ // export [filename]
+ if (!pool_name || nargs.size() > 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ int file_fd;
+ if (nargs.size() < 2 || std::string(nargs[1]) == "-") {
+ file_fd = STDOUT_FILENO;
+ } else {
+ file_fd = open(nargs[1], O_WRONLY|O_CREAT|O_TRUNC, 0666);
+ if (file_fd < 0) {
+ cerr << "Error opening '" << nargs[1] << "': "
+ << cpp_strerror(file_fd) << std::endl;
+ return 1;
+ }
+ }
+
+ ret = PoolDump(file_fd).dump(&io_ctx);
+
+ if (file_fd != STDIN_FILENO) {
+ VOID_TEMP_FAILURE_RETRY(::close(file_fd));
+ }
+
+ if (ret < 0) {
+ cerr << "error from export: "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else if (strcmp(nargs[0], "import") == 0) {
+ // import [--no-overwrite] [--dry-run] <filename | - >
+ if (!pool_name || nargs.size() > 4 || nargs.size() < 2) {
+ usage(cerr);
+ return 1;
+ }
+
+ // Last arg is the filename
+ std::string const filename = nargs[nargs.size() - 1];
+
+ // All other args may be flags
+ bool dry_run = false;
+ bool no_overwrite = false;
+ for (unsigned i = 1; i < nargs.size() - 1; ++i) {
+ std::string arg(nargs[i]);
+
+ if (arg == std::string("--no-overwrite")) {
+ no_overwrite = true;
+ } else if (arg == std::string("--dry-run")) {
+ dry_run = true;
+ } else {
+ std::cerr << "Invalid argument '" << arg << "'" << std::endl;
+ return 1;
+ }
+ }
+
+ int file_fd;
+ if (filename == "-") {
+ file_fd = STDIN_FILENO;
+ } else {
+ file_fd = open(filename.c_str(), O_RDONLY);
+ if (file_fd < 0) {
+ cerr << "Error opening '" << filename << "': "
+ << cpp_strerror(file_fd) << std::endl;
+ return 1;
+ }
+ }
+
+ ret = RadosImport(file_fd, 0, dry_run).import(io_ctx, no_overwrite);
+
+ if (file_fd != STDIN_FILENO) {
+ VOID_TEMP_FAILURE_RETRY(::close(file_fd));
+ }
+
+ if (ret < 0) {
+ cerr << "error from import: "
+ << cpp_strerror(ret) << std::endl;
+ return 1;
+ }
+ } else {
+ cerr << "unrecognized command " << nargs[0] << "; -h or --help for usage" << std::endl;
+ ret = -EINVAL;
+ }
+
+ if (ret < 0)
+ cerr << "error " << (-ret) << ": " << cpp_strerror(ret) << std::endl;
+
+ return (ret < 0) ? 1 : 0;
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage(cout);
+ exit(0);
+ }
+
+ std::map < std::string, std::string > opts;
+ std::string val;
+
+ // Necessary to support usage of -f for formatting,
+ // since global_init will remove the -f using ceph
+ // argparse procedures.
+ for (auto j = args.begin(); j != args.end(); ++j) {
+ if (strcmp(*j, "--") == 0) {
+ break;
+ } else if ((j+1) == args.end()) {
+ // This can't be a formatting call (no format arg)
+ break;
+ } else if (strcmp(*j, "-f") == 0) {
+ val = *(j+1);
+ unique_ptr<Formatter> formatter(Formatter::create(val.c_str()));
+
+ if (formatter) {
+ j = args.erase(j);
+ opts["format"] = val;
+
+ j = args.erase(j);
+ break;
+ }
+ }
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ std::vector<const char*>::iterator i;
+ for (i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_double_dash(args, i)) {
+ break;
+ } else if (ceph_argparse_flag(args, i, "--force-full", (char*)NULL)) {
+ opts["force-full"] = "true";
+ } else if (ceph_argparse_flag(args, i, "-d", "--delete-after", (char*)NULL)) {
+ opts["delete-after"] = "true";
+ } else if (ceph_argparse_flag(args, i, "-C", "--create", "--create-pool",
+ (char*)NULL)) {
+ opts["create"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--pretty-format", (char*)NULL)) {
+ opts["pretty-format"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--show-time", (char*)NULL)) {
+ opts["show-time"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--no-cleanup", (char*)NULL)) {
+ opts["no-cleanup"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--no-hints", (char*)NULL)) {
+ opts["no-hints"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--reuse-bench", (char*)NULL)) {
+ opts["reuse-bench"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--no-verify", (char*)NULL)) {
+ opts["no-verify"] = "true";
+ } else if (ceph_argparse_witharg(args, i, &val, "--run-name", (char*)NULL)) {
+ opts["run-name"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--prefix", (char*)NULL)) {
+ opts["prefix"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) {
+ opts["pool"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-pool", (char*)NULL)) {
+ opts["target_pool"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--object-locator" , (char *)NULL)) {
+ opts["object_locator"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-locator" , (char *)NULL)) {
+ opts["target_locator"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-nspace" , (char *)NULL)) {
+ opts["target_nspace"] = val;
+#ifdef WITH_LIBRADOSSTRIPER
+ } else if (ceph_argparse_flag(args, i, "--striper" , (char *)NULL)) {
+ opts["striper"] = "true";
+#endif
+ } else if (ceph_argparse_witharg(args, i, &val, "-t", "--concurrent-ios", (char*)NULL)) {
+ opts["concurrent-ios"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--block-size", (char*)NULL)) {
+ opts["block-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-b", (char*)NULL)) {
+ opts["block-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--object-size", (char*)NULL)) {
+ opts["object-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-objects", (char*)NULL)) {
+ opts["max-objects"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--offset", (char*)NULL)) {
+ opts["offset"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-O", (char*)NULL)) {
+ opts["object-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-s", "--snap", (char*)NULL)) {
+ opts["snap"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-S", "--snapid", (char*)NULL)) {
+ opts["snapid"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--min-object-size", (char*)NULL)) {
+ opts["min-object-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-object-size", (char*)NULL)) {
+ opts["max-object-size"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--min-op-len", (char*)NULL)) {
+ opts["min-op-len"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-op-len", (char*)NULL)) {
+ opts["max-op-len"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-ops", (char*)NULL)) {
+ opts["max-ops"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-backlog", (char*)NULL)) {
+ opts["max-backlog"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--target-throughput", (char*)NULL)) {
+ opts["target-throughput"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--offset-align", (char*)NULL)) {
+ opts["offset_align"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--read-percent", (char*)NULL)) {
+ opts["read-percent"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--num-objects", (char*)NULL)) {
+ opts["num-objects"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--run-length", (char*)NULL)) {
+ opts["run-length"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--workers", (char*)NULL)) {
+ opts["workers"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-f", "--format", (char*)NULL)) {
+ opts["format"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-tag", (char*)NULL)) {
+ opts["lock-tag"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-cookie", (char*)NULL)) {
+ opts["lock-cookie"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-description", (char*)NULL)) {
+ opts["lock-description"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-duration", (char*)NULL)) {
+ opts["lock-duration"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--lock-type", (char*)NULL)) {
+ opts["lock-type"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "-N", "--namespace", (char*)NULL)) {
+ opts["namespace"] = val;
+ } else if (ceph_argparse_flag(args, i, "--all", (char*)NULL)) {
+ opts["all"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--default", (char*)NULL)) {
+ opts["default"] = "true";
+ } else if (ceph_argparse_witharg(args, i, &val, "-o", "--output", (char*)NULL)) {
+ opts["output"] = val;
+ } else if (ceph_argparse_flag(args, i, "--write-omap", (char*)NULL)) {
+ opts["write-dest-omap"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--write-object", (char*)NULL)) {
+ opts["write-dest-obj"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--write-xattr", (char*)NULL)) {
+ opts["write-dest-xattr"] = "true";
+ } else if (ceph_argparse_flag(args, i, "--with-clones", (char*)NULL)) {
+ opts["with-clones"] = "true";
+ } else if (ceph_argparse_witharg(args, i, &val, "--omap-key-file", (char*)NULL)) {
+ opts["omap-key-file"] = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--obj-name-file", (char*)NULL)) {
+ opts["obj-name-file"] = val;
+ } else if (ceph_argparse_flag(args, i, "--with-reference", (char*)NULL)) {
+ opts["with-reference"] = "true";
+ } else if (ceph_argparse_witharg(args, i, &val, "--pgid", (char*)NULL)) {
+ opts["pgid"] = val;
+ } else {
+ if (val[0] == '-')
+ usage_exit();
+ ++i;
+ }
+ }
+
+ if (args.empty()) {
+ cerr << "rados: you must give an action. Try --help" << std::endl;
+ return 1;
+ }
+
+ return rados_tool_common(opts, args);
+}
diff --git a/src/tools/radosacl.cc b/src/tools/radosacl.cc
new file mode 100644
index 00000000..3b071705
--- /dev/null
+++ b/src/tools/radosacl.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <stdlib.h>
+#include <time.h>
+#include <errno.h>
+
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+
+using namespace librados;
+
+void buf_to_hex(const unsigned char *buf, int len, char *str)
+{
+ str[0] = '\0';
+ for (int i = 0; i < len; i++) {
+ sprintf(&str[i*2], "%02x", (int)buf[i]);
+ }
+}
+
+
+#define ID_SIZE 8
+
+#define ACL_RD 0x1
+#define ACL_WR 0x2
+
+struct ACLID {
+ char id[ID_SIZE + 1];
+
+ void encode(bufferlist& bl) const {
+ bl.append((const char *)id, ID_SIZE);
+ }
+ void decode(bufferlist::const_iterator& iter) {
+ iter.copy(ID_SIZE, (char *)id);
+ }
+};
+WRITE_CLASS_ENCODER(ACLID)
+
+typedef __u32 ACLFlags;
+
+
+inline bool operator<(const ACLID& l, const ACLID& r)
+{
+ return (memcmp(&l, &r, ID_SIZE) < 0);
+}
+
+struct ACLPair {
+ ACLID id;
+ ACLFlags flags;
+};
+
+class ObjectACLs {
+ map<ACLID, ACLFlags> acls_map;
+
+public:
+
+ void encode(bufferlist& bl) const {
+ using ceph::encode;
+ encode(acls_map, bl);
+ }
+ void decode(bufferlist::const_iterator& bl) {
+ using ceph::decode;
+ decode(acls_map, bl);
+ }
+
+ int read_acl(ACLID& id, ACLFlags *flags);
+ void set_acl(ACLID& id, ACLFlags flags);
+};
+WRITE_CLASS_ENCODER(ObjectACLs)
+
+int ObjectACLs::read_acl(ACLID& id, ACLFlags *flags)
+{
+ if (!flags)
+ return -EINVAL;
+
+ map<ACLID, ACLFlags>::iterator iter = acls_map.find(id);
+
+ if (iter == acls_map.end())
+ return -ENOENT;
+
+ *flags = iter->second;
+
+ return 0;
+}
+
+void ObjectACLs::set_acl(ACLID& id, ACLFlags flags)
+{
+ acls_map[id] = flags;
+}
+
+
+
+class ACLEntity
+{
+ string name;
+ map<ACLID, ACLEntity> groups;
+};
+
+typedef map<ACLID, ACLEntity> tACLIDEntityMap;
+
+static map<ACLID, ACLEntity> users;
+static map<ACLID, ACLEntity> groups;
+
+void get_user(ACLID& aclid, ACLEntity *entity)
+{
+ //users.find(aclid);
+}
+
+
+
+
+
+int main(int argc, const char **argv)
+{
+ Rados rados;
+ if (rados.init(NULL) < 0) {
+ cerr << "couldn't initialize rados!" << std::endl;
+ exit(1);
+ }
+ if (rados.conf_read_file(NULL)) {
+ cerr << "couldn't read Ceph configuration file!" << std::endl;
+ exit(1);
+ }
+ if (rados.connect() < 0) {
+ cerr << "couldn't connect to cluster!" << std::endl;
+ exit(1);
+ }
+
+ time_t tm;
+ bufferlist bl, bl2;
+ char buf[128];
+
+ time(&tm);
+ snprintf(buf, 128, "%s", ctime(&tm));
+ bl.append(buf, strlen(buf));
+
+ const char *oid = "bar";
+
+ IoCtx io_ctx;
+ int r = rados.ioctx_create("data", io_ctx);
+ cout << "open io_ctx result = " << r << " pool = " << io_ctx.get_pool_name() << std::endl;
+
+ ACLID id;
+
+ snprintf(id.id, sizeof(id.id), "%.8x", 0x1234);
+ cout << "id=" << id.id << std::endl;
+
+ r = io_ctx.exec(oid, "acl", "get", bl, bl2);
+ cout << "exec(acl get) returned " << r
+ << " len=" << bl2.length() << std::endl;
+ ObjectACLs oa;
+ if (r >= 0) {
+ auto iter = bl2.cbegin();
+ oa.decode(iter);
+ }
+
+ oa.set_acl(id, ACL_RD);
+ bl.clear();
+ oa.encode(bl);
+ r = io_ctx.exec(oid, "acl", "set", bl, bl2);
+ cout << "exec(acl set) returned " << r
+ << " len=" << bl2.length() << std::endl;
+
+ const unsigned char *md5 = (const unsigned char *)bl2.c_str();
+ char md5_str[bl2.length()*2 + 1];
+ buf_to_hex(md5, bl2.length(), md5_str);
+ cout << "md5 result=" << md5_str << std::endl;
+
+ int size = io_ctx.read(oid, bl2, 128, 0);
+ cout << "read result=" << bl2.c_str() << std::endl;
+ cout << "size=" << size << std::endl;
+
+ return 0;
+}
+
diff --git a/src/tools/rbd/ArgumentTypes.cc b/src/tools/rbd/ArgumentTypes.cc
new file mode 100644
index 00000000..ae5f9fd7
--- /dev/null
+++ b/src/tools/rbd/ArgumentTypes.cc
@@ -0,0 +1,515 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd/features.h"
+#include "common/config_proxy.h"
+#include "common/strtol.h"
+#include "common/Formatter.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/tokenizer.hpp>
+
+namespace rbd {
+namespace argument_types {
+
+namespace po = boost::program_options;
+
+const std::map<uint64_t, std::string> ImageFeatures::FEATURE_MAPPING = {
+ {RBD_FEATURE_LAYERING, RBD_FEATURE_NAME_LAYERING},
+ {RBD_FEATURE_STRIPINGV2, RBD_FEATURE_NAME_STRIPINGV2},
+ {RBD_FEATURE_EXCLUSIVE_LOCK, RBD_FEATURE_NAME_EXCLUSIVE_LOCK},
+ {RBD_FEATURE_OBJECT_MAP, RBD_FEATURE_NAME_OBJECT_MAP},
+ {RBD_FEATURE_FAST_DIFF, RBD_FEATURE_NAME_FAST_DIFF},
+ {RBD_FEATURE_DEEP_FLATTEN, RBD_FEATURE_NAME_DEEP_FLATTEN},
+ {RBD_FEATURE_JOURNALING, RBD_FEATURE_NAME_JOURNALING},
+ {RBD_FEATURE_DATA_POOL, RBD_FEATURE_NAME_DATA_POOL},
+ {RBD_FEATURE_OPERATIONS, RBD_FEATURE_NAME_OPERATIONS},
+ {RBD_FEATURE_MIGRATING, RBD_FEATURE_NAME_MIGRATING},
+};
+
+Format::Formatter Format::create_formatter(bool pretty) const {
+ if (value == "json") {
+ return Formatter(new JSONFormatter(pretty));
+ } else if (value == "xml") {
+ return Formatter(new XMLFormatter(pretty));
+ }
+ return Formatter();
+}
+
+std::string get_name_prefix(ArgumentModifier modifier) {
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_SOURCE:
+ return SOURCE_PREFIX;
+ case ARGUMENT_MODIFIER_DEST:
+ return DEST_PREFIX;
+ default:
+ return "";
+ }
+}
+
+std::string get_description_prefix(ArgumentModifier modifier) {
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_SOURCE:
+ return "source ";
+ case ARGUMENT_MODIFIER_DEST:
+ return "destination ";
+ default:
+ return "";
+ }
+}
+
+void add_pool_option(po::options_description *opt,
+ ArgumentModifier modifier,
+ const std::string &desc_suffix) {
+ std::string name = POOL_NAME + ",p";
+ std::string description = "pool name";
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_NONE:
+ break;
+ case ARGUMENT_MODIFIER_SOURCE:
+ description = "source " + description;
+ break;
+ case ARGUMENT_MODIFIER_DEST:
+ name = DEST_POOL_NAME;
+ description = "destination " + description;
+ break;
+ }
+ description += desc_suffix;
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_namespace_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier) {
+ std::string name = NAMESPACE_NAME;
+ std::string description = "namespace name";
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_NONE:
+ break;
+ case ARGUMENT_MODIFIER_SOURCE:
+ description = "source " + description;
+ break;
+ case ARGUMENT_MODIFIER_DEST:
+ name = DEST_NAMESPACE_NAME;
+ description = "destination " + description;
+ break;
+ }
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_image_option(po::options_description *opt,
+ ArgumentModifier modifier,
+ const std::string &desc_suffix) {
+ std::string name = IMAGE_NAME;
+ std::string description = "image name";
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_NONE:
+ break;
+ case ARGUMENT_MODIFIER_SOURCE:
+ description = "source " + description;
+ break;
+ case ARGUMENT_MODIFIER_DEST:
+ name = DEST_IMAGE_NAME;
+ description = "destination " + description;
+ break;
+ }
+ description += desc_suffix;
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_image_id_option(po::options_description *opt,
+ const std::string &desc_suffix) {
+ std::string name = IMAGE_ID;
+ std::string description = "image id";
+ description += desc_suffix;
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_snap_option(po::options_description *opt,
+ ArgumentModifier modifier) {
+
+ std::string name = SNAPSHOT_NAME;
+ std::string description = "snapshot name";
+ switch (modifier) {
+ case ARGUMENT_MODIFIER_NONE:
+ break;
+ case ARGUMENT_MODIFIER_DEST:
+ name = DEST_SNAPSHOT_NAME;
+ description = "destination " + description;
+ break;
+ case ARGUMENT_MODIFIER_SOURCE:
+ description = "source " + description;
+ break;
+ }
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_snap_id_option(po::options_description *opt) {
+ opt->add_options()
+ (SNAPSHOT_ID.c_str(), po::value<uint64_t>(), "snapshot id");
+}
+
+void add_pool_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ bool namespaces_supported) {
+ opt->add_options()
+ ((POOL_NAME + ",p").c_str(), po::value<std::string>(), "pool name");
+ if (namespaces_supported) {
+ add_namespace_option(opt, ARGUMENT_MODIFIER_NONE);
+ pos->add_options()
+ ("pool-spec", "pool specification\n"
+ "(example: <pool-name>[/<namespace>]");
+ } else {
+ pos->add_options()
+ ("pool-name", "pool name");
+ }
+}
+
+void add_image_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ ArgumentModifier modifier) {
+ pos->add_options()
+ ((get_name_prefix(modifier) + IMAGE_SPEC).c_str(),
+ (get_description_prefix(modifier) + "image specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<image-name>)").c_str());
+ add_pool_option(opt, modifier);
+ add_namespace_option(opt, modifier);
+ add_image_option(opt, modifier);
+}
+
+void add_snap_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ ArgumentModifier modifier) {
+ pos->add_options()
+ ((get_name_prefix(modifier) + SNAPSHOT_SPEC).c_str(),
+ (get_description_prefix(modifier) + "snapshot specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<image-name>@<snapshot-name>)").c_str());
+ add_pool_option(opt, modifier);
+ add_namespace_option(opt, modifier);
+ add_image_option(opt, modifier);
+ add_snap_option(opt, modifier);
+}
+
+void add_image_or_snap_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ ArgumentModifier modifier) {
+ pos->add_options()
+ ((get_name_prefix(modifier) + IMAGE_OR_SNAPSHOT_SPEC).c_str(),
+ (get_description_prefix(modifier) + "image or snapshot specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<image-name>[@<snap-name>])").c_str());
+ add_pool_option(opt, modifier);
+ add_namespace_option(opt, modifier);
+ add_image_option(opt, modifier);
+ add_snap_option(opt, modifier);
+}
+
+void add_create_image_options(po::options_description *opt,
+ bool include_format) {
+ // TODO get default image format from conf
+ if (include_format) {
+ opt->add_options()
+ (IMAGE_FORMAT.c_str(), po::value<ImageFormat>(),
+ "image format [1 (deprecated) or 2]")
+ (IMAGE_NEW_FORMAT.c_str(),
+ po::value<ImageNewFormat>()->zero_tokens(),
+ "use image format 2\n(deprecated)");
+ }
+
+ opt->add_options()
+ (IMAGE_ORDER.c_str(), po::value<ImageOrder>(),
+ "object order [12 <= order <= 25]")
+ (IMAGE_OBJECT_SIZE.c_str(), po::value<ImageObjectSize>(),
+ "object size in B/K/M [4K <= object size <= 32M]")
+ (IMAGE_FEATURES.c_str(), po::value<ImageFeatures>()->composing(),
+ ("image features\n" + get_short_features_help(true)).c_str())
+ (IMAGE_SHARED.c_str(), po::bool_switch(), "shared image")
+ (IMAGE_STRIPE_UNIT.c_str(), po::value<ImageObjectSize>(), "stripe unit in B/K/M")
+ (IMAGE_STRIPE_COUNT.c_str(), po::value<uint64_t>(), "stripe count")
+ (IMAGE_DATA_POOL.c_str(), po::value<std::string>(), "data pool");
+
+ add_create_journal_options(opt);
+}
+
+void add_create_journal_options(po::options_description *opt) {
+ opt->add_options()
+ (JOURNAL_SPLAY_WIDTH.c_str(), po::value<uint64_t>(),
+ "number of active journal objects")
+ (JOURNAL_OBJECT_SIZE.c_str(), po::value<JournalObjectSize>(),
+ "size of journal objects [4K <= size <= 64M]")
+ (JOURNAL_POOL.c_str(), po::value<std::string>(),
+ "pool for journal objects");
+}
+
+void add_size_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ ((IMAGE_SIZE + ",s").c_str(), po::value<ImageSize>()->required(),
+ "image size (in M/G/T) [default: M]");
+}
+
+void add_sparse_size_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (IMAGE_SPARSE_SIZE.c_str(), po::value<ImageObjectSize>(),
+ "sparse size in B/K/M [default: 4K]");
+}
+
+void add_path_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ const std::string &description) {
+ pos->add_options()
+ (PATH_NAME.c_str(), po::value<std::string>(), description.c_str());
+ opt->add_options()
+ (PATH.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_limit_option(po::options_description *opt) {
+ std::string description = "maximum allowed snapshot count";
+
+ opt->add_options()
+ (LIMIT.c_str(), po::value<uint64_t>(), description.c_str());
+}
+
+void add_no_progress_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (NO_PROGRESS.c_str(), po::bool_switch(), "disable progress output");
+}
+
+void add_format_options(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (FORMAT.c_str(), po::value<Format>(), "output format (plain, json, or xml) [default: plain]")
+ (PRETTY_FORMAT.c_str(), po::bool_switch(),
+ "pretty formatting (json and xml)");
+}
+
+void add_verbose_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (VERBOSE.c_str(), po::bool_switch(), "be verbose");
+}
+
+void add_no_error_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (NO_ERROR.c_str(), po::bool_switch(), "continue after error");
+}
+
+void add_export_format_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ ("export-format", po::value<ExportFormat>(), "format of image file");
+}
+
+void add_flatten_option(boost::program_options::options_description *opt) {
+ opt->add_options()
+ (IMAGE_FLATTEN.c_str(), po::bool_switch(),
+ "fill clone with parent data (make it independent)");
+}
+
+std::string get_short_features_help(bool append_suffix) {
+ std::ostringstream oss;
+ bool first_feature = true;
+ oss << "[";
+ for (auto &pair : ImageFeatures::FEATURE_MAPPING) {
+ if ((pair.first & RBD_FEATURES_IMPLICIT_ENABLE) != 0ULL) {
+ // hide implicitly enabled features from list
+ continue;
+ } else if (!append_suffix && (pair.first & RBD_FEATURES_MUTABLE) == 0ULL) {
+ // hide non-mutable features for the 'rbd feature XYZ' command
+ continue;
+ }
+
+ if (!first_feature) {
+ oss << ", ";
+ }
+ first_feature = false;
+
+ std::string suffix;
+ if (append_suffix) {
+ if ((pair.first & rbd::utils::get_rbd_default_features(g_ceph_context)) != 0) {
+ suffix += "+";
+ }
+ if ((pair.first & RBD_FEATURES_MUTABLE) != 0) {
+ suffix += "*";
+ } else if ((pair.first & RBD_FEATURES_DISABLE_ONLY) != 0) {
+ suffix += "-";
+ }
+ if (!suffix.empty()) {
+ suffix = "(" + suffix + ")";
+ }
+ }
+ oss << pair.second << suffix;
+ }
+ oss << "]";
+ return oss.str();
+}
+
+std::string get_long_features_help() {
+ std::ostringstream oss;
+ oss << "Image Features:" << std::endl
+ << " (*) supports enabling/disabling on existing images" << std::endl
+ << " (-) supports disabling-only on existing images" << std::endl
+ << " (+) enabled by default for new images if features not specified"
+ << std::endl;
+ return oss.str();
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageSize *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (!parse_error.empty()) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+
+ //NOTE: We can remove below given three lines of code once all applications,
+ //which use this CLI will adopt B/K/M/G/T/P/E with size value
+ if (isdigit(*s.rbegin())) {
+ size = size << 20; // Default MB to Bytes
+ }
+ v = boost::any(size);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageOrder *target_type, int dummy) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ try {
+ uint64_t order = boost::lexical_cast<uint64_t>(s);
+ if (order >= 12 && order <= 25) {
+ v = boost::any(order);
+ return;
+ }
+ } catch (const boost::bad_lexical_cast &) {
+ }
+ throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageObjectSize *target_type, int dummy) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t objectsize = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (!parse_error.empty()) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+ v = boost::any(objectsize);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageFormat *target_type, int dummy) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ try {
+ uint32_t format = boost::lexical_cast<uint32_t>(s);
+ if (format == 1 || format == 2) {
+ v = boost::any(format);
+ return;
+ }
+ } catch (const boost::bad_lexical_cast &) {
+ }
+ throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageNewFormat *target_type, int dummy) {
+ std::cout << "rbd: --new-format is deprecated, use --image-format"
+ << std::endl;
+ v = boost::any(true);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageFeatures *target_type, int) {
+ if (v.empty()) {
+ v = boost::any(static_cast<uint64_t>(0));
+ }
+
+ uint64_t &features = boost::any_cast<uint64_t &>(v);
+ for (auto &value : values) {
+ boost::char_separator<char> sep(",");
+ boost::tokenizer<boost::char_separator<char> > tok(value, sep);
+ for (auto &token : tok) {
+ bool matched = false;
+ for (auto &it : ImageFeatures::FEATURE_MAPPING) {
+ if (token == it.second) {
+ features |= it.first;
+ matched = true;
+ break;
+ }
+ }
+
+ if (!matched) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+ }
+ }
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Format *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ if (s == "plain" || s == "json" || s == "xml") {
+ v = boost::any(Format(s));
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ JournalObjectSize *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (parse_error.empty() && (size >= (1 << 12)) && (size <= (1 << 26))) {
+ v = boost::any(size);
+ return;
+ }
+ throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ExportFormat *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t format = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (!parse_error.empty() || (format != 1 && format != 2)) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+
+ v = boost::any(format);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Secret *target_type, int) {
+ std::cerr << "rbd: --secret is deprecated, use --keyfile" << std::endl;
+
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ g_conf().set_val_or_die("keyfile", s.c_str());
+ v = boost::any(s);
+}
+
+} // namespace argument_types
+} // namespace rbd
diff --git a/src/tools/rbd/ArgumentTypes.h b/src/tools/rbd/ArgumentTypes.h
new file mode 100644
index 00000000..23bb02b9
--- /dev/null
+++ b/src/tools/rbd/ArgumentTypes.h
@@ -0,0 +1,218 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_ARGUMENT_TYPES_H
+#define CEPH_RBD_ARGUMENT_TYPES_H
+
+#include "include/int_types.h"
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/any.hpp>
+#include <boost/program_options.hpp>
+#include <boost/shared_ptr.hpp>
+
+namespace ceph { class Formatter; }
+
+namespace rbd {
+namespace argument_types {
+
+enum ArgumentModifier {
+ ARGUMENT_MODIFIER_NONE,
+ ARGUMENT_MODIFIER_SOURCE,
+ ARGUMENT_MODIFIER_DEST
+};
+
+enum SpecFormat {
+ SPEC_FORMAT_IMAGE,
+ SPEC_FORMAT_SNAPSHOT,
+ SPEC_FORMAT_IMAGE_OR_SNAPSHOT
+};
+
+static const std::string SOURCE_PREFIX("source-");
+static const std::string DEST_PREFIX("dest-");
+
+// positional arguments
+static const std::string POSITIONAL_COMMAND_SPEC("positional-command-spec");
+static const std::string POSITIONAL_ARGUMENTS("positional-arguments");
+static const std::string IMAGE_SPEC("image-spec");
+static const std::string SNAPSHOT_SPEC("snap-spec");
+static const std::string IMAGE_OR_SNAPSHOT_SPEC("image-or-snap-spec");
+static const std::string PATH_NAME("path-name");
+static const std::string IMAGE_ID("image-id");
+
+// optional arguments
+static const std::string CONFIG_PATH("conf");
+static const std::string POOL_NAME("pool");
+static const std::string DEST_POOL_NAME("dest-pool");
+static const std::string NAMESPACE_NAME("namespace");
+static const std::string DEST_NAMESPACE_NAME("dest-namespace");
+static const std::string IMAGE_NAME("image");
+static const std::string DEST_IMAGE_NAME("dest");
+static const std::string SNAPSHOT_NAME("snap");
+static const std::string SNAPSHOT_ID("snap-id");
+static const std::string DEST_SNAPSHOT_NAME("dest-snap");
+static const std::string PATH("path");
+static const std::string FROM_SNAPSHOT_NAME("from-snap");
+static const std::string WHOLE_OBJECT("whole-object");
+
+static const std::string IMAGE_FORMAT("image-format");
+static const std::string IMAGE_NEW_FORMAT("new-format");
+static const std::string IMAGE_ORDER("order");
+static const std::string IMAGE_OBJECT_SIZE("object-size");
+static const std::string IMAGE_FEATURES("image-feature");
+static const std::string IMAGE_SHARED("image-shared");
+static const std::string IMAGE_SIZE("size");
+static const std::string IMAGE_STRIPE_UNIT("stripe-unit");
+static const std::string IMAGE_STRIPE_COUNT("stripe-count");
+static const std::string IMAGE_DATA_POOL("data-pool");
+static const std::string IMAGE_SPARSE_SIZE("sparse-size");
+static const std::string IMAGE_THICK_PROVISION("thick-provision");
+static const std::string IMAGE_FLATTEN("flatten");
+
+static const std::string JOURNAL_OBJECT_SIZE("journal-object-size");
+static const std::string JOURNAL_SPLAY_WIDTH("journal-splay-width");
+static const std::string JOURNAL_POOL("journal-pool");
+
+static const std::string NO_PROGRESS("no-progress");
+static const std::string FORMAT("format");
+static const std::string PRETTY_FORMAT("pretty-format");
+static const std::string VERBOSE("verbose");
+static const std::string NO_ERROR("no-error");
+
+static const std::string LIMIT("limit");
+
+static const std::set<std::string> SWITCH_ARGUMENTS = {
+ WHOLE_OBJECT, NO_PROGRESS, PRETTY_FORMAT, VERBOSE, NO_ERROR};
+
+struct ImageSize {};
+struct ImageOrder {};
+struct ImageObjectSize {};
+struct ImageFormat {};
+struct ImageNewFormat {};
+
+struct ImageFeatures {
+ static const std::map<uint64_t, std::string> FEATURE_MAPPING;
+
+ uint64_t features;
+};
+
+template <typename T>
+struct TypedValue {
+ T value;
+ TypedValue(const T& t) : value(t) {}
+};
+
+struct Format : public TypedValue<std::string> {
+ typedef boost::shared_ptr<ceph::Formatter> Formatter;
+
+ Format(const std::string &format) : TypedValue<std::string>(format) {}
+
+ Formatter create_formatter(bool pretty) const;
+};
+
+struct JournalObjectSize {};
+
+struct ExportFormat {};
+
+struct Secret {};
+
+void add_export_format_option(boost::program_options::options_description *opt);
+
+std::string get_name_prefix(ArgumentModifier modifier);
+std::string get_description_prefix(ArgumentModifier modifier);
+
+void add_all_option(boost::program_options::options_description *opt,
+ std::string description);
+
+void add_pool_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier,
+ const std::string &desc_suffix = "");
+void add_namespace_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+
+void add_image_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier,
+ const std::string &desc_suffix = "");
+
+void add_image_id_option(boost::program_options::options_description *opt,
+ const std::string &desc_suffix = "");
+
+void add_snap_option(boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+void add_snap_id_option(boost::program_options::options_description *opt);
+
+void add_pool_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ bool namespaces_supported);
+
+void add_image_spec_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+
+void add_snap_spec_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+
+void add_image_or_snap_spec_options(
+ boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ ArgumentModifier modifier);
+
+void add_create_image_options(boost::program_options::options_description *opt,
+ bool include_format);
+
+void add_create_journal_options(
+ boost::program_options::options_description *opt);
+
+void add_size_option(boost::program_options::options_description *opt);
+
+void add_sparse_size_option(boost::program_options::options_description *opt);
+
+void add_path_options(boost::program_options::options_description *pos,
+ boost::program_options::options_description *opt,
+ const std::string &description);
+
+void add_limit_option(boost::program_options::options_description *opt);
+
+void add_no_progress_option(boost::program_options::options_description *opt);
+
+void add_format_options(boost::program_options::options_description *opt);
+
+void add_verbose_option(boost::program_options::options_description *opt);
+
+void add_no_error_option(boost::program_options::options_description *opt);
+
+void add_flatten_option(boost::program_options::options_description *opt);
+
+std::string get_short_features_help(bool append_suffix);
+std::string get_long_features_help();
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ExportFormat *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageSize *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageOrder *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageObjectSize *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageFormat *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageNewFormat *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ ImageFeatures *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Format *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ JournalObjectSize *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Secret *target_type, int);
+
+
+std::ostream &operator<<(std::ostream &os, const ImageFeatures &features);
+
+} // namespace argument_types
+} // namespace rbd
+
+#endif // CEPH_RBD_ARGUMENT_TYPES_H
diff --git a/src/tools/rbd/CMakeLists.txt b/src/tools/rbd/CMakeLists.txt
new file mode 100644
index 00000000..0e38a033
--- /dev/null
+++ b/src/tools/rbd/CMakeLists.txt
@@ -0,0 +1,65 @@
+set(CURSES_NEED_NCURSES TRUE)
+find_package(Curses REQUIRED)
+
+set(rbd_srcs
+ rbd.cc
+ ArgumentTypes.cc
+ IndentStream.cc
+ MirrorDaemonServiceInfo.cc
+ OptionPrinter.cc
+ Shell.cc
+ Utils.cc
+ action/Bench.cc
+ action/Children.cc
+ action/Clone.cc
+ action/Config.cc
+ action/Copy.cc
+ action/Create.cc
+ action/Device.cc
+ action/Diff.cc
+ action/DiskUsage.cc
+ action/Export.cc
+ action/Feature.cc
+ action/Flatten.cc
+ action/Ggate.cc
+ action/Group.cc
+ action/ImageMeta.cc
+ action/Import.cc
+ action/Info.cc
+ action/Journal.cc
+ action/Kernel.cc
+ action/List.cc
+ action/Lock.cc
+ action/MergeDiff.cc
+ action/Migration.cc
+ action/MirrorPool.cc
+ action/MirrorImage.cc
+ action/Namespace.cc
+ action/Nbd.cc
+ action/ObjectMap.cc
+ action/Perf.cc
+ action/Pool.cc
+ action/Remove.cc
+ action/Rename.cc
+ action/Resize.cc
+ action/Snap.cc
+ action/Sparsify.cc
+ action/Status.cc
+ action/Trash.cc
+ action/Watch.cc)
+
+add_executable(rbd ${rbd_srcs}
+ $<TARGET_OBJECTS:common_texttable_obj>)
+set_target_properties(rbd PROPERTIES OUTPUT_NAME rbd)
+target_link_libraries(rbd librbd librados
+ cls_journal_client cls_rbd_client
+ rbd_types
+ journal
+ ceph-common global ${CURSES_LIBRARIES}
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+if(WITH_KRBD)
+ target_link_libraries(rbd
+ krbd)
+endif()
+
+install(TARGETS rbd DESTINATION bin)
diff --git a/src/tools/rbd/IndentStream.cc b/src/tools/rbd/IndentStream.cc
new file mode 100644
index 00000000..83591a8c
--- /dev/null
+++ b/src/tools/rbd/IndentStream.cc
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/IndentStream.h"
+
+namespace rbd {
+
+int IndentBuffer::overflow (int c) {
+ if (traits_type::eq_int_type(traits_type::eof(), c)) {
+ return traits_type::not_eof(c);
+ }
+
+ int r;
+ switch (c) {
+ case '\n':
+ m_buffer += c;
+ flush_line();
+ r = m_streambuf->sputn(m_buffer.c_str(), m_buffer.size());
+ m_buffer.clear();
+ return r;
+ case '\t':
+ // convert tab to single space and fall-through
+ c = ' ';
+ default:
+ if (m_indent + m_buffer.size() >= m_line_length) {
+ size_t word_offset = m_buffer.find_last_of(m_delim);
+ bool space_delim = (m_delim == " ");
+ if (word_offset == std::string::npos && !space_delim) {
+ word_offset = m_buffer.find_last_of(" ");
+ }
+
+ if (word_offset != std::string::npos) {
+ flush_line();
+ m_streambuf->sputn(m_buffer.c_str(), word_offset);
+ m_buffer = std::string(m_buffer,
+ word_offset + (space_delim ? 1 : 0));
+ } else {
+ flush_line();
+ m_streambuf->sputn(m_buffer.c_str(), m_buffer.size());
+ m_buffer.clear();
+ }
+ m_streambuf->sputc('\n');
+ }
+ m_buffer += c;
+ return c;
+ }
+}
+
+void IndentBuffer::flush_line() {
+ if (m_initial_offset >= m_indent) {
+ m_initial_offset = 0;
+ m_streambuf->sputc('\n');
+ }
+
+ m_streambuf->sputn(m_indent_prefix.c_str(), m_indent - m_initial_offset);
+ m_initial_offset = 0;
+}
+
+} // namespace rbd
diff --git a/src/tools/rbd/IndentStream.h b/src/tools/rbd/IndentStream.h
new file mode 100644
index 00000000..85ccc85b
--- /dev/null
+++ b/src/tools/rbd/IndentStream.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_INDENT_STREAM_H
+#define CEPH_RBD_INDENT_STREAM_H
+
+#include "include/int_types.h"
+#include <iostream>
+#include <streambuf>
+#include <iomanip>
+
+namespace rbd {
+
+class IndentBuffer : public std::streambuf {
+public:
+ IndentBuffer(size_t indent, size_t initial_offset, size_t line_length,
+ std::streambuf *streambuf)
+ : m_indent(indent), m_initial_offset(initial_offset),
+ m_line_length(line_length), m_streambuf(streambuf),
+ m_delim(" "), m_indent_prefix(m_indent, ' ') {
+ }
+
+ void set_delimiter(const std::string &delim) {
+ m_delim = delim;
+ }
+
+protected:
+ int overflow (int c) override;
+
+private:
+ size_t m_indent;
+ size_t m_initial_offset;
+ size_t m_line_length;
+ std::streambuf *m_streambuf;
+
+ std::string m_delim;
+ std::string m_indent_prefix;
+ std::string m_buffer;
+
+ void flush_line();
+};
+
+class IndentStream : public std::ostream {
+public:
+ IndentStream(size_t indent, size_t initial_offset, size_t line_length,
+ std::ostream &os)
+ : std::ostream(&m_indent_buffer),
+ m_indent_buffer(indent, initial_offset, line_length, os.rdbuf()) {
+ }
+
+ void set_delimiter(const std::string &delim) {
+ m_indent_buffer.set_delimiter(delim);
+ }
+private:
+ IndentBuffer m_indent_buffer;
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_INDENT_STREAM_ITERATOR_H
diff --git a/src/tools/rbd/MirrorDaemonServiceInfo.cc b/src/tools/rbd/MirrorDaemonServiceInfo.cc
new file mode 100644
index 00000000..4870c1b2
--- /dev/null
+++ b/src/tools/rbd/MirrorDaemonServiceInfo.cc
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_json.h"
+#include "common/errno.h"
+#include "include/rados/librados.hpp"
+#include "include/stringify.h"
+#include "tools/rbd/MirrorDaemonServiceInfo.h"
+
+#include <boost/scope_exit.hpp>
+#include <iostream>
+
+namespace rbd {
+
+int MirrorDaemonServiceInfo::init() {
+
+ std::string cmd = "{\"prefix\": \"service dump\"}";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+ int r = librados::Rados(m_io_ctx).mgr_command(cmd, in_bl, &out_bl, nullptr);
+ if (r < 0) {
+ std::cerr << "rbd: failed to get service dump: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ bool json_valid = false;
+ json_spirit::mValue json_root;
+ if (json_spirit::read(out_bl.to_str(), json_root)) {
+ try {
+ auto& json_obj = json_root.get_obj();
+ if (json_obj.count("services")) {
+ auto &services = json_obj["services"].get_obj();
+ if (services.count("rbd-mirror")) {
+ auto &mirror_service = services["rbd-mirror"].get_obj();
+ if (mirror_service.count("daemons")) {
+ for (auto &it : mirror_service["daemons"].get_obj()) {
+ if (it.second.type() != json_spirit::obj_type ||
+ !it.second.get_obj().count("metadata")) {
+ continue;
+ }
+ auto &service_id = it.first;
+ auto &daemon_metadata = it.second.get_obj()["metadata"].get_obj();
+ for (auto &iter : daemon_metadata) {
+ if (iter.second.type() != json_spirit::str_type) {
+ continue;
+ }
+ m_daemons_metadata[service_id][iter.first] = iter.second.get_str();
+ }
+ }
+ }
+ }
+ }
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ std::cerr << "rbd: failed to parse service status" << std::endl;
+ return -EBADMSG;
+ }
+
+ cmd = "{\"prefix\": \"service status\"}";
+
+ out_bl.clear();
+ r = librados::Rados(m_io_ctx).mgr_command(cmd, in_bl, &out_bl, nullptr);
+ if (r < 0) {
+ std::cerr << "rbd: failed to get service status: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ json_valid = false;
+ if (json_spirit::read(out_bl.to_str(), json_root)) {
+ try {
+ auto& json_obj = json_root.get_obj();
+ if (json_obj.count("rbd-mirror")) {
+ auto &mirror_service = json_obj["rbd-mirror"].get_obj();
+ for (auto &it : mirror_service) {
+ auto &service_id = it.first;
+ auto &daemon = it.second.get_obj();
+ if (daemon.count("status") &&
+ daemon["status"].get_obj().count("json")) {
+ auto& status_json_str =
+ daemon["status"].get_obj()["json"].get_str();
+ json_spirit::mValue status_json_root;
+ if (json_spirit::read(status_json_str, status_json_root)) {
+ auto& status = status_json_root.get_obj();
+ auto iter = status.find(stringify(m_io_ctx.get_id()));
+ if (iter != status.end() &&
+ iter->second.get_obj().count("instance_id")) {
+ auto &instance_id =
+ iter->second.get_obj()["instance_id"].get_str();
+ m_instance_id_to_service_id[instance_id] = service_id;
+ }
+ }
+ }
+ }
+ }
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ std::cerr << "rbd: failed to parse service status" << std::endl;
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+std::string MirrorDaemonServiceInfo::get_description(
+ const std::string &instance_id) const {
+ if (!m_instance_id_to_service_id.count(instance_id)) {
+ return {};
+ }
+
+ auto service_id = m_instance_id_to_service_id.find(instance_id)->second;
+
+ auto it = m_daemons_metadata.find(service_id);
+ if (it == m_daemons_metadata.end()) {
+ return service_id;
+ }
+
+ auto &metadata = it->second;
+ auto iter = metadata.find("id");
+ std::string description = (iter != metadata.end()) ?
+ iter->second : service_id;
+ iter = metadata.find("hostname");
+ if (iter != metadata.end()) {
+ description += " on " + iter->second;
+ }
+
+ return description;
+}
+
+void MirrorDaemonServiceInfo::dump(
+ const std::string &instance_id,
+ argument_types::Format::Formatter formatter) const {
+ formatter->open_object_section("daemon_service");
+ BOOST_SCOPE_EXIT(formatter) {
+ formatter->close_section();
+ } BOOST_SCOPE_EXIT_END;
+
+ if (instance_id.empty() ||
+ !m_instance_id_to_service_id.count(instance_id)) {
+ return;
+ }
+
+ auto service_id = m_instance_id_to_service_id.find(instance_id)->second;
+ formatter->dump_string("service_id", service_id);
+ formatter->dump_string("instance_id", instance_id);
+
+ auto it = m_daemons_metadata.find(service_id);
+ if (it == m_daemons_metadata.end()) {
+ return;
+ }
+
+ auto &metadata = it->second;
+ auto iter = metadata.find("id");
+ if (iter != metadata.end()) {
+ formatter->dump_string("daemon_id", iter->second);
+ }
+ iter = metadata.find("hostname");
+ if (iter != metadata.end()) {
+ formatter->dump_string("hostname", iter->second);
+ }
+}
+
+} // namespace rbd
+
diff --git a/src/tools/rbd/MirrorDaemonServiceInfo.h b/src/tools/rbd/MirrorDaemonServiceInfo.h
new file mode 100644
index 00000000..7c3c3856
--- /dev/null
+++ b/src/tools/rbd/MirrorDaemonServiceInfo.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_DAEMON_SERVICE_INFO_H
+#define CEPH_RBD_MIRROR_DAEMON_SERVICE_INFO_H
+
+#include "include/rados/librados_fwd.hpp"
+#include "tools/rbd/ArgumentTypes.h"
+
+#include <string>
+#include <map>
+
+namespace rbd {
+
+class MirrorDaemonServiceInfo {
+public:
+ MirrorDaemonServiceInfo(librados::IoCtx &io_ctx) : m_io_ctx(io_ctx) {
+ }
+
+ int init();
+
+ std::string get_description(const std::string &instance_id) const;
+ void dump(const std::string &instance_id,
+ argument_types::Format::Formatter formatter) const;
+
+private:
+ librados::IoCtx &m_io_ctx;
+ std::map<std::string, std::string> m_instance_id_to_service_id;
+ std::map<std::string, std::map<std::string, std::string>> m_daemons_metadata;
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_DAEMON_SERVICE_INFO_H
diff --git a/src/tools/rbd/OptionPrinter.cc b/src/tools/rbd/OptionPrinter.cc
new file mode 100644
index 00000000..14affb0b
--- /dev/null
+++ b/src/tools/rbd/OptionPrinter.cc
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/OptionPrinter.h"
+#include "tools/rbd/IndentStream.h"
+
+namespace rbd {
+
+namespace po = boost::program_options;
+
+const std::string OptionPrinter::POSITIONAL_ARGUMENTS("Positional arguments");
+const std::string OptionPrinter::OPTIONAL_ARGUMENTS("Optional arguments");
+
+const size_t OptionPrinter::MAX_DESCRIPTION_OFFSET;
+
+OptionPrinter::OptionPrinter(const OptionsDescription &positional,
+ const OptionsDescription &optional)
+ : m_positional(positional), m_optional(optional) {
+}
+
+void OptionPrinter::print_short(std::ostream &os, size_t initial_offset) {
+ size_t name_width = std::min(initial_offset, MAX_DESCRIPTION_OFFSET) + 1;
+
+ IndentStream indent_stream(name_width, initial_offset, LINE_WIDTH, os);
+ indent_stream.set_delimiter("[");
+ for (size_t i = 0; i < m_optional.options().size(); ++i) {
+ bool required = m_optional.options()[i]->semantic()->is_required();
+ if (!required) {
+ indent_stream << "[";
+ }
+ indent_stream << "--" << m_optional.options()[i]->long_name();
+ if (m_optional.options()[i]->semantic()->max_tokens() != 0) {
+ indent_stream << " <" << m_optional.options()[i]->long_name() << ">";
+ }
+ if (!required) {
+ indent_stream << "]";
+ }
+ indent_stream << " ";
+ }
+
+ if (m_optional.options().size() > 0 || m_positional.options().size() == 0) {
+ indent_stream << std::endl;
+ }
+
+ if (m_positional.options().size() > 0) {
+ indent_stream.set_delimiter(" ");
+ for (size_t i = 0; i < m_positional.options().size(); ++i) {
+ indent_stream << "<" << m_positional.options()[i]->long_name() << "> ";
+ if (m_positional.options()[i]->semantic()->max_tokens() > 1) {
+ indent_stream << "[<" << m_positional.options()[i]->long_name()
+ << "> ...]";
+ break;
+ }
+ }
+ indent_stream << std::endl;
+ }
+}
+
+void OptionPrinter::print_detailed(std::ostream &os) {
+ std::string indent_prefix(2, ' ');
+ size_t name_width = compute_name_width(indent_prefix.size());
+
+ if (m_positional.options().size() > 0) {
+ std::cout << POSITIONAL_ARGUMENTS << std::endl;
+ for (size_t i = 0; i < m_positional.options().size(); ++i) {
+ std::stringstream ss;
+ ss << indent_prefix << "<" << m_positional.options()[i]->long_name()
+ << ">";
+
+ std::cout << ss.str();
+ IndentStream indent_stream(name_width, ss.str().size(), LINE_WIDTH, os);
+ indent_stream << m_positional.options()[i]->description() << std::endl;
+ }
+ std::cout << std::endl;
+ }
+
+ if (m_optional.options().size() > 0) {
+ std::cout << OPTIONAL_ARGUMENTS << std::endl;
+ for (size_t i = 0; i < m_optional.options().size(); ++i) {
+ std::stringstream ss;
+ ss << indent_prefix
+ << m_optional.options()[i]->format_name() << " "
+ << m_optional.options()[i]->format_parameter();
+
+ std::cout << ss.str();
+ IndentStream indent_stream(name_width, ss.str().size(), LINE_WIDTH, os);
+ indent_stream << m_optional.options()[i]->description() << std::endl;
+ }
+ std::cout << std::endl;
+ }
+}
+
+size_t OptionPrinter::compute_name_width(size_t indent) {
+ size_t width = MIN_NAME_WIDTH;
+ std::vector<OptionsDescription> descs = {m_positional, m_optional};
+ for (size_t desc_idx = 0; desc_idx < descs.size(); ++desc_idx) {
+ const OptionsDescription &desc = descs[desc_idx];
+ for (size_t opt_idx = 0; opt_idx < desc.options().size(); ++opt_idx) {
+ size_t name_width = desc.options()[opt_idx]->format_name().size() +
+ desc.options()[opt_idx]->format_parameter().size()
+ + 1;
+ width = std::max(width, name_width);
+ }
+ }
+ width += indent;
+ width = std::min(width, MAX_DESCRIPTION_OFFSET) + 1;
+ return width;
+}
+
+} // namespace rbd
diff --git a/src/tools/rbd/OptionPrinter.h b/src/tools/rbd/OptionPrinter.h
new file mode 100644
index 00000000..e18a5f88
--- /dev/null
+++ b/src/tools/rbd/OptionPrinter.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_OPTION_PRINTER_H
+#define CEPH_RBD_OPTION_PRINTER_H
+
+#include "include/int_types.h"
+#include <string>
+#include <vector>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+
+class OptionPrinter {
+public:
+ typedef boost::program_options::options_description OptionsDescription;
+
+ static const std::string POSITIONAL_ARGUMENTS;
+ static const std::string OPTIONAL_ARGUMENTS;
+
+ static const size_t LINE_WIDTH = 80;
+ static const size_t MIN_NAME_WIDTH = 20;
+ static const size_t MAX_DESCRIPTION_OFFSET = LINE_WIDTH / 2;
+
+ OptionPrinter(const OptionsDescription &positional,
+ const OptionsDescription &optional);
+
+ void print_short(std::ostream &os, size_t initial_offset);
+ void print_detailed(std::ostream &os);
+
+private:
+ const OptionsDescription &m_positional;
+ const OptionsDescription &m_optional;
+
+ size_t compute_name_width(size_t indent);
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_OPTION_PRINTER_H
diff --git a/src/tools/rbd/Shell.cc b/src/tools/rbd/Shell.cc
new file mode 100644
index 00000000..9993c691
--- /dev/null
+++ b/src/tools/rbd/Shell.cc
@@ -0,0 +1,432 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/IndentStream.h"
+#include "tools/rbd/OptionPrinter.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "include/stringify.h"
+#include <algorithm>
+#include <iostream>
+#include <set>
+
+namespace rbd {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+static const std::string APP_NAME("rbd");
+static const std::string HELP_SPEC("help");
+static const std::string BASH_COMPLETION_SPEC("bash-completion");
+
+boost::intrusive_ptr<CephContext> global_init(
+ int argc, const char **argv, std::vector<std::string> *command_args,
+ std::vector<std::string> *global_init_args) {
+ std::vector<const char*> cmd_args;
+ argv_to_vec(argc, argv, cmd_args);
+ std::vector<const char*> args(cmd_args);
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY,
+ CINIT_FLAG_NO_MON_CONFIG);
+
+ *command_args = {args.begin(), args.end()};
+
+ // Scan command line arguments for ceph global init args (those are
+ // filtered out from args vector by global_init).
+
+ auto cursor = args.begin();
+ for (auto &arg : cmd_args) {
+ auto iter = cursor;
+ for (; iter != args.end(); iter++) {
+ if (*iter == arg) {
+ break;
+ }
+ }
+ if (iter == args.end()) {
+ // filtered out by global_init
+ global_init_args->push_back(arg);
+ } else {
+ cursor = ++iter;
+ }
+ }
+
+ return cct;
+}
+
+std::string format_command_spec(const Shell::CommandSpec &spec) {
+ return joinify<std::string>(spec.begin(), spec.end(), " ");
+}
+
+std::string format_alias_spec(const Shell::CommandSpec &spec,
+ const Shell::CommandSpec &alias_spec) {
+ auto spec_it = spec.begin();
+ auto alias_it = alias_spec.begin();
+ int level = 0;
+ while (spec_it != spec.end() && alias_it != alias_spec.end() &&
+ *spec_it == *alias_it) {
+ spec_it++;
+ alias_it++;
+ level++;
+ }
+ ceph_assert(spec_it != spec.end() && alias_it != alias_spec.end());
+
+ if (level < 2) {
+ return joinify<std::string>(alias_spec.begin(), alias_spec.end(), " ");
+ } else {
+ return "... " + joinify<std::string>(alias_it, alias_spec.end(), " ");
+ }
+}
+
+std::string format_command_name(const Shell::CommandSpec &spec,
+ const Shell::CommandSpec &alias_spec) {
+ std::string name = format_command_spec(spec);
+ if (!alias_spec.empty()) {
+ name += " (" + format_alias_spec(spec, alias_spec) + ")";
+ }
+ return name;
+}
+
+std::string format_option_suffix(
+ const boost::shared_ptr<po::option_description> &option) {
+ std::string suffix;
+ if (option->semantic()->max_tokens() != 0) {
+ if (option->description().find("path") != std::string::npos ||
+ option->description().find("file") != std::string::npos) {
+ suffix += " path";
+ } else if (option->description().find("host") != std::string::npos) {
+ suffix += " host";
+ } else {
+ suffix += " arg";
+ }
+ }
+ return suffix;
+}
+
+} // anonymous namespace
+
+std::vector<Shell::Action *>& Shell::get_actions() {
+ static std::vector<Action *> actions;
+
+ return actions;
+}
+
+std::set<std::string>& Shell::get_switch_arguments() {
+ static std::set<std::string> switch_arguments;
+
+ return switch_arguments;
+}
+
+int Shell::execute(int argc, const char **argv) {
+ std::vector<std::string> arguments;
+ std::vector<std::string> ceph_global_init_args;
+ auto cct = global_init(argc, argv, &arguments, &ceph_global_init_args);
+
+ std::vector<std::string> command_spec;
+ get_command_spec(arguments, &command_spec);
+ bool is_alias = true;
+
+ if (command_spec.empty() || command_spec == CommandSpec({"help"})) {
+ // list all available actions
+ print_help();
+ return 0;
+ } else if (command_spec[0] == HELP_SPEC) {
+ // list help for specific action
+ command_spec.erase(command_spec.begin());
+ Action *action = find_action(command_spec, NULL, &is_alias);
+ if (action == NULL) {
+ print_unknown_action(command_spec);
+ return EXIT_FAILURE;
+ } else {
+ print_action_help(action, is_alias);
+ return 0;
+ }
+ } else if (command_spec[0] == BASH_COMPLETION_SPEC) {
+ command_spec.erase(command_spec.begin());
+ print_bash_completion(command_spec);
+ return 0;
+ }
+
+ CommandSpec *matching_spec;
+ Action *action = find_action(command_spec, &matching_spec, &is_alias);
+ if (action == NULL) {
+ print_unknown_action(command_spec);
+ return EXIT_FAILURE;
+ }
+
+ po::variables_map vm;
+ try {
+ po::options_description positional_opts;
+ po::options_description command_opts;
+ (*action->get_arguments)(&positional_opts, &command_opts);
+
+ // dynamically allocate options for our command (e.g. snap list) and
+ // its associated positional arguments
+ po::options_description argument_opts;
+ argument_opts.add_options()
+ (at::POSITIONAL_COMMAND_SPEC.c_str(),
+ po::value<std::vector<std::string> >()->required(), "")
+ (at::POSITIONAL_ARGUMENTS.c_str(),
+ po::value<std::vector<std::string> >(), "");
+
+ po::positional_options_description positional_options;
+ positional_options.add(at::POSITIONAL_COMMAND_SPEC.c_str(),
+ matching_spec->size());
+ if (!positional_opts.options().empty()) {
+ int max_count = positional_opts.options().size();
+ if (positional_opts.options().back()->semantic()->max_tokens() > 1)
+ max_count = -1;
+ positional_options.add(at::POSITIONAL_ARGUMENTS.c_str(), max_count);
+ }
+
+ po::options_description group_opts;
+ group_opts.add(command_opts)
+ .add(argument_opts);
+
+ po::store(po::command_line_parser(arguments)
+ .style(po::command_line_style::default_style &
+ ~po::command_line_style::allow_guessing)
+ .options(group_opts)
+ .positional(positional_options)
+ .run(), vm);
+
+ if (vm[at::POSITIONAL_COMMAND_SPEC].as<std::vector<std::string> >() !=
+ *matching_spec) {
+ std::cerr << "rbd: failed to parse command" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ int r = (*action->execute)(vm, ceph_global_init_args);
+ if (r != 0) {
+ return std::abs(r);
+ }
+ } catch (po::required_option& e) {
+ std::cerr << "rbd: " << e.what() << std::endl;
+ return EXIT_FAILURE;
+ } catch (po::too_many_positional_options_error& e) {
+ std::cerr << "rbd: too many arguments" << std::endl;
+ return EXIT_FAILURE;
+ } catch (po::error& e) {
+ std::cerr << "rbd: " << e.what() << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ return 0;
+}
+
+void Shell::get_command_spec(const std::vector<std::string> &arguments,
+ std::vector<std::string> *command_spec) {
+ for (size_t i = 0; i < arguments.size(); ++i) {
+ std::string arg(arguments[i]);
+ if (arg == "-h" || arg == "--help") {
+ *command_spec = {HELP_SPEC};
+ return;
+ } else if (arg == "--") {
+ // all arguments after a double-dash are positional
+ if (i + 1 < arguments.size()) {
+ command_spec->insert(command_spec->end(),
+ arguments.data() + i + 1,
+ arguments.data() + arguments.size());
+ }
+ return;
+ } else if (arg[0] == '-') {
+ // if the option is not a switch, skip its value
+ if (arg.size() >= 2 &&
+ (arg[1] == '-' ||
+ get_switch_arguments().count(arg.substr(1, 1)) == 0) &&
+ (arg[1] != '-' ||
+ get_switch_arguments().count(arg.substr(2, std::string::npos)) == 0) &&
+ at::SWITCH_ARGUMENTS.count(arg.substr(2, std::string::npos)) == 0 &&
+ arg.find('=') == std::string::npos) {
+ ++i;
+ }
+ } else {
+ command_spec->push_back(arg);
+ }
+ }
+}
+
+Shell::Action *Shell::find_action(const CommandSpec &command_spec,
+ CommandSpec **matching_spec, bool *is_alias) {
+ for (size_t i = 0; i < get_actions().size(); ++i) {
+ Action *action = get_actions()[i];
+ if (action->command_spec.size() <= command_spec.size()) {
+ if (std::includes(action->command_spec.begin(),
+ action->command_spec.end(),
+ command_spec.begin(),
+ command_spec.begin() + action->command_spec.size())) {
+ if (matching_spec != NULL) {
+ *matching_spec = &action->command_spec;
+ }
+ *is_alias = false;
+ return action;
+ }
+ }
+ if (!action->alias_command_spec.empty() &&
+ action->alias_command_spec.size() <= command_spec.size()) {
+ if (std::includes(action->alias_command_spec.begin(),
+ action->alias_command_spec.end(),
+ command_spec.begin(),
+ command_spec.begin() +
+ action->alias_command_spec.size())) {
+ if (matching_spec != NULL) {
+ *matching_spec = &action->alias_command_spec;
+ }
+ *is_alias = true;
+ return action;
+ }
+ }
+ }
+ return NULL;
+}
+
+void Shell::get_global_options(po::options_description *opts) {
+ opts->add_options()
+ ((at::CONFIG_PATH + ",c").c_str(), po::value<std::string>(), "path to cluster configuration")
+ ("cluster", po::value<std::string>(), "cluster name")
+ ("id", po::value<std::string>(), "client id (without 'client.' prefix)")
+ ("user", po::value<std::string>(), "client id (without 'client.' prefix)")
+ ("name,n", po::value<std::string>(), "client name")
+ ("mon_host,m", po::value<std::string>(), "monitor host")
+ ("secret", po::value<at::Secret>(), "path to secret key (deprecated)")
+ ("keyfile,K", po::value<std::string>(), "path to secret key")
+ ("keyring,k", po::value<std::string>(), "path to keyring");
+}
+
+void Shell::print_help() {
+ std::cout << "usage: " << APP_NAME << " <command> ..."
+ << std::endl << std::endl
+ << "Command-line interface for managing Ceph RBD images."
+ << std::endl << std::endl;
+
+ std::vector<Action *> actions(get_actions());
+ std::sort(actions.begin(), actions.end(),
+ [](Action *lhs, Action *rhs) { return lhs->command_spec <
+ rhs->command_spec; });
+
+ std::cout << OptionPrinter::POSITIONAL_ARGUMENTS << ":" << std::endl
+ << " <command>" << std::endl;
+
+ // since the commands have spaces, we have to build our own formatter
+ std::string indent(4, ' ');
+ size_t name_width = OptionPrinter::MIN_NAME_WIDTH;
+ for (size_t i = 0; i < actions.size(); ++i) {
+ Action *action = actions[i];
+ std::string name = format_command_name(action->command_spec,
+ action->alias_command_spec);
+ name_width = std::max(name_width, name.size());
+ }
+ name_width += indent.size();
+ name_width = std::min(name_width, OptionPrinter::MAX_DESCRIPTION_OFFSET) + 1;
+
+ for (size_t i = 0; i < actions.size(); ++i) {
+ Action *action = actions[i];
+ if (!action->visible)
+ continue;
+ std::stringstream ss;
+ ss << indent
+ << format_command_name(action->command_spec, action->alias_command_spec);
+
+ std::cout << ss.str();
+ if (!action->description.empty()) {
+ IndentStream indent_stream(name_width, ss.str().size(),
+ OptionPrinter::LINE_WIDTH,
+ std::cout);
+ indent_stream << action->description << std::endl;
+ } else {
+ std::cout << std::endl;
+ }
+ }
+
+ po::options_description global_opts(OptionPrinter::OPTIONAL_ARGUMENTS);
+ get_global_options(&global_opts);
+ std::cout << std::endl << global_opts << std::endl
+ << "See '" << APP_NAME << " help <command>' for help on a specific "
+ << "command." << std::endl;
+ }
+
+void Shell::print_action_help(Action *action, bool is_alias) {
+ std::stringstream ss;
+ ss << "usage: " << APP_NAME << " "
+ << format_command_spec(is_alias ? action->alias_command_spec : action->command_spec);
+ std::cout << ss.str();
+
+ po::options_description positional;
+ po::options_description options;
+ (*action->get_arguments)(&positional, &options);
+
+ OptionPrinter option_printer(positional, options);
+ option_printer.print_short(std::cout, ss.str().size());
+
+ if (!action->description.empty()) {
+ std::cout << std::endl << action->description << std::endl;
+ }
+
+ std::cout << std::endl;
+ option_printer.print_detailed(std::cout);
+
+ if (!action->help.empty()) {
+ std::cout << action->help << std::endl;
+ }
+}
+
+void Shell::print_unknown_action(const std::vector<std::string> &command_spec) {
+ std::cerr << "error: unknown option '"
+ << joinify<std::string>(command_spec.begin(),
+ command_spec.end(), " ") << "'"
+ << std::endl << std::endl;
+ print_help();
+}
+
+void Shell::print_bash_completion(const CommandSpec &command_spec) {
+
+ bool is_alias = true;
+
+ Action *action = find_action(command_spec, NULL, &is_alias);
+ po::options_description global_opts;
+ get_global_options(&global_opts);
+ print_bash_completion_options(global_opts);
+
+ if (action != nullptr) {
+ po::options_description positional_opts;
+ po::options_description command_opts;
+ (*action->get_arguments)(&positional_opts, &command_opts);
+ print_bash_completion_options(command_opts);
+ } else {
+ std::cout << "|help";
+ for (size_t i = 0; i < get_actions().size(); ++i) {
+ Action *action = get_actions()[i];
+ std::cout << "|"
+ << joinify<std::string>(action->command_spec.begin(),
+ action->command_spec.end(), " ");
+ if (!action->alias_command_spec.empty()) {
+ std::cout << "|"
+ << joinify<std::string>(action->alias_command_spec.begin(),
+ action->alias_command_spec.end(),
+ " ");
+ }
+ }
+ }
+ std::cout << "|" << std::endl;
+}
+
+void Shell::print_bash_completion_options(const po::options_description &ops) {
+ for (size_t i = 0; i < ops.options().size(); ++i) {
+ auto option = ops.options()[i];
+ std::string long_name(option->canonical_display_name(0));
+ std::string short_name(option->canonical_display_name(
+ po::command_line_style::allow_dash_for_short));
+
+ std::cout << "|--" << long_name << format_option_suffix(option);
+ if (long_name != short_name) {
+ std::cout << "|" << short_name << format_option_suffix(option);
+ }
+ }
+}
+
+} // namespace rbd
diff --git a/src/tools/rbd/Shell.h b/src/tools/rbd/Shell.h
new file mode 100644
index 00000000..fe3dee46
--- /dev/null
+++ b/src/tools/rbd/Shell.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_SHELL_H
+#define CEPH_RBD_SHELL_H
+
+#include "include/int_types.h"
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+
+class Shell {
+public:
+ typedef std::vector<std::string> CommandSpec;
+
+ struct Action {
+ typedef void (*GetArguments)(boost::program_options::options_description *,
+ boost::program_options::options_description *);
+ typedef int (*Execute)(const boost::program_options::variables_map &,
+ const std::vector<std::string> &);
+
+ CommandSpec command_spec;
+ CommandSpec alias_command_spec;
+ const std::string description;
+ const std::string help;
+ GetArguments get_arguments;
+ Execute execute;
+ bool visible;
+
+ template <typename Args, typename Execute>
+ Action(const std::initializer_list<std::string> &command_spec,
+ const std::initializer_list<std::string> &alias_command_spec,
+ const std::string &description, const std::string &help,
+ Args args, Execute execute, bool visible = true)
+ : command_spec(command_spec), alias_command_spec(alias_command_spec),
+ description(description), help(help), get_arguments(args),
+ execute(execute), visible(visible) {
+ Shell::get_actions().push_back(this);
+ }
+
+ };
+
+ struct SwitchArguments {
+ SwitchArguments(const std::initializer_list<std::string> &arguments) {
+ Shell::get_switch_arguments().insert(arguments.begin(), arguments.end());
+ }
+ };
+
+ int execute(int argc, const char **argv);
+
+private:
+ static std::vector<Action *>& get_actions();
+ static std::set<std::string>& get_switch_arguments();
+
+ void get_command_spec(const std::vector<std::string> &arguments,
+ std::vector<std::string> *command_spec);
+ Action *find_action(const CommandSpec &command_spec,
+ CommandSpec **matching_spec, bool *is_alias);
+
+ void get_global_options(boost::program_options::options_description *opts);
+
+ void print_help();
+ void print_action_help(Action *action, bool is_alias);
+ void print_unknown_action(const CommandSpec &command_spec);
+
+ void print_bash_completion(const CommandSpec &command_spec);
+ void print_bash_completion_options(
+ const boost::program_options::options_description &ops);
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_SHELL_H
diff --git a/src/tools/rbd/Utils.cc b/src/tools/rbd/Utils.cc
new file mode 100644
index 00000000..d4f50022
--- /dev/null
+++ b/src/tools/rbd/Utils.cc
@@ -0,0 +1,907 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/Utils.h"
+#include "include/ceph_assert.h"
+#include "include/Context.h"
+#include "include/encoding.h"
+#include "common/common_init.h"
+#include "include/stringify.h"
+#include "include/rbd/features.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <regex>
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+
+namespace rbd {
+namespace utils {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int ProgressContext::update_progress(uint64_t offset, uint64_t total) {
+ if (progress) {
+ int pc = total ? (offset * 100ull / total) : 0;
+ if (pc != last_pc) {
+ cerr << "\r" << operation << ": "
+ << pc << "% complete...";
+ cerr.flush();
+ last_pc = pc;
+ }
+ }
+ return 0;
+}
+
+void ProgressContext::finish() {
+ if (progress) {
+ cerr << "\r" << operation << ": 100% complete...done." << std::endl;
+ }
+}
+
+void ProgressContext::fail() {
+ if (progress) {
+ cerr << "\r" << operation << ": " << last_pc << "% complete...failed."
+ << std::endl;
+ }
+}
+
+void aio_context_callback(librbd::completion_t completion, void *arg)
+{
+ librbd::RBD::AioCompletion *aio_completion =
+ reinterpret_cast<librbd::RBD::AioCompletion*>(completion);
+ Context *context = reinterpret_cast<Context *>(arg);
+ context->complete(aio_completion->get_return_value());
+ aio_completion->release();
+}
+
+int read_string(int fd, unsigned max, std::string *out) {
+ char buf[4];
+
+ int r = safe_read_exact(fd, buf, 4);
+ if (r < 0)
+ return r;
+
+ bufferlist bl;
+ bl.append(buf, 4);
+ auto p = bl.cbegin();
+ uint32_t len;
+ decode(len, p);
+ if (len > max)
+ return -EINVAL;
+
+ char sbuf[len];
+ r = safe_read_exact(fd, sbuf, len);
+ if (r < 0)
+ return r;
+ out->assign(sbuf, len);
+ return len;
+}
+
+int extract_spec(const std::string &spec, std::string *pool_name,
+ std::string *namespace_name, std::string *name,
+ std::string *snap_name, SpecValidation spec_validation) {
+ if (!g_ceph_context->_conf.get_val<bool>("rbd_validate_names")) {
+ spec_validation = SPEC_VALIDATION_NONE;
+ }
+
+ std::regex pattern;
+ switch (spec_validation) {
+ case SPEC_VALIDATION_FULL:
+ // disallow "/" and "@" in all names
+ pattern = "^(?:([^/@]+)/(?:([^/@]+)/)?)?([^/@]+)(?:@([^/@]+))?$";
+ break;
+ case SPEC_VALIDATION_SNAP:
+ // disallow "/" and "@" in snap name
+ pattern = "^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$";
+ break;
+ case SPEC_VALIDATION_NONE:
+ // relaxed pattern assumes pool is before first "/",
+ // namespace is before second "/", and snap name is after first "@"
+ pattern = "^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@(.+))?$";
+ break;
+ default:
+ ceph_abort();
+ break;
+ }
+
+ std::smatch match;
+ if (!std::regex_match(spec, match, pattern)) {
+ std::cerr << "rbd: invalid spec '" << spec << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (match[1].matched) {
+ if (pool_name != nullptr) {
+ *pool_name = match[1];
+ } else {
+ std::cerr << "rbd: pool name specified for a command that doesn't use it"
+ << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (match[2].matched) {
+ if (namespace_name != nullptr) {
+ *namespace_name = match[2];
+ } else {
+ std::cerr << "rbd: namespace name specified for a command that doesn't "
+ << "use it" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (name != nullptr) {
+ *name = match[3];
+ }
+
+ if (match[4].matched) {
+ if (snap_name != nullptr) {
+ *snap_name = match[4];
+ } else {
+ std::cerr << "rbd: snapshot name specified for a command that doesn't "
+ << "use it" << std::endl;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+std::string get_positional_argument(const po::variables_map &vm, size_t index) {
+ if (vm.count(at::POSITIONAL_ARGUMENTS) == 0) {
+ return "";
+ }
+
+ const std::vector<std::string> &args =
+ boost::any_cast<std::vector<std::string> >(
+ vm[at::POSITIONAL_ARGUMENTS].value());
+ if (index < args.size()) {
+ return args[index];
+ }
+ return "";
+}
+
+std::string get_default_pool_name() {
+ return g_ceph_context->_conf.get_val<std::string>("rbd_default_pool");
+}
+
+int get_pool_and_namespace_names(
+ const boost::program_options::variables_map &vm,
+ bool default_empty_pool_name, bool validate_pool_name,
+ std::string* pool_name, std::string* namespace_name, size_t *arg_index) {
+ if (namespace_name != nullptr && vm.count(at::NAMESPACE_NAME)) {
+ *namespace_name = vm[at::NAMESPACE_NAME].as<std::string>();
+ }
+
+ if (vm.count(at::POOL_NAME)) {
+ *pool_name = vm[at::POOL_NAME].as<std::string>();
+ } else {
+ *pool_name = get_positional_argument(vm, *arg_index);
+ if (!pool_name->empty()) {
+ if (namespace_name != nullptr) {
+ auto slash_pos = pool_name->find_last_of('/');
+ if (slash_pos != std::string::npos) {
+ *namespace_name = pool_name->substr(slash_pos + 1);
+ }
+ *pool_name = pool_name->substr(0, slash_pos);
+ }
+ ++(*arg_index);
+ }
+ }
+
+ if (default_empty_pool_name && pool_name->empty()) {
+ *pool_name = get_default_pool_name();
+ }
+
+ if (!g_ceph_context->_conf.get_val<bool>("rbd_validate_names")) {
+ validate_pool_name = false;
+ }
+
+ if (validate_pool_name &&
+ pool_name->find_first_of("/@") != std::string::npos) {
+ std::cerr << "rbd: invalid pool '" << *pool_name << "'" << std::endl;
+ return -EINVAL;
+ } else if (namespace_name != nullptr &&
+ namespace_name->find_first_of("/@") != std::string::npos) {
+ std::cerr << "rbd: invalid namespace '" << *namespace_name << "'"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_pool_image_id(const po::variables_map &vm,
+ size_t *spec_arg_index,
+ std::string *pool_name,
+ std::string *namespace_name,
+ std::string *image_id) {
+
+ if (vm.count(at::POOL_NAME) && pool_name != nullptr) {
+ *pool_name = vm[at::POOL_NAME].as<std::string>();
+ }
+ if (vm.count(at::NAMESPACE_NAME) && namespace_name != nullptr) {
+ *namespace_name = vm[at::NAMESPACE_NAME].as<std::string>();
+ }
+ if (vm.count(at::IMAGE_ID) && image_id != nullptr) {
+ *image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r;
+ if (image_id != nullptr && spec_arg_index != nullptr && image_id->empty()) {
+ std::string spec = get_positional_argument(vm, (*spec_arg_index)++);
+ if (!spec.empty()) {
+ r = extract_spec(spec, pool_name, namespace_name, image_id, nullptr,
+ SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ if (pool_name != nullptr && pool_name->empty()) {
+ *pool_name = get_default_pool_name();
+ }
+
+ if (image_id != nullptr && image_id->empty()) {
+ std::cerr << "rbd: image id was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_pool_image_snapshot_names(const po::variables_map &vm,
+ at::ArgumentModifier mod,
+ size_t *spec_arg_index,
+ std::string *pool_name,
+ std::string *namespace_name,
+ std::string *image_name,
+ std::string *snap_name,
+ bool image_name_required,
+ SnapshotPresence snapshot_presence,
+ SpecValidation spec_validation) {
+ std::string pool_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_POOL_NAME : at::POOL_NAME);
+ std::string image_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_IMAGE_NAME : at::IMAGE_NAME);
+ return get_pool_generic_snapshot_names(vm, mod, spec_arg_index, pool_key,
+ pool_name, namespace_name, image_key,
+ "image", image_name, snap_name,
+ image_name_required, snapshot_presence,
+ spec_validation);
+}
+
+int get_pool_generic_snapshot_names(const po::variables_map &vm,
+ at::ArgumentModifier mod,
+ size_t *spec_arg_index,
+ const std::string& pool_key,
+ std::string *pool_name,
+ std::string *namespace_name,
+ const std::string& generic_key,
+ const std::string& generic_key_desc,
+ std::string *generic_name,
+ std::string *snap_name,
+ bool generic_name_required,
+ SnapshotPresence snapshot_presence,
+ SpecValidation spec_validation) {
+ std::string namespace_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_NAMESPACE_NAME : at::NAMESPACE_NAME);
+ std::string snap_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_SNAPSHOT_NAME : at::SNAPSHOT_NAME);
+
+ if (vm.count(pool_key) && pool_name != nullptr) {
+ *pool_name = vm[pool_key].as<std::string>();
+ }
+ if (vm.count(namespace_key) && namespace_name != nullptr) {
+ *namespace_name = vm[namespace_key].as<std::string>();
+ }
+ if (vm.count(generic_key) && generic_name != nullptr) {
+ *generic_name = vm[generic_key].as<std::string>();
+ }
+ if (vm.count(snap_key) && snap_name != nullptr) {
+ *snap_name = vm[snap_key].as<std::string>();
+ }
+
+ int r;
+ if ((generic_key == at::IMAGE_NAME || generic_key == at::DEST_IMAGE_NAME) &&
+ generic_name != nullptr && !generic_name->empty()) {
+ // despite the separate pool and snapshot name options,
+ // we can also specify them via the image option
+ std::string image_name_copy(*generic_name);
+ r = extract_spec(image_name_copy, pool_name, namespace_name, generic_name,
+ snap_name, spec_validation);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (generic_name != nullptr && spec_arg_index != nullptr &&
+ generic_name->empty()) {
+ std::string spec = get_positional_argument(vm, (*spec_arg_index)++);
+ if (!spec.empty()) {
+ r = extract_spec(spec, pool_name, namespace_name, generic_name, snap_name,
+ spec_validation);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ if (pool_name != nullptr && pool_name->empty()) {
+ *pool_name = get_default_pool_name();
+ }
+
+ if (generic_name != nullptr && generic_name_required &&
+ generic_name->empty()) {
+ std::string prefix = at::get_description_prefix(mod);
+ std::cerr << "rbd: "
+ << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+ << generic_key_desc << " name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ std::regex pattern("^[^@/]+?$");
+ if (spec_validation == SPEC_VALIDATION_FULL) {
+ // validate pool name while creating/renaming/copying/cloning/importing/etc
+ if ((pool_name != nullptr) && !std::regex_match (*pool_name, pattern)) {
+ std::cerr << "rbd: invalid pool name '" << *pool_name << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ if (namespace_name != nullptr && !namespace_name->empty() &&
+ !std::regex_match (*namespace_name, pattern)) {
+ std::cerr << "rbd: invalid namespace name '" << *namespace_name << "'"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ if (snap_name != nullptr) {
+ r = validate_snapshot_name(mod, *snap_name, snapshot_presence,
+ spec_validation);
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+int validate_snapshot_name(at::ArgumentModifier mod,
+ const std::string &snap_name,
+ SnapshotPresence snapshot_presence,
+ SpecValidation spec_validation) {
+ std::string prefix = at::get_description_prefix(mod);
+ switch (snapshot_presence) {
+ case SNAPSHOT_PRESENCE_PERMITTED:
+ break;
+ case SNAPSHOT_PRESENCE_NONE:
+ if (!snap_name.empty()) {
+ std::cerr << "rbd: "
+ << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+ << "snapshot name specified for a command that doesn't use it"
+ << std::endl;
+ return -EINVAL;
+ }
+ break;
+ case SNAPSHOT_PRESENCE_REQUIRED:
+ if (snap_name.empty()) {
+ std::cerr << "rbd: "
+ << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+ << "snapshot name was not specified" << std::endl;
+ return -EINVAL;
+ }
+ break;
+ }
+
+ if (spec_validation == SPEC_VALIDATION_SNAP) {
+ // disallow "/" and "@" in snap name
+ std::regex pattern("^[^@/]*?$");
+ if (!std::regex_match (snap_name, pattern)) {
+ std::cerr << "rbd: invalid snap name '" << snap_name << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+int get_image_options(const boost::program_options::variables_map &vm,
+ bool get_format, librbd::ImageOptions *opts) {
+ uint64_t order = 0, stripe_unit = 0, stripe_count = 0, object_size = 0;
+ uint64_t features = 0, features_clear = 0;
+ std::string data_pool;
+ bool order_specified = true;
+ bool features_specified = false;
+ bool features_clear_specified = false;
+ bool stripe_specified = false;
+
+ if (vm.count(at::IMAGE_ORDER)) {
+ order = vm[at::IMAGE_ORDER].as<uint64_t>();
+ std::cerr << "rbd: --order is deprecated, use --object-size"
+ << std::endl;
+ } else if (vm.count(at::IMAGE_OBJECT_SIZE)) {
+ object_size = vm[at::IMAGE_OBJECT_SIZE].as<uint64_t>();
+ order = std::round(std::log2(object_size));
+ } else {
+ order_specified = false;
+ }
+
+ if (vm.count(at::IMAGE_FEATURES)) {
+ features = vm[at::IMAGE_FEATURES].as<uint64_t>();
+ features_specified = true;
+ } else {
+ features = get_rbd_default_features(g_ceph_context);
+ }
+
+ if (vm.count(at::IMAGE_STRIPE_UNIT)) {
+ stripe_unit = vm[at::IMAGE_STRIPE_UNIT].as<uint64_t>();
+ stripe_specified = true;
+ }
+
+ if (vm.count(at::IMAGE_STRIPE_COUNT)) {
+ stripe_count = vm[at::IMAGE_STRIPE_COUNT].as<uint64_t>();
+ stripe_specified = true;
+ }
+
+ if (vm.count(at::IMAGE_SHARED) && vm[at::IMAGE_SHARED].as<bool>()) {
+ if (features_specified) {
+ features &= ~RBD_FEATURES_SINGLE_CLIENT;
+ } else {
+ features_clear |= RBD_FEATURES_SINGLE_CLIENT;
+ features_clear_specified = true;
+ }
+ }
+
+ if (vm.count(at::IMAGE_DATA_POOL)) {
+ data_pool = vm[at::IMAGE_DATA_POOL].as<std::string>();
+ }
+
+ if (get_format) {
+ uint64_t format = 0;
+ bool format_specified = false;
+ if (vm.count(at::IMAGE_NEW_FORMAT)) {
+ format = 2;
+ format_specified = true;
+ } else if (vm.count(at::IMAGE_FORMAT)) {
+ format = vm[at::IMAGE_FORMAT].as<uint32_t>();
+ format_specified = true;
+ }
+ if (format == 1) {
+ std::cerr << "rbd: image format 1 is deprecated" << std::endl;
+ }
+
+ if (features_specified && features != 0) {
+ if (format_specified && format == 1) {
+ std::cerr << "rbd: features not allowed with format 1; "
+ << "use --image-format 2" << std::endl;
+ return -EINVAL;
+ } else {
+ format = 2;
+ format_specified = true;
+ }
+ }
+
+ if ((stripe_unit || stripe_count) &&
+ (stripe_unit != (1ull << order) && stripe_count != 1)) {
+ if (format_specified && format == 1) {
+ std::cerr << "rbd: non-default striping not allowed with format 1; "
+ << "use --image-format 2" << std::endl;
+ return -EINVAL;
+ } else {
+ format = 2;
+ format_specified = true;
+ }
+ }
+
+ if (!data_pool.empty()) {
+ if (format_specified && format == 1) {
+ std::cerr << "rbd: data pool not allowed with format 1; "
+ << "use --image-format 2" << std::endl;
+ return -EINVAL;
+ } else {
+ format = 2;
+ format_specified = true;
+ }
+ }
+
+ if (format_specified) {
+ int r = g_conf().set_val("rbd_default_format", stringify(format));
+ ceph_assert(r == 0);
+ opts->set(RBD_IMAGE_OPTION_FORMAT, format);
+ }
+ }
+
+ if (order_specified)
+ opts->set(RBD_IMAGE_OPTION_ORDER, order);
+ if (features_specified)
+ opts->set(RBD_IMAGE_OPTION_FEATURES, features);
+ if (features_clear_specified) {
+ opts->set(RBD_IMAGE_OPTION_FEATURES_CLEAR, features_clear);
+ }
+ if (stripe_specified) {
+ opts->set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+ opts->set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+ }
+ if (!data_pool.empty()) {
+ opts->set(RBD_IMAGE_OPTION_DATA_POOL, data_pool);
+ }
+ int r = get_journal_options(vm, opts);
+ if (r < 0) {
+ return r;
+ }
+
+ r = get_flatten_option(vm, opts);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int get_journal_options(const boost::program_options::variables_map &vm,
+ librbd::ImageOptions *opts) {
+
+ if (vm.count(at::JOURNAL_OBJECT_SIZE)) {
+ uint64_t size = vm[at::JOURNAL_OBJECT_SIZE].as<uint64_t>();
+ uint64_t order = 12;
+ while ((1ULL << order) < size) {
+ order++;
+ }
+ opts->set(RBD_IMAGE_OPTION_JOURNAL_ORDER, order);
+
+ int r = g_conf().set_val("rbd_journal_order", stringify(order));
+ ceph_assert(r == 0);
+ }
+ if (vm.count(at::JOURNAL_SPLAY_WIDTH)) {
+ opts->set(RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH,
+ vm[at::JOURNAL_SPLAY_WIDTH].as<uint64_t>());
+
+ int r = g_conf().set_val("rbd_journal_splay_width",
+ stringify(
+ vm[at::JOURNAL_SPLAY_WIDTH].as<uint64_t>()));
+ ceph_assert(r == 0);
+ }
+ if (vm.count(at::JOURNAL_POOL)) {
+ opts->set(RBD_IMAGE_OPTION_JOURNAL_POOL,
+ vm[at::JOURNAL_POOL].as<std::string>());
+
+ int r = g_conf().set_val("rbd_journal_pool",
+ vm[at::JOURNAL_POOL].as<std::string>());
+ ceph_assert(r == 0);
+ }
+
+ return 0;
+}
+
+int get_flatten_option(const boost::program_options::variables_map &vm,
+ librbd::ImageOptions *opts) {
+ if (vm.count(at::IMAGE_FLATTEN) && vm[at::IMAGE_FLATTEN].as<bool>()) {
+ uint64_t flatten = 1;
+ opts->set(RBD_IMAGE_OPTION_FLATTEN, flatten);
+ }
+ return 0;
+}
+
+int get_image_size(const boost::program_options::variables_map &vm,
+ uint64_t *size) {
+ if (vm.count(at::IMAGE_SIZE) == 0) {
+ std::cerr << "rbd: must specify --size <M/G/T>" << std::endl;
+ return -EINVAL;
+ }
+
+ *size = vm[at::IMAGE_SIZE].as<uint64_t>();
+ return 0;
+}
+
+int get_path(const boost::program_options::variables_map &vm,
+ size_t *arg_index, std::string *path) {
+ if (vm.count(at::PATH)) {
+ *path = vm[at::PATH].as<std::string>();
+ } else {
+ *path = get_positional_argument(vm, *arg_index);
+ if (!path->empty()) {
+ ++(*arg_index);
+ }
+ }
+
+ if (path->empty()) {
+ std::cerr << "rbd: path was not specified" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int get_formatter(const po::variables_map &vm,
+ at::Format::Formatter *formatter) {
+ if (vm.count(at::FORMAT)) {
+ bool pretty = vm[at::PRETTY_FORMAT].as<bool>();
+ *formatter = vm[at::FORMAT].as<at::Format>().create_formatter(pretty);
+ if (*formatter == nullptr && pretty) {
+ std::cerr << "rbd: --pretty-format only works when --format "
+ << "is json or xml" << std::endl;
+ return -EINVAL;
+ } else if (*formatter != nullptr && !pretty) {
+ formatter->get()->enable_line_break();
+ }
+ } else if (vm[at::PRETTY_FORMAT].as<bool>()) {
+ std::cerr << "rbd: --pretty-format only works when --format "
+ << "is json or xml" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+void init_context() {
+ g_conf().set_val_or_die("rbd_cache_writethrough_until_flush", "false");
+ g_conf().apply_changes(nullptr);
+ common_init_finish(g_ceph_context);
+}
+
+int init_rados(librados::Rados *rados) {
+ init_context();
+
+ int r = rados->init_with_context(g_ceph_context);
+ if (r < 0) {
+ std::cerr << "rbd: couldn't initialize rados!" << std::endl;
+ return r;
+ }
+
+ r = rados->connect();
+ if (r < 0) {
+ std::cerr << "rbd: couldn't connect to the cluster!" << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int init(const std::string &pool_name, const std::string& namespace_name,
+ librados::Rados *rados, librados::IoCtx *io_ctx) {
+ init_context();
+
+ int r = init_rados(rados);
+ if (r < 0) {
+ return r;
+ }
+
+ r = init_io_ctx(*rados, pool_name, namespace_name, io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+int init_io_ctx(librados::Rados &rados, const std::string &pool_name,
+ const std::string& namespace_name, librados::IoCtx *io_ctx) {
+ int r = rados.ioctx_create(pool_name.c_str(), *io_ctx);
+ if (r < 0) {
+ if (r == -ENOENT && pool_name == get_default_pool_name()) {
+ std::cerr << "rbd: error opening default pool "
+ << "'" << pool_name << "'" << std::endl
+ << "Ensure that the default pool has been created or specify "
+ << "an alternate pool name." << std::endl;
+ } else {
+ std::cerr << "rbd: error opening pool '" << pool_name << "': "
+ << cpp_strerror(r) << std::endl;
+ }
+ return r;
+ }
+
+ return set_namespace(namespace_name, io_ctx);
+}
+
+int set_namespace(const std::string& namespace_name, librados::IoCtx *io_ctx) {
+ if (!namespace_name.empty()) {
+ librbd::RBD rbd;
+ bool exists = false;
+ int r = rbd.namespace_exists(*io_ctx, namespace_name.c_str(), &exists);
+ if (r < 0) {
+ std::cerr << "rbd: error asserting namespace: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ if (!exists) {
+ std::cerr << "rbd: namespace '" << namespace_name << "' does not exist."
+ << std::endl;
+ return -ENOENT;
+ }
+ }
+ io_ctx->set_namespace(namespace_name);
+ return 0;
+}
+
+void disable_cache() {
+ g_conf().set_val_or_die("rbd_cache", "false");
+}
+
+int open_image(librados::IoCtx &io_ctx, const std::string &image_name,
+ bool read_only, librbd::Image *image) {
+ int r;
+ librbd::RBD rbd;
+ if (read_only) {
+ r = rbd.open_read_only(io_ctx, *image, image_name.c_str(), NULL);
+ } else {
+ r = rbd.open(io_ctx, *image, image_name.c_str());
+ }
+
+ if (r < 0) {
+ std::cerr << "rbd: error opening image " << image_name << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int open_image_by_id(librados::IoCtx &io_ctx, const std::string &image_id,
+ bool read_only, librbd::Image *image) {
+ int r;
+ librbd::RBD rbd;
+ if (read_only) {
+ r = rbd.open_by_id_read_only(io_ctx, *image, image_id.c_str(), NULL);
+ } else {
+ r = rbd.open_by_id(io_ctx, *image, image_id.c_str());
+ }
+
+ if (r < 0) {
+ std::cerr << "rbd: error opening image with id " << image_id << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int init_and_open_image(const std::string &pool_name,
+ const std::string &namespace_name,
+ const std::string &image_name,
+ const std::string &image_id,
+ const std::string &snap_name, bool read_only,
+ librados::Rados *rados, librados::IoCtx *io_ctx,
+ librbd::Image *image) {
+ int r = init(pool_name, namespace_name, rados, io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ if (image_id.empty()) {
+ r = open_image(*io_ctx, image_name, read_only, image);
+ } else {
+ r = open_image_by_id(*io_ctx, image_id, read_only, image);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ if (!snap_name.empty()) {
+ r = snap_set(*image, snap_name);
+ if (r < 0) {
+ return r;
+ }
+ }
+ return 0;
+}
+
+int snap_set(librbd::Image &image, const std::string &snap_name) {
+ int r = image.snap_set(snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "error setting snapshot context: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void calc_sparse_extent(const bufferptr &bp,
+ size_t sparse_size,
+ size_t buffer_offset,
+ uint64_t buffer_length,
+ size_t *write_length,
+ bool *zeroed) {
+ if (sparse_size == 0) {
+ // sparse writes are disabled -- write the full extent
+ ceph_assert(buffer_offset == 0);
+ *write_length = buffer_length;
+ *zeroed = false;
+ return;
+ }
+
+ *write_length = 0;
+ size_t original_offset = buffer_offset;
+ while (buffer_offset < buffer_length) {
+ size_t extent_size = std::min<size_t>(
+ sparse_size, buffer_length - buffer_offset);
+
+ bufferptr extent(bp, buffer_offset, extent_size);
+
+ bool extent_is_zero = extent.is_zero();
+ if (original_offset == buffer_offset) {
+ *zeroed = extent_is_zero;
+ } else if (*zeroed != extent_is_zero) {
+ ceph_assert(*write_length > 0);
+ return;
+ }
+
+ buffer_offset += extent_size;
+ *write_length += extent_size;
+ }
+}
+
+std::string image_id(librbd::Image& image) {
+ std::string id;
+ int r = image.get_id(&id);
+ if (r < 0) {
+ return std::string();
+ }
+ return id;
+}
+
+std::string mirror_image_state(librbd::mirror_image_state_t state) {
+ switch (state) {
+ case RBD_MIRROR_IMAGE_DISABLING:
+ return "disabling";
+ case RBD_MIRROR_IMAGE_ENABLED:
+ return "enabled";
+ case RBD_MIRROR_IMAGE_DISABLED:
+ return "disabled";
+ default:
+ return "unknown";
+ }
+}
+
+std::string mirror_image_status_state(librbd::mirror_image_status_state_t state) {
+ switch (state) {
+ case MIRROR_IMAGE_STATUS_STATE_UNKNOWN:
+ return "unknown";
+ case MIRROR_IMAGE_STATUS_STATE_ERROR:
+ return "error";
+ case MIRROR_IMAGE_STATUS_STATE_SYNCING:
+ return "syncing";
+ case MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY:
+ return "starting_replay";
+ case MIRROR_IMAGE_STATUS_STATE_REPLAYING:
+ return "replaying";
+ case MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY:
+ return "stopping_replay";
+ case MIRROR_IMAGE_STATUS_STATE_STOPPED:
+ return "stopped";
+ default:
+ return "unknown (" + stringify(static_cast<uint32_t>(state)) + ")";
+ }
+}
+
+std::string mirror_image_status_state(librbd::mirror_image_status_t status) {
+ return (status.up ? "up+" : "down+") +
+ mirror_image_status_state(status.state);
+}
+
+std::string timestr(time_t t) {
+ struct tm tm;
+
+ localtime_r(&t, &tm);
+
+ char buf[32];
+ strftime(buf, sizeof(buf), "%F %T", &tm);
+
+ return buf;
+}
+
+uint64_t get_rbd_default_features(CephContext* cct) {
+ auto features = cct->_conf.get_val<std::string>("rbd_default_features");
+ return boost::lexical_cast<uint64_t>(features);
+}
+
+bool is_not_user_snap_namespace(librbd::Image* image,
+ const librbd::snap_info_t &snap_info)
+{
+ librbd::snap_namespace_type_t namespace_type;
+ int r = image->snap_get_namespace_type(snap_info.id, &namespace_type);
+ if (r < 0) {
+ return false;
+ }
+ return namespace_type != RBD_SNAP_NAMESPACE_TYPE_USER;
+}
+
+} // namespace utils
+} // namespace rbd
diff --git a/src/tools/rbd/Utils.h b/src/tools/rbd/Utils.h
new file mode 100644
index 00000000..81ea2c71
--- /dev/null
+++ b/src/tools/rbd/Utils.h
@@ -0,0 +1,204 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_UTILS_H
+#define CEPH_RBD_UTILS_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "tools/rbd/ArgumentTypes.h"
+#include <string>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace utils {
+
+namespace detail {
+
+template <typename T, void(T::*MF)(int)>
+void aio_completion_callback(librbd::completion_t completion,
+ void *arg) {
+ librbd::RBD::AioCompletion *aio_completion =
+ reinterpret_cast<librbd::RBD::AioCompletion*>(completion);
+
+ // complete the AIO callback in separate thread context
+ T *t = reinterpret_cast<T *>(arg);
+ int r = aio_completion->get_return_value();
+ aio_completion->release();
+
+ (t->*MF)(r);
+}
+
+} // namespace detail
+
+static const std::string RBD_DIFF_BANNER ("rbd diff v1\n");
+static const size_t RBD_DEFAULT_SPARSE_SIZE = 4096;
+
+static const std::string RBD_IMAGE_BANNER_V2 ("rbd image v2\n");
+static const std::string RBD_IMAGE_DIFFS_BANNER_V2 ("rbd image diffs v2\n");
+static const std::string RBD_DIFF_BANNER_V2 ("rbd diff v2\n");
+
+#define RBD_DIFF_FROM_SNAP 'f'
+#define RBD_DIFF_TO_SNAP 't'
+#define RBD_DIFF_IMAGE_SIZE 's'
+#define RBD_DIFF_WRITE 'w'
+#define RBD_DIFF_ZERO 'z'
+#define RBD_DIFF_END 'e'
+
+#define RBD_SNAP_PROTECTION_STATUS 'p'
+
+#define RBD_EXPORT_IMAGE_ORDER 'O'
+#define RBD_EXPORT_IMAGE_FEATURES 'T'
+#define RBD_EXPORT_IMAGE_STRIPE_UNIT 'U'
+#define RBD_EXPORT_IMAGE_STRIPE_COUNT 'C'
+#define RBD_EXPORT_IMAGE_META 'M'
+#define RBD_EXPORT_IMAGE_END 'E'
+
+enum SnapshotPresence {
+ SNAPSHOT_PRESENCE_NONE,
+ SNAPSHOT_PRESENCE_PERMITTED,
+ SNAPSHOT_PRESENCE_REQUIRED
+};
+
+enum SpecValidation {
+ SPEC_VALIDATION_FULL,
+ SPEC_VALIDATION_SNAP,
+ SPEC_VALIDATION_NONE
+};
+
+struct ProgressContext : public librbd::ProgressContext {
+ const char *operation;
+ bool progress;
+ int last_pc;
+
+ ProgressContext(const char *o, bool no_progress)
+ : operation(o), progress(!no_progress), last_pc(0) {
+ }
+
+ int update_progress(uint64_t offset, uint64_t total) override;
+ void finish();
+ void fail();
+};
+
+template <typename T, void(T::*MF)(int)>
+librbd::RBD::AioCompletion *create_aio_completion(T *t) {
+ return new librbd::RBD::AioCompletion(
+ t, &detail::aio_completion_callback<T, MF>);
+}
+
+void aio_context_callback(librbd::completion_t completion, void *arg);
+
+int read_string(int fd, unsigned max, std::string *out);
+
+int extract_spec(const std::string &spec, std::string *pool_name,
+ std::string *namespace_name, std::string *name,
+ std::string *snap_name, SpecValidation spec_validation);
+
+std::string get_positional_argument(
+ const boost::program_options::variables_map &vm, size_t index);
+
+std::string get_default_pool_name();
+int get_pool_and_namespace_names(
+ const boost::program_options::variables_map &vm,
+ bool default_empty_pool_name, bool validate_pool_name,
+ std::string* pool_name, std::string* namespace_name, size_t *arg_index);
+
+int get_pool_image_snapshot_names(
+ const boost::program_options::variables_map &vm,
+ argument_types::ArgumentModifier mod, size_t *spec_arg_index,
+ std::string *pool_name, std::string *namespace_name,
+ std::string *image_name, std::string *snap_name, bool image_name_required,
+ SnapshotPresence snapshot_presence, SpecValidation spec_validation);
+
+int get_pool_generic_snapshot_names(
+ const boost::program_options::variables_map &vm,
+ argument_types::ArgumentModifier mod, size_t *spec_arg_index,
+ const std::string& pool_key, std::string *pool_name,
+ std::string *namespace_name, const std::string& generic_key,
+ const std::string& generic_key_desc, std::string *generic_name,
+ std::string *snap_name, bool generic_name_required,
+ SnapshotPresence snapshot_presence, SpecValidation spec_validation);
+
+int get_pool_image_id(const boost::program_options::variables_map &vm,
+ size_t *spec_arg_index,
+ std::string *pool_name,
+ std::string *namespace_name,
+ std::string *image_id);
+
+int validate_snapshot_name(argument_types::ArgumentModifier mod,
+ const std::string &snap_name,
+ SnapshotPresence snapshot_presence,
+ SpecValidation spec_validation);
+
+int get_image_options(const boost::program_options::variables_map &vm,
+ bool get_format, librbd::ImageOptions* opts);
+
+int get_journal_options(const boost::program_options::variables_map &vm,
+ librbd::ImageOptions *opts);
+
+int get_flatten_option(const boost::program_options::variables_map &vm,
+ librbd::ImageOptions *opts);
+
+int get_image_size(const boost::program_options::variables_map &vm,
+ uint64_t *size);
+
+int get_path(const boost::program_options::variables_map &vm,
+ size_t *arg_index, std::string *path);
+
+int get_formatter(const boost::program_options::variables_map &vm,
+ argument_types::Format::Formatter *formatter);
+
+void init_context();
+
+int init_rados(librados::Rados *rados);
+
+int init(const std::string &pool_name, const std::string& namespace_name,
+ librados::Rados *rados, librados::IoCtx *io_ctx);
+int init_io_ctx(librados::Rados &rados, const std::string &pool_name,
+ const std::string& namespace_name, librados::IoCtx *io_ctx);
+int set_namespace(const std::string& namespace_name, librados::IoCtx *io_ctx);
+
+void disable_cache();
+
+int open_image(librados::IoCtx &io_ctx, const std::string &image_name,
+ bool read_only, librbd::Image *image);
+
+int open_image_by_id(librados::IoCtx &io_ctx, const std::string &image_id,
+ bool read_only, librbd::Image *image);
+
+int init_and_open_image(const std::string &pool_name,
+ const std::string &namespace_name,
+ const std::string &image_name,
+ const std::string &image_id,
+ const std::string &snap_name, bool read_only,
+ librados::Rados *rados, librados::IoCtx *io_ctx,
+ librbd::Image *image);
+
+int snap_set(librbd::Image &image, const std::string &snap_name);
+
+void calc_sparse_extent(const bufferptr &bp,
+ size_t sparse_size,
+ size_t buffer_offset,
+ uint64_t length,
+ size_t *write_length,
+ bool *zeroed);
+
+bool is_not_user_snap_namespace(librbd::Image* image,
+ const librbd::snap_info_t &snap_info);
+
+std::string image_id(librbd::Image& image);
+
+std::string mirror_image_state(librbd::mirror_image_state_t mirror_image_state);
+std::string mirror_image_status_state(librbd::mirror_image_status_state_t state);
+std::string mirror_image_status_state(librbd::mirror_image_status_t status);
+
+std::string timestr(time_t t);
+
+// duplicate here to not include librbd_internal lib
+uint64_t get_rbd_default_features(CephContext* cct);
+
+} // namespace utils
+} // namespace rbd
+
+#endif // CEPH_RBD_UTILS_H
diff --git a/src/tools/rbd/action/Bench.cc b/src/tools/rbd/action/Bench.cc
new file mode 100644
index 00000000..27843c7b
--- /dev/null
+++ b/src/tools/rbd/action/Bench.cc
@@ -0,0 +1,539 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "global/signal_handler.h"
+#include <atomic>
+#include <chrono>
+#include <iostream>
+#include <boost/accumulators/accumulators.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+#include <boost/accumulators/statistics/rolling_sum.hpp>
+#include <boost/program_options.hpp>
+
+using namespace std::chrono;
+
+static std::atomic<bool> terminating;
+static void handle_signal(int signum)
+{
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ terminating = true;
+}
+
+namespace rbd {
+namespace action {
+namespace bench {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+enum io_type_t {
+ IO_TYPE_READ = 0,
+ IO_TYPE_WRITE,
+ IO_TYPE_RW,
+
+ IO_TYPE_NUM,
+};
+
+struct IOType {};
+struct Size {};
+struct IOPattern {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Size *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ std::string parse_error;
+ uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
+ if (!parse_error.empty()) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+ v = boost::any(size);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ IOPattern *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ if (s == "rand") {
+ v = boost::any(true);
+ } else if (s == "seq") {
+ v = boost::any(false);
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+io_type_t get_io_type(string io_type_string) {
+ if (io_type_string == "read")
+ return IO_TYPE_READ;
+ else if (io_type_string == "write")
+ return IO_TYPE_WRITE;
+ else if (io_type_string == "readwrite" || io_type_string == "rw")
+ return IO_TYPE_RW;
+ else
+ return IO_TYPE_NUM;
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ IOType *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ io_type_t io_type = get_io_type(s);
+ if (io_type >= IO_TYPE_NUM)
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ else
+ v = boost::any(io_type);
+}
+
+} // anonymous namespace
+
+static void rbd_bencher_completion(void *c, void *pc);
+struct rbd_bencher;
+
+struct bencher_completer {
+ rbd_bencher *bencher;
+ bufferlist *bl;
+
+public:
+ bencher_completer(rbd_bencher *bencher, bufferlist *bl)
+ : bencher(bencher), bl(bl)
+ { }
+
+ ~bencher_completer()
+ {
+ if (bl)
+ delete bl;
+ }
+};
+
+struct rbd_bencher {
+ librbd::Image *image;
+ Mutex lock;
+ Cond cond;
+ int in_flight;
+ io_type_t io_type;
+ uint64_t io_size;
+ bufferlist write_bl;
+
+ explicit rbd_bencher(librbd::Image *i, io_type_t io_type, uint64_t io_size)
+ : image(i),
+ lock("rbd_bencher::lock"),
+ in_flight(0),
+ io_type(io_type),
+ io_size(io_size)
+ {
+ if (io_type == IO_TYPE_WRITE || io_type == IO_TYPE_RW) {
+ bufferptr bp(io_size);
+ memset(bp.c_str(), rand() & 0xff, io_size);
+ write_bl.push_back(bp);
+ }
+ }
+
+ void start_io(int max, uint64_t off, uint64_t len, int op_flags, bool read_flag)
+ {
+ {
+ Mutex::Locker l(lock);
+ in_flight++;
+ }
+
+ librbd::RBD::AioCompletion *c;
+ if (read_flag) {
+ bufferlist *read_bl = new bufferlist();
+ c = new librbd::RBD::AioCompletion((void *)(new bencher_completer(this, read_bl)),
+ rbd_bencher_completion);
+ image->aio_read2(off, len, *read_bl, c, op_flags);
+ } else {
+ c = new librbd::RBD::AioCompletion((void *)(new bencher_completer(this, NULL)),
+ rbd_bencher_completion);
+ image->aio_write2(off, len, write_bl, c, op_flags);
+ }
+ }
+
+ int wait_for(int max, bool interrupt_on_terminating) {
+ Mutex::Locker l(lock);
+ while (in_flight > max && !(terminating && interrupt_on_terminating)) {
+ utime_t dur;
+ dur.set_from_double(.2);
+ cond.WaitInterval(lock, dur);
+ }
+
+ return terminating ? -EINTR : 0;
+ }
+
+};
+
+void rbd_bencher_completion(void *vc, void *pc)
+{
+ librbd::RBD::AioCompletion *c = (librbd::RBD::AioCompletion *)vc;
+ bencher_completer *bc = static_cast<bencher_completer *>(pc);
+ rbd_bencher *b = bc->bencher;
+ //cout << "complete " << c << std::endl;
+ int ret = c->get_return_value();
+ if (b->io_type == IO_TYPE_WRITE && ret != 0) {
+ cout << "write error: " << cpp_strerror(ret) << std::endl;
+ exit(ret < 0 ? -ret : ret);
+ } else if (b->io_type == IO_TYPE_READ && (unsigned int)ret != b->io_size) {
+ cout << "read error: " << cpp_strerror(ret) << std::endl;
+ exit(ret < 0 ? -ret : ret);
+ }
+ b->lock.Lock();
+ b->in_flight--;
+ b->cond.Signal();
+ b->lock.Unlock();
+ c->release();
+ delete bc;
+}
+
+bool should_read(uint64_t read_proportion)
+{
+ uint64_t rand_num = rand() % 100;
+
+ if (rand_num < read_proportion)
+ return true;
+ else
+ return false;
+}
+
+int do_bench(librbd::Image& image, io_type_t io_type,
+ uint64_t io_size, uint64_t io_threads,
+ uint64_t io_bytes, bool random, uint64_t read_proportion)
+{
+ uint64_t size = 0;
+ image.size(&size);
+ if (io_size > size) {
+ std::cerr << "rbd: io-size " << byte_u_t(io_size) << " "
+ << "larger than image size " << byte_u_t(size) << std::endl;
+ return -EINVAL;
+ }
+
+ if (io_size > std::numeric_limits<uint32_t>::max()) {
+ std::cerr << "rbd: io-size should be less than 4G" << std::endl;
+ return -EINVAL;
+ }
+
+ int r = image.flush();
+ if (r < 0 && (r != -EROFS || io_type != IO_TYPE_READ)) {
+ std::cerr << "rbd: failed to flush: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ rbd_bencher b(&image, io_type, io_size);
+
+ std::cout << "bench "
+ << " type " << (io_type == IO_TYPE_READ ? "read" :
+ io_type == IO_TYPE_WRITE ? "write" : "readwrite")
+ << (io_type == IO_TYPE_RW ? " read:write=" +
+ to_string(read_proportion) + ":" + to_string(100 - read_proportion) : "")
+ << " io_size " << io_size
+ << " io_threads " << io_threads
+ << " bytes " << io_bytes
+ << " pattern " << (random ? "random" : "sequential")
+ << std::endl;
+
+ srand(time(NULL) % (unsigned long) -1);
+
+ coarse_mono_time start = coarse_mono_clock::now();
+ chrono::duration<double> last = chrono::duration<double>::zero();
+ unsigned ios = 0;
+
+ vector<uint64_t> thread_offset;
+ uint64_t i;
+ uint64_t start_pos;
+
+ uint64_t unit_len = size/io_size/io_threads;
+ // disturb all thread's offset
+ for (i = 0; i < io_threads; i++) {
+ if (random) {
+ start_pos = (rand() % (size / io_size)) * io_size;
+ } else {
+ start_pos = unit_len * i * io_size;
+ }
+ thread_offset.push_back(start_pos);
+ }
+
+ const int WINDOW_SIZE = 5;
+ typedef boost::accumulators::accumulator_set<
+ double, boost::accumulators::stats<
+ boost::accumulators::tag::rolling_sum> > RollingSum;
+
+ RollingSum time_acc(
+ boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
+ RollingSum ios_acc(
+ boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
+ RollingSum off_acc(
+ boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
+ uint64_t cur_ios = 0;
+ uint64_t cur_off = 0;
+
+ int op_flags;
+ if (random) {
+ op_flags = LIBRADOS_OP_FLAG_FADVISE_RANDOM;
+ } else {
+ op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
+ }
+
+ printf(" SEC OPS OPS/SEC BYTES/SEC\n");
+ uint64_t off;
+ int read_ops = 0;
+ int write_ops = 0;
+
+ for (off = 0; off < io_bytes; ) {
+ // Issue I/O
+ i = 0;
+ int r = 0;
+ while (i < io_threads && off < io_bytes) {
+ bool read_flag = should_read(read_proportion);
+
+ r = b.wait_for(io_threads - 1, true);
+ if (r < 0) {
+ break;
+ }
+ b.start_io(io_threads, thread_offset[i], io_size, op_flags, read_flag);
+
+ ++i;
+ ++ios;
+ off += io_size;
+
+ ++cur_ios;
+ cur_off += io_size;
+
+ if (read_flag)
+ read_ops++;
+ else
+ write_ops++;
+ }
+
+ if (r < 0) {
+ break;
+ }
+
+ // Set the thread_offsets of next I/O
+ for (i = 0; i < io_threads; ++i) {
+ if (random) {
+ thread_offset[i] = (rand() % (size / io_size)) * io_size;
+ continue;
+ }
+ if (off < (io_size * unit_len * io_threads) ) {
+ thread_offset[i] += io_size;
+ } else {
+ // thread_offset is adjusted to the chunks unassigned to threads.
+ thread_offset[i] = off + (i * io_size);
+ }
+ if (thread_offset[i] + io_size > size)
+ thread_offset[i] = unit_len * i * io_size;
+ }
+
+ coarse_mono_time now = coarse_mono_clock::now();
+ chrono::duration<double> elapsed = now - start;
+ if (last == chrono::duration<double>::zero()) {
+ last = elapsed;
+ } else if ((int)elapsed.count() != (int)last.count()) {
+ time_acc((elapsed - last).count());
+ ios_acc(static_cast<double>(cur_ios));
+ off_acc(static_cast<double>(cur_off));
+ cur_ios = 0;
+ cur_off = 0;
+
+ double time_sum = boost::accumulators::rolling_sum(time_acc);
+ printf("%5d %8d %8.2lf %8.2lf\n",
+ (int)elapsed.count(),
+ (int)(ios - io_threads),
+ boost::accumulators::rolling_sum(ios_acc) / time_sum,
+ boost::accumulators::rolling_sum(off_acc) / time_sum);
+ last = elapsed;
+ }
+ }
+ b.wait_for(0, false);
+
+ if (io_type != IO_TYPE_READ) {
+ r = image.flush();
+ if (r < 0) {
+ std::cerr << "rbd: failed to flush at the end: " << cpp_strerror(r)
+ << std::endl;
+ }
+ }
+
+ coarse_mono_time now = coarse_mono_clock::now();
+ chrono::duration<double> elapsed = now - start;
+
+ printf("elapsed: %5d ops: %8d ops/sec: %8.2lf bytes/sec: %8.2lf\n",
+ (int)elapsed.count(), ios, (double)ios / elapsed.count(),
+ (double)off / elapsed.count());
+
+ if (io_type == IO_TYPE_RW) {
+ printf("read_ops: %5d read_ops/sec: %8.2lf read_bytes/sec: %8.2lf\n",
+ read_ops, (double)read_ops / elapsed.count(),
+ (double)read_ops * io_size / elapsed.count());
+
+ printf("write_ops: %5d write_ops/sec: %8.2lf write_bytes/sec: %8.2lf\n",
+ write_ops, (double)write_ops / elapsed.count(),
+ (double)write_ops * io_size / elapsed.count());
+ }
+
+ return 0;
+}
+
+void add_bench_common_options(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+
+ options->add_options()
+ ("io-size", po::value<Size>(), "IO size (in B/K/M/G/T) [default: 4K]")
+ ("io-threads", po::value<uint32_t>(), "ios in flight [default: 16]")
+ ("io-total", po::value<Size>(), "total size for IO (in B/K/M/G/T) [default: 1G]")
+ ("io-pattern", po::value<IOPattern>(), "IO pattern (rand or seq) [default: seq]")
+ ("rw-mix-read", po::value<uint64_t>(), "read proportion in readwrite (<= 100) [default: 50]");
+}
+
+void get_arguments_for_write(po::options_description *positional,
+ po::options_description *options) {
+ add_bench_common_options(positional, options);
+}
+
+void get_arguments_for_bench(po::options_description *positional,
+ po::options_description *options) {
+ add_bench_common_options(positional, options);
+
+ options->add_options()
+ ("io-type", po::value<IOType>()->required(), "IO type (read , write, or readwrite(rw))");
+}
+
+int bench_execute(const po::variables_map &vm, io_type_t bench_io_type) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ utils::SnapshotPresence snap_presence = utils::SNAPSHOT_PRESENCE_NONE;
+ if (bench_io_type == IO_TYPE_READ)
+ snap_presence = utils::SNAPSHOT_PRESENCE_PERMITTED;
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, snap_presence, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t bench_io_size;
+ if (vm.count("io-size")) {
+ bench_io_size = vm["io-size"].as<uint64_t>();
+ } else {
+ bench_io_size = 4096;
+ }
+ if (bench_io_size == 0) {
+ std::cerr << "rbd: --io-size should be greater than zero." << std::endl;
+ return -EINVAL;
+ }
+
+ uint32_t bench_io_threads;
+ if (vm.count("io-threads")) {
+ bench_io_threads = vm["io-threads"].as<uint32_t>();
+ } else {
+ bench_io_threads = 16;
+ }
+ if (bench_io_threads == 0) {
+ std::cerr << "rbd: --io-threads should be greater than zero." << std::endl;
+ return -EINVAL;
+ }
+
+ uint64_t bench_bytes;
+ if (vm.count("io-total")) {
+ bench_bytes = vm["io-total"].as<uint64_t>();
+ } else {
+ bench_bytes = 1 << 30;
+ }
+
+ bool bench_random;
+ if (vm.count("io-pattern")) {
+ bench_random = vm["io-pattern"].as<bool>();
+ } else {
+ bench_random = false;
+ }
+
+ uint64_t bench_read_proportion;
+ if (bench_io_type == IO_TYPE_READ) {
+ bench_read_proportion = 100;
+ } else if (bench_io_type == IO_TYPE_WRITE) {
+ bench_read_proportion = 0;
+ } else {
+ if (vm.count("rw-mix-read")) {
+ bench_read_proportion = vm["rw-mix-read"].as<uint64_t>();
+ } else {
+ bench_read_proportion = 50;
+ }
+
+ if (bench_read_proportion > 100) {
+ std::cerr << "rbd: --rw-mix-read should not be larger than 100." << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, sighup_handler);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ r = do_bench(image, bench_io_type, bench_io_size, bench_io_threads,
+ bench_bytes, bench_random, bench_read_proportion);
+
+ unregister_async_signal_handler(SIGHUP, sighup_handler);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ if (r < 0) {
+ std::cerr << "bench failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int execute_for_write(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::cerr << "rbd: bench-write is deprecated, use rbd bench --io-type write ..." << std::endl;
+ return bench_execute(vm, IO_TYPE_WRITE);
+}
+
+int execute_for_bench(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ io_type_t bench_io_type;
+ if (vm.count("io-type")) {
+ bench_io_type = vm["io-type"].as<io_type_t>();
+ } else {
+ std::cerr << "rbd: --io-type must be specified." << std::endl;
+ return -EINVAL;
+ }
+
+ return bench_execute(vm, bench_io_type);
+}
+
+Shell::Action action_write(
+ {"bench-write"}, {}, "Simple write benchmark. (Deprecated, please use `rbd bench --io-type write` instead.)",
+ "", &get_arguments_for_write, &execute_for_write, false);
+
+Shell::Action action_bench(
+ {"bench"}, {}, "Simple benchmark.", "", &get_arguments_for_bench, &execute_for_bench);
+
+} // namespace bench
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Children.cc b/src/tools/rbd/action/Children.cc
new file mode 100644
index 00000000..f459e92b
--- /dev/null
+++ b/src/tools/rbd/action/Children.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace children {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_list_children(librados::IoCtx &io_ctx, librbd::Image &image,
+ bool all_flag, bool descendants_flag, Formatter *f)
+{
+ std::vector<librbd::linked_image_spec_t> children;
+ librbd::RBD rbd;
+ int r;
+ if (descendants_flag) {
+ r = image.list_descendants(&children);
+ } else {
+ r = image.list_children3(&children);
+ }
+ if (r < 0)
+ return r;
+
+ if (f)
+ f->open_array_section("children");
+
+ for (auto& child : children) {
+ bool trash = child.trash;
+ if (f) {
+ if (all_flag) {
+ f->open_object_section("child");
+ f->dump_string("pool", child.pool_name);
+ f->dump_string("pool_namespace", child.pool_namespace);
+ f->dump_string("image", child.image_name);
+ f->dump_string("id", child.image_id);
+ f->dump_bool("trash", child.trash);
+ f->close_section();
+ } else if (!trash) {
+ f->open_object_section("child");
+ f->dump_string("pool", child.pool_name);
+ f->dump_string("pool_namespace", child.pool_namespace);
+ f->dump_string("image", child.image_name);
+ f->close_section();
+ }
+ } else if (all_flag || !trash) {
+ if (child.pool_name.empty()) {
+ std::cout << "(child missing " << child.pool_id << "/";
+ } else {
+ std::cout << child.pool_name << "/";
+ }
+ if (!child.pool_namespace.empty()) {
+ std::cout << child.pool_namespace << "/";
+ }
+ if (child.image_name.empty()) {
+ std::cout << child.image_id << ")";
+ } else {
+ std::cout << child.image_name;
+ if (trash) {
+ std::cout << " (trash " << child.image_id << ")";
+ }
+ }
+ std::cout << std::endl;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_snap_id_option(options);
+ options->add_options()
+ ("all,a", po::bool_switch(), "list all children (include trash)");
+ options->add_options()
+ ("descendants", po::bool_switch(), "include all descendants");
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ uint64_t snap_id = LIBRADOS_SNAP_HEAD;
+ if (vm.count(at::SNAPSHOT_ID)) {
+ snap_id = vm[at::SNAPSHOT_ID].as<uint64_t>();
+ }
+
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (snap_id != LIBRADOS_SNAP_HEAD && !snap_name.empty()) {
+ std::cerr << "rbd: trying to access snapshot using both name and id."
+ << std::endl;
+ return -EINVAL;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!snap_name.empty()) {
+ r = image.snap_set(snap_name.c_str());
+ } else if (snap_id != LIBRADOS_SNAP_HEAD) {
+ r = image.snap_set_by_id(snap_id);
+ }
+ if (r == -ENOENT) {
+ std::cerr << "rbd: snapshot does not exist." << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: error setting snapshot: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ r = do_list_children(io_ctx, image, vm["all"].as<bool>(),
+ vm["descendants"].as<bool>(), formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: listing children failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"children"}, {}, "Display children of an image or its snapshot.", "",
+ &get_arguments, &execute);
+
+} // namespace children
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Clone.cc b/src/tools/rbd/action/Clone.cc
new file mode 100644
index 00000000..6406c957
--- /dev/null
+++ b/src/tools/rbd/action/Clone.cc
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace clone {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_clone(librbd::RBD &rbd, librados::IoCtx &p_ioctx,
+ const char *p_name, const char *p_snapname,
+ librados::IoCtx &c_ioctx, const char *c_name,
+ librbd::ImageOptions& opts) {
+ return rbd.clone3(p_ioctx, p_name, p_snapname, c_ioctx, c_name, opts);
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, false);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dst_pool_name;
+ std::string dst_namespace_name;
+ std::string dst_image_name;
+ std::string dst_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name,
+ &dst_namespace_name, &dst_image_name, &dst_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, false, &opts);
+ if (r < 0) {
+ return r;
+ }
+ opts.set(RBD_IMAGE_OPTION_FORMAT, static_cast<uint64_t>(2));
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name, &dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = do_clone(rbd, io_ctx, image_name.c_str(), snap_name.c_str(), dst_io_ctx,
+ dst_image_name.c_str(), opts);
+ if (r == -EXDEV) {
+ std::cerr << "rbd: clone v2 required for cross-namespace clones."
+ << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: clone error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"clone"}, {}, "Clone a snapshot into a CoW child image.",
+ at::get_long_features_help(), &get_arguments, &execute);
+
+} // namespace clone
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Config.cc b/src/tools/rbd/action/Config.cc
new file mode 100644
index 00000000..2868c7ad
--- /dev/null
+++ b/src/tools/rbd/action/Config.cc
@@ -0,0 +1,890 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/escape.h"
+#include "common/errno.h"
+#include "common/options.h"
+#include "global/global_context.h"
+#include "include/stringify.h"
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+
+#include <iostream>
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace config {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+const std::string METADATA_CONF_PREFIX = "conf_";
+const uint32_t MAX_KEYS = 64;
+
+void add_config_entity_option(
+ boost::program_options::options_description *positional) {
+ positional->add_options()
+ ("config-entity", "config entity (global, client, client.<id>)");
+}
+
+void add_pool_option(boost::program_options::options_description *positional) {
+ positional->add_options()
+ ("pool-name", "pool name");
+}
+
+void add_key_option(po::options_description *positional) {
+ positional->add_options()
+ ("key", "config key");
+}
+
+int get_config_entity(const po::variables_map &vm, std::string *config_entity) {
+ *config_entity = utils::get_positional_argument(vm, 0);
+
+ if (*config_entity != "global" && *config_entity != "client" &&
+ !boost::starts_with(*config_entity, ("client."))) {
+ std::cerr << "rbd: invalid config entity: " << *config_entity
+ << " (must be global, client or client.<id>)" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_pool(const po::variables_map &vm, std::string *pool_name) {
+ *pool_name = utils::get_positional_argument(vm, 0);
+ if (pool_name->empty()) {
+ std::cerr << "rbd: pool name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_key(const po::variables_map &vm, size_t *arg_index,
+ std::string *key) {
+ *key = utils::get_positional_argument(vm, *arg_index);
+ if (key->empty()) {
+ std::cerr << "rbd: config key was not specified" << std::endl;
+ return -EINVAL;
+ } else {
+ ++(*arg_index);
+ }
+
+ if (!boost::starts_with(*key, "rbd_")) {
+ std::cerr << "rbd: not rbd option: " << *key << std::endl;
+ return -EINVAL;
+ }
+
+ std::string value;
+ int r = g_ceph_context->_conf.get_val(key->c_str(), &value);
+ if (r < 0) {
+ std::cerr << "rbd: invalid config key: " << *key << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+std::ostream& operator<<(std::ostream& os,
+ const librbd::config_source_t& source) {
+ switch (source) {
+ case RBD_CONFIG_SOURCE_CONFIG:
+ os << "config";
+ break;
+ case RBD_CONFIG_SOURCE_POOL:
+ os << "pool";
+ break;
+ case RBD_CONFIG_SOURCE_IMAGE:
+ os << "image";
+ break;
+ default:
+ os << "unknown (" << static_cast<uint32_t>(source) << ")";
+ break;
+ }
+ return os;
+}
+
+int config_global_list(
+ librados::Rados &rados, const std::string &config_entity,
+ std::map<std::string, std::pair<std::string, std::string>> *options) {
+ bool client_id_config_entity =
+ boost::starts_with(config_entity, ("client."));
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config dump\", "
+ "\"format\": \"json\" "
+ "}";
+ bufferlist in_bl;
+ bufferlist out_bl;
+ std::string ss;
+ int r = rados.mon_command(cmd, in_bl, &out_bl, &ss);
+ if (r < 0) {
+ std::cerr << "rbd: error reading config: " << ss << std::endl;
+ return r;
+ }
+
+ json_spirit::mValue json_root;
+ if (!json_spirit::read(out_bl.to_str(), json_root)) {
+ std::cerr << "rbd: error parsing config dump" << std::endl;
+ return -EINVAL;
+ }
+
+ try {
+ auto &json_array = json_root.get_array();
+ for (auto& e : json_array) {
+ auto &json_obj = e.get_obj();
+ std::string section;
+ std::string name;
+ std::string value;
+
+ for (auto &pairs : json_obj) {
+ if (pairs.first == "section") {
+ section = pairs.second.get_str();
+ } else if (pairs.first == "name") {
+ name = pairs.second.get_str();
+ } else if (pairs.first == "value") {
+ value = pairs.second.get_str();
+ }
+ }
+
+ if (!boost::starts_with(name, "rbd_")) {
+ continue;
+ }
+ if (section != "global" && section != "client" &&
+ (!client_id_config_entity || section != config_entity)) {
+ continue;
+ }
+ if (config_entity == "global" && section != "global") {
+ continue;
+ }
+ auto it = options->find(name);
+ if (it == options->end()) {
+ (*options)[name] = {value, section};
+ continue;
+ }
+ if (section == "client") {
+ if (it->second.second == "global") {
+ it->second = {value, section};
+ }
+ } else if (client_id_config_entity) {
+ it->second = {value, section};
+ }
+ }
+ } catch (std::runtime_error &e) {
+ std::cerr << "rbd: error parsing config dump: " << e.what() << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+} // anonymous namespace
+
+void get_global_get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_config_entity_option(positional);
+ add_key_option(positional);
+}
+
+int execute_global_get(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string config_entity;
+ int r = get_config_entity(vm, &config_entity);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ std::map<std::string, std::pair<std::string, std::string>> options;
+ r = config_global_list(rados, config_entity, &options);
+ if (r < 0) {
+ return r;
+ }
+
+ auto it = options.find(key);
+
+ if (it == options.end() || it->second.second != config_entity) {
+ std::cerr << "rbd: " << key << " is not set" << std::endl;
+ return -ENOENT;
+ }
+
+ std::cout << it->second.first << std::endl;
+ return 0;
+}
+
+void get_global_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_config_entity_option(positional);
+ add_key_option(positional);
+ positional->add_options()
+ ("value", "config value");
+}
+
+int execute_global_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string config_entity;
+ int r = get_config_entity(vm, &config_entity);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value = utils::get_positional_argument(vm, 2);
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config set\", "
+ "\"who\": \"" + stringify(json_stream_escaper(config_entity)) + "\", "
+ "\"name\": \"" + key + "\", "
+ "\"value\": \"" + stringify(json_stream_escaper(value)) + "\""
+ "}";
+ bufferlist in_bl;
+ std::string ss;
+ r = rados.mon_command(cmd, in_bl, nullptr, &ss);
+ if (r < 0) {
+ std::cerr << "rbd: error setting " << key << ": " << ss << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_global_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_config_entity_option(positional);
+ add_key_option(positional);
+}
+
+int execute_global_remove(
+ const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string config_entity;
+ int r = get_config_entity(vm, &config_entity);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config rm\", "
+ "\"who\": \"" + stringify(json_stream_escaper(config_entity)) + "\", "
+ "\"name\": \"" + key + "\""
+ "}";
+ bufferlist in_bl;
+ std::string ss;
+ r = rados.mon_command(cmd, in_bl, nullptr, &ss);
+ if (r < 0) {
+ std::cerr << "rbd: error removing " << key << ": " << ss << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_global_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_config_entity_option(positional);
+ at::add_format_options(options);
+}
+
+int execute_global_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string config_entity;
+ int r = get_config_entity(vm, &config_entity);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter f;
+ r = utils::get_formatter(vm, &f);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ std::map<std::string, std::pair<std::string, std::string>> options;
+ r = config_global_list(rados, config_entity, &options);
+ if (r < 0) {
+ return r;
+ }
+
+ if (options.empty() && !f) {
+ return 0;
+ }
+
+ TextTable tbl;
+
+ if (f) {
+ f->open_array_section("config");
+ } else {
+ tbl.define_column("Name", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Section", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (const auto &it : options) {
+ if (f) {
+ f->open_object_section("option");
+ f->dump_string("name", it.first);
+ f->dump_string("value", it.second.first);
+ f->dump_string("section", it.second.second);
+ f->close_section();
+ } else {
+ tbl << it.first << it.second.first << it.second.second
+ << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+void get_pool_get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_pool_option(positional);
+ add_key_option(positional);
+}
+
+int execute_pool_get(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ int r = get_pool(vm, &pool_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::string value;
+
+ r = rbd.pool_metadata_get(io_ctx, METADATA_CONF_PREFIX + key, &value);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ std::cerr << "rbd: " << key << " is not set" << std::endl;
+ } else {
+ std::cerr << "rbd: failed to get " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+
+ std::cout << value << std::endl;
+ return 0;
+}
+
+void get_pool_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_pool_option(positional);
+ add_key_option(positional);
+ positional->add_options()
+ ("value", "config value");
+}
+
+int execute_pool_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ int r = get_pool(vm, &pool_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value = utils::get_positional_argument(vm, 2);
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.pool_metadata_set(io_ctx, METADATA_CONF_PREFIX + key, value);
+ if (r < 0) {
+ std::cerr << "rbd: failed to set " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_pool_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_pool_option(positional);
+ add_key_option(positional);
+}
+
+int execute_pool_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ int r = get_pool(vm, &pool_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ size_t arg_index = 1;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.pool_metadata_remove(io_ctx, METADATA_CONF_PREFIX + key);
+ if (r < 0) {
+ std::cerr << "rbd: failed to remove " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_pool_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_pool_option(positional);
+ at::add_format_options(options);
+}
+
+int execute_pool_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ int r = get_pool(vm, &pool_name);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter f;
+ r = utils::get_formatter(vm, &f);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ TextTable tbl;
+ librbd::RBD rbd;
+ std::vector<librbd::config_option_t> options;
+
+ r = rbd.config_list(io_ctx, &options);
+ if (r < 0) {
+ std::cerr << "rbd: failed to list config: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (f) {
+ f->open_array_section("config");
+ } else {
+ tbl.define_column("Name", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Source", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (auto &option : options) {
+ if (f) {
+ f->open_object_section("option");
+ f->dump_string("name", option.name);
+ f->dump_string("value", option.value);
+ f->dump_stream("source") << option.source;
+ f->close_section();
+ } else {
+ std::ostringstream source;
+ source << option.source;
+ tbl << option.name << option.value << source.str() << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+void get_image_get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+}
+
+int execute_image_get(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value;
+
+ r = image.metadata_get(METADATA_CONF_PREFIX + key, &value);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ std::cerr << "rbd: " << key << " is not set" << std::endl;
+ } else {
+ std::cerr << "rbd: failed to get " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+
+ std::cout << value << std::endl;
+ return 0;
+}
+
+void get_image_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+ positional->add_options()
+ ("value", "config value");
+}
+
+int execute_image_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value = utils::get_positional_argument(vm, arg_index);
+ if (value.empty()) {
+ std::cerr << "rbd: image config value was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.metadata_set(METADATA_CONF_PREFIX + key, value);
+ if (r < 0) {
+ std::cerr << "rbd: failed to set " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_image_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+}
+
+int execute_image_remove(
+ const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.metadata_remove(METADATA_CONF_PREFIX + key);
+ if (r < 0) {
+ std::cerr << "rbd: failed to remove " << key << ": " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_image_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_image_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter f;
+ r = utils::get_formatter(vm, &f);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ TextTable tbl;
+ std::vector<librbd::config_option_t> options;
+
+ r = image.config_list(&options);
+ if (r < 0) {
+ std::cerr << "rbd: failed to list config: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (options.empty()) {
+ if (f == nullptr) {
+ std::cout << "There are no values" << std::endl;
+ }
+ return 0;
+ }
+
+ if (f) {
+ f->open_array_section("config");
+ } else {
+ tbl.define_column("Name", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Source", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (auto &option : options) {
+ if (f) {
+ f->open_object_section("option");
+ f->dump_string("name", option.name);
+ f->dump_string("value", option.value);
+ f->dump_stream("source") << option.source;
+ f->close_section();
+ } else {
+ std::ostringstream source;
+ source << option.source;
+ tbl << option.name << option.value << source.str() << TextTable::endrow;
+ }
+ }
+
+ if (f == nullptr) {
+ bool single = (options.size() == 1);
+ std::cout << "There " << (single ? "is" : "are") << " " << options.size()
+ << " " << (single ? "value" : "values") << ":" << std::endl;
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+Shell::Action action_global_get(
+ {"config", "global", "get"}, {},
+ "Get a global-level configuration override.", "",
+ &get_global_get_arguments, &execute_global_get);
+Shell::Action action_global_set(
+ {"config", "global", "set"}, {},
+ "Set a global-level configuration override.", "",
+ &get_global_set_arguments, &execute_global_set);
+Shell::Action action_global_remove(
+ {"config", "global", "remove"}, {"config", "global", "rm"},
+ "Remove a global-level configuration override.", "",
+ &get_global_remove_arguments, &execute_global_remove);
+Shell::Action action_global_list(
+ {"config", "global", "list"}, {"config", "global", "ls"},
+ "List global-level configuration overrides.", "",
+ &get_global_list_arguments, &execute_global_list);
+
+Shell::Action action_pool_get(
+ {"config", "pool", "get"}, {}, "Get a pool-level configuration override.", "",
+ &get_pool_get_arguments, &execute_pool_get);
+Shell::Action action_pool_set(
+ {"config", "pool", "set"}, {}, "Set a pool-level configuration override.", "",
+ &get_pool_set_arguments, &execute_pool_set);
+Shell::Action action_pool_remove(
+ {"config", "pool", "remove"}, {"config", "pool", "rm"},
+ "Remove a pool-level configuration override.", "",
+ &get_pool_remove_arguments, &execute_pool_remove);
+Shell::Action action_pool_list(
+ {"config", "pool", "list"}, {"config", "pool", "ls"},
+ "List pool-level configuration overrides.", "",
+ &get_pool_list_arguments, &execute_pool_list);
+
+Shell::Action action_image_get(
+ {"config", "image", "get"}, {}, "Get an image-level configuration override.",
+ "", &get_image_get_arguments, &execute_image_get);
+Shell::Action action_image_set(
+ {"config", "image", "set"}, {}, "Set an image-level configuration override.",
+ "", &get_image_set_arguments, &execute_image_set);
+Shell::Action action_image_remove(
+ {"config", "image", "remove"}, {"config", "image", "rm"},
+ "Remove an image-level configuration override.", "",
+ &get_image_remove_arguments, &execute_image_remove);
+Shell::Action action_image_list(
+ {"config", "image", "list"}, {"config", "image", "ls"},
+ "List image-level configuration overrides.", "",
+ &get_image_list_arguments, &execute_image_list);
+
+} // namespace config
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Copy.cc b/src/tools/rbd/action/Copy.cc
new file mode 100644
index 00000000..9a248437
--- /dev/null
+++ b/src/tools/rbd/action/Copy.cc
@@ -0,0 +1,195 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace copy {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_copy(librbd::Image &src, librados::IoCtx& dest_pp,
+ const char *destname, librbd::ImageOptions& opts,
+ bool no_progress,
+ size_t sparse_size)
+{
+ utils::ProgressContext pc("Image copy", no_progress);
+ int r = src.copy_with_progress4(dest_pp, destname, opts, pc, sparse_size);
+ if (r < 0){
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, false);
+ at::add_sparse_size_option(options);
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dst_pool_name;
+ std::string dst_namespace_name;
+ std::string dst_image_name;
+ std::string dst_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name,
+ &dst_namespace_name, &dst_image_name, &dst_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, false, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name, &dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE;
+ if (vm.count(at::IMAGE_SPARSE_SIZE)) {
+ sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>();
+ }
+ r = do_copy(image, dst_io_ctx, dst_image_name.c_str(), opts,
+ vm[at::NO_PROGRESS].as<bool>(), sparse_size);
+ if (r < 0) {
+ std::cerr << "rbd: copy failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"copy"}, {"cp"}, "Copy src image to dest.", at::get_long_features_help(),
+ &get_arguments, &execute);
+
+static int do_deep_copy(librbd::Image &src, librados::IoCtx& dest_pp,
+ const char *destname, librbd::ImageOptions& opts,
+ bool no_progress)
+{
+ utils::ProgressContext pc("Image deep copy", no_progress);
+ int r = src.deep_copy_with_progress(dest_pp, destname, opts, pc);
+ if (r < 0){
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments_deep(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, false);
+ at::add_flatten_option(options);
+ at::add_no_progress_option(options);
+}
+
+int execute_deep(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dst_pool_name;
+ std::string dst_namespace_name;
+ std::string dst_image_name;
+ std::string dst_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name,
+ &dst_namespace_name, &dst_image_name, &dst_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, false, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = utils::init_io_ctx(rados, dst_pool_name, dst_namespace_name, &dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_deep_copy(image, dst_io_ctx, dst_image_name.c_str(), opts,
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: deep copy failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_deep(
+ {"deep", "copy"}, {"deep", "cp"}, "Deep copy src image to dest.",
+ at::get_long_features_help(), &get_arguments_deep, &execute_deep);
+
+} // namespace copy
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Create.cc b/src/tools/rbd/action/Create.cc
new file mode 100644
index 00000000..99efa0b5
--- /dev/null
+++ b/src/tools/rbd/action/Create.cc
@@ -0,0 +1,264 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+#include "common/Cond.h"
+#include "common/Mutex.h"
+
+namespace rbd {
+namespace action {
+namespace create {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_create(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+ const char *imgname, uint64_t size,
+ librbd::ImageOptions& opts) {
+ return rbd.create4(io_ctx, imgname, size, opts);
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_create_image_options(options, true);
+ options->add_options()
+ (at::IMAGE_THICK_PROVISION.c_str(), po::bool_switch(), "fully allocate storage and zero image");
+ at::add_size_option(options);
+ at::add_no_progress_option(options);
+}
+
+void thick_provision_writer_completion(rbd_completion_t, void *);
+
+struct thick_provision_writer {
+ librbd::Image *image;
+ Mutex lock;
+ Cond cond;
+ bufferlist bl;
+ uint64_t chunk_size;
+ const int block_size;
+ uint64_t concurr;
+ struct {
+ uint64_t in_flight;
+ int io_error;
+ } io_status;
+
+ // Constructor
+ explicit thick_provision_writer(librbd::Image *i, librbd::ImageOptions &o)
+ : image(i),
+ lock("thick_provision_writer::lock"),
+ block_size(512) // 512 Bytes
+ {
+ // If error cases occur, the code is aborted, because
+ // constructor cannot return error value.
+ ceph_assert(g_ceph_context != nullptr);
+ bl.append_zero(block_size);
+
+ librbd::image_info_t info;
+ int r = image->stat(info, sizeof(info));
+ ceph_assert(r >= 0);
+ uint64_t order;
+ if (info.order == 0) {
+ order = g_conf().get_val<uint64_t>("rbd_default_order");
+ } else {
+ order = info.order;
+ }
+ chunk_size = (1ull << order);
+ if (image->get_stripe_unit() < chunk_size) {
+ chunk_size = image->get_stripe_unit();
+ }
+
+ concurr = g_conf().get_val<uint64_t>("rbd_concurrent_management_ops");
+ io_status.in_flight = 0;
+ io_status.io_error = 0;
+ }
+
+ int start_io(uint64_t write_offset)
+ {
+ {
+ Mutex::Locker l(lock);
+ io_status.in_flight++;
+ if (io_status.in_flight > concurr) {
+ io_status.in_flight--;
+ return -EINVAL;
+ }
+ }
+
+ librbd::RBD::AioCompletion *c;
+ c = new librbd::RBD::AioCompletion(this, thick_provision_writer_completion);
+ int r;
+ r = image->aio_writesame(write_offset, chunk_size, bl, c, LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL);
+ if (r < 0) {
+ Mutex::Locker l(lock);
+ io_status.io_error = r;
+ }
+ return r;
+ }
+
+ int wait_for(uint64_t max) {
+ Mutex::Locker l(lock);
+ int r = io_status.io_error;
+
+ while (io_status.in_flight > max) {
+ utime_t dur;
+ dur.set_from_double(.2);
+ cond.WaitInterval(lock, dur);
+ }
+ return r;
+ }
+};
+
+void thick_provision_writer_completion(rbd_completion_t rc, void *pc) {
+ librbd::RBD::AioCompletion *ac = (librbd::RBD::AioCompletion *)rc;
+ thick_provision_writer *tc = static_cast<thick_provision_writer *>(pc);
+
+ int r = ac->get_return_value();
+ tc->lock.Lock();
+ if (r < 0 && tc->io_status.io_error >= 0) {
+ tc->io_status.io_error = r;
+ }
+ tc->io_status.in_flight--;
+ tc->cond.Signal();
+ tc->lock.Unlock();
+ ac->release();
+}
+
+int write_data(librbd::Image &image, librbd::ImageOptions &opts,
+ bool no_progress) {
+ uint64_t image_size;
+ int r = 0;
+ utils::ProgressContext pc("Thick provisioning", no_progress);
+
+ if (image.size(&image_size) != 0) {
+ return -EINVAL;
+ }
+
+ thick_provision_writer tpw(&image, opts);
+ uint64_t off;
+ uint64_t i;
+ for (off = 0; off < image_size;) {
+ i = 0;
+ while (i < tpw.concurr && off < image_size) {
+ tpw.wait_for(tpw.concurr - 1);
+ r = tpw.start_io(off);
+ if (r != 0) {
+ goto err_writesame;
+ }
+ ++i;
+ off += tpw.chunk_size;
+ if(off > image_size) {
+ off = image_size;
+ }
+ pc.update_progress(off, image_size);
+ }
+ }
+
+ tpw.wait_for(0);
+ r = image.flush();
+ if (r < 0) {
+ std::cerr << "rbd: failed to flush at the end: " << cpp_strerror(r)
+ << std::endl;
+ goto err_writesame;
+ }
+ pc.finish();
+
+ return r;
+
+err_writesame:
+ tpw.wait_for(0);
+ pc.fail();
+
+ return r;
+}
+
+int thick_write(const std::string &image_name,librados::IoCtx &io_ctx,
+ librbd::ImageOptions &opts, bool no_progress) {
+ int r = 0;
+ librbd::Image image;
+
+ // To prevent writesame from discarding data, thick_write sets
+ // the rbd_discard_on_zeroed_write_same option to false.
+ ceph_assert(g_ceph_context != nullptr);
+ r = g_conf().set_val("rbd_discard_on_zeroed_write_same", "false");
+ ceph_assert(r == 0);
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = write_data(image, opts, no_progress);
+
+ image.close();
+
+ return r;
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, true, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t size;
+ r = utils::get_image_size(vm, &size);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = do_create(rbd, io_ctx, image_name.c_str(), size, opts);
+ if (!namespace_name.empty() && r == -ENOENT) {
+ std::cerr << "rbd: namespace not found - it must be created with "
+ << "'rbd namespace create' before creating an image."
+ << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: create error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (vm.count(at::IMAGE_THICK_PROVISION) && vm[at::IMAGE_THICK_PROVISION].as<bool>()) {
+ r = thick_write(image_name, io_ctx, opts, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: image created but error encountered during thick provisioning: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"create"}, {}, "Create an empty image.", at::get_long_features_help(),
+ &get_arguments, &execute);
+
+} // namespace create
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Device.cc b/src/tools/rbd/action/Device.cc
new file mode 100644
index 00000000..3fdf2ef5
--- /dev/null
+++ b/src/tools/rbd/action/Device.cc
@@ -0,0 +1,185 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+
+#include <boost/program_options.hpp>
+
+#include "include/ceph_assert.h"
+
+namespace rbd {
+namespace action {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+#define DECLARE_DEVICE_OPERATIONS(ns) \
+ namespace ns { \
+ int execute_list(const po::variables_map &vm, \
+ const std::vector<std::string> &ceph_global_args); \
+ int execute_map(const po::variables_map &vm, \
+ const std::vector<std::string> &ceph_global_args); \
+ int execute_unmap(const po::variables_map &vm, \
+ const std::vector<std::string> &ceph_global_args); \
+ }
+
+DECLARE_DEVICE_OPERATIONS(ggate);
+DECLARE_DEVICE_OPERATIONS(kernel);
+DECLARE_DEVICE_OPERATIONS(nbd);
+
+namespace device {
+
+namespace {
+
+struct DeviceOperations {
+ int (*execute_list)(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args);
+ int (*execute_map)(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args);
+ int (*execute_unmap)(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args);
+};
+
+const DeviceOperations ggate_operations = {
+ ggate::execute_list,
+ ggate::execute_map,
+ ggate::execute_unmap,
+};
+
+const DeviceOperations krbd_operations = {
+ kernel::execute_list,
+ kernel::execute_map,
+ kernel::execute_unmap,
+};
+
+const DeviceOperations nbd_operations = {
+ nbd::execute_list,
+ nbd::execute_map,
+ nbd::execute_unmap,
+};
+
+enum device_type_t {
+ DEVICE_TYPE_GGATE,
+ DEVICE_TYPE_KRBD,
+ DEVICE_TYPE_NBD,
+};
+
+struct DeviceType {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ DeviceType *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ if (s == "ggate") {
+ v = boost::any(DEVICE_TYPE_GGATE);
+ } else if (s == "krbd") {
+ v = boost::any(DEVICE_TYPE_KRBD);
+ } else if (s == "nbd") {
+ v = boost::any(DEVICE_TYPE_NBD);
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+void add_device_type_option(po::options_description *options) {
+ options->add_options()
+ ("device-type,t", po::value<DeviceType>(),
+ "device type [ggate, krbd (default), nbd]");
+}
+
+void add_device_specific_options(po::options_description *options) {
+ options->add_options()
+ ("options,o", po::value<std::vector<std::string>>(),
+ "device specific options");
+}
+
+device_type_t get_device_type(const po::variables_map &vm) {
+ if (vm.count("device-type")) {
+ return vm["device-type"].as<device_type_t>();
+ }
+ return DEVICE_TYPE_KRBD;
+}
+
+const DeviceOperations *get_device_operations(const po::variables_map &vm) {
+ switch (get_device_type(vm)) {
+ case DEVICE_TYPE_GGATE:
+ return &ggate_operations;
+ case DEVICE_TYPE_KRBD:
+ return &krbd_operations;
+ case DEVICE_TYPE_NBD:
+ return &nbd_operations;
+ default:
+ ceph_abort();
+ return nullptr;
+ }
+}
+
+} // anonymous namespace
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_device_type_option(options);
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return (*get_device_operations(vm)->execute_list)(vm, ceph_global_init_args);
+}
+
+void get_map_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_device_type_option(options);
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ ("read-only", po::bool_switch(), "map read-only")
+ ("exclusive", po::bool_switch(), "disable automatic exclusive lock transitions");
+ add_device_specific_options(options);
+}
+
+int execute_map(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return (*get_device_operations(vm)->execute_map)(vm, ceph_global_init_args);
+}
+
+void get_unmap_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_device_type_option(options);
+ positional->add_options()
+ ("image-or-snap-or-device-spec",
+ "image, snapshot, or device specification\n"
+ "[<pool-name>/]<image-name>[@<snapshot-name>] or <device-path>");
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_snap_option(options, at::ARGUMENT_MODIFIER_NONE);
+ add_device_specific_options(options);
+}
+
+int execute_unmap(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return (*get_device_operations(vm)->execute_unmap)(vm, ceph_global_init_args);
+}
+
+Shell::SwitchArguments switched_arguments({"read-only", "exclusive"});
+Shell::Action action_list(
+ {"device", "list"}, {"showmapped"}, "List mapped rbd images.", "",
+ &get_list_arguments, &execute_list);
+// yet another alias for list command
+Shell::Action action_ls(
+ {"device", "ls"}, {}, "List mapped rbd images.", "",
+ &get_list_arguments, &execute_list, false);
+
+Shell::Action action_map(
+ {"device", "map"}, {"map"}, "Map an image to a block device.", "",
+ &get_map_arguments, &execute_map);
+
+Shell::Action action_unmap(
+ {"device", "unmap"}, {"unmap"}, "Unmap a rbd device.", "",
+ &get_unmap_arguments, &execute_unmap);
+
+} // namespace device
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Diff.cc b/src/tools/rbd/action/Diff.cc
new file mode 100644
index 00000000..3729469c
--- /dev/null
+++ b/src/tools/rbd/action/Diff.cc
@@ -0,0 +1,143 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace diff {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+struct output_method {
+ output_method() : f(NULL), t(NULL), empty(true) {}
+ Formatter *f;
+ TextTable *t;
+ bool empty;
+};
+
+static int diff_cb(uint64_t ofs, size_t len, int exists, void *arg)
+{
+ output_method *om = static_cast<output_method *>(arg);
+ om->empty = false;
+ if (om->f) {
+ om->f->open_object_section("extent");
+ om->f->dump_unsigned("offset", ofs);
+ om->f->dump_unsigned("length", len);
+ om->f->dump_string("exists", exists ? "true" : "false");
+ om->f->close_section();
+ } else {
+ ceph_assert(om->t);
+ *(om->t) << ofs << len << (exists ? "data" : "zero") << TextTable::endrow;
+ }
+ return 0;
+}
+
+static int do_diff(librbd::Image& image, const char *fromsnapname,
+ bool whole_object, Formatter *f)
+{
+ int r;
+ librbd::image_info_t info;
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ output_method om;
+ if (f) {
+ om.f = f;
+ f->open_array_section("extents");
+ } else {
+ om.t = new TextTable();
+ om.t->define_column("Offset", TextTable::LEFT, TextTable::LEFT);
+ om.t->define_column("Length", TextTable::LEFT, TextTable::LEFT);
+ om.t->define_column("Type", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
+ diff_cb, &om);
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ if (!om.empty)
+ std::cout << *om.t;
+ delete om.t;
+ }
+ return r;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(),
+ "snapshot starting point")
+ (at::WHOLE_OBJECT.c_str(), po::bool_switch(), "compare whole object");
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string from_snap_name;
+ if (vm.count(at::FROM_SNAPSHOT_NAME)) {
+ from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>();
+ }
+
+ bool diff_whole_object = vm[at::WHOLE_OBJECT].as<bool>();
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_diff(image, from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
+ diff_whole_object, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: diff error: " << cpp_strerror(r) << std::endl;
+ return -r;
+ }
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({at::WHOLE_OBJECT});
+Shell::Action action(
+ {"diff"}, {},
+ "Print extents that differ since a previous snap, or image creation.", "",
+ &get_arguments, &execute);
+
+} // namespace diff
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/DiskUsage.cc b/src/tools/rbd/action/DiskUsage.cc
new file mode 100644
index 00000000..649f39a7
--- /dev/null
+++ b/src/tools/rbd/action/DiskUsage.cc
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <algorithm>
+#include <iostream>
+#include <boost/bind.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace disk_usage {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int disk_usage_callback(uint64_t offset, size_t len, int exists,
+ void *arg) {
+ uint64_t *used_size = reinterpret_cast<uint64_t *>(arg);
+ if (exists) {
+ (*used_size) += len;
+ }
+ return 0;
+}
+
+static int compute_image_disk_usage(const std::string& name,
+ const std::string& snap_name,
+ const std::string& from_snap_name,
+ librbd::Image &image, uint64_t size,
+ bool exact, TextTable& tbl, Formatter *f,
+ uint64_t *used_size) {
+ const char* from = NULL;
+ if (!from_snap_name.empty()) {
+ from = from_snap_name.c_str();
+ }
+
+ uint64_t flags;
+ int r = image.get_flags(&flags);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve image flags: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) {
+ std::cerr << "warning: fast-diff map is invalid for " << name
+ << (snap_name.empty() ? "" : "@" + snap_name) << ". "
+ << "operation may be slow." << std::endl;
+ }
+
+ *used_size = 0;
+ r = image.diff_iterate2(from, 0, size, false, !exact,
+ &disk_usage_callback, used_size);
+ if (r < 0) {
+ std::cerr << "rbd: failed to iterate diffs: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("name", name);
+ if (!snap_name.empty()) {
+ f->dump_string("snapshot", snap_name);
+ }
+ f->dump_unsigned("provisioned_size", size);
+ f->dump_unsigned("used_size" , *used_size);
+ f->close_section();
+ } else {
+ std::string full_name = name;
+ if (!snap_name.empty()) {
+ full_name += "@" + snap_name;
+ }
+ tbl << full_name
+ << stringify(byte_u_t(size))
+ << stringify(byte_u_t(*used_size))
+ << TextTable::endrow;
+ }
+ return 0;
+}
+
+static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
+ const char *imgname, const char *snapname,
+ const char *from_snapname, bool exact, Formatter *f) {
+ std::vector<librbd::image_spec_t> images;
+ int r = rbd.list2(io_ctx, &images);
+ if (r == -ENOENT) {
+ r = 0;
+ } else if (r < 0) {
+ return r;
+ }
+
+ TextTable tbl;
+ if (f) {
+ f->open_object_section("stats");
+ f->open_array_section("images");
+ } else {
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("PROVISIONED", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
+ }
+
+ uint32_t count = 0;
+ uint64_t used_size = 0;
+ uint64_t total_prov = 0;
+ uint64_t total_used = 0;
+ uint64_t snap_id = CEPH_NOSNAP;
+ uint64_t from_id = CEPH_NOSNAP;
+ bool found = false;
+ for (auto& image_spec : images) {
+ if (imgname != NULL && image_spec.name != imgname) {
+ continue;
+ }
+ found = true;
+
+ librbd::Image image;
+ r = rbd.open_read_only(io_ctx, image, image_spec.name.c_str(), NULL);
+ if (r < 0) {
+ if (r != -ENOENT) {
+ std::cerr << "rbd: error opening " << image_spec.name << ": "
+ << cpp_strerror(r) << std::endl;
+ }
+ continue;
+ }
+
+ uint64_t features;
+ r = image.features(&features);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve image features: " << cpp_strerror(r)
+ << std::endl;
+ goto out;
+ }
+ if ((features & RBD_FEATURE_FAST_DIFF) == 0) {
+ std::cerr << "warning: fast-diff map is not enabled for "
+ << image_spec.name << ". " << "operation may be slow."
+ << std::endl;
+ }
+
+ librbd::image_info_t info;
+ if (image.stat(info, sizeof(info)) < 0) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ std::vector<librbd::snap_info_t> snap_list;
+ r = image.snap_list(snap_list);
+ if (r < 0) {
+ std::cerr << "rbd: error opening " << image_spec.name << " snapshots: "
+ << cpp_strerror(r) << std::endl;
+ continue;
+ }
+
+ snap_list.erase(remove_if(snap_list.begin(),
+ snap_list.end(),
+ boost::bind(utils::is_not_user_snap_namespace, &image, _1)),
+ snap_list.end());
+
+ bool found_from_snap = (from_snapname == nullptr);
+ bool found_snap = (snapname == nullptr);
+ bool found_from = (from_snapname == nullptr);
+ std::string last_snap_name;
+ std::sort(snap_list.begin(), snap_list.end(),
+ boost::bind(&librbd::snap_info_t::id, _1) <
+ boost::bind(&librbd::snap_info_t::id, _2));
+ if (!found_snap || !found_from) {
+ for (auto &snap_info : snap_list) {
+ if (!found_snap && snap_info.name == snapname) {
+ snap_id = snap_info.id;
+ found_snap = true;
+ }
+ if (!found_from && snap_info.name == from_snapname) {
+ from_id = snap_info.id;
+ found_from = true;
+ }
+ if (found_snap && found_from) {
+ break;
+ }
+ }
+ }
+ if ((snapname != nullptr && snap_id == CEPH_NOSNAP) ||
+ (from_snapname != nullptr && from_id == CEPH_NOSNAP)) {
+ std::cerr << "specified snapshot is not found." << std::endl;
+ return -ENOENT;
+ }
+ if (snap_id != CEPH_NOSNAP && from_id != CEPH_NOSNAP) {
+ if (from_id == snap_id) {
+ // no diskusage.
+ return 0;
+ }
+ if (from_id >= snap_id) {
+ return -EINVAL;
+ }
+ }
+
+ for (std::vector<librbd::snap_info_t>::const_iterator snap =
+ snap_list.begin(); snap != snap_list.end(); ++snap) {
+ librbd::Image snap_image;
+ r = rbd.open_read_only(io_ctx, snap_image, image_spec.name.c_str(),
+ snap->name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: error opening snapshot " << image_spec.name << "@"
+ << snap->name << ": " << cpp_strerror(r) << std::endl;
+ goto out;
+ }
+
+ if (imgname == nullptr || found_from_snap ||
+ (found_from_snap && snapname != nullptr && snap->name == snapname)) {
+ r = compute_image_disk_usage(image_spec.name, snap->name,
+ last_snap_name, snap_image, snap->size,
+ exact, tbl, f, &used_size);
+ if (r < 0) {
+ goto out;
+ }
+
+ if (snapname != NULL) {
+ total_prov += snap->size;
+ }
+ total_used += used_size;
+ ++count;
+ }
+
+ if (!found_from_snap && from_snapname != nullptr &&
+ snap->name == from_snapname) {
+ found_from_snap = true;
+ }
+ if (snapname != nullptr && snap->name == snapname) {
+ break;
+ }
+ last_snap_name = snap->name;
+ }
+
+ if (snapname == NULL) {
+ r = compute_image_disk_usage(image_spec.name, "", last_snap_name, image,
+ info.size, exact, tbl, f, &used_size);
+ if (r < 0) {
+ goto out;
+ }
+ total_prov += info.size;
+ total_used += used_size;
+ ++count;
+ }
+ }
+ if (imgname != nullptr && !found) {
+ std::cerr << "specified image " << imgname << " is not found." << std::endl;
+ return -ENOENT;
+ }
+
+out:
+ if (f) {
+ f->close_section();
+ if (imgname == NULL) {
+ f->dump_unsigned("total_provisioned_size", total_prov);
+ f->dump_unsigned("total_used_size", total_used);
+ }
+ f->close_section();
+ f->flush(std::cout);
+ } else if (!images.empty()) {
+ if (count > 1) {
+ tbl << "<TOTAL>"
+ << stringify(byte_u_t(total_prov))
+ << stringify(byte_u_t(total_used))
+ << TextTable::endrow;
+ }
+ std::cout << tbl;
+ }
+
+ return r < 0 ? r : 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+ options->add_options()
+ (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(),
+ "snapshot starting point")
+ ("exact", po::bool_switch(), "compute exact disk usage (slow)");
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, vm.count(at::FROM_SNAPSHOT_NAME),
+ utils::SNAPSHOT_PRESENCE_PERMITTED, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string from_snap_name;
+ if (vm.count(at::FROM_SNAPSHOT_NAME)) {
+ from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>();
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ librbd::RBD rbd;
+ r = do_disk_usage(rbd, io_ctx,
+ image_name.empty() ? nullptr: image_name.c_str(),
+ snap_name.empty() ? nullptr : snap_name.c_str(),
+ from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
+ vm["exact"].as<bool>(), formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: du failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"disk-usage"}, {"du"}, "Show disk usage stats for pool, image or snapshot.",
+ "", &get_arguments, &execute);
+
+} // namespace disk_usage
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Export.cc b/src/tools/rbd/action/Export.cc
new file mode 100644
index 00000000..b5b82f4c
--- /dev/null
+++ b/src/tools/rbd/action/Export.cc
@@ -0,0 +1,651 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/Context.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "include/encoding.h"
+#include <iostream>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <boost/program_options.hpp>
+#include <boost/scope_exit.hpp>
+
+namespace rbd {
+namespace action {
+namespace export_full {
+
+struct ExportDiffContext {
+ librbd::Image *image;
+ int fd;
+ int export_format;
+ uint64_t totalsize;
+ utils::ProgressContext pc;
+ OrderedThrottle throttle;
+
+ ExportDiffContext(librbd::Image *i, int f, uint64_t t, int max_ops,
+ bool no_progress, int eformat) :
+ image(i), fd(f), export_format(eformat), totalsize(t), pc("Exporting image", no_progress),
+ throttle(max_ops, true) {
+ }
+};
+
+class C_ExportDiff : public Context {
+public:
+ C_ExportDiff(ExportDiffContext *edc, uint64_t offset, uint64_t length,
+ bool exists, int export_format)
+ : m_export_diff_context(edc), m_offset(offset), m_length(length),
+ m_exists(exists), m_export_format(export_format) {
+ }
+
+ int send() {
+ if (m_export_diff_context->throttle.pending_error()) {
+ return m_export_diff_context->throttle.wait_for_ret();
+ }
+
+ C_OrderedThrottle *ctx = m_export_diff_context->throttle.start_op(this);
+ if (m_exists) {
+ librbd::RBD::AioCompletion *aio_completion =
+ new librbd::RBD::AioCompletion(ctx, &utils::aio_context_callback);
+
+ int op_flags = LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ int r = m_export_diff_context->image->aio_read2(
+ m_offset, m_length, m_read_data, aio_completion, op_flags);
+ if (r < 0) {
+ aio_completion->release();
+ ctx->complete(r);
+ }
+ } else {
+ ctx->complete(0);
+ }
+ return 0;
+ }
+
+ static int export_diff_cb(uint64_t offset, size_t length, int exists,
+ void *arg) {
+ ExportDiffContext *edc = reinterpret_cast<ExportDiffContext *>(arg);
+
+ C_ExportDiff *context = new C_ExportDiff(edc, offset, length, exists, edc->export_format);
+ return context->send();
+ }
+
+protected:
+ void finish(int r) override {
+ if (r >= 0) {
+ if (m_exists) {
+ m_exists = !m_read_data.is_zero();
+ }
+ r = write_extent(m_export_diff_context, m_offset, m_length, m_exists, m_export_format);
+ if (r == 0 && m_exists) {
+ r = m_read_data.write_fd(m_export_diff_context->fd);
+ }
+ }
+ m_export_diff_context->throttle.end_op(r);
+ }
+
+private:
+ ExportDiffContext *m_export_diff_context;
+ uint64_t m_offset;
+ uint64_t m_length;
+ bool m_exists;
+ int m_export_format;
+ bufferlist m_read_data;
+
+ static int write_extent(ExportDiffContext *edc, uint64_t offset,
+ uint64_t length, bool exists, int export_format) {
+ // extent
+ bufferlist bl;
+ __u8 tag = exists ? RBD_DIFF_WRITE : RBD_DIFF_ZERO;
+ uint64_t len = 0;
+ encode(tag, bl);
+ if (export_format == 2) {
+ if (tag == RBD_DIFF_WRITE)
+ len = 8 + 8 + length;
+ else
+ len = 8 + 8;
+ encode(len, bl);
+ }
+ encode(offset, bl);
+ encode(length, bl);
+ int r = bl.write_fd(edc->fd);
+
+ edc->pc.update_progress(offset, edc->totalsize);
+ return r;
+ }
+};
+
+
+int do_export_diff_fd(librbd::Image& image, const char *fromsnapname,
+ const char *endsnapname, bool whole_object,
+ int fd, bool no_progress, int export_format)
+{
+ int r;
+ librbd::image_info_t info;
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ {
+ // header
+ bufferlist bl;
+ if (export_format == 1)
+ bl.append(utils::RBD_DIFF_BANNER);
+ else
+ bl.append(utils::RBD_DIFF_BANNER_V2);
+
+ __u8 tag;
+ uint64_t len = 0;
+ if (fromsnapname) {
+ tag = RBD_DIFF_FROM_SNAP;
+ encode(tag, bl);
+ std::string from(fromsnapname);
+ if (export_format == 2) {
+ len = from.length() + 4;
+ encode(len, bl);
+ }
+ encode(from, bl);
+ }
+
+ if (endsnapname) {
+ tag = RBD_DIFF_TO_SNAP;
+ encode(tag, bl);
+ std::string to(endsnapname);
+ if (export_format == 2) {
+ len = to.length() + 4;
+ encode(len, bl);
+ }
+ encode(to, bl);
+ }
+
+ if (endsnapname && export_format == 2) {
+ tag = RBD_SNAP_PROTECTION_STATUS;
+ encode(tag, bl);
+ bool is_protected = false;
+ r = image.snap_is_protected(endsnapname, &is_protected);
+ if (r < 0) {
+ return r;
+ }
+ len = 8;
+ encode(len, bl);
+ encode(is_protected, bl);
+ }
+
+ tag = RBD_DIFF_IMAGE_SIZE;
+ encode(tag, bl);
+ uint64_t endsize = info.size;
+ if (export_format == 2) {
+ len = 8;
+ encode(len, bl);
+ }
+ encode(endsize, bl);
+
+ r = bl.write_fd(fd);
+ if (r < 0) {
+ return r;
+ }
+ }
+ ExportDiffContext edc(&image, fd, info.size,
+ g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"),
+ no_progress, export_format);
+ r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
+ &C_ExportDiff::export_diff_cb, (void *)&edc);
+ if (r < 0) {
+ goto out;
+ }
+
+ r = edc.throttle.wait_for_ret();
+ if (r < 0) {
+ goto out;
+ }
+
+ {
+ __u8 tag = RBD_DIFF_END;
+ bufferlist bl;
+ encode(tag, bl);
+ r = bl.write_fd(fd);
+ }
+
+out:
+ if (r < 0)
+ edc.pc.fail();
+ else
+ edc.pc.finish();
+
+ return r;
+}
+
+int do_export_diff(librbd::Image& image, const char *fromsnapname,
+ const char *endsnapname, bool whole_object,
+ const char *path, bool no_progress)
+{
+ int r;
+ int fd;
+
+ if (strcmp(path, "-") == 0)
+ fd = STDOUT_FILENO;
+ else
+ fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
+ if (fd < 0)
+ return -errno;
+
+ r = do_export_diff_fd(image, fromsnapname, endsnapname, whole_object, fd, no_progress, 1);
+
+ if (fd != 1)
+ close(fd);
+ if (r < 0 && fd != 1) {
+ remove(path);
+ }
+
+ return r;
+}
+
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments_diff(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_path_options(positional, options,
+ "export file (or '-' for stdout)");
+ options->add_options()
+ (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(),
+ "snapshot starting point")
+ (at::WHOLE_OBJECT.c_str(), po::bool_switch(), "compare whole object");
+ at::add_no_progress_option(options);
+}
+
+int execute_diff(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string path;
+ r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string from_snap_name;
+ if (vm.count(at::FROM_SNAPSHOT_NAME)) {
+ from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_export_diff(image,
+ from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
+ snap_name.empty() ? nullptr : snap_name.c_str(),
+ vm[at::WHOLE_OBJECT].as<bool>(), path.c_str(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: export-diff error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({at::WHOLE_OBJECT});
+Shell::Action action_diff(
+ {"export-diff"}, {}, "Export incremental diff to file.", "",
+ &get_arguments_diff, &execute_diff);
+
+class C_Export : public Context
+{
+public:
+ C_Export(OrderedThrottle &ordered_throttle, librbd::Image &image,
+ uint64_t fd_offset, uint64_t offset, uint64_t length, int fd)
+ : m_throttle(ordered_throttle), m_image(image), m_dest_offset(fd_offset),
+ m_offset(offset), m_length(length), m_fd(fd)
+ {
+ }
+
+ void send()
+ {
+ auto ctx = m_throttle.start_op(this);
+ auto aio_completion = new librbd::RBD::AioCompletion(
+ ctx, &utils::aio_context_callback);
+ int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ int r = m_image.aio_read2(m_offset, m_length, m_bufferlist,
+ aio_completion, op_flags);
+ if (r < 0) {
+ cerr << "rbd: error requesting read from source image" << std::endl;
+ aio_completion->release();
+ m_throttle.end_op(r);
+ }
+ }
+
+ void finish(int r) override
+ {
+ BOOST_SCOPE_EXIT((&m_throttle) (&r))
+ {
+ m_throttle.end_op(r);
+ } BOOST_SCOPE_EXIT_END
+
+ if (r < 0) {
+ cerr << "rbd: error reading from source image at offset "
+ << m_offset << ": " << cpp_strerror(r) << std::endl;
+ return;
+ }
+
+ ceph_assert(m_bufferlist.length() == static_cast<size_t>(r));
+ if (m_fd != STDOUT_FILENO) {
+ if (m_bufferlist.is_zero()) {
+ return;
+ }
+
+ uint64_t chkret = lseek64(m_fd, m_dest_offset, SEEK_SET);
+ if (chkret != m_dest_offset) {
+ cerr << "rbd: error seeking destination image to offset "
+ << m_dest_offset << std::endl;
+ r = -errno;
+ return;
+ }
+ }
+
+ r = m_bufferlist.write_fd(m_fd);
+ if (r < 0) {
+ cerr << "rbd: error writing to destination image at offset "
+ << m_dest_offset << std::endl;
+ }
+ }
+
+private:
+ OrderedThrottle &m_throttle;
+ librbd::Image &m_image;
+ bufferlist m_bufferlist;
+ uint64_t m_dest_offset;
+ uint64_t m_offset;
+ uint64_t m_length;
+ int m_fd;
+};
+
+const uint32_t MAX_KEYS = 64;
+
+static int do_export_v2(librbd::Image& image, librbd::image_info_t &info, int fd,
+ uint64_t period, int max_concurrent_ops, utils::ProgressContext &pc)
+{
+ int r = 0;
+ // header
+ bufferlist bl;
+ bl.append(utils::RBD_IMAGE_BANNER_V2);
+
+ __u8 tag;
+ uint64_t length;
+ // encode order
+ tag = RBD_EXPORT_IMAGE_ORDER;
+ length = 8;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(uint64_t(info.order), bl);
+
+ // encode features
+ tag = RBD_EXPORT_IMAGE_FEATURES;
+ uint64_t features;
+ image.features(&features);
+ length = 8;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(features, bl);
+
+ // encode stripe_unit and stripe_count
+ tag = RBD_EXPORT_IMAGE_STRIPE_UNIT;
+ uint64_t stripe_unit;
+ stripe_unit = image.get_stripe_unit();
+ length = 8;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(stripe_unit, bl);
+
+ tag = RBD_EXPORT_IMAGE_STRIPE_COUNT;
+ uint64_t stripe_count;
+ stripe_count = image.get_stripe_count();
+ length = 8;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(stripe_count, bl);
+
+ //retrieve metadata of image
+ std::map<std::string, string> imagemetas;
+ std::string last_key;
+ bool more_results = true;
+ while (more_results) {
+ std::map<std::string, bufferlist> pairs;
+ r = image.metadata_list(last_key, MAX_KEYS, &pairs);
+ if (r < 0) {
+ std::cerr << "failed to retrieve metadata of image : " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ if (!pairs.empty()) {
+ last_key = pairs.rbegin()->first;
+
+ for (auto kv : pairs) {
+ std::string key = kv.first;
+ std::string val(kv.second.c_str(), kv.second.length());
+ imagemetas[key] = val;
+ }
+ }
+ more_results = (pairs.size() == MAX_KEYS);
+ }
+
+ //encode imageMeta key and value
+ for (std::map<std::string, string>::iterator it = imagemetas.begin();
+ it != imagemetas.end(); ++it) {
+ string key = it->first;
+ string value = it->second;
+
+ tag = RBD_EXPORT_IMAGE_META;
+ length = key.length() + value.length() + 4 * 2;
+ encode(tag, bl);
+ encode(length, bl);
+ encode(key, bl);
+ encode(value, bl);
+ }
+
+ // encode end tag
+ tag = RBD_EXPORT_IMAGE_END;
+ encode(tag, bl);
+
+ // write bl to fd.
+ r = bl.write_fd(fd);
+ if (r < 0) {
+ return r;
+ }
+
+ // header for snapshots
+ bl.clear();
+ bl.append(utils::RBD_IMAGE_DIFFS_BANNER_V2);
+
+ std::vector<librbd::snap_info_t> snaps;
+ r = image.snap_list(snaps);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t diff_num = snaps.size() + 1;
+ encode(diff_num, bl);
+
+ r = bl.write_fd(fd);
+ if (r < 0) {
+ return r;
+ }
+
+ const char *last_snap = NULL;
+ for (size_t i = 0; i < snaps.size(); ++i) {
+ utils::snap_set(image, snaps[i].name.c_str());
+ r = do_export_diff_fd(image, last_snap, snaps[i].name.c_str(), false, fd, true, 2);
+ if (r < 0) {
+ return r;
+ }
+ pc.update_progress(i, snaps.size() + 1);
+ last_snap = snaps[i].name.c_str();
+ }
+ utils::snap_set(image, std::string(""));
+ r = do_export_diff_fd(image, last_snap, nullptr, false, fd, true, 2);
+ if (r < 0) {
+ return r;
+ }
+ pc.update_progress(snaps.size() + 1, snaps.size() + 1);
+ return r;
+}
+
+static int do_export_v1(librbd::Image& image, librbd::image_info_t &info,
+ int fd, uint64_t period, int max_concurrent_ops,
+ utils::ProgressContext &pc)
+{
+ int r = 0;
+ size_t file_size = 0;
+ OrderedThrottle throttle(max_concurrent_ops, false);
+ for (uint64_t offset = 0; offset < info.size; offset += period) {
+ if (throttle.pending_error()) {
+ break;
+ }
+
+ uint64_t length = min(period, info.size - offset);
+ C_Export *ctx = new C_Export(throttle, image, file_size + offset, offset,
+ length, fd);
+ ctx->send();
+
+ pc.update_progress(offset, info.size);
+ }
+
+ file_size += info.size;
+ r = throttle.wait_for_ret();
+ if (fd != 1) {
+ if (r >= 0) {
+ r = ftruncate(fd, file_size);
+ if (r < 0)
+ return r;
+
+ uint64_t chkret = lseek64(fd, file_size, SEEK_SET);
+ if (chkret != file_size)
+ r = errno;
+ }
+ }
+ return r;
+}
+
+static int do_export(librbd::Image& image, const char *path, bool no_progress,
+ int export_format)
+{
+ librbd::image_info_t info;
+ int64_t r = image.stat(info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ int fd;
+ int max_concurrent_ops = g_conf().get_val<uint64_t>("rbd_concurrent_management_ops");
+ bool to_stdout = (strcmp(path, "-") == 0);
+ if (to_stdout) {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
+ if (fd < 0) {
+ return -errno;
+ }
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+ }
+
+ utils::ProgressContext pc("Exporting image", no_progress);
+ uint64_t period = image.get_stripe_count() * (1ull << info.order);
+
+ if (export_format == 1)
+ r = do_export_v1(image, info, fd, period, max_concurrent_ops, pc);
+ else
+ r = do_export_v2(image, info, fd, period, max_concurrent_ops, pc);
+
+ if (r < 0)
+ pc.fail();
+ else
+ pc.finish();
+ if (!to_stdout)
+ close(fd);
+ return r;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_path_options(positional, options,
+ "export file (or '-' for stdout)");
+ at::add_no_progress_option(options);
+ at::add_export_format_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string path;
+ r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ int format = 1;
+ if (vm.count("export-format"))
+ format = vm["export-format"].as<uint64_t>();
+
+ r = do_export(image, path.c_str(), vm[at::NO_PROGRESS].as<bool>(), format);
+ if (r < 0) {
+ std::cerr << "rbd: export error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"export"}, {}, "Export image to file.", "", &get_arguments, &execute);
+
+} // namespace export_full
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Feature.cc b/src/tools/rbd/action/Feature.cc
new file mode 100644
index 00000000..13a7b6ea
--- /dev/null
+++ b/src/tools/rbd/action/Feature.cc
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include <iostream>
+#include <map>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace feature {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options, bool enabled) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ positional->add_options()
+ ("features", po::value<at::ImageFeatures>()->multitoken(),
+ ("image features\n" + at::get_short_features_help(false)).c_str());
+ if (enabled) {
+ at::add_create_journal_options(options);
+ }
+}
+
+void get_arguments_disable(po::options_description *positional,
+ po::options_description *options) {
+ get_arguments(positional, options, false);
+}
+
+void get_arguments_enable(po::options_description *positional,
+ po::options_description *options) {
+ get_arguments(positional, options, true);
+}
+
+int execute(const po::variables_map &vm, bool enabled) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_journal_options(vm, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ std::vector<std::string> feature_names;
+ if (vm.count(at::POSITIONAL_ARGUMENTS)) {
+ const std::vector<std::string> &args =
+ vm[at::POSITIONAL_ARGUMENTS].as<std::vector<std::string> >();
+ feature_names.insert(feature_names.end(), args.begin() + arg_index,
+ args.end());
+ }
+
+ if (feature_names.empty()) {
+ std::cerr << "rbd: at least one feature name must be specified"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ boost::any features_any(static_cast<uint64_t>(0));
+ at::ImageFeatures image_features;
+ at::validate(features_any, feature_names, &image_features, 0);
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.update_features(boost::any_cast<uint64_t>(features_any), enabled);
+ if (r < 0) {
+ std::cerr << "rbd: failed to update image features: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int execute_disable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return execute(vm, false);
+}
+
+int execute_enable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return execute(vm, true);
+}
+
+Shell::Action action_disable(
+ {"feature", "disable"}, {}, "Disable the specified image feature.", "",
+ &get_arguments_disable, &execute_disable);
+Shell::Action action_enable(
+ {"feature", "enable"}, {}, "Enable the specified image feature.", "",
+ &get_arguments_enable, &execute_enable);
+
+} // namespace feature
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Flatten.cc b/src/tools/rbd/action/Flatten.cc
new file mode 100644
index 00000000..ec4e837a
--- /dev/null
+++ b/src/tools/rbd/action/Flatten.cc
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace flatten {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_flatten(librbd::Image& image, bool no_progress)
+{
+ utils::ProgressContext pc("Image flatten", no_progress);
+ int r = image.flatten_with_progress(pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_flatten(image, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: flatten error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"flatten"}, {}, "Fill clone with parent data (make it independent).", "",
+ &get_arguments, &execute);
+
+} // namespace flatten
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Ggate.cc b/src/tools/rbd/action/Ggate.cc
new file mode 100644
index 00000000..61f77be2
--- /dev/null
+++ b/src/tools/rbd/action/Ggate.cc
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/param.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "include/stringify.h"
+#include "common/SubProcess.h"
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/program_options.hpp>
+
+#include <iostream>
+
+namespace rbd {
+namespace action {
+namespace ggate {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int call_ggate_cmd(const po::variables_map &vm,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &ceph_global_args) {
+ SubProcess process("rbd-ggate", SubProcess::KEEP, SubProcess::KEEP,
+ SubProcess::KEEP);
+
+ for (auto &arg : ceph_global_args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ for (auto &arg : args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ if (process.spawn()) {
+ std::cerr << "rbd: failed to run rbd-ggate: " << process.err() << std::endl;
+ return -EINVAL;
+ } else if (process.join()) {
+ std::cerr << "rbd: rbd-ggate failed with error: " << process.err()
+ << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_image_or_snap_spec(const po::variables_map &vm, std::string *spec) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string nspace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name,
+ &image_name, &snap_name, true,
+ utils::SNAPSHOT_PRESENCE_PERMITTED, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ spec->append(pool_name);
+ spec->append("/");
+ if (!nspace_name.empty()) {
+ spec->append(nspace_name);
+ spec->append("/");
+ }
+ spec->append(image_name);
+ if (!snap_name.empty()) {
+ spec->append("@");
+ spec->append(snap_name);
+ }
+
+ return 0;
+}
+
+int parse_options(const std::vector<std::string> &options,
+ std::vector<std::string> *args) {
+ for (auto &opts : options) {
+ std::vector<std::string> args_;
+ boost::split(args_, opts, boost::is_any_of(","));
+ for (auto &o : args_) {
+ args->push_back("--" + o);
+ }
+ }
+
+ return 0;
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(__FreeBSD__)
+ std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+ std::vector<std::string> args;
+
+ args.push_back("list");
+
+ if (vm.count("format")) {
+ args.push_back("--format");
+ args.push_back(vm["format"].as<at::Format>().value);
+ }
+ if (vm["pretty-format"].as<bool>()) {
+ args.push_back("--pretty-format");
+ }
+
+ return call_ggate_cmd(vm, args, ceph_global_init_args);
+}
+
+int execute_map(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(__FreeBSD__)
+ std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+ std::vector<std::string> args;
+
+ args.push_back("map");
+ std::string img;
+ int r = get_image_or_snap_spec(vm, &img);
+ if (r < 0) {
+ return r;
+ }
+ args.push_back(img);
+
+ if (vm["read-only"].as<bool>()) {
+ args.push_back("--read-only");
+ }
+
+ if (vm["exclusive"].as<bool>()) {
+ args.push_back("--exclusive");
+ }
+
+ if (vm.count("options")) {
+ r = parse_options(vm["options"].as<std::vector<std::string>>(), &args);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return call_ggate_cmd(vm, args, ceph_global_init_args);
+}
+
+int execute_unmap(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if !defined(__FreeBSD__)
+ std::cerr << "rbd: ggate is only supported on FreeBSD" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+ std::string device_name = utils::get_positional_argument(vm, 0);
+ if (!boost::starts_with(device_name, "/dev/")) {
+ device_name.clear();
+ }
+
+ std::string image_name;
+ if (device_name.empty()) {
+ int r = get_image_or_snap_spec(vm, &image_name);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (device_name.empty() && image_name.empty()) {
+ std::cerr << "rbd: unmap requires either image name or device path"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ std::vector<std::string> args;
+
+ args.push_back("unmap");
+ args.push_back(device_name.empty() ? image_name : device_name);
+
+ if (vm.count("options")) {
+ int r = parse_options(vm["options"].as<std::vector<std::string>>(), &args);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return call_ggate_cmd(vm, args, ceph_global_init_args);
+}
+
+} // namespace ggate
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Group.cc b/src/tools/rbd/action/Group.cc
new file mode 100644
index 00000000..8554ae3b
--- /dev/null
+++ b/src/tools/rbd/action/Group.cc
@@ -0,0 +1,904 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+
+namespace rbd {
+namespace action {
+namespace group {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static const std::string GROUP_SPEC("group-spec");
+static const std::string GROUP_SNAP_SPEC("group-snap-spec");
+
+static const std::string GROUP_NAME("group");
+static const std::string DEST_GROUP_NAME("dest-group");
+
+static const std::string GROUP_POOL_NAME("group-" + at::POOL_NAME);
+static const std::string IMAGE_POOL_NAME("image-" + at::POOL_NAME);
+
+void add_group_option(po::options_description *opt,
+ at::ArgumentModifier modifier) {
+ std::string name = GROUP_NAME;
+ std::string description = at::get_description_prefix(modifier) + "group name";
+ switch (modifier) {
+ case at::ARGUMENT_MODIFIER_NONE:
+ case at::ARGUMENT_MODIFIER_SOURCE:
+ break;
+ case at::ARGUMENT_MODIFIER_DEST:
+ name = DEST_GROUP_NAME;
+ break;
+ }
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_prefixed_pool_option(po::options_description *opt,
+ const std::string &prefix) {
+ std::string name = prefix + "-" + at::POOL_NAME;
+ std::string description = prefix + " pool name";
+
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_prefixed_namespace_option(po::options_description *opt,
+ const std::string &prefix) {
+ std::string name = prefix + "-" + at::NAMESPACE_NAME;
+ std::string description = prefix + " namespace name";
+
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_group_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ at::ArgumentModifier modifier,
+ bool snap) {
+ at::add_pool_option(opt, modifier);
+ at::add_namespace_option(opt, modifier);
+ add_group_option(opt, modifier);
+ if (!snap) {
+ pos->add_options()
+ ((get_name_prefix(modifier) + GROUP_SPEC).c_str(),
+ (get_description_prefix(modifier) + "group specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<group-name>)").c_str());
+ } else {
+ add_snap_option(opt, modifier);
+ pos->add_options()
+ ((get_name_prefix(modifier) + GROUP_SNAP_SPEC).c_str(),
+ (get_description_prefix(modifier) + "group specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<group-name>@<snap-name>)").c_str());
+ }
+}
+
+int execute_create(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ librbd::RBD rbd;
+ r = rbd.group_create(io_ctx, group_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: create error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+ Formatter *f = formatter.get();
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::vector<std::string> names;
+ r = rbd.group_list(io_ctx, &names);
+ if (r < 0)
+ return r;
+
+ if (f)
+ f->open_array_section("groups");
+ for (auto i : names) {
+ if (f)
+ f->dump_string("name", i);
+ else
+ std::cout << i << std::endl;
+ }
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ librbd::RBD rbd;
+
+ r = rbd.group_remove(io_ctx, group_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: remove error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_rename(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dest_pool_name;
+ std::string dest_namespace_name;
+ std::string dest_group_name;
+
+ r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, at::DEST_POOL_NAME,
+ &dest_pool_name, &dest_namespace_name, DEST_GROUP_NAME, "group",
+ &dest_group_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (pool_name != dest_pool_name) {
+ std::cerr << "rbd: group rename across pools not supported" << std::endl
+ << "source pool: " << pool_name << ", dest pool: "
+ << dest_pool_name << std::endl;
+ return -EINVAL;
+ } else if (namespace_name != dest_namespace_name) {
+ std::cerr << "rbd: group rename across namespaces not supported"
+ << std::endl
+ << "source namespace: " << namespace_name << ", dest namespace: "
+ << dest_namespace_name << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_rename(io_ctx, group_name.c_str(),
+ dest_group_name.c_str());
+
+ if (r < 0) {
+ std::cerr << "rbd: failed to rename group: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_add(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ // Parse group data.
+ std::string group_pool_name;
+ std::string group_namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, GROUP_POOL_NAME,
+ &group_pool_name, &group_namespace_name, GROUP_NAME, "group", &group_name,
+ nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string image_pool_name;
+ std::string image_namespace_name;
+ std::string image_name;
+
+ r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, IMAGE_POOL_NAME,
+ &image_pool_name, &image_namespace_name, at::IMAGE_NAME, "image",
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (group_namespace_name != image_namespace_name) {
+ std::cerr << "rbd: group and image namespace must match." << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx cg_io_ctx;
+ r = utils::init(group_pool_name, group_namespace_name, &rados, &cg_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx image_io_ctx;
+ r = utils::init(image_pool_name, group_namespace_name, &rados, &image_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_image_add(cg_io_ctx, group_name.c_str(),
+ image_io_ctx, image_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: add image error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_remove_image(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+
+ std::string group_pool_name;
+ std::string group_namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, GROUP_POOL_NAME,
+ &group_pool_name, &group_namespace_name, GROUP_NAME, "group", &group_name,
+ nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string image_pool_name;
+ std::string image_namespace_name;
+ std::string image_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, IMAGE_POOL_NAME,
+ &image_pool_name, &image_namespace_name, at::IMAGE_NAME, "image",
+ &image_name, nullptr, image_id.empty(), utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (group_namespace_name != image_namespace_name) {
+ std::cerr << "rbd: group and image namespace must match." << std::endl;
+ return -EINVAL;
+ } else if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx cg_io_ctx;
+ r = utils::init(group_pool_name, group_namespace_name, &rados, &cg_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx image_io_ctx;
+ r = utils::init(image_pool_name, group_namespace_name, &rados, &image_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ if (image_id.empty()) {
+ r = rbd.group_image_remove(cg_io_ctx, group_name.c_str(),
+ image_io_ctx, image_name.c_str());
+ } else {
+ r = rbd.group_image_remove_by_id(cg_io_ctx, group_name.c_str(),
+ image_io_ctx, image_id.c_str());
+ }
+ if (r < 0) {
+ std::cerr << "rbd: remove image error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_list_images(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+ Formatter *f = formatter.get();
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::vector<librbd::group_image_info_t> images;
+
+ r = rbd.group_image_list(io_ctx, group_name.c_str(), &images,
+ sizeof(librbd::group_image_info_t));
+
+ if (r == -ENOENT)
+ r = 0;
+
+ if (r < 0)
+ return r;
+
+ std::sort(images.begin(), images.end(),
+ [](const librbd::group_image_info_t &lhs,
+ const librbd::group_image_info_t &rhs) {
+ if (lhs.pool != rhs.pool) {
+ return lhs.pool < rhs.pool;
+ }
+ return lhs.name < rhs.name;
+ }
+ );
+
+ if (f)
+ f->open_array_section("images");
+
+ for (auto image : images) {
+ std::string image_name = image.name;
+ int state = image.state;
+ std::string state_string;
+ if (RBD_GROUP_IMAGE_STATE_INCOMPLETE == state) {
+ state_string = "incomplete";
+ }
+
+ std::string pool_name = "";
+
+ librados::Rados rados(io_ctx);
+ librados::IoCtx pool_io_ctx;
+ r = rados.ioctx_create2(image.pool, pool_io_ctx);
+ if (r < 0) {
+ pool_name = "<missing image pool " + stringify(image.pool) + ">";
+ } else {
+ pool_name = pool_io_ctx.get_pool_name();
+ }
+
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("image", image_name);
+ f->dump_string("pool", pool_name);
+ f->dump_string("namespace", io_ctx.get_namespace());
+ f->dump_int("state", state);
+ f->close_section();
+ } else {
+ std::cout << pool_name << "/";
+ if (!io_ctx.get_namespace().empty()) {
+ std::cout << io_ctx.get_namespace() << "/";
+ }
+ std::cout << image_name << " " << state_string << std::endl;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+int execute_group_snap_create(const po::variables_map &vm,
+ const std::vector<std::string> &global_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx io_ctx;
+ librados::Rados rados;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_snap_create(io_ctx, group_name.c_str(), snap_name.c_str());
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_group_snap_remove(const po::variables_map &vm,
+ const std::vector<std::string> &global_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx io_ctx;
+ librados::Rados rados;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_snap_remove(io_ctx, group_name.c_str(), snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: failed to remove group snapshot: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_group_snap_rename(const po::variables_map &vm,
+ const std::vector<std::string> &global_args) {
+ size_t arg_index = 0;
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+ std::string source_snap_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, &source_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dest_snap_name;
+ if (vm.count(at::DEST_SNAPSHOT_NAME)) {
+ dest_snap_name = vm[at::DEST_SNAPSHOT_NAME].as<std::string>();
+ }
+
+ if (dest_snap_name.empty()) {
+ dest_snap_name = utils::get_positional_argument(vm, arg_index++);
+ }
+
+ if (dest_snap_name.empty()) {
+ std::cerr << "rbd: destination snapshot name was not specified"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ r = utils::validate_snapshot_name(at::ARGUMENT_MODIFIER_DEST, dest_snap_name,
+ utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_SNAP);
+ if (r < 0) {
+ return r;
+ }
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.group_snap_rename(io_ctx, group_name.c_str(),
+ source_snap_name.c_str(), dest_snap_name.c_str());
+
+ if (r < 0) {
+ std::cerr << "rbd: failed to rename group snapshot: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_group_snap_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string group_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+ Formatter *f = formatter.get();
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::vector<librbd::group_snap_info_t> snaps;
+
+ r = rbd.group_snap_list(io_ctx, group_name.c_str(), &snaps,
+ sizeof(librbd::group_snap_info_t));
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ TextTable t;
+ if (f) {
+ f->open_array_section("group_snaps");
+ } else {
+ t.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ t.define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
+ }
+
+ for (auto i : snaps) {
+ std::string snap_name = i.name;
+ int state = i.state;
+ std::string state_string;
+ if (RBD_GROUP_SNAP_STATE_INCOMPLETE == state) {
+ state_string = "incomplete";
+ } else {
+ state_string = "ok";
+ }
+ if (r < 0) {
+ return r;
+ }
+ if (f) {
+ f->open_object_section("group_snap");
+ f->dump_string("snapshot", snap_name);
+ f->dump_string("state", state_string);
+ f->close_section();
+ } else {
+ t << snap_name << state_string << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else if (snaps.size()) {
+ std::cout << t;
+ }
+ return 0;
+}
+
+int execute_group_snap_rollback(const po::variables_map &vm,
+ const std::vector<std::string> &global_args) {
+ size_t arg_index = 0;
+
+ std::string group_name;
+ std::string namespace_name;
+ std::string pool_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_generic_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name,
+ &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx io_ctx;
+ librados::Rados rados;
+
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ utils::ProgressContext pc("Rolling back to group snapshot",
+ vm[at::NO_PROGRESS].as<bool>());
+ r = rbd.group_snap_rollback_with_progress(io_ctx, group_name.c_str(),
+ snap_name.c_str(), pc);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: rollback group to snapshot failed: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ pc.finish();
+ return 0;
+}
+
+void get_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ false);
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ false);
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ at::add_format_options(options);
+}
+
+void get_rename_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE,
+ false);
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST,
+ false);
+}
+
+void get_add_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ (GROUP_SPEC.c_str(),
+ "group specification\n"
+ "(example: [<pool-name>/[<namespace>/]]<group-name>)");
+
+ add_prefixed_pool_option(options, "group");
+ add_prefixed_namespace_option(options, "group");
+ add_group_option(options, at::ARGUMENT_MODIFIER_NONE);
+
+ positional->add_options()
+ (at::IMAGE_SPEC.c_str(),
+ "image specification\n"
+ "(example: [<pool-name>/[<namespace>/]]<image-name>)");
+
+ add_prefixed_pool_option(options, "image");
+ add_prefixed_namespace_option(options, "image");
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE,
+ " unless overridden");
+}
+
+void get_remove_image_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ (GROUP_SPEC.c_str(),
+ "group specification\n"
+ "(example: [<pool-name>/[<namespace>/]]<group-name>)");
+
+ add_prefixed_pool_option(options, "group");
+ add_prefixed_namespace_option(options, "group");
+ add_group_option(options, at::ARGUMENT_MODIFIER_NONE);
+
+ positional->add_options()
+ (at::IMAGE_SPEC.c_str(),
+ "image specification\n"
+ "(example: [<pool-name>/[<namespace>/]]<image-name>)");
+
+ add_prefixed_pool_option(options, "image");
+ add_prefixed_namespace_option(options, "image");
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE,
+ " unless overridden");
+ at::add_image_id_option(options);
+}
+
+void get_list_images_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_format_options(options);
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ false);
+}
+
+void get_group_snap_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ true);
+}
+
+void get_group_snap_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ true);
+}
+
+void get_group_snap_rename_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ true);
+
+ positional->add_options()
+ (at::DEST_SNAPSHOT_NAME.c_str(),
+ "destination snapshot name\n(example: <snapshot-name>)");
+ at::add_snap_option(options, at::ARGUMENT_MODIFIER_DEST);
+}
+
+void get_group_snap_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_format_options(options);
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ false);
+}
+
+void get_group_snap_rollback_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_no_progress_option(options);
+ add_group_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE,
+ true);
+}
+
+Shell::Action action_create(
+ {"group", "create"}, {}, "Create a group.",
+ "", &get_create_arguments, &execute_create);
+Shell::Action action_remove(
+ {"group", "remove"}, {"group", "rm"}, "Delete a group.",
+ "", &get_remove_arguments, &execute_remove);
+Shell::Action action_list(
+ {"group", "list"}, {"group", "ls"}, "List rbd groups.",
+ "", &get_list_arguments, &execute_list);
+Shell::Action action_rename(
+ {"group", "rename"}, {}, "Rename a group within pool.",
+ "", &get_rename_arguments, &execute_rename);
+Shell::Action action_add(
+ {"group", "image", "add"}, {}, "Add an image to a group.",
+ "", &get_add_arguments, &execute_add);
+Shell::Action action_remove_image(
+ {"group", "image", "remove"}, {"group", "image", "rm"},
+ "Remove an image from a group.", "",
+ &get_remove_image_arguments, &execute_remove_image);
+Shell::Action action_list_images(
+ {"group", "image", "list"}, {"group", "image", "ls"},
+ "List images in a group.", "",
+ &get_list_images_arguments, &execute_list_images);
+Shell::Action action_group_snap_create(
+ {"group", "snap", "create"}, {}, "Make a snapshot of a group.",
+ "", &get_group_snap_create_arguments, &execute_group_snap_create);
+Shell::Action action_group_snap_remove(
+ {"group", "snap", "remove"}, {"group", "snap", "rm"},
+ "Remove a snapshot from a group.",
+ "", &get_group_snap_remove_arguments, &execute_group_snap_remove);
+Shell::Action action_group_snap_rename(
+ {"group", "snap", "rename"}, {}, "Rename group's snapshot.",
+ "", &get_group_snap_rename_arguments, &execute_group_snap_rename);
+Shell::Action action_group_snap_list(
+ {"group", "snap", "list"}, {"group", "snap", "ls"},
+ "List snapshots of a group.",
+ "", &get_group_snap_list_arguments, &execute_group_snap_list);
+Shell::Action action_group_snap_rollback(
+ {"group", "snap", "rollback"}, {},
+ "Rollback group to snapshot.",
+ "", &get_group_snap_rollback_arguments, &execute_group_snap_rollback);
+
+} // namespace group
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/ImageMeta.cc b/src/tools/rbd/action/ImageMeta.cc
new file mode 100644
index 00000000..20c4555d
--- /dev/null
+++ b/src/tools/rbd/action/ImageMeta.cc
@@ -0,0 +1,345 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace image_meta {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+void add_key_option(po::options_description *positional) {
+ positional->add_options()
+ ("key", "image meta key");
+}
+
+int get_key(const po::variables_map &vm, size_t *arg_index,
+ std::string *key) {
+ *key = utils::get_positional_argument(vm, *arg_index);
+ if (key->empty()) {
+ std::cerr << "rbd: metadata key was not specified" << std::endl;
+ return -EINVAL;
+ } else {
+ ++(*arg_index);
+ }
+ return 0;
+}
+
+const uint32_t MAX_KEYS = 64;
+
+} // anonymous namespace
+
+static int do_metadata_list(librbd::Image& image, Formatter *f)
+{
+ int r;
+ TextTable tbl;
+
+ size_t count = 0;
+ std::string last_key;
+ bool more_results = true;
+ while (more_results) {
+ std::map<std::string, bufferlist> pairs;
+ r = image.metadata_list(last_key, MAX_KEYS, &pairs);
+ if (r < 0) {
+ std::cerr << "failed to list metadata of image : " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ more_results = (pairs.size() == MAX_KEYS);
+ if (!pairs.empty()) {
+ if (count == 0) {
+ if (f) {
+ f->open_object_section("metadatas");
+ } else {
+ tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+ }
+ }
+
+ last_key = pairs.rbegin()->first;
+ count += pairs.size();
+
+ for (auto kv : pairs) {
+ std::string val(kv.second.c_str(), kv.second.length());
+ if (f) {
+ f->dump_string(kv.first.c_str(), val.c_str());
+ } else {
+ tbl << kv.first << val << TextTable::endrow;
+ }
+ }
+ }
+ }
+
+ if (f == nullptr) {
+ bool single = (count == 1);
+ std::cout << "There " << (single ? "is" : "are") << " " << count << " "
+ << (single ? "metadatum" : "metadata") << " on this image"
+ << (count == 0 ? "." : ":") << std::endl;
+ }
+
+ if (count > 0) {
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << std::endl << tbl;
+ }
+ }
+ return 0;
+}
+
+static int do_metadata_set(librbd::Image& image, std::string &key,
+ std::string &value)
+{
+ int r = image.metadata_set(key, value);
+ if (r < 0) {
+ std::cerr << "failed to set metadata " << key << " of image : "
+ << cpp_strerror(r) << std::endl;
+ }
+ return r;
+}
+
+static int do_metadata_remove(librbd::Image& image, std::string &key)
+{
+ int r = image.metadata_remove(key);
+ if (r == -ENOENT) {
+ std::cerr << "rbd: no existing metadata key " << key << " of image : "
+ << cpp_strerror(r) << std::endl;
+ } else if(r < 0) {
+ std::cerr << "failed to remove metadata " << key << " of image : "
+ << cpp_strerror(r) << std::endl;
+ }
+ return r;
+}
+
+static int do_metadata_get(librbd::Image& image, std::string &key)
+{
+ std::string s;
+ int r = image.metadata_get(key, &s);
+ if (r < 0) {
+ std::cerr << "failed to get metadata " << key << " of image : "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ std::cout << s << std::endl;
+ return r;
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_metadata_list(image, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: listing metadata failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+}
+
+int execute_get(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_metadata_get(image, key);
+ if (r < 0) {
+ std::cerr << "rbd: getting metadata failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+ positional->add_options()
+ ("value", "image meta value");
+}
+
+int execute_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string value = utils::get_positional_argument(vm, arg_index);
+ if (value.empty()) {
+ std::cerr << "rbd: metadata value was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_metadata_set(image, key, value);
+ if (r < 0) {
+ std::cerr << "rbd: setting metadata failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_key_option(positional);
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string key;
+ r = get_key(vm, &arg_index, &key);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_metadata_remove(image, key);
+ if (r < 0) {
+ std::cerr << "rbd: removing metadata failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_list(
+ {"image-meta", "list"}, {"image-meta", "ls"}, "Image metadata list keys with values.", "",
+ &get_list_arguments, &execute_list);
+Shell::Action action_get(
+ {"image-meta", "get"}, {},
+ "Image metadata get the value associated with the key.", "",
+ &get_get_arguments, &execute_get);
+Shell::Action action_set(
+ {"image-meta", "set"}, {}, "Image metadata set key with value.", "",
+ &get_set_arguments, &execute_set);
+Shell::Action action_remove(
+ {"image-meta", "remove"}, {"image-meta", "rm"},
+ "Image metadata remove the key and value associated.", "",
+ &get_remove_arguments, &execute_remove);
+
+} // namespace image_meta
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Import.cc b/src/tools/rbd/action/Import.cc
new file mode 100644
index 00000000..7397d926
--- /dev/null
+++ b/src/tools/rbd/action/Import.cc
@@ -0,0 +1,1037 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/Context.h"
+#include "common/blkdev.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "include/compat.h"
+#include "include/encoding.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/scoped_ptr.hpp>
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+
+namespace rbd {
+namespace action {
+namespace import {
+
+struct ImportDiffContext {
+ librbd::Image *image;
+ int fd;
+ size_t size;
+ utils::ProgressContext pc;
+ OrderedThrottle throttle;
+ uint64_t last_offset;
+
+ ImportDiffContext(librbd::Image *image, int fd, size_t size, bool no_progress)
+ : image(image), fd(fd), size(size), pc("Importing image diff", no_progress),
+ throttle((fd == STDIN_FILENO) ? 1 :
+ g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"),
+ false),
+ last_offset(0) {
+ }
+
+ void update_size(size_t new_size)
+ {
+ if (fd == STDIN_FILENO) {
+ size = new_size;
+ }
+ }
+
+ void update_progress(uint64_t off)
+ {
+ if (size) {
+ pc.update_progress(off, size);
+ last_offset = off;
+ }
+ }
+
+ void update_progress()
+ {
+ uint64_t off = last_offset;
+ if (fd != STDIN_FILENO) {
+ off = lseek(fd, 0, SEEK_CUR);
+ }
+
+ update_progress(off);
+ }
+
+ void finish(int r)
+ {
+ if (r < 0) {
+ pc.fail();
+ } else {
+ pc.finish();
+ }
+ }
+};
+
+class C_ImportDiff : public Context {
+public:
+ C_ImportDiff(ImportDiffContext *idiffctx, bufferlist data, uint64_t offset,
+ uint64_t length, bool write_zeroes)
+ : m_idiffctx(idiffctx), m_data(data), m_offset(offset), m_length(length),
+ m_write_zeroes(write_zeroes) {
+ // use block offset (stdin) or import file position to report
+ // progress.
+ if (m_idiffctx->fd == STDIN_FILENO) {
+ m_prog_offset = offset;
+ } else {
+ m_prog_offset = lseek(m_idiffctx->fd, 0, SEEK_CUR);
+ }
+ }
+
+ int send()
+ {
+ if (m_idiffctx->throttle.pending_error()) {
+ return m_idiffctx->throttle.wait_for_ret();
+ }
+
+ C_OrderedThrottle *ctx = m_idiffctx->throttle.start_op(this);
+ librbd::RBD::AioCompletion *aio_completion =
+ new librbd::RBD::AioCompletion(ctx, &utils::aio_context_callback);
+
+ int r;
+ if (m_write_zeroes) {
+ r = m_idiffctx->image->aio_write_zeroes(m_offset, m_length,
+ aio_completion, 0U,
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+ } else {
+ r = m_idiffctx->image->aio_write2(m_offset, m_length, m_data,
+ aio_completion,
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+ }
+
+ if (r < 0) {
+ aio_completion->release();
+ ctx->complete(r);
+ }
+
+ return r;
+ }
+
+ void finish(int r) override
+ {
+ m_idiffctx->update_progress(m_prog_offset);
+ m_idiffctx->throttle.end_op(r);
+ }
+
+private:
+ ImportDiffContext *m_idiffctx;
+ bufferlist m_data;
+ uint64_t m_offset;
+ uint64_t m_length;
+ bool m_write_zeroes;
+ uint64_t m_prog_offset;
+};
+
+static int do_image_snap_from(ImportDiffContext *idiffctx)
+{
+ int r;
+ string from;
+ r = utils::read_string(idiffctx->fd, 4096, &from); // 4k limit to make sure we don't get a garbage string
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode start snap name" << std::endl;
+ return r;
+ }
+
+ bool exists;
+ r = idiffctx->image->snap_exists2(from.c_str(), &exists);
+ if (r < 0) {
+ std::cerr << "rbd: failed to query start snap state" << std::endl;
+ return r;
+ }
+
+ if (!exists) {
+ std::cerr << "start snapshot '" << from
+ << "' does not exist in the image, aborting" << std::endl;
+ return -EINVAL;
+ }
+
+ idiffctx->update_progress();
+ return 0;
+}
+
+static int do_image_snap_to(ImportDiffContext *idiffctx, std::string *tosnap)
+{
+ int r;
+ string to;
+ r = utils::read_string(idiffctx->fd, 4096, &to); // 4k limit to make sure we don't get a garbage string
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode end snap name" << std::endl;
+ return r;
+ }
+
+ bool exists;
+ r = idiffctx->image->snap_exists2(to.c_str(), &exists);
+ if (r < 0) {
+ std::cerr << "rbd: failed to query end snap state" << std::endl;
+ return r;
+ }
+
+ if (exists) {
+ std::cerr << "end snapshot '" << to << "' already exists, aborting"
+ << std::endl;
+ return -EEXIST;
+ }
+
+ *tosnap = to;
+ idiffctx->update_progress();
+
+ return 0;
+}
+
+static int get_snap_protection_status(ImportDiffContext *idiffctx,
+ bool *is_protected)
+{
+ int r;
+ char buf[sizeof(__u8)];
+ r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode snap protection status" << std::endl;
+ return r;
+ }
+
+ *is_protected = (buf[0] != 0);
+ idiffctx->update_progress();
+
+ return 0;
+}
+
+static int do_image_resize(ImportDiffContext *idiffctx)
+{
+ int r;
+ char buf[sizeof(uint64_t)];
+ uint64_t end_size;
+ r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode image size" << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto p = bl.cbegin();
+ decode(end_size, p);
+
+ uint64_t cur_size;
+ idiffctx->image->size(&cur_size);
+ if (cur_size != end_size) {
+ idiffctx->image->resize(end_size);
+ }
+
+ idiffctx->update_size(end_size);
+ idiffctx->update_progress();
+ return 0;
+}
+
+static int do_image_io(ImportDiffContext *idiffctx, bool write_zeroes,
+ size_t sparse_size)
+{
+ int r;
+ char buf[16];
+ r = safe_read_exact(idiffctx->fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode IO length" << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto p = bl.cbegin();
+
+ uint64_t image_offset, buffer_length;
+ decode(image_offset, p);
+ decode(buffer_length, p);
+
+ if (!write_zeroes) {
+ bufferptr bp = buffer::create(buffer_length);
+ r = safe_read_exact(idiffctx->fd, bp.c_str(), buffer_length);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode write data" << std::endl;
+ return r;
+ }
+
+ size_t buffer_offset = 0;
+ while (buffer_offset < buffer_length) {
+ size_t write_length = 0;
+ bool zeroed = false;
+ utils::calc_sparse_extent(bp, sparse_size, buffer_offset, buffer_length,
+ &write_length, &zeroed);
+ ceph_assert(write_length > 0);
+
+ bufferlist write_bl;
+ if (!zeroed) {
+ bufferptr write_ptr(bp, buffer_offset, write_length);
+ write_bl.push_back(write_ptr);
+ ceph_assert(write_bl.length() == write_length);
+ }
+
+ C_ImportDiff *ctx = new C_ImportDiff(idiffctx, write_bl,
+ image_offset + buffer_offset,
+ write_length, zeroed);
+ r = ctx->send();
+ if (r < 0) {
+ return r;
+ }
+
+ buffer_offset += write_length;
+ }
+ } else {
+ bufferlist data;
+ C_ImportDiff *ctx = new C_ImportDiff(idiffctx, data, image_offset,
+ buffer_length, true);
+ return ctx->send();
+ }
+ return r;
+}
+
+static int validate_banner(int fd, std::string banner)
+{
+ int r;
+ char buf[banner.size() + 1];
+ memset(buf, 0, sizeof(buf));
+ r = safe_read_exact(fd, buf, banner.size());
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode diff banner" << std::endl;
+ return r;
+ }
+
+ buf[banner.size()] = '\0';
+ if (strcmp(buf, banner.c_str())) {
+ std::cerr << "rbd: invalid or unexpected diff banner" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int skip_tag(int fd, uint64_t length)
+{
+ int r;
+
+ if (fd == STDIN_FILENO) {
+ // read the appending data out to skip this tag.
+ char buf[4096];
+ uint64_t len = min<uint64_t>(length, sizeof(buf));
+ while (len > 0) {
+ r = safe_read_exact(fd, buf, len);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode skipped tag data" << std::endl;
+ return r;
+ }
+ length -= len;
+ len = min<uint64_t>(length, sizeof(buf));
+ }
+ } else {
+ // lseek to skip this tag
+ off64_t offs = lseek64(fd, length, SEEK_CUR);
+ if (offs < 0) {
+ return -errno;
+ }
+ }
+
+ return 0;
+}
+
+static int read_tag(int fd, __u8 end_tag, int format, __u8 *tag, uint64_t *readlen)
+{
+ int r;
+ __u8 read_tag;
+
+ r = safe_read_exact(fd, &read_tag, sizeof(read_tag));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode tag" << std::endl;
+ return r;
+ }
+
+ *tag = read_tag;
+ if (read_tag != end_tag && format == 2) {
+ char buf[sizeof(uint64_t)];
+ r = safe_read_exact(fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode tag length" << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto p = bl.cbegin();
+ decode(*readlen, p);
+ }
+
+ return 0;
+}
+
+int do_import_diff_fd(librados::Rados &rados, librbd::Image &image, int fd,
+ bool no_progress, int format, size_t sparse_size)
+{
+ int r;
+
+ uint64_t size = 0;
+ bool from_stdin = (fd == STDIN_FILENO);
+ if (!from_stdin) {
+ struct stat stat_buf;
+ r = ::fstat(fd, &stat_buf);
+ if (r < 0) {
+ std::cerr << "rbd: failed to stat specified diff file" << std::endl;
+ return r;
+ }
+ size = (uint64_t)stat_buf.st_size;
+ }
+
+ r = validate_banner(fd, (format == 1 ? utils::RBD_DIFF_BANNER :
+ utils::RBD_DIFF_BANNER_V2));
+ if (r < 0) {
+ return r;
+ }
+
+ // begin image import
+ std::string tosnap;
+ bool is_protected = false;
+ ImportDiffContext idiffctx(&image, fd, size, no_progress);
+ while (r == 0) {
+ __u8 tag;
+ uint64_t length = 0;
+
+ r = read_tag(fd, RBD_DIFF_END, format, &tag, &length);
+ if (r < 0 || tag == RBD_DIFF_END) {
+ break;
+ }
+
+ if (tag == RBD_DIFF_FROM_SNAP) {
+ r = do_image_snap_from(&idiffctx);
+ } else if (tag == RBD_DIFF_TO_SNAP) {
+ r = do_image_snap_to(&idiffctx, &tosnap);
+ } else if (tag == RBD_SNAP_PROTECTION_STATUS) {
+ r = get_snap_protection_status(&idiffctx, &is_protected);
+ } else if (tag == RBD_DIFF_IMAGE_SIZE) {
+ r = do_image_resize(&idiffctx);
+ } else if (tag == RBD_DIFF_WRITE || tag == RBD_DIFF_ZERO) {
+ r = do_image_io(&idiffctx, (tag == RBD_DIFF_ZERO), sparse_size);
+ } else {
+ std::cerr << "unrecognized tag byte " << (int)tag << " in stream; skipping"
+ << std::endl;
+ r = skip_tag(fd, length);
+ }
+ }
+
+ int temp_r = idiffctx.throttle.wait_for_ret();
+ r = (r < 0) ? r : temp_r; // preserve original error
+ if (r == 0 && tosnap.length()) {
+ r = idiffctx.image->snap_create(tosnap.c_str());
+ if (r == 0 && is_protected) {
+ r = idiffctx.image->snap_protect(tosnap.c_str());
+ }
+ }
+
+ idiffctx.finish(r);
+ return r;
+}
+
+int do_import_diff(librados::Rados &rados, librbd::Image &image,
+ const char *path, bool no_progress, size_t sparse_size)
+{
+ int r;
+ int fd;
+
+ if (strcmp(path, "-") == 0) {
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(path, O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << path << std::endl;
+ return r;
+ }
+ }
+ r = do_import_diff_fd(rados, image, fd, no_progress, 1, sparse_size);
+
+ if (fd != 0)
+ close(fd);
+ return r;
+}
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments_diff(po::options_description *positional,
+ po::options_description *options) {
+ at::add_path_options(positional, options,
+ "import file (or '-' for stdin)");
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_sparse_size_option(options);
+ at::add_no_progress_option(options);
+}
+
+int execute_diff(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string path;
+ size_t arg_index = 0;
+ int r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE;
+ if (vm.count(at::IMAGE_SPARSE_SIZE)) {
+ sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_import_diff(rados, image, path.c_str(),
+ vm[at::NO_PROGRESS].as<bool>(), sparse_size);
+ if (r == -EDOM) {
+ r = -EBADMSG;
+ }
+ if (r < 0) {
+ cerr << "rbd: import-diff failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_diff(
+ {"import-diff"}, {}, "Import an incremental diff.", "", &get_arguments_diff,
+ &execute_diff);
+
+class C_Import : public Context {
+public:
+ C_Import(SimpleThrottle &simple_throttle, librbd::Image &image,
+ bufferlist &bl, uint64_t offset)
+ : m_throttle(simple_throttle), m_image(image),
+ m_aio_completion(
+ new librbd::RBD::AioCompletion(this, &utils::aio_context_callback)),
+ m_bufferlist(bl), m_offset(offset)
+ {
+ }
+
+ void send()
+ {
+ m_throttle.start_op();
+
+ int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+ int r = m_image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist,
+ m_aio_completion, op_flags);
+ if (r < 0) {
+ std::cerr << "rbd: error requesting write to destination image"
+ << std::endl;
+ m_aio_completion->release();
+ m_throttle.end_op(r);
+ }
+ }
+
+ void finish(int r) override
+ {
+ if (r < 0) {
+ std::cerr << "rbd: error writing to destination image at offset "
+ << m_offset << ": " << cpp_strerror(r) << std::endl;
+ }
+ m_throttle.end_op(r);
+ }
+
+private:
+ SimpleThrottle &m_throttle;
+ librbd::Image &m_image;
+ librbd::RBD::AioCompletion *m_aio_completion;
+ bufferlist m_bufferlist;
+ uint64_t m_offset;
+};
+
+static int decode_and_set_image_option(int fd, uint64_t imageopt, librbd::ImageOptions& opts)
+{
+ int r;
+ char buf[sizeof(uint64_t)];
+
+ r = safe_read_exact(fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode image option" << std::endl;
+ return r;
+ }
+
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto it = bl.cbegin();
+
+ uint64_t val;
+ decode(val, it);
+
+ if (opts.get(imageopt, &val) != 0) {
+ opts.set(imageopt, val);
+ }
+
+ return 0;
+}
+
+static int do_import_metadata(int import_format, librbd::Image& image,
+ const std::map<std::string, std::string> &imagemetas)
+{
+ int r = 0;
+
+ //v1 format
+ if (import_format == 1) {
+ return 0;
+ }
+
+ for (std::map<std::string, std::string>::const_iterator it = imagemetas.begin();
+ it != imagemetas.end(); ++it) {
+ r = image.metadata_set(it->first, it->second);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int decode_imagemeta(int fd, uint64_t length, std::map<std::string, std::string>* imagemetas)
+{
+ int r;
+ string key;
+ string value;
+
+ r = utils::read_string(fd, length, &key);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode metadata key" << std::endl;
+ return r;
+ }
+
+ r = utils::read_string(fd, length, &value);
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode metadata value" << std::endl;
+ return r;
+ }
+
+ (*imagemetas)[key] = value;
+ return 0;
+}
+
+static int do_import_header(int fd, int import_format, librbd::ImageOptions& opts,
+ std::map<std::string, std::string>* imagemetas)
+{
+ // There is no header in v1 image.
+ if (import_format == 1) {
+ return 0;
+ }
+
+ int r;
+ r = validate_banner(fd, utils::RBD_IMAGE_BANNER_V2);
+ if (r < 0) {
+ return r;
+ }
+
+ // As V1 format for image is already deprecated, import image in V2 by default.
+ uint64_t image_format = 2;
+ if (opts.get(RBD_IMAGE_OPTION_FORMAT, &image_format) != 0) {
+ opts.set(RBD_IMAGE_OPTION_FORMAT, image_format);
+ }
+
+ while (r == 0) {
+ __u8 tag;
+ uint64_t length = 0;
+ r = read_tag(fd, RBD_EXPORT_IMAGE_END, image_format, &tag, &length);
+ if (r < 0 || tag == RBD_EXPORT_IMAGE_END) {
+ break;
+ }
+
+ if (tag == RBD_EXPORT_IMAGE_ORDER) {
+ r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_ORDER, opts);
+ } else if (tag == RBD_EXPORT_IMAGE_FEATURES) {
+ r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_FEATURES, opts);
+ } else if (tag == RBD_EXPORT_IMAGE_STRIPE_UNIT) {
+ r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_UNIT, opts);
+ } else if (tag == RBD_EXPORT_IMAGE_STRIPE_COUNT) {
+ r = decode_and_set_image_option(fd, RBD_IMAGE_OPTION_STRIPE_COUNT, opts);
+ } else if (tag == RBD_EXPORT_IMAGE_META) {
+ r = decode_imagemeta(fd, length, imagemetas);
+ } else {
+ std::cerr << "rbd: invalid tag in image properties zone: " << tag << "Skip it."
+ << std::endl;
+ r = skip_tag(fd, length);
+ }
+ }
+
+ return r;
+}
+
+static int do_import_v2(librados::Rados &rados, int fd, librbd::Image &image,
+ uint64_t size, size_t imgblklen,
+ utils::ProgressContext &pc, size_t sparse_size)
+{
+ int r = 0;
+ r = validate_banner(fd, utils::RBD_IMAGE_DIFFS_BANNER_V2);
+ if (r < 0) {
+ return r;
+ }
+
+ char buf[sizeof(uint64_t)];
+ r = safe_read_exact(fd, buf, sizeof(buf));
+ if (r < 0) {
+ std::cerr << "rbd: failed to decode diff count" << std::endl;
+ return r;
+ }
+ bufferlist bl;
+ bl.append(buf, sizeof(buf));
+ auto p = bl.cbegin();
+ uint64_t diff_num;
+ decode(diff_num, p);
+ for (size_t i = 0; i < diff_num; i++) {
+ r = do_import_diff_fd(rados, image, fd, true, 2, sparse_size);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: import-diff failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ pc.update_progress(i + 1, diff_num);
+ }
+
+ return r;
+}
+
+static int do_import_v1(int fd, librbd::Image &image, uint64_t size,
+ size_t imgblklen, utils::ProgressContext &pc,
+ size_t sparse_size)
+{
+ int r = 0;
+ size_t reqlen = imgblklen; // amount requested from read
+ ssize_t readlen; // amount received from one read
+ size_t blklen = 0; // amount accumulated from reads to fill blk
+ char *p = new char[imgblklen];
+ uint64_t image_pos = 0;
+ bool from_stdin = (fd == STDIN_FILENO);
+ boost::scoped_ptr<SimpleThrottle> throttle;
+
+ if (from_stdin) {
+ throttle.reset(new SimpleThrottle(1, false));
+ } else {
+ throttle.reset(new SimpleThrottle(
+ g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"), false));
+ }
+
+ reqlen = min<uint64_t>(reqlen, size);
+ // loop body handles 0 return, as we may have a block to flush
+ while ((readlen = ::read(fd, p + blklen, reqlen)) >= 0) {
+ if (throttle->pending_error()) {
+ break;
+ }
+
+ blklen += readlen;
+ // if read was short, try again to fill the block before writing
+ if (readlen && ((size_t)readlen < reqlen)) {
+ reqlen -= readlen;
+ continue;
+ }
+ if (!from_stdin)
+ pc.update_progress(image_pos, size);
+
+ bufferptr blkptr(p, blklen);
+ // resize output image by binary expansion as we go for stdin
+ if (from_stdin && (image_pos + (size_t)blklen) > size) {
+ size *= 2;
+ r = image.resize(size);
+ if (r < 0) {
+ std::cerr << "rbd: can't resize image during import" << std::endl;
+ goto out;
+ }
+ }
+
+ // write as much as we got; perhaps less than imgblklen
+ // but skip writing zeros to create sparse images
+ size_t buffer_offset = 0;
+ while (buffer_offset < blklen) {
+ size_t write_length = 0;
+ bool zeroed = false;
+ utils::calc_sparse_extent(blkptr, sparse_size, buffer_offset, blklen,
+ &write_length, &zeroed);
+
+ if (!zeroed) {
+ bufferlist write_bl;
+ bufferptr write_ptr(blkptr, buffer_offset, write_length);
+ write_bl.push_back(write_ptr);
+ ceph_assert(write_bl.length() == write_length);
+
+ C_Import *ctx = new C_Import(*throttle, image, write_bl,
+ image_pos + buffer_offset);
+ ctx->send();
+ }
+
+ buffer_offset += write_length;
+ }
+
+ // done with whole block, whether written or not
+ image_pos += blklen;
+ if (!from_stdin && image_pos >= size)
+ break;
+ // if read had returned 0, we're at EOF and should quit
+ if (readlen == 0)
+ break;
+ blklen = 0;
+ reqlen = imgblklen;
+ }
+ r = throttle->wait_for_ret();
+ if (r < 0) {
+ goto out;
+ }
+
+ if (fd == STDIN_FILENO) {
+ r = image.resize(image_pos);
+ if (r < 0) {
+ std::cerr << "rbd: final image resize failed" << std::endl;
+ goto out;
+ }
+ }
+out:
+ delete[] p;
+ return r;
+}
+
+static int do_import(librados::Rados &rados, librbd::RBD &rbd,
+ librados::IoCtx& io_ctx, const char *imgname,
+ const char *path, librbd::ImageOptions& opts,
+ bool no_progress, int import_format, size_t sparse_size)
+{
+ int fd, r;
+ struct stat stat_buf;
+ utils::ProgressContext pc("Importing image", no_progress);
+ std::map<std::string, std::string> imagemetas;
+
+ ceph_assert(imgname);
+
+ uint64_t order;
+ if (opts.get(RBD_IMAGE_OPTION_ORDER, &order) != 0) {
+ order = g_conf().get_val<uint64_t>("rbd_default_order");
+ }
+
+ // try to fill whole imgblklen blocks for sparsification
+ size_t imgblklen = 1 << order;
+ librbd::Image image;
+ uint64_t size = 0;
+
+ bool from_stdin = !strcmp(path, "-");
+ if (from_stdin) {
+ fd = STDIN_FILENO;
+ size = 1ULL << order;
+ } else {
+ if ((fd = open(path, O_RDONLY)) < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << path << std::endl;
+ goto done2;
+ }
+
+ if ((fstat(fd, &stat_buf)) < 0) {
+ r = -errno;
+ std::cerr << "rbd: stat error " << path << std::endl;
+ goto done;
+ }
+ if (S_ISDIR(stat_buf.st_mode)) {
+ r = -EISDIR;
+ std::cerr << "rbd: cannot import a directory" << std::endl;
+ goto done;
+ }
+ if (stat_buf.st_size)
+ size = (uint64_t)stat_buf.st_size;
+
+ if (!size) {
+ int64_t bdev_size = 0;
+ BlkDev blkdev(fd);
+ r = blkdev.get_size(&bdev_size);
+ if (r < 0) {
+ std::cerr << "rbd: unable to get size of file/block device"
+ << std::endl;
+ goto done;
+ }
+ ceph_assert(bdev_size >= 0);
+ size = (uint64_t) bdev_size;
+ }
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+ }
+
+ r = do_import_header(fd, import_format, opts, &imagemetas);
+ if (r < 0) {
+ std::cerr << "rbd: import header failed." << std::endl;
+ goto done;
+ }
+
+ r = rbd.create4(io_ctx, imgname, size, opts);
+ if (r < 0) {
+ std::cerr << "rbd: image creation failed" << std::endl;
+ goto done;
+ }
+
+ r = rbd.open(io_ctx, image, imgname);
+ if (r < 0) {
+ std::cerr << "rbd: failed to open image" << std::endl;
+ goto err;
+ }
+
+ r = do_import_metadata(import_format, image, imagemetas);
+ if (r < 0) {
+ std::cerr << "rbd: failed to import image-meta" << std::endl;
+ goto err;
+ }
+
+ if (import_format == 1) {
+ r = do_import_v1(fd, image, size, imgblklen, pc, sparse_size);
+ } else {
+ r = do_import_v2(rados, fd, image, size, imgblklen, pc, sparse_size);
+ }
+ if (r < 0) {
+ std::cerr << "rbd: failed to import image" << std::endl;
+ image.close();
+ goto err;
+ }
+
+ r = image.close();
+err:
+ if (r < 0)
+ rbd.remove(io_ctx, imgname);
+done:
+ if (r < 0)
+ pc.fail();
+ else
+ pc.finish();
+ if (!from_stdin)
+ close(fd);
+done2:
+ return r;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_path_options(positional, options,
+ "import file (or '-' for stdin)");
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, true);
+ at::add_sparse_size_option(options);
+ at::add_no_progress_option(options);
+ at::add_export_format_option(options);
+
+ // TODO legacy rbd allowed import to accept both 'image'/'dest' and
+ // 'pool'/'dest-pool'
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE, " (deprecated)");
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE, " (deprecated)");
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string path;
+ size_t arg_index = 0;
+ int r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ // odd check to support legacy / deprecated behavior of import
+ std::string deprecated_pool_name;
+ if (vm.count(at::POOL_NAME)) {
+ deprecated_pool_name = vm[at::POOL_NAME].as<std::string>();
+ std::cerr << "rbd: --pool is deprecated for import, use --dest-pool"
+ << std::endl;
+ }
+
+ std::string deprecated_image_name;
+ if (vm.count(at::IMAGE_NAME)) {
+ deprecated_image_name = vm[at::IMAGE_NAME].as<std::string>();
+ std::cerr << "rbd: --image is deprecated for import, use --dest"
+ << std::endl;
+ } else {
+ deprecated_image_name = path.substr(path.find_last_of("/") + 1);
+ }
+
+ std::string deprecated_snap_name;
+ r = utils::extract_spec(deprecated_image_name, &deprecated_pool_name,
+ nullptr, &deprecated_image_name,
+ &deprecated_snap_name, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE;
+ if (vm.count(at::IMAGE_SPARSE_SIZE)) {
+ sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>();
+ }
+
+ std::string pool_name = deprecated_pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name = deprecated_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, false, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (image_name.empty()) {
+ image_name = deprecated_image_name;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, true, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ int format = 1;
+ if (vm.count("export-format"))
+ format = vm["export-format"].as<uint64_t>();
+
+ librbd::RBD rbd;
+ r = do_import(rados, rbd, io_ctx, image_name.c_str(), path.c_str(),
+ opts, vm[at::NO_PROGRESS].as<bool>(), format, sparse_size);
+ if (r < 0) {
+ std::cerr << "rbd: import failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+Shell::Action action(
+ {"import"}, {}, "Import image from file.", at::get_long_features_help(),
+ &get_arguments, &execute);
+
+} // namespace import
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Info.cc b/src/tools/rbd/action/Info.cc
new file mode 100644
index 00000000..5adacb92
--- /dev/null
+++ b/src/tools/rbd/action/Info.cc
@@ -0,0 +1,459 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+#include "common/Clock.h"
+
+namespace rbd {
+namespace action {
+namespace info {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static void format_bitmask(Formatter *f, const std::string &name,
+ const std::map<uint64_t, std::string>& mapping,
+ uint64_t bitmask)
+{
+ int count = 0;
+ std::string group_name(name + "s");
+ if (f == NULL) {
+ std::cout << "\t" << group_name << ": ";
+ } else {
+ f->open_array_section(group_name.c_str());
+ }
+ for (std::map<uint64_t, std::string>::const_iterator it = mapping.begin();
+ it != mapping.end(); ++it) {
+ if ((it->first & bitmask) == 0) {
+ continue;
+ }
+
+ if (f == NULL) {
+ if (count++ > 0) {
+ std::cout << ", ";
+ }
+ std::cout << it->second;
+ } else {
+ f->dump_string(name.c_str(), it->second);
+ }
+ }
+ if (f == NULL) {
+ std::cout << std::endl;
+ } else {
+ f->close_section();
+ }
+}
+
+static void format_features(Formatter *f, uint64_t features)
+{
+ format_bitmask(f, "feature", at::ImageFeatures::FEATURE_MAPPING, features);
+}
+
+static void format_op_features(Formatter *f, uint64_t op_features)
+{
+ static std::map<uint64_t, std::string> mapping = {
+ {RBD_OPERATION_FEATURE_CLONE_PARENT, RBD_OPERATION_FEATURE_NAME_CLONE_PARENT},
+ {RBD_OPERATION_FEATURE_CLONE_CHILD, RBD_OPERATION_FEATURE_NAME_CLONE_CHILD},
+ {RBD_OPERATION_FEATURE_GROUP, RBD_OPERATION_FEATURE_NAME_GROUP},
+ {RBD_OPERATION_FEATURE_SNAP_TRASH, RBD_OPERATION_FEATURE_NAME_SNAP_TRASH}};
+ format_bitmask(f, "op_feature", mapping, op_features);
+}
+
+static void format_flags(Formatter *f, uint64_t flags)
+{
+ std::map<uint64_t, std::string> mapping = {
+ {RBD_FLAG_OBJECT_MAP_INVALID, "object map invalid"},
+ {RBD_FLAG_FAST_DIFF_INVALID, "fast diff invalid"}};
+ format_bitmask(f, "flag", mapping, flags);
+}
+
+void format_timestamp(struct timespec timestamp, std::string &timestamp_str) {
+ if(timestamp.tv_sec != 0) {
+ time_t ts = timestamp.tv_sec;
+ timestamp_str = ctime(&ts);
+ timestamp_str = timestamp_str.substr(0, timestamp_str.length() - 1);
+ }
+}
+
+static int do_show_info(librados::IoCtx &io_ctx, librbd::Image& image,
+ const std::string &snapname, Formatter *f)
+{
+ librbd::image_info_t info;
+ uint8_t old_format;
+ uint64_t overlap, features, flags, snap_limit;
+ bool snap_protected = false;
+ librbd::mirror_image_info_t mirror_image;
+ std::vector<librbd::snap_info_t> snaps;
+ int r;
+
+ std::string imgname;
+ r = image.get_name(&imgname);
+ if (r < 0)
+ return r;
+
+ r = image.snap_list(snaps);
+ if (r < 0)
+ return r;
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ r = image.old_format(&old_format);
+ if (r < 0)
+ return r;
+
+ std::string imgid;
+ if (!old_format) {
+ r = image.get_id(&imgid);
+ if (r < 0)
+ return r;
+ }
+
+ std::string data_pool;
+ if (!old_format) {
+ int64_t data_pool_id = image.get_data_pool_id();
+ if (data_pool_id != io_ctx.get_id()) {
+ librados::Rados rados(io_ctx);
+ librados::IoCtx data_io_ctx;
+ r = rados.ioctx_create2(data_pool_id, data_io_ctx);
+ if (r < 0) {
+ data_pool = "<missing data pool " + stringify(data_pool_id) + ">";
+ } else {
+ data_pool = data_io_ctx.get_pool_name();
+ }
+ }
+ }
+
+ r = image.overlap(&overlap);
+ if (r < 0)
+ return r;
+
+ r = image.features(&features);
+ if (r < 0)
+ return r;
+
+ uint64_t op_features;
+ r = image.get_op_features(&op_features);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.get_flags(&flags);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!snapname.empty()) {
+ r = image.snap_is_protected(snapname.c_str(), &snap_protected);
+ if (r < 0)
+ return r;
+ }
+
+ if (features & RBD_FEATURE_JOURNALING) {
+ r = image.mirror_image_get_info(&mirror_image, sizeof(mirror_image));
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ r = image.snap_get_limit(&snap_limit);
+ if (r < 0)
+ return r;
+
+ std::string prefix = image.get_block_name_prefix();
+
+ librbd::group_info_t group_info;
+ r = image.get_group(&group_info, sizeof(group_info));
+ if (r < 0) {
+ return r;
+ }
+
+ std::string group_string = "";
+ if (RBD_GROUP_INVALID_POOL != group_info.pool) {
+ std::string group_pool;
+ librados::Rados rados(io_ctx);
+ librados::IoCtx group_io_ctx;
+ r = rados.ioctx_create2(group_info.pool, group_io_ctx);
+ if (r < 0) {
+ group_pool = "<missing group pool " + stringify(group_info.pool) + ">";
+ } else {
+ group_pool = group_io_ctx.get_pool_name();
+ }
+
+ group_string = group_pool + "/";
+ if (!io_ctx.get_namespace().empty()) {
+ group_string += io_ctx.get_namespace() + "/";
+ }
+ group_string += group_info.name;
+ }
+
+ struct timespec create_timestamp;
+ image.get_create_timestamp(&create_timestamp);
+
+ std::string create_timestamp_str = "";
+ format_timestamp(create_timestamp, create_timestamp_str);
+
+ struct timespec access_timestamp;
+ image.get_access_timestamp(&access_timestamp);
+
+ std::string access_timestamp_str = "";
+ format_timestamp(access_timestamp, access_timestamp_str);
+
+ struct timespec modify_timestamp;
+ image.get_modify_timestamp(&modify_timestamp);
+
+ std::string modify_timestamp_str = "";
+ format_timestamp(modify_timestamp, modify_timestamp_str);
+
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("name", imgname);
+ f->dump_string("id", imgid);
+ f->dump_unsigned("size", info.size);
+ f->dump_unsigned("objects", info.num_objs);
+ f->dump_int("order", info.order);
+ f->dump_unsigned("object_size", info.obj_size);
+ f->dump_int("snapshot_count", snaps.size());
+ if (!data_pool.empty()) {
+ f->dump_string("data_pool", data_pool);
+ }
+ f->dump_string("block_name_prefix", prefix);
+ f->dump_int("format", (old_format ? 1 : 2));
+ } else {
+ std::cout << "rbd image '" << imgname << "':\n"
+ << "\tsize " << byte_u_t(info.size) << " in "
+ << info.num_objs << " objects"
+ << std::endl
+ << "\torder " << info.order
+ << " (" << byte_u_t(info.obj_size) << " objects)"
+ << std::endl
+ << "\tsnapshot_count: " << snaps.size()
+ << std::endl;
+ if (!imgid.empty()) {
+ std::cout << "\tid: " << imgid << std::endl;
+ }
+ if (!data_pool.empty()) {
+ std::cout << "\tdata_pool: " << data_pool << std::endl;
+ }
+ std::cout << "\tblock_name_prefix: " << prefix
+ << std::endl
+ << "\tformat: " << (old_format ? "1" : "2")
+ << std::endl;
+ }
+
+ if (!old_format) {
+ format_features(f, features);
+ format_op_features(f, op_features);
+ format_flags(f, flags);
+ }
+
+ if (!group_string.empty()) {
+ if (f) {
+ f->dump_string("group", group_string);
+ } else {
+ std::cout << "\tgroup: " << group_string
+ << std::endl;
+ }
+ }
+
+ if (!create_timestamp_str.empty()) {
+ if (f) {
+ f->dump_string("create_timestamp", create_timestamp_str);
+ } else {
+ std::cout << "\tcreate_timestamp: " << create_timestamp_str
+ << std::endl;
+ }
+ }
+
+ if (!access_timestamp_str.empty()) {
+ if (f) {
+ f->dump_string("access_timestamp", access_timestamp_str);
+ } else {
+ std::cout << "\taccess_timestamp: " << access_timestamp_str
+ << std::endl;
+ }
+ }
+
+ if (!modify_timestamp_str.empty()) {
+ if (f) {
+ f->dump_string("modify_timestamp", modify_timestamp_str);
+ } else {
+ std::cout << "\tmodify_timestamp: " << modify_timestamp_str
+ << std::endl;
+ }
+ }
+
+ // snapshot info, if present
+ if (!snapname.empty()) {
+ if (f) {
+ f->dump_string("protected", snap_protected ? "true" : "false");
+ } else {
+ std::cout << "\tprotected: " << (snap_protected ? "True" : "False")
+ << std::endl;
+ }
+ }
+
+ if (snap_limit < UINT64_MAX) {
+ if (f) {
+ f->dump_unsigned("snapshot_limit", snap_limit);
+ } else {
+ std::cout << "\tsnapshot_limit: " << snap_limit << std::endl;
+ }
+ }
+
+ // parent info, if present
+ librbd::linked_image_spec_t parent_image_spec;
+ librbd::snap_spec_t parent_snap_spec;
+ if ((image.get_parent(&parent_image_spec, &parent_snap_spec) == 0) &&
+ (parent_image_spec.image_name.length() > 0)) {
+ if (f) {
+ f->open_object_section("parent");
+ f->dump_string("pool", parent_image_spec.pool_name);
+ f->dump_string("pool_namespace", parent_image_spec.pool_namespace);
+ f->dump_string("image", parent_image_spec.image_name);
+ f->dump_string("id", parent_image_spec.image_id);
+ f->dump_string("snapshot", parent_snap_spec.name);
+ f->dump_bool("trash", parent_image_spec.trash);
+ f->dump_unsigned("overlap", overlap);
+ f->close_section();
+ } else {
+ std::cout << "\tparent: " << parent_image_spec.pool_name << "/";
+ if (!parent_image_spec.pool_namespace.empty()) {
+ std::cout << parent_image_spec.pool_namespace << "/";
+ }
+ std::cout << parent_image_spec.image_name << "@"
+ << parent_snap_spec.name;
+ if (parent_image_spec.trash) {
+ std::cout << " (trash " << parent_image_spec.image_id << ")";
+ }
+ std::cout << std::endl;
+ std::cout << "\toverlap: " << byte_u_t(overlap) << std::endl;
+ }
+ }
+
+ // striping info, if feature is set
+ if (features & RBD_FEATURE_STRIPINGV2) {
+ if (f) {
+ f->dump_unsigned("stripe_unit", image.get_stripe_unit());
+ f->dump_unsigned("stripe_count", image.get_stripe_count());
+ } else {
+ std::cout << "\tstripe unit: " << byte_u_t(image.get_stripe_unit())
+ << std::endl
+ << "\tstripe count: " << image.get_stripe_count() << std::endl;
+ }
+ }
+
+ if (features & RBD_FEATURE_JOURNALING) {
+ if (f) {
+ f->dump_string("journal", utils::image_id(image));
+ } else {
+ std::cout << "\tjournal: " << utils::image_id(image) << std::endl;
+ }
+ }
+
+ if (features & RBD_FEATURE_JOURNALING) {
+ if (f) {
+ f->open_object_section("mirroring");
+ f->dump_string("state",
+ utils::mirror_image_state(mirror_image.state));
+ if (mirror_image.state != RBD_MIRROR_IMAGE_DISABLED) {
+ f->dump_string("global_id", mirror_image.global_id);
+ f->dump_bool("primary", mirror_image.primary);
+ }
+ f->close_section();
+ } else {
+ std::cout << "\tmirroring state: "
+ << utils::mirror_image_state(mirror_image.state) << std::endl;
+ if (mirror_image.state != RBD_MIRROR_IMAGE_DISABLED) {
+ std::cout << "\tmirroring global id: " << mirror_image.global_id
+ << std::endl
+ << "\tmirroring primary: "
+ << (mirror_image.primary ? "true" : "false") <<std::endl;
+ }
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ utils::SNAPSHOT_PRESENCE_PERMITTED, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name,
+ image_id, snap_name, true, &rados, &io_ctx,
+ &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_show_info(io_ctx, image, snap_name, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: info: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"info"}, {}, "Show information about image size, striping, etc.", "",
+ &get_arguments, &execute);
+
+} // namespace info
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Journal.cc b/src/tools/rbd/action/Journal.cc
new file mode 100644
index 00000000..d3a54f94
--- /dev/null
+++ b/src/tools/rbd/action/Journal.cc
@@ -0,0 +1,1254 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/Cond.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "include/stringify.h"
+#include <fstream>
+#include <sstream>
+#include <boost/program_options.hpp>
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/journal/cls_journal_types.h"
+#include "cls/journal/cls_journal_client.h"
+
+#include "journal/Journaler.h"
+#include "journal/ReplayEntry.h"
+#include "journal/ReplayHandler.h"
+#include "journal/Settings.h"
+#include "librbd/journal/Types.h"
+
+namespace rbd {
+namespace action {
+namespace journal {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static const std::string JOURNAL_SPEC("journal-spec");
+static const std::string JOURNAL_NAME("journal");
+static const std::string DEST_JOURNAL_NAME("dest-journal");
+
+void add_journal_option(po::options_description *opt,
+ at::ArgumentModifier modifier) {
+ std::string name = JOURNAL_NAME;
+ std::string description = at::get_description_prefix(modifier) +
+ "journal name";
+ switch (modifier) {
+ case at::ARGUMENT_MODIFIER_NONE:
+ case at::ARGUMENT_MODIFIER_SOURCE:
+ break;
+ case at::ARGUMENT_MODIFIER_DEST:
+ name = DEST_JOURNAL_NAME;
+ break;
+ }
+
+ // TODO add validator
+ opt->add_options()
+ (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_journal_spec_options(po::options_description *pos,
+ po::options_description *opt,
+ at::ArgumentModifier modifier) {
+
+ pos->add_options()
+ ((get_name_prefix(modifier) + JOURNAL_SPEC).c_str(),
+ (get_description_prefix(modifier) + "journal specification\n" +
+ "(example: [<pool-name>/[<namespace>/]]<journal-name>)").c_str());
+ add_pool_option(opt, modifier);
+ add_namespace_option(opt, modifier);
+ add_image_option(opt, modifier);
+ add_journal_option(opt, modifier);
+}
+
+int get_pool_journal_names(const po::variables_map &vm,
+ at::ArgumentModifier mod,
+ size_t *spec_arg_index,
+ std::string *pool_name,
+ std::string *namespace_name,
+ std::string *journal_name) {
+ std::string pool_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_POOL_NAME : at::POOL_NAME);
+ std::string namespace_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_NAMESPACE_NAME : at::NAMESPACE_NAME);
+ std::string image_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ at::DEST_IMAGE_NAME : at::IMAGE_NAME);
+ std::string journal_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+ DEST_JOURNAL_NAME : JOURNAL_NAME);
+
+ if (vm.count(pool_key) && pool_name != nullptr) {
+ *pool_name = vm[pool_key].as<std::string>();
+ }
+ if (vm.count(namespace_key) && namespace_name != nullptr) {
+ *namespace_name = vm[namespace_key].as<std::string>();
+ }
+ if (vm.count(journal_key) && journal_name != nullptr) {
+ *journal_name = vm[journal_key].as<std::string>();
+ }
+
+ std::string image_name;
+ if (vm.count(image_key)) {
+ image_name = vm[image_key].as<std::string>();
+ }
+
+ int r;
+ if (journal_name != nullptr && !journal_name->empty()) {
+ // despite the separate pool option,
+ // we can also specify them via the journal option
+ std::string journal_name_copy(*journal_name);
+ r = extract_spec(journal_name_copy, pool_name, namespace_name, journal_name,
+ nullptr, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (!image_name.empty()) {
+ // despite the separate pool option,
+ // we can also specify them via the image option
+ std::string image_name_copy(image_name);
+ r = extract_spec(image_name_copy, pool_name, namespace_name, &image_name,
+ nullptr, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (journal_name != nullptr && spec_arg_index != nullptr &&
+ journal_name->empty()) {
+ std::string spec = utils::get_positional_argument(vm, (*spec_arg_index)++);
+ if (!spec.empty()) {
+ r = extract_spec(spec, pool_name, namespace_name, journal_name, nullptr,
+ utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+ }
+ }
+
+ if (pool_name != nullptr && pool_name->empty()) {
+ *pool_name = utils::get_default_pool_name();
+ }
+
+ if (pool_name != nullptr && namespace_name != nullptr &&
+ journal_name != nullptr && journal_name->empty() && !image_name.empty()) {
+ // Try to get journal name from image info.
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ int r = utils::init_and_open_image(*pool_name, *namespace_name, image_name,
+ "", "", true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ std::cerr << "rbd: failed to open image " << image_name
+ << " to get journal name: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ uint64_t features;
+ r = image.features(&features);
+ if (r < 0) {
+ return r;
+ }
+ if ((features & RBD_FEATURE_JOURNALING) == 0) {
+ std::cerr << "rbd: journaling is not enabled for image " << image_name
+ << std::endl;
+ return -EINVAL;
+ }
+ *journal_name = utils::image_id(image);
+ }
+
+ if (journal_name != nullptr && journal_name->empty()) {
+ std::string prefix = at::get_description_prefix(mod);
+ std::cerr << "rbd: "
+ << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+ << "journal was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int do_show_journal_info(librados::Rados& rados, librados::IoCtx& io_ctx,
+ const std::string& journal_id, Formatter *f)
+{
+ int r;
+ C_SaferCond cond;
+
+ std::string header_oid = ::journal::Journaler::header_oid(journal_id);
+ std::string object_oid_prefix = ::journal::Journaler::object_oid_prefix(
+ io_ctx.get_id(), journal_id);
+ uint8_t order;
+ uint8_t splay_width;
+ int64_t pool_id;
+
+ cls::journal::client::get_immutable_metadata(io_ctx, header_oid, &order,
+ &splay_width, &pool_id, &cond);
+ r = cond.wait();
+ if (r < 0) {
+ std::cerr << "failed to get journal metadata: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ std::string object_pool_name;
+ if (pool_id >= 0) {
+ r = rados.pool_reverse_lookup(pool_id, &object_pool_name);
+ if (r < 0) {
+ std::cerr << "error looking up pool name for pool_id=" << pool_id << ": "
+ << cpp_strerror(r) << std::endl;
+ }
+ }
+
+ if (f) {
+ f->open_object_section("journal");
+ f->dump_string("journal_id", journal_id);
+ f->dump_string("header_oid", header_oid);
+ f->dump_string("object_oid_prefix", object_oid_prefix);
+ f->dump_int("order", order);
+ f->dump_int("splay_width", splay_width);
+ if (!object_pool_name.empty()) {
+ f->dump_string("object_pool", object_pool_name);
+ }
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << "rbd journal '" << journal_id << "':" << std::endl;
+ std::cout << "\theader_oid: " << header_oid << std::endl;
+ std::cout << "\tobject_oid_prefix: " << object_oid_prefix << std::endl;
+ std::cout << "\torder: " << static_cast<int>(order) << " ("
+ << byte_u_t(1ull << order) << " objects)"<< std::endl;
+ std::cout << "\tsplay_width: " << static_cast<int>(splay_width) << std::endl;
+ if (!object_pool_name.empty()) {
+ std::cout << "\tobject_pool: " << object_pool_name << std::endl;
+ }
+ }
+ return 0;
+}
+
+static int do_show_journal_status(librados::IoCtx& io_ctx,
+ const std::string& journal_id, Formatter *f)
+{
+ int r;
+
+ C_SaferCond cond;
+ uint64_t minimum_set;
+ uint64_t active_set;
+ std::set<cls::journal::Client> registered_clients;
+ std::string oid = ::journal::Journaler::header_oid(journal_id);
+
+ cls::journal::client::get_mutable_metadata(io_ctx, oid, &minimum_set,
+ &active_set, &registered_clients,
+ &cond);
+ r = cond.wait();
+ if (r < 0) {
+ std::cerr << "warning: failed to get journal metadata" << std::endl;
+ return r;
+ }
+
+ if (f) {
+ f->open_object_section("status");
+ f->dump_unsigned("minimum_set", minimum_set);
+ f->dump_unsigned("active_set", active_set);
+ f->open_array_section("registered_clients");
+ for (std::set<cls::journal::Client>::iterator c =
+ registered_clients.begin(); c != registered_clients.end(); ++c) {
+ f->open_object_section("client");
+ c->dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << "minimum_set: " << minimum_set << std::endl;
+ std::cout << "active_set: " << active_set << std::endl;
+ std::cout << "registered clients: " << std::endl;
+ for (std::set<cls::journal::Client>::iterator c =
+ registered_clients.begin(); c != registered_clients.end(); ++c) {
+ std::cout << "\t" << *c << std::endl;
+ }
+ }
+ return 0;
+}
+
+static int do_reset_journal(librados::IoCtx& io_ctx,
+ const std::string& journal_id)
+{
+ // disable/re-enable journaling to delete/re-create the journal
+ // to properly handle mirroring constraints
+ std::string image_name;
+ int r = librbd::cls_client::dir_get_name(&io_ctx, RBD_DIRECTORY, journal_id,
+ &image_name);
+ if (r < 0) {
+ std::cerr << "failed to locate journal's image: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ librbd::Image image;
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ if (r < 0) {
+ std::cerr << "failed to open image: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ r = image.update_features(RBD_FEATURE_JOURNALING, false);
+ if (r < 0) {
+ std::cerr << "failed to disable image journaling: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ r = image.update_features(RBD_FEATURE_JOURNALING, true);
+ if (r < 0) {
+ std::cerr << "failed to re-enable image journaling: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+static int do_disconnect_journal_client(librados::IoCtx& io_ctx,
+ const std::string& journal_id,
+ const std::string& client_id)
+{
+ int r;
+
+ C_SaferCond cond;
+ uint64_t minimum_set;
+ uint64_t active_set;
+ std::set<cls::journal::Client> registered_clients;
+ std::string oid = ::journal::Journaler::header_oid(journal_id);
+
+ cls::journal::client::get_mutable_metadata(io_ctx, oid, &minimum_set,
+ &active_set, &registered_clients,
+ &cond);
+ r = cond.wait();
+ if (r < 0) {
+ std::cerr << "warning: failed to get journal metadata" << std::endl;
+ return r;
+ }
+
+ static const std::string IMAGE_CLIENT_ID("");
+
+ bool found = false;
+ for (auto &c : registered_clients) {
+ if (c.id == IMAGE_CLIENT_ID || (!client_id.empty() && client_id != c.id)) {
+ continue;
+ }
+ r = cls::journal::client::client_update_state(io_ctx, oid, c.id,
+ cls::journal::CLIENT_STATE_DISCONNECTED);
+ if (r < 0) {
+ std::cerr << "warning: failed to disconnect client " << c.id << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ std::cout << "client " << c.id << " disconnected" << std::endl;
+ found = true;
+ }
+
+ if (!found) {
+ if (!client_id.empty()) {
+ std::cerr << "warning: client " << client_id << " is not registered"
+ << std::endl;
+ } else {
+ std::cerr << "no registered clients to disconnect" << std::endl;
+ }
+ return -ENOENT;
+ }
+
+ bufferlist bl;
+ r = io_ctx.notify2(oid, bl, 5000, NULL);
+ if (r < 0) {
+ std::cerr << "warning: failed to notify state change:" << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+class Journaler : public ::journal::Journaler {
+public:
+ Journaler(librados::IoCtx& io_ctx, const std::string& journal_id,
+ const std::string &client_id) :
+ ::journal::Journaler(io_ctx, journal_id, client_id, {}) {
+ }
+
+ int init() {
+ int r;
+
+ // TODO register with librbd payload
+ r = register_client(bufferlist());
+ if (r < 0) {
+ std::cerr << "failed to register client: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ C_SaferCond cond;
+
+ ::journal::Journaler::init(&cond);
+ r = cond.wait();
+ if (r < 0) {
+ std::cerr << "failed to initialize journal: " << cpp_strerror(r)
+ << std::endl;
+ (void) unregister_client();
+ return r;
+ }
+
+ return 0;
+ }
+
+ int shut_down() {
+ int r = unregister_client();
+ if (r < 0) {
+ std::cerr << "rbd: failed to unregister journal client: "
+ << cpp_strerror(r) << std::endl;
+ }
+ ::journal::Journaler::shut_down();
+
+ return r;
+ }
+};
+
+class JournalPlayer {
+public:
+ JournalPlayer(librados::IoCtx& io_ctx, const std::string& journal_id,
+ const std::string &client_id) :
+ m_journaler(io_ctx, journal_id, client_id),
+ m_cond(),
+ m_r(0) {
+ }
+
+ virtual ~JournalPlayer() {}
+
+ virtual int exec() {
+ int r;
+
+ r = m_journaler.init();
+ if (r < 0) {
+ return r;
+ }
+
+ ReplayHandler replay_handler(this);
+
+ m_journaler.start_replay(&replay_handler);
+
+ r = m_cond.wait();
+ if (r < 0) {
+ std::cerr << "rbd: failed to process journal: " << cpp_strerror(r)
+ << std::endl;
+ if (m_r == 0) {
+ m_r = r;
+ }
+ }
+ return m_r;
+ }
+
+ int shut_down() {
+ return m_journaler.shut_down();
+ }
+
+protected:
+ struct ReplayHandler : public ::journal::ReplayHandler {
+ JournalPlayer *journal;
+ explicit ReplayHandler(JournalPlayer *_journal) : journal(_journal) {}
+
+ void get() override {}
+ void put() override {}
+
+ void handle_entries_available() override {
+ journal->handle_replay_ready();
+ }
+ void handle_complete(int r) override {
+ journal->handle_replay_complete(r);
+ }
+ };
+
+ void handle_replay_ready() {
+ int r = 0;
+ while (true) {
+ ::journal::ReplayEntry replay_entry;
+ uint64_t tag_id;
+ if (!m_journaler.try_pop_front(&replay_entry, &tag_id)) {
+ break;
+ }
+
+ r = process_entry(replay_entry, tag_id);
+ if (r < 0) {
+ break;
+ }
+ }
+ }
+
+ virtual int process_entry(::journal::ReplayEntry replay_entry,
+ uint64_t tag_id) = 0;
+
+ void handle_replay_complete(int r) {
+ if (m_r == 0 && r < 0) {
+ m_r = r;
+ }
+ m_journaler.stop_replay(&m_cond);
+ }
+
+ Journaler m_journaler;
+ C_SaferCond m_cond;
+ int m_r;
+};
+
+static int inspect_entry(bufferlist& data,
+ librbd::journal::EventEntry& event_entry,
+ bool verbose) {
+ try {
+ auto it = data.cbegin();
+ decode(event_entry, it);
+ } catch (const buffer::error &err) {
+ std::cerr << "failed to decode event entry: " << err.what() << std::endl;
+ return -EINVAL;
+ }
+ if (verbose) {
+ JSONFormatter f(true);
+ f.open_object_section("event_entry");
+ event_entry.dump(&f);
+ f.close_section();
+ f.flush(std::cout);
+ }
+ return 0;
+}
+
+class JournalInspector : public JournalPlayer {
+public:
+ JournalInspector(librados::IoCtx& io_ctx, const std::string& journal_id,
+ bool verbose) :
+ JournalPlayer(io_ctx, journal_id, "INSPECT"),
+ m_verbose(verbose),
+ m_s() {
+ }
+
+ int exec() override {
+ int r = JournalPlayer::exec();
+ m_s.print();
+ return r;
+ }
+
+private:
+ struct Stats {
+ Stats() : total(0), error(0) {}
+
+ void print() {
+ std::cout << "Summary:" << std::endl
+ << " " << total << " entries inspected, " << error << " errors"
+ << std::endl;
+ }
+
+ int total;
+ int error;
+ };
+
+ int process_entry(::journal::ReplayEntry replay_entry,
+ uint64_t tag_id) override {
+ m_s.total++;
+ if (m_verbose) {
+ std::cout << "Entry: tag_id=" << tag_id << ", commit_tid="
+ << replay_entry.get_commit_tid() << std::endl;
+ }
+ bufferlist data = replay_entry.get_data();
+ librbd::journal::EventEntry event_entry;
+ int r = inspect_entry(data, event_entry, m_verbose);
+ if (r < 0) {
+ m_r = r;
+ m_s.error++;
+ }
+ return 0;
+ }
+
+ bool m_verbose;
+ Stats m_s;
+};
+
+static int do_inspect_journal(librados::IoCtx& io_ctx,
+ const std::string& journal_id,
+ bool verbose) {
+ JournalInspector inspector(io_ctx, journal_id, verbose);
+ int r = inspector.exec();
+ if (r < 0) {
+ inspector.shut_down();
+ return r;
+ }
+
+ r = inspector.shut_down();
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+struct ExportEntry {
+ uint64_t tag_id;
+ uint64_t commit_tid;
+ int type;
+ bufferlist entry;
+
+ ExportEntry() : tag_id(0), commit_tid(0), type(0), entry() {}
+
+ ExportEntry(uint64_t tag_id, uint64_t commit_tid, int type,
+ const bufferlist& entry)
+ : tag_id(tag_id), commit_tid(commit_tid), type(type), entry(entry) {
+ }
+
+ void dump(Formatter *f) const {
+ ::encode_json("tag_id", tag_id, f);
+ ::encode_json("commit_tid", commit_tid, f);
+ ::encode_json("type", type, f);
+ ::encode_json("entry", entry, f);
+ }
+
+ void decode_json(JSONObj *obj) {
+ JSONDecoder::decode_json("tag_id", tag_id, obj);
+ JSONDecoder::decode_json("commit_tid", commit_tid, obj);
+ JSONDecoder::decode_json("type", type, obj);
+ JSONDecoder::decode_json("entry", entry, obj);
+ }
+};
+
+class JournalExporter : public JournalPlayer {
+public:
+ JournalExporter(librados::IoCtx& io_ctx, const std::string& journal_id,
+ int fd, bool no_error, bool verbose) :
+ JournalPlayer(io_ctx, journal_id, "EXPORT"),
+ m_journal_id(journal_id),
+ m_fd(fd),
+ m_no_error(no_error),
+ m_verbose(verbose),
+ m_s() {
+ }
+
+ int exec() override {
+ std::string header("# journal_id: " + m_journal_id + "\n");
+ int r;
+ r = safe_write(m_fd, header.c_str(), header.size());
+ if (r < 0) {
+ std::cerr << "rbd: failed to write to export file: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ r = JournalPlayer::exec();
+ m_s.print();
+ return r;
+ }
+
+private:
+ struct Stats {
+ Stats() : total(0), error(0) {}
+
+ void print() {
+ std::cout << total << " entries processed, " << error << " errors"
+ << std::endl;
+ }
+
+ int total;
+ int error;
+ };
+
+ int process_entry(::journal::ReplayEntry replay_entry,
+ uint64_t tag_id) override {
+ m_s.total++;
+ int type = -1;
+ bufferlist entry = replay_entry.get_data();
+ librbd::journal::EventEntry event_entry;
+ int r = inspect_entry(entry, event_entry, m_verbose);
+ if (r < 0) {
+ m_s.error++;
+ m_r = r;
+ return m_no_error ? 0 : r;
+ } else {
+ type = event_entry.get_event_type();
+ }
+ ExportEntry export_entry(tag_id, replay_entry.get_commit_tid(), type,
+ entry);
+ JSONFormatter f;
+ ::encode_json("event_entry", export_entry, &f);
+ std::ostringstream oss;
+ f.flush(oss);
+ std::string objstr = oss.str();
+ std::string header = stringify(objstr.size()) + " ";
+ r = safe_write(m_fd, header.c_str(), header.size());
+ if (r == 0) {
+ r = safe_write(m_fd, objstr.c_str(), objstr.size());
+ }
+ if (r == 0) {
+ r = safe_write(m_fd, "\n", 1);
+ }
+ if (r < 0) {
+ std::cerr << "rbd: failed to write to export file: " << cpp_strerror(r)
+ << std::endl;
+ m_s.error++;
+ return r;
+ }
+ return 0;
+ }
+
+ std::string m_journal_id;
+ int m_fd;
+ bool m_no_error;
+ bool m_verbose;
+ Stats m_s;
+};
+
+static int do_export_journal(librados::IoCtx& io_ctx,
+ const std::string& journal_id,
+ const std::string& path,
+ bool no_error, bool verbose) {
+ int r;
+ int fd;
+ bool to_stdout = path == "-";
+ if (to_stdout) {
+ fd = STDOUT_FILENO;
+ } else {
+ fd = open(path.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0644);
+ if (fd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error creating " << path << std::endl;
+ return r;
+ }
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+ }
+
+ JournalExporter exporter(io_ctx, journal_id, fd, no_error, verbose);
+ r = exporter.exec();
+
+ if (!to_stdout) {
+ close(fd);
+ }
+
+ int shut_down_r = exporter.shut_down();
+ if (r == 0 && shut_down_r < 0) {
+ r = shut_down_r;
+ }
+
+ return r;
+}
+
+class JournalImporter {
+public:
+ JournalImporter(librados::IoCtx& io_ctx, const std::string& journal_id,
+ int fd, bool no_error, bool verbose) :
+ m_journaler(io_ctx, journal_id, "IMPORT"),
+ m_fd(fd),
+ m_no_error(no_error),
+ m_verbose(verbose) {
+ }
+
+ bool read_entry(bufferlist& bl, int& r) {
+ // Entries are stored in the file using the following format:
+ //
+ // # Optional comments
+ // NNN {json encoded entry}
+ // ...
+ //
+ // Where NNN is the encoded entry size.
+ bl.clear();
+ char buf[80];
+ // Skip line feed and comments (lines started with #).
+ while ((r = safe_read_exact(m_fd, buf, 1)) == 0) {
+ if (buf[0] == '\n') {
+ continue;
+ } else if (buf[0] == '#') {
+ while ((r = safe_read_exact(m_fd, buf, 1)) == 0) {
+ if (buf[0] == '\n') {
+ break;
+ }
+ }
+ } else {
+ break;
+ }
+ }
+ if (r < 0) {
+ if (r == -EDOM) {
+ r = 0;
+ }
+ return false;
+ }
+ // Read entry size to buf.
+ if (!isdigit(buf[0])) {
+ r = -EINVAL;
+ std::cerr << "rbd: import data invalid format (digit expected)"
+ << std::endl;
+ return false;
+ }
+ for (size_t i = 1; i < sizeof(buf); i++) {
+ r = safe_read_exact(m_fd, buf + i, 1);
+ if (r < 0) {
+ std::cerr << "rbd: error reading import data" << std::endl;
+ return false;
+ }
+ if (!isdigit(buf[i])) {
+ if (buf[i] != ' ') {
+ r = -EINVAL;
+ std::cerr << "rbd: import data invalid format (space expected)"
+ << std::endl;
+ return false;
+ }
+ buf[i] = '\0';
+ break;
+ }
+ }
+ int entry_size = atoi(buf);
+ if (entry_size == 0) {
+ r = -EINVAL;
+ std::cerr << "rbd: import data invalid format (zero entry size)"
+ << std::endl;
+ return false;
+ }
+ ceph_assert(entry_size > 0);
+ // Read entry.
+ r = bl.read_fd(m_fd, entry_size);
+ if (r < 0) {
+ std::cerr << "rbd: error reading from stdin: " << cpp_strerror(r)
+ << std::endl;
+ return false;
+ }
+ if (r != entry_size) {
+ std::cerr << "rbd: error reading from stdin: truncated"
+ << std::endl;
+ r = -EINVAL;
+ return false;
+ }
+ r = 0;
+ return true;
+ }
+
+ int exec() {
+ int r = m_journaler.init();
+ if (r < 0) {
+ return r;
+ }
+ m_journaler.start_append(0);
+
+ int r1 = 0;
+ bufferlist bl;
+ int n = 0;
+ int error_count = 0;
+ while (read_entry(bl, r)) {
+ n++;
+ error_count++;
+ JSONParser p;
+ if (!p.parse(bl.c_str(), bl.length())) {
+ std::cerr << "rbd: error parsing input (entry " << n << ")"
+ << std::endl;
+ r = -EINVAL;
+ if (m_no_error) {
+ r1 = r;
+ continue;
+ } else {
+ break;
+ }
+ }
+ ExportEntry e;
+ try {
+ decode_json_obj(e, &p);
+ } catch (JSONDecoder::err& err) {
+ std::cerr << "rbd: error json decoding import data (entry " << n << "):"
+ << err.message << std::endl;
+ r = -EINVAL;
+ if (m_no_error) {
+ r1 = r;
+ continue;
+ } else {
+ break;
+ }
+ }
+ librbd::journal::EventEntry event_entry;
+ r = inspect_entry(e.entry, event_entry, m_verbose);
+ if (r < 0) {
+ std::cerr << "rbd: corrupted entry " << n << ": tag_tid=" << e.tag_id
+ << ", commit_tid=" << e.commit_tid << std::endl;
+ if (m_no_error) {
+ r1 = r;
+ continue;
+ } else {
+ break;
+ }
+ }
+ m_journaler.append(e.tag_id, e.entry);
+ error_count--;
+ }
+
+ std::cout << n << " entries processed, " << error_count << " errors" << std::endl;
+
+ std::cout << "Waiting for journal append to complete..." << std::endl;
+
+ C_SaferCond cond;
+ m_journaler.stop_append(&cond);
+ r = cond.wait();
+
+ if (r < 0) {
+ std::cerr << "failed to append journal: " << cpp_strerror(r) << std::endl;
+ }
+
+ if (r1 < 0 && r == 0) {
+ r = r1;
+ }
+ return r;
+ }
+
+ int shut_down() {
+ return m_journaler.shut_down();
+ }
+
+private:
+ Journaler m_journaler;
+ int m_fd;
+ bool m_no_error;
+ bool m_verbose;
+};
+
+static int do_import_journal(librados::IoCtx& io_ctx,
+ const std::string& journal_id,
+ const std::string& path,
+ bool no_error, bool verbose) {
+ int r;
+
+ int fd;
+ bool from_stdin = path == "-";
+ if (from_stdin) {
+ fd = STDIN_FILENO;
+ } else {
+ if ((fd = open(path.c_str(), O_RDONLY)) < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << path << std::endl;
+ return r;
+ }
+#ifdef HAVE_POSIX_FADVISE
+ posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+ }
+
+ JournalImporter importer(io_ctx, journal_id, fd, no_error, verbose);
+ r = importer.exec();
+
+ if (!from_stdin) {
+ close(fd);
+ }
+
+ int shut_down_r = importer.shut_down();
+ if (r == 0 && shut_down_r < 0) {
+ r = shut_down_r;
+ }
+
+ return r;
+}
+
+void get_info_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_info(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_show_journal_info(rados, io_ctx, journal_name, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: journal info: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+
+}
+
+void get_status_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_status(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_show_journal_status(io_ctx, journal_name, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: journal status: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_reset_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_reset(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_reset_journal(io_ctx, journal_name);
+ if (r < 0) {
+ std::cerr << "rbd: journal reset: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_client_disconnect_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ ("client-id", po::value<std::string>(),
+ "client ID (or leave unspecified to disconnect all)");
+}
+
+int execute_client_disconnect(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string client_id;
+ if (vm.count("client-id")) {
+ client_id = vm["client-id"].as<std::string>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_disconnect_journal_client(io_ctx, journal_name, client_id);
+ if (r < 0) {
+ std::cerr << "rbd: journal client disconnect: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_inspect_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_verbose_option(options);
+}
+
+int execute_inspect(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_NONE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_inspect_journal(io_ctx, journal_name, vm[at::VERBOSE].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: journal inspect: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_export_arguments(po::options_description *positional,
+ po::options_description *options) {
+ add_journal_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_path_options(positional, options,
+ "export file (or '-' for stdout)");
+ at::add_verbose_option(options);
+ at::add_no_error_option(options);
+}
+
+int execute_export(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ int r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string path;
+ r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_export_journal(io_ctx, journal_name, path, vm[at::NO_ERROR].as<bool>(),
+ vm[at::VERBOSE].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: journal export: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_import_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_path_options(positional, options,
+ "import file (or '-' for stdin)");
+ add_journal_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_verbose_option(options);
+ at::add_no_error_option(options);
+}
+
+int execute_import(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string path;
+ size_t arg_index = 0;
+ int r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string pool_name;
+ std::string namespace_name;
+ std::string journal_name;
+ r = get_pool_journal_names(vm, at::ARGUMENT_MODIFIER_DEST, &arg_index,
+ &pool_name, &namespace_name, &journal_name);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_import_journal(io_ctx, journal_name, path, vm[at::NO_ERROR].as<bool>(),
+ vm[at::VERBOSE].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: journal import: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_info(
+ {"journal", "info"}, {}, "Show information about image journal.", "",
+ &get_info_arguments, &execute_info);
+
+Shell::Action action_status(
+ {"journal", "status"}, {}, "Show status of image journal.", "",
+ &get_status_arguments, &execute_status);
+
+Shell::Action action_reset(
+ {"journal", "reset"}, {}, "Reset image journal.", "",
+ &get_reset_arguments, &execute_reset);
+
+Shell::Action action_inspect(
+ {"journal", "inspect"}, {}, "Inspect image journal for structural errors.", "",
+ &get_inspect_arguments, &execute_inspect);
+
+Shell::Action action_export(
+ {"journal", "export"}, {}, "Export image journal.", "",
+ &get_export_arguments, &execute_export);
+
+Shell::Action action_import(
+ {"journal", "import"}, {}, "Import image journal.", "",
+ &get_import_arguments, &execute_import);
+
+Shell::Action action_disconnect(
+ {"journal", "client", "disconnect"}, {},
+ "Flag image journal client as disconnected.", "",
+ &get_client_disconnect_arguments, &execute_client_disconnect);
+
+} // namespace journal
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Kernel.cc b/src/tools/rbd/action/Kernel.cc
new file mode 100644
index 00000000..dc0938eb
--- /dev/null
+++ b/src/tools/rbd/action/Kernel.cc
@@ -0,0 +1,561 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/krbd.h"
+#include "include/stringify.h"
+#include "include/uuid.h"
+#include "common/config_proxy.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/strtol.h"
+#include "common/Formatter.h"
+#include "msg/msg_types.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/scope_exit.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace kernel {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+std::map<std::string, std::string> map_options; // used for both map and unmap
+
+} // anonymous namespace
+
+static std::string map_option_uuid_cb(const char *value_char)
+{
+ uuid_d u;
+ if (!u.parse(value_char))
+ return "";
+
+ return stringify(u);
+}
+
+static std::string map_option_ip_cb(const char *value_char)
+{
+ entity_addr_t a;
+ const char *endptr;
+ if (!a.parse(value_char, &endptr) ||
+ endptr != value_char + strlen(value_char)) {
+ return "";
+ }
+
+ return stringify(a.get_sockaddr());
+}
+
+static std::string map_option_int_cb(const char *value_char)
+{
+ std::string err;
+ int d = strict_strtol(value_char, 10, &err);
+ if (!err.empty() || d < 0)
+ return "";
+
+ return stringify(d);
+}
+
+static std::string map_option_ms_mode_cb(const char *value_char)
+{
+ if (!strcmp(value_char, "legacy") || !strcmp(value_char, "crc") ||
+ !strcmp(value_char, "secure") || !strcmp(value_char, "prefer-crc") ||
+ !strcmp(value_char, "prefer-secure")) {
+ return value_char;
+ }
+ return "";
+}
+
+static void put_map_option(const std::string &key, const std::string &val)
+{
+ map_options[key] = val;
+}
+
+static int put_map_option_value(const std::string &opt, const char *value_char,
+ std::string (*parse_cb)(const char *))
+{
+ if (!value_char || *value_char == '\0') {
+ std::cerr << "rbd: " << opt << " option requires a value" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string value = parse_cb(value_char);
+ if (value.empty()) {
+ std::cerr << "rbd: invalid " << opt << " value '" << value_char << "'"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ put_map_option(opt, opt + "=" + value);
+ return 0;
+}
+
+static int parse_map_options(const std::string &options_string)
+{
+ char *options = strdup(options_string.c_str());
+ BOOST_SCOPE_EXIT(options) {
+ free(options);
+ } BOOST_SCOPE_EXIT_END;
+
+ for (char *this_char = strtok(options, ", ");
+ this_char != NULL;
+ this_char = strtok(NULL, ",")) {
+ char *value_char;
+
+ if ((value_char = strchr(this_char, '=')) != NULL)
+ *value_char++ = '\0';
+
+ if (!strcmp(this_char, "fsid")) {
+ if (put_map_option_value("fsid", value_char, map_option_uuid_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "ip")) {
+ if (put_map_option_value("ip", value_char, map_option_ip_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "share") || !strcmp(this_char, "noshare")) {
+ put_map_option("share", this_char);
+ } else if (!strcmp(this_char, "crc") || !strcmp(this_char, "nocrc")) {
+ put_map_option("crc", this_char);
+ } else if (!strcmp(this_char, "cephx_require_signatures") ||
+ !strcmp(this_char, "nocephx_require_signatures")) {
+ put_map_option("cephx_require_signatures", this_char);
+ } else if (!strcmp(this_char, "tcp_nodelay") ||
+ !strcmp(this_char, "notcp_nodelay")) {
+ put_map_option("tcp_nodelay", this_char);
+ } else if (!strcmp(this_char, "cephx_sign_messages") ||
+ !strcmp(this_char, "nocephx_sign_messages")) {
+ put_map_option("cephx_sign_messages", this_char);
+ } else if (!strcmp(this_char, "mount_timeout")) {
+ if (put_map_option_value("mount_timeout", value_char, map_option_int_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "osd_request_timeout")) {
+ if (put_map_option_value("osd_request_timeout", value_char, map_option_int_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "lock_timeout")) {
+ if (put_map_option_value("lock_timeout", value_char, map_option_int_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "osdkeepalive")) {
+ if (put_map_option_value("osdkeepalive", value_char, map_option_int_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "osd_idle_ttl")) {
+ if (put_map_option_value("osd_idle_ttl", value_char, map_option_int_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "rw") || !strcmp(this_char, "ro")) {
+ put_map_option("rw", this_char);
+ } else if (!strcmp(this_char, "queue_depth")) {
+ if (put_map_option_value("queue_depth", value_char, map_option_int_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "lock_on_read")) {
+ put_map_option("lock_on_read", this_char);
+ } else if (!strcmp(this_char, "exclusive")) {
+ put_map_option("exclusive", this_char);
+ } else if (!strcmp(this_char, "notrim")) {
+ put_map_option("notrim", this_char);
+ } else if (!strcmp(this_char, "abort_on_full")) {
+ put_map_option("abort_on_full", this_char);
+ } else if (!strcmp(this_char, "alloc_size")) {
+ if (put_map_option_value("alloc_size", value_char, map_option_int_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "ms_mode")) {
+ if (put_map_option_value("ms_mode", value_char, map_option_ms_mode_cb))
+ return -EINVAL;
+ } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) {
+ put_map_option("udev", this_char);
+ } else {
+ std::cerr << "rbd: unknown map option '" << this_char << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_unmap_options(const std::string &options_string)
+{
+ char *options = strdup(options_string.c_str());
+ BOOST_SCOPE_EXIT(options) {
+ free(options);
+ } BOOST_SCOPE_EXIT_END;
+
+ for (char *this_char = strtok(options, ", ");
+ this_char != NULL;
+ this_char = strtok(NULL, ",")) {
+ char *value_char;
+
+ if ((value_char = strchr(this_char, '=')) != NULL)
+ *value_char++ = '\0';
+
+ if (!strcmp(this_char, "force")) {
+ put_map_option("force", this_char);
+ } else if (!strcmp(this_char, "udev") || !strcmp(this_char, "noudev")) {
+ put_map_option("udev", this_char);
+ } else {
+ std::cerr << "rbd: unknown unmap option '" << this_char << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int do_kernel_list(Formatter *f) {
+#if defined(WITH_KRBD)
+ struct krbd_ctx *krbd;
+ int r;
+
+ r = krbd_create_from_context(g_ceph_context, 0, &krbd);
+ if (r < 0)
+ return r;
+
+ r = krbd_showmapped(krbd, f);
+
+ krbd_destroy(krbd);
+ return r;
+#else
+ std::cerr << "rbd: kernel device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int get_unsupported_features(librbd::Image &image,
+ uint64_t *unsupported_features)
+{
+ char buf[20];
+ uint64_t features, supported_features;
+ int r;
+
+ r = safe_read_file("/sys/bus/rbd/", "supported_features", buf,
+ sizeof(buf) - 1);
+ if (r < 0)
+ return r;
+
+ buf[r] = '\0';
+ try {
+ supported_features = std::stoull(buf, nullptr, 16);
+ } catch (...) {
+ return -EINVAL;
+ }
+
+ r = image.features(&features);
+ if (r < 0)
+ return r;
+
+ *unsupported_features = features & ~supported_features;
+ return 0;
+}
+
+/*
+ * hint user to check syslog for krbd related messages and provide suggestions
+ * based on errno return by krbd_map(). also note that even if some librbd calls
+ * fail, we at least dump the "try dmesg..." message to aid debugging.
+ */
+static void print_error_description(const char *poolname,
+ const char *nspace_name,
+ const char *imgname,
+ const char *snapname,
+ int maperrno)
+{
+ int r;
+ uint8_t oldformat;
+ librados::Rados rados;
+ librados::IoCtx ioctx;
+ librbd::Image image;
+
+ if (maperrno == -ENOENT)
+ goto done;
+
+ r = utils::init_and_open_image(poolname, nspace_name, imgname, "", snapname,
+ true, &rados, &ioctx, &image);
+ if (r < 0)
+ goto done;
+
+ r = image.old_format(&oldformat);
+ if (r < 0)
+ goto done;
+
+ /*
+ * kernel returns -ENXIO when mapping a V2 image due to unsupported feature
+ * set - so, hint about that too...
+ */
+ if (!oldformat && (maperrno == -ENXIO)) {
+ uint64_t unsupported_features;
+ bool need_terminate = true;
+
+ std::cout << "RBD image feature set mismatch. ";
+ r = get_unsupported_features(image, &unsupported_features);
+ if (r == 0 && (unsupported_features & ~RBD_FEATURES_ALL) == 0) {
+ uint64_t immutable = RBD_FEATURES_ALL & ~(RBD_FEATURES_MUTABLE |
+ RBD_FEATURES_DISABLE_ONLY);
+ if (unsupported_features & immutable) {
+ std::cout << "This image cannot be mapped because the following "
+ << "immutable features are unsupported by the kernel:";
+ unsupported_features &= immutable;
+ need_terminate = false;
+ } else {
+ std::cout << "You can disable features unsupported by the kernel "
+ << "with \"rbd feature disable ";
+ if (poolname != utils::get_default_pool_name() || *nspace_name) {
+ std::cout << poolname << "/";
+ }
+ if (*nspace_name) {
+ std::cout << nspace_name << "/";
+ }
+ std::cout << imgname;
+ }
+ } else {
+ std::cout << "Try disabling features unsupported by the kernel "
+ << "with \"rbd feature disable";
+ unsupported_features = 0;
+ }
+ for (auto it : at::ImageFeatures::FEATURE_MAPPING) {
+ if (it.first & unsupported_features) {
+ std::cout << " " << it.second;
+ }
+ }
+ if (need_terminate)
+ std::cout << "\"";
+ std::cout << "." << std::endl;
+ }
+
+ done:
+ std::cout << "In some cases useful info is found in syslog - try \"dmesg | tail\"." << std::endl;
+}
+
+static int do_kernel_map(const char *poolname, const char *nspace_name,
+ const char *imgname, const char *snapname)
+{
+#if defined(WITH_KRBD)
+ struct krbd_ctx *krbd;
+ std::ostringstream oss;
+ uint32_t flags = 0;
+ char *devnode;
+ int r;
+
+ for (auto it = map_options.begin(); it != map_options.end(); ) {
+ // for compatibility with < 3.7 kernels, assume that rw is on by
+ // default and omit it even if it was specified by the user
+ // (see ceph.git commit fb0f1986449b)
+ if (it->first == "rw" && it->second == "rw") {
+ it = map_options.erase(it);
+ } else if (it->first == "udev") {
+ if (it->second == "noudev") {
+ flags |= KRBD_CTX_F_NOUDEV;
+ }
+ it = map_options.erase(it);
+ } else {
+ if (it != map_options.begin())
+ oss << ",";
+ oss << it->second;
+ ++it;
+ }
+ }
+
+ r = krbd_create_from_context(g_ceph_context, flags, &krbd);
+ if (r < 0)
+ return r;
+
+ r = krbd_is_mapped(krbd, poolname, nspace_name, imgname, snapname, &devnode);
+ if (r < 0) {
+ std::cerr << "rbd: warning: can't get image map information: "
+ << cpp_strerror(r) << std::endl;
+ } else if (r > 0) {
+ std::cerr << "rbd: warning: image already mapped as " << devnode
+ << std::endl;
+ free(devnode);
+ }
+
+ r = krbd_map(krbd, poolname, nspace_name, imgname, snapname,
+ oss.str().c_str(), &devnode);
+ if (r < 0) {
+ print_error_description(poolname, nspace_name, imgname, snapname, r);
+ goto out;
+ }
+
+ std::cout << devnode << std::endl;
+
+ free(devnode);
+out:
+ krbd_destroy(krbd);
+ return r;
+#else
+ std::cerr << "rbd: kernel device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int do_kernel_unmap(const char *dev, const char *poolname,
+ const char *nspace_name, const char *imgname,
+ const char *snapname)
+{
+#if defined(WITH_KRBD)
+ struct krbd_ctx *krbd;
+ std::ostringstream oss;
+ uint32_t flags = 0;
+ int r;
+
+ for (auto it = map_options.begin(); it != map_options.end(); ) {
+ if (it->first == "udev") {
+ if (it->second == "noudev") {
+ flags |= KRBD_CTX_F_NOUDEV;
+ }
+ it = map_options.erase(it);
+ } else {
+ if (it != map_options.begin())
+ oss << ",";
+ oss << it->second;
+ ++it;
+ }
+ }
+
+ r = krbd_create_from_context(g_ceph_context, flags, &krbd);
+ if (r < 0)
+ return r;
+
+ if (dev)
+ r = krbd_unmap(krbd, dev, oss.str().c_str());
+ else
+ r = krbd_unmap_by_spec(krbd, poolname, nspace_name, imgname, snapname,
+ oss.str().c_str());
+
+ krbd_destroy(krbd);
+ return r;
+#else
+ std::cerr << "rbd: kernel device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ at::Format::Formatter formatter;
+ int r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::init_context();
+
+ r = do_kernel_list(formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: device list failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int execute_map(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string nspace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ // parse default options first so they can be overwritten by cli options
+ r = parse_map_options(
+ g_conf().get_val<std::string>("rbd_default_map_options"));
+ if (r < 0) {
+ std::cerr << "rbd: couldn't parse default map options" << std::endl;
+ return r;
+ }
+
+ if (vm.count("options")) {
+ for (auto &options : vm["options"].as<std::vector<std::string>>()) {
+ r = parse_map_options(options);
+ if (r < 0) {
+ std::cerr << "rbd: couldn't parse map options" << std::endl;
+ return r;
+ }
+ }
+ }
+
+ // parse options common to all device types after parsing krbd-specific
+ // options so that common options win (in particular "-o rw --read-only"
+ // should result in read-only mapping)
+ if (vm["read-only"].as<bool>()) {
+ put_map_option("rw", "ro");
+ }
+ if (vm["exclusive"].as<bool>()) {
+ put_map_option("exclusive", "exclusive");
+ }
+
+ utils::init_context();
+
+ r = do_kernel_map(pool_name.c_str(), nspace_name.c_str(), image_name.c_str(),
+ snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: map failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_unmap(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string device_name = utils::get_positional_argument(vm, 0);
+ if (!boost::starts_with(device_name, "/dev/")) {
+ device_name.clear();
+ }
+
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string nspace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r;
+ if (device_name.empty()) {
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name,
+ &image_name, &snap_name, false, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (device_name.empty() && image_name.empty()) {
+ std::cerr << "rbd: unmap requires either image name or device path"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ if (vm.count("options")) {
+ for (auto &options : vm["options"].as<std::vector<std::string>>()) {
+ r = parse_unmap_options(options);
+ if (r < 0) {
+ std::cerr << "rbd: couldn't parse unmap options" << std::endl;
+ return r;
+ }
+ }
+ }
+
+ utils::init_context();
+
+ r = do_kernel_unmap(device_name.empty() ? nullptr : device_name.c_str(),
+ pool_name.c_str(), nspace_name.c_str(),
+ image_name.c_str(), snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: unmap failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+} // namespace kernel
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/List.cc b/src/tools/rbd/action/List.cc
new file mode 100644
index 00000000..e6025418
--- /dev/null
+++ b/src/tools/rbd/action/List.cc
@@ -0,0 +1,340 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+#include "include/types.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/bind.hpp>
+#include <boost/program_options.hpp>
+#include "global/global_context.h"
+
+namespace rbd {
+
+namespace action {
+namespace list {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+enum WorkerState {
+ STATE_IDLE = 0,
+ STATE_OPENED,
+ STATE_DONE
+} ;
+
+struct WorkerEntry {
+ librbd::Image img;
+ librbd::RBD::AioCompletion* completion;
+ WorkerState state;
+ string name;
+
+ WorkerEntry() {
+ state = STATE_IDLE;
+ completion = nullptr;
+ }
+};
+
+
+int list_process_image(librados::Rados* rados, WorkerEntry* w, bool lflag, Formatter *f, TextTable &tbl)
+{
+ int r = 0;
+ librbd::image_info_t info;
+ std::string parent;
+
+ // handle second-nth trips through loop
+ librbd::linked_image_spec_t parent_image_spec;
+ librbd::snap_spec_t parent_snap_spec;
+ r = w->img.get_parent(&parent_image_spec, &parent_snap_spec);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+
+ bool has_parent = false;
+ if (r != -ENOENT) {
+ parent = parent_image_spec.pool_name + "/";
+ if (!parent_image_spec.pool_namespace.empty()) {
+ parent += parent_image_spec.pool_namespace + "/";
+ }
+ parent += parent_image_spec.image_name + "@" + parent_snap_spec.name;
+ has_parent = true;
+ }
+
+ if (w->img.stat(info, sizeof(info)) < 0) {
+ return -EINVAL;
+ }
+
+ uint8_t old_format;
+ w->img.old_format(&old_format);
+
+ std::list<librbd::locker_t> lockers;
+ bool exclusive;
+ r = w->img.list_lockers(&lockers, &exclusive, NULL);
+ if (r < 0)
+ return r;
+ std::string lockstr;
+ if (!lockers.empty()) {
+ lockstr = (exclusive) ? "excl" : "shr";
+ }
+
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("image", w->name);
+ f->dump_unsigned("size", info.size);
+ if (has_parent) {
+ f->open_object_section("parent");
+ f->dump_string("pool", parent_image_spec.pool_name);
+ f->dump_string("pool_namespace", parent_image_spec.pool_namespace);
+ f->dump_string("image", parent_image_spec.image_name);
+ f->dump_string("snapshot", parent_snap_spec.name);
+ f->close_section();
+ }
+ f->dump_int("format", old_format ? 1 : 2);
+ if (!lockers.empty())
+ f->dump_string("lock_type", exclusive ? "exclusive" : "shared");
+ f->close_section();
+ } else {
+ tbl << w->name
+ << stringify(byte_u_t(info.size))
+ << parent
+ << ((old_format) ? '1' : '2')
+ << "" // protect doesn't apply to images
+ << lockstr
+ << TextTable::endrow;
+ }
+
+ std::vector<librbd::snap_info_t> snaplist;
+ if (w->img.snap_list(snaplist) >= 0 && !snaplist.empty()) {
+ snaplist.erase(remove_if(snaplist.begin(),
+ snaplist.end(),
+ boost::bind(utils::is_not_user_snap_namespace, &w->img, _1)),
+ snaplist.end());
+ for (std::vector<librbd::snap_info_t>::iterator s = snaplist.begin();
+ s != snaplist.end(); ++s) {
+ bool is_protected;
+ bool has_parent = false;
+ parent.clear();
+ w->img.snap_set(s->name.c_str());
+ r = w->img.snap_is_protected(s->name.c_str(), &is_protected);
+ if (r < 0)
+ return r;
+ if (w->img.get_parent(&parent_image_spec, &parent_snap_spec) >= 0) {
+ parent = parent_image_spec.pool_name + "/";
+ if (!parent_image_spec.pool_namespace.empty()) {
+ parent += parent_image_spec.pool_namespace + "/";
+ }
+ parent += parent_image_spec.image_name + "@" + parent_snap_spec.name;
+ has_parent = true;
+ }
+ if (f) {
+ f->open_object_section("snapshot");
+ f->dump_string("image", w->name);
+ f->dump_string("snapshot", s->name);
+ f->dump_unsigned("size", s->size);
+ if (has_parent) {
+ f->open_object_section("parent");
+ f->dump_string("pool", parent_image_spec.pool_name);
+ f->dump_string("pool_namespace", parent_image_spec.pool_namespace);
+ f->dump_string("image", parent_image_spec.image_name);
+ f->dump_string("snapshot", parent_snap_spec.name);
+ f->close_section();
+ }
+ f->dump_int("format", old_format ? 1 : 2);
+ f->dump_string("protected", is_protected ? "true" : "false");
+ f->close_section();
+ } else {
+ tbl << w->name + "@" + s->name
+ << stringify(byte_u_t(s->size))
+ << parent
+ << ((old_format) ? '1' : '2')
+ << (is_protected ? "yes" : "")
+ << "" // locks don't apply to snaps
+ << TextTable::endrow;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int do_list(const std::string &pool_name, const std::string& namespace_name,
+ bool lflag, int threads, Formatter *f) {
+ std::vector<WorkerEntry*> workers;
+ std::vector<librbd::image_spec_t> images;
+ librados::Rados rados;
+ librbd::RBD rbd;
+ librados::IoCtx ioctx;
+
+ if (threads < 1) {
+ threads = 1;
+ }
+ if (threads > 32) {
+ threads = 32;
+ }
+
+ int r = utils::init(pool_name, namespace_name, &rados, &ioctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ r = rbd.list2(ioctx, &images);
+ if (r < 0)
+ return r;
+
+ if (!lflag) {
+ if (f)
+ f->open_array_section("images");
+ for (auto& image : images) {
+ if (f)
+ f->dump_string("name", image.name);
+ else
+ std::cout << image.name << std::endl;
+ }
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+ return 0;
+ }
+
+ TextTable tbl;
+
+ if (f) {
+ f->open_array_section("images");
+ } else {
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("PARENT", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("FMT", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("PROT", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("LOCK", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (size_t left = 0; left < std::min<size_t>(threads, images.size());
+ left++) {
+ workers.push_back(new WorkerEntry());
+ }
+
+ auto i = images.begin();
+ while (true) {
+ size_t workers_idle = 0;
+ for (auto comp : workers) {
+ switch (comp->state) {
+ case STATE_DONE:
+ comp->completion->wait_for_complete();
+ comp->state = STATE_IDLE;
+ comp->completion->release();
+ comp->completion = nullptr;
+ // we want it to fall through in this case
+ case STATE_IDLE:
+ if (i == images.end()) {
+ workers_idle++;
+ continue;
+ }
+ comp->name = i->name;
+ comp->completion = new librbd::RBD::AioCompletion(nullptr, nullptr);
+ r = rbd.aio_open_read_only(ioctx, comp->img, i->name.c_str(), nullptr,
+ comp->completion);
+ i++;
+ comp->state = STATE_OPENED;
+ break;
+ case STATE_OPENED:
+ comp->completion->wait_for_complete();
+ // image might disappear between rbd.list() and rbd.open(); ignore
+ // that, warn about other possible errors (EPERM, say, for opening
+ // an old-format image, because you need execute permission for the
+ // class method)
+ r = comp->completion->get_return_value();
+ comp->completion->release();
+ if (r < 0) {
+ std::cerr << "rbd: error opening " << comp->name << ": "
+ << cpp_strerror(r) << std::endl;
+
+ // in any event, continue to next image
+ comp->state = STATE_IDLE;
+ continue;
+ }
+ r = list_process_image(&rados, comp, lflag, f, tbl);
+ if (r < 0) {
+ std::cerr << "rbd: error processing image " << comp->name << ": "
+ << cpp_strerror(r) << std::endl;
+ }
+ comp->completion = new librbd::RBD::AioCompletion(nullptr, nullptr);
+ r = comp->img.aio_close(comp->completion);
+ comp->state = STATE_DONE;
+ break;
+ }
+ }
+ if (workers_idle == workers.size()) {
+ break;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else if (!images.empty()) {
+ std::cout << tbl;
+ }
+
+ rados.shutdown();
+
+ for (auto comp : workers) {
+ delete comp;
+ }
+
+ return r < 0 ? r : 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ options->add_options()
+ ("long,l", po::bool_switch(), "long listing format");
+ at::add_pool_options(positional, options, true);
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_list(pool_name, namespace_name, vm["long"].as<bool>(),
+ g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"),
+ formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: listing images failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"long", "l"});
+Shell::Action action(
+ {"list"}, {"ls"}, "List rbd images.", "", &get_arguments, &execute);
+
+} // namespace list
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Lock.cc b/src/tools/rbd/action/Lock.cc
new file mode 100644
index 00000000..754cb384
--- /dev/null
+++ b/src/tools/rbd/action/Lock.cc
@@ -0,0 +1,279 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace lock {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+void add_id_option(po::options_description *positional) {
+ positional->add_options()
+ ("lock-id", "unique lock id");
+}
+
+int get_id(const po::variables_map &vm, size_t *arg_index,
+ std::string *id) {
+ *id = utils::get_positional_argument(vm, *arg_index);
+ if (id->empty()) {
+ std::cerr << "rbd: lock id was not specified" << std::endl;
+ return -EINVAL;
+ } else {
+ ++(*arg_index);
+ }
+ return 0;
+}
+
+} // anonymous namespace
+
+static int do_lock_list(librbd::Image& image, Formatter *f)
+{
+ std::list<librbd::locker_t> lockers;
+ bool exclusive;
+ std::string tag;
+ TextTable tbl;
+ int r;
+
+ r = image.list_lockers(&lockers, &exclusive, &tag);
+ if (r < 0)
+ return r;
+
+ if (f) {
+ f->open_array_section("locks");
+ } else {
+ tbl.define_column("Locker", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("Address", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ if (lockers.size()) {
+ bool one = (lockers.size() == 1);
+
+ if (!f) {
+ std::cout << "There " << (one ? "is " : "are ") << lockers.size()
+ << (exclusive ? " exclusive" : " shared")
+ << " lock" << (one ? "" : "s") << " on this image.\n";
+ if (!exclusive)
+ std::cout << "Lock tag: " << tag << "\n";
+ }
+
+ for (std::list<librbd::locker_t>::const_iterator it = lockers.begin();
+ it != lockers.end(); ++it) {
+ if (f) {
+ f->open_object_section("lock");
+ f->dump_string("id", it->cookie);
+ f->dump_string("locker", it->client);
+ f->dump_string("address", it->address);
+ f->close_section();
+ } else {
+ tbl << it->client << it->cookie << it->address << TextTable::endrow;
+ }
+ }
+ if (!f)
+ std::cout << tbl;
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+ return 0;
+}
+
+static int do_lock_add(librbd::Image& image, const char *cookie,
+ const char *tag)
+{
+ if (tag)
+ return image.lock_shared(cookie, tag);
+ else
+ return image.lock_exclusive(cookie);
+}
+
+static int do_lock_remove(librbd::Image& image, const char *client,
+ const char *cookie)
+{
+ return image.break_lock(client, cookie);
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_lock_list(image, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: listing locks failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_add_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_id_option(positional);
+ options->add_options()
+ ("shared", po::value<std::string>(), "shared lock tag");
+}
+
+int execute_add(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string lock_cookie;
+ r = get_id(vm, &arg_index, &lock_cookie);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string lock_tag;
+ if (vm.count("shared")) {
+ lock_tag = vm["shared"].as<std::string>();
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_lock_add(image, lock_cookie.c_str(),
+ lock_tag.empty() ? nullptr : lock_tag.c_str());
+ if (r < 0) {
+ if (r == -EBUSY || r == -EEXIST) {
+ if (!lock_tag.empty()) {
+ std::cerr << "rbd: lock is already held by someone else"
+ << " with a different tag" << std::endl;
+ } else {
+ std::cerr << "rbd: lock is already held by someone else" << std::endl;
+ }
+ } else {
+ std::cerr << "rbd: taking lock failed: " << cpp_strerror(r) << std::endl;
+ }
+ return r;
+ }
+ return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ add_id_option(positional);
+ positional->add_options()
+ ("locker", "locker client");
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string lock_cookie;
+ r = get_id(vm, &arg_index, &lock_cookie);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string lock_client = utils::get_positional_argument(vm, arg_index);
+ if (lock_client.empty()) {
+ std::cerr << "rbd: locker was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_lock_remove(image, lock_client.c_str(), lock_cookie.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: releasing lock failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_list(
+ {"lock", "list"}, {"lock", "ls"}, "Show locks held on an image.", "",
+ &get_list_arguments, &execute_list);
+Shell::Action action_add(
+ {"lock", "add"}, {}, "Take a lock on an image.", "",
+ &get_add_arguments, &execute_add);
+Shell::Action action_remove(
+ {"lock", "remove"}, {"lock", "rm"}, "Release a lock on an image.", "",
+ &get_remove_arguments, &execute_remove);
+
+} // namespace lock
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/MergeDiff.cc b/src/tools/rbd/action/MergeDiff.cc
new file mode 100644
index 00000000..406b23b4
--- /dev/null
+++ b/src/tools/rbd/action/MergeDiff.cc
@@ -0,0 +1,454 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#define _LARGEFILE64_SOURCE
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "include/compat.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/safe_io.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+
+namespace rbd {
+namespace action {
+namespace merge_diff {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int parse_diff_header(int fd, __u8 *tag, string *from, string *to, uint64_t *size)
+{
+ int r;
+
+ {//header
+ char buf[utils::RBD_DIFF_BANNER.size() + 1];
+ r = safe_read_exact(fd, buf, utils::RBD_DIFF_BANNER.size());
+ if (r < 0)
+ return r;
+
+ buf[utils::RBD_DIFF_BANNER.size()] = '\0';
+ if (strcmp(buf, utils::RBD_DIFF_BANNER.c_str())) {
+ std::cerr << "invalid banner '" << buf << "', expected '"
+ << utils::RBD_DIFF_BANNER << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ while (true) {
+ r = safe_read_exact(fd, tag, 1);
+ if (r < 0)
+ return r;
+
+ if (*tag == RBD_DIFF_FROM_SNAP) {
+ r = utils::read_string(fd, 4096, from); // 4k limit to make sure we don't get a garbage string
+ if (r < 0)
+ return r;
+ dout(2) << " from snap " << *from << dendl;
+ } else if (*tag == RBD_DIFF_TO_SNAP) {
+ r = utils::read_string(fd, 4096, to); // 4k limit to make sure we don't get a garbage string
+ if (r < 0)
+ return r;
+ dout(2) << " to snap " << *to << dendl;
+ } else if (*tag == RBD_DIFF_IMAGE_SIZE) {
+ char buf[8];
+ r = safe_read_exact(fd, buf, 8);
+ if (r < 0)
+ return r;
+
+ bufferlist bl;
+ bl.append(buf, 8);
+ auto p = bl.cbegin();
+ decode(*size, p);
+ } else {
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_diff_body(int fd, __u8 *tag, uint64_t *offset, uint64_t *length)
+{
+ int r;
+
+ if (!(*tag)) {
+ r = safe_read_exact(fd, tag, 1);
+ if (r < 0)
+ return r;
+ }
+
+ if (*tag == RBD_DIFF_END) {
+ offset = 0;
+ length = 0;
+ return 0;
+ }
+
+ if (*tag != RBD_DIFF_WRITE && *tag != RBD_DIFF_ZERO)
+ return -ENOTSUP;
+
+ char buf[16];
+ r = safe_read_exact(fd, buf, 16);
+ if (r < 0)
+ return r;
+
+ bufferlist bl;
+ bl.append(buf, 16);
+ auto p = bl.cbegin();
+ decode(*offset, p);
+ decode(*length, p);
+
+ if (!(*length))
+ return -ENOTSUP;
+
+ return 0;
+}
+
+/*
+ * fd: the diff file to read from
+ * pd: the diff file to be written into
+ */
+static int accept_diff_body(int fd, int pd, __u8 tag, uint64_t offset, uint64_t length)
+{
+ if (tag == RBD_DIFF_END)
+ return 0;
+
+ bufferlist bl;
+ encode(tag, bl);
+ encode(offset, bl);
+ encode(length, bl);
+ int r;
+ r = bl.write_fd(pd);
+ if (r < 0)
+ return r;
+
+ if (tag == RBD_DIFF_WRITE) {
+ bufferptr bp = buffer::create(length);
+ r = safe_read_exact(fd, bp.c_str(), length);
+ if (r < 0)
+ return r;
+ bufferlist data;
+ data.append(bp);
+ r = data.write_fd(pd);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+/*
+ * Merge two diff files into one single file
+ * Note: It does not do the merging work if
+ * either of the source diff files is stripped,
+ * since which complicates the process and is
+ * rarely used
+ */
+static int do_merge_diff(const char *first, const char *second,
+ const char *path, bool no_progress)
+{
+ utils::ProgressContext pc("Merging image diff", no_progress);
+ int fd = -1, sd = -1, pd = -1, r;
+
+ string f_from, f_to;
+ string s_from, s_to;
+ uint64_t f_size = 0;
+ uint64_t s_size = 0;
+ uint64_t pc_size;
+
+ __u8 f_tag = 0, s_tag = 0;
+ uint64_t f_off = 0, f_len = 0;
+ uint64_t s_off = 0, s_len = 0;
+ bool f_end = false, s_end = false;
+
+ bool first_stdin = !strcmp(first, "-");
+ if (first_stdin) {
+ fd = STDIN_FILENO;
+ } else {
+ fd = open(first, O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << first << std::endl;
+ goto done;
+ }
+ }
+
+ sd = open(second, O_RDONLY);
+ if (sd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << second << std::endl;
+ goto done;
+ }
+
+ if (strcmp(path, "-") == 0) {
+ pd = 1;
+ } else {
+ pd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
+ if (pd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error create " << path << std::endl;
+ goto done;
+ }
+ }
+
+ //We just handle the case like 'banner, [ftag], [ttag], stag, [wztag]*,etag',
+ // and the (offset,length) in wztag must be ascending order.
+ r = parse_diff_header(fd, &f_tag, &f_from, &f_to, &f_size);
+ if (r < 0) {
+ std::cerr << "rbd: failed to parse first diff header" << std::endl;
+ goto done;
+ }
+
+ r = parse_diff_header(sd, &s_tag, &s_from, &s_to, &s_size);
+ if (r < 0) {
+ std::cerr << "rbd: failed to parse second diff header" << std::endl;
+ goto done;
+ }
+
+ if (f_to != s_from) {
+ r = -EINVAL;
+ std::cerr << "The first TO snapshot must be equal with the second FROM "
+ << "snapshot, aborting" << std::endl;
+ goto done;
+ }
+
+ {
+ // header
+ bufferlist bl;
+ bl.append(utils::RBD_DIFF_BANNER);
+
+ __u8 tag;
+ if (f_from.size()) {
+ tag = RBD_DIFF_FROM_SNAP;
+ encode(tag, bl);
+ encode(f_from, bl);
+ }
+
+ if (s_to.size()) {
+ tag = RBD_DIFF_TO_SNAP;
+ encode(tag, bl);
+ encode(s_to, bl);
+ }
+
+ tag = RBD_DIFF_IMAGE_SIZE;
+ encode(tag, bl);
+ encode(s_size, bl);
+
+ r = bl.write_fd(pd);
+ if (r < 0) {
+ std::cerr << "rbd: failed to write merged diff header" << std::endl;
+ goto done;
+ }
+ }
+ if (f_size > s_size)
+ pc_size = f_size << 1;
+ else
+ pc_size = s_size << 1;
+
+ //data block
+ while (!f_end || !s_end) {
+ // progress through input
+ pc.update_progress(f_off + s_off, pc_size);
+
+ if (!f_end && !f_len) {
+ uint64_t last_off = f_off;
+
+ r = parse_diff_body(fd, &f_tag, &f_off, &f_len);
+ dout(2) << "first diff data chunk: tag=" << f_tag << ", "
+ << "off=" << f_off << ", "
+ << "len=" << f_len << dendl;
+ if (r < 0) {
+ std::cerr << "rbd: failed to read first diff data chunk header"
+ << std::endl;
+ goto done;
+ }
+
+ if (f_tag == RBD_DIFF_END) {
+ f_end = true;
+ f_tag = RBD_DIFF_ZERO;
+ f_off = f_size;
+ if (f_size < s_size)
+ f_len = s_size - f_size;
+ else
+ f_len = 0;
+ }
+
+ if (last_off > f_off) {
+ r = -ENOTSUP;
+ std::cerr << "rbd: out-of-order offset from first diff ("
+ << last_off << " > " << f_off << ")" << std::endl;
+ goto done;
+ }
+ }
+
+ if (!s_end && !s_len) {
+ uint64_t last_off = s_off;
+
+ r = parse_diff_body(sd, &s_tag, &s_off, &s_len);
+ dout(2) << "second diff data chunk: tag=" << s_tag << ", "
+ << "off=" << s_off << ", "
+ << "len=" << s_len << dendl;
+ if (r < 0) {
+ std::cerr << "rbd: failed to read second diff data chunk header"
+ << std::endl;
+ goto done;
+ }
+
+ if (s_tag == RBD_DIFF_END) {
+ s_end = true;
+ s_off = s_size;
+ if (s_size < f_size)
+ s_len = f_size - s_size;
+ else
+ s_len = 0;
+ }
+
+ if (last_off > s_off) {
+ r = -ENOTSUP;
+ std::cerr << "rbd: out-of-order offset from second diff ("
+ << last_off << " > " << s_off << ")" << std::endl;
+ goto done;
+ }
+ }
+
+ if (f_off < s_off && f_len) {
+ uint64_t delta = s_off - f_off;
+ if (delta > f_len)
+ delta = f_len;
+ r = accept_diff_body(fd, pd, f_tag, f_off, delta);
+ if (r < 0) {
+ std::cerr << "rbd: failed to merge diff chunk" << std::endl;
+ goto done;
+ }
+ f_off += delta;
+ f_len -= delta;
+
+ if (!f_len) {
+ f_tag = 0;
+ continue;
+ }
+ }
+ ceph_assert(f_off >= s_off);
+
+ if (f_off < s_off + s_len && f_len) {
+ uint64_t delta = s_off + s_len - f_off;
+ if (delta > f_len)
+ delta = f_len;
+ if (f_tag == RBD_DIFF_WRITE) {
+ if (first_stdin) {
+ bufferptr bp = buffer::create(delta);
+ r = safe_read_exact(fd, bp.c_str(), delta);
+ } else {
+ off64_t l = lseek64(fd, delta, SEEK_CUR);
+ r = l < 0 ? -errno : 0;
+ }
+ if (r < 0) {
+ std::cerr << "rbd: failed to skip first diff data" << std::endl;
+ goto done;
+ }
+ }
+ f_off += delta;
+ f_len -= delta;
+
+ if (!f_len) {
+ f_tag = 0;
+ continue;
+ }
+ }
+ ceph_assert(f_off >= s_off + s_len);
+ if (s_len) {
+ r = accept_diff_body(sd, pd, s_tag, s_off, s_len);
+ if (r < 0) {
+ std::cerr << "rbd: failed to merge diff chunk" << std::endl;
+ goto done;
+ }
+ s_off += s_len;
+ s_len = 0;
+ s_tag = 0;
+ } else {
+ ceph_assert(f_end && s_end);
+ }
+ continue;
+ }
+
+ {//tail
+ __u8 tag = RBD_DIFF_END;
+ bufferlist bl;
+ encode(tag, bl);
+ r = bl.write_fd(pd);
+ }
+
+done:
+ if (pd > 2)
+ close(pd);
+ if (sd > 2)
+ close(sd);
+ if (fd > 2)
+ close(fd);
+
+ if(r < 0) {
+ pc.fail();
+ if (pd > 2)
+ unlink(path);
+ } else
+ pc.finish();
+
+ return r;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ ("diff1-path", "path to first diff (or '-' for stdin)")
+ ("diff2-path", "path to second diff");
+ at::add_path_options(positional, options,
+ "path to merged diff (or '-' for stdout)");
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string first_diff = utils::get_positional_argument(vm, 0);
+ if (first_diff.empty()) {
+ std::cerr << "rbd: first diff was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string second_diff = utils::get_positional_argument(vm, 1);
+ if (second_diff.empty()) {
+ std::cerr << "rbd: second diff was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string path;
+ size_t arg_index = 2;
+ int r = utils::get_path(vm, &arg_index, &path);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_merge_diff(first_diff.c_str(), second_diff.c_str(), path.c_str(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ cerr << "rbd: merge-diff error" << std::endl;
+ return -r;
+ }
+
+ return 0;
+}
+
+Shell::Action action(
+ {"merge-diff"}, {}, "Merge two diff exports together.", "",
+ &get_arguments, &execute);
+
+} // namespace merge_diff
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Migration.cc b/src/tools/rbd/action/Migration.cc
new file mode 100644
index 00000000..bb05e376
--- /dev/null
+++ b/src/tools/rbd/action/Migration.cc
@@ -0,0 +1,338 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/errno.h"
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace migration {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_prepare(librados::IoCtx& io_ctx, const std::string &image_name,
+ librados::IoCtx& dest_io_ctx,
+ const std::string &dest_image_name,
+ librbd::ImageOptions& opts) {
+ int r = librbd::RBD().migration_prepare(io_ctx, image_name.c_str(),
+ dest_io_ctx, dest_image_name.c_str(),
+ opts);
+ if (r < 0) {
+ std::cerr << "rbd: preparing migration failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+static int do_execute(librados::IoCtx& io_ctx, const std::string &image_name,
+ bool no_progress) {
+ utils::ProgressContext pc("Image migration", no_progress);
+ int r = librbd::RBD().migration_execute_with_progress(io_ctx,
+ image_name.c_str(), pc);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: migration failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+static int do_abort(librados::IoCtx& io_ctx, const std::string &image_name,
+ bool no_progress) {
+ utils::ProgressContext pc("Abort image migration", no_progress);
+ int r = librbd::RBD().migration_abort_with_progress(io_ctx,
+ image_name.c_str(), pc);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: aborting migration failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+static int do_commit(librados::IoCtx& io_ctx, const std::string &image_name,
+ bool force, bool no_progress) {
+ librbd::image_migration_status_t migration_status;
+ int r = librbd::RBD().migration_status(io_ctx, image_name.c_str(),
+ &migration_status,
+ sizeof(migration_status));
+ if (r < 0) {
+ std::cerr << "rbd: getting migration status failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = librados::Rados(io_ctx).ioctx_create2(migration_status.dest_pool_id, dst_io_ctx);
+ if (r < 0) {
+ std::cerr << "rbd: accessing source pool id="
+ << migration_status.dest_pool_id << " failed: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ r = utils::set_namespace(migration_status.dest_pool_namespace, &dst_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::Image image;
+ r = utils::open_image_by_id(dst_io_ctx, migration_status.dest_image_id,
+ true, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ std::vector<librbd::linked_image_spec_t> children;
+ r = image.list_descendants(&children);
+ if (r < 0) {
+ std::cerr << "rbd: listing descendants failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ if (children.size() > 0) {
+ std::cerr << "rbd: the image has "
+ << (children.size() == 1 ? "a descendant" : "descendants") << ": "
+ << std::endl;
+ for (auto& child : children) {
+ std::cerr << " " << child.pool_name << "/";
+ if (!child.pool_namespace.empty()) {
+ std::cerr << child.pool_namespace << "/";
+ }
+ std::cerr << child.image_name;
+ if (child.trash) {
+ std::cerr << " (trash " << child.image_id << ")";
+ }
+ std::cerr << std::endl;
+ }
+ std::cerr << "Warning: in-use, read-only descendant images"
+ << " will not detect the parent update." << std::endl;
+ if (force) {
+ std::cerr << "Proceeding anyway due to force flag set." << std::endl;
+ } else {
+ std::cerr << "Ensure no descendant images are opened read-only"
+ << " and run again with force flag." << std::endl;
+ return -EBUSY;
+ }
+ }
+
+ utils::ProgressContext pc("Commit image migration", no_progress);
+ r = librbd::RBD().migration_commit_with_progress(io_ctx, image_name.c_str(),
+ pc);
+ if (r < 0) {
+ pc.fail();
+ std::cerr << "rbd: committing migration failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_prepare_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+ at::add_create_image_options(options, true);
+ at::add_flatten_option(options);
+}
+
+int execute_prepare(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ io_ctx.set_osdmap_full_try();
+
+ std::string dest_pool_name;
+ std::string dest_namespace_name;
+ std::string dest_image_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dest_pool_name,
+ &dest_namespace_name, &dest_image_name, nullptr, false,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::ImageOptions opts;
+ r = utils::get_image_options(vm, true, &opts);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::IoCtx dest_io_ctx;
+ if (!dest_pool_name.empty()) {
+ r = utils::init_io_ctx(rados, dest_pool_name, dest_namespace_name,
+ &dest_io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ r = do_prepare(io_ctx, image_name, dest_pool_name.empty() ? io_ctx :
+ dest_io_ctx, dest_image_name, opts);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void get_execute_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ io_ctx.set_osdmap_full_try();
+
+ r = do_execute(io_ctx, image_name, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void get_abort_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_abort(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ io_ctx.set_osdmap_full_try();
+
+ r = do_abort(io_ctx, image_name, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void get_commit_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+ options->add_options()
+ ("force", po::bool_switch(), "proceed even if the image has children");
+}
+
+int execute_commit(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+ io_ctx.set_osdmap_full_try();
+
+ r = do_commit(io_ctx, image_name, vm["force"].as<bool>(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+Shell::Action action_prepare(
+ {"migration", "prepare"}, {}, "Prepare image migration.",
+ at::get_long_features_help(), &get_prepare_arguments, &execute_prepare);
+
+Shell::Action action_execute(
+ {"migration", "execute"}, {}, "Execute image migration.", "",
+ &get_execute_arguments, &execute_execute);
+
+Shell::Action action_abort(
+ {"migration", "abort"}, {}, "Cancel interrupted image migration.", "",
+ &get_abort_arguments, &execute_abort);
+
+Shell::Action action_commit(
+ {"migration", "commit"}, {}, "Commit image migration.", "",
+ &get_commit_arguments, &execute_commit);
+
+} // namespace migration
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/MirrorImage.cc b/src/tools/rbd/action/MirrorImage.cc
new file mode 100644
index 00000000..a250b694
--- /dev/null
+++ b/src/tools/rbd/action/MirrorImage.cc
@@ -0,0 +1,360 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/MirrorDaemonServiceInfo.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace mirror_image {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+int validate_mirroring_enabled(librbd::Image& image) {
+ librbd::mirror_image_info_t mirror_image;
+ int r = image.mirror_image_get_info(&mirror_image, sizeof(mirror_image));
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (mirror_image.state != RBD_MIRROR_IMAGE_ENABLED) {
+ std::cerr << "rbd: mirroring not enabled on the image" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+} // anonymous namespace
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+void get_arguments_disable(po::options_description *positional,
+ po::options_description *options) {
+ options->add_options()
+ ("force", po::bool_switch(), "disable even if not primary");
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_enable_disable(const po::variables_map &vm, bool enable,
+ bool force) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, "", image_name, "", "", false,
+ &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = enable ? image.mirror_image_enable() : image.mirror_image_disable(force);
+ if (r < 0) {
+ return r;
+ }
+
+ std::cout << (enable ? "Mirroring enabled" : "Mirroring disabled")
+ << std::endl;
+
+ return 0;
+}
+
+int execute_disable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return execute_enable_disable(vm, false, vm["force"].as<bool>());
+}
+
+int execute_enable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ return execute_enable_disable(vm, true, false);
+}
+
+void get_arguments_promote(po::options_description *positional,
+ po::options_description *options) {
+ options->add_options()
+ ("force", po::bool_switch(), "promote even if not cleanly demoted by remote cluster");
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_promote(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ bool force = vm["force"].as<bool>();
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, "", image_name, "", "", false,
+ &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.mirror_image_promote(force);
+ if (r < 0) {
+ std::cerr << "rbd: error promoting image to primary" << std::endl;
+ return r;
+ }
+
+ std::cout << "Image promoted to primary" << std::endl;
+ return 0;
+}
+
+int execute_demote(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, "", image_name, "", "", false,
+ &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.mirror_image_demote();
+ if (r < 0) {
+ std::cerr << "rbd: error demoting image to non-primary" << std::endl;
+ return r;
+ }
+
+ std::cout << "Image demoted to non-primary" << std::endl;
+ return 0;
+}
+
+int execute_resync(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, "", image_name, "", "", false,
+ &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.mirror_image_resync();
+ if (r < 0) {
+ std::cerr << "rbd: error flagging image resync" << std::endl;
+ return r;
+ }
+
+ std::cout << "Flagged image for resync from primary" << std::endl;
+ return 0;
+}
+
+void get_status_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute_status(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ at::Format::Formatter formatter;
+ int r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string image_name;
+ std::string snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, nullptr,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, "", image_name, "", "", false,
+ &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(image);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::mirror_image_status_t status;
+ r = image.mirror_image_get_status(&status, sizeof(status));
+ if (r < 0) {
+ std::cerr << "rbd: failed to get status for image " << image_name << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ std::string instance_id;
+ MirrorDaemonServiceInfo daemon_service_info(io_ctx);
+
+ if (status.up) {
+ r = image.mirror_image_get_instance_id(&instance_id);
+ if (r == -EOPNOTSUPP) {
+ std::cerr << "rbd: newer release of Ceph OSDs required to map image "
+ << "to rbd-mirror daemon instance" << std::endl;
+ // not fatal
+ } else if (r < 0 && r != -ENOENT) {
+ std::cerr << "rbd: failed to get service id for image "
+ << image_name << ": " << cpp_strerror(r) << std::endl;
+ // not fatal
+ } else if (!instance_id.empty()) {
+ daemon_service_info.init();
+ }
+ }
+
+ std::string state = utils::mirror_image_status_state(status);
+ std::string last_update = (
+ status.last_update == 0 ? "" : utils::timestr(status.last_update));
+
+ if (formatter != nullptr) {
+ formatter->open_object_section("image");
+ formatter->dump_string("name", image_name);
+ formatter->dump_string("global_id", status.info.global_id);
+ formatter->dump_string("state", state);
+ formatter->dump_string("description", status.description);
+ daemon_service_info.dump(instance_id, formatter);
+ formatter->dump_string("last_update", last_update);
+ formatter->close_section(); // image
+ formatter->flush(std::cout);
+ } else {
+ std::cout << image_name << ":\n"
+ << " global_id: " << status.info.global_id << "\n"
+ << " state: " << state << "\n"
+ << " description: " << status.description << "\n";
+ if (!instance_id.empty()) {
+ std::cout << " service: " <<
+ daemon_service_info.get_description(instance_id) << "\n";
+ }
+ std::cout << " last_update: " << last_update << std::endl;
+ }
+
+ return 0;
+}
+
+Shell::Action action_enable(
+ {"mirror", "image", "enable"}, {},
+ "Enable RBD mirroring for an image.", "",
+ &get_arguments, &execute_enable);
+Shell::Action action_disable(
+ {"mirror", "image", "disable"}, {},
+ "Disable RBD mirroring for an image.", "",
+ &get_arguments_disable, &execute_disable);
+Shell::Action action_promote(
+ {"mirror", "image", "promote"}, {},
+ "Promote an image to primary for RBD mirroring.", "",
+ &get_arguments_promote, &execute_promote);
+Shell::Action action_demote(
+ {"mirror", "image", "demote"}, {},
+ "Demote an image to non-primary for RBD mirroring.", "",
+ &get_arguments, &execute_demote);
+Shell::Action action_resync(
+ {"mirror", "image", "resync"}, {},
+ "Force resync to primary image for RBD mirroring.", "",
+ &get_arguments, &execute_resync);
+Shell::Action action_status(
+ {"mirror", "image", "status"}, {},
+ "Show RBD mirroring status for an image.", "",
+ &get_status_arguments, &execute_status);
+
+} // namespace mirror_image
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/MirrorPool.cc b/src/tools/rbd/action/MirrorPool.cc
new file mode 100644
index 00000000..ff7c3031
--- /dev/null
+++ b/src/tools/rbd/action/MirrorPool.cc
@@ -0,0 +1,1537 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/MirrorDaemonServiceInfo.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+#include "include/rbd/librbd.hpp"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/Throttle.h"
+#include "global/global_context.h"
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <regex>
+#include <set>
+#include <boost/program_options.hpp>
+#include "include/ceph_assert.h"
+
+#include <atomic>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::action::MirrorPool: "
+
+namespace rbd {
+namespace action {
+namespace mirror_pool {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static const std::string ALL_NAME("all");
+static const std::string SITE_NAME("site-name");
+
+namespace {
+
+void add_site_name_optional(po::options_description *options) {
+ options->add_options()
+ (SITE_NAME.c_str(), po::value<std::string>(), "local site name");
+}
+
+int set_site_name(librados::Rados& rados, const std::string& site_name) {
+ librbd::RBD rbd;
+ int r = rbd.mirror_site_name_set(rados, site_name);
+ if (r == -EOPNOTSUPP) {
+ std::cerr << "rbd: cluster does not support site names" << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to set site name" << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+struct MirrorPeerDirection {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ MirrorPeerDirection *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+
+ if (s == "rx-only") {
+ v = boost::any(RBD_MIRROR_PEER_DIRECTION_RX);
+ } else if (s == "rx-tx") {
+ v = boost::any(RBD_MIRROR_PEER_DIRECTION_RX_TX);
+ } else {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
+int validate_mirroring_enabled(librados::IoCtx& io_ctx) {
+ librbd::RBD rbd;
+ rbd_mirror_mode_t mirror_mode;
+ int r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (mirror_mode == RBD_MIRROR_MODE_DISABLED) {
+ std::cerr << "rbd: mirroring not enabled on the pool" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int validate_uuid(const std::string &uuid) {
+ std::regex pattern("^[A-F0-9]{8}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{12}$",
+ std::regex::icase);
+ std::smatch match;
+ if (!std::regex_match(uuid, match, pattern)) {
+ std::cerr << "rbd: invalid uuid '" << uuid << "'" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int read_key_file(std::string path, std::string* key) {
+ std::ifstream key_file;
+ key_file.open(path);
+ if (key_file.fail()) {
+ std::cerr << "rbd: failed to open " << path << std::endl;
+ return -EINVAL;
+ }
+
+ std::getline(key_file, *key);
+ if (key_file.bad()) {
+ std::cerr << "rbd: failed to read key from " << path << std::endl;
+ return -EINVAL;
+ }
+
+ key_file.close();
+ return 0;
+}
+
+void add_uuid_option(po::options_description *positional) {
+ positional->add_options()
+ ("uuid", po::value<std::string>(), "peer uuid");
+}
+
+int get_uuid(const po::variables_map &vm, size_t arg_index,
+ std::string *uuid) {
+ *uuid = utils::get_positional_argument(vm, arg_index);
+ if (uuid->empty()) {
+ std::cerr << "rbd: must specify peer uuid" << std::endl;
+ return -EINVAL;
+ }
+ return validate_uuid(*uuid);
+}
+
+int get_remote_cluster_spec(const po::variables_map &vm,
+ const std::string &spec,
+ std::string *remote_client_name,
+ std::string *remote_cluster,
+ std::map<std::string, std::string>* attributes) {
+ if (vm.count("remote-client-name")) {
+ *remote_client_name = vm["remote-client-name"].as<std::string>();
+ }
+ if (vm.count("remote-cluster")) {
+ *remote_cluster = vm["remote-cluster"].as<std::string>();
+ }
+ if (vm.count("remote-mon-host")) {
+ (*attributes)["mon_host"] = vm["remote-mon-host"].as<std::string>();
+ }
+ if (vm.count("remote-key-file")) {
+ std::string key;
+ int r = read_key_file(vm["remote-key-file"].as<std::string>(), &key);
+ if (r < 0) {
+ return r;
+ }
+ (*attributes)["key"] = key;
+ }
+
+ if (!spec.empty()) {
+ std::regex pattern("^(?:(client\\.[^@]+)@)?([^/@]+)$");
+ std::smatch match;
+ if (!std::regex_match(spec, match, pattern)) {
+ std::cerr << "rbd: invalid spec '" << spec << "'" << std::endl;
+ return -EINVAL;
+ }
+ if (match[1].matched) {
+ *remote_client_name = match[1];
+ }
+ *remote_cluster = match[2];
+ }
+
+ if (remote_cluster->empty()) {
+ std::cerr << "rbd: remote cluster was not specified" << std::endl;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int set_peer_config_key(librados::IoCtx& io_ctx, const std::string& peer_uuid,
+ std::map<std::string, std::string>&& attributes) {
+ librbd::RBD rbd;
+ int r = rbd.mirror_peer_set_attributes(io_ctx, peer_uuid, attributes);
+ if (r == -EPERM) {
+ std::cerr << "rbd: permission denied attempting to set peer "
+ << "config-key secrets in the monitor" << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to update mirroring peer config: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+int get_peer_config_key(librados::IoCtx& io_ctx, const std::string& peer_uuid,
+ std::map<std::string, std::string>* attributes) {
+ librbd::RBD rbd;
+ int r = rbd.mirror_peer_get_attributes(io_ctx, peer_uuid, attributes);
+ if (r == -ENOENT) {
+ return r;
+ } else if (r == -EPERM) {
+ std::cerr << "rbd: permission denied attempting to access peer "
+ << "config-key secrets from the monitor" << std::endl;
+ return r;
+ } else if (r == -EINVAL) {
+ std::cerr << "rbd: corrupt mirroring peer config" << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: error reading mirroring peer config: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int update_peer_config_key(librados::IoCtx& io_ctx,
+ const std::string& peer_uuid,
+ const std::string& key,
+ const std::string& value) {
+ std::map<std::string, std::string> attributes;
+ int r = get_peer_config_key(io_ctx, peer_uuid, &attributes);
+ if (r == -ENOENT) {
+ return set_peer_config_key(io_ctx, peer_uuid, {{key, value}});
+ } else if (r < 0) {
+ return r;
+ }
+
+ if (value.empty()) {
+ attributes.erase(key);
+ } else {
+ attributes[key] = value;
+ }
+ return set_peer_config_key(io_ctx, peer_uuid, std::move(attributes));
+}
+
+int format_mirror_peers(librados::IoCtx& io_ctx,
+ at::Format::Formatter formatter,
+ const std::vector<librbd::mirror_peer_t> &peers,
+ bool config_key) {
+ TextTable tbl;
+ if (formatter != nullptr) {
+ formatter->open_array_section("peers");
+ } else {
+ std::cout << "Peers: ";
+ if (peers.empty()) {
+ std::cout << "none" << std::endl;
+ } else {
+ tbl.define_column("", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("UUID", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("CLIENT", TextTable::LEFT, TextTable::LEFT);
+ if (config_key) {
+ tbl.define_column("MON_HOST", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("KEY", TextTable::LEFT, TextTable::LEFT);
+ }
+ }
+ }
+
+ for (auto &peer : peers) {
+ std::map<std::string, std::string> attributes;
+ if (config_key) {
+ int r = get_peer_config_key(io_ctx, peer.uuid, &attributes);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+ }
+
+ if (formatter != nullptr) {
+ formatter->open_object_section("peer");
+ formatter->dump_string("uuid", peer.uuid);
+ formatter->dump_string("cluster_name", peer.cluster_name);
+ formatter->dump_string("client_name", peer.client_name);
+ for (auto& pair : attributes) {
+ formatter->dump_string(pair.first.c_str(), pair.second);
+ }
+ formatter->close_section();
+ } else {
+ tbl << " "
+ << peer.uuid
+ << peer.cluster_name
+ << peer.client_name;
+ if (config_key) {
+ tbl << attributes["mon_host"]
+ << attributes["key"];
+ }
+ tbl << TextTable::endrow;
+ }
+ }
+
+ if (formatter != nullptr) {
+ formatter->close_section();
+ } else {
+ std::cout << std::endl << tbl;
+ }
+ return 0;
+}
+
+class ImageRequestBase {
+public:
+ void send() {
+ dout(20) << this << " " << __func__ << ": image_name=" << m_image_name
+ << dendl;
+
+ auto ctx = new FunctionContext([this](int r) {
+ handle_finalize(r);
+ });
+
+ // will pause here until slots are available
+ m_finalize_ctx = m_throttle.start_op(ctx);
+
+ open_image();
+ }
+
+protected:
+ ImageRequestBase(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name)
+ : m_io_ctx(io_ctx), m_throttle(throttle), m_image_name(image_name) {
+ }
+ virtual ~ImageRequestBase() {
+ }
+
+ virtual bool skip_get_info() const {
+ return false;
+ }
+ virtual void get_info(librbd::Image &image, librbd::mirror_image_info_t *info,
+ librbd::RBD::AioCompletion *aio_comp) {
+ image.aio_mirror_image_get_info(info, sizeof(librbd::mirror_image_info_t),
+ aio_comp);
+ }
+
+ virtual bool skip_action(const librbd::mirror_image_info_t &info) const {
+ return false;
+ }
+ virtual void execute_action(librbd::Image &image,
+ librbd::RBD::AioCompletion *aio_comp) = 0;
+ virtual void handle_execute_action(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ std::cerr << "rbd: failed to " << get_action_type() << " image "
+ << m_image_name << ": " << cpp_strerror(r) << std::endl;
+ m_ret_val = r;
+ }
+
+ close_image();
+ }
+
+ virtual void finalize_action() {
+ }
+ virtual std::string get_action_type() const = 0;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_IMAGE
+ * |
+ * v
+ * GET_INFO
+ * |
+ * v
+ * EXECUTE_ACTION
+ * |
+ * v
+ * CLOSE_IMAGE
+ * |
+ * v
+ * FINALIZE_ACTION
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ OrderedThrottle &m_throttle;
+ const std::string m_image_name;
+
+ librbd::Image m_image;
+ Context *m_finalize_ctx = nullptr;
+
+ librbd::mirror_image_info_t m_mirror_image_info;
+
+ int m_ret_val = 0;
+
+ void open_image() {
+ dout(20) << this << " " << __func__ << dendl;
+
+ librbd::RBD rbd;
+ auto aio_completion = utils::create_aio_completion<
+ ImageRequestBase, &ImageRequestBase::handle_open_image>(this);
+ rbd.aio_open(m_io_ctx, m_image, m_image_name.c_str(), nullptr,
+ aio_completion);
+ }
+
+ void handle_open_image(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ std::cerr << "rbd: failed to open image "
+ << m_image_name << ": " << cpp_strerror(r) << std::endl;
+ m_finalize_ctx->complete(r);
+ return;
+ }
+
+ get_info();
+ }
+
+ void get_info() {
+ if (skip_get_info()) {
+ execute_action();
+ return;
+ }
+ dout(20) << this << " " << __func__ << dendl;
+
+ auto aio_completion = utils::create_aio_completion<
+ ImageRequestBase, &ImageRequestBase::handle_get_info>(this);
+ get_info(m_image, &m_mirror_image_info, aio_completion);
+ }
+
+ void handle_get_info(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ close_image();
+ return;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror image info for "
+ << m_image_name << ": " << cpp_strerror(r) << std::endl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ execute_action();
+ }
+
+ void execute_action() {
+ if (skip_action(m_mirror_image_info)) {
+ close_image();
+ return;
+ }
+ dout(20) << this << " " << __func__ << dendl;
+
+ auto aio_completion = utils::create_aio_completion<
+ ImageRequestBase, &ImageRequestBase::handle_execute_action>(this);
+ execute_action(m_image, aio_completion);
+ }
+
+ void close_image() {
+ dout(20) << this << " " << __func__ << dendl;
+
+ auto aio_completion = utils::create_aio_completion<
+ ImageRequestBase, &ImageRequestBase::handle_close_image>(this);
+ m_image.aio_close(aio_completion);
+ }
+
+ void handle_close_image(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r < 0) {
+ std::cerr << "rbd: failed to close image "
+ << m_image_name << ": " << cpp_strerror(r) << std::endl;
+ }
+
+ m_finalize_ctx->complete(r);
+ }
+
+ void handle_finalize(int r) {
+ dout(20) << this << " " << __func__ << ": r=" << r << dendl;
+
+ if (r == 0 && m_ret_val < 0) {
+ r = m_ret_val;
+ }
+ if (r >= 0) {
+ finalize_action();
+ }
+ m_throttle.end_op(r);
+ delete this;
+ }
+
+};
+
+class PromoteImageRequest : public ImageRequestBase {
+public:
+ PromoteImageRequest(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name, std::atomic<unsigned> *counter,
+ bool force)
+ : ImageRequestBase(io_ctx, throttle, image_name), m_counter(counter),
+ m_force(force) {
+ }
+
+protected:
+ bool skip_action(const librbd::mirror_image_info_t &info) const override {
+ return (info.state != RBD_MIRROR_IMAGE_ENABLED || info.primary);
+ }
+
+ void execute_action(librbd::Image &image,
+ librbd::RBD::AioCompletion *aio_comp) override {
+ image.aio_mirror_image_promote(m_force, aio_comp);
+ }
+
+ void handle_execute_action(int r) override {
+ if (r >= 0) {
+ (*m_counter)++;
+ }
+ ImageRequestBase::handle_execute_action(r);
+ }
+
+ std::string get_action_type() const override {
+ return "promote";
+ }
+
+private:
+ std::atomic<unsigned> *m_counter = nullptr;
+ bool m_force;
+};
+
+class DemoteImageRequest : public ImageRequestBase {
+public:
+ DemoteImageRequest(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name, std::atomic<unsigned> *counter)
+ : ImageRequestBase(io_ctx, throttle, image_name), m_counter(counter) {
+ }
+
+protected:
+ bool skip_action(const librbd::mirror_image_info_t &info) const override {
+ return (info.state != RBD_MIRROR_IMAGE_ENABLED || !info.primary);
+ }
+
+ void execute_action(librbd::Image &image,
+ librbd::RBD::AioCompletion *aio_comp) override {
+ image.aio_mirror_image_demote(aio_comp);
+ }
+ void handle_execute_action(int r) override {
+ if (r >= 0) {
+ (*m_counter)++;
+ }
+ ImageRequestBase::handle_execute_action(r);
+ }
+
+ std::string get_action_type() const override {
+ return "demote";
+ }
+
+private:
+ std::atomic<unsigned> *m_counter = nullptr;
+};
+
+class StatusImageRequest : public ImageRequestBase {
+public:
+ StatusImageRequest(
+ librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name,
+ const std::map<std::string, std::string> &instance_ids,
+ const MirrorDaemonServiceInfo &daemon_service_info,
+ at::Format::Formatter formatter)
+ : ImageRequestBase(io_ctx, throttle, image_name),
+ m_instance_ids(instance_ids), m_daemon_service_info(daemon_service_info),
+ m_formatter(formatter) {
+ }
+
+protected:
+ bool skip_get_info() const override {
+ return true;
+ }
+
+ void execute_action(librbd::Image &image,
+ librbd::RBD::AioCompletion *aio_comp) override {
+ image.get_id(&m_image_id);
+ image.aio_mirror_image_get_status(&m_mirror_image_status,
+ sizeof(m_mirror_image_status), aio_comp);
+ }
+
+ void finalize_action() override {
+ if (m_mirror_image_status.info.global_id.empty()) {
+ return;
+ }
+
+ std::string state = utils::mirror_image_status_state(m_mirror_image_status);
+ std::string instance_id = (m_mirror_image_status.up &&
+ m_instance_ids.count(m_image_id)) ?
+ m_instance_ids.find(m_image_id)->second : "";
+ std::string last_update = (
+ m_mirror_image_status.last_update == 0 ?
+ "" : utils::timestr(m_mirror_image_status.last_update));
+
+ if (m_formatter != nullptr) {
+ m_formatter->open_object_section("image");
+ m_formatter->dump_string("name", m_mirror_image_status.name);
+ m_formatter->dump_string("global_id",
+ m_mirror_image_status.info.global_id);
+ m_formatter->dump_string("state", state);
+ m_formatter->dump_string("description",
+ m_mirror_image_status.description);
+ m_daemon_service_info.dump(instance_id, m_formatter);
+ m_formatter->dump_string("last_update", last_update);
+ m_formatter->close_section(); // image
+ } else {
+ std::cout << "\n" << m_mirror_image_status.name << ":\n"
+ << " global_id: "
+ << m_mirror_image_status.info.global_id << "\n"
+ << " state: " << state << "\n"
+ << " description: "
+ << m_mirror_image_status.description << "\n";
+ if (!instance_id.empty()) {
+ std::cout << " service: "
+ << m_daemon_service_info.get_description(instance_id) << "\n";
+ }
+ std::cout << " last_update: " << last_update << std::endl;
+ }
+ }
+
+ std::string get_action_type() const override {
+ return "status";
+ }
+
+private:
+ const std::map<std::string, std::string> &m_instance_ids;
+ const MirrorDaemonServiceInfo &m_daemon_service_info;
+ at::Format::Formatter m_formatter;
+ std::string m_image_id;
+ librbd::mirror_image_status_t m_mirror_image_status;
+};
+
+template <typename RequestT>
+class ImageRequestAllocator {
+public:
+ template <class... Args>
+ RequestT *operator()(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
+ const std::string &image_name, Args&&... args) {
+ return new RequestT(io_ctx, throttle, image_name,
+ std::forward<Args>(args)...);
+ }
+};
+
+template <typename RequestT>
+class ImageRequestGenerator {
+public:
+ template <class... Args>
+ ImageRequestGenerator(librados::IoCtx &io_ctx, Args&&... args)
+ : m_io_ctx(io_ctx),
+ m_factory(std::bind(ImageRequestAllocator<RequestT>(),
+ std::ref(m_io_ctx), std::ref(m_throttle),
+ std::placeholders::_1, std::forward<Args>(args)...)),
+ m_throttle(g_conf().get_val<uint64_t>("rbd_concurrent_management_ops"),
+ true) {
+ }
+
+ int execute() {
+ // use the alphabetical list of image names for pool-level
+ // mirror image operations
+ librbd::RBD rbd;
+ int r = rbd.list2(m_io_ctx, &m_images);
+ if (r < 0 && r != -ENOENT) {
+ std::cerr << "rbd: failed to list images within pool" << std::endl;
+ return r;
+ }
+
+ for (auto &image : m_images) {
+ auto request = m_factory(image.name);
+ request->send();
+ }
+
+ return m_throttle.wait_for_ret();
+ }
+private:
+ typedef std::function<RequestT*(const std::string&)> Factory;
+
+ librados::IoCtx &m_io_ctx;
+ Factory m_factory;
+
+ OrderedThrottle m_throttle;
+
+ std::vector<librbd::image_spec_t> m_images;
+
+};
+
+} // anonymous namespace
+
+void get_peer_bootstrap_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ options->add_options()
+ (SITE_NAME.c_str(), po::value<std::string>(), "local site name");
+}
+
+int execute_peer_bootstrap_create(
+ const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ if (vm.count(SITE_NAME)) {
+ r = set_site_name(rados, vm[SITE_NAME].as<std::string>());
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ librbd::RBD rbd;
+ std::string token;
+ r = rbd.mirror_peer_bootstrap_create(io_ctx, &token);
+ if (r == -EEXIST) {
+ std::cerr << "rbd: mismatch with pre-existing RBD mirroring peer user caps"
+ << std::endl;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to create mirroring bootstrap token: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ std::cout << token << std::endl;
+ return 0;
+}
+
+void get_peer_bootstrap_import_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ options->add_options()
+ (SITE_NAME.c_str(), po::value<std::string>(), "local site name");
+ positional->add_options()
+ ("token-path", po::value<std::string>(),
+ "bootstrap token file (or '-' for stdin)");
+ options->add_options()
+ ("token-path", po::value<std::string>(),
+ "bootstrap token file (or '-' for stdin)")
+ ("direction", po::value<MirrorPeerDirection>(),
+ "mirroring direction (rx-only, rx-tx)\n"
+ "[default: rx-tx]");
+}
+
+int execute_peer_bootstrap_import(
+ const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string token_path;
+ if (vm.count("token-path")) {
+ token_path = vm["token-path"].as<std::string>();
+ } else {
+ token_path = utils::get_positional_argument(vm, arg_index++);
+ }
+
+ if (token_path.empty()) {
+ std::cerr << "rbd: token path was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ rbd_mirror_peer_direction_t mirror_peer_direction =
+ RBD_MIRROR_PEER_DIRECTION_RX_TX;
+ if (vm.count("direction")) {
+ mirror_peer_direction = vm["direction"].as<rbd_mirror_peer_direction_t>();
+ }
+
+ int fd = STDIN_FILENO;
+ if (token_path != "-") {
+ fd = open(token_path.c_str(), O_RDONLY);
+ if (fd < 0) {
+ r = -errno;
+ std::cerr << "rbd: error opening " << token_path << std::endl;
+ return r;
+ }
+ }
+
+ char token[1024];
+ memset(token, 0, sizeof(token));
+ r = safe_read(fd, token, sizeof(token) - 1);
+ if (fd != STDIN_FILENO) {
+ VOID_TEMP_FAILURE_RETRY(close(fd));
+ }
+
+ if (r < 0) {
+ std::cerr << "rbd: error reading token file: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ if (vm.count(SITE_NAME)) {
+ r = set_site_name(rados, vm[SITE_NAME].as<std::string>());
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ librbd::RBD rbd;
+ r = rbd.mirror_peer_bootstrap_import(io_ctx, mirror_peer_direction, token);
+ if (r == -ENOSYS) {
+ std::cerr << "rbd: mirroring is not enabled on remote peer" << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to import peer bootstrap token" << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_peer_add_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ positional->add_options()
+ ("remote-cluster-spec", "remote cluster spec\n"
+ "(example: [<client name>@]<cluster name>)");
+ options->add_options()
+ ("remote-client-name", po::value<std::string>(), "remote client name")
+ ("remote-cluster", po::value<std::string>(), "remote cluster name")
+ ("remote-mon-host", po::value<std::string>(), "remote mon host(s)")
+ ("remote-key-file", po::value<std::string>(),
+ "path to file containing remote key");
+}
+
+int execute_peer_add(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string remote_client_name = g_ceph_context->_conf->name.to_str();
+ std::string remote_cluster;
+ std::map<std::string, std::string> attributes;
+ r = get_remote_cluster_spec(
+ vm, utils::get_positional_argument(vm, arg_index),
+ &remote_client_name, &remote_cluster, &attributes);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO: temporary restriction to prevent adding multiple peers
+ // until rbd-mirror daemon can properly handle the scenario
+ librbd::RBD rbd;
+ std::vector<librbd::mirror_peer_t> mirror_peers;
+ r = rbd.mirror_peer_list(io_ctx, &mirror_peers);
+ if (r < 0) {
+ std::cerr << "rbd: failed to list mirror peers" << std::endl;
+ return r;
+ }
+ if (!mirror_peers.empty()) {
+ std::cerr << "rbd: multiple peers are not currently supported" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string uuid;
+ r = rbd.mirror_peer_add(io_ctx, &uuid, remote_cluster, remote_client_name);
+ if (r < 0) {
+ std::cerr << "rbd: error adding mirror peer" << std::endl;
+ return r;
+ }
+
+ if (!attributes.empty()) {
+ r = set_peer_config_key(io_ctx, uuid, std::move(attributes));
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ std::cout << uuid << std::endl;
+ return 0;
+}
+
+void get_peer_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ add_uuid_option(positional);
+}
+
+int execute_peer_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string uuid;
+ r = get_uuid(vm, arg_index, &uuid);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.mirror_peer_remove(io_ctx, uuid);
+ if (r < 0) {
+ std::cerr << "rbd: error removing mirror peer" << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_peer_set_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ add_uuid_option(positional);
+ positional->add_options()
+ ("key", "peer parameter [client, cluster, mon-host, key-file]")
+ ("value", "new value for specified key");
+}
+
+int execute_peer_set(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string uuid;
+ r = get_uuid(vm, arg_index++, &uuid);
+ if (r < 0) {
+ return r;
+ }
+
+ std::set<std::string> valid_keys{{"client", "cluster", "mon-host",
+ "key-file"}};
+ std::string key = utils::get_positional_argument(vm, arg_index++);
+ if (valid_keys.find(key) == valid_keys.end()) {
+ std::cerr << "rbd: must specify ";
+ for (auto& valid_key : valid_keys) {
+ std::cerr << "'" << valid_key << "'";
+ if (&valid_key != &(*valid_keys.rbegin())) {
+ std::cerr << ", ";
+ }
+ }
+ std::cerr << " key." << std::endl;
+ return -EINVAL;
+ }
+
+ std::string value = utils::get_positional_argument(vm, arg_index++);
+ if (value.empty() && (key == "client" || key == "cluster")) {
+ std::cerr << "rbd: must specify new " << key << " value." << std::endl;
+ } else if (key == "key-file") {
+ key = "key";
+ r = read_key_file(value, &value);
+ if (r < 0) {
+ return r;
+ }
+ } else if (key == "mon-host") {
+ key = "mon_host";
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ if (key == "client") {
+ r = rbd.mirror_peer_set_client(io_ctx, uuid.c_str(), value.c_str());
+ } else if (key == "cluster") {
+ r = rbd.mirror_peer_set_cluster(io_ctx, uuid.c_str(), value.c_str());
+ } else {
+ r = update_peer_config_key(io_ctx, uuid, key, value);
+ if (r == -ENOENT) {
+ std::cerr << "rbd: mirror peer " << uuid << " does not exist"
+ << std::endl;
+ }
+ }
+
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+void get_disable_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+}
+
+void get_enable_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ positional->add_options()
+ ("mode", "mirror mode [image or pool]");
+ add_site_name_optional(options);
+}
+
+int execute_enable_disable(librados::IoCtx& io_ctx,
+ rbd_mirror_mode_t next_mirror_mode,
+ const std::string &mode, bool ignore_no_update) {
+ librbd::RBD rbd;
+ rbd_mirror_mode_t current_mirror_mode;
+ int r = rbd.mirror_mode_get(io_ctx, &current_mirror_mode);
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve mirror mode: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (current_mirror_mode == next_mirror_mode) {
+ if (!ignore_no_update) {
+ if (mode == "disabled") {
+ std::cout << "rbd: mirroring is already " << mode << std::endl;
+ } else {
+ std::cout << "rbd: mirroring is already configured for "
+ << mode << " mode" << std::endl;
+ }
+ }
+ return 0;
+ } else if (next_mirror_mode == RBD_MIRROR_MODE_IMAGE &&
+ current_mirror_mode == RBD_MIRROR_MODE_POOL) {
+ std::cout << "note: changing mirroring mode from pool to image"
+ << std::endl;
+ } else if (next_mirror_mode == RBD_MIRROR_MODE_POOL &&
+ current_mirror_mode == RBD_MIRROR_MODE_IMAGE) {
+ std::cout << "note: changing mirroring mode from image to pool"
+ << std::endl;
+ }
+
+ r = rbd.mirror_mode_set(io_ctx, next_mirror_mode);
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+int execute_disable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+
+ // TODO support namespaces
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ return execute_enable_disable(io_ctx, RBD_MIRROR_MODE_DISABLED, "disabled",
+ false);
+}
+
+int execute_enable(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ rbd_mirror_mode_t mirror_mode;
+ std::string mode = utils::get_positional_argument(vm, arg_index++);
+ if (mode == "image") {
+ mirror_mode = RBD_MIRROR_MODE_IMAGE;
+ } else if (mode == "pool") {
+ mirror_mode = RBD_MIRROR_MODE_POOL;
+ } else {
+ std::cerr << "rbd: must specify 'image' or 'pool' mode." << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+
+ // TODO support namespaces
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ bool updated = false;
+ if (vm.count(SITE_NAME)) {
+ librbd::RBD rbd;
+
+ auto site_name = vm[SITE_NAME].as<std::string>();
+ std::string original_site_name;
+ r = rbd.mirror_site_name_get(rados, &original_site_name);
+ updated = (r >= 0 && site_name != original_site_name);
+
+ r = set_site_name(rados, site_name);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return execute_enable_disable(io_ctx, mirror_mode, mode, updated);
+}
+
+void get_info_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ at::add_format_options(options);
+ options->add_options()
+ (ALL_NAME.c_str(), po::bool_switch(), "list all attributes");
+}
+
+int execute_info(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ rbd_mirror_mode_t mirror_mode;
+ r = rbd.mirror_mode_get(io_ctx, &mirror_mode);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string site_name;
+ r = rbd.mirror_site_name_get(rados, &site_name);
+ if (r < 0 && r != -EOPNOTSUPP) {
+ return r;
+ }
+
+ std::vector<librbd::mirror_peer_t> mirror_peers;
+ r = rbd.mirror_peer_list(io_ctx, &mirror_peers);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string mirror_mode_desc;
+ switch (mirror_mode) {
+ case RBD_MIRROR_MODE_DISABLED:
+ mirror_mode_desc = "disabled";
+ break;
+ case RBD_MIRROR_MODE_IMAGE:
+ mirror_mode_desc = "image";
+ break;
+ case RBD_MIRROR_MODE_POOL:
+ mirror_mode_desc = "pool";
+ break;
+ default:
+ mirror_mode_desc = "unknown";
+ break;
+ }
+
+ if (formatter != nullptr) {
+ formatter->open_object_section("mirror");
+ formatter->dump_string("mode", mirror_mode_desc);
+ } else {
+ std::cout << "Mode: " << mirror_mode_desc << std::endl;
+ }
+
+ if (mirror_mode != RBD_MIRROR_MODE_DISABLED) {
+ if (formatter != nullptr) {
+ formatter->dump_string("site_name", site_name);
+ } else {
+ std::cout << "Site Name: " << site_name << std::endl;
+ }
+
+ r = format_mirror_peers(io_ctx, formatter, mirror_peers,
+ vm[ALL_NAME].as<bool>());
+ if (r < 0) {
+ return r;
+ }
+ }
+ if (formatter != nullptr) {
+ formatter->close_section();
+ formatter->flush(std::cout);
+ }
+ return 0;
+}
+
+void get_status_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ at::add_format_options(options);
+ at::add_verbose_option(options);
+}
+
+int execute_status(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ bool verbose = vm[at::VERBOSE].as<bool>();
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+
+ std::map<librbd::mirror_image_status_state_t, int> states;
+ r = rbd.mirror_image_status_summary(io_ctx, &states);
+ if (r < 0) {
+ std::cerr << "rbd: failed to get status summary for mirrored images: "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (formatter != nullptr) {
+ formatter->open_object_section("status");
+ }
+
+ enum Health {Ok = 0, Warning = 1, Error = 2} health = Ok;
+ const char *names[] = {"OK", "WARNING", "ERROR"};
+ int total = 0;
+
+ for (auto &it : states) {
+ auto &state = it.first;
+ if (health < Warning &&
+ (state != MIRROR_IMAGE_STATUS_STATE_REPLAYING &&
+ state != MIRROR_IMAGE_STATUS_STATE_STOPPED)) {
+ health = Warning;
+ }
+ if (health < Error &&
+ state == MIRROR_IMAGE_STATUS_STATE_ERROR) {
+ health = Error;
+ }
+ total += it.second;
+ }
+
+ if (formatter != nullptr) {
+ formatter->open_object_section("summary");
+ formatter->dump_string("health", names[health]);
+ formatter->open_object_section("states");
+ for (auto &it : states) {
+ std::string state_name = utils::mirror_image_status_state(it.first);
+ formatter->dump_int(state_name.c_str(), it.second);
+ }
+ formatter->close_section(); // states
+ formatter->close_section(); // summary
+ } else {
+ std::cout << "health: " << names[health] << std::endl;
+ std::cout << "images: " << total << " total" << std::endl;
+ for (auto &it : states) {
+ std::cout << " " << it.second << " "
+ << utils::mirror_image_status_state(it.first) << std::endl;
+ }
+ }
+
+ int ret = 0;
+
+ if (verbose) {
+ if (formatter != nullptr) {
+ formatter->open_array_section("images");
+ }
+
+ std::map<std::string, std::string> instance_ids;
+ MirrorDaemonServiceInfo daemon_service_info(io_ctx);
+
+ std::string start_image_id;
+ while (true) {
+ std::map<std::string, std::string> ids;
+ r = rbd.mirror_image_instance_id_list(io_ctx, start_image_id, 1024, &ids);
+ if (r < 0) {
+ if (r == -EOPNOTSUPP) {
+ std::cerr << "rbd: newer release of Ceph OSDs required to map image "
+ << "to rbd-mirror daemon instance" << std::endl;
+ } else {
+ std::cerr << "rbd: failed to get instance id list: "
+ << cpp_strerror(r) << std::endl;
+ }
+ // not fatal
+ break;
+ }
+ if (ids.empty()) {
+ break;
+ }
+ instance_ids.insert(ids.begin(), ids.end());
+ start_image_id = ids.rbegin()->first;
+ }
+
+ if (!instance_ids.empty()) {
+ daemon_service_info.init();
+ }
+
+ ImageRequestGenerator<StatusImageRequest> generator(
+ io_ctx, instance_ids, daemon_service_info, formatter);
+ ret = generator.execute();
+
+ if (formatter != nullptr) {
+ formatter->close_section(); // images
+ }
+ }
+
+ if (formatter != nullptr) {
+ formatter->close_section(); // status
+ formatter->flush(std::cout);
+ }
+
+ return ret;
+}
+
+void get_promote_arguments(po::options_description *positional,
+ po::options_description *options) {
+ options->add_options()
+ ("force", po::bool_switch(),
+ "promote even if not cleanly demoted by remote cluster");
+ at::add_pool_options(positional, options, false);
+}
+
+int execute_promote(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ std::atomic<unsigned> counter = { 0 };
+ ImageRequestGenerator<PromoteImageRequest> generator(io_ctx, &counter,
+ vm["force"].as<bool>());
+ r = generator.execute();
+
+ std::cout << "Promoted " << counter.load() << " mirrored images" << std::endl;
+ return r;
+}
+
+void get_demote_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+}
+
+int execute_demote(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ // TODO support namespaces
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ r = validate_mirroring_enabled(io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ std::atomic<unsigned> counter { 0 };
+ ImageRequestGenerator<DemoteImageRequest> generator(io_ctx, &counter);
+ r = generator.execute();
+
+ std::cout << "Demoted " << counter.load() << " mirrored images" << std::endl;
+ return r;
+}
+
+Shell::Action action_bootstrap_create(
+ {"mirror", "pool", "peer", "bootstrap", "create"}, {},
+ "Create a peer bootstrap token to import in a remote cluster", "",
+ &get_peer_bootstrap_create_arguments, &execute_peer_bootstrap_create);
+Shell::Action action_bootstreap_import(
+ {"mirror", "pool", "peer", "bootstrap", "import"}, {},
+ "Import a peer bootstrap token created from a remote cluster", "",
+ &get_peer_bootstrap_import_arguments, &execute_peer_bootstrap_import);
+
+Shell::Action action_add(
+ {"mirror", "pool", "peer", "add"}, {},
+ "Add a mirroring peer to a pool.", "",
+ &get_peer_add_arguments, &execute_peer_add);
+Shell::Action action_remove(
+ {"mirror", "pool", "peer", "remove"}, {},
+ "Remove a mirroring peer from a pool.", "",
+ &get_peer_remove_arguments, &execute_peer_remove);
+Shell::Action action_set(
+ {"mirror", "pool", "peer", "set"}, {},
+ "Update mirroring peer settings.", "",
+ &get_peer_set_arguments, &execute_peer_set);
+
+Shell::Action action_disable(
+ {"mirror", "pool", "disable"}, {},
+ "Disable RBD mirroring by default within a pool.", "",
+ &get_disable_arguments, &execute_disable);
+Shell::Action action_enable(
+ {"mirror", "pool", "enable"}, {},
+ "Enable RBD mirroring by default within a pool.", "",
+ &get_enable_arguments, &execute_enable);
+Shell::Action action_info(
+ {"mirror", "pool", "info"}, {},
+ "Show information about the pool mirroring configuration.", {},
+ &get_info_arguments, &execute_info);
+Shell::Action action_status(
+ {"mirror", "pool", "status"}, {},
+ "Show status for all mirrored images in the pool.", {},
+ &get_status_arguments, &execute_status);
+Shell::Action action_promote(
+ {"mirror", "pool", "promote"}, {},
+ "Promote all non-primary images in the pool.", {},
+ &get_promote_arguments, &execute_promote);
+Shell::Action action_demote(
+ {"mirror", "pool", "demote"}, {},
+ "Demote all primary images in the pool.", {},
+ &get_demote_arguments, &execute_demote);
+
+} // namespace mirror_pool
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Namespace.cc b/src/tools/rbd/action/Namespace.cc
new file mode 100644
index 00000000..746ab40c
--- /dev/null
+++ b/src/tools/rbd/action/Namespace.cc
@@ -0,0 +1,191 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <algorithm>
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace ns {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+}
+
+int execute_create(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ if (namespace_name.empty()) {
+ std::cerr << "rbd: namespace name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.namespace_create(io_ctx, namespace_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: failed to created namespace: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ if (namespace_name.empty()) {
+ std::cerr << "rbd: namespace name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.namespace_remove(io_ctx, namespace_name.c_str());
+ if (r == -EBUSY) {
+ std::cerr << "rbd: namespace contains images which must be deleted first."
+ << std::endl;
+ return r;
+ } else if (r == -ENOENT) {
+ std::cerr << "rbd: namespace does not exist." << std::endl;
+ return r;
+ } else if (r < 0) {
+ std::cerr << "rbd: failed to remove namespace: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, true, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ std::vector<std::string> names;
+ r = rbd.namespace_list(io_ctx, &names);
+ if (r < 0 && r != -ENOENT) {
+ std::cerr << "rbd: failed to list namespaces: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ std::sort(names.begin(), names.end());
+
+ TextTable tbl;
+ if (formatter) {
+ formatter->open_array_section("namespaces");
+ } else {
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (auto& name : names) {
+ if (formatter) {
+ formatter->open_object_section("namespace");
+ formatter->dump_string("name", name);
+ formatter->close_section();
+ } else {
+ tbl << name << TextTable::endrow;
+ }
+ }
+
+ if (formatter) {
+ formatter->close_section();
+ formatter->flush(std::cout);
+ } else if (!names.empty()) {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+Shell::Action action_create(
+ {"namespace", "create"}, {},
+ "Create an RBD image namespace.", "",
+ &get_create_arguments, &execute_create);
+
+Shell::Action action_remove(
+ {"namespace", "remove"}, {"namespace", "rm"},
+ "Remove an RBD image namespace.", "",
+ &get_remove_arguments, &execute_remove);
+
+Shell::Action action_list(
+ {"namespace", "list"}, {"namespace", "ls"}, "List RBD image namespaces.", "",
+ &get_list_arguments, &execute_list);
+
+} // namespace ns
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Nbd.cc b/src/tools/rbd/action/Nbd.cc
new file mode 100644
index 00000000..5c55adea
--- /dev/null
+++ b/src/tools/rbd/action/Nbd.cc
@@ -0,0 +1,286 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/SubProcess.h"
+#include <iostream>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace nbd {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int call_nbd_cmd(const po::variables_map &vm,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &ceph_global_init_args) {
+ char exe_path[PATH_MAX];
+ ssize_t exe_path_bytes = readlink("/proc/self/exe", exe_path,
+ sizeof(exe_path) - 1);
+ if (exe_path_bytes < 0) {
+ strcpy(exe_path, "rbd-nbd");
+ } else {
+ if (snprintf(exe_path + exe_path_bytes,
+ sizeof(exe_path) - exe_path_bytes,
+ "-nbd") < 0) {
+ return -EOVERFLOW;
+ }
+ }
+
+ SubProcess process(exe_path, SubProcess::KEEP, SubProcess::KEEP, SubProcess::KEEP);
+
+ for (auto &arg : ceph_global_init_args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ for (auto &arg : args) {
+ process.add_cmd_arg(arg.c_str());
+ }
+
+ if (process.spawn()) {
+ std::cerr << "rbd: failed to run rbd-nbd: " << process.err() << std::endl;
+ return -EINVAL;
+ } else if (process.join()) {
+ std::cerr << "rbd: rbd-nbd failed with error: " << process.err() << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int get_image_or_snap_spec(const po::variables_map &vm, std::string *spec) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string nspace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &nspace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ spec->append(pool_name);
+ spec->append("/");
+ if (!nspace_name.empty()) {
+ spec->append(nspace_name);
+ spec->append("/");
+ }
+ spec->append(image_name);
+ if (!snap_name.empty()) {
+ spec->append("@");
+ spec->append(snap_name);
+ }
+
+ return 0;
+}
+
+int parse_options(const std::vector<std::string> &options,
+ std::vector<std::string> *args) {
+ for (auto &opts : options) {
+ std::vector<std::string> args_;
+ boost::split(args_, opts, boost::is_any_of(","));
+ for (auto &o : args_) {
+ args->push_back("--" + o);
+ }
+ }
+
+ return 0;
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(__FreeBSD__)
+ std::cerr << "rbd: nbd device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+ std::vector<std::string> args;
+
+ args.push_back("list-mapped");
+
+ if (vm.count("format")) {
+ args.push_back("--format");
+ args.push_back(vm["format"].as<at::Format>().value);
+ }
+ if (vm["pretty-format"].as<bool>()) {
+ args.push_back("--pretty-format");
+ }
+
+ return call_nbd_cmd(vm, args, ceph_global_init_args);
+}
+
+int execute_map(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(__FreeBSD__)
+ std::cerr << "rbd: nbd device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+ std::vector<std::string> args;
+
+ args.push_back("map");
+ std::string img;
+ int r = get_image_or_snap_spec(vm, &img);
+ if (r < 0) {
+ return r;
+ }
+ args.push_back(img);
+
+ if (vm["read-only"].as<bool>()) {
+ args.push_back("--read-only");
+ }
+
+ if (vm["exclusive"].as<bool>()) {
+ args.push_back("--exclusive");
+ }
+
+ if (vm.count("options")) {
+ r = parse_options(vm["options"].as<std::vector<std::string>>(), &args);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return call_nbd_cmd(vm, args, ceph_global_init_args);
+}
+
+int execute_unmap(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+#if defined(__FreeBSD__)
+ std::cerr << "rbd: nbd device is not supported" << std::endl;
+ return -EOPNOTSUPP;
+#endif
+ std::string device_name = utils::get_positional_argument(vm, 0);
+ if (!boost::starts_with(device_name, "/dev/")) {
+ device_name.clear();
+ }
+
+ std::string image_name;
+ if (device_name.empty()) {
+ int r = get_image_or_snap_spec(vm, &image_name);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (device_name.empty() && image_name.empty()) {
+ std::cerr << "rbd: unmap requires either image name or device path"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ std::vector<std::string> args;
+
+ args.push_back("unmap");
+ args.push_back(device_name.empty() ? image_name : device_name);
+
+ if (vm.count("options")) {
+ int r = parse_options(vm["options"].as<std::vector<std::string>>(), &args);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return call_nbd_cmd(vm, args, ceph_global_init_args);
+}
+
+void get_list_arguments_deprecated(po::options_description *positional,
+ po::options_description *options) {
+ at::add_format_options(options);
+}
+
+int execute_list_deprecated(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args) {
+ std::cerr << "rbd: 'nbd list' command is deprecated, "
+ << "use 'device list -t nbd' instead" << std::endl;
+ return execute_list(vm, ceph_global_args);
+}
+
+void get_map_arguments_deprecated(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ ("read-only", po::bool_switch(), "map read-only")
+ ("exclusive", po::bool_switch(), "forbid writes by other clients")
+ ("device", po::value<std::string>(), "specify nbd device")
+ ("nbds_max", po::value<std::string>(), "override module param nbds_max")
+ ("max_part", po::value<std::string>(), "override module param max_part")
+ ("timeout", po::value<std::string>(), "set nbd request timeout (seconds)");
+}
+
+int execute_map_deprecated(const po::variables_map &vm_deprecated,
+ const std::vector<std::string> &ceph_global_args) {
+ std::cerr << "rbd: 'nbd map' command is deprecated, "
+ << "use 'device map -t nbd' instead" << std::endl;
+
+ po::options_description options;
+ options.add_options()
+ ("options,o", po::value<std::vector<std::string>>()
+ ->default_value(std::vector<std::string>(), ""), "");
+
+ po::variables_map vm = vm_deprecated;
+ po::store(po::command_line_parser({}).options(options).run(), vm);
+
+ std::vector<std::string> opts;
+ if (vm_deprecated.count("device")) {
+ opts.push_back("device=" + vm_deprecated["device"].as<std::string>());
+ }
+ if (vm_deprecated.count("nbds_max")) {
+ opts.push_back("nbds_max=" + vm_deprecated["nbds_max"].as<std::string>());
+ }
+ if (vm_deprecated.count("max_part")) {
+ opts.push_back("max_part=" + vm_deprecated["max_part"].as<std::string>());
+ }
+ if (vm_deprecated.count("timeout")) {
+ opts.push_back("timeout=" + vm_deprecated["timeout"].as<std::string>());
+ }
+
+ vm.at("options").value() = boost::any(opts);
+
+ return execute_map(vm, ceph_global_args);
+}
+
+void get_unmap_arguments_deprecated(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ ("image-or-snap-or-device-spec",
+ "image, snapshot, or device specification\n"
+ "[<pool-name>/]<image-name>[@<snapshot-name>] or <device-path>");
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_snap_option(options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_unmap_deprecated(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_args) {
+ std::cerr << "rbd: 'nbd unmap' command is deprecated, "
+ << "use 'device unmap -t nbd' instead" << std::endl;
+ return execute_unmap(vm, ceph_global_args);
+}
+
+Shell::SwitchArguments switched_arguments({"read-only", "exclusive"});
+
+Shell::Action action_show_deprecated(
+ {"nbd", "list"}, {"nbd", "ls"}, "List the nbd devices already used.", "",
+ &get_list_arguments_deprecated, &execute_list_deprecated, false);
+
+Shell::Action action_map_deprecated(
+ {"nbd", "map"}, {}, "Map image to a nbd device.", "",
+ &get_map_arguments_deprecated, &execute_map_deprecated, false);
+
+Shell::Action action_unmap_deprecated(
+ {"nbd", "unmap"}, {}, "Unmap a nbd device.", "",
+ &get_unmap_arguments_deprecated, &execute_unmap_deprecated, false);
+
+} // namespace nbd
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/ObjectMap.cc b/src/tools/rbd/action/ObjectMap.cc
new file mode 100644
index 00000000..40ee2d47
--- /dev/null
+++ b/src/tools/rbd/action/ObjectMap.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace object_map {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_object_map_rebuild(librbd::Image &image, bool no_progress)
+{
+ utils::ProgressContext pc("Object Map Rebuild", no_progress);
+ int r = image.rebuild_object_map(pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_rebuild_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_rebuild(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_object_map_rebuild(image, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: rebuilding object map failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+static int do_object_map_check(librbd::Image &image, bool no_progress)
+{
+ utils::ProgressContext pc("Object Map Check", no_progress);
+ int r = image.check_object_map(pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_check_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_or_snap_spec_options(positional, options,
+ at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_check(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_PERMITTED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_object_map_check(image, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: checking object map failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_rebuild(
+ {"object-map", "rebuild"}, {}, "Rebuild an invalid object map.", "",
+ &get_rebuild_arguments, &execute_rebuild);
+Shell::Action action_check(
+ {"object-map", "check"}, {}, "Verify the object map is correct.", "",
+ &get_check_arguments, &execute_check);
+
+} // namespace object_map
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Perf.cc b/src/tools/rbd/action/Perf.cc
new file mode 100644
index 00000000..8f76e85f
--- /dev/null
+++ b/src/tools/rbd/action/Perf.cc
@@ -0,0 +1,699 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "global/global_context.h"
+#include <ncurses.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <iostream>
+#include <vector>
+#include <boost/algorithm/string.hpp>
+#include <boost/assign.hpp>
+#include <boost/bimap.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace perf {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+enum class StatDescriptor {
+ WRITE_OPS = 0,
+ READ_OPS,
+ WRITE_BYTES,
+ READ_BYTES,
+ WRITE_LATENCY,
+ READ_LATENCY
+};
+
+typedef boost::bimap<StatDescriptor, std::string> StatDescriptors;
+
+static const StatDescriptors STAT_DESCRIPTORS =
+ boost::assign::list_of<StatDescriptors::relation>
+ (StatDescriptor::WRITE_OPS, "write_ops")
+ (StatDescriptor::READ_OPS, "read_ops")
+ (StatDescriptor::WRITE_BYTES, "write_bytes")
+ (StatDescriptor::READ_BYTES, "read_bytes")
+ (StatDescriptor::WRITE_LATENCY, "write_latency")
+ (StatDescriptor::READ_LATENCY, "read_latency");
+
+std::ostream& operator<<(std::ostream& os, const StatDescriptor& val) {
+ auto it = STAT_DESCRIPTORS.left.find(val);
+ if (it == STAT_DESCRIPTORS.left.end()) {
+ os << "unknown (" << static_cast<int>(val) << ")";
+ } else {
+ os << it->second;
+ }
+ return os;
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ StatDescriptor *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ std::string s = po::validators::get_single_string(values);
+ boost::replace_all(s, "_", " ");
+ boost::replace_all(s, "-", "_");
+
+ auto it = STAT_DESCRIPTORS.right.find(s);
+ if (it == STAT_DESCRIPTORS.right.end()) {
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+ v = boost::any(it->second);
+}
+
+struct ImageStat {
+ ImageStat(const std::string& pool_name, const std::string& pool_namespace,
+ const std::string& image_name)
+ : pool_name(pool_name), pool_namespace(pool_namespace),
+ image_name(image_name) {
+ stats.resize(STAT_DESCRIPTORS.size());
+ }
+
+ std::string pool_name;
+ std::string pool_namespace;
+ std::string image_name;
+ std::vector<double> stats;
+};
+
+typedef std::vector<ImageStat> ImageStats;
+
+typedef std::pair<std::string, std::string> SpecPair;
+
+std::string format_pool_spec(const std::string& pool,
+ const std::string& pool_namespace) {
+ std::string pool_spec{pool};
+ if (!pool_namespace.empty()) {
+ pool_spec += "/" + pool_namespace;
+ }
+ return pool_spec;
+}
+
+int query_iostats(librados::Rados& rados, const std::string& pool_spec,
+ StatDescriptor sort_by, ImageStats* image_stats,
+ std::ostream& err_os) {
+ auto sort_by_str = STAT_DESCRIPTORS.left.find(sort_by)->second;
+
+ std::string cmd = R"(
+ {
+ "prefix": "rbd perf image stats",
+ "pool_spec": ")" + pool_spec + R"(",
+ "sort_by": ")" + sort_by_str + R"(",
+ "format": "json"
+ }")";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+ std::string outs;
+ int r = rados.mgr_command(cmd, in_bl, &out_bl, &outs);
+ if (r == -EOPNOTSUPP) {
+ err_os << "rbd: 'rbd_support' mgr module is not enabled."
+ << std::endl << std::endl
+ << "Use 'ceph mgr module enable rbd_support' to enable."
+ << std::endl;
+ return r;
+ } else if (r < 0) {
+ err_os << "rbd: mgr command failed: " << cpp_strerror(r);
+ if (!outs.empty()) {
+ err_os << ": " << outs;
+ }
+ err_os << std::endl;
+ return r;
+ }
+
+ json_spirit::mValue json_root;
+ if (!json_spirit::read(out_bl.to_str(), json_root)) {
+ err_os << "rbd: error parsing perf stats" << std::endl;
+ return -EINVAL;
+ }
+
+ image_stats->clear();
+ try {
+ auto& root = json_root.get_obj();
+
+ // map JSON stat descriptor order to our internal order
+ std::map<uint32_t, uint32_t> json_to_internal_stats;
+ auto& json_stat_descriptors = root["stat_descriptors"].get_array();
+ for (size_t idx = 0; idx < json_stat_descriptors.size(); ++idx) {
+ auto it = STAT_DESCRIPTORS.right.find(
+ json_stat_descriptors[idx].get_str());
+ if (it == STAT_DESCRIPTORS.right.end()) {
+ continue;
+ }
+ json_to_internal_stats[idx] = static_cast<uint32_t>(it->second);
+ }
+
+ // cache a mapping from pool descriptors back to pool-specs
+ std::map<std::string, SpecPair> json_to_internal_pools;
+ auto& pool_descriptors = root["pool_descriptors"].get_obj();
+ for (auto& pool : pool_descriptors) {
+ auto& pool_spec = pool.second.get_str();
+ auto pos = pool_spec.rfind("/");
+
+ SpecPair pair{pool_spec.substr(0, pos), ""};
+ if (pos != std::string::npos) {
+ pair.second = pool_spec.substr(pos + 1);
+ }
+
+ json_to_internal_pools[pool.first] = pair;
+ }
+
+ auto& stats = root["stats"].get_array();
+ for (auto& stat : stats) {
+ auto& stat_obj = stat.get_obj();
+ if (!stat_obj.empty()) {
+ auto& image_spec = stat_obj.begin()->first;
+
+ auto pos = image_spec.find("/");
+ SpecPair pair{image_spec.substr(0, pos), ""};
+ if (pos != std::string::npos) {
+ pair.second = image_spec.substr(pos + 1);
+ }
+
+ const auto pool_it = json_to_internal_pools.find(pair.first);
+ if (pool_it == json_to_internal_pools.end()) {
+ continue;
+ }
+
+ image_stats->emplace_back(
+ pool_it->second.first, pool_it->second.second, pair.second);
+
+ auto& image_stat = image_stats->back();
+ auto& data = stat_obj.begin()->second.get_array();
+ for (auto& indexes : json_to_internal_stats) {
+ image_stat.stats[indexes.second] = data[indexes.first].get_real();
+ }
+ }
+ }
+ } catch (std::runtime_error &e) {
+ err_os << "rbd: error parsing perf stats: " << e.what() << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void format_stat(StatDescriptor stat_descriptor, double stat,
+ std::ostream& os) {
+ switch (stat_descriptor) {
+ case StatDescriptor::WRITE_OPS:
+ case StatDescriptor::READ_OPS:
+ os << si_u_t(stat) << "/s";
+ break;
+ case StatDescriptor::WRITE_BYTES:
+ case StatDescriptor::READ_BYTES:
+ os << byte_u_t(stat) << "/s";
+ break;
+ case StatDescriptor::WRITE_LATENCY:
+ case StatDescriptor::READ_LATENCY:
+ os << std::fixed << std::setprecision(2);
+ if (stat >= 1000000000) {
+ os << (stat / 1000000000) << " s";
+ } else if (stat >= 1000000) {
+ os << (stat / 1000000) << " ms";
+ } else if (stat >= 1000) {
+ os << (stat / 1000) << " us";
+ } else {
+ os << stat << " ns";
+ }
+ break;
+ default:
+ ceph_assert(false);
+ break;
+ }
+}
+
+} // anonymous namespace
+
+namespace iostat {
+
+struct Iterations {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+ Iterations *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ auto& s = po::validators::get_single_string(values);
+
+ try {
+ auto iterations = boost::lexical_cast<uint32_t>(s);
+ if (iterations > 0) {
+ v = boost::any(iterations);
+ return;
+ }
+ } catch (const boost::bad_lexical_cast &) {
+ }
+ throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void format(const ImageStats& image_stats, Formatter* f, bool global_search) {
+ TextTable tbl;
+ if (f) {
+ f->open_array_section("images");
+ } else {
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ for (auto& stat : STAT_DESCRIPTORS.left) {
+ std::string title;
+ switch (stat.first) {
+ case StatDescriptor::WRITE_OPS:
+ title = "WR ";
+ break;
+ case StatDescriptor::READ_OPS:
+ title = "RD ";
+ break;
+ case StatDescriptor::WRITE_BYTES:
+ title = "WR_BYTES ";
+ break;
+ case StatDescriptor::READ_BYTES:
+ title = "RD_BYTES ";
+ break;
+ case StatDescriptor::WRITE_LATENCY:
+ title = "WR_LAT ";
+ break;
+ case StatDescriptor::READ_LATENCY:
+ title = "RD_LAT ";
+ break;
+ default:
+ ceph_assert(false);
+ break;
+ }
+ tbl.define_column(title, TextTable::RIGHT, TextTable::RIGHT);
+ }
+ }
+
+ for (auto& image_stat : image_stats) {
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("pool", image_stat.pool_name);
+ f->dump_string("pool_namespace", image_stat.pool_namespace);
+ f->dump_string("image", image_stat.image_name);
+ for (auto& pair : STAT_DESCRIPTORS.left) {
+ f->dump_float(pair.second.c_str(),
+ image_stat.stats[static_cast<size_t>(pair.first)]);
+ }
+ f->close_section();
+ } else {
+ std::string name;
+ if (global_search) {
+ name += image_stat.pool_name + "/";
+ if (!image_stat.pool_namespace.empty()) {
+ name += image_stat.pool_namespace + "/";
+ }
+ }
+ name += image_stat.image_name;
+
+ tbl << name;
+ for (auto& pair : STAT_DESCRIPTORS.left) {
+ std::stringstream str;
+ format_stat(pair.first,
+ image_stat.stats[static_cast<size_t>(pair.first)], str);
+ str << ' ';
+ tbl << str.str();
+ }
+ tbl << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else {
+ std::cout << tbl << std::endl;
+ }
+}
+
+} // namespace iostat
+
+namespace iotop {
+
+class MainWindow {
+public:
+ MainWindow(librados::Rados& rados, const std::string& pool_spec)
+ : m_rados(rados), m_pool_spec(pool_spec) {
+ initscr();
+ curs_set(0);
+ cbreak();
+ noecho();
+ keypad(stdscr, TRUE);
+ nodelay(stdscr, TRUE);
+
+ init_columns();
+ }
+
+ int run() {
+ redraw();
+
+ int r = 0;
+ std::stringstream err_str;
+ while (true) {
+ r = query_iostats(m_rados, m_pool_spec, m_sort_by, &m_image_stats,
+ err_str);
+ if (r < 0) {
+ break;
+ return r;
+ }
+
+ redraw();
+ wait_for_key_or_delay();
+
+ int ch = getch();
+ if (ch == 'q' || ch == 'Q') {
+ break;
+ } else if (ch == '<' || ch == KEY_LEFT) {
+ auto it = STAT_DESCRIPTORS.left.find(m_sort_by);
+ if (it != STAT_DESCRIPTORS.left.begin()) {
+ m_sort_by = (--it)->first;
+ }
+ } else if (ch == '>' || ch == KEY_RIGHT) {
+ auto it = STAT_DESCRIPTORS.left.find(m_sort_by);
+ if (it != STAT_DESCRIPTORS.left.end() &&
+ ++it != STAT_DESCRIPTORS.left.end()) {
+ m_sort_by = it->first;
+ }
+ }
+ }
+
+ endwin();
+
+ if (r < 0) {
+ std::cerr << err_str.str() << std::endl;
+ }
+ return r;
+ }
+
+private:
+ static const size_t STAT_COLUMN_WIDTH = 12;
+
+ librados::Rados& m_rados;
+ std::string m_pool_spec;
+
+ ImageStats m_image_stats;
+ StatDescriptor m_sort_by = StatDescriptor::WRITE_OPS;
+
+ bool m_pending_win_opened = false;
+ WINDOW* m_pending_win = nullptr;
+
+ int m_height = 1;
+ int m_width = 1;
+
+ std::map<StatDescriptor, std::string> m_columns;
+
+ void init_columns() {
+ m_columns.clear();
+ for (auto& pair : STAT_DESCRIPTORS.left) {
+ std::string title;
+ switch (pair.first) {
+ case StatDescriptor::WRITE_OPS:
+ title = "WRITES OPS";
+ break;
+ case StatDescriptor::READ_OPS:
+ title = "READS OPS";
+ break;
+ case StatDescriptor::WRITE_BYTES:
+ title = "WRITE BYTES";
+ break;
+ case StatDescriptor::READ_BYTES:
+ title = "READ BYTES";
+ break;
+ case StatDescriptor::WRITE_LATENCY:
+ title = "WRITE LAT";
+ break;
+ case StatDescriptor::READ_LATENCY:
+ title = "READ LAT";
+ break;
+ default:
+ ceph_assert(false);
+ break;
+ }
+ m_columns[pair.first] = (title);
+ }
+ }
+
+ void redraw() {
+ getmaxyx(stdscr, m_height, m_width);
+
+ redraw_main_window();
+ redraw_pending_window();
+
+ doupdate();
+ }
+
+ void redraw_main_window() {
+ werase(stdscr);
+ mvhline(0, 0, ' ' | A_REVERSE, m_width);
+
+ // print header for all metrics
+ int remaining_cols = m_width;
+ std::stringstream str;
+ for (auto& pair : m_columns) {
+ int attr = A_REVERSE;
+ std::string title;
+ if (pair.first == m_sort_by) {
+ title += '>';
+ attr |= A_BOLD;
+ } else {
+ title += ' ';
+ }
+ title += pair.second;
+
+ str.str("");
+ str << std::right << std::setfill(' ')
+ << std::setw(STAT_COLUMN_WIDTH)
+ << title << ' ';
+
+ attrset(attr);
+ addstr(str.str().c_str());
+ remaining_cols -= title.size();
+ }
+
+ attrset(A_REVERSE);
+ addstr("IMAGE");
+ attrset(A_NORMAL);
+
+ // print each image (one per line)
+ int row = 1;
+ int remaining_lines = m_height - 1;
+ for (auto& image_stat : m_image_stats) {
+ if (remaining_lines <= 0) {
+ break;
+ }
+ --remaining_lines;
+
+ move(row++, 0);
+ for (auto& pair : m_columns) {
+ str.str("");
+ format_stat(pair.first,
+ image_stat.stats[static_cast<size_t>(pair.first)], str);
+ auto value = str.str().substr(0, STAT_COLUMN_WIDTH);
+
+ str.str("");
+ str << std::right << std::setfill(' ')
+ << std::setw(STAT_COLUMN_WIDTH)
+ << value << ' ';
+ addstr(str.str().c_str());
+ }
+
+ std::string image;
+ if (m_pool_spec.empty()) {
+ image = format_pool_spec(image_stat.pool_name,
+ image_stat.pool_namespace) + "/";
+ }
+ image += image_stat.image_name;
+ addstr(image.substr(0, remaining_cols).c_str());
+ }
+
+ wnoutrefresh(stdscr);
+ }
+
+ void redraw_pending_window() {
+ // draw a "please by patient" window while waiting
+ const char* msg = "Waiting for initial stats";
+ int height = 5;
+ int width = strlen(msg) + 4;;
+ int starty = (m_height - height) / 2;
+ int startx = (m_width - width) / 2;
+
+ if (m_image_stats.empty() && !m_pending_win_opened) {
+ m_pending_win_opened = true;
+ m_pending_win = newwin(height, width, starty, startx);
+ }
+
+ if (m_pending_win != nullptr) {
+ if (m_image_stats.empty()) {
+ box(m_pending_win, 0 , 0);
+ mvwaddstr(m_pending_win, 2, 2, msg);
+ wnoutrefresh(m_pending_win);
+ } else {
+ delwin(m_pending_win);
+ m_pending_win = nullptr;
+ }
+ }
+ }
+
+ void wait_for_key_or_delay() {
+ fd_set fds;
+ FD_ZERO(&fds);
+ FD_SET(STDIN_FILENO, &fds);
+
+ // no point to refreshing faster than the stats period
+ struct timeval tval;
+ tval.tv_sec = std::min<uint32_t>(
+ 10, g_conf().get_val<int64_t>("mgr_stats_period"));
+ tval.tv_usec = 0;
+
+ select(STDIN_FILENO + 1, &fds, NULL, NULL, &tval);
+ }
+};
+
+} // namespace iotop
+
+
+void get_arguments_iostat(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ options->add_options()
+ ("iterations", po::value<iostat::Iterations>(),
+ "iterations of metric collection [> 0]")
+ ("sort-by", po::value<StatDescriptor>()->default_value(StatDescriptor::WRITE_OPS),
+ "sort-by IO metric "
+ "(write-ops, read-ops, write-bytes, read-bytes, write-latency, read-latency) "
+ "[default: write-ops]");
+ at::add_format_options(options);
+}
+
+int execute_iostat(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool;
+ std::string pool_namespace;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, false, &pool,
+ &pool_namespace, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ uint32_t iterations = 0;
+ if (vm.count("iterations")) {
+ iterations = vm["iterations"].as<uint32_t>();
+ }
+ auto sort_by = vm["sort-by"].as<StatDescriptor>();
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ auto f = formatter.get();
+ if (iterations > 1 && f != nullptr) {
+ std::cerr << "rbd: specifing iterations is not valid with formatted output"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ r = rados.wait_for_latest_osdmap();
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve OSD map" << std::endl;
+ return r;
+ }
+
+ std::string pool_spec = format_pool_spec(pool, pool_namespace);
+
+ // no point to refreshing faster than the stats period
+ auto delay = std::min<uint32_t>(10, g_conf().get_val<int64_t>("mgr_stats_period"));
+
+ ImageStats image_stats;
+ uint32_t count = 0;
+ bool printed_notice = false;
+ while (count++ < iterations || iterations == 0) {
+ r = query_iostats(rados, pool_spec, sort_by, &image_stats, std::cerr);
+ if (r < 0) {
+ return r;
+ }
+
+ if (count == 1 && image_stats.empty()) {
+ count = 0;
+ if (!printed_notice) {
+ std::cerr << "rbd: waiting for initial image stats"
+ << std::endl << std::endl;;
+ printed_notice = true;
+ }
+ } else {
+ iostat::format(image_stats, f, pool_spec.empty());
+ if (f != nullptr) {
+ break;
+ }
+ }
+
+ sleep(delay);
+ }
+
+ return 0;
+}
+
+void get_arguments_iotop(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+}
+
+int execute_iotop(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool;
+ std::string pool_namespace;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, false, false, &pool,
+ &pool_namespace, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ r = utils::init_rados(&rados);
+ if (r < 0) {
+ return r;
+ }
+
+ r = rados.wait_for_latest_osdmap();
+ if (r < 0) {
+ std::cerr << "rbd: failed to retrieve OSD map" << std::endl;
+ return r;
+ }
+
+ iotop::MainWindow mainWindow(rados, format_pool_spec(pool, pool_namespace));
+ r = mainWindow.run();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+Shell::Action stat_action(
+ {"perf", "image", "iostat"}, {}, "Display image IO statistics.", "",
+ &get_arguments_iostat, &execute_iostat);
+Shell::Action top_action(
+ {"perf", "image", "iotop"}, {}, "Display a top-like IO monitor.", "",
+ &get_arguments_iotop, &execute_iotop);
+
+} // namespace perf
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Pool.cc b/src/tools/rbd/action/Pool.cc
new file mode 100644
index 00000000..f1718eb1
--- /dev/null
+++ b/src/tools/rbd/action/Pool.cc
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace pool {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments_init(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, false);
+ options->add_options()
+ ("force", po::bool_switch(),
+ "force initialize pool for RBD use if registered by another application");
+}
+
+int execute_init(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name,
+ nullptr, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, "", &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.pool_init(io_ctx, vm["force"].as<bool>());
+ if (r == -EOPNOTSUPP) {
+ std::cerr << "rbd: luminous or later release required." << std::endl;
+ } else if (r == -EPERM) {
+ std::cerr << "rbd: pool already registered to a different application."
+ << std::endl;
+ } else if (r < 0) {
+ std::cerr << "rbd: error registered application: " << cpp_strerror(r)
+ << std::endl;
+ }
+
+ return 0;
+}
+
+void get_arguments_stats(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ at::add_format_options(options);
+}
+
+int execute_stats(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ uint64_t image_count;
+ uint64_t provisioned_bytes;
+ uint64_t snap_count;
+ uint64_t trash_count;
+ uint64_t trash_provisioned_bytes;
+ uint64_t trash_snap_count;
+
+ librbd::PoolStats pool_stats;
+ pool_stats.add(RBD_POOL_STAT_OPTION_IMAGES, &image_count);
+ pool_stats.add(RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES,
+ &provisioned_bytes);
+ pool_stats.add(RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, &snap_count);
+ pool_stats.add(RBD_POOL_STAT_OPTION_TRASH_IMAGES, &trash_count);
+ pool_stats.add(RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES,
+ &trash_provisioned_bytes);
+ pool_stats.add(RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS, &trash_snap_count);
+
+ r = rbd.pool_stats_get(io_ctx, &pool_stats);
+ if (r < 0) {
+ std::cerr << "rbd: failed to query pool stats: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ if (formatter) {
+ formatter->open_object_section("stats");
+ formatter->open_object_section("images");
+ formatter->dump_unsigned("count", image_count);
+ formatter->dump_unsigned("provisioned_bytes", provisioned_bytes);
+ formatter->dump_unsigned("snap_count", snap_count);
+ formatter->close_section();
+ formatter->open_object_section("trash");
+ formatter->dump_unsigned("count", trash_count);
+ formatter->dump_unsigned("provisioned_bytes", trash_provisioned_bytes);
+ formatter->dump_unsigned("snap_count", trash_snap_count);
+ formatter->close_section();
+ formatter->close_section();
+ formatter->flush(std::cout);
+ } else {
+ std::cout << "Total Images: " << image_count;
+ if (trash_count > 0) {
+ std::cout << " (" << trash_count << " in trash)";
+ }
+ std::cout << std::endl;
+
+ std::cout << "Total Snapshots: " << snap_count;
+ if (trash_count > 0) {
+ std::cout << " (" << trash_snap_count << " in trash)";
+ }
+ std::cout << std::endl;
+
+ std::cout << "Provisioned Size: " << byte_u_t(provisioned_bytes);
+ if (trash_count > 0) {
+ std::cout << " (" << byte_u_t(trash_provisioned_bytes) << " in trash)";
+ }
+ std::cout << std::endl;
+ }
+
+ return 0;
+}
+
+Shell::Action init_action(
+ {"pool", "init"}, {}, "Initialize pool for use by RBD.", "",
+ &get_arguments_init, &execute_init);
+Shell::Action stat_action(
+ {"pool", "stats"}, {}, "Display pool statistics.",
+ "Note: legacy v1 images are not included in stats",
+ &get_arguments_stats, &execute_stats);
+
+} // namespace pool
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Remove.cc b/src/tools/rbd/action/Remove.cc
new file mode 100644
index 00000000..337d42be
--- /dev/null
+++ b/src/tools/rbd/action/Remove.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace remove {
+
+namespace {
+
+bool is_auto_delete_snapshot(librbd::Image* image,
+ const librbd::snap_info_t &snap_info) {
+ librbd::snap_namespace_type_t namespace_type;
+ int r = image->snap_get_namespace_type(snap_info.id, &namespace_type);
+ if (r < 0) {
+ return false;
+ }
+
+ switch (namespace_type) {
+ case RBD_SNAP_NAMESPACE_TYPE_TRASH:
+ return true;
+ default:
+ return false;
+ }
+}
+
+} // anonymous namespace
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_delete(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+ const char *imgname, bool no_progress)
+{
+ utils::ProgressContext pc("Removing image", no_progress);
+ int r = rbd.remove_with_progress(io_ctx, imgname, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_osdmap_full_try();
+
+ librbd::RBD rbd;
+ r = do_delete(rbd, io_ctx, image_name.c_str(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ if (r == -ENOTEMPTY) {
+ librbd::Image image;
+ std::vector<librbd::snap_info_t> snaps;
+ int image_r = utils::open_image(io_ctx, image_name, true, &image);
+ if (image_r >= 0) {
+ image_r = image.snap_list(snaps);
+ }
+ if (image_r >= 0) {
+ snaps.erase(std::remove_if(snaps.begin(), snaps.end(),
+ [&image](const librbd::snap_info_t& snap) {
+ return is_auto_delete_snapshot(&image,
+ snap);
+ }),
+ snaps.end());
+ }
+
+ if (!snaps.empty()) {
+ std::cerr << "rbd: image has snapshots - these must be deleted"
+ << " with 'rbd snap purge' before the image can be removed."
+ << std::endl;
+ } else {
+ std::cerr << "rbd: image has snapshots with linked clones - these must "
+ << "be deleted or flattened before the image can be removed."
+ << std::endl;
+ }
+ } else if (r == -EBUSY) {
+ std::cerr << "rbd: error: image still has watchers"
+ << std::endl
+ << "This means the image is still open or the client using "
+ << "it crashed. Try again after closing/unmapping it or "
+ << "waiting 30s for the crashed client to timeout."
+ << std::endl;
+ } else if (r == -EMLINK) {
+ librbd::Image image;
+ int image_r = utils::open_image(io_ctx, image_name, true, &image);
+ librbd::group_info_t group_info;
+ if (image_r == 0) {
+ image_r = image.get_group(&group_info, sizeof(group_info));
+ }
+ if (image_r == 0) {
+ std::string pool_name = "";
+ librados::Rados rados(io_ctx);
+ librados::IoCtx pool_io_ctx;
+ image_r = rados.ioctx_create2(group_info.pool, pool_io_ctx);
+ if (image_r < 0) {
+ pool_name = "<missing group pool " + stringify(group_info.pool) + ">";
+ } else {
+ pool_name = pool_io_ctx.get_pool_name();
+ }
+ std::cerr << "rbd: error: image belongs to a group "
+ << pool_name << "/";
+ if (!io_ctx.get_namespace().empty()) {
+ std::cerr << io_ctx.get_namespace() << "/";
+ }
+ std::cerr << group_info.name;
+ } else
+ std::cerr << "rbd: error: image belongs to a group";
+
+ std::cerr << std::endl
+ << "Remove the image from the group and try again."
+ << std::endl;
+ image.close();
+ } else {
+ std::cerr << "rbd: delete error: " << cpp_strerror(r) << std::endl;
+ }
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"remove"}, {"rm"}, "Delete an image.", "", &get_arguments, &execute);
+
+} // namespace remove
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Rename.cc b/src/tools/rbd/action/Rename.cc
new file mode 100644
index 00000000..b4954bcb
--- /dev/null
+++ b/src/tools/rbd/action/Rename.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace rename {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_rename(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+ const char *imgname, const char *destname)
+{
+ int r = rbd.rename(io_ctx, imgname, destname);
+ if (r < 0)
+ return r;
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string dst_image_name;
+ std::string dst_snap_name;
+ std::string dst_pool_name = pool_name;
+ std::string dst_namespace_name = namespace_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name,
+ &dst_namespace_name, &dst_image_name, &dst_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL);
+ if (r < 0) {
+ return r;
+ }
+
+ if (pool_name != dst_pool_name) {
+ std::cerr << "rbd: mv/rename across pools not supported" << std::endl
+ << "source pool: " << pool_name << " dest pool: " << dst_pool_name
+ << std::endl;
+ return -EINVAL;
+ } else if (namespace_name != dst_namespace_name) {
+ std::cerr << "rbd: mv/rename across namespaces not supported" << std::endl
+ << "source namespace: " << namespace_name << " dest namespace: "
+ << dst_namespace_name << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::RBD rbd;
+ r = do_rename(rbd, io_ctx, image_name.c_str(), dst_image_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: rename error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"rename"}, {"mv"}, "Rename image within pool.", "", &get_arguments,
+ &execute);
+
+} // namespace rename
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Resize.cc b/src/tools/rbd/action/Resize.cc
new file mode 100644
index 00000000..60c16429
--- /dev/null
+++ b/src/tools/rbd/action/Resize.cc
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace resize {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_resize(librbd::Image& image, uint64_t size, bool allow_shrink, bool no_progress)
+{
+ utils::ProgressContext pc("Resizing image", no_progress);
+ int r = image.resize2(size, allow_shrink, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_size_option(options);
+ options->add_options()
+ ("allow-shrink", po::bool_switch(), "permit shrinking");
+ at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t size;
+ r = utils::get_image_size(vm, &size);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "",
+ snap_name, false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::image_info_t info;
+ r = image.stat(info, sizeof(info));
+ if (r < 0) {
+ std::cerr << "rbd: resize error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (info.size == size) {
+ std::cerr << "rbd: new size is equal to original size " << std::endl;
+ return -EINVAL;
+ }
+
+ if (info.size > size && !vm["allow-shrink"].as<bool>()) {
+ r = -EINVAL;
+ } else {
+ r = do_resize(image, size, vm["allow-shrink"].as<bool>(), vm[at::NO_PROGRESS].as<bool>());
+ }
+
+ if (r < 0) {
+ if (r == -EINVAL && !vm["allow-shrink"].as<bool>()) {
+ std::cerr << "rbd: shrinking an image is only allowed with the "
+ << "--allow-shrink flag" << std::endl;
+ return r;
+ }
+ std::cerr << "rbd: resize error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"allow-shrink"});
+Shell::Action action(
+ {"resize"}, {}, "Resize (expand or shrink) image.", "", &get_arguments,
+ &execute);
+
+} // namespace resize
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Snap.cc b/src/tools/rbd/action/Snap.cc
new file mode 100644
index 00000000..70cf62da
--- /dev/null
+++ b/src/tools/rbd/action/Snap.cc
@@ -0,0 +1,889 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/bind.hpp>
+
+namespace rbd {
+namespace action {
+namespace snap {
+
+static const std::string ALL_NAME("all");
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_list_snaps(librbd::Image& image, Formatter *f, bool all_snaps, librados::Rados& rados)
+{
+ std::vector<librbd::snap_info_t> snaps;
+ TextTable t;
+ int r;
+
+ r = image.snap_list(snaps);
+ if (r < 0) {
+ std::cerr << "rbd: unable to list snapshots" << std::endl;
+ return r;
+ }
+
+ if (!all_snaps) {
+ snaps.erase(remove_if(snaps.begin(),
+ snaps.end(),
+ boost::bind(utils::is_not_user_snap_namespace, &image, _1)),
+ snaps.end());
+ }
+
+ if (f) {
+ f->open_array_section("snapshots");
+ } else {
+ t.define_column("SNAPID", TextTable::LEFT, TextTable::RIGHT);
+ t.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ t.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
+ t.define_column("PROTECTED", TextTable::LEFT, TextTable::LEFT);
+ t.define_column("TIMESTAMP", TextTable::LEFT, TextTable::RIGHT);
+ if (all_snaps) {
+ t.define_column("NAMESPACE", TextTable::LEFT, TextTable::LEFT);
+ }
+ }
+
+ std::list<std::pair<int64_t, std::string>> pool_list;
+ rados.pool_list2(pool_list);
+ std::map<int64_t, std::string> pool_map(pool_list.begin(), pool_list.end());
+
+ for (std::vector<librbd::snap_info_t>::iterator s = snaps.begin();
+ s != snaps.end(); ++s) {
+ struct timespec timestamp;
+ bool snap_protected = false;
+ image.snap_get_timestamp(s->id, &timestamp);
+ string tt_str = "";
+ if(timestamp.tv_sec != 0) {
+ time_t tt = timestamp.tv_sec;
+ tt_str = ctime(&tt);
+ tt_str = tt_str.substr(0, tt_str.length() - 1);
+ }
+
+ librbd::snap_namespace_type_t snap_namespace;
+ r = image.snap_get_namespace_type(s->id, &snap_namespace);
+ if (r < 0) {
+ std::cerr << "rbd: unable to retrieve snap namespace" << std::endl;
+ return r;
+ }
+
+ std::string snap_namespace_name = "Unknown";
+ switch (snap_namespace) {
+ case RBD_SNAP_NAMESPACE_TYPE_USER:
+ snap_namespace_name = "user";
+ break;
+ case RBD_SNAP_NAMESPACE_TYPE_GROUP:
+ snap_namespace_name = "group";
+ break;
+ case RBD_SNAP_NAMESPACE_TYPE_TRASH:
+ snap_namespace_name = "trash";
+ break;
+ }
+
+ int get_trash_res = -ENOENT;
+ std::string trash_original_name;
+ int get_group_res = -ENOENT;
+ librbd::snap_group_namespace_t group_snap;
+ if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_GROUP) {
+ get_group_res = image.snap_get_group_namespace(s->id, &group_snap,
+ sizeof(group_snap));
+ } else if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_TRASH) {
+ get_trash_res = image.snap_get_trash_namespace(
+ s->id, &trash_original_name);
+ }
+
+ std::string protected_str = "";
+ if (snap_namespace == RBD_SNAP_NAMESPACE_TYPE_USER) {
+ r = image.snap_is_protected(s->name.c_str(), &snap_protected);
+ if (r < 0) {
+ std::cerr << "rbd: unable to retrieve snap protection" << std::endl;
+ return r;
+ }
+ }
+
+ if (f) {
+ protected_str = snap_protected ? "true" : "false";
+ f->open_object_section("snapshot");
+ f->dump_unsigned("id", s->id);
+ f->dump_string("name", s->name);
+ f->dump_unsigned("size", s->size);
+ f->dump_string("protected", protected_str);
+ f->dump_string("timestamp", tt_str);
+ if (all_snaps) {
+ f->open_object_section("namespace");
+ f->dump_string("type", snap_namespace_name);
+ if (get_group_res == 0) {
+ std::string pool_name = pool_map[group_snap.group_pool];
+ f->dump_string("pool", pool_name);
+ f->dump_string("group", group_snap.group_name);
+ f->dump_string("group snap", group_snap.group_snap_name);
+ } else if (get_trash_res == 0) {
+ f->dump_string("original_name", trash_original_name);
+ }
+ f->close_section();
+ }
+ f->close_section();
+ } else {
+ protected_str = snap_protected ? "yes" : "";
+ t << s->id << s->name << stringify(byte_u_t(s->size)) << protected_str << tt_str;
+
+ if (all_snaps) {
+ ostringstream oss;
+ oss << snap_namespace_name;
+
+ if (get_group_res == 0) {
+ std::string pool_name = pool_map[group_snap.group_pool];
+ oss << " (" << pool_name << "/"
+ << group_snap.group_name << "@"
+ << group_snap.group_snap_name << ")";
+ } else if (get_trash_res == 0) {
+ oss << " (" << trash_original_name << ")";
+ }
+
+ t << oss.str();
+ }
+ t << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else if (snaps.size()) {
+ std::cout << t;
+ }
+
+ return 0;
+}
+
+int do_add_snap(librbd::Image& image, const char *snapname)
+{
+ int r = image.snap_create(snapname);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int do_remove_snap(librbd::Image& image, const char *snapname, bool force,
+ bool no_progress)
+{
+ uint32_t flags = force? RBD_SNAP_REMOVE_FORCE : 0;
+ int r = 0;
+ utils::ProgressContext pc("Removing snap", no_progress);
+
+ r = image.snap_remove2(snapname, flags, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+
+ pc.finish();
+ return 0;
+}
+
+int do_rollback_snap(librbd::Image& image, const char *snapname,
+ bool no_progress)
+{
+ utils::ProgressContext pc("Rolling back to snapshot", no_progress);
+ int r = image.snap_rollback_with_progress(snapname, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+int do_purge_snaps(librbd::Image& image, bool no_progress)
+{
+ utils::ProgressContext pc("Removing all snapshots", no_progress);
+ std::vector<librbd::snap_info_t> snaps;
+ bool is_protected = false;
+ int r = image.snap_list(snaps);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ } else if (0 == snaps.size()) {
+ return 0;
+ } else {
+ list<std::string> protect;
+ snaps.erase(remove_if(snaps.begin(),
+ snaps.end(),
+ boost::bind(utils::is_not_user_snap_namespace, &image, _1)),
+ snaps.end());
+ for (auto it = snaps.begin(); it != snaps.end();) {
+ r = image.snap_is_protected(it->name.c_str(), &is_protected);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ } else if (is_protected == true) {
+ protect.push_back(it->name.c_str());
+ snaps.erase(it);
+ } else {
+ ++it;
+ }
+ }
+
+ if (!protect.empty()) {
+ std::cout << "rbd: error removing snapshot(s) '" << protect << "', which "
+ << (1 == protect.size() ? "is" : "are")
+ << " protected - these must be unprotected with "
+ << "`rbd snap unprotect`."
+ << std::endl;
+ }
+ for (size_t i = 0; i < snaps.size(); ++i) {
+ r = image.snap_remove(snaps[i].name.c_str());
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.update_progress(i + 1, snaps.size() + protect.size());
+ }
+
+ if (!protect.empty()) {
+ pc.fail();
+ } else if (snaps.size() > 0) {
+ pc.finish();
+ }
+
+ return 0;
+ }
+}
+
+int do_protect_snap(librbd::Image& image, const char *snapname)
+{
+ int r = image.snap_protect(snapname);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int do_unprotect_snap(librbd::Image& image, const char *snapname)
+{
+ int r = image.snap_unprotect(snapname);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int do_set_limit(librbd::Image& image, uint64_t limit)
+{
+ return image.snap_set_limit(limit);
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_format_options(options);
+
+ std::string name = ALL_NAME + ",a";
+
+ options->add_options()
+ (name.c_str(), po::bool_switch(), "list snapshots from all namespaces");
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name,
+ image_id, "", true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ bool all_snaps = vm[ALL_NAME].as<bool>();
+ r = do_list_snaps(image, formatter.get(), all_snaps, rados);
+ if (r < 0) {
+ cerr << "rbd: failed to list snapshots: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_create_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_create(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_SNAP);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_add_snap(image, snap_name.c_str());
+ if (r < 0) {
+ cerr << "rbd: failed to create snapshot: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_snap_id_option(options);
+ at::add_no_progress_option(options);
+
+ options->add_options()
+ ("force", po::bool_switch(), "flatten children and unprotect snapshot if needed.");
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+ uint64_t snap_id = CEPH_NOSNAP;
+ bool force = vm["force"].as<bool>();
+ bool no_progress = vm[at::NO_PROGRESS].as<bool>();
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+ if (vm.count(at::SNAPSHOT_ID)) {
+ snap_id = vm[at::SNAPSHOT_ID].as<uint64_t>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ (snap_id == CEPH_NOSNAP ? utils::SNAPSHOT_PRESENCE_REQUIRED :
+ utils::SNAPSHOT_PRESENCE_PERMITTED),
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id."
+ << std::endl;
+ return -EINVAL;
+ } else if (!snap_name.empty() && snap_id != CEPH_NOSNAP) {
+ std::cerr << "rbd: trying to access snapshot using both name and id."
+ << std::endl;
+ return -EINVAL;
+ } else if ((force || no_progress) && snap_id != CEPH_NOSNAP) {
+ std::cerr << "rbd: force and no-progress options not permitted when "
+ << "removing by id." << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_osdmap_full_try();
+ if (image_id.empty()) {
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ } else {
+ r = utils::open_image_by_id(io_ctx, image_id, false, &image);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ if (!snap_name.empty()) {
+ r = do_remove_snap(image, snap_name.c_str(), force, no_progress);
+ } else {
+ r = image.snap_remove_by_id(snap_id);
+ }
+
+ if (r < 0) {
+ if (r == -EBUSY) {
+ std::cerr << "rbd: snapshot "
+ << (snap_name.empty() ? std::string("id ") + stringify(snap_id) :
+ std::string("'") + snap_name + "'")
+ << " is protected from removal." << std::endl;
+ } else {
+ std::cerr << "rbd: failed to remove snapshot: " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ return 0;
+}
+
+void get_purge_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_no_progress_option(options);
+}
+
+int execute_purge(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_osdmap_full_try();
+ if (image_id.empty()) {
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ } else {
+ r = utils::open_image_by_id(io_ctx, image_id, false, &image);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_purge_snaps(image, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ if (r != -EBUSY) {
+ std::cerr << "rbd: removing snaps failed: " << cpp_strerror(r)
+ << std::endl;
+ }
+ return r;
+ }
+ return 0;
+}
+
+void get_rollback_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+}
+
+int execute_rollback(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_rollback_snap(image, snap_name.c_str(),
+ vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: rollback failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_protect_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_protect(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ bool is_protected = false;
+ r = image.snap_is_protected(snap_name.c_str(), &is_protected);
+ if (r < 0) {
+ std::cerr << "rbd: protecting snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ } else if (is_protected) {
+ std::cerr << "rbd: snap is already protected" << std::endl;
+ return -EBUSY;
+ }
+
+ r = do_protect_snap(image, snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: protecting snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_unprotect_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+}
+
+int execute_unprotect(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ std::string image_id;
+
+ if (vm.count(at::IMAGE_ID)) {
+ image_id = vm[at::IMAGE_ID].as<std::string>();
+ }
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, image_id.empty(),
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!image_id.empty() && !image_name.empty()) {
+ std::cerr << "rbd: trying to access image using both name and id. "
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_osdmap_full_try();
+ if (image_id.empty()) {
+ r = utils::open_image(io_ctx, image_name, false, &image);
+ } else {
+ r = utils::open_image_by_id(io_ctx, image_id, false, &image);
+ }
+ if (r < 0) {
+ return r;
+ }
+
+ bool is_protected = false;
+ r = image.snap_is_protected(snap_name.c_str(), &is_protected);
+ if (r < 0) {
+ std::cerr << "rbd: unprotecting snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ } else if (!is_protected) {
+ std::cerr << "rbd: snap is already unprotected" << std::endl;
+ return -EINVAL;
+ }
+
+ r = do_unprotect_snap(image, snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: unprotecting snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_set_limit_arguments(po::options_description *pos,
+ po::options_description *opt) {
+ at::add_image_spec_options(pos, opt, at::ARGUMENT_MODIFIER_NONE);
+ at::add_limit_option(opt);
+}
+
+int execute_set_limit(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ uint64_t limit;
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ if (vm.count(at::LIMIT)) {
+ limit = vm[at::LIMIT].as<uint64_t>();
+ } else {
+ std::cerr << "rbd: must specify --limit <num>" << std::endl;
+ return -ERANGE;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_set_limit(image, limit);
+ if (r < 0) {
+ std::cerr << "rbd: setting snapshot limit failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_clear_limit_arguments(po::options_description *pos,
+ po::options_description *opt) {
+ at::add_image_spec_options(pos, opt, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_clear_limit(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_set_limit(image, UINT64_MAX);
+ if (r < 0) {
+ std::cerr << "rbd: clearing snapshot limit failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_rename_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+ at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+}
+
+int execute_rename(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string src_snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &src_snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return -r;
+ }
+
+ std::string dest_pool_name(pool_name);
+ std::string dest_namespace_name(namespace_name);
+ std::string dest_image_name;
+ std::string dest_snap_name;
+ r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dest_pool_name,
+ &dest_namespace_name, &dest_image_name, &dest_snap_name, true,
+ utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_SNAP);
+ if (r < 0) {
+ return -r;
+ }
+
+ if (pool_name != dest_pool_name) {
+ std::cerr << "rbd: source and destination pool must be the same"
+ << std::endl;
+ return -EINVAL;
+ } else if (namespace_name != dest_namespace_name) {
+ std::cerr << "rbd: source and destination namespace must be the same"
+ << std::endl;
+ return -EINVAL;
+ } else if (image_name != dest_image_name) {
+ std::cerr << "rbd: source and destination image name must be the same"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = image.snap_rename(src_snap_name.c_str(), dest_snap_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: renaming snap failed: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action_list(
+ {"snap", "list"}, {"snap", "ls"}, "Dump list of image snapshots.", "",
+ &get_list_arguments, &execute_list);
+Shell::Action action_create(
+ {"snap", "create"}, {"snap", "add"}, "Create a snapshot.", "",
+ &get_create_arguments, &execute_create);
+Shell::Action action_remove(
+ {"snap", "remove"}, {"snap", "rm"}, "Delete a snapshot.", "",
+ &get_remove_arguments, &execute_remove);
+Shell::Action action_purge(
+ {"snap", "purge"}, {}, "Delete all unprotected snapshots.", "",
+ &get_purge_arguments, &execute_purge);
+Shell::Action action_rollback(
+ {"snap", "rollback"}, {"snap", "revert"}, "Rollback image to snapshot.", "",
+ &get_rollback_arguments, &execute_rollback);
+Shell::Action action_protect(
+ {"snap", "protect"}, {}, "Prevent a snapshot from being deleted.", "",
+ &get_protect_arguments, &execute_protect);
+Shell::Action action_unprotect(
+ {"snap", "unprotect"}, {}, "Allow a snapshot to be deleted.", "",
+ &get_unprotect_arguments, &execute_unprotect);
+Shell::Action action_set_limit(
+ {"snap", "limit", "set"}, {}, "Limit the number of snapshots.", "",
+ &get_set_limit_arguments, &execute_set_limit);
+Shell::Action action_clear_limit(
+ {"snap", "limit", "clear"}, {}, "Remove snapshot limit.", "",
+ &get_clear_limit_arguments, &execute_clear_limit);
+Shell::Action action_rename(
+ {"snap", "rename"}, {}, "Rename a snapshot.", "",
+ &get_rename_arguments, &execute_rename);
+
+} // namespace snap
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Sparsify.cc b/src/tools/rbd/action/Sparsify.cc
new file mode 100644
index 00000000..a345f920
--- /dev/null
+++ b/src/tools/rbd/action/Sparsify.cc
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace sparsify {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_sparsify(librbd::Image& image, size_t sparse_size,
+ bool no_progress)
+{
+ utils::ProgressContext pc("Image sparsify", no_progress);
+ int r = image.sparsify_with_progress(sparse_size, pc);
+ if (r < 0) {
+ pc.fail();
+ return r;
+ }
+ pc.finish();
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_no_progress_option(options);
+ at::add_sparse_size_option(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ false, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE;
+ if (vm.count(at::IMAGE_SPARSE_SIZE)) {
+ sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>();
+ }
+
+ r = do_sparsify(image, sparse_size, vm[at::NO_PROGRESS].as<bool>());
+ if (r < 0) {
+ std::cerr << "rbd: sparsify error: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"sparsify"}, {},
+ "Reclaim space for zeroed image extents.", "",
+ &get_arguments, &execute);
+
+} // namespace sparsify
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Status.cc b/src/tools/rbd/action/Status.cc
new file mode 100644
index 00000000..0a599e7f
--- /dev/null
+++ b/src/tools/rbd/action/Status.cc
@@ -0,0 +1,214 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd_types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace status {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_show_status(librados::IoCtx& io_ctx, const std::string &image_name,
+ librbd::Image &image, Formatter *f)
+{
+ int r;
+ std::list<librbd::image_watcher_t> watchers;
+
+ r = image.list_watchers(watchers);
+ if (r < 0)
+ return r;
+
+ uint64_t features;
+ r = image.features(&features);
+ if (r < 0) {
+ return r;
+ }
+
+ librbd::image_migration_status_t migration_status;
+ std::string source_pool_name;
+ std::string dest_pool_name;
+ std::string migration_state;
+ if ((features & RBD_FEATURE_MIGRATING) != 0) {
+ r = librbd::RBD().migration_status(io_ctx, image_name.c_str(),
+ &migration_status,
+ sizeof(migration_status));
+ if (r < 0) {
+ std::cerr << "rbd: getting migration status failed: " << cpp_strerror(r)
+ << std::endl;
+ // not fatal
+ } else {
+ librados::IoCtx src_io_ctx;
+ r = librados::Rados(io_ctx).ioctx_create2(migration_status.source_pool_id, src_io_ctx);
+ if (r < 0) {
+ source_pool_name = stringify(migration_status.source_pool_id);
+ } else {
+ source_pool_name = src_io_ctx.get_pool_name();
+ }
+
+ librados::IoCtx dst_io_ctx;
+ r = librados::Rados(io_ctx).ioctx_create2(migration_status.dest_pool_id, dst_io_ctx);
+ if (r < 0) {
+ dest_pool_name = stringify(migration_status.dest_pool_id);
+ } else {
+ dest_pool_name = dst_io_ctx.get_pool_name();
+ }
+
+ switch (migration_status.state) {
+ case RBD_IMAGE_MIGRATION_STATE_ERROR:
+ migration_state = "error";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_PREPARING:
+ migration_state = "preparing";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_PREPARED:
+ migration_state = "prepared";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_EXECUTING:
+ migration_state = "executing";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_EXECUTED:
+ migration_state = "executed";
+ break;
+ case RBD_IMAGE_MIGRATION_STATE_ABORTING:
+ migration_state = "aborting";
+ break;
+ default:
+ migration_state = "unknown";
+ }
+ }
+ }
+
+ if (f)
+ f->open_object_section("status");
+
+ if (f) {
+ f->open_array_section("watchers");
+ for (auto &watcher : watchers) {
+ f->open_object_section("watcher");
+ f->dump_string("address", watcher.addr);
+ f->dump_unsigned("client", watcher.id);
+ f->dump_unsigned("cookie", watcher.cookie);
+ f->close_section();
+ }
+ f->close_section(); // watchers
+ if (!migration_state.empty()) {
+ f->open_object_section("migration");
+ f->dump_string("source_pool_name", source_pool_name);
+ f->dump_string("source_pool_namespace",
+ migration_status.source_pool_namespace);
+ f->dump_string("source_image_name", migration_status.source_image_name);
+ f->dump_string("source_image_id", migration_status.source_image_id);
+ f->dump_string("dest_pool_name", dest_pool_name);
+ f->dump_string("dest_pool_namespace",
+ migration_status.dest_pool_namespace);
+ f->dump_string("dest_image_name", migration_status.dest_image_name);
+ f->dump_string("dest_image_id", migration_status.dest_image_id);
+ f->dump_string("state", migration_state);
+ f->dump_string("state_description", migration_status.state_description);
+ f->close_section(); // migration
+ }
+ } else {
+ if (watchers.size()) {
+ std::cout << "Watchers:" << std::endl;
+ for (auto &watcher : watchers) {
+ std::cout << "\twatcher=" << watcher.addr << " client." << watcher.id
+ << " cookie=" << watcher.cookie << std::endl;
+ }
+ } else {
+ std::cout << "Watchers: none" << std::endl;
+ }
+ if (!migration_state.empty()) {
+ if (!migration_status.source_pool_namespace.empty()) {
+ source_pool_name += ("/" + migration_status.source_pool_namespace);
+ }
+ if (!migration_status.dest_pool_namespace.empty()) {
+ dest_pool_name += ("/" + migration_status.dest_pool_namespace);
+ }
+
+ std::cout << "Migration:" << std::endl;
+ std::cout << "\tsource: " << source_pool_name << "/"
+ << migration_status.source_image_name;
+ if (!migration_status.source_image_id.empty()) {
+ std::cout << " (" << migration_status.source_image_id << ")";
+ }
+ std::cout << std::endl;
+ std::cout << "\tdestination: " << dest_pool_name << "/"
+ << migration_status.dest_image_name << " ("
+ << migration_status.dest_image_id << ")" << std::endl;
+ std::cout << "\tstate: " << migration_state;
+ if (!migration_status.state_description.empty()) {
+ std::cout << " (" << migration_status.state_description << ")";
+ }
+ std::cout << std::endl;
+ }
+ }
+
+ if (f) {
+ f->close_section(); // status
+ f->flush(std::cout);
+ }
+
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_show_status(io_ctx, image_name, image, formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: show status failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"status"}, {}, "Show the status of this image.", "", &get_arguments,
+ &execute);
+
+} // namespace status
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Trash.cc b/src/tools/rbd/action/Trash.cc
new file mode 100644
index 00000000..327b20ba
--- /dev/null
+++ b/src/tools/rbd/action/Trash.cc
@@ -0,0 +1,525 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "common/Clock.h"
+#include <iostream>
+#include <sstream>
+#include <boost/program_options.hpp>
+#include <boost/bind.hpp>
+
+namespace rbd {
+namespace action {
+namespace trash {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+//Optional arguments used only by this set of commands (rbd trash *)
+static const std::string EXPIRES_AT("expires-at");
+static const std::string EXPIRED_BEFORE("expired-before");
+static const std::string THRESHOLD("threshold");
+
+static bool is_not_trash_user(const librbd::trash_image_info_t &trash_info) {
+ return trash_info.source != RBD_TRASH_IMAGE_SOURCE_USER;
+}
+
+void get_move_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+ options->add_options()
+ (EXPIRES_AT.c_str(), po::value<std::string>()->default_value("now"),
+ "set the expiration time of an image so it can be purged when it is stale");
+}
+
+int execute_move(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utime_t now = ceph_clock_now();
+ utime_t exp_time = now;
+ std::string expires_at;
+ if (vm.find(EXPIRES_AT) != vm.end()) {
+ expires_at = vm[EXPIRES_AT].as<std::string>();
+ r = utime_t::invoke_date(expires_at, &exp_time);
+ if (r < 0) {
+ std::cerr << "rbd: error calling /bin/date: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ }
+
+ time_t dt = (exp_time - now).sec();
+ if(dt < 0) {
+ std::cerr << "rbd: cannot use a date in the past as an expiration date"
+ << std::endl;
+ return -EINVAL;
+ }
+
+ librbd::RBD rbd;
+ r = rbd.trash_move(io_ctx, image_name.c_str(), dt);
+ if (r < 0) {
+ std::cerr << "rbd: deferred delete error: " << cpp_strerror(r)
+ << std::endl;
+ }
+
+ return r;
+}
+
+void get_remove_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ (at::IMAGE_ID.c_str(), "image id\n(example: [<pool-name>/[<namespace>/]]<image-id>)");
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_namespace_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+
+ at::add_no_progress_option(options);
+ options->add_options()
+ ("force", po::bool_switch(), "force remove of non-expired delayed images");
+}
+
+int execute_remove(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_id;
+ int r = utils::get_pool_image_id(vm, &arg_index, &pool_name, &namespace_name,
+ &image_id);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_osdmap_full_try();
+ librbd::RBD rbd;
+
+ utils::ProgressContext pc("Removing image", vm[at::NO_PROGRESS].as<bool>());
+ r = rbd.trash_remove_with_progress(io_ctx, image_id.c_str(),
+ vm["force"].as<bool>(), pc);
+ if (r < 0) {
+ if (r == -ENOTEMPTY) {
+ std::cerr << "rbd: image has snapshots - these must be deleted"
+ << " with 'rbd snap purge' before the image can be removed."
+ << std::endl;
+ } else if (r == -EBUSY) {
+ std::cerr << "rbd: error: image still has watchers"
+ << std::endl
+ << "This means the image is still open or the client using "
+ << "it crashed. Try again after closing/unmapping it or "
+ << "waiting 30s for the crashed client to timeout."
+ << std::endl;
+ } else if (r == -EMLINK) {
+ std::cerr << std::endl
+ << "Remove the image from the group and try again."
+ << std::endl;
+ } else if (r == -EPERM) {
+ std::cerr << std::endl
+ << "Deferment time has not expired, please use --force if you "
+ << "really want to remove the image"
+ << std::endl;
+ } else {
+ std::cerr << "rbd: remove error: " << cpp_strerror(r) << std::endl;
+ }
+ pc.fail();
+ return r;
+ }
+
+ pc.finish();
+
+ return r;
+}
+
+std::string delete_status(time_t deferment_end_time) {
+ time_t now = time(nullptr);
+
+ std::string time_str = ctime(&deferment_end_time);
+ time_str = time_str.substr(0, time_str.length() - 1);
+
+ std::stringstream ss;
+ if (now < deferment_end_time) {
+ ss << "protected until " << time_str;
+ } else {
+ ss << "expired at " << time_str;
+ }
+
+ return ss.str();
+}
+
+int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool long_flag,
+ bool all_flag, Formatter *f) {
+ std::vector<librbd::trash_image_info_t> trash_entries;
+ int r = rbd.trash_list(io_ctx, trash_entries);
+ if (r < 0) {
+ return r;
+ }
+
+ if (!all_flag) {
+ trash_entries.erase(remove_if(trash_entries.begin(),
+ trash_entries.end(),
+ boost::bind(is_not_trash_user, _1)),
+ trash_entries.end());
+ }
+
+ if (!long_flag) {
+ if (f) {
+ f->open_array_section("trash");
+ }
+ for (const auto& entry : trash_entries) {
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("id", entry.id);
+ f->dump_string("name", entry.name);
+ f->close_section();
+ } else {
+ std::cout << entry.id << " " << entry.name << std::endl;
+ }
+ }
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ }
+ return 0;
+ }
+
+ TextTable tbl;
+
+ if (f) {
+ f->open_array_section("trash");
+ } else {
+ tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("SOURCE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("DELETED_AT", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("STATUS", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("PARENT", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ for (const auto& entry : trash_entries) {
+ librbd::Image im;
+
+ r = rbd.open_by_id_read_only(io_ctx, im, entry.id.c_str(), NULL);
+ // image might disappear between rbd.list() and rbd.open(); ignore
+ // that, warn about other possible errors (EPERM, say, for opening
+ // an old-format image, because you need execute permission for the
+ // class method)
+ if (r < 0) {
+ if (r != -ENOENT) {
+ std::cerr << "rbd: error opening " << entry.id << ": "
+ << cpp_strerror(r) << std::endl;
+ }
+ // in any event, continue to next image
+ continue;
+ }
+
+ std::string del_source;
+ switch (entry.source) {
+ case RBD_TRASH_IMAGE_SOURCE_USER:
+ del_source = "USER";
+ break;
+ case RBD_TRASH_IMAGE_SOURCE_MIRRORING:
+ del_source = "MIRRORING";
+ break;
+ case RBD_TRASH_IMAGE_SOURCE_MIGRATION:
+ del_source = "MIGRATION";
+ break;
+ case RBD_TRASH_IMAGE_SOURCE_REMOVING:
+ del_source = "REMOVING";
+ break;
+ }
+
+ std::string time_str = ctime(&entry.deletion_time);
+ time_str = time_str.substr(0, time_str.length() - 1);
+
+ bool has_parent = false;
+ std::string parent;
+ librbd::linked_image_spec_t parent_image;
+ librbd::snap_spec_t parent_snap;
+ r = im.get_parent(&parent_image, &parent_snap);
+ if (r == -ENOENT) {
+ r = 0;
+ } else if (r < 0) {
+ return r;
+ } else {
+ parent = parent_image.pool_name + "/";
+ if (!parent_image.pool_namespace.empty()) {
+ parent += parent_image.pool_namespace + "/";
+ }
+ parent += parent_image.image_name + "@" + parent_snap.name;
+ has_parent = true;
+ }
+
+ if (f) {
+ f->open_object_section("image");
+ f->dump_string("id", entry.id);
+ f->dump_string("name", entry.name);
+ f->dump_string("source", del_source);
+ f->dump_string("deleted_at", time_str);
+ f->dump_string("status",
+ delete_status(entry.deferment_end_time));
+ if (has_parent) {
+ f->open_object_section("parent");
+ f->dump_string("pool", parent_image.pool_name);
+ f->dump_string("pool_namespace", parent_image.pool_namespace);
+ f->dump_string("image", parent_image.image_name);
+ f->dump_string("snapshot", parent_snap.name);
+ f->close_section();
+ }
+ f->close_section();
+ } else {
+ tbl << entry.id
+ << entry.name
+ << del_source
+ << time_str
+ << delete_status(entry.deferment_end_time);
+ if (has_parent)
+ tbl << parent;
+ tbl << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section();
+ f->flush(std::cout);
+ } else if (!trash_entries.empty()) {
+ std::cout << tbl;
+ }
+
+ return r < 0 ? r : 0;
+}
+
+void get_list_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ options->add_options()
+ ("all,a", po::bool_switch(), "list images from all sources");
+ options->add_options()
+ ("long,l", po::bool_switch(), "long listing format");
+ at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ at::Format::Formatter formatter;
+ r = utils::get_formatter(vm, &formatter);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ librbd::RBD rbd;
+ r = do_list(rbd, io_ctx, vm["long"].as<bool>(), vm["all"].as<bool>(),
+ formatter.get());
+ if (r < 0) {
+ std::cerr << "rbd: trash list: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+void get_purge_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_pool_options(positional, options, true);
+ at::add_no_progress_option(options);
+
+ options->add_options()
+ (EXPIRED_BEFORE.c_str(), po::value<std::string>()->value_name("date"),
+ "purges images that expired before the given date");
+ options->add_options()
+ (THRESHOLD.c_str(), po::value<float>(),
+ "purges images until the current pool data usage is reduced to X%, "
+ "value range: 0.0-1.0");
+}
+
+int execute_purge (const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ std::string pool_name;
+ std::string namespace_name;
+ size_t arg_index = 0;
+ int r = utils::get_pool_and_namespace_names(vm, true, false, &pool_name,
+ &namespace_name, &arg_index);
+ if (r < 0) {
+ return r;
+ }
+
+ utils::disable_cache();
+
+ librbd::RBD rbd;
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ io_ctx.set_osdmap_full_try();
+
+ float threshold = -1;
+ time_t expire_ts = 0;
+
+ if (vm.find(THRESHOLD) != vm.end()) {
+ threshold = vm[THRESHOLD].as<float>();
+ } else {
+ if (vm.find(EXPIRED_BEFORE) != vm.end()) {
+ utime_t new_time;
+ r = utime_t::invoke_date(vm[EXPIRED_BEFORE].as<std::string>(), &new_time);
+ if (r < 0) {
+ std::cerr << "rbd: error calling /bin/date: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+ expire_ts = new_time.sec();
+ }
+ }
+
+ utils::ProgressContext pc("Removing images", vm[at::NO_PROGRESS].as<bool>());
+ r = rbd.trash_purge_with_progress(io_ctx, expire_ts, threshold, pc);
+ if (r < 0) {
+ pc.fail();
+ } else {
+ pc.finish();
+ }
+
+ return 0;
+}
+
+void get_restore_arguments(po::options_description *positional,
+ po::options_description *options) {
+ positional->add_options()
+ (at::IMAGE_ID.c_str(), "image id\n(example: [<pool-name>/]<image-id>)");
+ at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_namespace_option(options, at::ARGUMENT_MODIFIER_NONE);
+ at::add_image_id_option(options);
+ at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE, "");
+}
+
+int execute_restore(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_id;
+ int r = utils::get_pool_image_id(vm, &arg_index, &pool_name, &namespace_name,
+ &image_id);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ r = utils::init(pool_name, namespace_name, &rados, &io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ std::string name;
+ if (vm.find(at::IMAGE_NAME) != vm.end()) {
+ name = vm[at::IMAGE_NAME].as<std::string>();
+ }
+
+ librbd::RBD rbd;
+ r = rbd.trash_restore(io_ctx, image_id.c_str(), name.c_str());
+ if (r < 0) {
+ if (r == -ENOENT) {
+ std::cerr << "rbd: error: image does not exist in trash"
+ << std::endl;
+ } else if (r == -EEXIST) {
+ std::cerr << "rbd: error: an image with the same name already exists, "
+ << "try again with with a different name"
+ << std::endl;
+ } else {
+ std::cerr << "rbd: restore error: " << cpp_strerror(r) << std::endl;
+ }
+ return r;
+ }
+
+ return r;
+}
+
+
+Shell::Action action_move(
+ {"trash", "move"}, {"trash", "mv"}, "Move an image to the trash.", "",
+ &get_move_arguments, &execute_move);
+
+Shell::Action action_remove(
+ {"trash", "remove"}, {"trash", "rm"}, "Remove an image from trash.", "",
+ &get_remove_arguments, &execute_remove);
+
+Shell::Action action_purge(
+ {"trash", "purge"}, {}, "Remove all expired images from trash.", "",
+ &get_purge_arguments, &execute_purge);
+
+Shell::SwitchArguments switched_arguments({"long", "l"});
+Shell::Action action_list(
+ {"trash", "list"}, {"trash", "ls"}, "List trash images.", "",
+ &get_list_arguments, &execute_list);
+
+Shell::Action action_restore(
+ {"trash", "restore"}, {}, "Restore an image from trash.", "",
+ &get_restore_arguments, &execute_restore);
+
+} // namespace trash
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Watch.cc b/src/tools/rbd/action/Watch.cc
new file mode 100644
index 00000000..65f0f93d
--- /dev/null
+++ b/src/tools/rbd/action/Watch.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd_types.h"
+#include "librbd/WatchNotifyTypes.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace watch {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+class RbdWatchCtx : public librados::WatchCtx2 {
+public:
+ RbdWatchCtx(librados::IoCtx& io_ctx, const char *image_name,
+ const std::string &header_oid)
+ : m_io_ctx(io_ctx), m_image_name(image_name), m_header_oid(header_oid)
+ {
+ }
+
+ ~RbdWatchCtx() override {}
+
+ void handle_notify(uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ bufferlist& bl) override {
+ using namespace librbd::watch_notify;
+ NotifyMessage notify_message;
+ if (bl.length() == 0) {
+ notify_message = NotifyMessage(HeaderUpdatePayload());
+ } else {
+ try {
+ auto iter = bl.cbegin();
+ notify_message.decode(iter);
+ } catch (const buffer::error &err) {
+ std::cerr << "rbd: failed to decode image notification" << std::endl;
+ }
+ }
+
+ std::cout << m_image_name << " received notification: notify_id="
+ << notify_id << ", cookie=" << cookie << ", notifier_id="
+ << notifier_id << ", bl.length=" << bl.length() << ", notify_op="
+ << notify_message.get_notify_op() << std::endl;
+ bufferlist reply;
+ m_io_ctx.notify_ack(m_header_oid, notify_id, cookie, reply);
+ }
+
+ void handle_error(uint64_t cookie, int err) override {
+ std::cerr << m_image_name << " received error: cookie=" << cookie << ", "
+ << "err=" << cpp_strerror(err) << std::endl;
+ }
+private:
+ librados::IoCtx m_io_ctx;
+ const char *m_image_name;
+ std::string m_header_oid;
+};
+
+static int do_watch(librados::IoCtx& pp, librbd::Image &image,
+ const char *imgname)
+{
+ uint8_t old_format;
+ int r = image.old_format(&old_format);
+ if (r < 0) {
+ std::cerr << "failed to query format" << std::endl;
+ return r;
+ }
+
+ std::string header_oid;
+ if (old_format != 0) {
+ header_oid = std::string(imgname) + RBD_SUFFIX;
+ } else {
+ std::string id;
+ r = image.get_id(&id);
+ if (r < 0) {
+ return r;
+ }
+
+ header_oid = RBD_HEADER_PREFIX + id;
+ }
+
+ uint64_t cookie;
+ RbdWatchCtx ctx(pp, imgname, header_oid);
+ r = pp.watch2(header_oid, &cookie, &ctx);
+ if (r < 0) {
+ std::cerr << "rbd: watch failed" << std::endl;
+ return r;
+ }
+
+ std::cout << "press enter to exit..." << std::endl;
+ getchar();
+
+ r = pp.unwatch2(cookie);
+ if (r < 0) {
+ std::cerr << "rbd: unwatch failed" << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+void get_arguments(po::options_description *positional,
+ po::options_description *options) {
+ at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute(const po::variables_map &vm,
+ const std::vector<std::string> &ceph_global_init_args) {
+ size_t arg_index = 0;
+ std::string pool_name;
+ std::string namespace_name;
+ std::string image_name;
+ std::string snap_name;
+ int r = utils::get_pool_image_snapshot_names(
+ vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &namespace_name,
+ &image_name, &snap_name, true, utils::SNAPSHOT_PRESENCE_NONE,
+ utils::SPEC_VALIDATION_NONE);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::Rados rados;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+ r = utils::init_and_open_image(pool_name, namespace_name, image_name, "", "",
+ true, &rados, &io_ctx, &image);
+ if (r < 0) {
+ return r;
+ }
+
+ r = do_watch(io_ctx, image, image_name.c_str());
+ if (r < 0) {
+ std::cerr << "rbd: watch failed: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ return 0;
+}
+
+Shell::Action action(
+ {"watch"}, {}, "Watch events on image.", "", &get_arguments, &execute);
+
+} // namespace watch
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/rbd.cc b/src/tools/rbd/rbd.cc
new file mode 100644
index 00000000..a8c59d57
--- /dev/null
+++ b/src/tools/rbd/rbd.cc
@@ -0,0 +1,10 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/Shell.h"
+
+int main(int argc, const char **argv)
+{
+ rbd::Shell shell;
+ return shell.execute(argc, argv);
+}
diff --git a/src/tools/rbd_ggate/CMakeLists.txt b/src/tools/rbd_ggate/CMakeLists.txt
new file mode 100644
index 00000000..5c5572c4
--- /dev/null
+++ b/src/tools/rbd_ggate/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(rbd-ggate
+ Driver.cc
+ Server.cc
+ Watcher.cc
+ debug.cc
+ ggate_drv.c
+ main.cc)
+target_link_libraries(rbd-ggate geom librbd librados global)
+install(TARGETS rbd-ggate DESTINATION bin)
diff --git a/src/tools/rbd_ggate/Driver.cc b/src/tools/rbd_ggate/Driver.cc
new file mode 100644
index 00000000..752ef56f
--- /dev/null
+++ b/src/tools/rbd_ggate/Driver.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <stdlib.h>
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "Driver.h"
+#include "Request.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::ggate::Driver: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace ggate {
+
+int Driver::load() {
+
+ return ggate_drv_load();
+}
+
+int Driver::kill(const std::string &devname) {
+
+ int r = ggate_drv_kill(devname.c_str());
+
+ return r;
+}
+
+int Driver::list(std::map<std::string, DevInfo> *devices) {
+ size_t size = 1024;
+ ggate_drv_info *devs = nullptr;
+ int r;
+
+ while (size <= 1024 * 1024) {
+ devs = static_cast<ggate_drv_info *>(
+ realloc(static_cast<void *>(devs), size * sizeof(*devs)));
+ r = ggate_drv_list(devs, &size);
+ if (r != -ERANGE) {
+ break;
+ }
+ }
+ if (r < 0) {
+ goto free;
+ }
+
+ devices->clear();
+ for (size_t i = 0; i < size; i++) {
+ auto &dev = devs[i];
+ (*devices)[dev.id] = {dev.name, dev.info};
+ }
+
+free:
+ free(devs);
+
+ return r;
+}
+
+Driver::Driver(const std::string &devname, size_t sectorsize, size_t mediasize,
+ bool readonly, const std::string &info)
+ : m_devname(devname), m_sectorsize(sectorsize), m_mediasize(mediasize),
+ m_readonly(readonly), m_info(info) {
+}
+
+int Driver::init() {
+ dout(20) << dendl;
+
+ char name[PATH_MAX];
+ size_t namelen;
+
+ if (m_devname.empty()) {
+ name[0] = '\0';
+ namelen = PATH_MAX;
+ } else {
+ namelen = m_devname.size();
+ if (namelen >= PATH_MAX) {
+ return -ENAMETOOLONG;
+ }
+ strncpy(name, m_devname.c_str(), namelen + 1);
+ }
+
+ int r = ggate_drv_create(name, namelen, m_sectorsize, m_mediasize, m_readonly,
+ m_info.c_str(), &m_drv);
+ if (r < 0) {
+ return r;
+ }
+
+ if (m_devname.empty()) {
+ m_devname = name;
+ }
+
+ return 0;
+}
+
+std::string Driver::get_devname() const {
+ dout(30) << m_devname << dendl;
+
+ return m_devname;
+}
+
+void Driver::shut_down() {
+ dout(20) << dendl;
+
+ ggate_drv_destroy(m_drv);
+}
+
+int Driver::resize(size_t newsize) {
+ dout(20) << "newsize=" << newsize << dendl;
+
+ int r = ggate_drv_resize(m_drv, newsize);
+ if (r < 0) {
+ return r;
+ }
+
+ m_mediasize = newsize;
+ return 0;
+}
+
+int Driver::recv(Request **req) {
+ dout(20) << dendl;
+
+ ggate_drv_req_t req_;
+
+ int r = ggate_drv_recv(m_drv, &req_);
+ if (r < 0) {
+ return r;
+ }
+
+ *req = new Request(req_);
+
+ dout(20) << "req=" << *req << dendl;
+
+ if (ggate_drv_req_cmd(req_) == GGATE_DRV_CMD_WRITE) {
+ bufferptr ptr(buffer::claim_malloc(
+ ggate_drv_req_length(req_),
+ static_cast<char *>(ggate_drv_req_release_buf(req_))));
+ (*req)->bl.push_back(ptr);
+ }
+
+ return 0;
+}
+
+int Driver::send(Request *req) {
+ dout(20) << "req=" << req << dendl;
+
+ if (ggate_drv_req_cmd(req->req) == GGATE_DRV_CMD_READ &&
+ ggate_drv_req_error(req->req) == 0) {
+ ceph_assert(req->bl.length() == ggate_drv_req_length(req->req));
+ // TODO: avoid copying?
+ req->bl.copy(0, ggate_drv_req_length(req->req),
+ static_cast<char *>(ggate_drv_req_buf(req->req)));
+ dout(20) << "copied resulting " << req->bl.length() << " bytes to "
+ << ggate_drv_req_buf(req->req) << dendl;
+ }
+
+ int r = ggate_drv_send(m_drv, req->req);
+
+ delete req;
+ return r;
+}
+
+} // namespace ggate
+} // namespace rbd
diff --git a/src/tools/rbd_ggate/Driver.h b/src/tools/rbd_ggate/Driver.h
new file mode 100644
index 00000000..50be72b9
--- /dev/null
+++ b/src/tools/rbd_ggate/Driver.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_DRIVER_H
+#define CEPH_RBD_GGATE_DRIVER_H
+
+#include <map>
+#include <string>
+
+#include "ggate_drv.h"
+
+namespace rbd {
+namespace ggate {
+
+struct Request;
+
+class Driver {
+public:
+ typedef std::pair<std::string, std::string> DevInfo;
+ static int load();
+ static int kill(const std::string &devname);
+ static int list(std::map<std::string, DevInfo> *devices);
+
+ Driver(const std::string &devname, size_t sectorsize, size_t mediasize,
+ bool readonly, const std::string &info);
+
+ int init();
+ void shut_down();
+
+ std::string get_devname() const;
+
+ int recv(Request **req);
+ int send(Request *req);
+
+ int resize(size_t newsize);
+
+private:
+ std::string m_devname;
+ size_t m_sectorsize;
+ size_t m_mediasize;
+ bool m_readonly;
+ std::string m_info;
+ ggate_drv_t m_drv = 0;
+};
+
+} // namespace ggate
+} // namespace rbd
+
+#endif // CEPH_RBD_GGATE_DRIVER_H
+
diff --git a/src/tools/rbd_ggate/Request.h b/src/tools/rbd_ggate/Request.h
new file mode 100644
index 00000000..66f21985
--- /dev/null
+++ b/src/tools/rbd_ggate/Request.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_REQUEST_H
+#define CEPH_RBD_GGATE_REQUEST_H
+
+#include "ggate_drv.h"
+
+namespace rbd {
+namespace ggate {
+
+struct Request {
+ enum Command {
+ Unknown = 0,
+ Write = 1,
+ Read = 2,
+ Flush = 3,
+ Discard = 4,
+ };
+
+ ggate_drv_req_t req;
+ bufferlist bl;
+
+ Request(ggate_drv_req_t req) : req(req) {
+ }
+
+ uint64_t get_id() {
+ return ggate_drv_req_id(req);
+ }
+
+ Command get_cmd() {
+ return static_cast<Command>(ggate_drv_req_cmd(req));
+ }
+
+ size_t get_length() {
+ return ggate_drv_req_length(req);
+ }
+
+ uint64_t get_offset() {
+ return ggate_drv_req_offset(req);
+ }
+
+ uint64_t get_error() {
+ return ggate_drv_req_error(req);
+ }
+
+ void set_error(int error) {
+ ggate_drv_req_set_error(req, error);
+ }
+};
+
+} // namespace ggate
+} // namespace rbd
+
+#endif // CEPH_RBD_GGATE_REQUEST_H
diff --git a/src/tools/rbd_ggate/Server.cc b/src/tools/rbd_ggate/Server.cc
new file mode 100644
index 00000000..3beeec3f
--- /dev/null
+++ b/src/tools/rbd_ggate/Server.cc
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "Driver.h"
+#include "Server.h"
+#include "Request.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::ggate::Server: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace ggate {
+
+Server::Server(Driver *drv, librbd::Image& image)
+ : m_drv(drv), m_image(image), m_lock("rbd::ggate::Server::m_lock"),
+ m_reader_thread(this, &Server::reader_entry),
+ m_writer_thread(this, &Server::writer_entry) {
+}
+
+void Server::run() {
+ dout(10) << dendl;
+
+ int r = start();
+ ceph_assert(r == 0);
+
+ dout(20) << "entering run loop" << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ while (!m_stopping) {
+ m_cond.WaitInterval(m_lock, utime_t(1, 0));
+ }
+ }
+
+ dout(20) << "exiting run loop" << dendl;
+
+ stop();
+}
+
+int Server::start() {
+ dout(10) << dendl;
+
+ m_reader_thread.create("rbd_reader");
+ m_writer_thread.create("rbd_writer");
+ return 0;
+}
+
+void Server::stop() {
+ dout(10) << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_stopping);
+ }
+
+ m_reader_thread.join();
+ m_writer_thread.join();
+
+ wait_clean();
+}
+
+void Server::io_start(IOContext *ctx) {
+ dout(20) << ctx << dendl;
+
+ Mutex::Locker locker(m_lock);
+ m_io_pending.push_back(&ctx->item);
+}
+
+void Server::io_finish(IOContext *ctx) {
+ dout(20) << ctx << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(ctx->item.is_on_list());
+
+ ctx->item.remove_myself();
+ m_io_finished.push_back(&ctx->item);
+ m_cond.Signal();
+}
+
+Server::IOContext *Server::wait_io_finish() {
+ dout(20) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ while (m_io_finished.empty() && !m_stopping) {
+ m_cond.Wait(m_lock);
+ }
+
+ if (m_io_finished.empty()) {
+ return nullptr;
+ }
+
+ IOContext *ret = m_io_finished.front();
+ m_io_finished.pop_front();
+
+ return ret;
+}
+
+void Server::wait_clean() {
+ dout(20) << dendl;
+
+ ceph_assert(!m_reader_thread.is_started());
+
+ Mutex::Locker locker(m_lock);
+
+ while (!m_io_pending.empty()) {
+ m_cond.Wait(m_lock);
+ }
+
+ while (!m_io_finished.empty()) {
+ std::unique_ptr<IOContext> free_ctx(m_io_finished.front());
+ m_io_finished.pop_front();
+ }
+}
+
+void Server::aio_callback(librbd::completion_t cb, void *arg) {
+ librbd::RBD::AioCompletion *aio_completion =
+ reinterpret_cast<librbd::RBD::AioCompletion*>(cb);
+
+ IOContext *ctx = reinterpret_cast<IOContext *>(arg);
+ int r = aio_completion->get_return_value();
+
+ ctx->server->handle_aio(ctx, r);
+ aio_completion->release();
+}
+
+void Server::handle_aio(IOContext *ctx, int r) {
+ dout(20) << ctx << ": r=" << r << dendl;
+
+ if (r == -EINVAL) {
+ // if shrinking an image, a pagecache writeback might reference
+ // extents outside of the range of the new image extents
+ dout(5) << "masking IO out-of-bounds error" << dendl;
+ ctx->req->bl.clear();
+ r = 0;
+ }
+
+ if (r < 0) {
+ ctx->req->set_error(-r);
+ } else if ((ctx->req->get_cmd() == Request::Read) &&
+ r != static_cast<int>(ctx->req->get_length())) {
+ int pad_byte_count = static_cast<int> (ctx->req->get_length()) - r;
+ ctx->req->bl.append_zero(pad_byte_count);
+ dout(20) << ctx << ": pad byte count: " << pad_byte_count << dendl;
+ ctx->req->set_error(0);
+ } else {
+ ctx->req->set_error(0);
+ }
+ io_finish(ctx);
+}
+
+void Server::reader_entry() {
+ dout(20) << dendl;
+
+ while (!m_stopping) {
+ std::unique_ptr<IOContext> ctx(new IOContext(this));
+
+ dout(20) << "waiting for ggate request" << dendl;
+
+ int r = m_drv->recv(&ctx->req);
+ if (r < 0) {
+ if (r != -ECANCELED) {
+ derr << "recv: " << cpp_strerror(r) << dendl;
+ }
+ Mutex::Locker locker(m_lock);
+ m_stopping = true;
+ m_cond.Signal();
+ return;
+ }
+
+ IOContext *pctx = ctx.release();
+
+ dout(20) << pctx << ": start: " << *pctx << dendl;
+
+ io_start(pctx);
+ librbd::RBD::AioCompletion *c =
+ new librbd::RBD::AioCompletion(pctx, aio_callback);
+ switch (pctx->req->get_cmd())
+ {
+ case rbd::ggate::Request::Write:
+ m_image.aio_write(pctx->req->get_offset(), pctx->req->get_length(),
+ pctx->req->bl, c);
+ break;
+ case rbd::ggate::Request::Read:
+ m_image.aio_read(pctx->req->get_offset(), pctx->req->get_length(),
+ pctx->req->bl, c);
+ break;
+ case rbd::ggate::Request::Flush:
+ m_image.aio_flush(c);
+ break;
+ case rbd::ggate::Request::Discard:
+ m_image.aio_discard(pctx->req->get_offset(), pctx->req->get_length(), c);
+ break;
+ default:
+ derr << pctx << ": invalid request command: " << pctx->req->get_cmd()
+ << dendl;
+ c->release();
+ Mutex::Locker locker(m_lock);
+ m_stopping = true;
+ m_cond.Signal();
+ return;
+ }
+ }
+ dout(20) << "terminated" << dendl;
+}
+
+void Server::writer_entry() {
+ dout(20) << dendl;
+
+ while (!m_stopping) {
+ dout(20) << "waiting for io request" << dendl;
+
+ std::unique_ptr<IOContext> ctx(wait_io_finish());
+ if (!ctx) {
+ dout(20) << "no io requests, terminating" << dendl;
+ return;
+ }
+
+ dout(20) << ctx.get() << ": got: " << *ctx << dendl;
+
+ int r = m_drv->send(ctx->req);
+ if (r < 0) {
+ derr << ctx.get() << ": send: " << cpp_strerror(r) << dendl;
+ Mutex::Locker locker(m_lock);
+ m_stopping = true;
+ m_cond.Signal();
+ return;
+ }
+ dout(20) << ctx.get() << " finish" << dendl;
+ }
+ dout(20) << "terminated" << dendl;
+}
+
+std::ostream &operator<<(std::ostream &os, const Server::IOContext &ctx) {
+
+ os << "[" << ctx.req->get_id();
+
+ switch (ctx.req->get_cmd())
+ {
+ case rbd::ggate::Request::Write:
+ os << " Write ";
+ break;
+ case rbd::ggate::Request::Read:
+ os << " Read ";
+ break;
+ case rbd::ggate::Request::Flush:
+ os << " Flush ";
+ break;
+ case rbd::ggate::Request::Discard:
+ os << " Discard ";
+ break;
+ default:
+ os << " Unknow(" << ctx.req->get_cmd() << ") ";
+ break;
+ }
+
+ os << ctx.req->get_offset() << "~" << ctx.req->get_length() << " "
+ << ctx.req->get_error() << "]";
+
+ return os;
+}
+
+} // namespace ggate
+} // namespace rbd
+
diff --git a/src/tools/rbd_ggate/Server.h b/src/tools/rbd_ggate/Server.h
new file mode 100644
index 00000000..8ed4f512
--- /dev/null
+++ b/src/tools/rbd_ggate/Server.h
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_SERVER_H
+#define CEPH_RBD_GGATE_SERVER_H
+
+#include "include/rbd/librbd.hpp"
+#include "include/xlist.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+
+namespace rbd {
+namespace ggate {
+
+class Driver;
+struct Request;
+
+class Server {
+public:
+ Server(Driver *drv, librbd::Image& image);
+
+ void run();
+
+private:
+ struct IOContext {
+ xlist<IOContext*>::item item;
+ Server *server;
+ Request *req = nullptr;
+
+ IOContext(Server *server) : item(this), server(server) {
+ }
+ };
+
+ class ThreadHelper : public Thread {
+ public:
+ typedef void (Server::*entry_func)();
+
+ ThreadHelper(Server *server, entry_func func)
+ : server(server), func(func) {
+ }
+
+ protected:
+ virtual void* entry() {
+ (server->*func)();
+ return nullptr;
+ }
+
+ private:
+ Server *server;
+ entry_func func;
+ };
+
+ friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx);
+
+ Driver *m_drv;
+ librbd::Image &m_image;
+
+ mutable Mutex m_lock;
+ Cond m_cond;
+ bool m_stopping = false;
+ ThreadHelper m_reader_thread, m_writer_thread;
+ xlist<IOContext*> m_io_pending;
+ xlist<IOContext*> m_io_finished;
+
+ static void aio_callback(librbd::completion_t cb, void *arg);
+
+ int start();
+ void stop();
+
+ void reader_entry();
+ void writer_entry();
+
+ void io_start(IOContext *ctx);
+ void io_finish(IOContext *ctx);
+
+ IOContext *wait_io_finish();
+ void wait_clean();
+
+ void handle_aio(IOContext *ctx, int r);
+};
+
+std::ostream &operator<<(std::ostream &os, const Server::IOContext &ctx);
+
+} // namespace ggate
+} // namespace rbd
+
+#endif // CEPH_RBD_GGATE_SERVER_H
diff --git a/src/tools/rbd_ggate/Watcher.cc b/src/tools/rbd_ggate/Watcher.cc
new file mode 100644
index 00000000..57b3f960
--- /dev/null
+++ b/src/tools/rbd_ggate/Watcher.cc
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "Driver.h"
+#include "Watcher.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::ggate::Watcher: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace ggate {
+
+Watcher::Watcher(Driver *drv, librados::IoCtx &ioctx, librbd::Image &image,
+ size_t size)
+ : m_drv(drv), m_ioctx(ioctx), m_image(image), m_size(size) {
+}
+
+void Watcher::handle_notify() {
+ dout(20) << dendl;
+
+ librbd::image_info_t info;
+
+ if (m_image.stat(info, sizeof(info)) == 0) {
+ size_t new_size = info.size;
+
+ if (new_size != m_size) {
+ int r = m_drv->resize(new_size);
+ if (r < 0) {
+ derr << "resize failed: " << cpp_strerror(r) << dendl;
+ m_drv->shut_down();
+ }
+ r = m_image.invalidate_cache();
+ if (r < 0) {
+ derr << "invalidate rbd cache failed: " << cpp_strerror(r) << dendl;
+ m_drv->shut_down();
+ }
+ m_size = new_size;
+ }
+ }
+}
+
+} // namespace ggate
+} // namespace rbd
diff --git a/src/tools/rbd_ggate/Watcher.h b/src/tools/rbd_ggate/Watcher.h
new file mode 100644
index 00000000..8f524b43
--- /dev/null
+++ b/src/tools/rbd_ggate/Watcher.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_WATCHER_H
+#define CEPH_RBD_GGATE_WATCHER_H
+
+#include "include/rbd/librbd.hpp"
+
+namespace rbd {
+namespace ggate {
+
+class Driver;
+
+class Watcher : public librbd::UpdateWatchCtx
+{
+public:
+ Watcher(Driver *m_drv, librados::IoCtx &ioctx, librbd::Image &image,
+ size_t size);
+
+ void handle_notify() override;
+
+private:
+ Driver *m_drv;
+ librados::IoCtx &m_ioctx;
+ librbd::Image &m_image;
+ size_t m_size;
+};
+
+
+} // namespace ggate
+} // namespace rbd
+
+#endif // CEPH_RBD_GGATE_WATCHER_H
+
diff --git a/src/tools/rbd_ggate/debug.cc b/src/tools/rbd_ggate/debug.cc
new file mode 100644
index 00000000..b675ba5b
--- /dev/null
+++ b/src/tools/rbd_ggate/debug.cc
@@ -0,0 +1,55 @@
+#include "common/debug.h"
+#include "common/errno.h"
+#include "debug.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::ggate: "
+
+extern "C" void debugv(int level, const char *fmt, va_list ap) {
+ char *msg;
+ int saved_errno = errno;
+
+ if (g_ceph_context == nullptr) {
+ return;
+ }
+
+ vasprintf(&msg, fmt, ap);
+
+ dout(ceph::dout::need_dynamic(level)) << msg << dendl;
+
+ free(msg);
+ errno = saved_errno;
+}
+
+extern "C" void debug(int level, const char *fmt, ...) {
+ va_list ap;
+
+ va_start(ap, fmt);
+ debugv(level, fmt, ap);
+ va_end(ap);
+}
+
+extern "C" void errx(const char *fmt, ...) {
+ va_list ap;
+
+ va_start(ap, fmt);
+ debugv(-1, fmt, ap);
+ va_end(ap);
+}
+
+extern "C" void err(const char *fmt, ...) {
+ va_list ap;
+ char *msg;
+ int saved_errno = errno;
+
+ va_start(ap, fmt);
+ vasprintf(&msg, fmt, ap);
+ va_end(ap);
+ errno = saved_errno;
+
+ errx("%s: %s", msg, cpp_strerror(errno).c_str());
+
+ free(msg);
+}
diff --git a/src/tools/rbd_ggate/debug.h b/src/tools/rbd_ggate/debug.h
new file mode 100644
index 00000000..da9b46a3
--- /dev/null
+++ b/src/tools/rbd_ggate/debug.h
@@ -0,0 +1,17 @@
+#ifndef CEPH_RBD_GGATE_DEBUG_H
+#define CEPH_RBD_GGATE_DEBUG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void debug(int level, const char *fmt, ...) __printflike(2, 3);
+void debugv(int level, const char *fmt, va_list ap) __printflike(2, 0);
+void err(const char *fmt, ...) __printflike(1, 2);
+void errx(const char *fmt, ...) __printflike(1, 2);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CEPH_RBD_GGATE_DEBUG_H
diff --git a/src/tools/rbd_ggate/ggate_drv.c b/src/tools/rbd_ggate/ggate_drv.c
new file mode 100644
index 00000000..b1faccd2
--- /dev/null
+++ b/src/tools/rbd_ggate/ggate_drv.c
@@ -0,0 +1,379 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/linker.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+
+#include <geom/gate/g_gate.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <libgeom.h>
+
+#include "debug.h"
+#include "ggate_drv.h"
+
+uint64_t ggate_drv_req_id(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_seq;
+}
+
+int ggate_drv_req_cmd(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ switch (ggio->gctl_cmd) {
+ case BIO_WRITE:
+ return GGATE_DRV_CMD_WRITE;
+ case BIO_READ:
+ return GGATE_DRV_CMD_READ;
+ case BIO_FLUSH:
+ return GGATE_DRV_CMD_FLUSH;
+ case BIO_DELETE:
+ return GGATE_DRV_CMD_DISCARD;
+ default:
+ return GGATE_DRV_CMD_UNKNOWN;
+ }
+}
+
+uint64_t ggate_drv_req_offset(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_offset;
+}
+
+size_t ggate_drv_req_length(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_length;
+}
+
+void *ggate_drv_req_buf(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_data;
+}
+
+int ggate_drv_req_error(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ return ggio->gctl_error;
+}
+
+void ggate_drv_req_set_error(ggate_drv_req_t req, int error) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ ggio->gctl_error = error;
+}
+
+void *ggate_drv_req_release_buf(ggate_drv_req_t req) {
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+
+ void *data = ggio->gctl_data;
+ ggio->gctl_data = NULL;
+
+ return data;
+}
+
+struct ggate_drv {
+ int fd;
+ int unit;
+};
+
+int ggate_drv_load() {
+ if (modfind("g_gate") != -1) {
+ /* Present in kernel. */
+ return 0;
+ }
+
+ if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) {
+ if (errno != EEXIST) {
+ err("failed to load geom_gate module");
+ return -errno;
+ }
+ }
+ return 0;
+}
+
+int ggate_drv_create(char *name, size_t namelen, size_t sectorsize,
+ size_t mediasize, bool readonly, const char *info, ggate_drv_t *drv_) {
+ struct ggate_drv *drv;
+ struct g_gate_ctl_create ggiocreate;
+
+ debug(20, "%s: name=%s, sectorsize=%zd, mediasize=%zd, readonly=%d, info=%s",
+ __func__, name, sectorsize, mediasize, (int)readonly, info);
+
+ if (*name != '\0') {
+ if (namelen > sizeof(ggiocreate.gctl_name) - 1) {
+ return -ENAMETOOLONG;
+ }
+ }
+
+ /*
+ * We communicate with ggate via /dev/ggctl. Open it.
+ */
+ int fd = open("/dev/" G_GATE_CTL_NAME, O_RDWR);
+ if (fd == -1) {
+ err("failed to open /dev/" G_GATE_CTL_NAME);
+ return -errno;
+ }
+
+ drv = calloc(1, sizeof(*drv));
+ if (drv == NULL) {
+ errno = -ENOMEM;
+ goto fail_close;
+ }
+
+ /*
+ * Create provider.
+ */
+ memset(&ggiocreate, 0, sizeof(ggiocreate));
+ ggiocreate.gctl_version = G_GATE_VERSION;
+ ggiocreate.gctl_mediasize = mediasize;
+ ggiocreate.gctl_sectorsize = sectorsize;
+ ggiocreate.gctl_flags = readonly ? G_GATE_FLAG_READONLY : 0;
+ ggiocreate.gctl_maxcount = 0;
+ ggiocreate.gctl_timeout = 0;
+ if (*name != '\0') {
+ ggiocreate.gctl_unit = G_GATE_NAME_GIVEN;
+ strlcpy(ggiocreate.gctl_name, name, sizeof(ggiocreate.gctl_name));
+ } else {
+ ggiocreate.gctl_unit = G_GATE_UNIT_AUTO;
+ }
+ strlcpy(ggiocreate.gctl_info, info, sizeof(ggiocreate.gctl_info));
+ if (ioctl(fd, G_GATE_CMD_CREATE, &ggiocreate) == -1) {
+ err("failed to create " G_GATE_PROVIDER_NAME " device");
+ goto fail;
+ }
+
+ debug(20, "%s: created, unit: %d, name: %s", __func__, ggiocreate.gctl_unit,
+ ggiocreate.gctl_name);
+
+ drv->fd = fd;
+ drv->unit = ggiocreate.gctl_unit;
+ *drv_ = drv;
+
+ if (*name == '\0') {
+ snprintf(name, namelen, "%s%d", G_GATE_PROVIDER_NAME, drv->unit);
+ }
+
+ return 0;
+
+fail:
+ free(drv);
+fail_close:
+ close(fd);
+ return -errno;
+}
+
+void ggate_drv_destroy(ggate_drv_t drv_) {
+ struct ggate_drv *drv = (struct ggate_drv *)drv_;
+ struct g_gate_ctl_destroy ggiodestroy;
+
+ debug(20, "%s %p", __func__, drv);
+
+ memset(&ggiodestroy, 0, sizeof(ggiodestroy));
+ ggiodestroy.gctl_version = G_GATE_VERSION;
+ ggiodestroy.gctl_unit = drv->unit;
+ ggiodestroy.gctl_force = 1;
+
+ // Remember errno.
+ int rerrno = errno;
+
+ int r = ioctl(drv->fd, G_GATE_CMD_DESTROY, &ggiodestroy);
+ if (r == -1) {
+ err("failed to destroy /dev/%s%d device", G_GATE_PROVIDER_NAME,
+ drv->unit);
+ }
+ // Restore errno.
+ errno = rerrno;
+
+ free(drv);
+}
+
+int ggate_drv_resize(ggate_drv_t drv_, size_t newsize) {
+ struct ggate_drv *drv = (struct ggate_drv *)drv_;
+
+ debug(20, "%s %p: newsize=%zd", __func__, drv, newsize);
+
+ struct g_gate_ctl_modify ggiomodify;
+
+ memset(&ggiomodify, 0, sizeof(ggiomodify));
+ ggiomodify.gctl_version = G_GATE_VERSION;
+ ggiomodify.gctl_unit = drv->unit;
+ ggiomodify.gctl_modify = GG_MODIFY_MEDIASIZE;
+ ggiomodify.gctl_mediasize = newsize;
+
+ int r = ioctl(drv->fd, G_GATE_CMD_MODIFY, &ggiomodify);
+ if (r == -1) {
+ r = -errno;
+ err("failed to resize /dev/%s%d device", G_GATE_PROVIDER_NAME, drv->unit);
+ }
+ return r;
+}
+
+int ggate_drv_kill(const char *devname) {
+ debug(20, "%s %s", __func__, devname);
+
+ int fd = open("/dev/" G_GATE_CTL_NAME, O_RDWR);
+ if (fd == -1) {
+ err("failed to open /dev/" G_GATE_CTL_NAME);
+ return -errno;
+ }
+
+ struct g_gate_ctl_destroy ggiodestroy;
+ memset(&ggiodestroy, 0, sizeof(ggiodestroy));
+ ggiodestroy.gctl_version = G_GATE_VERSION;
+ ggiodestroy.gctl_unit = G_GATE_NAME_GIVEN;
+ ggiodestroy.gctl_force = 1;
+
+ strlcpy(ggiodestroy.gctl_name, devname, sizeof(ggiodestroy.gctl_name));
+
+ int r = ioctl(fd, G_GATE_CMD_DESTROY, &ggiodestroy);
+ if (r == -1) {
+ r = -errno;
+ err("failed to destroy %s device", devname);
+ }
+
+ close(fd);
+ return r;
+}
+
+int ggate_drv_recv(ggate_drv_t drv_, ggate_drv_req_t *req) {
+ struct ggate_drv *drv = (struct ggate_drv *)drv_;
+ struct g_gate_ctl_io *ggio;
+ int error, r;
+
+ debug(20, "%s", __func__);
+
+ ggio = calloc(1, sizeof(*ggio));
+ if (ggio == NULL) {
+ return -ENOMEM;
+ }
+
+ ggio->gctl_version = G_GATE_VERSION;
+ ggio->gctl_unit = drv->unit;
+ ggio->gctl_data = malloc(MAXPHYS);
+ ggio->gctl_length = MAXPHYS;
+
+ debug(20, "%s: waiting for request from kernel", __func__);
+ if (ioctl(drv->fd, G_GATE_CMD_START, ggio) == -1) {
+ err("%s: G_GATE_CMD_START failed", __func__);
+ return -errno;
+ }
+
+ debug(20, "%s: got request from kernel: "
+ "unit=%d, seq=%ju, cmd=%u, offset=%ju, length=%ju, error=%d, data=%p",
+ __func__, ggio->gctl_unit, (uintmax_t)ggio->gctl_seq, ggio->gctl_cmd,
+ (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length,
+ ggio->gctl_error, ggio->gctl_data);
+
+ error = ggio->gctl_error;
+ switch (error) {
+ case 0:
+ break;
+ case ECANCELED:
+ debug(10, "%s: canceled: exit gracefully", __func__);
+ r = -error;
+ goto fail;
+ case ENOMEM:
+ /*
+ * Buffer too small? Impossible, we allocate MAXPHYS
+ * bytes - request can't be bigger than that.
+ */
+ /* FALLTHROUGH */
+ case ENXIO:
+ default:
+ errno = error;
+ err("%s: G_GATE_CMD_START failed", __func__);
+ r = -error;
+ goto fail;
+ }
+
+ *req = ggio;
+ return 0;
+
+fail:
+ free(ggio->gctl_data);
+ free(ggio);
+ return r;
+}
+
+int ggate_drv_send(ggate_drv_t drv_, ggate_drv_req_t req) {
+ struct ggate_drv *drv = (struct ggate_drv *)drv_;
+ struct g_gate_ctl_io *ggio = (struct g_gate_ctl_io *)req;
+ int r = 0;
+
+ debug(20, "%s: send request to kernel: "
+ "unit=%d, seq=%ju, cmd=%u, offset=%ju, length=%ju, error=%d, data=%p",
+ __func__, ggio->gctl_unit, (uintmax_t)ggio->gctl_seq, ggio->gctl_cmd,
+ (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length,
+ ggio->gctl_error, ggio->gctl_data);
+
+ if (ioctl(drv->fd, G_GATE_CMD_DONE, ggio) == -1) {
+ err("%s: G_GATE_CMD_DONE failed", __func__);
+ r = -errno;
+ }
+
+ free(ggio->gctl_data);
+ free(ggio);
+ return r;
+}
+
+static const char * get_conf(struct ggeom *gp, const char *name) {
+ struct gconfig *conf;
+
+ LIST_FOREACH(conf, &gp->lg_config, lg_config) {
+ if (strcmp(conf->lg_name, name) == 0)
+ return (conf->lg_val);
+ }
+ return "";
+}
+
+int ggate_drv_list(struct ggate_drv_info *info, size_t *size) {
+ struct gmesh mesh;
+ struct gclass *class;
+ struct ggeom *gp;
+ int r;
+ size_t max_size;
+
+ r = geom_gettree(&mesh);
+ if (r != 0) {
+ return -errno;
+ }
+
+ max_size = *size;
+ *size = 0;
+
+ LIST_FOREACH(class, &mesh.lg_class, lg_class) {
+ if (strcmp(class->lg_name, G_GATE_CLASS_NAME) == 0) {
+ LIST_FOREACH(gp, &class->lg_geom, lg_geom) {
+ (*size)++;
+ }
+ if (*size > max_size) {
+ r = -ERANGE;
+ goto done;
+ }
+ LIST_FOREACH(gp, &class->lg_geom, lg_geom) {
+ strlcpy(info->id, get_conf(gp, "unit"), sizeof(info->id));
+ strlcpy(info->name, gp->lg_name, sizeof(info->name));
+ strlcpy(info->info, get_conf(gp, "info"), sizeof(info->info));
+ info++;
+ }
+ }
+ }
+
+done:
+ geom_deletetree(&mesh);
+ return r;
+}
diff --git a/src/tools/rbd_ggate/ggate_drv.h b/src/tools/rbd_ggate/ggate_drv.h
new file mode 100644
index 00000000..a32f5113
--- /dev/null
+++ b/src/tools/rbd_ggate/ggate_drv.h
@@ -0,0 +1,64 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_GGATE_GGATE_DRV_H
+#define CEPH_RBD_GGATE_GGATE_DRV_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+
+#include <stdbool.h>
+#include <stdint.h>
+
+typedef void *ggate_drv_t;
+typedef void *ggate_drv_req_t;
+
+/*
+ * GGATE driver commands. They are mapped to GgateReq::Command.
+ */
+enum {
+ GGATE_DRV_CMD_UNKNOWN = 0,
+ GGATE_DRV_CMD_WRITE = 1,
+ GGATE_DRV_CMD_READ = 2,
+ GGATE_DRV_CMD_FLUSH = 3,
+ GGATE_DRV_CMD_DISCARD = 4,
+};
+
+struct ggate_drv_info {
+ char id[16];
+ char name[NAME_MAX];
+ char info[2048]; /* G_GATE_INFOSIZE */
+};
+
+uint64_t ggate_drv_req_id(ggate_drv_req_t req);
+int ggate_drv_req_cmd(ggate_drv_req_t req);
+void *ggate_drv_req_buf(ggate_drv_req_t req);
+size_t ggate_drv_req_length(ggate_drv_req_t req);
+uint64_t ggate_drv_req_offset(ggate_drv_req_t req);
+int ggate_drv_req_error(ggate_drv_req_t req);
+
+void ggate_drv_req_set_error(ggate_drv_req_t req, int error);
+void *ggate_drv_req_release_buf(ggate_drv_req_t req);
+
+int ggate_drv_load();
+
+int ggate_drv_create(char *name, size_t namelen, size_t sectorsize,
+ size_t mediasize, bool readonly, const char *info, ggate_drv_t *drv);
+void ggate_drv_destroy(ggate_drv_t drv);
+
+int ggate_drv_recv(ggate_drv_t drv, ggate_drv_req_t *req);
+int ggate_drv_send(ggate_drv_t drv, ggate_drv_req_t req);
+
+int ggate_drv_resize(ggate_drv_t drv, size_t newsize);
+
+int ggate_drv_kill(const char *devname);
+int ggate_drv_list(struct ggate_drv_info *info, size_t *size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CEPH_RBD_GGATE_GGATE_DRV_H
diff --git a/src/tools/rbd_ggate/main.cc b/src/tools/rbd_ggate/main.cc
new file mode 100644
index 00000000..5ed582fb
--- /dev/null
+++ b/src/tools/rbd_ggate/main.cc
@@ -0,0 +1,521 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/int_types.h"
+
+#include <sys/types.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+
+#include <iostream>
+#include <memory>
+#include <boost/algorithm/string/predicate.hpp>
+#include <regex>
+
+#include "common/Formatter.h"
+#include "common/Preforker.h"
+#include "common/TextTable.h"
+#include "common/ceph_argparse.h"
+#include "common/config_proxy.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "include/stringify.h"
+
+#include "Driver.h"
+#include "Server.h"
+#include "Watcher.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd-ggate: " << __func__ << ": "
+
+static void usage() {
+ std::cout << "Usage: rbd-ggate [options] map <image-or-snap-spec> Map an image to ggate device\n"
+ << " unmap <device path> Unmap ggate device\n"
+ << " list List mapped ggate devices\n"
+ << "\n"
+ << "Map options:\n"
+ << " --device <device path> Specify ggate device path\n"
+ << " --read-only Map readonly\n"
+ << " --exclusive Forbid writes by other clients\n"
+ << "\n"
+ << "List options:\n"
+ << " --format plain|json|xml Output format (default: plain)\n"
+ << " --pretty-format Pretty formatting (json and xml)\n"
+ << std::endl;
+ generic_server_usage();
+}
+
+static std::string devpath, poolname, nsname, imgname, snapname;
+static bool readonly = false;
+static bool exclusive = false;
+
+static std::unique_ptr<rbd::ggate::Driver> drv;
+
+static void handle_signal(int signum)
+{
+ derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
+
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ ceph_assert(drv);
+
+ drv->shut_down();
+}
+
+static int do_map(int argc, const char *argv[])
+{
+ int r;
+
+ librados::Rados rados;
+ librbd::RBD rbd;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+
+ librbd::image_info_t info;
+ std::string desc;
+
+ Preforker forker;
+
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+ g_ceph_context->_conf.set_val_or_die("pid_file", "");
+
+ if (global_init_prefork(g_ceph_context) >= 0) {
+ std::string err;
+ r = forker.prefork(err);
+ if (r < 0) {
+ std::cerr << err << std::endl;
+ return r;
+ }
+ if (forker.is_parent()) {
+ if (forker.parent_wait(err) != 0) {
+ return -ENXIO;
+ }
+ return 0;
+ }
+ global_init_postfork_start(g_ceph_context);
+ }
+
+ common_init_finish(g_ceph_context);
+ global_init_chdir(g_ceph_context);
+
+ if (poolname.empty()) {
+ poolname = g_ceph_context->_conf.get_val<std::string>("rbd_default_pool");
+ }
+
+ std::string devname = boost::starts_with(devpath, "/dev/") ?
+ devpath.substr(5) : devpath;
+ std::unique_ptr<rbd::ggate::Watcher> watcher;
+ uint64_t handle;
+
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ goto done;
+ }
+
+ r = rados.connect();
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to connect to cluster: " << cpp_strerror(r)
+ << std::endl;
+ goto done;
+ }
+
+ r = rados.ioctx_create(poolname.c_str(), io_ctx);
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to acces pool " << poolname << ": "
+ << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+
+ io_ctx.set_namespace(nsname);
+
+ r = rbd.open(io_ctx, image, imgname.c_str());
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to open image " << imgname << ": "
+ << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+
+ if (exclusive) {
+ r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE);
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to acquire exclusive lock: "
+ << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+ }
+
+ desc = "RBD " + poolname + "/" + (nsname.empty() ? "" : nsname + "/") +
+ imgname;
+
+ if (!snapname.empty()) {
+ r = image.snap_set(snapname.c_str());
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to set snapshot " << snapname << ": "
+ << cpp_strerror(r) << std::endl;
+ goto done;
+ }
+ readonly = true;
+ desc += "@" + snapname;
+ }
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0) {
+ std::cerr << "rbd-ggate: image stat failed: " << cpp_strerror(r)
+ << std::endl;
+ goto done;
+ }
+
+ rbd::ggate::Driver::load();
+ drv.reset(new rbd::ggate::Driver(devname, 512, info.size, readonly, desc));
+ r = drv->init();
+ if (r < 0) {
+ r = -errno;
+ std::cerr << "rbd-ggate: failed to create ggate device: " << cpp_strerror(r)
+ << std::endl;
+ goto done;
+ }
+
+ watcher.reset(new rbd::ggate::Watcher(drv.get(), io_ctx, image, info.size));
+ r = image.update_watch(watcher.get(), &handle);
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to set watcher: " << cpp_strerror(r)
+ << std::endl;
+ drv->shut_down();
+ goto done;
+ }
+
+ std::cout << "/dev/" << drv->get_devname() << std::endl;
+
+ if (g_conf()->daemonize) {
+ global_init_postfork_finish(g_ceph_context);
+ forker.daemonize();
+ }
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, sighup_handler);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ rbd::ggate::Server(drv.get(), image).run();
+
+ unregister_async_signal_handler(SIGHUP, sighup_handler);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ r = image.update_unwatch(handle);
+ ceph_assert(r == 0);
+
+done:
+ image.close();
+ io_ctx.close();
+ rados.shutdown();
+
+ if (r < 0) {
+ std::cerr << "rbd-ggate: failed to map: " << cpp_strerror(r) << std::endl;
+ }
+
+ forker.exit(r < 0 ? EXIT_FAILURE : 0);
+ // Unreachable;
+ return r;
+}
+
+static int do_unmap()
+{
+ std::string devname = boost::starts_with(devpath, "/dev/") ?
+ devpath.substr(5) : devpath;
+
+ int r = rbd::ggate::Driver::kill(devname);
+ if (r < 0) {
+ cerr << "rbd-ggate: failed to destroy " << devname << ": "
+ << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+static int parse_imgpath(const std::string &imgpath, std::string *poolname,
+ std::string *nsname, std::string *imgname,
+ std::string *snapname) {
+ std::regex pattern("^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$");
+ std::smatch match;
+ if (!std::regex_match(imgpath, match, pattern)) {
+ std::cerr << "rbd-ggate: invalid spec '" << imgpath << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (match[1].matched) {
+ *poolname = match[1];
+ }
+
+ if (match[2].matched) {
+ *nsname = match[2];
+ }
+
+ *imgname = match[3];
+
+ if (match[4].matched) {
+ *snapname = match[4];
+ }
+
+ return 0;
+}
+
+static bool find_mapped_dev_by_spec(const std::string &spec,
+ std::string *devname) {
+ std::string poolname, nsname, imgname, snapname;
+ int r = parse_imgpath(spec, &poolname, &nsname, &imgname, &snapname);
+ if (r < 0) {
+ return false;
+ }
+ if (poolname.empty()) {
+ // We could use rbd_default_pool config to set pool name but then
+ // we would need to initialize the global context. So right now it
+ // is mandatory for the user to specify a pool. Fortunately the
+ // preferred way for users to call rbd-ggate is via rbd, which
+ // cares to set the pool name.
+ return false;
+ }
+
+ std::map<std::string, rbd::ggate::Driver::DevInfo> devs;
+ r = rbd::ggate::Driver::list(&devs);
+ if (r < 0) {
+ return false;
+ }
+
+ for (auto &it : devs) {
+ auto &name = it.second.first;
+ auto &info = it.second.second;
+ if (!boost::starts_with(info, "RBD ")) {
+ continue;
+ }
+
+ std::string p, n, i, s;
+ parse_imgpath(info.substr(4), &p, &n, &i, &s);
+ if (p == poolname && n == nsname && i == imgname && s == snapname) {
+ *devname = name;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static int do_list(const std::string &format, bool pretty_format)
+{
+ rbd::ggate::Driver::load();
+
+ std::map<std::string, rbd::ggate::Driver::DevInfo> devs;
+ int r = rbd::ggate::Driver::list(&devs);
+ if (r < 0) {
+ return -r;
+ }
+
+ std::unique_ptr<ceph::Formatter> f;
+ TextTable tbl;
+
+ if (format == "json") {
+ f.reset(new JSONFormatter(pretty_format));
+ } else if (format == "xml") {
+ f.reset(new XMLFormatter(pretty_format));
+ } else if (!format.empty() && format != "plain") {
+ std::cerr << "rbd-ggate: invalid output format: " << format << std::endl;
+ return -EINVAL;
+ }
+
+ if (f) {
+ f->open_array_section("devices");
+ } else {
+ tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ int count = 0;
+
+ for (auto &it : devs) {
+ auto &id = it.first;
+ auto &name = it.second.first;
+ auto &info = it.second.second;
+ if (!boost::starts_with(info, "RBD ")) {
+ continue;
+ }
+
+ std::string poolname;
+ std::string nsname;
+ std::string imgname;
+ std::string snapname(f ? "" : "-");
+ parse_imgpath(info.substr(4), &poolname, &nsname, &imgname, &snapname);
+
+ if (f) {
+ f->open_object_section("device");
+ f->dump_string("id", id);
+ f->dump_string("pool", poolname);
+ f->dump_string("namespace", nsname);
+ f->dump_string("image", imgname);
+ f->dump_string("snap", snapname);
+ f->dump_string("device", "/dev/" + name);
+ f->close_section();
+ } else {
+ tbl << id << poolname << nsname << imgname << snapname << "/dev/" + name
+ << TextTable::endrow;
+ }
+ count++;
+ }
+
+ if (f) {
+ f->close_section(); // devices
+ f->flush(std::cout);
+ } else if (count > 0) {
+ std::cout << tbl;
+ }
+
+ return 0;
+}
+
+int main(int argc, const char *argv[]) {
+ int r;
+ enum {
+ None,
+ Connect,
+ Disconnect,
+ List
+ } cmd = None;
+
+ vector<const char*> args;
+
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+ // filter out ceph config options
+ ConfigProxy{false}.parse_argv(args);
+
+ std::string format;
+ bool pretty_format = false;
+ std::vector<const char*>::iterator i;
+
+ for (i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+ usage();
+ return 0;
+ } else if (ceph_argparse_witharg(args, i, &devpath, "--device",
+ (char *)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
+ readonly = true;
+ } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) {
+ exclusive = true;
+ } else if (ceph_argparse_witharg(args, i, &format, "--format",
+ (char *)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) {
+ pretty_format = true;
+ } else {
+ ++i;
+ }
+ }
+
+ if (args.begin() != args.end()) {
+ if (strcmp(*args.begin(), "map") == 0) {
+ cmd = Connect;
+ } else if (strcmp(*args.begin(), "unmap") == 0) {
+ cmd = Disconnect;
+ } else if (strcmp(*args.begin(), "list") == 0) {
+ cmd = List;
+ } else {
+ cerr << "rbd-ggate: unknown command: " << *args.begin() << std::endl;
+ return EXIT_FAILURE;
+ }
+ args.erase(args.begin());
+ }
+
+ if (cmd == None) {
+ cerr << "rbd-ggate: must specify command" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ switch (cmd) {
+ case Connect:
+ if (args.begin() == args.end()) {
+ cerr << "rbd-ggate: must specify image-or-snap-spec" << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (parse_imgpath(*args.begin(), &poolname, &nsname, &imgname,
+ &snapname) < 0) {
+ return EXIT_FAILURE;
+ }
+ args.erase(args.begin());
+ break;
+ case Disconnect:
+ if (args.begin() == args.end()) {
+ std::cerr << "rbd-ggate: must specify ggate device or image-or-snap-spec"
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (boost::starts_with(*args.begin(), "/dev/") ||
+ !find_mapped_dev_by_spec(*args.begin(), &devpath)) {
+ devpath = *args.begin();
+ }
+ args.erase(args.begin());
+ break;
+ default:
+ break;
+ }
+
+ if (args.begin() != args.end()) {
+ cerr << "rbd-ggate: unknown args: " << *args.begin() << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ switch (cmd) {
+ case Connect:
+ if (imgname.empty()) {
+ cerr << "rbd-ggate: image name was not specified" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ r = do_map(argc, argv);
+ if (r < 0)
+ return EXIT_FAILURE;
+ break;
+ case Disconnect:
+ r = do_unmap();
+ if (r < 0)
+ return EXIT_FAILURE;
+ break;
+ case List:
+ r = do_list(format, pretty_format);
+ if (r < 0)
+ return EXIT_FAILURE;
+ break;
+ default:
+ usage();
+ return EXIT_FAILURE;
+ }
+
+ return 0;
+}
diff --git a/src/tools/rbd_mirror/BaseRequest.h b/src/tools/rbd_mirror/BaseRequest.h
new file mode 100644
index 00000000..5053eb83
--- /dev/null
+++ b/src/tools/rbd_mirror/BaseRequest.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_BASE_REQUEST_H
+#define CEPH_RBD_MIRROR_BASE_REQUEST_H
+
+#include "common/RefCountedObj.h"
+#include "include/Context.h"
+
+namespace rbd {
+namespace mirror {
+
+class BaseRequest : public RefCountedObject {
+public:
+ BaseRequest(const std::string& name, CephContext *cct, Context *on_finish)
+ : RefCountedObject(cct, 1), m_name(name), m_cct(cct),
+ m_on_finish(on_finish) {
+ }
+
+ virtual void send() = 0;
+ virtual void cancel() {}
+
+protected:
+ virtual void finish(int r) {
+ if (m_cct) {
+ lsubdout(m_cct, rbd_mirror, 20) << m_name << "::finish: r=" << r << dendl;
+ }
+ if (m_on_finish) {
+ m_on_finish->complete(r);
+ }
+ put();
+ }
+
+private:
+ const std::string m_name;
+ CephContext *m_cct;
+ Context *m_on_finish;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_BASE_REQUEST_H
diff --git a/src/tools/rbd_mirror/CMakeLists.txt b/src/tools/rbd_mirror/CMakeLists.txt
new file mode 100644
index 00000000..30106a86
--- /dev/null
+++ b/src/tools/rbd_mirror/CMakeLists.txt
@@ -0,0 +1,69 @@
+add_library(rbd_mirror_types STATIC
+ image_map/Types.cc
+ instance_watcher/Types.cc
+ leader_watcher/Types.cc)
+
+set(rbd_mirror_internal
+ ClusterWatcher.cc
+ ImageDeleter.cc
+ ImageMap.cc
+ ImageReplayer.cc
+ ImageSync.cc
+ ImageSyncThrottler.cc
+ InstanceReplayer.cc
+ InstanceWatcher.cc
+ Instances.cc
+ LeaderWatcher.cc
+ Mirror.cc
+ MirrorStatusWatcher.cc
+ PoolReplayer.cc
+ PoolWatcher.cc
+ ServiceDaemon.cc
+ Threads.cc
+ Types.cc
+ image_deleter/SnapshotPurgeRequest.cc
+ image_deleter/TrashMoveRequest.cc
+ image_deleter/TrashRemoveRequest.cc
+ image_deleter/TrashWatcher.cc
+ image_map/LoadRequest.cc
+ image_map/Policy.cc
+ image_map/SimplePolicy.cc
+ image_map/StateTransition.cc
+ image_map/UpdateRequest.cc
+ image_replayer/BootstrapRequest.cc
+ image_replayer/CloseImageRequest.cc
+ image_replayer/CreateImageRequest.cc
+ image_replayer/EventPreprocessor.cc
+ image_replayer/GetMirrorImageIdRequest.cc
+ image_replayer/IsPrimaryRequest.cc
+ image_replayer/OpenImageRequest.cc
+ image_replayer/OpenLocalImageRequest.cc
+ image_replayer/PrepareLocalImageRequest.cc
+ image_replayer/PrepareRemoteImageRequest.cc
+ image_replayer/ReplayStatusFormatter.cc
+ image_replayer/Utils.cc
+ image_sync/SyncPointCreateRequest.cc
+ image_sync/SyncPointPruneRequest.cc
+ pool_watcher/RefreshImagesRequest.cc
+ service_daemon/Types.cc)
+
+add_library(rbd_mirror_internal STATIC
+ ${rbd_mirror_internal})
+
+add_executable(rbd-mirror
+ main.cc)
+target_link_libraries(rbd-mirror
+ rbd_mirror_internal
+ rbd_mirror_types
+ rbd_api
+ rbd_internal
+ rbd_types
+ journal
+ librados
+ osdc
+ cls_rbd_client
+ cls_lock_client
+ cls_journal_client
+ global
+ ${ALLOC_LIBS})
+install(TARGETS rbd-mirror DESTINATION bin)
diff --git a/src/tools/rbd_mirror/ClusterWatcher.cc b/src/tools/rbd_mirror/ClusterWatcher.cc
new file mode 100644
index 00000000..54329de6
--- /dev/null
+++ b/src/tools/rbd_mirror/ClusterWatcher.cc
@@ -0,0 +1,223 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ClusterWatcher.h"
+#include "include/stringify.h"
+#include "common/ceph_json.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/internal.h"
+#include "librbd/api/Mirror.h"
+#include "tools/rbd_mirror/ServiceDaemon.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ClusterWatcher:" << this << " " \
+ << __func__ << ": "
+
+using std::list;
+using std::map;
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using librados::Rados;
+using librados::IoCtx;
+
+namespace rbd {
+namespace mirror {
+
+ClusterWatcher::ClusterWatcher(RadosRef cluster, Mutex &lock,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon)
+ : m_cluster(cluster), m_lock(lock), m_service_daemon(service_daemon)
+{
+}
+
+const ClusterWatcher::PoolPeers& ClusterWatcher::get_pool_peers() const
+{
+ ceph_assert(m_lock.is_locked());
+ return m_pool_peers;
+}
+
+void ClusterWatcher::refresh_pools()
+{
+ dout(20) << "enter" << dendl;
+
+ PoolPeers pool_peers;
+ read_pool_peers(&pool_peers);
+
+ Mutex::Locker l(m_lock);
+ m_pool_peers = pool_peers;
+ // TODO: perhaps use a workqueue instead, once we get notifications
+ // about config changes for existing pools
+}
+
+void ClusterWatcher::read_pool_peers(PoolPeers *pool_peers)
+{
+ int r = m_cluster->wait_for_latest_osdmap();
+ if (r < 0) {
+ derr << "error waiting for OSD map: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ list<pair<int64_t, string> > pools;
+ r = m_cluster->pool_list2(pools);
+ if (r < 0) {
+ derr << "error listing pools: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ std::set<int64_t> service_pool_ids;
+ for (auto& kv : pools) {
+ int64_t pool_id = kv.first;
+ auto& pool_name = kv.second;
+ int64_t base_tier;
+ r = m_cluster->pool_get_base_tier(pool_id, &base_tier);
+ if (r == -ENOENT) {
+ dout(10) << "pool " << pool_name << " no longer exists" << dendl;
+ continue;
+ } else if (r < 0) {
+ derr << "Error retrieving base tier for pool " << pool_name << dendl;
+ continue;
+ }
+ if (pool_id != base_tier) {
+ // pool is a cache; skip it
+ continue;
+ }
+
+ IoCtx ioctx;
+ r = m_cluster->ioctx_create2(pool_id, ioctx);
+ if (r == -ENOENT) {
+ dout(10) << "pool " << pool_id << " no longer exists" << dendl;
+ continue;
+ } else if (r < 0) {
+ derr << "Error accessing pool " << pool_name << cpp_strerror(r) << dendl;
+ continue;
+ }
+
+ cls::rbd::MirrorMode mirror_mode_internal;
+ r = librbd::cls_client::mirror_mode_get(&ioctx, &mirror_mode_internal);
+ if (r == 0 && mirror_mode_internal == cls::rbd::MIRROR_MODE_DISABLED) {
+ dout(10) << "mirroring is disabled for pool " << pool_name << dendl;
+ continue;
+ }
+
+ service_pool_ids.insert(pool_id);
+ if (m_service_pools.find(pool_id) == m_service_pools.end()) {
+ m_service_pools[pool_id] = {};
+ m_service_daemon->add_pool(pool_id, pool_name);
+ }
+
+ if (r == -EPERM) {
+ dout(10) << "access denied querying pool " << pool_name << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING, "access denied");
+ continue;
+ } else if (r < 0) {
+ derr << "could not tell whether mirroring was enabled for " << pool_name
+ << " : " << cpp_strerror(r) << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING, "mirroring mode query failed");
+ continue;
+ }
+
+ vector<librbd::mirror_peer_t> configs;
+ r = librbd::api::Mirror<>::peer_list(ioctx, &configs);
+ if (r < 0) {
+ derr << "error reading mirroring config for pool " << pool_name
+ << cpp_strerror(r) << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_ERROR, "mirroring peer list failed");
+ continue;
+ }
+
+ std::vector<PeerSpec> peers{configs.begin(), configs.end()};
+ for (auto& peer : peers) {
+ r = resolve_peer_config_keys(pool_id, pool_name, &peer);
+ if (r < 0) {
+ break;
+ }
+ }
+
+ if (m_service_pools[pool_id] != service_daemon::CALLOUT_ID_NONE) {
+ m_service_daemon->remove_callout(pool_id, m_service_pools[pool_id]);
+ m_service_pools[pool_id] = service_daemon::CALLOUT_ID_NONE;
+ }
+
+ pool_peers->emplace(pool_id, Peers{peers.begin(), peers.end()});
+ }
+
+ for (auto it = m_service_pools.begin(); it != m_service_pools.end(); ) {
+ auto current_it(it++);
+ if (service_pool_ids.find(current_it->first) == service_pool_ids.end()) {
+ m_service_daemon->remove_pool(current_it->first);
+ m_service_pools.erase(current_it->first);
+ }
+ }
+}
+
+int ClusterWatcher::resolve_peer_config_keys(int64_t pool_id,
+ const std::string& pool_name,
+ PeerSpec* peer) {
+ dout(10) << "retrieving config-key: pool_id=" << pool_id << ", "
+ << "pool_name=" << pool_name << ", "
+ << "peer_uuid=" << peer->uuid << dendl;
+
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config-key get\", "
+ "\"key\": \"" RBD_MIRROR_PEER_CONFIG_KEY_PREFIX + stringify(pool_id) +
+ "/" + peer->uuid + "\""
+ "}";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+ int r = m_cluster->mon_command(cmd, in_bl, &out_bl, nullptr);
+ if (r == -ENOENT || out_bl.length() == 0) {
+ return 0;
+ } else if (r < 0) {
+ derr << "error reading mirroring peer config for pool " << pool_name << ": "
+ << cpp_strerror(r) << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING,
+ "mirroring peer config-key query failed");
+ return r;
+ }
+
+ bool json_valid = false;
+ json_spirit::mValue json_root;
+ if(json_spirit::read(out_bl.to_str(), json_root)) {
+ try {
+ auto& json_obj = json_root.get_obj();
+ if (json_obj.count("mon_host")) {
+ peer->mon_host = json_obj["mon_host"].get_str();
+ }
+ if (json_obj.count("key")) {
+ peer->key = json_obj["key"].get_str();
+ }
+ json_valid = true;
+ } catch (std::runtime_error&) {
+ }
+ }
+
+ if (!json_valid) {
+ derr << "error parsing mirroring peer config for pool " << pool_name << ", "
+ << "peer " << peer->uuid << dendl;
+ m_service_pools[pool_id] = m_service_daemon->add_or_update_callout(
+ pool_id, m_service_pools[pool_id],
+ service_daemon::CALLOUT_LEVEL_WARNING,
+ "mirroring peer config-key decode failed");
+ }
+
+ return 0;
+}
+
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/ClusterWatcher.h b/src/tools/rbd_mirror/ClusterWatcher.h
new file mode 100644
index 00000000..e8430b47
--- /dev/null
+++ b/src/tools/rbd_mirror/ClusterWatcher.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_CLUSTER_WATCHER_H
+#define CEPH_RBD_MIRROR_CLUSTER_WATCHER_H
+
+#include <map>
+#include <memory>
+#include <set>
+
+#include "common/ceph_context.h"
+#include "common/Mutex.h"
+#include "common/Timer.h"
+#include "include/rados/librados.hpp"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/service_daemon/Types.h"
+#include <unordered_map>
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class ServiceDaemon;
+
+/**
+ * Tracks mirroring configuration for pools in a single
+ * cluster.
+ */
+class ClusterWatcher {
+public:
+ struct PeerSpecCompare {
+ bool operator()(const PeerSpec& lhs, const PeerSpec& rhs) const {
+ return (lhs.uuid < rhs.uuid);
+ }
+ };
+ typedef std::set<PeerSpec, PeerSpecCompare> Peers;
+ typedef std::map<int64_t, Peers> PoolPeers;
+
+ ClusterWatcher(RadosRef cluster, Mutex &lock,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon);
+ ~ClusterWatcher() = default;
+ ClusterWatcher(const ClusterWatcher&) = delete;
+ ClusterWatcher& operator=(const ClusterWatcher&) = delete;
+
+ // Caller controls frequency of calls
+ void refresh_pools();
+ const PoolPeers& get_pool_peers() const;
+
+private:
+ typedef std::unordered_map<int64_t, service_daemon::CalloutId> ServicePools;
+
+ RadosRef m_cluster;
+ Mutex &m_lock;
+ ServiceDaemon<librbd::ImageCtx>* m_service_daemon;
+
+ ServicePools m_service_pools;
+ PoolPeers m_pool_peers;
+
+ void read_pool_peers(PoolPeers *pool_peers);
+
+ int resolve_peer_config_keys(int64_t pool_id, const std::string& pool_name,
+ PeerSpec* peer);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_CLUSTER_WATCHER_H
diff --git a/src/tools/rbd_mirror/ImageDeleter.cc b/src/tools/rbd_mirror/ImageDeleter.cc
new file mode 100644
index 00000000..f4d928ca
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageDeleter.cc
@@ -0,0 +1,549 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "common/Formatter.h"
+#include "common/admin_socket.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "global/global_context.h"
+#include "librbd/internal.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Operations.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/Utils.h"
+#include "ImageDeleter.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_deleter/TrashMoveRequest.h"
+#include "tools/rbd_mirror/image_deleter/TrashRemoveRequest.h"
+#include "tools/rbd_mirror/image_deleter/TrashWatcher.h"
+#include <map>
+#include <sstream>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+
+using std::string;
+using std::stringstream;
+using std::vector;
+using std::pair;
+using std::make_pair;
+
+using librados::IoCtx;
+using namespace librbd;
+
+namespace rbd {
+namespace mirror {
+
+namespace {
+
+class ImageDeleterAdminSocketCommand {
+public:
+ virtual ~ImageDeleterAdminSocketCommand() {}
+ virtual bool call(Formatter *f, stringstream *ss) = 0;
+};
+
+template <typename I>
+class StatusCommand : public ImageDeleterAdminSocketCommand {
+public:
+ explicit StatusCommand(ImageDeleter<I> *image_del) : image_del(image_del) {}
+
+ bool call(Formatter *f, stringstream *ss) override {
+ image_del->print_status(f, ss);
+ return true;
+ }
+
+private:
+ ImageDeleter<I> *image_del;
+};
+
+} // anonymous namespace
+
+template <typename I>
+class ImageDeleterAdminSocketHook : public AdminSocketHook {
+public:
+ ImageDeleterAdminSocketHook(CephContext *cct, const std::string& pool_name,
+ ImageDeleter<I> *image_del) :
+ admin_socket(cct->get_admin_socket()) {
+
+ std::string command;
+ int r;
+
+ command = "rbd mirror deletion status " + pool_name;
+ r = admin_socket->register_command(command, command, this,
+ "get status for image deleter");
+ if (r == 0) {
+ commands[command] = new StatusCommand<I>(image_del);
+ }
+
+ }
+
+ ~ImageDeleterAdminSocketHook() override {
+ for (Commands::const_iterator i = commands.begin(); i != commands.end();
+ ++i) {
+ (void)admin_socket->unregister_command(i->first);
+ delete i->second;
+ }
+ }
+
+ bool call(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, bufferlist& out) override {
+ Commands::const_iterator i = commands.find(command);
+ ceph_assert(i != commands.end());
+ Formatter *f = Formatter::create(format);
+ stringstream ss;
+ bool r = i->second->call(f, &ss);
+ delete f;
+ out.append(ss);
+ return r;
+ }
+
+private:
+ typedef std::map<std::string, ImageDeleterAdminSocketCommand*,
+ std::less<>> Commands;
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+template <typename I>
+ImageDeleter<I>::ImageDeleter(librados::IoCtx& local_io_ctx,
+ Threads<librbd::ImageCtx>* threads,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon)
+ : m_local_io_ctx(local_io_ctx), m_threads(threads),
+ m_service_daemon(service_daemon), m_trash_listener(this),
+ m_lock(librbd::util::unique_lock_name("rbd::mirror::ImageDeleter::m_lock",
+ this)) {
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ImageDeleter: " << " " \
+ << __func__ << ": "
+
+template <typename I>
+void ImageDeleter<I>::trash_move(librados::IoCtx& local_io_ctx,
+ const std::string& global_image_id,
+ bool resync,
+ ContextWQ* work_queue, Context* on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << ", "
+ << "resync=" << resync << dendl;
+
+ auto req = rbd::mirror::image_deleter::TrashMoveRequest<>::create(
+ local_io_ctx, global_image_id, resync, work_queue, on_finish);
+ req->send();
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ImageDeleter: " << this << " " \
+ << __func__ << ": "
+
+template <typename I>
+void ImageDeleter<I>::init(Context* on_finish) {
+ dout(10) << dendl;
+
+ m_asok_hook = new ImageDeleterAdminSocketHook<I>(
+ g_ceph_context, m_local_io_ctx.get_pool_name(), this);
+
+ m_trash_watcher = image_deleter::TrashWatcher<I>::create(m_local_io_ctx,
+ m_threads,
+ m_trash_listener);
+ m_trash_watcher->init(on_finish);
+}
+
+template <typename I>
+void ImageDeleter<I>::shut_down(Context* on_finish) {
+ dout(10) << dendl;
+
+ delete m_asok_hook;
+ m_asok_hook = nullptr;
+
+ shut_down_trash_watcher(on_finish);
+}
+
+template <typename I>
+void ImageDeleter<I>::shut_down_trash_watcher(Context* on_finish) {
+ dout(10) << dendl;
+ ceph_assert(m_trash_watcher);
+ auto ctx = new FunctionContext([this, on_finish](int r) {
+ delete m_trash_watcher;
+ m_trash_watcher = nullptr;
+
+ wait_for_ops(on_finish);
+ });
+ m_trash_watcher->shut_down(ctx);
+}
+
+template <typename I>
+void ImageDeleter<I>::wait_for_ops(Context* on_finish) {
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ m_running = false;
+ cancel_retry_timer();
+ }
+
+ auto ctx = new FunctionContext([this, on_finish](int) {
+ cancel_all_deletions(on_finish);
+ });
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void ImageDeleter<I>::cancel_all_deletions(Context* on_finish) {
+ {
+ Mutex::Locker locker(m_lock);
+ // wake up any external state machines waiting on deletions
+ ceph_assert(m_in_flight_delete_queue.empty());
+ for (auto& queue : {&m_delete_queue, &m_retry_delete_queue}) {
+ for (auto& info : *queue) {
+ notify_on_delete(info->image_id, -ECANCELED);
+ }
+ queue->clear();
+ }
+ }
+ on_finish->complete(0);
+}
+
+template <typename I>
+void ImageDeleter<I>::wait_for_deletion(const std::string& image_id,
+ bool scheduled_only,
+ Context* on_finish) {
+ dout(5) << "image_id=" << image_id << dendl;
+
+ on_finish = new FunctionContext([this, on_finish](int r) {
+ m_threads->work_queue->queue(on_finish, r);
+ });
+
+ Mutex::Locker locker(m_lock);
+ auto del_info = find_delete_info(image_id);
+ if (!del_info && scheduled_only) {
+ // image not scheduled for deletion
+ on_finish->complete(0);
+ return;
+ }
+
+ notify_on_delete(image_id, -ESTALE);
+ m_on_delete_contexts[image_id] = on_finish;
+}
+
+template <typename I>
+void ImageDeleter<I>::complete_active_delete(DeleteInfoRef* delete_info,
+ int r) {
+ dout(20) << "info=" << *delete_info << ", r=" << r << dendl;
+ Mutex::Locker locker(m_lock);
+ notify_on_delete((*delete_info)->image_id, r);
+ delete_info->reset();
+}
+
+template <typename I>
+void ImageDeleter<I>::enqueue_failed_delete(DeleteInfoRef* delete_info,
+ int error_code,
+ double retry_delay) {
+ dout(20) << "info=" << *delete_info << ", r=" << error_code << dendl;
+ if (error_code == -EBLACKLISTED) {
+ Mutex::Locker locker(m_lock);
+ derr << "blacklisted while deleting local image" << dendl;
+ complete_active_delete(delete_info, error_code);
+ return;
+ }
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ auto& delete_info_ref = *delete_info;
+ notify_on_delete(delete_info_ref->image_id, error_code);
+ delete_info_ref->error_code = error_code;
+ ++delete_info_ref->retries;
+ delete_info_ref->retry_time = ceph_clock_now();
+ delete_info_ref->retry_time += retry_delay;
+ m_retry_delete_queue.push_back(delete_info_ref);
+
+ schedule_retry_timer();
+}
+
+template <typename I>
+typename ImageDeleter<I>::DeleteInfoRef
+ImageDeleter<I>::find_delete_info(const std::string &image_id) {
+ ceph_assert(m_lock.is_locked());
+ DeleteQueue delete_queues[] = {m_in_flight_delete_queue,
+ m_retry_delete_queue,
+ m_delete_queue};
+
+ DeleteInfo delete_info{image_id};
+ for (auto& queue : delete_queues) {
+ auto it = std::find_if(queue.begin(), queue.end(),
+ [&delete_info](const DeleteInfoRef& ref) {
+ return delete_info == *ref;
+ });
+ if (it != queue.end()) {
+ return *it;
+ }
+ }
+ return {};
+}
+
+template <typename I>
+void ImageDeleter<I>::print_status(Formatter *f, stringstream *ss) {
+ dout(20) << dendl;
+
+ if (f) {
+ f->open_object_section("image_deleter_status");
+ f->open_array_section("delete_images_queue");
+ }
+
+ Mutex::Locker l(m_lock);
+ for (const auto& image : m_delete_queue) {
+ image->print_status(f, ss);
+ }
+
+ if (f) {
+ f->close_section();
+ f->open_array_section("failed_deletes_queue");
+ }
+
+ for (const auto& image : m_retry_delete_queue) {
+ image->print_status(f, ss, true);
+ }
+
+ if (f) {
+ f->close_section();
+ f->close_section();
+ f->flush(*ss);
+ }
+}
+
+template <typename I>
+vector<string> ImageDeleter<I>::get_delete_queue_items() {
+ vector<string> items;
+
+ Mutex::Locker l(m_lock);
+ for (const auto& del_info : m_delete_queue) {
+ items.push_back(del_info->image_id);
+ }
+
+ return items;
+}
+
+template <typename I>
+vector<pair<string, int> > ImageDeleter<I>::get_failed_queue_items() {
+ vector<pair<string, int> > items;
+
+ Mutex::Locker l(m_lock);
+ for (const auto& del_info : m_retry_delete_queue) {
+ items.push_back(make_pair(del_info->image_id,
+ del_info->error_code));
+ }
+
+ return items;
+}
+
+template <typename I>
+void ImageDeleter<I>::remove_images() {
+ dout(10) << dendl;
+
+ auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
+ uint64_t max_concurrent_deletions = cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_concurrent_image_deletions");
+
+ Mutex::Locker locker(m_lock);
+ while (true) {
+ if (!m_running || m_delete_queue.empty() ||
+ m_in_flight_delete_queue.size() >= max_concurrent_deletions) {
+ return;
+ }
+
+ DeleteInfoRef delete_info = m_delete_queue.front();
+ m_delete_queue.pop_front();
+
+ ceph_assert(delete_info);
+ remove_image(delete_info);
+ }
+}
+
+template <typename I>
+void ImageDeleter<I>::remove_image(DeleteInfoRef delete_info) {
+ dout(10) << "info=" << *delete_info << dendl;
+ ceph_assert(m_lock.is_locked());
+
+ m_in_flight_delete_queue.push_back(delete_info);
+ m_async_op_tracker.start_op();
+
+ auto ctx = new FunctionContext([this, delete_info](int r) {
+ handle_remove_image(delete_info, r);
+ m_async_op_tracker.finish_op();
+ });
+
+ auto req = image_deleter::TrashRemoveRequest<I>::create(
+ m_local_io_ctx, delete_info->image_id, &delete_info->error_result,
+ m_threads->work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void ImageDeleter<I>::handle_remove_image(DeleteInfoRef delete_info,
+ int r) {
+ dout(10) << "info=" << *delete_info << ", r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_lock.is_locked());
+ auto it = std::find(m_in_flight_delete_queue.begin(),
+ m_in_flight_delete_queue.end(), delete_info);
+ ceph_assert(it != m_in_flight_delete_queue.end());
+ m_in_flight_delete_queue.erase(it);
+ }
+
+ if (r < 0) {
+ if (delete_info->error_result == image_deleter::ERROR_RESULT_COMPLETE) {
+ complete_active_delete(&delete_info, r);
+ } else if (delete_info->error_result ==
+ image_deleter::ERROR_RESULT_RETRY_IMMEDIATELY) {
+ enqueue_failed_delete(&delete_info, r, m_busy_interval);
+ } else {
+ auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
+ double failed_interval = cct->_conf.get_val<double>(
+ "rbd_mirror_delete_retry_interval");
+ enqueue_failed_delete(&delete_info, r, failed_interval);
+ }
+ } else {
+ complete_active_delete(&delete_info, 0);
+ }
+
+ // process the next queued image to delete
+ remove_images();
+}
+
+template <typename I>
+void ImageDeleter<I>::schedule_retry_timer() {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+ if (!m_running || m_timer_ctx != nullptr || m_retry_delete_queue.empty()) {
+ return;
+ }
+
+ dout(10) << dendl;
+ auto &delete_info = m_retry_delete_queue.front();
+ m_timer_ctx = new FunctionContext([this](int r) {
+ handle_retry_timer();
+ });
+ m_threads->timer->add_event_at(delete_info->retry_time, m_timer_ctx);
+}
+
+template <typename I>
+void ImageDeleter<I>::cancel_retry_timer() {
+ dout(10) << dendl;
+ ceph_assert(m_threads->timer_lock.is_locked());
+ if (m_timer_ctx != nullptr) {
+ bool canceled = m_threads->timer->cancel_event(m_timer_ctx);
+ m_timer_ctx = nullptr;
+ ceph_assert(canceled);
+ }
+}
+
+template <typename I>
+void ImageDeleter<I>::handle_retry_timer() {
+ dout(10) << dendl;
+ ceph_assert(m_threads->timer_lock.is_locked());
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_timer_ctx != nullptr);
+ m_timer_ctx = nullptr;
+
+ ceph_assert(m_running);
+ ceph_assert(!m_retry_delete_queue.empty());
+
+ // move all ready-to-ready items back to main queue
+ utime_t now = ceph_clock_now();
+ while (!m_retry_delete_queue.empty()) {
+ auto &delete_info = m_retry_delete_queue.front();
+ if (delete_info->retry_time > now) {
+ break;
+ }
+
+ m_delete_queue.push_back(delete_info);
+ m_retry_delete_queue.pop_front();
+ }
+
+ // schedule wake up for any future retries
+ schedule_retry_timer();
+
+ // start (concurrent) removal of images
+ m_async_op_tracker.start_op();
+ auto ctx = new FunctionContext([this](int r) {
+ remove_images();
+ m_async_op_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void ImageDeleter<I>::handle_trash_image(const std::string& image_id,
+ const utime_t& deferment_end_time) {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+
+ auto del_info = find_delete_info(image_id);
+ if (del_info != nullptr) {
+ dout(20) << "image " << image_id << " "
+ << "was already scheduled for deletion" << dendl;
+ return;
+ }
+
+ dout(10) << "image_id=" << image_id << ", "
+ << "deferment_end_time=" << deferment_end_time << dendl;
+
+ del_info.reset(new DeleteInfo(image_id));
+ del_info->retry_time = deferment_end_time;
+ m_retry_delete_queue.push_back(del_info);
+
+ schedule_retry_timer();
+}
+
+template <typename I>
+void ImageDeleter<I>::notify_on_delete(const std::string& image_id,
+ int r) {
+ dout(10) << "image_id=" << image_id << ", r=" << r << dendl;
+ auto it = m_on_delete_contexts.find(image_id);
+ if (it == m_on_delete_contexts.end()) {
+ return;
+ }
+
+ it->second->complete(r);
+ m_on_delete_contexts.erase(it);
+}
+
+template <typename I>
+void ImageDeleter<I>::DeleteInfo::print_status(Formatter *f, stringstream *ss,
+ bool print_failure_info) {
+ if (f) {
+ f->open_object_section("delete_info");
+ f->dump_string("image_id", image_id);
+ if (print_failure_info) {
+ f->dump_string("error_code", cpp_strerror(error_code));
+ f->dump_int("retries", retries);
+ }
+ f->close_section();
+ f->flush(*ss);
+ } else {
+ *ss << *this;
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ImageDeleter<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageDeleter.h b/src/tools/rbd_mirror/ImageDeleter.h
new file mode 100644
index 00000000..8a17eb38
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageDeleter.h
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETER_H
+
+#include "include/utime.h"
+#include "common/AsyncOpTracker.h"
+#include "common/Mutex.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+#include <atomic>
+#include <deque>
+#include <iosfwd>
+#include <map>
+#include <memory>
+#include <vector>
+
+class AdminSocketHook;
+class Context;
+class ContextWQ;
+class SafeTimer;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class ServiceDaemon;
+template <typename> class Threads;
+
+namespace image_deleter { template <typename> struct TrashWatcher; }
+
+/**
+ * Manage deletion of non-primary images.
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageDeleter {
+public:
+ static ImageDeleter* create(librados::IoCtx& local_io_ctx,
+ Threads<librbd::ImageCtx>* threads,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon) {
+ return new ImageDeleter(local_io_ctx, threads, service_daemon);
+ }
+
+ ImageDeleter(librados::IoCtx& local_io_ctx,
+ Threads<librbd::ImageCtx>* threads,
+ ServiceDaemon<librbd::ImageCtx>* service_daemon);
+
+ ImageDeleter(const ImageDeleter&) = delete;
+ ImageDeleter& operator=(const ImageDeleter&) = delete;
+
+ static void trash_move(librados::IoCtx& local_io_ctx,
+ const std::string& global_image_id, bool resync,
+ ContextWQ* work_queue, Context* on_finish);
+
+ void init(Context* on_finish);
+ void shut_down(Context* on_finish);
+
+ void print_status(Formatter *f, std::stringstream *ss);
+
+ // for testing purposes
+ void wait_for_deletion(const std::string &image_id,
+ bool scheduled_only, Context* on_finish);
+
+ std::vector<std::string> get_delete_queue_items();
+ std::vector<std::pair<std::string, int> > get_failed_queue_items();
+
+ inline void set_busy_timer_interval(double interval) {
+ m_busy_interval = interval;
+ }
+
+private:
+ struct TrashListener : public image_deleter::TrashListener {
+ ImageDeleter *image_deleter;
+
+ TrashListener(ImageDeleter *image_deleter) : image_deleter(image_deleter) {
+ }
+
+ void handle_trash_image(const std::string& image_id,
+ const utime_t& deferment_end_time) override {
+ image_deleter->handle_trash_image(image_id, deferment_end_time);
+ }
+ };
+
+ struct DeleteInfo {
+ std::string image_id;
+
+ image_deleter::ErrorResult error_result = {};
+ int error_code = 0;
+ utime_t retry_time = {};
+ int retries = 0;
+
+ DeleteInfo(const std::string& image_id)
+ : image_id(image_id) {
+ }
+
+ inline bool operator==(const DeleteInfo& delete_info) const {
+ return (image_id == delete_info.image_id);
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, DeleteInfo& delete_info) {
+ os << "[image_id=" << delete_info.image_id << "]";
+ return os;
+ }
+
+ void print_status(Formatter *f, std::stringstream *ss,
+ bool print_failure_info=false);
+ };
+ typedef std::shared_ptr<DeleteInfo> DeleteInfoRef;
+ typedef std::deque<DeleteInfoRef> DeleteQueue;
+ typedef std::map<std::string, Context*> OnDeleteContexts;
+
+ librados::IoCtx& m_local_io_ctx;
+ Threads<librbd::ImageCtx>* m_threads;
+ ServiceDaemon<librbd::ImageCtx>* m_service_daemon;
+
+ image_deleter::TrashWatcher<ImageCtxT>* m_trash_watcher = nullptr;
+ TrashListener m_trash_listener;
+
+ std::atomic<unsigned> m_running { 1 };
+
+ double m_busy_interval = 1;
+
+ AsyncOpTracker m_async_op_tracker;
+
+ Mutex m_lock;
+ DeleteQueue m_delete_queue;
+ DeleteQueue m_retry_delete_queue;
+ DeleteQueue m_in_flight_delete_queue;
+
+ OnDeleteContexts m_on_delete_contexts;
+
+ AdminSocketHook *m_asok_hook = nullptr;
+
+ Context *m_timer_ctx = nullptr;
+
+ bool process_image_delete();
+
+ void complete_active_delete(DeleteInfoRef* delete_info, int r);
+ void enqueue_failed_delete(DeleteInfoRef* delete_info, int error_code,
+ double retry_delay);
+
+ DeleteInfoRef find_delete_info(const std::string &image_id);
+
+ void remove_images();
+ void remove_image(DeleteInfoRef delete_info);
+ void handle_remove_image(DeleteInfoRef delete_info, int r);
+
+ void schedule_retry_timer();
+ void cancel_retry_timer();
+ void handle_retry_timer();
+
+ void handle_trash_image(const std::string& image_id,
+ const utime_t& deferment_end_time);
+
+ void shut_down_trash_watcher(Context* on_finish);
+ void wait_for_ops(Context* on_finish);
+ void cancel_all_deletions(Context* on_finish);
+
+ void notify_on_delete(const std::string& image_id, int r);
+
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ImageDeleter<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_H
diff --git a/src/tools/rbd_mirror/ImageMap.cc b/src/tools/rbd_mirror/ImageMap.cc
new file mode 100644
index 00000000..58fa5e03
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageMap.cc
@@ -0,0 +1,601 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+
+#include "librbd/Utils.h"
+#include "tools/rbd_mirror/Threads.h"
+
+#include "ImageMap.h"
+#include "image_map/LoadRequest.h"
+#include "image_map/SimplePolicy.h"
+#include "image_map/UpdateRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ImageMap: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+using ::operator<<;
+using image_map::Policy;
+
+using librbd::util::unique_lock_name;
+using librbd::util::create_async_context_callback;
+
+template <typename I>
+struct ImageMap<I>::C_NotifyInstance : public Context {
+ ImageMap* image_map;
+ std::string global_image_id;
+ bool acquire_release;
+
+ C_NotifyInstance(ImageMap* image_map, const std::string& global_image_id,
+ bool acquire_release)
+ : image_map(image_map), global_image_id(global_image_id),
+ acquire_release(acquire_release) {
+ image_map->start_async_op();
+ }
+
+ void finish(int r) override {
+ if (acquire_release) {
+ image_map->handle_peer_ack(global_image_id, r);
+ } else {
+ image_map->handle_peer_ack_remove(global_image_id, r);
+ }
+ image_map->finish_async_op();
+ }
+};
+
+template <typename I>
+ImageMap<I>::ImageMap(librados::IoCtx &ioctx, Threads<I> *threads,
+ const std::string& instance_id,
+ image_map::Listener &listener)
+ : m_ioctx(ioctx), m_threads(threads), m_instance_id(instance_id),
+ m_listener(listener),
+ m_lock(unique_lock_name("rbd::mirror::ImageMap::m_lock", this)) {
+}
+
+template <typename I>
+ImageMap<I>::~ImageMap() {
+ ceph_assert(m_async_op_tracker.empty());
+ ceph_assert(m_timer_task == nullptr);
+ ceph_assert(m_rebalance_task == nullptr);
+}
+
+template <typename I>
+void ImageMap<I>::continue_action(const std::set<std::string> &global_image_ids,
+ int r) {
+ dout(20) << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_shutting_down) {
+ return;
+ }
+
+ for (auto const &global_image_id : global_image_ids) {
+ bool schedule = m_policy->finish_action(global_image_id, r);
+ if (schedule) {
+ schedule_action(global_image_id);
+ }
+ }
+ }
+
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::handle_update_request(
+ const Updates &updates,
+ const std::set<std::string> &remove_global_image_ids, int r) {
+ dout(20) << "r=" << r << dendl;
+
+ std::set<std::string> global_image_ids;
+
+ global_image_ids.insert(remove_global_image_ids.begin(),
+ remove_global_image_ids.end());
+ for (auto const &update : updates) {
+ global_image_ids.insert(update.global_image_id);
+ }
+
+ continue_action(global_image_ids, r);
+}
+
+template <typename I>
+void ImageMap<I>::update_image_mapping(Updates&& map_updates,
+ std::set<std::string>&& map_removals) {
+ if (map_updates.empty() && map_removals.empty()) {
+ return;
+ }
+
+ dout(5) << "updates=[" << map_updates << "], "
+ << "removes=[" << map_removals << "]" << dendl;
+
+ Context *on_finish = new FunctionContext(
+ [this, map_updates, map_removals](int r) {
+ handle_update_request(map_updates, map_removals, r);
+ finish_async_op();
+ });
+ on_finish = create_async_context_callback(m_threads->work_queue, on_finish);
+
+ // empty meta policy for now..
+ image_map::PolicyMetaNone policy_meta;
+
+ bufferlist bl;
+ encode(image_map::PolicyData(policy_meta), bl);
+
+ // prepare update map
+ std::map<std::string, cls::rbd::MirrorImageMap> update_mapping;
+ for (auto const &update : map_updates) {
+ update_mapping.emplace(
+ update.global_image_id, cls::rbd::MirrorImageMap(update.instance_id,
+ update.mapped_time, bl));
+ }
+
+ start_async_op();
+ image_map::UpdateRequest<I> *req = image_map::UpdateRequest<I>::create(
+ m_ioctx, std::move(update_mapping), std::move(map_removals), on_finish);
+ req->send();
+}
+
+template <typename I>
+void ImageMap<I>::process_updates() {
+ dout(20) << dendl;
+
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_timer_task == nullptr);
+
+ Updates map_updates;
+ std::set<std::string> map_removals;
+ Updates acquire_updates;
+ Updates release_updates;
+
+ // gather updates by advancing the state machine
+ m_lock.Lock();
+ for (auto const &global_image_id : m_global_image_ids) {
+ image_map::ActionType action_type =
+ m_policy->start_action(global_image_id);
+ image_map::LookupInfo info = m_policy->lookup(global_image_id);
+
+ dout(15) << "global_image_id=" << global_image_id << ", "
+ << "action=" << action_type << ", "
+ << "instance=" << info.instance_id << dendl;
+ switch (action_type) {
+ case image_map::ACTION_TYPE_NONE:
+ continue;
+ case image_map::ACTION_TYPE_MAP_UPDATE:
+ ceph_assert(info.instance_id != image_map::UNMAPPED_INSTANCE_ID);
+ map_updates.emplace_back(global_image_id, info.instance_id,
+ info.mapped_time);
+ break;
+ case image_map::ACTION_TYPE_MAP_REMOVE:
+ map_removals.emplace(global_image_id);
+ break;
+ case image_map::ACTION_TYPE_ACQUIRE:
+ ceph_assert(info.instance_id != image_map::UNMAPPED_INSTANCE_ID);
+ acquire_updates.emplace_back(global_image_id, info.instance_id);
+ break;
+ case image_map::ACTION_TYPE_RELEASE:
+ ceph_assert(info.instance_id != image_map::UNMAPPED_INSTANCE_ID);
+ release_updates.emplace_back(global_image_id, info.instance_id);
+ break;
+ }
+ }
+ m_global_image_ids.clear();
+ m_lock.Unlock();
+
+ // notify listener (acquire, release) and update on-disk map. note
+ // that its safe to process this outside m_lock as we still hold
+ // timer lock.
+ notify_listener_acquire_release_images(acquire_updates, release_updates);
+ update_image_mapping(std::move(map_updates), std::move(map_removals));
+}
+
+template <typename I>
+void ImageMap<I>::schedule_update_task() {
+ Mutex::Locker timer_lock(m_threads->timer_lock);
+ schedule_update_task(m_threads->timer_lock);
+}
+
+template <typename I>
+void ImageMap<I>::schedule_update_task(const Mutex &timer_lock) {
+ ceph_assert(m_threads->timer_lock.is_locked());
+
+ schedule_rebalance_task();
+
+ if (m_timer_task != nullptr) {
+ return;
+ }
+
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_global_image_ids.empty()) {
+ return;
+ }
+ }
+
+ m_timer_task = new FunctionContext([this](int r) {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ m_timer_task = nullptr;
+
+ process_updates();
+ });
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ double after = cct->_conf.get_val<double>("rbd_mirror_image_policy_update_throttle_interval");
+
+ dout(20) << "scheduling image check update (" << m_timer_task << ")"
+ << " after " << after << " second(s)" << dendl;
+ m_threads->timer->add_event_after(after, m_timer_task);
+}
+
+template <typename I>
+void ImageMap<I>::rebalance() {
+ ceph_assert(m_rebalance_task == nullptr);
+
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_async_op_tracker.empty() && m_global_image_ids.empty()){
+ dout(20) << "starting rebalance" << dendl;
+
+ std::set<std::string> remap_global_image_ids;
+ m_policy->add_instances({}, &remap_global_image_ids);
+
+ for (auto const &global_image_id : remap_global_image_ids) {
+ schedule_action(global_image_id);
+ }
+ }
+ }
+
+ schedule_update_task(m_threads->timer_lock);
+}
+
+template <typename I>
+void ImageMap<I>::schedule_rebalance_task() {
+ ceph_assert(m_threads->timer_lock.is_locked());
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+
+ // fetch the updated value of idle timeout for (re)scheduling
+ double resched_after = cct->_conf.get_val<double>(
+ "rbd_mirror_image_policy_rebalance_timeout");
+ if (!resched_after) {
+ return;
+ }
+
+ // cancel existing rebalance task if any before scheduling
+ if (m_rebalance_task != nullptr) {
+ m_threads->timer->cancel_event(m_rebalance_task);
+ }
+
+ m_rebalance_task = new FunctionContext([this](int _) {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ m_rebalance_task = nullptr;
+
+ rebalance();
+ });
+
+ dout(20) << "scheduling rebalance (" << m_rebalance_task << ")"
+ << " after " << resched_after << " second(s)" << dendl;
+ m_threads->timer->add_event_after(resched_after, m_rebalance_task);
+}
+
+template <typename I>
+void ImageMap<I>::schedule_action(const std::string &global_image_id) {
+ dout(20) << "global_image_id=" << global_image_id << dendl;
+ ceph_assert(m_lock.is_locked());
+
+ m_global_image_ids.emplace(global_image_id);
+}
+
+template <typename I>
+void ImageMap<I>::notify_listener_acquire_release_images(
+ const Updates &acquire, const Updates &release) {
+ if (acquire.empty() && release.empty()) {
+ return;
+ }
+
+ dout(5) << "acquire=[" << acquire << "], "
+ << "release=[" << release << "]" << dendl;
+
+ for (auto const &update : acquire) {
+ m_listener.acquire_image(
+ update.global_image_id, update.instance_id,
+ create_async_context_callback(
+ m_threads->work_queue,
+ new C_NotifyInstance(this, update.global_image_id, true)));
+ }
+
+ for (auto const &update : release) {
+ m_listener.release_image(
+ update.global_image_id, update.instance_id,
+ create_async_context_callback(
+ m_threads->work_queue,
+ new C_NotifyInstance(this, update.global_image_id, true)));
+ }
+}
+
+template <typename I>
+void ImageMap<I>::notify_listener_remove_images(const std::string &peer_uuid,
+ const Updates &remove) {
+ dout(5) << "peer_uuid=" << peer_uuid << ", "
+ << "remove=[" << remove << "]" << dendl;
+
+ for (auto const &update : remove) {
+ m_listener.remove_image(
+ peer_uuid, update.global_image_id, update.instance_id,
+ create_async_context_callback(
+ m_threads->work_queue,
+ new C_NotifyInstance(this, update.global_image_id, false)));
+ }
+}
+
+template <typename I>
+void ImageMap<I>::handle_load(const std::map<std::string,
+ cls::rbd::MirrorImageMap> &image_mapping) {
+ dout(20) << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_policy->init(image_mapping);
+
+ for (auto& pair : image_mapping) {
+ schedule_action(pair.first);
+ }
+ }
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::handle_peer_ack_remove(const std::string &global_image_id,
+ int r) {
+ Mutex::Locker locker(m_lock);
+ dout(5) << "global_image_id=" << global_image_id << dendl;
+
+ if (r < 0) {
+ derr << "failed to remove global_image_id=" << global_image_id << dendl;
+ }
+
+ auto peer_it = m_peer_map.find(global_image_id);
+ if (peer_it == m_peer_map.end()) {
+ return;
+ }
+
+ m_peer_map.erase(peer_it);
+}
+
+template <typename I>
+void ImageMap<I>::update_images_added(
+ const std::string &peer_uuid,
+ const std::set<std::string> &global_image_ids) {
+ dout(5) << "peer_uuid=" << peer_uuid << ", "
+ << "global_image_ids=[" << global_image_ids << "]" << dendl;
+ ceph_assert(m_lock.is_locked());
+
+ for (auto const &global_image_id : global_image_ids) {
+ auto result = m_peer_map[global_image_id].insert(peer_uuid);
+ if (result.second && m_peer_map[global_image_id].size() == 1) {
+ if (m_policy->add_image(global_image_id)) {
+ schedule_action(global_image_id);
+ }
+ }
+ }
+}
+
+template <typename I>
+void ImageMap<I>::update_images_removed(
+ const std::string &peer_uuid,
+ const std::set<std::string> &global_image_ids) {
+ dout(5) << "peer_uuid=" << peer_uuid << ", "
+ << "global_image_ids=[" << global_image_ids << "]" << dendl;
+ ceph_assert(m_lock.is_locked());
+
+ Updates to_remove;
+ for (auto const &global_image_id : global_image_ids) {
+ image_map::LookupInfo info = m_policy->lookup(global_image_id);
+ bool image_mapped = (info.instance_id != image_map::UNMAPPED_INSTANCE_ID);
+
+ bool image_removed = image_mapped;
+ bool peer_removed = false;
+ auto peer_it = m_peer_map.find(global_image_id);
+ if (peer_it != m_peer_map.end()) {
+ auto& peer_set = peer_it->second;
+ peer_removed = peer_set.erase(peer_uuid);
+ image_removed = peer_removed && peer_set.empty();
+ }
+
+ if (image_mapped && peer_removed && !peer_uuid.empty()) {
+ // peer image has been deleted
+ to_remove.emplace_back(global_image_id, info.instance_id);
+ }
+
+ if (image_mapped && image_removed) {
+ // local and peer images have been deleted
+ if (m_policy->remove_image(global_image_id)) {
+ schedule_action(global_image_id);
+ }
+ }
+ }
+
+ if (!to_remove.empty()) {
+ // removal notification will be notified instantly. this is safe
+ // even after scheduling action for images as we still hold m_lock
+ notify_listener_remove_images(peer_uuid, to_remove);
+ }
+}
+
+template <typename I>
+void ImageMap<I>::update_instances_added(
+ const std::vector<std::string> &instance_ids) {
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_shutting_down) {
+ return;
+ }
+
+ std::vector<std::string> filtered_instance_ids;
+ filter_instance_ids(instance_ids, &filtered_instance_ids, false);
+ if (filtered_instance_ids.empty()) {
+ return;
+ }
+
+ dout(20) << "instance_ids=" << filtered_instance_ids << dendl;
+
+ std::set<std::string> remap_global_image_ids;
+ m_policy->add_instances(filtered_instance_ids, &remap_global_image_ids);
+
+ for (auto const &global_image_id : remap_global_image_ids) {
+ schedule_action(global_image_id);
+ }
+ }
+
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::update_instances_removed(
+ const std::vector<std::string> &instance_ids) {
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_shutting_down) {
+ return;
+ }
+
+ std::vector<std::string> filtered_instance_ids;
+ filter_instance_ids(instance_ids, &filtered_instance_ids, true);
+ if (filtered_instance_ids.empty()) {
+ return;
+ }
+
+ dout(20) << "instance_ids=" << filtered_instance_ids << dendl;
+
+ std::set<std::string> remap_global_image_ids;
+ m_policy->remove_instances(filtered_instance_ids, &remap_global_image_ids);
+
+ for (auto const &global_image_id : remap_global_image_ids) {
+ schedule_action(global_image_id);
+ }
+ }
+
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::update_images(const std::string &peer_uuid,
+ std::set<std::string> &&added_global_image_ids,
+ std::set<std::string> &&removed_global_image_ids) {
+ dout(5) << "peer_uuid=" << peer_uuid << ", " << "added_count="
+ << added_global_image_ids.size() << ", " << "removed_count="
+ << removed_global_image_ids.size() << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_shutting_down) {
+ return;
+ }
+
+ if (!removed_global_image_ids.empty()) {
+ update_images_removed(peer_uuid, removed_global_image_ids);
+ }
+ if (!added_global_image_ids.empty()) {
+ update_images_added(peer_uuid, added_global_image_ids);
+ }
+ }
+
+ schedule_update_task();
+}
+
+template <typename I>
+void ImageMap<I>::handle_peer_ack(const std::string &global_image_id, int r) {
+ dout (20) << "global_image_id=" << global_image_id << ", r=" << r
+ << dendl;
+
+ continue_action({global_image_id}, r);
+}
+
+template <typename I>
+void ImageMap<I>::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ std::string policy_type = cct->_conf.get_val<string>("rbd_mirror_image_policy_type");
+
+ if (policy_type == "none" || policy_type == "simple") {
+ m_policy.reset(image_map::SimplePolicy::create(m_ioctx));
+ } else {
+ ceph_abort(); // not really needed as such, but catch it.
+ }
+
+ dout(20) << "mapping policy=" << policy_type << dendl;
+
+ start_async_op();
+ C_LoadMap *ctx = new C_LoadMap(this, on_finish);
+ image_map::LoadRequest<I> *req = image_map::LoadRequest<I>::create(
+ m_ioctx, &ctx->image_mapping, ctx);
+ req->send();
+}
+
+template <typename I>
+void ImageMap<I>::shut_down(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ Mutex::Locker timer_lock(m_threads->timer_lock);
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_shutting_down);
+
+ m_shutting_down = true;
+ m_policy.reset();
+ }
+
+ if (m_timer_task != nullptr) {
+ m_threads->timer->cancel_event(m_timer_task);
+ m_timer_task = nullptr;
+ }
+ if (m_rebalance_task != nullptr) {
+ m_threads->timer->cancel_event(m_rebalance_task);
+ m_rebalance_task = nullptr;
+ }
+ }
+
+ wait_for_async_ops(on_finish);
+}
+
+template <typename I>
+void ImageMap<I>::filter_instance_ids(
+ const std::vector<std::string> &instance_ids,
+ std::vector<std::string> *filtered_instance_ids, bool removal) const {
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ std::string policy_type = cct->_conf.get_val<string>("rbd_mirror_image_policy_type");
+
+ if (policy_type != "none") {
+ *filtered_instance_ids = instance_ids;
+ return;
+ }
+
+ if (removal) {
+ // propagate removals for external instances
+ for (auto& instance_id : instance_ids) {
+ if (instance_id != m_instance_id) {
+ filtered_instance_ids->push_back(instance_id);
+ }
+ }
+ } else if (std::find(instance_ids.begin(), instance_ids.end(),
+ m_instance_id) != instance_ids.end()) {
+ // propagate addition only for local instance
+ filtered_instance_ids->push_back(m_instance_id);
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ImageMap<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageMap.h b/src/tools/rbd_mirror/ImageMap.h
new file mode 100644
index 00000000..283f55db
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageMap.h
@@ -0,0 +1,175 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_H
+
+#include <vector>
+
+#include "common/Mutex.h"
+#include "include/Context.h"
+#include "common/AsyncOpTracker.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/rados/librados.hpp"
+
+#include "image_map/Policy.h"
+#include "image_map/Types.h"
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageMap {
+public:
+ static ImageMap *create(librados::IoCtx &ioctx, Threads<ImageCtxT> *threads,
+ const std::string& instance_id,
+ image_map::Listener &listener) {
+ return new ImageMap(ioctx, threads, instance_id, listener);
+ }
+
+ ~ImageMap();
+
+ // init (load) the instance map from disk
+ void init(Context *on_finish);
+
+ // shut down map operations
+ void shut_down(Context *on_finish);
+
+ // update (add/remove) images
+ void update_images(const std::string &peer_uuid,
+ std::set<std::string> &&added_global_image_ids,
+ std::set<std::string> &&removed_global_image_ids);
+
+ // add/remove instances
+ void update_instances_added(const std::vector<std::string> &instances);
+ void update_instances_removed(const std::vector<std::string> &instances);
+
+private:
+ struct C_NotifyInstance;
+
+ ImageMap(librados::IoCtx &ioctx, Threads<ImageCtxT> *threads,
+ const std::string& instance_id, image_map::Listener &listener);
+
+ struct Update {
+ std::string global_image_id;
+ std::string instance_id;
+ utime_t mapped_time;
+
+ Update(const std::string &global_image_id, const std::string &instance_id,
+ utime_t mapped_time)
+ : global_image_id(global_image_id),
+ instance_id(instance_id),
+ mapped_time(mapped_time) {
+ }
+ Update(const std::string &global_image_id, const std::string &instance_id)
+ : Update(global_image_id, instance_id, ceph_clock_now()) {
+ }
+
+ friend std::ostream& operator<<(std::ostream& os,
+ const Update& update) {
+ os << "{global_image_id=" << update.global_image_id << ", "
+ << "instance_id=" << update.instance_id << "}";
+ return os;
+ }
+
+ };
+ typedef std::list<Update> Updates;
+
+ // Lock ordering: m_threads->timer_lock, m_lock
+
+ librados::IoCtx &m_ioctx;
+ Threads<ImageCtxT> *m_threads;
+ std::string m_instance_id;
+ image_map::Listener &m_listener;
+
+ std::unique_ptr<image_map::Policy> m_policy; // our mapping policy
+
+ Context *m_timer_task = nullptr;
+ Mutex m_lock;
+ bool m_shutting_down = false;
+ AsyncOpTracker m_async_op_tracker;
+
+ // global_image_id -> registered peers ("" == local, remote otherwise)
+ std::map<std::string, std::set<std::string> > m_peer_map;
+
+ std::set<std::string> m_global_image_ids;
+
+ Context *m_rebalance_task = nullptr;
+
+ struct C_LoadMap : Context {
+ ImageMap *image_map;
+ Context *on_finish;
+
+ std::map<std::string, cls::rbd::MirrorImageMap> image_mapping;
+
+ C_LoadMap(ImageMap *image_map, Context *on_finish)
+ : image_map(image_map),
+ on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ if (r == 0) {
+ image_map->handle_load(image_mapping);
+ }
+
+ image_map->finish_async_op();
+ on_finish->complete(r);
+ }
+ };
+
+ // async op-tracker helper routines
+ void start_async_op() {
+ m_async_op_tracker.start_op();
+ }
+ void finish_async_op() {
+ m_async_op_tracker.finish_op();
+ }
+ void wait_for_async_ops(Context *on_finish) {
+ m_async_op_tracker.wait_for_ops(on_finish);
+ }
+
+ void handle_peer_ack(const std::string &global_image_id, int r);
+ void handle_peer_ack_remove(const std::string &global_image_id, int r);
+
+ void handle_load(const std::map<std::string, cls::rbd::MirrorImageMap> &image_mapping);
+ void handle_update_request(const Updates &updates,
+ const std::set<std::string> &remove_global_image_ids, int r);
+
+ // continue (retry or resume depending on state machine) processing
+ // current action.
+ void continue_action(const std::set<std::string> &global_image_ids, int r);
+
+ // schedule an image for update
+ void schedule_action(const std::string &global_image_id);
+
+ void schedule_update_task();
+ void schedule_update_task(const Mutex &timer_lock);
+ void process_updates();
+ void update_image_mapping(Updates&& map_updates,
+ std::set<std::string>&& map_removals);
+
+ void rebalance();
+ void schedule_rebalance_task();
+
+ void notify_listener_acquire_release_images(const Updates &acquire, const Updates &release);
+ void notify_listener_remove_images(const std::string &peer_uuid, const Updates &remove);
+
+ void update_images_added(const std::string &peer_uuid,
+ const std::set<std::string> &global_image_ids);
+ void update_images_removed(const std::string &peer_uuid,
+ const std::set<std::string> &global_image_ids);
+
+ void filter_instance_ids(const std::vector<std::string> &instance_ids,
+ std::vector<std::string> *filtered_instance_ids,
+ bool removal) const;
+
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_H
diff --git a/src/tools/rbd_mirror/ImageReplayer.cc b/src/tools/rbd_mirror/ImageReplayer.cc
new file mode 100644
index 00000000..6c6ee2d5
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageReplayer.cc
@@ -0,0 +1,1896 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "common/Formatter.h"
+#include "common/admin_socket.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "global/global_context.h"
+#include "journal/Journaler.h"
+#include "journal/ReplayHandler.h"
+#include "journal/Settings.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/Replay.h"
+#include "ImageDeleter.h"
+#include "ImageReplayer.h"
+#include "Threads.h"
+#include "tools/rbd_mirror/image_replayer/BootstrapRequest.h"
+#include "tools/rbd_mirror/image_replayer/CloseImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/EventPreprocessor.h"
+#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h"
+#include "tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::" << *this << " " \
+ << __func__ << ": "
+
+using std::map;
+using std::string;
+using std::unique_ptr;
+using std::shared_ptr;
+using std::vector;
+
+extern PerfCounters *g_perf_counters;
+
+namespace rbd {
+namespace mirror {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+using namespace rbd::mirror::image_replayer;
+
+template <typename I>
+std::ostream &operator<<(std::ostream &os,
+ const typename ImageReplayer<I>::State &state);
+
+namespace {
+
+template <typename I>
+struct ReplayHandler : public ::journal::ReplayHandler {
+ ImageReplayer<I> *replayer;
+ ReplayHandler(ImageReplayer<I> *replayer) : replayer(replayer) {}
+ void get() override {}
+ void put() override {}
+
+ void handle_entries_available() override {
+ replayer->handle_replay_ready();
+ }
+ void handle_complete(int r) override {
+ std::stringstream ss;
+ if (r < 0) {
+ ss << "replay completed with error: " << cpp_strerror(r);
+ }
+ replayer->handle_replay_complete(r, ss.str());
+ }
+};
+
+template <typename I>
+class ImageReplayerAdminSocketCommand {
+public:
+ ImageReplayerAdminSocketCommand(const std::string &desc,
+ ImageReplayer<I> *replayer)
+ : desc(desc), replayer(replayer) {
+ }
+ virtual ~ImageReplayerAdminSocketCommand() {}
+ virtual bool call(Formatter *f, stringstream *ss) = 0;
+
+ std::string desc;
+ ImageReplayer<I> *replayer;
+ bool registered = false;
+};
+
+template <typename I>
+class StatusCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit StatusCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->replayer->print_status(f, ss);
+ return true;
+ }
+};
+
+template <typename I>
+class StartCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit StartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->replayer->start(nullptr, true);
+ return true;
+ }
+};
+
+template <typename I>
+class StopCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit StopCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->replayer->stop(nullptr, true);
+ return true;
+ }
+};
+
+template <typename I>
+class RestartCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit RestartCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->replayer->restart();
+ return true;
+ }
+};
+
+template <typename I>
+class FlushCommand : public ImageReplayerAdminSocketCommand<I> {
+public:
+ explicit FlushCommand(const std::string &desc, ImageReplayer<I> *replayer)
+ : ImageReplayerAdminSocketCommand<I>(desc, replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->replayer->flush();
+ return true;
+ }
+};
+
+template <typename I>
+class ImageReplayerAdminSocketHook : public AdminSocketHook {
+public:
+ ImageReplayerAdminSocketHook(CephContext *cct, const std::string &name,
+ ImageReplayer<I> *replayer)
+ : admin_socket(cct->get_admin_socket()),
+ commands{{"rbd mirror flush " + name,
+ new FlushCommand<I>("flush rbd mirror " + name, replayer)},
+ {"rbd mirror restart " + name,
+ new RestartCommand<I>("restart rbd mirror " + name, replayer)},
+ {"rbd mirror start " + name,
+ new StartCommand<I>("start rbd mirror " + name, replayer)},
+ {"rbd mirror status " + name,
+ new StatusCommand<I>("get status for rbd mirror " + name, replayer)},
+ {"rbd mirror stop " + name,
+ new StopCommand<I>("stop rbd mirror " + name, replayer)}} {
+ }
+
+ int register_commands() {
+ for (auto &it : commands) {
+ int r = admin_socket->register_command(it.first, it.first, this,
+ it.second->desc);
+ if (r < 0) {
+ return r;
+ }
+ it.second->registered = true;
+ }
+ return 0;
+ }
+
+ ~ImageReplayerAdminSocketHook() override {
+ for (auto &it : commands) {
+ if (it.second->registered) {
+ admin_socket->unregister_command(it.first);
+ }
+ delete it.second;
+ }
+ commands.clear();
+ }
+
+ bool call(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, bufferlist& out) override {
+ auto i = commands.find(command);
+ ceph_assert(i != commands.end());
+ Formatter *f = Formatter::create(format);
+ stringstream ss;
+ bool r = i->second->call(f, &ss);
+ delete f;
+ out.append(ss);
+ return r;
+ }
+
+private:
+ typedef std::map<std::string, ImageReplayerAdminSocketCommand<I>*,
+ std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+uint32_t calculate_replay_delay(const utime_t &event_time,
+ int mirroring_replay_delay) {
+ if (mirroring_replay_delay <= 0) {
+ return 0;
+ }
+
+ utime_t now = ceph_clock_now();
+ if (event_time + mirroring_replay_delay <= now) {
+ return 0;
+ }
+
+ // ensure it is rounded up when converting to integer
+ return (event_time + mirroring_replay_delay - now) + 1;
+}
+
+} // anonymous namespace
+
+template <typename I>
+void ImageReplayer<I>::BootstrapProgressContext::update_progress(
+ const std::string &description, bool flush)
+{
+ const std::string desc = "bootstrapping, " + description;
+ replayer->set_state_description(0, desc);
+ if (flush) {
+ replayer->update_mirror_image_status(false, boost::none);
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::RemoteJournalerListener::handle_update(
+ ::journal::JournalMetadata *) {
+ FunctionContext *ctx = new FunctionContext([this](int r) {
+ replayer->handle_remote_journal_metadata_updated();
+ });
+ replayer->m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+ImageReplayer<I>::ImageReplayer(Threads<I> *threads,
+ InstanceWatcher<I> *instance_watcher,
+ RadosRef local,
+ const std::string &local_mirror_uuid,
+ int64_t local_pool_id,
+ const std::string &global_image_id) :
+ m_threads(threads),
+ m_instance_watcher(instance_watcher),
+ m_local(local),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_local_pool_id(local_pool_id),
+ m_global_image_id(global_image_id), m_local_image_name(global_image_id),
+ m_lock("rbd::mirror::ImageReplayer " + stringify(local_pool_id) + " " +
+ global_image_id),
+ m_progress_cxt(this),
+ m_journal_listener(new JournalListener(this)),
+ m_remote_listener(this)
+{
+ // Register asok commands using a temporary "remote_pool_name/global_image_id"
+ // name. When the image name becomes known on start the asok commands will be
+ // re-registered using "remote_pool_name/remote_image_name" name.
+
+ std::string pool_name;
+ int r = m_local->pool_reverse_lookup(m_local_pool_id, &pool_name);
+ if (r < 0) {
+ derr << "error resolving local pool " << m_local_pool_id
+ << ": " << cpp_strerror(r) << dendl;
+ pool_name = stringify(m_local_pool_id);
+ }
+
+ m_name = pool_name + "/" + m_global_image_id;
+ register_admin_socket_hook();
+}
+
+template <typename I>
+ImageReplayer<I>::~ImageReplayer()
+{
+ unregister_admin_socket_hook();
+ ceph_assert(m_event_preprocessor == nullptr);
+ ceph_assert(m_replay_status_formatter == nullptr);
+ ceph_assert(m_local_image_ctx == nullptr);
+ ceph_assert(m_local_replay == nullptr);
+ ceph_assert(m_remote_journaler == nullptr);
+ ceph_assert(m_replay_handler == nullptr);
+ ceph_assert(m_on_start_finish == nullptr);
+ ceph_assert(m_on_stop_finish == nullptr);
+ ceph_assert(m_bootstrap_request == nullptr);
+ ceph_assert(m_in_flight_status_updates == 0);
+
+ delete m_journal_listener;
+}
+
+template <typename I>
+image_replayer::HealthState ImageReplayer<I>::get_health_state() const {
+ Mutex::Locker locker(m_lock);
+
+ if (!m_mirror_image_status_state) {
+ return image_replayer::HEALTH_STATE_OK;
+ } else if (*m_mirror_image_status_state ==
+ cls::rbd::MIRROR_IMAGE_STATUS_STATE_SYNCING ||
+ *m_mirror_image_status_state ==
+ cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN) {
+ return image_replayer::HEALTH_STATE_WARNING;
+ }
+ return image_replayer::HEALTH_STATE_ERROR;
+}
+
+template <typename I>
+void ImageReplayer<I>::add_peer(const std::string &peer_uuid,
+ librados::IoCtx &io_ctx) {
+ Mutex::Locker locker(m_lock);
+ auto it = m_peers.find({peer_uuid});
+ if (it == m_peers.end()) {
+ m_peers.insert({peer_uuid, io_ctx});
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::set_state_description(int r, const std::string &desc) {
+ dout(10) << r << " " << desc << dendl;
+
+ Mutex::Locker l(m_lock);
+ m_last_r = r;
+ m_state_desc = desc;
+}
+
+template <typename I>
+void ImageReplayer<I>::start(Context *on_finish, bool manual)
+{
+ dout(10) << "on_finish=" << on_finish << dendl;
+
+ int r = 0;
+ {
+ Mutex::Locker locker(m_lock);
+ if (!is_stopped_()) {
+ derr << "already running" << dendl;
+ r = -EINVAL;
+ } else if (m_manual_stop && !manual) {
+ dout(5) << "stopped manually, ignoring start without manual flag"
+ << dendl;
+ r = -EPERM;
+ } else {
+ m_state = STATE_STARTING;
+ m_last_r = 0;
+ m_state_desc.clear();
+ m_manual_stop = false;
+ m_delete_requested = false;
+
+ if (on_finish != nullptr) {
+ ceph_assert(m_on_start_finish == nullptr);
+ m_on_start_finish = on_finish;
+ }
+ ceph_assert(m_on_stop_finish == nullptr);
+ }
+ }
+
+ if (r < 0) {
+ if (on_finish) {
+ on_finish->complete(r);
+ }
+ return;
+ }
+
+ m_local_ioctx.reset(new librados::IoCtx{});
+ r = m_local->ioctx_create2(m_local_pool_id, *m_local_ioctx);
+ if (r < 0) {
+ m_local_ioctx.reset();
+
+ derr << "error opening ioctx for local pool " << m_local_pool_id
+ << ": " << cpp_strerror(r) << dendl;
+ on_start_fail(r, "error opening local pool");
+ return;
+ }
+
+ prepare_local_image();
+}
+
+template <typename I>
+void ImageReplayer<I>::prepare_local_image() {
+ dout(10) << dendl;
+
+ m_local_image_id = "";
+ Context *ctx = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_prepare_local_image>(this);
+ auto req = PrepareLocalImageRequest<I>::create(
+ *m_local_ioctx, m_global_image_id, &m_local_image_id, &m_local_image_name,
+ &m_local_image_tag_owner, m_threads->work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_prepare_local_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(10) << "local image does not exist" << dendl;
+ } else if (r < 0) {
+ on_start_fail(r, "error preparing local image for replay");
+ return;
+ } else {
+ reregister_admin_socket_hook();
+ }
+
+ // local image doesn't exist or is non-primary
+ prepare_remote_image();
+}
+
+template <typename I>
+void ImageReplayer<I>::prepare_remote_image() {
+ dout(10) << dendl;
+ if (m_peers.empty()) {
+ // technically nothing to bootstrap, but it handles the status update
+ bootstrap();
+ return;
+ }
+
+ // TODO need to support multiple remote images
+ ceph_assert(!m_peers.empty());
+ m_remote_image = {*m_peers.begin()};
+
+ auto cct = static_cast<CephContext *>(m_local->cct());
+ journal::Settings journal_settings;
+ journal_settings.commit_interval = cct->_conf.get_val<double>(
+ "rbd_mirror_journal_commit_age");
+ journal_settings.max_fetch_bytes = cct->_conf.get_val<Option::size_t>(
+ "rbd_mirror_journal_max_fetch_bytes");
+
+ Context *ctx = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_prepare_remote_image>(this);
+ auto req = PrepareRemoteImageRequest<I>::create(
+ m_threads, m_remote_image.io_ctx, m_global_image_id, m_local_mirror_uuid,
+ m_local_image_id, journal_settings, &m_remote_image.mirror_uuid,
+ &m_remote_image.image_id, &m_remote_journaler, &m_client_state,
+ &m_client_meta, ctx);
+ req->send();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_prepare_remote_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r < 0 ? m_remote_journaler == nullptr : m_remote_journaler != nullptr);
+ if (r < 0 && !m_local_image_id.empty() &&
+ m_local_image_tag_owner == librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ // local image is primary -- fall-through
+ } else if (r == -ENOENT) {
+ dout(10) << "remote image does not exist" << dendl;
+
+ // TODO need to support multiple remote images
+ if (m_remote_image.image_id.empty() && !m_local_image_id.empty() &&
+ m_local_image_tag_owner == m_remote_image.mirror_uuid) {
+ // local image exists and is non-primary and linked to the missing
+ // remote image
+
+ m_delete_requested = true;
+ on_start_fail(0, "remote image no longer exists");
+ } else {
+ on_start_fail(-ENOENT, "remote image does not exist");
+ }
+ return;
+ } else if (r < 0) {
+ on_start_fail(r, "error retrieving remote image id");
+ return;
+ }
+
+ bootstrap();
+}
+
+template <typename I>
+void ImageReplayer<I>::bootstrap() {
+ dout(10) << dendl;
+
+ if (!m_local_image_id.empty() &&
+ m_local_image_tag_owner == librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ dout(5) << "local image is primary" << dendl;
+ on_start_fail(0, "local image is primary");
+ return;
+ } else if (m_peers.empty()) {
+ dout(5) << "no peer clusters" << dendl;
+ on_start_fail(-ENOENT, "no peer clusters");
+ return;
+ }
+
+ BootstrapRequest<I> *request = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ if (on_start_interrupted(m_lock)) {
+ return;
+ }
+
+ auto ctx = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_bootstrap>(this);
+ request = BootstrapRequest<I>::create(
+ m_threads, *m_local_ioctx, m_remote_image.io_ctx, m_instance_watcher,
+ &m_local_image_ctx, m_local_image_id, m_remote_image.image_id,
+ m_global_image_id, m_local_mirror_uuid, m_remote_image.mirror_uuid,
+ m_remote_journaler, &m_client_state, &m_client_meta, ctx,
+ &m_resync_requested, &m_progress_cxt);
+ request->get();
+ m_bootstrap_request = request;
+ }
+
+ update_mirror_image_status(false, boost::none);
+ reschedule_update_status_task(10);
+
+ request->send();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_bootstrap(int r) {
+ dout(10) << "r=" << r << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ m_bootstrap_request->put();
+ m_bootstrap_request = nullptr;
+ if (m_local_image_ctx) {
+ m_local_image_id = m_local_image_ctx->id;
+ }
+ }
+
+ if (on_start_interrupted()) {
+ return;
+ } else if (r == -EREMOTEIO) {
+ m_local_image_tag_owner = "";
+ dout(5) << "remote image is non-primary" << dendl;
+ on_start_fail(-EREMOTEIO, "remote image is non-primary");
+ return;
+ } else if (r == -EEXIST) {
+ m_local_image_tag_owner = "";
+ on_start_fail(r, "split-brain detected");
+ return;
+ } else if (r < 0) {
+ on_start_fail(r, "error bootstrapping replay");
+ return;
+ } else if (m_resync_requested) {
+ on_start_fail(0, "resync requested");
+ return;
+ }
+
+ ceph_assert(m_local_journal == nullptr);
+ {
+ RWLock::RLocker snap_locker(m_local_image_ctx->snap_lock);
+ if (m_local_image_ctx->journal != nullptr) {
+ m_local_journal = m_local_image_ctx->journal;
+ m_local_journal->add_listener(m_journal_listener);
+ }
+ }
+
+ if (m_local_journal == nullptr) {
+ on_start_fail(-EINVAL, "error accessing local journal");
+ return;
+ }
+
+ update_mirror_image_status(false, boost::none);
+ init_remote_journaler();
+}
+
+template <typename I>
+void ImageReplayer<I>::init_remote_journaler() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_init_remote_journaler>(this);
+ m_remote_journaler->init(ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_init_remote_journaler(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (on_start_interrupted()) {
+ return;
+ } else if (r < 0) {
+ derr << "failed to initialize remote journal: " << cpp_strerror(r) << dendl;
+ on_start_fail(r, "error initializing remote journal");
+ return;
+ }
+
+ m_remote_journaler->add_listener(&m_remote_listener);
+
+ cls::journal::Client client;
+ r = m_remote_journaler->get_cached_client(m_local_mirror_uuid, &client);
+ if (r < 0) {
+ derr << "error retrieving remote journal client: " << cpp_strerror(r)
+ << dendl;
+ on_start_fail(r, "error retrieving remote journal client");
+ return;
+ }
+
+ dout(5) << "image_id=" << m_local_image_id << ", "
+ << "client_meta.image_id=" << m_client_meta.image_id << ", "
+ << "client.state=" << client.state << dendl;
+ if (m_client_meta.image_id == m_local_image_id &&
+ client.state != cls::journal::CLIENT_STATE_CONNECTED) {
+ dout(5) << "client flagged disconnected, stopping image replay" << dendl;
+ if (m_local_image_ctx->config.template get_val<bool>("rbd_mirroring_resync_after_disconnect")) {
+ m_resync_requested = true;
+ on_start_fail(-ENOTCONN, "disconnected: automatic resync");
+ } else {
+ on_start_fail(-ENOTCONN, "disconnected");
+ }
+ return;
+ }
+
+ start_replay();
+}
+
+template <typename I>
+void ImageReplayer<I>::start_replay() {
+ dout(10) << dendl;
+
+ Context *start_ctx = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_start_replay>(this);
+ m_local_journal->start_external_replay(&m_local_replay, start_ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_start_replay(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ ceph_assert(m_local_replay == nullptr);
+ derr << "error starting external replay on local image "
+ << m_local_image_id << ": " << cpp_strerror(r) << dendl;
+ on_start_fail(r, "error starting replay on local image");
+ return;
+ }
+
+ m_replay_status_formatter =
+ ReplayStatusFormatter<I>::create(m_remote_journaler, m_local_mirror_uuid);
+
+ Context *on_finish(nullptr);
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_STARTING);
+ m_state = STATE_REPLAYING;
+ std::swap(m_on_start_finish, on_finish);
+ }
+
+ m_event_preprocessor = EventPreprocessor<I>::create(
+ *m_local_image_ctx, *m_remote_journaler, m_local_mirror_uuid,
+ &m_client_meta, m_threads->work_queue);
+
+ update_mirror_image_status(true, boost::none);
+ reschedule_update_status_task(30);
+
+ if (on_replay_interrupted()) {
+ return;
+ }
+
+ {
+ CephContext *cct = static_cast<CephContext *>(m_local->cct());
+ double poll_seconds = cct->_conf.get_val<double>(
+ "rbd_mirror_journal_poll_age");
+
+ Mutex::Locker locker(m_lock);
+ m_replay_handler = new ReplayHandler<I>(this);
+ m_remote_journaler->start_live_replay(m_replay_handler, poll_seconds);
+
+ dout(10) << "m_remote_journaler=" << *m_remote_journaler << dendl;
+ }
+
+ dout(10) << "start succeeded" << dendl;
+ if (on_finish != nullptr) {
+ dout(10) << "on finish complete, r=" << r << dendl;
+ on_finish->complete(r);
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::on_start_fail(int r, const std::string &desc)
+{
+ dout(10) << "r=" << r << dendl;
+ Context *ctx = new FunctionContext([this, r, desc](int _r) {
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_STARTING);
+ m_state = STATE_STOPPING;
+ if (r < 0 && r != -ECANCELED && r != -EREMOTEIO && r != -ENOENT) {
+ derr << "start failed: " << cpp_strerror(r) << dendl;
+ } else {
+ dout(10) << "start canceled" << dendl;
+ }
+ }
+
+ set_state_description(r, desc);
+ if (m_local_ioctx) {
+ update_mirror_image_status(false, boost::none);
+ }
+ reschedule_update_status_task(-1);
+ shut_down(r);
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+bool ImageReplayer<I>::on_start_interrupted() {
+ Mutex::Locker locker(m_lock);
+ return on_start_interrupted(m_lock);
+}
+
+template <typename I>
+bool ImageReplayer<I>::on_start_interrupted(Mutex& lock) {
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_state == STATE_STARTING);
+ if (!m_stop_requested) {
+ return false;
+ }
+
+ on_start_fail(-ECANCELED, "");
+ return true;
+}
+
+template <typename I>
+void ImageReplayer<I>::stop(Context *on_finish, bool manual, int r,
+ const std::string& desc)
+{
+ dout(10) << "on_finish=" << on_finish << ", manual=" << manual
+ << ", desc=" << desc << dendl;
+
+ image_replayer::BootstrapRequest<I> *bootstrap_request = nullptr;
+ bool shut_down_replay = false;
+ bool running = true;
+ {
+ Mutex::Locker locker(m_lock);
+
+ if (!is_running_()) {
+ running = false;
+ } else {
+ if (!is_stopped_()) {
+ if (m_state == STATE_STARTING) {
+ dout(10) << "canceling start" << dendl;
+ if (m_bootstrap_request != nullptr) {
+ bootstrap_request = m_bootstrap_request;
+ bootstrap_request->get();
+ }
+ } else {
+ dout(10) << "interrupting replay" << dendl;
+ shut_down_replay = true;
+ }
+
+ ceph_assert(m_on_stop_finish == nullptr);
+ std::swap(m_on_stop_finish, on_finish);
+ m_stop_requested = true;
+ m_manual_stop = manual;
+ }
+ }
+ }
+
+ // avoid holding lock since bootstrap request will update status
+ if (bootstrap_request != nullptr) {
+ dout(10) << "canceling bootstrap" << dendl;
+ bootstrap_request->cancel();
+ bootstrap_request->put();
+ }
+
+ if (!running) {
+ dout(20) << "not running" << dendl;
+ if (on_finish) {
+ on_finish->complete(-EINVAL);
+ }
+ return;
+ }
+
+ if (shut_down_replay) {
+ on_stop_journal_replay(r, desc);
+ } else if (on_finish != nullptr) {
+ on_finish->complete(0);
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::on_stop_journal_replay(int r, const std::string &desc)
+{
+ dout(10) << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_state != STATE_REPLAYING) {
+ // might be invoked multiple times while stopping
+ return;
+ }
+ m_stop_requested = true;
+ m_state = STATE_STOPPING;
+ }
+
+ set_state_description(r, desc);
+ update_mirror_image_status(true, boost::none);
+ reschedule_update_status_task(-1);
+ shut_down(0);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_replay_ready()
+{
+ dout(20) << dendl;
+ if (on_replay_interrupted()) {
+ return;
+ }
+
+ if (!m_remote_journaler->try_pop_front(&m_replay_entry, &m_replay_tag_tid)) {
+ return;
+ }
+
+ m_event_replay_tracker.start_op();
+
+ m_lock.Lock();
+ bool stopping = (m_state == STATE_STOPPING);
+ m_lock.Unlock();
+
+ if (stopping) {
+ dout(10) << "stopping event replay" << dendl;
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ if (m_replay_tag_valid && m_replay_tag.tid == m_replay_tag_tid) {
+ preprocess_entry();
+ return;
+ }
+
+ replay_flush();
+}
+
+template <typename I>
+void ImageReplayer<I>::restart(Context *on_finish)
+{
+ FunctionContext *ctx = new FunctionContext(
+ [this, on_finish](int r) {
+ if (r < 0) {
+ // Try start anyway.
+ }
+ start(on_finish, true);
+ });
+ stop(ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::flush()
+{
+ dout(10) << dendl;
+ C_SaferCond ctx;
+ flush_local_replay(&ctx);
+ ctx.wait();
+
+ update_mirror_image_status(false, boost::none);
+}
+
+template <typename I>
+void ImageReplayer<I>::flush_local_replay(Context* on_flush)
+{
+ m_lock.Lock();
+ if (m_state != STATE_REPLAYING) {
+ m_lock.Unlock();
+ on_flush->complete(0);
+ return;
+ }
+
+ dout(15) << dendl;
+ auto ctx = new FunctionContext(
+ [this, on_flush](int r) {
+ handle_flush_local_replay(on_flush, r);
+ });
+ m_local_replay->flush(ctx);
+ m_lock.Unlock();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_flush_local_replay(Context* on_flush, int r)
+{
+ dout(15) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "error flushing local replay: " << cpp_strerror(r) << dendl;
+ on_flush->complete(r);
+ return;
+ }
+
+ flush_commit_position(on_flush);
+}
+
+template <typename I>
+void ImageReplayer<I>::flush_commit_position(Context* on_flush)
+{
+ m_lock.Lock();
+ if (m_state != STATE_REPLAYING) {
+ m_lock.Unlock();
+ on_flush->complete(0);
+ return;
+ }
+
+ dout(15) << dendl;
+ auto ctx = new FunctionContext(
+ [this, on_flush](int r) {
+ handle_flush_commit_position(on_flush, r);
+ });
+ m_remote_journaler->flush_commit_position(ctx);
+ m_lock.Unlock();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_flush_commit_position(Context* on_flush, int r)
+{
+ dout(15) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "error flushing remote journal commit position: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ on_flush->complete(r);
+}
+
+template <typename I>
+bool ImageReplayer<I>::on_replay_interrupted()
+{
+ bool shut_down;
+ {
+ Mutex::Locker locker(m_lock);
+ shut_down = m_stop_requested;
+ }
+
+ if (shut_down) {
+ on_stop_journal_replay();
+ }
+ return shut_down;
+}
+
+template <typename I>
+void ImageReplayer<I>::print_status(Formatter *f, stringstream *ss)
+{
+ dout(10) << dendl;
+
+ Mutex::Locker l(m_lock);
+
+ if (f) {
+ f->open_object_section("image_replayer");
+ f->dump_string("name", m_name);
+ f->dump_string("state", to_string(m_state));
+ f->close_section();
+ f->flush(*ss);
+ } else {
+ *ss << m_name << ": state: " << to_string(m_state);
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_replay_complete(int r, const std::string &error_desc)
+{
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "replay encountered an error: " << cpp_strerror(r) << dendl;
+ }
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_stop_requested = true;
+ }
+ on_stop_journal_replay(r, error_desc);
+}
+
+template <typename I>
+void ImageReplayer<I>::replay_flush() {
+ dout(10) << dendl;
+
+ bool interrupted = false;
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_state != STATE_REPLAYING) {
+ dout(10) << "replay interrupted" << dendl;
+ interrupted = true;
+ } else {
+ m_state = STATE_REPLAY_FLUSHING;
+ }
+ }
+
+ if (interrupted) {
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ // shut down the replay to flush all IO and ops and create a new
+ // replayer to handle the new tag epoch
+ Context *ctx = create_context_callback<
+ ImageReplayer<I>, &ImageReplayer<I>::handle_replay_flush>(this);
+ ctx = new FunctionContext([this, ctx](int r) {
+ m_local_image_ctx->journal->stop_external_replay();
+ m_local_replay = nullptr;
+
+ if (r < 0) {
+ ctx->complete(r);
+ return;
+ }
+
+ m_local_journal->start_external_replay(&m_local_replay, ctx);
+ });
+ m_local_replay->shut_down(false, ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_replay_flush(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_REPLAY_FLUSHING);
+ m_state = STATE_REPLAYING;
+ }
+
+ if (r < 0) {
+ derr << "replay flush encountered an error: " << cpp_strerror(r) << dendl;
+ m_event_replay_tracker.finish_op();
+ handle_replay_complete(r, "replay flush encountered an error");
+ return;
+ } else if (on_replay_interrupted()) {
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ get_remote_tag();
+}
+
+template <typename I>
+void ImageReplayer<I>::get_remote_tag() {
+ dout(15) << "tag_tid: " << m_replay_tag_tid << dendl;
+
+ Context *ctx = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_get_remote_tag>(this);
+ m_remote_journaler->get_tag(m_replay_tag_tid, &m_replay_tag, ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_get_remote_tag(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r == 0) {
+ try {
+ auto it = m_replay_tag.data.cbegin();
+ decode(m_replay_tag_data, it);
+ } catch (const buffer::error &err) {
+ r = -EBADMSG;
+ }
+ }
+
+ if (r < 0) {
+ derr << "failed to retrieve remote tag " << m_replay_tag_tid << ": "
+ << cpp_strerror(r) << dendl;
+ m_event_replay_tracker.finish_op();
+ handle_replay_complete(r, "failed to retrieve remote tag");
+ return;
+ }
+
+ m_replay_tag_valid = true;
+ dout(15) << "decoded remote tag " << m_replay_tag_tid << ": "
+ << m_replay_tag_data << dendl;
+
+ allocate_local_tag();
+}
+
+template <typename I>
+void ImageReplayer<I>::allocate_local_tag() {
+ dout(15) << dendl;
+
+ std::string mirror_uuid = m_replay_tag_data.mirror_uuid;
+ if (mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ mirror_uuid = m_remote_image.mirror_uuid;
+ } else if (mirror_uuid == m_local_mirror_uuid) {
+ mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID;
+ } else if (mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+ // handle possible edge condition where daemon can failover and
+ // the local image has already been promoted/demoted
+ auto local_tag_data = m_local_journal->get_tag_data();
+ if (local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID &&
+ (local_tag_data.predecessor.commit_valid &&
+ local_tag_data.predecessor.mirror_uuid ==
+ librbd::Journal<>::LOCAL_MIRROR_UUID)) {
+ dout(15) << "skipping stale demotion event" << dendl;
+ handle_process_entry_safe(m_replay_entry, m_replay_start_time, 0);
+ handle_replay_ready();
+ return;
+ } else {
+ dout(5) << "encountered image demotion: stopping" << dendl;
+ Mutex::Locker locker(m_lock);
+ m_stop_requested = true;
+ }
+ }
+
+ librbd::journal::TagPredecessor predecessor(m_replay_tag_data.predecessor);
+ if (predecessor.mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ predecessor.mirror_uuid = m_remote_image.mirror_uuid;
+ } else if (predecessor.mirror_uuid == m_local_mirror_uuid) {
+ predecessor.mirror_uuid = librbd::Journal<>::LOCAL_MIRROR_UUID;
+ }
+
+ dout(15) << "mirror_uuid=" << mirror_uuid << ", "
+ << "predecessor=" << predecessor << ", "
+ << "replay_tag_tid=" << m_replay_tag_tid << dendl;
+ Context *ctx = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_allocate_local_tag>(this);
+ m_local_journal->allocate_tag(mirror_uuid, predecessor, ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_allocate_local_tag(int r) {
+ dout(15) << "r=" << r << ", "
+ << "tag_tid=" << m_local_journal->get_tag_tid() << dendl;
+
+ if (r < 0) {
+ derr << "failed to allocate journal tag: " << cpp_strerror(r) << dendl;
+ m_event_replay_tracker.finish_op();
+ handle_replay_complete(r, "failed to allocate journal tag");
+ return;
+ }
+
+ preprocess_entry();
+}
+
+template <typename I>
+void ImageReplayer<I>::preprocess_entry() {
+ dout(20) << "preprocessing entry tid=" << m_replay_entry.get_commit_tid()
+ << dendl;
+
+ bufferlist data = m_replay_entry.get_data();
+ auto it = data.cbegin();
+ int r = m_local_replay->decode(&it, &m_event_entry);
+ if (r < 0) {
+ derr << "failed to decode journal event" << dendl;
+ m_event_replay_tracker.finish_op();
+ handle_replay_complete(r, "failed to decode journal event");
+ return;
+ }
+
+ uint32_t delay = calculate_replay_delay(
+ m_event_entry.timestamp, m_local_image_ctx->mirroring_replay_delay);
+ if (delay == 0) {
+ handle_preprocess_entry_ready(0);
+ return;
+ }
+
+ dout(20) << "delaying replay by " << delay << " sec" << dendl;
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ ceph_assert(m_delayed_preprocess_task == nullptr);
+ m_delayed_preprocess_task = new FunctionContext(
+ [this](int r) {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ m_delayed_preprocess_task = nullptr;
+ m_threads->work_queue->queue(
+ create_context_callback<ImageReplayer,
+ &ImageReplayer<I>::handle_preprocess_entry_ready>(this), 0);
+ });
+ m_threads->timer->add_event_after(delay, m_delayed_preprocess_task);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_preprocess_entry_ready(int r) {
+ dout(20) << "r=" << r << dendl;
+ ceph_assert(r == 0);
+
+ m_replay_start_time = ceph_clock_now();
+ if (!m_event_preprocessor->is_required(m_event_entry)) {
+ process_entry();
+ return;
+ }
+
+ Context *ctx = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_preprocess_entry_safe>(this);
+ m_event_preprocessor->preprocess(&m_event_entry, ctx);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_preprocess_entry_safe(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ m_event_replay_tracker.finish_op();
+
+ if (r == -ECANCELED) {
+ handle_replay_complete(0, "lost exclusive lock");
+ } else {
+ derr << "failed to preprocess journal event" << dendl;
+ handle_replay_complete(r, "failed to preprocess journal event");
+ }
+ return;
+ }
+
+ process_entry();
+}
+
+template <typename I>
+void ImageReplayer<I>::process_entry() {
+ dout(20) << "processing entry tid=" << m_replay_entry.get_commit_tid()
+ << dendl;
+
+ // stop replaying events if stop has been requested
+ if (on_replay_interrupted()) {
+ m_event_replay_tracker.finish_op();
+ return;
+ }
+
+ Context *on_ready = create_context_callback<
+ ImageReplayer, &ImageReplayer<I>::handle_process_entry_ready>(this);
+ Context *on_commit = new C_ReplayCommitted(this, std::move(m_replay_entry),
+ m_replay_start_time);
+
+ m_local_replay->process(m_event_entry, on_ready, on_commit);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_process_entry_ready(int r) {
+ dout(20) << dendl;
+ ceph_assert(r == 0);
+
+ bool update_status = false;
+ {
+ RWLock::RLocker snap_locker(m_local_image_ctx->snap_lock);
+ if (m_local_image_name != m_local_image_ctx->name) {
+ m_local_image_name = m_local_image_ctx->name;
+ update_status = true;
+ }
+ }
+
+ if (update_status) {
+ reschedule_update_status_task(0);
+ }
+
+ // attempt to process the next event
+ handle_replay_ready();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_process_entry_safe(const ReplayEntry &replay_entry,
+ const utime_t &replay_start_time,
+ int r) {
+ dout(20) << "commit_tid=" << replay_entry.get_commit_tid() << ", r=" << r
+ << dendl;
+
+ if (r < 0) {
+ derr << "failed to commit journal event: " << cpp_strerror(r) << dendl;
+ handle_replay_complete(r, "failed to commit journal event");
+ } else {
+ ceph_assert(m_remote_journaler != nullptr);
+ m_remote_journaler->committed(replay_entry);
+ }
+
+ auto bytes = replay_entry.get_data().length();
+ auto latency = ceph_clock_now() - replay_start_time;
+
+ if (g_perf_counters) {
+ g_perf_counters->inc(l_rbd_mirror_replay);
+ g_perf_counters->inc(l_rbd_mirror_replay_bytes, bytes);
+ g_perf_counters->tinc(l_rbd_mirror_replay_latency, latency);
+ }
+
+ auto ctx = new FunctionContext(
+ [this, bytes, latency](int r) {
+ Mutex::Locker locker(m_lock);
+ if (m_perf_counters) {
+ m_perf_counters->inc(l_rbd_mirror_replay);
+ m_perf_counters->inc(l_rbd_mirror_replay_bytes, bytes);
+ m_perf_counters->tinc(l_rbd_mirror_replay_latency, latency);
+ }
+ m_event_replay_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+bool ImageReplayer<I>::update_mirror_image_status(bool force,
+ const OptionalState &state) {
+ dout(15) << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ if (!start_mirror_image_status_update(force, false)) {
+ return false;
+ }
+ }
+
+ queue_mirror_image_status_update(state);
+ return true;
+}
+
+template <typename I>
+bool ImageReplayer<I>::start_mirror_image_status_update(bool force,
+ bool restarting) {
+ ceph_assert(m_lock.is_locked());
+
+ if (!force && !is_stopped_()) {
+ if (!is_running_()) {
+ dout(15) << "shut down in-progress: ignoring update" << dendl;
+ return false;
+ } else if (m_in_flight_status_updates > (restarting ? 1 : 0)) {
+ dout(15) << "already sending update" << dendl;
+ m_update_status_requested = true;
+ return false;
+ }
+ }
+
+ ++m_in_flight_status_updates;
+ dout(15) << "in-flight updates=" << m_in_flight_status_updates << dendl;
+ return true;
+}
+
+template <typename I>
+void ImageReplayer<I>::finish_mirror_image_status_update() {
+ reregister_admin_socket_hook();
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_in_flight_status_updates > 0);
+ if (--m_in_flight_status_updates > 0) {
+ dout(15) << "waiting on " << m_in_flight_status_updates << " in-flight "
+ << "updates" << dendl;
+ return;
+ }
+
+ std::swap(on_finish, m_on_update_status_finish);
+ }
+
+ dout(15) << dendl;
+ if (on_finish != nullptr) {
+ on_finish->complete(0);
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::queue_mirror_image_status_update(const OptionalState &state) {
+ dout(15) << dendl;
+
+ auto ctx = new FunctionContext(
+ [this, state](int r) {
+ send_mirror_status_update(state);
+ });
+
+ // ensure pending IO is flushed and the commit position is updated
+ // prior to updating the mirror status
+ ctx = new FunctionContext(
+ [this, ctx](int r) {
+ flush_local_replay(ctx);
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void ImageReplayer<I>::send_mirror_status_update(const OptionalState &opt_state) {
+ State state;
+ std::string state_desc;
+ int last_r;
+ bool stopping_replay;
+
+ OptionalMirrorImageStatusState mirror_image_status_state =
+ boost::make_optional(false, cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN);
+ image_replayer::BootstrapRequest<I>* bootstrap_request = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ state = m_state;
+ state_desc = m_state_desc;
+ mirror_image_status_state = m_mirror_image_status_state;
+ last_r = m_last_r;
+ stopping_replay = (m_local_image_ctx != nullptr);
+
+ if (m_bootstrap_request != nullptr) {
+ bootstrap_request = m_bootstrap_request;
+ bootstrap_request->get();
+ }
+ }
+
+ bool syncing = false;
+ if (bootstrap_request != nullptr) {
+ syncing = bootstrap_request->is_syncing();
+ bootstrap_request->put();
+ bootstrap_request = nullptr;
+ }
+
+ if (opt_state) {
+ state = *opt_state;
+ }
+
+ cls::rbd::MirrorImageStatus status;
+ status.up = true;
+ switch (state) {
+ case STATE_STARTING:
+ if (syncing) {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_SYNCING;
+ status.description = state_desc.empty() ? "syncing" : state_desc;
+ mirror_image_status_state = status.state;
+ } else {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY;
+ status.description = "starting replay";
+ }
+ break;
+ case STATE_REPLAYING:
+ case STATE_REPLAY_FLUSHING:
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_REPLAYING;
+ {
+ Context *on_req_finish = new FunctionContext(
+ [this](int r) {
+ dout(15) << "replay status ready: r=" << r << dendl;
+ if (r >= 0) {
+ send_mirror_status_update(boost::none);
+ } else if (r == -EAGAIN) {
+ // decrement in-flight status update counter
+ handle_mirror_status_update(r);
+ }
+ });
+
+ std::string desc;
+ ceph_assert(m_replay_status_formatter != nullptr);
+ if (!m_replay_status_formatter->get_or_send_update(&desc,
+ on_req_finish)) {
+ dout(15) << "waiting for replay status" << dendl;
+ return;
+ }
+ status.description = "replaying, " + desc;
+ mirror_image_status_state = boost::make_optional(
+ false, cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN);
+ }
+ break;
+ case STATE_STOPPING:
+ if (stopping_replay) {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY;
+ status.description = state_desc.empty() ? "stopping replay" : state_desc;
+ break;
+ }
+ // FALLTHROUGH
+ case STATE_STOPPED:
+ if (last_r == -EREMOTEIO) {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN;
+ status.description = state_desc;
+ mirror_image_status_state = status.state;
+ } else if (last_r < 0) {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_ERROR;
+ status.description = state_desc;
+ mirror_image_status_state = status.state;
+ } else {
+ status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STOPPED;
+ status.description = state_desc.empty() ? "stopped" : state_desc;
+ mirror_image_status_state = boost::none;
+ }
+ break;
+ default:
+ ceph_assert(!"invalid state");
+ }
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_mirror_image_status_state = mirror_image_status_state;
+ }
+
+ // prevent the status from ping-ponging when failed replays are restarted
+ if (mirror_image_status_state &&
+ *mirror_image_status_state == cls::rbd::MIRROR_IMAGE_STATUS_STATE_ERROR) {
+ status.state = *mirror_image_status_state;
+ }
+
+ dout(15) << "status=" << status << dendl;
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_status_set(&op, m_global_image_id, status);
+
+ ceph_assert(m_local_ioctx);
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ ImageReplayer<I>, &ImageReplayer<I>::handle_mirror_status_update>(this);
+ int r = m_local_ioctx->aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_mirror_status_update(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ bool running = false;
+ bool started = false;
+ {
+ Mutex::Locker locker(m_lock);
+ bool update_status_requested = false;
+ std::swap(update_status_requested, m_update_status_requested);
+
+ running = is_running_();
+ if (running && update_status_requested) {
+ started = start_mirror_image_status_update(false, true);
+ }
+ }
+
+ // if a deferred update is available, send it -- otherwise reschedule
+ // the timer task
+ if (started) {
+ queue_mirror_image_status_update(boost::none);
+ } else if (running) {
+ reschedule_update_status_task(0);
+ }
+
+ // mark committed status update as no longer in-flight
+ finish_mirror_image_status_update();
+}
+
+template <typename I>
+void ImageReplayer<I>::reschedule_update_status_task(int new_interval) {
+ bool canceled_task = false;
+ {
+ Mutex::Locker locker(m_lock);
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+
+ if (m_update_status_task) {
+ dout(15) << "canceling existing status update task" << dendl;
+
+ canceled_task = m_threads->timer->cancel_event(m_update_status_task);
+ m_update_status_task = nullptr;
+ }
+
+ if (new_interval > 0) {
+ m_update_status_interval = new_interval;
+ }
+
+ if (new_interval >= 0 && is_running_() &&
+ start_mirror_image_status_update(true, false)) {
+ m_update_status_task = new FunctionContext(
+ [this](int r) {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ m_update_status_task = nullptr;
+
+ queue_mirror_image_status_update(boost::none);
+ });
+ dout(15) << "scheduling status update task after "
+ << m_update_status_interval << " seconds" << dendl;
+ m_threads->timer->add_event_after(m_update_status_interval,
+ m_update_status_task);
+ }
+ }
+
+ if (canceled_task) {
+ // decrement in-flight status update counter for canceled task
+ finish_mirror_image_status_update();
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::shut_down(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ bool canceled_delayed_preprocess_task = false;
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ if (m_delayed_preprocess_task != nullptr) {
+ canceled_delayed_preprocess_task = m_threads->timer->cancel_event(
+ m_delayed_preprocess_task);
+ ceph_assert(canceled_delayed_preprocess_task);
+ m_delayed_preprocess_task = nullptr;
+ }
+ }
+ if (canceled_delayed_preprocess_task) {
+ // wake up sleeping replay
+ m_event_replay_tracker.finish_op();
+ }
+
+ reschedule_update_status_task(-1);
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_state == STATE_STOPPING);
+
+ // if status updates are in-flight, wait for them to complete
+ // before proceeding
+ if (m_in_flight_status_updates > 0) {
+ if (m_on_update_status_finish == nullptr) {
+ dout(15) << "waiting for in-flight status update" << dendl;
+ m_on_update_status_finish = new FunctionContext(
+ [this, r](int _r) {
+ shut_down(r);
+ });
+ }
+ return;
+ }
+ }
+
+ // NOTE: it's important to ensure that the local image is fully
+ // closed before attempting to close the remote journal in
+ // case the remote cluster is unreachable
+
+ // chain the shut down sequence (reverse order)
+ Context *ctx = new FunctionContext(
+ [this, r](int _r) {
+ if (m_local_ioctx) {
+ update_mirror_image_status(true, STATE_STOPPED);
+ }
+ handle_shut_down(r);
+ });
+
+ // close the remote journal
+ if (m_remote_journaler != nullptr) {
+ ctx = new FunctionContext([this, ctx](int r) {
+ delete m_remote_journaler;
+ m_remote_journaler = nullptr;
+ ctx->complete(0);
+ });
+ ctx = new FunctionContext([this, ctx](int r) {
+ m_remote_journaler->remove_listener(&m_remote_listener);
+ m_remote_journaler->shut_down(ctx);
+ });
+ }
+
+ // stop the replay of remote journal events
+ if (m_replay_handler != nullptr) {
+ ctx = new FunctionContext([this, ctx](int r) {
+ delete m_replay_handler;
+ m_replay_handler = nullptr;
+
+ m_event_replay_tracker.wait_for_ops(ctx);
+ });
+ ctx = new FunctionContext([this, ctx](int r) {
+ m_remote_journaler->stop_replay(ctx);
+ });
+ }
+
+ // close the local image (release exclusive lock)
+ if (m_local_image_ctx) {
+ ctx = new FunctionContext([this, ctx](int r) {
+ CloseImageRequest<I> *request = CloseImageRequest<I>::create(
+ &m_local_image_ctx, ctx);
+ request->send();
+ });
+ }
+
+ // shut down event replay into the local image
+ if (m_local_journal != nullptr) {
+ ctx = new FunctionContext([this, ctx](int r) {
+ m_local_journal = nullptr;
+ ctx->complete(0);
+ });
+ if (m_local_replay != nullptr) {
+ ctx = new FunctionContext([this, ctx](int r) {
+ m_local_journal->stop_external_replay();
+ m_local_replay = nullptr;
+
+ EventPreprocessor<I>::destroy(m_event_preprocessor);
+ m_event_preprocessor = nullptr;
+ ctx->complete(0);
+ });
+ }
+ ctx = new FunctionContext([this, ctx](int r) {
+ // blocks if listener notification is in-progress
+ m_local_journal->remove_listener(m_journal_listener);
+ ctx->complete(0);
+ });
+ }
+
+ // wait for all local in-flight replay events to complete
+ ctx = new FunctionContext([this, ctx](int r) {
+ if (r < 0) {
+ derr << "error shutting down journal replay: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ m_event_replay_tracker.wait_for_ops(ctx);
+ });
+
+ // flush any local in-flight replay events
+ if (m_local_replay != nullptr) {
+ ctx = new FunctionContext([this, ctx](int r) {
+ m_local_replay->shut_down(true, ctx);
+ });
+ }
+
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_shut_down(int r) {
+ reschedule_update_status_task(-1);
+
+ bool resync_requested = false;
+ bool delete_requested = false;
+ bool unregister_asok_hook = false;
+ {
+ Mutex::Locker locker(m_lock);
+
+ // if status updates are in-flight, wait for them to complete
+ // before proceeding
+ if (m_in_flight_status_updates > 0) {
+ if (m_on_update_status_finish == nullptr) {
+ dout(15) << "waiting for in-flight status update" << dendl;
+ m_on_update_status_finish = new FunctionContext(
+ [this, r](int _r) {
+ handle_shut_down(r);
+ });
+ }
+ return;
+ }
+
+ if (m_delete_requested && !m_local_image_id.empty()) {
+ ceph_assert(m_remote_image.image_id.empty());
+ dout(0) << "remote image no longer exists: scheduling deletion" << dendl;
+ unregister_asok_hook = true;
+ std::swap(delete_requested, m_delete_requested);
+ }
+
+ std::swap(resync_requested, m_resync_requested);
+ if (delete_requested || resync_requested) {
+ m_local_image_id = "";
+ } else if (m_last_r == -ENOENT &&
+ m_local_image_id.empty() && m_remote_image.image_id.empty()) {
+ dout(0) << "mirror image no longer exists" << dendl;
+ unregister_asok_hook = true;
+ m_finished = true;
+ }
+ }
+
+ if (unregister_asok_hook) {
+ unregister_admin_socket_hook();
+ }
+
+ if (delete_requested || resync_requested) {
+ dout(5) << "moving image to trash" << dendl;
+ auto ctx = new FunctionContext([this, r](int) {
+ handle_shut_down(r);
+ });
+ ImageDeleter<I>::trash_move(*m_local_ioctx, m_global_image_id,
+ resync_requested, m_threads->work_queue, ctx);
+ return;
+ }
+
+ dout(10) << "stop complete" << dendl;
+ ReplayStatusFormatter<I>::destroy(m_replay_status_formatter);
+ m_replay_status_formatter = nullptr;
+
+ Context *on_start = nullptr;
+ Context *on_stop = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ std::swap(on_start, m_on_start_finish);
+ std::swap(on_stop, m_on_stop_finish);
+ m_stop_requested = false;
+ ceph_assert(m_delayed_preprocess_task == nullptr);
+ ceph_assert(m_state == STATE_STOPPING);
+ m_state = STATE_STOPPED;
+ }
+
+ if (on_start != nullptr) {
+ dout(10) << "on start finish complete, r=" << r << dendl;
+ on_start->complete(r);
+ r = 0;
+ }
+ if (on_stop != nullptr) {
+ dout(10) << "on stop finish complete, r=" << r << dendl;
+ on_stop->complete(r);
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_remote_journal_metadata_updated() {
+ dout(20) << dendl;
+
+ cls::journal::Client client;
+ {
+ Mutex::Locker locker(m_lock);
+ if (!is_running_()) {
+ return;
+ }
+
+ int r = m_remote_journaler->get_cached_client(m_local_mirror_uuid, &client);
+ if (r < 0) {
+ derr << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ return;
+ }
+ }
+
+ if (client.state != cls::journal::CLIENT_STATE_CONNECTED) {
+ dout(0) << "client flagged disconnected, stopping image replay" << dendl;
+ stop(nullptr, false, -ENOTCONN, "disconnected");
+ }
+}
+
+template <typename I>
+std::string ImageReplayer<I>::to_string(const State state) {
+ switch (state) {
+ case ImageReplayer<I>::STATE_STARTING:
+ return "Starting";
+ case ImageReplayer<I>::STATE_REPLAYING:
+ return "Replaying";
+ case ImageReplayer<I>::STATE_REPLAY_FLUSHING:
+ return "ReplayFlushing";
+ case ImageReplayer<I>::STATE_STOPPING:
+ return "Stopping";
+ case ImageReplayer<I>::STATE_STOPPED:
+ return "Stopped";
+ default:
+ break;
+ }
+ return "Unknown(" + stringify(state) + ")";
+}
+
+template <typename I>
+void ImageReplayer<I>::resync_image(Context *on_finish) {
+ dout(10) << dendl;
+
+ m_resync_requested = true;
+ stop(on_finish);
+}
+
+template <typename I>
+void ImageReplayer<I>::register_admin_socket_hook() {
+ ImageReplayerAdminSocketHook<I> *asok_hook;
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_asok_hook != nullptr) {
+ return;
+ }
+
+ ceph_assert(m_perf_counters == nullptr);
+
+ dout(15) << "registered asok hook: " << m_name << dendl;
+ asok_hook = new ImageReplayerAdminSocketHook<I>(g_ceph_context, m_name,
+ this);
+ int r = asok_hook->register_commands();
+ if (r == 0) {
+ m_asok_hook = asok_hook;
+
+ CephContext *cct = static_cast<CephContext *>(m_local->cct());
+ auto prio = cct->_conf.get_val<int64_t>("rbd_mirror_perf_stats_prio");
+ PerfCountersBuilder plb(g_ceph_context, "rbd_mirror_" + m_name,
+ l_rbd_mirror_first, l_rbd_mirror_last);
+ plb.add_u64_counter(l_rbd_mirror_replay, "replay", "Replays", "r", prio);
+ plb.add_u64_counter(l_rbd_mirror_replay_bytes, "replay_bytes",
+ "Replayed data", "rb", prio, unit_t(UNIT_BYTES));
+ plb.add_time_avg(l_rbd_mirror_replay_latency, "replay_latency",
+ "Replay latency", "rl", prio);
+ m_perf_counters = plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(m_perf_counters);
+
+ return;
+ }
+ derr << "error registering admin socket commands" << dendl;
+ }
+ delete asok_hook;
+}
+
+template <typename I>
+void ImageReplayer<I>::unregister_admin_socket_hook() {
+ dout(15) << dendl;
+
+ AdminSocketHook *asok_hook = nullptr;
+ PerfCounters *perf_counters = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ std::swap(asok_hook, m_asok_hook);
+ std::swap(perf_counters, m_perf_counters);
+ }
+ delete asok_hook;
+ if (perf_counters != nullptr) {
+ g_ceph_context->get_perfcounters_collection()->remove(perf_counters);
+ delete perf_counters;
+ }
+}
+
+template <typename I>
+void ImageReplayer<I>::reregister_admin_socket_hook() {
+ {
+ Mutex::Locker locker(m_lock);
+ auto name = m_local_ioctx->get_pool_name() + "/" + m_local_image_name;
+ if (m_asok_hook != nullptr && m_name == name) {
+ return;
+ }
+ m_name = name;
+ }
+ unregister_admin_socket_hook();
+ register_admin_socket_hook();
+}
+
+template <typename I>
+std::ostream &operator<<(std::ostream &os, const ImageReplayer<I> &replayer)
+{
+ os << "ImageReplayer: " << &replayer << " [" << replayer.get_local_pool_id()
+ << "/" << replayer.get_global_image_id() << "]";
+ return os;
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ImageReplayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageReplayer.h b/src/tools/rbd_mirror/ImageReplayer.h
new file mode 100644
index 00000000..9af3e961
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageReplayer.h
@@ -0,0 +1,438 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_H
+#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_H
+
+#include "common/AsyncOpTracker.h"
+#include "common/Mutex.h"
+#include "common/WorkQueue.h"
+#include "include/rados/librados.hpp"
+#include "cls/journal/cls_journal_types.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "journal/JournalMetadataListener.h"
+#include "journal/ReplayEntry.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include "ProgressContext.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/image_replayer/Types.h"
+
+#include <boost/noncopyable.hpp>
+#include <boost/optional.hpp>
+
+#include <set>
+#include <map>
+#include <atomic>
+#include <string>
+#include <vector>
+
+class AdminSocketHook;
+class PerfCounters;
+
+namespace journal {
+
+class Journaler;
+class ReplayHandler;
+
+}
+
+namespace librbd {
+
+class ImageCtx;
+namespace journal { template <typename> class Replay; }
+
+}
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct InstanceWatcher;
+template <typename> struct Threads;
+
+namespace image_replayer { template <typename> class BootstrapRequest; }
+namespace image_replayer { template <typename> class EventPreprocessor; }
+namespace image_replayer { template <typename> class ReplayStatusFormatter; }
+
+/**
+ * Replays changes from a remote cluster for a single image.
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageReplayer {
+public:
+ static ImageReplayer *create(
+ Threads<ImageCtxT> *threads, InstanceWatcher<ImageCtxT> *instance_watcher,
+ RadosRef local, const std::string &local_mirror_uuid, int64_t local_pool_id,
+ const std::string &global_image_id) {
+ return new ImageReplayer(threads, instance_watcher, local,
+ local_mirror_uuid, local_pool_id, global_image_id);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ ImageReplayer(Threads<ImageCtxT> *threads,
+ InstanceWatcher<ImageCtxT> *instance_watcher,
+ RadosRef local, const std::string &local_mirror_uuid,
+ int64_t local_pool_id, const std::string &global_image_id);
+ virtual ~ImageReplayer();
+ ImageReplayer(const ImageReplayer&) = delete;
+ ImageReplayer& operator=(const ImageReplayer&) = delete;
+
+ bool is_stopped() { Mutex::Locker l(m_lock); return is_stopped_(); }
+ bool is_running() { Mutex::Locker l(m_lock); return is_running_(); }
+ bool is_replaying() { Mutex::Locker l(m_lock); return is_replaying_(); }
+
+ std::string get_name() { Mutex::Locker l(m_lock); return m_name; };
+ void set_state_description(int r, const std::string &desc);
+
+ // TODO temporary until policy handles release of image replayers
+ inline bool is_finished() const {
+ Mutex::Locker locker(m_lock);
+ return m_finished;
+ }
+ inline void set_finished(bool finished) {
+ Mutex::Locker locker(m_lock);
+ m_finished = finished;
+ }
+
+ inline bool is_blacklisted() const {
+ Mutex::Locker locker(m_lock);
+ return (m_last_r == -EBLACKLISTED);
+ }
+
+ image_replayer::HealthState get_health_state() const;
+
+ void add_peer(const std::string &peer_uuid, librados::IoCtx &remote_io_ctx);
+
+ inline int64_t get_local_pool_id() const {
+ return m_local_pool_id;
+ }
+ inline const std::string& get_global_image_id() const {
+ return m_global_image_id;
+ }
+
+ void start(Context *on_finish = nullptr, bool manual = false);
+ void stop(Context *on_finish = nullptr, bool manual = false,
+ int r = 0, const std::string& desc = "");
+ void restart(Context *on_finish = nullptr);
+ void flush();
+
+ void resync_image(Context *on_finish=nullptr);
+
+ void print_status(Formatter *f, stringstream *ss);
+
+ virtual void handle_replay_ready();
+ virtual void handle_replay_complete(int r, const std::string &error_desc);
+
+protected:
+ /**
+ * @verbatim
+ * (error)
+ * <uninitialized> <------------------------------------ FAIL
+ * | ^
+ * v *
+ * <starting> *
+ * | *
+ * v (error) *
+ * PREPARE_LOCAL_IMAGE * * * * * * * * * * * * * * * * * *
+ * | *
+ * v (error) *
+ * PREPARE_REMOTE_IMAGE * * * * * * * * * * * * * * * * * *
+ * | *
+ * v (error) *
+ * BOOTSTRAP_IMAGE * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * v (error) *
+ * INIT_REMOTE_JOURNALER * * * * * * * * * * * * * * * * *
+ * | *
+ * v (error) *
+ * START_REPLAY * * * * * * * * * * * * * * * * * * * * * *
+ * |
+ * | /--------------------------------------------\
+ * | | |
+ * v v (asok flush) |
+ * REPLAYING -------------> LOCAL_REPLAY_FLUSH |
+ * | \ | |
+ * | | v |
+ * | | FLUSH_COMMIT_POSITION |
+ * | | | |
+ * | | \--------------------/|
+ * | | |
+ * | | (entries available) |
+ * | \-----------> REPLAY_READY |
+ * | | |
+ * | | (skip if not |
+ * | v needed) (error)
+ * | REPLAY_FLUSH * * * * * * * * *
+ * | | | *
+ * | | (skip if not | *
+ * | v needed) (error) *
+ * | GET_REMOTE_TAG * * * * * * * *
+ * | | | *
+ * | | (skip if not | *
+ * | v needed) (error) *
+ * | ALLOCATE_LOCAL_TAG * * * * * *
+ * | | | *
+ * | v (error) *
+ * | PREPROCESS_ENTRY * * * * * * *
+ * | | | *
+ * | v (error) *
+ * | PROCESS_ENTRY * * * * * * * * *
+ * | | | *
+ * | \---------------------/ *
+ * v *
+ * REPLAY_COMPLETE < * * * * * * * * * * * * * * * * * * *
+ * |
+ * v
+ * JOURNAL_REPLAY_SHUT_DOWN
+ * |
+ * v
+ * LOCAL_IMAGE_CLOSE
+ * |
+ * v
+ * <stopped>
+ *
+ * @endverbatim
+ */
+
+ virtual void on_start_fail(int r, const std::string &desc);
+ virtual bool on_start_interrupted();
+ virtual bool on_start_interrupted(Mutex& lock);
+
+ virtual void on_stop_journal_replay(int r = 0, const std::string &desc = "");
+
+ bool on_replay_interrupted();
+
+private:
+ typedef typename librbd::journal::TypeTraits<ImageCtxT>::ReplayEntry ReplayEntry;
+
+ enum State {
+ STATE_UNKNOWN,
+ STATE_STARTING,
+ STATE_REPLAYING,
+ STATE_REPLAY_FLUSHING,
+ STATE_STOPPING,
+ STATE_STOPPED,
+ };
+
+ struct RemoteImage {
+ std::string mirror_uuid;
+ std::string image_id;
+ librados::IoCtx io_ctx;
+
+ RemoteImage() {
+ }
+ RemoteImage(const Peer& peer) : io_ctx(peer.io_ctx) {
+ }
+ };
+
+ typedef typename librbd::journal::TypeTraits<ImageCtxT>::Journaler Journaler;
+ typedef boost::optional<State> OptionalState;
+ typedef boost::optional<cls::rbd::MirrorImageStatusState>
+ OptionalMirrorImageStatusState;
+
+ struct JournalListener : public librbd::journal::Listener {
+ ImageReplayer *img_replayer;
+
+ JournalListener(ImageReplayer *img_replayer)
+ : img_replayer(img_replayer) {
+ }
+
+ void handle_close() override {
+ img_replayer->on_stop_journal_replay();
+ }
+
+ void handle_promoted() override {
+ img_replayer->on_stop_journal_replay(0, "force promoted");
+ }
+
+ void handle_resync() override {
+ img_replayer->resync_image();
+ }
+ };
+
+ class BootstrapProgressContext : public ProgressContext {
+ public:
+ BootstrapProgressContext(ImageReplayer<ImageCtxT> *replayer) :
+ replayer(replayer) {
+ }
+
+ void update_progress(const std::string &description,
+ bool flush = true) override;
+ private:
+ ImageReplayer<ImageCtxT> *replayer;
+ };
+
+ Threads<ImageCtxT> *m_threads;
+ InstanceWatcher<ImageCtxT> *m_instance_watcher;
+
+ Peers m_peers;
+ RemoteImage m_remote_image;
+
+ RadosRef m_local;
+ std::string m_local_mirror_uuid;
+ int64_t m_local_pool_id;
+ std::string m_local_image_id;
+ std::string m_global_image_id;
+ std::string m_local_image_name;
+ std::string m_name;
+
+ mutable Mutex m_lock;
+ State m_state = STATE_STOPPED;
+ std::string m_state_desc;
+
+ OptionalMirrorImageStatusState m_mirror_image_status_state =
+ boost::make_optional(false, cls::rbd::MIRROR_IMAGE_STATUS_STATE_UNKNOWN);
+ int m_last_r = 0;
+
+ BootstrapProgressContext m_progress_cxt;
+
+ bool m_finished = false;
+ bool m_delete_requested = false;
+ bool m_resync_requested = false;
+
+ image_replayer::EventPreprocessor<ImageCtxT> *m_event_preprocessor = nullptr;
+ image_replayer::ReplayStatusFormatter<ImageCtxT> *m_replay_status_formatter =
+ nullptr;
+ IoCtxRef m_local_ioctx;
+ ImageCtxT *m_local_image_ctx = nullptr;
+ std::string m_local_image_tag_owner;
+
+ decltype(ImageCtxT::journal) m_local_journal = nullptr;
+ librbd::journal::Replay<ImageCtxT> *m_local_replay = nullptr;
+ Journaler* m_remote_journaler = nullptr;
+ ::journal::ReplayHandler *m_replay_handler = nullptr;
+ librbd::journal::Listener *m_journal_listener;
+
+ Context *m_on_start_finish = nullptr;
+ Context *m_on_stop_finish = nullptr;
+ Context *m_update_status_task = nullptr;
+ int m_update_status_interval = 0;
+ librados::AioCompletion *m_update_status_comp = nullptr;
+ bool m_stop_requested = false;
+ bool m_manual_stop = false;
+
+ AdminSocketHook *m_asok_hook = nullptr;
+ PerfCounters *m_perf_counters = nullptr;
+
+ image_replayer::BootstrapRequest<ImageCtxT> *m_bootstrap_request = nullptr;
+
+ uint32_t m_in_flight_status_updates = 0;
+ bool m_update_status_requested = false;
+ Context *m_on_update_status_finish = nullptr;
+
+ cls::journal::ClientState m_client_state =
+ cls::journal::CLIENT_STATE_DISCONNECTED;
+ librbd::journal::MirrorPeerClientMeta m_client_meta;
+
+ ReplayEntry m_replay_entry;
+ utime_t m_replay_start_time;
+ bool m_replay_tag_valid = false;
+ uint64_t m_replay_tag_tid = 0;
+ cls::journal::Tag m_replay_tag;
+ librbd::journal::TagData m_replay_tag_data;
+ librbd::journal::EventEntry m_event_entry;
+ AsyncOpTracker m_event_replay_tracker;
+ Context *m_delayed_preprocess_task = nullptr;
+
+ struct RemoteJournalerListener : public ::journal::JournalMetadataListener {
+ ImageReplayer *replayer;
+
+ RemoteJournalerListener(ImageReplayer *replayer) : replayer(replayer) { }
+
+ void handle_update(::journal::JournalMetadata *) override;
+ } m_remote_listener;
+
+ struct C_ReplayCommitted : public Context {
+ ImageReplayer *replayer;
+ ReplayEntry replay_entry;
+ utime_t replay_start_time;
+
+ C_ReplayCommitted(ImageReplayer *replayer,
+ ReplayEntry &&replay_entry,
+ const utime_t &replay_start_time)
+ : replayer(replayer), replay_entry(std::move(replay_entry)),
+ replay_start_time(replay_start_time) {
+ }
+ void finish(int r) override {
+ replayer->handle_process_entry_safe(replay_entry, replay_start_time, r);
+ }
+ };
+
+ static std::string to_string(const State state);
+
+ bool is_stopped_() const {
+ return m_state == STATE_STOPPED;
+ }
+ bool is_running_() const {
+ return !is_stopped_() && m_state != STATE_STOPPING && !m_stop_requested;
+ }
+ bool is_replaying_() const {
+ return (m_state == STATE_REPLAYING ||
+ m_state == STATE_REPLAY_FLUSHING);
+ }
+
+ void flush_local_replay(Context* on_flush);
+ void handle_flush_local_replay(Context* on_flush, int r);
+
+ void flush_commit_position(Context* on_flush);
+ void handle_flush_commit_position(Context* on_flush, int r);
+
+ bool update_mirror_image_status(bool force, const OptionalState &state);
+ bool start_mirror_image_status_update(bool force, bool restarting);
+ void finish_mirror_image_status_update();
+ void queue_mirror_image_status_update(const OptionalState &state);
+ void send_mirror_status_update(const OptionalState &state);
+ void handle_mirror_status_update(int r);
+ void reschedule_update_status_task(int new_interval);
+
+ void shut_down(int r);
+ void handle_shut_down(int r);
+ void handle_remote_journal_metadata_updated();
+
+ void prepare_local_image();
+ void handle_prepare_local_image(int r);
+
+ void prepare_remote_image();
+ void handle_prepare_remote_image(int r);
+
+ void bootstrap();
+ void handle_bootstrap(int r);
+
+ void init_remote_journaler();
+ void handle_init_remote_journaler(int r);
+
+ void start_replay();
+ void handle_start_replay(int r);
+
+ void replay_flush();
+ void handle_replay_flush(int r);
+
+ void get_remote_tag();
+ void handle_get_remote_tag(int r);
+
+ void allocate_local_tag();
+ void handle_allocate_local_tag(int r);
+
+ void preprocess_entry();
+ void handle_preprocess_entry_ready(int r);
+ void handle_preprocess_entry_safe(int r);
+
+ void process_entry();
+ void handle_process_entry_ready(int r);
+ void handle_process_entry_safe(const ReplayEntry& replay_entry,
+ const utime_t &m_replay_start_time, int r);
+
+ void register_admin_socket_hook();
+ void unregister_admin_socket_hook();
+ void reregister_admin_socket_hook();
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ImageReplayer<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_H
diff --git a/src/tools/rbd_mirror/ImageSync.cc b/src/tools/rbd_mirror/ImageSync.cc
new file mode 100644
index 00000000..929d75c2
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageSync.cc
@@ -0,0 +1,481 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ImageSync.h"
+#include "InstanceWatcher.h"
+#include "ProgressContext.h"
+#include "common/debug.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/DeepCopyRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/internal.h"
+#include "librbd/journal/Types.h"
+#include "tools/rbd_mirror/image_sync/SyncPointCreateRequest.h"
+#include "tools/rbd_mirror/image_sync/SyncPointPruneRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ImageSync: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+
+using namespace image_sync;
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+class ImageSync<I>::ImageCopyProgressContext : public librbd::ProgressContext {
+public:
+ ImageCopyProgressContext(ImageSync *image_sync) : image_sync(image_sync) {
+ }
+
+ int update_progress(uint64_t object_no, uint64_t object_count) override {
+ image_sync->handle_copy_image_update_progress(object_no, object_count);
+ return 0;
+ }
+
+ ImageSync *image_sync;
+};
+
+template <typename I>
+ImageSync<I>::ImageSync(I *local_image_ctx, I *remote_image_ctx,
+ SafeTimer *timer, Mutex *timer_lock,
+ const std::string &mirror_uuid, Journaler *journaler,
+ MirrorPeerClientMeta *client_meta,
+ ContextWQ *work_queue,
+ InstanceWatcher<I> *instance_watcher,
+ Context *on_finish, ProgressContext *progress_ctx)
+ : BaseRequest("rbd::mirror::ImageSync", local_image_ctx->cct, on_finish),
+ m_local_image_ctx(local_image_ctx), m_remote_image_ctx(remote_image_ctx),
+ m_timer(timer), m_timer_lock(timer_lock), m_mirror_uuid(mirror_uuid),
+ m_journaler(journaler), m_client_meta(client_meta),
+ m_work_queue(work_queue), m_instance_watcher(instance_watcher),
+ m_progress_ctx(progress_ctx),
+ m_lock(unique_lock_name("ImageSync::m_lock", this)),
+ m_update_sync_point_interval(m_local_image_ctx->cct->_conf.template get_val<double>(
+ "rbd_mirror_sync_point_update_age")), m_client_meta_copy(*client_meta) {
+}
+
+template <typename I>
+ImageSync<I>::~ImageSync() {
+ ceph_assert(m_image_copy_request == nullptr);
+ ceph_assert(m_image_copy_prog_ctx == nullptr);
+ ceph_assert(m_update_sync_ctx == nullptr);
+}
+
+template <typename I>
+void ImageSync<I>::send() {
+ send_notify_sync_request();
+}
+
+template <typename I>
+void ImageSync<I>::cancel() {
+ Mutex::Locker locker(m_lock);
+
+ dout(10) << dendl;
+
+ m_canceled = true;
+
+ if (m_instance_watcher->cancel_sync_request(m_local_image_ctx->id)) {
+ return;
+ }
+
+ if (m_image_copy_request != nullptr) {
+ m_image_copy_request->cancel();
+ }
+}
+
+template <typename I>
+void ImageSync<I>::send_notify_sync_request() {
+ update_progress("NOTIFY_SYNC_REQUEST");
+
+ dout(10) << dendl;
+
+ m_lock.Lock();
+ if (m_canceled) {
+ m_lock.Unlock();
+ BaseRequest::finish(-ECANCELED);
+ return;
+ }
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_notify_sync_request>(this));
+ m_instance_watcher->notify_sync_request(m_local_image_ctx->id, ctx);
+ m_lock.Unlock();
+}
+
+template <typename I>
+void ImageSync<I>::handle_notify_sync_request(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ m_lock.Lock();
+ if (r == 0 && m_canceled) {
+ r = -ECANCELED;
+ }
+ m_lock.Unlock();
+
+ if (r < 0) {
+ BaseRequest::finish(r);
+ return;
+ }
+
+ send_prune_catch_up_sync_point();
+}
+
+template <typename I>
+void ImageSync<I>::send_prune_catch_up_sync_point() {
+ update_progress("PRUNE_CATCH_UP_SYNC_POINT");
+
+ if (m_client_meta->sync_points.empty()) {
+ send_create_sync_point();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ // prune will remove sync points with missing snapshots and
+ // ensure we have a maximum of one sync point (in case we
+ // restarted)
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_prune_catch_up_sync_point>(this);
+ SyncPointPruneRequest<I> *request = SyncPointPruneRequest<I>::create(
+ m_remote_image_ctx, false, m_journaler, m_client_meta, ctx);
+ request->send();
+}
+
+template <typename I>
+void ImageSync<I>::handle_prune_catch_up_sync_point(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to prune catch-up sync point: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_create_sync_point();
+}
+
+template <typename I>
+void ImageSync<I>::send_create_sync_point() {
+ update_progress("CREATE_SYNC_POINT");
+
+ // TODO: when support for disconnecting laggy clients is added,
+ // re-connect and create catch-up sync point
+ if (m_client_meta->sync_points.size() > 0) {
+ send_copy_image();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_create_sync_point>(this);
+ SyncPointCreateRequest<I> *request = SyncPointCreateRequest<I>::create(
+ m_remote_image_ctx, m_mirror_uuid, m_journaler, m_client_meta, ctx);
+ request->send();
+}
+
+template <typename I>
+void ImageSync<I>::handle_create_sync_point(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to create sync point: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ send_copy_image();
+}
+
+template <typename I>
+void ImageSync<I>::send_copy_image() {
+ librados::snap_t snap_id_start = 0;
+ librados::snap_t snap_id_end;
+ librbd::deep_copy::ObjectNumber object_number;
+ int r = 0;
+ {
+ RWLock::RLocker snap_locker(m_remote_image_ctx->snap_lock);
+ ceph_assert(!m_client_meta->sync_points.empty());
+ auto &sync_point = m_client_meta->sync_points.front();
+ snap_id_end = m_remote_image_ctx->get_snap_id(
+ cls::rbd::UserSnapshotNamespace(), sync_point.snap_name);
+ if (snap_id_end == CEPH_NOSNAP) {
+ derr << ": failed to locate snapshot: " << sync_point.snap_name << dendl;
+ r = -ENOENT;
+ } else if (!sync_point.from_snap_name.empty()) {
+ snap_id_start = m_remote_image_ctx->get_snap_id(
+ cls::rbd::UserSnapshotNamespace(), sync_point.from_snap_name);
+ if (snap_id_start == CEPH_NOSNAP) {
+ derr << ": failed to locate from snapshot: "
+ << sync_point.from_snap_name << dendl;
+ r = -ENOENT;
+ }
+ }
+ object_number = sync_point.object_number;
+ }
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ m_lock.Lock();
+ if (m_canceled) {
+ m_lock.Unlock();
+ finish(-ECANCELED);
+ return;
+ }
+
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_copy_image>(this);
+ m_image_copy_prog_ctx = new ImageCopyProgressContext(this);
+ m_image_copy_request = librbd::DeepCopyRequest<I>::create(
+ m_remote_image_ctx, m_local_image_ctx, snap_id_start, snap_id_end,
+ 0, false, object_number, m_work_queue, &m_client_meta->snap_seqs,
+ m_image_copy_prog_ctx, ctx);
+ m_image_copy_request->get();
+ m_lock.Unlock();
+
+ update_progress("COPY_IMAGE");
+
+ m_image_copy_request->send();
+}
+
+template <typename I>
+void ImageSync<I>::handle_copy_image(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ {
+ Mutex::Locker timer_locker(*m_timer_lock);
+ Mutex::Locker locker(m_lock);
+ m_image_copy_request->put();
+ m_image_copy_request = nullptr;
+ delete m_image_copy_prog_ctx;
+ m_image_copy_prog_ctx = nullptr;
+ if (r == 0 && m_canceled) {
+ r = -ECANCELED;
+ }
+
+ if (m_update_sync_ctx != nullptr) {
+ m_timer->cancel_event(m_update_sync_ctx);
+ m_update_sync_ctx = nullptr;
+ }
+
+ if (m_updating_sync_point) {
+ m_ret_val = r;
+ return;
+ }
+ }
+
+ if (r == -ECANCELED) {
+ dout(10) << ": image copy canceled" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << ": failed to copy image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_flush_sync_point();
+}
+
+template <typename I>
+void ImageSync<I>::handle_copy_image_update_progress(uint64_t object_no,
+ uint64_t object_count) {
+ int percent = 100 * object_no / object_count;
+ update_progress("COPY_IMAGE " + stringify(percent) + "%");
+
+ Mutex::Locker locker(m_lock);
+ m_image_copy_object_no = object_no;
+ m_image_copy_object_count = object_count;
+
+ if (m_update_sync_ctx == nullptr && !m_updating_sync_point) {
+ send_update_sync_point();
+ }
+}
+
+template <typename I>
+void ImageSync<I>::send_update_sync_point() {
+ ceph_assert(m_lock.is_locked());
+
+ m_update_sync_ctx = nullptr;
+
+ if (m_canceled) {
+ return;
+ }
+
+ auto sync_point = &m_client_meta->sync_points.front();
+
+ if (m_client_meta->sync_object_count == m_image_copy_object_count &&
+ sync_point->object_number &&
+ (m_image_copy_object_no - 1) == sync_point->object_number.get()) {
+ // update sync point did not progress since last sync
+ return;
+ }
+
+ m_updating_sync_point = true;
+
+ m_client_meta_copy = *m_client_meta;
+ m_client_meta->sync_object_count = m_image_copy_object_count;
+ if (m_image_copy_object_no > 0) {
+ sync_point->object_number = m_image_copy_object_no - 1;
+ }
+
+ CephContext *cct = m_local_image_ctx->cct;
+ ldout(cct, 20) << ": sync_point=" << *sync_point << dendl;
+
+ bufferlist client_data_bl;
+ librbd::journal::ClientData client_data(*m_client_meta);
+ encode(client_data, client_data_bl);
+
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_update_sync_point>(
+ this);
+ m_journaler->update_client(client_data_bl, ctx);
+}
+
+template <typename I>
+void ImageSync<I>::handle_update_sync_point(int r) {
+ CephContext *cct = m_local_image_ctx->cct;
+ ldout(cct, 20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ *m_client_meta = m_client_meta_copy;
+ lderr(cct) << ": failed to update client data: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ {
+ Mutex::Locker timer_locker(*m_timer_lock);
+ Mutex::Locker locker(m_lock);
+ m_updating_sync_point = false;
+
+ if (m_image_copy_request != nullptr) {
+ m_update_sync_ctx = new FunctionContext(
+ [this](int r) {
+ Mutex::Locker locker(m_lock);
+ this->send_update_sync_point();
+ });
+ m_timer->add_event_after(m_update_sync_point_interval,
+ m_update_sync_ctx);
+ return;
+ }
+ }
+
+ send_flush_sync_point();
+}
+
+template <typename I>
+void ImageSync<I>::send_flush_sync_point() {
+ if (m_ret_val < 0) {
+ finish(m_ret_val);
+ return;
+ }
+
+ update_progress("FLUSH_SYNC_POINT");
+
+ m_client_meta_copy = *m_client_meta;
+ m_client_meta->sync_object_count = m_image_copy_object_count;
+ auto sync_point = &m_client_meta->sync_points.front();
+ if (m_image_copy_object_no > 0) {
+ sync_point->object_number = m_image_copy_object_no - 1;
+ } else {
+ sync_point->object_number = boost::none;
+ }
+
+ dout(10) << ": sync_point=" << *sync_point << dendl;
+
+ bufferlist client_data_bl;
+ librbd::journal::ClientData client_data(*m_client_meta);
+ encode(client_data, client_data_bl);
+
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_flush_sync_point>(
+ this);
+ m_journaler->update_client(client_data_bl, ctx);
+}
+
+template <typename I>
+void ImageSync<I>::handle_flush_sync_point(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ *m_client_meta = m_client_meta_copy;
+
+ derr << ": failed to update client data: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ send_prune_sync_points();
+}
+
+template <typename I>
+void ImageSync<I>::send_prune_sync_points() {
+ dout(10) << dendl;
+
+ update_progress("PRUNE_SYNC_POINTS");
+
+ Context *ctx = create_context_callback<
+ ImageSync<I>, &ImageSync<I>::handle_prune_sync_points>(this);
+ SyncPointPruneRequest<I> *request = SyncPointPruneRequest<I>::create(
+ m_remote_image_ctx, true, m_journaler, m_client_meta, ctx);
+ request->send();
+}
+
+template <typename I>
+void ImageSync<I>::handle_prune_sync_points(int r) {
+ dout(10) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to prune sync point: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (!m_client_meta->sync_points.empty()) {
+ send_copy_image();
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void ImageSync<I>::update_progress(const std::string &description) {
+ dout(20) << ": " << description << dendl;
+
+ if (m_progress_ctx) {
+ m_progress_ctx->update_progress("IMAGE_SYNC/" + description);
+ }
+}
+
+template <typename I>
+void ImageSync<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_instance_watcher->notify_sync_complete(m_local_image_ctx->id);
+ BaseRequest::finish(r);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ImageSync<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageSync.h b/src/tools/rbd_mirror/ImageSync.h
new file mode 100644
index 00000000..9e00c129
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageSync.h
@@ -0,0 +1,160 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_SYNC_H
+#define RBD_MIRROR_IMAGE_SYNC_H
+
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/journal/TypeTraits.h"
+#include "librbd/journal/Types.h"
+#include "common/Mutex.h"
+#include "tools/rbd_mirror/BaseRequest.h"
+#include <map>
+#include <vector>
+
+class Context;
+class ContextWQ;
+namespace journal { class Journaler; }
+namespace librbd { class ProgressContext; }
+namespace librbd { template <typename> class DeepCopyRequest; }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+namespace rbd {
+namespace mirror {
+
+class ProgressContext;
+
+template <typename> class InstanceWatcher;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageSync : public BaseRequest {
+public:
+ typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+ typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta;
+
+ static ImageSync* create(ImageCtxT *local_image_ctx,
+ ImageCtxT *remote_image_ctx,
+ SafeTimer *timer, Mutex *timer_lock,
+ const std::string &mirror_uuid,
+ Journaler *journaler,
+ MirrorPeerClientMeta *client_meta,
+ ContextWQ *work_queue,
+ InstanceWatcher<ImageCtxT> *instance_watcher,
+ Context *on_finish,
+ ProgressContext *progress_ctx = nullptr) {
+ return new ImageSync(local_image_ctx, remote_image_ctx, timer, timer_lock,
+ mirror_uuid, journaler, client_meta, work_queue,
+ instance_watcher, on_finish, progress_ctx);
+ }
+
+ ImageSync(ImageCtxT *local_image_ctx, ImageCtxT *remote_image_ctx,
+ SafeTimer *timer, Mutex *timer_lock, const std::string &mirror_uuid,
+ Journaler *journaler, MirrorPeerClientMeta *client_meta,
+ ContextWQ *work_queue, InstanceWatcher<ImageCtxT> *instance_watcher,
+ Context *on_finish, ProgressContext *progress_ctx = nullptr);
+ ~ImageSync() override;
+
+ void send() override;
+ void cancel() override;
+
+protected:
+ void finish(int r) override;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * NOTIFY_SYNC_REQUEST
+ * |
+ * v
+ * PRUNE_CATCH_UP_SYNC_POINT
+ * |
+ * v
+ * CREATE_SYNC_POINT (skip if already exists and
+ * | not disconnected)
+ * v
+ * COPY_IMAGE . . . . . . . . . . . . . .
+ * | .
+ * v .
+ * FLUSH_SYNC_POINT .
+ * | . (image sync canceled)
+ * v .
+ * PRUNE_SYNC_POINTS .
+ * | .
+ * v .
+ * <finish> < . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ typedef std::vector<librados::snap_t> SnapIds;
+ typedef std::map<librados::snap_t, SnapIds> SnapMap;
+ class ImageCopyProgressContext;
+
+ ImageCtxT *m_local_image_ctx;
+ ImageCtxT *m_remote_image_ctx;
+ SafeTimer *m_timer;
+ Mutex *m_timer_lock;
+ std::string m_mirror_uuid;
+ Journaler *m_journaler;
+ MirrorPeerClientMeta *m_client_meta;
+ ContextWQ *m_work_queue;
+ InstanceWatcher<ImageCtxT> *m_instance_watcher;
+ ProgressContext *m_progress_ctx;
+
+ SnapMap m_snap_map;
+
+ Mutex m_lock;
+ bool m_canceled = false;
+
+ librbd::DeepCopyRequest<ImageCtxT> *m_image_copy_request = nullptr;
+ librbd::ProgressContext *m_image_copy_prog_ctx = nullptr;
+
+ bool m_updating_sync_point = false;
+ Context *m_update_sync_ctx = nullptr;
+ double m_update_sync_point_interval;
+ uint64_t m_image_copy_object_no = 0;
+ uint64_t m_image_copy_object_count = 0;
+ MirrorPeerClientMeta m_client_meta_copy;
+
+ int m_ret_val = 0;
+
+ void send_notify_sync_request();
+ void handle_notify_sync_request(int r);
+
+ void send_prune_catch_up_sync_point();
+ void handle_prune_catch_up_sync_point(int r);
+
+ void send_create_sync_point();
+ void handle_create_sync_point(int r);
+
+ void send_update_max_object_count();
+ void handle_update_max_object_count(int r);
+
+ void send_copy_image();
+ void handle_copy_image(int r);
+ void handle_copy_image_update_progress(uint64_t object_no,
+ uint64_t object_count);
+ void send_update_sync_point();
+ void handle_update_sync_point(int r);
+
+ void send_flush_sync_point();
+ void handle_flush_sync_point(int r);
+
+ void send_prune_sync_points();
+ void handle_prune_sync_points(int r);
+
+ void update_progress(const std::string &description);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ImageSync<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_SYNC_H
diff --git a/src/tools/rbd_mirror/ImageSyncThrottler.cc b/src/tools/rbd_mirror/ImageSyncThrottler.cc
new file mode 100644
index 00000000..b395a012
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageSyncThrottler.cc
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "ImageSyncThrottler.h"
+#include "common/Formatter.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ImageSyncThrottler:: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+template <typename I>
+ImageSyncThrottler<I>::ImageSyncThrottler(CephContext *cct)
+ : m_cct(cct),
+ m_lock(librbd::util::unique_lock_name("rbd::mirror::ImageSyncThrottler",
+ this)),
+ m_max_concurrent_syncs(cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_concurrent_image_syncs")) {
+ dout(20) << "max_concurrent_syncs=" << m_max_concurrent_syncs << dendl;
+ m_cct->_conf.add_observer(this);
+}
+
+template <typename I>
+ImageSyncThrottler<I>::~ImageSyncThrottler() {
+ m_cct->_conf.remove_observer(this);
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_inflight_ops.empty());
+ ceph_assert(m_queue.empty());
+}
+
+template <typename I>
+void ImageSyncThrottler<I>::start_op(const std::string &id, Context *on_start) {
+ dout(20) << "id=" << id << dendl;
+
+ int r = 0;
+ {
+ Mutex::Locker locker(m_lock);
+
+ if (m_inflight_ops.count(id) > 0) {
+ dout(20) << "duplicate for already started op " << id << dendl;
+ } else if (m_queued_ops.count(id) > 0) {
+ dout(20) << "duplicate for already queued op " << id << dendl;
+ std::swap(m_queued_ops[id], on_start);
+ r = -ENOENT;
+ } else if (m_max_concurrent_syncs == 0 ||
+ m_inflight_ops.size() < m_max_concurrent_syncs) {
+ ceph_assert(m_queue.empty());
+ m_inflight_ops.insert(id);
+ dout(20) << "ready to start sync for " << id << " ["
+ << m_inflight_ops.size() << "/" << m_max_concurrent_syncs << "]"
+ << dendl;
+ } else {
+ m_queue.push_back(id);
+ std::swap(m_queued_ops[id], on_start);
+ dout(20) << "image sync for " << id << " has been queued" << dendl;
+ }
+ }
+
+ if (on_start != nullptr) {
+ on_start->complete(r);
+ }
+}
+
+template <typename I>
+bool ImageSyncThrottler<I>::cancel_op(const std::string &id) {
+ dout(20) << "id=" << id << dendl;
+
+ Context *on_start = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ auto it = m_queued_ops.find(id);
+ if (it != m_queued_ops.end()) {
+ dout(20) << "canceled queued sync for " << id << dendl;
+ m_queue.remove(id);
+ on_start = it->second;
+ m_queued_ops.erase(it);
+ }
+ }
+
+ if (on_start == nullptr) {
+ return false;
+ }
+
+ on_start->complete(-ECANCELED);
+ return true;
+}
+
+template <typename I>
+void ImageSyncThrottler<I>::finish_op(const std::string &id) {
+ dout(20) << "id=" << id << dendl;
+
+ if (cancel_op(id)) {
+ return;
+ }
+
+ Context *on_start = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+
+ m_inflight_ops.erase(id);
+
+ if (m_inflight_ops.size() < m_max_concurrent_syncs && !m_queue.empty()) {
+ auto id = m_queue.front();
+ auto it = m_queued_ops.find(id);
+ ceph_assert(it != m_queued_ops.end());
+ m_inflight_ops.insert(id);
+ dout(20) << "ready to start sync for " << id << " ["
+ << m_inflight_ops.size() << "/" << m_max_concurrent_syncs << "]"
+ << dendl;
+ on_start = it->second;
+ m_queued_ops.erase(it);
+ m_queue.pop_front();
+ }
+ }
+
+ if (on_start != nullptr) {
+ on_start->complete(0);
+ }
+}
+
+template <typename I>
+void ImageSyncThrottler<I>::drain(int r) {
+ dout(20) << dendl;
+
+ std::map<std::string, Context *> queued_ops;
+ {
+ Mutex::Locker locker(m_lock);
+ std::swap(m_queued_ops, queued_ops);
+ m_queue.clear();
+ m_inflight_ops.clear();
+ }
+
+ for (auto &it : queued_ops) {
+ it.second->complete(r);
+ }
+}
+
+template <typename I>
+void ImageSyncThrottler<I>::set_max_concurrent_syncs(uint32_t max) {
+ dout(20) << "max=" << max << dendl;
+
+ std::list<Context *> ops;
+ {
+ Mutex::Locker locker(m_lock);
+ m_max_concurrent_syncs = max;
+
+ // Start waiting ops in the case of available free slots
+ while ((m_max_concurrent_syncs == 0 ||
+ m_inflight_ops.size() < m_max_concurrent_syncs) &&
+ !m_queue.empty()) {
+ auto id = m_queue.front();
+ m_inflight_ops.insert(id);
+ dout(20) << "ready to start sync for " << id << " ["
+ << m_inflight_ops.size() << "/" << m_max_concurrent_syncs << "]"
+ << dendl;
+ auto it = m_queued_ops.find(id);
+ ceph_assert(it != m_queued_ops.end());
+ ops.push_back(it->second);
+ m_queued_ops.erase(it);
+ m_queue.pop_front();
+ }
+ }
+
+ for (const auto& ctx : ops) {
+ ctx->complete(0);
+ }
+}
+
+template <typename I>
+void ImageSyncThrottler<I>::print_status(Formatter *f, std::stringstream *ss) {
+ dout(20) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (f) {
+ f->dump_int("max_parallel_syncs", m_max_concurrent_syncs);
+ f->dump_int("running_syncs", m_inflight_ops.size());
+ f->dump_int("waiting_syncs", m_queue.size());
+ f->flush(*ss);
+ } else {
+ *ss << "[ ";
+ *ss << "max_parallel_syncs=" << m_max_concurrent_syncs << ", ";
+ *ss << "running_syncs=" << m_inflight_ops.size() << ", ";
+ *ss << "waiting_syncs=" << m_queue.size() << " ]";
+ }
+}
+
+template <typename I>
+const char** ImageSyncThrottler<I>::get_tracked_conf_keys() const {
+ static const char* KEYS[] = {
+ "rbd_mirror_concurrent_image_syncs",
+ NULL
+ };
+ return KEYS;
+}
+
+template <typename I>
+void ImageSyncThrottler<I>::handle_conf_change(const ConfigProxy& conf,
+ const set<string> &changed) {
+ if (changed.count("rbd_mirror_concurrent_image_syncs")) {
+ set_max_concurrent_syncs(conf.get_val<uint64_t>("rbd_mirror_concurrent_image_syncs"));
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ImageSyncThrottler<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ImageSyncThrottler.h b/src/tools/rbd_mirror/ImageSyncThrottler.h
new file mode 100644
index 00000000..c0cda61e
--- /dev/null
+++ b/src/tools/rbd_mirror/ImageSyncThrottler.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_SYNC_THROTTLER_H
+#define RBD_MIRROR_IMAGE_SYNC_THROTTLER_H
+
+#include <list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "common/Mutex.h"
+#include "common/config_obs.h"
+
+class CephContext;
+class Context;
+
+namespace ceph { class Formatter; }
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ImageSyncThrottler : public md_config_obs_t {
+public:
+ static ImageSyncThrottler *create(CephContext *cct) {
+ return new ImageSyncThrottler(cct);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ ImageSyncThrottler(CephContext *cct);
+ ~ImageSyncThrottler() override;
+
+ void set_max_concurrent_syncs(uint32_t max);
+ void start_op(const std::string &id, Context *on_start);
+ bool cancel_op(const std::string &id);
+ void finish_op(const std::string &id);
+ void drain(int r);
+
+ void print_status(Formatter *f, std::stringstream *ss);
+
+private:
+ CephContext *m_cct;
+ Mutex m_lock;
+ uint32_t m_max_concurrent_syncs;
+ std::list<std::string> m_queue;
+ std::map<std::string, Context *> m_queued_ops;
+ std::set<std::string> m_inflight_ops;
+
+ const char **get_tracked_conf_keys() const override;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) override;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ImageSyncThrottler<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_SYNC_THROTTLER_H
diff --git a/src/tools/rbd_mirror/InstanceReplayer.cc b/src/tools/rbd_mirror/InstanceReplayer.cc
new file mode 100644
index 00000000..c0086a48
--- /dev/null
+++ b/src/tools/rbd_mirror/InstanceReplayer.cc
@@ -0,0 +1,510 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/stringify.h"
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "ImageReplayer.h"
+#include "InstanceReplayer.h"
+#include "ServiceDaemon.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::InstanceReplayer: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+namespace {
+
+const std::string SERVICE_DAEMON_ASSIGNED_COUNT_KEY("image_assigned_count");
+const std::string SERVICE_DAEMON_WARNING_COUNT_KEY("image_warning_count");
+const std::string SERVICE_DAEMON_ERROR_COUNT_KEY("image_error_count");
+
+} // anonymous namespace
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+
+template <typename I>
+InstanceReplayer<I>::InstanceReplayer(
+ Threads<I> *threads, ServiceDaemon<I>* service_daemon,
+ RadosRef local_rados, const std::string &local_mirror_uuid,
+ int64_t local_pool_id)
+ : m_threads(threads), m_service_daemon(service_daemon),
+ m_local_rados(local_rados), m_local_mirror_uuid(local_mirror_uuid),
+ m_local_pool_id(local_pool_id),
+ m_lock("rbd::mirror::InstanceReplayer " + stringify(local_pool_id)) {
+}
+
+template <typename I>
+InstanceReplayer<I>::~InstanceReplayer() {
+ ceph_assert(m_image_state_check_task == nullptr);
+ ceph_assert(m_async_op_tracker.empty());
+ ceph_assert(m_image_replayers.empty());
+}
+
+template <typename I>
+bool InstanceReplayer<I>::is_blacklisted() const {
+ std::lock_guard locker{m_lock};
+ return m_blacklisted;
+}
+
+template <typename I>
+int InstanceReplayer<I>::init() {
+ C_SaferCond init_ctx;
+ init(&init_ctx);
+ return init_ctx.wait();
+}
+
+template <typename I>
+void InstanceReplayer<I>::init(Context *on_finish) {
+ dout(10) << dendl;
+
+ Context *ctx = new FunctionContext(
+ [this, on_finish] (int r) {
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ schedule_image_state_check_task();
+ }
+ on_finish->complete(0);
+ });
+
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::shut_down() {
+ C_SaferCond shut_down_ctx;
+ shut_down(&shut_down_ctx);
+ int r = shut_down_ctx.wait();
+ ceph_assert(r == 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::shut_down(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_on_shut_down == nullptr);
+ m_on_shut_down = on_finish;
+
+ Context *ctx = new FunctionContext(
+ [this] (int r) {
+ cancel_image_state_check_task();
+ wait_for_ops();
+ });
+
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::add_peer(std::string peer_uuid,
+ librados::IoCtx io_ctx) {
+ dout(10) << peer_uuid << dendl;
+
+ Mutex::Locker locker(m_lock);
+ auto result = m_peers.insert(Peer(peer_uuid, io_ctx)).second;
+ ceph_assert(result);
+}
+
+template <typename I>
+void InstanceReplayer<I>::release_all(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ C_Gather *gather_ctx = new C_Gather(g_ceph_context, on_finish);
+ for (auto it = m_image_replayers.begin(); it != m_image_replayers.end();
+ it = m_image_replayers.erase(it)) {
+ auto image_replayer = it->second;
+ auto ctx = gather_ctx->new_sub();
+ ctx = new FunctionContext(
+ [image_replayer, ctx] (int r) {
+ image_replayer->destroy();
+ ctx->complete(0);
+ });
+ stop_image_replayer(image_replayer, ctx);
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+void InstanceReplayer<I>::acquire_image(InstanceWatcher<I> *instance_watcher,
+ const std::string &global_image_id,
+ Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_on_shut_down == nullptr);
+
+ auto it = m_image_replayers.find(global_image_id);
+ if (it == m_image_replayers.end()) {
+ auto image_replayer = ImageReplayer<I>::create(
+ m_threads, instance_watcher, m_local_rados,
+ m_local_mirror_uuid, m_local_pool_id, global_image_id);
+
+ dout(10) << global_image_id << ": creating replayer " << image_replayer
+ << dendl;
+
+ it = m_image_replayers.insert(std::make_pair(global_image_id,
+ image_replayer)).first;
+
+ // TODO only a single peer is currently supported
+ ceph_assert(m_peers.size() == 1);
+ auto peer = *m_peers.begin();
+ image_replayer->add_peer(peer.peer_uuid, peer.io_ctx);
+ start_image_replayer(image_replayer);
+ } else {
+ // A duplicate acquire notification implies (1) connection hiccup or
+ // (2) new leader election. For the second case, restart the replayer to
+ // detect if the image has been deleted while the leader was offline
+ auto& image_replayer = it->second;
+ image_replayer->set_finished(false);
+ image_replayer->restart();
+ }
+
+ m_threads->work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::release_image(const std::string &global_image_id,
+ Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_shut_down == nullptr);
+
+ auto it = m_image_replayers.find(global_image_id);
+ if (it == m_image_replayers.end()) {
+ dout(5) << global_image_id << ": not found" << dendl;
+ m_threads->work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ auto image_replayer = it->second;
+ m_image_replayers.erase(it);
+
+ on_finish = new FunctionContext(
+ [image_replayer, on_finish] (int r) {
+ image_replayer->destroy();
+ on_finish->complete(0);
+ });
+ stop_image_replayer(image_replayer, on_finish);
+}
+
+template <typename I>
+void InstanceReplayer<I>::remove_peer_image(const std::string &global_image_id,
+ const std::string &peer_mirror_uuid,
+ Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << ", "
+ << "peer_mirror_uuid=" << peer_mirror_uuid << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_shut_down == nullptr);
+
+ auto it = m_image_replayers.find(global_image_id);
+ if (it != m_image_replayers.end()) {
+ // TODO only a single peer is currently supported, therefore
+ // we can just interrupt the current image replayer and
+ // it will eventually detect that the peer image is missing and
+ // determine if a delete propagation is required.
+ auto image_replayer = it->second;
+ image_replayer->restart();
+ }
+ m_threads->work_queue->queue(on_finish, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::print_status(Formatter *f, stringstream *ss) {
+ dout(10) << dendl;
+
+ if (!f) {
+ return;
+ }
+
+ Mutex::Locker locker(m_lock);
+
+ f->open_array_section("image_replayers");
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->print_status(f, ss);
+ }
+ f->close_section();
+}
+
+template <typename I>
+void InstanceReplayer<I>::start()
+{
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ m_manual_stop = false;
+
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->start(nullptr, true);
+ }
+}
+
+template <typename I>
+void InstanceReplayer<I>::stop()
+{
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ m_manual_stop = true;
+
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->stop(nullptr, true);
+ }
+}
+
+template <typename I>
+void InstanceReplayer<I>::restart()
+{
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ m_manual_stop = false;
+
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->restart();
+ }
+}
+
+template <typename I>
+void InstanceReplayer<I>::flush()
+{
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ for (auto &kv : m_image_replayers) {
+ auto &image_replayer = kv.second;
+ image_replayer->flush();
+ }
+}
+
+template <typename I>
+void InstanceReplayer<I>::start_image_replayer(
+ ImageReplayer<I> *image_replayer) {
+ ceph_assert(m_lock.is_locked());
+
+ std::string global_image_id = image_replayer->get_global_image_id();
+ if (!image_replayer->is_stopped()) {
+ return;
+ } else if (image_replayer->is_blacklisted()) {
+ derr << "global_image_id=" << global_image_id << ": blacklisted detected "
+ << "during image replay" << dendl;
+ m_blacklisted = true;
+ return;
+ } else if (image_replayer->is_finished()) {
+ // TODO temporary until policy integrated
+ dout(5) << "removing image replayer for global_image_id="
+ << global_image_id << dendl;
+ m_image_replayers.erase(image_replayer->get_global_image_id());
+ image_replayer->destroy();
+ return;
+ } else if (m_manual_stop) {
+ return;
+ }
+
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+ image_replayer->start(nullptr, false);
+}
+
+template <typename I>
+void InstanceReplayer<I>::queue_start_image_replayers() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ InstanceReplayer, &InstanceReplayer<I>::start_image_replayers>(this);
+ m_async_op_tracker.start_op();
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceReplayer<I>::start_image_replayers(int r) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ if (m_on_shut_down != nullptr) {
+ return;
+ }
+
+ uint64_t image_count = 0;
+ uint64_t warning_count = 0;
+ uint64_t error_count = 0;
+ for (auto it = m_image_replayers.begin();
+ it != m_image_replayers.end();) {
+ auto current_it(it);
+ ++it;
+
+ ++image_count;
+ auto health_state = current_it->second->get_health_state();
+ if (health_state == image_replayer::HEALTH_STATE_WARNING) {
+ ++warning_count;
+ } else if (health_state == image_replayer::HEALTH_STATE_ERROR) {
+ ++error_count;
+ }
+
+ start_image_replayer(current_it->second);
+ }
+
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_ASSIGNED_COUNT_KEY, image_count);
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_WARNING_COUNT_KEY, warning_count);
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_ERROR_COUNT_KEY, error_count);
+
+ m_async_op_tracker.finish_op();
+}
+
+template <typename I>
+void InstanceReplayer<I>::stop_image_replayer(ImageReplayer<I> *image_replayer,
+ Context *on_finish) {
+ dout(10) << image_replayer << " global_image_id="
+ << image_replayer->get_global_image_id() << ", on_finish="
+ << on_finish << dendl;
+
+ if (image_replayer->is_stopped()) {
+ m_threads->work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ m_async_op_tracker.start_op();
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, new FunctionContext(
+ [this, image_replayer, on_finish] (int r) {
+ stop_image_replayer(image_replayer, on_finish);
+ m_async_op_tracker.finish_op();
+ }));
+
+ if (image_replayer->is_running()) {
+ image_replayer->stop(ctx, false);
+ } else {
+ int after = 1;
+ dout(10) << "scheduling image replayer " << image_replayer << " stop after "
+ << after << " sec (task " << ctx << ")" << dendl;
+ ctx = new FunctionContext(
+ [this, after, ctx] (int r) {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ m_threads->timer->add_event_after(after, ctx);
+ });
+ m_threads->work_queue->queue(ctx, 0);
+ }
+}
+
+template <typename I>
+void InstanceReplayer<I>::wait_for_ops() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ InstanceReplayer, &InstanceReplayer<I>::handle_wait_for_ops>(this);
+
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void InstanceReplayer<I>::handle_wait_for_ops(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ Mutex::Locker locker(m_lock);
+ stop_image_replayers();
+}
+
+template <typename I>
+void InstanceReplayer<I>::stop_image_replayers() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<InstanceReplayer<I>,
+ &InstanceReplayer<I>::handle_stop_image_replayers>(this));
+
+ C_Gather *gather_ctx = new C_Gather(g_ceph_context, ctx);
+ for (auto &it : m_image_replayers) {
+ stop_image_replayer(it.second, gather_ctx->new_sub());
+ }
+ gather_ctx->activate();
+}
+
+template <typename I>
+void InstanceReplayer<I>::handle_stop_image_replayers(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+
+ for (auto &it : m_image_replayers) {
+ ceph_assert(it.second->is_stopped());
+ it.second->destroy();
+ }
+ m_image_replayers.clear();
+
+ ceph_assert(m_on_shut_down != nullptr);
+ std::swap(on_finish, m_on_shut_down);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void InstanceReplayer<I>::cancel_image_state_check_task() {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+
+ if (m_image_state_check_task == nullptr) {
+ return;
+ }
+
+ dout(10) << m_image_state_check_task << dendl;
+ bool canceled = m_threads->timer->cancel_event(m_image_state_check_task);
+ ceph_assert(canceled);
+ m_image_state_check_task = nullptr;
+}
+
+template <typename I>
+void InstanceReplayer<I>::schedule_image_state_check_task() {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_image_state_check_task == nullptr);
+
+ m_image_state_check_task = new FunctionContext(
+ [this](int r) {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ m_image_state_check_task = nullptr;
+ schedule_image_state_check_task();
+ queue_start_image_replayers();
+ });
+
+ auto cct = static_cast<CephContext *>(m_local_rados->cct());
+ int after = cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_image_state_check_interval");
+
+ dout(10) << "scheduling image state check after " << after << " sec (task "
+ << m_image_state_check_task << ")" << dendl;
+ m_threads->timer->add_event_after(after, m_image_state_check_task);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::InstanceReplayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/InstanceReplayer.h b/src/tools/rbd_mirror/InstanceReplayer.h
new file mode 100644
index 00000000..efbdde02
--- /dev/null
+++ b/src/tools/rbd_mirror/InstanceReplayer.h
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_INSTANCE_REPLAYER_H
+#define RBD_MIRROR_INSTANCE_REPLAYER_H
+
+#include <map>
+#include <sstream>
+
+#include "common/AsyncOpTracker.h"
+#include "common/Formatter.h"
+#include "common/Mutex.h"
+#include "tools/rbd_mirror/Types.h"
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class ImageReplayer;
+template <typename> class InstanceWatcher;
+template <typename> class ServiceDaemon;
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class InstanceReplayer {
+public:
+ static InstanceReplayer* create(
+ Threads<ImageCtxT> *threads,
+ ServiceDaemon<ImageCtxT>* service_daemon,
+ RadosRef local_rados, const std::string &local_mirror_uuid,
+ int64_t local_pool_id) {
+ return new InstanceReplayer(threads, service_daemon, local_rados,
+ local_mirror_uuid, local_pool_id);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ InstanceReplayer(Threads<ImageCtxT> *threads,
+ ServiceDaemon<ImageCtxT>* service_daemon,
+ RadosRef local_rados, const std::string &local_mirror_uuid,
+ int64_t local_pool_id);
+ ~InstanceReplayer();
+
+ bool is_blacklisted() const;
+
+ int init();
+ void shut_down();
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+ void add_peer(std::string peer_uuid, librados::IoCtx io_ctx);
+
+ void acquire_image(InstanceWatcher<ImageCtxT> *instance_watcher,
+ const std::string &global_image_id, Context *on_finish);
+ void release_image(const std::string &global_image_id, Context *on_finish);
+ void remove_peer_image(const std::string &global_image_id,
+ const std::string &peer_mirror_uuid,
+ Context *on_finish);
+
+ void release_all(Context *on_finish);
+
+ void print_status(Formatter *f, stringstream *ss);
+ void start();
+ void stop();
+ void restart();
+ void flush();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <uninitialized> <-------------------\
+ * | (init) | (repeat for each
+ * v STOP_IMAGE_REPLAYER ---\ image replayer)
+ * SCHEDULE_IMAGE_STATE_CHECK_TASK ^ ^ |
+ * | | | |
+ * v (shut_down) | \---------/
+ * <initialized> -----------------> WAIT_FOR_OPS
+ *
+ * @endverbatim
+ */
+
+ Threads<ImageCtxT> *m_threads;
+ ServiceDaemon<ImageCtxT>* m_service_daemon;
+ RadosRef m_local_rados;
+ std::string m_local_mirror_uuid;
+ int64_t m_local_pool_id;
+
+ mutable Mutex m_lock;
+ AsyncOpTracker m_async_op_tracker;
+ std::map<std::string, ImageReplayer<ImageCtxT> *> m_image_replayers;
+ Peers m_peers;
+ Context *m_image_state_check_task = nullptr;
+ Context *m_on_shut_down = nullptr;
+ bool m_manual_stop = false;
+ bool m_blacklisted = false;
+
+ void wait_for_ops();
+ void handle_wait_for_ops(int r);
+
+ void start_image_replayer(ImageReplayer<ImageCtxT> *image_replayer);
+ void queue_start_image_replayers();
+ void start_image_replayers(int r);
+
+ void stop_image_replayer(ImageReplayer<ImageCtxT> *image_replayer,
+ Context *on_finish);
+
+ void stop_image_replayers();
+ void handle_stop_image_replayers(int r);
+
+ void schedule_image_state_check_task();
+ void cancel_image_state_check_task();
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::InstanceReplayer<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_INSTANCE_REPLAYER_H
diff --git a/src/tools/rbd_mirror/InstanceWatcher.cc b/src/tools/rbd_mirror/InstanceWatcher.cc
new file mode 100644
index 00000000..d9e1ba23
--- /dev/null
+++ b/src/tools/rbd_mirror/InstanceWatcher.cc
@@ -0,0 +1,1299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "InstanceWatcher.h"
+#include "include/stringify.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ManagedLock.h"
+#include "librbd/Utils.h"
+#include "InstanceReplayer.h"
+#include "ImageSyncThrottler.h"
+#include "common/Cond.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::InstanceWatcher: "
+
+namespace rbd {
+namespace mirror {
+
+using namespace instance_watcher;
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+using librbd::util::unique_lock_name;
+
+namespace {
+
+struct C_GetInstances : public Context {
+ std::vector<std::string> *instance_ids;
+ Context *on_finish;
+ bufferlist out_bl;
+
+ C_GetInstances(std::vector<std::string> *instance_ids, Context *on_finish)
+ : instance_ids(instance_ids), on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ dout(10) << "C_GetInstances: " << this << " " << __func__ << ": r=" << r
+ << dendl;
+
+ if (r == 0) {
+ auto it = out_bl.cbegin();
+ r = librbd::cls_client::mirror_instances_list_finish(&it, instance_ids);
+ } else if (r == -ENOENT) {
+ r = 0;
+ }
+ on_finish->complete(r);
+ }
+};
+
+template <typename I>
+struct C_RemoveInstanceRequest : public Context {
+ InstanceWatcher<I> instance_watcher;
+ Context *on_finish;
+
+ C_RemoveInstanceRequest(librados::IoCtx &io_ctx, ContextWQ *work_queue,
+ const std::string &instance_id, Context *on_finish)
+ : instance_watcher(io_ctx, work_queue, nullptr, instance_id),
+ on_finish(on_finish) {
+ }
+
+ void send() {
+ dout(10) << "C_RemoveInstanceRequest: " << this << " " << __func__ << dendl;
+
+ instance_watcher.remove(this);
+ }
+
+ void finish(int r) override {
+ dout(10) << "C_RemoveInstanceRequest: " << this << " " << __func__ << ": r="
+ << r << dendl;
+ ceph_assert(r == 0);
+
+ on_finish->complete(r);
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+struct InstanceWatcher<I>::C_NotifyInstanceRequest : public Context {
+ InstanceWatcher<I> *instance_watcher;
+ std::string instance_id;
+ uint64_t request_id;
+ bufferlist bl;
+ Context *on_finish;
+ bool send_to_leader;
+ std::unique_ptr<librbd::watcher::Notifier> notifier;
+ librbd::watcher::NotifyResponse response;
+ bool canceling = false;
+
+ C_NotifyInstanceRequest(InstanceWatcher<I> *instance_watcher,
+ const std::string &instance_id, uint64_t request_id,
+ bufferlist &&bl, Context *on_finish)
+ : instance_watcher(instance_watcher), instance_id(instance_id),
+ request_id(request_id), bl(bl), on_finish(on_finish),
+ send_to_leader(instance_id.empty()) {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": instance_watcher=" << instance_watcher << ", instance_id="
+ << instance_id << ", request_id=" << request_id << dendl;
+
+ ceph_assert(instance_watcher->m_lock.is_locked());
+
+ if (!send_to_leader) {
+ ceph_assert((!instance_id.empty()));
+ notifier.reset(new librbd::watcher::Notifier(
+ instance_watcher->m_work_queue,
+ instance_watcher->m_ioctx,
+ RBD_MIRROR_INSTANCE_PREFIX + instance_id));
+ }
+
+ instance_watcher->m_notify_op_tracker.start_op();
+ auto result = instance_watcher->m_notify_ops.insert(
+ std::make_pair(instance_id, this)).second;
+ ceph_assert(result);
+ }
+
+ void send() {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ << dendl;
+
+ ceph_assert(instance_watcher->m_lock.is_locked());
+
+ if (canceling) {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": canceling" << dendl;
+ instance_watcher->m_work_queue->queue(this, -ECANCELED);
+ return;
+ }
+
+ if (send_to_leader) {
+ if (instance_watcher->m_leader_instance_id.empty()) {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": suspending" << dendl;
+ instance_watcher->suspend_notify_request(this);
+ return;
+ }
+
+ if (instance_watcher->m_leader_instance_id != instance_id) {
+ auto count = instance_watcher->m_notify_ops.erase(
+ std::make_pair(instance_id, this));
+ ceph_assert(count > 0);
+
+ instance_id = instance_watcher->m_leader_instance_id;
+
+ auto result = instance_watcher->m_notify_ops.insert(
+ std::make_pair(instance_id, this)).second;
+ ceph_assert(result);
+
+ notifier.reset(new librbd::watcher::Notifier(
+ instance_watcher->m_work_queue,
+ instance_watcher->m_ioctx,
+ RBD_MIRROR_INSTANCE_PREFIX + instance_id));
+ }
+ }
+
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": sending to " << instance_id << dendl;
+ notifier->notify(bl, &response, this);
+ }
+
+ void cancel() {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ << dendl;
+
+ ceph_assert(instance_watcher->m_lock.is_locked());
+
+ canceling = true;
+ instance_watcher->unsuspend_notify_request(this);
+ }
+
+ void finish(int r) override {
+ dout(10) << "C_NotifyInstanceRequest: " << this << " " << __func__ << ": r="
+ << r << dendl;
+
+ if (r == 0 || r == -ETIMEDOUT) {
+ bool found = false;
+ for (auto &it : response.acks) {
+ auto &bl = it.second;
+ if (it.second.length() == 0) {
+ dout(5) << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": no payload in ack, ignoring" << dendl;
+ continue;
+ }
+ try {
+ auto iter = bl.cbegin();
+ NotifyAckPayload ack;
+ decode(ack, iter);
+ if (ack.instance_id != instance_watcher->get_instance_id()) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": ack instance_id (" << ack.instance_id << ") "
+ << "does not match, ignoring" << dendl;
+ continue;
+ }
+ if (ack.request_id != request_id) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": ack request_id (" << ack.request_id << ") "
+ << "does not match, ignoring" << dendl;
+ continue;
+ }
+ r = ack.ret_val;
+ found = true;
+ break;
+ } catch (const buffer::error &err) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": failed to decode ack: " << err.what() << dendl;
+ continue;
+ }
+ }
+
+ if (!found) {
+ if (r == -ETIMEDOUT) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": resending after timeout" << dendl;
+ Mutex::Locker locker(instance_watcher->m_lock);
+ send();
+ return;
+ } else {
+ r = -EINVAL;
+ }
+ } else {
+ if (r == -ESTALE && send_to_leader) {
+ derr << "C_NotifyInstanceRequest: " << this << " " << __func__
+ << ": resending due to leader change" << dendl;
+ Mutex::Locker locker(instance_watcher->m_lock);
+ send();
+ return;
+ }
+ }
+ }
+
+ on_finish->complete(r);
+
+ {
+ Mutex::Locker locker(instance_watcher->m_lock);
+ auto result = instance_watcher->m_notify_ops.erase(
+ std::make_pair(instance_id, this));
+ ceph_assert(result > 0);
+ instance_watcher->m_notify_op_tracker.finish_op();
+ }
+
+ delete this;
+ }
+
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+template <typename I>
+struct InstanceWatcher<I>::C_SyncRequest : public Context {
+ InstanceWatcher<I> *instance_watcher;
+ std::string sync_id;
+ Context *on_start;
+ Context *on_complete = nullptr;
+ C_NotifyInstanceRequest *req = nullptr;
+
+ C_SyncRequest(InstanceWatcher<I> *instance_watcher,
+ const std::string &sync_id, Context *on_start)
+ : instance_watcher(instance_watcher), sync_id(sync_id),
+ on_start(on_start) {
+ dout(10) << "C_SyncRequest: " << this << " " << __func__ << ": sync_id="
+ << sync_id << dendl;
+ }
+
+ void finish(int r) override {
+ dout(10) << "C_SyncRequest: " << this << " " << __func__ << ": r="
+ << r << dendl;
+
+ if (on_start != nullptr) {
+ instance_watcher->handle_notify_sync_request(this, r);
+ } else {
+ instance_watcher->handle_notify_sync_complete(this, r);
+ delete this;
+ }
+ }
+
+ // called twice
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::InstanceWatcher: " \
+ << this << " " << __func__ << ": "
+template <typename I>
+void InstanceWatcher<I>::get_instances(librados::IoCtx &io_ctx,
+ std::vector<std::string> *instance_ids,
+ Context *on_finish) {
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_instances_list_start(&op);
+ C_GetInstances *ctx = new C_GetInstances(instance_ids, on_finish);
+ librados::AioCompletion *aio_comp = create_rados_callback(ctx);
+
+ int r = io_ctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op, &ctx->out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::remove_instance(librados::IoCtx &io_ctx,
+ ContextWQ *work_queue,
+ const std::string &instance_id,
+ Context *on_finish) {
+ auto req = new C_RemoveInstanceRequest<I>(io_ctx, work_queue, instance_id,
+ on_finish);
+ req->send();
+}
+
+template <typename I>
+InstanceWatcher<I> *InstanceWatcher<I>::create(
+ librados::IoCtx &io_ctx, ContextWQ *work_queue,
+ InstanceReplayer<I> *instance_replayer) {
+ return new InstanceWatcher<I>(io_ctx, work_queue, instance_replayer,
+ stringify(io_ctx.get_instance_id()));
+}
+
+template <typename I>
+InstanceWatcher<I>::InstanceWatcher(librados::IoCtx &io_ctx,
+ ContextWQ *work_queue,
+ InstanceReplayer<I> *instance_replayer,
+ const std::string &instance_id)
+ : Watcher(io_ctx, work_queue, RBD_MIRROR_INSTANCE_PREFIX + instance_id),
+ m_instance_replayer(instance_replayer), m_instance_id(instance_id),
+ m_lock(unique_lock_name("rbd::mirror::InstanceWatcher::m_lock", this)),
+ m_instance_lock(librbd::ManagedLock<I>::create(
+ m_ioctx, m_work_queue, m_oid, this, librbd::managed_lock::EXCLUSIVE, true,
+ m_cct->_conf.get_val<uint64_t>("rbd_blacklist_expire_seconds"))) {
+}
+
+template <typename I>
+InstanceWatcher<I>::~InstanceWatcher() {
+ ceph_assert(m_requests.empty());
+ ceph_assert(m_notify_ops.empty());
+ ceph_assert(m_notify_op_tracker.empty());
+ ceph_assert(m_suspended_ops.empty());
+ ceph_assert(m_inflight_sync_reqs.empty());
+ ceph_assert(m_image_sync_throttler == nullptr);
+ m_instance_lock->destroy();
+}
+
+template <typename I>
+int InstanceWatcher<I>::init() {
+ C_SaferCond init_ctx;
+ init(&init_ctx);
+ return init_ctx.wait();
+}
+
+template <typename I>
+void InstanceWatcher<I>::init(Context *on_finish) {
+ dout(10) << "instance_id=" << m_instance_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ register_instance();
+}
+
+template <typename I>
+void InstanceWatcher<I>::shut_down() {
+ C_SaferCond shut_down_ctx;
+ shut_down(&shut_down_ctx);
+ int r = shut_down_ctx.wait();
+ ceph_assert(r == 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::shut_down(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ release_lock();
+}
+
+template <typename I>
+void InstanceWatcher<I>::remove(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ get_instance_locker();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_image_acquire(
+ const std::string &instance_id, const std::string &global_image_id,
+ Context *on_notify_ack) {
+ dout(10) << "instance_id=" << instance_id << ", global_image_id="
+ << global_image_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_on_finish == nullptr);
+
+ uint64_t request_id = ++m_request_seq;
+ bufferlist bl;
+ encode(NotifyMessage{ImageAcquirePayload{request_id, global_image_id}}, bl);
+ auto req = new C_NotifyInstanceRequest(this, instance_id, request_id,
+ std::move(bl), on_notify_ack);
+ req->send();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_image_release(
+ const std::string &instance_id, const std::string &global_image_id,
+ Context *on_notify_ack) {
+ dout(10) << "instance_id=" << instance_id << ", global_image_id="
+ << global_image_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_on_finish == nullptr);
+
+ uint64_t request_id = ++m_request_seq;
+ bufferlist bl;
+ encode(NotifyMessage{ImageReleasePayload{request_id, global_image_id}}, bl);
+ auto req = new C_NotifyInstanceRequest(this, instance_id, request_id,
+ std::move(bl), on_notify_ack);
+ req->send();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_peer_image_removed(
+ const std::string &instance_id, const std::string &global_image_id,
+ const std::string &peer_mirror_uuid, Context *on_notify_ack) {
+ dout(10) << "instance_id=" << instance_id << ", "
+ << "global_image_id=" << global_image_id << ", "
+ << "peer_mirror_uuid=" << peer_mirror_uuid << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_finish == nullptr);
+
+ uint64_t request_id = ++m_request_seq;
+ bufferlist bl;
+ encode(NotifyMessage{PeerImageRemovedPayload{request_id, global_image_id,
+ peer_mirror_uuid}}, bl);
+ auto req = new C_NotifyInstanceRequest(this, instance_id, request_id,
+ std::move(bl), on_notify_ack);
+ req->send();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_request(const std::string &sync_id,
+ Context *on_sync_start) {
+ dout(10) << "sync_id=" << sync_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_inflight_sync_reqs.count(sync_id) == 0);
+
+ uint64_t request_id = ++m_request_seq;
+
+ bufferlist bl;
+ encode(NotifyMessage{SyncRequestPayload{request_id, sync_id}}, bl);
+
+ auto sync_ctx = new C_SyncRequest(this, sync_id, on_sync_start);
+ sync_ctx->req = new C_NotifyInstanceRequest(this, "", request_id,
+ std::move(bl), sync_ctx);
+
+ m_inflight_sync_reqs[sync_id] = sync_ctx;
+ sync_ctx->req->send();
+}
+
+template <typename I>
+bool InstanceWatcher<I>::cancel_sync_request(const std::string &sync_id) {
+ dout(10) << "sync_id=" << sync_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ auto it = m_inflight_sync_reqs.find(sync_id);
+ if (it == m_inflight_sync_reqs.end()) {
+ return false;
+ }
+
+ auto sync_ctx = it->second;
+
+ if (sync_ctx->on_start == nullptr) {
+ return false;
+ }
+
+ ceph_assert(sync_ctx->req != nullptr);
+ sync_ctx->req->cancel();
+ return true;
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_start(const std::string &instance_id,
+ const std::string &sync_id) {
+ dout(10) << "sync_id=" << sync_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ uint64_t request_id = ++m_request_seq;
+
+ bufferlist bl;
+ encode(NotifyMessage{SyncStartPayload{request_id, sync_id}}, bl);
+
+ auto ctx = new FunctionContext(
+ [this, sync_id] (int r) {
+ dout(10) << "finish: sync_id=" << sync_id << ", r=" << r << dendl;
+ Mutex::Locker locker(m_lock);
+ if (r != -ESTALE && m_image_sync_throttler != nullptr) {
+ m_image_sync_throttler->finish_op(sync_id);
+ }
+ });
+ auto req = new C_NotifyInstanceRequest(this, instance_id, request_id,
+ std::move(bl), ctx);
+ req->send();
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_complete(const std::string &sync_id) {
+ Mutex::Locker locker(m_lock);
+ notify_sync_complete(m_lock, sync_id);
+}
+
+template <typename I>
+void InstanceWatcher<I>::notify_sync_complete(const Mutex&,
+ const std::string &sync_id) {
+ dout(10) << "sync_id=" << sync_id << dendl;
+ ceph_assert(m_lock.is_locked());
+
+ auto it = m_inflight_sync_reqs.find(sync_id);
+ ceph_assert(it != m_inflight_sync_reqs.end());
+
+ auto sync_ctx = it->second;
+ ceph_assert(sync_ctx->req == nullptr);
+
+ m_inflight_sync_reqs.erase(it);
+ m_work_queue->queue(sync_ctx, 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_notify_sync_request(C_SyncRequest *sync_ctx,
+ int r) {
+ dout(10) << "sync_id=" << sync_ctx->sync_id << ", r=" << r << dendl;
+
+ Context *on_start = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(sync_ctx->req != nullptr);
+ ceph_assert(sync_ctx->on_start != nullptr);
+
+ if (sync_ctx->req->canceling) {
+ r = -ECANCELED;
+ }
+
+ std::swap(sync_ctx->on_start, on_start);
+ sync_ctx->req = nullptr;
+
+ if (r == -ECANCELED) {
+ notify_sync_complete(m_lock, sync_ctx->sync_id);
+ }
+ }
+
+ on_start->complete(r == -ECANCELED ? r : 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_notify_sync_complete(C_SyncRequest *sync_ctx,
+ int r) {
+ dout(10) << "sync_id=" << sync_ctx->sync_id << ", r=" << r << dendl;
+
+ if (sync_ctx->on_complete != nullptr) {
+ sync_ctx->on_complete->complete(r);
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::print_sync_status(Formatter *f, stringstream *ss) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ if (m_image_sync_throttler != nullptr) {
+ m_image_sync_throttler->print_status(f, ss);
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_acquire_leader() {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_image_sync_throttler == nullptr);
+ m_image_sync_throttler = ImageSyncThrottler<I>::create(m_cct);
+
+ m_leader_instance_id = m_instance_id;
+ unsuspend_notify_requests();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_release_leader() {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_image_sync_throttler != nullptr);
+
+ m_leader_instance_id.clear();
+
+ m_image_sync_throttler->drain(-ESTALE);
+ m_image_sync_throttler->destroy();
+ m_image_sync_throttler = nullptr;
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_update_leader(
+ const std::string &leader_instance_id) {
+ dout(10) << "leader_instance_id=" << leader_instance_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ m_leader_instance_id = leader_instance_id;
+
+ if (!m_leader_instance_id.empty()) {
+ unsuspend_notify_requests();
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::cancel_notify_requests(
+ const std::string &instance_id) {
+ dout(10) << "instance_id=" << instance_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ for (auto op : m_notify_ops) {
+ if (op.first == instance_id && !op.second->send_to_leader) {
+ op.second->cancel();
+ }
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::register_instance() {
+ ceph_assert(m_lock.is_locked());
+
+ dout(10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_instances_add(&op, m_instance_id);
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_register_instance>(this);
+
+ int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_register_instance(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+
+ if (r == 0) {
+ create_instance_object();
+ return;
+ }
+
+ derr << "error registering instance: " << cpp_strerror(r) << dendl;
+
+ std::swap(on_finish, m_on_finish);
+ }
+ on_finish->complete(r);
+}
+
+
+template <typename I>
+void InstanceWatcher<I>::create_instance_object() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ librados::ObjectWriteOperation op;
+ op.create(true);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ InstanceWatcher<I>,
+ &InstanceWatcher<I>::handle_create_instance_object>(this);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_create_instance_object(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ derr << "error creating " << m_oid << " object: " << cpp_strerror(r)
+ << dendl;
+
+ m_ret_val = r;
+ unregister_instance();
+ return;
+ }
+
+ register_watch();
+}
+
+template <typename I>
+void InstanceWatcher<I>::register_watch() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_register_watch>(this));
+
+ librbd::Watcher::register_watch(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_register_watch(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ derr << "error registering instance watcher for " << m_oid << " object: "
+ << cpp_strerror(r) << dendl;
+
+ m_ret_val = r;
+ remove_instance_object();
+ return;
+ }
+
+ acquire_lock();
+}
+
+template <typename I>
+void InstanceWatcher<I>::acquire_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_acquire_lock>(this));
+
+ m_instance_lock->acquire_lock(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_acquire_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+
+ derr << "error acquiring instance lock: " << cpp_strerror(r) << dendl;
+
+ m_ret_val = r;
+ unregister_watch();
+ return;
+ }
+
+ std::swap(on_finish, m_on_finish);
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void InstanceWatcher<I>::release_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_release_lock>(this));
+
+ m_instance_lock->shut_down(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_release_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ derr << "error releasing instance lock: " << cpp_strerror(r) << dendl;
+ }
+
+ unregister_watch();
+}
+
+template <typename I>
+void InstanceWatcher<I>::unregister_watch() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_unregister_watch>(this));
+
+ librbd::Watcher::unregister_watch(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_unregister_watch(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error unregistering instance watcher for " << m_oid << " object: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ Mutex::Locker locker(m_lock);
+ remove_instance_object();
+}
+
+template <typename I>
+void InstanceWatcher<I>::remove_instance_object() {
+ ceph_assert(m_lock.is_locked());
+
+ dout(10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ op.remove();
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ InstanceWatcher<I>,
+ &InstanceWatcher<I>::handle_remove_instance_object>(this);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_remove_instance_object(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+
+ if (r < 0) {
+ derr << "error removing " << m_oid << " object: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ Mutex::Locker locker(m_lock);
+ unregister_instance();
+}
+
+template <typename I>
+void InstanceWatcher<I>::unregister_instance() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_instances_remove(&op, m_instance_id);
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_unregister_instance>(this);
+
+ int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_unregister_instance(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error unregistering instance: " << cpp_strerror(r) << dendl;
+ }
+
+ Mutex::Locker locker(m_lock);
+ wait_for_notify_ops();
+}
+
+template <typename I>
+void InstanceWatcher<I>::wait_for_notify_ops() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ for (auto op : m_notify_ops) {
+ op.second->cancel();
+ }
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_wait_for_notify_ops>(this));
+
+ m_notify_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_wait_for_notify_ops(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_notify_ops.empty());
+
+ std::swap(on_finish, m_on_finish);
+ r = m_ret_val;
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void InstanceWatcher<I>::get_instance_locker() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_get_instance_locker>(this));
+
+ m_instance_lock->get_locker(&m_instance_locker, ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_get_instance_locker(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ if (r != -ENOENT) {
+ derr << "error retrieving instance locker: " << cpp_strerror(r) << dendl;
+ }
+ remove_instance_object();
+ return;
+ }
+
+ break_instance_lock();
+}
+
+template <typename I>
+void InstanceWatcher<I>::break_instance_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ InstanceWatcher<I>, &InstanceWatcher<I>::handle_break_instance_lock>(this));
+
+ m_instance_lock->break_lock(m_instance_locker, true, ctx);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_break_instance_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ if (r != -ENOENT) {
+ derr << "error breaking instance lock: " << cpp_strerror(r) << dendl;
+ }
+ remove_instance_object();
+ return;
+ }
+
+ remove_instance_object();
+}
+
+template <typename I>
+void InstanceWatcher<I>::suspend_notify_request(C_NotifyInstanceRequest *req) {
+ dout(10) << req << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ auto result = m_suspended_ops.insert(req).second;
+ ceph_assert(result);
+}
+
+template <typename I>
+bool InstanceWatcher<I>::unsuspend_notify_request(
+ C_NotifyInstanceRequest *req) {
+ dout(10) << req << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ auto result = m_suspended_ops.erase(req);
+ if (result == 0) {
+ return false;
+ }
+
+ req->send();
+ return true;
+}
+
+template <typename I>
+void InstanceWatcher<I>::unsuspend_notify_requests() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ std::set<C_NotifyInstanceRequest *> suspended_ops;
+ std::swap(m_suspended_ops, suspended_ops);
+
+ for (auto op : suspended_ops) {
+ op->send();
+ }
+}
+
+template <typename I>
+Context *InstanceWatcher<I>::prepare_request(const std::string &instance_id,
+ uint64_t request_id,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "instance_id=" << instance_id << ", request_id=" << request_id
+ << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ Context *ctx = nullptr;
+ Request request(instance_id, request_id);
+ auto it = m_requests.find(request);
+
+ if (it != m_requests.end()) {
+ dout(10) << "duplicate for in-progress request" << dendl;
+ delete it->on_notify_ack;
+ m_requests.erase(it);
+ } else {
+ ctx = create_async_context_callback(
+ m_work_queue, new FunctionContext(
+ [this, instance_id, request_id] (int r) {
+ complete_request(instance_id, request_id, r);
+ }));
+ }
+
+ request.on_notify_ack = on_notify_ack;
+ m_requests.insert(request);
+ return ctx;
+}
+
+template <typename I>
+void InstanceWatcher<I>::complete_request(const std::string &instance_id,
+ uint64_t request_id, int r) {
+ dout(10) << "instance_id=" << instance_id << ", request_id=" << request_id
+ << dendl;
+
+ C_NotifyAck *on_notify_ack;
+ {
+ Mutex::Locker locker(m_lock);
+ Request request(instance_id, request_id);
+ auto it = m_requests.find(request);
+ ceph_assert(it != m_requests.end());
+ on_notify_ack = it->on_notify_ack;
+ m_requests.erase(it);
+ }
+
+ encode(NotifyAckPayload(instance_id, request_id, r), on_notify_ack->out);
+ on_notify_ack->complete(0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ dout(10) << "notify_id=" << notify_id << ", handle=" << handle << ", "
+ << "notifier_id=" << notifier_id << dendl;
+
+ auto ctx = new C_NotifyAck(this, notify_id, handle);
+
+ NotifyMessage notify_message;
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ derr << "error decoding image notification: " << err.what() << dendl;
+ ctx->complete(0);
+ return;
+ }
+
+ apply_visitor(HandlePayloadVisitor(this, stringify(notifier_id), ctx),
+ notify_message.payload);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_image_acquire(
+ const std::string &global_image_id, Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+
+ auto ctx = new FunctionContext(
+ [this, global_image_id, on_finish] (int r) {
+ m_instance_replayer->acquire_image(this, global_image_id, on_finish);
+ m_notify_op_tracker.finish_op();
+ });
+
+ m_notify_op_tracker.start_op();
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_image_release(
+ const std::string &global_image_id, Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << dendl;
+
+ auto ctx = new FunctionContext(
+ [this, global_image_id, on_finish] (int r) {
+ m_instance_replayer->release_image(global_image_id, on_finish);
+ m_notify_op_tracker.finish_op();
+ });
+
+ m_notify_op_tracker.start_op();
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_peer_image_removed(
+ const std::string &global_image_id, const std::string &peer_mirror_uuid,
+ Context *on_finish) {
+ dout(10) << "global_image_id=" << global_image_id << ", "
+ << "peer_mirror_uuid=" << peer_mirror_uuid << dendl;
+
+ auto ctx = new FunctionContext(
+ [this, peer_mirror_uuid, global_image_id, on_finish] (int r) {
+ m_instance_replayer->remove_peer_image(global_image_id,
+ peer_mirror_uuid, on_finish);
+ m_notify_op_tracker.finish_op();
+ });
+
+ m_notify_op_tracker.start_op();
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_sync_request(const std::string &instance_id,
+ const std::string &sync_id,
+ Context *on_finish) {
+ dout(10) << "instance_id=" << instance_id << ", sync_id=" << sync_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (m_image_sync_throttler == nullptr) {
+ dout(10) << "sync request for non-leader" << dendl;
+ m_work_queue->queue(on_finish, -ESTALE);
+ return;
+ }
+
+ Context *on_start = create_async_context_callback(
+ m_work_queue, new FunctionContext(
+ [this, instance_id, sync_id, on_finish] (int r) {
+ dout(10) << "handle_sync_request: finish: instance_id=" << instance_id
+ << ", sync_id=" << sync_id << ", r=" << r << dendl;
+ if (r == 0) {
+ notify_sync_start(instance_id, sync_id);
+ }
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ on_finish->complete(r);
+ }));
+ m_image_sync_throttler->start_op(sync_id, on_start);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_sync_start(const std::string &instance_id,
+ const std::string &sync_id,
+ Context *on_finish) {
+ dout(10) << "instance_id=" << instance_id << ", sync_id=" << sync_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ auto it = m_inflight_sync_reqs.find(sync_id);
+ if (it == m_inflight_sync_reqs.end()) {
+ dout(5) << "not found" << dendl;
+ m_work_queue->queue(on_finish, 0);
+ return;
+ }
+
+ auto sync_ctx = it->second;
+
+ if (sync_ctx->on_complete != nullptr) {
+ dout(5) << "duplicate request" << dendl;
+ m_work_queue->queue(sync_ctx->on_complete, -ESTALE);
+ }
+
+ sync_ctx->on_complete = on_finish;
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const ImageAcquirePayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "image_acquire: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish != nullptr) {
+ handle_image_acquire(payload.global_image_id, on_finish);
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const ImageReleasePayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "image_release: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish != nullptr) {
+ handle_image_release(payload.global_image_id, on_finish);
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const PeerImageRemovedPayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "remove_peer_image: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish != nullptr) {
+ handle_peer_image_removed(payload.global_image_id, payload.peer_mirror_uuid,
+ on_finish);
+ }
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const SyncRequestPayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "sync_request: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish == nullptr) {
+ return;
+ }
+
+ handle_sync_request(instance_id, payload.sync_id, on_finish);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const SyncStartPayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(10) << "sync_start: instance_id=" << instance_id << ", "
+ << "request_id=" << payload.request_id << dendl;
+
+ auto on_finish = prepare_request(instance_id, payload.request_id,
+ on_notify_ack);
+ if (on_finish == nullptr) {
+ return;
+ }
+
+ handle_sync_start(instance_id, payload.sync_id, on_finish);
+}
+
+template <typename I>
+void InstanceWatcher<I>::handle_payload(const std::string &instance_id,
+ const UnknownPayload &payload,
+ C_NotifyAck *on_notify_ack) {
+ dout(5) << "unknown: instance_id=" << instance_id << dendl;
+
+ on_notify_ack->complete(0);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::InstanceWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/InstanceWatcher.h b/src/tools/rbd_mirror/InstanceWatcher.h
new file mode 100644
index 00000000..5ec1aef0
--- /dev/null
+++ b/src/tools/rbd_mirror/InstanceWatcher.h
@@ -0,0 +1,264 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_INSTANCE_WATCHER_H
+#define CEPH_RBD_MIRROR_INSTANCE_WATCHER_H
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "common/AsyncOpTracker.h"
+#include "librbd/Watcher.h"
+#include "librbd/managed_lock/Types.h"
+#include "tools/rbd_mirror/instance_watcher/Types.h"
+
+namespace librbd {
+
+class ImageCtx;
+template <typename> class ManagedLock;
+
+}
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class ImageSyncThrottler;
+template <typename> class InstanceReplayer;
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class InstanceWatcher : protected librbd::Watcher {
+ using librbd::Watcher::unregister_watch; // Silence overloaded virtual warning
+public:
+ static void get_instances(librados::IoCtx &io_ctx,
+ std::vector<std::string> *instance_ids,
+ Context *on_finish);
+ static void remove_instance(librados::IoCtx &io_ctx,
+ ContextWQ *work_queue,
+ const std::string &instance_id,
+ Context *on_finish);
+
+ static InstanceWatcher *create(
+ librados::IoCtx &io_ctx, ContextWQ *work_queue,
+ InstanceReplayer<ImageCtxT> *instance_replayer);
+ void destroy() {
+ delete this;
+ }
+
+ InstanceWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue,
+ InstanceReplayer<ImageCtxT> *instance_replayer,
+ const std::string &instance_id);
+ ~InstanceWatcher() override;
+
+ inline std::string &get_instance_id() {
+ return m_instance_id;
+ }
+
+ int init();
+ void shut_down();
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+ void remove(Context *on_finish);
+
+ void notify_image_acquire(const std::string &instance_id,
+ const std::string &global_image_id,
+ Context *on_notify_ack);
+ void notify_image_release(const std::string &instance_id,
+ const std::string &global_image_id,
+ Context *on_notify_ack);
+ void notify_peer_image_removed(const std::string &instance_id,
+ const std::string &global_image_id,
+ const std::string &peer_mirror_uuid,
+ Context *on_notify_ack);
+
+ void notify_sync_request(const std::string &sync_id, Context *on_sync_start);
+ bool cancel_sync_request(const std::string &sync_id);
+ void notify_sync_complete(const std::string &sync_id);
+
+ void print_sync_status(Formatter *f, stringstream *ss);
+
+ void cancel_notify_requests(const std::string &instance_id);
+
+ void handle_acquire_leader();
+ void handle_release_leader();
+ void handle_update_leader(const std::string &leader_instance_id);
+
+private:
+ /**
+ * @verbatim
+ *
+ * BREAK_INSTANCE_LOCK -------\
+ * ^ |
+ * | (error) |
+ * GET_INSTANCE_LOCKER * * *>|
+ * ^ (remove) |
+ * | |
+ * <uninitialized> <----------------+---- WAIT_FOR_NOTIFY_OPS
+ * | (init) ^ | ^
+ * v (error) * | |
+ * REGISTER_INSTANCE * * * * * *|* *> UNREGISTER_INSTANCE
+ * | * | ^
+ * v (error) * v |
+ * CREATE_INSTANCE_OBJECT * * * * * *> REMOVE_INSTANCE_OBJECT
+ * | * ^
+ * v (error) * |
+ * REGISTER_WATCH * * * * * * * * * *> UNREGISTER_WATCH
+ * | * ^
+ * v (error) * |
+ * ACQUIRE_LOCK * * * * * * * * * * * RELEASE_LOCK
+ * | ^
+ * v (shut_down) |
+ * <watching> -------------------------------/
+ *
+ * @endverbatim
+ */
+
+ struct C_NotifyInstanceRequest;
+ struct C_SyncRequest;
+
+ typedef std::pair<std::string, std::string> Id;
+
+ struct HandlePayloadVisitor : public boost::static_visitor<void> {
+ InstanceWatcher *instance_watcher;
+ std::string instance_id;
+ C_NotifyAck *on_notify_ack;
+
+ HandlePayloadVisitor(InstanceWatcher *instance_watcher,
+ const std::string &instance_id,
+ C_NotifyAck *on_notify_ack)
+ : instance_watcher(instance_watcher), instance_id(instance_id),
+ on_notify_ack(on_notify_ack) {
+ }
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ instance_watcher->handle_payload(instance_id, payload, on_notify_ack);
+ }
+ };
+
+ struct Request {
+ std::string instance_id;
+ uint64_t request_id;
+ C_NotifyAck *on_notify_ack = nullptr;
+
+ Request(const std::string &instance_id, uint64_t request_id)
+ : instance_id(instance_id), request_id(request_id) {
+ }
+
+ inline bool operator<(const Request &rhs) const {
+ return instance_id < rhs.instance_id ||
+ (instance_id == rhs.instance_id && request_id < rhs.request_id);
+ }
+ };
+
+ Threads<ImageCtxT> *m_threads;
+ InstanceReplayer<ImageCtxT> *m_instance_replayer;
+ std::string m_instance_id;
+
+ mutable Mutex m_lock;
+ librbd::ManagedLock<ImageCtxT> *m_instance_lock;
+ Context *m_on_finish = nullptr;
+ int m_ret_val = 0;
+ std::string m_leader_instance_id;
+ librbd::managed_lock::Locker m_instance_locker;
+ std::set<std::pair<std::string, C_NotifyInstanceRequest *>> m_notify_ops;
+ AsyncOpTracker m_notify_op_tracker;
+ uint64_t m_request_seq = 0;
+ std::set<Request> m_requests;
+ std::set<C_NotifyInstanceRequest *> m_suspended_ops;
+ std::map<std::string, C_SyncRequest *> m_inflight_sync_reqs;
+ ImageSyncThrottler<ImageCtxT> *m_image_sync_throttler = nullptr;
+
+ void register_instance();
+ void handle_register_instance(int r);
+
+ void create_instance_object();
+ void handle_create_instance_object(int r);
+
+ void register_watch();
+ void handle_register_watch(int r);
+
+ void acquire_lock();
+ void handle_acquire_lock(int r);
+
+ void release_lock();
+ void handle_release_lock(int r);
+
+ void unregister_watch();
+ void handle_unregister_watch(int r);
+
+ void remove_instance_object();
+ void handle_remove_instance_object(int r);
+
+ void unregister_instance();
+ void handle_unregister_instance(int r);
+
+ void wait_for_notify_ops();
+ void handle_wait_for_notify_ops(int r);
+
+ void get_instance_locker();
+ void handle_get_instance_locker(int r);
+
+ void break_instance_lock();
+ void handle_break_instance_lock(int r);
+
+ void suspend_notify_request(C_NotifyInstanceRequest *req);
+ bool unsuspend_notify_request(C_NotifyInstanceRequest *req);
+ void unsuspend_notify_requests();
+
+ void notify_sync_complete(const Mutex& lock, const std::string &sync_id);
+ void handle_notify_sync_request(C_SyncRequest *sync_ctx, int r);
+ void handle_notify_sync_complete(C_SyncRequest *sync_ctx, int r);
+
+ void notify_sync_start(const std::string &instance_id,
+ const std::string &sync_id);
+
+ Context *prepare_request(const std::string &instance_id, uint64_t request_id,
+ C_NotifyAck *on_notify_ack);
+ void complete_request(const std::string &instance_id, uint64_t request_id,
+ int r);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+
+ void handle_image_acquire(const std::string &global_image_id,
+ Context *on_finish);
+ void handle_image_release(const std::string &global_image_id,
+ Context *on_finish);
+ void handle_peer_image_removed(const std::string &global_image_id,
+ const std::string &peer_mirror_uuid,
+ Context *on_finish);
+
+ void handle_sync_request(const std::string &instance_id,
+ const std::string &sync_id, Context *on_finish);
+ void handle_sync_start(const std::string &instance_id,
+ const std::string &sync_id, Context *on_finish);
+
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::ImageAcquirePayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::ImageReleasePayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::PeerImageRemovedPayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::SyncRequestPayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::SyncStartPayload &payload,
+ C_NotifyAck *on_notify_ack);
+ void handle_payload(const std::string &instance_id,
+ const instance_watcher::UnknownPayload &payload,
+ C_NotifyAck *on_notify_ack);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_INSTANCE_WATCHER_H
diff --git a/src/tools/rbd_mirror/Instances.cc b/src/tools/rbd_mirror/Instances.cc
new file mode 100644
index 00000000..b7a6cf11
--- /dev/null
+++ b/src/tools/rbd_mirror/Instances.cc
@@ -0,0 +1,359 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/stringify.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/Utils.h"
+#include "InstanceWatcher.h"
+#include "Instances.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::Instances: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+Instances<I>::Instances(Threads<I> *threads, librados::IoCtx &ioctx,
+ const std::string& instance_id,
+ instances::Listener& listener) :
+ m_threads(threads), m_ioctx(ioctx), m_instance_id(instance_id),
+ m_listener(listener), m_cct(reinterpret_cast<CephContext *>(ioctx.cct())),
+ m_lock("rbd::mirror::Instances " + ioctx.get_pool_name()) {
+}
+
+template <typename I>
+Instances<I>::~Instances() {
+}
+
+template <typename I>
+void Instances<I>::init(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ get_instances();
+}
+
+template <typename I>
+void Instances<I>::shut_down(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+
+ Context *ctx = new FunctionContext(
+ [this](int r) {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ cancel_remove_task();
+ wait_for_ops();
+ });
+
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void Instances<I>::unblock_listener() {
+ dout(5) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_listener_blocked);
+ m_listener_blocked = false;
+
+ InstanceIds added_instance_ids;
+ for (auto& pair : m_instances) {
+ if (pair.second.state == INSTANCE_STATE_ADDING) {
+ added_instance_ids.push_back(pair.first);
+ }
+ }
+
+ if (!added_instance_ids.empty()) {
+ m_threads->work_queue->queue(
+ new C_NotifyInstancesAdded(this, added_instance_ids), 0);
+ }
+}
+
+template <typename I>
+void Instances<I>::acked(const InstanceIds& instance_ids) {
+ dout(10) << "instance_ids=" << instance_ids << dendl;
+
+ Mutex::Locker locker(m_lock);
+ if (m_on_finish != nullptr) {
+ dout(5) << "received on shut down, ignoring" << dendl;
+ return;
+ }
+
+ Context *ctx = new C_HandleAcked(this, instance_ids);
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void Instances<I>::handle_acked(const InstanceIds& instance_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ if (m_on_finish != nullptr) {
+ dout(5) << "handled on shut down, ignoring" << dendl;
+ return;
+ }
+
+ InstanceIds added_instance_ids;
+ auto time = ceph_clock_now();
+ for (auto& instance_id : instance_ids) {
+ auto &instance = m_instances.insert(
+ std::make_pair(instance_id, Instance{})).first->second;
+ instance.acked_time = time;
+ if (instance.state == INSTANCE_STATE_ADDING) {
+ added_instance_ids.push_back(instance_id);
+ }
+ }
+
+ schedule_remove_task(time);
+ if (!m_listener_blocked && !added_instance_ids.empty()) {
+ m_threads->work_queue->queue(
+ new C_NotifyInstancesAdded(this, added_instance_ids), 0);
+ }
+}
+
+template <typename I>
+void Instances<I>::notify_instances_added(const InstanceIds& instance_ids) {
+ Mutex::Locker locker(m_lock);
+ InstanceIds added_instance_ids;
+ for (auto& instance_id : instance_ids) {
+ auto it = m_instances.find(instance_id);
+ if (it != m_instances.end() && it->second.state == INSTANCE_STATE_ADDING) {
+ added_instance_ids.push_back(instance_id);
+ }
+ }
+
+ if (added_instance_ids.empty()) {
+ return;
+ }
+
+ dout(5) << "instance_ids=" << added_instance_ids << dendl;
+ m_lock.Unlock();
+ m_listener.handle_added(added_instance_ids);
+ m_lock.Lock();
+
+ for (auto& instance_id : added_instance_ids) {
+ auto it = m_instances.find(instance_id);
+ if (it != m_instances.end() && it->second.state == INSTANCE_STATE_ADDING) {
+ it->second.state = INSTANCE_STATE_IDLE;
+ }
+ }
+}
+
+template <typename I>
+void Instances<I>::notify_instances_removed(const InstanceIds& instance_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+ m_listener.handle_removed(instance_ids);
+
+ Mutex::Locker locker(m_lock);
+ for (auto& instance_id : instance_ids) {
+ m_instances.erase(instance_id);
+ }
+}
+
+template <typename I>
+void Instances<I>::list(std::vector<std::string> *instance_ids) {
+ dout(20) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ for (auto it : m_instances) {
+ instance_ids->push_back(it.first);
+ }
+}
+
+
+template <typename I>
+void Instances<I>::get_instances() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_context_callback<
+ Instances, &Instances<I>::handle_get_instances>(this);
+
+ InstanceWatcher<I>::get_instances(m_ioctx, &m_instance_ids, ctx);
+}
+
+template <typename I>
+void Instances<I>::handle_get_instances(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ std::swap(on_finish, m_on_finish);
+ }
+
+ if (r < 0) {
+ derr << "error retrieving instances: " << cpp_strerror(r) << dendl;
+ } else {
+ handle_acked(m_instance_ids);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void Instances<I>::wait_for_ops() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ Instances, &Instances<I>::handle_wait_for_ops>(this));
+
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void Instances<I>::handle_wait_for_ops(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ceph_assert(r == 0);
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ std::swap(on_finish, m_on_finish);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void Instances<I>::remove_instances(const utime_t& time) {
+ ceph_assert(m_lock.is_locked());
+
+ InstanceIds instance_ids;
+ for (auto& instance_pair : m_instances) {
+ if (instance_pair.first == m_instance_id) {
+ continue;
+ }
+ auto& instance = instance_pair.second;
+ if (instance.state != INSTANCE_STATE_REMOVING &&
+ instance.acked_time <= time) {
+ instance.state = INSTANCE_STATE_REMOVING;
+ instance_ids.push_back(instance_pair.first);
+ }
+ }
+ ceph_assert(!instance_ids.empty());
+
+ dout(10) << "instance_ids=" << instance_ids << dendl;
+ Context* ctx = new FunctionContext([this, instance_ids](int r) {
+ handle_remove_instances(r, instance_ids);
+ });
+ ctx = create_async_context_callback(m_threads->work_queue, ctx);
+
+ auto gather_ctx = new C_Gather(m_cct, ctx);
+ for (auto& instance_id : instance_ids) {
+ InstanceWatcher<I>::remove_instance(m_ioctx, m_threads->work_queue,
+ instance_id, gather_ctx->new_sub());
+ }
+
+ m_async_op_tracker.start_op();
+ gather_ctx->activate();
+}
+
+template <typename I>
+void Instances<I>::handle_remove_instances(
+ int r, const InstanceIds& instance_ids) {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+
+ dout(10) << "r=" << r << ", instance_ids=" << instance_ids << dendl;
+ ceph_assert(r == 0);
+
+ // fire removed notification now that instances have been blacklisted
+ m_threads->work_queue->queue(
+ new C_NotifyInstancesRemoved(this, instance_ids), 0);
+
+ // reschedule the timer for the next batch
+ schedule_remove_task(ceph_clock_now());
+ m_async_op_tracker.finish_op();
+}
+
+template <typename I>
+void Instances<I>::cancel_remove_task() {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+
+ if (m_timer_task == nullptr) {
+ return;
+ }
+
+ dout(10) << dendl;
+
+ bool canceled = m_threads->timer->cancel_event(m_timer_task);
+ ceph_assert(canceled);
+ m_timer_task = nullptr;
+}
+
+template <typename I>
+void Instances<I>::schedule_remove_task(const utime_t& time) {
+ cancel_remove_task();
+ if (m_on_finish != nullptr) {
+ dout(10) << "received on shut down, ignoring" << dendl;
+ return;
+ }
+
+ int after = m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_heartbeat_interval") *
+ (1 + m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_max_missed_heartbeats") +
+ m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_max_acquire_attempts_before_break"));
+
+ bool schedule = false;
+ utime_t oldest_time = time;
+ for (auto& instance : m_instances) {
+ if (instance.first == m_instance_id) {
+ continue;
+ }
+ if (instance.second.state == INSTANCE_STATE_REMOVING) {
+ // removal is already in-flight
+ continue;
+ }
+
+ oldest_time = std::min(oldest_time, instance.second.acked_time);
+ schedule = true;
+ }
+
+ if (!schedule) {
+ return;
+ }
+
+ dout(10) << dendl;
+
+ // schedule a time to fire when the oldest instance should be removed
+ m_timer_task = new FunctionContext(
+ [this, oldest_time](int r) {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ Mutex::Locker locker(m_lock);
+ m_timer_task = nullptr;
+
+ remove_instances(oldest_time);
+ });
+
+ oldest_time += after;
+ m_threads->timer->add_event_at(oldest_time, m_timer_task);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::Instances<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/Instances.h b/src/tools/rbd_mirror/Instances.h
new file mode 100644
index 00000000..dbfb16df
--- /dev/null
+++ b/src/tools/rbd_mirror/Instances.h
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_INSTANCES_H
+#define CEPH_RBD_MIRROR_INSTANCES_H
+
+#include <map>
+#include <vector>
+
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include "common/AsyncOpTracker.h"
+#include "common/Mutex.h"
+#include "librbd/Watcher.h"
+#include "tools/rbd_mirror/instances/Types.h"
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class Instances {
+public:
+ typedef std::vector<std::string> InstanceIds;
+
+ static Instances *create(Threads<ImageCtxT> *threads,
+ librados::IoCtx &ioctx,
+ const std::string& instance_id,
+ instances::Listener& listener) {
+ return new Instances(threads, ioctx, instance_id, listener);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ Instances(Threads<ImageCtxT> *threads, librados::IoCtx &ioctx,
+ const std::string& instance_id, instances::Listener& listener);
+ virtual ~Instances();
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+ void unblock_listener();
+
+ void acked(const InstanceIds& instance_ids);
+
+ void list(std::vector<std::string> *instance_ids);
+
+private:
+ /**
+ * @verbatim
+ *
+ * <uninitialized> <---------------------\
+ * | (init) ^ |
+ * v (error) * |
+ * GET_INSTANCES * * * * * WAIT_FOR_OPS
+ * | ^
+ * v (shut_down) |
+ * <initialized> ------------------------/
+ * .
+ * . (remove_instance)
+ * v
+ * REMOVE_INSTANCE
+ *
+ * @endverbatim
+ */
+
+ enum InstanceState {
+ INSTANCE_STATE_ADDING,
+ INSTANCE_STATE_IDLE,
+ INSTANCE_STATE_REMOVING
+ };
+
+ struct Instance {
+ utime_t acked_time{};
+ InstanceState state = INSTANCE_STATE_ADDING;
+ };
+
+ struct C_NotifyBase : public Context {
+ Instances *instances;
+ InstanceIds instance_ids;
+
+ C_NotifyBase(Instances *instances, const InstanceIds& instance_ids)
+ : instances(instances), instance_ids(instance_ids) {
+ instances->m_async_op_tracker.start_op();
+ }
+
+ void finish(int r) override {
+ execute();
+ instances->m_async_op_tracker.finish_op();
+ }
+
+ virtual void execute() = 0;
+ };
+
+ struct C_HandleAcked : public C_NotifyBase {
+ C_HandleAcked(Instances *instances, const InstanceIds& instance_ids)
+ : C_NotifyBase(instances, instance_ids) {
+ }
+
+ void execute() override {
+ this->instances->handle_acked(this->instance_ids);
+ }
+ };
+
+ struct C_NotifyInstancesAdded : public C_NotifyBase {
+ C_NotifyInstancesAdded(Instances *instances,
+ const InstanceIds& instance_ids)
+ : C_NotifyBase(instances, instance_ids) {
+ }
+
+ void execute() override {
+ this->instances->notify_instances_added(this->instance_ids);
+ }
+ };
+
+ struct C_NotifyInstancesRemoved : public C_NotifyBase {
+ C_NotifyInstancesRemoved(Instances *instances,
+ const InstanceIds& instance_ids)
+ : C_NotifyBase(instances, instance_ids) {
+ }
+
+ void execute() override {
+ this->instances->notify_instances_removed(this->instance_ids);
+ }
+ };
+
+ Threads<ImageCtxT> *m_threads;
+ librados::IoCtx &m_ioctx;
+ std::string m_instance_id;
+ instances::Listener& m_listener;
+ CephContext *m_cct;
+
+ Mutex m_lock;
+ InstanceIds m_instance_ids;
+ std::map<std::string, Instance> m_instances;
+ Context *m_on_finish = nullptr;
+ AsyncOpTracker m_async_op_tracker;
+
+ Context *m_timer_task = nullptr;
+
+ bool m_listener_blocked = true;
+
+ void handle_acked(const InstanceIds& instance_ids);
+ void notify_instances_added(const InstanceIds& instance_ids);
+ void notify_instances_removed(const InstanceIds& instance_ids);
+
+ void get_instances();
+ void handle_get_instances(int r);
+
+ void wait_for_ops();
+ void handle_wait_for_ops(int r);
+
+ void remove_instances(const utime_t& time);
+ void handle_remove_instances(int r, const InstanceIds& instance_ids);
+
+ void cancel_remove_task();
+ void schedule_remove_task(const utime_t& time);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_INSTANCES_H
diff --git a/src/tools/rbd_mirror/LeaderWatcher.cc b/src/tools/rbd_mirror/LeaderWatcher.cc
new file mode 100644
index 00000000..0d4bde6f
--- /dev/null
+++ b/src/tools/rbd_mirror/LeaderWatcher.cc
@@ -0,0 +1,1145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "LeaderWatcher.h"
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "include/stringify.h"
+#include "librbd/Utils.h"
+#include "librbd/watcher/Types.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::LeaderWatcher: " \
+ << this << " " << __func__ << ": "
+namespace rbd {
+namespace mirror {
+
+using namespace leader_watcher;
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+LeaderWatcher<I>::LeaderWatcher(Threads<I> *threads, librados::IoCtx &io_ctx,
+ leader_watcher::Listener *listener)
+ : Watcher(io_ctx, threads->work_queue, RBD_MIRROR_LEADER),
+ m_threads(threads), m_listener(listener), m_instances_listener(this),
+ m_lock("rbd::mirror::LeaderWatcher " + io_ctx.get_pool_name()),
+ m_notifier_id(librados::Rados(io_ctx).get_instance_id()),
+ m_instance_id(stringify(m_notifier_id)),
+ m_leader_lock(new LeaderLock(m_ioctx, m_work_queue, m_oid, this, true,
+ m_cct->_conf.get_val<uint64_t>(
+ "rbd_blacklist_expire_seconds"))) {
+}
+
+template <typename I>
+LeaderWatcher<I>::~LeaderWatcher() {
+ ceph_assert(m_status_watcher == nullptr);
+ ceph_assert(m_instances == nullptr);
+ ceph_assert(m_timer_task == nullptr);
+
+ delete m_leader_lock;
+}
+
+template <typename I>
+std::string LeaderWatcher<I>::get_instance_id() {
+ return m_instance_id;
+}
+
+template <typename I>
+int LeaderWatcher<I>::init() {
+ C_SaferCond init_ctx;
+ init(&init_ctx);
+ return init_ctx.wait();
+}
+
+template <typename I>
+void LeaderWatcher<I>::init(Context *on_finish) {
+ dout(10) << "notifier_id=" << m_notifier_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+
+ create_leader_object();
+}
+
+template <typename I>
+void LeaderWatcher<I>::create_leader_object() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ librados::ObjectWriteOperation op;
+ op.create(false);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_create_leader_object>(this);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_create_leader_object(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+
+ if (r == 0) {
+ register_watch();
+ return;
+ }
+
+ derr << "error creating " << m_oid << " object: " << cpp_strerror(r)
+ << dendl;
+
+ std::swap(on_finish, m_on_finish);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::register_watch() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_register_watch>(this));
+
+ librbd::Watcher::register_watch(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_register_watch(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ if (r < 0) {
+ Mutex::Locker locker(m_lock);
+ derr << "error registering leader watcher for " << m_oid << " object: "
+ << cpp_strerror(r) << dendl;
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(on_finish, m_on_finish);
+ } else {
+ Mutex::Locker locker(m_lock);
+ init_status_watcher();
+ return;
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::shut_down() {
+ C_SaferCond shut_down_ctx;
+ shut_down(&shut_down_ctx);
+ int r = shut_down_ctx.wait();
+ ceph_assert(r == 0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::shut_down(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(m_on_shut_down_finish == nullptr);
+ m_on_shut_down_finish = on_finish;
+ cancel_timer_task();
+ shut_down_leader_lock();
+}
+
+template <typename I>
+void LeaderWatcher<I>::shut_down_leader_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_shut_down_leader_lock>(this));
+
+ m_leader_lock->shut_down(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_shut_down_leader_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ derr << "error shutting down leader lock: " << cpp_strerror(r) << dendl;
+ }
+
+ shut_down_status_watcher();
+}
+
+template <typename I>
+void LeaderWatcher<I>::unregister_watch() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_unregister_watch>(this));
+
+ librbd::Watcher::unregister_watch(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_unregister_watch(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error unregistering leader watcher for " << m_oid << " object: "
+ << cpp_strerror(r) << dendl;
+ }
+ wait_for_tasks();
+}
+
+template <typename I>
+void LeaderWatcher<I>::wait_for_tasks() {
+ dout(10) << dendl;
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ schedule_timer_task("wait for tasks", 0, false,
+ &LeaderWatcher<I>::handle_wait_for_tasks, true);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_wait_for_tasks() {
+ dout(10) << dendl;
+
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_on_shut_down_finish != nullptr);
+
+ ceph_assert(!m_timer_op_tracker.empty());
+ m_timer_op_tracker.finish_op();
+
+ auto ctx = new FunctionContext([this](int r) {
+ Context *on_finish;
+ {
+ // ensure lock isn't held when completing shut down
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_shut_down_finish != nullptr);
+ on_finish = m_on_shut_down_finish;
+ }
+ on_finish->complete(0);
+ });
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_blacklisted() const {
+ std::lock_guard locker{m_lock};
+ return m_blacklisted;
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_leader() const {
+ Mutex::Locker locker(m_lock);
+
+ return is_leader(m_lock);
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_leader(Mutex &lock) const {
+ ceph_assert(m_lock.is_locked());
+
+ bool leader = m_leader_lock->is_leader();
+ dout(10) << leader << dendl;
+ return leader;
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_releasing_leader() const {
+ Mutex::Locker locker(m_lock);
+
+ return is_releasing_leader(m_lock);
+}
+
+template <typename I>
+bool LeaderWatcher<I>::is_releasing_leader(Mutex &lock) const {
+ ceph_assert(m_lock.is_locked());
+
+ bool releasing = m_leader_lock->is_releasing_leader();
+ dout(10) << releasing << dendl;
+ return releasing;
+}
+
+template <typename I>
+bool LeaderWatcher<I>::get_leader_instance_id(std::string *instance_id) const {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (is_leader(m_lock) || is_releasing_leader(m_lock)) {
+ *instance_id = m_instance_id;
+ return true;
+ }
+
+ if (!m_locker.cookie.empty()) {
+ *instance_id = stringify(m_locker.entity.num());
+ return true;
+ }
+
+ return false;
+}
+
+template <typename I>
+void LeaderWatcher<I>::release_leader() {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ if (!is_leader(m_lock)) {
+ return;
+ }
+
+ release_leader_lock();
+}
+
+template <typename I>
+void LeaderWatcher<I>::list_instances(std::vector<std::string> *instance_ids) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ instance_ids->clear();
+ if (m_instances != nullptr) {
+ m_instances->list(instance_ids);
+ }
+}
+
+template <typename I>
+void LeaderWatcher<I>::cancel_timer_task() {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+
+ if (m_timer_task == nullptr) {
+ return;
+ }
+
+ dout(10) << m_timer_task << dendl;
+ bool canceled = m_threads->timer->cancel_event(m_timer_task);
+ ceph_assert(canceled);
+ m_timer_task = nullptr;
+}
+
+template <typename I>
+void LeaderWatcher<I>::schedule_timer_task(const std::string &name,
+ int delay_factor, bool leader,
+ TimerCallback timer_callback,
+ bool shutting_down) {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+
+ if (!shutting_down && m_on_shut_down_finish != nullptr) {
+ return;
+ }
+
+ cancel_timer_task();
+
+ m_timer_task = new FunctionContext(
+ [this, leader, timer_callback](int r) {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ m_timer_task = nullptr;
+
+ if (m_timer_op_tracker.empty()) {
+ Mutex::Locker locker(m_lock);
+ execute_timer_task(leader, timer_callback);
+ return;
+ }
+
+ // old timer task is still running -- do not start next
+ // task until the previous task completes
+ if (m_timer_gate == nullptr) {
+ m_timer_gate = new C_TimerGate(this);
+ m_timer_op_tracker.wait_for_ops(m_timer_gate);
+ }
+ m_timer_gate->leader = leader;
+ m_timer_gate->timer_callback = timer_callback;
+ });
+
+ int after = delay_factor * m_cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_leader_heartbeat_interval");
+
+ dout(10) << "scheduling " << name << " after " << after << " sec (task "
+ << m_timer_task << ")" << dendl;
+ m_threads->timer->add_event_after(after, m_timer_task);
+}
+
+template <typename I>
+void LeaderWatcher<I>::execute_timer_task(bool leader,
+ TimerCallback timer_callback) {
+ dout(10) << dendl;
+
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_timer_op_tracker.empty());
+
+ if (is_leader(m_lock) != leader) {
+ return;
+ }
+
+ m_timer_op_tracker.start_op();
+ (this->*timer_callback)();
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_post_acquire_leader_lock(int r,
+ Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -EAGAIN) {
+ dout(10) << "already locked" << dendl;
+ } else {
+ derr << "error acquiring leader lock: " << cpp_strerror(r) << dendl;
+ }
+ on_finish->complete(r);
+ return;
+ }
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ init_instances();
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_pre_release_leader_lock(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+ m_ret_val = 0;
+
+ notify_listener();
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_post_release_leader_lock(int r,
+ Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ on_finish->complete(r);
+ return;
+ }
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_finish == nullptr);
+ m_on_finish = on_finish;
+
+ notify_lock_released();
+}
+
+template <typename I>
+void LeaderWatcher<I>::break_leader_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (m_locker.cookie.empty()) {
+ get_locker();
+ return;
+ }
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_break_leader_lock>(this));
+
+ m_leader_lock->break_lock(m_locker, true, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_break_leader_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (m_leader_lock->is_shutdown()) {
+ dout(10) << "canceling due to shutdown" << dendl;
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "error breaking leader lock: " << cpp_strerror(r) << dendl;
+ schedule_acquire_leader_lock(1);
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ m_locker = {};
+ m_acquire_attempts = 0;
+ acquire_leader_lock();
+}
+
+template <typename I>
+void LeaderWatcher<I>::schedule_get_locker(bool reset_leader,
+ uint32_t delay_factor) {
+ dout(10) << dendl;
+
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+
+ if (reset_leader) {
+ m_locker = {};
+ m_acquire_attempts = 0;
+ }
+
+ schedule_timer_task("get locker", delay_factor, false,
+ &LeaderWatcher<I>::get_locker, false);
+}
+
+template <typename I>
+void LeaderWatcher<I>::get_locker() {
+ dout(10) << dendl;
+
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ C_GetLocker *get_locker_ctx = new C_GetLocker(this);
+ Context *ctx = create_async_context_callback(m_work_queue, get_locker_ctx);
+
+ m_leader_lock->get_locker(&get_locker_ctx->locker, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_get_locker(int r,
+ librbd::managed_lock::Locker& locker) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker mutex_locker(m_lock);
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (m_leader_lock->is_shutdown()) {
+ dout(10) << "canceling due to shutdown" << dendl;
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ if (is_leader(m_lock)) {
+ m_locker = {};
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ if (r == -ENOENT) {
+ m_locker = {};
+ m_acquire_attempts = 0;
+ acquire_leader_lock();
+ return;
+ } else if (r < 0) {
+ derr << "error retrieving leader locker: " << cpp_strerror(r) << dendl;
+ schedule_get_locker(true, 1);
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ bool notify_listener = false;
+ if (m_locker != locker) {
+ m_locker = locker;
+ notify_listener = true;
+ if (m_acquire_attempts > 1) {
+ dout(10) << "new lock owner detected -- resetting heartbeat counter"
+ << dendl;
+ m_acquire_attempts = 0;
+ }
+ }
+
+ if (m_acquire_attempts >= m_cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_leader_max_acquire_attempts_before_break")) {
+ dout(0) << "breaking leader lock after " << m_acquire_attempts << " "
+ << "failed attempts to acquire" << dendl;
+ break_leader_lock();
+ return;
+ }
+
+ schedule_acquire_leader_lock(1);
+
+ if (!notify_listener) {
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ auto ctx = new FunctionContext(
+ [this](int r) {
+ std::string instance_id;
+ if (get_leader_instance_id(&instance_id)) {
+ m_listener->update_leader_handler(instance_id);
+ }
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ m_timer_op_tracker.finish_op();
+ });
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::schedule_acquire_leader_lock(uint32_t delay_factor) {
+ dout(10) << dendl;
+
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+
+ schedule_timer_task("acquire leader lock",
+ delay_factor *
+ m_cct->_conf.get_val<uint64_t>("rbd_mirror_leader_max_missed_heartbeats"),
+ false, &LeaderWatcher<I>::acquire_leader_lock, false);
+}
+
+template <typename I>
+void LeaderWatcher<I>::acquire_leader_lock() {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ ++m_acquire_attempts;
+ dout(10) << "acquire_attempts=" << m_acquire_attempts << dendl;
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_acquire_leader_lock>(this));
+ m_leader_lock->try_acquire_lock(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_acquire_leader_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (m_leader_lock->is_shutdown()) {
+ dout(10) << "canceling due to shutdown" << dendl;
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ if (r < 0) {
+ if (r == -EAGAIN) {
+ dout(10) << "already locked" << dendl;
+ } else {
+ derr << "error acquiring lock: " << cpp_strerror(r) << dendl;
+ }
+
+ get_locker();
+ return;
+ }
+
+ m_locker = {};
+ m_acquire_attempts = 0;
+
+ if (m_ret_val) {
+ dout(5) << "releasing due to error on notify" << dendl;
+ release_leader_lock();
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ notify_heartbeat();
+}
+
+template <typename I>
+void LeaderWatcher<I>::release_leader_lock() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_release_leader_lock>(this));
+
+ m_leader_lock->release_lock(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_release_leader_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ derr << "error releasing lock: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ schedule_acquire_leader_lock(1);
+}
+
+template <typename I>
+void LeaderWatcher<I>::init_status_watcher() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_status_watcher == nullptr);
+
+ m_status_watcher = MirrorStatusWatcher<I>::create(m_ioctx, m_work_queue);
+
+ Context *ctx = create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_init_status_watcher>(this);
+
+ m_status_watcher->init(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_init_status_watcher(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ derr << "error initializing mirror status watcher: " << cpp_strerror(r)
+ << cpp_strerror(r) << dendl;
+ } else {
+ schedule_acquire_leader_lock(0);
+ }
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(on_finish, m_on_finish);
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::shut_down_status_watcher() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_status_watcher != nullptr);
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<LeaderWatcher<I>,
+ &LeaderWatcher<I>::handle_shut_down_status_watcher>(this));
+
+ m_status_watcher->shut_down(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_shut_down_status_watcher(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+ m_status_watcher->destroy();
+ m_status_watcher = nullptr;
+
+ if (r < 0) {
+ derr << "error shutting mirror status watcher down: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ unregister_watch();
+}
+
+template <typename I>
+void LeaderWatcher<I>::init_instances() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_instances == nullptr);
+
+ m_instances = Instances<I>::create(m_threads, m_ioctx, m_instance_id,
+ m_instances_listener);
+
+ Context *ctx = create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_init_instances>(this);
+
+ m_instances->init(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_init_instances(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ if (r < 0) {
+ Mutex::Locker locker(m_lock);
+ derr << "error initializing instances: " << cpp_strerror(r) << dendl;
+ m_instances->destroy();
+ m_instances = nullptr;
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(m_on_finish, on_finish);
+ } else {
+ Mutex::Locker locker(m_lock);
+ notify_listener();
+ return;
+ }
+
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::shut_down_instances() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(m_instances != nullptr);
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<LeaderWatcher<I>,
+ &LeaderWatcher<I>::handle_shut_down_instances>(this));
+
+ m_instances->shut_down(ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_shut_down_instances(int r) {
+ dout(10) << "r=" << r << dendl;
+ ceph_assert(r == 0);
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+
+ m_instances->destroy();
+ m_instances = nullptr;
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(m_on_finish, on_finish);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::notify_listener() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_async_context_callback(
+ m_work_queue, create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_listener>(this));
+
+ if (is_leader(m_lock)) {
+ ctx = new FunctionContext(
+ [this, ctx](int r) {
+ m_listener->post_acquire_handler(ctx);
+ });
+ } else {
+ ctx = new FunctionContext(
+ [this, ctx](int r) {
+ m_listener->pre_release_handler(ctx);
+ });
+ }
+ m_work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify_listener(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ if (r < 0) {
+ derr << "error notifying listener: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+
+ if (is_leader(m_lock)) {
+ notify_lock_acquired();
+ } else {
+ shut_down_instances();
+ }
+}
+
+template <typename I>
+void LeaderWatcher<I>::notify_lock_acquired() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_lock_acquired>(this);
+
+ bufferlist bl;
+ encode(NotifyMessage{LockAcquiredPayload{}}, bl);
+
+ send_notify(bl, nullptr, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify_lock_acquired(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ if (r < 0 && r != -ETIMEDOUT) {
+ derr << "error notifying leader lock acquired: " << cpp_strerror(r)
+ << dendl;
+ m_ret_val = r;
+ }
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(m_on_finish, on_finish);
+
+ if (m_ret_val == 0) {
+ // listener should be ready for instance add/remove events now
+ m_instances->unblock_listener();
+ }
+ }
+ on_finish->complete(0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::notify_lock_released() {
+ dout(10) << dendl;
+
+ ceph_assert(m_lock.is_locked());
+
+ Context *ctx = create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_lock_released>(this);
+
+ bufferlist bl;
+ encode(NotifyMessage{LockReleasedPayload{}}, bl);
+
+ send_notify(bl, nullptr, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify_lock_released(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ if (r < 0 && r != -ETIMEDOUT) {
+ derr << "error notifying leader lock released: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ ceph_assert(m_on_finish != nullptr);
+ std::swap(m_on_finish, on_finish);
+ }
+ on_finish->complete(r);
+}
+
+template <typename I>
+void LeaderWatcher<I>::notify_heartbeat() {
+ dout(10) << dendl;
+
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_lock.is_locked());
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ if (!is_leader(m_lock)) {
+ dout(5) << "not leader, canceling" << dendl;
+ m_timer_op_tracker.finish_op();
+ return;
+ }
+
+ Context *ctx = create_context_callback<
+ LeaderWatcher<I>, &LeaderWatcher<I>::handle_notify_heartbeat>(this);
+
+ bufferlist bl;
+ encode(NotifyMessage{HeartbeatPayload{}}, bl);
+
+ m_heartbeat_response.acks.clear();
+ send_notify(bl, &m_heartbeat_response, ctx);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify_heartbeat(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_timer_op_tracker.empty());
+
+ m_timer_op_tracker.finish_op();
+ if (m_leader_lock->is_shutdown()) {
+ dout(10) << "canceling due to shutdown" << dendl;
+ return;
+ } else if (!is_leader(m_lock)) {
+ return;
+ }
+
+ if (r < 0 && r != -ETIMEDOUT) {
+ derr << "error notifying heartbeat: " << cpp_strerror(r)
+ << ", releasing leader" << dendl;
+ release_leader_lock();
+ return;
+ }
+
+ dout(10) << m_heartbeat_response.acks.size() << " acks received, "
+ << m_heartbeat_response.timeouts.size() << " timed out" << dendl;
+
+ std::vector<std::string> instance_ids;
+ for (auto &it: m_heartbeat_response.acks) {
+ uint64_t notifier_id = it.first.gid;
+ instance_ids.push_back(stringify(notifier_id));
+ }
+ if (!instance_ids.empty()) {
+ m_instances->acked(instance_ids);
+ }
+
+ schedule_timer_task("heartbeat", 1, true,
+ &LeaderWatcher<I>::notify_heartbeat, false);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_heartbeat(Context *on_notify_ack) {
+ dout(10) << dendl;
+
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ if (is_leader(m_lock)) {
+ dout(5) << "got another leader heartbeat, ignoring" << dendl;
+ } else {
+ cancel_timer_task();
+ m_acquire_attempts = 0;
+ schedule_acquire_leader_lock(1);
+ }
+ }
+
+ on_notify_ack->complete(0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_lock_acquired(Context *on_notify_ack) {
+ dout(10) << dendl;
+
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ if (is_leader(m_lock)) {
+ dout(5) << "got another leader lock_acquired, ignoring" << dendl;
+ } else {
+ cancel_timer_task();
+ schedule_get_locker(true, 0);
+ }
+ }
+
+ on_notify_ack->complete(0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_lock_released(Context *on_notify_ack) {
+ dout(10) << dendl;
+
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ if (is_leader(m_lock)) {
+ dout(5) << "got another leader lock_released, ignoring" << dendl;
+ } else {
+ cancel_timer_task();
+ schedule_get_locker(true, 0);
+ }
+ }
+
+ on_notify_ack->complete(0);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) {
+ dout(10) << "notify_id=" << notify_id << ", handle=" << handle << ", "
+ << "notifier_id=" << notifier_id << dendl;
+
+ Context *ctx = new C_NotifyAck(this, notify_id, handle);
+
+ if (notifier_id == m_notifier_id) {
+ dout(10) << "our own notification, ignoring" << dendl;
+ ctx->complete(0);
+ return;
+ }
+
+ NotifyMessage notify_message;
+ try {
+ auto iter = bl.cbegin();
+ decode(notify_message, iter);
+ } catch (const buffer::error &err) {
+ derr << "error decoding image notification: " << err.what() << dendl;
+ ctx->complete(0);
+ return;
+ }
+
+ apply_visitor(HandlePayloadVisitor(this, ctx), notify_message.payload);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_rewatch_complete(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ if (r == -EBLACKLISTED) {
+ dout(1) << "blacklisted detected" << dendl;
+ m_blacklisted = true;
+ return;
+ }
+
+ m_leader_lock->reacquire_lock(nullptr);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_payload(const HeartbeatPayload &payload,
+ Context *on_notify_ack) {
+ dout(10) << "heartbeat" << dendl;
+
+ handle_heartbeat(on_notify_ack);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_payload(const LockAcquiredPayload &payload,
+ Context *on_notify_ack) {
+ dout(10) << "lock_acquired" << dendl;
+
+ handle_lock_acquired(on_notify_ack);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_payload(const LockReleasedPayload &payload,
+ Context *on_notify_ack) {
+ dout(10) << "lock_released" << dendl;
+
+ handle_lock_released(on_notify_ack);
+}
+
+template <typename I>
+void LeaderWatcher<I>::handle_payload(const UnknownPayload &payload,
+ Context *on_notify_ack) {
+ dout(10) << "unknown" << dendl;
+
+ on_notify_ack->complete(0);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::LeaderWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/LeaderWatcher.h b/src/tools/rbd_mirror/LeaderWatcher.h
new file mode 100644
index 00000000..01ee0565
--- /dev/null
+++ b/src/tools/rbd_mirror/LeaderWatcher.h
@@ -0,0 +1,320 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_LEADER_WATCHER_H
+#define CEPH_RBD_MIRROR_LEADER_WATCHER_H
+
+#include <list>
+#include <memory>
+#include <string>
+
+#include "common/AsyncOpTracker.h"
+#include "librbd/ManagedLock.h"
+#include "librbd/Watcher.h"
+#include "librbd/managed_lock/Types.h"
+#include "librbd/watcher/Types.h"
+#include "Instances.h"
+#include "MirrorStatusWatcher.h"
+#include "tools/rbd_mirror/instances/Types.h"
+#include "tools/rbd_mirror/leader_watcher/Types.h"
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class LeaderWatcher : protected librbd::Watcher {
+ using librbd::Watcher::unregister_watch; // Silence overloaded virtual warning
+public:
+ static LeaderWatcher* create(Threads<ImageCtxT> *threads,
+ librados::IoCtx &io_ctx,
+ leader_watcher::Listener *listener) {
+ return new LeaderWatcher(threads, io_ctx, listener);
+ }
+
+ LeaderWatcher(Threads<ImageCtxT> *threads, librados::IoCtx &io_ctx,
+ leader_watcher::Listener *listener);
+ ~LeaderWatcher() override;
+
+ int init();
+ void shut_down();
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+ bool is_blacklisted() const;
+ bool is_leader() const;
+ bool is_releasing_leader() const;
+ bool get_leader_instance_id(std::string *instance_id) const;
+ void release_leader();
+ void list_instances(std::vector<std::string> *instance_ids);
+
+ std::string get_instance_id();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <uninitialized> <------------------------------ WAIT_FOR_TASKS
+ * | (init) ^ ^
+ * v * |
+ * CREATE_OBJECT * * * * * (error) UNREGISTER_WATCH
+ * | * ^
+ * v * |
+ * REGISTER_WATCH * * * * * SHUT_DOWN_STATUS_WATCHER
+ * | * ^
+ * v * |
+ * INIT_STATUS_WATCHER * * SHUT_DOWN_LEADER_LOCK
+ * | |
+ * | (no leader heartbeat and acquire failed) |
+ * | BREAK_LOCK <-------------------------------------\ |
+ * | | (no leader heartbeat) | | (shut down)
+ * | | /----------------------------------------\ | |
+ * | | | (lock_released received) | |
+ * | | | /-------------------------------------\ | |
+ * | | | | (lock_acquired or | | |
+ * | | | | heartbeat received) | | |
+ * | | | | (ENOENT) /-----------\ | | |
+ * | | | | * * * * * * * * * * | | | | |
+ * v v v v v (error) * v | | | |
+ * ACQUIRE_LEADER_LOCK * * * * *> GET_LOCKER ---> <secondary>
+ * | * ^
+ * ....|...................*.................... .....|.....................
+ * . v * . . | post_release .
+ * .INIT_INSTANCES * * * * * . .NOTIFY_LOCK_RELEASED .
+ * . | . .....^.....................
+ * . v . |
+ * .NOTIFY_LISTENER . RELEASE_LEADER_LOCK
+ * . | . ^
+ * . v . .....|.....................
+ * .NOTIFY_LOCK_ACQUIRED . . | .
+ * . | post_acquire . .SHUT_DOWN_INSTANCES .
+ * ....|........................................ . ^ .
+ * v . | .
+ * <leader> -----------------------------------> .NOTIFY_LISTENER .
+ * (shut_down, release_leader, . pre_release .
+ * notify error) ...........................
+ * @endverbatim
+ */
+
+ struct InstancesListener : public instances::Listener {
+ LeaderWatcher* leader_watcher;
+
+ InstancesListener(LeaderWatcher* leader_watcher)
+ : leader_watcher(leader_watcher) {
+ }
+
+ void handle_added(const InstanceIds& instance_ids) override {
+ leader_watcher->m_listener->handle_instances_added(instance_ids);
+ }
+
+ void handle_removed(const InstanceIds& instance_ids) override {
+ leader_watcher->m_listener->handle_instances_removed(instance_ids);
+ }
+ };
+
+ class LeaderLock : public librbd::ManagedLock<ImageCtxT> {
+ public:
+ typedef librbd::ManagedLock<ImageCtxT> Parent;
+
+ LeaderLock(librados::IoCtx& ioctx, ContextWQ *work_queue,
+ const std::string& oid, LeaderWatcher *watcher,
+ bool blacklist_on_break_lock,
+ uint32_t blacklist_expire_seconds)
+ : Parent(ioctx, work_queue, oid, watcher, librbd::managed_lock::EXCLUSIVE,
+ blacklist_on_break_lock, blacklist_expire_seconds),
+ watcher(watcher) {
+ }
+
+ bool is_leader() const {
+ Mutex::Locker locker(Parent::m_lock);
+ return Parent::is_state_post_acquiring() || Parent::is_state_locked();
+ }
+
+ bool is_releasing_leader() const {
+ Mutex::Locker locker(Parent::m_lock);
+ return Parent::is_state_pre_releasing();
+ }
+
+ protected:
+ void post_acquire_lock_handler(int r, Context *on_finish) {
+ if (r == 0) {
+ // lock is owned at this point
+ Mutex::Locker locker(Parent::m_lock);
+ Parent::set_state_post_acquiring();
+ }
+ watcher->handle_post_acquire_leader_lock(r, on_finish);
+ }
+ void pre_release_lock_handler(bool shutting_down,
+ Context *on_finish) {
+ watcher->handle_pre_release_leader_lock(on_finish);
+ }
+ void post_release_lock_handler(bool shutting_down, int r,
+ Context *on_finish) {
+ watcher->handle_post_release_leader_lock(r, on_finish);
+ }
+ private:
+ LeaderWatcher *watcher;
+ };
+
+ struct HandlePayloadVisitor : public boost::static_visitor<void> {
+ LeaderWatcher *leader_watcher;
+ Context *on_notify_ack;
+
+ HandlePayloadVisitor(LeaderWatcher *leader_watcher, Context *on_notify_ack)
+ : leader_watcher(leader_watcher), on_notify_ack(on_notify_ack) {
+ }
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ leader_watcher->handle_payload(payload, on_notify_ack);
+ }
+ };
+
+ struct C_GetLocker : public Context {
+ LeaderWatcher *leader_watcher;
+ librbd::managed_lock::Locker locker;
+
+ C_GetLocker(LeaderWatcher *leader_watcher)
+ : leader_watcher(leader_watcher) {
+ }
+
+ void finish(int r) override {
+ leader_watcher->handle_get_locker(r, locker);
+ }
+ };
+
+ typedef void (LeaderWatcher<ImageCtxT>::*TimerCallback)();
+
+ struct C_TimerGate : public Context {
+ LeaderWatcher *leader_watcher;
+
+ bool leader = false;
+ TimerCallback timer_callback = nullptr;
+
+ C_TimerGate(LeaderWatcher *leader_watcher)
+ : leader_watcher(leader_watcher) {
+ }
+
+ void finish(int r) override {
+ leader_watcher->m_timer_gate = nullptr;
+ leader_watcher->execute_timer_task(leader, timer_callback);
+ }
+ };
+
+ Threads<ImageCtxT> *m_threads;
+ leader_watcher::Listener *m_listener;
+
+ InstancesListener m_instances_listener;
+ mutable Mutex m_lock;
+ uint64_t m_notifier_id;
+ std::string m_instance_id;
+ LeaderLock *m_leader_lock;
+ Context *m_on_finish = nullptr;
+ Context *m_on_shut_down_finish = nullptr;
+ uint64_t m_acquire_attempts = 0;
+ int m_ret_val = 0;
+ MirrorStatusWatcher<ImageCtxT> *m_status_watcher = nullptr;
+ Instances<ImageCtxT> *m_instances = nullptr;
+ librbd::managed_lock::Locker m_locker;
+
+ bool m_blacklisted = false;
+
+ AsyncOpTracker m_timer_op_tracker;
+ Context *m_timer_task = nullptr;
+ C_TimerGate *m_timer_gate = nullptr;
+
+ librbd::watcher::NotifyResponse m_heartbeat_response;
+
+ bool is_leader(Mutex &m_lock) const;
+ bool is_releasing_leader(Mutex &m_lock) const;
+
+ void cancel_timer_task();
+ void schedule_timer_task(const std::string &name,
+ int delay_factor, bool leader,
+ TimerCallback callback, bool shutting_down);
+ void execute_timer_task(bool leader, TimerCallback timer_callback);
+
+ void create_leader_object();
+ void handle_create_leader_object(int r);
+
+ void register_watch();
+ void handle_register_watch(int r);
+
+ void shut_down_leader_lock();
+ void handle_shut_down_leader_lock(int r);
+
+ void unregister_watch();
+ void handle_unregister_watch(int r);
+
+ void wait_for_tasks();
+ void handle_wait_for_tasks();
+
+ void break_leader_lock();
+ void handle_break_leader_lock(int r);
+
+ void schedule_get_locker(bool reset_leader, uint32_t delay_factor);
+ void get_locker();
+ void handle_get_locker(int r, librbd::managed_lock::Locker& locker);
+
+ void schedule_acquire_leader_lock(uint32_t delay_factor);
+ void acquire_leader_lock();
+ void handle_acquire_leader_lock(int r);
+
+ void release_leader_lock();
+ void handle_release_leader_lock(int r);
+
+ void init_status_watcher();
+ void handle_init_status_watcher(int r);
+
+ void shut_down_status_watcher();
+ void handle_shut_down_status_watcher(int r);
+
+ void init_instances();
+ void handle_init_instances(int r);
+
+ void shut_down_instances();
+ void handle_shut_down_instances(int r);
+
+ void notify_listener();
+ void handle_notify_listener(int r);
+
+ void notify_lock_acquired();
+ void handle_notify_lock_acquired(int r);
+
+ void notify_lock_released();
+ void handle_notify_lock_released(int r);
+
+ void notify_heartbeat();
+ void handle_notify_heartbeat(int r);
+
+ void handle_post_acquire_leader_lock(int r, Context *on_finish);
+ void handle_pre_release_leader_lock(Context *on_finish);
+ void handle_post_release_leader_lock(int r, Context *on_finish);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+
+ void handle_rewatch_complete(int r) override;
+
+ void handle_heartbeat(Context *on_ack);
+ void handle_lock_acquired(Context *on_ack);
+ void handle_lock_released(Context *on_ack);
+
+ void handle_payload(const leader_watcher::HeartbeatPayload &payload,
+ Context *on_notify_ack);
+ void handle_payload(const leader_watcher::LockAcquiredPayload &payload,
+ Context *on_notify_ack);
+ void handle_payload(const leader_watcher::LockReleasedPayload &payload,
+ Context *on_notify_ack);
+ void handle_payload(const leader_watcher::UnknownPayload &payload,
+ Context *on_notify_ack);
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_LEADER_WATCHER_H
diff --git a/src/tools/rbd_mirror/Mirror.cc b/src/tools/rbd_mirror/Mirror.cc
new file mode 100644
index 00000000..ef18a0b6
--- /dev/null
+++ b/src/tools/rbd_mirror/Mirror.cc
@@ -0,0 +1,448 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <signal.h>
+
+#include <boost/range/adaptor/map.hpp>
+
+#include "common/Formatter.h"
+#include "common/admin_socket.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "Mirror.h"
+#include "ServiceDaemon.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::Mirror: " << this << " " \
+ << __func__ << ": "
+
+using std::list;
+using std::map;
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using librados::Rados;
+using librados::IoCtx;
+using librbd::mirror_peer_t;
+
+namespace rbd {
+namespace mirror {
+
+namespace {
+
+class MirrorAdminSocketCommand {
+public:
+ virtual ~MirrorAdminSocketCommand() {}
+ virtual bool call(Formatter *f, stringstream *ss) = 0;
+};
+
+class StatusCommand : public MirrorAdminSocketCommand {
+public:
+ explicit StatusCommand(Mirror *mirror) : mirror(mirror) {}
+
+ bool call(Formatter *f, stringstream *ss) override {
+ mirror->print_status(f, ss);
+ return true;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class StartCommand : public MirrorAdminSocketCommand {
+public:
+ explicit StartCommand(Mirror *mirror) : mirror(mirror) {}
+
+ bool call(Formatter *f, stringstream *ss) override {
+ mirror->start();
+ return true;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class StopCommand : public MirrorAdminSocketCommand {
+public:
+ explicit StopCommand(Mirror *mirror) : mirror(mirror) {}
+
+ bool call(Formatter *f, stringstream *ss) override {
+ mirror->stop();
+ return true;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class RestartCommand : public MirrorAdminSocketCommand {
+public:
+ explicit RestartCommand(Mirror *mirror) : mirror(mirror) {}
+
+ bool call(Formatter *f, stringstream *ss) override {
+ mirror->restart();
+ return true;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class FlushCommand : public MirrorAdminSocketCommand {
+public:
+ explicit FlushCommand(Mirror *mirror) : mirror(mirror) {}
+
+ bool call(Formatter *f, stringstream *ss) override {
+ mirror->flush();
+ return true;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+class LeaderReleaseCommand : public MirrorAdminSocketCommand {
+public:
+ explicit LeaderReleaseCommand(Mirror *mirror) : mirror(mirror) {}
+
+ bool call(Formatter *f, stringstream *ss) override {
+ mirror->release_leader();
+ return true;
+ }
+
+private:
+ Mirror *mirror;
+};
+
+} // anonymous namespace
+
+class MirrorAdminSocketHook : public AdminSocketHook {
+public:
+ MirrorAdminSocketHook(CephContext *cct, Mirror *mirror) :
+ admin_socket(cct->get_admin_socket()) {
+ std::string command;
+ int r;
+
+ command = "rbd mirror status";
+ r = admin_socket->register_command(command, command, this,
+ "get status for rbd mirror");
+ if (r == 0) {
+ commands[command] = new StatusCommand(mirror);
+ }
+
+ command = "rbd mirror start";
+ r = admin_socket->register_command(command, command, this,
+ "start rbd mirror");
+ if (r == 0) {
+ commands[command] = new StartCommand(mirror);
+ }
+
+ command = "rbd mirror stop";
+ r = admin_socket->register_command(command, command, this,
+ "stop rbd mirror");
+ if (r == 0) {
+ commands[command] = new StopCommand(mirror);
+ }
+
+ command = "rbd mirror restart";
+ r = admin_socket->register_command(command, command, this,
+ "restart rbd mirror");
+ if (r == 0) {
+ commands[command] = new RestartCommand(mirror);
+ }
+
+ command = "rbd mirror flush";
+ r = admin_socket->register_command(command, command, this,
+ "flush rbd mirror");
+ if (r == 0) {
+ commands[command] = new FlushCommand(mirror);
+ }
+
+ command = "rbd mirror leader release";
+ r = admin_socket->register_command(command, command, this,
+ "release rbd mirror leader");
+ if (r == 0) {
+ commands[command] = new LeaderReleaseCommand(mirror);
+ }
+ }
+
+ ~MirrorAdminSocketHook() override {
+ for (Commands::const_iterator i = commands.begin(); i != commands.end();
+ ++i) {
+ (void)admin_socket->unregister_command(i->first);
+ delete i->second;
+ }
+ }
+
+ bool call(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, bufferlist& out) override {
+ Commands::const_iterator i = commands.find(command);
+ ceph_assert(i != commands.end());
+ Formatter *f = Formatter::create(format);
+ stringstream ss;
+ bool r = i->second->call(f, &ss);
+ delete f;
+ out.append(ss);
+ return r;
+ }
+
+private:
+ typedef std::map<std::string, MirrorAdminSocketCommand*, std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+Mirror::Mirror(CephContext *cct, const std::vector<const char*> &args) :
+ m_cct(cct),
+ m_args(args),
+ m_lock("rbd::mirror::Mirror"),
+ m_local(new librados::Rados()),
+ m_asok_hook(new MirrorAdminSocketHook(cct, this))
+{
+ m_threads =
+ &(cct->lookup_or_create_singleton_object<Threads<librbd::ImageCtx>>(
+ "rbd_mirror::threads", false, cct));
+ m_service_daemon.reset(new ServiceDaemon<>(m_cct, m_local, m_threads));
+}
+
+Mirror::~Mirror()
+{
+ delete m_asok_hook;
+}
+
+void Mirror::handle_signal(int signum)
+{
+ dout(20) << signum << dendl;
+
+ Mutex::Locker l(m_lock);
+
+ switch (signum) {
+ case SIGHUP:
+ for (auto &it : m_pool_replayers) {
+ it.second->reopen_logs();
+ }
+ g_ceph_context->reopen_logs();
+ break;
+
+ case SIGINT:
+ case SIGTERM:
+ m_stopping = true;
+ m_cond.Signal();
+ break;
+
+ default:
+ ceph_abort_msgf("unexpected signal %d", signum);
+ }
+}
+
+int Mirror::init()
+{
+ int r = m_local->init_with_context(m_cct);
+ if (r < 0) {
+ derr << "could not initialize rados handle" << dendl;
+ return r;
+ }
+
+ r = m_local->connect();
+ if (r < 0) {
+ derr << "error connecting to local cluster" << dendl;
+ return r;
+ }
+
+ r = m_service_daemon->init();
+ if (r < 0) {
+ derr << "error registering service daemon: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ m_local_cluster_watcher.reset(new ClusterWatcher(m_local, m_lock,
+ m_service_daemon.get()));
+ return r;
+}
+
+void Mirror::run()
+{
+ dout(20) << "enter" << dendl;
+ while (!m_stopping) {
+ m_local_cluster_watcher->refresh_pools();
+ Mutex::Locker l(m_lock);
+ if (!m_manual_stop) {
+ update_pool_replayers(m_local_cluster_watcher->get_pool_peers());
+ }
+ m_cond.WaitInterval(
+ m_lock,
+ utime_t(m_cct->_conf.get_val<uint64_t>("rbd_mirror_pool_replayers_refresh_interval"), 0));
+ }
+
+ // stop all pool replayers in parallel
+ Mutex::Locker locker(m_lock);
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->stop(false);
+ }
+ dout(20) << "return" << dendl;
+}
+
+void Mirror::print_status(Formatter *f, stringstream *ss)
+{
+ dout(20) << "enter" << dendl;
+
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping) {
+ return;
+ }
+
+ if (f) {
+ f->open_object_section("mirror_status");
+ f->open_array_section("pool_replayers");
+ };
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->print_status(f, ss);
+ }
+
+ if (f) {
+ f->close_section();
+ f->close_section();
+ f->flush(*ss);
+ }
+}
+
+void Mirror::start()
+{
+ dout(20) << "enter" << dendl;
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = false;
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->start();
+ }
+}
+
+void Mirror::stop()
+{
+ dout(20) << "enter" << dendl;
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = true;
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->stop(true);
+ }
+}
+
+void Mirror::restart()
+{
+ dout(20) << "enter" << dendl;
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = false;
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->restart();
+ }
+}
+
+void Mirror::flush()
+{
+ dout(20) << "enter" << dendl;
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping || m_manual_stop) {
+ return;
+ }
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->flush();
+ }
+}
+
+void Mirror::release_leader()
+{
+ dout(20) << "enter" << dendl;
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping) {
+ return;
+ }
+
+ for (auto &pool_replayer : m_pool_replayers) {
+ pool_replayer.second->release_leader();
+ }
+}
+
+void Mirror::update_pool_replayers(const PoolPeers &pool_peers)
+{
+ dout(20) << "enter" << dendl;
+ ceph_assert(m_lock.is_locked());
+
+ // remove stale pool replayers before creating new pool replayers
+ for (auto it = m_pool_replayers.begin(); it != m_pool_replayers.end();) {
+ auto &peer = it->first.second;
+ auto pool_peer_it = pool_peers.find(it->first.first);
+ if (pool_peer_it == pool_peers.end() ||
+ pool_peer_it->second.find(peer) == pool_peer_it->second.end()) {
+ dout(20) << "removing pool replayer for " << peer << dendl;
+ // TODO: make async
+ it->second->shut_down();
+ it = m_pool_replayers.erase(it);
+ } else {
+ ++it;
+ }
+ }
+
+ for (auto &kv : pool_peers) {
+ for (auto &peer : kv.second) {
+ PoolPeer pool_peer(kv.first, peer);
+
+ auto pool_replayers_it = m_pool_replayers.find(pool_peer);
+ if (pool_replayers_it != m_pool_replayers.end()) {
+ auto& pool_replayer = pool_replayers_it->second;
+ if (pool_replayer->is_blacklisted()) {
+ derr << "restarting blacklisted pool replayer for " << peer << dendl;
+ // TODO: make async
+ pool_replayer->shut_down();
+ pool_replayer->init();
+ } else if (!pool_replayer->is_running()) {
+ derr << "restarting failed pool replayer for " << peer << dendl;
+ // TODO: make async
+ pool_replayer->shut_down();
+ pool_replayer->init();
+ }
+ } else {
+ dout(20) << "starting pool replayer for " << peer << dendl;
+ unique_ptr<PoolReplayer<>> pool_replayer(new PoolReplayer<>(
+ m_threads, m_service_daemon.get(), kv.first, peer, m_args));
+
+ // TODO: make async
+ pool_replayer->init();
+ m_pool_replayers.emplace(pool_peer, std::move(pool_replayer));
+ }
+ }
+
+ // TODO currently only support a single peer
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/Mirror.h b/src/tools/rbd_mirror/Mirror.h
new file mode 100644
index 00000000..153c0bc5
--- /dev/null
+++ b/src/tools/rbd_mirror/Mirror.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_H
+#define CEPH_RBD_MIRROR_H
+
+#include "common/ceph_context.h"
+#include "common/Mutex.h"
+#include "include/rados/librados.hpp"
+#include "ClusterWatcher.h"
+#include "PoolReplayer.h"
+#include "tools/rbd_mirror/Types.h"
+
+#include <set>
+#include <map>
+#include <memory>
+#include <atomic>
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct ServiceDaemon;
+template <typename> struct Threads;
+class MirrorAdminSocketHook;
+
+/**
+ * Contains the main loop and overall state for rbd-mirror.
+ *
+ * Sets up mirroring, and coordinates between noticing config
+ * changes and applying them.
+ */
+class Mirror {
+public:
+ Mirror(CephContext *cct, const std::vector<const char*> &args);
+ Mirror(const Mirror&) = delete;
+ Mirror& operator=(const Mirror&) = delete;
+ ~Mirror();
+
+ int init();
+ void run();
+ void handle_signal(int signum);
+
+ void print_status(Formatter *f, stringstream *ss);
+ void start();
+ void stop();
+ void restart();
+ void flush();
+ void release_leader();
+
+private:
+ typedef ClusterWatcher::PoolPeers PoolPeers;
+ typedef std::pair<int64_t, PeerSpec> PoolPeer;
+
+ void update_pool_replayers(const PoolPeers &pool_peers);
+
+ CephContext *m_cct;
+ std::vector<const char*> m_args;
+ Threads<librbd::ImageCtx> *m_threads = nullptr;
+ Mutex m_lock;
+ Cond m_cond;
+ RadosRef m_local;
+ std::unique_ptr<ServiceDaemon<librbd::ImageCtx>> m_service_daemon;
+
+ // monitor local cluster for config changes in peers
+ std::unique_ptr<ClusterWatcher> m_local_cluster_watcher;
+ std::map<PoolPeer, std::unique_ptr<PoolReplayer<>>> m_pool_replayers;
+ std::atomic<bool> m_stopping = { false };
+ bool m_manual_stop = false;
+ MirrorAdminSocketHook *m_asok_hook;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_H
diff --git a/src/tools/rbd_mirror/MirrorStatusWatcher.cc b/src/tools/rbd_mirror/MirrorStatusWatcher.cc
new file mode 100644
index 00000000..b935bc5c
--- /dev/null
+++ b/src/tools/rbd_mirror/MirrorStatusWatcher.cc
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MirrorStatusWatcher.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::MirrorStatusWatcher: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+MirrorStatusWatcher<I>::MirrorStatusWatcher(librados::IoCtx &io_ctx,
+ ContextWQ *work_queue)
+ : Watcher(io_ctx, work_queue, RBD_MIRRORING) {
+}
+
+template <typename I>
+MirrorStatusWatcher<I>::~MirrorStatusWatcher() {
+}
+
+template <typename I>
+void MirrorStatusWatcher<I>::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ on_finish = new FunctionContext(
+ [this, on_finish] (int r) {
+ if (r < 0) {
+ derr << "error removing down statuses: " << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+ register_watch(on_finish);
+ });
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_status_remove_down(&op);
+ librados::AioCompletion *aio_comp = create_rados_callback(on_finish);
+
+ int r = m_ioctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void MirrorStatusWatcher<I>::shut_down(Context *on_finish) {
+ dout(20) << dendl;
+
+ unregister_watch(on_finish);
+}
+
+template <typename I>
+void MirrorStatusWatcher<I>::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id,
+ bufferlist &bl) {
+ dout(20) << dendl;
+
+ bufferlist out;
+ acknowledge_notify(notify_id, handle, out);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::MirrorStatusWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/MirrorStatusWatcher.h b/src/tools/rbd_mirror/MirrorStatusWatcher.h
new file mode 100644
index 00000000..155f8cc8
--- /dev/null
+++ b/src/tools/rbd_mirror/MirrorStatusWatcher.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_MIRROR_STATUS_WATCHER_H
+#define CEPH_RBD_MIRROR_MIRROR_STATUS_WATCHER_H
+
+#include "librbd/Watcher.h"
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class MirrorStatusWatcher : protected librbd::Watcher {
+public:
+ static MirrorStatusWatcher *create(librados::IoCtx &io_ctx,
+ ContextWQ *work_queue) {
+ return new MirrorStatusWatcher(io_ctx, work_queue);
+ }
+ void destroy() {
+ delete this;
+ }
+
+ MirrorStatusWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue);
+ ~MirrorStatusWatcher() override;
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+protected:
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist &bl) override;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_MIRROR_STATUS_WATCHER_H
diff --git a/src/tools/rbd_mirror/PoolReplayer.cc b/src/tools/rbd_mirror/PoolReplayer.cc
new file mode 100644
index 00000000..35d32eb5
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolReplayer.cc
@@ -0,0 +1,1133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PoolReplayer.h"
+#include <boost/bind.hpp>
+#include "common/Formatter.h"
+#include "common/admin_socket.h"
+#include "common/ceph_argparse.h"
+#include "common/code_environment.h"
+#include "common/common_init.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "global/global_context.h"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+#include "librbd/Watcher.h"
+#include "librbd/api/Config.h"
+#include "librbd/api/Mirror.h"
+#include "ImageMap.h"
+#include "InstanceReplayer.h"
+#include "InstanceWatcher.h"
+#include "LeaderWatcher.h"
+#include "ServiceDaemon.h"
+#include "Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::PoolReplayer: " \
+ << this << " " << __func__ << ": "
+
+using std::chrono::seconds;
+using std::map;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using librbd::cls_client::dir_get_name;
+using librbd::util::create_async_context_callback;
+
+namespace rbd {
+namespace mirror {
+
+using ::operator<<;
+
+namespace {
+
+const std::string SERVICE_DAEMON_INSTANCE_ID_KEY("instance_id");
+const std::string SERVICE_DAEMON_LEADER_KEY("leader");
+const std::string SERVICE_DAEMON_LOCAL_COUNT_KEY("image_local_count");
+const std::string SERVICE_DAEMON_REMOTE_COUNT_KEY("image_remote_count");
+
+const std::vector<std::string> UNIQUE_PEER_CONFIG_KEYS {
+ {"monmap", "mon_host", "mon_dns_srv_name", "key", "keyfile", "keyring"}};
+
+template <typename I>
+class PoolReplayerAdminSocketCommand {
+public:
+ PoolReplayerAdminSocketCommand(PoolReplayer<I> *pool_replayer)
+ : pool_replayer(pool_replayer) {
+ }
+ virtual ~PoolReplayerAdminSocketCommand() {}
+ virtual bool call(Formatter *f, stringstream *ss) = 0;
+protected:
+ PoolReplayer<I> *pool_replayer;
+};
+
+template <typename I>
+class StatusCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit StatusCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->pool_replayer->print_status(f, ss);
+ return true;
+ }
+};
+
+template <typename I>
+class StartCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit StartCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->pool_replayer->start();
+ return true;
+ }
+};
+
+template <typename I>
+class StopCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit StopCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->pool_replayer->stop(true);
+ return true;
+ }
+};
+
+template <typename I>
+class RestartCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit RestartCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->pool_replayer->restart();
+ return true;
+ }
+};
+
+template <typename I>
+class FlushCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit FlushCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->pool_replayer->flush();
+ return true;
+ }
+};
+
+template <typename I>
+class LeaderReleaseCommand : public PoolReplayerAdminSocketCommand<I> {
+public:
+ explicit LeaderReleaseCommand(PoolReplayer<I> *pool_replayer)
+ : PoolReplayerAdminSocketCommand<I>(pool_replayer) {
+ }
+
+ bool call(Formatter *f, stringstream *ss) override {
+ this->pool_replayer->release_leader();
+ return true;
+ }
+};
+
+template <typename I>
+class PoolReplayerAdminSocketHook : public AdminSocketHook {
+public:
+ PoolReplayerAdminSocketHook(CephContext *cct, const std::string &name,
+ PoolReplayer<I> *pool_replayer)
+ : admin_socket(cct->get_admin_socket()) {
+ std::string command;
+ int r;
+
+ command = "rbd mirror status " + name;
+ r = admin_socket->register_command(command, command, this,
+ "get status for rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new StatusCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror start " + name;
+ r = admin_socket->register_command(command, command, this,
+ "start rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new StartCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror stop " + name;
+ r = admin_socket->register_command(command, command, this,
+ "stop rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new StopCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror restart " + name;
+ r = admin_socket->register_command(command, command, this,
+ "restart rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new RestartCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror flush " + name;
+ r = admin_socket->register_command(command, command, this,
+ "flush rbd mirror " + name);
+ if (r == 0) {
+ commands[command] = new FlushCommand<I>(pool_replayer);
+ }
+
+ command = "rbd mirror leader release " + name;
+ r = admin_socket->register_command(command, command, this,
+ "release rbd mirror leader " + name);
+ if (r == 0) {
+ commands[command] = new LeaderReleaseCommand<I>(pool_replayer);
+ }
+ }
+
+ ~PoolReplayerAdminSocketHook() override {
+ for (auto i = commands.begin(); i != commands.end(); ++i) {
+ (void)admin_socket->unregister_command(i->first);
+ delete i->second;
+ }
+ }
+
+ bool call(std::string_view command, const cmdmap_t& cmdmap,
+ std::string_view format, bufferlist& out) override {
+ auto i = commands.find(command);
+ ceph_assert(i != commands.end());
+ Formatter *f = Formatter::create(format);
+ stringstream ss;
+ bool r = i->second->call(f, &ss);
+ delete f;
+ out.append(ss);
+ return r;
+ }
+
+private:
+ typedef std::map<std::string, PoolReplayerAdminSocketCommand<I>*,
+ std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+} // anonymous namespace
+
+template <typename I>
+PoolReplayer<I>::PoolReplayer(Threads<I> *threads,
+ ServiceDaemon<I>* service_daemon,
+ int64_t local_pool_id, const PeerSpec &peer,
+ const std::vector<const char*> &args) :
+ m_threads(threads),
+ m_service_daemon(service_daemon),
+ m_local_pool_id(local_pool_id),
+ m_peer(peer),
+ m_args(args),
+ m_lock(stringify("rbd::mirror::PoolReplayer ") + stringify(peer)),
+ m_local_pool_watcher_listener(this, true),
+ m_remote_pool_watcher_listener(this, false),
+ m_image_map_listener(this),
+ m_pool_replayer_thread(this),
+ m_leader_listener(this)
+{
+}
+
+template <typename I>
+PoolReplayer<I>::~PoolReplayer()
+{
+ delete m_asok_hook;
+ shut_down();
+}
+
+template <typename I>
+bool PoolReplayer<I>::is_blacklisted() const {
+ Mutex::Locker locker(m_lock);
+ return m_blacklisted;
+}
+
+template <typename I>
+bool PoolReplayer<I>::is_leader() const {
+ Mutex::Locker locker(m_lock);
+ return m_leader_watcher && m_leader_watcher->is_leader();
+}
+
+template <typename I>
+bool PoolReplayer<I>::is_running() const {
+ return m_pool_replayer_thread.is_started();
+}
+
+template <typename I>
+void PoolReplayer<I>::init()
+{
+ Mutex::Locker l(m_lock);
+
+ ceph_assert(!m_pool_replayer_thread.is_started());
+
+ // reset state
+ m_stopping = false;
+ m_blacklisted = false;
+
+ dout(10) << "replaying for " << m_peer << dendl;
+ int r = init_rados(g_ceph_context->_conf->cluster,
+ g_ceph_context->_conf->name.to_str(),
+ "", "", "local cluster", &m_local_rados, false);
+ if (r < 0) {
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to connect to local cluster");
+ return;
+ }
+
+ r = init_rados(m_peer.cluster_name, m_peer.client_name,
+ m_peer.mon_host, m_peer.key,
+ std::string("remote peer ") + stringify(m_peer),
+ &m_remote_rados, true);
+ if (r < 0) {
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to connect to remote cluster");
+ return;
+ }
+
+ r = m_local_rados->ioctx_create2(m_local_pool_id, m_local_io_ctx);
+ if (r < 0) {
+ derr << "error accessing local pool " << m_local_pool_id << ": "
+ << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ auto cct = reinterpret_cast<CephContext *>(m_local_io_ctx.cct());
+ librbd::api::Config<I>::apply_pool_overrides(m_local_io_ctx, &cct->_conf);
+
+ std::string local_mirror_uuid;
+ r = librbd::cls_client::mirror_uuid_get(&m_local_io_ctx,
+ &local_mirror_uuid);
+ if (r < 0) {
+ derr << "failed to retrieve local mirror uuid from pool "
+ << m_local_io_ctx.get_pool_name() << ": " << cpp_strerror(r) << dendl;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to query local mirror uuid");
+ return;
+ }
+
+ r = m_remote_rados->ioctx_create(m_local_io_ctx.get_pool_name().c_str(),
+ m_remote_io_ctx);
+ if (r < 0) {
+ derr << "error accessing remote pool " << m_local_io_ctx.get_pool_name()
+ << ": " << cpp_strerror(r) << dendl;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_WARNING,
+ "unable to access remote pool");
+ return;
+ }
+
+ dout(10) << "connected to " << m_peer << dendl;
+
+ m_instance_replayer.reset(InstanceReplayer<I>::create(
+ m_threads, m_service_daemon, m_local_rados, local_mirror_uuid,
+ m_local_pool_id));
+ m_instance_replayer->init();
+ m_instance_replayer->add_peer(m_peer.uuid, m_remote_io_ctx);
+
+ m_instance_watcher.reset(InstanceWatcher<I>::create(
+ m_local_io_ctx, m_threads->work_queue, m_instance_replayer.get()));
+ r = m_instance_watcher->init();
+ if (r < 0) {
+ derr << "error initializing instance watcher: " << cpp_strerror(r) << dendl;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to initialize instance messenger object");
+ return;
+ }
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_INSTANCE_ID_KEY,
+ m_instance_watcher->get_instance_id());
+
+ m_leader_watcher.reset(LeaderWatcher<I>::create(m_threads, m_local_io_ctx,
+ &m_leader_listener));
+ r = m_leader_watcher->init();
+ if (r < 0) {
+ derr << "error initializing leader watcher: " << cpp_strerror(r) << dendl;
+ m_callout_id = m_service_daemon->add_or_update_callout(
+ m_local_pool_id, m_callout_id, service_daemon::CALLOUT_LEVEL_ERROR,
+ "unable to initialize leader messenger object");
+ return;
+ }
+
+ if (m_callout_id != service_daemon::CALLOUT_ID_NONE) {
+ m_service_daemon->remove_callout(m_local_pool_id, m_callout_id);
+ m_callout_id = service_daemon::CALLOUT_ID_NONE;
+ }
+
+ m_pool_replayer_thread.create("pool replayer");
+}
+
+template <typename I>
+void PoolReplayer<I>::shut_down() {
+ m_stopping = true;
+ {
+ Mutex::Locker l(m_lock);
+ m_cond.Signal();
+ }
+ if (m_pool_replayer_thread.is_started()) {
+ m_pool_replayer_thread.join();
+ }
+ if (m_leader_watcher) {
+ m_leader_watcher->shut_down();
+ }
+ if (m_instance_watcher) {
+ m_instance_watcher->shut_down();
+ }
+ if (m_instance_replayer) {
+ m_instance_replayer->shut_down();
+ }
+
+ m_leader_watcher.reset();
+ m_instance_watcher.reset();
+ m_instance_replayer.reset();
+
+ ceph_assert(!m_image_map);
+ ceph_assert(!m_image_deleter);
+ ceph_assert(!m_local_pool_watcher);
+ ceph_assert(!m_remote_pool_watcher);
+ m_local_rados.reset();
+ m_remote_rados.reset();
+}
+
+template <typename I>
+int PoolReplayer<I>::init_rados(const std::string &cluster_name,
+ const std::string &client_name,
+ const std::string &mon_host,
+ const std::string &key,
+ const std::string &description,
+ RadosRef *rados_ref,
+ bool strip_cluster_overrides) {
+ // NOTE: manually bootstrap a CephContext here instead of via
+ // the librados API to avoid mixing global singletons between
+ // the librados shared library and the daemon
+ // TODO: eliminate intermingling of global singletons within Ceph APIs
+ CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
+ if (client_name.empty() || !iparams.name.from_str(client_name)) {
+ derr << "error initializing cluster handle for " << description << dendl;
+ return -EINVAL;
+ }
+
+ CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+ cct->_conf->cluster = cluster_name;
+
+ // librados::Rados::conf_read_file
+ int r = cct->_conf.parse_config_files(nullptr, nullptr, 0);
+ if (r < 0 && r != -ENOENT) {
+ // do not treat this as fatal, it might still be able to connect
+ derr << "could not read ceph conf for " << description << ": "
+ << cpp_strerror(r) << dendl;
+ }
+
+ // preserve cluster-specific config settings before applying environment/cli
+ // overrides
+ std::map<std::string, std::string> config_values;
+ if (strip_cluster_overrides) {
+ // remote peer connections shouldn't apply cluster-specific
+ // configuration settings
+ for (auto& key : UNIQUE_PEER_CONFIG_KEYS) {
+ config_values[key] = cct->_conf.get_val<std::string>(key);
+ }
+ }
+
+ cct->_conf.parse_env(cct->get_module_type());
+
+ // librados::Rados::conf_parse_env
+ std::vector<const char*> args;
+ r = cct->_conf.parse_argv(args);
+ if (r < 0) {
+ derr << "could not parse environment for " << description << ":"
+ << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ cct->_conf.parse_env(cct->get_module_type());
+
+ if (!m_args.empty()) {
+ // librados::Rados::conf_parse_argv
+ args = m_args;
+ r = cct->_conf.parse_argv(args);
+ if (r < 0) {
+ derr << "could not parse command line args for " << description << ": "
+ << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+
+ if (strip_cluster_overrides) {
+ // remote peer connections shouldn't apply cluster-specific
+ // configuration settings
+ for (auto& pair : config_values) {
+ auto value = cct->_conf.get_val<std::string>(pair.first);
+ if (pair.second != value) {
+ dout(0) << "reverting global config option override: "
+ << pair.first << ": " << value << " -> " << pair.second
+ << dendl;
+ cct->_conf.set_val_or_die(pair.first, pair.second);
+ }
+ }
+ }
+
+ if (!g_ceph_context->_conf->admin_socket.empty()) {
+ cct->_conf.set_val_or_die("admin_socket",
+ "$run_dir/$name.$pid.$cluster.$cctid.asok");
+ }
+
+ if (!mon_host.empty()) {
+ r = cct->_conf.set_val("mon_host", mon_host);
+ if (r < 0) {
+ derr << "failed to set mon_host config for " << description << ": "
+ << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+
+ if (!key.empty()) {
+ r = cct->_conf.set_val("key", key);
+ if (r < 0) {
+ derr << "failed to set key config for " << description << ": "
+ << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+
+ // disable unnecessary librbd cache
+ cct->_conf.set_val_or_die("rbd_cache", "false");
+ cct->_conf.apply_changes(nullptr);
+ cct->_conf.complain_about_parse_errors(cct);
+
+ rados_ref->reset(new librados::Rados());
+
+ r = (*rados_ref)->init_with_context(cct);
+ ceph_assert(r == 0);
+ cct->put();
+
+ r = (*rados_ref)->connect();
+ if (r < 0) {
+ derr << "error connecting to " << description << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void PoolReplayer<I>::run()
+{
+ dout(20) << "enter" << dendl;
+
+ while (!m_stopping) {
+ std::string asok_hook_name = m_local_io_ctx.get_pool_name() + " " +
+ m_peer.cluster_name;
+ if (m_asok_hook_name != asok_hook_name || m_asok_hook == nullptr) {
+ m_asok_hook_name = asok_hook_name;
+ delete m_asok_hook;
+
+ m_asok_hook = new PoolReplayerAdminSocketHook<I>(g_ceph_context,
+ m_asok_hook_name, this);
+ }
+
+ Mutex::Locker locker(m_lock);
+ if (m_leader_watcher->is_blacklisted() ||
+ m_instance_replayer->is_blacklisted() ||
+ (m_local_pool_watcher && m_local_pool_watcher->is_blacklisted()) ||
+ (m_remote_pool_watcher && m_remote_pool_watcher->is_blacklisted())) {
+ m_blacklisted = true;
+ m_stopping = true;
+ break;
+ }
+
+ if (!m_stopping) {
+ m_cond.WaitInterval(m_lock, utime_t(1, 0));
+ }
+ }
+
+ m_instance_replayer->stop();
+}
+
+template <typename I>
+void PoolReplayer<I>::reopen_logs()
+{
+ Mutex::Locker l(m_lock);
+
+ if (m_local_rados) {
+ reinterpret_cast<CephContext *>(m_local_rados->cct())->reopen_logs();
+ }
+ if (m_remote_rados) {
+ reinterpret_cast<CephContext *>(m_remote_rados->cct())->reopen_logs();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::print_status(Formatter *f, stringstream *ss)
+{
+ dout(20) << "enter" << dendl;
+
+ if (!f) {
+ return;
+ }
+
+ Mutex::Locker l(m_lock);
+
+ f->open_object_section("pool_replayer_status");
+ f->dump_stream("peer") << m_peer;
+ if (m_local_io_ctx.is_valid()) {
+ f->dump_string("pool", m_local_io_ctx.get_pool_name());
+ f->dump_stream("instance_id") << m_instance_watcher->get_instance_id();
+ }
+
+ std::string state("running");
+ if (m_manual_stop) {
+ state = "stopped (manual)";
+ } else if (m_stopping) {
+ state = "stopped";
+ }
+ f->dump_string("state", state);
+
+ std::string leader_instance_id;
+ m_leader_watcher->get_leader_instance_id(&leader_instance_id);
+ f->dump_string("leader_instance_id", leader_instance_id);
+
+ bool leader = m_leader_watcher->is_leader();
+ f->dump_bool("leader", leader);
+ if (leader) {
+ std::vector<std::string> instance_ids;
+ m_leader_watcher->list_instances(&instance_ids);
+ f->open_array_section("instances");
+ for (auto instance_id : instance_ids) {
+ f->dump_string("instance_id", instance_id);
+ }
+ f->close_section();
+ }
+
+ f->dump_string("local_cluster_admin_socket",
+ reinterpret_cast<CephContext *>(m_local_io_ctx.cct())->_conf.
+ get_val<std::string>("admin_socket"));
+ f->dump_string("remote_cluster_admin_socket",
+ reinterpret_cast<CephContext *>(m_remote_io_ctx.cct())->_conf.
+ get_val<std::string>("admin_socket"));
+
+ f->open_object_section("sync_throttler");
+ m_instance_watcher->print_sync_status(f, ss);
+ f->close_section();
+
+ m_instance_replayer->print_status(f, ss);
+
+ if (m_image_deleter) {
+ f->open_object_section("image_deleter");
+ m_image_deleter->print_status(f, ss);
+ f->close_section();
+ }
+
+ f->close_section();
+ f->flush(*ss);
+}
+
+template <typename I>
+void PoolReplayer<I>::start()
+{
+ dout(20) << "enter" << dendl;
+
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = false;
+
+ if (m_instance_replayer) {
+ m_instance_replayer->start();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::stop(bool manual)
+{
+ dout(20) << "enter: manual=" << manual << dendl;
+
+ Mutex::Locker l(m_lock);
+ if (!manual) {
+ m_stopping = true;
+ m_cond.Signal();
+ return;
+ } else if (m_stopping) {
+ return;
+ }
+
+ m_manual_stop = true;
+
+ if (m_instance_replayer) {
+ m_instance_replayer->stop();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::restart()
+{
+ dout(20) << "enter" << dendl;
+
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping) {
+ return;
+ }
+
+ if (m_instance_replayer) {
+ m_instance_replayer->restart();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::flush()
+{
+ dout(20) << "enter" << dendl;
+
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping || m_manual_stop) {
+ return;
+ }
+
+ if (m_instance_replayer) {
+ m_instance_replayer->flush();
+ }
+}
+
+template <typename I>
+void PoolReplayer<I>::release_leader()
+{
+ dout(20) << "enter" << dendl;
+
+ Mutex::Locker l(m_lock);
+
+ if (m_stopping || !m_leader_watcher) {
+ return;
+ }
+
+ m_leader_watcher->release_leader();
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_update(const std::string &mirror_uuid,
+ ImageIds &&added_image_ids,
+ ImageIds &&removed_image_ids) {
+ if (m_stopping) {
+ return;
+ }
+
+ dout(10) << "mirror_uuid=" << mirror_uuid << ", "
+ << "added_count=" << added_image_ids.size() << ", "
+ << "removed_count=" << removed_image_ids.size() << dendl;
+ Mutex::Locker locker(m_lock);
+ if (!m_leader_watcher->is_leader()) {
+ return;
+ }
+
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_LOCAL_COUNT_KEY,
+ m_local_pool_watcher->get_image_count());
+ if (m_remote_pool_watcher) {
+ m_service_daemon->add_or_update_attribute(
+ m_local_pool_id, SERVICE_DAEMON_REMOTE_COUNT_KEY,
+ m_remote_pool_watcher->get_image_count());
+ }
+
+ std::set<std::string> added_global_image_ids;
+ for (auto& image_id : added_image_ids) {
+ added_global_image_ids.insert(image_id.global_id);
+ }
+
+ std::set<std::string> removed_global_image_ids;
+ for (auto& image_id : removed_image_ids) {
+ removed_global_image_ids.insert(image_id.global_id);
+ }
+
+ m_image_map->update_images(mirror_uuid,
+ std::move(added_global_image_ids),
+ std::move(removed_global_image_ids));
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_post_acquire_leader(Context *on_finish) {
+ dout(10) << dendl;
+
+ m_service_daemon->add_or_update_attribute(m_local_pool_id,
+ SERVICE_DAEMON_LEADER_KEY, true);
+ m_instance_watcher->handle_acquire_leader();
+ init_image_map(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_pre_release_leader(Context *on_finish) {
+ dout(10) << dendl;
+
+ m_service_daemon->remove_attribute(m_local_pool_id,
+ SERVICE_DAEMON_LEADER_KEY);
+ m_instance_watcher->handle_release_leader();
+ shut_down_image_deleter(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::init_image_map(Context *on_finish) {
+ dout(5) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_image_map);
+ m_image_map.reset(ImageMap<I>::create(m_local_io_ctx, m_threads,
+ m_instance_watcher->get_instance_id(),
+ m_image_map_listener));
+
+ auto ctx = new FunctionContext([this, on_finish](int r) {
+ handle_init_image_map(r, on_finish);
+ });
+ m_image_map->init(create_async_context_callback(
+ m_threads->work_queue, ctx));
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_init_image_map(int r, Context *on_finish) {
+ dout(5) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to init image map: " << cpp_strerror(r) << dendl;
+ on_finish = new FunctionContext([on_finish, r](int) {
+ on_finish->complete(r);
+ });
+ shut_down_image_map(on_finish);
+ return;
+ }
+
+ init_local_pool_watcher(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::init_local_pool_watcher(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_local_pool_watcher);
+ m_local_pool_watcher.reset(PoolWatcher<I>::create(
+ m_threads, m_local_io_ctx, m_local_pool_watcher_listener));
+
+ // ensure the initial set of local images is up-to-date
+ // after acquiring the leader role
+ auto ctx = new FunctionContext([this, on_finish](int r) {
+ handle_init_local_pool_watcher(r, on_finish);
+ });
+ m_local_pool_watcher->init(create_async_context_callback(
+ m_threads->work_queue, ctx));
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_init_local_pool_watcher(
+ int r, Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to retrieve local images: " << cpp_strerror(r) << dendl;
+ on_finish = new FunctionContext([on_finish, r](int) {
+ on_finish->complete(r);
+ });
+ shut_down_pool_watchers(on_finish);
+ return;
+ }
+
+ init_remote_pool_watcher(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::init_remote_pool_watcher(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_remote_pool_watcher);
+ m_remote_pool_watcher.reset(PoolWatcher<I>::create(
+ m_threads, m_remote_io_ctx, m_remote_pool_watcher_listener));
+
+ auto ctx = new FunctionContext([this, on_finish](int r) {
+ handle_init_remote_pool_watcher(r, on_finish);
+ });
+ m_remote_pool_watcher->init(create_async_context_callback(
+ m_threads->work_queue, ctx));
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_init_remote_pool_watcher(
+ int r, Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+ if (r == -ENOENT) {
+ // Technically nothing to do since the other side doesn't
+ // have mirroring enabled. Eventually the remote pool watcher will
+ // detect images (if mirroring is enabled), so no point propagating
+ // an error which would just busy-spin the state machines.
+ dout(0) << "remote peer does not have mirroring configured" << dendl;
+ } else if (r < 0) {
+ derr << "failed to retrieve remote images: " << cpp_strerror(r) << dendl;
+ on_finish = new FunctionContext([on_finish, r](int) {
+ on_finish->complete(r);
+ });
+ shut_down_pool_watchers(on_finish);
+ return;
+ }
+
+ init_image_deleter(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::init_image_deleter(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_image_deleter);
+
+ on_finish = new FunctionContext([this, on_finish](int r) {
+ handle_init_image_deleter(r, on_finish);
+ });
+ m_image_deleter.reset(ImageDeleter<I>::create(m_local_io_ctx, m_threads,
+ m_service_daemon));
+ m_image_deleter->init(create_async_context_callback(
+ m_threads->work_queue, on_finish));
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_init_image_deleter(int r, Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to init image deleter: " << cpp_strerror(r) << dendl;
+ on_finish = new FunctionContext([on_finish, r](int) {
+ on_finish->complete(r);
+ });
+ shut_down_image_deleter(on_finish);
+ return;
+ }
+
+ on_finish->complete(0);
+
+ Mutex::Locker locker(m_lock);
+ m_cond.Signal();
+}
+
+template <typename I>
+void PoolReplayer<I>::shut_down_image_deleter(Context* on_finish) {
+ dout(10) << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_image_deleter) {
+ Context *ctx = new FunctionContext([this, on_finish](int r) {
+ handle_shut_down_image_deleter(r, on_finish);
+ });
+ ctx = create_async_context_callback(m_threads->work_queue, ctx);
+
+ m_image_deleter->shut_down(ctx);
+ return;
+ }
+ }
+ shut_down_pool_watchers(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_shut_down_image_deleter(
+ int r, Context* on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_image_deleter);
+ m_image_deleter.reset();
+ }
+
+ shut_down_pool_watchers(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::shut_down_pool_watchers(Context *on_finish) {
+ dout(10) << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_local_pool_watcher) {
+ Context *ctx = new FunctionContext([this, on_finish](int r) {
+ handle_shut_down_pool_watchers(r, on_finish);
+ });
+ ctx = create_async_context_callback(m_threads->work_queue, ctx);
+
+ auto gather_ctx = new C_Gather(g_ceph_context, ctx);
+ m_local_pool_watcher->shut_down(gather_ctx->new_sub());
+ if (m_remote_pool_watcher) {
+ m_remote_pool_watcher->shut_down(gather_ctx->new_sub());
+ }
+ gather_ctx->activate();
+ return;
+ }
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_shut_down_pool_watchers(
+ int r, Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_local_pool_watcher);
+ m_local_pool_watcher.reset();
+
+ if (m_remote_pool_watcher) {
+ m_remote_pool_watcher.reset();
+ }
+ }
+ wait_for_update_ops(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::wait_for_update_ops(Context *on_finish) {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+
+ Context *ctx = new FunctionContext([this, on_finish](int r) {
+ handle_wait_for_update_ops(r, on_finish);
+ });
+ ctx = create_async_context_callback(m_threads->work_queue, ctx);
+
+ m_update_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_wait_for_update_ops(int r, Context *on_finish) {
+ dout(10) << "r=" << r << dendl;
+ ceph_assert(r == 0);
+
+ shut_down_image_map(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::shut_down_image_map(Context *on_finish) {
+ dout(5) << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_image_map) {
+ on_finish = new FunctionContext([this, on_finish](int r) {
+ handle_shut_down_image_map(r, on_finish);
+ });
+ m_image_map->shut_down(create_async_context_callback(
+ m_threads->work_queue, on_finish));
+ return;
+ }
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_shut_down_image_map(int r, Context *on_finish) {
+ dout(5) << "r=" << r << dendl;
+ if (r < 0 && r != -EBLACKLISTED) {
+ derr << "failed to shut down image map: " << cpp_strerror(r) << dendl;
+ }
+
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_image_map);
+ m_image_map.reset();
+
+ m_instance_replayer->release_all(on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_update_leader(
+ const std::string &leader_instance_id) {
+ dout(10) << "leader_instance_id=" << leader_instance_id << dendl;
+
+ m_instance_watcher->handle_update_leader(leader_instance_id);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_acquire_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) {
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ m_instance_watcher->notify_image_acquire(instance_id, global_image_id,
+ on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_release_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) {
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ m_instance_watcher->notify_image_release(instance_id, global_image_id,
+ on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_remove_image(const std::string &mirror_uuid,
+ const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) {
+ ceph_assert(!mirror_uuid.empty());
+ dout(5) << "mirror_uuid=" << mirror_uuid << ", "
+ << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ m_instance_watcher->notify_peer_image_removed(instance_id, global_image_id,
+ mirror_uuid, on_finish);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_instances_added(const InstanceIds &instance_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+ Mutex::Locker locker(m_lock);
+ if (!m_leader_watcher->is_leader()) {
+ return;
+ }
+
+ ceph_assert(m_image_map);
+ m_image_map->update_instances_added(instance_ids);
+}
+
+template <typename I>
+void PoolReplayer<I>::handle_instances_removed(
+ const InstanceIds &instance_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+ Mutex::Locker locker(m_lock);
+ if (!m_leader_watcher->is_leader()) {
+ return;
+ }
+
+ ceph_assert(m_image_map);
+ m_image_map->update_instances_removed(instance_ids);
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::PoolReplayer<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/PoolReplayer.h b/src/tools/rbd_mirror/PoolReplayer.h
new file mode 100644
index 00000000..43a4a0fc
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolReplayer.h
@@ -0,0 +1,303 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_POOL_REPLAYER_H
+#define CEPH_RBD_MIRROR_POOL_REPLAYER_H
+
+#include "common/AsyncOpTracker.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/WorkQueue.h"
+#include "include/rados/librados.hpp"
+
+#include "ClusterWatcher.h"
+#include "LeaderWatcher.h"
+#include "PoolWatcher.h"
+#include "ImageDeleter.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/image_map/Types.h"
+#include "tools/rbd_mirror/leader_watcher/Types.h"
+#include "tools/rbd_mirror/pool_watcher/Types.h"
+#include "tools/rbd_mirror/service_daemon/Types.h"
+
+#include <set>
+#include <map>
+#include <memory>
+#include <atomic>
+#include <string>
+#include <vector>
+
+class AdminSocketHook;
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> class ImageMap;
+template <typename> class InstanceReplayer;
+template <typename> class InstanceWatcher;
+template <typename> class ServiceDaemon;
+template <typename> struct Threads;
+
+/**
+ * Controls mirroring for a single remote cluster.
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class PoolReplayer {
+public:
+ PoolReplayer(Threads<ImageCtxT> *threads,
+ ServiceDaemon<ImageCtxT>* service_daemon,
+ int64_t local_pool_id, const PeerSpec &peer,
+ const std::vector<const char*> &args);
+ ~PoolReplayer();
+ PoolReplayer(const PoolReplayer&) = delete;
+ PoolReplayer& operator=(const PoolReplayer&) = delete;
+
+ bool is_blacklisted() const;
+ bool is_leader() const;
+ bool is_running() const;
+
+ void init();
+ void shut_down();
+
+ void run();
+
+ void print_status(Formatter *f, stringstream *ss);
+ void start();
+ void stop(bool manual);
+ void restart();
+ void flush();
+ void release_leader();
+ void reopen_logs();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT
+ * |
+ * v
+ * <follower> <-------------------------\
+ * . |
+ * . |
+ * v (leader acquired) |
+ * INIT_IMAGE_MAP SHUT_DOWN_IMAGE_MAP
+ * | ^
+ * v |
+ * INIT_LOCAL_POOL_WATCHER WAIT_FOR_NOTIFICATIONS
+ * | ^
+ * v |
+ * INIT_REMOTE_POOL_WATCHER SHUT_DOWN_POOL_WATCHERS
+ * | ^
+ * v |
+ * INIT_IMAGE_DELETER SHUT_DOWN_IMAGE_DELETER
+ * | ^
+ * v .
+ * <leader> <-----------\ .
+ * . | .
+ * . (image update) | .
+ * . . > NOTIFY_INSTANCE_WATCHER .
+ * . .
+ * . (leader lost / shut down) .
+ * . . . . . . . . . . . . . . . . . .
+ *
+ * @endverbatim
+ */
+
+ typedef std::vector<std::string> InstanceIds;
+
+ struct PoolWatcherListener : public pool_watcher::Listener {
+ PoolReplayer *pool_replayer;
+ bool local;
+
+ PoolWatcherListener(PoolReplayer *pool_replayer, bool local)
+ : pool_replayer(pool_replayer), local(local) {
+ }
+
+ void handle_update(const std::string &mirror_uuid,
+ ImageIds &&added_image_ids,
+ ImageIds &&removed_image_ids) override {
+ pool_replayer->handle_update((local ? "" : mirror_uuid),
+ std::move(added_image_ids),
+ std::move(removed_image_ids));
+ }
+ };
+
+ struct ImageMapListener : public image_map::Listener {
+ PoolReplayer *pool_replayer;
+
+ ImageMapListener(PoolReplayer *pool_replayer)
+ : pool_replayer(pool_replayer) {
+ }
+
+ void acquire_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) override {
+ pool_replayer->handle_acquire_image(global_image_id, instance_id,
+ on_finish);
+ }
+
+ void release_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) override {
+ pool_replayer->handle_release_image(global_image_id, instance_id,
+ on_finish);
+ }
+
+ void remove_image(const std::string &mirror_uuid,
+ const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) override {
+ pool_replayer->handle_remove_image(mirror_uuid, global_image_id,
+ instance_id, on_finish);
+ }
+ };
+
+ void handle_update(const std::string &mirror_uuid,
+ ImageIds &&added_image_ids,
+ ImageIds &&removed_image_ids);
+
+ int init_rados(const std::string &cluster_name,
+ const std::string &client_name,
+ const std::string &mon_host,
+ const std::string &key,
+ const std::string &description, RadosRef *rados_ref,
+ bool strip_cluster_overrides);
+
+ void handle_post_acquire_leader(Context *on_finish);
+ void handle_pre_release_leader(Context *on_finish);
+
+ void init_image_map(Context *on_finish);
+ void handle_init_image_map(int r, Context *on_finish);
+
+ void init_local_pool_watcher(Context *on_finish);
+ void handle_init_local_pool_watcher(int r, Context *on_finish);
+
+ void init_remote_pool_watcher(Context *on_finish);
+ void handle_init_remote_pool_watcher(int r, Context *on_finish);
+
+ void init_image_deleter(Context* on_finish);
+ void handle_init_image_deleter(int r, Context* on_finish);
+
+ void shut_down_image_deleter(Context* on_finish);
+ void handle_shut_down_image_deleter(int r, Context* on_finish);
+
+ void shut_down_pool_watchers(Context *on_finish);
+ void handle_shut_down_pool_watchers(int r, Context *on_finish);
+
+ void wait_for_update_ops(Context *on_finish);
+ void handle_wait_for_update_ops(int r, Context *on_finish);
+
+ void shut_down_image_map(Context *on_finish);
+ void handle_shut_down_image_map(int r, Context *on_finish);
+
+ void handle_update_leader(const std::string &leader_instance_id);
+
+ void handle_acquire_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish);
+ void handle_release_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish);
+ void handle_remove_image(const std::string &mirror_uuid,
+ const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish);
+
+ void handle_instances_added(const InstanceIds &instance_ids);
+ void handle_instances_removed(const InstanceIds &instance_ids);
+
+ Threads<ImageCtxT> *m_threads;
+ ServiceDaemon<ImageCtxT>* m_service_daemon;
+ int64_t m_local_pool_id = -1;
+ PeerSpec m_peer;
+ std::vector<const char*> m_args;
+
+ mutable Mutex m_lock;
+ Cond m_cond;
+ std::atomic<bool> m_stopping = { false };
+ bool m_manual_stop = false;
+ bool m_blacklisted = false;
+
+ RadosRef m_local_rados;
+ RadosRef m_remote_rados;
+
+ librados::IoCtx m_local_io_ctx;
+ librados::IoCtx m_remote_io_ctx;
+
+ PoolWatcherListener m_local_pool_watcher_listener;
+ std::unique_ptr<PoolWatcher<ImageCtxT>> m_local_pool_watcher;
+
+ PoolWatcherListener m_remote_pool_watcher_listener;
+ std::unique_ptr<PoolWatcher<ImageCtxT>> m_remote_pool_watcher;
+
+ std::unique_ptr<InstanceReplayer<ImageCtxT>> m_instance_replayer;
+ std::unique_ptr<ImageDeleter<ImageCtxT>> m_image_deleter;
+
+ ImageMapListener m_image_map_listener;
+ std::unique_ptr<ImageMap<ImageCtxT>> m_image_map;
+
+ std::string m_asok_hook_name;
+ AdminSocketHook *m_asok_hook = nullptr;
+
+ service_daemon::CalloutId m_callout_id = service_daemon::CALLOUT_ID_NONE;
+
+ class PoolReplayerThread : public Thread {
+ PoolReplayer *m_pool_replayer;
+ public:
+ PoolReplayerThread(PoolReplayer *pool_replayer)
+ : m_pool_replayer(pool_replayer) {
+ }
+ void *entry() override {
+ m_pool_replayer->run();
+ return 0;
+ }
+ } m_pool_replayer_thread;
+
+ class LeaderListener : public leader_watcher::Listener {
+ public:
+ LeaderListener(PoolReplayer *pool_replayer)
+ : m_pool_replayer(pool_replayer) {
+ }
+
+ protected:
+ void post_acquire_handler(Context *on_finish) override {
+ m_pool_replayer->handle_post_acquire_leader(on_finish);
+ }
+
+ void pre_release_handler(Context *on_finish) override {
+ m_pool_replayer->handle_pre_release_leader(on_finish);
+ }
+
+ void update_leader_handler(
+ const std::string &leader_instance_id) override {
+ m_pool_replayer->handle_update_leader(leader_instance_id);
+ }
+
+ void handle_instances_added(const InstanceIds& instance_ids) override {
+ m_pool_replayer->handle_instances_added(instance_ids);
+ }
+
+ void handle_instances_removed(const InstanceIds& instance_ids) override {
+ m_pool_replayer->handle_instances_removed(instance_ids);
+ }
+
+ private:
+ PoolReplayer *m_pool_replayer;
+ } m_leader_listener;
+
+ std::unique_ptr<LeaderWatcher<ImageCtxT>> m_leader_watcher;
+ std::unique_ptr<InstanceWatcher<ImageCtxT>> m_instance_watcher;
+ AsyncOpTracker m_update_op_tracker;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::PoolReplayer<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_POOL_REPLAYER_H
diff --git a/src/tools/rbd_mirror/PoolWatcher.cc b/src/tools/rbd_mirror/PoolWatcher.cc
new file mode 100644
index 00000000..81810ea1
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolWatcher.cc
@@ -0,0 +1,553 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/PoolWatcher.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+#include "librbd/MirroringWatcher.h"
+#include "librbd/Utils.h"
+#include "librbd/api/Image.h"
+#include "librbd/api/Mirror.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h"
+#include <boost/bind.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::PoolWatcher: " << this << " " \
+ << __func__ << ": "
+
+using std::list;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+namespace rbd {
+namespace mirror {
+
+template <typename I>
+class PoolWatcher<I>::MirroringWatcher : public librbd::MirroringWatcher<I> {
+public:
+ using ContextWQ = typename std::decay<
+ typename std::remove_pointer<
+ decltype(Threads<I>::work_queue)>::type>::type;
+
+ MirroringWatcher(librados::IoCtx &io_ctx, ContextWQ *work_queue,
+ PoolWatcher *pool_watcher)
+ : librbd::MirroringWatcher<I>(io_ctx, work_queue),
+ m_pool_watcher(pool_watcher) {
+ }
+
+ void handle_rewatch_complete(int r) override {
+ m_pool_watcher->handle_rewatch_complete(r);
+ }
+
+ void handle_mode_updated(cls::rbd::MirrorMode mirror_mode) override {
+ // invalidate all image state and refresh the pool contents
+ m_pool_watcher->schedule_refresh_images(5);
+ }
+
+ void handle_image_updated(cls::rbd::MirrorImageState state,
+ const std::string &remote_image_id,
+ const std::string &global_image_id) override {
+ bool enabled = (state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED);
+ m_pool_watcher->handle_image_updated(remote_image_id, global_image_id,
+ enabled);
+ }
+
+private:
+ PoolWatcher *m_pool_watcher;
+};
+
+template <typename I>
+PoolWatcher<I>::PoolWatcher(Threads<I> *threads, librados::IoCtx &remote_io_ctx,
+ pool_watcher::Listener &listener)
+ : m_threads(threads), m_remote_io_ctx(remote_io_ctx), m_listener(listener),
+ m_lock(librbd::util::unique_lock_name("rbd::mirror::PoolWatcher", this)) {
+ m_mirroring_watcher = new MirroringWatcher(m_remote_io_ctx,
+ m_threads->work_queue, this);
+}
+
+template <typename I>
+PoolWatcher<I>::~PoolWatcher() {
+ delete m_mirroring_watcher;
+}
+
+template <typename I>
+bool PoolWatcher<I>::is_blacklisted() const {
+ Mutex::Locker locker(m_lock);
+ return m_blacklisted;
+}
+
+template <typename I>
+void PoolWatcher<I>::init(Context *on_finish) {
+ dout(5) << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_on_init_finish = on_finish;
+
+ ceph_assert(!m_refresh_in_progress);
+ m_refresh_in_progress = true;
+ }
+
+ // start async updates for mirror image directory
+ register_watcher();
+}
+
+template <typename I>
+void PoolWatcher<I>::shut_down(Context *on_finish) {
+ dout(5) << dendl;
+
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(!m_shutting_down);
+ m_shutting_down = true;
+ if (m_timer_ctx != nullptr) {
+ m_threads->timer->cancel_event(m_timer_ctx);
+ m_timer_ctx = nullptr;
+ }
+ }
+
+ // in-progress unregister tracked as async op
+ unregister_watcher();
+
+ m_async_op_tracker.wait_for_ops(on_finish);
+}
+
+template <typename I>
+void PoolWatcher<I>::register_watcher() {
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_image_ids_invalid);
+ ceph_assert(m_refresh_in_progress);
+ }
+
+ // if the watch registration is in-flight, let the watcher
+ // handle the transition -- only (re-)register if it's not registered
+ if (!m_mirroring_watcher->is_unregistered()) {
+ refresh_images();
+ return;
+ }
+
+ // first time registering or the watch failed
+ dout(5) << dendl;
+ m_async_op_tracker.start_op();
+
+ Context *ctx = create_context_callback<
+ PoolWatcher, &PoolWatcher<I>::handle_register_watcher>(this);
+ m_mirroring_watcher->register_watch(ctx);
+}
+
+template <typename I>
+void PoolWatcher<I>::handle_register_watcher(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_image_ids_invalid);
+ ceph_assert(m_refresh_in_progress);
+ if (r < 0) {
+ m_refresh_in_progress = false;
+ }
+ }
+
+ Context *on_init_finish = nullptr;
+ if (r >= 0) {
+ refresh_images();
+ } else if (r == -EBLACKLISTED) {
+ dout(0) << "detected client is blacklisted" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ m_blacklisted = true;
+ std::swap(on_init_finish, m_on_init_finish);
+ } else if (r == -ENOENT) {
+ dout(5) << "mirroring directory does not exist" << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+
+ schedule_refresh_images(30);
+ } else {
+ derr << "unexpected error registering mirroring directory watch: "
+ << cpp_strerror(r) << dendl;
+ schedule_refresh_images(10);
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void PoolWatcher<I>::unregister_watcher() {
+ dout(5) << dendl;
+
+ m_async_op_tracker.start_op();
+ Context *ctx = new FunctionContext([this](int r) {
+ dout(5) << "unregister_watcher: r=" << r << dendl;
+ if (r < 0) {
+ derr << "error unregistering watcher for "
+ << m_mirroring_watcher->get_oid() << " object: " << cpp_strerror(r)
+ << dendl;
+ }
+ m_async_op_tracker.finish_op();
+ });
+
+ m_mirroring_watcher->unregister_watch(ctx);
+}
+
+template <typename I>
+void PoolWatcher<I>::refresh_images() {
+ dout(5) << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_image_ids_invalid);
+ ceph_assert(m_refresh_in_progress);
+
+ // clear all pending notification events since we need to perform
+ // a full image list refresh
+ m_pending_added_image_ids.clear();
+ m_pending_removed_image_ids.clear();
+ }
+
+ m_async_op_tracker.start_op();
+ m_refresh_image_ids.clear();
+ Context *ctx = create_context_callback<
+ PoolWatcher, &PoolWatcher<I>::handle_refresh_images>(this);
+ auto req = pool_watcher::RefreshImagesRequest<I>::create(m_remote_io_ctx,
+ &m_refresh_image_ids,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void PoolWatcher<I>::handle_refresh_images(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ bool retry_refresh = false;
+ Context *on_init_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_image_ids_invalid);
+ ceph_assert(m_refresh_in_progress);
+
+ if (r >= 0) {
+ m_pending_image_ids = std::move(m_refresh_image_ids);
+ } else if (r == -EBLACKLISTED) {
+ dout(0) << "detected client is blacklisted during image refresh" << dendl;
+
+ m_blacklisted = true;
+ m_refresh_in_progress = false;
+ std::swap(on_init_finish, m_on_init_finish);
+ } else if (r == -ENOENT) {
+ dout(5) << "mirroring directory not found" << dendl;
+ m_pending_image_ids.clear();
+ r = 0;
+ } else {
+ m_refresh_in_progress = false;
+ retry_refresh = true;
+ }
+ }
+
+ if (retry_refresh) {
+ derr << "failed to retrieve mirroring directory: " << cpp_strerror(r)
+ << dendl;
+ schedule_refresh_images(10);
+ } else if (r >= 0) {
+ get_mirror_uuid();
+ return;
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ ceph_assert(r == -EBLACKLISTED);
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void PoolWatcher<I>::get_mirror_uuid() {
+ dout(5) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_uuid_get_start(&op);
+
+ m_out_bl.clear();
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ PoolWatcher, &PoolWatcher<I>::handle_get_mirror_uuid>(this);
+ int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void PoolWatcher<I>::handle_get_mirror_uuid(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ bool deferred_refresh = false;
+ bool retry_refresh = false;
+ Context *on_init_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_image_ids_invalid);
+ ceph_assert(m_refresh_in_progress);
+ m_refresh_in_progress = false;
+
+ m_pending_mirror_uuid = "";
+ if (r >= 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_uuid_get_finish(
+ &it, &m_pending_mirror_uuid);
+ }
+ if (r >= 0 && m_pending_mirror_uuid.empty()) {
+ r = -ENOENT;
+ }
+
+ if (m_deferred_refresh) {
+ // need to refresh -- skip the notification
+ deferred_refresh = true;
+ } else if (r >= 0) {
+ dout(10) << "mirror_uuid=" << m_pending_mirror_uuid << dendl;
+ m_image_ids_invalid = false;
+ std::swap(on_init_finish, m_on_init_finish);
+ schedule_listener();
+ } else if (r == -EBLACKLISTED) {
+ dout(0) << "detected client is blacklisted during image refresh" << dendl;
+
+ m_blacklisted = true;
+ std::swap(on_init_finish, m_on_init_finish);
+ } else if (r == -ENOENT) {
+ dout(5) << "mirroring uuid not found" << dendl;
+ std::swap(on_init_finish, m_on_init_finish);
+ retry_refresh = true;
+ } else {
+ retry_refresh = true;
+ }
+ }
+
+ if (deferred_refresh) {
+ dout(5) << "scheduling deferred refresh" << dendl;
+ schedule_refresh_images(0);
+ } else if (retry_refresh) {
+ derr << "failed to retrieve mirror uuid: " << cpp_strerror(r)
+ << dendl;
+ schedule_refresh_images(10);
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void PoolWatcher<I>::schedule_refresh_images(double interval) {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ if (m_shutting_down || m_refresh_in_progress || m_timer_ctx != nullptr) {
+ if (m_refresh_in_progress && !m_deferred_refresh) {
+ dout(5) << "deferring refresh until in-flight refresh completes" << dendl;
+ m_deferred_refresh = true;
+ }
+ return;
+ }
+
+ m_image_ids_invalid = true;
+ m_timer_ctx = m_threads->timer->add_event_after(
+ interval,
+ new FunctionContext([this](int r) {
+ process_refresh_images();
+ }));
+}
+
+template <typename I>
+void PoolWatcher<I>::handle_rewatch_complete(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ if (r == -EBLACKLISTED) {
+ dout(0) << "detected client is blacklisted" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ m_blacklisted = true;
+ return;
+ } else if (r == -ENOENT) {
+ dout(5) << "mirroring directory deleted" << dendl;
+ } else if (r < 0) {
+ derr << "unexpected error re-registering mirroring directory watch: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ schedule_refresh_images(5);
+}
+
+template <typename I>
+void PoolWatcher<I>::handle_image_updated(const std::string &remote_image_id,
+ const std::string &global_image_id,
+ bool enabled) {
+ dout(10) << "remote_image_id=" << remote_image_id << ", "
+ << "global_image_id=" << global_image_id << ", "
+ << "enabled=" << enabled << dendl;
+
+ Mutex::Locker locker(m_lock);
+ ImageId image_id(global_image_id, remote_image_id);
+ m_pending_added_image_ids.erase(image_id);
+ m_pending_removed_image_ids.erase(image_id);
+
+ if (enabled) {
+ m_pending_added_image_ids.insert(image_id);
+ schedule_listener();
+ } else {
+ m_pending_removed_image_ids.insert(image_id);
+ schedule_listener();
+ }
+}
+
+template <typename I>
+void PoolWatcher<I>::process_refresh_images() {
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_timer_ctx != nullptr);
+ m_timer_ctx = nullptr;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_refresh_in_progress);
+ m_refresh_in_progress = true;
+ m_deferred_refresh = false;
+ }
+
+ // execute outside of the timer's lock
+ m_async_op_tracker.start_op();
+ Context *ctx = new FunctionContext([this](int r) {
+ register_watcher();
+ m_async_op_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void PoolWatcher<I>::schedule_listener() {
+ ceph_assert(m_lock.is_locked());
+ m_pending_updates = true;
+ if (m_shutting_down || m_image_ids_invalid || m_notify_listener_in_progress) {
+ return;
+ }
+
+ dout(20) << dendl;
+
+ m_async_op_tracker.start_op();
+ Context *ctx = new FunctionContext([this](int r) {
+ notify_listener();
+ m_async_op_tracker.finish_op();
+ });
+
+ m_notify_listener_in_progress = true;
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void PoolWatcher<I>::notify_listener() {
+ dout(10) << dendl;
+
+ std::string mirror_uuid;
+ ImageIds added_image_ids;
+ ImageIds removed_image_ids;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_notify_listener_in_progress);
+
+ // if the mirror uuid is updated, treat it as the removal of all
+ // images in the pool
+ if (m_mirror_uuid != m_pending_mirror_uuid) {
+ if (!m_mirror_uuid.empty()) {
+ dout(0) << "mirror uuid updated:"
+ << "old=" << m_mirror_uuid << ", "
+ << "new=" << m_pending_mirror_uuid << dendl;
+ }
+
+ mirror_uuid = m_mirror_uuid;
+ removed_image_ids = std::move(m_image_ids);
+ m_image_ids.clear();
+ }
+ }
+
+ if (!removed_image_ids.empty()) {
+ m_listener.handle_update(mirror_uuid, {}, std::move(removed_image_ids));
+ removed_image_ids.clear();
+ }
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_notify_listener_in_progress);
+
+ // if the watch failed while we didn't own the lock, we are going
+ // to need to perform a full refresh
+ if (m_image_ids_invalid) {
+ m_notify_listener_in_progress = false;
+ return;
+ }
+
+ // merge add/remove notifications into pending set (a given image
+ // can only be in one set or another)
+ for (auto &image_id : m_pending_removed_image_ids) {
+ dout(20) << "image_id=" << image_id << dendl;
+ m_pending_image_ids.erase(image_id);
+ }
+
+ for (auto &image_id : m_pending_added_image_ids) {
+ dout(20) << "image_id=" << image_id << dendl;
+ m_pending_image_ids.erase(image_id);
+ m_pending_image_ids.insert(image_id);
+ }
+ m_pending_added_image_ids.clear();
+
+ // compute added/removed images
+ for (auto &image_id : m_image_ids) {
+ auto it = m_pending_image_ids.find(image_id);
+ if (it == m_pending_image_ids.end() || it->id != image_id.id) {
+ removed_image_ids.insert(image_id);
+ }
+ }
+ for (auto &image_id : m_pending_image_ids) {
+ auto it = m_image_ids.find(image_id);
+ if (it == m_image_ids.end() || it->id != image_id.id) {
+ added_image_ids.insert(image_id);
+ }
+ }
+
+ m_pending_updates = false;
+ m_image_ids = m_pending_image_ids;
+
+ m_mirror_uuid = m_pending_mirror_uuid;
+ mirror_uuid = m_mirror_uuid;
+ }
+
+ m_listener.handle_update(mirror_uuid, std::move(added_image_ids),
+ std::move(removed_image_ids));
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_notify_listener_in_progress = false;
+ if (m_pending_updates) {
+ schedule_listener();
+ }
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::PoolWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/PoolWatcher.h b/src/tools/rbd_mirror/PoolWatcher.h
new file mode 100644
index 00000000..1136a319
--- /dev/null
+++ b/src/tools/rbd_mirror/PoolWatcher.h
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_POOL_WATCHER_H
+#define CEPH_RBD_MIRROR_POOL_WATCHER_H
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+
+#include "common/AsyncOpTracker.h"
+#include "common/ceph_context.h"
+#include "common/Mutex.h"
+#include "include/rados/librados.hpp"
+#include "tools/rbd_mirror/Types.h"
+#include <boost/functional/hash.hpp>
+#include <boost/optional.hpp>
+#include "include/ceph_assert.h"
+#include "tools/rbd_mirror/pool_watcher/Types.h"
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+/**
+ * Keeps track of images that have mirroring enabled within all
+ * pools.
+ */
+template <typename ImageCtxT = librbd::ImageCtx>
+class PoolWatcher {
+public:
+ static PoolWatcher* create(Threads<ImageCtxT> *threads,
+ librados::IoCtx &remote_io_ctx,
+ pool_watcher::Listener &listener) {
+ return new PoolWatcher(threads, remote_io_ctx, listener);
+ }
+
+ PoolWatcher(Threads<ImageCtxT> *threads, librados::IoCtx &remote_io_ctx,
+ pool_watcher::Listener &listener);
+ ~PoolWatcher();
+ PoolWatcher(const PoolWatcher&) = delete;
+ PoolWatcher& operator=(const PoolWatcher&) = delete;
+
+ bool is_blacklisted() const;
+
+ void init(Context *on_finish = nullptr);
+ void shut_down(Context *on_finish);
+
+ inline uint64_t get_image_count() const {
+ Mutex::Locker locker(m_lock);
+ return m_image_ids.size();
+ }
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT
+ * |
+ * v
+ * REGISTER_WATCHER
+ * |
+ * |/--------------------------------\
+ * | |
+ * v |
+ * REFRESH_IMAGES |
+ * | |
+ * |/----------------------------\ |
+ * | | |
+ * v | |
+ * GET_MIRROR_UUID | |
+ * | | |
+ * v | |
+ * NOTIFY_LISTENER | |
+ * | | |
+ * v | |
+ * IDLE ---\ | |
+ * | | | |
+ * | |\---> IMAGE_UPDATED | |
+ * | | | | |
+ * | | v | |
+ * | | GET_IMAGE_NAME --/ |
+ * | | |
+ * | \----> WATCH_ERROR ---------/
+ * v
+ * SHUT_DOWN
+ * |
+ * v
+ * UNREGISTER_WATCHER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ class MirroringWatcher;
+
+ Threads<ImageCtxT> *m_threads;
+ librados::IoCtx m_remote_io_ctx;
+ pool_watcher::Listener &m_listener;
+
+ ImageIds m_refresh_image_ids;
+ bufferlist m_out_bl;
+
+ mutable Mutex m_lock;
+
+ Context *m_on_init_finish = nullptr;
+
+ ImageIds m_image_ids;
+ std::string m_mirror_uuid;
+
+ bool m_pending_updates = false;
+ bool m_notify_listener_in_progress = false;
+ ImageIds m_pending_image_ids;
+ ImageIds m_pending_added_image_ids;
+ ImageIds m_pending_removed_image_ids;
+
+ std::string m_pending_mirror_uuid;
+
+ MirroringWatcher *m_mirroring_watcher;
+
+ Context *m_timer_ctx = nullptr;
+
+ AsyncOpTracker m_async_op_tracker;
+ bool m_blacklisted = false;
+ bool m_shutting_down = false;
+ bool m_image_ids_invalid = true;
+ bool m_refresh_in_progress = false;
+ bool m_deferred_refresh = false;
+
+ void register_watcher();
+ void handle_register_watcher(int r);
+ void unregister_watcher();
+
+ void refresh_images();
+ void handle_refresh_images(int r);
+
+ void schedule_refresh_images(double interval);
+ void process_refresh_images();
+
+ void get_mirror_uuid();
+ void handle_get_mirror_uuid(int r);
+
+ void handle_rewatch_complete(int r);
+ void handle_image_updated(const std::string &remote_image_id,
+ const std::string &global_image_id,
+ bool enabled);
+
+ void schedule_listener();
+ void notify_listener();
+
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::PoolWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_POOL_WATCHER_H
diff --git a/src/tools/rbd_mirror/ProgressContext.h b/src/tools/rbd_mirror/ProgressContext.h
new file mode 100644
index 00000000..e4430ee6
--- /dev/null
+++ b/src/tools/rbd_mirror/ProgressContext.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_PROGRESS_CONTEXT_H
+#define RBD_MIRROR_PROGRESS_CONTEXT_H
+
+namespace rbd {
+namespace mirror {
+
+class ProgressContext
+{
+public:
+ virtual ~ProgressContext() {}
+ virtual void update_progress(const std::string &description,
+ bool flush = true) = 0;
+};
+
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_PROGRESS_CONTEXT_H
diff --git a/src/tools/rbd_mirror/ServiceDaemon.cc b/src/tools/rbd_mirror/ServiceDaemon.cc
new file mode 100644
index 00000000..f3b549b8
--- /dev/null
+++ b/src/tools/rbd_mirror/ServiceDaemon.cc
@@ -0,0 +1,251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/ServiceDaemon.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/Timer.h"
+#include "tools/rbd_mirror/Threads.h"
+#include <sstream>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::ServiceDaemon: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+
+namespace {
+
+const std::string RBD_MIRROR_AUTH_ID_PREFIX("rbd-mirror.");
+
+struct AttributeDumpVisitor : public boost::static_visitor<void> {
+ ceph::Formatter *f;
+ const std::string& name;
+
+ AttributeDumpVisitor(ceph::Formatter *f, const std::string& name)
+ : f(f), name(name) {
+ }
+
+ void operator()(bool val) const {
+ f->dump_bool(name.c_str(), val);
+ }
+ void operator()(uint64_t val) const {
+ f->dump_unsigned(name.c_str(), val);
+ }
+ void operator()(const std::string& val) const {
+ f->dump_string(name.c_str(), val);
+ }
+};
+
+} // anonymous namespace
+
+using namespace service_daemon;
+
+template <typename I>
+ServiceDaemon<I>::ServiceDaemon(CephContext *cct, RadosRef rados,
+ Threads<I>* threads)
+ : m_cct(cct), m_rados(rados), m_threads(threads),
+ m_lock("rbd::mirror::ServiceDaemon") {
+ dout(20) << dendl;
+}
+
+template <typename I>
+ServiceDaemon<I>::~ServiceDaemon() {
+ dout(20) << dendl;
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ if (m_timer_ctx != nullptr) {
+ m_threads->timer->cancel_event(m_timer_ctx);
+ update_status();
+ }
+}
+
+template <typename I>
+int ServiceDaemon<I>::init() {
+ dout(20) << dendl;
+
+ std::string id = m_cct->_conf->name.get_id();
+ if (id.find(RBD_MIRROR_AUTH_ID_PREFIX) == 0) {
+ id = id.substr(RBD_MIRROR_AUTH_ID_PREFIX.size());
+ }
+
+ std::string instance_id = stringify(m_rados->get_instance_id());
+ std::map<std::string, std::string> service_metadata = {
+ {"id", id}, {"instance_id", instance_id}};
+ int r = m_rados->service_daemon_register("rbd-mirror", instance_id,
+ service_metadata);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void ServiceDaemon<I>::add_pool(int64_t pool_id, const std::string& pool_name) {
+ dout(20) << "pool_id=" << pool_id << ", pool_name=" << pool_name << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_pools.insert({pool_id, {pool_name}});
+ }
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_pool(int64_t pool_id) {
+ dout(20) << "pool_id=" << pool_id << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ m_pools.erase(pool_id);
+ }
+ schedule_update_status();
+}
+
+template <typename I>
+uint64_t ServiceDaemon<I>::add_or_update_callout(int64_t pool_id,
+ uint64_t callout_id,
+ CalloutLevel callout_level,
+ const std::string& text) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "callout_id=" << callout_id << ", "
+ << "callout_level=" << callout_level << ", "
+ << "text=" << text << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return CALLOUT_ID_NONE;
+ }
+
+ if (callout_id == CALLOUT_ID_NONE) {
+ callout_id = ++m_callout_id;
+ }
+ pool_it->second.callouts[callout_id] = {callout_level, text};
+ }
+
+ schedule_update_status();
+ return callout_id;
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_callout(int64_t pool_id, uint64_t callout_id) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "callout_id=" << callout_id << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.callouts.erase(callout_id);
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::add_or_update_attribute(int64_t pool_id,
+ const std::string& key,
+ const AttributeValue& value) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "key=" << key << ", "
+ << "value=" << value << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.attributes[key] = value;
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::remove_attribute(int64_t pool_id,
+ const std::string& key) {
+ dout(20) << "pool_id=" << pool_id << ", "
+ << "key=" << key << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ auto pool_it = m_pools.find(pool_id);
+ if (pool_it == m_pools.end()) {
+ return;
+ }
+ pool_it->second.attributes.erase(key);
+ }
+
+ schedule_update_status();
+}
+
+template <typename I>
+void ServiceDaemon<I>::schedule_update_status() {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ if (m_timer_ctx != nullptr) {
+ return;
+ }
+
+ m_timer_ctx = new FunctionContext([this](int) {
+ m_timer_ctx = nullptr;
+ update_status();
+ });
+ m_threads->timer->add_event_after(1, m_timer_ctx);
+}
+
+template <typename I>
+void ServiceDaemon<I>::update_status() {
+ dout(20) << dendl;
+ ceph_assert(m_threads->timer_lock.is_locked());
+
+ ceph::JSONFormatter f;
+ {
+ Mutex::Locker locker(m_lock);
+ f.open_object_section("pools");
+ for (auto& pool_pair : m_pools) {
+ f.open_object_section(stringify(pool_pair.first).c_str());
+ f.dump_string("name", pool_pair.second.name);
+ f.open_object_section("callouts");
+ for (auto& callout : pool_pair.second.callouts) {
+ f.open_object_section(stringify(callout.first).c_str());
+ f.dump_string("level", stringify(callout.second.level).c_str());
+ f.dump_string("text", callout.second.text.c_str());
+ f.close_section();
+ }
+ f.close_section(); // callouts
+
+ for (auto& attribute : pool_pair.second.attributes) {
+ AttributeDumpVisitor attribute_dump_visitor(&f, attribute.first);
+ boost::apply_visitor(attribute_dump_visitor, attribute.second);
+ }
+ f.close_section(); // pool
+ }
+ f.close_section(); // pools
+ }
+
+ std::stringstream ss;
+ f.flush(ss);
+
+ int r = m_rados->service_daemon_update_status({{"json", ss.str()}});
+ if (r < 0) {
+ derr << "failed to update service daemon status: " << cpp_strerror(r)
+ << dendl;
+ }
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::ServiceDaemon<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/ServiceDaemon.h b/src/tools/rbd_mirror/ServiceDaemon.h
new file mode 100644
index 00000000..1de7e20b
--- /dev/null
+++ b/src/tools/rbd_mirror/ServiceDaemon.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_SERVICE_DAEMON_H
+#define CEPH_RBD_MIRROR_SERVICE_DAEMON_H
+
+#include "common/Mutex.h"
+#include "tools/rbd_mirror/Types.h"
+#include "tools/rbd_mirror/service_daemon/Types.h"
+#include <map>
+#include <string>
+
+struct CephContext;
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ServiceDaemon {
+public:
+ ServiceDaemon(CephContext *cct, RadosRef rados, Threads<ImageCtxT>* threads);
+ ~ServiceDaemon();
+
+ int init();
+
+ void add_pool(int64_t pool_id, const std::string& pool_name);
+ void remove_pool(int64_t pool_id);
+
+ uint64_t add_or_update_callout(int64_t pool_id, uint64_t callout_id,
+ service_daemon::CalloutLevel callout_level,
+ const std::string& text);
+ void remove_callout(int64_t pool_id, uint64_t callout_id);
+
+ void add_or_update_attribute(int64_t pool_id, const std::string& key,
+ const service_daemon::AttributeValue& value);
+ void remove_attribute(int64_t pool_id, const std::string& key);
+
+private:
+ struct Callout {
+ service_daemon::CalloutLevel level;
+ std::string text;
+
+ Callout() : level(service_daemon::CALLOUT_LEVEL_INFO) {
+ }
+ Callout(service_daemon::CalloutLevel level, const std::string& text)
+ : level(level), text(text) {
+ }
+ };
+ typedef std::map<uint64_t, Callout> Callouts;
+ typedef std::map<std::string, service_daemon::AttributeValue> Attributes;
+
+ struct Pool {
+ std::string name;
+ Callouts callouts;
+ Attributes attributes;
+
+ Pool(const std::string& name) : name(name) {
+ }
+ };
+
+ typedef std::map<int64_t, Pool> Pools;
+
+ CephContext *m_cct;
+ RadosRef m_rados;
+ Threads<ImageCtxT>* m_threads;
+
+ Mutex m_lock;
+ Pools m_pools;
+ uint64_t m_callout_id = service_daemon::CALLOUT_ID_NONE;
+
+ Context* m_timer_ctx = nullptr;
+
+ void schedule_update_status();
+ void update_status();
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::ServiceDaemon<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_SERVICE_DAEMON_H
diff --git a/src/tools/rbd_mirror/Threads.cc b/src/tools/rbd_mirror/Threads.cc
new file mode 100644
index 00000000..ca0a8b0f
--- /dev/null
+++ b/src/tools/rbd_mirror/Threads.cc
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/Threads.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+
+namespace rbd {
+namespace mirror {
+
+template <typename I>
+Threads<I>::Threads(CephContext *cct) : timer_lock("Threads::timer_lock") {
+ thread_pool = new ThreadPool(cct, "Journaler::thread_pool", "tp_journal",
+ cct->_conf.get_val<uint64_t>("rbd_op_threads"),
+ "rbd_op_threads");
+ thread_pool->start();
+
+ work_queue = new ContextWQ("Journaler::work_queue",
+ cct->_conf.get_val<uint64_t>("rbd_op_thread_timeout"),
+ thread_pool);
+
+ timer = new SafeTimer(cct, timer_lock, true);
+ timer->init();
+}
+
+template <typename I>
+Threads<I>::~Threads() {
+ {
+ Mutex::Locker timer_locker(timer_lock);
+ timer->shutdown();
+ }
+ delete timer;
+
+ work_queue->drain();
+ delete work_queue;
+
+ thread_pool->stop();
+ delete thread_pool;
+}
+
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::Threads<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/Threads.h b/src/tools/rbd_mirror/Threads.h
new file mode 100644
index 00000000..f52e8837
--- /dev/null
+++ b/src/tools/rbd_mirror/Threads.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_THREADS_H
+#define CEPH_RBD_MIRROR_THREADS_H
+
+#include "common/Mutex.h"
+
+class CephContext;
+class ContextWQ;
+class SafeTimer;
+class ThreadPool;
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+struct Threads {
+ ThreadPool *thread_pool = nullptr;
+ ContextWQ *work_queue = nullptr;
+
+ SafeTimer *timer = nullptr;
+ Mutex timer_lock;
+
+ explicit Threads(CephContext *cct);
+ Threads(const Threads&) = delete;
+ Threads& operator=(const Threads&) = delete;
+
+ ~Threads();
+};
+
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::Threads<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_THREADS_H
diff --git a/src/tools/rbd_mirror/Types.cc b/src/tools/rbd_mirror/Types.cc
new file mode 100644
index 00000000..74fe318e
--- /dev/null
+++ b/src/tools/rbd_mirror/Types.cc
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/Types.h"
+
+namespace rbd {
+namespace mirror {
+
+std::ostream &operator<<(std::ostream &os, const ImageId &image_id) {
+ return os << "global id=" << image_id.global_id << ", "
+ << "id=" << image_id.id;
+}
+
+std::ostream& operator<<(std::ostream& lhs, const PeerSpec &peer) {
+ return lhs << "uuid: " << peer.uuid
+ << " cluster: " << peer.cluster_name
+ << " client: " << peer.client_name;
+}
+
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/Types.h b/src/tools/rbd_mirror/Types.h
new file mode 100644
index 00000000..ed3b9d8a
--- /dev/null
+++ b/src/tools/rbd_mirror/Types.h
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_TYPES_H
+#define CEPH_RBD_MIRROR_TYPES_H
+
+#include <iostream>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+
+namespace rbd {
+namespace mirror {
+
+// Performance counters
+enum {
+ l_rbd_mirror_first = 27000,
+ l_rbd_mirror_replay,
+ l_rbd_mirror_replay_bytes,
+ l_rbd_mirror_replay_latency,
+ l_rbd_mirror_last,
+};
+
+typedef std::shared_ptr<librados::Rados> RadosRef;
+typedef std::shared_ptr<librados::IoCtx> IoCtxRef;
+typedef std::shared_ptr<librbd::Image> ImageRef;
+
+struct ImageId {
+ std::string global_id;
+ std::string id;
+
+ explicit ImageId(const std::string &global_id) : global_id(global_id) {
+ }
+ ImageId(const std::string &global_id, const std::string &id)
+ : global_id(global_id), id(id) {
+ }
+
+ inline bool operator==(const ImageId &rhs) const {
+ return (global_id == rhs.global_id && id == rhs.id);
+ }
+ inline bool operator<(const ImageId &rhs) const {
+ return global_id < rhs.global_id;
+ }
+};
+
+std::ostream &operator<<(std::ostream &, const ImageId &image_id);
+
+typedef std::set<ImageId> ImageIds;
+
+struct Peer {
+ std::string peer_uuid;
+ librados::IoCtx io_ctx;
+
+ Peer() {
+ }
+ Peer(const std::string &peer_uuid) : peer_uuid(peer_uuid) {
+ }
+ Peer(const std::string &peer_uuid, librados::IoCtx& io_ctx)
+ : peer_uuid(peer_uuid), io_ctx(io_ctx) {
+ }
+
+ inline bool operator<(const Peer &rhs) const {
+ return peer_uuid < rhs.peer_uuid;
+ }
+};
+
+typedef std::set<Peer> Peers;
+
+struct PeerSpec {
+ PeerSpec() = default;
+ PeerSpec(const std::string &uuid, const std::string &cluster_name,
+ const std::string &client_name)
+ : uuid(uuid), cluster_name(cluster_name), client_name(client_name)
+ {
+ }
+ PeerSpec(const librbd::mirror_peer_t &peer) :
+ uuid(peer.uuid),
+ cluster_name(peer.cluster_name),
+ client_name(peer.client_name)
+ {
+ }
+
+ std::string uuid;
+ std::string cluster_name;
+ std::string client_name;
+
+ /// optional config properties
+ std::string mon_host;
+ std::string key;
+
+ bool operator==(const PeerSpec& rhs) const {
+ return (uuid == rhs.uuid &&
+ cluster_name == rhs.cluster_name &&
+ client_name == rhs.client_name &&
+ mon_host == rhs.mon_host &&
+ key == rhs.key);
+ }
+ bool operator<(const PeerSpec& rhs) const {
+ if (uuid != rhs.uuid) {
+ return uuid < rhs.uuid;
+ } else if (cluster_name != rhs.cluster_name) {
+ return cluster_name < rhs.cluster_name;
+ } else if (client_name != rhs.client_name) {
+ return client_name < rhs.client_name;
+ } else if (mon_host < rhs.mon_host) {
+ return mon_host < rhs.mon_host;
+ } else {
+ return key < rhs.key;
+ }
+ }
+};
+
+std::ostream& operator<<(std::ostream& lhs, const PeerSpec &peer);
+
+} // namespace mirror
+} // namespace rbd
+
+
+#endif // CEPH_RBD_MIRROR_TYPES_H
diff --git a/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc
new file mode 100644
index 00000000..a0e9fd90
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc
@@ -0,0 +1,290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/Policy.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_deleter::SnapshotPurgeRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+void SnapshotPurgeRequest<I>::send() {
+ open_image();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::open_image() {
+ dout(10) << dendl;
+ m_image_ctx = I::create("", m_image_id, nullptr, m_io_ctx, false);
+
+ {
+ RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+ m_image_ctx->set_journal_policy(new JournalPolicy());
+ }
+
+ Context *ctx = create_context_callback<
+ SnapshotPurgeRequest<I>, &SnapshotPurgeRequest<I>::handle_open_image>(
+ this);
+ m_image_ctx->state->open(librbd::OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_open_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to open image '" << m_image_id << "': " << cpp_strerror(r)
+ << dendl;
+ m_image_ctx->destroy();
+ m_image_ctx = nullptr;
+
+ finish(r);
+ return;
+ }
+
+ acquire_lock();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::acquire_lock() {
+ dout(10) << dendl;
+
+ m_image_ctx->owner_lock.get_read();
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ m_image_ctx->owner_lock.put_read();
+
+ derr << "exclusive lock not enabled" << dendl;
+ m_ret_val = -EINVAL;
+ close_image();
+ return;
+ }
+
+ m_image_ctx->exclusive_lock->acquire_lock(create_context_callback<
+ SnapshotPurgeRequest<I>, &SnapshotPurgeRequest<I>::handle_acquire_lock>(
+ this));
+ m_image_ctx->owner_lock.put_read();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_acquire_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to acquire exclusive lock: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ {
+ RWLock::RLocker snap_locker(m_image_ctx->snap_lock);
+ m_snaps = m_image_ctx->snaps;
+ }
+ snap_unprotect();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::snap_unprotect() {
+ if (m_snaps.empty()) {
+ close_image();
+ return;
+ }
+
+ librados::snap_t snap_id = m_snaps.back();
+ m_image_ctx->snap_lock.get_read();
+ int r = m_image_ctx->get_snap_namespace(snap_id, &m_snap_namespace);
+ if (r < 0) {
+ m_image_ctx->snap_lock.put_read();
+
+ derr << "failed to get snap namespace: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ r = m_image_ctx->get_snap_name(snap_id, &m_snap_name);
+ if (r < 0) {
+ m_image_ctx->snap_lock.put_read();
+
+ derr << "failed to get snap name: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ bool is_protected;
+ r = m_image_ctx->is_snap_protected(snap_id, &is_protected);
+ if (r < 0) {
+ m_image_ctx->snap_lock.put_read();
+
+ derr << "failed to get snap protection status: " << cpp_strerror(r)
+ << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+ m_image_ctx->snap_lock.put_read();
+
+ if (!is_protected) {
+ snap_remove();
+ return;
+ }
+
+ dout(10) << "snap_id=" << snap_id << ", "
+ << "snap_namespace=" << m_snap_namespace << ", "
+ << "snap_name=" << m_snap_name << dendl;
+
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ derr << "lost exclusive lock" << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_snap_unprotect(r);
+ finish_op_ctx->complete(0);
+ });
+ RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+ m_image_ctx->operations->execute_snap_unprotect(
+ m_snap_namespace, m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_snap_unprotect(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -EBUSY) {
+ dout(10) << "snapshot in-use" << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ } else if (r < 0) {
+ derr << "failed to unprotect snapshot: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ {
+ // avoid the need to refresh to delete the newly unprotected snapshot
+ RWLock::RLocker snap_locker(m_image_ctx->snap_lock);
+ librados::snap_t snap_id = m_snaps.back();
+ auto snap_info_it = m_image_ctx->snap_info.find(snap_id);
+ if (snap_info_it != m_image_ctx->snap_info.end()) {
+ snap_info_it->second.protection_status =
+ RBD_PROTECTION_STATUS_UNPROTECTED;
+ }
+ }
+
+ snap_remove();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::snap_remove() {
+ librados::snap_t snap_id = m_snaps.back();
+ dout(10) << "snap_id=" << snap_id << ", "
+ << "snap_namespace=" << m_snap_namespace << ", "
+ << "snap_name=" << m_snap_name << dendl;
+
+ int r;
+ auto finish_op_ctx = start_lock_op(&r);
+ if (finish_op_ctx == nullptr) {
+ derr << "lost exclusive lock" << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ auto ctx = new FunctionContext([this, finish_op_ctx](int r) {
+ handle_snap_remove(r);
+ finish_op_ctx->complete(0);
+ });
+ RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+ m_image_ctx->operations->execute_snap_remove(
+ m_snap_namespace, m_snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_snap_remove(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -EBUSY) {
+ dout(10) << "snapshot in-use" << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ } else if (r < 0) {
+ derr << "failed to remove snapshot: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ m_snaps.pop_back();
+ snap_unprotect();
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::close_image() {
+ dout(10) << dendl;
+
+ m_image_ctx->state->close(create_context_callback<
+ SnapshotPurgeRequest<I>,
+ &SnapshotPurgeRequest<I>::handle_close_image>(this));
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::handle_close_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_image_ctx->destroy();
+ m_image_ctx = nullptr;
+
+ if (r < 0) {
+ derr << "failed to close: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+ finish(0);
+}
+
+template <typename I>
+void SnapshotPurgeRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+template <typename I>
+Context *SnapshotPurgeRequest<I>::start_lock_op(int* r) {
+ RWLock::RLocker owner_locker(m_image_ctx->owner_lock);
+ return m_image_ctx->exclusive_lock->start_op(r);
+}
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_deleter::SnapshotPurgeRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h
new file mode 100644
index 00000000..b8b635fe
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_SNAPSHOT_PURGE_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETER_SNAPSHOT_PURGE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <string>
+#include <vector>
+
+class Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SnapshotPurgeRequest {
+public:
+ static SnapshotPurgeRequest* create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ Context *on_finish) {
+ return new SnapshotPurgeRequest(io_ctx, image_id, on_finish);
+ }
+
+ SnapshotPurgeRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_IMAGE
+ * |
+ * v
+ * ACQUIRE_LOCK
+ * |
+ * | (repeat for each snapshot)
+ * |/------------------------\
+ * | |
+ * v (skip if not needed) |
+ * SNAP_UNPROTECT |
+ * | |
+ * v (skip if not needed) |
+ * SNAP_REMOVE -----------------/
+ * |
+ * v
+ * CLOSE_IMAGE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ Context *m_on_finish;
+
+ ImageCtxT *m_image_ctx = nullptr;
+ int m_ret_val = 0;
+
+ std::vector<librados::snap_t> m_snaps;
+ cls::rbd::SnapshotNamespace m_snap_namespace;
+ std::string m_snap_name;
+
+ void open_image();
+ void handle_open_image(int r);
+
+ void acquire_lock();
+ void handle_acquire_lock(int r);
+
+ void snap_unprotect();
+ void handle_snap_unprotect(int r);
+
+ void snap_remove();
+ void handle_snap_remove(int r);
+
+ void close_image();
+ void handle_close_image(int r);
+
+ void finish(int r);
+
+ Context *start_lock_op(int* r);
+
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_deleter::SnapshotPurgeRequest<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_SNAPSHOT_PURGE_REQUEST_H
+
diff --git a/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc
new file mode 100644
index 00000000..92db22ca
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_deleter/TrashMoveRequest.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/TrashWatcher.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/ResetRequest.h"
+#include "librbd/trash/MoveRequest.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_deleter::TrashMoveRequest: " \
+ << this << " " << __func__ << ": "
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void TrashMoveRequest<I>::send() {
+ get_mirror_image_id();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::get_mirror_image_id() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_image_id_start(&op, m_global_image_id);
+
+ auto aio_comp = create_rados_callback<
+ TrashMoveRequest<I>,
+ &TrashMoveRequest<I>::handle_get_mirror_image_id>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_get_mirror_image_id(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto bl_it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_image_id_finish(&bl_it,
+ &m_image_id);
+ }
+ if (r == -ENOENT) {
+ dout(10) << "image " << m_global_image_id << " is not mirrored" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "error retrieving local id for image " << m_global_image_id << ": "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ get_tag_owner();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::get_tag_owner() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_get_tag_owner>(this);
+ librbd::Journal<I>::get_tag_owner(m_io_ctx, m_image_id, &m_mirror_uuid,
+ m_op_work_queue, ctx);
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_get_tag_owner(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "error retrieving image primary info for image "
+ << m_global_image_id << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ } else if (r != -ENOENT) {
+ if (m_mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ dout(10) << "image " << m_global_image_id << " is local primary" << dendl;
+ finish(-EPERM);
+ return;
+ } else if (m_mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID &&
+ !m_resync) {
+ dout(10) << "image " << m_global_image_id << " is orphaned" << dendl;
+ finish(-EPERM);
+ return;
+ }
+ }
+
+ disable_mirror_image();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::disable_mirror_image() {
+ dout(10) << dendl;
+
+ cls::rbd::MirrorImage mirror_image;
+ mirror_image.global_image_id = m_global_image_id;
+ mirror_image.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_set(&op, m_image_id, mirror_image);
+
+ auto aio_comp = create_rados_callback<
+ TrashMoveRequest<I>,
+ &TrashMoveRequest<I>::handle_disable_mirror_image>(this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_disable_mirror_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(10) << "local image is not mirrored, aborting deletion." << dendl;
+ finish(r);
+ return;
+ } else if (r == -EEXIST || r == -EINVAL) {
+ derr << "cannot disable mirroring for image " << m_global_image_id
+ << ": global_image_id has changed/reused: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "cannot disable mirroring for image " << m_global_image_id
+ << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ reset_journal();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::reset_journal() {
+ dout(10) << dendl;
+
+ // ensure that if the image is recovered any peers will split-brain
+ auto ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_reset_journal>(this);
+ auto req = librbd::journal::ResetRequest<I>::create(
+ m_io_ctx, m_image_id, librbd::Journal<>::IMAGE_CLIENT_ID,
+ librbd::Journal<>::LOCAL_MIRROR_UUID, m_op_work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_reset_journal(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to reset journal: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ open_image();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::open_image() {
+ dout(10) << dendl;
+
+ m_image_ctx = I::create("", m_image_id, nullptr, m_io_ctx, false);
+
+ {
+ // don't attempt to open the journal
+ RWLock::WLocker snap_locker(m_image_ctx->snap_lock);
+ m_image_ctx->set_journal_policy(new JournalPolicy());
+ }
+
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_open_image>(this);
+ m_image_ctx->state->open(librbd::OPEN_FLAG_SKIP_OPEN_PARENT, ctx);
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_open_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to open image: " << cpp_strerror(r) << dendl;
+ m_image_ctx->destroy();
+ m_image_ctx = nullptr;
+ finish(r);
+ return;
+ }
+
+ if (m_image_ctx->old_format) {
+ derr << "cannot move v1 image to trash" << dendl;
+ m_ret_val = -EINVAL;
+ close_image();
+ return;
+ }
+
+ acquire_lock();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::acquire_lock() {
+ m_image_ctx->owner_lock.get_read();
+ if (m_image_ctx->exclusive_lock == nullptr) {
+ derr << "exclusive lock feature not enabled" << dendl;
+ m_image_ctx->owner_lock.put_read();
+ m_ret_val = -EINVAL;
+ close_image();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_acquire_lock>(this);
+ m_image_ctx->exclusive_lock->block_requests(0);
+ m_image_ctx->exclusive_lock->acquire_lock(ctx);
+ m_image_ctx->owner_lock.put_read();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_acquire_lock(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to acquire exclusive lock: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ trash_move();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::trash_move() {
+ dout(10) << dendl;
+
+ utime_t delete_time{ceph_clock_now()};
+ utime_t deferment_end_time{delete_time};
+ deferment_end_time +=
+ m_image_ctx->config.template get_val<uint64_t>("rbd_mirroring_delete_delay");
+
+ m_trash_image_spec = {
+ cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING, m_image_ctx->name, delete_time,
+ deferment_end_time};
+
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_trash_move>(this);
+ auto req = librbd::trash::MoveRequest<I>::create(
+ m_io_ctx, m_image_id, m_trash_image_spec, ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_trash_move(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to move image to trash: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_image();
+ return;
+ }
+
+ m_moved_to_trash = true;
+ remove_mirror_image();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::remove_mirror_image() {
+ dout(10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::mirror_image_remove(&op, m_image_id);
+
+ auto aio_comp = create_rados_callback<
+ TrashMoveRequest<I>,
+ &TrashMoveRequest<I>::handle_remove_mirror_image>(this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_remove_mirror_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(10) << "local image is not mirrored" << dendl;
+ } else if (r < 0) {
+ derr << "failed to remove mirror image state for " << m_global_image_id
+ << ": " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+
+ close_image();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::close_image() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_close_image>(this);
+ m_image_ctx->state->close(ctx);
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_close_image(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_image_ctx->destroy();
+ m_image_ctx = nullptr;
+
+ if (r < 0) {
+ derr << "failed to close image: " << cpp_strerror(r) << dendl;
+ }
+
+ // don't send notification if we failed
+ if (!m_moved_to_trash) {
+ finish(0);
+ return;
+ }
+
+ notify_trash_add();
+}
+
+template <typename I>
+void TrashMoveRequest<I>::notify_trash_add() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ TrashMoveRequest<I>, &TrashMoveRequest<I>::handle_notify_trash_add>(this);
+ librbd::TrashWatcher<I>::notify_image_added(m_io_ctx, m_image_id,
+ m_trash_image_spec, ctx);
+}
+
+template <typename I>
+void TrashMoveRequest<I>::handle_notify_trash_add(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to notify trash watchers: " << cpp_strerror(r) << dendl;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void TrashMoveRequest<I>::finish(int r) {
+ if (m_ret_val < 0) {
+ r = m_ret_val;
+ }
+
+ dout(10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_deleter::TrashMoveRequest<librbd::ImageCtx>;
+
diff --git a/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h
new file mode 100644
index 00000000..07b7432e
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h
@@ -0,0 +1,136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_MOVE_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_MOVE_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+#include <boost/optional.hpp>
+#include <string>
+
+struct Context;
+class ContextWQ;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class TrashMoveRequest {
+public:
+ static TrashMoveRequest* create(librados::IoCtx& io_ctx,
+ const std::string& global_image_id,
+ bool resync, ContextWQ* op_work_queue,
+ Context* on_finish) {
+ return new TrashMoveRequest(io_ctx, global_image_id, resync, op_work_queue,
+ on_finish);
+ }
+
+ TrashMoveRequest(librados::IoCtx& io_ctx, const std::string& global_image_id,
+ bool resync, ContextWQ* op_work_queue, Context* on_finish)
+ : m_io_ctx(io_ctx), m_global_image_id(global_image_id), m_resync(resync),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_IMAGE_ID
+ * |
+ * v
+ * GET_TAG_OWNER
+ * |
+ * v
+ * DISABLE_MIRROR_IMAGE
+ * |
+ * v
+ * RESET_JOURNAL
+ * |
+ * v
+ * OPEN_IMAGE
+ * |
+ * v
+ * ACQUIRE_LOCK
+ * |
+ * v
+ * TRASH_MOVE
+ * |
+ * v
+ * REMOVE_MIRROR_IMAGE
+ * |
+ * v
+ * CLOSE_IMAGE
+ * |
+ * v
+ * NOTIFY_TRASH_ADD
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_global_image_id;
+ bool m_resync;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ ceph::bufferlist m_out_bl;
+ std::string m_image_id;
+ std::string m_mirror_uuid;
+ cls::rbd::TrashImageSpec m_trash_image_spec;
+ ImageCtxT *m_image_ctx = nullptr;;
+ int m_ret_val = 0;
+ bool m_moved_to_trash = false;
+
+ void get_mirror_image_id();
+ void handle_get_mirror_image_id(int r);
+
+ void get_tag_owner();
+ void handle_get_tag_owner(int r);
+
+ void disable_mirror_image();
+ void handle_disable_mirror_image(int r);
+
+ void reset_journal();
+ void handle_reset_journal(int r);
+
+ void open_image();
+ void handle_open_image(int r);
+
+ void acquire_lock();
+ void handle_acquire_lock(int r);
+
+ void trash_move();
+ void handle_trash_move(int r);
+
+ void remove_mirror_image();
+ void handle_remove_mirror_image(int r);
+
+ void close_image();
+ void handle_close_image(int r);
+
+ void notify_trash_add();
+ void handle_notify_trash_add(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_deleter::TrashMoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H
diff --git a/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc
new file mode 100644
index 00000000..e7c725dc
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_deleter/TrashRemoveRequest.h"
+#include "include/ceph_assert.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/TrashWatcher.h"
+#include "librbd/Utils.h"
+#include "librbd/trash/RemoveRequest.h"
+#include "tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_deleter::TrashRemoveRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void TrashRemoveRequest<I>::send() {
+ *m_error_result = ERROR_RESULT_RETRY;
+
+ get_trash_image_spec();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::get_trash_image_spec() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::trash_get_start(&op, m_image_id);
+
+ auto aio_comp = create_rados_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_get_trash_image_spec>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_get_trash_image_spec(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto bl_it = m_out_bl.cbegin();
+ r = librbd::cls_client::trash_get_finish(&bl_it, &m_trash_image_spec);
+ }
+
+ if (r == -ENOENT || (r >= 0 && m_trash_image_spec.source !=
+ cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING)) {
+ dout(10) << "image id " << m_image_id << " not in mirroring trash" << dendl;
+ finish(0);
+ return;
+ } else if (r < 0) {
+ derr << "error getting image id " << m_image_id << " info from trash: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ if (m_trash_image_spec.state != cls::rbd::TRASH_IMAGE_STATE_NORMAL &&
+ m_trash_image_spec.state != cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ dout(10) << "image " << m_image_id << " is not in an expected trash state: "
+ << m_trash_image_spec.state << dendl;
+ *m_error_result = ERROR_RESULT_RETRY_IMMEDIATELY;
+ finish(-EBUSY);
+ return;
+ }
+
+ set_trash_state();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::set_trash_state() {
+ if (m_trash_image_spec.state == cls::rbd::TRASH_IMAGE_STATE_REMOVING) {
+ get_snap_context();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ librados::ObjectWriteOperation op;
+ librbd::cls_client::trash_state_set(&op, m_image_id,
+ cls::rbd::TRASH_IMAGE_STATE_REMOVING,
+ cls::rbd::TRASH_IMAGE_STATE_NORMAL);
+
+ auto aio_comp = create_rados_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_set_trash_state>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_set_trash_state(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(10) << "image id " << m_image_id << " not in mirroring trash" << dendl;
+ finish(0);
+ return;
+ } else if (r < 0 && r != -EOPNOTSUPP) {
+ derr << "error setting trash image state for image id " << m_image_id
+ << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ get_snap_context();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::get_snap_context() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::get_snapcontext_start(&op);
+
+ std::string header_oid = librbd::util::header_name(m_image_id);
+
+ auto aio_comp = create_rados_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_get_snap_context>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(header_oid, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_get_snap_context(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ ::SnapContext snapc;
+ if (r == 0) {
+ auto bl_it = m_out_bl.cbegin();
+ r = librbd::cls_client::get_snapcontext_finish(&bl_it, &snapc);
+ }
+ if (r < 0 && r != -ENOENT) {
+ derr << "error retrieving snapshot context for image "
+ << m_image_id << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_has_snapshots = (!snapc.empty());
+ purge_snapshots();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::purge_snapshots() {
+ if (!m_has_snapshots) {
+ remove_image();
+ return;
+ }
+
+ dout(10) << dendl;
+ auto ctx = create_context_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_purge_snapshots>(this);
+ auto req = SnapshotPurgeRequest<I>::create(m_io_ctx, m_image_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_purge_snapshots(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == -EBUSY) {
+ dout(10) << "snapshots still in-use" << dendl;
+ *m_error_result = ERROR_RESULT_RETRY_IMMEDIATELY;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to purge image snapshots: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ remove_image();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::remove_image() {
+ dout(10) << dendl;
+
+ auto ctx = create_context_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_remove_image>(this);
+ auto req = librbd::trash::RemoveRequest<I>::create(
+ m_io_ctx, m_image_id, m_op_work_queue, true, m_progress_ctx,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_remove_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r == -ENOTEMPTY) {
+ // image must have clone v2 snapshot still associated to child
+ dout(10) << "snapshots still in-use" << dendl;
+ *m_error_result = ERROR_RESULT_RETRY_IMMEDIATELY;
+ finish(-EBUSY);
+ return;
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "error removing image " << m_image_id << " "
+ << "(" << m_image_id << ") from local pool: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ notify_trash_removed();
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::notify_trash_removed() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ TrashRemoveRequest<I>,
+ &TrashRemoveRequest<I>::handle_notify_trash_removed>(this);
+ librbd::TrashWatcher<I>::notify_image_removed(m_io_ctx, m_image_id, ctx);
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::handle_notify_trash_removed(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to notify trash watchers: " << cpp_strerror(r) << dendl;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void TrashRemoveRequest<I>::finish(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_deleter::TrashRemoveRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h
new file mode 100644
index 00000000..d2295e8e
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_TRASH_REMOVE_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETER_TRASH_REMOVE_REQUEST_H
+
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/internal.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+#include <string>
+#include <vector>
+
+class Context;
+class ContextWQ;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class TrashRemoveRequest {
+public:
+ static TrashRemoveRequest* create(librados::IoCtx &io_ctx,
+ const std::string &image_id,
+ ErrorResult *error_result,
+ ContextWQ *op_work_queue,
+ Context *on_finish) {
+ return new TrashRemoveRequest(io_ctx, image_id, error_result, op_work_queue,
+ on_finish);
+ }
+
+ TrashRemoveRequest(librados::IoCtx &io_ctx, const std::string &image_id,
+ ErrorResult *error_result, ContextWQ *op_work_queue,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_id(image_id), m_error_result(error_result),
+ m_op_work_queue(op_work_queue), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /*
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_TRASH_IMAGE_SPEC
+ * |
+ * v
+ * SET_TRASH_STATE
+ * |
+ * v
+ * GET_SNAP_CONTEXT
+ * |
+ * v
+ * PURGE_SNAPSHOTS
+ * |
+ * v
+ * TRASH_REMOVE
+ * |
+ * v
+ * NOTIFY_TRASH_REMOVE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_image_id;
+ ErrorResult *m_error_result;
+ ContextWQ *m_op_work_queue;
+ Context *m_on_finish;
+
+ ceph::bufferlist m_out_bl;
+ cls::rbd::TrashImageSpec m_trash_image_spec;
+ bool m_has_snapshots = false;
+ librbd::NoOpProgressContext m_progress_ctx;
+
+ void get_trash_image_spec();
+ void handle_get_trash_image_spec(int r);
+
+ void set_trash_state();
+ void handle_set_trash_state(int r);
+
+ void get_snap_context();
+ void handle_get_snap_context(int r);
+
+ void purge_snapshots();
+ void handle_purge_snapshots(int r);
+
+ void remove_image();
+ void handle_remove_image(int r);
+
+ void notify_trash_removed();
+ void handle_notify_trash_removed(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_deleter::TrashRemoveRequest<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_TRASH_REMOVE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_deleter/TrashWatcher.cc b/src/tools/rbd_mirror/image_deleter/TrashWatcher.cc
new file mode 100644
index 00000000..8735dfb7
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashWatcher.cc
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_deleter/TrashWatcher.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_deleter/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_deleter::TrashWatcher: " \
+ << this << " " << __func__ << ": "
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+namespace {
+
+const size_t MAX_RETURN = 1024;
+
+} // anonymous namespace
+
+template <typename I>
+TrashWatcher<I>::TrashWatcher(librados::IoCtx &io_ctx, Threads<I> *threads,
+ TrashListener& trash_listener)
+ : librbd::TrashWatcher<I>(io_ctx, threads->work_queue),
+ m_io_ctx(io_ctx), m_threads(threads), m_trash_listener(trash_listener),
+ m_lock(librbd::util::unique_lock_name(
+ "rbd::mirror::image_deleter::TrashWatcher", this)) {
+}
+
+template <typename I>
+void TrashWatcher<I>::init(Context *on_finish) {
+ dout(5) << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_on_init_finish = on_finish;
+
+ ceph_assert(!m_trash_list_in_progress);
+ m_trash_list_in_progress = true;
+ }
+
+ create_trash();
+}
+
+template <typename I>
+void TrashWatcher<I>::shut_down(Context *on_finish) {
+ dout(5) << dendl;
+
+ {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+
+ ceph_assert(!m_shutting_down);
+ m_shutting_down = true;
+ if (m_timer_ctx != nullptr) {
+ m_threads->timer->cancel_event(m_timer_ctx);
+ m_timer_ctx = nullptr;
+ }
+ }
+
+ auto ctx = new FunctionContext([this, on_finish](int r) {
+ unregister_watcher(on_finish);
+ });
+ m_async_op_tracker.wait_for_ops(ctx);
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_image_added(const std::string &image_id,
+ const cls::rbd::TrashImageSpec& spec) {
+ dout(10) << "image_id=" << image_id << dendl;
+
+ Mutex::Locker locker(m_lock);
+ add_image(image_id, spec);
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_image_removed(const std::string &image_id) {
+ // ignore removals -- the image deleter will ignore -ENOENTs
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_rewatch_complete(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ if (r == -EBLACKLISTED) {
+ dout(0) << "detected client is blacklisted" << dendl;
+ return;
+ } else if (r == -ENOENT) {
+ dout(5) << "trash directory deleted" << dendl;
+ } else if (r < 0) {
+ derr << "unexpected error re-registering trash directory watch: "
+ << cpp_strerror(r) << dendl;
+ }
+ schedule_trash_list(30);
+}
+
+template <typename I>
+void TrashWatcher<I>::create_trash() {
+ dout(20) << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_trash_list_in_progress);
+ }
+
+ librados::ObjectWriteOperation op;
+ op.create(false);
+
+ m_async_op_tracker.start_op();
+ auto aio_comp = create_rados_callback<
+ TrashWatcher<I>, &TrashWatcher<I>::handle_create_trash>(this);
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_create_trash(int r) {
+ dout(20) << "r=" << r << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_trash_list_in_progress);
+ }
+
+ Context* on_init_finish = nullptr;
+ if (r == -EBLACKLISTED || r == -ENOENT) {
+ if (r == -EBLACKLISTED) {
+ dout(0) << "detected client is blacklisted" << dendl;
+ } else {
+ dout(0) << "detected pool no longer exists" << dendl;
+ }
+
+ Mutex::Locker locker(m_lock);
+ std::swap(on_init_finish, m_on_init_finish);
+ m_trash_list_in_progress = false;
+ } else if (r < 0 && r != -EEXIST) {
+ derr << "failed to create trash object: " << cpp_strerror(r) << dendl;
+ {
+ Mutex::Locker locker(m_lock);
+ m_trash_list_in_progress = false;
+ }
+
+ schedule_trash_list(30);
+ } else {
+ register_watcher();
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void TrashWatcher<I>::register_watcher() {
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_trash_list_in_progress);
+ }
+
+ // if the watch registration is in-flight, let the watcher
+ // handle the transition -- only (re-)register if it's not registered
+ if (!this->is_unregistered()) {
+ trash_list(true);
+ return;
+ }
+
+ // first time registering or the watch failed
+ dout(5) << dendl;
+ m_async_op_tracker.start_op();
+
+ Context *ctx = create_context_callback<
+ TrashWatcher, &TrashWatcher<I>::handle_register_watcher>(this);
+ this->register_watch(ctx);
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_register_watcher(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_trash_list_in_progress);
+ if (r < 0) {
+ m_trash_list_in_progress = false;
+ }
+ }
+
+ Context *on_init_finish = nullptr;
+ if (r >= 0) {
+ trash_list(true);
+ } else if (r == -EBLACKLISTED) {
+ dout(0) << "detected client is blacklisted" << dendl;
+
+ Mutex::Locker locker(m_lock);
+ std::swap(on_init_finish, m_on_init_finish);
+ } else {
+ derr << "unexpected error registering trash directory watch: "
+ << cpp_strerror(r) << dendl;
+ schedule_trash_list(10);
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void TrashWatcher<I>::unregister_watcher(Context* on_finish) {
+ dout(5) << dendl;
+
+ m_async_op_tracker.start_op();
+ Context *ctx = new FunctionContext([this, on_finish](int r) {
+ handle_unregister_watcher(r, on_finish);
+ });
+ this->unregister_watch(ctx);
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_unregister_watcher(int r, Context* on_finish) {
+ dout(5) << "unregister_watcher: r=" << r << dendl;
+ if (r < 0) {
+ derr << "error unregistering watcher for trash directory: "
+ << cpp_strerror(r) << dendl;
+ }
+ m_async_op_tracker.finish_op();
+ on_finish->complete(0);
+}
+
+template <typename I>
+void TrashWatcher<I>::trash_list(bool initial_request) {
+ if (initial_request) {
+ m_async_op_tracker.start_op();
+ m_last_image_id = "";
+ }
+
+ dout(5) << "last_image_id=" << m_last_image_id << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_trash_list_in_progress);
+ }
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::trash_list_start(&op, m_last_image_id, MAX_RETURN);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ TrashWatcher<I>, &TrashWatcher<I>::handle_trash_list>(this);
+ m_out_bl.clear();
+ int r = m_io_ctx.aio_operate(RBD_TRASH, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void TrashWatcher<I>::handle_trash_list(int r) {
+ dout(5) << "r=" << r << dendl;
+
+ std::map<std::string, cls::rbd::TrashImageSpec> images;
+ if (r >= 0) {
+ auto bl_it = m_out_bl.cbegin();
+ r = librbd::cls_client::trash_list_finish(&bl_it, &images);
+ }
+
+ Context *on_init_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_trash_list_in_progress);
+ if (r >= 0) {
+ for (auto& image : images) {
+ add_image(image.first, image.second);
+ }
+ } else if (r == -ENOENT) {
+ r = 0;
+ }
+
+ if (r == -EBLACKLISTED) {
+ dout(0) << "detected client is blacklisted during trash refresh" << dendl;
+ m_trash_list_in_progress = false;
+ std::swap(on_init_finish, m_on_init_finish);
+ } else if (r >= 0 && images.size() < MAX_RETURN) {
+ m_trash_list_in_progress = false;
+ std::swap(on_init_finish, m_on_init_finish);
+ } else if (r < 0) {
+ m_trash_list_in_progress = false;
+ }
+ }
+
+ if (r >= 0 && images.size() == MAX_RETURN) {
+ m_last_image_id = images.rbegin()->first;
+ trash_list(false);
+ return;
+ } else if (r < 0 && r != -EBLACKLISTED) {
+ derr << "failed to retrieve trash directory: " << cpp_strerror(r) << dendl;
+ schedule_trash_list(10);
+ }
+
+ m_async_op_tracker.finish_op();
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+}
+
+template <typename I>
+void TrashWatcher<I>::schedule_trash_list(double interval) {
+ Mutex::Locker timer_locker(m_threads->timer_lock);
+ Mutex::Locker locker(m_lock);
+ if (m_shutting_down || m_trash_list_in_progress || m_timer_ctx != nullptr) {
+ if (m_trash_list_in_progress && !m_deferred_trash_list) {
+ dout(5) << "deferring refresh until in-flight refresh completes" << dendl;
+ m_deferred_trash_list = true;
+ }
+ return;
+ }
+
+ dout(5) << dendl;
+ m_timer_ctx = m_threads->timer->add_event_after(
+ interval,
+ new FunctionContext([this](int r) {
+ process_trash_list();
+ }));
+}
+
+template <typename I>
+void TrashWatcher<I>::process_trash_list() {
+ dout(5) << dendl;
+
+ ceph_assert(m_threads->timer_lock.is_locked());
+ ceph_assert(m_timer_ctx != nullptr);
+ m_timer_ctx = nullptr;
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(!m_trash_list_in_progress);
+ m_trash_list_in_progress = true;
+ }
+
+ // execute outside of the timer's lock
+ m_async_op_tracker.start_op();
+ Context *ctx = new FunctionContext([this](int r) {
+ create_trash();
+ m_async_op_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+template <typename I>
+void TrashWatcher<I>::add_image(const std::string& image_id,
+ const cls::rbd::TrashImageSpec& spec) {
+ if (spec.source != cls::rbd::TRASH_IMAGE_SOURCE_MIRRORING) {
+ return;
+ }
+
+ ceph_assert(m_lock.is_locked());
+ auto& deferment_end_time = spec.deferment_end_time;
+ dout(10) << "image_id=" << image_id << ", "
+ << "deferment_end_time=" << deferment_end_time << dendl;
+
+ m_async_op_tracker.start_op();
+ auto ctx = new FunctionContext([this, image_id, deferment_end_time](int r) {
+ m_trash_listener.handle_trash_image(image_id, deferment_end_time);
+ m_async_op_tracker.finish_op();
+ });
+ m_threads->work_queue->queue(ctx, 0);
+}
+
+} // namespace image_deleter;
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_deleter::TrashWatcher<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_deleter/TrashWatcher.h b/src/tools/rbd_mirror/image_deleter/TrashWatcher.h
new file mode 100644
index 00000000..b6f69833
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/TrashWatcher.h
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H
+
+#include "include/rados/librados.hpp"
+#include "common/AsyncOpTracker.h"
+#include "common/Mutex.h"
+#include "librbd/TrashWatcher.h"
+#include <set>
+#include <string>
+
+struct Context;
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+namespace image_deleter {
+
+struct TrashListener;
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class TrashWatcher : public librbd::TrashWatcher<ImageCtxT> {
+public:
+ static TrashWatcher* create(librados::IoCtx &io_ctx,
+ Threads<ImageCtxT> *threads,
+ TrashListener& trash_listener) {
+ return new TrashWatcher(io_ctx, threads, trash_listener);
+ }
+
+ TrashWatcher(librados::IoCtx &io_ctx, Threads<ImageCtxT> *threads,
+ TrashListener& trash_listener);
+ TrashWatcher(const TrashWatcher&) = delete;
+ TrashWatcher& operator=(const TrashWatcher&) = delete;
+
+ void init(Context *on_finish);
+ void shut_down(Context *on_finish);
+
+protected:
+ void handle_image_added(const std::string &image_id,
+ const cls::rbd::TrashImageSpec& spec) override;
+
+ void handle_image_removed(const std::string &image_id) override;
+
+ void handle_rewatch_complete(int r) override;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * INIT
+ * |
+ * v
+ * CREATE_TRASH
+ * |
+ * v
+ * REGISTER_WATCHER
+ * |
+ * |/--------------------------------\
+ * | |
+ * |/---------\ |
+ * | | |
+ * v | (more images) |
+ * TRASH_LIST ---/ |
+ * | |
+ * |/----------------------------\ |
+ * | | |
+ * v | |
+ * <idle> --\ | |
+ * | | | |
+ * | |\---> IMAGE_ADDED -----/ |
+ * | | |
+ * | \----> WATCH_ERROR ---------/
+ * v
+ * SHUT_DOWN
+ * |
+ * v
+ * UNREGISTER_WATCHER
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx m_io_ctx;
+ Threads<ImageCtxT> *m_threads;
+ TrashListener& m_trash_listener;
+
+ std::string m_last_image_id;
+ bufferlist m_out_bl;
+
+ mutable Mutex m_lock;
+
+ Context *m_on_init_finish = nullptr;
+ Context *m_timer_ctx = nullptr;
+
+ AsyncOpTracker m_async_op_tracker;
+ bool m_trash_list_in_progress = false;
+ bool m_deferred_trash_list = false;
+ bool m_shutting_down = false;
+
+ void register_watcher();
+ void handle_register_watcher(int r);
+
+ void create_trash();
+ void handle_create_trash(int r);
+
+ void unregister_watcher(Context* on_finish);
+ void handle_unregister_watcher(int r, Context* on_finish);
+
+ void trash_list(bool initial_request);
+ void handle_trash_list(int r);
+
+ void schedule_trash_list(double interval);
+ void process_trash_list();
+
+ void get_mirror_uuid();
+ void handle_get_mirror_uuid(int r);
+
+ void add_image(const std::string& image_id,
+ const cls::rbd::TrashImageSpec& spec);
+
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_deleter::TrashWatcher<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETE_TRASH_WATCHER_H
diff --git a/src/tools/rbd_mirror/image_deleter/Types.h b/src/tools/rbd_mirror/image_deleter/Types.h
new file mode 100644
index 00000000..ac3bc64a
--- /dev/null
+++ b/src/tools/rbd_mirror/image_deleter/Types.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_DELETER_TYPES_H
+#define CEPH_RBD_MIRROR_IMAGE_DELETER_TYPES_H
+
+#include "include/Context.h"
+#include "librbd/journal/Policy.h"
+#include <string>
+
+struct utime_t;
+
+namespace rbd {
+namespace mirror {
+namespace image_deleter {
+
+enum ErrorResult {
+ ERROR_RESULT_COMPLETE,
+ ERROR_RESULT_RETRY,
+ ERROR_RESULT_RETRY_IMMEDIATELY
+};
+
+struct TrashListener {
+ TrashListener() {
+ }
+ TrashListener(const TrashListener&) = delete;
+ TrashListener& operator=(const TrashListener&) = delete;
+
+ virtual ~TrashListener() {
+ }
+
+ virtual void handle_trash_image(const std::string& image_id,
+ const utime_t& deferment_end_time) = 0;
+
+};
+
+struct JournalPolicy : public librbd::journal::Policy {
+ bool append_disabled() const override {
+ return true;
+ }
+ bool journal_disabled() const override {
+ return true;
+ }
+
+ void allocate_tag_on_lock(Context *on_finish) override {
+ on_finish->complete(0);
+ }
+};
+
+} // namespace image_deleter
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_DELETER_TYPES_H
diff --git a/src/tools/rbd_mirror/image_map/LoadRequest.cc b/src/tools/rbd_mirror/image_map/LoadRequest.cc
new file mode 100644
index 00000000..7387b476
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/LoadRequest.cc
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "librbd/Utils.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+
+#include "LoadRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_map::LoadRequest: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+static const uint32_t MAX_RETURN = 1024;
+
+using librbd::util::create_rados_callback;
+
+template<typename I>
+LoadRequest<I>::LoadRequest(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> *image_mapping,
+ Context *on_finish)
+ : m_ioctx(ioctx),
+ m_image_mapping(image_mapping),
+ m_on_finish(on_finish) {
+}
+
+template<typename I>
+void LoadRequest<I>::send() {
+ dout(20) << dendl;
+
+ image_map_list();
+}
+
+template<typename I>
+void LoadRequest<I>::image_map_list() {
+ dout(20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_map_list_start(&op, m_start_after, MAX_RETURN);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ LoadRequest, &LoadRequest::handle_image_map_list>(this);
+
+ m_out_bl.clear();
+ int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template<typename I>
+void LoadRequest<I>::handle_image_map_list(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ std::map<std::string, cls::rbd::MirrorImageMap> image_mapping;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_map_list_finish(&it, &image_mapping);
+ }
+
+ if (r < 0) {
+ derr << ": failed to get image map: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_image_mapping->insert(image_mapping.begin(), image_mapping.end());
+
+ if (image_mapping.size() == MAX_RETURN) {
+ m_start_after = image_mapping.rbegin()->first;
+ image_map_list();
+ return;
+ }
+
+ finish(0);
+}
+
+template<typename I>
+void LoadRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_map::LoadRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_map/LoadRequest.h b/src/tools/rbd_mirror/image_map/LoadRequest.h
new file mode 100644
index 00000000..7657e110
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/LoadRequest.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_LOAD_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_LOAD_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/rados/librados.hpp"
+
+class Context;
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+template<typename ImageCtxT = librbd::ImageCtx>
+class LoadRequest {
+public:
+ static LoadRequest *create(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> *image_mapping,
+ Context *on_finish) {
+ return new LoadRequest(ioctx, image_mapping, on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . . . . . . .
+ * v v . MAX_RETURN
+ * IMAGE_MAP_LIST. . . . . . .
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ LoadRequest(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> *image_mapping,
+ Context *on_finish);
+
+ librados::IoCtx &m_ioctx;
+ std::map<std::string, cls::rbd::MirrorImageMap> *m_image_mapping;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ std::string m_start_after;
+
+ void image_map_list();
+ void handle_image_map_list(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_LOAD_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_map/Policy.cc b/src/tools/rbd_mirror/image_map/Policy.cc
new file mode 100644
index 00000000..6fababdd
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/Policy.cc
@@ -0,0 +1,406 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "librbd/Utils.h"
+#include "Policy.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_map::Policy: " << this \
+ << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+namespace {
+
+bool is_instance_action(ActionType action_type) {
+ switch (action_type) {
+ case ACTION_TYPE_ACQUIRE:
+ case ACTION_TYPE_RELEASE:
+ return true;
+ case ACTION_TYPE_NONE:
+ case ACTION_TYPE_MAP_UPDATE:
+ case ACTION_TYPE_MAP_REMOVE:
+ break;
+ }
+ return false;
+}
+
+} // anonymous namespace
+
+using ::operator<<;
+using librbd::util::unique_lock_name;
+
+Policy::Policy(librados::IoCtx &ioctx)
+ : m_ioctx(ioctx),
+ m_map_lock(unique_lock_name("rbd::mirror::image_map::Policy::m_map_lock",
+ this)) {
+
+ // map should at least have once instance
+ std::string instance_id = stringify(ioctx.get_instance_id());
+ m_map.emplace(instance_id, std::set<std::string>{});
+}
+
+void Policy::init(
+ const std::map<std::string, cls::rbd::MirrorImageMap> &image_mapping) {
+ dout(20) << dendl;
+
+ RWLock::WLocker map_lock(m_map_lock);
+ for (auto& it : image_mapping) {
+ ceph_assert(!it.second.instance_id.empty());
+ auto map_result = m_map[it.second.instance_id].emplace(it.first);
+ ceph_assert(map_result.second);
+
+ auto image_state_result = m_image_states.emplace(
+ it.first, ImageState{it.second.instance_id, it.second.mapped_time});
+ ceph_assert(image_state_result.second);
+
+ // ensure we (re)send image acquire actions to the instance
+ auto& image_state = image_state_result.first->second;
+ auto start_action = set_state(&image_state,
+ StateTransition::STATE_INITIALIZING, false);
+ ceph_assert(start_action);
+ }
+}
+
+LookupInfo Policy::lookup(const std::string &global_image_id) {
+ dout(20) << "global_image_id=" << global_image_id << dendl;
+
+ RWLock::RLocker map_lock(m_map_lock);
+ LookupInfo info;
+
+ auto it = m_image_states.find(global_image_id);
+ if (it != m_image_states.end()) {
+ info.instance_id = it->second.instance_id;
+ info.mapped_time = it->second.mapped_time;
+ }
+ return info;
+}
+
+bool Policy::add_image(const std::string &global_image_id) {
+ dout(5) << "global_image_id=" << global_image_id << dendl;
+
+ RWLock::WLocker map_lock(m_map_lock);
+ auto image_state_result = m_image_states.emplace(global_image_id,
+ ImageState{});
+ auto& image_state = image_state_result.first->second;
+ if (image_state.state == StateTransition::STATE_INITIALIZING) {
+ // avoid duplicate acquire notifications upon leader startup
+ return false;
+ }
+
+ return set_state(&image_state, StateTransition::STATE_ASSOCIATING, false);
+}
+
+bool Policy::remove_image(const std::string &global_image_id) {
+ dout(5) << "global_image_id=" << global_image_id << dendl;
+
+ RWLock::WLocker map_lock(m_map_lock);
+ auto it = m_image_states.find(global_image_id);
+ if (it == m_image_states.end()) {
+ return false;
+ }
+
+ auto& image_state = it->second;
+ return set_state(&image_state, StateTransition::STATE_DISSOCIATING, false);
+}
+
+void Policy::add_instances(const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids) {
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+
+ RWLock::WLocker map_lock(m_map_lock);
+ for (auto& instance : instance_ids) {
+ ceph_assert(!instance.empty());
+ m_map.emplace(instance, std::set<std::string>{});
+ }
+
+ // post-failover, remove any dead instances and re-shuffle their images
+ if (m_initial_update) {
+ dout(5) << "initial instance update" << dendl;
+ m_initial_update = false;
+
+ std::set<std::string> alive_instances(instance_ids.begin(),
+ instance_ids.end());
+ InstanceIds dead_instances;
+ for (auto& map_pair : m_map) {
+ if (alive_instances.find(map_pair.first) == alive_instances.end()) {
+ dead_instances.push_back(map_pair.first);
+ }
+ }
+
+ if (!dead_instances.empty()) {
+ remove_instances(m_map_lock, dead_instances, global_image_ids);
+ }
+ }
+
+ GlobalImageIds shuffle_global_image_ids;
+ do_shuffle_add_instances(m_map, m_image_states.size(), &shuffle_global_image_ids);
+ dout(5) << "shuffling global_image_ids=[" << shuffle_global_image_ids
+ << "]" << dendl;
+ for (auto& global_image_id : shuffle_global_image_ids) {
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+
+ auto& image_state = it->second;
+ if (set_state(&image_state, StateTransition::STATE_SHUFFLING, false)) {
+ global_image_ids->emplace(global_image_id);
+ }
+ }
+}
+
+void Policy::remove_instances(const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids) {
+ RWLock::WLocker map_lock(m_map_lock);
+ remove_instances(m_map_lock, instance_ids, global_image_ids);
+}
+
+void Policy::remove_instances(const RWLock& lock,
+ const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids) {
+ ceph_assert(m_map_lock.is_wlocked());
+ dout(5) << "instance_ids=" << instance_ids << dendl;
+
+ for (auto& instance_id : instance_ids) {
+ auto map_it = m_map.find(instance_id);
+ if (map_it == m_map.end()) {
+ continue;
+ }
+
+ auto& instance_global_image_ids = map_it->second;
+ if (instance_global_image_ids.empty()) {
+ m_map.erase(map_it);
+ continue;
+ }
+
+ m_dead_instances.insert(instance_id);
+ dout(5) << "force shuffling: instance_id=" << instance_id << ", "
+ << "global_image_ids=[" << instance_global_image_ids << "]"<< dendl;
+ for (auto& global_image_id : instance_global_image_ids) {
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+
+ auto& image_state = it->second;
+ if (is_state_scheduled(image_state,
+ StateTransition::STATE_DISSOCIATING)) {
+ // don't shuffle images that no longer exist
+ continue;
+ }
+
+ if (set_state(&image_state, StateTransition::STATE_SHUFFLING, true)) {
+ global_image_ids->emplace(global_image_id);
+ }
+ }
+ }
+}
+
+ActionType Policy::start_action(const std::string &global_image_id) {
+ RWLock::WLocker map_lock(m_map_lock);
+
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+
+ auto& image_state = it->second;
+ auto& transition = image_state.transition;
+ ceph_assert(transition.action_type != ACTION_TYPE_NONE);
+
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "state=" << image_state.state << ", "
+ << "action_type=" << transition.action_type << dendl;
+ if (transition.start_policy_action) {
+ execute_policy_action(global_image_id, &image_state,
+ *transition.start_policy_action);
+ transition.start_policy_action = boost::none;
+ }
+ return transition.action_type;
+}
+
+bool Policy::finish_action(const std::string &global_image_id, int r) {
+ RWLock::WLocker map_lock(m_map_lock);
+
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+
+ auto& image_state = it->second;
+ auto& transition = image_state.transition;
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "state=" << image_state.state << ", "
+ << "action_type=" << transition.action_type << ", "
+ << "r=" << r << dendl;
+
+ // retry on failure unless it's an RPC message to an instance that is dead
+ if (r < 0 &&
+ (!is_instance_action(image_state.transition.action_type) ||
+ image_state.instance_id == UNMAPPED_INSTANCE_ID ||
+ m_dead_instances.find(image_state.instance_id) ==
+ m_dead_instances.end())) {
+ return true;
+ }
+
+ auto finish_policy_action = transition.finish_policy_action;
+ StateTransition::transit(image_state.state, &image_state.transition);
+ if (transition.finish_state) {
+ // in-progress state machine complete
+ ceph_assert(StateTransition::is_idle(*transition.finish_state));
+ image_state.state = *transition.finish_state;
+ image_state.transition = {};
+ }
+
+ if (StateTransition::is_idle(image_state.state) && image_state.next_state) {
+ // advance to pending state machine
+ bool start_action = set_state(&image_state, *image_state.next_state, false);
+ ceph_assert(start_action);
+ }
+
+ // image state may get purged in execute_policy_action()
+ bool pending_action = image_state.transition.action_type != ACTION_TYPE_NONE;
+ if (finish_policy_action) {
+ execute_policy_action(global_image_id, &image_state, *finish_policy_action);
+ }
+
+ return pending_action;
+}
+
+void Policy::execute_policy_action(
+ const std::string& global_image_id, ImageState* image_state,
+ StateTransition::PolicyAction policy_action) {
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "policy_action=" << policy_action << dendl;
+
+ switch (policy_action) {
+ case StateTransition::POLICY_ACTION_MAP:
+ map(global_image_id, image_state);
+ break;
+ case StateTransition::POLICY_ACTION_UNMAP:
+ unmap(global_image_id, image_state);
+ break;
+ case StateTransition::POLICY_ACTION_REMOVE:
+ if (image_state->state == StateTransition::STATE_UNASSOCIATED) {
+ ceph_assert(image_state->instance_id == UNMAPPED_INSTANCE_ID);
+ ceph_assert(!image_state->next_state);
+ m_image_states.erase(global_image_id);
+ }
+ break;
+ }
+}
+
+void Policy::map(const std::string& global_image_id, ImageState* image_state) {
+ ceph_assert(m_map_lock.is_wlocked());
+
+ std::string instance_id = image_state->instance_id;
+ if (instance_id != UNMAPPED_INSTANCE_ID && !is_dead_instance(instance_id)) {
+ return;
+ }
+ if (is_dead_instance(instance_id)) {
+ unmap(global_image_id, image_state);
+ }
+
+ instance_id = do_map(m_map, global_image_id);
+ ceph_assert(!instance_id.empty());
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ image_state->instance_id = instance_id;
+ image_state->mapped_time = ceph_clock_now();
+
+ auto ins = m_map[instance_id].emplace(global_image_id);
+ ceph_assert(ins.second);
+}
+
+void Policy::unmap(const std::string &global_image_id,
+ ImageState* image_state) {
+ ceph_assert(m_map_lock.is_wlocked());
+
+ std::string instance_id = image_state->instance_id;
+ if (instance_id == UNMAPPED_INSTANCE_ID) {
+ return;
+ }
+
+ dout(5) << "global_image_id=" << global_image_id << ", "
+ << "instance_id=" << instance_id << dendl;
+
+ ceph_assert(!instance_id.empty());
+ m_map[instance_id].erase(global_image_id);
+ image_state->instance_id = UNMAPPED_INSTANCE_ID;
+ image_state->mapped_time = {};
+
+ if (is_dead_instance(instance_id) && m_map[instance_id].empty()) {
+ dout(5) << "removing dead instance_id=" << instance_id << dendl;
+ m_map.erase(instance_id);
+ m_dead_instances.erase(instance_id);
+ }
+}
+
+bool Policy::is_image_shuffling(const std::string &global_image_id) {
+ ceph_assert(m_map_lock.is_locked());
+
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+ auto& image_state = it->second;
+
+ // avoid attempting to re-shuffle a pending shuffle
+ auto result = is_state_scheduled(image_state,
+ StateTransition::STATE_SHUFFLING);
+ dout(20) << "global_image_id=" << global_image_id << ", "
+ << "result=" << result << dendl;
+ return result;
+}
+
+bool Policy::can_shuffle_image(const std::string &global_image_id) {
+ ceph_assert(m_map_lock.is_locked());
+
+ CephContext *cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+ int migration_throttle = cct->_conf.get_val<uint64_t>(
+ "rbd_mirror_image_policy_migration_throttle");
+
+ auto it = m_image_states.find(global_image_id);
+ ceph_assert(it != m_image_states.end());
+ auto& image_state = it->second;
+
+ utime_t last_shuffled_time = image_state.mapped_time;
+
+ // idle images that haven't been recently remapped can shuffle
+ utime_t now = ceph_clock_now();
+ auto result = (StateTransition::is_idle(image_state.state) &&
+ ((migration_throttle <= 0) ||
+ (now - last_shuffled_time >= migration_throttle)));
+ dout(10) << "global_image_id=" << global_image_id << ", "
+ << "migration_throttle=" << migration_throttle << ", "
+ << "last_shuffled_time=" << last_shuffled_time << ", "
+ << "result=" << result << dendl;
+ return result;
+}
+
+bool Policy::set_state(ImageState* image_state, StateTransition::State state,
+ bool ignore_current_state) {
+ if (!ignore_current_state && image_state->state == state) {
+ return false;
+ } else if (StateTransition::is_idle(image_state->state)) {
+ image_state->state = state;
+ image_state->next_state = boost::none;
+
+ StateTransition::transit(image_state->state, &image_state->transition);
+ ceph_assert(image_state->transition.action_type != ACTION_TYPE_NONE);
+ ceph_assert(!image_state->transition.finish_state);
+ return true;
+ }
+
+ image_state->next_state = state;
+ return false;
+}
+
+bool Policy::is_state_scheduled(const ImageState& image_state,
+ StateTransition::State state) const {
+ return (image_state.state == state ||
+ (image_state.next_state && *image_state.next_state == state));
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_map/Policy.h b/src/tools/rbd_mirror/image_map/Policy.h
new file mode 100644
index 00000000..590fdbfe
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/Policy.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_POLICY_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_POLICY_H
+
+#include <map>
+#include <tuple>
+#include <boost/optional.hpp>
+
+#include "common/RWLock.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/rados/librados.hpp"
+#include "tools/rbd_mirror/image_map/StateTransition.h"
+#include "tools/rbd_mirror/image_map/Types.h"
+
+class Context;
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+class Policy {
+public:
+ Policy(librados::IoCtx &ioctx);
+
+ virtual ~Policy() {
+ }
+
+ // init -- called during initialization
+ void init(
+ const std::map<std::string, cls::rbd::MirrorImageMap> &image_mapping);
+
+ // lookup an image from the map
+ LookupInfo lookup(const std::string &global_image_id);
+
+ // add, remove
+ bool add_image(const std::string &global_image_id);
+ bool remove_image(const std::string &global_image_id);
+
+ // shuffle images when instances are added/removed
+ void add_instances(const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids);
+ void remove_instances(const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids);
+
+ ActionType start_action(const std::string &global_image_id);
+ bool finish_action(const std::string &global_image_id, int r);
+
+protected:
+ typedef std::map<std::string, std::set<std::string> > InstanceToImageMap;
+
+ bool is_dead_instance(const std::string instance_id) {
+ ceph_assert(m_map_lock.is_locked());
+ return m_dead_instances.find(instance_id) != m_dead_instances.end();
+ }
+
+ bool is_image_shuffling(const std::string &global_image_id);
+ bool can_shuffle_image(const std::string &global_image_id);
+
+ // map an image (global image id) to an instance
+ virtual std::string do_map(const InstanceToImageMap& map,
+ const std::string &global_image_id) = 0;
+
+ // shuffle images when instances are added/removed
+ virtual void do_shuffle_add_instances(
+ const InstanceToImageMap& map, size_t image_count,
+ std::set<std::string> *remap_global_image_ids) = 0;
+
+private:
+ struct ImageState {
+ std::string instance_id = UNMAPPED_INSTANCE_ID;
+ utime_t mapped_time;
+
+ ImageState() {}
+ ImageState(const std::string& instance_id, const utime_t& mapped_time)
+ : instance_id(instance_id), mapped_time(mapped_time) {
+ }
+
+ // active state and action
+ StateTransition::State state = StateTransition::STATE_UNASSOCIATED;
+ StateTransition::Transition transition;
+
+ // next scheduled state
+ boost::optional<StateTransition::State> next_state = boost::none;
+ };
+
+ typedef std::map<std::string, ImageState> ImageStates;
+
+ librados::IoCtx &m_ioctx;
+
+ RWLock m_map_lock; // protects m_map
+ InstanceToImageMap m_map; // instance_id -> global_id map
+
+ ImageStates m_image_states;
+ std::set<std::string> m_dead_instances;
+
+ bool m_initial_update = true;
+
+ void remove_instances(const RWLock& lock, const InstanceIds &instance_ids,
+ GlobalImageIds* global_image_ids);
+
+ bool set_state(ImageState* image_state, StateTransition::State state,
+ bool ignore_current_state);
+
+ void execute_policy_action(const std::string& global_image_id,
+ ImageState* image_state,
+ StateTransition::PolicyAction policy_action);
+
+ void map(const std::string& global_image_id, ImageState* image_state);
+ void unmap(const std::string &global_image_id, ImageState* image_state);
+
+ bool is_state_scheduled(const ImageState& image_state,
+ StateTransition::State state) const;
+
+};
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_POLICY_H
diff --git a/src/tools/rbd_mirror/image_map/SimplePolicy.cc b/src/tools/rbd_mirror/image_map/SimplePolicy.cc
new file mode 100644
index 00000000..f2680581
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/SimplePolicy.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "SimplePolicy.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_map::SimplePolicy: " << this \
+ << " " << __func__ << ": "
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+SimplePolicy::SimplePolicy(librados::IoCtx &ioctx)
+ : Policy(ioctx) {
+}
+
+size_t SimplePolicy::calc_images_per_instance(const InstanceToImageMap& map,
+ size_t image_count) {
+ size_t nr_instances = 0;
+ for (auto const &it : map) {
+ if (!Policy::is_dead_instance(it.first)) {
+ ++nr_instances;
+ }
+ }
+ ceph_assert(nr_instances > 0);
+
+ size_t images_per_instance = image_count / nr_instances;
+ if (images_per_instance == 0) {
+ ++images_per_instance;
+ }
+
+ return images_per_instance;
+}
+
+void SimplePolicy::do_shuffle_add_instances(
+ const InstanceToImageMap& map, size_t image_count,
+ std::set<std::string> *remap_global_image_ids) {
+ uint64_t images_per_instance = calc_images_per_instance(map, image_count);
+ dout(5) << "images per instance=" << images_per_instance << dendl;
+
+ for (auto const &instance : map) {
+ if (instance.second.size() <= images_per_instance) {
+ continue;
+ }
+
+ auto it = instance.second.begin();
+ uint64_t cut_off = instance.second.size() - images_per_instance;
+
+ while (it != instance.second.end() && cut_off > 0) {
+ if (Policy::is_image_shuffling(*it)) {
+ --cut_off;
+ } else if (Policy::can_shuffle_image(*it)) {
+ --cut_off;
+ remap_global_image_ids->emplace(*it);
+ }
+
+ ++it;
+ }
+ }
+}
+
+std::string SimplePolicy::do_map(const InstanceToImageMap& map,
+ const std::string &global_image_id) {
+ auto min_it = map.end();
+ for (auto it = map.begin(); it != map.end(); ++it) {
+ ceph_assert(it->second.find(global_image_id) == it->second.end());
+ if (Policy::is_dead_instance(it->first)) {
+ continue;
+ } else if (min_it == map.end()) {
+ min_it = it;
+ } else if (it->second.size() < min_it->second.size()) {
+ min_it = it;
+ }
+ }
+
+ ceph_assert(min_it != map.end());
+ dout(20) << "global_image_id=" << global_image_id << " maps to instance_id="
+ << min_it->first << dendl;
+ return min_it->first;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_map/SimplePolicy.h b/src/tools/rbd_mirror/image_map/SimplePolicy.h
new file mode 100644
index 00000000..ad2071b2
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/SimplePolicy.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_SIMPLE_POLICY_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_SIMPLE_POLICY_H
+
+#include "Policy.h"
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+class SimplePolicy : public Policy {
+public:
+ static SimplePolicy *create(librados::IoCtx &ioctx) {
+ return new SimplePolicy(ioctx);
+ }
+
+protected:
+ SimplePolicy(librados::IoCtx &ioctx);
+
+ std::string do_map(const InstanceToImageMap& map,
+ const std::string &global_image_id) override;
+
+ void do_shuffle_add_instances(
+ const InstanceToImageMap& map, size_t image_count,
+ std::set<std::string> *remap_global_image_ids) override;
+
+private:
+ size_t calc_images_per_instance(const InstanceToImageMap& map,
+ size_t image_count);
+
+};
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_SIMPLE_POLICY_H
diff --git a/src/tools/rbd_mirror/image_map/StateTransition.cc b/src/tools/rbd_mirror/image_map/StateTransition.cc
new file mode 100644
index 00000000..ec5f07ff
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/StateTransition.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <ostream>
+#include "include/ceph_assert.h"
+#include "StateTransition.h"
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+std::ostream &operator<<(std::ostream &os,
+ const StateTransition::State &state) {
+ switch(state) {
+ case StateTransition::STATE_INITIALIZING:
+ os << "INITIALIZING";
+ break;
+ case StateTransition::STATE_ASSOCIATING:
+ os << "ASSOCIATING";
+ break;
+ case StateTransition::STATE_ASSOCIATED:
+ os << "ASSOCIATED";
+ break;
+ case StateTransition::STATE_SHUFFLING:
+ os << "SHUFFLING";
+ break;
+ case StateTransition::STATE_DISSOCIATING:
+ os << "DISSOCIATING";
+ break;
+ case StateTransition::STATE_UNASSOCIATED:
+ os << "UNASSOCIATED";
+ break;
+ }
+ return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+ const StateTransition::PolicyAction &policy_action) {
+ switch(policy_action) {
+ case StateTransition::POLICY_ACTION_MAP:
+ os << "MAP";
+ break;
+ case StateTransition::POLICY_ACTION_UNMAP:
+ os << "UNMAP";
+ break;
+ case StateTransition::POLICY_ACTION_REMOVE:
+ os << "REMOVE";
+ break;
+ }
+ return os;
+}
+
+const StateTransition::TransitionTable StateTransition::s_transition_table {
+ // state current_action Transition
+ // ---------------------------------------------------------------------------
+ {{STATE_INITIALIZING, ACTION_TYPE_NONE}, {ACTION_TYPE_ACQUIRE, {}, {},
+ {}}},
+ {{STATE_INITIALIZING, ACTION_TYPE_ACQUIRE}, {ACTION_TYPE_NONE, {}, {},
+ {STATE_ASSOCIATED}}},
+
+ {{STATE_ASSOCIATING, ACTION_TYPE_NONE}, {ACTION_TYPE_MAP_UPDATE,
+ {POLICY_ACTION_MAP}, {}, {}}},
+ {{STATE_ASSOCIATING, ACTION_TYPE_MAP_UPDATE}, {ACTION_TYPE_ACQUIRE, {}, {},
+ {}}},
+ {{STATE_ASSOCIATING, ACTION_TYPE_ACQUIRE}, {ACTION_TYPE_NONE, {}, {},
+ {STATE_ASSOCIATED}}},
+
+ {{STATE_DISSOCIATING, ACTION_TYPE_NONE}, {ACTION_TYPE_RELEASE, {},
+ {POLICY_ACTION_UNMAP}, {}}},
+ {{STATE_DISSOCIATING, ACTION_TYPE_RELEASE}, {ACTION_TYPE_MAP_REMOVE, {},
+ {POLICY_ACTION_REMOVE}, {}}},
+ {{STATE_DISSOCIATING, ACTION_TYPE_MAP_REMOVE}, {ACTION_TYPE_NONE, {},
+ {}, {STATE_UNASSOCIATED}}},
+
+ {{STATE_SHUFFLING, ACTION_TYPE_NONE}, {ACTION_TYPE_RELEASE, {},
+ {POLICY_ACTION_UNMAP}, {}}},
+ {{STATE_SHUFFLING, ACTION_TYPE_RELEASE}, {ACTION_TYPE_MAP_UPDATE,
+ {POLICY_ACTION_MAP}, {}, {}}},
+ {{STATE_SHUFFLING, ACTION_TYPE_MAP_UPDATE}, {ACTION_TYPE_ACQUIRE, {}, {},
+ {}}},
+ {{STATE_SHUFFLING, ACTION_TYPE_ACQUIRE}, {ACTION_TYPE_NONE, {}, {},
+ {STATE_ASSOCIATED}}}
+};
+
+void StateTransition::transit(State state, Transition* transition) {
+ auto it = s_transition_table.find({state, transition->action_type});
+ ceph_assert(it != s_transition_table.end());
+
+ *transition = it->second;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_map/StateTransition.h b/src/tools/rbd_mirror/image_map/StateTransition.h
new file mode 100644
index 00000000..02a5ce4e
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/StateTransition.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_STATE_TRANSITION_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_STATE_TRANSITION_H
+
+#include "tools/rbd_mirror/image_map/Types.h"
+#include <boost/optional.hpp>
+#include <map>
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+class StateTransition {
+public:
+ enum State {
+ STATE_UNASSOCIATED,
+ STATE_INITIALIZING,
+ STATE_ASSOCIATING,
+ STATE_ASSOCIATED,
+ STATE_SHUFFLING,
+ STATE_DISSOCIATING
+ };
+
+ enum PolicyAction {
+ POLICY_ACTION_MAP,
+ POLICY_ACTION_UNMAP,
+ POLICY_ACTION_REMOVE
+ };
+
+ struct Transition {
+ // image map action
+ ActionType action_type = ACTION_TYPE_NONE;
+
+ // policy internal action
+ boost::optional<PolicyAction> start_policy_action;
+ boost::optional<PolicyAction> finish_policy_action;
+
+ // state machine complete
+ boost::optional<State> finish_state;
+
+ Transition() {
+ }
+ Transition(ActionType action_type,
+ const boost::optional<PolicyAction>& start_policy_action,
+ const boost::optional<PolicyAction>& finish_policy_action,
+ const boost::optional<State>& finish_state)
+ : action_type(action_type), start_policy_action(start_policy_action),
+ finish_policy_action(finish_policy_action), finish_state(finish_state) {
+ }
+ };
+
+ static bool is_idle(State state) {
+ return (state == STATE_UNASSOCIATED || state == STATE_ASSOCIATED);
+ }
+
+ static void transit(State state, Transition* transition);
+
+private:
+ typedef std::pair<State, ActionType> TransitionKey;
+ typedef std::map<TransitionKey, Transition> TransitionTable;
+
+ // image transition table
+ static const TransitionTable s_transition_table;
+};
+
+std::ostream &operator<<(std::ostream &os, const StateTransition::State &state);
+std::ostream &operator<<(std::ostream &os,
+ const StateTransition::PolicyAction &policy_action);
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_STATE_TRANSITION_H
diff --git a/src/tools/rbd_mirror/image_map/Types.cc b/src/tools/rbd_mirror/image_map/Types.cc
new file mode 100644
index 00000000..47de9c3c
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/Types.cc
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Types.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+#include <iostream>
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+const std::string UNMAPPED_INSTANCE_ID("");
+
+namespace {
+
+template <typename E>
+class GetTypeVisitor : public boost::static_visitor<E> {
+public:
+ template <typename T>
+ inline E operator()(const T&) const {
+ return T::TYPE;
+ }
+};
+
+class EncodeVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodeVisitor(bufferlist &bl) : m_bl(bl) {
+ }
+
+ template <typename T>
+ inline void operator()(const T& t) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(T::TYPE), m_bl);
+ t.encode(m_bl);
+ }
+private:
+ bufferlist &m_bl;
+};
+
+class DecodeVisitor : public boost::static_visitor<void> {
+public:
+ DecodeVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {
+ }
+
+ template <typename T>
+ inline void operator()(T& t) const {
+ t.decode(m_version, m_iter);
+ }
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+class DumpVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpVisitor(Formatter *formatter, const std::string &key)
+ : m_formatter(formatter), m_key(key) {}
+
+ template <typename T>
+ inline void operator()(const T& t) const {
+ auto type = T::TYPE;
+ m_formatter->dump_string(m_key.c_str(), stringify(type));
+ t.dump(m_formatter);
+ }
+private:
+ ceph::Formatter *m_formatter;
+ std::string m_key;
+};
+
+} // anonymous namespace
+
+PolicyMetaType PolicyData::get_policy_meta_type() const {
+ return boost::apply_visitor(GetTypeVisitor<PolicyMetaType>(), policy_meta);
+}
+
+void PolicyData::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ boost::apply_visitor(EncodeVisitor(bl), policy_meta);
+ ENCODE_FINISH(bl);
+}
+
+void PolicyData::decode(bufferlist::const_iterator& it) {
+ DECODE_START(1, it);
+
+ uint32_t policy_meta_type;
+ decode(policy_meta_type, it);
+
+ switch (policy_meta_type) {
+ case POLICY_META_TYPE_NONE:
+ policy_meta = PolicyMetaNone();
+ break;
+ default:
+ policy_meta = PolicyMetaUnknown();
+ break;
+ }
+
+ boost::apply_visitor(DecodeVisitor(struct_v, it), policy_meta);
+ DECODE_FINISH(it);
+}
+
+void PolicyData::dump(Formatter *f) const {
+ boost::apply_visitor(DumpVisitor(f, "policy_meta_type"), policy_meta);
+}
+
+void PolicyData::generate_test_instances(std::list<PolicyData *> &o) {
+ o.push_back(new PolicyData(PolicyMetaNone()));
+}
+
+std::ostream &operator<<(std::ostream &os, const ActionType& action_type) {
+ switch (action_type) {
+ case ACTION_TYPE_NONE:
+ os << "NONE";
+ break;
+ case ACTION_TYPE_MAP_UPDATE:
+ os << "MAP_UPDATE";
+ break;
+ case ACTION_TYPE_MAP_REMOVE:
+ os << "MAP_REMOVE";
+ break;
+ case ACTION_TYPE_ACQUIRE:
+ os << "ACQUIRE";
+ break;
+ case ACTION_TYPE_RELEASE:
+ os << "RELEASE";
+ break;
+ default:
+ os << "UNKNOWN (" << static_cast<uint32_t>(action_type) << ")";
+ break;
+ }
+ return os;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/image_map/Types.h b/src/tools/rbd_mirror/image_map/Types.h
new file mode 100644
index 00000000..5a97430f
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/Types.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_TYPES_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_TYPES_H
+
+#include <iosfwd>
+#include <map>
+#include <set>
+#include <string>
+#include <boost/variant.hpp>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/utime.h"
+#include "tools/rbd_mirror/Types.h"
+
+struct Context;
+
+namespace ceph {
+class Formatter;
+}
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+extern const std::string UNMAPPED_INSTANCE_ID;
+
+struct Listener {
+ virtual ~Listener() {
+ }
+
+ virtual void acquire_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) = 0;
+ virtual void release_image(const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) = 0;
+ virtual void remove_image(const std::string &mirror_uuid,
+ const std::string &global_image_id,
+ const std::string &instance_id,
+ Context* on_finish) = 0;
+};
+
+struct LookupInfo {
+ std::string instance_id = UNMAPPED_INSTANCE_ID;
+ utime_t mapped_time;
+};
+
+enum ActionType {
+ ACTION_TYPE_NONE,
+ ACTION_TYPE_MAP_UPDATE,
+ ACTION_TYPE_MAP_REMOVE,
+ ACTION_TYPE_ACQUIRE,
+ ACTION_TYPE_RELEASE
+};
+
+typedef std::vector<std::string> InstanceIds;
+typedef std::set<std::string> GlobalImageIds;
+typedef std::map<std::string, ActionType> ImageActionTypes;
+
+enum PolicyMetaType {
+ POLICY_META_TYPE_NONE = 0,
+};
+
+struct PolicyMetaNone {
+ static const PolicyMetaType TYPE = POLICY_META_TYPE_NONE;
+
+ PolicyMetaNone() {
+ }
+
+ void encode(bufferlist& bl) const {
+ }
+
+ void decode(__u8 version, bufferlist::const_iterator& it) {
+ }
+
+ void dump(Formatter *f) const {
+ }
+};
+
+struct PolicyMetaUnknown {
+ static const PolicyMetaType TYPE = static_cast<PolicyMetaType>(-1);
+
+ PolicyMetaUnknown() {
+ }
+
+ void encode(bufferlist& bl) const {
+ ceph_abort();
+ }
+
+ void decode(__u8 version, bufferlist::const_iterator& it) {
+ }
+
+ void dump(Formatter *f) const {
+ }
+};
+
+typedef boost::variant<PolicyMetaNone,
+ PolicyMetaUnknown> PolicyMeta;
+
+struct PolicyData {
+ PolicyData()
+ : policy_meta(PolicyMetaUnknown()) {
+ }
+ PolicyData(const PolicyMeta &policy_meta)
+ : policy_meta(policy_meta) {
+ }
+
+ PolicyMeta policy_meta;
+
+ PolicyMetaType get_policy_meta_type() const;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<PolicyData *> &o);
+};
+
+WRITE_CLASS_ENCODER(PolicyData);
+
+std::ostream &operator<<(std::ostream &os, const ActionType &action_type);
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_TYPES_H
diff --git a/src/tools/rbd_mirror/image_map/UpdateRequest.cc b/src/tools/rbd_mirror/image_map/UpdateRequest.cc
new file mode 100644
index 00000000..799c5670
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/UpdateRequest.cc
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "librbd/Utils.h"
+#include "include/rbd_types.h"
+#include "cls/rbd/cls_rbd_client.h"
+
+#include "UpdateRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_map::UpdateRequest: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+using librbd::util::create_rados_callback;
+
+static const uint32_t MAX_UPDATE = 256;
+
+template <typename I>
+UpdateRequest<I>::UpdateRequest(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> &&update_mapping,
+ std::set<std::string> &&remove_global_image_ids, Context *on_finish)
+ : m_ioctx(ioctx),
+ m_update_mapping(update_mapping),
+ m_remove_global_image_ids(remove_global_image_ids),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void UpdateRequest<I>::send() {
+ dout(20) << dendl;
+
+ update_image_map();
+}
+
+template <typename I>
+void UpdateRequest<I>::update_image_map() {
+ dout(20) << dendl;
+
+ if (m_update_mapping.empty() && m_remove_global_image_ids.empty()) {
+ finish(0);
+ return;
+ }
+
+ uint32_t nr_updates = 0;
+ librados::ObjectWriteOperation op;
+
+ auto it1 = m_update_mapping.begin();
+ while (it1 != m_update_mapping.end() && nr_updates++ < MAX_UPDATE) {
+ librbd::cls_client::mirror_image_map_update(&op, it1->first, it1->second);
+ it1 = m_update_mapping.erase(it1);
+ }
+
+ auto it2 = m_remove_global_image_ids.begin();
+ while (it2 != m_remove_global_image_ids.end() && nr_updates++ < MAX_UPDATE) {
+ librbd::cls_client::mirror_image_map_remove(&op, *it2);
+ it2 = m_remove_global_image_ids.erase(it2);
+ }
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ UpdateRequest, &UpdateRequest::handle_update_image_map>(this);
+ int r = m_ioctx.aio_operate(RBD_MIRROR_LEADER, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void UpdateRequest<I>::handle_update_image_map(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to update image map: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ update_image_map();
+}
+
+template <typename I>
+void UpdateRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_map::UpdateRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_map/UpdateRequest.h b/src/tools/rbd_mirror/image_map/UpdateRequest.h
new file mode 100644
index 00000000..841cc6f9
--- /dev/null
+++ b/src/tools/rbd_mirror/image_map/UpdateRequest.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_MAP_UPDATE_REQUEST_H
+#define CEPH_RBD_MIRROR_IMAGE_MAP_UPDATE_REQUEST_H
+
+#include "cls/rbd/cls_rbd_types.h"
+#include "include/rados/librados.hpp"
+
+class Context;
+
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_map {
+
+template<typename ImageCtxT = librbd::ImageCtx>
+class UpdateRequest {
+public:
+ // accepts an image map for updation and a collection of
+ // global image ids to purge.
+ static UpdateRequest *create(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> &&update_mapping,
+ std::set<std::string> &&remove_global_image_ids, Context *on_finish) {
+ return new UpdateRequest(ioctx, std::move(update_mapping), std::move(remove_global_image_ids),
+ on_finish);
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * | . . . . . . . .
+ * v v . MAX_UPDATE
+ * UPDATE_IMAGE_MAP. . . . . . .
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ UpdateRequest(librados::IoCtx &ioctx,
+ std::map<std::string, cls::rbd::MirrorImageMap> &&update_mapping,
+ std::set<std::string> &&remove_global_image_ids, Context *on_finish);
+
+ librados::IoCtx &m_ioctx;
+ std::map<std::string, cls::rbd::MirrorImageMap> m_update_mapping;
+ std::set<std::string> m_remove_global_image_ids;
+ Context *m_on_finish;
+
+ void update_image_map();
+ void handle_update_image_map(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_map
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_MAP_UPDATE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
new file mode 100644
index 00000000..7ce21b4b
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
@@ -0,0 +1,785 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "BootstrapRequest.h"
+#include "CloseImageRequest.h"
+#include "CreateImageRequest.h"
+#include "IsPrimaryRequest.h"
+#include "OpenImageRequest.h"
+#include "OpenLocalImageRequest.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/Types.h"
+#include "tools/rbd_mirror/ProgressContext.h"
+#include "tools/rbd_mirror/ImageSync.h"
+#include "tools/rbd_mirror/Threads.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::BootstrapRequest: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+using librbd::util::unique_lock_name;
+
+template <typename I>
+BootstrapRequest<I>::BootstrapRequest(
+ Threads<I>* threads,
+ librados::IoCtx &local_io_ctx,
+ librados::IoCtx &remote_io_ctx,
+ InstanceWatcher<I> *instance_watcher,
+ I **local_image_ctx,
+ const std::string &local_image_id,
+ const std::string &remote_image_id,
+ const std::string &global_image_id,
+ const std::string &local_mirror_uuid,
+ const std::string &remote_mirror_uuid,
+ Journaler *journaler,
+ cls::journal::ClientState *client_state,
+ MirrorPeerClientMeta *client_meta,
+ Context *on_finish,
+ bool *do_resync,
+ rbd::mirror::ProgressContext *progress_ctx)
+ : BaseRequest("rbd::mirror::image_replayer::BootstrapRequest",
+ reinterpret_cast<CephContext*>(local_io_ctx.cct()), on_finish),
+ m_threads(threads), m_local_io_ctx(local_io_ctx),
+ m_remote_io_ctx(remote_io_ctx), m_instance_watcher(instance_watcher),
+ m_local_image_ctx(local_image_ctx), m_local_image_id(local_image_id),
+ m_remote_image_id(remote_image_id), m_global_image_id(global_image_id),
+ m_local_mirror_uuid(local_mirror_uuid),
+ m_remote_mirror_uuid(remote_mirror_uuid), m_journaler(journaler),
+ m_client_state(client_state), m_client_meta(client_meta),
+ m_progress_ctx(progress_ctx), m_do_resync(do_resync),
+ m_lock(unique_lock_name("BootstrapRequest::m_lock", this)) {
+ dout(10) << dendl;
+}
+
+template <typename I>
+BootstrapRequest<I>::~BootstrapRequest() {
+ ceph_assert(m_remote_image_ctx == nullptr);
+}
+
+template <typename I>
+bool BootstrapRequest<I>::is_syncing() const {
+ Mutex::Locker locker(m_lock);
+ return (m_image_sync != nullptr);
+}
+
+template <typename I>
+void BootstrapRequest<I>::send() {
+ *m_do_resync = false;
+
+ get_remote_tag_class();
+}
+
+template <typename I>
+void BootstrapRequest<I>::cancel() {
+ dout(10) << dendl;
+
+ Mutex::Locker locker(m_lock);
+ m_canceled = true;
+
+ if (m_image_sync != nullptr) {
+ m_image_sync->cancel();
+ }
+}
+
+template <typename I>
+void BootstrapRequest<I>::get_remote_tag_class() {
+ dout(15) << dendl;
+
+ update_progress("GET_REMOTE_TAG_CLASS");
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_remote_tag_class>(
+ this);
+ m_journaler->get_client(librbd::Journal<>::IMAGE_CLIENT_ID, &m_client, ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_get_remote_tag_class(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to retrieve remote client: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ librbd::journal::ClientData client_data;
+ auto it = m_client.data.cbegin();
+ try {
+ decode(client_data, it);
+ } catch (const buffer::error &err) {
+ derr << "failed to decode remote client meta data: " << err.what()
+ << dendl;
+ finish(-EBADMSG);
+ return;
+ }
+
+ librbd::journal::ImageClientMeta *client_meta =
+ boost::get<librbd::journal::ImageClientMeta>(&client_data.client_meta);
+ if (client_meta == nullptr) {
+ derr << "unknown remote client registration" << dendl;
+ finish(-EINVAL);
+ return;
+ }
+
+ m_remote_tag_class = client_meta->tag_class;
+ dout(10) << "remote tag class=" << m_remote_tag_class << dendl;
+
+ open_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::open_remote_image() {
+ dout(15) << "remote_image_id=" << m_remote_image_id << dendl;
+
+ update_progress("OPEN_REMOTE_IMAGE");
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_open_remote_image>(
+ this);
+ OpenImageRequest<I> *request = OpenImageRequest<I>::create(
+ m_remote_io_ctx, &m_remote_image_ctx, m_remote_image_id, false,
+ ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_open_remote_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to open remote image: " << cpp_strerror(r) << dendl;
+ ceph_assert(m_remote_image_ctx == nullptr);
+ finish(r);
+ return;
+ }
+
+ is_primary();
+}
+
+template <typename I>
+void BootstrapRequest<I>::is_primary() {
+ dout(15) << dendl;
+
+ update_progress("OPEN_REMOTE_IMAGE");
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_is_primary>(
+ this);
+ IsPrimaryRequest<I> *request = IsPrimaryRequest<I>::create(m_remote_image_ctx,
+ &m_primary, ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_is_primary(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(5) << "remote image is not mirrored" << dendl;
+ m_ret_val = -EREMOTEIO;
+ close_remote_image();
+ return;
+ } else if (r < 0) {
+ derr << "error querying remote image primary status: " << cpp_strerror(r)
+ << dendl;
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ }
+
+ if (!m_primary) {
+ if (m_local_image_id.empty()) {
+ // no local image and remote isn't primary -- don't sync it
+ dout(5) << "remote image is not primary -- not syncing"
+ << dendl;
+ m_ret_val = -EREMOTEIO;
+ close_remote_image();
+ return;
+ } else if (m_client_meta->state !=
+ librbd::journal::MIRROR_PEER_STATE_REPLAYING) {
+ // ensure we attempt to re-sync to remote if it's re-promoted
+ dout(5) << "remote image is not primary -- sync interrupted"
+ << dendl;
+ m_ret_val = -EREMOTEIO;
+ update_client_state();
+ return;
+ }
+ }
+
+ if (!m_client_meta->image_id.empty()) {
+ // have an image id -- use that to open the image since a deletion (resync)
+ // will leave the old image id registered in the peer
+ m_local_image_id = m_client_meta->image_id;
+ }
+
+ if (m_local_image_id.empty()) {
+ // prepare to create local image
+ update_client_image();
+ return;
+ }
+
+ open_local_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::update_client_state() {
+ dout(15) << dendl;
+ update_progress("UPDATE_CLIENT_STATE");
+
+ librbd::journal::MirrorPeerClientMeta client_meta(*m_client_meta);
+ client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+ librbd::journal::ClientData client_data(client_meta);
+ bufferlist data_bl;
+ encode(client_data, data_bl);
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_update_client_state>(
+ this);
+ m_journaler->update_client(data_bl, ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_update_client_state(int r) {
+ dout(15) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to update client: " << cpp_strerror(r) << dendl;
+ } else {
+ m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+ }
+
+ close_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::open_local_image() {
+ dout(15) << "local_image_id=" << m_local_image_id << dendl;
+
+ update_progress("OPEN_LOCAL_IMAGE");
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_open_local_image>(
+ this);
+ OpenLocalImageRequest<I> *request = OpenLocalImageRequest<I>::create(
+ m_local_io_ctx, m_local_image_ctx, m_local_image_id, m_threads->work_queue,
+ ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_open_local_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ ceph_assert(*m_local_image_ctx == nullptr);
+ dout(10) << "local image missing" << dendl;
+ unregister_client();
+ return;
+ } else if (r == -EREMOTEIO) {
+ ceph_assert(*m_local_image_ctx == nullptr);
+ dout(10) << "local image is primary -- skipping image replay" << dendl;
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ } else if (r < 0) {
+ ceph_assert(*m_local_image_ctx == nullptr);
+ derr << "failed to open local image: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ }
+
+ I *local_image_ctx = (*m_local_image_ctx);
+ {
+ local_image_ctx->snap_lock.get_read();
+ if (local_image_ctx->journal == nullptr) {
+ local_image_ctx->snap_lock.put_read();
+
+ derr << "local image does not support journaling" << dendl;
+ m_ret_val = -EINVAL;
+ close_local_image();
+ return;
+ }
+
+ r = (*m_local_image_ctx)->journal->is_resync_requested(m_do_resync);
+ if (r < 0) {
+ local_image_ctx->snap_lock.put_read();
+
+ derr << "failed to check if a resync was requested" << dendl;
+ m_ret_val = r;
+ close_local_image();
+ return;
+ }
+
+ m_local_tag_tid = local_image_ctx->journal->get_tag_tid();
+ m_local_tag_data = local_image_ctx->journal->get_tag_data();
+ dout(10) << "local tag=" << m_local_tag_tid << ", "
+ << "local tag data=" << m_local_tag_data << dendl;
+ local_image_ctx->snap_lock.put_read();
+ }
+
+ if (m_local_tag_data.mirror_uuid != m_remote_mirror_uuid && !m_primary) {
+ // if the local mirror is not linked to the (now) non-primary image,
+ // stop the replay. Otherwise, we ignore that the remote is non-primary
+ // so that we can replay the demotion
+ dout(5) << "remote image is not primary -- skipping image replay"
+ << dendl;
+ m_ret_val = -EREMOTEIO;
+ close_local_image();
+ return;
+ }
+
+ if (*m_do_resync) {
+ close_remote_image();
+ return;
+ }
+
+ if (*m_client_state == cls::journal::CLIENT_STATE_DISCONNECTED) {
+ dout(10) << "client flagged disconnected -- skipping bootstrap" << dendl;
+ // The caller is expected to detect disconnect initializing remote journal.
+ m_ret_val = 0;
+ close_remote_image();
+ return;
+ }
+
+ get_remote_tags();
+}
+
+template <typename I>
+void BootstrapRequest<I>::unregister_client() {
+ dout(15) << dendl;
+ update_progress("UNREGISTER_CLIENT");
+
+ m_local_image_id = "";
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_unregister_client>(
+ this);
+ m_journaler->unregister_client(ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_unregister_client(int r) {
+ dout(15) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to unregister with remote journal: " << cpp_strerror(r)
+ << dendl;
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ }
+
+ *m_client_meta = librbd::journal::MirrorPeerClientMeta("");
+ register_client();
+}
+
+template <typename I>
+void BootstrapRequest<I>::register_client() {
+ dout(15) << dendl;
+
+ update_progress("REGISTER_CLIENT");
+
+ ceph_assert(m_local_image_id.empty());
+ librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta;
+ mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+ librbd::journal::ClientData client_data{mirror_peer_client_meta};
+ bufferlist client_data_bl;
+ encode(client_data, client_data_bl);
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_register_client>(
+ this);
+ m_journaler->register_client(client_data_bl, ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_register_client(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to register with remote journal: " << cpp_strerror(r)
+ << dendl;
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ }
+
+ *m_client_state = cls::journal::CLIENT_STATE_CONNECTED;
+ *m_client_meta = librbd::journal::MirrorPeerClientMeta();
+ m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+ is_primary();
+}
+
+template <typename I>
+void BootstrapRequest<I>::update_client_image() {
+ ceph_assert(m_local_image_id.empty());
+ assert(m_local_image_id.empty());
+ m_local_image_id = librbd::util::generate_image_id<I>(m_local_io_ctx);
+
+ dout(15) << "local_image_id=" << m_local_image_id << dendl;
+ update_progress("UPDATE_CLIENT_IMAGE");
+
+ librbd::journal::MirrorPeerClientMeta client_meta{m_local_image_id};
+ client_meta.state = librbd::journal::MIRROR_PEER_STATE_SYNCING;
+
+ librbd::journal::ClientData client_data(client_meta);
+ bufferlist data_bl;
+ encode(client_data, data_bl);
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_update_client_image>(
+ this);
+ m_journaler->update_client(data_bl, ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_update_client_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to update client: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ }
+
+ if (m_canceled) {
+ dout(10) << "request canceled" << dendl;
+ m_ret_val = -ECANCELED;
+ close_remote_image();
+ return;
+ }
+
+ *m_client_meta = {m_local_image_id};
+ m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_SYNCING;
+ create_local_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::create_local_image() {
+ dout(15) << "local_image_id=" << m_local_image_id << dendl;
+ update_progress("CREATE_LOCAL_IMAGE");
+
+ m_remote_image_ctx->snap_lock.get_read();
+ std::string image_name = m_remote_image_ctx->name;
+ m_remote_image_ctx->snap_lock.put_read();
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_create_local_image>(
+ this);
+ CreateImageRequest<I> *request = CreateImageRequest<I>::create(
+ m_threads, m_local_io_ctx, m_global_image_id, m_remote_mirror_uuid,
+ image_name, m_local_image_id, m_remote_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_create_local_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r == -EBADF) {
+ dout(5) << "image id " << m_local_image_id << " already in-use" << dendl;
+ m_local_image_id = "";
+ update_client_image();
+ return;
+ } else if (r < 0) {
+ if (r == -ENOENT) {
+ dout(10) << "parent image does not exist" << dendl;
+ } else {
+ derr << "failed to create local image: " << cpp_strerror(r) << dendl;
+ }
+ m_ret_val = r;
+ close_remote_image();
+ return;
+ }
+
+ open_local_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::get_remote_tags() {
+ if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_SYNCING) {
+ // optimization -- no need to compare remote tags if we just created
+ // the image locally or sync was interrupted
+ image_sync();
+ return;
+ }
+
+ dout(15) << dendl;
+ update_progress("GET_REMOTE_TAGS");
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_remote_tags>(this);
+ m_journaler->get_tags(m_remote_tag_class, &m_remote_tags, ctx);
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_get_remote_tags(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to retrieve remote tags: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ close_local_image();
+ return;
+ }
+
+ if (m_canceled) {
+ dout(10) << "request canceled" << dendl;
+ m_ret_val = -ECANCELED;
+ close_local_image();
+ return;
+ }
+
+ // At this point, the local image was existing, non-primary, and replaying;
+ // and the remote image is primary. Attempt to link the local image's most
+ // recent tag to the remote image's tag chain.
+ bool remote_tag_data_valid = false;
+ librbd::journal::TagData remote_tag_data;
+ boost::optional<uint64_t> remote_orphan_tag_tid =
+ boost::make_optional<uint64_t>(false, 0U);
+ bool reconnect_orphan = false;
+
+ // decode the remote tags
+ for (auto &remote_tag : m_remote_tags) {
+ if (m_local_tag_data.predecessor.commit_valid &&
+ m_local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
+ m_local_tag_data.predecessor.tag_tid > remote_tag.tid) {
+ dout(15) << "skipping processed predecessor remote tag "
+ << remote_tag.tid << dendl;
+ continue;
+ }
+
+ try {
+ auto it = remote_tag.data.cbegin();
+ decode(remote_tag_data, it);
+ remote_tag_data_valid = true;
+ } catch (const buffer::error &err) {
+ derr << "failed to decode remote tag " << remote_tag.tid << ": "
+ << err.what() << dendl;
+ m_ret_val = -EBADMSG;
+ close_local_image();
+ return;
+ }
+
+ dout(10) << "decoded remote tag " << remote_tag.tid << ": "
+ << remote_tag_data << dendl;
+
+ if (!m_local_tag_data.predecessor.commit_valid) {
+ // newly synced local image (no predecessor) replays from the first tag
+ if (remote_tag_data.mirror_uuid != librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ dout(15) << "skipping non-primary remote tag" << dendl;
+ continue;
+ }
+
+ dout(10) << "using initial primary remote tag" << dendl;
+ break;
+ }
+
+ if (m_local_tag_data.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+ // demotion last available local epoch
+
+ if (remote_tag_data.mirror_uuid == m_local_tag_data.mirror_uuid &&
+ remote_tag_data.predecessor.commit_valid &&
+ remote_tag_data.predecessor.tag_tid ==
+ m_local_tag_data.predecessor.tag_tid) {
+ // demotion matches remote epoch
+
+ if (remote_tag_data.predecessor.mirror_uuid == m_local_mirror_uuid &&
+ m_local_tag_data.predecessor.mirror_uuid ==
+ librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ // local demoted and remote has matching event
+ dout(15) << "found matching local demotion tag" << dendl;
+ remote_orphan_tag_tid = remote_tag.tid;
+ continue;
+ }
+
+ if (m_local_tag_data.predecessor.mirror_uuid == m_remote_mirror_uuid &&
+ remote_tag_data.predecessor.mirror_uuid ==
+ librbd::Journal<>::LOCAL_MIRROR_UUID) {
+ // remote demoted and local has matching event
+ dout(15) << "found matching remote demotion tag" << dendl;
+ remote_orphan_tag_tid = remote_tag.tid;
+ continue;
+ }
+ }
+
+ if (remote_tag_data.mirror_uuid == librbd::Journal<>::LOCAL_MIRROR_UUID &&
+ remote_tag_data.predecessor.mirror_uuid == librbd::Journal<>::ORPHAN_MIRROR_UUID &&
+ remote_tag_data.predecessor.commit_valid && remote_orphan_tag_tid &&
+ remote_tag_data.predecessor.tag_tid == *remote_orphan_tag_tid) {
+ // remote promotion tag chained to remote/local demotion tag
+ dout(15) << "found chained remote promotion tag" << dendl;
+ reconnect_orphan = true;
+ break;
+ }
+
+ // promotion must follow demotion
+ remote_orphan_tag_tid = boost::none;
+ }
+ }
+
+ if (remote_tag_data_valid &&
+ m_local_tag_data.mirror_uuid == m_remote_mirror_uuid) {
+ dout(10) << "local image is in clean replay state" << dendl;
+ } else if (reconnect_orphan) {
+ dout(10) << "remote image was demoted/promoted" << dendl;
+ } else {
+ derr << "split-brain detected -- skipping image replay" << dendl;
+ m_ret_val = -EEXIST;
+ close_local_image();
+ return;
+ }
+
+ image_sync();
+}
+
+template <typename I>
+void BootstrapRequest<I>::image_sync() {
+ if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_REPLAYING) {
+ // clean replay state -- no image sync required
+ close_remote_image();
+ return;
+ }
+
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_canceled) {
+ m_ret_val = -ECANCELED;
+ } else {
+ dout(15) << dendl;
+ ceph_assert(m_image_sync == nullptr);
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_image_sync>(this);
+ m_image_sync = ImageSync<I>::create(
+ *m_local_image_ctx, m_remote_image_ctx, m_threads->timer,
+ &m_threads->timer_lock, m_local_mirror_uuid, m_journaler,
+ m_client_meta, m_threads->work_queue, m_instance_watcher, ctx,
+ m_progress_ctx);
+
+ m_image_sync->get();
+
+ m_lock.Unlock();
+ update_progress("IMAGE_SYNC");
+ m_lock.Lock();
+
+ m_image_sync->send();
+ return;
+ }
+ }
+
+ dout(10) << "request canceled" << dendl;
+ close_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_image_sync(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ {
+ Mutex::Locker locker(m_lock);
+ m_image_sync->put();
+ m_image_sync = nullptr;
+
+ if (m_canceled) {
+ dout(10) << "request canceled" << dendl;
+ m_ret_val = -ECANCELED;
+ }
+
+ if (r < 0) {
+ derr << "failed to sync remote image: " << cpp_strerror(r) << dendl;
+ m_ret_val = r;
+ }
+ }
+
+ close_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::close_local_image() {
+ dout(15) << dendl;
+
+ update_progress("CLOSE_LOCAL_IMAGE");
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_close_local_image>(
+ this);
+ CloseImageRequest<I> *request = CloseImageRequest<I>::create(
+ m_local_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_close_local_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error encountered closing local image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ close_remote_image();
+}
+
+template <typename I>
+void BootstrapRequest<I>::close_remote_image() {
+ dout(15) << dendl;
+
+ update_progress("CLOSE_REMOTE_IMAGE");
+
+ Context *ctx = create_context_callback<
+ BootstrapRequest<I>, &BootstrapRequest<I>::handle_close_remote_image>(
+ this);
+ CloseImageRequest<I> *request = CloseImageRequest<I>::create(
+ &m_remote_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void BootstrapRequest<I>::handle_close_remote_image(int r) {
+ dout(15) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error encountered closing remote image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(m_ret_val);
+}
+
+template <typename I>
+void BootstrapRequest<I>::update_progress(const std::string &description) {
+ dout(15) << description << dendl;
+
+ if (m_progress_ctx) {
+ m_progress_ctx->update_progress(description);
+ }
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::BootstrapRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
new file mode 100644
index 00000000..ea9f8565
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
@@ -0,0 +1,230 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_BOOTSTRAP_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_BOOTSTRAP_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "common/Mutex.h"
+#include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include "tools/rbd_mirror/BaseRequest.h"
+#include "tools/rbd_mirror/Types.h"
+#include <list>
+#include <string>
+
+class Context;
+class ContextWQ;
+class Mutex;
+class SafeTimer;
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+namespace rbd {
+namespace mirror {
+
+class ProgressContext;
+
+template <typename> class ImageSync;
+template <typename> class InstanceWatcher;
+template <typename> struct Threads;
+
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class BootstrapRequest : public BaseRequest {
+public:
+ typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+ typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta;
+ typedef rbd::mirror::ProgressContext ProgressContext;
+
+ static BootstrapRequest* create(
+ Threads<ImageCtxT>* threads,
+ librados::IoCtx &local_io_ctx,
+ librados::IoCtx &remote_io_ctx,
+ InstanceWatcher<ImageCtxT> *instance_watcher,
+ ImageCtxT **local_image_ctx,
+ const std::string &local_image_id,
+ const std::string &remote_image_id,
+ const std::string &global_image_id,
+ const std::string &local_mirror_uuid,
+ const std::string &remote_mirror_uuid,
+ Journaler *journaler,
+ cls::journal::ClientState *client_state,
+ MirrorPeerClientMeta *client_meta,
+ Context *on_finish,
+ bool *do_resync,
+ ProgressContext *progress_ctx = nullptr) {
+ return new BootstrapRequest(threads, local_io_ctx, remote_io_ctx,
+ instance_watcher, local_image_ctx,
+ local_image_id, remote_image_id,
+ global_image_id, local_mirror_uuid,
+ remote_mirror_uuid, journaler, client_state,
+ client_meta, on_finish, do_resync,
+ progress_ctx);
+ }
+
+ BootstrapRequest(Threads<ImageCtxT>* threads,
+ librados::IoCtx &local_io_ctx,
+ librados::IoCtx &remote_io_ctx,
+ InstanceWatcher<ImageCtxT> *instance_watcher,
+ ImageCtxT **local_image_ctx,
+ const std::string &local_image_id,
+ const std::string &remote_image_id,
+ const std::string &global_image_id,
+ const std::string &local_mirror_uuid,
+ const std::string &remote_mirror_uuid, Journaler *journaler,
+ cls::journal::ClientState *client_state,
+ MirrorPeerClientMeta *client_meta, Context *on_finish,
+ bool *do_resync, ProgressContext *progress_ctx = nullptr);
+ ~BootstrapRequest() override;
+
+ bool is_syncing() const;
+
+ void send() override;
+ void cancel() override;
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_REMOTE_TAG_CLASS * * * * * * * * * * * * * * * * * *
+ * | * (error)
+ * v *
+ * OPEN_REMOTE_IMAGE * * * * * * * * * * * * * * * * * * *
+ * | *
+ * |/--------------------------------------------------*---\
+ * v * |
+ * IS_PRIMARY * * * * * * * * * * * * * * * * * * * * * * |
+ * | * * |
+ * | (remote image primary, no local image id) * * |
+ * \----> UPDATE_CLIENT_IMAGE * * * * * * * * * * * * |
+ * | | ^ * * |
+ * | | * (duplicate image id) * * |
+ * | v * * * |
+ * \----> CREATE_LOCAL_IMAGE * * * * * * * * * * * * * |
+ * | | * * |
+ * | v * * |
+ * | (remote image primary) * * |
+ * \----> OPEN_LOCAL_IMAGE * * * * * * * * * * * * * * |
+ * | | . * * |
+ * | | . (image doesn't exist) * * |
+ * | | . . > UNREGISTER_CLIENT * * * * * * * |
+ * | | | * * |
+ * | | v * * |
+ * | | REGISTER_CLIENT * * * * * * * * |
+ * | | | * * |
+ * | | \-----------------------*---*---/
+ * | | * *
+ * | v (skip if not needed) * *
+ * | GET_REMOTE_TAGS * * * * * * * * *
+ * | | * * *
+ * | v (skip if not needed) v * *
+ * | IMAGE_SYNC * * * > CLOSE_LOCAL_IMAGE * *
+ * | | | * *
+ * | \-----------------\ /-----/ * *
+ * | | * *
+ * | | * *
+ * | (skip if not needed) | * *
+ * \----> UPDATE_CLIENT_STATE *|* * * * * * * * * * *
+ * | | * *
+ * /-----------/----------------/ * *
+ * | * *
+ * v * *
+ * CLOSE_REMOTE_IMAGE < * * * * * * * * * * * * * * * * *
+ * | *
+ * v *
+ * <finish> < * * * * * * * * * * * * * * * * * * * * * * *
+ *
+ * @endverbatim
+ */
+ typedef std::list<cls::journal::Tag> Tags;
+
+ Threads<ImageCtxT>* m_threads;
+ librados::IoCtx &m_local_io_ctx;
+ librados::IoCtx &m_remote_io_ctx;
+ InstanceWatcher<ImageCtxT> *m_instance_watcher;
+ ImageCtxT **m_local_image_ctx;
+ std::string m_local_image_id;
+ std::string m_remote_image_id;
+ std::string m_global_image_id;
+ std::string m_local_mirror_uuid;
+ std::string m_remote_mirror_uuid;
+ Journaler *m_journaler;
+ cls::journal::ClientState *m_client_state;
+ MirrorPeerClientMeta *m_client_meta;
+ ProgressContext *m_progress_ctx;
+ bool *m_do_resync;
+
+ mutable Mutex m_lock;
+ bool m_canceled = false;
+
+ Tags m_remote_tags;
+ cls::journal::Client m_client;
+ uint64_t m_remote_tag_class = 0;
+ ImageCtxT *m_remote_image_ctx = nullptr;
+ bool m_primary = false;
+ int m_ret_val = 0;
+ ImageSync<ImageCtxT> *m_image_sync = nullptr;
+
+ uint64_t m_local_tag_tid = 0;
+ librbd::journal::TagData m_local_tag_data;
+
+ bufferlist m_out_bl;
+
+ void get_remote_tag_class();
+ void handle_get_remote_tag_class(int r);
+
+ void open_remote_image();
+ void handle_open_remote_image(int r);
+
+ void is_primary();
+ void handle_is_primary(int r);
+
+ void update_client_state();
+ void handle_update_client_state(int r);
+
+ void open_local_image();
+ void handle_open_local_image(int r);
+
+ void unregister_client();
+ void handle_unregister_client(int r);
+
+ void register_client();
+ void handle_register_client(int r);
+
+ void create_local_image();
+ void handle_create_local_image(int r);
+
+ void update_client_image();
+ void handle_update_client_image(int r);
+
+ void get_remote_tags();
+ void handle_get_remote_tags(int r);
+
+ void image_sync();
+ void handle_image_sync(int r);
+
+ void close_local_image();
+ void handle_close_local_image(int r);
+
+ void close_remote_image();
+ void handle_close_remote_image(int r);
+
+ void update_progress(const std::string &description);
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::BootstrapRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_BOOTSTRAP_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc
new file mode 100644
index 00000000..5b754823
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "CloseImageRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::CloseImageRequest: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+CloseImageRequest<I>::CloseImageRequest(I **image_ctx, Context *on_finish)
+ : m_image_ctx(image_ctx), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void CloseImageRequest<I>::send() {
+ close_image();
+}
+
+template <typename I>
+void CloseImageRequest<I>::close_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ CloseImageRequest<I>, &CloseImageRequest<I>::handle_close_image>(this);
+ (*m_image_ctx)->state->close(ctx);
+}
+
+template <typename I>
+void CloseImageRequest<I>::handle_close_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": error encountered while closing image: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ delete *m_image_ctx;
+ *m_image_ctx = nullptr;
+
+ m_on_finish->complete(0);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::CloseImageRequest<librbd::ImageCtx>;
+
diff --git a/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h
new file mode 100644
index 00000000..02481369
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/CloseImageRequest.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_CLOSE_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_CLOSE_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include <string>
+
+class Context;
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CloseImageRequest {
+public:
+ static CloseImageRequest* create(ImageCtxT **image_ctx, Context *on_finish) {
+ return new CloseImageRequest(image_ctx, on_finish);
+ }
+
+ CloseImageRequest(ImageCtxT **image_ctx, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * CLOSE_IMAGE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ ImageCtxT **m_image_ctx;
+ Context *m_on_finish;
+
+ void close_image();
+ void handle_close_image(int r);
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::CloseImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_CLOSE_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc
new file mode 100644
index 00000000..8d8236b2
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc
@@ -0,0 +1,506 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "CreateImageRequest.h"
+#include "CloseImageRequest.h"
+#include "OpenImageRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "journal/Journaler.h"
+#include "journal/Settings.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/internal.h"
+#include "librbd/Utils.h"
+#include "librbd/image/CreateRequest.h"
+#include "librbd/image/CloneRequest.h"
+#include "librbd/journal/Types.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::CreateImageRequest: " \
+ << this << " " << __func__ << ": "
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename I>
+CreateImageRequest<I>::CreateImageRequest(Threads<I>* threads,
+ librados::IoCtx &local_io_ctx,
+ const std::string &global_image_id,
+ const std::string &remote_mirror_uuid,
+ const std::string &local_image_name,
+ const std::string &local_image_id,
+ I *remote_image_ctx,
+ Context *on_finish)
+ : m_threads(threads), m_local_io_ctx(local_io_ctx),
+ m_global_image_id(global_image_id),
+ m_remote_mirror_uuid(remote_mirror_uuid),
+ m_local_image_name(local_image_name), m_local_image_id(local_image_id),
+ m_remote_image_ctx(remote_image_ctx), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void CreateImageRequest<I>::send() {
+ int r = validate_parent();
+ if (r < 0) {
+ error(r);
+ return;
+ }
+
+ if (m_remote_parent_spec.pool_id == -1) {
+ create_image();
+ } else {
+ get_local_parent_mirror_uuid();
+ }
+}
+
+template <typename I>
+void CreateImageRequest<I>::create_image() {
+ dout(10) << dendl;
+
+ using klass = CreateImageRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_create_image>(this);
+
+ RWLock::RLocker snap_locker(m_remote_image_ctx->snap_lock);
+
+ auto& config{
+ reinterpret_cast<CephContext*>(m_local_io_ctx.cct())->_conf};
+
+ librbd::ImageOptions image_options;
+ populate_image_options(&image_options);
+
+ auto req = librbd::image::CreateRequest<I>::create(
+ config, m_local_io_ctx, m_local_image_name, m_local_image_id,
+ m_remote_image_ctx->size, image_options, m_global_image_id,
+ m_remote_mirror_uuid, false, m_remote_image_ctx->op_work_queue, ctx);
+ req->send();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_create_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r == -EBADF) {
+ dout(5) << "image id " << m_local_image_id << " already in-use" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to create local image: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void CreateImageRequest<I>::get_local_parent_mirror_uuid() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_uuid_get_start(&op);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_get_local_parent_mirror_uuid>(this);
+ m_out_bl.clear();
+ int r = m_local_parent_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_get_local_parent_mirror_uuid(int r) {
+ if (r >= 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_uuid_get_finish(
+ &it, &m_local_parent_mirror_uuid);
+ if (r >= 0 && m_local_parent_mirror_uuid.empty()) {
+ r = -ENOENT;
+ }
+ }
+
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ if (r == -ENOENT) {
+ dout(5) << "local parent mirror uuid missing" << dendl;
+ } else {
+ derr << "failed to retrieve local parent mirror uuid: " << cpp_strerror(r)
+ << dendl;
+ }
+ finish(r);
+ return;
+ }
+
+ dout(15) << "local_parent_mirror_uuid=" << m_local_parent_mirror_uuid
+ << dendl;
+ get_remote_parent_client_state();
+}
+
+template <typename I>
+void CreateImageRequest<I>::get_remote_parent_client_state() {
+ dout(10) << dendl;
+
+ m_remote_journaler = new Journaler(m_threads->work_queue, m_threads->timer,
+ &m_threads->timer_lock,
+ m_remote_parent_io_ctx,
+ m_remote_parent_spec.image_id,
+ m_local_parent_mirror_uuid, {});
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_get_remote_parent_client_state>(this));
+ m_remote_journaler->get_client(m_local_parent_mirror_uuid, &m_client, ctx);
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_get_remote_parent_client_state(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ delete m_remote_journaler;
+ m_remote_journaler = nullptr;
+
+ librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta;
+ if (r == -ENOENT) {
+ dout(15) << "client not registered to parent image" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to retrieve parent client: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ } else if (!util::decode_client_meta(m_client, &mirror_peer_client_meta)) {
+ // require operator intervention since the data is corrupt
+ derr << "failed to decode parent client: " << cpp_strerror(r) << dendl;
+ finish(-EBADMSG);
+ return;
+ } else if (mirror_peer_client_meta.state !=
+ librbd::journal::MIRROR_PEER_STATE_REPLAYING) {
+ // avoid possible race w/ incomplete parent image since the parent snapshot
+ // might be deleted if the sync restarts
+ dout(15) << "parent image still syncing" << dendl;
+ finish(-ENOENT);
+ return;
+ }
+
+ get_parent_global_image_id();
+}
+
+
+template <typename I>
+void CreateImageRequest<I>::get_parent_global_image_id() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_start(&op,
+ m_remote_parent_spec.image_id);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_get_parent_global_image_id>(this);
+ m_out_bl.clear();
+ int r = m_remote_parent_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_get_parent_global_image_id(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r == 0) {
+ cls::rbd::MirrorImage mirror_image;
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_finish(&iter, &mirror_image);
+ if (r == 0) {
+ m_parent_global_image_id = mirror_image.global_image_id;
+ dout(15) << "parent_global_image_id=" << m_parent_global_image_id
+ << dendl;
+ }
+ }
+
+ if (r == -ENOENT) {
+ dout(10) << "parent image " << m_remote_parent_spec.image_id
+ << " not mirrored" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to retrieve global image id for parent image "
+ << m_remote_parent_spec.image_id << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ get_local_parent_image_id();
+}
+
+template <typename I>
+void CreateImageRequest<I>::get_local_parent_image_id() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_image_id_start(
+ &op, m_parent_global_image_id);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_get_local_parent_image_id>(this);
+ m_out_bl.clear();
+ int r = m_local_parent_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_get_local_parent_image_id(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_image_id_finish(
+ &iter, &m_local_parent_spec.image_id);
+ }
+
+ if (r == -ENOENT) {
+ dout(10) << "parent image " << m_parent_global_image_id << " not "
+ << "registered locally" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to retrieve local image id for parent image "
+ << m_parent_global_image_id << ": " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ open_remote_parent_image();
+}
+
+template <typename I>
+void CreateImageRequest<I>::open_remote_parent_image() {
+ dout(10) << dendl;
+
+ Context *ctx = create_context_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_open_remote_parent_image>(this);
+ OpenImageRequest<I> *request = OpenImageRequest<I>::create(
+ m_remote_parent_io_ctx, &m_remote_parent_image_ctx,
+ m_remote_parent_spec.image_id, true, ctx);
+ request->send();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_open_remote_parent_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "failed to open remote parent image " << m_parent_pool_name << "/"
+ << m_remote_parent_spec.image_id << dendl;
+ finish(r);
+ return;
+ }
+
+ clone_image();
+}
+
+template <typename I>
+void CreateImageRequest<I>::clone_image() {
+ dout(10) << dendl;
+
+ std::string snap_name;
+ cls::rbd::SnapshotNamespace snap_namespace;
+ {
+ RWLock::RLocker remote_snap_locker(m_remote_parent_image_ctx->snap_lock);
+ auto it = m_remote_parent_image_ctx->snap_info.find(
+ m_remote_parent_spec.snap_id);
+ if (it != m_remote_parent_image_ctx->snap_info.end()) {
+ snap_name = it->second.name;
+ snap_namespace = it->second.snap_namespace;
+ }
+ }
+
+ librbd::ImageOptions opts;
+ populate_image_options(&opts);
+
+ auto& config{
+ reinterpret_cast<CephContext*>(m_local_io_ctx.cct())->_conf};
+
+ using klass = CreateImageRequest<I>;
+ Context *ctx = create_context_callback<
+ klass, &klass::handle_clone_image>(this);
+
+ librbd::image::CloneRequest<I> *req = librbd::image::CloneRequest<I>::create(
+ config, m_local_parent_io_ctx, m_local_parent_spec.image_id, snap_name,
+ CEPH_NOSNAP, m_local_io_ctx, m_local_image_name, m_local_image_id, opts,
+ m_global_image_id, m_remote_mirror_uuid, m_remote_image_ctx->op_work_queue,
+ ctx);
+ req->send();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_clone_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r == -EBADF) {
+ dout(5) << "image id " << m_local_image_id << " already in-use" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << "failed to clone image " << m_parent_pool_name << "/"
+ << m_remote_parent_spec.image_id << " to "
+ << m_local_image_name << dendl;
+ m_ret_val = r;
+ }
+
+ close_remote_parent_image();
+}
+
+template <typename I>
+void CreateImageRequest<I>::close_remote_parent_image() {
+ dout(10) << dendl;
+ Context *ctx = create_context_callback<
+ CreateImageRequest<I>,
+ &CreateImageRequest<I>::handle_close_remote_parent_image>(this);
+ CloseImageRequest<I> *request = CloseImageRequest<I>::create(
+ &m_remote_parent_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void CreateImageRequest<I>::handle_close_remote_parent_image(int r) {
+ dout(10) << "r=" << r << dendl;
+ if (r < 0) {
+ derr << "error encountered closing remote parent image: "
+ << cpp_strerror(r) << dendl;
+ }
+
+ finish(m_ret_val);
+}
+
+template <typename I>
+void CreateImageRequest<I>::error(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_threads->work_queue->queue(create_context_callback<
+ CreateImageRequest<I>, &CreateImageRequest<I>::finish>(this), r);
+}
+
+template <typename I>
+void CreateImageRequest<I>::finish(int r) {
+ dout(10) << "r=" << r << dendl;
+ m_on_finish->complete(r);
+ delete this;
+}
+
+template <typename I>
+int CreateImageRequest<I>::validate_parent() {
+ RWLock::RLocker owner_locker(m_remote_image_ctx->owner_lock);
+ RWLock::RLocker snap_locker(m_remote_image_ctx->snap_lock);
+
+ m_remote_parent_spec = m_remote_image_ctx->parent_md.spec;
+
+ // scan all remote snapshots for a linked parent
+ for (auto &snap_info_pair : m_remote_image_ctx->snap_info) {
+ auto &parent_spec = snap_info_pair.second.parent.spec;
+ if (parent_spec.pool_id == -1) {
+ continue;
+ } else if (m_remote_parent_spec.pool_id == -1) {
+ m_remote_parent_spec = parent_spec;
+ continue;
+ }
+
+ if (m_remote_parent_spec != parent_spec) {
+ derr << "remote image parent spec mismatch" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ if (m_remote_parent_spec.pool_id == -1) {
+ return 0;
+ }
+
+ // map remote parent pool to local parent pool
+ librados::Rados remote_rados(m_remote_image_ctx->md_ctx);
+ int r = remote_rados.ioctx_create2(m_remote_parent_spec.pool_id,
+ m_remote_parent_io_ctx);
+ if (r < 0) {
+ derr << "failed to open remote parent pool " << m_remote_parent_spec.pool_id
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ m_parent_pool_name = m_remote_parent_io_ctx.get_pool_name();
+
+ librados::Rados local_rados(m_local_io_ctx);
+ r = local_rados.ioctx_create(m_parent_pool_name.c_str(),
+ m_local_parent_io_ctx);
+ if (r < 0) {
+ derr << "failed to open local parent pool " << m_parent_pool_name << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+template <typename I>
+void CreateImageRequest<I>::populate_image_options(
+ librbd::ImageOptions* image_options) {
+ image_options->set(RBD_IMAGE_OPTION_FEATURES,
+ m_remote_image_ctx->features);
+ image_options->set(RBD_IMAGE_OPTION_ORDER, m_remote_image_ctx->order);
+ image_options->set(RBD_IMAGE_OPTION_STRIPE_UNIT,
+ m_remote_image_ctx->stripe_unit);
+ image_options->set(RBD_IMAGE_OPTION_STRIPE_COUNT,
+ m_remote_image_ctx->stripe_count);
+
+ // Determine the data pool for the local image as follows:
+ // 1. If the local pool has a default data pool, use it.
+ // 2. If the remote image has a data pool different from its metadata pool and
+ // a pool with the same name exists locally, use it.
+ // 3. Don't set the data pool explicitly.
+ std::string data_pool;
+ librados::Rados local_rados(m_local_io_ctx);
+ auto default_data_pool = g_ceph_context->_conf.get_val<std::string>("rbd_default_data_pool");
+ auto remote_md_pool = m_remote_image_ctx->md_ctx.get_pool_name();
+ auto remote_data_pool = m_remote_image_ctx->data_ctx.get_pool_name();
+
+ if (default_data_pool != "") {
+ data_pool = default_data_pool;
+ } else if (remote_data_pool != remote_md_pool) {
+ if (local_rados.pool_lookup(remote_data_pool.c_str()) >= 0) {
+ data_pool = remote_data_pool;
+ }
+ }
+
+ if (data_pool != "") {
+ image_options->set(RBD_IMAGE_OPTION_DATA_POOL, data_pool);
+ }
+
+ if (m_remote_parent_spec.pool_id != -1) {
+ uint64_t clone_format = 1;
+ if (m_remote_image_ctx->test_op_features(
+ RBD_OPERATION_FEATURE_CLONE_CHILD)) {
+ clone_format = 2;
+ }
+ image_options->set(RBD_IMAGE_OPTION_CLONE_FORMAT, clone_format);
+ }
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::CreateImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/CreateImageRequest.h b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.h
new file mode 100644
index 00000000..0b20da52
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/CreateImageRequest.h
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_CREATE_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_CREATE_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+#include "cls/journal/cls_journal_types.h"
+#include "librbd/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include <string>
+
+class Context;
+class ContextWQ;
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+namespace librbd { class ImageOptions; }
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class CreateImageRequest {
+public:
+ static CreateImageRequest *create(Threads<ImageCtxT> *threads,
+ librados::IoCtx &local_io_ctx,
+ const std::string &global_image_id,
+ const std::string &remote_mirror_uuid,
+ const std::string &local_image_name,
+ const std::string &local_image_id,
+ ImageCtxT *remote_image_ctx,
+ Context *on_finish) {
+ return new CreateImageRequest(threads, local_io_ctx, global_image_id,
+ remote_mirror_uuid, local_image_name,
+ local_image_id, remote_image_ctx, on_finish);
+ }
+
+ CreateImageRequest(Threads<ImageCtxT> *threads, librados::IoCtx &local_io_ctx,
+ const std::string &global_image_id,
+ const std::string &remote_mirror_uuid,
+ const std::string &local_image_name,
+ const std::string &local_image_id,
+ ImageCtxT *remote_image_ctx,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * | *
+ * | (non-clone) *
+ * |\------------> CREATE_IMAGE ---------------------\ * (error)
+ * | | *
+ * | (clone) | *
+ * \-------------> GET_LOCAL_PARENT_MIRROR_UUID * * | * * * *
+ * | | * *
+ * v | *
+ * GET_REMOTE_PARENT_CLIENT_STATE * | * * * *
+ * | | * *
+ * v | *
+ * GET_PARENT_GLOBAL_IMAGE_ID * * * | * * * *
+ * | | * *
+ * v | *
+ * GET_LOCAL_PARENT_IMAGE_ID * * * * | * * * *
+ * | | * *
+ * v | *
+ * OPEN_REMOTE_PARENT * * * * * * * | * * * *
+ * | | * *
+ * v | *
+ * CLONE_IMAGE | *
+ * | | *
+ * v | *
+ * CLOSE_REMOTE_PARENT | *
+ * | v *
+ * \------------------------> <finish> < * *
+ * @endverbatim
+ */
+
+ typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+
+ Threads<ImageCtxT> *m_threads;
+ librados::IoCtx &m_local_io_ctx;
+ std::string m_global_image_id;
+ std::string m_remote_mirror_uuid;
+ std::string m_local_image_name;
+ std::string m_local_image_id;
+ ImageCtxT *m_remote_image_ctx;
+ Context *m_on_finish;
+
+ librados::IoCtx m_remote_parent_io_ctx;
+ std::string m_local_parent_mirror_uuid;
+ Journaler *m_remote_journaler = nullptr;
+ ImageCtxT *m_remote_parent_image_ctx = nullptr;
+ cls::rbd::ParentImageSpec m_remote_parent_spec;
+
+ librados::IoCtx m_local_parent_io_ctx;
+ cls::rbd::ParentImageSpec m_local_parent_spec;
+
+ bufferlist m_out_bl;
+ std::string m_parent_global_image_id;
+ std::string m_parent_pool_name;
+ cls::journal::Client m_client;
+ int m_ret_val = 0;
+
+ void create_image();
+ void handle_create_image(int r);
+
+ void get_local_parent_mirror_uuid();
+ void handle_get_local_parent_mirror_uuid(int r);
+
+ void get_remote_parent_client_state();
+ void handle_get_remote_parent_client_state(int r);
+
+ void get_parent_global_image_id();
+ void handle_get_parent_global_image_id(int r);
+
+ void get_local_parent_image_id();
+ void handle_get_local_parent_image_id(int r);
+
+ void open_remote_parent_image();
+ void handle_open_remote_parent_image(int r);
+
+ void clone_image();
+ void handle_clone_image(int r);
+
+ void close_remote_parent_image();
+ void handle_close_remote_parent_image(int r);
+
+ void error(int r);
+ void finish(int r);
+
+ int validate_parent();
+
+ void populate_image_options(librbd::ImageOptions* image_options);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::CreateImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_CREATE_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/EventPreprocessor.cc b/src/tools/rbd_mirror/image_replayer/EventPreprocessor.cc
new file mode 100644
index 00000000..6314eb7d
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/EventPreprocessor.cc
@@ -0,0 +1,204 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "EventPreprocessor.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/Types.h"
+#include <boost/variant.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::EventPreprocessor: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+EventPreprocessor<I>::EventPreprocessor(I &local_image_ctx,
+ Journaler &remote_journaler,
+ const std::string &local_mirror_uuid,
+ MirrorPeerClientMeta *client_meta,
+ ContextWQ *work_queue)
+ : m_local_image_ctx(local_image_ctx), m_remote_journaler(remote_journaler),
+ m_local_mirror_uuid(local_mirror_uuid), m_client_meta(client_meta),
+ m_work_queue(work_queue) {
+}
+
+template <typename I>
+EventPreprocessor<I>::~EventPreprocessor() {
+ ceph_assert(!m_in_progress);
+}
+
+template <typename I>
+bool EventPreprocessor<I>::is_required(const EventEntry &event_entry) {
+ SnapSeqs snap_seqs(m_client_meta->snap_seqs);
+ return (prune_snap_map(&snap_seqs) ||
+ event_entry.get_event_type() ==
+ librbd::journal::EVENT_TYPE_SNAP_RENAME);
+}
+
+template <typename I>
+void EventPreprocessor<I>::preprocess(EventEntry *event_entry,
+ Context *on_finish) {
+ ceph_assert(!m_in_progress);
+ m_in_progress = true;
+ m_event_entry = event_entry;
+ m_on_finish = on_finish;
+
+ refresh_image();
+}
+
+template <typename I>
+void EventPreprocessor<I>::refresh_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ EventPreprocessor<I>, &EventPreprocessor<I>::handle_refresh_image>(this);
+ m_local_image_ctx.state->refresh(ctx);
+}
+
+template <typename I>
+void EventPreprocessor<I>::handle_refresh_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "error encountered during image refresh: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ preprocess_event();
+}
+
+template <typename I>
+void EventPreprocessor<I>::preprocess_event() {
+ dout(20) << dendl;
+
+ m_snap_seqs = m_client_meta->snap_seqs;
+ m_snap_seqs_updated = prune_snap_map(&m_snap_seqs);
+
+ int r = boost::apply_visitor(PreprocessEventVisitor(this),
+ m_event_entry->event);
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ update_client();
+}
+
+template <typename I>
+int EventPreprocessor<I>::preprocess_snap_rename(
+ librbd::journal::SnapRenameEvent &event) {
+ dout(20) << ": "
+ << "remote_snap_id=" << event.snap_id << ", "
+ << "src_snap_name=" << event.src_snap_name << ", "
+ << "dest_snap_name=" << event.dst_snap_name << dendl;
+
+ auto snap_seq_it = m_snap_seqs.find(event.snap_id);
+ if (snap_seq_it != m_snap_seqs.end()) {
+ dout(20) << ": remapping remote snap id " << snap_seq_it->first << " "
+ << "to local snap id " << snap_seq_it->second << dendl;
+ event.snap_id = snap_seq_it->second;
+ return 0;
+ }
+
+ auto snap_id_it = m_local_image_ctx.snap_ids.find({cls::rbd::UserSnapshotNamespace(),
+ event.src_snap_name});
+ if (snap_id_it == m_local_image_ctx.snap_ids.end()) {
+ dout(20) << ": cannot map remote snapshot '" << event.src_snap_name << "' "
+ << "to local snapshot" << dendl;
+ event.snap_id = CEPH_NOSNAP;
+ return -ENOENT;
+ }
+
+ dout(20) << ": mapping remote snap id " << event.snap_id << " "
+ << "to local snap id " << snap_id_it->second << dendl;
+ m_snap_seqs_updated = true;
+ m_snap_seqs[event.snap_id] = snap_id_it->second;
+ event.snap_id = snap_id_it->second;
+ return 0;
+}
+
+template <typename I>
+void EventPreprocessor<I>::update_client() {
+ if (!m_snap_seqs_updated) {
+ finish(0);
+ return;
+ }
+
+ dout(20) << dendl;
+ librbd::journal::MirrorPeerClientMeta client_meta(*m_client_meta);
+ client_meta.snap_seqs = m_snap_seqs;
+
+ librbd::journal::ClientData client_data(client_meta);
+ bufferlist data_bl;
+ encode(client_data, data_bl);
+
+ Context *ctx = create_context_callback<
+ EventPreprocessor<I>, &EventPreprocessor<I>::handle_update_client>(
+ this);
+ m_remote_journaler.update_client(data_bl, ctx);
+}
+
+template <typename I>
+void EventPreprocessor<I>::handle_update_client(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to update mirror peer journal client: "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ m_client_meta->snap_seqs = m_snap_seqs;
+ finish(0);
+}
+
+template <typename I>
+bool EventPreprocessor<I>::prune_snap_map(SnapSeqs *snap_seqs) {
+ bool pruned = false;
+
+ RWLock::RLocker snap_locker(m_local_image_ctx.snap_lock);
+ for (auto it = snap_seqs->begin(); it != snap_seqs->end(); ) {
+ auto current_it(it++);
+ if (m_local_image_ctx.snap_info.count(current_it->second) == 0) {
+ snap_seqs->erase(current_it);
+ pruned = true;
+ }
+ }
+ return pruned;
+}
+
+template <typename I>
+void EventPreprocessor<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_finish = m_on_finish;
+ m_on_finish = nullptr;
+ m_event_entry = nullptr;
+ m_in_progress = false;
+ m_snap_seqs_updated = false;
+ m_work_queue->queue(on_finish, r);
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::EventPreprocessor<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/EventPreprocessor.h b/src/tools/rbd_mirror/image_replayer/EventPreprocessor.h
new file mode 100644
index 00000000..67aeea0b
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/EventPreprocessor.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_EVENT_PREPROCESSOR_H
+#define RBD_MIRROR_IMAGE_REPLAYER_EVENT_PREPROCESSOR_H
+
+#include "include/int_types.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include <map>
+#include <string>
+#include <boost/variant/static_visitor.hpp>
+
+struct Context;
+struct ContextWQ;
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class EventPreprocessor {
+public:
+ using Journaler = typename librbd::journal::TypeTraits<ImageCtxT>::Journaler;
+ using EventEntry = librbd::journal::EventEntry;
+ using MirrorPeerClientMeta = librbd::journal::MirrorPeerClientMeta;
+
+ static EventPreprocessor *create(ImageCtxT &local_image_ctx,
+ Journaler &remote_journaler,
+ const std::string &local_mirror_uuid,
+ MirrorPeerClientMeta *client_meta,
+ ContextWQ *work_queue) {
+ return new EventPreprocessor(local_image_ctx, remote_journaler,
+ local_mirror_uuid, client_meta, work_queue);
+ }
+
+ static void destroy(EventPreprocessor* processor) {
+ delete processor;
+ }
+
+ EventPreprocessor(ImageCtxT &local_image_ctx, Journaler &remote_journaler,
+ const std::string &local_mirror_uuid,
+ MirrorPeerClientMeta *client_meta, ContextWQ *work_queue);
+ ~EventPreprocessor();
+
+ bool is_required(const EventEntry &event_entry);
+ void preprocess(EventEntry *event_entry, Context *on_finish);
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v (skip if not required)
+ * REFRESH_IMAGE
+ * |
+ * v (skip if not required)
+ * PREPROCESS_EVENT
+ * |
+ * v (skip if not required)
+ * UPDATE_CLIENT
+ *
+ * @endverbatim
+ */
+
+ typedef std::map<uint64_t, uint64_t> SnapSeqs;
+
+ class PreprocessEventVisitor : public boost::static_visitor<int> {
+ public:
+ EventPreprocessor *event_preprocessor;
+
+ PreprocessEventVisitor(EventPreprocessor *event_preprocessor)
+ : event_preprocessor(event_preprocessor) {
+ }
+
+ template <typename T>
+ inline int operator()(T&) const {
+ return 0;
+ }
+ inline int operator()(librbd::journal::SnapRenameEvent &event) const {
+ return event_preprocessor->preprocess_snap_rename(event);
+ }
+ };
+
+ ImageCtxT &m_local_image_ctx;
+ Journaler &m_remote_journaler;
+ std::string m_local_mirror_uuid;
+ MirrorPeerClientMeta *m_client_meta;
+ ContextWQ *m_work_queue;
+
+ bool m_in_progress = false;
+ EventEntry *m_event_entry = nullptr;
+ Context *m_on_finish = nullptr;
+
+ SnapSeqs m_snap_seqs;
+ bool m_snap_seqs_updated = false;
+
+ bool prune_snap_map(SnapSeqs *snap_seqs);
+
+ void refresh_image();
+ void handle_refresh_image(int r);
+
+ void preprocess_event();
+ int preprocess_snap_rename(librbd::journal::SnapRenameEvent &event);
+
+ void update_client();
+ void handle_update_client(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::EventPreprocessor<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_EVENT_PREPROCESSOR_H
diff --git a/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc
new file mode 100644
index 00000000..74e97537
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \
+ << "GetMirrorImageIdRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void GetMirrorImageIdRequest<I>::send() {
+ dout(20) << dendl;
+ get_image_id();
+}
+
+template <typename I>
+void GetMirrorImageIdRequest<I>::get_image_id() {
+ dout(20) << dendl;
+
+ // attempt to cross-reference a image id by the global image id
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_image_id_start(&op, m_global_image_id);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ GetMirrorImageIdRequest<I>,
+ &GetMirrorImageIdRequest<I>::handle_get_image_id>(
+ this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void GetMirrorImageIdRequest<I>::handle_get_image_id(int r) {
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_image_id_finish(
+ &iter, m_image_id);
+ }
+
+ dout(20) << "r=" << r << ", "
+ << "image_id=" << *m_image_id << dendl;
+
+ if (r < 0) {
+ if (r == -ENOENT) {
+ dout(10) << "global image " << m_global_image_id << " not registered"
+ << dendl;
+ } else {
+ derr << "failed to retrieve image id: " << cpp_strerror(r) << dendl;
+ }
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void GetMirrorImageIdRequest<I>::finish(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::GetMirrorImageIdRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h
new file mode 100644
index 00000000..b2664513
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_GET_MIRROR_IMAGE_ID_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_GET_MIRROR_IMAGE_ID_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados_fwd.hpp"
+#include <string>
+
+namespace librbd { struct ImageCtx; }
+
+struct Context;
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class GetMirrorImageIdRequest {
+public:
+ static GetMirrorImageIdRequest *create(librados::IoCtx &io_ctx,
+ const std::string &global_image_id,
+ std::string *image_id,
+ Context *on_finish) {
+ return new GetMirrorImageIdRequest(io_ctx, global_image_id, image_id,
+ on_finish);
+ }
+
+ GetMirrorImageIdRequest(librados::IoCtx &io_ctx,
+ const std::string &global_image_id,
+ std::string *image_id,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_global_image_id(global_image_id),
+ m_image_id(image_id), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_IMAGE_ID
+ * |
+ * v
+ * <finish>
+
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_global_image_id;
+ std::string *m_image_id;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ void get_image_id();
+ void handle_get_image_id(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::GetMirrorImageIdRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_GET_MIRROR_IMAGE_ID_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.cc b/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.cc
new file mode 100644
index 00000000..54636fdb
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.cc
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "IsPrimaryRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include <type_traits>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::IsPrimaryRequest: " \
+ << this << " " << __func__ << " "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+IsPrimaryRequest<I>::IsPrimaryRequest(I *image_ctx, bool *primary,
+ Context *on_finish)
+ : m_image_ctx(image_ctx), m_primary(primary), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void IsPrimaryRequest<I>::send() {
+ send_get_mirror_state();
+}
+
+template <typename I>
+void IsPrimaryRequest<I>::send_get_mirror_state() {
+ dout(20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_start(&op, m_image_ctx->id);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ IsPrimaryRequest<I>, &IsPrimaryRequest<I>::handle_get_mirror_state>(this);
+ int r = m_image_ctx->md_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op,
+ &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void IsPrimaryRequest<I>::handle_get_mirror_state(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ cls::rbd::MirrorImage mirror_image;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_finish(&iter, &mirror_image);
+ if (r == 0) {
+ if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_ENABLED) {
+ send_is_tag_owner();
+ return;
+ } else if (mirror_image.state == cls::rbd::MIRROR_IMAGE_STATE_DISABLING) {
+ dout(5) << ": image mirroring is being disabled" << dendl;
+ r = -ENOENT;
+ } else {
+ derr << ": image mirroring is disabled" << dendl;
+ r = -EINVAL;
+ }
+ } else {
+ derr << ": failed to decode image mirror state: " << cpp_strerror(r)
+ << dendl;
+ }
+ } else if (r == -ENOENT) {
+ dout(5) << ": image is not mirrored" << dendl;
+ } else {
+ derr << ": failed to retrieve image mirror state: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void IsPrimaryRequest<I>::send_is_tag_owner() {
+ // deduce the class type for the journal to support unit tests
+ using Journal = typename std::decay<
+ typename std::remove_pointer<decltype(std::declval<I>().journal)>
+ ::type>::type;
+
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ IsPrimaryRequest<I>, &IsPrimaryRequest<I>::handle_is_tag_owner>(this);
+
+ Journal::is_tag_owner(m_image_ctx, m_primary, ctx);
+}
+
+template <typename I>
+void IsPrimaryRequest<I>::handle_is_tag_owner(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to query remote image tag owner: " << cpp_strerror(r)
+ << dendl;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void IsPrimaryRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::IsPrimaryRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.h b/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.h
new file mode 100644
index 00000000..ddb332cb
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/IsPrimaryRequest.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_IS_PRIMARY_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_IS_PRIMARY_REQUEST_H
+
+#include "include/buffer.h"
+
+class Context;
+class ContextWQ;
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class IsPrimaryRequest {
+public:
+ static IsPrimaryRequest* create(ImageCtxT *image_ctx, bool *primary,
+ Context *on_finish) {
+ return new IsPrimaryRequest(image_ctx, primary, on_finish);
+ }
+
+ IsPrimaryRequest(ImageCtxT *image_ctx, bool *primary, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_MIRROR_STATE * * * * *
+ * | *
+ * v *
+ * IS_TAG_OWNER * * * * * * * (error)
+ * | *
+ * v *
+ * <finish> < * * * * * * * *
+ *
+ * @endverbatim
+ */
+ ImageCtxT *m_image_ctx;
+ bool *m_primary;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ void send_get_mirror_state();
+ void handle_get_mirror_state(int r);
+
+ void send_is_tag_owner();
+ void handle_is_tag_owner(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::IsPrimaryRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_IS_PRIMARY_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc
new file mode 100644
index 00000000..7f55745e
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "OpenImageRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Utils.h"
+#include <type_traits>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::OpenImageRequest: " \
+ << this << " " << __func__ << " "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+OpenImageRequest<I>::OpenImageRequest(librados::IoCtx &io_ctx, I **image_ctx,
+ const std::string &image_id,
+ bool read_only, Context *on_finish)
+ : m_io_ctx(io_ctx), m_image_ctx(image_ctx), m_image_id(image_id),
+ m_read_only(read_only), m_on_finish(on_finish) {
+}
+
+template <typename I>
+void OpenImageRequest<I>::send() {
+ send_open_image();
+}
+
+template <typename I>
+void OpenImageRequest<I>::send_open_image() {
+ dout(20) << dendl;
+
+ *m_image_ctx = I::create("", m_image_id, nullptr, m_io_ctx, m_read_only);
+
+ Context *ctx = create_context_callback<
+ OpenImageRequest<I>, &OpenImageRequest<I>::handle_open_image>(
+ this);
+ (*m_image_ctx)->state->open(0, ctx);
+}
+
+template <typename I>
+void OpenImageRequest<I>::handle_open_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to open image '" << m_image_id << "': "
+ << cpp_strerror(r) << dendl;
+ (*m_image_ctx)->destroy();
+ *m_image_ctx = nullptr;
+ }
+
+ finish(r);
+}
+
+template <typename I>
+void OpenImageRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::OpenImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/OpenImageRequest.h b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.h
new file mode 100644
index 00000000..01ab3117
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/OpenImageRequest.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_OPEN_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_OPEN_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include <string>
+
+class Context;
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class OpenImageRequest {
+public:
+ static OpenImageRequest* create(librados::IoCtx &io_ctx,
+ ImageCtxT **image_ctx,
+ const std::string &image_id,
+ bool read_only, Context *on_finish) {
+ return new OpenImageRequest(io_ctx, image_ctx, image_id, read_only,
+ on_finish);
+ }
+
+ OpenImageRequest(librados::IoCtx &io_ctx, ImageCtxT **image_ctx,
+ const std::string &image_id, bool read_only,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_IMAGE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+ librados::IoCtx &m_io_ctx;
+ ImageCtxT **m_image_ctx;
+ std::string m_image_id;
+ bool m_read_only;
+ Context *m_on_finish;
+
+ void send_open_image();
+ void handle_open_image(int r);
+
+ void send_close_image(int r);
+ void handle_close_image(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::OpenImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_OPEN_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc
new file mode 100644
index 00000000..87b141ca
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc
@@ -0,0 +1,271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "CloseImageRequest.h"
+#include "IsPrimaryRequest.h"
+#include "OpenLocalImageRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "librbd/ExclusiveLock.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "librbd/exclusive_lock/Policy.h"
+#include "librbd/journal/Policy.h"
+#include <type_traits>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::OpenLocalImageRequest: " \
+ << this << " " << __func__ << " "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+
+namespace {
+
+template <typename I>
+struct MirrorExclusiveLockPolicy : public librbd::exclusive_lock::Policy {
+ I *image_ctx;
+
+ MirrorExclusiveLockPolicy(I *image_ctx) : image_ctx(image_ctx) {
+ }
+
+ bool may_auto_request_lock() override {
+ return false;
+ }
+
+ int lock_requested(bool force) override {
+ int r = -EROFS;
+ {
+ RWLock::RLocker owner_locker(image_ctx->owner_lock);
+ RWLock::RLocker snap_locker(image_ctx->snap_lock);
+ if (image_ctx->journal == nullptr || image_ctx->journal->is_tag_owner()) {
+ r = 0;
+ }
+ }
+
+ if (r == 0) {
+ // if the local image journal has been closed or if it was (force)
+ // promoted allow the lock to be released to another client
+ image_ctx->exclusive_lock->release_lock(nullptr);
+ }
+ return r;
+ }
+
+ bool accept_blocked_request(
+ librbd::exclusive_lock::OperationRequestType request_type) override {
+ if (request_type ==
+ librbd::exclusive_lock::OPERATION_REQUEST_TYPE_TRASH_SNAP_REMOVE) {
+ return true;
+ }
+ return false;
+ }
+};
+
+struct MirrorJournalPolicy : public librbd::journal::Policy {
+ ContextWQ *work_queue;
+
+ MirrorJournalPolicy(ContextWQ *work_queue) : work_queue(work_queue) {
+ }
+
+ bool append_disabled() const override {
+ // avoid recording any events to the local journal
+ return true;
+ }
+ bool journal_disabled() const override {
+ return false;
+ }
+
+ void allocate_tag_on_lock(Context *on_finish) override {
+ // rbd-mirror will manually create tags by copying them from the peer
+ work_queue->queue(on_finish, 0);
+ }
+};
+
+} // anonymous namespace
+
+template <typename I>
+OpenLocalImageRequest<I>::OpenLocalImageRequest(librados::IoCtx &local_io_ctx,
+ I **local_image_ctx,
+ const std::string &local_image_id,
+ ContextWQ *work_queue,
+ Context *on_finish)
+ : m_local_io_ctx(local_io_ctx), m_local_image_ctx(local_image_ctx),
+ m_local_image_id(local_image_id), m_work_queue(work_queue),
+ m_on_finish(on_finish) {
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send() {
+ send_open_image();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send_open_image() {
+ dout(20) << dendl;
+
+ *m_local_image_ctx = I::create("", m_local_image_id, nullptr,
+ m_local_io_ctx, false);
+ {
+ RWLock::WLocker owner_locker((*m_local_image_ctx)->owner_lock);
+ RWLock::WLocker snap_locker((*m_local_image_ctx)->snap_lock);
+ (*m_local_image_ctx)->set_exclusive_lock_policy(
+ new MirrorExclusiveLockPolicy<I>(*m_local_image_ctx));
+ (*m_local_image_ctx)->set_journal_policy(
+ new MirrorJournalPolicy(m_work_queue));
+ }
+
+ Context *ctx = create_context_callback<
+ OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_open_image>(
+ this);
+ (*m_local_image_ctx)->state->open(0, ctx);
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::handle_open_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ if (r == -ENOENT) {
+ dout(10) << ": local image does not exist" << dendl;
+ } else {
+ derr << ": failed to open image '" << m_local_image_id << "': "
+ << cpp_strerror(r) << dendl;
+ }
+ (*m_local_image_ctx)->destroy();
+ *m_local_image_ctx = nullptr;
+ finish(r);
+ return;
+ }
+
+ send_is_primary();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send_is_primary() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_is_primary>(
+ this);
+ IsPrimaryRequest<I> *request = IsPrimaryRequest<I>::create(*m_local_image_ctx,
+ &m_primary, ctx);
+ request->send();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::handle_is_primary(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(5) << ": local image is not mirrored" << dendl;
+ send_close_image(r);
+ return;
+ } else if (r < 0) {
+ derr << ": error querying local image primary status: " << cpp_strerror(r)
+ << dendl;
+ send_close_image(r);
+ return;
+ }
+
+ // if the local image owns the tag -- don't steal the lock since
+ // we aren't going to mirror peer data into this image anyway
+ if (m_primary) {
+ dout(10) << ": local image is primary -- skipping image replay" << dendl;
+ send_close_image(-EREMOTEIO);
+ return;
+ }
+
+ send_lock_image();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send_lock_image() {
+ dout(20) << dendl;
+
+ RWLock::RLocker owner_locker((*m_local_image_ctx)->owner_lock);
+ if ((*m_local_image_ctx)->exclusive_lock == nullptr) {
+ derr << ": image does not support exclusive lock" << dendl;
+ send_close_image(-EINVAL);
+ return;
+ }
+
+ // disallow any proxied maintenance operations before grabbing lock
+ (*m_local_image_ctx)->exclusive_lock->block_requests(-EROFS);
+
+ Context *ctx = create_context_callback<
+ OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_lock_image>(
+ this);
+
+ (*m_local_image_ctx)->exclusive_lock->acquire_lock(ctx);
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::handle_lock_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to lock image '" << m_local_image_id << "': "
+ << cpp_strerror(r) << dendl;
+ send_close_image(r);
+ return;
+ }
+
+ {
+ RWLock::RLocker owner_locker((*m_local_image_ctx)->owner_lock);
+ if ((*m_local_image_ctx)->exclusive_lock == nullptr ||
+ !(*m_local_image_ctx)->exclusive_lock->is_lock_owner()) {
+ derr << ": image is not locked" << dendl;
+ send_close_image(-EBUSY);
+ return;
+ }
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::send_close_image(int r) {
+ dout(20) << dendl;
+
+ if (m_ret_val == 0 && r < 0) {
+ m_ret_val = r;
+ }
+
+ Context *ctx = create_context_callback<
+ OpenLocalImageRequest<I>, &OpenLocalImageRequest<I>::handle_close_image>(
+ this);
+ CloseImageRequest<I> *request = CloseImageRequest<I>::create(
+ m_local_image_ctx, ctx);
+ request->send();
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::handle_close_image(int r) {
+ dout(20) << dendl;
+
+ ceph_assert(r == 0);
+ finish(m_ret_val);
+}
+
+template <typename I>
+void OpenLocalImageRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::OpenLocalImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h
new file mode 100644
index 00000000..58de545f
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_OPEN_LOCAL_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_OPEN_LOCAL_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "librbd/ImageCtx.h"
+#include <string>
+
+class Context;
+class ContextWQ;
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class OpenLocalImageRequest {
+public:
+ static OpenLocalImageRequest* create(librados::IoCtx &local_io_ctx,
+ ImageCtxT **local_image_ctx,
+ const std::string &local_image_id,
+ ContextWQ *work_queue,
+ Context *on_finish) {
+ return new OpenLocalImageRequest(local_io_ctx, local_image_ctx,
+ local_image_id, work_queue, on_finish);
+ }
+
+ OpenLocalImageRequest(librados::IoCtx &local_io_ctx,
+ ImageCtxT **local_image_ctx,
+ const std::string &local_image_id,
+ ContextWQ *m_work_queue,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * OPEN_IMAGE * * * * * * * *
+ * | *
+ * v *
+ * IS_PRIMARY * * * * * * * *
+ * | *
+ * v (skip if primary) v
+ * LOCK_IMAGE * * * > CLOSE_IMAGE
+ * | |
+ * v |
+ * <finish> <---------------/
+ *
+ * @endverbatim
+ */
+ librados::IoCtx &m_local_io_ctx;
+ ImageCtxT **m_local_image_ctx;
+ std::string m_local_image_id;
+ ContextWQ *m_work_queue;
+ Context *m_on_finish;
+
+ bool m_primary = false;
+ int m_ret_val = 0;
+
+ void send_open_image();
+ void handle_open_image(int r);
+
+ void send_is_primary();
+ void handle_is_primary(int r);
+
+ void send_lock_image();
+ void handle_lock_image(int r);
+
+ void send_close_image(int r);
+ void handle_close_image(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::OpenLocalImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_OPEN_LOCAL_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc
new file mode 100644
index 00000000..8e0ea837
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h"
+#include <type_traits>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \
+ << "PrepareLocalImageRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void PrepareLocalImageRequest<I>::send() {
+ dout(20) << dendl;
+ get_local_image_id();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_local_image_id() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ PrepareLocalImageRequest<I>,
+ &PrepareLocalImageRequest<I>::handle_get_local_image_id>(this);
+ auto req = GetMirrorImageIdRequest<I>::create(m_io_ctx, m_global_image_id,
+ m_local_image_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_local_image_id(int r) {
+ dout(20) << "r=" << r << ", "
+ << "local_image_id=" << *m_local_image_id << dendl;
+
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ get_local_image_name();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_local_image_name() {
+ dout(20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::dir_get_name_start(&op, *m_local_image_id);
+
+ m_out_bl.clear();
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ PrepareLocalImageRequest<I>,
+ &PrepareLocalImageRequest<I>::handle_get_local_image_name>(this);
+ int r = m_io_ctx.aio_operate(RBD_DIRECTORY, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_local_image_name(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::dir_get_name_finish(&it, m_local_image_name);
+ }
+
+ if (r < 0) {
+ if (r != -ENOENT) {
+ derr << "failed to retrieve image name: " << cpp_strerror(r) << dendl;
+ }
+ finish(r);
+ return;
+ }
+
+ get_mirror_state();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_mirror_state() {
+ dout(20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_get_start(&op, *m_local_image_id);
+
+ m_out_bl.clear();
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ PrepareLocalImageRequest<I>,
+ &PrepareLocalImageRequest<I>::handle_get_mirror_state>(this);
+ int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_mirror_state(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ cls::rbd::MirrorImage mirror_image;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_get_finish(&iter, &mirror_image);
+ }
+
+ if (r < 0) {
+ derr << "failed to retrieve image mirror state: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ // TODO save current mirror state to determine if we should
+ // delete a partially formed image
+ // (e.g. MIRROR_IMAGE_STATE_CREATING/DELETING)
+
+ get_tag_owner();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_tag_owner() {
+ // deduce the class type for the journal to support unit tests
+ using Journal = typename std::decay<
+ typename std::remove_pointer<decltype(std::declval<I>().journal)>
+ ::type>::type;
+
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ PrepareLocalImageRequest<I>,
+ &PrepareLocalImageRequest<I>::handle_get_tag_owner>(this);
+ Journal::get_tag_owner(m_io_ctx, *m_local_image_id, m_tag_owner,
+ m_work_queue, ctx);
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_tag_owner(int r) {
+ dout(20) << "r=" << r << ", "
+ << "tag_owner=" << *m_tag_owner << dendl;
+
+ if (r < 0) {
+ derr << "failed to retrieve journal tag owner: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::finish(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::PrepareLocalImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h
new file mode 100644
index 00000000..3417dd96
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados_fwd.hpp"
+#include <string>
+
+namespace librbd { struct ImageCtx; }
+
+struct Context;
+struct ContextWQ;
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PrepareLocalImageRequest {
+public:
+ static PrepareLocalImageRequest *create(librados::IoCtx &io_ctx,
+ const std::string &global_image_id,
+ std::string *local_image_id,
+ std::string *local_image_name,
+ std::string *tag_owner,
+ ContextWQ *work_queue,
+ Context *on_finish) {
+ return new PrepareLocalImageRequest(io_ctx, global_image_id, local_image_id,
+ local_image_name, tag_owner, work_queue,
+ on_finish);
+ }
+
+ PrepareLocalImageRequest(librados::IoCtx &io_ctx,
+ const std::string &global_image_id,
+ std::string *local_image_id,
+ std::string *local_image_name,
+ std::string *tag_owner,
+ ContextWQ *work_queue,
+ Context *on_finish)
+ : m_io_ctx(io_ctx), m_global_image_id(global_image_id),
+ m_local_image_id(local_image_id), m_local_image_name(local_image_name),
+ m_tag_owner(tag_owner), m_work_queue(work_queue), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_LOCAL_IMAGE_ID
+ * |
+ * v
+ * GET_LOCAL_IMAGE_NAME
+ * |
+ * v
+ * GET_MIRROR_STATE
+ * |
+ * v
+ * <finish>
+
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_io_ctx;
+ std::string m_global_image_id;
+ std::string *m_local_image_id;
+ std::string *m_local_image_name;
+ std::string *m_tag_owner;
+ ContextWQ *m_work_queue;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+
+ void get_local_image_id();
+ void handle_get_local_image_id(int r);
+
+ void get_local_image_name();
+ void handle_get_local_image_name(int r);
+
+ void get_mirror_state();
+ void handle_get_mirror_state(int r);
+
+ void get_tag_owner();
+ void handle_get_tag_owner(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::PrepareLocalImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc
new file mode 100644
index 00000000..00c141e0
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc
@@ -0,0 +1,195 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Utils.h"
+#include "librbd/journal/Types.h"
+#include "tools/rbd_mirror/Threads.h"
+#include "tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h"
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \
+ << "PrepareRemoteImageRequest: " << this << " " \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_async_context_callback;
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::send() {
+ get_remote_mirror_uuid();
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::get_remote_mirror_uuid() {
+ dout(20) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_uuid_get_start(&op);
+
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ PrepareRemoteImageRequest<I>,
+ &PrepareRemoteImageRequest<I>::handle_get_remote_mirror_uuid>(this);
+ int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::handle_get_remote_mirror_uuid(int r) {
+ if (r >= 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_uuid_get_finish(&it, m_remote_mirror_uuid);
+ if (r >= 0 && m_remote_mirror_uuid->empty()) {
+ r = -ENOENT;
+ }
+ }
+
+ dout(20) << "r=" << r << dendl;
+ if (r < 0) {
+ if (r == -ENOENT) {
+ dout(5) << "remote mirror uuid missing" << dendl;
+ } else {
+ derr << "failed to retrieve remote mirror uuid: " << cpp_strerror(r)
+ << dendl;
+ }
+ finish(r);
+ return;
+ }
+
+ get_remote_image_id();
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::get_remote_image_id() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ PrepareRemoteImageRequest<I>,
+ &PrepareRemoteImageRequest<I>::handle_get_remote_image_id>(this);
+ auto req = GetMirrorImageIdRequest<I>::create(m_remote_io_ctx,
+ m_global_image_id,
+ m_remote_image_id, ctx);
+ req->send();
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::handle_get_remote_image_id(int r) {
+ dout(20) << "r=" << r << ", "
+ << "remote_image_id=" << *m_remote_image_id << dendl;
+
+ if (r < 0) {
+ finish(r);
+ return;
+ }
+
+ get_client();
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::get_client() {
+ dout(20) << dendl;
+
+ ceph_assert(*m_remote_journaler == nullptr);
+ *m_remote_journaler = new Journaler(m_threads->work_queue, m_threads->timer,
+ &m_threads->timer_lock, m_remote_io_ctx,
+ *m_remote_image_id, m_local_mirror_uuid,
+ m_journal_settings);
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ PrepareRemoteImageRequest<I>,
+ &PrepareRemoteImageRequest<I>::handle_get_client>(this));
+ (*m_remote_journaler)->get_client(m_local_mirror_uuid, &m_client, ctx);
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::handle_get_client(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ if (r == -ENOENT) {
+ dout(10) << "client not registered" << dendl;
+ register_client();
+ } else if (r < 0) {
+ derr << "failed to retrieve client: " << cpp_strerror(r) << dendl;
+ finish(r);
+ } else if (!util::decode_client_meta(m_client, m_client_meta)) {
+ // require operator intervention since the data is corrupt
+ finish(-EBADMSG);
+ } else {
+ // skip registration if it already exists
+ *m_client_state = m_client.state;
+ finish(0);
+ }
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::register_client() {
+ dout(20) << dendl;
+
+ librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
+ m_local_image_id};
+ mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+ librbd::journal::ClientData client_data{mirror_peer_client_meta};
+ bufferlist client_data_bl;
+ encode(client_data, client_data_bl);
+
+ Context *ctx = create_async_context_callback(
+ m_threads->work_queue, create_context_callback<
+ PrepareRemoteImageRequest<I>,
+ &PrepareRemoteImageRequest<I>::handle_register_client>(this));
+ (*m_remote_journaler)->register_client(client_data_bl, ctx);
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::handle_register_client(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ derr << "failed to register with remote journal: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ *m_client_state = cls::journal::CLIENT_STATE_CONNECTED;
+ *m_client_meta = librbd::journal::MirrorPeerClientMeta(m_local_image_id);
+ m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+
+ finish(0);
+}
+
+template <typename I>
+void PrepareRemoteImageRequest<I>::finish(int r) {
+ dout(20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ delete *m_remote_journaler;
+ *m_remote_journaler = nullptr;
+ }
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::PrepareRemoteImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h
new file mode 100644
index 00000000..100a066b
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_PREPARE_REMOTE_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_PREPARE_REMOTE_IMAGE_REQUEST_H
+
+#include "include/buffer_fwd.h"
+#include "include/rados/librados_fwd.hpp"
+#include "cls/journal/cls_journal_types.h"
+#include "journal/Settings.h"
+#include "librbd/journal/TypeTraits.h"
+#include <string>
+
+namespace journal { class Journaler; }
+namespace journal { class Settings; }
+namespace librbd { struct ImageCtx; }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+struct Context;
+struct ContextWQ;
+
+namespace rbd {
+namespace mirror {
+
+template <typename> struct Threads;
+
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PrepareRemoteImageRequest {
+public:
+ typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+ typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta;
+
+ static PrepareRemoteImageRequest *create(Threads<ImageCtxT> *threads,
+ librados::IoCtx &remote_io_ctx,
+ const std::string &global_image_id,
+ const std::string &local_mirror_uuid,
+ const std::string &local_image_id,
+ const journal::Settings &settings,
+ std::string *remote_mirror_uuid,
+ std::string *remote_image_id,
+ Journaler **remote_journaler,
+ cls::journal::ClientState *client_state,
+ MirrorPeerClientMeta *client_meta,
+ Context *on_finish) {
+ return new PrepareRemoteImageRequest(threads, remote_io_ctx,
+ global_image_id, local_mirror_uuid,
+ local_image_id, settings,
+ remote_mirror_uuid, remote_image_id,
+ remote_journaler, client_state,
+ client_meta, on_finish);
+ }
+
+ PrepareRemoteImageRequest(Threads<ImageCtxT> *threads,
+ librados::IoCtx &remote_io_ctx,
+ const std::string &global_image_id,
+ const std::string &local_mirror_uuid,
+ const std::string &local_image_id,
+ const journal::Settings &journal_settings,
+ std::string *remote_mirror_uuid,
+ std::string *remote_image_id,
+ Journaler **remote_journaler,
+ cls::journal::ClientState *client_state,
+ MirrorPeerClientMeta *client_meta,
+ Context *on_finish)
+ : m_threads(threads), m_remote_io_ctx(remote_io_ctx),
+ m_global_image_id(global_image_id),
+ m_local_mirror_uuid(local_mirror_uuid), m_local_image_id(local_image_id),
+ m_journal_settings(journal_settings),
+ m_remote_mirror_uuid(remote_mirror_uuid),
+ m_remote_image_id(remote_image_id),
+ m_remote_journaler(remote_journaler), m_client_state(client_state),
+ m_client_meta(client_meta), m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * GET_REMOTE_MIRROR_UUID
+ * |
+ * v
+ * GET_REMOTE_IMAGE_ID
+ * |
+ * v
+ * GET_CLIENT
+ * |
+ * v (skip if not needed)
+ * REGISTER_CLIENT
+ * |
+ * v
+ * <finish>
+
+ * @endverbatim
+ */
+
+ Threads<ImageCtxT> *m_threads;
+ librados::IoCtx &m_remote_io_ctx;
+ std::string m_global_image_id;
+ std::string m_local_mirror_uuid;
+ std::string m_local_image_id;
+ journal::Settings m_journal_settings;
+ std::string *m_remote_mirror_uuid;
+ std::string *m_remote_image_id;
+ Journaler **m_remote_journaler;
+ cls::journal::ClientState *m_client_state;
+ MirrorPeerClientMeta *m_client_meta;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ cls::journal::Client m_client;
+
+ void get_remote_mirror_uuid();
+ void handle_get_remote_mirror_uuid(int r);
+
+ void get_remote_image_id();
+ void handle_get_remote_image_id(int r);
+
+ void get_client();
+ void handle_get_client(int r);
+
+ void register_client();
+ void handle_register_client(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::PrepareRemoteImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_PREPARE_REMOTE_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc
new file mode 100644
index 00000000..f514d749
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc
@@ -0,0 +1,246 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ReplayStatusFormatter.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::ReplayStatusFormatter: " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::unique_lock_name;
+
+template <typename I>
+ReplayStatusFormatter<I>::ReplayStatusFormatter(Journaler *journaler,
+ const std::string &mirror_uuid)
+ : m_journaler(journaler),
+ m_mirror_uuid(mirror_uuid),
+ m_lock(unique_lock_name("ReplayStatusFormatter::m_lock", this)) {
+}
+
+template <typename I>
+bool ReplayStatusFormatter<I>::get_or_send_update(std::string *description,
+ Context *on_finish) {
+ dout(20) << dendl;
+
+ bool in_progress = false;
+ {
+ Mutex::Locker locker(m_lock);
+ if (m_on_finish) {
+ in_progress = true;
+ } else {
+ m_on_finish = on_finish;
+ }
+ }
+
+ if (in_progress) {
+ dout(10) << "previous request is still in progress, ignoring" << dendl;
+ on_finish->complete(-EAGAIN);
+ return false;
+ }
+
+ m_master_position = cls::journal::ObjectPosition();
+ m_mirror_position = cls::journal::ObjectPosition();
+
+ cls::journal::Client master_client, mirror_client;
+ int r;
+
+ r = m_journaler->get_cached_client(librbd::Journal<>::IMAGE_CLIENT_ID,
+ &master_client);
+ if (r < 0) {
+ derr << "error retrieving registered master client: "
+ << cpp_strerror(r) << dendl;
+ } else {
+ r = m_journaler->get_cached_client(m_mirror_uuid, &mirror_client);
+ if (r < 0) {
+ derr << "error retrieving registered mirror client: "
+ << cpp_strerror(r) << dendl;
+ }
+ }
+
+ if (!master_client.commit_position.object_positions.empty()) {
+ m_master_position =
+ *(master_client.commit_position.object_positions.begin());
+ }
+
+ if (!mirror_client.commit_position.object_positions.empty()) {
+ m_mirror_position =
+ *(mirror_client.commit_position.object_positions.begin());
+ }
+
+ if (!calculate_behind_master_or_send_update()) {
+ dout(20) << "need to update tag cache" << dendl;
+ return false;
+ }
+
+ format(description);
+
+ {
+ Mutex::Locker locker(m_lock);
+ ceph_assert(m_on_finish == on_finish);
+ m_on_finish = nullptr;
+ }
+
+ on_finish->complete(-EEXIST);
+ return true;
+}
+
+template <typename I>
+bool ReplayStatusFormatter<I>::calculate_behind_master_or_send_update() {
+ dout(20) << "m_master_position=" << m_master_position
+ << ", m_mirror_position=" << m_mirror_position << dendl;
+
+ m_entries_behind_master = 0;
+
+ if (m_master_position == cls::journal::ObjectPosition() ||
+ m_master_position.tag_tid < m_mirror_position.tag_tid) {
+ return true;
+ }
+
+ cls::journal::ObjectPosition master = m_master_position;
+ uint64_t mirror_tag_tid = m_mirror_position.tag_tid;
+
+ while (master.tag_tid > mirror_tag_tid) {
+ auto tag_it = m_tag_cache.find(master.tag_tid);
+ if (tag_it == m_tag_cache.end()) {
+ send_update_tag_cache(master.tag_tid, mirror_tag_tid);
+ return false;
+ }
+ librbd::journal::TagData &tag_data = tag_it->second;
+ m_entries_behind_master += master.entry_tid;
+ master = {0, tag_data.predecessor.tag_tid, tag_data.predecessor.entry_tid};
+ }
+ if (master.tag_tid == mirror_tag_tid &&
+ master.entry_tid > m_mirror_position.entry_tid) {
+ m_entries_behind_master += master.entry_tid - m_mirror_position.entry_tid;
+ }
+
+ dout(20) << "clearing tags not needed any more (below mirror position)"
+ << dendl;
+
+ uint64_t tag_tid = mirror_tag_tid;
+ size_t old_size = m_tag_cache.size();
+ while (tag_tid != 0) {
+ auto tag_it = m_tag_cache.find(tag_tid);
+ if (tag_it == m_tag_cache.end()) {
+ break;
+ }
+ librbd::journal::TagData &tag_data = tag_it->second;
+
+ dout(20) << "erasing tag " << tag_data << "for tag_tid " << tag_tid
+ << dendl;
+
+ tag_tid = tag_data.predecessor.tag_tid;
+ m_tag_cache.erase(tag_it);
+ }
+
+ dout(20) << old_size - m_tag_cache.size() << " entries cleared" << dendl;
+
+ return true;
+}
+
+template <typename I>
+void ReplayStatusFormatter<I>::send_update_tag_cache(uint64_t master_tag_tid,
+ uint64_t mirror_tag_tid) {
+ if (master_tag_tid <= mirror_tag_tid ||
+ m_tag_cache.find(master_tag_tid) != m_tag_cache.end()) {
+ Context *on_finish = nullptr;
+ {
+ Mutex::Locker locker(m_lock);
+ std::swap(m_on_finish, on_finish);
+ }
+
+ ceph_assert(on_finish);
+ on_finish->complete(0);
+ return;
+ }
+
+ dout(20) << "master_tag_tid=" << master_tag_tid << ", mirror_tag_tid="
+ << mirror_tag_tid << dendl;
+
+ FunctionContext *ctx = new FunctionContext(
+ [this, master_tag_tid, mirror_tag_tid](int r) {
+ handle_update_tag_cache(master_tag_tid, mirror_tag_tid, r);
+ });
+ m_journaler->get_tag(master_tag_tid, &m_tag, ctx);
+}
+
+template <typename I>
+void ReplayStatusFormatter<I>::handle_update_tag_cache(uint64_t master_tag_tid,
+ uint64_t mirror_tag_tid,
+ int r) {
+ librbd::journal::TagData tag_data;
+
+ if (r < 0) {
+ derr << "error retrieving tag " << master_tag_tid << ": " << cpp_strerror(r)
+ << dendl;
+ } else {
+ dout(20) << "retrieved tag " << master_tag_tid << ": " << m_tag << dendl;
+
+ auto it = m_tag.data.cbegin();
+ try {
+ decode(tag_data, it);
+ } catch (const buffer::error &err) {
+ derr << "error decoding tag " << master_tag_tid << ": " << err.what()
+ << dendl;
+ }
+ }
+
+ if (tag_data.predecessor.mirror_uuid !=
+ librbd::Journal<>::LOCAL_MIRROR_UUID &&
+ tag_data.predecessor.mirror_uuid !=
+ librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+ dout(20) << "hit remote image non-primary epoch" << dendl;
+ tag_data.predecessor = {};
+ }
+
+ dout(20) << "decoded tag " << master_tag_tid << ": " << tag_data << dendl;
+
+ m_tag_cache[master_tag_tid] = tag_data;
+ send_update_tag_cache(tag_data.predecessor.tag_tid, mirror_tag_tid);
+}
+
+template <typename I>
+void ReplayStatusFormatter<I>::format(std::string *description) {
+
+ dout(20) << "m_master_position=" << m_master_position
+ << ", m_mirror_position=" << m_mirror_position
+ << ", m_entries_behind_master=" << m_entries_behind_master << dendl;
+
+ std::stringstream ss;
+ ss << "master_position=";
+ if (m_master_position == cls::journal::ObjectPosition()) {
+ ss << "[]";
+ } else {
+ ss << m_master_position;
+ }
+ ss << ", mirror_position=";
+ if (m_mirror_position == cls::journal::ObjectPosition()) {
+ ss << "[]";
+ } else {
+ ss << m_mirror_position;
+ }
+ ss << ", entries_behind_master="
+ << (m_entries_behind_master > 0 ? m_entries_behind_master : 0);
+
+ *description = ss.str();
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class
+rbd::mirror::image_replayer::ReplayStatusFormatter<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h
new file mode 100644
index 00000000..59940a65
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_REPLAY_STATUS_FORMATTER_H
+#define RBD_MIRROR_IMAGE_REPLAYER_REPLAY_STATUS_FORMATTER_H
+
+#include "include/Context.h"
+#include "common/Mutex.h"
+#include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ReplayStatusFormatter {
+public:
+ typedef typename librbd::journal::TypeTraits<ImageCtxT>::Journaler Journaler;
+
+ static ReplayStatusFormatter* create(Journaler *journaler,
+ const std::string &mirror_uuid) {
+ return new ReplayStatusFormatter(journaler, mirror_uuid);
+ }
+
+ static void destroy(ReplayStatusFormatter* formatter) {
+ delete formatter;
+ }
+
+ ReplayStatusFormatter(Journaler *journaler, const std::string &mirror_uuid);
+
+ bool get_or_send_update(std::string *description, Context *on_finish);
+
+private:
+ Journaler *m_journaler;
+ std::string m_mirror_uuid;
+ Mutex m_lock;
+ Context *m_on_finish = nullptr;
+ cls::journal::ObjectPosition m_master_position;
+ cls::journal::ObjectPosition m_mirror_position;
+ int m_entries_behind_master = 0;
+ cls::journal::Tag m_tag;
+ std::map<uint64_t, librbd::journal::TagData> m_tag_cache;
+
+ bool calculate_behind_master_or_send_update();
+ void send_update_tag_cache(uint64_t master_tag_tid, uint64_t mirror_tag_tid);
+ void handle_update_tag_cache(uint64_t master_tag_tid, uint64_t mirror_tag_tid,
+ int r);
+ void format(std::string *description);
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_REPLAY_STATUS_FORMATTER_H
diff --git a/src/tools/rbd_mirror/image_replayer/Types.h b/src/tools/rbd_mirror/image_replayer/Types.h
new file mode 100644
index 00000000..6ab988a7
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/Types.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H
+#define CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+enum HealthState {
+ HEALTH_STATE_OK,
+ HEALTH_STATE_WARNING,
+ HEALTH_STATE_ERROR
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_IMAGE_REPLAYER_TYPES_H
diff --git a/src/tools/rbd_mirror/image_replayer/Utils.cc b/src/tools/rbd_mirror/image_replayer/Utils.cc
new file mode 100644
index 00000000..eda0179f
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/Utils.cc
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/Utils.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/journal/cls_journal_types.h"
+#include "librbd/journal/Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::util::" \
+ << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace util {
+
+bool decode_client_meta(const cls::journal::Client& client,
+ librbd::journal::MirrorPeerClientMeta* client_meta) {
+ dout(15) << dendl;
+
+ librbd::journal::ClientData client_data;
+ auto it = client.data.cbegin();
+ try {
+ decode(client_data, it);
+ } catch (const buffer::error &err) {
+ derr << "failed to decode client meta data: " << err.what() << dendl;
+ return false;
+ }
+
+ auto local_client_meta = boost::get<librbd::journal::MirrorPeerClientMeta>(
+ &client_data.client_meta);
+ if (local_client_meta == nullptr) {
+ derr << "unknown peer registration" << dendl;
+ return false;
+ }
+
+ *client_meta = *local_client_meta;
+ dout(15) << "client found: client_meta=" << *client_meta << dendl;
+ return true;
+}
+
+} // namespace util
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
diff --git a/src/tools/rbd_mirror/image_replayer/Utils.h b/src/tools/rbd_mirror/image_replayer/Utils.h
new file mode 100644
index 00000000..d42146d1
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/Utils.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_UTILS_H
+#define RBD_MIRROR_IMAGE_REPLAYER_UTILS_H
+
+namespace cls { namespace journal { struct Client; } }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+namespace util {
+
+bool decode_client_meta(const cls::journal::Client& client,
+ librbd::journal::MirrorPeerClientMeta* client_meta);
+
+} // namespace util
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_UTILS_H
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc
new file mode 100644
index 00000000..ffe2eca9
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SyncPointCreateRequest.h"
+#include "include/uuid.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_sync::SyncPointCreateRequest: " \
+ << this << " " << __func__
+
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+
+namespace {
+
+static const std::string SNAP_NAME_PREFIX(".rbd-mirror");
+
+} // anonymous namespace
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+SyncPointCreateRequest<I>::SyncPointCreateRequest(I *remote_image_ctx,
+ const std::string &mirror_uuid,
+ Journaler *journaler,
+ MirrorPeerClientMeta *client_meta,
+ Context *on_finish)
+ : m_remote_image_ctx(remote_image_ctx), m_mirror_uuid(mirror_uuid),
+ m_journaler(journaler), m_client_meta(client_meta), m_on_finish(on_finish),
+ m_client_meta_copy(*client_meta) {
+ ceph_assert(m_client_meta->sync_points.size() < 2);
+
+ // initialize the updated client meta with the new sync point
+ m_client_meta_copy.sync_points.emplace_back();
+ if (m_client_meta_copy.sync_points.size() > 1) {
+ m_client_meta_copy.sync_points.back().from_snap_name =
+ m_client_meta_copy.sync_points.front().snap_name;
+ }
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send() {
+ send_update_client();
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send_update_client() {
+ uuid_d uuid_gen;
+ uuid_gen.generate_random();
+
+ MirrorPeerSyncPoint &sync_point = m_client_meta_copy.sync_points.back();
+ sync_point.snap_name = SNAP_NAME_PREFIX + "." + m_mirror_uuid + "." +
+ uuid_gen.to_string();
+
+ dout(20) << ": sync_point=" << sync_point << dendl;
+
+ bufferlist client_data_bl;
+ librbd::journal::ClientData client_data(m_client_meta_copy);
+ encode(client_data, client_data_bl);
+
+ Context *ctx = create_context_callback<
+ SyncPointCreateRequest<I>, &SyncPointCreateRequest<I>::handle_update_client>(
+ this);
+ m_journaler->update_client(client_data_bl, ctx);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::handle_update_client(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to update client data: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ // update provided meta structure to reflect reality
+ *m_client_meta = m_client_meta_copy;
+
+ send_refresh_image();
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send_refresh_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ SyncPointCreateRequest<I>, &SyncPointCreateRequest<I>::handle_refresh_image>(
+ this);
+ m_remote_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::handle_refresh_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": remote image refresh failed: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_create_snap();
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send_create_snap() {
+ dout(20) << dendl;
+
+ MirrorPeerSyncPoint &sync_point = m_client_meta_copy.sync_points.back();
+
+ Context *ctx = create_context_callback<
+ SyncPointCreateRequest<I>, &SyncPointCreateRequest<I>::handle_create_snap>(
+ this);
+ m_remote_image_ctx->operations->snap_create(
+ cls::rbd::UserSnapshotNamespace(), sync_point.snap_name.c_str(), ctx);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::handle_create_snap(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r == -EEXIST) {
+ send_update_client();
+ return;
+ } else if (r < 0) {
+ derr << ": failed to create snapshot: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_final_refresh_image();
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::send_final_refresh_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ SyncPointCreateRequest<I>,
+ &SyncPointCreateRequest<I>::handle_final_refresh_image>(this);
+ m_remote_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::handle_final_refresh_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to refresh image for snapshot: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void SyncPointCreateRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_sync::SyncPointCreateRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h
new file mode 100644
index 00000000..45275ec4
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H
+#define RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H
+
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include <string>
+
+class Context;
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SyncPointCreateRequest {
+public:
+ typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+ typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta;
+ typedef librbd::journal::MirrorPeerSyncPoint MirrorPeerSyncPoint;
+
+ static SyncPointCreateRequest* create(ImageCtxT *remote_image_ctx,
+ const std::string &mirror_uuid,
+ Journaler *journaler,
+ MirrorPeerClientMeta *client_meta,
+ Context *on_finish) {
+ return new SyncPointCreateRequest(remote_image_ctx, mirror_uuid, journaler,
+ client_meta, on_finish);
+ }
+
+ SyncPointCreateRequest(ImageCtxT *remote_image_ctx,
+ const std::string &mirror_uuid, Journaler *journaler,
+ MirrorPeerClientMeta *client_meta, Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * v
+ * UPDATE_CLIENT < . .
+ * | .
+ * v .
+ * REFRESH_IMAGE .
+ * | . (repeat on EEXIST)
+ * v .
+ * CREATE_SNAP . . . .
+ * |
+ * v
+ * REFRESH_IMAGE
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_remote_image_ctx;
+ std::string m_mirror_uuid;
+ Journaler *m_journaler;
+ MirrorPeerClientMeta *m_client_meta;
+ Context *m_on_finish;
+
+ MirrorPeerClientMeta m_client_meta_copy;
+
+ void send_update_client();
+ void handle_update_client(int r);
+
+ void send_refresh_image();
+ void handle_refresh_image(int r);
+
+ void send_create_snap();
+ void handle_create_snap(int r);
+
+ void send_final_refresh_image();
+ void handle_final_refresh_image(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_sync::SyncPointCreateRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_CREATE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc
new file mode 100644
index 00000000..2cfed5e6
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "SyncPointPruneRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "journal/Journaler.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageState.h"
+#include "librbd/Operations.h"
+#include "librbd/Utils.h"
+#include <set>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_sync::SyncPointPruneRequest: " \
+ << this << " " << __func__
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+
+using librbd::util::create_context_callback;
+
+template <typename I>
+SyncPointPruneRequest<I>::SyncPointPruneRequest(I *remote_image_ctx,
+ bool sync_complete,
+ Journaler *journaler,
+ MirrorPeerClientMeta *client_meta,
+ Context *on_finish)
+ : m_remote_image_ctx(remote_image_ctx), m_sync_complete(sync_complete),
+ m_journaler(journaler), m_client_meta(client_meta), m_on_finish(on_finish),
+ m_client_meta_copy(*client_meta) {
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::send() {
+ if (m_client_meta->sync_points.empty()) {
+ send_remove_snap();
+ return;
+ }
+
+ if (m_sync_complete) {
+ // if sync is complete, we can remove the master sync point
+ auto it = m_client_meta_copy.sync_points.begin();
+ MirrorPeerSyncPoint &sync_point = *it;
+
+ ++it;
+ if (it == m_client_meta_copy.sync_points.end() ||
+ it->from_snap_name != sync_point.snap_name) {
+ m_snap_names.push_back(sync_point.snap_name);
+ }
+
+ if (!sync_point.from_snap_name.empty()) {
+ m_snap_names.push_back(sync_point.from_snap_name);
+ }
+ } else {
+ // if we have more than one sync point or invalid sync points,
+ // trim them off
+ RWLock::RLocker snap_locker(m_remote_image_ctx->snap_lock);
+ std::set<std::string> snap_names;
+ for (auto it = m_client_meta_copy.sync_points.rbegin();
+ it != m_client_meta_copy.sync_points.rend(); ++it) {
+ MirrorPeerSyncPoint &sync_point = *it;
+ if (&sync_point == &m_client_meta_copy.sync_points.front()) {
+ if (m_remote_image_ctx->get_snap_id(
+ cls::rbd::UserSnapshotNamespace(), sync_point.snap_name) ==
+ CEPH_NOSNAP) {
+ derr << ": failed to locate sync point snapshot: "
+ << sync_point.snap_name << dendl;
+ } else if (!sync_point.from_snap_name.empty()) {
+ derr << ": unexpected from_snap_name in primary sync point: "
+ << sync_point.from_snap_name << dendl;
+ } else {
+ // first sync point is OK -- keep it
+ break;
+ }
+ m_invalid_master_sync_point = true;
+ }
+
+ if (snap_names.count(sync_point.snap_name) == 0) {
+ snap_names.insert(sync_point.snap_name);
+ m_snap_names.push_back(sync_point.snap_name);
+ }
+
+ MirrorPeerSyncPoint &front_sync_point =
+ m_client_meta_copy.sync_points.front();
+ if (!sync_point.from_snap_name.empty() &&
+ snap_names.count(sync_point.from_snap_name) == 0 &&
+ sync_point.from_snap_name != front_sync_point.snap_name) {
+ snap_names.insert(sync_point.from_snap_name);
+ m_snap_names.push_back(sync_point.from_snap_name);
+ }
+ }
+ }
+
+ send_remove_snap();
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::send_remove_snap() {
+ if (m_snap_names.empty()) {
+ send_refresh_image();
+ return;
+ }
+
+ const std::string &snap_name = m_snap_names.front();
+
+ dout(20) << ": snap_name=" << snap_name << dendl;
+
+ Context *ctx = create_context_callback<
+ SyncPointPruneRequest<I>, &SyncPointPruneRequest<I>::handle_remove_snap>(
+ this);
+ m_remote_image_ctx->operations->snap_remove(cls::rbd::UserSnapshotNamespace(),
+ snap_name.c_str(),
+ ctx);
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::handle_remove_snap(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ ceph_assert(!m_snap_names.empty());
+ std::string snap_name = m_snap_names.front();
+ m_snap_names.pop_front();
+
+ if (r == -ENOENT) {
+ r = 0;
+ }
+ if (r < 0) {
+ derr << ": failed to remove snapshot '" << snap_name << "': "
+ << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_remove_snap();
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::send_refresh_image() {
+ dout(20) << dendl;
+
+ Context *ctx = create_context_callback<
+ SyncPointPruneRequest<I>, &SyncPointPruneRequest<I>::handle_refresh_image>(
+ this);
+ m_remote_image_ctx->state->refresh(ctx);
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::handle_refresh_image(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": remote image refresh failed: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ send_update_client();
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::send_update_client() {
+ dout(20) << dendl;
+
+ if (m_sync_complete) {
+ m_client_meta_copy.sync_points.pop_front();
+ if (m_client_meta_copy.sync_points.empty()) {
+ m_client_meta_copy.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+ }
+ } else {
+ while (m_client_meta_copy.sync_points.size() > 1) {
+ m_client_meta_copy.sync_points.pop_back();
+ }
+ if (m_invalid_master_sync_point) {
+ // all subsequent sync points would have been pruned
+ m_client_meta_copy.sync_points.clear();
+ }
+ }
+
+ bufferlist client_data_bl;
+ librbd::journal::ClientData client_data(m_client_meta_copy);
+ encode(client_data, client_data_bl);
+
+ Context *ctx = create_context_callback<
+ SyncPointPruneRequest<I>, &SyncPointPruneRequest<I>::handle_update_client>(
+ this);
+ m_journaler->update_client(client_data_bl, ctx);
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::handle_update_client(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to update client data: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ // update provided meta structure to reflect reality
+ *m_client_meta = m_client_meta_copy;
+ finish(0);
+}
+
+template <typename I>
+void SyncPointPruneRequest<I>::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_sync::SyncPointPruneRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h
new file mode 100644
index 00000000..65e13ef5
--- /dev/null
+++ b/src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H
+#define RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H
+
+#include "librbd/journal/Types.h"
+#include "librbd/journal/TypeTraits.h"
+#include <list>
+#include <string>
+
+class Context;
+namespace journal { class Journaler; }
+namespace librbd { class ImageCtx; }
+namespace librbd { namespace journal { struct MirrorPeerClientMeta; } }
+
+namespace rbd {
+namespace mirror {
+namespace image_sync {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class SyncPointPruneRequest {
+public:
+ typedef librbd::journal::TypeTraits<ImageCtxT> TypeTraits;
+ typedef typename TypeTraits::Journaler Journaler;
+ typedef librbd::journal::MirrorPeerClientMeta MirrorPeerClientMeta;
+ typedef librbd::journal::MirrorPeerSyncPoint MirrorPeerSyncPoint;
+
+ static SyncPointPruneRequest* create(ImageCtxT *remote_image_ctx,
+ bool sync_complete,
+ Journaler *journaler,
+ MirrorPeerClientMeta *client_meta,
+ Context *on_finish) {
+ return new SyncPointPruneRequest(remote_image_ctx, sync_complete, journaler,
+ client_meta, on_finish);
+ }
+
+ SyncPointPruneRequest(ImageCtxT *remote_image_ctx, bool sync_complete,
+ Journaler *journaler, MirrorPeerClientMeta *client_meta,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | . . . . .
+ * | . .
+ * v v . (repeat if from snap
+ * REMOVE_SNAP . . . unused by other sync)
+ * |
+ * v
+ * REFRESH_IMAGE
+ * |
+ * v
+ * UPDATE_CLIENT
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ ImageCtxT *m_remote_image_ctx;
+ bool m_sync_complete;
+ Journaler *m_journaler;
+ MirrorPeerClientMeta *m_client_meta;
+ Context *m_on_finish;
+
+ MirrorPeerClientMeta m_client_meta_copy;
+ std::list<std::string> m_snap_names;
+
+ bool m_invalid_master_sync_point = false;
+
+ void send_remove_snap();
+ void handle_remove_snap(int r);
+
+ void send_refresh_image();
+ void handle_refresh_image(int r);
+
+ void send_update_client();
+ void handle_update_client(int r);
+
+ void finish(int r);
+};
+
+} // namespace image_sync
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_sync::SyncPointPruneRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_SYNC_SYNC_POINT_PRUNE_REQUEST_H
diff --git a/src/tools/rbd_mirror/instance_watcher/Types.cc b/src/tools/rbd_mirror/instance_watcher/Types.cc
new file mode 100644
index 00000000..0e992273
--- /dev/null
+++ b/src/tools/rbd_mirror/instance_watcher/Types.cc
@@ -0,0 +1,245 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Types.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+
+namespace rbd {
+namespace mirror {
+namespace instance_watcher {
+
+namespace {
+
+class EncodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(Payload::NOTIFY_OP), m_bl);
+ payload.encode(m_bl);
+ }
+
+private:
+ bufferlist &m_bl;
+};
+
+class DecodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {}
+
+ template <typename Payload>
+ inline void operator()(Payload &payload) const {
+ payload.decode(m_version, m_iter);
+ }
+
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ NotifyOp notify_op = Payload::NOTIFY_OP;
+ m_formatter->dump_string("notify_op", stringify(notify_op));
+ payload.dump(m_formatter);
+ }
+
+private:
+ ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void PayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(request_id, bl);
+}
+
+void PayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(request_id, iter);
+}
+
+void PayloadBase::dump(Formatter *f) const {
+ f->dump_unsigned("request_id", request_id);
+}
+
+void ImagePayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ PayloadBase::encode(bl);
+ encode(global_image_id, bl);
+}
+
+void ImagePayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ PayloadBase::decode(version, iter);
+ decode(global_image_id, iter);
+}
+
+void ImagePayloadBase::dump(Formatter *f) const {
+ PayloadBase::dump(f);
+ f->dump_string("global_image_id", global_image_id);
+}
+
+void PeerImageRemovedPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ PayloadBase::encode(bl);
+ encode(global_image_id, bl);
+ encode(peer_mirror_uuid, bl);
+}
+
+void PeerImageRemovedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ PayloadBase::decode(version, iter);
+ decode(global_image_id, iter);
+ decode(peer_mirror_uuid, iter);
+}
+
+void PeerImageRemovedPayload::dump(Formatter *f) const {
+ PayloadBase::dump(f);
+ f->dump_string("global_image_id", global_image_id);
+ f->dump_string("peer_mirror_uuid", peer_mirror_uuid);
+}
+
+void SyncPayloadBase::encode(bufferlist &bl) const {
+ using ceph::encode;
+ PayloadBase::encode(bl);
+ encode(sync_id, bl);
+}
+
+void SyncPayloadBase::decode(__u8 version, bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ PayloadBase::decode(version, iter);
+ decode(sync_id, iter);
+}
+
+void SyncPayloadBase::dump(Formatter *f) const {
+ PayloadBase::dump(f);
+ f->dump_string("sync_id", sync_id);
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(2, 2, bl);
+ boost::apply_visitor(EncodePayloadVisitor(bl), payload);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(2, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_IMAGE_ACQUIRE:
+ payload = ImageAcquirePayload();
+ break;
+ case NOTIFY_OP_IMAGE_RELEASE:
+ payload = ImageReleasePayload();
+ break;
+ case NOTIFY_OP_PEER_IMAGE_REMOVED:
+ payload = PeerImageRemovedPayload();
+ break;
+ case NOTIFY_OP_SYNC_REQUEST:
+ payload = SyncRequestPayload();
+ break;
+ case NOTIFY_OP_SYNC_START:
+ payload = SyncStartPayload();
+ break;
+ default:
+ payload = UnknownPayload();
+ break;
+ }
+
+ apply_visitor(DecodePayloadVisitor(struct_v, iter), payload);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage(ImageAcquirePayload()));
+ o.push_back(new NotifyMessage(ImageAcquirePayload(1, "gid")));
+
+ o.push_back(new NotifyMessage(ImageReleasePayload()));
+ o.push_back(new NotifyMessage(ImageReleasePayload(1, "gid")));
+
+ o.push_back(new NotifyMessage(PeerImageRemovedPayload()));
+ o.push_back(new NotifyMessage(PeerImageRemovedPayload(1, "gid", "uuid")));
+
+ o.push_back(new NotifyMessage(SyncRequestPayload()));
+ o.push_back(new NotifyMessage(SyncRequestPayload(1, "sync_id")));
+
+ o.push_back(new NotifyMessage(SyncStartPayload()));
+ o.push_back(new NotifyMessage(SyncStartPayload(1, "sync_id")));
+}
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op) {
+ switch (op) {
+ case NOTIFY_OP_IMAGE_ACQUIRE:
+ out << "ImageAcquire";
+ break;
+ case NOTIFY_OP_IMAGE_RELEASE:
+ out << "ImageRelease";
+ break;
+ case NOTIFY_OP_PEER_IMAGE_REMOVED:
+ out << "PeerImageRemoved";
+ break;
+ case NOTIFY_OP_SYNC_REQUEST:
+ out << "SyncRequest";
+ break;
+ case NOTIFY_OP_SYNC_START:
+ out << "SyncStart";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+void NotifyAckPayload::encode(bufferlist &bl) const {
+ using ceph::encode;
+ encode(instance_id, bl);
+ encode(request_id, bl);
+ encode(ret_val, bl);
+}
+
+void NotifyAckPayload::decode(bufferlist::const_iterator &iter) {
+ using ceph::decode;
+ decode(instance_id, iter);
+ decode(request_id, iter);
+ decode(ret_val, iter);
+}
+
+void NotifyAckPayload::dump(Formatter *f) const {
+ f->dump_string("instance_id", instance_id);
+ f->dump_unsigned("request_id", request_id);
+ f->dump_int("request_id", ret_val);
+}
+
+} // namespace instance_watcher
+} // namespace mirror
+} // namespace rbd
diff --git a/src/tools/rbd_mirror/instance_watcher/Types.h b/src/tools/rbd_mirror/instance_watcher/Types.h
new file mode 100644
index 00000000..b0b7b779
--- /dev/null
+++ b/src/tools/rbd_mirror/instance_watcher/Types.h
@@ -0,0 +1,197 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_INSTANCE_WATCHER_TYPES_H
+#define RBD_MIRROR_INSTANCE_WATCHER_TYPES_H
+
+#include <string>
+#include <set>
+#include <boost/variant.hpp>
+
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "include/int_types.h"
+
+namespace ceph { class Formatter; }
+
+namespace rbd {
+namespace mirror {
+namespace instance_watcher {
+
+enum NotifyOp {
+ NOTIFY_OP_IMAGE_ACQUIRE = 0,
+ NOTIFY_OP_IMAGE_RELEASE = 1,
+ NOTIFY_OP_PEER_IMAGE_REMOVED = 2,
+ NOTIFY_OP_SYNC_REQUEST = 3,
+ NOTIFY_OP_SYNC_START = 4
+};
+
+struct PayloadBase {
+ uint64_t request_id;
+
+ PayloadBase() : request_id(0) {
+ }
+
+ PayloadBase(uint64_t request_id) : request_id(request_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct ImagePayloadBase : public PayloadBase {
+ std::string global_image_id;
+
+ ImagePayloadBase() : PayloadBase() {
+ }
+
+ ImagePayloadBase(uint64_t request_id, const std::string &global_image_id)
+ : PayloadBase(request_id), global_image_id(global_image_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct ImageAcquirePayload : public ImagePayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_ACQUIRE;
+
+ ImageAcquirePayload() {
+ }
+ ImageAcquirePayload(uint64_t request_id, const std::string &global_image_id)
+ : ImagePayloadBase(request_id, global_image_id) {
+ }
+};
+
+struct ImageReleasePayload : public ImagePayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_IMAGE_RELEASE;
+
+ ImageReleasePayload() {
+ }
+ ImageReleasePayload(uint64_t request_id, const std::string &global_image_id)
+ : ImagePayloadBase(request_id, global_image_id) {
+ }
+};
+
+struct PeerImageRemovedPayload : public PayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_PEER_IMAGE_REMOVED;
+
+ std::string global_image_id;
+ std::string peer_mirror_uuid;
+
+ PeerImageRemovedPayload() {
+ }
+ PeerImageRemovedPayload(uint64_t request_id,
+ const std::string& global_image_id,
+ const std::string& peer_mirror_uuid)
+ : PayloadBase(request_id),
+ global_image_id(global_image_id), peer_mirror_uuid(peer_mirror_uuid) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct SyncPayloadBase : public PayloadBase {
+ std::string sync_id;
+
+ SyncPayloadBase() : PayloadBase() {
+ }
+
+ SyncPayloadBase(uint64_t request_id, const std::string &sync_id)
+ : PayloadBase(request_id), sync_id(sync_id) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct SyncRequestPayload : public SyncPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SYNC_REQUEST;
+
+ SyncRequestPayload() : SyncPayloadBase() {
+ }
+
+ SyncRequestPayload(uint64_t request_id, const std::string &sync_id)
+ : SyncPayloadBase(request_id, sync_id) {
+ }
+};
+
+struct SyncStartPayload : public SyncPayloadBase {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_SYNC_START;
+
+ SyncStartPayload() : SyncPayloadBase() {
+ }
+
+ SyncStartPayload(uint64_t request_id, const std::string &sync_id)
+ : SyncPayloadBase(request_id, sync_id) {
+ }
+};
+
+struct UnknownPayload {
+ static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
+ UnknownPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<ImageAcquirePayload,
+ ImageReleasePayload,
+ PeerImageRemovedPayload,
+ SyncRequestPayload,
+ SyncStartPayload,
+ UnknownPayload> Payload;
+
+struct NotifyMessage {
+ NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) {
+ }
+
+ Payload payload;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+WRITE_CLASS_ENCODER(NotifyMessage);
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op);
+
+struct NotifyAckPayload {
+ std::string instance_id;
+ uint64_t request_id;
+ int ret_val;
+
+ NotifyAckPayload() : request_id(0), ret_val(0) {
+ }
+
+ NotifyAckPayload(const std::string &instance_id, uint64_t request_id,
+ int ret_val)
+ : instance_id(instance_id), request_id(request_id), ret_val(ret_val) {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+};
+
+WRITE_CLASS_ENCODER(NotifyAckPayload);
+
+} // namespace instance_watcher
+} // namespace mirror
+} // namespace librbd
+
+using rbd::mirror::instance_watcher::encode;
+using rbd::mirror::instance_watcher::decode;
+
+#endif // RBD_MIRROR_INSTANCE_WATCHER_TYPES_H
diff --git a/src/tools/rbd_mirror/instances/Types.h b/src/tools/rbd_mirror/instances/Types.h
new file mode 100644
index 00000000..8b0a68fc
--- /dev/null
+++ b/src/tools/rbd_mirror/instances/Types.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_INSTANCES_TYPES_H
+#define CEPH_RBD_MIRROR_INSTANCES_TYPES_H
+
+#include <string>
+#include <vector>
+
+namespace rbd {
+namespace mirror {
+namespace instances {
+
+struct Listener {
+ typedef std::vector<std::string> InstanceIds;
+
+ virtual ~Listener() {
+ }
+
+ virtual void handle_added(const InstanceIds& instance_ids) = 0;
+ virtual void handle_removed(const InstanceIds& instance_ids) = 0;
+};
+
+} // namespace instances
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_INSTANCES_TYPES_H
diff --git a/src/tools/rbd_mirror/leader_watcher/Types.cc b/src/tools/rbd_mirror/leader_watcher/Types.cc
new file mode 100644
index 00000000..d2fb7908
--- /dev/null
+++ b/src/tools/rbd_mirror/leader_watcher/Types.cc
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Types.h"
+#include "include/ceph_assert.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+
+namespace rbd {
+namespace mirror {
+namespace leader_watcher {
+
+namespace {
+
+class EncodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ using ceph::encode;
+ encode(static_cast<uint32_t>(Payload::NOTIFY_OP), m_bl);
+ payload.encode(m_bl);
+ }
+
+private:
+ bufferlist &m_bl;
+};
+
+class DecodePayloadVisitor : public boost::static_visitor<void> {
+public:
+ DecodePayloadVisitor(__u8 version, bufferlist::const_iterator &iter)
+ : m_version(version), m_iter(iter) {}
+
+ template <typename Payload>
+ inline void operator()(Payload &payload) const {
+ payload.decode(m_version, m_iter);
+ }
+
+private:
+ __u8 m_version;
+ bufferlist::const_iterator &m_iter;
+};
+
+class DumpPayloadVisitor : public boost::static_visitor<void> {
+public:
+ explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+ template <typename Payload>
+ inline void operator()(const Payload &payload) const {
+ NotifyOp notify_op = Payload::NOTIFY_OP;
+ m_formatter->dump_string("notify_op", stringify(notify_op));
+ payload.dump(m_formatter);
+ }
+
+private:
+ ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void HeartbeatPayload::encode(bufferlist &bl) const {
+}
+
+void HeartbeatPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void HeartbeatPayload::dump(Formatter *f) const {
+}
+
+void LockAcquiredPayload::encode(bufferlist &bl) const {
+}
+
+void LockAcquiredPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void LockAcquiredPayload::dump(Formatter *f) const {
+}
+
+void LockReleasedPayload::encode(bufferlist &bl) const {
+}
+
+void LockReleasedPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void LockReleasedPayload::dump(Formatter *f) const {
+}
+
+void UnknownPayload::encode(bufferlist &bl) const {
+ ceph_abort();
+}
+
+void UnknownPayload::decode(__u8 version, bufferlist::const_iterator &iter) {
+}
+
+void UnknownPayload::dump(Formatter *f) const {
+}
+
+void NotifyMessage::encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ boost::apply_visitor(EncodePayloadVisitor(bl), payload);
+ ENCODE_FINISH(bl);
+}
+
+void NotifyMessage::decode(bufferlist::const_iterator& iter) {
+ DECODE_START(1, iter);
+
+ uint32_t notify_op;
+ decode(notify_op, iter);
+
+ // select the correct payload variant based upon the encoded op
+ switch (notify_op) {
+ case NOTIFY_OP_HEARTBEAT:
+ payload = HeartbeatPayload();
+ break;
+ case NOTIFY_OP_LOCK_ACQUIRED:
+ payload = LockAcquiredPayload();
+ break;
+ case NOTIFY_OP_LOCK_RELEASED:
+ payload = LockReleasedPayload();
+ break;
+ default:
+ payload = UnknownPayload();
+ break;
+ }
+
+ apply_visitor(DecodePayloadVisitor(struct_v, iter), payload);
+ DECODE_FINISH(iter);
+}
+
+void NotifyMessage::dump(Formatter *f) const {
+ apply_visitor(DumpPayloadVisitor(f), payload);
+}
+
+void NotifyMessage::generate_test_instances(std::list<NotifyMessage *> &o) {
+ o.push_back(new NotifyMessage(HeartbeatPayload()));
+ o.push_back(new NotifyMessage(LockAcquiredPayload()));
+ o.push_back(new NotifyMessage(LockReleasedPayload()));
+}
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op) {
+ switch (op) {
+ case NOTIFY_OP_HEARTBEAT:
+ out << "Heartbeat";
+ break;
+ case NOTIFY_OP_LOCK_ACQUIRED:
+ out << "LockAcquired";
+ break;
+ case NOTIFY_OP_LOCK_RELEASED:
+ out << "LockReleased";
+ break;
+ default:
+ out << "Unknown (" << static_cast<uint32_t>(op) << ")";
+ break;
+ }
+ return out;
+}
+
+} // namespace leader_watcher
+} // namespace mirror
+} // namespace librbd
diff --git a/src/tools/rbd_mirror/leader_watcher/Types.h b/src/tools/rbd_mirror/leader_watcher/Types.h
new file mode 100644
index 00000000..1278e54b
--- /dev/null
+++ b/src/tools/rbd_mirror/leader_watcher/Types.h
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_LEADER_WATCHER_TYPES_H
+#define RBD_MIRROR_LEADER_WATCHER_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include <string>
+#include <vector>
+#include <boost/variant.hpp>
+
+struct Context;
+
+namespace ceph { class Formatter; }
+
+namespace rbd {
+namespace mirror {
+namespace leader_watcher {
+
+struct Listener {
+ typedef std::vector<std::string> InstanceIds;
+
+ virtual ~Listener() {
+ }
+
+ virtual void post_acquire_handler(Context *on_finish) = 0;
+ virtual void pre_release_handler(Context *on_finish) = 0;
+
+ virtual void update_leader_handler(
+ const std::string &leader_instance_id) = 0;
+
+ virtual void handle_instances_added(const InstanceIds& instance_ids) = 0;
+ virtual void handle_instances_removed(const InstanceIds& instance_ids) = 0;
+};
+
+enum NotifyOp {
+ NOTIFY_OP_HEARTBEAT = 0,
+ NOTIFY_OP_LOCK_ACQUIRED = 1,
+ NOTIFY_OP_LOCK_RELEASED = 2,
+};
+
+struct HeartbeatPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_HEARTBEAT;
+
+ HeartbeatPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct LockAcquiredPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_LOCK_ACQUIRED;
+
+ LockAcquiredPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct LockReleasedPayload {
+ static const NotifyOp NOTIFY_OP = NOTIFY_OP_LOCK_RELEASED;
+
+ LockReleasedPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+struct UnknownPayload {
+ static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
+ UnknownPayload() {
+ }
+
+ void encode(bufferlist &bl) const;
+ void decode(__u8 version, bufferlist::const_iterator &iter);
+ void dump(Formatter *f) const;
+};
+
+typedef boost::variant<HeartbeatPayload,
+ LockAcquiredPayload,
+ LockReleasedPayload,
+ UnknownPayload> Payload;
+
+struct NotifyMessage {
+ NotifyMessage(const Payload &payload = UnknownPayload()) : payload(payload) {
+ }
+
+ Payload payload;
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::const_iterator& it);
+ void dump(Formatter *f) const;
+
+ static void generate_test_instances(std::list<NotifyMessage *> &o);
+};
+
+WRITE_CLASS_ENCODER(NotifyMessage);
+
+std::ostream &operator<<(std::ostream &out, const NotifyOp &op);
+
+} // namespace leader_watcher
+} // namespace mirror
+} // namespace librbd
+
+using rbd::mirror::leader_watcher::encode;
+using rbd::mirror::leader_watcher::decode;
+
+#endif // RBD_MIRROR_LEADER_WATCHER_TYPES_H
diff --git a/src/tools/rbd_mirror/main.cc b/src/tools/rbd_mirror/main.cc
new file mode 100644
index 00000000..ab350a01
--- /dev/null
+++ b/src/tools/rbd_mirror/main.cc
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+#include "Mirror.h"
+#include "Types.h"
+
+#include <vector>
+
+rbd::mirror::Mirror *mirror = nullptr;
+PerfCounters *g_perf_counters = nullptr;
+
+void usage() {
+ std::cout << "usage: rbd-mirror [options...]" << std::endl;
+ std::cout << "options:\n";
+ std::cout << " -m monaddress[:port] connect to specified monitor\n";
+ std::cout << " --keyring=<path> path to keyring for local cluster\n";
+ std::cout << " --log-file=<logfile> file to log debug output\n";
+ std::cout << " --debug-rbd-mirror=<log-level>/<memory-level> set rbd-mirror debug level\n";
+ generic_server_usage();
+}
+
+static void handle_signal(int signum)
+{
+ if (mirror)
+ mirror->handle_signal(signum);
+}
+
+int main(int argc, const char **argv)
+{
+ std::vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+
+ if (g_conf()->daemonize) {
+ global_init_daemonize(g_ceph_context);
+ }
+
+ common_init_finish(g_ceph_context);
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, handle_signal);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ std::vector<const char*> cmd_args;
+ argv_to_vec(argc, argv, cmd_args);
+
+ // disable unnecessary librbd cache
+ g_ceph_context->_conf.set_val_or_die("rbd_cache", "false");
+
+ auto prio =
+ g_ceph_context->_conf.get_val<int64_t>("rbd_mirror_perf_stats_prio");
+ PerfCountersBuilder plb(g_ceph_context, "rbd_mirror",
+ rbd::mirror::l_rbd_mirror_first,
+ rbd::mirror::l_rbd_mirror_last);
+ plb.add_u64_counter(rbd::mirror::l_rbd_mirror_replay, "replay", "Replays",
+ "r", prio);
+ plb.add_u64_counter(rbd::mirror::l_rbd_mirror_replay_bytes, "replay_bytes",
+ "Replayed data", "rb", prio, unit_t(UNIT_BYTES));
+ plb.add_time_avg(rbd::mirror::l_rbd_mirror_replay_latency, "replay_latency",
+ "Replay latency", "rl", prio);
+ g_perf_counters = plb.create_perf_counters();
+ g_ceph_context->get_perfcounters_collection()->add(g_perf_counters);
+
+ mirror = new rbd::mirror::Mirror(g_ceph_context, cmd_args);
+ int r = mirror->init();
+ if (r < 0) {
+ std::cerr << "failed to initialize: " << cpp_strerror(r) << std::endl;
+ goto cleanup;
+ }
+
+ mirror->run();
+
+ cleanup:
+ unregister_async_signal_handler(SIGHUP, handle_signal);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ g_ceph_context->get_perfcounters_collection()->remove(g_perf_counters);
+
+ delete mirror;
+ delete g_perf_counters;
+
+ return r < 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc
new file mode 100644
index 00000000..a1d9c1b5
--- /dev/null
+++ b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/Utils.h"
+#include <map>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::pool_watcher::RefreshImagesRequest " \
+ << this << " " << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace pool_watcher {
+
+static const uint32_t MAX_RETURN = 1024;
+
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void RefreshImagesRequest<I>::send() {
+ m_image_ids->clear();
+ mirror_image_list();
+}
+
+template <typename I>
+void RefreshImagesRequest<I>::mirror_image_list() {
+ dout(10) << dendl;
+
+ librados::ObjectReadOperation op;
+ librbd::cls_client::mirror_image_list_start(&op, m_start_after, MAX_RETURN);
+
+ m_out_bl.clear();
+ librados::AioCompletion *aio_comp = create_rados_callback<
+ RefreshImagesRequest<I>,
+ &RefreshImagesRequest<I>::handle_mirror_image_list>(this);
+ int r = m_remote_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+template <typename I>
+void RefreshImagesRequest<I>::handle_mirror_image_list(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ std::map<std::string, std::string> ids;
+ if (r == 0) {
+ auto it = m_out_bl.cbegin();
+ r = librbd::cls_client::mirror_image_list_finish(&it, &ids);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ derr << "failed to list mirrored images: " << cpp_strerror(r) << dendl;
+ finish(r);
+ return;
+ }
+
+ // store as global -> local image ids
+ for (auto &id : ids) {
+ m_image_ids->emplace(id.second, id.first);
+ }
+
+ if (ids.size() == MAX_RETURN) {
+ m_start_after = ids.rbegin()->first;
+ mirror_image_list();
+ return;
+ }
+
+ finish(0);
+}
+
+template <typename I>
+void RefreshImagesRequest<I>::finish(int r) {
+ dout(10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace pool_watcher
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::pool_watcher::RefreshImagesRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h
new file mode 100644
index 00000000..8bfeabe2
--- /dev/null
+++ b/src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_POOL_WATCHER_REFRESH_IMAGES_REQUEST_H
+#define CEPH_RBD_MIRROR_POOL_WATCHER_REFRESH_IMAGES_REQUEST_H
+
+#include "include/buffer.h"
+#include "include/rados/librados.hpp"
+#include "tools/rbd_mirror/Types.h"
+#include <string>
+
+struct Context;
+
+namespace librbd { struct ImageCtx; }
+
+namespace rbd {
+namespace mirror {
+namespace pool_watcher {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class RefreshImagesRequest {
+public:
+ static RefreshImagesRequest *create(librados::IoCtx &remote_io_ctx,
+ ImageIds *image_ids, Context *on_finish) {
+ return new RefreshImagesRequest(remote_io_ctx, image_ids, on_finish);
+ }
+
+ RefreshImagesRequest(librados::IoCtx &remote_io_ctx, ImageIds *image_ids,
+ Context *on_finish)
+ : m_remote_io_ctx(remote_io_ctx), m_image_ids(image_ids),
+ m_on_finish(on_finish) {
+ }
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start>
+ * |
+ * | /-------------\
+ * | | |
+ * v v | (more images)
+ * MIRROR_IMAGE_LIST ---/
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_remote_io_ctx;
+ ImageIds *m_image_ids;
+ Context *m_on_finish;
+
+ bufferlist m_out_bl;
+ std::string m_start_after;
+
+ void mirror_image_list();
+ void handle_mirror_image_list(int r);
+
+ void finish(int r);
+
+};
+
+} // namespace pool_watcher
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::pool_watcher::RefreshImagesRequest<librbd::ImageCtx>;
+
+#endif // CEPH_RBD_MIRROR_POOL_WATCHER_REFRESH_IMAGES_REQUEST_H
diff --git a/src/tools/rbd_mirror/pool_watcher/Types.h b/src/tools/rbd_mirror/pool_watcher/Types.h
new file mode 100644
index 00000000..52dfc342
--- /dev/null
+++ b/src/tools/rbd_mirror/pool_watcher/Types.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_POOL_WATCHER_TYPES_H
+#define CEPH_RBD_MIRROR_POOL_WATCHER_TYPES_H
+
+#include "tools/rbd_mirror/Types.h"
+#include <string>
+
+namespace rbd {
+namespace mirror {
+namespace pool_watcher {
+
+struct Listener {
+ virtual ~Listener() {
+ }
+
+ virtual void handle_update(const std::string &mirror_uuid,
+ ImageIds &&added_image_ids,
+ ImageIds &&removed_image_ids) = 0;
+};
+
+} // namespace pool_watcher
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_POOL_WATCHER_TYPES_H
diff --git a/src/tools/rbd_mirror/service_daemon/Types.cc b/src/tools/rbd_mirror/service_daemon/Types.cc
new file mode 100644
index 00000000..7dc6537c
--- /dev/null
+++ b/src/tools/rbd_mirror/service_daemon/Types.cc
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/service_daemon/Types.h"
+#include <iostream>
+
+namespace rbd {
+namespace mirror {
+namespace service_daemon {
+
+std::ostream& operator<<(std::ostream& os, const CalloutLevel& callout_level) {
+ switch (callout_level) {
+ case CALLOUT_LEVEL_INFO:
+ os << "info";
+ break;
+ case CALLOUT_LEVEL_WARNING:
+ os << "warning";
+ break;
+ case CALLOUT_LEVEL_ERROR:
+ os << "error";
+ break;
+ }
+ return os;
+}
+
+} // namespace service_daemon
+} // namespace mirror
+} // namespace rbd
+
diff --git a/src/tools/rbd_mirror/service_daemon/Types.h b/src/tools/rbd_mirror/service_daemon/Types.h
new file mode 100644
index 00000000..3aab7201
--- /dev/null
+++ b/src/tools/rbd_mirror/service_daemon/Types.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H
+#define CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H
+
+#include "include/int_types.h"
+#include <iosfwd>
+#include <string>
+#include <boost/variant.hpp>
+
+namespace rbd {
+namespace mirror {
+namespace service_daemon {
+
+typedef uint64_t CalloutId;
+const uint64_t CALLOUT_ID_NONE {0};
+
+enum CalloutLevel {
+ CALLOUT_LEVEL_INFO,
+ CALLOUT_LEVEL_WARNING,
+ CALLOUT_LEVEL_ERROR
+};
+
+std::ostream& operator<<(std::ostream& os, const CalloutLevel& callout_level);
+
+typedef boost::variant<bool, uint64_t, std::string> AttributeValue;
+
+} // namespace service_daemon
+} // namespace mirror
+} // namespace rbd
+
+#endif // CEPH_RBD_MIRROR_SERVICE_DAEMON_TYPES_H
diff --git a/src/tools/rbd_nbd/CMakeLists.txt b/src/tools/rbd_nbd/CMakeLists.txt
new file mode 100644
index 00000000..5356fae4
--- /dev/null
+++ b/src/tools/rbd_nbd/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(rbd-nbd rbd-nbd.cc)
+target_include_directories(rbd-nbd PUBLIC ${GENL_INCLUDE_DIR})
+target_link_libraries(rbd-nbd librbd librados global ${GENL_LIBRARIES})
+install(TARGETS rbd-nbd DESTINATION bin)
diff --git a/src/tools/rbd_nbd/nbd-netlink.h b/src/tools/rbd_nbd/nbd-netlink.h
new file mode 100644
index 00000000..f932f96a
--- /dev/null
+++ b/src/tools/rbd_nbd/nbd-netlink.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2017 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef _UAPILINUX_NBD_NETLINK_H
+#define _UAPILINUX_NBD_NETLINK_H
+
+#define NBD_GENL_FAMILY_NAME "nbd"
+#define NBD_GENL_VERSION 0x1
+
+/* Configuration policy attributes, used for CONNECT */
+enum {
+ NBD_ATTR_UNSPEC,
+ NBD_ATTR_INDEX,
+ NBD_ATTR_SIZE_BYTES,
+ NBD_ATTR_BLOCK_SIZE_BYTES,
+ NBD_ATTR_TIMEOUT,
+ NBD_ATTR_SERVER_FLAGS,
+ NBD_ATTR_CLIENT_FLAGS,
+ NBD_ATTR_SOCKETS,
+ __NBD_ATTR_MAX,
+};
+#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1)
+
+/*
+ * This is the format for multiple sockets with NBD_ATTR_SOCKETS
+ *
+ * [NBD_ATTR_SOCKETS]
+ * [NBD_SOCK_ITEM]
+ * [NBD_SOCK_FD]
+ * [NBD_SOCK_ITEM]
+ * [NBD_SOCK_FD]
+ */
+enum {
+ NBD_SOCK_ITEM_UNSPEC,
+ NBD_SOCK_ITEM,
+ __NBD_SOCK_ITEM_MAX,
+};
+#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1)
+
+enum {
+ NBD_SOCK_UNSPEC,
+ NBD_SOCK_FD,
+ __NBD_SOCK_MAX,
+};
+#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1)
+
+enum {
+ NBD_CMD_UNSPEC,
+ NBD_CMD_CONNECT,
+ NBD_CMD_DISCONNECT,
+ NBD_CMD_RECONFIGURE,
+ __NBD_CMD_MAX,
+};
+#define NBD_CMD_MAX (__NBD_CMD_MAX - 1)
+
+#endif /* _UAPILINUX_NBD_NETLINK_H */
diff --git a/src/tools/rbd_nbd/rbd-nbd.cc b/src/tools/rbd_nbd/rbd-nbd.cc
new file mode 100644
index 00000000..42dc92ad
--- /dev/null
+++ b/src/tools/rbd_nbd/rbd-nbd.cc
@@ -0,0 +1,1615 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * rbd-nbd - RBD in userspace
+ *
+ * Copyright (C) 2015 - 2016 Kylin Corporation
+ *
+ * Author: Yunchuan Wen <yunchuan.wen@kylin-cloud.com>
+ * Li Wang <li.wang@kylin-cloud.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+*/
+
+#include "include/int_types.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <linux/nbd.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include "nbd-netlink.h"
+#include <libnl3/netlink/genl/genl.h>
+#include <libnl3/netlink/genl/ctrl.h>
+#include <libnl3/netlink/genl/mngt.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <regex>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "common/Formatter.h"
+#include "common/Preforker.h"
+#include "common/TextTable.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/module.h"
+#include "common/safe_io.h"
+#include "common/version.h"
+
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "include/stringify.h"
+#include "include/xlist.h"
+
+#include "mon/MonClient.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd-nbd: "
+
+struct Config {
+ int nbds_max = 0;
+ int max_part = 255;
+ int timeout = -1;
+
+ bool exclusive = false;
+ bool readonly = false;
+ bool set_max_part = false;
+ bool try_netlink = false;
+
+ std::string poolname;
+ std::string nsname;
+ std::string imgname;
+ std::string snapname;
+ std::string devpath;
+
+ std::string format;
+ bool pretty_format = false;
+};
+
+static void usage()
+{
+ std::cout << "Usage: rbd-nbd [options] map <image-or-snap-spec> Map an image to nbd device\n"
+ << " unmap <device|image-or-snap-spec> Unmap nbd device\n"
+ << " [options] list-mapped List mapped nbd devices\n"
+ << "Map options:\n"
+ << " --device <device path> Specify nbd device path (/dev/nbd{num})\n"
+ << " --read-only Map read-only\n"
+ << " --nbds_max <limit> Override for module param nbds_max\n"
+ << " --max_part <limit> Override for module param max_part\n"
+ << " --exclusive Forbid writes by other clients\n"
+ << " --timeout <seconds> Set nbd request timeout\n"
+ << " --try-netlink Use the nbd netlink interface\n"
+ << "\n"
+ << "List options:\n"
+ << " --format plain|json|xml Output format (default: plain)\n"
+ << " --pretty-format Pretty formatting (json and xml)\n"
+ << std::endl;
+ generic_server_usage();
+}
+
+static int nbd = -1;
+static int nbd_index = -1;
+
+enum Command {
+ None,
+ Connect,
+ Disconnect,
+ List
+};
+
+static Command cmd = None;
+
+#define RBD_NBD_BLKSIZE 512UL
+
+#define HELP_INFO 1
+#define VERSION_INFO 2
+
+#ifdef CEPH_BIG_ENDIAN
+#define ntohll(a) (a)
+#elif defined(CEPH_LITTLE_ENDIAN)
+#define ntohll(a) swab(a)
+#else
+#error "Could not determine endianess"
+#endif
+#define htonll(a) ntohll(a)
+
+static int parse_args(vector<const char*>& args, std::ostream *err_msg,
+ Command *command, Config *cfg);
+static int netlink_resize(int nbd_index, uint64_t size);
+
+class NBDServer
+{
+private:
+ int fd;
+ librbd::Image &image;
+
+public:
+ NBDServer(int _fd, librbd::Image& _image)
+ : fd(_fd)
+ , image(_image)
+ , disconnect_lock("NBDServer::DisconnectLocker")
+ , lock("NBDServer::Locker")
+ , reader_thread(*this, &NBDServer::reader_entry)
+ , writer_thread(*this, &NBDServer::writer_entry)
+ , started(false)
+ {}
+
+private:
+ Mutex disconnect_lock;
+ Cond disconnect_cond;
+ std::atomic<bool> terminated = { false };
+
+ void shutdown()
+ {
+ bool expected = false;
+ if (terminated.compare_exchange_strong(expected, true)) {
+ ::shutdown(fd, SHUT_RDWR);
+
+ Mutex::Locker l(lock);
+ cond.Signal();
+ }
+ }
+
+ struct IOContext
+ {
+ xlist<IOContext*>::item item;
+ NBDServer *server = nullptr;
+ struct nbd_request request;
+ struct nbd_reply reply;
+ bufferlist data;
+ int command = 0;
+
+ IOContext()
+ : item(this)
+ {}
+ };
+
+ friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx);
+
+ Mutex lock;
+ Cond cond;
+ xlist<IOContext*> io_pending;
+ xlist<IOContext*> io_finished;
+
+ void io_start(IOContext *ctx)
+ {
+ Mutex::Locker l(lock);
+ io_pending.push_back(&ctx->item);
+ }
+
+ void io_finish(IOContext *ctx)
+ {
+ Mutex::Locker l(lock);
+ ceph_assert(ctx->item.is_on_list());
+ ctx->item.remove_myself();
+ io_finished.push_back(&ctx->item);
+ cond.Signal();
+ }
+
+ IOContext *wait_io_finish()
+ {
+ Mutex::Locker l(lock);
+ while(io_finished.empty() && !terminated)
+ cond.Wait(lock);
+
+ if (io_finished.empty())
+ return NULL;
+
+ IOContext *ret = io_finished.front();
+ io_finished.pop_front();
+
+ return ret;
+ }
+
+ void wait_clean()
+ {
+ ceph_assert(!reader_thread.is_started());
+ Mutex::Locker l(lock);
+ while(!io_pending.empty())
+ cond.Wait(lock);
+
+ while(!io_finished.empty()) {
+ std::unique_ptr<IOContext> free_ctx(io_finished.front());
+ io_finished.pop_front();
+ }
+ }
+
+ static void aio_callback(librbd::completion_t cb, void *arg)
+ {
+ librbd::RBD::AioCompletion *aio_completion =
+ reinterpret_cast<librbd::RBD::AioCompletion*>(cb);
+
+ IOContext *ctx = reinterpret_cast<IOContext *>(arg);
+ int ret = aio_completion->get_return_value();
+
+ dout(20) << __func__ << ": " << *ctx << dendl;
+
+ if (ret == -EINVAL) {
+ // if shrinking an image, a pagecache writeback might reference
+ // extents outside of the range of the new image extents
+ dout(0) << __func__ << ": masking IO out-of-bounds error" << dendl;
+ ctx->data.clear();
+ ret = 0;
+ }
+
+ if (ret < 0) {
+ ctx->reply.error = htonl(-ret);
+ } else if ((ctx->command == NBD_CMD_READ) &&
+ ret < static_cast<int>(ctx->request.len)) {
+ int pad_byte_count = static_cast<int> (ctx->request.len) - ret;
+ ctx->data.append_zero(pad_byte_count);
+ dout(20) << __func__ << ": " << *ctx << ": Pad byte count: "
+ << pad_byte_count << dendl;
+ ctx->reply.error = htonl(0);
+ } else {
+ ctx->reply.error = htonl(0);
+ }
+ ctx->server->io_finish(ctx);
+
+ aio_completion->release();
+ }
+
+ void reader_entry()
+ {
+ while (!terminated) {
+ std::unique_ptr<IOContext> ctx(new IOContext());
+ ctx->server = this;
+
+ dout(20) << __func__ << ": waiting for nbd request" << dendl;
+
+ int r = safe_read_exact(fd, &ctx->request, sizeof(struct nbd_request));
+ if (r < 0) {
+ derr << "failed to read nbd request header: " << cpp_strerror(r)
+ << dendl;
+ goto signal;
+ }
+
+ if (ctx->request.magic != htonl(NBD_REQUEST_MAGIC)) {
+ derr << "invalid nbd request header" << dendl;
+ goto signal;
+ }
+
+ ctx->request.from = ntohll(ctx->request.from);
+ ctx->request.type = ntohl(ctx->request.type);
+ ctx->request.len = ntohl(ctx->request.len);
+
+ ctx->reply.magic = htonl(NBD_REPLY_MAGIC);
+ memcpy(ctx->reply.handle, ctx->request.handle, sizeof(ctx->reply.handle));
+
+ ctx->command = ctx->request.type & 0x0000ffff;
+
+ dout(20) << *ctx << ": start" << dendl;
+
+ switch (ctx->command)
+ {
+ case NBD_CMD_DISC:
+ // NBD_DO_IT will return when pipe is closed
+ dout(0) << "disconnect request received" << dendl;
+ goto signal;
+ case NBD_CMD_WRITE:
+ bufferptr ptr(ctx->request.len);
+ r = safe_read_exact(fd, ptr.c_str(), ctx->request.len);
+ if (r < 0) {
+ derr << *ctx << ": failed to read nbd request data: "
+ << cpp_strerror(r) << dendl;
+ goto signal;
+ }
+ ctx->data.push_back(ptr);
+ break;
+ }
+
+ IOContext *pctx = ctx.release();
+ io_start(pctx);
+ librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(pctx, aio_callback);
+ switch (pctx->command)
+ {
+ case NBD_CMD_WRITE:
+ image.aio_write(pctx->request.from, pctx->request.len, pctx->data, c);
+ break;
+ case NBD_CMD_READ:
+ image.aio_read(pctx->request.from, pctx->request.len, pctx->data, c);
+ break;
+ case NBD_CMD_FLUSH:
+ image.aio_flush(c);
+ break;
+ case NBD_CMD_TRIM:
+ image.aio_discard(pctx->request.from, pctx->request.len, c);
+ break;
+ default:
+ derr << *pctx << ": invalid request command" << dendl;
+ c->release();
+ goto signal;
+ }
+ }
+ dout(20) << __func__ << ": terminated" << dendl;
+
+signal:
+ Mutex::Locker l(disconnect_lock);
+ disconnect_cond.Signal();
+ }
+
+ void writer_entry()
+ {
+ while (!terminated) {
+ dout(20) << __func__ << ": waiting for io request" << dendl;
+ std::unique_ptr<IOContext> ctx(wait_io_finish());
+ if (!ctx) {
+ dout(20) << __func__ << ": no io requests, terminating" << dendl;
+ return;
+ }
+
+ dout(20) << __func__ << ": got: " << *ctx << dendl;
+
+ int r = safe_write(fd, &ctx->reply, sizeof(struct nbd_reply));
+ if (r < 0) {
+ derr << *ctx << ": failed to write reply header: " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+ if (ctx->command == NBD_CMD_READ && ctx->reply.error == htonl(0)) {
+ r = ctx->data.write_fd(fd);
+ if (r < 0) {
+ derr << *ctx << ": failed to write replay data: " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+ }
+ dout(20) << *ctx << ": finish" << dendl;
+ }
+ dout(20) << __func__ << ": terminated" << dendl;
+ }
+
+ class ThreadHelper : public Thread
+ {
+ public:
+ typedef void (NBDServer::*entry_func)();
+ private:
+ NBDServer &server;
+ entry_func func;
+ public:
+ ThreadHelper(NBDServer &_server, entry_func _func)
+ :server(_server)
+ ,func(_func)
+ {}
+ protected:
+ void* entry() override
+ {
+ (server.*func)();
+ server.shutdown();
+ return NULL;
+ }
+ } reader_thread, writer_thread;
+
+ bool started;
+public:
+ void start()
+ {
+ if (!started) {
+ dout(10) << __func__ << ": starting" << dendl;
+
+ started = true;
+
+ reader_thread.create("rbd_reader");
+ writer_thread.create("rbd_writer");
+ }
+ }
+
+ void wait_for_disconnect()
+ {
+ if (!started)
+ return;
+
+ Mutex::Locker l(disconnect_lock);
+ disconnect_cond.Wait(disconnect_lock);
+ }
+
+ ~NBDServer()
+ {
+ if (started) {
+ dout(10) << __func__ << ": terminating" << dendl;
+
+ shutdown();
+
+ reader_thread.join();
+ writer_thread.join();
+
+ wait_clean();
+
+ started = false;
+ }
+ }
+};
+
+std::ostream &operator<<(std::ostream &os, const NBDServer::IOContext &ctx) {
+
+ os << "[" << std::hex << ntohll(*((uint64_t *)ctx.request.handle));
+
+ switch (ctx.command)
+ {
+ case NBD_CMD_WRITE:
+ os << " WRITE ";
+ break;
+ case NBD_CMD_READ:
+ os << " READ ";
+ break;
+ case NBD_CMD_FLUSH:
+ os << " FLUSH ";
+ break;
+ case NBD_CMD_TRIM:
+ os << " TRIM ";
+ break;
+ default:
+ os << " UNKNOWN(" << ctx.command << ") ";
+ break;
+ }
+
+ os << ctx.request.from << "~" << ctx.request.len << " "
+ << std::dec << ntohl(ctx.reply.error) << "]";
+
+ return os;
+}
+
+class NBDWatchCtx : public librbd::UpdateWatchCtx
+{
+private:
+ int fd;
+ int nbd_index;
+ bool use_netlink;
+ librados::IoCtx &io_ctx;
+ librbd::Image &image;
+ unsigned long size;
+public:
+ NBDWatchCtx(int _fd,
+ int _nbd_index,
+ bool _use_netlink,
+ librados::IoCtx &_io_ctx,
+ librbd::Image &_image,
+ unsigned long _size)
+ : fd(_fd)
+ , nbd_index(_nbd_index)
+ , use_netlink(_use_netlink)
+ , io_ctx(_io_ctx)
+ , image(_image)
+ , size(_size)
+ { }
+
+ ~NBDWatchCtx() override {}
+
+ void handle_notify() override
+ {
+ librbd::image_info_t info;
+ if (image.stat(info, sizeof(info)) == 0) {
+ unsigned long new_size = info.size;
+ int ret;
+
+ if (new_size != size) {
+ dout(5) << "resize detected" << dendl;
+ if (ioctl(fd, BLKFLSBUF, NULL) < 0)
+ derr << "invalidate page cache failed: " << cpp_strerror(errno)
+ << dendl;
+ if (use_netlink) {
+ ret = netlink_resize(nbd_index, new_size);
+ } else {
+ ret = ioctl(fd, NBD_SET_SIZE, new_size);
+ if (ret < 0)
+ derr << "resize failed: " << cpp_strerror(errno) << dendl;
+ }
+
+ if (!ret)
+ size = new_size;
+
+ if (ioctl(fd, BLKRRPART, NULL) < 0) {
+ derr << "rescan of partition table failed: " << cpp_strerror(errno)
+ << dendl;
+ }
+ if (image.invalidate_cache() < 0)
+ derr << "invalidate rbd cache failed" << dendl;
+ }
+ }
+ }
+};
+
+class NBDListIterator {
+public:
+ bool get(int *pid, Config *cfg) {
+ while (true) {
+ std::string nbd_path = "/sys/block/nbd" + stringify(m_index);
+ if(access(nbd_path.c_str(), F_OK) != 0) {
+ return false;
+ }
+
+ *cfg = Config();
+ cfg->devpath = "/dev/nbd" + stringify(m_index++);
+
+ std::ifstream ifs;
+ ifs.open(nbd_path + "/pid", std::ifstream::in);
+ if (!ifs.is_open()) {
+ continue;
+ }
+ ifs >> *pid;
+
+ int r = get_mapped_info(*pid, cfg);
+ if (r < 0) {
+ continue;
+ }
+
+ return true;
+ }
+ }
+
+private:
+ int m_index = 0;
+
+ int get_mapped_info(int pid, Config *cfg) {
+ int r;
+ std::string path = "/proc/" + stringify(pid) + "/cmdline";
+ std::ifstream ifs;
+ std::string cmdline;
+ std::vector<const char*> args;
+
+ ifs.open(path.c_str(), std::ifstream::in);
+ if (!ifs.is_open())
+ return -1;
+ ifs >> cmdline;
+
+ for (unsigned i = 0; i < cmdline.size(); i++) {
+ const char *arg = &cmdline[i];
+ if (i == 0) {
+ if (strcmp(basename(arg) , "rbd-nbd") != 0) {
+ return -EINVAL;
+ }
+ } else {
+ args.push_back(arg);
+ }
+
+ while (cmdline[i] != '\0') {
+ i++;
+ }
+ }
+
+ std::ostringstream err_msg;
+ Command command;
+ r = parse_args(args, &err_msg, &command, cfg);
+ if (r < 0) {
+ return r;
+ }
+
+ if (command != Connect) {
+ return -ENOENT;
+ }
+
+ return 0;
+ }
+};
+
+static int load_module(Config *cfg)
+{
+ ostringstream param;
+ int ret;
+
+ if (cfg->nbds_max)
+ param << "nbds_max=" << cfg->nbds_max;
+
+ if (cfg->max_part)
+ param << " max_part=" << cfg->max_part;
+
+ if (!access("/sys/module/nbd", F_OK)) {
+ if (cfg->nbds_max || cfg->set_max_part)
+ cerr << "rbd-nbd: ignoring kernel module parameter options: nbd module already loaded"
+ << std::endl;
+ return 0;
+ }
+
+ ret = module_load("nbd", param.str().c_str());
+ if (ret < 0)
+ cerr << "rbd-nbd: failed to load nbd kernel module: " << cpp_strerror(-ret)
+ << std::endl;
+
+ return ret;
+}
+
+static int check_device_size(int nbd_index, unsigned long expected_size)
+{
+ // There are bugs with some older kernel versions that result in an
+ // overflow for large image sizes. This check is to ensure we are
+ // not affected.
+
+ unsigned long size = 0;
+ std::string path = "/sys/block/nbd" + stringify(nbd_index) + "/size";
+ std::ifstream ifs;
+ ifs.open(path.c_str(), std::ifstream::in);
+ if (!ifs.is_open()) {
+ cerr << "rbd-nbd: failed to open " << path << std::endl;
+ return -EINVAL;
+ }
+ ifs >> size;
+ size *= RBD_NBD_BLKSIZE;
+
+ if (size == 0) {
+ // Newer kernel versions will report real size only after nbd
+ // connect. Assume this is the case and return success.
+ return 0;
+ }
+
+ if (size != expected_size) {
+ cerr << "rbd-nbd: kernel reported invalid device size (" << size
+ << ", expected " << expected_size << ")" << std::endl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int parse_nbd_index(const std::string& devpath)
+{
+ int index, ret;
+
+ ret = sscanf(devpath.c_str(), "/dev/nbd%d", &index);
+ if (ret <= 0) {
+ // mean an early matching failure. But some cases need a negative value.
+ if (ret == 0)
+ ret = -EINVAL;
+ cerr << "rbd-nbd: invalid device path: " << devpath
+ << " (expected /dev/nbd{num})" << std::endl;
+ return ret;
+ }
+
+ return index;
+}
+
+static int try_ioctl_setup(Config *cfg, int fd, uint64_t size, uint64_t flags)
+{
+ int index = 0, r;
+
+ if (cfg->devpath.empty()) {
+ char dev[64];
+ const char *path = "/sys/module/nbd/parameters/nbds_max";
+ int nbds_max = -1;
+ if (access(path, F_OK) == 0) {
+ std::ifstream ifs;
+ ifs.open(path, std::ifstream::in);
+ if (ifs.is_open()) {
+ ifs >> nbds_max;
+ ifs.close();
+ }
+ }
+
+ while (true) {
+ snprintf(dev, sizeof(dev), "/dev/nbd%d", index);
+
+ nbd = open(dev, O_RDWR);
+ if (nbd < 0) {
+ if (nbd == -EPERM && nbds_max != -1 && index < (nbds_max-1)) {
+ ++index;
+ continue;
+ }
+ r = nbd;
+ cerr << "rbd-nbd: failed to find unused device" << std::endl;
+ goto done;
+ }
+
+ r = ioctl(nbd, NBD_SET_SOCK, fd);
+ if (r < 0) {
+ close(nbd);
+ ++index;
+ continue;
+ }
+
+ cfg->devpath = dev;
+ break;
+ }
+ } else {
+ r = parse_nbd_index(cfg->devpath);
+ if (r < 0)
+ goto done;
+ index = r;
+
+ nbd = open(cfg->devpath.c_str(), O_RDWR);
+ if (nbd < 0) {
+ r = nbd;
+ cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl;
+ goto done;
+ }
+
+ r = ioctl(nbd, NBD_SET_SOCK, fd);
+ if (r < 0) {
+ r = -errno;
+ cerr << "rbd-nbd: the device " << cfg->devpath << " is busy" << std::endl;
+ close(nbd);
+ goto done;
+ }
+ }
+
+ r = ioctl(nbd, NBD_SET_BLKSIZE, RBD_NBD_BLKSIZE);
+ if (r < 0) {
+ r = -errno;
+ goto close_nbd;
+ }
+
+ r = ioctl(nbd, NBD_SET_SIZE, size);
+ if (r < 0) {
+ r = -errno;
+ goto close_nbd;
+ }
+
+ ioctl(nbd, NBD_SET_FLAGS, flags);
+
+ if (cfg->timeout >= 0) {
+ r = ioctl(nbd, NBD_SET_TIMEOUT, (unsigned long)cfg->timeout);
+ if (r < 0) {
+ r = -errno;
+ cerr << "rbd-nbd: failed to set timeout: " << cpp_strerror(r)
+ << std::endl;
+ goto close_nbd;
+ }
+ }
+
+ dout(10) << "ioctl setup complete for " << cfg->devpath << dendl;
+ nbd_index = index;
+ return 0;
+
+close_nbd:
+ if (r < 0) {
+ ioctl(nbd, NBD_CLEAR_SOCK);
+ cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r) << std::endl;
+ }
+ close(nbd);
+done:
+ return r;
+}
+
+static void netlink_cleanup(struct nl_sock *sock)
+{
+ if (!sock)
+ return;
+
+ nl_close(sock);
+ nl_socket_free(sock);
+}
+
+static struct nl_sock *netlink_init(int *id)
+{
+ struct nl_sock *sock;
+ int ret;
+
+ sock = nl_socket_alloc();
+ if (!sock) {
+ cerr << "rbd-nbd: Could not allocate netlink socket." << std::endl;
+ return NULL;
+ }
+
+ ret = genl_connect(sock);
+ if (ret < 0) {
+ cerr << "rbd-nbd: Could not connect netlink socket. Error " << ret
+ << std::endl;
+ goto free_sock;
+ }
+
+ *id = genl_ctrl_resolve(sock, "nbd");
+ if (*id < 0)
+ // nbd netlink interface not supported.
+ goto close_sock;
+
+ return sock;
+
+close_sock:
+ nl_close(sock);
+free_sock:
+ nl_socket_free(sock);
+ return NULL;
+}
+
+static int netlink_disconnect(int index)
+{
+ struct nl_sock *sock;
+ struct nl_msg *msg;
+ int ret, nl_id;
+
+ sock = netlink_init(&nl_id);
+ if (!sock)
+ // Try ioctl
+ return 1;
+
+ nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL);
+
+ msg = nlmsg_alloc();
+ if (!msg) {
+ cerr << "rbd-nbd: Could not allocate netlink message." << std::endl;
+ goto free_sock;
+ }
+
+ if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0,
+ NBD_CMD_DISCONNECT, 0)) {
+ cerr << "rbd-nbd: Could not setup message." << std::endl;
+ goto nla_put_failure;
+ }
+
+ NLA_PUT_U32(msg, NBD_ATTR_INDEX, index);
+
+ ret = nl_send_sync(sock, msg);
+ netlink_cleanup(sock);
+ if (ret < 0) {
+ cerr << "rbd-nbd: netlink disconnect failed: " << nl_geterror(-ret)
+ << std::endl;
+ return -EIO;
+ }
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_free(msg);
+free_sock:
+ netlink_cleanup(sock);
+ return -EIO;
+}
+
+static int netlink_disconnect_by_path(const std::string& devpath)
+{
+ int index;
+
+ index = parse_nbd_index(devpath);
+ if (index < 0)
+ return index;
+
+ return netlink_disconnect(index);
+}
+
+static int netlink_resize(int nbd_index, uint64_t size)
+{
+ struct nl_sock *sock;
+ struct nl_msg *msg;
+ int nl_id, ret;
+
+ sock = netlink_init(&nl_id);
+ if (!sock) {
+ cerr << "rbd-nbd: Netlink interface not supported." << std::endl;
+ return 1;
+ }
+
+ nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, genl_handle_msg, NULL);
+
+ msg = nlmsg_alloc();
+ if (!msg) {
+ cerr << "rbd-nbd: Could not allocate netlink message." << std::endl;
+ goto free_sock;
+ }
+
+ if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0,
+ NBD_CMD_RECONFIGURE, 0)) {
+ cerr << "rbd-nbd: Could not setup message." << std::endl;
+ goto free_msg;
+ }
+
+ NLA_PUT_U32(msg, NBD_ATTR_INDEX, nbd_index);
+ NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, size);
+
+ ret = nl_send_sync(sock, msg);
+ if (ret < 0) {
+ cerr << "rbd-nbd: netlink resize failed: " << nl_geterror(ret) << std::endl;
+ goto free_sock;
+ }
+
+ netlink_cleanup(sock);
+ dout(10) << "netlink resize complete for nbd" << nbd_index << dendl;
+ return 0;
+
+nla_put_failure:
+free_msg:
+ nlmsg_free(msg);
+free_sock:
+ netlink_cleanup(sock);
+ return -EIO;
+}
+
+static int netlink_connect_cb(struct nl_msg *msg, void *arg)
+{
+ struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlmsg_hdr(msg));
+ Config *cfg = (Config *)arg;
+ struct nlattr *msg_attr[NBD_ATTR_MAX + 1];
+ uint32_t index;
+ int ret;
+
+ ret = nla_parse(msg_attr, NBD_ATTR_MAX, genlmsg_attrdata(gnlh, 0),
+ genlmsg_attrlen(gnlh, 0), NULL);
+ if (ret) {
+ cerr << "rbd-nbd: Unsupported netlink reply" << std::endl;
+ return -NLE_MSGTYPE_NOSUPPORT;
+ }
+
+ if (!msg_attr[NBD_ATTR_INDEX]) {
+ cerr << "rbd-nbd: netlink connect reply missing device index." << std::endl;
+ return -NLE_MSGTYPE_NOSUPPORT;
+ }
+
+ index = nla_get_u32(msg_attr[NBD_ATTR_INDEX]);
+ cfg->devpath = "/dev/nbd" + stringify(index);
+ nbd_index = index;
+
+ return NL_OK;
+}
+
+static int netlink_connect(Config *cfg, struct nl_sock *sock, int nl_id, int fd,
+ uint64_t size, uint64_t flags)
+{
+ struct nlattr *sock_attr;
+ struct nlattr *sock_opt;
+ struct nl_msg *msg;
+ int ret;
+
+ nl_socket_modify_cb(sock, NL_CB_VALID, NL_CB_CUSTOM, netlink_connect_cb, cfg);
+
+ msg = nlmsg_alloc();
+ if (!msg) {
+ cerr << "rbd-nbd: Could not allocate netlink message." << std::endl;
+ return -ENOMEM;
+ }
+
+ if (!genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, nl_id, 0, 0, NBD_CMD_CONNECT,
+ 0)) {
+ cerr << "rbd-nbd: Could not setup message." << std::endl;
+ goto free_msg;
+ }
+
+ if (!cfg->devpath.empty()) {
+ ret = parse_nbd_index(cfg->devpath);
+ if (ret < 0)
+ goto free_msg;
+
+ NLA_PUT_U32(msg, NBD_ATTR_INDEX, ret);
+ }
+
+ if (cfg->timeout >= 0)
+ NLA_PUT_U64(msg, NBD_ATTR_TIMEOUT, cfg->timeout);
+
+ NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, size);
+ NLA_PUT_U64(msg, NBD_ATTR_BLOCK_SIZE_BYTES, RBD_NBD_BLKSIZE);
+ NLA_PUT_U64(msg, NBD_ATTR_SERVER_FLAGS, flags);
+
+ sock_attr = nla_nest_start(msg, NBD_ATTR_SOCKETS);
+ if (!sock_attr) {
+ cerr << "rbd-nbd: Could not init sockets in netlink message." << std::endl;
+ goto free_msg;
+ }
+
+ sock_opt = nla_nest_start(msg, NBD_SOCK_ITEM);
+ if (!sock_opt) {
+ cerr << "rbd-nbd: Could not init sock in netlink message." << std::endl;
+ goto free_msg;
+ }
+
+ NLA_PUT_U32(msg, NBD_SOCK_FD, fd);
+ nla_nest_end(msg, sock_opt);
+ nla_nest_end(msg, sock_attr);
+
+ ret = nl_send_sync(sock, msg);
+ if (ret < 0) {
+ cerr << "rbd-nbd: netlink connect failed: " << nl_geterror(ret)
+ << std::endl;
+ return -EIO;
+ }
+
+ dout(10) << "netlink connect complete for " << cfg->devpath << dendl;
+ return 0;
+
+nla_put_failure:
+free_msg:
+ nlmsg_free(msg);
+ return -EIO;
+}
+
+static int try_netlink_setup(Config *cfg, int fd, uint64_t size, uint64_t flags)
+{
+ struct nl_sock *sock;
+ int nl_id, ret;
+
+ sock = netlink_init(&nl_id);
+ if (!sock) {
+ cerr << "rbd-nbd: Netlink interface not supported. Using ioctl interface."
+ << std::endl;
+ return 1;
+ }
+
+ dout(10) << "netlink interface supported." << dendl;
+
+ ret = netlink_connect(cfg, sock, nl_id, fd, size, flags);
+ netlink_cleanup(sock);
+
+ if (ret != 0)
+ return ret;
+
+ nbd = open(cfg->devpath.c_str(), O_RDWR);
+ if (nbd < 0) {
+ cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl;
+ return nbd;
+ }
+
+ return 0;
+}
+
+static void handle_signal(int signum)
+{
+ int ret;
+
+ ceph_assert(signum == SIGINT || signum == SIGTERM);
+ derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
+
+ if (nbd < 0 || nbd_index < 0) {
+ dout(20) << __func__ << ": " << "disconnect not needed." << dendl;
+ return;
+ }
+
+ dout(20) << __func__ << ": " << "sending NBD_DISCONNECT" << dendl;
+ ret = netlink_disconnect(nbd_index);
+ if (ret == 1)
+ ret = ioctl(nbd, NBD_DISCONNECT);
+
+ if (ret != 0) {
+ derr << "rbd-nbd: disconnect failed. Error: " << ret << dendl;
+ } else {
+ dout(20) << __func__ << ": " << "disconnected" << dendl;
+ }
+}
+
+static NBDServer *start_server(int fd, librbd::Image& image)
+{
+ NBDServer *server;
+
+ server = new NBDServer(fd, image);
+ server->start();
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, sighup_handler);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ return server;
+}
+
+static void run_server(Preforker& forker, NBDServer *server, bool netlink_used)
+{
+ if (g_conf()->daemonize) {
+ global_init_postfork_finish(g_ceph_context);
+ forker.daemonize();
+ }
+
+ if (netlink_used)
+ server->wait_for_disconnect();
+ else
+ ioctl(nbd, NBD_DO_IT);
+
+ unregister_async_signal_handler(SIGHUP, sighup_handler);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+}
+
+static int do_map(int argc, const char *argv[], Config *cfg)
+{
+ int r;
+
+ librados::Rados rados;
+ librbd::RBD rbd;
+ librados::IoCtx io_ctx;
+ librbd::Image image;
+
+ int read_only = 0;
+ unsigned long flags;
+ unsigned long size;
+ bool use_netlink;
+
+ int fd[2];
+
+ librbd::image_info_t info;
+
+ Preforker forker;
+ NBDServer *server;
+
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+ g_ceph_context->_conf.set_val_or_die("pid_file", "");
+
+ if (global_init_prefork(g_ceph_context) >= 0) {
+ std::string err;
+ r = forker.prefork(err);
+ if (r < 0) {
+ cerr << err << std::endl;
+ return r;
+ }
+ if (forker.is_parent()) {
+ if (forker.parent_wait(err) != 0) {
+ return -ENXIO;
+ }
+ return 0;
+ }
+ global_init_postfork_start(g_ceph_context);
+ }
+
+ common_init_finish(g_ceph_context);
+ global_init_chdir(g_ceph_context);
+
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, fd) == -1) {
+ r = -errno;
+ goto close_ret;
+ }
+
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0)
+ goto close_fd;
+
+ r = rados.connect();
+ if (r < 0)
+ goto close_fd;
+
+ r = rados.ioctx_create(cfg->poolname.c_str(), io_ctx);
+ if (r < 0)
+ goto close_fd;
+
+ io_ctx.set_namespace(cfg->nsname);
+
+ r = rbd.open(io_ctx, image, cfg->imgname.c_str());
+ if (r < 0)
+ goto close_fd;
+
+ if (cfg->exclusive) {
+ r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE);
+ if (r < 0) {
+ cerr << "rbd-nbd: failed to acquire exclusive lock: " << cpp_strerror(r)
+ << std::endl;
+ goto close_fd;
+ }
+ }
+
+ if (!cfg->snapname.empty()) {
+ r = image.snap_set(cfg->snapname.c_str());
+ if (r < 0)
+ goto close_fd;
+ }
+
+ r = image.stat(info, sizeof(info));
+ if (r < 0)
+ goto close_fd;
+
+ flags = NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_TRIM | NBD_FLAG_HAS_FLAGS;
+ if (!cfg->snapname.empty() || cfg->readonly) {
+ flags |= NBD_FLAG_READ_ONLY;
+ read_only = 1;
+ }
+
+ if (info.size > ULONG_MAX) {
+ r = -EFBIG;
+ cerr << "rbd-nbd: image is too large (" << byte_u_t(info.size)
+ << ", max is " << byte_u_t(ULONG_MAX) << ")" << std::endl;
+ goto close_fd;
+ }
+
+ size = info.size;
+
+ r = load_module(cfg);
+ if (r < 0)
+ goto close_fd;
+
+ server = start_server(fd[1], image);
+
+ use_netlink = cfg->try_netlink;
+ if (use_netlink) {
+ r = try_netlink_setup(cfg, fd[0], size, flags);
+ if (r < 0) {
+ goto free_server;
+ } else if (r == 1) {
+ use_netlink = false;
+ }
+ }
+
+ if (!use_netlink) {
+ r = try_ioctl_setup(cfg, fd[0], size, flags);
+ if (r < 0)
+ goto free_server;
+ }
+
+ r = check_device_size(nbd_index, size);
+ if (r < 0)
+ goto close_nbd;
+
+ r = ioctl(nbd, BLKROSET, (unsigned long) &read_only);
+ if (r < 0) {
+ r = -errno;
+ goto close_nbd;
+ }
+
+ {
+ uint64_t handle;
+
+ NBDWatchCtx watch_ctx(nbd, nbd_index, use_netlink, io_ctx, image,
+ info.size);
+ r = image.update_watch(&watch_ctx, &handle);
+ if (r < 0)
+ goto close_nbd;
+
+ cout << cfg->devpath << std::endl;
+
+ run_server(forker, server, use_netlink);
+
+ r = image.update_unwatch(handle);
+ ceph_assert(r == 0);
+ }
+
+close_nbd:
+ if (r < 0) {
+ if (use_netlink) {
+ netlink_disconnect(nbd_index);
+ } else {
+ ioctl(nbd, NBD_CLEAR_SOCK);
+ cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r)
+ << std::endl;
+ }
+ }
+ close(nbd);
+free_server:
+ delete server;
+close_fd:
+ close(fd[0]);
+ close(fd[1]);
+close_ret:
+ image.close();
+ io_ctx.close();
+ rados.shutdown();
+
+ forker.exit(r < 0 ? EXIT_FAILURE : 0);
+ // Unreachable;
+ return r;
+}
+
+static int do_unmap(Config *cfg)
+{
+ int r, nbd;
+
+ /*
+ * The netlink disconnect call supports devices setup with netlink or ioctl,
+ * so we always try that first.
+ */
+ r = netlink_disconnect_by_path(cfg->devpath);
+ if (r != 1)
+ return r;
+
+ nbd = open(cfg->devpath.c_str(), O_RDWR);
+ if (nbd < 0) {
+ cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl;
+ return nbd;
+ }
+
+ r = ioctl(nbd, NBD_DISCONNECT);
+ if (r < 0) {
+ cerr << "rbd-nbd: the device is not used" << std::endl;
+ }
+
+ close(nbd);
+ return r;
+}
+
+static int parse_imgpath(const std::string &imgpath, Config *cfg,
+ std::ostream *err_msg) {
+ std::regex pattern("^(?:([^/]+)/(?:([^/@]+)/)?)?([^@]+)(?:@([^/@]+))?$");
+ std::smatch match;
+ if (!std::regex_match(imgpath, match, pattern)) {
+ std::cerr << "rbd-nbd: invalid spec '" << imgpath << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (match[1].matched) {
+ cfg->poolname = match[1];
+ }
+
+ if (match[2].matched) {
+ cfg->nsname = match[2];
+ }
+
+ cfg->imgname = match[3];
+
+ if (match[4].matched)
+ cfg->snapname = match[4];
+
+ return 0;
+}
+
+static int do_list_mapped_devices(const std::string &format, bool pretty_format)
+{
+ bool should_print = false;
+ std::unique_ptr<ceph::Formatter> f;
+ TextTable tbl;
+
+ if (format == "json") {
+ f.reset(new JSONFormatter(pretty_format));
+ } else if (format == "xml") {
+ f.reset(new XMLFormatter(pretty_format));
+ } else if (!format.empty() && format != "plain") {
+ std::cerr << "rbd-nbd: invalid output format: " << format << std::endl;
+ return -EINVAL;
+ }
+
+ if (f) {
+ f->open_array_section("devices");
+ } else {
+ tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("namespace", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
+ }
+
+ int pid;
+ Config cfg;
+ NBDListIterator it;
+ while (it.get(&pid, &cfg)) {
+ if (f) {
+ f->open_object_section("device");
+ f->dump_int("id", pid);
+ f->dump_string("pool", cfg.poolname);
+ f->dump_string("namespace", cfg.nsname);
+ f->dump_string("image", cfg.imgname);
+ f->dump_string("snap", cfg.snapname);
+ f->dump_string("device", cfg.devpath);
+ f->close_section();
+ } else {
+ should_print = true;
+ if (cfg.snapname.empty()) {
+ cfg.snapname = "-";
+ }
+ tbl << pid << cfg.poolname << cfg.nsname << cfg.imgname << cfg.snapname
+ << cfg.devpath << TextTable::endrow;
+ }
+ }
+
+ if (f) {
+ f->close_section(); // devices
+ f->flush(std::cout);
+ }
+ if (should_print) {
+ std::cout << tbl;
+ }
+ return 0;
+}
+
+static bool find_mapped_dev_by_spec(Config *cfg) {
+ int pid;
+ Config c;
+ NBDListIterator it;
+ while (it.get(&pid, &c)) {
+ if (c.poolname == cfg->poolname && c.nsname == cfg->nsname &&
+ c.imgname == cfg->imgname && c.snapname == cfg->snapname) {
+ *cfg = c;
+ return true;
+ }
+ }
+ return false;
+}
+
+
+static int parse_args(vector<const char*>& args, std::ostream *err_msg,
+ Command *command, Config *cfg) {
+ std::string conf_file_list;
+ std::string cluster;
+ CephInitParameters iparams = ceph_argparse_early_args(
+ args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list);
+
+ ConfigProxy config{false};
+ config->name = iparams.name;
+ config->cluster = cluster;
+
+ if (!conf_file_list.empty()) {
+ config.parse_config_files(conf_file_list.c_str(), nullptr, 0);
+ } else {
+ config.parse_config_files(nullptr, nullptr, 0);
+ }
+ config.parse_env(CEPH_ENTITY_TYPE_CLIENT);
+ config.parse_argv(args);
+ cfg->poolname = config.get_val<std::string>("rbd_default_pool");
+
+ std::vector<const char*>::iterator i;
+ std::ostringstream err;
+
+ for (i = args.begin(); i != args.end(); ) {
+ if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
+ return HELP_INFO;
+ } else if (ceph_argparse_flag(args, i, "-v", "--version", (char*)NULL)) {
+ return VERSION_INFO;
+ } else if (ceph_argparse_witharg(args, i, &cfg->devpath, "--device", (char *)NULL)) {
+ } else if (ceph_argparse_witharg(args, i, &cfg->nbds_max, err, "--nbds_max", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-nbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->nbds_max < 0) {
+ *err_msg << "rbd-nbd: Invalid argument for nbds_max!";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i, &cfg->max_part, err, "--max_part", (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-nbd: " << err.str();
+ return -EINVAL;
+ }
+ if ((cfg->max_part < 0) || (cfg->max_part > 255)) {
+ *err_msg << "rbd-nbd: Invalid argument for max_part(0~255)!";
+ return -EINVAL;
+ }
+ cfg->set_max_part = true;
+ } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
+ cfg->readonly = true;
+ } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) {
+ cfg->exclusive = true;
+ } else if (ceph_argparse_witharg(args, i, &cfg->timeout, err, "--timeout",
+ (char *)NULL)) {
+ if (!err.str().empty()) {
+ *err_msg << "rbd-nbd: " << err.str();
+ return -EINVAL;
+ }
+ if (cfg->timeout < 0) {
+ *err_msg << "rbd-nbd: Invalid argument for timeout!";
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(args, i, &cfg->format, err, "--format",
+ (char *)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "--pretty-format", (char *)NULL)) {
+ cfg->pretty_format = true;
+ } else if (ceph_argparse_flag(args, i, "--try-netlink", (char *)NULL)) {
+ cfg->try_netlink = true;
+ } else {
+ ++i;
+ }
+ }
+
+ Command cmd = None;
+ if (args.begin() != args.end()) {
+ if (strcmp(*args.begin(), "map") == 0) {
+ cmd = Connect;
+ } else if (strcmp(*args.begin(), "unmap") == 0) {
+ cmd = Disconnect;
+ } else if (strcmp(*args.begin(), "list-mapped") == 0) {
+ cmd = List;
+ } else {
+ *err_msg << "rbd-nbd: unknown command: " << *args.begin();
+ return -EINVAL;
+ }
+ args.erase(args.begin());
+ }
+
+ if (cmd == None) {
+ *err_msg << "rbd-nbd: must specify command";
+ return -EINVAL;
+ }
+
+ switch (cmd) {
+ case Connect:
+ if (args.begin() == args.end()) {
+ *err_msg << "rbd-nbd: must specify image-or-snap-spec";
+ return -EINVAL;
+ }
+ if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) {
+ return -EINVAL;
+ }
+ args.erase(args.begin());
+ break;
+ case Disconnect:
+ if (args.begin() == args.end()) {
+ *err_msg << "rbd-nbd: must specify nbd device or image-or-snap-spec";
+ return -EINVAL;
+ }
+ if (boost::starts_with(*args.begin(), "/dev/")) {
+ cfg->devpath = *args.begin();
+ } else {
+ if (parse_imgpath(*args.begin(), cfg, err_msg) < 0) {
+ return -EINVAL;
+ }
+ if (!find_mapped_dev_by_spec(cfg)) {
+ *err_msg << "rbd-nbd: " << *args.begin() << " is not mapped";
+ return -ENOENT;
+ }
+ }
+ args.erase(args.begin());
+ break;
+ default:
+ //shut up gcc;
+ break;
+ }
+
+ if (args.begin() != args.end()) {
+ *err_msg << "rbd-nbd: unknown args: " << *args.begin();
+ return -EINVAL;
+ }
+
+ *command = cmd;
+ return 0;
+}
+
+static int rbd_nbd(int argc, const char *argv[])
+{
+ int r;
+ Config cfg;
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ std::ostringstream err_msg;
+ r = parse_args(args, &err_msg, &cmd, &cfg);
+ if (r == HELP_INFO) {
+ usage();
+ return 0;
+ } else if (r == VERSION_INFO) {
+ std::cout << pretty_version_to_str() << std::endl;
+ return 0;
+ } else if (r < 0) {
+ cerr << err_msg.str() << std::endl;
+ return r;
+ }
+
+ switch (cmd) {
+ case Connect:
+ if (cfg.imgname.empty()) {
+ cerr << "rbd-nbd: image name was not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ r = do_map(argc, argv, &cfg);
+ if (r < 0)
+ return -EINVAL;
+ break;
+ case Disconnect:
+ r = do_unmap(&cfg);
+ if (r < 0)
+ return -EINVAL;
+ break;
+ case List:
+ r = do_list_mapped_devices(cfg.format, cfg.pretty_format);
+ if (r < 0)
+ return -EINVAL;
+ break;
+ default:
+ usage();
+ break;
+ }
+
+ return 0;
+}
+
+int main(int argc, const char *argv[])
+{
+ int r = rbd_nbd(argc, argv);
+ if (r < 0) {
+ return EXIT_FAILURE;
+ }
+ return 0;
+}
diff --git a/src/tools/rbd_recover_tool/FAQ b/src/tools/rbd_recover_tool/FAQ
new file mode 100644
index 00000000..1655e853
--- /dev/null
+++ b/src/tools/rbd_recover_tool/FAQ
@@ -0,0 +1,16 @@
+# author: min chen(minchen@ubuntukylin.com) 2014 2015
+
+1. error "get_image_metadata_v2: no meta_header_seq input"
+cause:
+ database is old, refresh database
+solution:
+ ./rbd-recover-tool database
+
+2. Error initializing leveldb: IO error: lock /var/lib/ceph/osd/ceph-0/current/omap/LOCK: Resource temporarily unavailable
+ ERROR: error flushing journal /var/lib/ceph/osd/ceph-0/journal for object store /var/lib/ceph/osd/ceph-0: (1) Operation not permitted
+cause:
+ when ./rbd-recover-tool database is interrupted , but command has been sent to each osd node, and there is a process reading leveldb and it is LOCKED
+ if run ./rbd-recover-tool database again, all command are sent to osd nodes again, while previous process is locking leveldb, so all new command
+ are failed.
+solution:
+ wait until all previous command finished.
diff --git a/src/tools/rbd_recover_tool/README b/src/tools/rbd_recover_tool/README
new file mode 100644
index 00000000..d289c11c
--- /dev/null
+++ b/src/tools/rbd_recover_tool/README
@@ -0,0 +1,97 @@
+# author: Min chen(minchen@ubuntukylin.com) 2014 2015
+
+------------- ceph rbd recover tool -------------
+
+ ceph rbd recover tool is used for recovering ceph rbd image, when all ceph services are killed.
+it is based on ceph-0.80.x (Firefly and newer)
+ currently, ceph service(ceph-mon, ceph-osd) evently are not available caused by bugs or sth else
+, especially on large scale ceph cluster, so that the ceph cluster can not supply service
+and rbd images can not be accessed. In this case, a tool to recover rbd image is necessary.
+ ceph rbd recover tool is just used for this, it can collect all objects of an image from distributed
+osd nodes with the latest pg epoch, and splice objects by offset to a complete image. To make sure
+object data is complete, this tool does flush osd journal on each osd node before recovering.
+ but, there are some limitions:
+-need ssh service and unobstructed network
+-osd data must be accessed on local disk
+-clone image is not supported, while snapshot is supported
+-only support relicated pool
+
+before you run this tool, you should make sure that:
+1). all processes (ceph-osd, ceph-mon, ceph-mds) are shutdown
+2). ssh daemon is running & network is ok (ssh to each node without password)
+3). ceph-kvstore-tool is installed(for ubuntu: apt-get install ceph-test)
+4). osd disk is not crashed and data can be accessed on local filesystem
+
+-architecture:
+
+ +---- osd.0
+ |
+admin_node -----------+---- osd.1
+ |
+ +---- osd.2
+ |
+ ......
+
+-files:
+admin_node: {rbd-recover-tool common_h epoch_h metadata_h database_h}
+osd: {osd_job common_h epoch_h metadata_h} #/var/rbd_tool/osd_job
+in this architecture, admin_node acts as client, osds act as server.
+so, they run different files:
+on admin_node run: rbd-recover-tool <action> [<parameters>]
+on osd node run: ./osd_job <function> <parameters>
+admin_node will copy files: osd_job, common_h, epoch_h, metadata_h to remote osd node
+
+
+-config file
+before you run this tool, make sure write config files first
+osd_host_path: osd hostnames and osd data path #user input
+ osdhost0 /var/lib/ceph/osd/ceph-0
+ osdhost1 /var/lib/ceph/osd/ceph-1
+ ......
+mon_host: all mon node hostname #user input
+ monhost0
+ monhost1
+ ......
+mds_host: all mds node hostname #user input
+ mdshost0
+ mdshost1
+ ......
+then, init_env_admin function will create file: osd_host
+osd_host: all osd node hostname #generated by admin_job, user ignore it
+ osdhost0
+ osdhost1
+ ......
+
+
+-usage:
+rbd-recovert-tool <operation>
+<operation> :
+database #generating offline database: hobject path, node hostname, pg_epoch and image metadata
+list #list all images from offline database
+lookup <pool_id>/<image_name>[@[<snap_name>]] #lookup image metadata in offline database
+recover <pool_id><image_name>[@[<snap_name>]] [/path/to/store/image] #recover image data according to image metadata
+
+-steps:
+1. stop all ceph services: ceph-mon, ceph-osd, ceph-mds
+2. setup config files: osd_host_path, mon_host, mds_host
+3. rbd-recover-tool database # wait a long time
+4. rbd-recover-tool list
+4. rbd-recover-tool recover <pool_id>/<image_name>[@[<image_name>]] [/path/to/store/image]
+
+
+-debug & error check
+if admin_node operation is failed, you can check it on osd node
+cd /var/rbd_tool/osd_job
+./osd_job <operation>
+<operation> :
+do_image_id <image_id_hobject> #get image id of image format v2
+do_image_id <image_header_hobject> #get image id of image format v1
+do_image_metadata_v1 <image_header_hobject> #get image metadata of image format v1, maybe pg epoch is not latest
+do_image_metadata_v2 <image_header_hobject> #get image metadata of image format v2, maybe pg epoch is not latest
+do_image_list #get all images on this osd(image head hobject)
+do_pg_epoch #get all pg epoch and store it in /var/rbd_tool/single_node/node_pg_epoch
+do_omap_list #list all omap headers and omap entries on this osd
+
+
+-FAQ
+file FAQ lists some common confusing cases while testing
diff --git a/src/tools/rbd_recover_tool/TODO b/src/tools/rbd_recover_tool/TODO
new file mode 100644
index 00000000..c36d4c94
--- /dev/null
+++ b/src/tools/rbd_recover_tool/TODO
@@ -0,0 +1,2 @@
+
+1.support clone imag
diff --git a/src/tools/rbd_recover_tool/common_h b/src/tools/rbd_recover_tool/common_h
new file mode 100644
index 00000000..f2df662a
--- /dev/null
+++ b/src/tools/rbd_recover_tool/common_h
@@ -0,0 +1,412 @@
+#!/usr/bin/env bash
+# file: common_h
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+
+# admin node init path
+rbd_image=/var/rbd_tool/rbd_image
+database=$rbd_image/database
+image_coll_v1=$rbd_image/image_coll_v1
+image_coll_v2=$rbd_image/image_coll_v2
+pg_coll=$rbd_image/pg_coll
+images=$rbd_image/images
+images_meta=$rbd_image/images_meta
+default_backup_dir=/var/rbd_tool/default_backup_dir
+
+# admin node: image snap & nosnap
+nosnap= #$rbd_image/<image_name>/nosnap
+snap= #rbd_image/<image_name>/<snap_name>
+
+# osd node init path
+job_path=/var/rbd_tool/osd_job
+single_node=/var/rbd_tool/single_node
+
+# osd node vars
+osd_env= #single_node/$cluster$id/osd_env
+osd_data= #/var/lib/ceph/osd/$cluster-$id
+omap_path= #$osd_data/current/omap
+image_list_v1= #single_node/$cluster-$id/image_list_v1
+image_list_v2= #single_node/$cluster-$id/image_list_v2
+image_v1= #$single_node/$cluster-$id/image_v1
+image_v2= #$single_node/$cluster-$id/image_v2
+pgid_list= #$single_node/$cluster-$id/pgid_list
+node_pg_epoch= #$single_node/$cluster-$id/node_pg_epoch
+omap_list= #$single_node/$cluster-$id/omap_list
+
+# admin node config file
+osd_host_path=$my_dir/config/osd_host_path
+osd_host_mapping= #$pwd_path/config/osd_host_mapping # host --> host_remote: by init_env_admin()
+osd_host=$my_dir/config/osd_host #generated by function init_env_admin()
+mon_host=$my_dir/config/mon_host
+mds_host=$my_dir/config/mds_host
+
+# ssh option
+ssh_option="-o ConnectTimeout=1"
+
+# gen md5sum
+function gen_md5()
+{
+ echo $1|md5sum|awk '{print $1}'
+}
+
+# on each osd node
+# check ceph environment: ssh, ceph-kvstore-tool, osd_data_path
+function check_ceph_env()
+{
+ local func="check_ceph_env"
+ if [ $# -lt 2 ];then
+ echo "$func: parameters: <node> <data_path>"
+ exit
+ fi
+ local node=$1
+ local data_path=$2
+ local res=
+ local cmd=
+
+ trap 'echo [$node]: ssh failed; exit' INT HUP
+ ssh -o ConnectTimeout=1 $node "echo -n" </dev/null
+ res=$?
+ if [ $res -ne 0 ];then
+ echo "[$node]: ssh failed"
+ exit
+ fi
+
+ cmd=ceph-kvstore-tool
+ trap 'echo [$node]: $cmd failed; exit' INT HUP
+ ssh -o ConnectTimeout=1 $node "$cmd &>/dev/null;" </dev/null
+ res=$?
+ # ceph-kvstore-tool will return 1 with no parameters input
+ if [ $res -ne 1 ];then
+ echo "[$node]: $cmd not installed"
+ exit
+ fi
+
+ trap 'echo [$node]: stat $data_path failed; exit' INT HUP
+ ssh -o ConnectTimeout=1 $node "stat $data_path &>/dev/null;" </dev/null
+ res=$?
+ if [ $res -ne 0 ];then
+ echo "[$node]: $data_path not exists"
+ exit
+ fi
+}
+
+# osd node context : osd_data_path
+function init_env_osd()
+{
+ local func="init_env_osd"
+ if [ "$1"x = ""x ];then
+ echo "$func: no osd_data_path input"
+ exit
+ fi
+ osd_data=$1
+ omap_path=$osd_data/current/omap
+
+ if [ ! -e $single_node ];then
+ mkdir -p $single_node
+ fi
+
+ local osd_id=`gen_md5 $osd_data`
+ local osd_dir=$single_node/$osd_id
+
+ if [ ! -e $osd_dir ];then
+ mkdir -p $osd_dir
+ fi
+
+ image_list_v1=$osd_dir/image_list_v1
+ image_list_v2=$osd_dir/image_list_v2
+ image_v1=$osd_dir/image_v1
+ image_v2=$osd_dir/image_v2
+ pgid_list=$osd_dir/pgid_list
+ node_pg_epoch=$osd_dir/node_pg_epoch
+ omap_list=$osd_dir/omap_list
+}
+
+# admin node process file: osd_host_path
+function init_env_admin()
+{
+ local func="init_env_admin"
+ local pwd_path=`pwd`
+ osd_host_mapping=$pwd_path/config/osd_host_mapping
+ if [ ! -s $osd_host_path ];then
+ echo "$func: config/osd_host_path not exists or empty"
+ exit
+ fi
+ if [ ! -e $rbd_image ];then
+ mkdir -p $rbd_image
+ fi
+ if [ ! -e $images ];then
+ mkdir -p $images
+ fi
+
+ if [ ! -s $mon_host ];then
+ echo "$func: config/mon_host not exists or empty"
+ exit
+ fi
+ if [ ! -e $mds_host ];then
+ echo "$func: config/mds_host not exists"
+ exit
+ fi
+
+ # we just judge if osd_host is needed to be updated
+ if [ -s $osd_host ] && [ $osd_host -nt $osd_host_path ];then
+ return
+ fi
+ echo "$func: create osd_host ..."
+ # create file: osd_host and osd_host_mapping
+ >$osd_host
+ >$osd_host_mapping
+ local lines=0
+ local lineno=0
+ while read line
+ do
+ lineno=$(($lineno + 1))
+ if [ "$line"x = ""x ];then
+ continue;
+ fi
+ local node=`echo $line|awk '{print $1}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: osd_host_path : line $lineno: osd hostname not input"
+ rm -rf $osd_host $osd_host_mapping
+ exit
+ fi
+ local data_path=`echo $line|awk '{print $2}'`
+ if [ "$data_path"x = ""x ];then
+ echo "$func: osd_host_path : line $lineno: osd data_path not input"
+ rm -rf $osd_host $osd_host_mapping
+ exit
+ fi
+ lines=$(($lines + 1))
+ # in case : there are servral hostnames on the same node
+ # just need output of `hostname`
+ local hostname_alias=
+ hostname_alias=`ssh $ssh_option $node "hostname" 2>/dev/null </dev/null`
+ if [ "$hostname_alias"x = ""x ];then
+ echo "$func: osd_host_path: line $lineno: $node: get remote hostname alias failed"
+ rm -rf $osd_host $osd_host_mapping
+ exit
+ fi
+ echo "$node $hostname_alias" >>$osd_host_mapping
+ echo $node >> $osd_host
+ # check ceph env on remote osd
+ check_ceph_env $node $data_path
+ done < $osd_host_path
+
+ if [ $lines = 0 ];then
+ echo "$func: no osd host path valid"
+ exit
+ fi
+}
+
+function admin_parse_osd()
+{
+ local func="admin_parse_osd"
+ if [ -s $osd_host ];then
+ return
+ fi
+ # create file: osd_host
+ >$osd_host
+ local lines=0
+ local lineno=0
+ while read line
+ do
+ lineno=$(($lineno + 1))
+ if [ "$line"x = ""x ];then
+ continue;
+ fi
+ local node=`echo $line|awk '{print $1}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: osd_host_path : line $lineno: osd_host not input"
+ exit
+ fi
+ local data_path=`echo $line|awk '{print $2}'`
+ if [ "$data_path"x = ""x ];then
+ echo "$func: osd_host_path : line $lineno: osd_data not input"
+ exit
+ fi
+ lines=$(($lines + 1))
+ echo $node >> $osd_host
+ done < $osd_host_path
+}
+
+# for osd node
+function get_omap_list()
+{
+ ceph-kvstore-tool $omap_path list > $omap_list
+}
+
+function convert_underline()
+{
+ if [ "$1"x = ""x ];then
+ return
+ fi
+
+ echo $1|sed -e 's/_/\\u/gp'|head -n 1
+}
+
+function dump_backslash()
+{
+ echo $*|sed -e 's/\\/\\\\/gp'|head -n 1
+}
+
+function dump_dump_backslash()
+{
+ echo $*|sed -e 's/\\/\\\\\\\\/gp'|head -n 1
+}
+
+function char_convert()
+{
+ if [ "$1"x = ""x ];then
+ return
+ fi
+
+ echo $1|sed -e 's/_/\\u/gp' -e 's/\./%e/gp' -e 's/%/%p/gp'|head -n 1
+}
+
+function check_osd_process()
+{
+ local func="check_osd_process"
+ local host=$1
+ if [ "$1"x = ""x ];then
+ exit
+ fi
+ local cmds="ps aux|grep ceph-osd|grep -v grep"
+ local ret=/tmp/ret.$$$$
+ ssh $ssh_option $host $cmds |tee $ret
+ if [ -s $ret ];then
+ echo "$func: [$host] ceph-osd process is not killed"
+ exit
+ fi
+ rm -f $ret
+}
+
+function get_map_header_prefix()
+{
+ echo "_HOBJTOSEQ_"
+}
+
+function get_map_header_key()
+{
+ local func="get_map_header_key"
+ if [ "$1"x = ""x ];then
+ #echo $func': no keyword input'
+ exit
+ fi
+ local keyword=$1
+ local res=`cat $omap_list| grep $keyword`
+ if [ "$res"x = ""x ];then
+ #echo "$func: map_header_key = $keyword not exists"
+ exit
+ fi
+ echo $res|awk -F ":" '{print $2}'
+}
+
+function get_header_seq()
+{
+ local func="get_header_seq"
+ if [ "$1"x == ""x ];then
+ #echo "$func: no prefix input"
+ exit;
+ elif [ "$2"x == ""x ];then
+ #echo "$func: no key input"
+ exit;
+ fi
+ local prefix=$1;
+ local key=$2;
+ local res=/tmp/header_seq.$$$$
+
+ ceph-kvstore-tool $omap_path get $prefix $key 2>/dev/null 1>$res
+ if [ $? != 0 ]; then
+ #echo "$func: <$prefix , $key> not exists" ;
+ exit;
+ fi
+
+ # ceph-kvstore-tool get result like this:
+ # 02 01 7e 00 00 00 12 44 00 00 00 00 00 00 00 00
+ # get header seq bytes:
+ # 12 44 00 00 00 00 00 00
+ # -> 00 00 00 00 00 00 44 12
+ # echo $((16#0000000000004412)) -> 17426 == header_seq
+ local seq=`cat $res |head -n 2|tail -n 1| \
+ awk '
+ BEGIN {
+ FS=":"
+ seq="";
+ i=7;
+ } {
+ split($2, arr, " ")
+ # header_seq uint64 : 8 bytes
+ for (x=7; x>=0; --x) {
+ seq=seq""arr[i+x];
+ }
+ }
+ END {
+ print seq
+ }'`
+ if [ "$seq"x = ""x ];then
+ #echo "$func: get <$prefix , $key> failed"
+ exit;
+ fi
+ rm -f $res
+ echo $((16#$seq))
+}
+
+# get header info key/value
+function get_header_kv()
+{
+ local func="get_header_kv"
+ if [ "$1"x = ""x ];then
+ #echo "$func: no prefix input"
+ exit
+ elif [ "$2"x = ""x ];then
+ #echo "$func: no key input"
+ exit
+ elif [ "$3"x != "string"x ] && [ "$3"x != "int"x ];then
+ #echo "$func: no valid type input, use type (string|int)"
+ exit
+ fi
+
+ local prefix=$1
+ local key=$2
+ local types=$3
+ local res=/tmp/kv.$$$$
+
+ ceph-kvstore-tool $omap_path get $prefix $key 2>/dev/null 1>$res
+ if [ $? != 0 ];then
+ #echo "$func: <$prefix , $key> not exists"
+ exit
+ fi
+
+ if [ "$types"x = "string"x ];then
+ local value=`cat $res |tail -n +2|head -n -1|awk -F ": " '{printf $3}'|sed -n 's/^\.\{4\}//p'`
+ echo $value
+ elif [ "$types"x = "int"x ];then
+ local value=`cat $res |tail -n +2|head -n -1| \
+ awk '
+ BEGIN{
+ FS=":"
+ } {
+ split($2, arr, " ");
+ len=length(arr)
+ for (i=len; i>0; --i) {
+ printf arr[i];
+ }
+ }'`
+ echo $((16#$value))
+ fi
+ rm -f $res
+}
diff --git a/src/tools/rbd_recover_tool/config/mds_host b/src/tools/rbd_recover_tool/config/mds_host
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/tools/rbd_recover_tool/config/mds_host
diff --git a/src/tools/rbd_recover_tool/config/mon_host b/src/tools/rbd_recover_tool/config/mon_host
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/tools/rbd_recover_tool/config/mon_host
diff --git a/src/tools/rbd_recover_tool/config/osd_host_path b/src/tools/rbd_recover_tool/config/osd_host_path
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/tools/rbd_recover_tool/config/osd_host_path
diff --git a/src/tools/rbd_recover_tool/database_h b/src/tools/rbd_recover_tool/database_h
new file mode 100644
index 00000000..4ff20425
--- /dev/null
+++ b/src/tools/rbd_recover_tool/database_h
@@ -0,0 +1,1134 @@
+#!/usr/bin/env bash
+# file: database_h
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+
+. $my_dir/common_h
+. $my_dir/metadata_h
+. $my_dir/epoch_h
+
+db_image_prefix=
+db_image_size=
+db_order=
+db_snap_id=
+db_snap_image_size=
+found=0
+
+#init osd_data and get all objects path
+function gen_database()
+{
+ local func="gen_database"
+ rm -rf $database/*
+ rm -rf $images
+ rm -rf $raw
+ mkdir -p $database
+ local host=
+ local data_path=
+
+ trap 'echo $func failed; exit;' INT HUP
+ while read line
+ do
+ {
+ host=`echo $line|awk '{print $1}'`
+ data_path=`echo $line|awk '{print $2}'`
+ if [ "$host"x = ""x ] || [ "$data_path"x = ""x ];then
+ continue
+ fi
+ local cmds="find $data_path/current -type f"
+ ssh $ssh_option $host $cmds > $database/$host
+ } &
+ done < $osd_host_path
+ wait
+ echo "$func: finish"
+}
+
+# collect hobjects from database
+# and choose the object whose epoch is latest
+# then, sort the objects by their offsets in image
+function gather_hobject_common()
+{
+ func="gather_hobject_common"
+
+ trap 'echo $func failed; exit;' INT HUP
+ if [ $# -lt 2 ];then
+ echo "$func: parameters: <pool_id> <image_prefix> [<snap_id>]"
+ exit
+ fi
+
+ local pool_id=$1
+ local image_prefix=$2
+ pool_id=$(($pool_id))
+ local hex_pool_id=`printf "%x" $pool_id`
+ # NOSNAP = uint64(-2)
+ local snap_id=`printf "%u" -2`
+ local hex_snap_id="head"
+ local psuffix=
+ local fsuffix="_head"
+ if [ $# = 3 ];then
+ snap_id=$(($3))
+ hex_snap_id=`printf "%x" $snap_id`
+ psuffix="_"$snap_id
+ fsuffix="_"$snap_id
+ fi
+ local underline_image_prefix=`convert_underline $image_prefix`
+ local dump_image_prefix=`dump_backslash $underline_image_prefix`
+ local ddump_image_prefix=`dump_dump_backslash $underline_image_prefix`
+ local images_raw_dir=$rbd_image/raw
+ local image_hobjects_dir=$images/pool_$pool_id/$image_prefix
+ # $images/raw/$image_prefix"_head"
+ local image_hobjects_raw=$images_raw_dir/$image_prefix"$fsuffix"
+ # $images/$image_prefix/$image_prefix"_head"
+ local image_hobjects_stable=$image_hobjects_dir/$image_prefix"$fsuffix"
+
+ if [ ! -e $images_raw_dir ];then
+ mkdir -p $images_raw_dir
+ fi
+ if [ ! -e $image_hobjects_dir ];then
+ local image_metadata=$images_meta/$image_name_in
+ mkdir -p $image_hobjects_dir
+ fi
+
+ pushd $database >/dev/null
+ local pattern="\.[0-9a-f]+__"$hex_snap_id"_[0-9A-F]{8}__"$hex_pool_id
+ >$image_hobjects_raw
+ grep -r -E $dump_image_prefix""$pattern * >$image_hobjects_raw
+ if [ ! -s $image_hobjects_raw ];then
+ echo "$func: image snap [ $image_prefix"$psuffix" ] is empty"
+ return 1 #no data available
+ fi
+ popd >/dev/null
+
+ local offset_dir_temp=$images_raw_dir/$image_prefix"$fsuffix""_dir_temp"
+ rm -rf $offset_dir_temp
+ mkdir -p $offset_dir_temp
+
+ echo "gather hobjects from database: snapid=$snap_id ..."
+
+ # format: ceph2:/var/lib/ceph/osd/ceph-1/current/2.d3_head/rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2
+ local tmp_image=$offset_dir_temp/tmpimage.$$$$
+ >$tmp_image
+ cat $image_hobjects_raw |
+ awk -F ':' '
+ BEGIN {
+ pg_coll="'$pg_coll'"
+ tmp_image="'$tmp_image'"
+ osd_host_mapping="'$osd_host_mapping'"
+ snapid="'$snap_id'"
+ }{
+ # $2 = /var/lib/ceph/osd/ceph-1/current/2.d3_head/rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2
+
+ split($2, arr1, "/current/"); # {/var/lib/ceph/osd/ceph-1/, 2.d3_head/rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2}
+ split(arr1[2], arr2, "/"); # {2.d3_head, rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2}
+ split(arr2[1], arr3, "_head"); # {2.d3,}
+
+ hobject=$2;
+ data_path=arr1[1];
+ gsub(/\\u/, "\\\\\\\\u", hobject); # dump backslash to delay escape (\ -> \\)
+ "awk \"\\$1 == \\\""$1"\\\" {print \\$2}\" "osd_host_mapping" | head -n 1" | getline node
+ pgid = arr3[1];
+
+ len=length(arr2);
+ offset_hobject=arr2[len] # rb.0.1293.6b8b4567.000000000002__head_FB425CD3__2
+ split(offset_hobject, offarr1, "."); # {rb, 0, 1293, 6b8b4567, 000000000002__head_FB425CD3__2}
+ len1=length(offarr1)
+ offset_p=offarr1[len1] # 000000000002__head_FB425CD3__2
+ split(offset_p, offarr2, "__"); # {000000000002, head_FB425CD3, 2}
+ offset=offarr2[1]; # 000000000002
+
+ system("echo -n \""node" "pgid" "hobject" "offset" "snapid" \" >>"tmp_image);
+ #system("echo -n \""node" "pgid" "hobject" "offset" "snapid" \"");
+ #print node" "pgid" "hobject" "offset" "snapid
+
+ # find pg_epoch from pg_coll database
+ system("awk \"\\$1 == \\\""node"\\\" && \\$2 == \\\""pgid"\\\" && \\$4 == \\\""data_path"\\\" {print \\$3}\" "pg_coll" >>"tmp_image);
+ #system("awk \"\\$1 == \\\""node"\\\" && \\$2 == \\\""pgid"\\\" && \\$4 == \\\""data_path"\\\" {print \\$3}\" "pg_coll);
+ }'
+
+ local sort_image=$offset_dir_temp/sortimage.$$$$
+ >$sort_image
+ sort -t ' ' -k 4.1,4 -k 6.1nr -k 1.1,1 $tmp_image >$sort_image
+ sort -t ' ' -k 4.1,4 -u $sort_image > $image_hobjects_stable
+
+ #rm -rf $offset_dir_temp
+ return 0
+}
+
+function gather_hobject_nosnap()
+{
+ gather_hobject_common $1 $2
+}
+
+function gather_hobject_snap()
+{
+ gather_hobject_common $1 $2 $3
+}
+
+# select the max pg_epoch item of the same $field
+# if no same $field, choose the first
+# format : "node $field pg_epoch"
+function choose_epoch()
+{
+ cat $1|sort -t ' ' -k 3.1,3nr -k 2.1,2n |head -n 1;
+}
+
+# lookup image info , after scatter_node_jobs & gather_node_infos
+function lookup_image()
+{
+ local func="lookup_image"
+ if [ $# -lt 2 ];then
+ echo "$func: parameters error <pool_id> <image_name> [<snap_name>]"
+ fi
+ local pool_id=$1
+ local image_name=$2
+ local snap_name=$3
+ pool_id=$((pool_id))
+ echo -e "$func: pool_id = $pool_id\timage_name = $image_name\tsnap_name = $snap_name"
+ if [ $pool_id -lt 0 ];then
+ echo "$func: pool_id must great than zero"
+ exit
+ fi
+ local hex_pool_id=`printf "%x" $pool_id`
+ input_image $image_name
+ local node=
+ local item=/tmp/item.$$$$
+ local img_name=`dump_backslash $image_name`
+
+ local image_format=0
+ local image_id_hobject=
+ local image_header_hobject=
+ local result=/tmp/tmp_result.$$$$
+ local res1=/tmp/tmp_res1.$$$$
+ local res2=/tmp/tmp_res2.$$$$
+ local data_path=
+
+ # image format v1
+ {
+ cat $image_coll_v1|grep -E "/$img_name\.rbd__head_[0-9A-F]{8}__$hex_pool_id" >$res1
+ if [ -s $res1 ];then
+ echo -n "$func: rbd_header_hobject = "
+ choose_epoch $res1| tee $item
+ #choose_epoch $res1 > $item
+
+ if [ -e $item ];then
+ node=`cat $item|awk '{print $1}'`
+ image_header_hobject=`cat $item|awk '{print $2}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: v1 node is NULL"
+ exit
+ fi
+ if [ "$image_header_hobject"x = ""x ];then
+ echo "$func: v1 image_header_hobject is NULL"
+ exit
+ fi
+ rm -f $item
+ fi
+
+ image_format=1
+ echo -e "image_name:\t$image_name_in"
+ echo -e "image_format:\t$image_format"
+ data_path=`echo $image_header_hobject|awk -F "/current" '{print $1}'`
+
+ >$result
+ cmds="bash $job_path/osd_job do_image_metadata_v1 $data_path `dump_backslash $image_header_hobject` $snap_name"
+ ssh $ssh_option $node $cmds | tee $result
+ fi
+ }
+
+ # image format v2
+ {
+ cat $image_coll_v2|grep -E "/rbd\\\\uid\."$img_name"__head_[0-9A-F]{8}__$hex_pool_id" >$res2
+ if [ -s $res2 ];then
+ echo -n "$func: rbd_id_hobject = "
+ choose_epoch $res2 | tee $item
+ #choose_epoch $res2 > $item
+
+ if [ -e $item ];then
+ node=`cat $item|awk '{print $1}'`
+ image_id_hobject=`cat $item|awk '{print $2}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: v2 node is NULL(to get image_id_hobject)"
+ exit
+ fi
+ if [ "$image_id_hobject"x = ""x ];then
+ echo "$func: v2 image_id_hobject is NULL"
+ exit
+ fi
+ rm -f $item
+ fi
+
+ check_osd_process $node
+ image_format=2
+
+ local tid=/tmp/image_id.$$$$
+ data_path=`echo $image_id_hobject|awk -F "/current" '{print $1}'`
+ >$tid
+ cmds="bash $job_path/osd_job do_image_id $data_path `dump_backslash $image_id_hobject`"
+ ssh $ssh_option $node $cmds > $tid
+
+ local image_id=`cat $tid`
+ rm -f $tid
+
+ #get image_header_hobject
+ pushd $database >/dev/null
+ local pattern="header\."$image_id"__head_[0-9A-F]{8}__$hex_pool_id"
+ local tcoll=/tmp/tmp_image_head_coll.$$$$
+
+ # hostname(by command hostname) in $pg_coll maybe different from hostname in tcoll(input by user)
+ # t_host: hostname read from config file ($tcoll)
+ # t_host_remote: $(hostname) on osd node ($pg_coll)
+ grep -r -E $pattern * >$tcoll
+ popd >/dev/null
+
+ local t_host=(`cat $tcoll|awk -F ":" '{print $1}'`)
+ local t_pgid=(`cat $tcoll|awk -F ":" '{print $2}'|sed -n 's/.*\/\([0-9a-fA-F]\+\.[0-9a-fA-F]\+\)_head\/.*/\1/p'`)
+ local t_hobject=(`cat $tcoll|awk -F ":" '{print $2}'`)
+ local t_data_path=(`cat $tcoll|awk -F ":" '{split($2, arr, "/current/"); print arr[1];}'`)
+ rm -f $tcoll
+ declare -a t_host_remote
+
+ #if there is no failed pg migration, number of t_host is replica num
+ #replica num : 3, 4, 5 ...
+ local t_hostname=/tmp/t_hostname.$$$$
+ for ((i=0; i<${#t_host[*]}; i++))
+ do
+ ssh $ssh_option ${t_host[$i]} "hostname" >$t_hostname
+ if [ $? != 0 ];then
+ echo "$func: ${t_host[$i]} get host_remote failed"
+ exit
+ fi
+ t_host_remote[$i]=`cat $t_hostname`
+ done
+ rm -f $t_hostname
+
+ local t_item=/tmp/tmp_item.$$$$
+ local tmp_item=/tmp/tmp_tmp_item.$$$$
+
+ >$tmp_item
+ for ((i=0; i<${#t_host_remote[*]}; i++ ))
+ do
+ local node=${t_host_remote[$i]}
+ local pgid=${t_pgid[$i]}
+ awk '$1 == "'"$node"'" && $2 == "'"$pgid"'" {print}' $pg_coll >>$tmp_item
+ done
+
+ # t_item: <remote_hostname> <pgid> <epoch> <data_path>
+ sort -u $tmp_item >$t_item
+ rm -f $tmp_item
+
+ local entry=`choose_epoch $t_item` #t_host_remote
+ rm -f $t_item
+
+ node=`echo $entry|awk '{print $1}'`
+ data_path=`echo $entry|awk '{print $4}'`
+ if [ "$node"x = ""x ];then
+ echo "$func: v2 node is NULL (to get image_header_hobject)"
+ exit
+ fi
+
+ for ((i=0; i<${#t_host_remote[*]}; i++))
+ do
+ if [ "${t_host_remote[$i]}"x = "$node"x ] && [ "${t_data_path[$i]}"x = "$data_path"x ];then
+ image_header_hobject=${t_hobject[$i]}
+ break
+ fi
+ done
+
+ if [ "$image_id_hobject"x = ""x ];then
+ echo "$func: v2 image_header_hobject is NULL"
+ exit
+ fi
+
+ check_osd_process $node
+
+ echo "$func: rbd_header_hobject = $node $image_header_hobject"
+ echo -e "image_name:\t$image_name_in"
+ echo -e "image_format:\t$image_format"
+
+ #data_path=`echo $image_header_hobject|awk -F "/current" '{print $1}'`
+ >$result
+ cmds="bash $job_path/osd_job do_image_metadata_v2 $data_path $image_id `dump_backslash $image_header_hobject` $snap_name"
+ ssh $ssh_option $node $cmds | tee $result
+ fi
+ }
+
+ if [ ! -s $result ];then
+ echo "$func: $image_name_in not exists"
+ exit
+ fi
+
+ # to assign value to global variable
+ db_image_prefix=`cat $result|awk '/^(object_prefix|block_name):/{print $2}'`
+ if [ "$db_image_prefix"x = ""x ];then
+ echo "$func: image_prefix is NULL"
+ exit
+ fi
+
+ db_image_size=`cat $result|awk '/^image_size:/{print $2}'`
+ db_order=`cat $result|awk '/^order:/{print $2}'`
+ if [ "$snap_name"x != ""x ];then
+ db_snap_id=`cat $result|awk '/^snapshot:/{print $2}'`
+ if [ "$db_snap_id"x = ""x ];then
+ echo "$func: $image_name_in@$snap_name NOT EXISTS"
+ exit
+ fi
+ db_snap_image_size=`cat $result|awk '/^snapshot:/{print $4}'`
+ else
+ #save snaplist
+ local image_snaplist=$images/pool_$pool_id/$image_name_in/@snaplist
+ local image_dir=$images/pool_$pool_id/$image_name_in
+ if [ ! -e $image_dir ];then
+ mkdir -p $image_dir
+ fi
+ cat $result|awk '/^snapshot:/{print $2" "$3" "$4}' >$image_snaplist
+ fi
+ found=1
+ rm -f $result
+}
+
+function list_images()
+{
+ echo "=============== format =============="
+ echo "format: <pool_id>/<image_name>"
+ echo "================ v1: ================"
+ #sed -n 's/\(.*\)\/\(.*\)\.rbd__\(.*\)/\2/p' $image_coll_v1|sort -u|sed -e 's/\\u/_/g'
+ sed -n 's/.*\/\(.*\)\.rbd__head_[0-9A-F]\{8\}__\([0-9a-f]\+\).*/\2 \1/p' $image_coll_v1|sort -u|awk '{print strtonum("0x"$1)"/"$2;}'|sed -e 's/\\u/_/g'
+ echo "================ v2: ================"
+ #sed -n 's/\(.*\)\/rbd\\uid.\(.*\)__\(head.*\)/\2/p' $image_coll_v2|sort -u|sed 's/\\u/_/g'
+ sed -n 's/.*\/rbd\\uid.\(.*\)__head_[0-9A-F]\{8\}__\([0-9a-f]\+\).*/\2 \1/p' $image_coll_v2|sort -u|awk '{print strtonum("0x"$1)"/"$2}'|sed 's/\\u/_/g'
+}
+
+# lookup image metadata
+# and
+# collect hobjects of image with the latest pg epoch
+function discover_image_nosnap()
+{
+ local func="discover_image_nosnap"
+ echo "$func ..."
+ local pool_id=$1
+ local image_name=$2
+ pool_id=$(($pool_id))
+ lookup_image $pool_id $image_name # assign $image_prefix
+ gather_hobject_nosnap $pool_id $db_image_prefix
+ if [ $? -ne 0 ];then
+ exit
+ fi
+ local image_hobjects_stable_nosnap=$images/pool_$pool_id/$db_image_prefix/$db_image_prefix"_head"
+ local image_hobjects_dir=$images/pool_$pool_id/$image_name_in
+ if [ ! -e $image_hobjects_dir ];then
+ mkdir -p $image_hobjects_dir
+ fi
+ # mv image_prefix to image_name
+ mv $image_hobjects_stable_nosnap $image_hobjects_dir/$image_name_in
+ rm -rf $images/pool_$pool_id/$db_image_prefix
+}
+
+# get the offset snapid object
+# if there is no object, choose the smallest snapid which is greater than current snapid
+function get_object_clone()
+{
+ local func="get_object_clone"
+ if [ $# -lt 4 ];then
+ exit
+ fi
+
+ local object_offset_string=$1
+ local snapid=$2
+ local snaplist_path=$3
+ local snapset_output_dir=$4
+
+ # snapid in desc
+ local snap_coll_arr=(`
+ cat $snaplist_path|awk '{ if ($1 >= '"$snapid"') print "'"$snapset_output_dir"'/@"$1}'`)
+
+ local hex_snapid=`printf "%x" $snapid`
+ pushd $snapset_output_dir >/dev/null
+ # get object with the smallest snapid greater than current snapid
+ awk '$4 == "'"$object_offset_string"'" && $5 >= '$snapid' {print}' `echo ${snap_coll_arr[@]}` |tail -n 1
+ popd >/dev/null
+}
+
+# gather hobject for each snapid
+function gen_snapset_hobject()
+{
+ local func="gen_image_snapset"
+ echo "$func ..."
+ if [ $# -lt 4 ];then
+ echo "$func: parameters: <pool_id> <image_prefix> <snaplist_path> <snapset_output_dir>"
+ exit
+ fi
+ local pool_id=$1
+ local image_prefix=$2
+ local snaplist_path=$3
+ local snapset_output_dir=$4
+ pool_id=$(($pool_id))
+ OIFS=$IFS
+ IFS=$'\n'
+ local snaparr=(`cat $snaplist_path`)
+ # gather hobject for each snapshot
+ trap 'echo $func failed; exit;' INT HUP
+ for line in ${snaparr[@]}
+ do
+ OOIFS=$IFS
+ IFS=$' '
+ local field=(`echo $line`)
+ local snapid=${field[0]}
+ local image_hobjects_stable_snap=$images/pool_$pool_id/$image_prefix/$image_prefix"_"$snapid
+ local image_snap=$snapset_output_dir/@$snapid
+ gather_hobject_snap $pool_id $image_prefix $snapid
+ local res=$?
+ if [ $res -ne 0 ];then
+ touch $image_snap
+ else
+ mv $image_hobjects_stable_snap $image_snap
+ fi
+ IFS=$OOIFS
+ done
+ IFS=$OIFS
+}
+
+# lookup image metadata and get snapid hobjects
+function discover_image_snap()
+{
+ local func="discover_image_snap"
+ echo "$func ..."
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <pool_id> <image_name> [<snap_name>]"
+ exit
+ fi
+ local pool_id=$1
+ local image_name=$2
+ local snap_name=$3
+ pool_id=$(($pool_id))
+ #mkdir -p $images/$image_prefix
+ lookup_image $pool_id $image_name $snap_name # input image_name and snap_name to lookup metadata and snap_id
+ if [ "$db_snap_id"x = ""x ];then
+ echo "$func: lookup image failed to gen snapid"
+ exit
+ fi
+ local image_hobjects_dir_prefix=$images/pool_$pool_id/$db_image_prefix
+ local image_nosnap=$images/pool_$pool_id/$image_name_in
+ #check if image nosnap recovered
+ if [ ! -s $image_nosnap ];then
+ echo "$func: please recover image nosnap before recover with snap"
+ rm -rf $image_hobjects_dir_prefix
+ exit
+ fi
+ local image_hobject_dir=$images/pool_$pool_id/$image_name_in
+ local image_snap_hobject=$image_hobject_dir/$image_name_in@$db_snap_id
+ local image_snap_hobject_head=$image_hobject_dir/$image_name_in@$db_snap_id@head
+ local image_snaplist=$image_hobject_dir/@snaplist
+ local image_snapset_dir=$image_hobject_dir/@snapset_dir
+ local image_head=$image_hobject_dir/$image_name_in
+ if [ ! -e $image_hobject_dir ];then
+ mkdir -p $image_hobject_dir
+ fi
+ # only gen snapset one time
+ if [ ! -e $image_snapset_dir ];then
+ mkdir -p $image_snapset_dir
+ gen_snapset_hobject $pool_id $db_image_prefix $image_snaplist $image_snapset_dir
+
+ fi
+
+ echo "$func: will get object clone ..."
+ >$image_snap_hobject
+ >$image_snap_hobject_head
+
+ trap 'echo $func failed; exit;' INT HUP
+ # get each offset 's snapid hobject
+ while read line
+ do
+ #echo $line
+ OOIFS=$IFS
+ IFS=$' '
+ local field=(`echo $line`)
+ local offset_string=${field[3]}
+ IFS=$OOIFS
+ local entry=`get_object_clone $offset_string $db_snap_id $image_snaplist $image_snapset_dir`
+ if [ "$entry"x != ""x ];then
+ echo $entry >> $image_snap_hobject
+ echo `dump_backslash $line` >> $image_snap_hobject_head
+ fi
+ done < $image_head
+ rm -rf $image_hobjects_dir_prefix
+}
+
+# after discover_image_nosnap
+# collect objects from osds one by one in sequence
+function copy_image_nosnap_single_thread()
+{
+ local func="copy_image_nosnap_single_thread"
+ echo "$func ..."
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <pool_id> <image_hobjects> <backup_dir>"
+ exit
+ fi
+ local pool_id=$1
+ local image_hobjects=$2
+ local backup_dir=$3
+ pool_id=$(($pool_id))
+
+ # make sure lookup_image first
+ if [ $found = 0 ];then
+ echo "$func: image not found, maybe forget to discover_image"
+ exit
+ fi
+ if [ ! -e $backup_dir ];then
+ mkdir -p $backup_dir
+ fi
+
+ local image_dir=$backup_dir/pool_$pool_id/$image_name_in
+ local image_file=$image_dir/$image_name_in
+ local CURRENT=$image_dir/@CURRENT
+ local LOCK=$image_dir/@LOCK
+ if [ ! -e $image_dir ];then
+ mkdir -p $image_dir
+ fi
+ if [ -e $LOCK ];then
+ echo "$func: $LOCK is locked by other process"
+ exit
+ else
+ touch $LOCK
+ fi
+
+ >$image_file
+ truncate -s $db_image_size $image_file
+ echo "head">$CURRENT
+
+ local count=$(($db_image_size >> $db_order))
+ local start=`cat $image_hobjects|head -n 1|awk '{print $4}'`
+ local end=`cat $image_hobjects|tail -n 1|awk '{print $4}'`
+ local entry_count=`cat $image_hobjects|wc -l`
+
+ local char_bits=$((`echo $start|wc -c` -1 ))
+ local format="%0"$char_bits"x"
+
+ local expect_start=`printf $format 0`
+ local expect_end=`printf $format $(($count -1 ))`
+
+ echo -e "object_count\t$entry_count"
+ echo -e "expect\t\t[$expect_start ~ $expect_end] count:$count"
+ echo -e "range\t\t[$start ~ $end] count:$entry_count"
+
+ local icount=0
+ local istart=
+ local iend=
+ local percent=
+
+ trap 'echo $func failed; exit;' INT HUP
+ local unit=$((1<<$db_order))
+ while read line
+ do
+ {
+ icount=$(($icount+1))
+ node=`echo $line|awk '{print $1}'`
+ hobject=`echo $line|awk '{print $3}'`
+ offset=`echo $line|awk '{print $4}'`
+ off=$((16#$offset))
+ if [ $icount = 1 ];then
+ istart=$offset
+ fi
+ hobject=`dump_backslash $hobject`
+ iend=$offset
+ sshcmd="cat $hobject"
+ ssh $ssh_option $node $sshcmd < /dev/null | dd of=$image_file bs=$unit seek=$off conv=notrunc 2>/dev/null
+ percent=`echo "scale=3; 100*$icount/$entry_count"|bc`
+ tput sc #record current cursor
+ echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%"
+ if [ $icount != $entry_count ];then
+ tput rc # backport most recent cursor
+ fi
+ }
+ done < $image_hobjects
+
+ echo
+ echo -n "size: "
+ ls -lh $image_file|awk '{print $5"\t"$9}'
+ echo -n "du: "
+ du -h $image_file
+ #unlock
+ rm -f $LOCK
+}
+
+
+# ssh copy snap_object & head_object from osd to admin node
+# copy all snapshot objects
+# and
+# all head objects which have the same offset as snapshot objects
+function collect_image_snap_objects()
+{
+ local func="collect_image_snap_objects"
+ #$1=backup_dir, $2=snap_name, $3=snap_hobjects, $4=head_hobjects
+ if [ $# -lt 6 ];then
+ echo "$func: parameters: <pool_id> <image_name> <snap_id> <snap_hobjects> <head_hobjects> <backup_dir>"
+ exit
+ fi
+
+ local pool_id=$1
+ local image_name=$2
+ local snap_id=$3
+ local snap_hobjects=$4 #snap hobjects info
+ local head_hobjects=$5 #head hobjects info
+ local backup_dir=$6
+ pool_id=$(($pool_id))
+
+ local head_dir=$backup_dir/pool_$pool_id/$image_name/@head
+ local snap_dir=$backup_dir/pool_$pool_id/$image_name/@$snap_id
+ local CURRENT=$backup_dir/pool_$pool_id/$image_name/@CURRENT
+
+ if [ ! -e $head_dir ];then
+ mkdir -p $head_dir
+ fi
+ if [ ! -e $snap_dir ];then
+ mkdir -p $snap_dir
+ fi
+
+ local snap_node= #osd node
+ local snap_hobject= #hobject path with snapid on osd
+ local snap_offset=
+ local snap_filename=
+
+ local head_node=
+ local head_hobject=
+ local head_offset=
+ local head_filename=
+
+ # ignore if there is no object in snapshot(empty )
+ if [ ! -s $snap_hobjects ];then
+ echo "$func: $snap_hobjects is empty"
+ return 0
+ fi
+ local start=`head -n 1 $snap_hobjects|awk '{print $4}'`
+ local end=`tail -n 1 $snap_hobjects|awk '{print $4}'`
+ local entry_count=`cat $snap_hobjects|wc -l`
+ if [ $((16#$first_offset)) -gt $((16#$last_offset)) ];then
+ echo "$func: $snap_hobjects not sorted"
+ return 1
+ fi
+
+ # just assert if ignored empty snapshot
+ if [ "$start"x = ""x ] || [ "$end"x = ""x ];then
+ return 1
+ fi
+
+ # speed up copy snapshot
+ # lookup the corresponding head hobject of snap hobject
+ # use command: grep <offset> <head hobjects>
+ #
+ # eg.
+ # head hobjects: (32 objects, snapid = uint64(-2) = 18446744073709551614)
+ # ceph1 29.4d /var/lib/ceph/osd/ceph-0/current/29.4d_head/rb.0.1c414.6b8b4567.000000000000__head_EC2C1C4D__1d 000000000000 18446744073709551614 869
+ # ceph1 29.8c /var/lib/ceph/osd/ceph-0/current/29.8c_head/rb.0.1c414.6b8b4567.000000000001__head_0F439A8C__1d 000000000001 18446744073709551614 867
+ # ceph1 29.6a /var/lib/ceph/osd/ceph-0/current/29.6a_head/rb.0.1c414.6b8b4567.000000000002__head_FC55706A__1d 000000000002 18446744073709551614 869
+ # ceph1 29.8b /var/lib/ceph/osd/ceph-0/current/29.8b_head/rb.0.1c414.6b8b4567.000000000003__head_20A6328B__1d 000000000003 18446744073709551614 869
+ # ceph2 29.75 /var/lib/ceph/osd/ceph-1/current/29.75_head/rb.0.1c414.6b8b4567.000000000004__head_AC5ADB75__1d 000000000004 18446744073709551614 867
+ # ceph2 29.23 /var/lib/ceph/osd/ceph-1/current/29.23_head/rb.0.1c414.6b8b4567.000000000005__head_1FDEA823__1d 000000000005 18446744073709551614 867
+ # ......
+ # ceph1 29.34 /var/lib/ceph/osd/ceph-0/current/29.34_head/rb.0.1c414.6b8b4567.00000000001f__head_52373734__1d 00000000001f 18446744073709551614 869
+ #
+ # snap hobjects: (3 objects, snapid >= 29)
+ # ceph1 29.8c /var/lib/ceph/osd/ceph-0/current/29.8c_head/rb.0.1c414.6b8b4567.000000000001__1f_0F439A8C__1d 000000000001 31 867
+ # ceph1 29.6a /var/lib/ceph/osd/ceph-0/current/29.6a_head/rb.0.1c414.6b8b4567.000000000002__1e_FC55706A__1d 000000000002 30 869
+ # ceph1 29.8b /var/lib/ceph/osd/ceph-0/current/29.8b_head/rb.0.1c414.6b8b4567.000000000003__1d_20A6328B__1d 000000000003 29 869
+ #
+ # so find out offset in head hobjects line number:
+ # snap hobjects: 000000000001 ---> head hobjects: 2 (n1)
+ # snap hobjects: 000000000003 ---> head hobjects: 4 (n2)
+ #
+ # finally , grep range from the whole file [1 ~ N] shranked to part of file [n1 ~ n2]
+ # the worst case : [n1 ~ n2] = [1 ~ N], means no shranking
+
+ # get the line number of the start offset in head hobjects
+ local n1=`grep -n $start $head_hobjects|head -n 1|cut -d ":" -f 1`
+ # get the line number of the end offset in head hobjects
+ local n2=`grep -n $end $head_hobjects|head -n 1|cut -d ":" -f 1`
+
+ local icount=0
+ local istart=
+ local iend=
+ local percent=
+
+ OIFS=$IFS
+ IFS=$'\n'
+
+ #assume file:snap_hobjects is not very large, and can be loaded into memory
+ local snap_arr=(`cat $snap_hobjects`)
+ local snap_tmp=/tmp/snaptmp.$$$$
+
+ # snap_tmp:
+ # consists of snap hobject or head hobject
+ # select lineno range: [n1 ~ n2]
+ head -n $n2 $head_hobjects|tail -n $(($n2-$n1+1)) >$snap_tmp
+
+ echo "copy image snap/head objects from osd ..."
+ echo -e "object_count\t$entry_count"
+ echo -e "range\t\t[$start ~ $end] count:$entry_count"
+
+ trap 'echo $func failed; exit;' INT HUP
+ for line in ${snap_arr[*]}
+ do
+ icount=$(($icount+1))
+
+ OOIFS=$IFS
+ IFS=$' '
+
+ local arr=(`echo $line`)
+ snap_node=${arr[0]}
+ snap_hobject=${arr[2]}
+ snap_offset=${arr[3]}
+ snap_filename=$snap_dir/$snap_offset
+
+ if [ $icount = 1 ];then
+ istart=$snap_offset
+ fi
+ iend=$snap_offset
+
+ #lookup corresponding head hobject of snap hobject
+ local res=`grep $snap_offset $snap_tmp|head -n 1`
+ if [ "$res"x = ""x ];then
+ echo "$func: image object[ $snap_offset ] missing"
+ exit
+ fi
+
+ local arr2=(`echo $res`)
+ head_node=${arr2[0]}
+ head_hobject=${arr2[2]}
+ head_offset=${arr2[3]}
+ head_filename=$head_dir/$head_offset
+
+ # just copy object(snap/head) if it does not exist
+ if [ ! -e $snap_filename ];then
+ ssh $ssh_option $snap_node "cat $snap_hobject" > $snap_filename
+ fi
+ if [ ! -e $head_filename ];then
+ ssh $ssh_option $head_node "cat $head_hobject" > $head_filename
+ fi
+ IFS=$OOIFS
+
+ percent=`echo "scale=3; 100*$icount/$entry_count"|bc`
+ tput sc #record current cursor
+ echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%"
+ if [ $icount != $entry_count ];then
+ tput rc # backport most recent cursor
+ fi
+ done
+ echo
+ IFS=$OIFS
+ rm -f $snap_tmp
+ return 0
+}
+
+# copy all snap objects and corresponding head objects from osds
+# in single process
+function copy_image_snap_single_thread()
+{
+ local func="copy_image_snap_single_thread"
+ if [ $# -lt 6 ];then
+ echo "$func: parameters: <pool_id> <image_name> <snap_id> <snap_hobjects> <head_hobjects> <backup_dir>"
+ exit
+ fi
+ local pool_id=$1
+ local image_name=$2
+ local snap_id=$3
+ local snap_hobjects=$4
+ local head_hobjects=$5
+ local backup_dir=$6
+ pool_id=$(($pool_id))
+
+ local CURRENT=$backup_dir/pool_$pool_id/$image_name/@CURRENT
+ local LOCK=$backup_dir/pool_$pool_id/$image_name/@LOCK
+ #lock
+ if [ -e $LOCK ];then
+ echo "$func: $LOCK is locked by other process"
+ exit
+ else
+ touch $LOCK
+ fi
+ collect_image_snap_objects $pool_id $image_name $snap_id $snap_hobjects $head_hobjects $backup_dir
+ #unlock
+ rm -f $LOCK
+}
+
+# after all snap objects and necessary head objects are copied,
+# just pick appropriate head objects and snap objects and write them to image
+# in order to rollback image to snapshot
+#
+# init: image is created by copy_image_nosnap_single_thread firstly
+#
+# all output include 3 parts:
+# <image> <head objects> <snap objects>
+#
+# head objects1 --- snap1 objects
+# head objects2 --- snap2 objects
+# image head objects3 --- snap3 objects
+# ......
+# head objectsN --- snapN objects
+#
+# how to rollback:
+# firstly rollback to head, secondly write <snapX objects>
+# head = <image> + <head objects>
+# snap1 = <image> + <head objects> + <snap1 objects>
+# snap2 = <image> + <head objects> + <snap2 objects>
+# snap3 = <image> + <head objects> + <snap3 objects>
+# ......
+# snapN = <image> + <head objects> + <snapN objects>
+#
+# improve rollback:
+# there is intersection of head objects and snapX objects, if snapX objects are not empty
+# and need to deduplicate the intersection.
+# deduplicate steps:
+# - get difference set of head objects and snapX objects
+# - write the difference set objects to image
+# - write the snapX objects to image
+function rollback_image_snap()
+{
+ local func="rollback_image_snap"
+
+ echo "$func ..."
+
+ trap 'echo $func failed; exit;' INT HUP
+ if [ $# -lt 6 ];then
+ echo "$func: parameters <pool_id> <image_name> <snap_id> <snap_object_dir> <backup_dir> <image_unit>"
+ exit
+ fi
+ local pool_id=$1
+ local image_name=$2
+ local snap_id=$3
+ local snap_object_dir=$4
+ local backup_dir=$5
+ local image_unit=$6
+
+ local need_diff_set=0
+
+ local image_path=$backup_dir/pool_$pool_id/$image_name/$image_name
+ local head_object_dir=$backup_dir/pool_$pool_id/$image_name/@head
+ local CURRENT=$backup_dir/pool_$pool_id/$image_name/@CURRENT
+ local LOCK=$backup_dir/pool_$pool_id/$image_name/@LOCK
+ if [ -e $LOCK ];then
+ echo "$func: $LOCK is locked by other process"
+ exit
+ else
+ touch $LOCK
+ fi
+ if [ $snap_id -ne -2 ];then
+ echo $snap_id > $CURRENT
+ else
+ echo "head" > $CURRENT
+ fi
+
+ if [ ! -e $snap_object_dir ];then
+ return 0
+ fi
+
+ if [ "$snap_object_dir"x != "$head_object_dir"x ];then
+ echo "$func: need to compute diff_set of head"
+ need_diff_set=1
+ else
+ echo "$func: NO diff_set"
+ need_diff_set=0
+ fi
+
+ local entry_count=0
+ local start=
+ local end=
+ local offset=
+ local icount=0
+ local istart=
+ local iend=
+ local percent=
+
+ local snap_objects=
+ local head_objects=
+ local diff_set=
+
+ snap_objects=(`ls $snap_object_dir`)
+
+ # if need to compute difference set of head_objects and snap_objects
+ if [ $need_diff_set -ne 0 ];then
+ head_objects=(`ls $head_object_dir`)
+
+ #get the difference set: ( head_objects - snap_objects )
+ diff_set=(`
+ sort -m <(echo ${head_objects[@]}|xargs -n 1 echo) <(echo ${snap_objects[@]}|xargs -n 1 echo) \
+ <(echo ${snap_objects[@]}|xargs -n 1 echo) |uniq -u`)
+
+ # copy diff_set of head object to image
+ pushd $head_object_dir >/dev/null
+
+ echo "$func: copy diff_set head objects ..."
+ entry_count=${#diff_set[@]}
+ start=${diff_set[0]}
+ end=
+ if [ $entry_count -gt 0 ];then
+ end=${diff_set[$(($entry_count - 1))]}
+ fi
+ offset=
+ icount=0
+ istart=
+ iend=
+ percent=
+
+ echo -e "object_count\t$entry_count"
+ echo -e "range\t\t[$start ~ $end] count:$entry_count"
+
+ for object in ${diff_set[@]}
+ do
+ icount=$(($icount+1))
+ if [ $icount = 1 ];then
+ istart=$object
+ fi
+ iend=$object
+
+ local offset=$((16#$object))
+ dd if=$object of=$image_path bs=$image_unit seek=$offset conv=notrunc 2>/dev/null
+
+ percent=`echo "scale=3; 100*$icount/$entry_count"|bc`
+ tput sc #record current cursor
+ echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%"
+ if [ $icount != $entry_count ];then
+ tput rc # backport most recent cursor
+ fi
+ done
+ if [ $entry_count -gt 0 ];then
+ echo
+ fi
+ popd >/dev/null
+
+ if [ $snap_id -ne -2 ];then
+ echo -e "$image_name already rollback diff_set: (head - snap)"
+ fi
+ fi
+
+ # copy snap object to image
+ pushd $snap_object_dir >/dev/null
+
+ if [ $need_diff_set -ne 0 ];then
+ echo "$func: copy snap objects ..."
+ else
+ echo "$func: copy head objects ..."
+ fi
+ entry_count=${#snap_objects[@]}
+ start=${snap_objects[0]}
+ end=
+ if [ $entry_count -gt 0 ];then
+ end=${snap_objects[$(($entry_count - 1))]}
+ fi
+ offset=
+ icount=0
+ istart=
+ iend=
+ percent=
+
+ echo -e "object_count\t$entry_count"
+ echo -e "range\t\t[$start ~ $end] count:$entry_count"
+
+ for object in ${snap_objects[@]}
+ do
+ icount=$(($icount+1))
+ if [ $icount = 1 ];then
+ istart=$object
+ fi
+ iend=$object
+
+ local offset=$((16#$object))
+ dd if=$object of=$image_path bs=$image_unit seek=$offset conv=notrunc 2>/dev/null
+
+ percent=`echo "scale=3; 100*$icount/$entry_count"|bc`
+ tput sc #record current cursor
+ echo -n -e "complete\t[$istart ~ $iend] $icount/$entry_count ==> "$percent"%"
+ if [ $icount != $entry_count ];then
+ tput rc # backport most recent cursor
+ fi
+ done
+ if [ $entry_count -gt 0 ];then
+ echo
+ fi
+ popd >/dev/null
+
+ rm -f $LOCK
+ if [ $snap_id -ne -2 ];then
+ echo "$image_name rollback to snapid: $snap_id"
+ else
+ echo "$image_name rollback to head"
+ fi
+}
+
+function recover_image()
+{
+ local func="recover_image"
+ echo "$func ..."
+
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <pool_id> <image_name> <snap_name> [<backup_dir>]"
+ exit
+ fi
+
+ local pool_id=$1
+ local img_name=$2
+ local snap_name=$3
+ local backup_dir=$4
+ pool_id=$(($pool_id))
+ if [ "$snap_name"x = "@"x ];then
+ snap_name=
+ fi
+ if [ "$backup_dir"x = ""x ];then
+ backup_dir=$default_backup_dir
+ fi
+
+ #recover image with nosnap
+ if [ "$snap_name"x = ""x ];then
+ discover_image_nosnap $pool_id $img_name #input image_name
+ local image_hobjects=$images/pool_$pool_id/$image_name_in/$image_name_in
+ copy_image_nosnap_single_thread $pool_id $image_hobjects $backup_dir
+
+ #recover image with snap
+ else
+
+ # check if recovered head already
+ local img_hobjects_path=$images/pool_$pool_id/$img_name/$img_name
+ local img_file_path=$backup_dir/pool_$pool_id/$img_name/$img_name
+ if [ ! -e $img_hobjects_path ] || [ ! -e $img_file_path ];then
+ echo "$func: $img_name@$snap_name : can not rollback to snapshot, please recover image head first"
+ exit
+ fi
+
+ # rollback to head
+ if [ "$snap_name"x = "@@"x ];then
+ local head_dir=$backup_dir/pool_$pool_id/$img_name/@head
+ if [ -e $head_dir ];then
+ local unit=`pushd $head_dir >/dev/null; ls|head -n 1|xargs -n 1 stat|awk '/Size:/{print $2}'`
+ # rollback to head
+ rollback_image_snap $pool_id $img_name -2 $backup_dir/$img_name/@head $backup_dir $unit
+ echo "$image_name_in head : $backup_dir/$img_name/$img_name"
+ else
+ echo "$func: no need to rollback to head"
+ fi
+ return 0
+ fi
+
+ # rollback to snap
+ discover_image_snap $pool_id $img_name $snap_name # get image meta & get snapid object
+ local snap_hobjects=$images/pool_$pool_id/$image_name_in/$image_name_in@$db_snap_id
+ local head_hobjects=$images/pool_$pool_id/$image_name_in/$image_name_in@$db_snap_id@head
+ local snap_object_dir=$backup_dir/pool_$pool_id/$image_name_in/@$db_snap_id
+ local image_path=$backup_dir/pool_$pool_id/$image_name_in/$image_name_in
+ local image_unit=$((1<<$db_order))
+ copy_image_snap_single_thread $pool_id $image_name_in $db_snap_id $snap_hobjects $head_hobjects $backup_dir
+ rollback_image_snap $pool_id $image_name_in $db_snap_id $snap_object_dir $backup_dir $image_unit
+ echo "$image_name_in@$snap_name : $image_path"
+ fi
+}
diff --git a/src/tools/rbd_recover_tool/epoch_h b/src/tools/rbd_recover_tool/epoch_h
new file mode 100644
index 00000000..e268eafa
--- /dev/null
+++ b/src/tools/rbd_recover_tool/epoch_h
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+# file: epoch_h
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+. $my_dir/common_h
+
+#pgid_list=$single_node/$cluster-$id/pgid_list
+function get_pgid_list()
+{
+ find $osd_data/current/ -type d -name "*_head"|\
+ sed -n 's/\(.*\)\/current\/\([0-9a-fA-F]\+\.[0-9a-fA-F]\+\)_head/\2 \1/p'|\
+ sort -t ' ' -k 1.1,1h -k 2.1,2 > $pgid_list;
+}
+
+function get_pgid()
+{
+ hobject_path=$1
+ echo $hobject_path| sed -n 's/\(.*\)\/\([0-9a-fA-F]\+\.[0-9a-fA-F]\+\)_head\(.*\)/\2/p'
+}
+
+infos_seq=
+function get_infos_seq()
+{
+ local func="get_infos_seq"
+
+ local keyword=":infos."
+ local infos_key=`get_map_header_key $keyword`
+
+ if [ "$infos_key"x = ""x ];then
+ echo "$func: keyword not input or infos_key not exists"
+ exit
+ fi
+ local prefix=`get_map_header_prefix`
+ local key=$infos_key
+
+ infos_seq=`get_header_seq $prefix $key`
+ if [ "$infos_seq"x = ""x ];then
+ echo "$func: infos_seq not exists"
+ exit
+ fi
+}
+
+pg_epoch=
+function get_pg_epoch()
+{
+ local func="get_pg_epoch"
+ if [ "$1"x = ""x ];then
+ echo "$func: no pgid input"
+ exit
+ fi
+
+ get_pg_epoch_firefly "$1"
+ if [ "$pg_epoch"x != ""x ]; then
+ # echo "Epoch for $1: $pg_epoch (firefly)"
+ return
+ fi
+
+ get_pg_epoch_hammer "$1"
+ if [ "$pg_epoch"x != ""x ]; then
+ # echo "Epoch for $1: $pg_epoch (hammer)"
+ return
+ fi
+
+ echo "$func: Couldn't find epoch for $1"
+ exit
+}
+
+function get_pg_epoch_firefly()
+{
+ local func="get_pg_epoch_firefly"
+ if [ "$1"x = ""x ];then
+ echo "$func: no pgid input"
+ exit
+ fi
+ local pgid=$1
+ local key=$pgid"_epoch"
+
+ #get_infos_seq;
+ # infos_seq default to 1
+ infos_seq=1
+ local infos_seq=`printf "%016d" $infos_seq`
+ local prefix="_USER_"$infos_seq"_USER_"
+
+ pg_epoch=`get_header_kv $prefix $key int`
+}
+
+function get_pg_epoch_hammer()
+{
+ local func="get_pg_epoch_hammer"
+ if [ "$1"x = ""x ];then
+ echo "$func: no pgid input"
+ exit
+ fi
+ local pgid="$1"
+ local hkey_prefix="$(get_map_header_prefix)"
+ local hkey="$(printf '...head.%x.%08X' "$(echo "$pgid"|cut -d'.' -f1)" "$((0x$(echo "$pgid"|cut -d'.' -f2)))")"
+
+ local infos_seq="$(get_header_seq "$hkey_prefix" "$hkey")"
+ local infos_seq=`printf "%016d" $infos_seq`
+ local prefix="_USER_"$infos_seq"_USER_"
+ local key="_epoch"
+
+ pg_epoch=`get_header_kv $prefix $key int`
+}
diff --git a/src/tools/rbd_recover_tool/metadata_h b/src/tools/rbd_recover_tool/metadata_h
new file mode 100644
index 00000000..4aa491b5
--- /dev/null
+++ b/src/tools/rbd_recover_tool/metadata_h
@@ -0,0 +1,368 @@
+#!/usr/bin/env bash
+# file: metadata_h
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+. $my_dir/common_h
+. $my_dir/epoch_h
+
+# put origin name in $image_name_in: for output
+# put convert "_" name in $image_name: for grep image hobjects from database
+image_name_in=
+image_name=
+function input_image()
+{
+ local func="input_image"
+ if [ "$1"x = ""x ];then
+ echo "$func: no image name input"
+ exit
+ fi
+
+ image_name_in=$1
+ # "_" -> "\u"
+ image_name=`convert_underline $image_name_in`
+}
+
+#======================================== distinguish v1 or v2 ===================================
+#image_list_v1=$single_node/$cluster-$id/image_list_v1
+#image_list_v2=$single_node/$cluster-$id/image_list_v2
+function get_image_list()
+{
+ find $osd_data/current/ -type f|grep ".rbd__" >$image_list_v1
+ find $osd_data/current/ -type f|grep "rbd\\\\uid." >$image_list_v2
+}
+
+function get_image_format_by_hobject()
+{
+ local func="get_image_format"
+ if [ "$1"x = ""x ];then
+ exit
+ fi
+ local res1=`cat $image_list_v1|grep $1`
+ if [ "$res1"x != ""x ];then
+ echo 1
+ exit
+ fi
+
+ local res2=`cat $image_list_v2|grep $1`
+ if [ "$res2"x = ""x ];then
+ echo 2
+ exit
+ fi
+}
+
+#======================================== image format v1 ========================================
+# <image_name>.rbd include 3 parts:
+# header + snap_count*snapshot + snap_count*snap_name
+#
+# struct rbd_obj_header_ondisk {
+# 40 char text[40];
+# 24 char block_name[RBD_MAX_BLOCK_NAME_SIZE];
+# 4 char signature[4];
+# 8 char version[8];
+# struct {
+# 1 __u8 order;
+# 1 __u8 crypt_type;
+# 1 __u8 comp_type;
+# 1 __u8 unused;
+# } __attribute__((packed)) options;
+# 8 __le64 image_size;//hexdump -C s=80 n=8
+# 8 __le64 snap_seq; //hexdump -C s=88 n=8
+# 4 __le32 snap_count;//hexdump -C s=96 n=4
+# 4 __le32 reserved;
+# 8 __le64 snap_names_len;//hexdump -C s=104 n=8
+# struct rbd_obj_snap_ondisk snaps[0];
+# } __attribute__((packed));
+#
+# sizeof(rbd_obj_header_ondisk): 112
+#
+# struct rbd_obj_snap_ondisk {
+# 8 __le64 id; //hexdump -C s=112+i*16 n=8 , i=[0, snap_count)
+# 8 __le64 image_size;//hexdump -C s=112+i*16+8 n=8, i=[0, snap_count)
+# } __attribute__((packed));
+# sizeof(rbd_obj_snap_ondisk): 16
+#
+# get snap_names form <image_nane>.rbd
+# hexdump -e '10/1 "%_c"' -s $((112 + $snap_count*16)) -n $snap_names_len <image_name>.rbd
+# then split snap_names into array
+
+function get_image_metadata_v1()
+{
+ local func="get_image_metadata_v1"
+ if [ "$1"x = ""x ];then
+ echo "$func: no image head object input"
+ exit
+ fi
+ local snap_name=
+ if [ "$2"x != ""x ];then
+ snap_name=$2
+ fi
+
+ if [ ! -e $1 ];then
+ echo "$func: $1 not exists"
+ exit
+ fi
+ local hobject_path=$1
+ d_hobject_path=`dump_backslash $1`
+ local image_format=`get_image_format_by_hobject $d_hobject_path`
+ if [ $image_format != 1 ];then
+ echo "$func: image_format must be 1"
+ exit
+ fi
+
+ if [ ! -e $hobject_path ];then
+ echo "$func: $hobject_path not exists"
+ exit
+ fi
+
+ # decode rbd_obj_header_ondisk of <image_name>.rbd
+ local block_name=`hexdump -e '10/1 "%c"' -s 40 -n 24 $hobject_path`
+ local order=`hexdump -e '10/4 "%u"' -s 76 -n 1 $hobject_path`
+ local image_size=`hexdump -C -s 80 -n 8 $hobject_path|head -n 1|awk '{for (i=9; i>1; i--) {printf $i}}'`
+ image_size=$((16#$image_size))
+ local snap_seq=`hexdump -C -s 88 -n 8 $hobject_path|head -n 1|
+ awk '{num=""; for(i=9; i>1; i--){ num=num""$i;} print strtonum("0x"num);}'`
+ local snap_count=`hexdump -C -s 96 -n 4 $hobject_path|head -n 1|
+ awk '{num=""; for(i=5; i>1; i--){ num=num""$i;} print strtonum("0x"num);}'`
+ local snap_names_len=`hexdump -C -s 104 -n 8 $hobject_path|head -n 1|
+ awk '{num=""; for(i=9; i>1; i--){ num=num""$i;} print strtonum("0x"num);}'`
+
+ echo -e "block_name:\t$block_name"
+ echo -e "order:\t\t$order"
+ echo -e "image_size:\t$image_size"
+ echo -e "snap_seq:\t$snap_seq"
+
+ # decode N rbd_obj_snap_ondisk of <image_name>.rbd
+ declare -a snap_ids
+ declare -a snap_names
+ declare -a snap_image_sizes
+ local size_header=112 #sizeof(rbd_obj_header_ondisk)
+ local size_snap=16 #sizeof(rbd_obj_snap_ondisk)
+ local offset=0
+ local id_off=0
+ local size_off=0
+ for ((i=0; i<$snap_count; i++))
+ do
+ offset=$(($size_header + $i * $size_snap))
+ id_off=$offset
+ size_off=$(($offset + 8))
+ snap_ids[$i]=`hexdump -C -s $id_off -n 8 $hobject_path|head -n 1|
+ awk '{num=""; for(i=9; i>1; i--){num=num""$i;} print strtonum("0x"num);}'`
+ snap_image_sizes[$i]=`hexdump -C -s $size_off -n 8 $hobject_path|head -n 1|
+ awk '{num=""; for(i=9; i>1; i--){num=num""$i;} print strtonum("0x"num);}'`
+ done
+ offset=$(($size_header + $snap_count * $size_snap))
+ snap_names=(`hexdump -e '10/1 "%_c"' -s $offset -n $snap_names_len $hobject_path|
+ awk -F "\\\\\\\\\\\\\\\\0" '{for(i=1; i<=NF; i++) {print $i" "} }'`);
+
+ echo -e "\t\tID\tNAME\t\tSIZE"
+ for ((i=0; i<$snap_count; i++))
+ do
+ if [ "$snap_name"x = ""x ];then
+ echo -n -e "snapshot:\t"
+ echo -e "${snap_ids[$i]}\t${snap_names[$i]}\t\t${snap_image_sizes[$i]}"
+ continue
+ fi
+ if [ "$snap_name"x = "${snap_names[$i]}"x ];then
+ echo -n -e "snapshot:\t"
+ echo -e "${snap_ids[$i]}\t${snap_names[$i]}\t\t${snap_image_sizes[$i]}"
+ return
+ fi
+ done
+}
+
+#======================================== end image format v1 ========================================
+
+#======================================== image format v2 ========================================
+
+# map_header, header_seq, header, key/value
+# eg.
+# map_header _HOBJTOSEQ_:rbd%uheader%e139a6b8b4567...head.2.68E826B6
+# meta_header_seq 17426
+# header: _USER_0000000000017426_USER_:object_prefix
+# _USER_0000000000017426_USER_:order
+# _USER_0000000000017426_USER_:size
+# _USER_0000000000017426_USER_:snap_seq
+# key/value ceph-kvstore-tool /storepath get _USER_0000000000017426_USER_ (object_prefix|order|size|snap_seq)
+
+# decode image id from image_id_hobject
+function get_image_id()
+{
+ local func="get_image_id"
+ if [ "$1"x = ""x ];then
+ exit;
+ fi
+ local image_id_hobject=$1 #from admin node's database
+
+ if [ ! -e $image_id_hobject ];then
+ #echo "$func: $image_id_hobject not exists"
+ exit;
+ fi
+
+ # get len of string
+ local n=`hexdump -e '10/4 "%u"' -s 0 -n 4 $image_id_hobject`
+ # get string
+ hexdump -e '10/1 "%c"' -s 4 -n $n $image_id_hobject
+}
+
+#find image_id omap entry in omaplist
+map_header_prefix=
+map_header_key=
+function get_map_header()
+{
+ local func="get_map_header"
+ local image_id=$1
+ if [ "$image_id"x = ""x ];then
+ echo "$func: no image_id input"
+ exit;
+ fi
+ map_header_prefix=`get_map_header_prefix`
+ local keyword="header%e"$image_id
+ map_header_key=`get_map_header_key $keyword`
+ if [ "$map_header_key"x = ""x ];then
+ echo "$func: map_header_key is NULL(not in omaplist)"
+ exit
+ fi
+}
+
+#get meta header seq from map_header
+meta_header_seq=
+function get_meta_header_seq()
+{
+ local func="get_meta_header_seq"
+ if [ "$1"x == ""x ];then
+ echo "$func: no prefix input"
+ exit;
+ elif [ "$2"x == ""x ];then
+ echo "$func: no key input"
+ exit;
+ fi
+ local prefix=$1;
+ local key=$2;
+ meta_header_seq=`get_header_seq $prefix $key`
+}
+
+# get image metadata : object_prefix, order, image_size, snap_seq
+object_prefix=
+order=
+image_size=
+snap_seq=
+function get_image_metadata_v2()
+{
+ local func="get_image_metadata_v2"
+ if [ "$1"x = ""x ];then
+ echo "$func: no meta_header_seq input"
+ exit;
+ fi
+ local meta_header_seq=`printf "%016d" $1`
+ #echo "$func: meta_header_seq = "$meta_header_seq
+ local ghobject_key="_USER_"$meta_header_seq"_USER_"
+ local prefix=$ghobject_key
+
+ object_prefix=`get_header_kv $prefix object_prefix string`
+ #object_prefix="rbd_data.$image_id"
+ order=`get_header_kv $prefix order int`
+ image_size=`get_header_kv $prefix size int`
+ snap_seq=`get_header_kv $prefix snap_seq int`
+
+ echo -e "object_prefix:\t$object_prefix"
+ echo -e "order:\t\t$order"
+ echo -e "image_size:\t$image_size"
+ echo -e "snap_seq:\t$snap_seq"
+
+ # list snapshot
+ list_snaps_v2 $1 $2
+}
+
+# struct cls_rbd_snap {
+# snapid_t id;
+# string name;
+# uint64_t image_size;
+# uint64_t features;
+# uint8_t protection_status;
+# cls_rbd_parent parent;
+# }
+# decode cls_rbd_snap
+# 1 u8 struct_v
+# 1 u8 struct_compat
+# 4 u32 struct_len
+# 8 u64 snapid_t id //s=6 n=8
+# 4 u32 len of name //s=14 n=4
+# len char name //s=18 n=len
+# 8 u64 image_size
+# 8 u64 features
+# ......
+#
+function list_snaps_v2()
+{
+ local func="list_snaps_v2"
+ if [ "$1"x = ""x ];then
+ exit
+ fi
+ local sname=
+ if [ $# -eq 2 ];then
+ sname=$2
+ fi
+ local meta_header_seq=`printf "%016d" $1`
+ local prefix="_USER_"$meta_header_seq"_USER_"
+ local keys=(`awk -F ":" '/snapshot_/ && $1 == "'"$prefix"'" {if ($2 == "") exit; split($2, arr, "_");
+ print arr[2];}' $omap_list|sort -r`)
+ echo -e "\t\tID\tNAME\t\tSIZE"
+ for key in ${keys[@]}
+ do
+ key="snapshot_$key"
+ local arr=(`ceph-kvstore-tool $omap_path get $prefix $key|awk -F ":" '{print $2}'`);
+ # get snap_name
+ tmp=
+ for ((i=17; i>13; i--))
+ do
+ tmp="$tmp${arr[$i]}"
+ done
+ local len=$((16#$tmp))
+ local snap_name=
+ for ((i=18; i<$((18+$len)); i++))
+ do
+ # convert ascii to char
+ local char=`echo -e "\x${arr[$i]}"`
+ snap_name="$snap_name$char"
+ done
+ # get snap_id (little endian)
+ local tmp=
+ for ((i=13; i>5; i--))
+ do
+ tmp="$tmp${arr[$i]}"
+ done
+ local snap_id=$((16#$tmp))
+ # get image_size of current snap (little endian)
+ tmp=
+ for ((i=$((25+$len)); i>$((17+$len)); i--))
+ do
+ tmp="$tmp${arr[$i]}"
+ done
+ local image_size=$((16#$tmp))
+ if [ "$sname"x = ""x ];then
+ echo -e "snapshot:\t$snap_id\t$snap_name\t\t$image_size"
+ continue
+ fi
+ if [ "$sname"x = "$snap_name"x ];then
+ echo -e "snapshot:\t$snap_id\t$snap_name\t\t$image_size"
+ return
+ fi
+ done
+}
+
+#======================================== end image format v2 ========================================
diff --git a/src/tools/rbd_recover_tool/osd_job b/src/tools/rbd_recover_tool/osd_job
new file mode 100755
index 00000000..b4b80be8
--- /dev/null
+++ b/src/tools/rbd_recover_tool/osd_job
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+# file: osd_job
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+my_dir=$(dirname "$0")
+
+. $my_dir/common_h
+. $my_dir/metadata_h
+. $my_dir/epoch_h
+
+function check_ceph_osd()
+{
+ local func="check_ceph_osd"
+ local host=`hostname`
+ # if ceph-osd service is still running, except flush-journal
+ if [ "`ps aux|grep ceph-osd|grep -v flush-journal|grep -v grep`"x != ""x ];then
+ echo "[$host]: $func: ceph-osd is running..., stop it"
+ exit
+ fi
+}
+
+function cat_pg_epoch()
+{
+ local func="cat_pg_epoch"
+ init_env_osd $1
+ if [ -e $node_pg_epoch ];then
+ cat $node_pg_epoch
+ fi
+}
+
+function cat_image_v1()
+{
+ local func="cat_image_v1"
+ init_env_osd $1
+ if [ -e $image_v1 ];then
+ cat $image_v1
+ fi
+}
+
+function cat_image_v2()
+{
+ local func="cat_image_v2"
+ init_env_osd $1
+ if [ -e $image_v2 ];then
+ cat $image_v2
+ fi
+}
+
+function flush_osd_journal()
+{
+ local func="flush_osd_journal"
+ init_env_osd $1
+ local osd_data_path=$osd_data
+ local osd_journal_path=$osd_data/journal
+ local whoami_path=$osd_data/whoami
+ local host=`hostname`
+ if [ ! -e $whoami_path ];then
+ echo "[$host]: $func: $whoami_path not exists"
+ exit
+ fi
+ local whoami=`cat $whoami_path`
+ echo "[$host]: $func ..."
+ ceph-osd -i $whoami --osd-data $osd_data_path --osd-journal $osd_journal_path --flush-journal >/dev/null
+ if [ $? -ne 0 ];then
+ echo "[$host]: $func: flush osd journal failed"
+ exit
+ fi
+}
+
+function do_omap_list()
+{
+ local func="do_omap_list"
+ init_env_osd $1
+ local host=`hostname`
+ echo "[$host]: $func ..."
+ get_omap_list
+}
+
+# get all pgs epoch
+function do_pg_epoch()
+{
+ local func="do_pg_epoch"
+ init_env_osd $1
+ local node=`hostname`
+ get_pgid_list
+ >$node_pg_epoch
+ local pgid=
+ local data_path=
+ local host=`hostname`
+ echo "[$host]: $func ..."
+ while read line
+ do
+ {
+ pgid=`echo $line|awk '{print $1}'`
+ data_path=`echo $line|awk '{print $2}'`
+ get_pg_epoch $pgid
+ echo -e "$node $pgid $pg_epoch $data_path" >>$node_pg_epoch
+ }
+ done < $pgid_list
+}
+
+# get an list of image in this osd node, pg epoch maybe not the latest, the admin node will do distinguish
+function do_image_list()
+{
+ local func="do_image_list"
+ init_env_osd $1
+ get_image_list
+ local node=`hostname`
+ >$image_v1
+ >$image_v2
+ local host=`hostname`
+ echo "[$host]: $func ..."
+ for line in `cat $image_list_v1`
+ do
+ pgid=`get_pgid $line`
+ get_pg_epoch $pgid
+ echo "$node $line $pg_epoch" >> $image_v1
+ done
+ for line in `cat $image_list_v2`
+ do
+ pgid=`get_pgid $line`
+ get_pg_epoch $pgid
+ echo "$node $line $pg_epoch" >> $image_v2
+ done
+}
+
+function do_image_id()
+{
+ local func="do_image_id"
+ init_env_osd $1
+ get_image_id $2
+}
+
+function do_image_metadata_v1()
+{
+ local func="do_image_metadata_v1"
+ init_env_osd $1
+ local image_header_hobject=$2
+ local snap_name=$3
+ get_image_metadata_v1 $image_header_hobject $snap_name
+}
+
+function do_image_metadata_v2()
+{
+ local func="do_image_metadata_v2"
+ init_env_osd $1
+ local image_id=$2
+ local image_header_hobject=$3
+ local snap_name=$4
+ get_map_header $image_id
+ get_meta_header_seq $map_header_prefix $map_header_key
+ get_image_metadata_v2 $meta_header_seq $snap_name
+}
+
+check_ceph_osd
+$*
diff --git a/src/tools/rbd_recover_tool/rbd-recover-tool b/src/tools/rbd_recover_tool/rbd-recover-tool
new file mode 100755
index 00000000..b7a25865
--- /dev/null
+++ b/src/tools/rbd_recover_tool/rbd-recover-tool
@@ -0,0 +1,327 @@
+#!/usr/bin/env bash
+# file: rbd-recover-tool
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+# rbd-recover-tool is an offline recover tool for rbd image in replicated pool
+# when ceph cluster is stopped.
+# it is a simple disater recovery policy, just for urgent condition
+
+my_dir=$(dirname "$0")
+
+. $my_dir/common_h
+. $my_dir/metadata_h
+. $my_dir/epoch_h
+. $my_dir/database_h
+
+#scp files from admin node to osd node
+file1=common_h
+file2=metadata_h
+file3=epoch_h
+file4=osd_job
+
+#------------ admin node's action -------------
+
+function scp_file()
+{
+ local func="scp_file"
+ file=$1
+ if [ "$1"x = ""x ];then
+ echo "$func: not file input"
+ exit
+ fi
+ for host in `cat $osd_host`
+ do
+ {
+ echo "$func: $host"
+ scp $ssh_option $file $host:$job_path 1>/dev/null
+ } &
+ done
+}
+
+function scp_files()
+{
+ local func="scp_files"
+ for host in `cat $osd_host`
+ do
+ {
+ echo "$func: $host"
+ scp $ssh_option $file1 $host:$job_path
+ scp $ssh_option $file2 $host:$job_path
+ scp $ssh_option $file3 $host:$job_path
+ scp $ssh_option $file4 $host:$job_path
+ } &
+ done
+ wait
+ echo "$func: finish"
+}
+
+function scatter_node_jobs()
+{
+ local func="scatter_node_jobs"
+ local host=
+ local data_path=
+ echo "$func: flush osd journal & generate infos: omap, pg, image metadata ..."
+
+ trap 'echo $func failed; exit' INT HUP
+ while read line
+ do
+ {
+ host=`echo $line|awk '{print $1}'`
+ data_path=`echo $line|awk '{print $2}'`
+ check_osd_process $host
+
+ cmd="mkdir -p $job_path"
+ ssh $ssh_option $host $cmd
+ scp $ssh_option $file1 $host:$job_path >/dev/null
+ scp $ssh_option $file2 $host:$job_path >/dev/null
+ scp $ssh_option $file3 $host:$job_path >/dev/null
+ scp $ssh_option $file4 $host:$job_path >/dev/null
+
+ cmd="bash $job_path/osd_job flush_osd_journal $data_path;"
+ cmd="$cmd $job_path/osd_job do_omap_list $data_path;"
+ cmd="$cmd bash $job_path/osd_job do_pg_epoch $data_path;"
+ cmd="$cmd bash $job_path/osd_job do_image_list $data_path;"
+
+ ssh $ssh_option $host $cmd </dev/null
+ } &
+ done < $osd_host_path
+ wait
+ echo "$func: finish"
+}
+
+function gather_node_infos()
+{
+ local func="gather_node_infos"
+ echo "$func ..."
+ >$pg_coll
+ >$image_coll_v1
+ >$image_coll_v2
+ trap 'echo $func failed; exit' INT HUP
+ while read line
+ do
+ {
+ host=`echo $line|awk '{print $1}'`
+ data_path=`echo $line|awk '{print $2}'`
+ echo "$func: $host"
+ check_osd_process $host
+
+ #pg epoch
+ cmd1="bash $job_path/osd_job cat_pg_epoch $data_path"
+ ssh $ssh_option $host $cmd1 >> $pg_coll
+ #image v1
+ cmd2="bash $job_path/osd_job cat_image_v1 $data_path"
+ ssh $ssh_option $host $cmd2 >> $image_coll_v1
+ #image v2
+ cmd3="bash $job_path/osd_job cat_image_v2 $data_path"
+ ssh $ssh_option $host $cmd3 >> $image_coll_v2
+ } &
+ done < $osd_host_path
+ wait
+ echo "$func: finish"
+}
+
+function scatter_gather()
+{
+ local func="scatter_gather"
+ if [ ! -s $osd_host ];then
+ echo "$func: no osd_host input"
+ exit
+ fi
+ if [ ! -s $mon_host ];then
+ echo "$func: no mon_host input"
+ exit
+ fi
+ scatter_node_jobs
+ gather_node_infos
+}
+
+
+#------------- operations --------------
+
+function database()
+{
+ scatter_gather
+ gen_database
+}
+
+function list()
+{
+ list_images
+}
+
+function lookup()
+{
+ lookup_image $1 $2 $3
+}
+
+function recover()
+{
+ recover_image $1 $2 $3 $4
+}
+
+#------------- helper -------------
+
+function usage()
+{
+ local cmd_name="rbd-recover-tool"
+ echo
+ echo "$cmd_name is used to recover rbd image of replicated pool,
+ when all ceph services are stopped"
+ echo "Usage:"
+ echo "$cmd_name database
+ gather pg info, object info, image metadata,
+ and epoch info from all osd nodes,
+ this will cosume a long time, just be patient,
+ especially when scale up to 1000+ osds"
+ echo "$cmd_name list
+ list all rbd images of all replicated pools,
+ before to lookup & recover"
+ echo "$cmd_name lookup <pool_id>/<image_name>[@[<snap_name>]]
+ show image metadata: image format, rbd id, size, order, snapseq
+ In addition, for image with snapshots,
+ this will list all snapshot infomations"
+ echo "$cmd_name recover <pool_id>/<image_name>[@[<snap_name>]] [</path/to/store/image>]
+ all snapshots share one image head, to economize disk space
+ so there is only one snapshot at any time,
+ image is saved at </path/to/store/image>/pool_<pool_id>/image_name/image_name
+ cat <path/to/store/image>/pool_<pool_id>/image_name/@CURRENT,
+ will show snapid
+ recover to raw image/nosnap/head: <image_name>
+ rollback to image head: <image_name>@
+ rollback to image snap: <image_name>@<snap_name>
+ recover steps:
+ 1. recover image nosnap (only one time)
+ 2. rollback to image snap"
+}
+
+function get_path()
+{
+ local func="get_path"
+ if [ $# -lt 1 ];then
+ return
+ fi
+ if [[ $1 =~ // ]];then
+ return # "/path//to" is invalid
+ fi
+ local parent=`dirname $1`
+ local name=`basename $1`
+ if [ "$parent"x = "/"x ];then
+ echo "$parent$name"
+ else
+ echo -n "$parent/$name"
+ fi
+}
+
+function admin_cmd()
+{
+ local func="admin_cmd"
+ if [ $# -lt 1 ];then
+ usage
+ exit
+ fi
+ if [ "$1"x = "-h"x ] || [ "$1"x = "--help"x ];then
+ usage
+ exit
+ fi
+
+ if [ "$1"x = "database"x ];then
+ if [ $# -gt 1 ];then
+ usage
+ exit
+ fi
+ # remove osd_host to refresh osd_host and osd_host_mapping
+ rm -f $osd_host
+ init_env_admin
+ database
+ elif [ "$1"x = "list"x ];then
+ if [ $# -gt 1 ];then
+ usage
+ exit
+ fi
+ init_env_admin
+ list
+ elif [ "$1"x = "lookup"x ];then
+ if [ $# -gt 2 ];then
+ usage
+ exit
+ fi
+ local pool_id=-1
+ local image_name=
+ local snap_name=
+ if [[ $2 =~ ^([^@/]+)/([^@/]+)$ ]];then
+ pool_id="${BASH_REMATCH[1]}"
+ image_name="${BASH_REMATCH[2]}"
+ elif [[ $2 =~ ^([^@/]+)/([^@/]+)@([^@/]*)$ ]];then
+ pool_id="${BASH_REMATCH[1]}"
+ image_name="${BASH_REMATCH[2]}"
+ snap_name="${BASH_REMATCH[3]}"
+ else
+ echo "format: $2 is invalid, use <pool_id>/<image_name>[@[<snap_name>]]"
+ exit
+ fi
+ init_env_admin
+ lookup $pool_id $image_name $snap_name
+ elif [ "$1"x = "recover"x ];then
+ if [ $# -lt 2 ] || [ $# -gt 3 ];then
+ usage
+ exit
+ fi
+ local pool_id=-1
+ local image_name=
+ local snap_name=@
+ local image_dir=
+ if [[ $2 =~ ^([^@/]+)/([^@/]+)$ ]];then
+ pool_id="${BASH_REMATCH[1]}"
+ image_name="${BASH_REMATCH[2]}"
+ elif [[ $2 =~ ^([^@/]+)/([^@/]+)@([^@/]*)$ ]];then
+ pool_id="${BASH_REMATCH[1]}"
+ image_name="${BASH_REMATCH[2]}"
+ snap_name="${BASH_REMATCH[3]}"
+ if [ "$snap_name"x = ""x ];then
+ snap_name=@@
+ fi
+ else
+ echo "format: $2 is invalid, use <pool_id>/<image_name>[@[<snap_name>]]"
+ exit
+ fi
+ if [ $# = 3 ];then
+ image_dir=`get_path $3`
+ if [ "image_dir"x = ""x ];then
+ echo "$3 invalid"
+ exit
+ fi
+ fi
+ init_env_admin
+ recover $pool_id $image_name $snap_name $image_dir
+ elif [ "$1"x = "scp_files"x ];then
+ if [ $# -gt 1 ];then
+ exit
+ fi
+ admin_parse_osd
+ scp_files
+ elif [ "$1"x = "scp_file"x ];then
+ if [ $# -gt 2 ];then
+ exit
+ fi
+ admin_parse_osd
+ scp_file $2
+ else
+ echo "$func: $1: command not found"
+ fi
+}
+
+admin_cmd $*
diff --git a/src/tools/rbd_recover_tool/test_rbd_recover_tool.sh b/src/tools/rbd_recover_tool/test_rbd_recover_tool.sh
new file mode 100755
index 00000000..876b47b9
--- /dev/null
+++ b/src/tools/rbd_recover_tool/test_rbd_recover_tool.sh
@@ -0,0 +1,542 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2015 Ubuntu Kylin
+#
+# Author: Min Chen <minchen@ubuntukylin.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+# unit test case for rbd-recover-tool
+
+#prepare:
+# - write config files: config/osd_host, config/mon_host, config/storage_path, config/mds_host if exist mds
+#step 1. rbd export all images as you need
+#step 2. stop all ceph services
+#step 3. use ceph_rbd_recover_tool to recover all images
+#step 4. compare md5sum of recover image with that of export image who has the same image name
+
+ssh_opt="-o ConnectTimeout=1"
+my_dir=$(dirname "$0")
+tool_dir=$my_dir
+
+#storage_path=$my_dir/config/storage_path
+mon_host=$my_dir/config/mon_host
+osd_host=$my_dir/config/osd_host
+mds_host=$my_dir/config/mds_host
+
+test_dir= # `cat $storage_path`
+export_dir= #$test_dir/export
+recover_dir= #$test_dir/recover
+image_names= #$test_dir/image_names
+online_images= #$test_dir/online_images, all images on ceph rbd pool
+gen_db= #$test_dir/gen_db, label database if exist
+pool=rbd
+pool_id=2
+
+function get_pool_id()
+{
+ local pool_id_file=/tmp/pool_id_file.$$$$
+ ceph osd pool stats $pool|head -n 1|awk '{print $4}' >$pool_id_file
+ if [ $? -ne 0 ];then
+ echo "$func: get pool id failed: pool = $pool"
+ rm -f $pool_id_file
+ exit
+ fi
+ pool_id=`cat $pool_id_file`
+ echo "$func: pool_id = $pool_id"
+ rm -f $pool_id_file
+}
+
+function init()
+{
+ local func="init"
+ if [ $# -eq 0 ];then
+ echo "$func: must input <path> to storage images, enough disk space is good"
+ exit
+ fi
+ if [ ! -s $osd_host ];then
+ echo "$func: config/osd_host not exists or empty"
+ exit
+ fi
+ if [ ! -s $mon_host ];then
+ echo "$func: config/mon_host not exists or empty"
+ exit
+ fi
+ if [ ! -e $mds_host ];then
+ echo "$func: config/mds_host not exists"
+ exit
+ fi
+ test_dir=$1
+ export_dir=$test_dir/export
+ recover_dir=$test_dir/recover
+ image_names=$test_dir/image_names
+ online_images=$test_dir/online_images
+ gen_db=$test_dir/gen_db
+
+ trap 'echo "ceph cluster is stopped ..."; exit;' INT
+ ceph -s >/dev/null
+ get_pool_id
+
+ mkdir -p $test_dir
+ mkdir -p $export_dir
+ mkdir -p $recover_dir
+ rm -rf $export_dir/*
+ rm -rf $recover_dir/*
+}
+
+function do_gen_database()
+{
+ local func="do_gen_database"
+ if [ -s $gen_db ] && [ `cat $gen_db` = 1 ];then
+ echo "$func: database already existed"
+ exit
+ fi
+ bash $tool_dir/rbd-recover-tool database
+ echo 1 >$gen_db
+}
+
+#check if all ceph processes are stopped
+function check_ceph_service()
+{
+ local func="check_ceph_service"
+ local res=`cat $osd_host $mon_host $mds_host|sort -u|tr -d [:blank:]|xargs -n 1 -I @ ssh $ssh_opt @ "ps aux|grep -E \"(ceph-osd|ceph-mon|ceph-mds)\"|grep -v grep"`
+ if [ "$res"x != ""x ];then
+ echo "$func: NOT all ceph services are stopped"
+ return 1
+ exit
+ fi
+ echo "$func: all ceph services are stopped"
+ return 0
+}
+
+function stop_ceph()
+{
+ local func="stop_ceph"
+ #cat osd_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-osd"
+ while read osd
+ do
+ {
+ osd=`echo $osd|tr -d [:blank:]`
+ if [ "$osd"x = ""x ];then
+ continue
+ fi
+ #ssh $ssh_opt $osd "killall ceph-osd ceph-mon ceph-mds" </dev/null
+ ssh $ssh_opt $osd "killall ceph-osd" </dev/null
+ } &
+ done < $osd_host
+ wait
+ echo "waiting kill all osd ..."
+ sleep 1
+ #cat $mon_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mon ceph-osd ceph-mds"
+ cat $mon_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mon"
+ #cat $mds_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mds ceph-mon ceph-osd"
+ cat $mds_host|xargs -n 1 -I @ ssh $ssh_opt @ "killall ceph-mds"
+}
+
+function create_image()
+{
+ local func="create_image"
+ if [ ${#} -lt 3 ];then
+ echo "create_image: parameters: <image_name> <size> <image_format>"
+ exit
+ fi
+ local image_name=$1
+ local size=$2
+ local image_format=$3
+ if [ $image_format -lt 1 ] || [ $image_format -gt 2 ];then
+ echo "$func: image_format must be 1 or 2"
+ exit
+ fi
+ local res=`rbd list|grep -E "^$1$"`
+ echo "$func $image_name ..."
+ if [ "$res"x = ""x ];then
+ rbd -p $pool create $image_name --size $size --image_format $image_format
+ else
+ if [ $image_format -eq 2 ];then
+ rbd snap ls $image_name|tail -n +2|awk '{print $2}'|xargs -n 1 -I % rbd snap unprotect $image_name@%
+ fi
+ rbd snap purge $image_name
+ #rbd rm $image_name
+ rbd -p $pool resize --allow-shrink --size $size $image_name
+ fi
+}
+
+function export_image()
+{
+ local func="export_image"
+
+ if [ $# -lt 2 ];then
+ echo "$func: parameters: <image_name> <image_format> [<image_size>]"
+ exit
+ fi
+
+ local image_name=$1
+ local format=$(($2))
+ local size=$(($3)) #MB
+
+ if [ $format -ne 1 ] && [ $format -ne 2 ];then
+ echo "$func: image format must be 1 or 2"
+ exit
+ fi
+
+ if [ $size -eq 0 ];then
+ size=24 #MB
+ echo "$func: size = $size"
+ fi
+ local mnt=/rbdfuse
+
+ mount |grep "rbd-fuse on /rbdfuse" &>/dev/null
+ if [ $? -ne 0 ];then
+ rbd-fuse $mnt
+ fi
+
+ create_image $image_name $size $format
+
+ dd conv=notrunc if=/dev/urandom of=$mnt/$image_name bs=4M count=$(($size/4))
+
+ local export_image_dir=$export_dir/pool_$pool_id/$image_name
+ mkdir -p $export_image_dir
+ local export_md5_nosnap=$export_image_dir/@md5_nosnap
+ >$export_md5_nosnap
+
+ local export_image_path=$export_image_dir/$image_name
+ rm -f $export_image_path
+
+ rbd export $pool/$image_name $export_image_path
+ md5sum $export_image_path |awk '{print $1}' >$export_md5_nosnap
+}
+
+function recover_image()
+{
+ local func="recover_snapshots"
+ if [ $# -lt 1 ];then
+ echo "$func: parameters: <image_name>"
+ exit
+ fi
+
+ local image_name=$1
+ #pool_id=29
+
+ local recover_image_dir=$recover_dir/pool_$pool_id/$image_name
+ mkdir -p $recover_image_dir
+ local recover_md5_nosnap=$recover_image_dir/@md5_nosnap
+ >$recover_md5_nosnap
+ local snapshot=
+
+ bash $tool_dir/rbd-recover-tool recover $pool_id/$image_name $recover_dir
+ md5sum $recover_image_dir/$image_name|awk '{print $1}' >$recover_md5_nosnap
+}
+
+function make_snapshot()
+{
+ local func="make_snapshot"
+ if [ $# -lt 5 ];then
+ echo "$func: parameters: <ofile> <seek> <count> <snap> <export_image_dir>"
+ exit
+ fi
+ local ofile=$1
+ local seek=$(($2))
+ local count=$(($3))
+ local snap=$4
+ local export_image_dir=$5
+
+ if [ $seek -lt 0 ];then
+ echo "$func: seek can not be minus"
+ exit
+ fi
+
+ if [ $count -lt 1 ];then
+ echo "$func: count must great than zero"
+ exit
+ fi
+
+ echo "[$snap] $func ..."
+ echo "$1 $2 $3 $4"
+ rbd snap ls $image_name|grep $snap;
+
+ local res=$?
+ if [ $res -eq 0 ];then
+ return $res
+ fi
+
+ dd conv=notrunc if=/dev/urandom of=$ofile bs=1M count=$count seek=$seek 2>/dev/null
+ snapshot=$image_name@$snap
+ rbd snap create $snapshot
+ rm -f $export_image_dir/$snapshot
+ rbd export $pool/$image_name $export_image_dir/$snapshot
+ pushd $export_image_dir >/dev/null
+ md5sum $snapshot >> @md5
+ popd >/dev/null
+}
+
+function recover_snapshots()
+{
+ local func="recover_snapshots"
+ if [ $# -lt 1 ];then
+ echo "$func: parameters: <image_name>"
+ exit
+ fi
+
+ local image_name=$1
+ #pool_id=29
+
+ local recover_image_dir=$recover_dir/pool_$pool_id/$image_name
+ mkdir -p $recover_image_dir
+ local recover_md5=$recover_image_dir/@md5
+ >$recover_md5
+ local snapshot=
+
+
+ # recover head
+ bash $tool_dir/rbd-recover-tool recover $pool_id/$image_name $recover_dir
+
+ # recover snapshots
+ for((i=1; i<10; i++))
+ do
+ snapshot=snap$i
+ bash $tool_dir/rbd-recover-tool recover $pool_id/$image_name@$snapshot $recover_dir
+ pushd $recover_image_dir >/dev/null
+ local chksum=`md5sum $image_name|awk '{print $1}'`
+ echo "$chksum $image_name@$snapshot" >>@md5
+ popd >/dev/null
+ done
+}
+
+function export_snapshots()
+{
+ local func="export_snapshots"
+
+ if [ $# -lt 2 ];then
+ echo "$func: parameters: <image_name> <image_format> [<image_size>]"
+ exit
+ fi
+
+ local image_name=$1
+ local format=$(($2))
+ local size=$(($3)) #MB
+
+ if [ $format -ne 1 ] && [ $format -ne 2 ];then
+ echo "$func: image format must be 1 or 2"
+ exit
+ fi
+
+ if [ $size -eq 0 ];then
+ size=24 #MB
+ echo "$func: size = $size"
+ fi
+ local mnt=/rbdfuse
+
+ mount |grep "rbd-fuse on /rbdfuse" &>/dev/null
+ if [ $? -ne 0 ];then
+ rbd-fuse $mnt
+ fi
+
+ create_image $image_name $size $format
+
+ local export_image_dir=$export_dir/pool_$pool_id/$image_name
+ mkdir -p $export_image_dir
+ local export_md5=$export_image_dir/@md5
+ >$export_md5
+
+ # create 9 snapshots
+ # image = {object0, object1, object2, object3, object4, object5, ...}
+ #
+ # snap1 : init/write all objects
+ # snap2 : write object0
+ # snap3 : write object1
+ # snap4 : write object2
+ # snap5 : write object3
+ # snap6 : write object4
+ # snap7 : write object5
+ # snap8 : write object0
+ # snap9 : write object3
+
+ make_snapshot $mnt/$image_name 0 $size snap1 $export_image_dir
+ make_snapshot $mnt/$image_name 0 1 snap2 $export_image_dir
+ make_snapshot $mnt/$image_name 4 1 snap3 $export_image_dir
+ make_snapshot $mnt/$image_name 8 1 snap4 $export_image_dir
+ make_snapshot $mnt/$image_name 12 1 snap5 $export_image_dir
+ make_snapshot $mnt/$image_name 16 1 snap6 $export_image_dir
+ make_snapshot $mnt/$image_name 20 1 snap7 $export_image_dir
+ make_snapshot $mnt/$image_name 1 1 snap8 $export_image_dir
+ make_snapshot $mnt/$image_name 13 1 snap9 $export_image_dir
+}
+
+function check_recover_nosnap()
+{
+ local func="check_recover_nosnap"
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <export_md5_file> <recover_md5_file> <image_name>"
+ fi
+ local export_md5=$1
+ local recover_md5=$2
+ local image_name=$3
+
+ local ifpassed="FAILED"
+
+ echo "================ < $image_name nosnap > ================"
+
+ local export_md5sum=`cat $export_md5`
+ local recover_md5sum=`cat $recover_md5`
+
+ if [ "$export_md5sum"x != ""x ] && [ "$export_md5sum"x = "$recover_md5sum"x ];then
+ ifpassed="PASSED"
+ fi
+ echo "export: $export_md5sum"
+ echo "recover: $recover_md5sum $ifpassed"
+}
+
+function check_recover_snapshots()
+{
+ local func="check_recover_snapshots"
+ if [ $# -lt 3 ];then
+ echo "$func: parameters: <export_md5_file> <recover_md5_file> <image_name>"
+ fi
+ local export_md5=$1
+ local recover_md5=$2
+ local image_name=$3
+
+ local ifpassed="FAILED"
+
+ echo "================ < $image_name snapshots > ================"
+
+ OIFS=$IFS
+ IFS=$'\n'
+ local export_md5s=(`cat $export_md5`)
+ local recover_md5s=(`cat $recover_md5`)
+ for((i=0; i<9; i++))
+ do
+ OOIFS=$IFS
+ IFS=$' '
+ local x=$(($i+1))
+ snapshot=snap$x
+
+ local export_arr=(`echo ${export_md5s[$i]}`)
+ local recover_arr=(`echo ${recover_md5s[$i]}`)
+ echo "export: ${export_md5s[$i]}"
+ if [ "${export_arr[1]}"x != ""x ] && [ "${export_arr[1]}"x = "${recover_arr[1]}"x ];then
+ ifpassed="PASSED"
+ fi
+ echo "recover: ${recover_md5s[$i]} $ifpassed"
+ IFS=$OOIFS
+ done
+ IFS=$OIFS
+}
+
+# step 1: export image, snapshot
+function do_export_nosnap()
+{
+ export_image image_v1_nosnap 1
+ export_image image_v2_nosnap 2
+}
+
+function do_export_snap()
+{
+ export_snapshots image_v1_snap 1
+ export_snapshots image_v2_snap 2
+}
+
+# step 2: stop ceph cluster and gen database
+function stop_cluster_gen_database()
+{
+ trap 'echo stop ceph cluster failed; exit;' INT HUP
+ stop_ceph
+ sleep 2
+ check_ceph_service
+ local res=$?
+ while [ $res -ne 0 ]
+ do
+ stop_ceph
+ sleep 2
+ check_ceph_service
+ res=$?
+ done
+
+ echo 0 >$gen_db
+ do_gen_database
+}
+
+# step 3: recover image,snapshot
+function do_recover_nosnap()
+{
+ recover_image image_v1_nosnap
+ recover_image image_v2_nosnap
+}
+
+function do_recover_snap()
+{
+ recover_snapshots image_v1_snap
+ recover_snapshots image_v2_snap
+}
+
+# step 4: check md5sum pair<export_md5sum, recover_md5sum>
+function do_check_recover_nosnap()
+{
+ local image1=image_v1_nosnap
+ local image2=image_v2_nosnap
+
+ local export_md5_1=$export_dir/pool_$pool_id/$image1/@md5_nosnap
+ local export_md5_2=$export_dir/pool_$pool_id/$image2/@md5_nosnap
+ local recover_md5_1=$recover_dir/pool_$pool_id/$image1/@md5_nosnap
+ local recover_md5_2=$recover_dir/pool_$pool_id/$image2/@md5_nosnap
+
+ check_recover_nosnap $export_md5_1 $recover_md5_1 $image1
+ check_recover_nosnap $export_md5_2 $recover_md5_2 $image2
+}
+
+function do_check_recover_snap()
+{
+ local image1=image_v1_snap
+ local image2=image_v2_snap
+
+ local export_md5_1=$export_dir/pool_$pool_id/$image1/@md5
+ local export_md5_2=$export_dir/pool_$pool_id/$image2/@md5
+ local recover_md5_1=$recover_dir/pool_$pool_id/$image1/@md5
+ local recover_md5_2=$recover_dir/pool_$pool_id/$image2/@md5
+
+ check_recover_snapshots $export_md5_1 $recover_md5_1 $image1
+ check_recover_snapshots $export_md5_2 $recover_md5_2 $image2
+}
+
+function test_case_1()
+{
+ do_export_nosnap
+ stop_cluster_gen_database
+ do_recover_nosnap
+ do_check_recover_nosnap
+}
+
+function test_case_2()
+{
+ do_export_snap
+ stop_cluster_gen_database
+ do_recover_snap
+ do_check_recover_snap
+}
+
+function test_case_3()
+{
+ do_export_nosnap
+ do_export_snap
+
+ stop_cluster_gen_database
+
+ do_recover_nosnap
+ do_recover_snap
+
+ do_check_recover_nosnap
+ do_check_recover_snap
+}
+
+
+init $*
+test_case_3
diff --git a/src/tools/rebuild_mondb.cc b/src/tools/rebuild_mondb.cc
new file mode 100644
index 00000000..8e3d5b45
--- /dev/null
+++ b/src/tools/rebuild_mondb.cc
@@ -0,0 +1,351 @@
+#include "auth/cephx/CephxKeyServer.h"
+#include "common/errno.h"
+#include "mon/AuthMonitor.h"
+#include "mon/MonitorDBStore.h"
+#include "os/ObjectStore.h"
+#include "osd/OSD.h"
+
+static int update_auth(const string& keyring_path,
+ const OSDSuperblock& sb,
+ MonitorDBStore& ms);
+static int update_monitor(const OSDSuperblock& sb, MonitorDBStore& ms);
+static int update_osdmap(ObjectStore& fs,
+ OSDSuperblock& sb,
+ MonitorDBStore& ms);
+
+int update_mon_db(ObjectStore& fs, OSDSuperblock& sb,
+ const string& keyring,
+ const string& store_path)
+{
+ MonitorDBStore ms(store_path);
+ int r = ms.create_and_open(cerr);
+ if (r < 0) {
+ cerr << "unable to open mon store: " << store_path << std::endl;
+ return r;
+ }
+ if ((r = update_auth(keyring, sb, ms)) < 0) {
+ goto out;
+ }
+ if ((r = update_osdmap(fs, sb, ms)) < 0) {
+ goto out;
+ }
+ if ((r = update_monitor(sb, ms)) < 0) {
+ goto out;
+ }
+ out:
+ ms.close();
+ return r;
+}
+
+static void add_auth(KeyServerData::Incremental& auth_inc,
+ MonitorDBStore& ms)
+{
+ AuthMonitor::Incremental inc;
+ inc.inc_type = AuthMonitor::AUTH_DATA;
+ encode(auth_inc, inc.auth_data);
+ inc.auth_type = CEPH_AUTH_CEPHX;
+
+ bufferlist bl;
+ __u8 v = 1;
+ encode(v, bl);
+ inc.encode(bl, CEPH_FEATURES_ALL);
+
+ const string prefix("auth");
+ auto last_committed = ms.get(prefix, "last_committed") + 1;
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put(prefix, last_committed, bl);
+ t->put(prefix, "last_committed", last_committed);
+ auto first_committed = ms.get(prefix, "first_committed");
+ if (!first_committed) {
+ t->put(prefix, "first_committed", last_committed);
+ }
+ ms.apply_transaction(t);
+}
+
+static int get_auth_inc(const string& keyring_path,
+ const OSDSuperblock& sb,
+ KeyServerData::Incremental* auth_inc)
+{
+ auth_inc->op = KeyServerData::AUTH_INC_ADD;
+
+ // get the name
+ EntityName entity;
+ // assuming the entity name of OSD is "osd.<osd_id>"
+ entity.set(CEPH_ENTITY_TYPE_OSD, std::to_string(sb.whoami));
+ auth_inc->name = entity;
+
+ // read keyring from disk
+ KeyRing keyring;
+ {
+ bufferlist bl;
+ string error;
+ int r = bl.read_file(keyring_path.c_str(), &error);
+ if (r < 0) {
+ if (r == -ENOENT) {
+ cout << "ignoring keyring (" << keyring_path << ")"
+ << ": " << error << std::endl;
+ return 0;
+ } else {
+ cerr << "unable to read keyring (" << keyring_path << ")"
+ << ": " << error << std::endl;
+ return r;
+ }
+ } else if (bl.length() == 0) {
+ cout << "ignoring empty keyring: " << keyring_path << std::endl;
+ return 0;
+ }
+ auto bp = bl.cbegin();
+ try {
+ decode(keyring, bp);
+ } catch (const buffer::error& e) {
+ cerr << "error decoding keyring: " << keyring_path << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ // get the key
+ EntityAuth new_inc;
+ if (!keyring.get_auth(auth_inc->name, new_inc)) {
+ cerr << "key for " << auth_inc->name << " not found in keyring: "
+ << keyring_path << std::endl;
+ return -EINVAL;
+ }
+ auth_inc->auth.key = new_inc.key;
+
+ // get the caps
+ map<string,bufferlist> caps;
+ if (new_inc.caps.empty()) {
+ // fallback to default caps for an OSD
+ // osd 'allow *' mon 'allow rwx'
+ // as suggested by document.
+ encode(string("allow *"), caps["osd"]);
+ encode(string("allow rwx"), caps["mon"]);
+ } else {
+ caps = new_inc.caps;
+ }
+ auth_inc->auth.caps = caps;
+ return 0;
+}
+
+// rebuild
+// - auth/${epoch}
+// - auth/first_committed
+// - auth/last_committed
+static int update_auth(const string& keyring_path,
+ const OSDSuperblock& sb,
+ MonitorDBStore& ms)
+{
+ // stolen from AuthMonitor::prepare_command(), where prefix is "auth add"
+ KeyServerData::Incremental auth_inc;
+ int r;
+ if ((r = get_auth_inc(keyring_path, sb, &auth_inc))) {
+ return r;
+ }
+ add_auth(auth_inc, ms);
+ return 0;
+}
+
+// stolen from Monitor::check_fsid()
+static int check_fsid(const uuid_d& fsid, MonitorDBStore& ms)
+{
+ bufferlist bl;
+ int r = ms.get("monitor", "cluster_uuid", bl);
+ if (r == -ENOENT)
+ return r;
+ string uuid(bl.c_str(), bl.length());
+ auto end = uuid.find_first_of('\n');
+ if (end != uuid.npos) {
+ uuid.resize(end);
+ }
+ uuid_d existing;
+ if (!existing.parse(uuid.c_str())) {
+ cerr << "error: unable to parse uuid" << std::endl;
+ return -EINVAL;
+ }
+ if (fsid != existing) {
+ cerr << "error: cluster_uuid " << existing << " != " << fsid << std::endl;
+ return -EEXIST;
+ }
+ return 0;
+}
+
+// rebuild
+// - monitor/cluster_uuid
+int update_monitor(const OSDSuperblock& sb, MonitorDBStore& ms)
+{
+ switch (check_fsid(sb.cluster_fsid, ms)) {
+ case -ENOENT:
+ break;
+ case -EINVAL:
+ return -EINVAL;
+ case -EEXIST:
+ return -EEXIST;
+ case 0:
+ return 0;
+ default:
+ ceph_abort();
+ }
+ string uuid = stringify(sb.cluster_fsid) + "\n";
+ bufferlist bl;
+ bl.append(uuid);
+ auto t = make_shared<MonitorDBStore::Transaction>();
+ t->put("monitor", "cluster_uuid", bl);
+ ms.apply_transaction(t);
+ return 0;
+}
+
+// rebuild
+// - osdmap/${epoch}
+// - osdmap/full_${epoch}
+// - osdmap/full_latest
+// - osdmap/first_committed
+// - osdmap/last_committed
+int update_osdmap(ObjectStore& fs, OSDSuperblock& sb, MonitorDBStore& ms)
+{
+ const string prefix("osdmap");
+ const string first_committed_name("first_committed");
+ const string last_committed_name("last_committed");
+ epoch_t first_committed = ms.get(prefix, first_committed_name);
+ epoch_t last_committed = ms.get(prefix, last_committed_name);
+ auto t = make_shared<MonitorDBStore::Transaction>();
+
+ // trim stale maps
+ unsigned ntrimmed = 0;
+ // osdmap starts at 1. if we have a "0" first_committed, then there is nothing
+ // to trim. and "1 osdmaps trimmed" in the output message is misleading. so
+ // let's make it an exception.
+ for (auto e = first_committed; first_committed && e < sb.oldest_map; e++) {
+ t->erase(prefix, e);
+ t->erase(prefix, ms.combine_strings("full", e));
+ ntrimmed++;
+ }
+ // make sure we have a non-zero first_committed. OSDMonitor relies on this.
+ // because PaxosService::put_last_committed() set it to last_committed, if it
+ // is zero. which breaks OSDMonitor::update_from_paxos(), in which we believe
+ // that latest_full should always be greater than last_committed.
+ if (first_committed == 0 && sb.oldest_map < sb.newest_map) {
+ first_committed = 1;
+ } else if (ntrimmed) {
+ first_committed += ntrimmed;
+ }
+ if (first_committed) {
+ t->put(prefix, first_committed_name, first_committed);
+ ms.apply_transaction(t);
+ t = make_shared<MonitorDBStore::Transaction>();
+ }
+
+ unsigned nadded = 0;
+
+ auto ch = fs.open_collection(coll_t::meta());
+ OSDMap osdmap;
+ for (auto e = std::max(last_committed+1, sb.oldest_map);
+ e <= sb.newest_map; e++) {
+ bool have_crc = false;
+ uint32_t crc = -1;
+ uint64_t features = 0;
+ // add inc maps
+ auto add_inc_result = [&] {
+ const auto oid = OSD::get_inc_osdmap_pobject_name(e);
+ bufferlist bl;
+ int nread = fs.read(ch, oid, 0, 0, bl);
+ if (nread <= 0) {
+ cout << "missing " << oid << std::endl;
+ return -ENOENT;
+ }
+ t->put(prefix, e, bl);
+
+ OSDMap::Incremental inc;
+ auto p = bl.cbegin();
+ inc.decode(p);
+ features = inc.encode_features | CEPH_FEATURE_RESERVED;
+ if (osdmap.get_epoch() && e > 1) {
+ if (osdmap.apply_incremental(inc)) {
+ cerr << "bad fsid: "
+ << osdmap.get_fsid() << " != " << inc.fsid << std::endl;
+ return -EINVAL;
+ }
+ have_crc = inc.have_crc;
+ if (inc.have_crc) {
+ crc = inc.full_crc;
+ bufferlist fbl;
+ osdmap.encode(fbl, features);
+ if (osdmap.get_crc() != inc.full_crc) {
+ cerr << "mismatched inc crc: "
+ << osdmap.get_crc() << " != " << inc.full_crc << std::endl;
+ return -EINVAL;
+ }
+ // inc.decode() verifies `inc_crc`, so it's been taken care of.
+ }
+ }
+ return 0;
+ }();
+ switch (add_inc_result) {
+ case -ENOENT:
+ // no worries, we always have full map
+ break;
+ case -EINVAL:
+ return -EINVAL;
+ case 0:
+ break;
+ default:
+ assert(0);
+ }
+ // add full maps
+ {
+ const auto oid = OSD::get_osdmap_pobject_name(e);
+ bufferlist bl;
+ int nread = fs.read(ch, oid, 0, 0, bl);
+ if (nread <= 0) {
+ cerr << "missing " << oid << std::endl;
+ return -EINVAL;
+ }
+ t->put(prefix, ms.combine_strings("full", e), bl);
+
+ auto p = bl.cbegin();
+ osdmap.decode(p);
+ if (osdmap.have_crc()) {
+ if (have_crc && osdmap.get_crc() != crc) {
+ cerr << "mismatched full/inc crc: "
+ << osdmap.get_crc() << " != " << crc << std::endl;
+ return -EINVAL;
+ }
+ uint32_t saved_crc = osdmap.get_crc();
+ bufferlist fbl;
+ osdmap.encode(fbl, features);
+ if (osdmap.get_crc() != saved_crc) {
+ cerr << "mismatched full crc: "
+ << saved_crc << " != " << osdmap.get_crc() << std::endl;
+ return -EINVAL;
+ }
+ }
+ }
+ nadded++;
+
+ // last_committed
+ t->put(prefix, last_committed_name, e);
+ // full last
+ t->put(prefix, ms.combine_strings("full", "latest"), e);
+
+ // this number comes from the default value of osd_target_transaction_size,
+ // so we won't OOM or stuff too many maps in a single transaction if OSD is
+ // keeping a large series of osdmap
+ static constexpr unsigned TRANSACTION_SIZE = 30;
+ if (t->size() >= TRANSACTION_SIZE) {
+ ms.apply_transaction(t);
+ t = make_shared<MonitorDBStore::Transaction>();
+ }
+ }
+ if (!t->empty()) {
+ ms.apply_transaction(t);
+ }
+ t.reset();
+
+ string osd_name("osd.");
+ osd_name += std::to_string(sb.whoami);
+ cout << std::left << setw(8)
+ << osd_name << ": "
+ << ntrimmed << " osdmaps trimmed, "
+ << nadded << " osdmaps added." << std::endl;
+ return 0;
+}
+
diff --git a/src/tools/rebuild_mondb.h b/src/tools/rebuild_mondb.h
new file mode 100644
index 00000000..8a2317d8
--- /dev/null
+++ b/src/tools/rebuild_mondb.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <string>
+
+class ObjectStore;
+class OSDSuperblock;
+
+int update_mon_db(ObjectStore& fs, OSDSuperblock& sb,
+ const std::string& keyring_path,
+ const std::string& store_path);
diff --git a/src/tools/rgw/parse-cr-dump.py b/src/tools/rgw/parse-cr-dump.py
new file mode 100755
index 00000000..539929b1
--- /dev/null
+++ b/src/tools/rgw/parse-cr-dump.py
@@ -0,0 +1,168 @@
+#!/usr/bin/python
+from __future__ import print_function
+from collections import Counter
+import argparse
+import json
+import re
+import sys
+
+def gen_mgrs(args, cr_dump):
+ """ traverse and return one manager at a time """
+ mgrs = cr_dump['coroutine_managers']
+ if args.manager is not None:
+ yield mgrs[args.manager]
+ else:
+ for mgr in mgrs:
+ yield mgr
+
+def gen_stacks(args, cr_dump):
+ """ traverse and return one stack at a time """
+ for mgr in gen_mgrs(args, cr_dump):
+ for ctx in mgr['run_contexts']:
+ for stack in ctx['entries']:
+ yield stack
+
+def gen_ops(args, cr_dump):
+ """ traverse and return one op at a time """
+ for stack in gen_stacks(args, cr_dump):
+ for op in stack['ops']:
+ yield stack, op
+
+def op_status(op):
+ """ return op status or (none) """
+ # "status": {"status": "...", "timestamp": "..."}
+ return op.get('status', {}).get('status', '(none)')
+
+def do_crs(args, cr_dump):
+ """ print a sorted list of coroutines """
+ counter = Counter()
+
+ if args.group == 'status':
+ print('Count:\tStatus:')
+ for _, op in gen_ops(args, cr_dump):
+ if args.filter and not re.search(args.filter, op['type']):
+ continue
+ counter[op_status(op)] += 1
+ else:
+ print('Count:\tCoroutine:')
+ for _, op in gen_ops(args, cr_dump):
+ name = op['type']
+ if args.filter and not re.search(args.filter, name):
+ continue
+ counter[name] += 1
+
+ crs = counter.most_common();
+
+ if args.order == 'asc':
+ crs.reverse()
+ if args.limit:
+ crs = crs[:args.limit]
+
+ for op in crs:
+ print('%d\t%s' % (op[1], op[0]))
+ print('Total:', sum(counter.values()))
+ return 0
+
+def match_ops(name, ops):
+ """ return true if any op matches the given filter """
+ for op in ops:
+ if re.search(name, op):
+ return True
+ return False
+
+def do_stacks(args, cr_dump):
+ """ print a list of coroutine stacks """
+ print('Stack:\t\tCoroutines:')
+ count = 0
+ for stack in gen_stacks(args, cr_dump):
+ stack_id = stack['stack']
+ ops = [op['type'] for op in stack['ops']]
+ if args.filter and not match_ops(args.filter, ops):
+ continue
+ if args.limit and count == args.limit:
+ print('...')
+ break
+ print('%s\t%s' % (stack_id, ', '.join(ops)))
+ count += 1
+ print('Total:', count)
+ return 0
+
+def traverse_spawned_stacks(args, stack, depth, stacks, callback):
+ """ recurse through spawned stacks, passing each op to the callback """
+ for op in stack['ops']:
+ # only filter ops in base stack
+ if depth == 0 and args.filter and not re.search(args.filter, op['type']):
+ continue
+ if not callback(stack, op, depth):
+ return False
+ for spawned in op.get('spawned', []):
+ s = stacks.get(spawned)
+ if not s:
+ continue
+ if not traverse_spawned_stacks(args, s, depth + 1, stacks, callback):
+ return False
+ return True
+
+def do_stack(args, cr_dump):
+ """ inspect a given stack and its descendents """
+ # build a lookup table of stacks by id
+ stacks = {s['stack']: s for s in gen_stacks(args, cr_dump)}
+
+ stack = stacks.get(args.stack)
+ if not stack:
+ print('Stack %s not found' % args.stack, file=sys.stderr)
+ return 1
+
+ do_stack.count = 0 # for use in closure
+ def print_stack_op(stack, op, depth):
+ indent = ' ' * depth * 4
+ if args.limit and do_stack.count == args.limit:
+ print('%s...' % indent)
+ return False # stop traversal
+ do_stack.count += 1
+ print('%s[%s] %s: %s' % (indent, stack['stack'], op['type'], op_status(op)))
+ return True
+
+ traverse_spawned_stacks(args, stack, 0, stacks, print_stack_op)
+ return 0
+
+def do_spawned(args, cr_dump):
+ """ search all ops for the given spawned stack """
+ for stack, op in gen_ops(args, cr_dump):
+ if args.stack in op.get('spawned', []):
+ print('Stack %s spawned by [%s] %s' % (args.stack, stack['stack'], op['type']))
+ return 0
+ print('Stack %s not spawned' % args.stack, file=sys.stderr)
+ return 1
+
+def main():
+ parser = argparse.ArgumentParser(description='Parse and inspect the output of the "cr dump" admin socket command.')
+ parser.add_argument('--filename', type=argparse.FileType(), default=sys.stdin, help='Input filename (or stdin if empty)')
+ parser.add_argument('--filter', type=str, help='Filter by coroutine type (regex syntax is supported)')
+ parser.add_argument('--limit', type=int)
+ parser.add_argument('--manager', type=int, help='Index into coroutine_managers[]')
+
+ subparsers = parser.add_subparsers()
+
+ crs_parser = subparsers.add_parser('crs', help='Produce a sorted list of coroutines')
+ crs_parser.add_argument('--group', type=str, choices=['type', 'status'])
+ crs_parser.add_argument('--order', type=str, choices=['desc', 'asc'])
+ crs_parser.set_defaults(func=do_crs)
+
+ stacks_parser = subparsers.add_parser('stacks', help='Produce a list of coroutine stacks and their ops')
+ stacks_parser.set_defaults(func=do_stacks)
+
+ stack_parser = subparsers.add_parser('stack', help='Inspect a given coroutine stack')
+ stack_parser.add_argument('stack', type=str)
+ stack_parser.set_defaults(func=do_stack)
+
+ spawned_parser = subparsers.add_parser('spawned', help='Find the op that spawned the given stack')
+ spawned_parser.add_argument('stack', type=str)
+ spawned_parser.set_defaults(func=do_spawned)
+
+ args = parser.parse_args()
+ return args.func(args, json.load(args.filename))
+
+if __name__ == "__main__":
+ result = main()
+ sys.exit(result)
diff --git a/src/tools/scratchtool.c b/src/tools/scratchtool.c
new file mode 100644
index 00000000..899447ec
--- /dev/null
+++ b/src/tools/scratchtool.c
@@ -0,0 +1,319 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+static int do_rados_setxattr(rados_ioctx_t io_ctx, const char *oid,
+ const char *key, const char *val)
+{
+ int ret = rados_setxattr(io_ctx, oid, key, val, strlen(val) + 1);
+ if (ret < 0) {
+ printf("rados_setxattr failed with error %d\n", ret);
+ return 1;
+ }
+ printf("rados_setxattr %s=%s\n", key, val);
+ return 0;
+}
+
+static int do_rados_getxattr(rados_ioctx_t io_ctx, const char *oid,
+ const char *key, const char *expected)
+{
+ size_t blen = strlen(expected) + 1;
+ char buf[blen];
+ memset(buf, 0, sizeof(buf));
+ int r = rados_getxattr(io_ctx, oid, key, buf, blen);
+ if (r < 0) {
+ printf("rados_getxattr(%s) failed with error %d\n", key, r);
+ return 1;
+ }
+ if (strcmp(buf, expected) != 0) {
+ printf("rados_getxattr(%s) got wrong result! "
+ "expected: '%s'. got '%s'\n", key, expected, buf);
+ return 1;
+ }
+ printf("rados_getxattr %s=%s\n", key, buf);
+ return 0;
+}
+
+static int do_rados_getxattrs(rados_ioctx_t io_ctx, const char *oid,
+ const char **exkeys, const char **exvals)
+{
+ rados_xattrs_iter_t iter;
+ int nval = 0, i, nfound = 0, r = 0, ret = 1;
+
+ for (i = 0; exvals[i]; ++i) {
+ ++nval;
+ }
+ r = rados_getxattrs(io_ctx, oid, &iter);
+ if (r) {
+ printf("rados_getxattrs(%s) failed with error %d\n", oid, r);
+ return 1;
+ }
+ while (1) {
+ size_t len;
+ const char *key, *val;
+ r = rados_getxattrs_next(iter, &key, &val, &len);
+ if (r) {
+ printf("rados_getxattrs(%s): rados_getxattrs_next "
+ "returned error %d\n", oid, r);
+ goto out_err;
+ }
+ if (!key)
+ break;
+ for (i = 0; i < nval; ++i) {
+ if (strcmp(exkeys[i], key))
+ continue;
+ if ((len == strlen(exvals[i]) + 1) && (val != NULL) && (!strcmp(exvals[i], val))) {
+ nfound++;
+ break;
+ }
+ printf("rados_getxattrs(%s): got key %s, but the "
+ "value was %s rather than %s.\n",
+ oid, key, val, exvals[i]);
+ goto out_err;
+ }
+ }
+ if (nfound != nval) {
+ printf("rados_getxattrs(%s): only found %d extended attributes. "
+ "Expected %d\n", oid, nfound, nval);
+ goto out_err;
+ }
+ ret = 0;
+ printf("rados_getxattrs(%s)\n", oid);
+
+out_err:
+ rados_getxattrs_end(iter);
+ return ret;
+}
+
+static int testrados(void)
+{
+ char tmp[32];
+ int i, r;
+ int ret = 1; //set 1 as error case
+ rados_t cl;
+
+ if (rados_create(&cl, NULL) < 0) {
+ printf("error initializing\n");
+ return 1;
+ }
+
+ if (rados_conf_read_file(cl, NULL)) {
+ printf("error reading configuration file\n");
+ goto out_err;
+ }
+
+ // Try to set a configuration option that doesn't exist.
+ // This should fail.
+ if (!rados_conf_set(cl, "config option that doesn't exist",
+ "some random value")) {
+ printf("error: succeeded in setting nonexistent config option\n");
+ goto out_err;
+ }
+
+ if (rados_conf_get(cl, "log to stderr", tmp, sizeof(tmp))) {
+ printf("error: failed to read log_to_stderr from config\n");
+ goto out_err;
+ }
+
+ // Can we change it?
+ if (rados_conf_set(cl, "log to stderr", "true")) {
+ printf("error: error setting log_to_stderr\n");
+ goto out_err;
+ }
+ if (rados_conf_get(cl, "log to stderr", tmp, sizeof(tmp))) {
+ printf("error: failed to read log_to_stderr from config\n");
+ goto out_err;
+ }
+ if (strcmp(tmp, "true")) {
+ printf("error: new setting for log_to_stderr failed to take effect.\n");
+ goto out_err;
+ }
+
+ if (rados_connect(cl)) {
+ printf("error connecting\n");
+ goto out_err;
+ }
+ if (rados_connect(cl) == 0) {
+ printf("second connect attempt didn't return an error\n");
+ goto out_err;
+ }
+
+ /* create an io_ctx */
+ r = rados_pool_create(cl, "foo");
+ printf("rados_pool_create = %d\n", r);
+
+ rados_ioctx_t io_ctx;
+ r = rados_ioctx_create(cl, "foo", &io_ctx);
+ if (r < 0) {
+ printf("error creating ioctx\n");
+ goto out_err;
+ }
+ printf("rados_ioctx_create = %d, io_ctx = %p\n", r, io_ctx);
+
+ /* list all pools */
+ {
+ int buf_sz = rados_pool_list(cl, NULL, 0);
+ printf("need buffer size of %d\n", buf_sz);
+ char buf[buf_sz];
+ int r = rados_pool_list(cl, buf, buf_sz);
+ if (r != buf_sz) {
+ printf("buffer size mismatch: got %d the first time, but %d "
+ "the second.\n", buf_sz, r);
+ goto out_err_cleanup;
+ }
+ const char *b = buf;
+ printf("begin pools.\n");
+ while (1) {
+ if (b[0] == '\0')
+ break;
+ printf(" pool: '%s'\n", b);
+ b += strlen(b) + 1;
+ };
+ printf("end pools.\n");
+ }
+
+
+ /* stat */
+ struct rados_pool_stat_t st;
+ r = rados_ioctx_pool_stat(io_ctx, &st);
+ printf("rados_ioctx_pool_stat = %d, %lld KB, %lld objects\n", r, (long long)st.num_kb, (long long)st.num_objects);
+
+ /* snapshots */
+ r = rados_ioctx_snap_create(io_ctx, "snap1");
+ printf("rados_ioctx_snap_create snap1 = %d\n", r);
+ rados_snap_t snaps[10];
+ r = rados_ioctx_snap_list(io_ctx, snaps, 10);
+ for (i=0; i<r; i++) {
+ char name[100];
+ rados_ioctx_snap_get_name(io_ctx, snaps[i], name, sizeof(name));
+ printf("rados_ioctx_snap_list got snap %lld %s\n", (long long)snaps[i], name);
+ }
+ rados_snap_t snapid;
+ r = rados_ioctx_snap_lookup(io_ctx, "snap1", &snapid);
+ printf("rados_ioctx_snap_lookup snap1 got %lld, result %d\n", (long long)snapid, r);
+ r = rados_ioctx_snap_remove(io_ctx, "snap1");
+ printf("rados_ioctx_snap_remove snap1 = %d\n", r);
+
+ /* sync io */
+ time_t tm;
+ char buf[128], buf2[128];
+ time(&tm);
+ snprintf(buf, 128, "%s", ctime(&tm));
+ const char *oid = "foo_object";
+ r = rados_write(io_ctx, oid, buf, strlen(buf) + 1, 0);
+ printf("rados_write = %d\n", r);
+ r = rados_read(io_ctx, oid, buf2, sizeof(buf2), 0);
+ printf("rados_read = %d\n", r);
+ if (memcmp(buf, buf2, r))
+ printf("*** content mismatch ***\n");
+
+ /* attrs */
+ if (do_rados_setxattr(io_ctx, oid, "b", "2"))
+ goto out_err_cleanup;
+ if (do_rados_setxattr(io_ctx, oid, "a", "1"))
+ goto out_err_cleanup;
+ if (do_rados_setxattr(io_ctx, oid, "c", "3"))
+ goto out_err_cleanup;
+ if (do_rados_getxattr(io_ctx, oid, "a", "1"))
+ goto out_err_cleanup;
+ if (do_rados_getxattr(io_ctx, oid, "b", "2"))
+ goto out_err_cleanup;
+ if (do_rados_getxattr(io_ctx, oid, "c", "3"))
+ goto out_err_cleanup;
+ const char *exkeys[] = { "a", "b", "c", NULL };
+ const char *exvals[] = { "1", "2", "3", NULL };
+ if (do_rados_getxattrs(io_ctx, oid, exkeys, exvals))
+ goto out_err_cleanup;
+
+ uint64_t size;
+ time_t mtime;
+ r = rados_stat(io_ctx, oid, &size, &mtime);
+ printf("rados_stat size = %lld mtime = %d = %d\n", (long long)size, (int)mtime, r);
+ r = rados_stat(io_ctx, "does_not_exist", NULL, NULL);
+ printf("rados_stat(does_not_exist) = %d\n", r);
+
+ /* exec */
+ rados_exec(io_ctx, oid, "crypto", "md5", buf, strlen(buf) + 1, buf, 128);
+ printf("exec result=%s\n", buf);
+ r = rados_read(io_ctx, oid, buf2, 128, 0);
+ printf("read result=%s\n", buf2);
+ printf("size=%d\n", r);
+
+ /* aio */
+ rados_completion_t a, b;
+ rados_aio_create_completion(0, 0, 0, &a);
+ rados_aio_create_completion(0, 0, 0, &b);
+ rados_aio_write(io_ctx, "a", a, buf, 100, 0);
+ rados_aio_write(io_ctx, "../b/bb_bb_bb\\foo\\bar", b, buf, 100, 0);
+ rados_aio_wait_for_safe(a);
+ printf("a safe\n");
+ rados_aio_wait_for_safe(b);
+ printf("b safe\n");
+ rados_aio_release(a);
+ rados_aio_release(b);
+
+ /* test flush */
+ printf("testing aio flush\n");
+ rados_completion_t c;
+ rados_aio_create_completion(0, 0, 0, &c);
+ rados_aio_write(io_ctx, "c", c, buf, 100, 0);
+ int safe = rados_aio_is_safe(c);
+ printf("a should not yet be safe and ... %s\n", safe ? "is":"is not");
+ assert(!safe);
+ rados_aio_flush(io_ctx);
+ safe = rados_aio_is_safe(c);
+ printf("a should be safe and ... %s\n", safe ? "is":"is not");
+ assert(safe);
+ rados_aio_release(c);
+
+ rados_read(io_ctx, "../b/bb_bb_bb\\foo\\bar", buf2, 128, 0);
+
+ /* list objects */
+ rados_list_ctx_t h;
+ r = rados_nobjects_list_open(io_ctx, &h);
+ printf("rados_nobjects_list_open = %d, h = %p\n", r, h);
+ const char *poolname;
+ while (rados_nobjects_list_next2(h, &poolname, NULL, NULL, NULL, NULL, NULL) == 0)
+ printf("rados_nobjects_list_next2 got object '%s'\n", poolname);
+ rados_nobjects_list_close(h);
+
+ /* stat */
+ r = rados_ioctx_pool_stat(io_ctx, &st);
+ printf("rados_stat_pool = %d, %lld KB, %lld objects\n", r, (long long)st.num_kb, (long long)st.num_objects);
+
+ ret = 0;
+
+out_err_cleanup:
+ /* delete a pool */
+ rados_ioctx_destroy(io_ctx);
+
+ r = rados_pool_delete(cl, "foo");
+ printf("rados_delete_pool = %d\n", r);
+
+out_err:
+ rados_shutdown(cl);
+ return ret;
+}
+
+int main(int argc, const char **argv)
+{
+ return testrados();
+}
diff --git a/src/tools/scratchtoolpp.cc b/src/tools/scratchtoolpp.cc
new file mode 100644
index 00000000..26a35beb
--- /dev/null
+++ b/src/tools/scratchtoolpp.cc
@@ -0,0 +1,293 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+
+using namespace librados;
+
+#include <iostream>
+
+#include <errno.h>
+#include <stdlib.h>
+#include <time.h>
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+void buf_to_hex(const unsigned char *buf, int len, char *str)
+{
+ str[0] = '\0';
+ for (int i = 0; i < len; i++) {
+ sprintf(&str[i*2], "%02x", (int)buf[i]);
+ }
+}
+
+class C_Watch : public WatchCtx {
+public:
+ C_Watch() {}
+ void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) override {
+ cout << "C_Watch::notify() opcode=" << (int)opcode << " ver=" << ver << std::endl;
+ }
+};
+
+void testradospp_milestone(void)
+{
+ int c;
+ cout << "*** press enter to continue ***" << std::endl;
+ while ((c = getchar()) != EOF) {
+ if (c == '\n')
+ break;
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ Rados rados;
+ if (rados.init(NULL) < 0) {
+ cerr << "couldn't initialize rados!" << std::endl;
+ exit(1);
+ }
+
+ if (rados.conf_read_file(NULL)) {
+ cerr << "couldn't read configuration file." << std::endl;
+ exit(1);
+ }
+ rados.conf_parse_argv(argc, argv);
+
+ if (!rados.conf_set("config option that doesn't exist",
+ "some random value")) {
+ printf("error: succeeded in setting nonexistent config option\n");
+ exit(1);
+ }
+ if (rados.conf_set("log to stderr", "true")) {
+ printf("error: error setting log_to_stderr\n");
+ exit(1);
+ }
+ std::string tmp;
+ if (rados.conf_get("log to stderr", tmp)) {
+ printf("error: failed to read log_to_stderr from config\n");
+ exit(1);
+ }
+ if (tmp != "true") {
+ printf("error: new setting for log_to_stderr failed to take effect.\n");
+ exit(1);
+ }
+
+ if (rados.connect()) {
+ printf("error connecting\n");
+ exit(1);
+ }
+
+ cout << "rados_initialize completed" << std::endl;
+ testradospp_milestone();
+
+ time_t tm;
+ bufferlist bl, bl2, blf;
+ char buf[128];
+
+ time(&tm);
+ snprintf(buf, 128, "%s", ctime(&tm));
+ bl.append(buf, strlen(buf));
+ blf.append(buf, 16);
+
+ const char *oid = "bar";
+
+ int r = rados.pool_create("foo");
+ cout << "pool_create result = " << r << std::endl;
+
+ IoCtx io_ctx;
+ r = rados.ioctx_create("foo", io_ctx);
+ cout << "ioctx_create result = " << r << std::endl;
+
+ r = io_ctx.write(oid, bl, bl.length(), 0);
+ uint64_t objver = io_ctx.get_last_version();
+ ceph_assert(objver > 0);
+ cout << "io_ctx.write returned " << r << " last_ver=" << objver << std::endl;
+
+ uint64_t stat_size;
+ time_t stat_mtime;
+ r = io_ctx.stat(oid, &stat_size, &stat_mtime);
+ cout << "io_ctx.stat returned " << r << " size = " << stat_size << " mtime = " << stat_mtime << std::endl;
+
+ r = io_ctx.stat(oid, NULL, NULL);
+ cout << "io_ctx.stat(does_not_exist) = " << r << std::endl;
+
+ uint64_t handle;
+ C_Watch wc;
+ r = io_ctx.watch(oid, objver, &handle, &wc);
+ cout << "io_ctx.watch returned " << r << std::endl;
+
+ testradospp_milestone();
+ io_ctx.set_notify_timeout(7);
+ bufferlist notify_bl;
+ r = io_ctx.notify(oid, objver, notify_bl);
+ cout << "io_ctx.notify returned " << r << std::endl;
+ testradospp_milestone();
+
+ r = io_ctx.notify(oid, objver, notify_bl);
+ cout << "io_ctx.notify returned " << r << std::endl;
+ testradospp_milestone();
+
+ r = io_ctx.unwatch(oid, handle);
+ cout << "io_ctx.unwatch returned " << r << std::endl;
+ testradospp_milestone();
+
+ r = io_ctx.notify(oid, objver, notify_bl);
+ cout << "io_ctx.notify returned " << r << std::endl;
+ testradospp_milestone();
+ io_ctx.set_assert_version(objver);
+
+ r = io_ctx.write(oid, bl, bl.length() - 1, 0);
+ cout << "io_ctx.write returned " << r << std::endl;
+
+ r = io_ctx.write(oid, bl, bl.length() - 2, 0);
+ cout << "io_ctx.write returned " << r << std::endl;
+ r = io_ctx.write(oid, bl, bl.length() - 3, 0);
+ cout << "rados.write returned " << r << std::endl;
+ r = io_ctx.append(oid, bl, bl.length());
+ cout << "rados.write returned " << r << std::endl;
+ r = io_ctx.write_full(oid, blf);
+ cout << "rados.write_full returned " << r << std::endl;
+ r = io_ctx.read(oid, bl, bl.length(), 0);
+ cout << "rados.read returned " << r << std::endl;
+ r = io_ctx.trunc(oid, 8);
+ cout << "rados.trunc returned " << r << std::endl;
+ r = io_ctx.read(oid, bl, bl.length(), 0);
+ cout << "rados.read returned " << r << std::endl;
+ r = io_ctx.exec(oid, "crypto", "md5", bl, bl2);
+ cout << "exec returned " << r << " buf size=" << bl2.length() << std::endl;
+ const unsigned char *md5 = (const unsigned char *)bl2.c_str();
+ char md5_str[bl2.length()*2 + 1];
+ buf_to_hex(md5, bl2.length(), md5_str);
+ cout << "md5 result=" << md5_str << std::endl;
+
+ // test assert_version
+ r = io_ctx.read(oid, bl, 0, 1);
+ ceph_assert(r >= 0);
+ uint64_t v = io_ctx.get_last_version();
+ cout << oid << " version is " << v << std::endl;
+ ceph_assert(v > 0);
+ io_ctx.set_assert_version(v);
+ r = io_ctx.read(oid, bl, 0, 1);
+ ceph_assert(r >= 0);
+ io_ctx.set_assert_version(v - 1);
+ r = io_ctx.read(oid, bl, 0, 1);
+ ceph_assert(r == -ERANGE);
+ io_ctx.set_assert_version(v + 1);
+ r = io_ctx.read(oid, bl, 0, 1);
+ ceph_assert(r == -EOVERFLOW);
+
+ r = io_ctx.exec(oid, "crypto", "sha1", bl, bl2);
+ cout << "exec returned " << r << std::endl;
+ const unsigned char *sha1 = (const unsigned char *)bl2.c_str();
+ char sha1_str[bl2.length()*2 + 1];
+ buf_to_hex(sha1, bl2.length(), sha1_str);
+ cout << "sha1 result=" << sha1_str << std::endl;
+
+ r = io_ctx.exec(oid, "acl", "set", bl, bl2);
+ cout << "exec (set) returned " << r << std::endl;
+ r = io_ctx.exec(oid, "acl", "get", bl, bl2);
+ cout << "exec (get) returned " << r << std::endl;
+ if (bl2.length() > 0) {
+ cout << "attr=" << bl2.c_str() << std::endl;
+ }
+
+ int size = io_ctx.read(oid, bl2, 128, 0);
+ if (size <= 0) {
+ cout << "failed to read oid " << oid << "." << std::endl;
+ exit(1);
+ }
+ if (size > 4096) {
+ cout << "read too many bytes from oid " << oid << "." << std::endl;
+ exit(1);
+ }
+ char rbuf[size + 1];
+ memcpy(rbuf, bl2.c_str(), size);
+ rbuf[size] = '\0';
+ cout << "read result='" << rbuf << "'" << std::endl;
+ cout << "size=" << size << std::endl;
+
+ const char *oid2 = "jjj10.rbd";
+ r = io_ctx.exec(oid2, "rbd", "snap_list", bl, bl2);
+ cout << "snap_list result=" << r << std::endl;
+ r = io_ctx.exec(oid2, "rbd", "snap_add", bl, bl2);
+ cout << "snap_add result=" << r << std::endl;
+
+ if (r > 0) {
+ char *s = bl2.c_str();
+ for (int i=0; i<r; i++, s += strlen(s) + 1)
+ cout << s << std::endl;
+ }
+
+ cout << "compound operation..." << std::endl;
+ ObjectWriteOperation o;
+ o.write(0, bl);
+ o.setxattr("foo", bl2);
+ r = io_ctx.operate(oid, &o);
+ cout << "operate result=" << r << std::endl;
+
+ cout << "cmpxattr" << std::endl;
+ bufferlist val;
+ val.append("foo");
+ r = io_ctx.setxattr(oid, "foo", val);
+ ceph_assert(r >= 0);
+ {
+ ObjectReadOperation o;
+ o.cmpxattr("foo", CEPH_OSD_CMPXATTR_OP_EQ, val);
+ r = io_ctx.operate(oid, &o, &bl2);
+ cout << " got " << r << " wanted >= 0" << std::endl;
+ ceph_assert(r >= 0);
+ }
+ val.append("...");
+ {
+ ObjectReadOperation o;
+ o.cmpxattr("foo", CEPH_OSD_CMPXATTR_OP_EQ, val);
+ r = io_ctx.operate(oid, &o, &bl2);
+ cout << " got " << r << " wanted " << -ECANCELED << " (-ECANCELED)" << std::endl;
+ ceph_assert(r == -ECANCELED);
+ }
+
+ io_ctx.locator_set_key(string());
+
+ cout << "iterating over objects..." << std::endl;
+ int num_objs = 0;
+ for (NObjectIterator iter = io_ctx.nobjects_begin();
+ iter != io_ctx.nobjects_end(); ++iter) {
+ num_objs++;
+ cout << "'" << *iter << "'" << std::endl;
+ }
+ cout << "iterated over " << num_objs << " objects." << std::endl;
+ map<string, bufferlist> attrset;
+ io_ctx.getxattrs(oid, attrset);
+
+ map<string, bufferlist>::iterator it;
+ for (it = attrset.begin(); it != attrset.end(); ++it) {
+ cout << "xattr: " << it->first << std::endl;
+ }
+
+ r = io_ctx.remove(oid);
+ cout << "remove result=" << r << std::endl;
+
+ r = rados.pool_delete("foo");
+ cout << "pool_delete result=" << r << std::endl;
+
+ rados.shutdown();
+
+ return 0;
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
diff --git a/src/tools/setup-virtualenv.sh b/src/tools/setup-virtualenv.sh
new file mode 100755
index 00000000..f0fa1e43
--- /dev/null
+++ b/src/tools/setup-virtualenv.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2016 <contact@redhat.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library Public License for more details.
+#
+
+SCRIPTNAME="$(basename $0)"
+if [ `uname` == FreeBSD ]; then
+ GETOPT="/usr/local/bin/getopt"
+else
+ GETOPT=getopt
+fi
+
+function usage {
+ echo
+ echo "$SCRIPTNAME - automate setup of Python virtual environment"
+ echo " (for use in building Ceph)"
+ echo
+ echo "Usage:"
+ echo " $SCRIPTNAME [--python=PYTHON_BINARY] TARGET_DIRECTORY"
+ echo
+ echo " TARGET_DIRECTORY will be created if it doesn't exist,"
+ echo " and completely destroyed and re-created if it does!"
+ echo
+ exit 1
+}
+
+TEMP=$($GETOPT --options "h" --long "help,python:" --name "$SCRIPTNAME" -- "$@")
+test $? != 0 && usage
+eval set -- "$TEMP"
+
+PYTHON_OPTION=""
+while true ; do
+ case "$1" in
+ -h|--help) usage ;; # does not return
+ --python) PYTHON_OPTION="--python=$2" ; shift ; shift ;;
+ --) shift ; break ;;
+ *) echo "Internal error" ; exit 1 ;;
+ esac
+done
+
+DIR=$1
+if [ -z "$DIR" ] ; then
+ echo "$SCRIPTNAME: need a directory path, but none was provided"
+ usage
+fi
+rm -fr $DIR
+mkdir -p $DIR
+virtualenv $PYTHON_OPTION $DIR
+. $DIR/bin/activate
+
+if pip --help | grep -q disable-pip-version-check; then
+ DISABLE_PIP_VERSION_CHECK=--disable-pip-version-check
+else
+ DISABLE_PIP_VERSION_CHECK=
+fi
+
+# older versions of pip will not install wrap_console scripts
+# when using wheel packages
+pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install --upgrade 'pip >= 6.1'
+
+if pip --help | grep -q disable-pip-version-check; then
+ DISABLE_PIP_VERSION_CHECK=--disable-pip-version-check
+else
+ DISABLE_PIP_VERSION_CHECK=
+fi
+
+if test -d wheelhouse ; then
+ export NO_INDEX=--no-index
+fi
+
+pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX --find-links=file://$(pwd)/wheelhouse 'tox >=2.9.1'
+if test -f requirements.txt ; then
+ if ! test -f wheelhouse/md5 || ! md5sum -c wheelhouse/md5 > /dev/null; then
+ NO_INDEX=''
+ fi
+ pip $DISABLE_PIP_VERSION_CHECK --log $DIR/log.txt install $NO_INDEX --find-links=file://$(pwd)/wheelhouse -r requirements.txt
+fi