From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/tools/CMakeLists.txt | 153 + src/tools/RadosDump.cc | 169 + src/tools/RadosDump.h | 409 ++ src/tools/ceph-client-debug.cc | 191 + src/tools/ceph-dencoder/CMakeLists.txt | 111 + src/tools/ceph-dencoder/ceph_dencoder.cc | 276 ++ src/tools/ceph-dencoder/ceph_time.h | 68 + src/tools/ceph-dencoder/common_types.cc | 36 + src/tools/ceph-dencoder/common_types.h | 454 ++ src/tools/ceph-dencoder/denc_plugin.h | 78 + src/tools/ceph-dencoder/denc_registry.h | 241 ++ src/tools/ceph-dencoder/mds_types.cc | 36 + src/tools/ceph-dencoder/mds_types.h | 112 + src/tools/ceph-dencoder/osd_types.cc | 39 + src/tools/ceph-dencoder/osd_types.h | 150 + src/tools/ceph-dencoder/rbd_types.cc | 36 + src/tools/ceph-dencoder/rbd_types.h | 52 + src/tools/ceph-dencoder/rgw_types.cc | 36 + src/tools/ceph-dencoder/rgw_types.h | 141 + src/tools/ceph-dencoder/sstring.h | 40 + src/tools/ceph-dencoder/str.h | 38 + src/tools/ceph-diff-sorted.cc | 173 + src/tools/ceph-lazy/bash_completion.d/ceph-lazy | 27 + src/tools/ceph-lazy/ceph-lazy | 709 +++ src/tools/ceph-monstore-update-crush.sh | 174 + src/tools/ceph_authtool.cc | 318 ++ src/tools/ceph_conf.cc | 278 ++ src/tools/ceph_dedup_tool.cc | 1779 ++++++++ src/tools/ceph_kvstore_tool.cc | 363 ++ src/tools/ceph_monstore_tool.cc | 1319 ++++++ src/tools/ceph_objectstore_tool.cc | 4552 ++++++++++++++++++++ src/tools/ceph_objectstore_tool.h | 44 + src/tools/ceph_osdomap_tool.cc | 212 + src/tools/cephfs/CMakeLists.txt | 58 + src/tools/cephfs/DataScan.cc | 2404 +++++++++++ src/tools/cephfs/DataScan.h | 344 ++ src/tools/cephfs/Dumper.cc | 433 ++ src/tools/cephfs/Dumper.h | 45 + src/tools/cephfs/EventOutput.cc | 153 + src/tools/cephfs/EventOutput.h | 42 + src/tools/cephfs/JournalFilter.cc | 316 ++ src/tools/cephfs/JournalFilter.h | 73 + src/tools/cephfs/JournalScanner.cc | 438 ++ src/tools/cephfs/JournalScanner.h | 133 + src/tools/cephfs/JournalTool.cc | 1266 ++++++ src/tools/cephfs/JournalTool.h | 101 + src/tools/cephfs/MDSUtility.cc | 155 + src/tools/cephfs/MDSUtility.h | 60 + src/tools/cephfs/MetaTool.cc | 1000 +++++ src/tools/cephfs/MetaTool.h | 272 ++ src/tools/cephfs/PgFiles.cc | 194 + src/tools/cephfs/PgFiles.h | 51 + src/tools/cephfs/Resetter.cc | 222 + src/tools/cephfs/Resetter.h | 50 + src/tools/cephfs/RoleSelector.cc | 59 + src/tools/cephfs/RoleSelector.h | 36 + src/tools/cephfs/TableTool.cc | 419 ++ src/tools/cephfs/TableTool.h | 40 + src/tools/cephfs/cephfs-data-scan.cc | 46 + src/tools/cephfs/cephfs-journal-tool.cc | 57 + src/tools/cephfs/cephfs-meta-injection.cc | 96 + src/tools/cephfs/cephfs-table-tool.cc | 46 + src/tools/cephfs/first-damage.py | 156 + src/tools/cephfs/shell/CMakeLists.txt | 7 + src/tools/cephfs/shell/cephfs-shell | 1854 ++++++++ src/tools/cephfs/shell/setup.py | 27 + src/tools/cephfs/shell/tox.ini | 7 + src/tools/cephfs/top/CMakeLists.txt | 11 + src/tools/cephfs/top/cephfs-top | 1227 ++++++ src/tools/cephfs/top/setup.py | 25 + src/tools/cephfs/top/tox.ini | 7 + src/tools/cephfs/type_helper.hpp | 28 + src/tools/cephfs_mirror/CMakeLists.txt | 30 + src/tools/cephfs_mirror/ClusterWatcher.cc | 182 + src/tools/cephfs_mirror/ClusterWatcher.h | 77 + src/tools/cephfs_mirror/FSMirror.cc | 444 ++ src/tools/cephfs_mirror/FSMirror.h | 182 + src/tools/cephfs_mirror/InstanceWatcher.cc | 256 ++ src/tools/cephfs_mirror/InstanceWatcher.h | 98 + src/tools/cephfs_mirror/Mirror.cc | 589 +++ src/tools/cephfs_mirror/Mirror.h | 153 + src/tools/cephfs_mirror/MirrorWatcher.cc | 151 + src/tools/cephfs_mirror/MirrorWatcher.h | 92 + src/tools/cephfs_mirror/PeerReplayer.cc | 1581 +++++++ src/tools/cephfs_mirror/PeerReplayer.h | 320 ++ src/tools/cephfs_mirror/ServiceDaemon.cc | 225 + src/tools/cephfs_mirror/ServiceDaemon.h | 62 + src/tools/cephfs_mirror/Types.cc | 21 + src/tools/cephfs_mirror/Types.h | 87 + src/tools/cephfs_mirror/Utils.cc | 166 + src/tools/cephfs_mirror/Utils.h | 22 + src/tools/cephfs_mirror/Watcher.cc | 285 ++ src/tools/cephfs_mirror/Watcher.h | 102 + src/tools/cephfs_mirror/aio_utils.h | 53 + src/tools/cephfs_mirror/main.cc | 124 + src/tools/cephfs_mirror/watcher/RewatchRequest.cc | 102 + src/tools/cephfs_mirror/watcher/RewatchRequest.h | 60 + src/tools/contrib/README.rst | 10 + src/tools/contrib/ceph-migrate-bluestore.bash | 370 ++ src/tools/crushdiff | 336 ++ src/tools/crushtool.cc | 1344 ++++++ src/tools/erasure-code/CMakeLists.txt | 5 + src/tools/erasure-code/ceph-erasure-code-tool.cc | 323 ++ src/tools/histogram_dump.py | 174 + src/tools/immutable_object_cache/CMakeLists.txt | 19 + src/tools/immutable_object_cache/CacheClient.cc | 435 ++ src/tools/immutable_object_cache/CacheClient.h | 84 + .../immutable_object_cache/CacheController.cc | 139 + src/tools/immutable_object_cache/CacheController.h | 40 + src/tools/immutable_object_cache/CacheServer.cc | 106 + src/tools/immutable_object_cache/CacheServer.h | 45 + src/tools/immutable_object_cache/CacheSession.cc | 140 + src/tools/immutable_object_cache/CacheSession.h | 56 + .../immutable_object_cache/ObjectCacheStore.cc | 461 ++ .../immutable_object_cache/ObjectCacheStore.h | 85 + src/tools/immutable_object_cache/Policy.h | 34 + src/tools/immutable_object_cache/SimplePolicy.cc | 216 + src/tools/immutable_object_cache/SimplePolicy.h | 68 + src/tools/immutable_object_cache/SocketCommon.h | 31 + src/tools/immutable_object_cache/Types.cc | 184 + src/tools/immutable_object_cache/Types.h | 136 + src/tools/immutable_object_cache/Utils.h | 31 + src/tools/immutable_object_cache/main.cc | 83 + src/tools/kvstore_tool.cc | 380 ++ src/tools/kvstore_tool.h | 81 + src/tools/monmaptool.cc | 489 +++ src/tools/neorados.cc | 389 ++ src/tools/osdmaptool.cc | 949 ++++ src/tools/psim.cc | 119 + src/tools/rados/PoolDump.cc | 173 + src/tools/rados/PoolDump.h | 29 + src/tools/rados/RadosImport.cc | 404 ++ src/tools/rados/RadosImport.h | 45 + src/tools/rados/rados.cc | 4224 ++++++++++++++++++ src/tools/radosacl.cc | 187 + src/tools/rbd/ArgumentTypes.cc | 576 +++ src/tools/rbd/ArgumentTypes.h | 244 ++ src/tools/rbd/CMakeLists.txt | 80 + src/tools/rbd/IndentStream.cc | 59 + src/tools/rbd/IndentStream.h | 60 + src/tools/rbd/MirrorDaemonServiceInfo.cc | 307 ++ src/tools/rbd/MirrorDaemonServiceInfo.h | 78 + src/tools/rbd/OptionPrinter.cc | 161 + src/tools/rbd/OptionPrinter.h | 43 + src/tools/rbd/Schedule.cc | 367 ++ src/tools/rbd/Schedule.h | 67 + src/tools/rbd/Shell.cc | 487 +++ src/tools/rbd/Shell.h | 76 + src/tools/rbd/Utils.cc | 1203 ++++++ src/tools/rbd/Utils.h | 283 ++ src/tools/rbd/action/Bench.cc | 589 +++ src/tools/rbd/action/Children.cc | 167 + src/tools/rbd/action/Clone.cc | 99 + src/tools/rbd/action/Config.cc | 891 ++++ src/tools/rbd/action/Copy.cc | 195 + src/tools/rbd/action/Create.cc | 257 ++ src/tools/rbd/action/Device.cc | 285 ++ src/tools/rbd/action/Diff.cc | 142 + src/tools/rbd/action/DiskUsage.cc | 377 ++ src/tools/rbd/action/Encryption.cc | 117 + src/tools/rbd/action/Export.cc | 653 +++ src/tools/rbd/action/Feature.cc | 116 + src/tools/rbd/action/Flatten.cc | 92 + src/tools/rbd/action/Ggate.cc | 180 + src/tools/rbd/action/Group.cc | 912 ++++ src/tools/rbd/action/ImageMeta.cc | 345 ++ src/tools/rbd/action/Import.cc | 1036 +++++ src/tools/rbd/action/Info.cc | 471 ++ src/tools/rbd/action/Journal.cc | 1251 ++++++ src/tools/rbd/action/Kernel.cc | 679 +++ src/tools/rbd/action/List.cc | 346 ++ src/tools/rbd/action/Lock.cc | 279 ++ src/tools/rbd/action/MergeDiff.cc | 456 ++ src/tools/rbd/action/Migration.cc | 429 ++ src/tools/rbd/action/MirrorImage.cc | 605 +++ src/tools/rbd/action/MirrorPool.cc | 1772 ++++++++ src/tools/rbd/action/MirrorSnapshotSchedule.cc | 322 ++ src/tools/rbd/action/Namespace.cc | 191 + src/tools/rbd/action/Nbd.cc | 389 ++ src/tools/rbd/action/ObjectMap.cc | 131 + src/tools/rbd/action/Perf.cc | 717 +++ src/tools/rbd/action/PersistentCache.cc | 122 + src/tools/rbd/action/Pool.cc | 162 + src/tools/rbd/action/Remove.cc | 161 + src/tools/rbd/action/Rename.cc | 94 + src/tools/rbd/action/Resize.cc | 123 + src/tools/rbd/action/Snap.cc | 972 +++++ src/tools/rbd/action/Sparsify.cc | 82 + src/tools/rbd/action/Status.cc | 365 ++ src/tools/rbd/action/Trash.cc | 543 +++ src/tools/rbd/action/TrashPurgeSchedule.cc | 355 ++ src/tools/rbd/action/Watch.cc | 149 + src/tools/rbd/action/Wnbd.cc | 172 + src/tools/rbd/rbd.cc | 10 + src/tools/rbd_ggate/CMakeLists.txt | 9 + src/tools/rbd_ggate/Driver.cc | 165 + src/tools/rbd_ggate/Driver.h | 50 + src/tools/rbd_ggate/Request.h | 55 + src/tools/rbd_ggate/Server.cc | 262 ++ src/tools/rbd_ggate/Server.h | 88 + src/tools/rbd_ggate/Watcher.cc | 48 + src/tools/rbd_ggate/Watcher.h | 34 + src/tools/rbd_ggate/debug.cc | 55 + src/tools/rbd_ggate/debug.h | 17 + src/tools/rbd_ggate/ggate_drv.c | 379 ++ src/tools/rbd_ggate/ggate_drv.h | 64 + src/tools/rbd_ggate/main.cc | 516 +++ src/tools/rbd_mirror/BaseRequest.h | 33 + src/tools/rbd_mirror/CMakeLists.txt | 91 + src/tools/rbd_mirror/CancelableRequest.h | 44 + src/tools/rbd_mirror/ClusterWatcher.cc | 252 ++ src/tools/rbd_mirror/ClusterWatcher.h | 73 + src/tools/rbd_mirror/ImageDeleter.cc | 549 +++ src/tools/rbd_mirror/ImageDeleter.h | 189 + src/tools/rbd_mirror/ImageMap.cc | 604 +++ src/tools/rbd_mirror/ImageMap.h | 175 + src/tools/rbd_mirror/ImageReplayer.cc | 1201 ++++++ src/tools/rbd_mirror/ImageReplayer.h | 273 ++ src/tools/rbd_mirror/ImageSync.cc | 469 ++ src/tools/rbd_mirror/ImageSync.h | 151 + src/tools/rbd_mirror/InstanceReplayer.cc | 543 +++ src/tools/rbd_mirror/InstanceReplayer.h | 138 + src/tools/rbd_mirror/InstanceWatcher.cc | 1290 ++++++ src/tools/rbd_mirror/InstanceWatcher.h | 269 ++ src/tools/rbd_mirror/Instances.cc | 356 ++ src/tools/rbd_mirror/Instances.h | 168 + src/tools/rbd_mirror/LeaderWatcher.cc | 1069 +++++ src/tools/rbd_mirror/LeaderWatcher.h | 313 ++ src/tools/rbd_mirror/Mirror.cc | 763 ++++ src/tools/rbd_mirror/Mirror.h | 89 + src/tools/rbd_mirror/MirrorStatusUpdater.cc | 397 ++ src/tools/rbd_mirror/MirrorStatusUpdater.h | 119 + src/tools/rbd_mirror/MirrorStatusWatcher.cc | 74 + src/tools/rbd_mirror/MirrorStatusWatcher.h | 43 + src/tools/rbd_mirror/NamespaceReplayer.cc | 862 ++++ src/tools/rbd_mirror/NamespaceReplayer.h | 308 ++ src/tools/rbd_mirror/PoolMetaCache.cc | 83 + src/tools/rbd_mirror/PoolMetaCache.h | 47 + src/tools/rbd_mirror/PoolReplayer.cc | 1110 +++++ src/tools/rbd_mirror/PoolReplayer.h | 288 ++ src/tools/rbd_mirror/PoolWatcher.cc | 473 ++ src/tools/rbd_mirror/PoolWatcher.h | 161 + src/tools/rbd_mirror/ProgressContext.h | 21 + src/tools/rbd_mirror/RemotePoolPoller.cc | 267 ++ src/tools/rbd_mirror/RemotePoolPoller.h | 133 + src/tools/rbd_mirror/ServiceDaemon.cc | 327 ++ src/tools/rbd_mirror/ServiceDaemon.h | 94 + src/tools/rbd_mirror/Threads.cc | 38 + src/tools/rbd_mirror/Threads.h | 45 + src/tools/rbd_mirror/Throttler.cc | 240 ++ src/tools/rbd_mirror/Throttler.h | 74 + src/tools/rbd_mirror/Types.cc | 32 + src/tools/rbd_mirror/Types.h | 171 + .../image_deleter/SnapshotPurgeRequest.cc | 299 ++ .../image_deleter/SnapshotPurgeRequest.h | 105 + .../rbd_mirror/image_deleter/TrashMoveRequest.cc | 419 ++ .../rbd_mirror/image_deleter/TrashMoveRequest.h | 142 + .../rbd_mirror/image_deleter/TrashRemoveRequest.cc | 265 ++ .../rbd_mirror/image_deleter/TrashRemoveRequest.h | 117 + src/tools/rbd_mirror/image_deleter/TrashWatcher.cc | 384 ++ src/tools/rbd_mirror/image_deleter/TrashWatcher.h | 139 + src/tools/rbd_mirror/image_deleter/Types.h | 54 + src/tools/rbd_mirror/image_map/LoadRequest.cc | 174 + src/tools/rbd_mirror/image_map/LoadRequest.h | 77 + src/tools/rbd_mirror/image_map/Policy.cc | 407 ++ src/tools/rbd_mirror/image_map/Policy.h | 122 + src/tools/rbd_mirror/image_map/SimplePolicy.cc | 89 + src/tools/rbd_mirror/image_map/SimplePolicy.h | 39 + src/tools/rbd_mirror/image_map/StateTransition.cc | 94 + src/tools/rbd_mirror/image_map/StateTransition.h | 76 + src/tools/rbd_mirror/image_map/Types.cc | 138 + src/tools/rbd_mirror/image_map/Types.h | 130 + src/tools/rbd_mirror/image_map/UpdateRequest.cc | 100 + src/tools/rbd_mirror/image_map/UpdateRequest.h | 65 + .../rbd_mirror/image_replayer/BootstrapRequest.cc | 485 +++ .../rbd_mirror/image_replayer/BootstrapRequest.h | 181 + .../rbd_mirror/image_replayer/CloseImageRequest.cc | 62 + .../rbd_mirror/image_replayer/CloseImageRequest.h | 56 + .../image_replayer/CreateImageRequest.cc | 451 ++ .../rbd_mirror/image_replayer/CreateImageRequest.h | 144 + .../image_replayer/GetMirrorImageIdRequest.cc | 85 + .../image_replayer/GetMirrorImageIdRequest.h | 75 + .../rbd_mirror/image_replayer/OpenImageRequest.cc | 79 + .../rbd_mirror/image_replayer/OpenImageRequest.h | 71 + .../image_replayer/OpenLocalImageRequest.cc | 292 ++ .../image_replayer/OpenLocalImageRequest.h | 97 + .../image_replayer/PrepareLocalImageRequest.cc | 197 + .../image_replayer/PrepareLocalImageRequest.h | 115 + .../image_replayer/PrepareRemoteImageRequest.cc | 283 ++ .../image_replayer/PrepareRemoteImageRequest.h | 153 + src/tools/rbd_mirror/image_replayer/Replayer.h | 39 + .../rbd_mirror/image_replayer/ReplayerListener.h | 21 + .../rbd_mirror/image_replayer/StateBuilder.cc | 138 + src/tools/rbd_mirror/image_replayer/StateBuilder.h | 114 + .../rbd_mirror/image_replayer/TimeRollingMean.cc | 34 + .../rbd_mirror/image_replayer/TimeRollingMean.h | 40 + src/tools/rbd_mirror/image_replayer/Types.h | 21 + src/tools/rbd_mirror/image_replayer/Utils.cc | 61 + src/tools/rbd_mirror/image_replayer/Utils.h | 29 + .../journal/CreateLocalImageRequest.cc | 162 + .../journal/CreateLocalImageRequest.h | 116 + .../image_replayer/journal/EventPreprocessor.cc | 206 + .../image_replayer/journal/EventPreprocessor.h | 127 + .../image_replayer/journal/PrepareReplayRequest.cc | 316 ++ .../image_replayer/journal/PrepareReplayRequest.h | 115 + .../journal/ReplayStatusFormatter.cc | 284 ++ .../image_replayer/journal/ReplayStatusFormatter.h | 70 + .../rbd_mirror/image_replayer/journal/Replayer.cc | 1317 ++++++ .../rbd_mirror/image_replayer/journal/Replayer.h | 323 ++ .../image_replayer/journal/StateBuilder.cc | 149 + .../image_replayer/journal/StateBuilder.h | 94 + .../image_replayer/journal/SyncPointHandler.cc | 109 + .../image_replayer/journal/SyncPointHandler.h | 55 + .../snapshot/ApplyImageStateRequest.cc | 658 +++ .../snapshot/ApplyImageStateRequest.h | 155 + .../snapshot/CreateLocalImageRequest.cc | 204 + .../snapshot/CreateLocalImageRequest.h | 121 + .../snapshot/PrepareReplayRequest.cc | 70 + .../image_replayer/snapshot/PrepareReplayRequest.h | 92 + .../rbd_mirror/image_replayer/snapshot/Replayer.cc | 1633 +++++++ .../rbd_mirror/image_replayer/snapshot/Replayer.h | 349 ++ .../image_replayer/snapshot/StateBuilder.cc | 120 + .../image_replayer/snapshot/StateBuilder.h | 93 + .../rbd_mirror/image_replayer/snapshot/Utils.cc | 65 + .../rbd_mirror/image_replayer/snapshot/Utils.h | 30 + .../image_sync/SyncPointCreateRequest.cc | 172 + .../rbd_mirror/image_sync/SyncPointCreateRequest.h | 93 + .../rbd_mirror/image_sync/SyncPointPruneRequest.cc | 213 + .../rbd_mirror/image_sync/SyncPointPruneRequest.h | 91 + src/tools/rbd_mirror/image_sync/Types.h | 74 + src/tools/rbd_mirror/image_sync/Utils.cc | 24 + src/tools/rbd_mirror/image_sync/Utils.h | 16 + src/tools/rbd_mirror/instance_watcher/Types.cc | 245 ++ src/tools/rbd_mirror/instance_watcher/Types.h | 197 + src/tools/rbd_mirror/instances/Types.h | 28 + src/tools/rbd_mirror/leader_watcher/Types.cc | 161 + src/tools/rbd_mirror/leader_watcher/Types.h | 117 + src/tools/rbd_mirror/main.cc | 124 + .../pool_watcher/RefreshImagesRequest.cc | 89 + .../rbd_mirror/pool_watcher/RefreshImagesRequest.h | 73 + src/tools/rbd_mirror/pool_watcher/Types.h | 27 + src/tools/rbd_mirror/service_daemon/Types.cc | 29 + src/tools/rbd_mirror/service_daemon/Types.h | 33 + src/tools/rbd_nbd/CMakeLists.txt | 4 + src/tools/rbd_nbd/nbd-netlink.h | 100 + src/tools/rbd_nbd/rbd-nbd.cc | 2441 +++++++++++ src/tools/rbd_nbd/rbd-nbd_quiesce | 31 + src/tools/rbd_recover_tool/FAQ | 16 + src/tools/rbd_recover_tool/README | 97 + src/tools/rbd_recover_tool/TODO | 2 + src/tools/rbd_recover_tool/common_h | 412 ++ src/tools/rbd_recover_tool/config/mds_host | 0 src/tools/rbd_recover_tool/config/mon_host | 0 src/tools/rbd_recover_tool/config/osd_host_path | 0 src/tools/rbd_recover_tool/database_h | 1134 +++++ src/tools/rbd_recover_tool/epoch_h | 119 + src/tools/rbd_recover_tool/metadata_h | 368 ++ src/tools/rbd_recover_tool/osd_job | 170 + src/tools/rbd_recover_tool/rbd-recover-tool | 327 ++ .../rbd_recover_tool/test_rbd_recover_tool.sh | 542 +++ src/tools/rbd_wnbd/CMakeLists.txt | 11 + src/tools/rbd_wnbd/rbd_wnbd.cc | 1871 ++++++++ src/tools/rbd_wnbd/rbd_wnbd.h | 193 + src/tools/rbd_wnbd/wnbd_handler.cc | 456 ++ src/tools/rbd_wnbd/wnbd_handler.h | 188 + src/tools/rbd_wnbd/wnbd_wmi.cc | 261 ++ src/tools/rbd_wnbd/wnbd_wmi.h | 109 + src/tools/rebuild_mondb.cc | 353 ++ src/tools/rebuild_mondb.h | 9 + src/tools/rgw/parse-cr-dump.py | 168 + src/tools/scratchtool.c | 319 ++ src/tools/scratchtoolpp.cc | 295 ++ src/tools/setup-virtualenv.sh | 103 + 373 files changed, 106499 insertions(+) create mode 100644 src/tools/CMakeLists.txt create mode 100644 src/tools/RadosDump.cc create mode 100644 src/tools/RadosDump.h create mode 100644 src/tools/ceph-client-debug.cc create mode 100644 src/tools/ceph-dencoder/CMakeLists.txt create mode 100644 src/tools/ceph-dencoder/ceph_dencoder.cc create mode 100644 src/tools/ceph-dencoder/ceph_time.h create mode 100644 src/tools/ceph-dencoder/common_types.cc create mode 100644 src/tools/ceph-dencoder/common_types.h create mode 100644 src/tools/ceph-dencoder/denc_plugin.h create mode 100644 src/tools/ceph-dencoder/denc_registry.h create mode 100644 src/tools/ceph-dencoder/mds_types.cc create mode 100644 src/tools/ceph-dencoder/mds_types.h create mode 100644 src/tools/ceph-dencoder/osd_types.cc create mode 100644 src/tools/ceph-dencoder/osd_types.h create mode 100644 src/tools/ceph-dencoder/rbd_types.cc create mode 100644 src/tools/ceph-dencoder/rbd_types.h create mode 100644 src/tools/ceph-dencoder/rgw_types.cc create mode 100644 src/tools/ceph-dencoder/rgw_types.h create mode 100644 src/tools/ceph-dencoder/sstring.h create mode 100644 src/tools/ceph-dencoder/str.h create mode 100644 src/tools/ceph-diff-sorted.cc create mode 100644 src/tools/ceph-lazy/bash_completion.d/ceph-lazy create mode 100755 src/tools/ceph-lazy/ceph-lazy create mode 100755 src/tools/ceph-monstore-update-crush.sh create mode 100644 src/tools/ceph_authtool.cc create mode 100644 src/tools/ceph_conf.cc create mode 100644 src/tools/ceph_dedup_tool.cc create mode 100644 src/tools/ceph_kvstore_tool.cc create mode 100644 src/tools/ceph_monstore_tool.cc create mode 100644 src/tools/ceph_objectstore_tool.cc create mode 100644 src/tools/ceph_objectstore_tool.h create mode 100644 src/tools/ceph_osdomap_tool.cc create mode 100644 src/tools/cephfs/CMakeLists.txt create mode 100644 src/tools/cephfs/DataScan.cc create mode 100644 src/tools/cephfs/DataScan.h create mode 100644 src/tools/cephfs/Dumper.cc create mode 100644 src/tools/cephfs/Dumper.h create mode 100644 src/tools/cephfs/EventOutput.cc create mode 100644 src/tools/cephfs/EventOutput.h create mode 100644 src/tools/cephfs/JournalFilter.cc create mode 100644 src/tools/cephfs/JournalFilter.h create mode 100644 src/tools/cephfs/JournalScanner.cc create mode 100644 src/tools/cephfs/JournalScanner.h create mode 100644 src/tools/cephfs/JournalTool.cc create mode 100644 src/tools/cephfs/JournalTool.h create mode 100644 src/tools/cephfs/MDSUtility.cc create mode 100644 src/tools/cephfs/MDSUtility.h create mode 100644 src/tools/cephfs/MetaTool.cc create mode 100644 src/tools/cephfs/MetaTool.h create mode 100644 src/tools/cephfs/PgFiles.cc create mode 100644 src/tools/cephfs/PgFiles.h create mode 100644 src/tools/cephfs/Resetter.cc create mode 100644 src/tools/cephfs/Resetter.h create mode 100644 src/tools/cephfs/RoleSelector.cc create mode 100644 src/tools/cephfs/RoleSelector.h create mode 100644 src/tools/cephfs/TableTool.cc create mode 100644 src/tools/cephfs/TableTool.h create mode 100644 src/tools/cephfs/cephfs-data-scan.cc create mode 100644 src/tools/cephfs/cephfs-journal-tool.cc create mode 100644 src/tools/cephfs/cephfs-meta-injection.cc create mode 100644 src/tools/cephfs/cephfs-table-tool.cc create mode 100644 src/tools/cephfs/first-damage.py create mode 100644 src/tools/cephfs/shell/CMakeLists.txt create mode 100755 src/tools/cephfs/shell/cephfs-shell create mode 100644 src/tools/cephfs/shell/setup.py create mode 100644 src/tools/cephfs/shell/tox.ini create mode 100644 src/tools/cephfs/top/CMakeLists.txt create mode 100755 src/tools/cephfs/top/cephfs-top create mode 100644 src/tools/cephfs/top/setup.py create mode 100644 src/tools/cephfs/top/tox.ini create mode 100644 src/tools/cephfs/type_helper.hpp create mode 100644 src/tools/cephfs_mirror/CMakeLists.txt create mode 100644 src/tools/cephfs_mirror/ClusterWatcher.cc create mode 100644 src/tools/cephfs_mirror/ClusterWatcher.h create mode 100644 src/tools/cephfs_mirror/FSMirror.cc create mode 100644 src/tools/cephfs_mirror/FSMirror.h create mode 100644 src/tools/cephfs_mirror/InstanceWatcher.cc create mode 100644 src/tools/cephfs_mirror/InstanceWatcher.h create mode 100644 src/tools/cephfs_mirror/Mirror.cc create mode 100644 src/tools/cephfs_mirror/Mirror.h create mode 100644 src/tools/cephfs_mirror/MirrorWatcher.cc create mode 100644 src/tools/cephfs_mirror/MirrorWatcher.h create mode 100644 src/tools/cephfs_mirror/PeerReplayer.cc create mode 100644 src/tools/cephfs_mirror/PeerReplayer.h create mode 100644 src/tools/cephfs_mirror/ServiceDaemon.cc create mode 100644 src/tools/cephfs_mirror/ServiceDaemon.h create mode 100644 src/tools/cephfs_mirror/Types.cc create mode 100644 src/tools/cephfs_mirror/Types.h create mode 100644 src/tools/cephfs_mirror/Utils.cc create mode 100644 src/tools/cephfs_mirror/Utils.h create mode 100644 src/tools/cephfs_mirror/Watcher.cc create mode 100644 src/tools/cephfs_mirror/Watcher.h create mode 100644 src/tools/cephfs_mirror/aio_utils.h create mode 100644 src/tools/cephfs_mirror/main.cc create mode 100644 src/tools/cephfs_mirror/watcher/RewatchRequest.cc create mode 100644 src/tools/cephfs_mirror/watcher/RewatchRequest.h create mode 100644 src/tools/contrib/README.rst create mode 100755 src/tools/contrib/ceph-migrate-bluestore.bash create mode 100755 src/tools/crushdiff create mode 100644 src/tools/crushtool.cc create mode 100644 src/tools/erasure-code/CMakeLists.txt create mode 100644 src/tools/erasure-code/ceph-erasure-code-tool.cc create mode 100755 src/tools/histogram_dump.py create mode 100644 src/tools/immutable_object_cache/CMakeLists.txt create mode 100644 src/tools/immutable_object_cache/CacheClient.cc create mode 100644 src/tools/immutable_object_cache/CacheClient.h create mode 100644 src/tools/immutable_object_cache/CacheController.cc create mode 100644 src/tools/immutable_object_cache/CacheController.h create mode 100644 src/tools/immutable_object_cache/CacheServer.cc create mode 100644 src/tools/immutable_object_cache/CacheServer.h create mode 100644 src/tools/immutable_object_cache/CacheSession.cc create mode 100644 src/tools/immutable_object_cache/CacheSession.h create mode 100644 src/tools/immutable_object_cache/ObjectCacheStore.cc create mode 100644 src/tools/immutable_object_cache/ObjectCacheStore.h create mode 100644 src/tools/immutable_object_cache/Policy.h create mode 100644 src/tools/immutable_object_cache/SimplePolicy.cc create mode 100644 src/tools/immutable_object_cache/SimplePolicy.h create mode 100644 src/tools/immutable_object_cache/SocketCommon.h create mode 100644 src/tools/immutable_object_cache/Types.cc create mode 100644 src/tools/immutable_object_cache/Types.h create mode 100644 src/tools/immutable_object_cache/Utils.h create mode 100644 src/tools/immutable_object_cache/main.cc create mode 100644 src/tools/kvstore_tool.cc create mode 100644 src/tools/kvstore_tool.h create mode 100644 src/tools/monmaptool.cc create mode 100644 src/tools/neorados.cc create mode 100644 src/tools/osdmaptool.cc create mode 100644 src/tools/psim.cc create mode 100644 src/tools/rados/PoolDump.cc create mode 100644 src/tools/rados/PoolDump.h create mode 100644 src/tools/rados/RadosImport.cc create mode 100644 src/tools/rados/RadosImport.h create mode 100644 src/tools/rados/rados.cc create mode 100644 src/tools/radosacl.cc create mode 100644 src/tools/rbd/ArgumentTypes.cc create mode 100644 src/tools/rbd/ArgumentTypes.h create mode 100644 src/tools/rbd/CMakeLists.txt create mode 100644 src/tools/rbd/IndentStream.cc create mode 100644 src/tools/rbd/IndentStream.h create mode 100644 src/tools/rbd/MirrorDaemonServiceInfo.cc create mode 100644 src/tools/rbd/MirrorDaemonServiceInfo.h create mode 100644 src/tools/rbd/OptionPrinter.cc create mode 100644 src/tools/rbd/OptionPrinter.h create mode 100644 src/tools/rbd/Schedule.cc create mode 100644 src/tools/rbd/Schedule.h create mode 100644 src/tools/rbd/Shell.cc create mode 100644 src/tools/rbd/Shell.h create mode 100644 src/tools/rbd/Utils.cc create mode 100644 src/tools/rbd/Utils.h create mode 100644 src/tools/rbd/action/Bench.cc create mode 100644 src/tools/rbd/action/Children.cc create mode 100644 src/tools/rbd/action/Clone.cc create mode 100644 src/tools/rbd/action/Config.cc create mode 100644 src/tools/rbd/action/Copy.cc create mode 100644 src/tools/rbd/action/Create.cc create mode 100644 src/tools/rbd/action/Device.cc create mode 100644 src/tools/rbd/action/Diff.cc create mode 100644 src/tools/rbd/action/DiskUsage.cc create mode 100644 src/tools/rbd/action/Encryption.cc create mode 100644 src/tools/rbd/action/Export.cc create mode 100644 src/tools/rbd/action/Feature.cc create mode 100644 src/tools/rbd/action/Flatten.cc create mode 100644 src/tools/rbd/action/Ggate.cc create mode 100644 src/tools/rbd/action/Group.cc create mode 100644 src/tools/rbd/action/ImageMeta.cc create mode 100644 src/tools/rbd/action/Import.cc create mode 100644 src/tools/rbd/action/Info.cc create mode 100644 src/tools/rbd/action/Journal.cc create mode 100644 src/tools/rbd/action/Kernel.cc create mode 100644 src/tools/rbd/action/List.cc create mode 100644 src/tools/rbd/action/Lock.cc create mode 100644 src/tools/rbd/action/MergeDiff.cc create mode 100644 src/tools/rbd/action/Migration.cc create mode 100644 src/tools/rbd/action/MirrorImage.cc create mode 100644 src/tools/rbd/action/MirrorPool.cc create mode 100644 src/tools/rbd/action/MirrorSnapshotSchedule.cc create mode 100644 src/tools/rbd/action/Namespace.cc create mode 100644 src/tools/rbd/action/Nbd.cc create mode 100644 src/tools/rbd/action/ObjectMap.cc create mode 100644 src/tools/rbd/action/Perf.cc create mode 100644 src/tools/rbd/action/PersistentCache.cc create mode 100644 src/tools/rbd/action/Pool.cc create mode 100644 src/tools/rbd/action/Remove.cc create mode 100644 src/tools/rbd/action/Rename.cc create mode 100644 src/tools/rbd/action/Resize.cc create mode 100644 src/tools/rbd/action/Snap.cc create mode 100644 src/tools/rbd/action/Sparsify.cc create mode 100644 src/tools/rbd/action/Status.cc create mode 100644 src/tools/rbd/action/Trash.cc create mode 100644 src/tools/rbd/action/TrashPurgeSchedule.cc create mode 100644 src/tools/rbd/action/Watch.cc create mode 100644 src/tools/rbd/action/Wnbd.cc create mode 100644 src/tools/rbd/rbd.cc create mode 100644 src/tools/rbd_ggate/CMakeLists.txt create mode 100644 src/tools/rbd_ggate/Driver.cc create mode 100644 src/tools/rbd_ggate/Driver.h create mode 100644 src/tools/rbd_ggate/Request.h create mode 100644 src/tools/rbd_ggate/Server.cc create mode 100644 src/tools/rbd_ggate/Server.h create mode 100644 src/tools/rbd_ggate/Watcher.cc create mode 100644 src/tools/rbd_ggate/Watcher.h create mode 100644 src/tools/rbd_ggate/debug.cc create mode 100644 src/tools/rbd_ggate/debug.h create mode 100644 src/tools/rbd_ggate/ggate_drv.c create mode 100644 src/tools/rbd_ggate/ggate_drv.h create mode 100644 src/tools/rbd_ggate/main.cc create mode 100644 src/tools/rbd_mirror/BaseRequest.h create mode 100644 src/tools/rbd_mirror/CMakeLists.txt create mode 100644 src/tools/rbd_mirror/CancelableRequest.h create mode 100644 src/tools/rbd_mirror/ClusterWatcher.cc create mode 100644 src/tools/rbd_mirror/ClusterWatcher.h create mode 100644 src/tools/rbd_mirror/ImageDeleter.cc create mode 100644 src/tools/rbd_mirror/ImageDeleter.h create mode 100644 src/tools/rbd_mirror/ImageMap.cc create mode 100644 src/tools/rbd_mirror/ImageMap.h create mode 100644 src/tools/rbd_mirror/ImageReplayer.cc create mode 100644 src/tools/rbd_mirror/ImageReplayer.h create mode 100644 src/tools/rbd_mirror/ImageSync.cc create mode 100644 src/tools/rbd_mirror/ImageSync.h create mode 100644 src/tools/rbd_mirror/InstanceReplayer.cc create mode 100644 src/tools/rbd_mirror/InstanceReplayer.h create mode 100644 src/tools/rbd_mirror/InstanceWatcher.cc create mode 100644 src/tools/rbd_mirror/InstanceWatcher.h create mode 100644 src/tools/rbd_mirror/Instances.cc create mode 100644 src/tools/rbd_mirror/Instances.h create mode 100644 src/tools/rbd_mirror/LeaderWatcher.cc create mode 100644 src/tools/rbd_mirror/LeaderWatcher.h create mode 100644 src/tools/rbd_mirror/Mirror.cc create mode 100644 src/tools/rbd_mirror/Mirror.h create mode 100644 src/tools/rbd_mirror/MirrorStatusUpdater.cc create mode 100644 src/tools/rbd_mirror/MirrorStatusUpdater.h create mode 100644 src/tools/rbd_mirror/MirrorStatusWatcher.cc create mode 100644 src/tools/rbd_mirror/MirrorStatusWatcher.h create mode 100644 src/tools/rbd_mirror/NamespaceReplayer.cc create mode 100644 src/tools/rbd_mirror/NamespaceReplayer.h create mode 100644 src/tools/rbd_mirror/PoolMetaCache.cc create mode 100644 src/tools/rbd_mirror/PoolMetaCache.h create mode 100644 src/tools/rbd_mirror/PoolReplayer.cc create mode 100644 src/tools/rbd_mirror/PoolReplayer.h create mode 100644 src/tools/rbd_mirror/PoolWatcher.cc create mode 100644 src/tools/rbd_mirror/PoolWatcher.h create mode 100644 src/tools/rbd_mirror/ProgressContext.h create mode 100644 src/tools/rbd_mirror/RemotePoolPoller.cc create mode 100644 src/tools/rbd_mirror/RemotePoolPoller.h create mode 100644 src/tools/rbd_mirror/ServiceDaemon.cc create mode 100644 src/tools/rbd_mirror/ServiceDaemon.h create mode 100644 src/tools/rbd_mirror/Threads.cc create mode 100644 src/tools/rbd_mirror/Threads.h create mode 100644 src/tools/rbd_mirror/Throttler.cc create mode 100644 src/tools/rbd_mirror/Throttler.h create mode 100644 src/tools/rbd_mirror/Types.cc create mode 100644 src/tools/rbd_mirror/Types.h create mode 100644 src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.cc create mode 100644 src/tools/rbd_mirror/image_deleter/SnapshotPurgeRequest.h create mode 100644 src/tools/rbd_mirror/image_deleter/TrashMoveRequest.cc create mode 100644 src/tools/rbd_mirror/image_deleter/TrashMoveRequest.h create mode 100644 src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.cc create mode 100644 src/tools/rbd_mirror/image_deleter/TrashRemoveRequest.h create mode 100644 src/tools/rbd_mirror/image_deleter/TrashWatcher.cc create mode 100644 src/tools/rbd_mirror/image_deleter/TrashWatcher.h create mode 100644 src/tools/rbd_mirror/image_deleter/Types.h create mode 100644 src/tools/rbd_mirror/image_map/LoadRequest.cc create mode 100644 src/tools/rbd_mirror/image_map/LoadRequest.h create mode 100644 src/tools/rbd_mirror/image_map/Policy.cc create mode 100644 src/tools/rbd_mirror/image_map/Policy.h create mode 100644 src/tools/rbd_mirror/image_map/SimplePolicy.cc create mode 100644 src/tools/rbd_mirror/image_map/SimplePolicy.h create mode 100644 src/tools/rbd_mirror/image_map/StateTransition.cc create mode 100644 src/tools/rbd_mirror/image_map/StateTransition.h create mode 100644 src/tools/rbd_mirror/image_map/Types.cc create mode 100644 src/tools/rbd_mirror/image_map/Types.h create mode 100644 src/tools/rbd_mirror/image_map/UpdateRequest.cc create mode 100644 src/tools/rbd_mirror/image_map/UpdateRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/BootstrapRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/CloseImageRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/CloseImageRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/CreateImageRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/CreateImageRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/GetMirrorImageIdRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/OpenImageRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/OpenImageRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/OpenLocalImageRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/PrepareRemoteImageRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/Replayer.h create mode 100644 src/tools/rbd_mirror/image_replayer/ReplayerListener.h create mode 100644 src/tools/rbd_mirror/image_replayer/StateBuilder.cc create mode 100644 src/tools/rbd_mirror/image_replayer/StateBuilder.h create mode 100644 src/tools/rbd_mirror/image_replayer/TimeRollingMean.cc create mode 100644 src/tools/rbd_mirror/image_replayer/TimeRollingMean.h create mode 100644 src/tools/rbd_mirror/image_replayer/Types.h create mode 100644 src/tools/rbd_mirror/image_replayer/Utils.cc create mode 100644 src/tools/rbd_mirror/image_replayer/Utils.h create mode 100644 src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/journal/CreateLocalImageRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.cc create mode 100644 src/tools/rbd_mirror/image_replayer/journal/EventPreprocessor.h create mode 100644 src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/journal/PrepareReplayRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.cc create mode 100644 src/tools/rbd_mirror/image_replayer/journal/ReplayStatusFormatter.h create mode 100644 src/tools/rbd_mirror/image_replayer/journal/Replayer.cc create mode 100644 src/tools/rbd_mirror/image_replayer/journal/Replayer.h create mode 100644 src/tools/rbd_mirror/image_replayer/journal/StateBuilder.cc create mode 100644 src/tools/rbd_mirror/image_replayer/journal/StateBuilder.h create mode 100644 src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.cc create mode 100644 src/tools/rbd_mirror/image_replayer/journal/SyncPointHandler.h create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/ApplyImageStateRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/CreateLocalImageRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.cc create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/PrepareReplayRequest.h create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/Replayer.cc create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/Replayer.h create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.cc create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/StateBuilder.h create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/Utils.cc create mode 100644 src/tools/rbd_mirror/image_replayer/snapshot/Utils.h create mode 100644 src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.cc create mode 100644 src/tools/rbd_mirror/image_sync/SyncPointCreateRequest.h create mode 100644 src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.cc create mode 100644 src/tools/rbd_mirror/image_sync/SyncPointPruneRequest.h create mode 100644 src/tools/rbd_mirror/image_sync/Types.h create mode 100644 src/tools/rbd_mirror/image_sync/Utils.cc create mode 100644 src/tools/rbd_mirror/image_sync/Utils.h create mode 100644 src/tools/rbd_mirror/instance_watcher/Types.cc create mode 100644 src/tools/rbd_mirror/instance_watcher/Types.h create mode 100644 src/tools/rbd_mirror/instances/Types.h create mode 100644 src/tools/rbd_mirror/leader_watcher/Types.cc create mode 100644 src/tools/rbd_mirror/leader_watcher/Types.h create mode 100644 src/tools/rbd_mirror/main.cc create mode 100644 src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.cc create mode 100644 src/tools/rbd_mirror/pool_watcher/RefreshImagesRequest.h create mode 100644 src/tools/rbd_mirror/pool_watcher/Types.h create mode 100644 src/tools/rbd_mirror/service_daemon/Types.cc create mode 100644 src/tools/rbd_mirror/service_daemon/Types.h create mode 100644 src/tools/rbd_nbd/CMakeLists.txt create mode 100644 src/tools/rbd_nbd/nbd-netlink.h create mode 100644 src/tools/rbd_nbd/rbd-nbd.cc create mode 100755 src/tools/rbd_nbd/rbd-nbd_quiesce create mode 100644 src/tools/rbd_recover_tool/FAQ create mode 100644 src/tools/rbd_recover_tool/README create mode 100644 src/tools/rbd_recover_tool/TODO create mode 100644 src/tools/rbd_recover_tool/common_h create mode 100644 src/tools/rbd_recover_tool/config/mds_host create mode 100644 src/tools/rbd_recover_tool/config/mon_host create mode 100644 src/tools/rbd_recover_tool/config/osd_host_path create mode 100644 src/tools/rbd_recover_tool/database_h create mode 100644 src/tools/rbd_recover_tool/epoch_h create mode 100644 src/tools/rbd_recover_tool/metadata_h create mode 100755 src/tools/rbd_recover_tool/osd_job create mode 100755 src/tools/rbd_recover_tool/rbd-recover-tool create mode 100755 src/tools/rbd_recover_tool/test_rbd_recover_tool.sh create mode 100644 src/tools/rbd_wnbd/CMakeLists.txt create mode 100644 src/tools/rbd_wnbd/rbd_wnbd.cc create mode 100644 src/tools/rbd_wnbd/rbd_wnbd.h create mode 100644 src/tools/rbd_wnbd/wnbd_handler.cc create mode 100644 src/tools/rbd_wnbd/wnbd_handler.h create mode 100644 src/tools/rbd_wnbd/wnbd_wmi.cc create mode 100644 src/tools/rbd_wnbd/wnbd_wmi.h create mode 100644 src/tools/rebuild_mondb.cc create mode 100644 src/tools/rebuild_mondb.h create mode 100755 src/tools/rgw/parse-cr-dump.py create mode 100644 src/tools/scratchtool.c create mode 100644 src/tools/scratchtoolpp.cc create mode 100755 src/tools/setup-virtualenv.sh (limited to 'src/tools') diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt new file mode 100644 index 000000000..aeb9d0248 --- /dev/null +++ b/src/tools/CMakeLists.txt @@ -0,0 +1,153 @@ +set(rados_srcs + rados/rados.cc + RadosDump.cc + rados/RadosImport.cc + rados/PoolDump.cc + ${PROJECT_SOURCE_DIR}/src/common/util.cc + ${PROJECT_SOURCE_DIR}/src/common/obj_bencher.cc + ${PROJECT_SOURCE_DIR}/src/osd/ECUtil.cc) +add_executable(rados ${rados_srcs}) + +target_link_libraries(rados librados global ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) +if(WITH_LIBRADOSSTRIPER) + target_link_libraries(rados radosstriper) +else() + target_link_libraries(rados cls_lock_client) +endif() +install(TARGETS rados DESTINATION bin) + +if(NOT WIN32) + set(neorados_srcs + neorados.cc) + add_executable(neorados ${neorados_srcs}) + target_link_libraries(neorados libneorados spawn fmt::fmt ${CMAKE_DL_LIBS}) + #install(TARGETS neorados DESTINATION bin) +endif() + +if(WITH_TESTS) +add_executable(ceph_scratchtool scratchtool.c) +target_link_libraries(ceph_scratchtool librados global) +install(TARGETS ceph_scratchtool DESTINATION bin) + +add_executable(ceph_scratchtoolpp scratchtoolpp.cc) +target_link_libraries(ceph_scratchtoolpp librados global) +install(TARGETS ceph_scratchtoolpp DESTINATION bin) + +add_executable(ceph_radosacl radosacl.cc) +target_link_libraries(ceph_radosacl librados global) +install(TARGETS ceph_radosacl DESTINATION bin) + +install(PROGRAMS + ceph-monstore-update-crush.sh + DESTINATION ${CMAKE_INSTALL_LIBDIR}/ceph) +endif(WITH_TESTS) + +add_executable(ceph-osdomap-tool ceph_osdomap_tool.cc) +target_link_libraries(ceph-osdomap-tool os global Boost::program_options) +install(TARGETS ceph-osdomap-tool DESTINATION bin) + +add_executable(ceph-monstore-tool + ceph_monstore_tool.cc + ../auth/cephx/CephxKeyServer.cc + ../mgr/mgr_commands.cc) +target_link_libraries(ceph-monstore-tool os global Boost::program_options) +install(TARGETS ceph-monstore-tool DESTINATION bin) + +add_executable(ceph-objectstore-tool + ceph_objectstore_tool.cc + rebuild_mondb.cc + RadosDump.cc) +target_link_libraries(ceph-objectstore-tool osd os global Boost::program_options ${CMAKE_DL_LIBS}) +if(WITH_FUSE) + target_link_libraries(ceph-objectstore-tool FUSE::FUSE) +endif(WITH_FUSE) +install(TARGETS ceph-objectstore-tool DESTINATION bin) + +if(WITH_LIBCEPHFS) +if(WITH_TESTS) + add_executable(ceph-client-debug ceph-client-debug.cc) + target_link_libraries(ceph-client-debug cephfs global client) + install(TARGETS ceph-client-debug DESTINATION bin) +endif(WITH_TESTS) +endif(WITH_LIBCEPHFS) + +add_executable(ceph-kvstore-tool + kvstore_tool.cc + ceph_kvstore_tool.cc) +target_link_libraries(ceph-kvstore-tool os global) +install(TARGETS ceph-kvstore-tool DESTINATION bin) + +set(ceph_conf_srcs ceph_conf.cc) +add_executable(ceph-conf ${ceph_conf_srcs}) +target_link_libraries(ceph-conf global) +install(TARGETS ceph-conf DESTINATION bin) + +set(crushtool_srcs crushtool.cc) +add_executable(crushtool ${crushtool_srcs}) +target_link_libraries(crushtool global) +install(TARGETS crushtool DESTINATION bin) + +set(monmaptool_srcs monmaptool.cc) +add_executable(monmaptool ${monmaptool_srcs}) +target_link_libraries(monmaptool global) +install(TARGETS monmaptool DESTINATION bin) + +set(osdomaptool_srcs osdmaptool.cc) +add_executable(osdmaptool ${osdomaptool_srcs}) +target_link_libraries(osdmaptool global) +install(TARGETS osdmaptool DESTINATION bin) + +install(PROGRAMS crushdiff DESTINATION bin) + +set(ceph-diff-sorted_srcs ceph-diff-sorted.cc) +add_executable(ceph-diff-sorted ${ceph-diff-sorted_srcs}) +set_target_properties(ceph-diff-sorted PROPERTIES + SKIP_RPATH TRUE + INSTALL_RPATH "") +install(TARGETS ceph-diff-sorted DESTINATION bin) + +if(WITH_TESTS) +set(ceph_psim_srcs psim.cc) +add_executable(ceph_psim ${ceph_psim_srcs}) +target_link_libraries(ceph_psim global) +install(TARGETS ceph_psim DESTINATION bin) +endif(WITH_TESTS) + +set(ceph_authtool_srcs ceph_authtool.cc) +add_executable(ceph-authtool ${ceph_authtool_srcs}) +target_link_libraries(ceph-authtool global ${EXTRALIBS} ${CRYPTO_LIBS}) +install(TARGETS ceph-authtool DESTINATION bin) + +if(WITH_TESTS) +set(ceph_dedup_tool_srcs ceph_dedup_tool.cc) +add_executable(ceph-dedup-tool ${ceph_dedup_tool_srcs}) +target_link_libraries(ceph-dedup-tool + librados + global + cls_cas_client + cls_cas_internal) +install(TARGETS ceph-dedup-tool DESTINATION bin) +endif(WITH_TESTS) + +if(WITH_CEPHFS) + add_subdirectory(cephfs) + add_subdirectory(cephfs_mirror) +endif(WITH_CEPHFS) + +if(WITH_RBD) + add_subdirectory(rbd) + add_subdirectory(rbd_mirror) + if(LINUX) + add_subdirectory(rbd_nbd) + endif() + if(WIN32) + add_subdirectory(rbd_wnbd) + endif() + if(FREEBSD) + add_subdirectory(rbd_ggate) + endif() +endif(WITH_RBD) + +add_subdirectory(immutable_object_cache) +add_subdirectory(ceph-dencoder) +add_subdirectory(erasure-code) diff --git a/src/tools/RadosDump.cc b/src/tools/RadosDump.cc new file mode 100644 index 000000000..014041bff --- /dev/null +++ b/src/tools/RadosDump.cc @@ -0,0 +1,169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "RadosDump.h" + +using std::cerr; +using std::cout; + +int RadosDump::read_super() +{ + bufferlist ebl; + auto ebliter = ebl.cbegin(); + ssize_t bytes; + + bytes = ebl.read_fd(file_fd, super_header::FIXED_LENGTH); + if ((size_t)bytes != super_header::FIXED_LENGTH) { + cerr << "Unexpected EOF" << std::endl; + return -EFAULT; + } + + sh.decode(ebliter); + + return 0; +} + + +int RadosDump::get_header(header *h) +{ + assert (h != NULL); + + bufferlist ebl; + auto ebliter = ebl.cbegin(); + ssize_t bytes; + + bytes = ebl.read_fd(file_fd, sh.header_size); + if ((size_t)bytes != sh.header_size) { + cerr << "Unexpected EOF" << std::endl; + return -EFAULT; + } + + h->decode(ebliter); + + return 0; +} + +int RadosDump::get_footer(footer *f) +{ + ceph_assert(f != NULL); + + bufferlist ebl; + auto ebliter = ebl.cbegin(); + ssize_t bytes; + + bytes = ebl.read_fd(file_fd, sh.footer_size); + if ((size_t)bytes != sh.footer_size) { + cerr << "Unexpected EOF" << std::endl; + return -EFAULT; + } + + f->decode(ebliter); + + if (f->magic != endmagic) { + cerr << "Bad footer magic" << std::endl; + return -EFAULT; + } + + return 0; +} + +int RadosDump::read_section(sectiontype_t *type, bufferlist *bl) +{ + header hdr; + ssize_t bytes; + + int ret = get_header(&hdr); + if (ret) + return ret; + + *type = hdr.type; + + bl->clear(); + bytes = bl->read_fd(file_fd, hdr.size); + if (bytes != hdr.size) { + cerr << "Unexpected EOF" << std::endl; + return -EFAULT; + } + + if (hdr.size > 0) { + footer ft; + ret = get_footer(&ft); + if (ret) + return ret; + } + + return 0; +} + + +int RadosDump::skip_object(bufferlist &bl) +{ + bufferlist ebl; + bool done = false; + while(!done) { + sectiontype_t type; + int ret = read_section(&type, &ebl); + if (ret) + return ret; + + if (type >= END_OF_TYPES) { + cout << "Skipping unknown object section type" << std::endl; + continue; + } + switch(type) { + case TYPE_DATA: + case TYPE_ATTRS: + case TYPE_OMAP_HDR: + case TYPE_OMAP: +#ifdef DIAGNOSTIC + cerr << "Skip type " << (int)type << std::endl; +#endif + break; + case TYPE_OBJECT_END: + done = true; + break; + default: + cerr << "Can't skip unknown type: " << type << std::endl; + return -EFAULT; + } + } + return 0; +} + +//Write super_header with its fixed 16 byte length +void RadosDump::write_super() +{ + if (dry_run) { + return; + } + + bufferlist superbl; + super_header sh; + footer ft; + + header hdr(TYPE_NONE, 0); + hdr.encode(superbl); + + sh.magic = super_header::super_magic; + sh.version = super_header::super_ver; + sh.header_size = superbl.length(); + superbl.clear(); + ft.encode(superbl); + sh.footer_size = superbl.length(); + superbl.clear(); + + sh.encode(superbl); + ceph_assert(super_header::FIXED_LENGTH == superbl.length()); + superbl.write_fd(file_fd); +} diff --git a/src/tools/RadosDump.h b/src/tools/RadosDump.h new file mode 100644 index 000000000..dd7951d22 --- /dev/null +++ b/src/tools/RadosDump.h @@ -0,0 +1,409 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef RADOS_DUMP_H_ +#define RADOS_DUMP_H_ + +#include + +#include "include/buffer.h" +#include "include/encoding.h" + +#include "osd/osd_types.h" +#include "osd/OSDMap.h" + +typedef uint8_t sectiontype_t; +typedef uint32_t mymagic_t; +typedef int64_t mysize_t; + +enum { + TYPE_NONE = 0, + TYPE_PG_BEGIN, + TYPE_PG_END, + TYPE_OBJECT_BEGIN, + TYPE_OBJECT_END, + TYPE_DATA, + TYPE_ATTRS, + TYPE_OMAP_HDR, + TYPE_OMAP, + TYPE_PG_METADATA, + TYPE_POOL_BEGIN, + TYPE_POOL_END, + END_OF_TYPES, //Keep at the end +}; + +const uint16_t shortmagic = 0xffce; //goes into stream as "ceff" +//endmagic goes into stream as "ceff ffec" +const mymagic_t endmagic = (0xecff << 16) | shortmagic; + +//The first FIXED_LENGTH bytes are a fixed +//portion of the export output. This includes the overall +//version number, and size of header and footer. +//THIS STRUCTURE CAN ONLY BE APPENDED TO. If it needs to expand, +//the version can be bumped and then anything +//can be added to the export format. +struct super_header { + static const uint32_t super_magic = (shortmagic << 16) | shortmagic; + // ver = 1, Initial version + // ver = 2, Add OSDSuperblock to pg_begin + static const uint32_t super_ver = 2; + static const uint32_t FIXED_LENGTH = 16; + uint32_t magic; + uint32_t version; + uint32_t header_size; + uint32_t footer_size; + + super_header() : magic(0), version(0), header_size(0), footer_size(0) { } + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(magic, bl); + encode(version, bl); + encode(header_size, bl); + encode(footer_size, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(magic, bl); + decode(version, bl); + decode(header_size, bl); + decode(footer_size, bl); + } +}; + +struct header { + sectiontype_t type; + mysize_t size; + header(sectiontype_t type, mysize_t size) : + type(type), size(size) { } + header(): type(0), size(0) { } + + void encode(bufferlist& bl) const { + uint32_t debug_type = (type << 24) | (type << 16) | shortmagic; + ENCODE_START(1, 1, bl); + encode(debug_type, bl); + encode(size, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + uint32_t debug_type; + DECODE_START(1, bl); + decode(debug_type, bl); + type = debug_type >> 24; + decode(size, bl); + DECODE_FINISH(bl); + } +}; + +struct footer { + mymagic_t magic; + footer() : magic(endmagic) { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(magic, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(magic, bl); + DECODE_FINISH(bl); + } +}; + +struct pg_begin { + spg_t pgid; + OSDSuperblock superblock; + + pg_begin(spg_t pg, const OSDSuperblock& sb): + pgid(pg), superblock(sb) { } + pg_begin() { } + + void encode(bufferlist& bl) const { + // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then + // shard will be NO_SHARD for a replicated pool. This means + // that we allow the decode by struct_v 2. + ENCODE_START(3, 2, bl); + encode(pgid.pgid, bl); + encode(superblock, bl); + encode(pgid.shard, bl); + ENCODE_FINISH(bl); + } + // NOTE: New super_ver prevents decode from ver 1 + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(pgid.pgid, bl); + if (struct_v > 1) { + decode(superblock, bl); + } + if (struct_v > 2) { + decode(pgid.shard, bl); + } else { + pgid.shard = shard_id_t::NO_SHARD; + } + DECODE_FINISH(bl); + } +}; + +struct object_begin { + ghobject_t hoid; + + // Duplicate what is in the OI_ATTR so we have it at the start + // of object processing. + object_info_t oi; + + explicit object_begin(const ghobject_t &hoid): hoid(hoid) { } + object_begin() { } + + // If superblock doesn't include CEPH_FS_FEATURE_INCOMPAT_SHARDS then + // generation will be NO_GEN, shard_id will be NO_SHARD for a replicated + // pool. This means we will allow the decode by struct_v 1. + void encode(bufferlist& bl) const { + ENCODE_START(3, 1, bl); + encode(hoid.hobj, bl); + encode(hoid.generation, bl); + encode(hoid.shard_id, bl); + encode(oi, bl, -1); /* FIXME: we always encode with full features */ + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(3, bl); + decode(hoid.hobj, bl); + if (struct_v > 1) { + decode(hoid.generation, bl); + decode(hoid.shard_id, bl); + } else { + hoid.generation = ghobject_t::NO_GEN; + hoid.shard_id = shard_id_t::NO_SHARD; + } + if (struct_v > 2) { + decode(oi, bl); + } + DECODE_FINISH(bl); + } +}; + +struct data_section { + uint64_t offset; + uint64_t len; + bufferlist databl; + data_section(uint64_t offset, uint64_t len, bufferlist bl): + offset(offset), len(len), databl(bl) { } + data_section(): offset(0), len(0) { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(offset, bl); + encode(len, bl); + encode(databl, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(offset, bl); + decode(len, bl); + decode(databl, bl); + DECODE_FINISH(bl); + } +}; + +struct attr_section { + using data_t = std::map>; + data_t data; + explicit attr_section(const data_t &data) : data(data) { } + explicit attr_section(std::map> &data_) + { + for (auto& [k, v] : data_) { + bufferlist bl; + bl.push_back(v); + data.emplace(k, std::move(bl)); + } + } + + attr_section() { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(data, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(data, bl); + DECODE_FINISH(bl); + } +}; + +struct omap_hdr_section { + bufferlist hdr; + explicit omap_hdr_section(bufferlist hdr) : hdr(hdr) { } + omap_hdr_section() { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(hdr, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(hdr, bl); + DECODE_FINISH(bl); + } +}; + +struct omap_section { + std::map omap; + explicit omap_section(const std::map &omap) : + omap(omap) { } + omap_section() { } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(omap, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(omap, bl); + DECODE_FINISH(bl); + } +}; + +struct metadata_section { + // struct_ver is the on-disk version of original pg + __u8 struct_ver; // for reference + epoch_t map_epoch; + pg_info_t info; + pg_log_t log; + PastIntervals past_intervals; + OSDMap osdmap; + bufferlist osdmap_bl; // Used in lieu of encoding osdmap due to crc checking + std::map divergent_priors; + pg_missing_t missing; + + metadata_section( + __u8 struct_ver, + epoch_t map_epoch, + const pg_info_t &info, + const pg_log_t &log, + const PastIntervals &past_intervals, + const pg_missing_t &missing) + : struct_ver(struct_ver), + map_epoch(map_epoch), + info(info), + log(log), + past_intervals(past_intervals), + missing(missing) {} + metadata_section() + : struct_ver(0), + map_epoch(0) { } + + void encode(bufferlist& bl) const { + ENCODE_START(6, 6, bl); + encode(struct_ver, bl); + encode(map_epoch, bl); + encode(info, bl); + encode(log, bl); + encode(past_intervals, bl); + // Equivalent to osdmap.encode(bl, features); but + // preserving exact layout for CRC checking. + bl.append(osdmap_bl); + encode(divergent_priors, bl); + encode(missing, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(6, bl); + decode(struct_ver, bl); + decode(map_epoch, bl); + decode(info, bl); + decode(log, bl); + if (struct_v >= 6) { + decode(past_intervals, bl); + } else if (struct_v > 1) { + std::cout << "NOTICE: Older export with classic past_intervals" << std::endl; + } else { + std::cout << "NOTICE: Older export without past_intervals" << std::endl; + } + if (struct_v > 2) { + osdmap.decode(bl); + } else { + std::cout << "WARNING: Older export without OSDMap information" << std::endl; + } + if (struct_v > 3) { + decode(divergent_priors, bl); + } + if (struct_v > 4) { + decode(missing, bl); + } + DECODE_FINISH(bl); + } +}; + +/** + * Superclass for classes that will need to handle a serialized RADOS + * dump. Requires that the serialized dump be opened with a known FD. + */ +class RadosDump +{ + protected: + int file_fd; + super_header sh; + bool dry_run; + + public: + RadosDump(int file_fd_, bool dry_run_) + : file_fd(file_fd_), dry_run(dry_run_) + {} + + int read_super(); + int get_header(header *h); + int get_footer(footer *f); + int read_section(sectiontype_t *type, bufferlist *bl); + int skip_object(bufferlist &bl); + void write_super(); + + // Define this in .h because it's templated + template + int write_section(sectiontype_t type, const T& obj, int fd) { + if (dry_run) + return 0; + bufferlist blhdr, bl, blftr; + obj.encode(bl); + header hdr(type, bl.length()); + hdr.encode(blhdr); + footer ft; + ft.encode(blftr); + + int ret = blhdr.write_fd(fd); + if (ret) return ret; + ret = bl.write_fd(fd); + if (ret) return ret; + ret = blftr.write_fd(fd); + return ret; + } + + int write_simple(sectiontype_t type, int fd) + { + if (dry_run) + return 0; + bufferlist hbl; + + header hdr(type, 0); + hdr.encode(hbl); + return hbl.write_fd(fd); + } +}; + +#endif diff --git a/src/tools/ceph-client-debug.cc b/src/tools/ceph-client-debug.cc new file mode 100644 index 000000000..229162e3b --- /dev/null +++ b/src/tools/ceph-client-debug.cc @@ -0,0 +1,191 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "common/Formatter.h" +#include "common/debug.h" +#include "common/errno.h" +#include "client/Inode.h" +#include "client/Dentry.h" +#include "client/Dir.h" +#include "include/cephfs/libcephfs.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_client + +using namespace std; + +void usage() +{ + std::cout << "Usage: ceph-client-debug [options] " << std::endl; + generic_client_usage(); +} + + +/** + * Given an inode, look up the path from the Client cache: assumes + * client cache is fully populated. + */ +void traverse_dentries(Inode *ino, std::vector &parts) +{ + if (ino->dentries.empty()) { + return; + } + + Dentry* dn = *(ino->dentries.begin()); + parts.push_back(dn); + traverse_dentries(dn->dir->parent_inode, parts); +} + + +/** + * Given an inode, send lookup requests to the MDS for + * all its ancestors, such that the full trace will be + * populated in client cache. + */ +int lookup_trace(ceph_mount_info *client, inodeno_t const ino) +{ + Inode *inode; + int r = ceph_ll_lookup_inode(client, ino, &inode); + if (r != 0) { + return r; + } else { + if (!inode->dentries.empty()) { + Dentry *dn = *(inode->dentries.begin()); + ceph_assert(dn->dir); + ceph_assert(dn->dir->parent_inode); + r = lookup_trace(client, dn->dir->parent_inode->ino); + if (r) { + return r; + } + } else { + // We reached the root of the tree + ceph_assert(inode->ino == CEPH_INO_ROOT); + } + } + + return r; +} + + +int main(int argc, const char **argv) +{ + // Argument handling + auto args = argv_to_vec(argc, argv); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS| + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + + common_init_finish(g_ceph_context); + + // Expect exactly one positional argument (inode number) + if (args.size() != 1) { + cerr << "missing position argument (inode number)" << std::endl; + exit(1); + } + char const *inode_str = args[0]; + inodeno_t inode = strtoll(inode_str, NULL, 0); + if (inode <= 0) { + derr << "Invalid inode: " << inode_str << dendl; + return -1; + } + + // Initialize filesystem client + struct ceph_mount_info *client; + int r = ceph_create_with_context(&client, g_ceph_context); + if (r) { + derr << "Error initializing libcephfs: " << cpp_strerror(r) << dendl; + return r; + } + + r = ceph_mount(client, "/"); + if (r) { + derr << "Error mounting: " << cpp_strerror(r) << dendl; + ceph_shutdown(client); + return r; + } + + + // Populate client cache with inode of interest & ancestors + r = lookup_trace(client, inode); + if (r) { + derr << "Error looking up inode " << std::hex << inode << std::dec << + ": " << cpp_strerror(r) << dendl; + return -1; + } + + // Retrieve inode of interest + struct vinodeno_t vinode; + vinode.ino = inode; + vinode.snapid = CEPH_NOSNAP; + Inode *ino = ceph_ll_get_inode(client, vinode); + + // Retrieve dentry trace + std::vector path; + traverse_dentries(ino, path); + + // Print inode and path as a JSON object + JSONFormatter jf(true); + jf.open_object_section("client_debug"); + { + jf.open_object_section("inode"); + { + ino->dump(&jf); + } + jf.close_section(); // inode + jf.open_array_section("path"); + { + for (std::vector::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) { + jf.open_object_section("dentry"); + { + (*p)->dump(&jf); + } + jf.close_section(); // dentry + } + } + jf.close_section(); // path + } + jf.close_section(); // client_debug + jf.flush(std::cout); + std::cout << std::endl; + + // Release Inode references + ceph_ll_forget(client, ino, 1); + for (std::vector::reverse_iterator p = path.rbegin(); p != path.rend(); ++p) { + ceph_ll_forget(client, (*p)->inode.get(), 1); + } + ino = NULL; + path.clear(); + + // Shut down + r = ceph_unmount(client); + if (r) { + derr << "Error mounting: " << cpp_strerror(r) << dendl; + } + ceph_shutdown(client); + + return r; +} diff --git a/src/tools/ceph-dencoder/CMakeLists.txt b/src/tools/ceph-dencoder/CMakeLists.txt new file mode 100644 index 000000000..5cb56e136 --- /dev/null +++ b/src/tools/ceph-dencoder/CMakeLists.txt @@ -0,0 +1,111 @@ +## dencoder +set_source_files_properties( + ceph_dencoder.cc + APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/include/ceph_ver.h) + +if(HAS_VTA) + set_source_files_properties(ceph_dencoder.cc + PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments) +endif() + +set(dencoder_srcs + ceph_dencoder.cc + ../../include/uuid.cc + ../../include/utime.cc + $) +add_executable(ceph-dencoder ${dencoder_srcs}) +set_target_properties(ceph-dencoder PROPERTIES + JOB_POOL_COMPILE heavy_compile_job_pool + JOB_POOL_LINK heavy_link_job_pool) + +set(denc_plugin_dir ${CEPH_INSTALL_FULL_PKGLIBDIR}/denc) +add_custom_target(ceph-dencoder-modules) + +function(add_denc_mod name) + add_library(${name} SHARED + ${ARGN}) + set_target_properties(${name} PROPERTIES + PREFIX "" + OUTPUT_NAME ${name} + CXX_VISIBILITY_PRESET hidden + VISIBILITY_INLINES_HIDDEN ON) + install( + TARGETS ${name} + DESTINATION ${denc_plugin_dir}) + add_dependencies(ceph-dencoder-modules + ${name}) +endfunction() + +add_denc_mod(denc-mod-common + common_types.cc) +target_link_libraries(denc-mod-common + journal + cls_cas_internal + cls_lock_client + cls_refcount_client + cls_timeindex_client) +add_denc_mod(denc-mod-osd + osd_types.cc) +target_link_libraries(denc-mod-osd + os + osd + mon + erasure_code + global) + +if(WITH_RADOSGW) + add_denc_mod(denc-mod-rgw + rgw_types.cc + ${CMAKE_SOURCE_DIR}/src/rgw/rgw_dencoder.cc) + target_include_directories(denc-mod-rgw + SYSTEM PRIVATE "${CMAKE_SOURCE_DIR}/src/rgw") + target_link_libraries(denc-mod-rgw + rgw_a + cls_rgw_client + cls_journal_client) + if(WITH_RADOSGW_AMQP_ENDPOINT) + target_link_libraries(denc-mod-rgw + rabbitmq ssl) + endif() + if(WITH_RADOSGW_KAFKA_ENDPOINT) + target_link_libraries(denc-mod-rgw + rdkafka) + endif() +endif() + +if(WITH_RBD) + add_denc_mod(denc-mod-rbd + rbd_types.cc) + target_link_libraries(denc-mod-rbd + cls_rbd_client + rbd_mirror_types + rbd_types + rbd_replay_types) + if(WITH_KRBD) + target_link_libraries(denc-mod-rbd + krbd) + endif() +endif() + +if(WITH_CEPHFS) + add_denc_mod(denc-mod-cephfs + mds_types.cc) + target_link_libraries(denc-mod-cephfs + mds) +endif() + +target_compile_definitions(ceph-dencoder PRIVATE + "CEPH_DENC_MOD_DIR=\"${denc_plugin_dir}\"") + +target_link_libraries(ceph-dencoder + StdFilesystem::filesystem + global + ${DENCODER_EXTRALIBS} + cls_log_client + cls_version_client + cls_user_client + cls_cas_client + ${EXTRALIBS} + ${CMAKE_DL_LIBS} + ${ALLOC_LIBS}) +install(TARGETS ceph-dencoder DESTINATION bin) diff --git a/src/tools/ceph-dencoder/ceph_dencoder.cc b/src/tools/ceph-dencoder/ceph_dencoder.cc new file mode 100644 index 000000000..a278e0862 --- /dev/null +++ b/src/tools/ceph-dencoder/ceph_dencoder.cc @@ -0,0 +1,276 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include + +#include +#include + +#include "ceph_ver.h" +#include "include/types.h" +#include "common/Formatter.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "denc_plugin.h" +#include "denc_registry.h" + +#define MB(m) ((m) * 1024 * 1024) + +namespace fs = std::filesystem; + +using namespace std; + +void usage(ostream &out) +{ + out << "usage: ceph-dencoder [commands ...]" << std::endl; + out << "\n"; + out << " version print version string (to stdout)\n"; + out << "\n"; + out << " import read encoded data from encfile\n"; + out << " export write encoded data to outfile\n"; + out << "\n"; + out << " set_features set feature bits used for encoding\n"; + out << " get_features print feature bits (int) to stdout\n"; + out << "\n"; + out << " list_types list supported types\n"; + out << " type select in-memory type\n"; + out << " skip skip leading bytes before decoding\n"; + out << " decode decode into in-memory object\n"; + out << " encode encode in-memory object\n"; + out << " dump_json dump in-memory object as json (to stdout)\n"; + out << " hexdump print encoded data in hex\n"; + out << " get_struct_v print version of the encoded object\n"; + out << " get_struct_compat print the oldest version of decoder that can decode the encoded object\n"; + out << "\n"; + out << " copy copy object (via operator=)\n"; + out << " copy_ctor copy object (via copy ctor)\n"; + out << "\n"; + out << " count_tests print number of generated test objects (to stdout)\n"; + out << " select_test select generated test object as in-memory object\n"; + out << " is_deterministic exit w/ success if type encodes deterministically\n"; +} + +vector load_plugins() +{ + fs::path mod_dir{CEPH_DENC_MOD_DIR}; + if (auto ceph_lib = getenv("CEPH_LIB"); ceph_lib) { + mod_dir = ceph_lib; + } else if (fs::is_regular_file("CMakeCache.txt")) { + mod_dir = std::filesystem::canonical("lib"); + } + if (!fs::is_directory(mod_dir)) { + std::cerr << "unable to load dencoders from " + << std::quoted(mod_dir.native()) << ". " + << "it is not a directory." << std::endl; + return {}; + } + vector dencoder_plugins; + for (auto& entry : fs::directory_iterator(mod_dir)) { + static const string_view DENC_MOD_PREFIX = "denc-mod-"; + if (entry.path().stem().string().compare(0, DENC_MOD_PREFIX.size(), + DENC_MOD_PREFIX) != 0) { + continue; + } + DencoderPlugin plugin(entry); + if (!plugin.good()) { + continue; + } + dencoder_plugins.push_back(std::move(plugin)); + } + return dencoder_plugins; +} + +int main(int argc, const char **argv) +{ + vector plugins = load_plugins(); + DencoderRegistry registry; + for (auto& plugin : plugins) { + for (auto& [name, denc] : plugin.register_dencoders()) { + registry.register_dencoder(name, denc); + } + } + + auto args = argv_to_vec(argc, argv); + env_to_vec(args); + + Dencoder *den = NULL; + uint64_t features = CEPH_FEATURES_SUPPORTED_DEFAULT; + bufferlist encbl; + uint64_t skip = 0; + + if (args.empty()) { + cerr << "-h for help" << std::endl; + return 1; + } + for (std::vector::iterator i = args.begin(); i != args.end(); ++i) { + string err; + + auto& dencoders = registry.get(); + if (*i == string("help") || *i == string("-h") || *i == string("--help")) { + usage(cout); + return 0; + } else if (*i == string("version")) { + cout << CEPH_GIT_NICE_VER << std::endl; + } else if (*i == string("list_types")) { + for (auto& dencoder : dencoders) + cout << dencoder.first << std::endl; + return 0; + } else if (*i == string("type")) { + ++i; + if (i == args.end()) { + cerr << "expecting type" << std::endl; + return 1; + } + string cname = *i; + if (!dencoders.count(cname)) { + cerr << "class '" << cname << "' unknown" << std::endl; + return 1; + } + den = dencoders[cname]; + den->generate(); + } else if (*i == string("skip")) { + ++i; + if (i == args.end()) { + cerr << "expecting byte count" << std::endl; + return 1; + } + skip = atoi(*i); + } else if (*i == string("get_features")) { + cout << CEPH_FEATURES_SUPPORTED_DEFAULT << std::endl; + return 0; + } else if (*i == string("set_features")) { + ++i; + if (i == args.end()) { + cerr << "expecting features" << std::endl; + return 1; + } + features = atoll(*i); + } else if (*i == string("encode")) { + if (!den) { + cerr << "must first select type with 'type '" << std::endl; + return 1; + } + den->encode(encbl, features | CEPH_FEATURE_RESERVED); // hack for OSDMap + } else if (*i == string("decode")) { + if (!den) { + cerr << "must first select type with 'type '" << std::endl; + return 1; + } + err = den->decode(encbl, skip); + } else if (*i == string("copy_ctor")) { + if (!den) { + cerr << "must first select type with 'type '" << std::endl; + return 1; + } + den->copy_ctor(); + } else if (*i == string("copy")) { + if (!den) { + cerr << "must first select type with 'type '" << std::endl; + return 1; + } + den->copy(); + } else if (*i == string("dump_json")) { + if (!den) { + cerr << "must first select type with 'type '" << std::endl; + return 1; + } + JSONFormatter jf(true); + jf.open_object_section("object"); + den->dump(&jf); + jf.close_section(); + jf.flush(cout); + cout << std::endl; + + } else if (*i == string("hexdump")) { + encbl.hexdump(cout); + } else if (*i == string("get_struct_v")) { + std::cout << den->get_struct_v(encbl, 0) << std::endl; + } else if (*i == string("get_struct_compat")) { + std::cout << den->get_struct_v(encbl, sizeof(uint8_t)) << std::endl; + } else if (*i == string("import")) { + ++i; + if (i == args.end()) { + cerr << "expecting filename" << std::endl; + return 1; + } + int r; + if (*i == string("-")) { + *i = "stdin"; + // Read up to 1mb if stdin specified + r = encbl.read_fd(STDIN_FILENO, MB(1)); + } else { + r = encbl.read_file(*i, &err); + } + if (r < 0) { + cerr << "error reading " << *i << ": " << err << std::endl; + return 1; + } + + } else if (*i == string("export")) { + ++i; + if (i == args.end()) { + cerr << "expecting filename" << std::endl; + return 1; + } + int fd = ::open(*i, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0644); + if (fd < 0) { + cerr << "error opening " << *i << " for write: " << cpp_strerror(errno) << std::endl; + return 1; + } + int r = encbl.write_fd(fd); + if (r < 0) { + cerr << "error writing " << *i << ": " << cpp_strerror(errno) << std::endl; + return 1; + } + ::close(fd); + + } else if (*i == string("count_tests")) { + if (!den) { + cerr << "must first select type with 'type '" << std::endl; + return 1; + } + cout << den->num_generated() << std::endl; + } else if (*i == string("select_test")) { + if (!den) { + cerr << "must first select type with 'type '" << std::endl; + return 1; + } + ++i; + if (i == args.end()) { + cerr << "expecting instance number" << std::endl; + return 1; + } + int n = atoi(*i); + err = den->select_generated(n); + } else if (*i == string("is_deterministic")) { + if (!den) { + cerr << "must first select type with 'type '" << std::endl; + return 1; + } + if (den->is_deterministic()) + return 0; + else + return 1; + } else { + cerr << "unknown option '" << *i << "'" << std::endl; + return 1; + } + if (err.length()) { + cerr << "error: " << err << std::endl; + return 1; + } + } + return 0; +} diff --git a/src/tools/ceph-dencoder/ceph_time.h b/src/tools/ceph-dencoder/ceph_time.h new file mode 100644 index 000000000..c27cb5746 --- /dev/null +++ b/src/tools/ceph-dencoder/ceph_time.h @@ -0,0 +1,68 @@ +#ifndef TEST_CEPH_TIME_H +#define TEST_CEPH_TIME_H + +#include + +#include "include/encoding.h" +#include "common/ceph_time.h" +#include "common/Formatter.h" + +// wrapper for ceph::real_time that implements the dencoder interface +template +class time_point_wrapper { + using time_point = typename Clock::time_point; + time_point t; + public: + time_point_wrapper() = default; + explicit time_point_wrapper(const time_point& t) : t(t) {} + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(t, bl); + } + void decode(bufferlist::const_iterator &p) { + using ceph::decode; + decode(t, p); + } + void dump(Formatter* f) { + auto epoch_time = Clock::to_time_t(t); + f->dump_string("time", std::ctime(&epoch_time)); + } + static void generate_test_instances(std::list& ls) { + constexpr time_t t{455500800}; // Ghostbusters release date + ls.push_back(new time_point_wrapper(Clock::from_time_t(t))); + } +}; + +using real_time_wrapper = time_point_wrapper; +WRITE_CLASS_ENCODER(real_time_wrapper) + +using coarse_real_time_wrapper = time_point_wrapper; +WRITE_CLASS_ENCODER(coarse_real_time_wrapper) + +// wrapper for ceph::timespan that implements the dencoder interface +class timespan_wrapper { + ceph::timespan d; + public: + timespan_wrapper() = default; + explicit timespan_wrapper(const ceph::timespan& d) : d(d) {} + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(d, bl); + } + void decode(bufferlist::const_iterator &p) { + using ceph::decode; + decode(d, p); + } + void dump(Formatter* f) { + f->dump_int("timespan", d.count()); + } + static void generate_test_instances(std::list& ls) { + constexpr std::chrono::seconds d{7377}; // marathon world record (2:02:57) + ls.push_back(new timespan_wrapper(d)); + } +}; +WRITE_CLASS_ENCODER(timespan_wrapper) + +#endif diff --git a/src/tools/ceph-dencoder/common_types.cc b/src/tools/ceph-dencoder/common_types.cc new file mode 100644 index 000000000..fa763c3bb --- /dev/null +++ b/src/tools/ceph-dencoder/common_types.cc @@ -0,0 +1,36 @@ +#include "acconfig.h" +#include +using namespace std; +#include "include/ceph_features.h" + +#define TYPE(t) +#define TYPE_STRAYDATA(t) +#define TYPE_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL(t) +#define TYPE_FEATUREFUL_STRAYDATA(t) +#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL_NOCOPY(t) +#define TYPE_NOCOPY(t) +#define MESSAGE(t) +#include "common_types.h" +#undef TYPE +#undef TYPE_STRAYDATA +#undef TYPE_NONDETERMINISTIC +#undef TYPE_NOCOPY +#undef TYPE_FEATUREFUL +#undef TYPE_FEATUREFUL_STRAYDATA +#undef TYPE_FEATUREFUL_NONDETERMINISTIC +#undef TYPE_FEATUREFUL_NOCOPY +#undef MESSAGE + +#include "denc_plugin.h" + +DENC_API void register_dencoders(DencoderPlugin* plugin) +{ +#include "common_types.h" +} + +DENC_API void unregister_dencoders(DencoderPlugin* plugin) +{ + plugin->unregister_dencoders(); +} diff --git a/src/tools/ceph-dencoder/common_types.h b/src/tools/ceph-dencoder/common_types.h new file mode 100644 index 000000000..3180e3476 --- /dev/null +++ b/src/tools/ceph-dencoder/common_types.h @@ -0,0 +1,454 @@ +#include "ceph_time.h" +TYPE(real_time_wrapper) +TYPE(coarse_real_time_wrapper) +TYPE(timespan_wrapper) + +#include "include/utime.h" +TYPE(utime_t) + +#include "include/uuid.h" +TYPE(uuid_d) + +#include "sstring.h" +TYPE(sstring_wrapper) + +#include "str.h" +TYPE(string_wrapper) + +#include "include/CompatSet.h" +TYPE(CompatSet) + +#include "include/filepath.h" +TYPE(filepath) + +#include "include/fs_types.h" +TYPE_FEATUREFUL(file_layout_t) + +#include "include/util.h" +TYPE(ceph_data_stats) + +#include "common/bit_vector.hpp" +TYPE(BitVector<2>) + +#include "common/bloom_filter.hpp" +TYPE(bloom_filter) +TYPE(compressible_bloom_filter) + +#include "common/DecayCounter.h" +TYPE(DecayCounter) + +#include "common/histogram.h" +TYPE(pow2_hist_t) + +#include "common/hobject.h" +TYPE(hobject_t) +TYPE(ghobject_t) + +#include "common/LogEntry.h" +TYPE_FEATUREFUL(LogEntry) +TYPE_FEATUREFUL(LogSummary) + +#include "common/SloppyCRCMap.h" +TYPE(SloppyCRCMap) + +#include "common/snap_types.h" +TYPE(SnapContext) +TYPE(SnapRealmInfo) + +#include "msg/msg_types.h" +TYPE(entity_name_t) +TYPE_FEATUREFUL(entity_addr_t) +TYPE_FEATUREFUL(entity_addrvec_t) +TYPE_FEATUREFUL(entity_inst_t) + +#include "crush/CrushWrapper.h" +TYPE_FEATUREFUL_NOCOPY(CrushWrapper) + +#include "cls/cas/cls_cas_ops.h" +TYPE(cls_cas_chunk_create_or_get_ref_op) +TYPE(cls_cas_chunk_get_ref_op) +TYPE(cls_cas_chunk_put_ref_op) + +#include "cls/cas/cls_cas_internal.h" +TYPE(chunk_refs_t) + +#include "cls/lock/cls_lock_types.h" +TYPE(rados::cls::lock::locker_id_t) +TYPE_FEATUREFUL(rados::cls::lock::locker_info_t) +TYPE_FEATUREFUL(rados::cls::lock::lock_info_t) + +#include "cls/lock/cls_lock_ops.h" +TYPE(cls_lock_lock_op) +TYPE(cls_lock_unlock_op) +TYPE(cls_lock_break_op) +TYPE(cls_lock_get_info_op) +TYPE_FEATUREFUL(cls_lock_get_info_reply) +TYPE(cls_lock_list_locks_reply) +TYPE(cls_lock_assert_op) +TYPE(cls_lock_set_cookie_op) + +#include "cls/refcount/cls_refcount_ops.h" +TYPE(cls_refcount_get_op) +TYPE(cls_refcount_put_op) +TYPE(cls_refcount_set_op) +TYPE(cls_refcount_read_op) +TYPE(cls_refcount_read_ret) +TYPE(obj_refcount) + +#include "cls/timeindex/cls_timeindex_types.h" +TYPE(cls_timeindex_entry) + +#include "journal/Entry.h" +TYPE(journal::Entry) + +// --- messages --- +#include "messages/MAuth.h" +MESSAGE(MAuth) + +#include "messages/MAuthReply.h" +MESSAGE(MAuthReply) + +#include "messages/MCacheExpire.h" +MESSAGE(MCacheExpire) + +#include "messages/MClientCapRelease.h" +MESSAGE(MClientCapRelease) + +#include "messages/MClientCaps.h" +MESSAGE(MClientCaps) + +#include "messages/MClientLease.h" +MESSAGE(MClientLease) + +#include "messages/MClientReconnect.h" +MESSAGE(MClientReconnect) + +#include "messages/MClientReply.h" +MESSAGE(MClientReply) + +#include "messages/MClientRequest.h" +MESSAGE(MClientRequest) + +#include "messages/MClientRequestForward.h" +MESSAGE(MClientRequestForward) + +#include "messages/MClientQuota.h" +MESSAGE(MClientQuota) + +#include "messages/MClientSession.h" +MESSAGE(MClientSession) + +#include "messages/MClientSnap.h" +MESSAGE(MClientSnap) + +#include "messages/MCommand.h" +MESSAGE(MCommand) + +#include "messages/MCommandReply.h" +MESSAGE(MCommandReply) + +#include "messages/MConfig.h" +MESSAGE(MConfig) + +#include "messages/MDentryLink.h" +MESSAGE(MDentryLink) + +#include "messages/MDentryUnlink.h" +MESSAGE(MDentryUnlink) + +#include "messages/MDirUpdate.h" +MESSAGE(MDirUpdate) + +#include "messages/MDiscover.h" +MESSAGE(MDiscover) + +#include "messages/MDiscoverReply.h" +MESSAGE(MDiscoverReply) + +#include "messages/MExportCaps.h" +MESSAGE(MExportCaps) + +#include "messages/MExportCapsAck.h" +MESSAGE(MExportCapsAck) + +#include "messages/MExportDir.h" +MESSAGE(MExportDir) + +#include "messages/MExportDirAck.h" +MESSAGE(MExportDirAck) + +#include "messages/MExportDirCancel.h" +MESSAGE(MExportDirCancel) + +#include "messages/MExportDirDiscover.h" +MESSAGE(MExportDirDiscover) + +#include "messages/MExportDirDiscoverAck.h" +MESSAGE(MExportDirDiscoverAck) + +#include "messages/MExportDirFinish.h" +MESSAGE(MExportDirFinish) + +#include "messages/MExportDirNotify.h" +MESSAGE(MExportDirNotify) + +#include "messages/MExportDirNotifyAck.h" +MESSAGE(MExportDirNotifyAck) + +#include "messages/MExportDirPrep.h" +MESSAGE(MExportDirPrep) + +#include "messages/MExportDirPrepAck.h" +MESSAGE(MExportDirPrepAck) + +#include "messages/MForward.h" +MESSAGE(MForward) + +#include "messages/MFSMap.h" +MESSAGE(MFSMap) + +#include "messages/MFSMapUser.h" +MESSAGE(MFSMapUser) + +#include "messages/MGatherCaps.h" +MESSAGE(MGatherCaps) + +#include "messages/MGenericMessage.h" +MESSAGE(MGenericMessage) + +#include "messages/MGetConfig.h" +MESSAGE(MGetConfig) + +#include "messages/MGetPoolStats.h" +MESSAGE(MGetPoolStats) + +#include "messages/MGetPoolStatsReply.h" +MESSAGE(MGetPoolStatsReply) + +#include "messages/MHeartbeat.h" +MESSAGE(MHeartbeat) + +#include "messages/MInodeFileCaps.h" +MESSAGE(MInodeFileCaps) + +#include "messages/MLock.h" +MESSAGE(MLock) + +#include "messages/MLog.h" +MESSAGE(MLog) + +#include "messages/MLogAck.h" +MESSAGE(MLogAck) + +#include "messages/MMDSOpenIno.h" +MESSAGE(MMDSOpenIno) + +#include "messages/MMDSOpenInoReply.h" +MESSAGE(MMDSOpenInoReply) + +#include "messages/MMDSBeacon.h" +MESSAGE(MMDSBeacon) + +#include "messages/MMDSCacheRejoin.h" +MESSAGE(MMDSCacheRejoin) + +#include "messages/MMDSFindIno.h" +MESSAGE(MMDSFindIno) + +#include "messages/MMDSFindInoReply.h" +MESSAGE(MMDSFindInoReply) + +#include "messages/MMDSFragmentNotify.h" +MESSAGE(MMDSFragmentNotify) + +#include "messages/MMDSLoadTargets.h" +MESSAGE(MMDSLoadTargets) + +#include "messages/MMDSMap.h" +MESSAGE(MMDSMap) + +#include "messages/MMgrReport.h" +MESSAGE(MMgrReport) + +#include "messages/MMDSResolve.h" +MESSAGE(MMDSResolve) + +#include "messages/MMDSResolveAck.h" +MESSAGE(MMDSResolveAck) + +#include "messages/MMDSPeerRequest.h" +MESSAGE(MMDSPeerRequest) + +#include "messages/MMDSSnapUpdate.h" +MESSAGE(MMDSSnapUpdate) + +#include "messages/MMDSTableRequest.h" +MESSAGE(MMDSTableRequest) + +#include "messages/MMgrClose.h" +MESSAGE(MMgrClose) + +#include "messages/MMgrConfigure.h" +MESSAGE(MMgrConfigure) + +#include "messages/MMgrDigest.h" +MESSAGE(MMgrDigest) + +#include "messages/MMgrMap.h" +MESSAGE(MMgrMap) + +#include "messages/MMgrOpen.h" +MESSAGE(MMgrOpen) + +#include "messages/MMonCommand.h" +MESSAGE(MMonCommand) + +#include "messages/MMonCommandAck.h" +MESSAGE(MMonCommandAck) + +#include "messages/MMonElection.h" +MESSAGE(MMonElection) + +#include "messages/MMonGetMap.h" +MESSAGE(MMonGetMap) + +#include "messages/MMonGetVersion.h" +MESSAGE(MMonGetVersion) + +#include "messages/MMonGetVersionReply.h" +MESSAGE(MMonGetVersionReply) + +#include "messages/MMonGlobalID.h" +MESSAGE(MMonGlobalID) + +#include "messages/MMonJoin.h" +MESSAGE(MMonJoin) + +#include "messages/MMonMap.h" +MESSAGE(MMonMap) + +#include "messages/MMonPaxos.h" +MESSAGE(MMonPaxos) + +#include "messages/MMonProbe.h" +MESSAGE(MMonProbe) + +#include "messages/MMonScrub.h" +MESSAGE(MMonScrub) + +#include "messages/MMonSync.h" +MESSAGE(MMonSync) + +#include "messages/MMonSubscribe.h" +MESSAGE(MMonSubscribe) + +#include "messages/MMonSubscribeAck.h" +MESSAGE(MMonSubscribeAck) + +#include "messages/MOSDAlive.h" +MESSAGE(MOSDAlive) + +#include "messages/MOSDBoot.h" +MESSAGE(MOSDBoot) + +#include "messages/MOSDFailure.h" +MESSAGE(MOSDFailure) + +#include "messages/MOSDMap.h" +MESSAGE(MOSDMap) + +#include "messages/MOSDOp.h" +MESSAGE(MOSDOp) + +#include "messages/MOSDOpReply.h" +MESSAGE(MOSDOpReply) + +#include "messages/MOSDPGBackfill.h" +MESSAGE(MOSDPGBackfill) + +#include "messages/MOSDPGCreate2.h" +MESSAGE(MOSDPGCreate2) + +#include "messages/MOSDPGInfo.h" +MESSAGE(MOSDPGInfo) + +#include "messages/MOSDPGLog.h" +MESSAGE(MOSDPGLog) + +#include "messages/MOSDPGNotify.h" +MESSAGE(MOSDPGNotify) + +#include "messages/MOSDPGQuery.h" +MESSAGE(MOSDPGQuery) + +#include "messages/MOSDPGRemove.h" +MESSAGE(MOSDPGRemove) + +#include "messages/MOSDPGRecoveryDelete.h" +MESSAGE(MOSDPGRecoveryDelete) + +#include "messages/MOSDPGRecoveryDeleteReply.h" +MESSAGE(MOSDPGRecoveryDeleteReply) + +#include "messages/MOSDPGScan.h" +MESSAGE(MOSDPGScan) + +#include "messages/MOSDPGTemp.h" +MESSAGE(MOSDPGTemp) + +#include "messages/MOSDPGTrim.h" +MESSAGE(MOSDPGTrim) + +#include "messages/MOSDPing.h" +MESSAGE(MOSDPing) + +#include "messages/MOSDRepScrub.h" +MESSAGE(MOSDRepScrub) + +#include "messages/MOSDScrub2.h" +MESSAGE(MOSDScrub2) + +#include "messages/MOSDForceRecovery.h" +MESSAGE(MOSDForceRecovery) + +#include "messages/MPGStats.h" +MESSAGE(MPGStats) + +#include "messages/MPGStatsAck.h" +MESSAGE(MPGStatsAck) + +#include "messages/MPing.h" +MESSAGE(MPing) + +#include "messages/MPoolOp.h" +MESSAGE(MPoolOp) + +#include "messages/MPoolOpReply.h" +MESSAGE(MPoolOpReply) + +#include "messages/MRemoveSnaps.h" +MESSAGE(MRemoveSnaps) + +#include "messages/MRoute.h" +MESSAGE(MRoute) + +#include "messages/MServiceMap.h" +MESSAGE(MServiceMap) + +#include "messages/MStatfs.h" +MESSAGE(MStatfs) + +#include "messages/MStatfsReply.h" +MESSAGE(MStatfsReply) + +#include "messages/MTimeCheck.h" +MESSAGE(MTimeCheck) + +#include "messages/MTimeCheck2.h" +MESSAGE(MTimeCheck2) + +#include "messages/MWatchNotify.h" +MESSAGE(MWatchNotify) + +#include "messages/MMgrUpdate.h" +MESSAGE(MMgrUpdate) diff --git a/src/tools/ceph-dencoder/denc_plugin.h b/src/tools/ceph-dencoder/denc_plugin.h new file mode 100644 index 000000000..a203551ea --- /dev/null +++ b/src/tools/ceph-dencoder/denc_plugin.h @@ -0,0 +1,78 @@ +#include +#include +#include + +#include "denc_registry.h" + +namespace fs = std::filesystem; + +class DencoderPlugin { + using dencoders_t = std::vector>; +public: + DencoderPlugin(const fs::path& path) { + mod = dlopen(path.c_str(), RTLD_NOW); + if (mod == nullptr) { + std::cerr << "failed to dlopen(" << path << "): " << dlerror() << std::endl; + } + } + DencoderPlugin(DencoderPlugin&& other) + : mod{other.mod}, + dencoders{std::move(other.dencoders)} + { + other.mod = nullptr; + other.dencoders.clear(); + } + ~DencoderPlugin() { +#if !defined(__FreeBSD__) + if (mod) { + dlclose(mod); + } +#endif + } + const dencoders_t& register_dencoders() { + static constexpr std::string_view REGISTER_DENCODERS_FUNCTION = "register_dencoders\0"; + + assert(mod); + using register_dencoders_t = void (*)(DencoderPlugin*); + const auto do_register = + reinterpret_cast(dlsym(mod, REGISTER_DENCODERS_FUNCTION.data())); + if (do_register == nullptr) { + std::cerr << "failed to dlsym(" << REGISTER_DENCODERS_FUNCTION << "): " + << dlerror() << std::endl; + return dencoders; + } + do_register(this); + return dencoders; + } + + bool good() const { + return mod != nullptr; + } + + void unregister_dencoders() { + while (!dencoders.empty()) { + delete dencoders.back().second; + dencoders.pop_back(); + } + } + template + void emplace(const char* name, Args&&...args) { + dencoders.emplace_back(name, new DencoderT(std::forward(args)...)); + } + +private: + void *mod = nullptr; + dencoders_t dencoders; +}; + +#define TYPE(t) plugin->emplace>(#t, false, false); +#define TYPE_STRAYDATA(t) plugin->emplace>(#t, true, false); +#define TYPE_NONDETERMINISTIC(t) plugin->emplace>(#t, false, true); +#define TYPE_FEATUREFUL(t) plugin->emplace>(#t, false, false); +#define TYPE_FEATUREFUL_STRAYDATA(t) plugin->emplace>(#t, true, false); +#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) plugin->emplace>(#t, false, true); +#define TYPE_FEATUREFUL_NOCOPY(t) plugin->emplace>(#t, false, false); +#define TYPE_NOCOPY(t) plugin->emplace>(#t, false, false); +#define MESSAGE(t) plugin->emplace>(#t); + +#define DENC_API extern "C" [[gnu::visibility("default")]] diff --git a/src/tools/ceph-dencoder/denc_registry.h b/src/tools/ceph-dencoder/denc_registry.h new file mode 100644 index 000000000..aad52cbf7 --- /dev/null +++ b/src/tools/ceph-dencoder/denc_registry.h @@ -0,0 +1,241 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include + +#include "include/buffer_fwd.h" +#include "msg/Message.h" + +namespace ceph { + class Formatter; +} + +struct Dencoder { + virtual ~Dencoder() {} + virtual std::string decode(bufferlist bl, uint64_t seek) = 0; + virtual void encode(bufferlist& out, uint64_t features) = 0; + virtual void dump(ceph::Formatter *f) = 0; + virtual void copy() { + std::cerr << "copy operator= not supported" << std::endl; + } + virtual void copy_ctor() { + std::cerr << "copy ctor not supported" << std::endl; + } + virtual void generate() = 0; + virtual int num_generated() = 0; + virtual std::string select_generated(unsigned n) = 0; + virtual bool is_deterministic() = 0; + unsigned get_struct_v(bufferlist bl, uint64_t seek) const { + auto p = bl.cbegin(seek); + uint8_t struct_v = 0; + ceph::decode(struct_v, p); + return struct_v; + } + //virtual void print(ostream& out) = 0; +}; + +template +class DencoderBase : public Dencoder { +protected: + T* m_object; + std::list m_list; + bool stray_okay; + bool nondeterministic; + +public: + DencoderBase(bool stray_okay, bool nondeterministic) + : m_object(new T), + stray_okay(stray_okay), + nondeterministic(nondeterministic) {} + ~DencoderBase() override { + delete m_object; + } + + std::string decode(bufferlist bl, uint64_t seek) override { + auto p = bl.cbegin(); + p.seek(seek); + try { + using ceph::decode; + decode(*m_object, p); + } + catch (buffer::error& e) { + return e.what(); + } + if (!stray_okay && !p.end()) { + std::ostringstream ss; + ss << "stray data at end of buffer, offset " << p.get_off(); + return ss.str(); + } + return {}; + } + + void encode(bufferlist& out, uint64_t features) override = 0; + + void dump(ceph::Formatter *f) override { + m_object->dump(f); + } + void generate() override { + T::generate_test_instances(m_list); + } + int num_generated() override { + return m_list.size(); + } + std::string select_generated(unsigned i) override { + // allow 0- or 1-based (by wrapping) + if (i == 0) + i = m_list.size(); + if ((i == 0) || (i > m_list.size())) + return "invalid id for generated object"; + m_object = *(std::next(m_list.begin(), i-1)); + return {}; + } + + bool is_deterministic() override { + return !nondeterministic; + } +}; + +template +class DencoderImplNoFeatureNoCopy : public DencoderBase { +public: + DencoderImplNoFeatureNoCopy(bool stray_ok, bool nondeterministic) + : DencoderBase(stray_ok, nondeterministic) {} + void encode(bufferlist& out, uint64_t features) override { + out.clear(); + using ceph::encode; + encode(*this->m_object, out); + } +}; + +template +class DencoderImplNoFeature : public DencoderImplNoFeatureNoCopy { +public: + DencoderImplNoFeature(bool stray_ok, bool nondeterministic) + : DencoderImplNoFeatureNoCopy(stray_ok, nondeterministic) {} + void copy() override { + T *n = new T; + *n = *this->m_object; + delete this->m_object; + this->m_object = n; + } + void copy_ctor() override { + T *n = new T(*this->m_object); + delete this->m_object; + this->m_object = n; + } +}; + +template +class DencoderImplFeaturefulNoCopy : public DencoderBase { +public: + DencoderImplFeaturefulNoCopy(bool stray_ok, bool nondeterministic) + : DencoderBase(stray_ok, nondeterministic) {} + void encode(bufferlist& out, uint64_t features) override { + out.clear(); + using ceph::encode; + encode(*(this->m_object), out, features); + } +}; + +template +class DencoderImplFeatureful : public DencoderImplFeaturefulNoCopy { +public: + DencoderImplFeatureful(bool stray_ok, bool nondeterministic) + : DencoderImplFeaturefulNoCopy(stray_ok, nondeterministic) {} + void copy() override { + T *n = new T; + *n = *this->m_object; + delete this->m_object; + this->m_object = n; + } + void copy_ctor() override { + T *n = new T(*this->m_object); + delete this->m_object; + this->m_object = n; + } +}; + +template +class MessageDencoderImpl : public Dencoder { + ref_t m_object; + std::list> m_list; + +public: + MessageDencoderImpl() : m_object{make_message()} {} + ~MessageDencoderImpl() override {} + + std::string decode(bufferlist bl, uint64_t seek) override { + auto p = bl.cbegin(); + p.seek(seek); + try { + ref_t n(decode_message(g_ceph_context, 0, p), false); + if (!n) + throw std::runtime_error("failed to decode"); + if (n->get_type() != m_object->get_type()) { + std::stringstream ss; + ss << "decoded type " << n->get_type() << " instead of expected " << m_object->get_type(); + throw std::runtime_error(ss.str()); + } + m_object = ref_cast(n); + } + catch (buffer::error& e) { + return e.what(); + } + if (!p.end()) { + std::ostringstream ss; + ss << "stray data at end of buffer, offset " << p.get_off(); + return ss.str(); + } + return {}; + } + + void encode(bufferlist& out, uint64_t features) override { + out.clear(); + encode_message(m_object.get(), features, out); + } + + void dump(ceph::Formatter *f) override { + m_object->dump(f); + } + void generate() override { + //T::generate_test_instances(m_list); + } + int num_generated() override { + return m_list.size(); + } + std::string select_generated(unsigned i) override { + // allow 0- or 1-based (by wrapping) + if (i == 0) + i = m_list.size(); + if ((i == 0) || (i > m_list.size())) + return "invalid id for generated object"; + m_object = *(std::next(m_list.begin(), i-1)); + return {}; + } + bool is_deterministic() override { + return true; + } + + //void print(ostream& out) { + //out << m_object << std::endl; + //} +}; + +class DencoderRegistry +{ + using dencoders_t = std::map; + +public: + dencoders_t& get() { + return dencoders; + } + void register_dencoder(std::string_view name, Dencoder* denc) { + dencoders.emplace(name, denc); + } +private: + dencoders_t dencoders; +}; diff --git a/src/tools/ceph-dencoder/mds_types.cc b/src/tools/ceph-dencoder/mds_types.cc new file mode 100644 index 000000000..94280477a --- /dev/null +++ b/src/tools/ceph-dencoder/mds_types.cc @@ -0,0 +1,36 @@ +#include "acconfig.h" +#include +using namespace std; +#include "include/ceph_features.h" + +#define TYPE(t) +#define TYPE_STRAYDATA(t) +#define TYPE_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL(t) +#define TYPE_FEATUREFUL_STRAYDATA(t) +#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL_NOCOPY(t) +#define TYPE_NOCOPY(t) +#define MESSAGE(t) +#include "mds_types.h" +#undef TYPE +#undef TYPE_STRAYDATA +#undef TYPE_NONDETERMINISTIC +#undef TYPE_NOCOPY +#undef TYPE_FEATUREFUL +#undef TYPE_FEATUREFUL_STRAYDATA +#undef TYPE_FEATUREFUL_NONDETERMINISTIC +#undef TYPE_FEATUREFUL_NOCOPY +#undef MESSAGE + +#include "denc_plugin.h" + +DENC_API void register_dencoders(DencoderPlugin* plugin) +{ +#include "mds_types.h" +} + +DENC_API void unregister_dencoders(DencoderPlugin* plugin) +{ + plugin->unregister_dencoders(); +} diff --git a/src/tools/ceph-dencoder/mds_types.h b/src/tools/ceph-dencoder/mds_types.h new file mode 100644 index 000000000..91ba86be0 --- /dev/null +++ b/src/tools/ceph-dencoder/mds_types.h @@ -0,0 +1,112 @@ +#ifdef WITH_CEPHFS +#include "mds/JournalPointer.h" +TYPE(JournalPointer) + +#include "osdc/Journaler.h" +TYPE(Journaler::Header) + +#include "mds/snap.h" +TYPE(SnapInfo) +TYPE(snaplink_t) +TYPE(sr_t) + +#include "mds/mdstypes.h" +#include "include/cephfs/types.h" +TYPE(frag_info_t) +TYPE(nest_info_t) +TYPE(quota_info_t) +TYPE(client_writeable_range_t) +TYPE_FEATUREFUL(inode_t) +TYPE_FEATUREFUL(old_inode_t) +TYPE(fnode_t) +TYPE(old_rstat_t) +TYPE_FEATUREFUL(session_info_t) +TYPE(string_snap_t) +TYPE(MDSCacheObjectInfo) +TYPE(mds_table_pending_t) +TYPE(cap_reconnect_t) +TYPE(inode_load_vec_t) +TYPE(dirfrag_load_vec_t) +TYPE(mds_load_t) +TYPE(MDSCacheObjectInfo) +TYPE(inode_backtrace_t) +TYPE(inode_backpointer_t) + +#include "mds/CInode.h" +TYPE_FEATUREFUL(InodeStore) +TYPE_FEATUREFUL(InodeStoreBare) + +#include "mds/MDSMap.h" +TYPE_FEATUREFUL(MDSMap) +TYPE_FEATUREFUL(MDSMap::mds_info_t) + +#include "mds/FSMap.h" +//TYPE_FEATUREFUL(Filesystem) +TYPE_FEATUREFUL(FSMap) + +#include "mds/Capability.h" +TYPE_NOCOPY(Capability) + +#include "mds/inode_backtrace.h" +TYPE(inode_backpointer_t) +TYPE(inode_backtrace_t) + +#include "mds/InoTable.h" +TYPE(InoTable) + +#include "mds/SnapServer.h" +TYPE_STRAYDATA(SnapServer) + +#include "mds/events/ECommitted.h" +TYPE_FEATUREFUL_NOCOPY(ECommitted) + +#include "mds/events/EExport.h" +TYPE_FEATUREFUL_NOCOPY(EExport) + +#include "mds/events/EFragment.h" +TYPE_FEATUREFUL_NOCOPY(EFragment) + +#include "mds/events/EImportFinish.h" +TYPE_FEATUREFUL_NOCOPY(EImportFinish) + +#include "mds/events/EImportStart.h" +TYPE_FEATUREFUL_NOCOPY(EImportStart) + +#include "mds/events/EMetaBlob.h" +TYPE_FEATUREFUL_NOCOPY(EMetaBlob::fullbit) +TYPE(EMetaBlob::remotebit) +TYPE(EMetaBlob::nullbit) +TYPE_FEATUREFUL_NOCOPY(EMetaBlob::dirlump) +TYPE_FEATUREFUL_NOCOPY(EMetaBlob) + +#include "mds/events/EOpen.h" +TYPE_FEATUREFUL_NOCOPY(EOpen) + +#include "mds/events/EResetJournal.h" +TYPE_FEATUREFUL_NOCOPY(EResetJournal) + +#include "mds/events/ESession.h" +TYPE_FEATUREFUL_NOCOPY(ESession) + +#include "mds/events/ESessions.h" +TYPE_FEATUREFUL_NOCOPY(ESessions) + +#include "mds/events/EPeerUpdate.h" +TYPE(link_rollback) +TYPE(rmdir_rollback) +TYPE(rename_rollback::drec) +TYPE(rename_rollback) +TYPE_FEATUREFUL_NOCOPY(EPeerUpdate) + +#include "mds/events/ESubtreeMap.h" +TYPE_FEATUREFUL_NOCOPY(ESubtreeMap) + +#include "mds/events/ETableClient.h" +TYPE_FEATUREFUL_NOCOPY(ETableClient) + +#include "mds/events/ETableServer.h" +TYPE_FEATUREFUL_NOCOPY(ETableServer) + +#include "mds/events/EUpdate.h" +TYPE_FEATUREFUL_NOCOPY(EUpdate) +#endif // WITH_CEPHFS diff --git a/src/tools/ceph-dencoder/osd_types.cc b/src/tools/ceph-dencoder/osd_types.cc new file mode 100644 index 000000000..13a90685b --- /dev/null +++ b/src/tools/ceph-dencoder/osd_types.cc @@ -0,0 +1,39 @@ +#include "acconfig.h" +#include +using namespace std; +#include "include/ceph_features.h" + +#define TYPE(t) +#define TYPE_STRAYDATA(t) +#define TYPE_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL(t) +#define TYPE_FEATUREFUL_STRAYDATA(t) +#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL_NOCOPY(t) +#define TYPE_NOCOPY(t) +#define MESSAGE(t) +#include "osd_types.h" +#undef TYPE +#undef TYPE_STRAYDATA +#undef TYPE_NONDETERMINISTIC +#undef TYPE_NOCOPY +#undef TYPE_FEATUREFUL +#undef TYPE_FEATUREFUL_STRAYDATA +#undef TYPE_FEATUREFUL_NONDETERMINISTIC +#undef TYPE_FEATUREFUL_NOCOPY +#undef MESSAGE + +#include "denc_plugin.h" + +// cannot initialize dencoders when initializing static variables, as some of +// the types are allocated using mempool, and the mempools are initialized as +// static variables. +DENC_API void register_dencoders(DencoderPlugin* plugin) +{ +#include "osd_types.h" +} + +DENC_API void unregister_dencoders(DencoderPlugin* plugin) +{ + plugin->unregister_dencoders(); +} diff --git a/src/tools/ceph-dencoder/osd_types.h b/src/tools/ceph-dencoder/osd_types.h new file mode 100644 index 000000000..0bc2fc244 --- /dev/null +++ b/src/tools/ceph-dencoder/osd_types.h @@ -0,0 +1,150 @@ +#include "osd/OSDMap.h" +TYPE(osd_info_t) +TYPE_FEATUREFUL(osd_xinfo_t) +TYPE_FEATUREFUL_NOCOPY(OSDMap) +TYPE_FEATUREFUL_STRAYDATA(OSDMap::Incremental) + +#include "osd/osd_types.h" +TYPE(osd_reqid_t) +TYPE(object_locator_t) +TYPE(request_redirect_t) +TYPE(pg_t) +TYPE(coll_t) +TYPE_FEATUREFUL(objectstore_perf_stat_t) +TYPE_FEATUREFUL(osd_stat_t) +TYPE(OSDSuperblock) +TYPE_FEATUREFUL(pool_snap_info_t) +TYPE_FEATUREFUL(pg_pool_t) +TYPE(object_stat_sum_t) +TYPE(object_stat_collection_t) +TYPE(pg_stat_t) +TYPE_FEATUREFUL(pool_stat_t) +TYPE(pg_hit_set_info_t) +TYPE(pg_hit_set_history_t) +TYPE(pg_history_t) +TYPE(pg_info_t) +TYPE(PastIntervals) +TYPE_FEATUREFUL(pg_query_t) +TYPE(ObjectModDesc) +TYPE(pg_log_entry_t) +TYPE(pg_log_dup_t) +TYPE(pg_log_t) +TYPE_FEATUREFUL(pg_missing_item) +TYPE_FEATUREFUL(pg_missing_t) +TYPE(pg_nls_response_t) +TYPE(pg_ls_response_t) +TYPE(object_copy_cursor_t) +TYPE_FEATUREFUL(object_copy_data_t) +TYPE(pg_create_t) +TYPE(OSDSuperblock) +TYPE(SnapSet) +TYPE_FEATUREFUL(watch_info_t) +TYPE_FEATUREFUL(watch_item_t) +TYPE(object_manifest_t) +TYPE_FEATUREFUL(object_info_t) +TYPE(SnapSet) +TYPE_FEATUREFUL(ObjectRecoveryInfo) +TYPE(ObjectRecoveryProgress) +TYPE(PushReplyOp) +TYPE_FEATUREFUL(PullOp) +TYPE_FEATUREFUL(PushOp) +TYPE(ScrubMap::object) +TYPE(ScrubMap) +TYPE_FEATUREFUL(obj_list_watch_response_t) +TYPE(clone_info) +TYPE(obj_list_snap_response_t) +TYPE(pool_pg_num_history_t) + +#include "osd/ECUtil.h" +// TYPE(stripe_info_t) non-standard encoding/decoding functions +TYPE(ECUtil::HashInfo) + +#include "osd/ECMsgTypes.h" +TYPE_NOCOPY(ECSubWrite) +TYPE(ECSubWriteReply) +TYPE_FEATUREFUL(ECSubRead) +TYPE(ECSubReadReply) + +#include "osd/HitSet.h" +TYPE_NONDETERMINISTIC(ExplicitHashHitSet) +TYPE_NONDETERMINISTIC(ExplicitObjectHitSet) +TYPE(BloomHitSet) +TYPE_NONDETERMINISTIC(HitSet) // because some subclasses are +TYPE(HitSet::Params) + +#include "os/ObjectStore.h" +TYPE(ObjectStore::Transaction) + +#include "os/SequencerPosition.h" +TYPE(SequencerPosition) + +#ifdef WITH_BLUESTORE +#include "os/bluestore/bluestore_types.h" +TYPE(bluestore_bdev_label_t) +TYPE(bluestore_cnode_t) +TYPE(bluestore_compression_header_t) +TYPE(bluestore_extent_ref_map_t) +TYPE(bluestore_pextent_t) +TYPE(bluestore_blob_use_tracker_t) +// TODO: bluestore_blob_t repurposes the "feature" param of encode() for its +// struct_v. at a higher level, BlueStore::ExtentMap encodes the extends using +// a different interface than the normal ones. see +// BlueStore::ExtentMap::encode_some(). maybe we can test it using another +// approach. +// TYPE_FEATUREFUL(bluestore_blob_t) +// TYPE(bluestore_shared_blob_t) there is no encode here +TYPE(bluestore_onode_t) +TYPE(bluestore_deferred_op_t) +TYPE(bluestore_deferred_transaction_t) +// TYPE(bluestore_compression_header_t) there is no encode here + +#include "os/bluestore/bluefs_types.h" +TYPE(bluefs_extent_t) +TYPE(bluefs_fnode_t) +TYPE(bluefs_super_t) +TYPE(bluefs_transaction_t) +#endif + +#include "mon/AuthMonitor.h" +TYPE_FEATUREFUL(AuthMonitor::Incremental) + +#include "mon/PGMap.h" +TYPE_FEATUREFUL_NONDETERMINISTIC(PGMapDigest) +TYPE_FEATUREFUL_NONDETERMINISTIC(PGMap) + +#include "mon/MonitorDBStore.h" +TYPE(MonitorDBStore::Transaction) +TYPE(MonitorDBStore::Op) + +#include "mon/MonMap.h" +TYPE_FEATUREFUL(MonMap) + +#include "mon/MonCap.h" +TYPE(MonCap) + +#include "mon/MgrMap.h" +TYPE_FEATUREFUL(MgrMap) + +#include "mon/mon_types.h" +TYPE(LevelDBStoreStats) +TYPE(ScrubResult) + +#include "mon/CreatingPGs.h" +TYPE_FEATUREFUL(creating_pgs_t) + +#include "mgr/ServiceMap.h" +TYPE_FEATUREFUL(ServiceMap) +TYPE_FEATUREFUL(ServiceMap::Service) +TYPE_FEATUREFUL(ServiceMap::Daemon) + +#include "mon/ConnectionTracker.h" +TYPE(ConnectionReport); +TYPE(ConnectionTracker); + +#include "os/DBObjectMap.h" +TYPE(DBObjectMap::_Header) +TYPE(DBObjectMap::State) + +#include "os/kstore/kstore_types.h" +TYPE(kstore_cnode_t) +TYPE(kstore_onode_t) diff --git a/src/tools/ceph-dencoder/rbd_types.cc b/src/tools/ceph-dencoder/rbd_types.cc new file mode 100644 index 000000000..e04efc30d --- /dev/null +++ b/src/tools/ceph-dencoder/rbd_types.cc @@ -0,0 +1,36 @@ +#include "acconfig.h" +#include +using namespace std; +#include "include/ceph_features.h" + +#define TYPE(t) +#define TYPE_STRAYDATA(t) +#define TYPE_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL(t) +#define TYPE_FEATUREFUL_STRAYDATA(t) +#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL_NOCOPY(t) +#define TYPE_NOCOPY(t) +#define MESSAGE(t) +#include "rbd_types.h" +#undef TYPE +#undef TYPE_STRAYDATA +#undef TYPE_NONDETERMINISTIC +#undef TYPE_NOCOPY +#undef TYPE_FEATUREFUL +#undef TYPE_FEATUREFUL_STRAYDATA +#undef TYPE_FEATUREFUL_NONDETERMINISTIC +#undef TYPE_FEATUREFUL_NOCOPY +#undef MESSAGE + +#include "denc_plugin.h" + +DENC_API void register_dencoders(DencoderPlugin* plugin) +{ +#include "rbd_types.h" +} + +DENC_API void unregister_dencoders(DencoderPlugin* plugin) +{ + plugin->unregister_dencoders(); +} diff --git a/src/tools/ceph-dencoder/rbd_types.h b/src/tools/ceph-dencoder/rbd_types.h new file mode 100644 index 000000000..0f004e54a --- /dev/null +++ b/src/tools/ceph-dencoder/rbd_types.h @@ -0,0 +1,52 @@ +#ifdef WITH_RBD +#include "librbd/journal/Types.h" +TYPE(librbd::journal::EventEntry) +TYPE(librbd::journal::ClientData) +TYPE(librbd::journal::TagData) +#include "librbd/mirroring_watcher/Types.h" +TYPE(librbd::mirroring_watcher::NotifyMessage) +#include "librbd/trash_watcher/Types.h" +TYPE(librbd::trash_watcher::NotifyMessage) +#include "librbd/WatchNotifyTypes.h" +TYPE_NOCOPY(librbd::watch_notify::NotifyMessage) +TYPE(librbd::watch_notify::ResponseMessage) + +#include "rbd_replay/ActionTypes.h" +TYPE(rbd_replay::action::Dependency) +TYPE(rbd_replay::action::ActionEntry) + +#include "tools/rbd_mirror/image_map/Types.h" +TYPE(rbd::mirror::image_map::PolicyData) +#endif + +#if defined(WITH_RBD) && defined(WITH_RBD_SSD_CACHE) +#include "librbd/cache/pwl/Types.h" +#include "librbd/cache/pwl/ssd/Types.h" +TYPE(librbd::cache::pwl::WriteLogCacheEntry) +TYPE(librbd::cache::pwl::WriteLogPoolRoot) +TYPE(librbd::cache::pwl::ssd::SuperBlock) +#endif + +#ifdef WITH_RBD +#include "cls/rbd/cls_rbd.h" +TYPE_FEATUREFUL(cls_rbd_parent) +TYPE_FEATUREFUL(cls_rbd_snap) + +#include "cls/rbd/cls_rbd_types.h" +TYPE(cls::rbd::ParentImageSpec) +TYPE(cls::rbd::ChildImageSpec) +TYPE(cls::rbd::MigrationSpec) +TYPE(cls::rbd::MirrorPeer) +TYPE(cls::rbd::MirrorImage) +TYPE(cls::rbd::MirrorImageMap) +TYPE(cls::rbd::MirrorImageStatus) +TYPE(cls::rbd::MirrorImageSiteStatus) +TYPE_FEATUREFUL(cls::rbd::MirrorImageSiteStatusOnDisk) +TYPE(cls::rbd::GroupImageSpec) +TYPE(cls::rbd::GroupImageStatus) +TYPE(cls::rbd::GroupSnapshot) +TYPE(cls::rbd::GroupSpec) +TYPE(cls::rbd::ImageSnapshotSpec) +TYPE(cls::rbd::SnapshotInfo) +TYPE(cls::rbd::SnapshotNamespace) +#endif diff --git a/src/tools/ceph-dencoder/rgw_types.cc b/src/tools/ceph-dencoder/rgw_types.cc new file mode 100644 index 000000000..79688b534 --- /dev/null +++ b/src/tools/ceph-dencoder/rgw_types.cc @@ -0,0 +1,36 @@ +#include "acconfig.h" +#include +using namespace std; +#include "include/ceph_features.h" + +#define TYPE(t) +#define TYPE_STRAYDATA(t) +#define TYPE_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL(t) +#define TYPE_FEATUREFUL_STRAYDATA(t) +#define TYPE_FEATUREFUL_NONDETERMINISTIC(t) +#define TYPE_FEATUREFUL_NOCOPY(t) +#define TYPE_NOCOPY(t) +#define MESSAGE(t) +#include "rgw_types.h" +#undef TYPE +#undef TYPE_STRAYDATA +#undef TYPE_NONDETERMINISTIC +#undef TYPE_NOCOPY +#undef TYPE_FEATUREFUL +#undef TYPE_FEATUREFUL_STRAYDATA +#undef TYPE_FEATUREFUL_NONDETERMINISTIC +#undef TYPE_FEATUREFUL_NOCOPY +#undef MESSAGE + +#include "denc_plugin.h" + +DENC_API void register_dencoders(DencoderPlugin* plugin) +{ +#include "rgw_types.h" +} + +DENC_API void unregister_dencoders(DencoderPlugin* plugin) +{ + plugin->unregister_dencoders(); +} diff --git a/src/tools/ceph-dencoder/rgw_types.h b/src/tools/ceph-dencoder/rgw_types.h new file mode 100644 index 000000000..dd5c3a8cb --- /dev/null +++ b/src/tools/ceph-dencoder/rgw_types.h @@ -0,0 +1,141 @@ +#ifdef WITH_RADOSGW + +#include "rgw_rados.h" +TYPE(RGWOLHInfo) +TYPE(RGWObjManifestPart) +TYPE(RGWObjManifest) +TYPE(objexp_hint_entry) + +#include "rgw_zone.h" +TYPE(RGWZoneParams) +TYPE(RGWZone) +TYPE(RGWZoneGroup) +TYPE(RGWRealm) +TYPE(RGWPeriod) +TYPE(RGWPeriodLatestEpochInfo) + +#include "rgw_acl.h" +TYPE(ACLPermission) +TYPE(ACLGranteeType) +TYPE(ACLGrant) +TYPE(RGWAccessControlList) +TYPE(ACLOwner) +TYPE(RGWAccessControlPolicy) + +#include "rgw_cache.h" +TYPE(ObjectMetaInfo) +TYPE(ObjectCacheInfo) +TYPE(RGWCacheNotifyInfo) + +#include "rgw_lc.h" +TYPE(RGWLifecycleConfiguration) + +#include "cls/rgw/cls_rgw_types.h" +TYPE(rgw_bucket_pending_info) +TYPE(rgw_bucket_dir_entry_meta) +TYPE(rgw_bucket_entry_ver) +TYPE(rgw_bucket_dir_entry) +TYPE(rgw_bucket_category_stats) +TYPE(rgw_bucket_dir_header) +TYPE(rgw_bucket_dir) +TYPE(rgw_bucket_entry_ver) +TYPE(cls_rgw_obj_key) +TYPE(rgw_bucket_olh_log_entry) +TYPE(rgw_usage_log_entry) +TYPE(rgw_cls_bi_entry) +TYPE(rgw_bucket_olh_entry) +TYPE(rgw_usage_data) +TYPE(rgw_usage_log_info) +TYPE(rgw_user_bucket) +TYPE(cls_rgw_lc_entry) + +#include "cls/rgw/cls_rgw_ops.h" +TYPE(cls_rgw_lc_get_entry_ret) +TYPE(rgw_cls_obj_prepare_op) +TYPE(rgw_cls_obj_complete_op) +TYPE(rgw_cls_list_op) +TYPE(rgw_cls_list_ret) +TYPE(cls_rgw_gc_defer_entry_op) +TYPE(cls_rgw_gc_list_op) +TYPE(cls_rgw_gc_list_ret) +TYPE(cls_rgw_gc_obj_info) +TYPE(cls_rgw_gc_remove_op) +TYPE(cls_rgw_gc_set_entry_op) +TYPE(cls_rgw_obj) +TYPE(cls_rgw_obj_chain) +TYPE(rgw_cls_tag_timeout_op) +TYPE(cls_rgw_bi_log_list_op) +TYPE(cls_rgw_bi_log_trim_op) +TYPE(cls_rgw_bi_log_list_ret) +TYPE(rgw_cls_link_olh_op) +TYPE(rgw_cls_unlink_instance_op) +TYPE(rgw_cls_read_olh_log_op) +TYPE(rgw_cls_read_olh_log_ret) +TYPE(rgw_cls_trim_olh_log_op) +TYPE(rgw_cls_bucket_clear_olh_op) +TYPE(rgw_cls_check_index_ret) +TYPE(cls_rgw_reshard_add_op) +TYPE(cls_rgw_reshard_list_op) +TYPE(cls_rgw_reshard_list_ret) +TYPE(cls_rgw_reshard_get_op) +TYPE(cls_rgw_reshard_get_ret) +TYPE(cls_rgw_reshard_remove_op) +TYPE(cls_rgw_set_bucket_resharding_op) +TYPE(cls_rgw_clear_bucket_resharding_op) +TYPE(cls_rgw_lc_obj_head) + +#include "cls/rgw/cls_rgw_client.h" +TYPE(rgw_bi_log_entry) +TYPE(cls_rgw_reshard_entry) +TYPE(cls_rgw_bucket_instance_entry) + +#include "cls/user/cls_user_types.h" +TYPE(cls_user_bucket) +TYPE(cls_user_bucket_entry) +TYPE(cls_user_stats) +TYPE(cls_user_header) + +#include "cls/user/cls_user_ops.h" +TYPE(cls_user_set_buckets_op) +TYPE(cls_user_remove_bucket_op) +TYPE(cls_user_list_buckets_op) +TYPE(cls_user_list_buckets_ret) +TYPE(cls_user_get_header_op) +TYPE(cls_user_get_header_ret) +TYPE(cls_user_complete_stats_sync_op) + +#include "cls/journal/cls_journal_types.h" +TYPE(cls::journal::ObjectPosition) +TYPE(cls::journal::ObjectSetPosition) +TYPE(cls::journal::Client) +TYPE(cls::journal::Tag) + +#include "rgw_common.h" +TYPE(RGWAccessKey) +TYPE(RGWSubUser) +TYPE(RGWUserInfo) +TYPE(rgw_bucket) +TYPE(RGWBucketInfo) +TYPE(RGWBucketEnt) +TYPE(rgw_obj) + +#include "rgw_log.h" +TYPE(rgw_log_entry) + +#include "rgw_meta_sync_status.h" +TYPE(rgw_meta_sync_info) +TYPE(rgw_meta_sync_marker) +TYPE(rgw_meta_sync_status) + +#include "rgw_multi.h" +TYPE(RGWUploadPartInfo) + +#include "rgw_data_sync.h" +TYPE(rgw_data_sync_info) +TYPE(rgw_data_sync_marker) +TYPE(rgw_data_sync_status) + +#include "rgw_bucket_encryption.h" +TYPE(RGWBucketEncryptionConfig) + +#endif diff --git a/src/tools/ceph-dencoder/sstring.h b/src/tools/ceph-dencoder/sstring.h new file mode 100644 index 000000000..c2493c10e --- /dev/null +++ b/src/tools/ceph-dencoder/sstring.h @@ -0,0 +1,40 @@ +#ifndef TEST_SSTRING_H +#define TEST_SSTRING_H + +#include "common/sstring.hh" + +// wrapper for sstring that implements the dencoder interface +class sstring_wrapper { + using sstring16 = basic_sstring; + sstring16 s1; + using sstring24 = basic_sstring; + sstring24 s2; + public: + sstring_wrapper() = default; + sstring_wrapper(sstring16&& s1, sstring24&& s2) + : s1(std::move(s1)), s2(std::move(s2)) + {} + + DENC(sstring_wrapper, w, p) { + DENC_START(1, 1, p); + denc(w.s1, p); + denc(w.s2, p); + DENC_FINISH(p); + } + void dump(Formatter* f) { + f->dump_string("s1", s1.c_str()); + f->dump_string("s2", reinterpret_cast(s2.c_str())); + } + static void generate_test_instances(std::list& ls) { + ls.push_back(new sstring_wrapper()); + // initialize sstrings that fit in internal storage + constexpr auto cstr6 = "abcdef"; + ls.push_back(new sstring_wrapper(sstring16{cstr6}, sstring24{cstr6})); + // initialize sstrings that overflow into external storage + constexpr auto cstr26 = "abcdefghijklmnopqrstuvwxyz"; + ls.push_back(new sstring_wrapper(sstring16{cstr26}, sstring24{cstr26})); + } +}; +WRITE_CLASS_DENC(sstring_wrapper) + +#endif diff --git a/src/tools/ceph-dencoder/str.h b/src/tools/ceph-dencoder/str.h new file mode 100644 index 000000000..7ff1d0794 --- /dev/null +++ b/src/tools/ceph-dencoder/str.h @@ -0,0 +1,38 @@ +#ifndef TEST_STRING_H +#define TEST_STRING_H + +#include "common/Formatter.h" + +// wrapper for std::string that implements the dencoder interface +class string_wrapper { + std::string s; + public: + string_wrapper() = default; + string_wrapper(string s1) + : s(s1) + {} + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(s, bl); + } + + void decode(ceph::buffer::list::const_iterator &bl) { + using ceph::decode; + decode(s, bl); + } + + void dump(Formatter* f) { + f->dump_string("s", s); + } + + static void generate_test_instances(std::list& ls) { + ls.push_back(new string_wrapper()); + // initialize strings that fit in internal storage + std::string s1 = "abcdef"; + ls.push_back(new string_wrapper(s1)); + } +}; +WRITE_CLASS_ENCODER(string_wrapper) + +#endif diff --git a/src/tools/ceph-diff-sorted.cc b/src/tools/ceph-diff-sorted.cc new file mode 100644 index 000000000..f8e4c28e6 --- /dev/null +++ b/src/tools/ceph-diff-sorted.cc @@ -0,0 +1,173 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * diffsorted -- a utility to compute a line-by-line diff on two + * sorted input files + * + * Copyright © 2019 Red Hat + * + * Author: J. Eric Ivancich + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. + */ + + +/* + * SUMMARY + * + * The `diffsorted` utility does a line-by-line diff on two sorted text + * files and indicating lines that are in one file but not the other + * using diff-style notation (although line numbers are not indicated). + * + * USAGE + * + * rgw-diff-sorted file1.txt file2.txt + * + * NOTES + * + * Each files should have its lines in sorted order and should have no + * empty lines. + * + * A potential input file can be sorted using the `sort` utility provided + * that LANG=C to insure byte lexical order. For example: + * + * LANG=C sort unsorted.txt >sorted.txt + * + * or: + * + * export LANG=C + * sort unsorted.txt >sorted.txt + * + * EXIT STATUS + * + * 0 : files same + * 1 : files different + * 2 : usage problem (e.g., wrong number of command-line arguments) + * 3 : problem opening input file + * 4 : bad file content (e.g., unsorted order or empty lines) + */ + + +#include +#include + + +struct FileOfLines { + const char* filename; + std::ifstream input; + std::string this_line, prev_line; + bool next_eof; + bool is_eof; + + FileOfLines(const char* _filename) : + filename(_filename), + input(filename), + next_eof(false), + is_eof(false) + { } + + void dump(const std::string& prefix) { + do { + std::cout << prefix << this_line << std::endl; + advance(); + } while (!eof()); + } + + bool eof() const { + return is_eof; + } + + bool good() const { + return input.good(); + } + + void advance() { + if (next_eof) { + is_eof = true; + return; + } + + prev_line = this_line; + std::getline(input, this_line); + if (this_line.empty()) { + if (!input.eof()) { + std::cerr << "Error: " << filename << " has an empty line." << + std::endl; + exit(4); + } + is_eof = true; + return; + } else if (input.eof()) { + next_eof = true; + } + + if (this_line < prev_line) { + std::cerr << "Error: " << filename << " is not in sorted order; \"" << + this_line << "\" follows \"" << prev_line << "\"." << std::endl; + exit(4); + } + } + + const std::string line() const { + return this_line; + } +}; + +int main(int argc, const char* argv[]) { + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + exit(2); + } + + FileOfLines input1(argv[1]); + if (!input1.good()) { + std::cerr << "Error opening " << argv[1] << + "." << std::endl; + exit(3); + } + + FileOfLines input2(argv[2]); + if (!input2.good()) { + std::cerr << "Error opening " << argv[2] << + "." << std::endl; + exit(3); + } + + bool files_same = true; + + input1.advance(); + input2.advance(); + + while (!input1.eof() && !input2.eof()) { + if (input1.line() == input2.line()) { + input1.advance(); + input2.advance(); + } else if (input1.line() < input2.line()) { + files_same = false; + std::cout << "< " << input1.line() << std::endl; + input1.advance(); + } else { + files_same = false; + std::cout << "> " << input2.line() << std::endl; + input2.advance(); + } + } + + if (!input1.eof()) { + files_same = false; + input1.dump("< "); + } else if (!input2.eof()) { + files_same = false; + input2.dump("> "); + } + + if (files_same) { + exit(0); + } else { + exit(1); + } +} diff --git a/src/tools/ceph-lazy/bash_completion.d/ceph-lazy b/src/tools/ceph-lazy/bash_completion.d/ceph-lazy new file mode 100644 index 000000000..4429def42 --- /dev/null +++ b/src/tools/ceph-lazy/bash_completion.d/ceph-lazy @@ -0,0 +1,27 @@ +_ceph-lazy() +{ + local cur prev all_opts commands + COMPREPLY=() + cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" + + commands="host-get-osd host-get-nodes host-osd-usage host-all-usage pg-get-host pg-most-write pg-less-write pg-most-write-kb pg-less-write-kb pg-most-read pg-less-read pg-most-read-kb pg-less-read-kb pg-empty rbd-prefix rbd-count rbd-host rbd-osd rbd-size rbd-all-size osd-most-used osd-less-used osd-get-ppg osd-get-pg object-get-host" + + all_opts="$commands -d -h" + + + +# If first option is -d keep completing without -d & -h + if [[ ${prev} == "-d" && ${#COMP_WORDS[@]} -eq 3 ]] ; then + COMPREPLY=( $(compgen -W "${commands}" -- ${cur}) ) + return 0 +# Do completion for first args + elif [[ ${#COMP_WORDS[@]} -eq 2 ]]; then + COMPREPLY=( $(compgen -W "${all_opts}" -- ${cur}) ) + return 0 +# Else do nothing + else + return 0 + fi +} +complete -F _ceph-lazy ceph-lazy diff --git a/src/tools/ceph-lazy/ceph-lazy b/src/tools/ceph-lazy/ceph-lazy new file mode 100755 index 000000000..39a331921 --- /dev/null +++ b/src/tools/ceph-lazy/ceph-lazy @@ -0,0 +1,709 @@ +#!/usr/bin/env bash +# +# ceph-lazy : Be efficient, be lazy ! +# +# Author: Gregory Charot +# +# This is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# + +# Specify exta option for ceph like the username/keyring/etc. Can also be done with CEPH_ARGS global variable +#CEPH_OPT="-n client.username" +VERSION="1.1.2" + +# +# Print info message to stderr +# + +function echoinfo() { + printf "INFO: %s\n" "$*" >&2; +} + + +# +# Print error message to stderr +# + +function echoerr() { + printf "ERROR: %s\n" "$*" >&2; +} + + +function help() { + >&2 echo "Usage : ceph-lazy [-d | -h] [command] [parameters] + +Ceph complex querying tool - Version $VERSION + +OPTIONS +======== + -d Activate debug mode + -h Print help + +COMMANDS +========= + + Host + ----- + host-get-osd hostname List all OSD IDs attached to a particular node. + host-get-nodes List all storage nodes. + host-osd-usage hostname Show total OSD space usage of a particular node (-d for details). + host-all-usage Show total OSD space usage of each nodes (-d for details) + + Placement groups + ----------------- + pg-get-host pgid Find PG storage hosts (first is primary) + pg-most-write Find most written PG (nb operations) + pg-less-write Find less written PG (nb operations) + pg-most-write-kb Find most written PG (data written) + pg-less-write-kb Find less written PG (data written) + pg-most-read Find most read PG (nb operations) + pg-less-read Find less read PG (nb operations) + pg-most-read-kb Find most read PG (data read) + pg-less-read-kb Find less read PG (data read) + pg-empty Find empty PGs (no stored object) + + RBD + ---- + rbd-prefix pool_name image_name Return RBD image prefix + rbd-count pool_name image_name Count number of objects in a RBD image + rbd-host pool_name image_name Find RBD primary storage hosts + rbd-osd pool_name image_name Find RBD primary OSDs + rbd-size pool_name image_name Print RBD image real size + rbd-all-size pool_name Print all RBD images size (Top first) + + OSD + ---- + osd-most-used Show the most used OSD (capacity) + osd-less-used Show the less used OSD (capacity) + osd-get-ppg osd_id Show all primaries PGS hosted on a OSD + osd-get-pg osd_id Show all PGS hosted on a OSD + + Objects + -------- + object-get-host pool_name object_id Find object storage hosts (first is primary) + " + +} + +# +# Check dependencies +# +function check_requirements() +{ + + # List of command dependencies + local bin_dep="ceph rados rbd osdmaptool jq" + + for cmd in $bin_dep; do + [ $DEBUG -eq 1 ] && echoinfo "Checking for $cmd..." + $cmd --version >/dev/null 2>&1 || { echoerr "$cmd cannot be found... Aborting."; return 1; } + done + + CEPH="ceph $CEPH_OPT" + + [ $DEBUG -eq 1 ] && echoinfo "Checking Ceph connectivity & basic permissions..." + + if ! $CEPH -s &> /dev/null; then + echoerr "Cannot connect to cluster, please check your username & permissions" + echoerr "Command $CEPH -s failed" + return 1 + fi + + JQ="jq -M --raw-output" +} + +# +# Print the host that hosts a specific PG +# +function find_host_from_pg() { + + if [ $# -eq 1 ]; then + local PGID=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "PG $PGID has been found at (first is primary) : " + + for osd in $($CEPH pg $PGID query | $JQ -cr .up[]); do + echo -n "OSD:osd.$osd | Host:" + $CEPH osd find $osd --format json 2> /dev/null | $JQ .crush_location.host + done +} + + +# +# Print the host that hosts a specific object +# +function find_host_from_object() { + + if [ $# -eq 2 ]; then + local pool=$1 + local objid=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + local pgid=$($CEPH osd map $pool $objid --format json 2> /dev/null | $JQ -cr .pgid) + + [ $DEBUG -eq 1 ] && echoinfo $objid found into PG $pgid + + while read host; do + echo "PG:$pgid | $host" + done < <(find_host_from_pg $pgid) +} + + +# +# Print all primary pgs hosted by an OSD +# +function find_prim_pg_from_osd() { + + if [ $# -eq 1 ]; then + local posd=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for primary PGs belonging to OSD $posd" + $CEPH pg dump pgs --format json 2>/dev/null | $JQ --argjson posd $posd '.[] | select(.acting_primary==$posd).pgid' +} + + +# +# Print all pgs (primary & secondary) hosted by an OSD +# +function find_all_pg_from_osd() { + + if [ $# -eq 1 ]; then + local osd=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for all PGs mapped to OSD $osd" + $CEPH pg dump pgs --format json 2> /dev/null | $JQ -M --argjson osd $osd '.[] | select(.up[]==$osd).pgid' +} + + +# +# Check if a given image exists +# +function check_rbd_exists(){ + + pool=$1 + rbd=$2 + + if ! rbd info -p $pool $rbd &> /dev/null; then + echoerr "Unable to find image $pool/$rbd" + exit 1 + fi +} + + +# +# Return RBD prefix from image name +# +function get_rbd_prefix() { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + check_rbd_exists $pool $rbd + + local prefix=$(rbd --image $rbd -p $pool info --format json 2> /dev/null | jq --raw-output .block_name_prefix) + if [ -z $prefix ]; then + echoerr "Unable to find RBD Prefix for image $pool/$rbd" + exit 1 + else + echo $prefix + fi + +} + + +# +# Count number of object in a RBD image +# +function count_rbd_object() { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + check_rbd_exists $pool $rbd + + local rbd_prefix=$(get_rbd_prefix $pool $rbd) + + [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now couning objects..." + + local nb_obj=$(rados -p $pool ls | grep $rbd_prefix | wc -l) + + [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has $nb_obj objects" + echo $nb_obj +} + + +# +# Find primary storage host for a given RBD image +# +function find_prim_host_from_rbd() { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + check_rbd_exists $pool $rbd + + local osd="null" + local osdmap_t=$(mktemp) + local osdtree_t=$(mktemp) + # Get RBD image prefix + local rbd_prefix=$(get_rbd_prefix $pool $rbd) +# Exit if we received an empty prefix + [ -z $rbd_prefix ] && exit 1 + +# Get pool ID from pool name + local pool_id=$(ceph osd lspools -f json | $JQ -M --arg pool $pool '.[]|select(.poolname==$pool).poolnum') + + [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now finding primary host..." + + [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD map to $osdmap_t" + if ! $CEPH osd getmap > $osdmap_t 2> /dev/null; then + echoerr "Failed to retrieve OSD map" + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD tree to $osdtree_t" + + if ! $CEPH osd tree --format json > $osdtree_t; then + echoerr "Failed to retrieve OSD tree" + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for hosts..." + +# For each object in the RBD image + for obj in $(rados -p $pool ls | grep $rbd_prefix); + do +# Map object to osd. osdmaptoot does not support json output so using dirty sed. + osd=$(osdmaptool --test-map-object $obj --pool $pool_id $osdmap_t 2>/dev/null | sed -r 's/.*\[([[:digit:]]+),.*/\1/' | grep -v osdmaptool) +# Map osd to host + $JQ --argjson osd $osd '.nodes[] | select(.type=="host") | select(.children[] == $osd).name' $osdtree_t + done | sort -u + +# Cleaning files + rm -f $osdtree_t $osdmap_t +} + + +# +# Find primary OSDs for a given RBD image +# +function find_prim_osd_from_rbd() { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + check_rbd_exists $pool $rbd + + local osd="null" + local osdmap_t=$(mktemp) + local osdtree_t=$(mktemp) + # Get RBD image prefix + local rbd_prefix=$(get_rbd_prefix $pool $rbd) + +# Exit if we received an empty prefix + [ -z $rbd_prefix ] && exit 1 + + [ $DEBUG -eq 1 ] && echoinfo "RBD image $pool/$rbd has prefix $rbd_prefix; now finding primary OSDs..." + + [ $DEBUG -eq 1 ] && echoinfo "Dumping OSD map to $osdmap_t" + if ! $CEPH osd getmap > $osdmap_t; then + echoerr "Failed to retrieve OSD map" + exit 1 + fi + +# For each object in the RBD image + for obj in $(rados -p $pool ls | grep $rbd_prefix); + do +# Map object to osd. osdmaptoot does not support json output so using dirty sed. + osd=$(osdmaptool --test-map-object $obj $osdmap_t 2>/dev/null | sed -r 's/.*\[([[:digit:]]+),.*/\1/' | grep -v osdmaptool) + echo "osd.${osd}" + done | sort -u + +# Cleaning files + rm -f $osdmap_t +} + + +# +# Print RBD image real size - Source http://ceph.com/planet/real-size-of-a-ceph-rbd-image/ +# + +function print_rbd_real_size { + + if [ $# -eq 2 ]; then + local pool=$1 + local rbd=$2 + else + echoerr "This command requires two arguments" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Checking if RBD image exists..." + + check_rbd_exists $pool $rbd + + rbd diff $pool/$rbd | awk '{ SUM += $2 } END { print SUM/1024/1024 " MB" }' + +} + + +# +# Print all RBD image real sizes - Top first +# + +function list_all_rbd_real_size { + + if [ $# -eq 1 ]; then + local pool=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for RBD images in pool $pool" + + while read rbd; do + [ $DEBUG -eq 1 ] && echoinfo "Inspecting image $rbd" + rbd diff $pool/$rbd | awk -v rbd="$rbd" '{ SUM += $2 } END { print SUM/1024/1024 " MB - " rbd }' + done < <(rbd -p $pool ls) | sort -rV +} + + +# +# Print OSDs belonging to a particular storage host +# + +function list_osd_from_host() { + + if [ $# -eq 1 ]; then + local host=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + $CEPH osd tree --format json-pretty 2> /dev/null | $JQ --arg host $host '.nodes[] | select(.type=="host") | select(.name == $host).children[]' | sort -V + +} + + +# +# List all OSD nodes +# + +function list_all_nodes() { + + + $CEPH osd tree --format json | $JQ -M --raw-output '.nodes[] | select(.type=="host") | .name' | sort -V + +} + + +# +# Print Total OSD usage of a particular storage host +# + +function show_host_osd_usage() { + + if [ $# -eq 1 ]; then + local host=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + local pgmap_t=$(mktemp) + + local osd_used_kb=0 + local total_used_kb=0 + + local total_available_kb=0 + local osd_available_kb=0 + + local total_size_kb=0 + local osd_size_kb=0 + local nb_osd=0 + + [ $DEBUG -eq 1 ] && echoinfo "Dumping PG map..." + if ! $CEPH pg dump osds --format json 2>/dev/null > $pgmap_t; then + echoerr "Failed to retrieve PG map" + exit 1 + fi + + [ $DEBUG -eq 1 ] && echoinfo "Looking for all OSDs on host $host..." + + for osd in $(list_osd_from_host $host); do + + osd_used_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb_used' $pgmap_t) + osd_available_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb_avail' $pgmap_t) + osd_size_kb=$($JQ --argjson osd $osd '.[] | select(.osd == $osd).kb' $pgmap_t) + + [ $DEBUG -eq 1 ] && echoinfo "OSD:$osd | Size:$(echo "scale=1;$osd_size_kb/1024/1024" | bc -l)GB | Used:$(echo "scale=1;$osd_used_kb /1024/1024" | bc -l)GB | Available:$(echo "scale=1;$osd_available_kb/1024/1024" | bc -l)GB" + + let "total_used_kb=total_used_kb+osd_used_kb" + let "total_available_kb=total_available_kb+osd_available_kb" + let "total_size_kb=total_size_kb+osd_size_kb" + let "nb_osd++" + + done + + echo "Host:$host | OSDs:$nb_osd | Total_Size:$(echo "scale=1;$total_size_kb/1024/1024" | bc -l)GB | Total_Used:$(echo "scale=1;$total_used_kb /1024/1024" | bc -l)GB | Total_Available:$(echo "scale=1;$total_available_kb/1024/1024" | bc -l)GB" + + rm -f $pgmap_t +} + + +# +# Print Total OSD usage of all nodes +# + +function list_all_nodes_osd_usage() { + + + for host in $(list_all_nodes); do + + [ $DEBUG -eq 1 ] && echoinfo "Looking at node $host..." + + show_host_osd_usage $host + done + +} + + +# +# Find most used (space) OSD +# + +function find_most_used_osd() { + + local osd=$($CEPH pg dump osds --format json 2> /dev/null| $JQ 'max_by(.kb_used) | .osd') + local host=$($CEPH osd find $osd 2> /dev/null | $JQ .crush_location.host) + + echo "OSD:osd.${osd} | host:$host" +} + + +# +# Find less used (space) OSD +# + +function find_less_used_osd() { + + local osd=$($CEPH pg dump osds --format json 2> /dev/null| $JQ 'min_by(.kb_used) | .osd') + local host=$($CEPH osd find $osd 2> /dev/null | $JQ .crush_location.host) + + echo "OSD:osd.${osd} | host:$host" +} + + +# +# Query PG stats +# + +function pg_stat_query() { + + if [ $# -eq 1 ]; then + local query_type=$1 + else + echoerr "This command requires one argument" + help + exit 1 + fi + + local pgmap_t=$(mktemp) + + [ $DEBUG -eq 1 ] && echoinfo "Dumping PG map..." + if ! $CEPH pg dump pgs --format json 2>/dev/null > $pgmap_t; then + echoerr "Failed to retrieve PG map" + exit 1 + fi + + local pgid=$($JQ --arg query_type $query_type "$query_type" $pgmap_t) + [ $DEBUG -eq 1 ] && echoinfo "Found PGID $pgid" + + local osd=$($JQ --arg pgid $pgid '.[] | select(.pgid == $pgid).acting_primary' $pgmap_t) + [ $DEBUG -eq 1 ] && echoinfo "Found OSD $osd" + + local host=$($CEPH osd find $osd --format json 2> /dev/null | $JQ .crush_location.host) + [ $DEBUG -eq 1 ] && echoinfo "Found host $host" + + echo "PG:$pgid | OSD:osd.$osd | Host:$host" + + rm -f $pgmap_t +} + + +# +# Find empty pgs (no object stored) +# + +function find_empty_pg() { + + $CEPH pg dump pgs --format json 2>/dev/null | $JQ '.[] | select(.stat_sum.num_objects == 0).pgid' + +} + + +# +# MAIN +# + + +# Print help if no argument is given +if [ $# -eq 0 ]; then + help + exit 1 +fi + +# Activate debug mode if -d is specified as first parameter +if [ "$1" = "-d" ]; then + echoinfo "Debug mode activated" + DEBUG=1 + shift +else + DEBUG=0 +fi + + +# Check if all requirements are met +check_requirements || exit 1 + + +# Call proper function +case $1 in + "-h") + help + exit 0 + ;; + "host-get-osd") + list_osd_from_host $2 + ;; + "host-get-nodes") + list_all_nodes + ;; + "host-osd-usage") + show_host_osd_usage $2 + ;; + "host-all-usage") + list_all_nodes_osd_usage + ;; + "pg-get-host") + find_host_from_pg $2 + ;; + "pg-most-write") + pg_stat_query "max_by(.stat_sum.num_write).pgid" + ;; + "pg-less-write") + pg_stat_query "min_by(.stat_sum.num_write).pgid" + ;; + "pg-most-write-kb") + pg_stat_query "max_by(.stat_sum.num_write_kb).pgid" + ;; + "pg-less-write-kb") + pg_stat_query "min_by(.stat_sum.num_write_kb).pgid" + ;; + "pg-most-read") + pg_stat_query "max_by(.stat_sum.num_read).pgid" + ;; + "pg-less-read") + pg_stat_query "min_by(.stat_sum.num_read).pgid" + ;; + "pg-most-read-kb") + pg_stat_query "max_by(.stat_sum.num_read_kb).pgid" + ;; + "pg-less-read-kb") + pg_stat_query "min_by(.stat_sum.num_read_kb).pgid" + ;; + "rbd-prefix") + get_rbd_prefix $2 $3 + ;; + "rbd-count") + count_rbd_object $2 $3 + ;; + "rbd-host") + find_prim_host_from_rbd $2 $3 + ;; + "rbd-osd") + find_prim_osd_from_rbd $2 $3 + ;; + "rbd-size") + print_rbd_real_size $2 $3 + ;; + "rbd-all-size") + list_all_rbd_real_size $2 + ;; + "osd-most-used") + find_most_used_osd + ;; + "osd-less-used") + find_less_used_osd + ;; + "osd-get-ppg") + find_prim_pg_from_osd $2 + ;; + "osd-get-pg") + find_all_pg_from_osd $2 + ;; + "pg-empty") + find_empty_pg + ;; + "object-get-host") + find_host_from_object $2 $3 + ;; + *) + echoerr "Unknown command : $1" + help + exit 1 + ;; +esac + diff --git a/src/tools/ceph-monstore-update-crush.sh b/src/tools/ceph-monstore-update-crush.sh new file mode 100755 index 000000000..5adfacdc2 --- /dev/null +++ b/src/tools/ceph-monstore-update-crush.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +# +# Copyright (C) 2015 Red Hat +# +# Author: Kefu Chai +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +verbose= + +test -d ../src && export PATH=$PATH:. + +if ! which jq ; then + echo "Missing jq binary!" + exit 1 +fi + +if [ `uname` = FreeBSD ]; then + GETOPT=/usr/local/bin/getopt +else + GETOPT=getopt +fi + +function osdmap_get() { + local store_path=$1 + local query=$2 + local epoch=${3:+-v $3} + local osdmap=`mktemp` + + $CEPH_BIN/ceph-monstore-tool $store_path get osdmap -- \ + $epoch -o $osdmap > /dev/null || return + + echo $($CEPH_BIN/osdmaptool --dump json $osdmap 2> /dev/null | \ + jq "$query") + + rm -f $osdmap +} + +function test_crush() { + local store_path=$1 + local epoch=$2 + local max_osd=$3 + local crush=$4 + local osdmap=`mktemp` + + $CEPH_BIN/ceph-monstore-tool $store_path get osdmap -- \ + -v $epoch -o $osdmap > /dev/null + $CEPH_BIN/osdmaptool --export-crush $crush $osdmap &> /dev/null + + if $CEPH_BIN/crushtool --test --check $max_osd -i $crush > /dev/null; then + good=true + else + good=false + fi + rm -f $osdmap + $good || return 1 +} + +function die() { + local retval=$? + echo "$@" >&2 + exit $retval +} + +function usage() { + [ $# -gt 0 ] && echo -e "\n$@" + cat < + +Search backward for a latest known-good epoch in monstore. Rewrite the osdmap +epochs after it with the crush map in the found epoch if asked to do so. By +default, print out the crush map in the good epoch. + + [-h|--help] display this message + [--out] write the found crush map to given file (default: stdout) + [--rewrite] rewrite the monitor storage with the found crush map + [--verbose] be more chatty +EOF + [ $# -gt 0 ] && exit 1 + exit 0 +} + +function main() { + local temp + temp=$($GETOPT -o h --long verbose,help,mon-store:,out:,rewrite -n $0 -- "$@") || return 1 + + eval set -- "$temp" + local rewrite + while [ "$1" != "--" ]; do + case "$1" in + --verbose) + verbose=true + # set -xe + # PS4='${FUNCNAME[0]}: $LINENO: ' + shift;; + -h|--help) + usage + return 0;; + --out) + output=$2 + shift 2;; + --osdmap-epoch) + osdmap_epoch=$2 + shift 2;; + --rewrite) + rewrite=true + shift;; + *) + usage "unexpected argument $1" + shift;; + esac + done + shift + + local store_path="$1" + test $store_path || usage "I need the path to mon-store." + + # try accessing the store; if it fails, likely means a mon is running + local last_osdmap_epoch + local max_osd + last_osdmap_epoch=$(osdmap_get $store_path ".epoch") || \ + die "error accessing mon store at $store_path" + # get the max_osd # in last osdmap epoch, crushtool will use it to check + # the crush maps in previous osdmaps + max_osd=$(osdmap_get $store_path ".max_osd" $last_osdmap_epoch) + + local good_crush + local good_epoch + test $verbose && echo "the latest osdmap epoch is $last_osdmap_epoch" + for epoch in `seq $last_osdmap_epoch -1 1`; do + local crush_path=`mktemp` + test $verbose && echo "checking crush map #$epoch" + if test_crush $store_path $epoch $max_osd $crush_path; then + test $verbose && echo "crush map version #$epoch works with osdmap epoch #$osdmap_epoch" + good_epoch=$epoch + good_crush=$crush_path + break + fi + rm -f $crush_path + done + + if test $good_epoch; then + echo "good crush map found at epoch $epoch/$last_osdmap_epoch" + else + echo "Unable to find a crush map for osdmap version #$osdmap_epoch." 2>&1 + return 1 + fi + + if test $good_epoch -eq $last_osdmap_epoch; then + echo "and mon store has no faulty crush maps." + elif test $output; then + $CEPH_BIN/crushtool --decompile $good_crush --outfn $output + elif test $rewrite; then + $CEPH_BIN/ceph-monstore-tool $store_path rewrite-crush -- \ + --crush $good_crush \ + --good-epoch $good_epoch + else + echo + $CEPH_BIN/crushtool --decompile $good_crush + fi + rm -f $good_crush +} + +main "$@" diff --git a/src/tools/ceph_authtool.cc b/src/tools/ceph_authtool.cc new file mode 100644 index 000000000..bf0bd5523 --- /dev/null +++ b/src/tools/ceph_authtool.cc @@ -0,0 +1,318 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2009 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/ConfUtils.h" +#include "common/ceph_argparse.h" +#include "common/config_proxy.h" +#include "global/global_context.h" +#include "global/global_init.h" + +#include "auth/Crypto.h" +#include "auth/Auth.h" +#include "auth/KeyRing.h" + +using std::map; +using std::string; +using std::vector; +using std::cerr; +using std::cout; + +void usage() +{ + cout << "usage: ceph-authtool keyringfile [OPTIONS]...\n" + << "where the options are:\n" + << " -l, --list will list all keys and capabilities present in\n" + << " the keyring\n" + << " -p, --print-key will print an encoded key for the specified\n" + << " entityname. This is suitable for the\n" + << " 'mount -o secret=..' argument\n" + << " -C, --create-keyring will create a new keyring, overwriting any\n" + << " existing keyringfile\n" + << " -g, --gen-key will generate a new secret key for the\n" + << " specified entityname\n" + << " --gen-print-key will generate a new secret key without set it\n" + << " to the keyringfile, prints the secret to stdout\n" + << " --import-keyring FILE will import the content of a given keyring\n" + << " into the keyringfile\n" + << " -n NAME, --name NAME specify entityname to operate on\n" + << " -a BASE64, --add-key BASE64 will add an encoded key to the keyring\n" + << " --cap SUBSYSTEM CAPABILITY will set the capability for given subsystem\n" + << " --caps CAPSFILE will set all of capabilities associated with a\n" + << " given key, for all subsystems\n" + << " --mode MODE will set the desired file mode to the keyring\n" + << " e.g: '0644', defaults to '0600'" + << std::endl; + exit(1); +} + +int main(int argc, const char **argv) +{ + auto args = argv_to_vec(argc, argv); + std::string add_key; + std::string caps_fn; + std::string import_keyring; + map caps; + std::string fn; + + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + + bool gen_key = false; + bool gen_print_key = false; + bool list = false; + bool print_key = false; + bool create_keyring = false; + int mode = 0600; // keyring file mode + std::vector::iterator i; + + /* Handle options unique to ceph-authtool + * -n NAME, --name NAME is handled by global_init + * */ + for (i = args.begin(); i != args.end(); ) { + std::string val; + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_flag(args, i, "-g", "--gen-key", (char*)NULL)) { + gen_key = true; + } else if (ceph_argparse_flag(args, i, "--gen-print-key", (char*)NULL)) { + gen_print_key = true; + } else if (ceph_argparse_witharg(args, i, &val, "-a", "--add-key", (char*)NULL)) { + if (val.empty()) { + cerr << "Option --add-key requires an argument" << std::endl; + exit(1); + } + add_key = val; + } else if (ceph_argparse_flag(args, i, "-l", "--list", (char*)NULL)) { + list = true; + } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) { + caps_fn = val; + } else if (ceph_argparse_witharg(args, i, &val, "--cap", (char*)NULL)) { + std::string my_key = val; + if (i == args.end()) { + cerr << "must give two arguments to --cap: key and val." << std::endl; + exit(1); + } + std::string my_val = *i; + ++i; + encode(my_val, caps[my_key]); + } else if (ceph_argparse_flag(args, i, "-p", "--print-key", (char*)NULL)) { + print_key = true; + } else if (ceph_argparse_flag(args, i, "-C", "--create-keyring", (char*)NULL)) { + create_keyring = true; + } else if (ceph_argparse_witharg(args, i, &val, "--import-keyring", (char*)NULL)) { + import_keyring = val; + } else if (ceph_argparse_witharg(args, i, &val, "--mode", (char*)NULL)) { + std::string err; + mode = strict_strtoll(val.c_str(), 8, &err); + if (!err.empty()) { + cerr << "Option --mode requires an argument" << std::endl; + exit(1); + } + } else if (fn.empty()) { + fn = *i++; + } else { + cerr << argv[0] << ": unexpected '" << *i << "'" << std::endl; + usage(); + } + } + + if (fn.empty() && !gen_print_key) { + cerr << argv[0] << ": must specify filename" << std::endl; + usage(); + } + if (!(gen_key || + gen_print_key || + !add_key.empty() || + list || + !caps_fn.empty() || + !caps.empty() || + print_key || + create_keyring || + !import_keyring.empty())) { + cerr << "no command specified" << std::endl; + usage(); + } + if (gen_key && (!add_key.empty())) { + cerr << "can't both gen-key and add-key" << std::endl; + usage(); + } + + common_init_finish(g_ceph_context); + EntityName ename(g_conf()->name); + + // Enforce the use of gen-key or add-key when creating to avoid ending up + // with an "empty" key (key = AAAAAAAAAAAAAAAA) + if (create_keyring && !gen_key && add_key.empty() && !caps.empty()) { + cerr << "must specify either gen-key or add-key when creating" << std::endl; + usage(); + } + + if (gen_print_key) { + CryptoKey key; + key.create(g_ceph_context, CEPH_CRYPTO_AES); + cout << key << std::endl; + return 0; + } + + // keyring -------- + bool modified = false; + bool added_entity = false; + KeyRing keyring; + + bufferlist bl; + int r = 0; + if (create_keyring) { + cout << "creating " << fn << std::endl; + modified = true; + } else { + std::string err; + r = bl.read_file(fn.c_str(), &err); + if (r >= 0) { + try { + auto iter = bl.cbegin(); + decode(keyring, iter); + } catch (const buffer::error &err) { + cerr << "error reading file " << fn << std::endl; + exit(1); + } + } else { + cerr << "can't open " << fn << ": " << err << std::endl; + exit(1); + } + } + + // Validate that "name" actually has an existing key in this keyring if we + // have not given gen-key or add-key options + if (!gen_key && add_key.empty() && !caps.empty()) { + CryptoKey key; + if (!keyring.get_secret(ename, key)) { + cerr << "can't find existing key for " << ename + << " and neither gen-key nor add-key specified" << std::endl; + exit(1); + } + } + + // write commands + if (!import_keyring.empty()) { + KeyRing other; + bufferlist obl; + std::string err; + int r = obl.read_file(import_keyring.c_str(), &err); + if (r >= 0) { + try { + auto iter = obl.cbegin(); + decode(other, iter); + } catch (const buffer::error &err) { + cerr << "error reading file " << import_keyring << std::endl; + exit(1); + } + + cout << "importing contents of " << import_keyring << " into " << fn << std::endl; + //other.print(cout); + keyring.import(g_ceph_context, other); + modified = true; + } else { + cerr << "can't open " << import_keyring << ": " << err << std::endl; + exit(1); + } + } + if (gen_key) { + EntityAuth eauth; + eauth.key.create(g_ceph_context, CEPH_CRYPTO_AES); + keyring.add(ename, eauth); + modified = true; + } + if (!add_key.empty()) { + EntityAuth eauth; + try { + eauth.key.decode_base64(add_key); + } catch (const buffer::error &err) { + cerr << "can't decode key '" << add_key << "'" << std::endl; + exit(1); + } + keyring.add(ename, eauth); + modified = true; + cout << "added entity " << ename << " " << eauth << std::endl; + added_entity = true; + } + if (!caps_fn.empty()) { + ConfFile cf; + if (cf.parse_file(caps_fn, &cerr) != 0) { + cerr << "could not parse caps file " << caps_fn << std::endl; + exit(1); + } + map caps; + const char *key_names[] = { "mon", "osd", "mds", "mgr", NULL }; + for (int i=0; key_names[i]; i++) { + std::string val; + if (cf.read("global", key_names[i], val) == 0) { + bufferlist bl; + encode(val, bl); + string s(key_names[i]); + caps[s] = bl; + } + } + keyring.set_caps(ename, caps); + modified = true; + } + if (!caps.empty()) { + keyring.set_caps(ename, caps); + modified = true; + } + if (added_entity && caps.size() > 0) { + cout << "added " << caps.size() << " caps to entity " << ename << std::endl; + } + + // read commands + if (list) { + try { + keyring.print(cout); + } catch (ceph::buffer::end_of_buffer &eob) { + cout << "Exception (end_of_buffer) in print(), exit." << std::endl; + exit(1); + } + } + if (print_key) { + CryptoKey key; + if (keyring.get_secret(ename, key)) { + cout << key << std::endl; + } else { + cerr << "entity " << ename << " not found" << std::endl; + exit(1); + } + } + + // write result? + if (modified) { + bufferlist bl; + keyring.encode_plaintext(bl); + r = bl.write_file(fn.c_str(), mode); + if (r < 0) { + cerr << "could not write " << fn << std::endl; + exit(1); + } + //cout << "wrote " << bl.length() << " bytes to " << fn << std::endl; + } + return 0; +} diff --git a/src/tools/ceph_conf.cc b/src/tools/ceph_conf.cc new file mode 100644 index 000000000..1d1fc1f77 --- /dev/null +++ b/src/tools/ceph_conf.cc @@ -0,0 +1,278 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2010 Dreamhost + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include + +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "mon/AuthMonitor.h" +#include "common/Formatter.h" + +using std::deque; +using std::string; +using std::unique_ptr; +using std::cerr; +using std::cout; +using std::vector; + +static void usage(std::ostream& out) +{ + // TODO: add generic_usage once cerr/derr issues are resolved + out << R"(Ceph configuration query tool + +USAGE +ceph-conf + +ACTIONS + -L|--list-all-sections List all sections + -l|--list-sections List sections with the given prefix + --filter-key Filter section list to only include sections + with given key defined. + --filter-key-value = Filter section list to only include sections + with given key/value pair. + --lookup Print a configuration setting to stdout. + Returns 0 (success) if the configuration setting is + found; 1 otherwise. + -r|--resolve-search search for the first file that exists and + can be opened in the resulted comma + delimited search list. + -D|--dump-all dump all variables. + --show-config-value Print the corresponding ceph.conf value + that matches the specified key. Also searches + global defaults. + +FLAGS + --name name Set type.id + [-s
] Add to list of sections to search + [--format plain|json|json-pretty] + dump variables in plain text, json or pretty + json + [--pid ] Override the $pid when expanding options + +If there is no action given, the action will default to --lookup. + +EXAMPLES +$ ceph-conf --name mon.0 -c /etc/ceph/ceph.conf 'mon addr' +Find out what the value of 'mon addr' is for monitor 0. + +$ ceph-conf -l mon +List sections beginning with 'mon'. + +RETURN CODE +Return code will be 0 on success; error code otherwise. +)"; +} + +static int list_sections(const std::string &prefix, + const std::list& filter_key, + const std::map& filter_key_value) +{ + std::vector sections; + int ret = g_conf().get_all_sections(sections); + if (ret) + return 2; + for (std::vector::const_iterator p = sections.begin(); + p != sections.end(); ++p) { + if (strncmp(prefix.c_str(), p->c_str(), prefix.size())) + continue; + + std::vector sec; + sec.push_back(*p); + + int r = 0; + for (std::list::const_iterator q = filter_key.begin(); q != filter_key.end(); ++q) { + string v; + r = g_conf().get_val_from_conf_file(sec, q->c_str(), v, false); + if (r < 0) + break; + } + if (r < 0) + continue; + + for (std::map::const_iterator q = filter_key_value.begin(); + q != filter_key_value.end(); + ++q) { + string v; + r = g_conf().get_val_from_conf_file(sec, q->first.c_str(), v, false); + if (r < 0 || v != q->second) { + r = -1; + break; + } + } + if (r < 0) + continue; + + cout << *p << std::endl; + } + return 0; +} + +static int lookup(const std::deque §ions, + const std::string &key, bool resolve_search) +{ + std::vector my_sections{sections.begin(), sections.end()}; + for (auto& section : g_conf().get_my_sections()) { + my_sections.push_back(section); + } + std::string val; + int ret = g_conf().get_val_from_conf_file(my_sections, key.c_str(), val, true); + if (ret == -ENOENT) + return 1; + else if (ret == 0) { + if (resolve_search) { + string result; + ret = ceph_resolve_file_search(val, result); + if (!ret) + puts(result.c_str()); + } + else { + puts(val.c_str()); + } + return 0; + } + else { + cerr << "error looking up '" << key << "': error " << ret << std::endl; + return 2; + } +} + +static int dump_all(const string& format) +{ + if (format == "" || format == "plain") { + g_conf().show_config(std::cout); + return 0; + } else { + unique_ptr f(Formatter::create(format)); + if (f) { + f->open_object_section("ceph-conf"); + g_conf().show_config(f.get()); + f->close_section(); + f->flush(std::cout); + return 0; + } + cerr << "format '" << format << "' not recognized." << std::endl; + usage(cerr); + return 1; + } +} + +static void maybe_override_pid(vector& args) +{ + for (auto i = args.begin(); i != args.end(); ++i) { + string val; + if (ceph_argparse_witharg(args, i, &val, "--pid", (char*)NULL)) { + setenv("PID", val.c_str(), 1); + break; + } + } +} + +int main(int argc, const char **argv) +{ + deque sections; + bool resolve_search = false; + std::string action; + std::string lookup_key; + std::string section_list_prefix; + std::list filter_key; + std::map filter_key_value; + std::string dump_format; + + auto args = argv_to_vec(argc, argv); + + auto orig_args = args; + auto cct = [&args] { + // override the PID before options are expanded + maybe_override_pid(args); + std::map defaults = {{"log_to_file", "false"}}; + return global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_NO_DAEMON_ACTIONS | + CINIT_FLAG_NO_MON_CONFIG); + }(); + + // do not common_init_finish(); do not start threads; do not do any of thing + // wonky things the daemon whose conf we are examining would do (like initialize + // the admin socket). + //common_init_finish(g_ceph_context); + + std::string val; + for (std::vector::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_witharg(args, i, &val, "-s", "--section", (char*)NULL)) { + sections.push_back(val); + } else if (ceph_argparse_flag(args, i, "-r", "--resolve_search", (char*)NULL)) { + resolve_search = true; + } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) { + action = "help"; + } else if (ceph_argparse_witharg(args, i, &val, "--lookup", (char*)NULL)) { + action = "lookup"; + lookup_key = val; + } else if (ceph_argparse_flag(args, i, "-L", "--list_all_sections", (char*)NULL)) { + action = "list-sections"; + section_list_prefix = ""; + } else if (ceph_argparse_witharg(args, i, &val, "-l", "--list_sections", (char*)NULL)) { + action = "list-sections"; + section_list_prefix = val; + } else if (ceph_argparse_witharg(args, i, &val, "--filter_key", (char*)NULL)) { + filter_key.push_back(val); + } else if (ceph_argparse_witharg(args, i, &val, "--filter_key_value", (char*)NULL)) { + size_t pos = val.find_first_of('='); + if (pos == string::npos) { + cerr << "expecting argument like 'key=value' for --filter-key-value (not '" << val << "')" << std::endl; + usage(cerr); + return EXIT_FAILURE; + } + string key(val, 0, pos); + string value(val, pos+1); + filter_key_value[key] = value; + } else if (ceph_argparse_flag(args, i, "-D", "--dump_all", (char*)NULL)) { + action = "dumpall"; + } else if (ceph_argparse_witharg(args, i, &val, "--format", (char*)NULL)) { + dump_format = val; + } else { + if (((action == "lookup") || (action == "")) && (lookup_key.empty())) { + action = "lookup"; + lookup_key = *i++; + } else { + cerr << "unable to parse option: '" << *i << "'" << std::endl; + cerr << "args:"; + for (auto arg : orig_args) { + cerr << " " << std::quoted(arg); + } + cerr << std::endl; + usage(cerr); + return EXIT_FAILURE; + } + } + } + + cct->_log->flush(); + if (action == "help") { + usage(cout); + return EXIT_SUCCESS; + } else if (action == "list-sections") { + return list_sections(section_list_prefix, filter_key, filter_key_value); + } else if (action == "lookup") { + return lookup(sections, lookup_key, resolve_search); + } else if (action == "dumpall") { + return dump_all(dump_format); + } else { + cerr << "You must give an action, such as --lookup or --list-all-sections." << std::endl; + cerr << "Pass --help for more help." << std::endl; + return EXIT_FAILURE; + } +} diff --git a/src/tools/ceph_dedup_tool.cc b/src/tools/ceph_dedup_tool.cc new file mode 100644 index 000000000..f3c942a97 --- /dev/null +++ b/src/tools/ceph_dedup_tool.cc @@ -0,0 +1,1779 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Author: Myoungwon Oh + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "include/types.h" + +#include "include/rados/buffer.h" +#include "include/rados/librados.hpp" +#include "include/rados/rados_types.hpp" + +#include "acconfig.h" + +#include "common/Cond.h" +#include "common/Formatter.h" +#include "common/ceph_argparse.h" +#include "common/ceph_crypto.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/obj_bencher.h" +#include "global/global_init.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tools/RadosDump.h" +#include "cls/cas/cls_cas_client.h" +#include "cls/cas/cls_cas_internal.h" +#include "include/stringify.h" +#include "global/signal_handler.h" +#include "common/CDC.h" +#include "common/Preforker.h" + +#include +#include + +using namespace std; +namespace po = boost::program_options; + +struct EstimateResult { + std::unique_ptr cdc; + + uint64_t chunk_size; + + ceph::mutex lock = ceph::make_mutex("EstimateResult::lock"); + + // < key, > + map< string, pair > chunk_statistics; + uint64_t total_bytes = 0; + std::atomic total_objects = {0}; + + EstimateResult(std::string alg, int chunk_size) + : cdc(CDC::create(alg, chunk_size)), + chunk_size(1ull << chunk_size) {} + + void add_chunk(bufferlist& chunk, const std::string& fp_algo) { + string fp; + if (fp_algo == "sha1") { + sha1_digest_t sha1_val = crypto::digest(chunk); + fp = sha1_val.to_str(); + } else if (fp_algo == "sha256") { + sha256_digest_t sha256_val = crypto::digest(chunk); + fp = sha256_val.to_str(); + } else if (fp_algo == "sha512") { + sha512_digest_t sha512_val = crypto::digest(chunk); + fp = sha512_val.to_str(); + } else { + ceph_assert(0 == "no support fingerperint algorithm"); + } + + std::lock_guard l(lock); + auto p = chunk_statistics.find(fp); + if (p != chunk_statistics.end()) { + p->second.first++; + if (p->second.second != chunk.length()) { + cerr << "warning: hash collision on " << fp + << ": was " << p->second.second + << " now " << chunk.length() << std::endl; + } + } else { + chunk_statistics[fp] = make_pair(1, chunk.length()); + } + total_bytes += chunk.length(); + } + + void dump(Formatter *f) const { + f->dump_unsigned("target_chunk_size", chunk_size); + + uint64_t dedup_bytes = 0; + uint64_t dedup_objects = chunk_statistics.size(); + for (auto& j : chunk_statistics) { + dedup_bytes += j.second.second; + } + //f->dump_unsigned("dedup_bytes", dedup_bytes); + //f->dump_unsigned("original_bytes", total_bytes); + f->dump_float("dedup_bytes_ratio", + (double)dedup_bytes / (double)total_bytes); + f->dump_float("dedup_objects_ratio", + (double)dedup_objects / (double)total_objects); + + uint64_t avg = total_bytes / dedup_objects; + uint64_t sqsum = 0; + for (auto& j : chunk_statistics) { + sqsum += (avg - j.second.second) * (avg - j.second.second); + } + uint64_t stddev = sqrt(sqsum / dedup_objects); + f->dump_unsigned("chunk_size_average", avg); + f->dump_unsigned("chunk_size_stddev", stddev); + } +}; + +map dedup_estimates; // chunk size -> result + +using namespace librados; +unsigned default_op_size = 1 << 26; +unsigned default_max_thread = 2; +int32_t default_report_period = 10; +ceph::mutex glock = ceph::make_mutex("glock"); + +po::options_description make_usage() { + po::options_description desc("Usage"); + desc.add_options() + ("help,h", ": produce help message") + ("op estimate --pool --chunk-size --chunk-algorithm --fingerprint-algorithm ", + ": estimate how many chunks are redundant") + ("op chunk-scrub --chunk-pool ", + ": perform chunk scrub") + ("op chunk-get-ref --chunk-pool --object --target-ref --target-ref-pool-id ", + ": get chunk object's reference") + ("op chunk-put-ref --chunk-pool --object --target-ref --target-ref-pool-id ", + ": put chunk object's reference") + ("op chunk-repair --chunk-pool --object --target-ref --target-ref-pool-id ", + ": fix mismatched references") + ("op dump-chunk-refs --chunk-pool --object ", + ": dump chunk object's references") + ("op chunk-dedup --pool --object --chunk-pool --fingerprint-algorithm --source-off --source-length ", + ": perform a chunk dedup---deduplicate only a chunk, which is a part of object.") + ("op object-dedup --pool --object --chunk-pool --fingerprint-algorithm --dedup-cdc-chunk-size [--snap]", + ": perform a object dedup---deduplicate the entire object, not a chunk. Related snapshots are also deduplicated if --snap is given") + ("op sample-dedup --pool --chunk-pool --chunk-algorithm --fingerprint-algorithm --daemon --loop", + ": perform a sample dedup---make crawling threads which crawl objects in base pool and deduplicate them based on their deduplication efficiency") + ; + po::options_description op_desc("Opational arguments"); + op_desc.add_options() + ("op", po::value(), ": estimate|chunk-scrub|chunk-get-ref|chunk-put-ref|chunk-repair|dump-chunk-refs|chunk-dedup|object-dedup") + ("target-ref", po::value(), ": set target object") + ("target-ref-pool-id", po::value(), ": set target pool id") + ("object", po::value(), ": set object name") + ("chunk-size", po::value(), ": chunk size (byte)") + ("chunk-algorithm", po::value(), ": , set chunk-algorithm") + ("fingerprint-algorithm", po::value(), ": , set fingerprint-algorithm") + ("chunk-pool", po::value(), ": set chunk pool name") + ("max-thread", po::value(), ": set max thread") + ("report-period", po::value(), ": set report-period") + ("max-seconds", po::value(), ": set max runtime") + ("max-read-size", po::value(), ": set max read size") + ("pool", po::value(), ": set pool name") + ("min-chunk-size", po::value(), ": min chunk size (byte)") + ("max-chunk-size", po::value(), ": max chunk size (byte)") + ("source-off", po::value(), ": set source offset") + ("source-length", po::value(), ": set source length") + ("dedup-cdc-chunk-size", po::value(), ": set dedup chunk size for cdc") + ("snap", ": deduplciate snapshotted object") + ("debug", ": enable debug") + ("pgid", ": set pgid") + ("chunk-dedup-threshold", po::value(), ": set the threshold for chunk dedup (number of duplication) ") + ("sampling-ratio", po::value(), ": set the sampling ratio (percentile)") + ("daemon", ": execute sample dedup in daemon mode") + ("loop", ": execute sample dedup in a loop until terminated. Sleeps 'wakeup-period' seconds between iterations") + ("wakeup-period", po::value(), ": set the wakeup period of crawler thread (sec)") + ; + desc.add(op_desc); + return desc; +} + +template +static int rados_sistrtoll(I &i, T *val) { + std::string err; + *val = strict_iecstrtoll(i->second, &err); + if (err != "") { + cerr << "Invalid value for " << i->first << ": " << err << std::endl; + return -EINVAL; + } else { + return 0; + } +} + +class EstimateDedupRatio; +class ChunkScrub; +class CrawlerThread : public Thread +{ + IoCtx io_ctx; + int n; + int m; + ObjectCursor begin; + ObjectCursor end; + ceph::mutex m_lock = ceph::make_mutex("CrawlerThread::Locker"); + ceph::condition_variable m_cond; + int32_t report_period; + bool m_stop = false; + uint64_t total_bytes = 0; + uint64_t total_objects = 0; + uint64_t examined_objects = 0; + uint64_t examined_bytes = 0; + uint64_t max_read_size = 0; + bool debug = false; +#define COND_WAIT_INTERVAL 10 + +public: + CrawlerThread(IoCtx& io_ctx, int n, int m, + ObjectCursor begin, ObjectCursor end, int32_t report_period, + uint64_t num_objects, uint64_t max_read_size = default_op_size): + io_ctx(io_ctx), n(n), m(m), begin(begin), end(end), + report_period(report_period), total_objects(num_objects), max_read_size(max_read_size) + {} + + void signal(int signum) { + std::lock_guard l{m_lock}; + m_stop = true; + m_cond.notify_all(); + } + virtual void print_status(Formatter *f, ostream &out) {} + uint64_t get_examined_objects() { return examined_objects; } + uint64_t get_examined_bytes() { return examined_bytes; } + uint64_t get_total_bytes() { return total_bytes; } + uint64_t get_total_objects() { return total_objects; } + void set_debug(const bool debug_) { debug = debug_; } + friend class EstimateDedupRatio; + friend class ChunkScrub; +}; + +class EstimateDedupRatio : public CrawlerThread +{ + string chunk_algo; + string fp_algo; + uint64_t chunk_size; + uint64_t max_seconds; + +public: + EstimateDedupRatio( + IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end, + string chunk_algo, string fp_algo, uint64_t chunk_size, int32_t report_period, + uint64_t num_objects, uint64_t max_read_size, + uint64_t max_seconds): + CrawlerThread(io_ctx, n, m, begin, end, report_period, num_objects, + max_read_size), + chunk_algo(chunk_algo), + fp_algo(fp_algo), + chunk_size(chunk_size), + max_seconds(max_seconds) { + } + + void* entry() { + estimate_dedup_ratio(); + return NULL; + } + void estimate_dedup_ratio(); +}; + +class ChunkScrub: public CrawlerThread +{ + IoCtx chunk_io_ctx; + int damaged_objects = 0; + +public: + ChunkScrub(IoCtx& io_ctx, int n, int m, ObjectCursor begin, ObjectCursor end, + IoCtx& chunk_io_ctx, int32_t report_period, uint64_t num_objects): + CrawlerThread(io_ctx, n, m, begin, end, report_period, num_objects), chunk_io_ctx(chunk_io_ctx) + { } + void* entry() { + chunk_scrub_common(); + return NULL; + } + void chunk_scrub_common(); + int get_damaged_objects() { return damaged_objects; } + void print_status(Formatter *f, ostream &out); +}; + +vector> estimate_threads; + +static void print_dedup_estimate(std::ostream& out, std::string chunk_algo) +{ + /* + uint64_t total_bytes = 0; + uint64_t total_objects = 0; + */ + uint64_t examined_objects = 0; + uint64_t examined_bytes = 0; + + for (auto &et : estimate_threads) { + examined_objects += et->get_examined_objects(); + examined_bytes += et->get_examined_bytes(); + } + + auto f = Formatter::create("json-pretty"); + f->open_object_section("results"); + f->dump_string("chunk_algo", chunk_algo); + f->open_array_section("chunk_sizes"); + for (auto& i : dedup_estimates) { + f->dump_object("chunker", i.second); + } + f->close_section(); + + f->open_object_section("summary"); + f->dump_unsigned("examined_objects", examined_objects); + f->dump_unsigned("examined_bytes", examined_bytes); + /* + f->dump_unsigned("total_objects", total_objects); + f->dump_unsigned("total_bytes", total_bytes); + f->dump_float("examined_ratio", (float)examined_bytes / (float)total_bytes); + */ + f->close_section(); + f->close_section(); + f->flush(out); +} + +static void handle_signal(int signum) +{ + std::lock_guard l{glock}; + for (auto &p : estimate_threads) { + p->signal(signum); + } +} + +void EstimateDedupRatio::estimate_dedup_ratio() +{ + ObjectCursor shard_start; + ObjectCursor shard_end; + + io_ctx.object_list_slice( + begin, + end, + n, + m, + &shard_start, + &shard_end); + + utime_t start = ceph_clock_now(); + utime_t end; + if (max_seconds) { + end = start; + end += max_seconds; + } + + utime_t next_report; + if (report_period) { + next_report = start; + next_report += report_period; + } + + ObjectCursor c(shard_start); + while (c < shard_end) + { + std::vector result; + int r = io_ctx.object_list(c, shard_end, 12, {}, &result, &c); + if (r < 0 ){ + cerr << "error object_list : " << cpp_strerror(r) << std::endl; + return; + } + + unsigned op_size = max_read_size; + + for (const auto & i : result) { + const auto &oid = i.oid; + + utime_t now = ceph_clock_now(); + if (max_seconds && now > end) { + m_stop = true; + } + if (m_stop) { + return; + } + + if (n == 0 && // first thread only + next_report != utime_t() && now > next_report) { + cerr << (int)(now - start) << "s : read " + << dedup_estimates.begin()->second.total_bytes << " bytes so far..." + << std::endl; + print_dedup_estimate(cerr, chunk_algo); + next_report = now; + next_report += report_period; + } + + // read entire object + bufferlist bl; + uint64_t offset = 0; + while (true) { + bufferlist t; + int ret = io_ctx.read(oid, t, op_size, offset); + if (ret <= 0) { + break; + } + offset += ret; + bl.claim_append(t); + } + examined_objects++; + examined_bytes += bl.length(); + + // do the chunking + for (auto& i : dedup_estimates) { + vector> chunks; + i.second.cdc->calc_chunks(bl, &chunks); + for (auto& p : chunks) { + bufferlist chunk; + chunk.substr_of(bl, p.first, p.second); + i.second.add_chunk(chunk, fp_algo); + if (debug) { + cout << " " << oid << " " << p.first << "~" << p.second << std::endl; + } + } + ++i.second.total_objects; + } + } + } +} + +void ChunkScrub::chunk_scrub_common() +{ + ObjectCursor shard_start; + ObjectCursor shard_end; + int ret; + Rados rados; + + ret = rados.init_with_context(g_ceph_context); + if (ret < 0) { + cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl; + return; + } + ret = rados.connect(); + if (ret) { + cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl; + return; + } + + chunk_io_ctx.object_list_slice( + begin, + end, + n, + m, + &shard_start, + &shard_end); + + ObjectCursor c(shard_start); + while(c < shard_end) + { + std::vector result; + int r = chunk_io_ctx.object_list(c, shard_end, 12, {}, &result, &c); + if (r < 0 ){ + cerr << "error object_list : " << cpp_strerror(r) << std::endl; + return; + } + + for (const auto & i : result) { + std::unique_lock l{m_lock}; + if (m_stop) { + Formatter *formatter = Formatter::create("json-pretty"); + print_status(formatter, cout); + delete formatter; + return; + } + auto oid = i.oid; + cout << oid << std::endl; + chunk_refs_t refs; + { + bufferlist t; + ret = chunk_io_ctx.getxattr(oid, CHUNK_REFCOUNT_ATTR, t); + if (ret < 0) { + continue; + } + auto p = t.cbegin(); + decode(refs, p); + } + + examined_objects++; + if (refs.get_type() != chunk_refs_t::TYPE_BY_OBJECT) { + // we can't do anything here + continue; + } + + // check all objects + chunk_refs_by_object_t *byo = + static_cast(refs.r.get()); + set real_refs; + + uint64_t pool_missing = 0; + uint64_t object_missing = 0; + uint64_t does_not_ref = 0; + for (auto& pp : byo->by_object) { + IoCtx target_io_ctx; + ret = rados.ioctx_create2(pp.pool, target_io_ctx); + if (ret < 0) { + cerr << oid << " ref " << pp + << ": referencing pool does not exist" << std::endl; + ++pool_missing; + continue; + } + + ret = cls_cas_references_chunk(target_io_ctx, pp.oid.name, oid); + if (ret == -ENOENT) { + cerr << oid << " ref " << pp + << ": referencing object missing" << std::endl; + ++object_missing; + } else if (ret == -ENOLINK) { + cerr << oid << " ref " << pp + << ": referencing object does not reference chunk" + << std::endl; + ++does_not_ref; + } + } + if (pool_missing || object_missing || does_not_ref) { + ++damaged_objects; + } + } + } + cout << "--done--" << std::endl; +} + +using AioCompRef = unique_ptr; + +class SampleDedupWorkerThread : public Thread +{ +public: + struct chunk_t { + string oid = ""; + size_t start = 0; + size_t size = 0; + string fingerprint = ""; + bufferlist data; + }; + + class FpStore { + public: + using dup_count_t = ssize_t; + + bool find(string& fp) { + std::shared_lock lock(fingerprint_lock); + auto found_item = fp_map.find(fp); + return found_item != fp_map.end(); + } + + // return true if the chunk is duplicate + bool add(chunk_t& chunk) { + std::unique_lock lock(fingerprint_lock); + auto found_iter = fp_map.find(chunk.fingerprint); + ssize_t cur_reference = 1; + if (found_iter == fp_map.end()) { + fp_map.insert({chunk.fingerprint, 1}); + } else { + cur_reference = ++found_iter->second; + } + return cur_reference >= dedup_threshold && dedup_threshold != -1; + } + + void init(size_t dedup_threshold_) { + std::unique_lock lock(fingerprint_lock); + fp_map.clear(); + dedup_threshold = dedup_threshold_; + } + FpStore(size_t chunk_threshold) : dedup_threshold(chunk_threshold) { } + + private: + ssize_t dedup_threshold = -1; + std::unordered_map fp_map; + std::shared_mutex fingerprint_lock; + }; + + struct SampleDedupGlobal { + FpStore fp_store; + const double sampling_ratio = -1; + SampleDedupGlobal( + int chunk_threshold, + int sampling_ratio) : + fp_store(chunk_threshold), + sampling_ratio(static_cast(sampling_ratio) / 100) { } + }; + + SampleDedupWorkerThread( + IoCtx &io_ctx, + IoCtx &chunk_io_ctx, + ObjectCursor begin, + ObjectCursor end, + size_t chunk_size, + std::string &fp_algo, + std::string &chunk_algo, + SampleDedupGlobal &sample_dedup_global) : + io_ctx(io_ctx), + chunk_io_ctx(chunk_io_ctx), + chunk_size(chunk_size), + fp_type(pg_pool_t::get_fingerprint_from_str(fp_algo)), + chunk_algo(chunk_algo), + sample_dedup_global(sample_dedup_global), + begin(begin), + end(end) { } + + ~SampleDedupWorkerThread() { }; + +protected: + void* entry() override { + crawl(); + return nullptr; + } + +private: + void crawl(); + std::tuple, ObjectCursor> get_objects( + ObjectCursor current, + ObjectCursor end, + size_t max_object_count); + std::vector sample_object(size_t count); + void try_dedup_and_accumulate_result(ObjectItem &object); + bool ok_to_dedup_all(); + int do_chunk_dedup(chunk_t &chunk); + bufferlist read_object(ObjectItem &object); + std::vector>> do_cdc( + ObjectItem &object, + bufferlist &data); + std::string generate_fingerprint(bufferlist chunk_data); + AioCompRef do_async_evict(string oid); + + IoCtx io_ctx; + IoCtx chunk_io_ctx; + size_t total_duplicated_size = 0; + size_t total_object_size = 0; + + std::set oid_for_evict; + const size_t chunk_size = 0; + pg_pool_t::fingerprint_t fp_type = pg_pool_t::TYPE_FINGERPRINT_NONE; + std::string chunk_algo; + SampleDedupGlobal &sample_dedup_global; + ObjectCursor begin; + ObjectCursor end; +}; + +void SampleDedupWorkerThread::crawl() +{ + cout << "new iteration" << std::endl; + + ObjectCursor current_object = begin; + while (current_object < end) { + std::vector objects; + // Get the list of object IDs to deduplicate + std::tie(objects, current_object) = get_objects(current_object, end, 100); + + // Pick few objects to be processed. Sampling ratio decides how many + // objects to pick. Lower sampling ratio makes crawler have lower crawling + // overhead but find less duplication. + auto sampled_indexes = sample_object(objects.size()); + for (size_t index : sampled_indexes) { + ObjectItem target = objects[index]; + try_dedup_and_accumulate_result(target); + } + } + + vector evict_completions(oid_for_evict.size()); + int i = 0; + for (auto &oid : oid_for_evict) { + evict_completions[i] = do_async_evict(oid); + i++; + } + for (auto &completion : evict_completions) { + completion->wait_for_complete(); + } + cout << "done iteration" << std::endl; +} + +AioCompRef SampleDedupWorkerThread::do_async_evict(string oid) +{ + Rados rados; + ObjectReadOperation op_tier; + AioCompRef completion(rados.aio_create_completion()); + op_tier.tier_evict(); + io_ctx.aio_operate( + oid, + completion.get(), + &op_tier, + NULL); + return completion; +} + +std::tuple, ObjectCursor> SampleDedupWorkerThread::get_objects( + ObjectCursor current, ObjectCursor end, size_t max_object_count) +{ + std::vector objects; + ObjectCursor next; + int ret = io_ctx.object_list( + current, + end, + max_object_count, + {}, + &objects, + &next); + if (ret < 0 ) { + cerr << "error object_list" << std::endl; + objects.clear(); + } + + return std::make_tuple(objects, next); +} + +std::vector SampleDedupWorkerThread::sample_object(size_t count) +{ + std::vector indexes(count); + for (size_t i = 0 ; i < count ; i++) { + indexes[i] = i; + } + default_random_engine generator; + shuffle(indexes.begin(), indexes.end(), generator); + size_t sampling_count = static_cast(count) * + sample_dedup_global.sampling_ratio; + indexes.resize(sampling_count); + + return indexes; +} + +void SampleDedupWorkerThread::try_dedup_and_accumulate_result(ObjectItem &object) +{ + bufferlist data = read_object(object); + if (data.length() == 0) { + cerr << __func__ << " skip object " << object.oid + << " read returned size 0" << std::endl; + return; + } + auto chunks = do_cdc(object, data); + size_t chunk_total_amount = 0; + + // First, check total size of created chunks + for (auto &chunk : chunks) { + auto &chunk_data = std::get<0>(chunk); + chunk_total_amount += chunk_data.length(); + } + if (chunk_total_amount != data.length()) { + cerr << __func__ << " sum of chunked length(" << chunk_total_amount + << ") is different from object data length(" << data.length() << ")" + << std::endl; + return; + } + + size_t duplicated_size = 0; + list redundant_chunks; + for (auto &chunk : chunks) { + auto &chunk_data = std::get<0>(chunk); + std::string fingerprint = generate_fingerprint(chunk_data); + std::pair chunk_boundary = std::get<1>(chunk); + chunk_t chunk_info = { + .oid = object.oid, + .start = chunk_boundary.first, + .size = chunk_boundary.second, + .fingerprint = fingerprint, + .data = chunk_data + }; + + if (sample_dedup_global.fp_store.find(fingerprint)) { + duplicated_size += chunk_data.length(); + } + if (sample_dedup_global.fp_store.add(chunk_info)) { + redundant_chunks.push_back(chunk_info); + } + } + + size_t object_size = data.length(); + + // perform chunk-dedup + for (auto &p : redundant_chunks) { + do_chunk_dedup(p); + } + total_duplicated_size += duplicated_size; + total_object_size += object_size; +} + +bufferlist SampleDedupWorkerThread::read_object(ObjectItem &object) +{ + bufferlist whole_data; + size_t offset = 0; + int ret = -1; + while (ret != 0) { + bufferlist partial_data; + ret = io_ctx.read(object.oid, partial_data, default_op_size, offset); + if (ret < 0) { + cerr << "read object error " << object.oid << " offset " << offset + << " size " << default_op_size << " error(" << cpp_strerror(ret) + << std::endl; + bufferlist empty_buf; + return empty_buf; + } + offset += ret; + whole_data.claim_append(partial_data); + } + return whole_data; +} + +std::vector>> SampleDedupWorkerThread::do_cdc( + ObjectItem &object, + bufferlist &data) +{ + std::vector>> ret; + + unique_ptr cdc = CDC::create(chunk_algo, cbits(chunk_size) - 1); + vector> chunks; + cdc->calc_chunks(data, &chunks); + for (auto &p : chunks) { + bufferlist chunk; + chunk.substr_of(data, p.first, p.second); + ret.push_back(make_tuple(chunk, p)); + } + + return ret; +} + +std::string SampleDedupWorkerThread::generate_fingerprint(bufferlist chunk_data) +{ + string ret; + + switch (fp_type) { + case pg_pool_t::TYPE_FINGERPRINT_SHA1: + ret = crypto::digest(chunk_data).to_str(); + break; + + case pg_pool_t::TYPE_FINGERPRINT_SHA256: + ret = crypto::digest(chunk_data).to_str(); + break; + + case pg_pool_t::TYPE_FINGERPRINT_SHA512: + ret = crypto::digest(chunk_data).to_str(); + break; + default: + ceph_assert(0 == "Invalid fp type"); + break; + } + return ret; +} + +int SampleDedupWorkerThread::do_chunk_dedup(chunk_t &chunk) +{ + uint64_t size; + time_t mtime; + + int ret = chunk_io_ctx.stat(chunk.fingerprint, &size, &mtime); + + if (ret == -ENOENT) { + bufferlist bl; + bl.append(chunk.data); + ObjectWriteOperation wop; + wop.write_full(bl); + chunk_io_ctx.operate(chunk.fingerprint, &wop); + } else { + ceph_assert(ret == 0); + } + + ObjectReadOperation op; + op.set_chunk( + chunk.start, + chunk.size, + chunk_io_ctx, + chunk.fingerprint, + 0, + CEPH_OSD_OP_FLAG_WITH_REFERENCE); + ret = io_ctx.operate(chunk.oid, &op, nullptr); + oid_for_evict.insert(chunk.oid); + return ret; +} + +void ChunkScrub::print_status(Formatter *f, ostream &out) +{ + if (f) { + f->open_array_section("chunk_scrub"); + f->dump_string("PID", stringify(get_pid())); + f->open_object_section("Status"); + f->dump_string("Total object", stringify(total_objects)); + f->dump_string("Examined objects", stringify(examined_objects)); + f->dump_string("damaged objects", stringify(damaged_objects)); + f->close_section(); + f->flush(out); + cout << std::endl; + } +} + +string get_opts_pool_name(const po::variables_map &opts) { + if (opts.count("pool")) { + return opts["pool"].as(); + } + cerr << "must specify pool name" << std::endl; + exit(1); +} + +string get_opts_chunk_algo(const po::variables_map &opts) { + if (opts.count("chunk-algorithm")) { + string chunk_algo = opts["chunk-algorithm"].as(); + if (!CDC::create(chunk_algo, 12)) { + cerr << "unrecognized chunk-algorithm " << chunk_algo << std::endl; + exit(1); + } + return chunk_algo; + } + cerr << "must specify chunk-algorithm" << std::endl; + exit(1); +} + +string get_opts_fp_algo(const po::variables_map &opts) { + if (opts.count("fingerprint-algorithm")) { + string fp_algo = opts["fingerprint-algorithm"].as(); + if (fp_algo != "sha1" + && fp_algo != "sha256" && fp_algo != "sha512") { + cerr << "unrecognized fingerprint-algorithm " << fp_algo << std::endl; + exit(1); + } + return fp_algo; + } + cout << "SHA1 is set as fingerprint algorithm by default" << std::endl; + return string("sha1"); +} + +string get_opts_op_name(const po::variables_map &opts) { + if (opts.count("op")) { + return opts["op"].as(); + } else { + cerr << "must specify op" << std::endl; + exit(1); + } +} + +string get_opts_chunk_pool(const po::variables_map &opts) { + if (opts.count("chunk-pool")) { + return opts["chunk-pool"].as(); + } else { + cerr << "must specify --chunk-pool" << std::endl; + exit(1); + } +} + +string get_opts_object_name(const po::variables_map &opts) { + if (opts.count("object")) { + return opts["object"].as(); + } else { + cerr << "must specify object" << std::endl; + exit(1); + } +} + +int get_opts_max_thread(const po::variables_map &opts) { + if (opts.count("max-thread")) { + return opts["max-thread"].as(); + } else { + cout << "2 is set as the number of threads by default" << std::endl; + return 2; + } +} + +int get_opts_report_period(const po::variables_map &opts) { + if (opts.count("report-period")) { + return opts["report-period"].as(); + } else { + cout << "10 seconds is set as report period by default" << std::endl; + return 10; + } +} + +int estimate_dedup_ratio(const po::variables_map &opts) +{ + Rados rados; + IoCtx io_ctx; + std::string chunk_algo = "fastcdc"; + string fp_algo = "sha1"; + string pool_name; + uint64_t chunk_size = 8192; + uint64_t min_chunk_size = 8192; + uint64_t max_chunk_size = 4*1024*1024; + unsigned max_thread = default_max_thread; + uint32_t report_period = default_report_period; + uint64_t max_read_size = default_op_size; + uint64_t max_seconds = 0; + int ret; + std::map::const_iterator i; + bool debug = false; + ObjectCursor begin; + ObjectCursor end; + librados::pool_stat_t s; + list pool_names; + map stats; + + pool_name = get_opts_pool_name(opts); + if (opts.count("chunk-algorithm")) { + chunk_algo = opts["chunk-algorithm"].as(); + if (!CDC::create(chunk_algo, 12)) { + cerr << "unrecognized chunk-algorithm " << chunk_algo << std::endl; + exit(1); + } + } else { + cerr << "must specify chunk-algorithm" << std::endl; + exit(1); + } + fp_algo = get_opts_fp_algo(opts); + if (opts.count("chunk-size")) { + chunk_size = opts["chunk-size"].as(); + } else { + cout << "8192 is set as chunk size by default" << std::endl; + } + if (opts.count("min-chunk-size")) { + chunk_size = opts["min-chunk-size"].as(); + } else { + cout << "8192 is set as min chunk size by default" << std::endl; + } + if (opts.count("max-chunk-size")) { + chunk_size = opts["max-chunk-size"].as(); + } else { + cout << "4MB is set as max chunk size by default" << std::endl; + } + max_thread = get_opts_max_thread(opts); + report_period = get_opts_report_period(opts); + if (opts.count("max-seconds")) { + max_seconds = opts["max-seconds"].as(); + } else { + cout << "max seconds is not set" << std::endl; + } + if (opts.count("max-read-size")) { + max_read_size = opts["max-read-size"].as(); + } else { + cout << default_op_size << " is set as max-read-size by default" << std::endl; + } + if (opts.count("debug")) { + debug = true; + } + boost::optional pgid(opts.count("pgid"), pg_t()); + + ret = rados.init_with_context(g_ceph_context); + if (ret < 0) { + cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl; + goto out; + } + ret = rados.connect(); + if (ret) { + cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl; + ret = -1; + goto out; + } + if (pool_name.empty()) { + cerr << "--create-pool requested but pool_name was not specified!" << std::endl; + exit(1); + } + ret = rados.ioctx_create(pool_name.c_str(), io_ctx); + if (ret < 0) { + cerr << "error opening pool " + << pool_name << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + + // set up chunkers + if (chunk_size) { + dedup_estimates.emplace(std::piecewise_construct, + std::forward_as_tuple(chunk_size), + std::forward_as_tuple(chunk_algo, cbits(chunk_size)-1)); + } else { + for (size_t cs = min_chunk_size; cs <= max_chunk_size; cs *= 2) { + dedup_estimates.emplace(std::piecewise_construct, + std::forward_as_tuple(cs), + std::forward_as_tuple(chunk_algo, cbits(cs)-1)); + } + } + + glock.lock(); + begin = io_ctx.object_list_begin(); + end = io_ctx.object_list_end(); + pool_names.push_back(pool_name); + ret = rados.get_pool_stats(pool_names, stats); + if (ret < 0) { + cerr << "error fetching pool stats: " << cpp_strerror(ret) << std::endl; + glock.unlock(); + return ret; + } + if (stats.find(pool_name) == stats.end()) { + cerr << "stats can not find pool name: " << pool_name << std::endl; + glock.unlock(); + return ret; + } + s = stats[pool_name]; + + for (unsigned i = 0; i < max_thread; i++) { + std::unique_ptr ptr ( + new EstimateDedupRatio(io_ctx, i, max_thread, begin, end, + chunk_algo, fp_algo, chunk_size, + report_period, s.num_objects, max_read_size, + max_seconds)); + ptr->create("estimate_thread"); + ptr->set_debug(debug); + estimate_threads.push_back(move(ptr)); + } + glock.unlock(); + + for (auto &p : estimate_threads) { + p->join(); + } + + print_dedup_estimate(cout, chunk_algo); + + out: + return (ret < 0) ? 1 : 0; +} + +static void print_chunk_scrub() +{ + uint64_t total_objects = 0; + uint64_t examined_objects = 0; + int damaged_objects = 0; + + for (auto &et : estimate_threads) { + if (!total_objects) { + total_objects = et->get_total_objects(); + } + examined_objects += et->get_examined_objects(); + ChunkScrub *ptr = static_cast(et.get()); + damaged_objects += ptr->get_damaged_objects(); + } + + cout << " Total object : " << total_objects << std::endl; + cout << " Examined object : " << examined_objects << std::endl; + cout << " Damaged object : " << damaged_objects << std::endl; +} + +int chunk_scrub_common(const po::variables_map &opts) +{ + Rados rados; + IoCtx io_ctx, chunk_io_ctx; + std::string object_name, target_object_name; + string chunk_pool_name, op_name; + int ret; + unsigned max_thread = default_max_thread; + std::map::const_iterator i; + uint32_t report_period = default_report_period; + ObjectCursor begin; + ObjectCursor end; + librados::pool_stat_t s; + list pool_names; + map stats; + + op_name = get_opts_op_name(opts); + chunk_pool_name = get_opts_chunk_pool(opts); + boost::optional pgid(opts.count("pgid"), pg_t()); + + ret = rados.init_with_context(g_ceph_context); + if (ret < 0) { + cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl; + goto out; + } + ret = rados.connect(); + if (ret) { + cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl; + ret = -1; + goto out; + } + ret = rados.ioctx_create(chunk_pool_name.c_str(), chunk_io_ctx); + if (ret < 0) { + cerr << "error opening pool " + << chunk_pool_name << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + + if (op_name == "chunk-get-ref" || + op_name == "chunk-put-ref" || + op_name == "chunk-repair") { + string target_object_name; + uint64_t pool_id; + object_name = get_opts_object_name(opts); + if (opts.count("target-ref")) { + target_object_name = opts["target-ref"].as(); + } else { + cerr << "must specify target ref" << std::endl; + exit(1); + } + if (opts.count("target-ref-pool-id")) { + pool_id = opts["target-ref-pool-id"].as(); + } else { + cerr << "must specify target-ref-pool-id" << std::endl; + exit(1); + } + + uint32_t hash; + ret = chunk_io_ctx.get_object_hash_position2(object_name, &hash); + if (ret < 0) { + return ret; + } + hobject_t oid(sobject_t(target_object_name, CEPH_NOSNAP), "", hash, pool_id, ""); + + auto run_op = [] (ObjectWriteOperation& op, hobject_t& oid, + string& object_name, IoCtx& chunk_io_ctx) -> int { + int ret = chunk_io_ctx.operate(object_name, &op); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + } + return ret; + }; + + ObjectWriteOperation op; + if (op_name == "chunk-get-ref") { + cls_cas_chunk_get_ref(op, oid); + ret = run_op(op, oid, object_name, chunk_io_ctx); + } else if (op_name == "chunk-put-ref") { + cls_cas_chunk_put_ref(op, oid); + ret = run_op(op, oid, object_name, chunk_io_ctx); + } else if (op_name == "chunk-repair") { + ret = rados.ioctx_create2(pool_id, io_ctx); + if (ret < 0) { + cerr << oid << " ref " << pool_id + << ": referencing pool does not exist" << std::endl; + return ret; + } + int chunk_ref = -1, base_ref = -1; + // read object on chunk pool to know how many reference the object has + bufferlist t; + ret = chunk_io_ctx.getxattr(object_name, CHUNK_REFCOUNT_ATTR, t); + if (ret < 0) { + return ret; + } + chunk_refs_t refs; + auto p = t.cbegin(); + decode(refs, p); + if (refs.get_type() != chunk_refs_t::TYPE_BY_OBJECT) { + cerr << " does not supported chunk type " << std::endl; + return -1; + } + chunk_ref = + static_cast(refs.r.get())->by_object.count(oid); + if (chunk_ref < 0) { + cerr << object_name << " has no reference of " << target_object_name + << std::endl; + return chunk_ref; + } + cout << object_name << " has " << chunk_ref << " references for " + << target_object_name << std::endl; + + // read object on base pool to know the number of chunk object's references + base_ref = cls_cas_references_chunk(io_ctx, target_object_name, object_name); + if (base_ref < 0) { + if (base_ref == -ENOENT || base_ref == -ENOLINK) { + base_ref = 0; + } else { + return base_ref; + } + } + cout << target_object_name << " has " << base_ref << " references for " + << object_name << std::endl; + if (chunk_ref != base_ref) { + if (base_ref > chunk_ref) { + cerr << "error : " << target_object_name << "'s ref. < " << object_name + << "' ref. " << std::endl; + return -EINVAL; + } + cout << " fix dangling reference from " << chunk_ref << " to " << base_ref + << std::endl; + while (base_ref != chunk_ref) { + ObjectWriteOperation op; + cls_cas_chunk_put_ref(op, oid); + chunk_ref--; + ret = run_op(op, oid, object_name, chunk_io_ctx); + if (ret < 0) { + return ret; + } + } + } + } + return ret; + + } else if (op_name == "dump-chunk-refs") { + object_name = get_opts_object_name(opts); + bufferlist t; + ret = chunk_io_ctx.getxattr(object_name, CHUNK_REFCOUNT_ATTR, t); + if (ret < 0) { + return ret; + } + chunk_refs_t refs; + auto p = t.cbegin(); + decode(refs, p); + auto f = Formatter::create("json-pretty"); + f->dump_object("refs", refs); + f->flush(cout); + return 0; + } + + max_thread = get_opts_max_thread(opts); + report_period = get_opts_report_period(opts); + glock.lock(); + begin = chunk_io_ctx.object_list_begin(); + end = chunk_io_ctx.object_list_end(); + pool_names.push_back(chunk_pool_name); + ret = rados.get_pool_stats(pool_names, stats); + if (ret < 0) { + cerr << "error fetching pool stats: " << cpp_strerror(ret) << std::endl; + glock.unlock(); + return ret; + } + if (stats.find(chunk_pool_name) == stats.end()) { + cerr << "stats can not find pool name: " << chunk_pool_name << std::endl; + glock.unlock(); + return ret; + } + s = stats[chunk_pool_name]; + + for (unsigned i = 0; i < max_thread; i++) { + std::unique_ptr ptr ( + new ChunkScrub(io_ctx, i, max_thread, begin, end, chunk_io_ctx, + report_period, s.num_objects)); + ptr->create("estimate_thread"); + estimate_threads.push_back(move(ptr)); + } + glock.unlock(); + + for (auto &p : estimate_threads) { + cout << "join " << std::endl; + p->join(); + cout << "joined " << std::endl; + } + + print_chunk_scrub(); + +out: + return (ret < 0) ? 1 : 0; +} + +string make_pool_str(string pool, string var, string val) +{ + return string("{\"prefix\": \"osd pool set\",\"pool\":\"") + pool + + string("\",\"var\": \"") + var + string("\",\"val\": \"") + + val + string("\"}"); +} + +string make_pool_str(string pool, string var, int val) +{ + return make_pool_str(pool, var, stringify(val)); +} + +int make_dedup_object(const po::variables_map &opts) +{ + Rados rados; + IoCtx io_ctx, chunk_io_ctx; + std::string object_name, chunk_pool_name, op_name, pool_name, fp_algo; + int ret; + std::map::const_iterator i; + + op_name = get_opts_op_name(opts); + pool_name = get_opts_pool_name(opts); + object_name = get_opts_object_name(opts); + chunk_pool_name = get_opts_chunk_pool(opts); + boost::optional pgid(opts.count("pgid"), pg_t()); + + ret = rados.init_with_context(g_ceph_context); + if (ret < 0) { + cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl; + goto out; + } + ret = rados.connect(); + if (ret) { + cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl; + ret = -1; + goto out; + } + ret = rados.ioctx_create(pool_name.c_str(), io_ctx); + if (ret < 0) { + cerr << "error opening pool " + << chunk_pool_name << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + ret = rados.ioctx_create(chunk_pool_name.c_str(), chunk_io_ctx); + if (ret < 0) { + cerr << "error opening pool " + << chunk_pool_name << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + fp_algo = get_opts_fp_algo(opts); + + if (op_name == "chunk-dedup") { + uint64_t offset, length; + string chunk_object; + if (opts.count("source-off")) { + offset = opts["source-off"].as(); + } else { + cerr << "must specify --source-off" << std::endl; + exit(1); + } + if (opts.count("source-length")) { + length = opts["source-length"].as(); + } else { + cerr << "must specify --source-length" << std::endl; + exit(1); + } + // 1. make a copy from manifest object to chunk object + bufferlist bl; + ret = io_ctx.read(object_name, bl, length, offset); + if (ret < 0) { + cerr << " reading object in base pool fails : " << cpp_strerror(ret) << std::endl; + goto out; + } + chunk_object = [&fp_algo, &bl]() -> string { + if (fp_algo == "sha1") { + return ceph::crypto::digest(bl).to_str(); + } else if (fp_algo == "sha256") { + return ceph::crypto::digest(bl).to_str(); + } else if (fp_algo == "sha512") { + return ceph::crypto::digest(bl).to_str(); + } else { + assert(0 == "unrecognized fingerprint type"); + return {}; + } + }(); + ret = chunk_io_ctx.write(chunk_object, bl, length, offset); + if (ret < 0) { + cerr << " writing object in chunk pool fails : " << cpp_strerror(ret) << std::endl; + goto out; + } + // 2. call set_chunk + ObjectReadOperation op; + op.set_chunk(offset, length, chunk_io_ctx, chunk_object, 0, + CEPH_OSD_OP_FLAG_WITH_REFERENCE); + ret = io_ctx.operate(object_name, &op, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + goto out; + } + } else if (op_name == "object-dedup") { + unsigned chunk_size = 0; + bool snap = false; + if (opts.count("dedup-cdc-chunk-size")) { + chunk_size = opts["dedup-cdc-chunk-size"].as(); + } else { + cerr << "must specify --dedup-cdc-chunk-size" << std::endl; + exit(1); + } + if (opts.count("snap")) { + snap = true; + } + + bufferlist inbl; + ret = rados.mon_command( + make_pool_str(pool_name, "fingerprint_algorithm", fp_algo), + inbl, NULL, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + ret = rados.mon_command( + make_pool_str(pool_name, "dedup_tier", chunk_pool_name), + inbl, NULL, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + ret = rados.mon_command( + make_pool_str(pool_name, "dedup_chunk_algorithm", "fastcdc"), + inbl, NULL, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + ret = rados.mon_command( + make_pool_str(pool_name, "dedup_cdc_chunk_size", chunk_size), + inbl, NULL, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + + auto create_new_deduped_object = + [&io_ctx](string object_name) -> int { + + // tier-flush to perform deduplication + ObjectReadOperation flush_op; + flush_op.tier_flush(); + int ret = io_ctx.operate(object_name, &flush_op, NULL); + if (ret < 0) { + cerr << " tier_flush fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + // tier-evict + ObjectReadOperation evict_op; + evict_op.tier_evict(); + ret = io_ctx.operate(object_name, &evict_op, NULL); + if (ret < 0) { + cerr << " tier_evict fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + return ret; + }; + + if (snap) { + io_ctx.snap_set_read(librados::SNAP_DIR); + snap_set_t snap_set; + int snap_ret; + ObjectReadOperation op; + op.list_snaps(&snap_set, &snap_ret); + io_ctx.operate(object_name, &op, NULL); + + for (vector::const_iterator r = snap_set.clones.begin(); + r != snap_set.clones.end(); + ++r) { + io_ctx.snap_set_read(r->cloneid); + ret = create_new_deduped_object(object_name); + if (ret < 0) { + goto out; + } + } + } else { + ret = create_new_deduped_object(object_name); + } + } + +out: + return (ret < 0) ? 1 : 0; +} + +int make_crawling_daemon(const po::variables_map &opts) +{ + string base_pool_name = get_opts_pool_name(opts); + string chunk_pool_name = get_opts_chunk_pool(opts); + unsigned max_thread = get_opts_max_thread(opts); + + bool loop = false; + if (opts.count("loop")) { + loop = true; + } + + int sampling_ratio = -1; + if (opts.count("sampling-ratio")) { + sampling_ratio = opts["sampling-ratio"].as(); + } + size_t chunk_size = 8192; + if (opts.count("chunk-size")) { + chunk_size = opts["chunk-size"].as(); + } else { + cout << "8192 is set as chunk size by default" << std::endl; + } + + uint32_t chunk_dedup_threshold = -1; + if (opts.count("chunk-dedup-threshold")) { + chunk_dedup_threshold = opts["chunk-dedup-threshold"].as(); + } + + std::string chunk_algo = get_opts_chunk_algo(opts); + + Rados rados; + int ret = rados.init_with_context(g_ceph_context); + if (ret < 0) { + cerr << "couldn't initialize rados: " << cpp_strerror(ret) << std::endl; + return -EINVAL; + } + ret = rados.connect(); + if (ret) { + cerr << "couldn't connect to cluster: " << cpp_strerror(ret) << std::endl; + return -EINVAL; + } + int wakeup_period = 100; + if (opts.count("wakeup-period")) { + wakeup_period = opts["wakeup-period"].as(); + } else { + cout << "100 second is set as wakeup period by default" << std::endl; + } + + std::string fp_algo = get_opts_fp_algo(opts); + + list pool_names; + IoCtx io_ctx, chunk_io_ctx; + pool_names.push_back(base_pool_name); + ret = rados.ioctx_create(base_pool_name.c_str(), io_ctx); + if (ret < 0) { + cerr << "error opening base pool " + << base_pool_name << ": " + << cpp_strerror(ret) << std::endl; + return -EINVAL; + } + + ret = rados.ioctx_create(chunk_pool_name.c_str(), chunk_io_ctx); + if (ret < 0) { + cerr << "error opening chunk pool " + << chunk_pool_name << ": " + << cpp_strerror(ret) << std::endl; + return -EINVAL; + } + bufferlist inbl; + ret = rados.mon_command( + make_pool_str(base_pool_name, "fingerprint_algorithm", fp_algo), + inbl, NULL, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + ret = rados.mon_command( + make_pool_str(base_pool_name, "dedup_chunk_algorithm", "fastcdc"), + inbl, NULL, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + ret = rados.mon_command( + make_pool_str(base_pool_name, "dedup_cdc_chunk_size", chunk_size), + inbl, NULL, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + ret = rados.mon_command( + make_pool_str(base_pool_name, "dedup_tier", chunk_pool_name), + inbl, NULL, NULL); + if (ret < 0) { + cerr << " operate fail : " << cpp_strerror(ret) << std::endl; + return ret; + } + + cout << "SampleRatio : " << sampling_ratio << std::endl + << "Chunk Dedup Threshold : " << chunk_dedup_threshold << std::endl + << "Chunk Size : " << chunk_size << std::endl + << std::endl; + + while (true) { + lock_guard lock(glock); + ObjectCursor begin = io_ctx.object_list_begin(); + ObjectCursor end = io_ctx.object_list_end(); + map stats; + ret = rados.get_pool_stats(pool_names, stats); + if (ret < 0) { + cerr << "error fetching pool stats: " << cpp_strerror(ret) << std::endl; + return -EINVAL; + } + if (stats.find(base_pool_name) == stats.end()) { + cerr << "stats can not find pool name: " << base_pool_name << std::endl; + return -EINVAL; + } + + SampleDedupWorkerThread::SampleDedupGlobal sample_dedup_global( + chunk_dedup_threshold, sampling_ratio); + + std::list threads; + for (unsigned i = 0; i < max_thread; i++) { + cout << " add thread.. " << std::endl; + ObjectCursor shard_start; + ObjectCursor shard_end; + io_ctx.object_list_slice( + begin, + end, + i, + max_thread, + &shard_start, + &shard_end); + + threads.emplace_back( + io_ctx, + chunk_io_ctx, + shard_start, + shard_end, + chunk_size, + fp_algo, + chunk_algo, + sample_dedup_global); + threads.back().create("sample_dedup"); + } + + for (auto &p : threads) { + p.join(); + } + if (loop) { + sleep(wakeup_period); + } else { + break; + } + } + + return 0; +} + +int main(int argc, const char **argv) +{ + auto args = argv_to_vec(argc, argv); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + + po::variables_map opts; + po::positional_options_description p; + p.add("command", 1); + po::options_description desc = make_usage(); + try { + po::parsed_options parsed = + po::command_line_parser(argc, argv).options(desc).positional(p).allow_unregistered().run(); + po::store(parsed, opts); + po::notify(opts); + } catch(po::error &e) { + std::cerr << e.what() << std::endl; + return 1; + } + if (opts.count("help") || opts.count("h")) { + cout<< desc << std::endl; + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_DAEMON, + CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS); + + Preforker forker; + if (global_init_prefork(g_ceph_context) >= 0) { + std::string err; + int r = forker.prefork(err); + if (r < 0) { + cerr << err << std::endl; + return r; + } + if (forker.is_parent()) { + g_ceph_context->_log->start(); + if (forker.parent_wait(err) != 0) { + return -ENXIO; + } + return 0; + } + global_init_postfork_start(g_ceph_context); + } + common_init_finish(g_ceph_context); + if (opts.count("daemon")) { + global_init_postfork_finish(g_ceph_context); + forker.daemonize(); + } + init_async_signal_handler(); + register_async_signal_handler_oneshot(SIGINT, handle_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_signal); + + string op_name = get_opts_op_name(opts); + int ret = 0; + if (op_name == "estimate") { + ret = estimate_dedup_ratio(opts); + } else if (op_name == "chunk-scrub" || + op_name == "chunk-get-ref" || + op_name == "chunk-put-ref" || + op_name == "chunk-repair" || + op_name == "dump-chunk-refs") { + ret = chunk_scrub_common(opts); + } else if (op_name == "chunk-dedup" || + op_name == "object-dedup") { + /* + * chunk-dedup: + * using a chunk generated by given source, + * create a new object in the chunk pool or increase the reference + * if the object exists + * + * object-dedup: + * perform deduplication on the entire object, not a chunk. + * + */ + ret = make_dedup_object(opts); + } else if (op_name == "sample-dedup") { + ret = make_crawling_daemon(opts); + } else { + cerr << "unrecognized op " << op_name << std::endl; + exit(1); + } + + unregister_async_signal_handler(SIGINT, handle_signal); + unregister_async_signal_handler(SIGTERM, handle_signal); + shutdown_async_signal_handler(); + + return forker.signal_exit(ret); +} diff --git a/src/tools/ceph_kvstore_tool.cc b/src/tools/ceph_kvstore_tool.cc new file mode 100644 index 000000000..61fe00cc9 --- /dev/null +++ b/src/tools/ceph_kvstore_tool.cc @@ -0,0 +1,363 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2012 Inktank, Inc. +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License version 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#include +#include +#include +#include + +#include "common/ceph_argparse.h" +#include "common/config.h" +#include "common/errno.h" +#include "common/strtol.h" +#include "common/url_escape.h" + +#include "global/global_context.h" +#include "global/global_init.h" + +#include "kvstore_tool.h" + +using namespace std; + +void usage(const char *pname) +{ + std::cout << "Usage: " << pname << " command [args...]\n" + << "\n" + << "Commands:\n" + << " list [prefix]\n" + << " list-crc [prefix]\n" + << " dump [prefix]\n" + << " exists [key]\n" + << " get [out ]\n" + << " crc \n" + << " get-size [ ]\n" + << " set [ver |in ]\n" + << " rm \n" + << " rm-prefix \n" + << " store-copy [num-keys-per-tx] [leveldb|rocksdb|...] \n" + << " store-crc \n" + << " compact\n" + << " compact-prefix \n" + << " compact-range \n" + << " destructive-repair (use only as last resort! may corrupt healthy data)\n" + << " stats\n" + << " histogram [prefix]\n" + << std::endl; +} + +int main(int argc, const char *argv[]) +{ + auto args = argv_to_vec(argc, argv); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + usage(argv[0]); + exit(0); + } + + map defaults = { + { "debug_rocksdb", "2" } + }; + + auto cct = global_init( + &defaults, args, + CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + + ceph_assert((int)args.size() < argc); + for(size_t i=0; i 4) + prefix = url_unescape(argv[4]); + + bool do_crc = (cmd == "list-crc"); + st.list(prefix, do_crc, false); + + } else if (cmd == "dump") { + string prefix; + if (argc > 4) + prefix = url_unescape(argv[4]); + st.list(prefix, false, true); + + } else if (cmd == "exists") { + string key; + if (argc < 5) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + if (argc > 5) + key = url_unescape(argv[5]); + + bool ret = st.exists(prefix, key); + std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ") " + << (ret ? "exists" : "does not exist") + << std::endl; + return (ret ? 0 : 1); + + } else if (cmd == "get") { + if (argc < 6) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + + bool exists = false; + bufferlist bl = st.get(prefix, key, exists); + std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ")"; + if (!exists) { + std::cout << " does not exist" << std::endl; + return 1; + } + std::cout << std::endl; + + if (argc >= 7) { + string subcmd(argv[6]); + if (subcmd != "out") { + std::cerr << "unrecognized subcmd '" << subcmd << "'" + << std::endl; + return 1; + } + if (argc < 8) { + std::cerr << "output path not specified" << std::endl; + return 1; + } + string out(argv[7]); + + if (out.empty()) { + std::cerr << "unspecified out file" << std::endl; + return 1; + } + + int err = bl.write_file(argv[7], 0644); + if (err < 0) { + std::cerr << "error writing value to '" << out << "': " + << cpp_strerror(err) << std::endl; + return 1; + } + } else { + ostringstream os; + bl.hexdump(os); + std::cout << os.str() << std::endl; + } + + } else if (cmd == "crc") { + if (argc < 6) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + + bool exists = false; + bufferlist bl = st.get(prefix, key, exists); + std::cout << "(" << url_escape(prefix) << ", " << url_escape(key) << ") "; + if (!exists) { + std::cout << " does not exist" << std::endl; + return 1; + } + std::cout << " crc " << bl.crc32c(0) << std::endl; + + } else if (cmd == "get-size") { + std::cout << "estimated store size: " << st.get_size() << std::endl; + + if (argc < 5) + return 0; + + if (argc < 6) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + + bool exists = false; + bufferlist bl = st.get(prefix, key, exists); + if (!exists) { + std::cerr << "(" << url_escape(prefix) << "," << url_escape(key) + << ") does not exist" << std::endl; + return 1; + } + std::cout << "(" << url_escape(prefix) << "," << url_escape(key) + << ") size " << byte_u_t(bl.length()) << std::endl; + + } else if (cmd == "set") { + if (argc < 8) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + string subcmd(argv[6]); + + bufferlist val; + string errstr; + if (subcmd == "ver") { + version_t v = (version_t) strict_strtoll(argv[7], 10, &errstr); + if (!errstr.empty()) { + std::cerr << "error reading version: " << errstr << std::endl; + return 1; + } + encode(v, val); + } else if (subcmd == "in") { + int ret = val.read_file(argv[7], &errstr); + if (ret < 0 || !errstr.empty()) { + std::cerr << "error reading file: " << errstr << std::endl; + return 1; + } + } else { + std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl; + usage(argv[0]); + return 1; + } + + bool ret = st.set(prefix, key, val); + if (!ret) { + std::cerr << "error setting (" + << url_escape(prefix) << "," << url_escape(key) << ")" << std::endl; + return 1; + } + } else if (cmd == "rm") { + if (argc < 6) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string key(url_unescape(argv[5])); + + bool ret = st.rm(prefix, key); + if (!ret) { + std::cerr << "error removing (" + << url_escape(prefix) << "," << url_escape(key) << ")" + << std::endl; + return 1; + } + } else if (cmd == "rm-prefix") { + if (argc < 5) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + + bool ret = st.rm_prefix(prefix); + if (!ret) { + std::cerr << "error removing prefix (" + << url_escape(prefix) << ")" + << std::endl; + return 1; + } + } else if (cmd == "store-copy") { + int num_keys_per_tx = 128; // magic number that just feels right. + if (argc < 5) { + usage(argv[0]); + return 1; + } else if (argc > 5) { + string err; + num_keys_per_tx = strict_strtol(argv[5], 10, &err); + if (!err.empty()) { + std::cerr << "invalid num_keys_per_tx: " << err << std::endl; + return 1; + } + } + string other_store_type = argv[1]; + if (argc > 6) { + other_store_type = argv[6]; + } + + int ret = st.copy_store_to(argv[1], argv[4], num_keys_per_tx, other_store_type); + if (ret < 0) { + std::cerr << "error copying store to path '" << argv[4] + << "': " << cpp_strerror(ret) << std::endl; + return 1; + } + + } else if (cmd == "store-crc") { + if (argc < 4) { + usage(argv[0]); + return 1; + } + std::ofstream fs(argv[4]); + uint32_t crc = st.traverse(string(), true, false, &fs); + std::cout << "store at '" << argv[4] << "' crc " << crc << std::endl; + + } else if (cmd == "compact") { + st.compact(); + } else if (cmd == "compact-prefix") { + if (argc < 5) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + st.compact_prefix(prefix); + } else if (cmd == "compact-range") { + if (argc < 7) { + usage(argv[0]); + return 1; + } + string prefix(url_unescape(argv[4])); + string start(url_unescape(argv[5])); + string end(url_unescape(argv[6])); + st.compact_range(prefix, start, end); + } else if (cmd == "stats") { + st.print_stats(); + } else if (cmd == "histogram") { + string prefix; + if (argc > 4) + prefix = url_unescape(argv[4]); + st.build_size_histogram(prefix); + } else { + std::cerr << "Unrecognized command: " << cmd << std::endl; + return 1; + } + + return 0; +} diff --git a/src/tools/ceph_monstore_tool.cc b/src/tools/ceph_monstore_tool.cc new file mode 100644 index 000000000..9da7f5f5c --- /dev/null +++ b/src/tools/ceph_monstore_tool.cc @@ -0,0 +1,1319 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2012 Inktank, Inc. +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License version 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#include +#include +#include + +#include +#include + +#include "common/Formatter.h" +#include "common/errno.h" + +#include "auth/KeyRing.h" +#include "auth/cephx/CephxKeyServer.h" +#include "global/global_init.h" +#include "include/scope_guard.h" +#include "include/stringify.h" +#include "mgr/mgr_commands.h" +#include "mon/AuthMonitor.h" +#include "mon/MonitorDBStore.h" +#include "mon/Paxos.h" +#include "mon/MonMap.h" +#include "mds/FSMap.h" +#include "mon/MgrMap.h" +#include "osd/OSDMap.h" +#include "crush/CrushCompiler.h" +#include "mon/CreatingPGs.h" + +namespace po = boost::program_options; + +using namespace std; + +class TraceIter { + int fd; + unsigned idx; + MonitorDBStore::TransactionRef t; +public: + explicit TraceIter(string fname) : fd(-1), idx(-1) { + fd = ::open(fname.c_str(), O_RDONLY|O_BINARY); + t.reset(new MonitorDBStore::Transaction); + } + bool valid() { + return fd != -1; + } + MonitorDBStore::TransactionRef cur() { + ceph_assert(valid()); + return t; + } + unsigned num() { return idx; } + void next() { + ++idx; + bufferlist bl; + int r = bl.read_fd(fd, 6); + if (r < 0) { + std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd" + << std::endl; + ::close(fd); + fd = -1; + return; + } else if ((unsigned)r < 6) { + std::cerr << "short read" << std::endl; + ::close(fd); + fd = -1; + return; + } + auto bliter = bl.cbegin(); + uint8_t ver, ver2; + decode(ver, bliter); + decode(ver2, bliter); + uint32_t len; + decode(len, bliter); + r = bl.read_fd(fd, len); + if (r < 0) { + std::cerr << "Got error: " << cpp_strerror(r) << " on read_fd" + << std::endl; + ::close(fd); + fd = -1; + return; + } else if ((unsigned)r < len) { + std::cerr << "short read" << std::endl; + ::close(fd); + fd = -1; + return; + } + bliter = bl.cbegin(); + t.reset(new MonitorDBStore::Transaction); + t->decode(bliter); + } + void init() { + next(); + } + ~TraceIter() { + if (fd != -1) { + ::close(fd); + fd = -1; + } + } +}; + + +int parse_cmd_args( + po::options_description *desc, /// < visible options description + po::options_description *hidden_desc, /// < hidden options description + po::positional_options_description *positional, /// < positional args + vector &cmd_args, /// < arguments to be parsed + po::variables_map *vm /// > post-parsing variable map + ) +{ + // desc_all will aggregate all visible and hidden options for parsing. + // + // From boost's program_options point of view, there is absolutely no + // distinction between 'desc' and 'hidden_desc'. This is a distinction + // that is only useful to us: 'desc' is whatever we are willing to show + // on 'usage()', whereas 'hidden_desc' refers to parameters we wish to + // take advantage of but do not wish to show on 'usage()'. + // + // For example, consider that program_options matches positional arguments + // (specified via 'positional') against the paramenters defined on a + // given 'po::options_description' class. This is performed below, + // supplying both the description and the positional arguments to the + // parser. However, we do not want the parameters that are mapped to + // positional arguments to be shown on usage, as that makes for ugly and + // confusing usage messages. Therefore we dissociate the options' + // description that is to be used as an aid to the user from those options + // that are nothing but useful for internal purposes (i.e., mapping options + // to positional arguments). We still need to aggregate them before parsing + // and that's what 'desc_all' is all about. + // + + ceph_assert(desc != NULL); + + po::options_description desc_all; + desc_all.add(*desc); + if (hidden_desc != NULL) + desc_all.add(*hidden_desc); + + try { + po::command_line_parser parser = po::command_line_parser(cmd_args). + options(desc_all); + + if (positional) { + parser = parser.positional(*positional); + } + + po::parsed_options parsed = parser.run(); + po::store(parsed, *vm); + po::notify(*vm); + } catch (po::error &e) { + std::cerr << "error: " << e.what() << std::endl; + return -EINVAL; + } + return 0; +} + + +/** + * usage: ceph-monstore-tool [options] + * + * commands: + * + * store-copy < --out arg > + * dump-keys + * compact + * getmonmap < --out arg [ --version arg ] > + * getosdmap < --out arg [ --version arg ] > + * dump-paxos <--dump-start VER> <--dump-end VER> + * dump-trace < --trace-file arg > + * replay-trace + * random-gen + * rewrite-crush + * + * wanted syntax: + * + * ceph-monstore-tool PATH CMD [options] + * + * ceph-monstore-tool PATH store-copy + * ceph-monstore-tool PATH dump-keys + * ceph-monstore-tool PATH compact + * ceph-monstore-tool PATH get monmap [VER] + * ceph-monstore-tool PATH get osdmap [VER] + * ceph-monstore-tool PATH dump-paxos STARTVER ENDVER + * + * + */ +void usage(const char *n, po::options_description &d) +{ + std::cerr << + "usage: " << n << " [args|options]\n" + << "\n" + << "Commands:\n" + << " store-copy PATH copies store to PATH\n" + << " compact compacts the store\n" + << " get monmap [-- options] get monmap (version VER if specified)\n" + << " (default: last committed)\n" + << " get osdmap [-- options] get osdmap (version VER if specified)\n" + << " (default: last committed)\n" + << " get mdsmap [-- options] get mdsmap (version VER if specified)\n" + << " (default: last committed)\n" + << " get mgr [-- options] get mgr map (version VER if specified)\n" + << " (default: last committed)\n" + << " get crushmap [-- options] get crushmap (version VER if specified)\n" + << " (default: last committed)\n" + << " show-versions [-- options] show the first&last committed version of map\n" + << " (show-versions -- --help for more info)\n" + << " dump-keys dumps store keys to FILE\n" + << " (default: stdout)\n" + << " dump-paxos [-- options] dump paxos transactions\n" + << " (dump-paxos -- --help for more info)\n" + << " dump-trace FILE [-- options] dump contents of trace file FILE\n" + << " (dump-trace -- --help for more info)\n" + << " replay-trace FILE [-- options] replay trace from FILE\n" + << " (replay-trace -- --help for more info)\n" + << " random-gen [-- options] add randomly generated ops to the store\n" + << " (random-gen -- --help for more info)\n" + << " rewrite-crush [-- options] add a rewrite commit to the store\n" + << " (rewrite-crush -- --help for more info)\n" + << " rebuild rebuild store\n" + << " (rebuild -- --help for more info)\n" + << std::endl; + std::cerr << d << std::endl; + std::cerr + << "\nPlease Note:\n" + << "* Ceph-specific options should be in the format --option-name=VAL\n" + << " (specifically, do not forget the '='!!)\n" + << "* Command-specific options need to be passed after a '--'\n" + << " e.g., 'get monmap -- --version 10 --out /tmp/foo'" + << std::endl; +} + +int update_osdmap(MonitorDBStore& store, version_t ver, bool copy, + std::shared_ptr crush, + MonitorDBStore::Transaction* t) { + const string prefix("osdmap"); + + // full + bufferlist bl; + int r = 0; + r = store.get(prefix, store.combine_strings("full", ver), bl); + if (r) { + std::cerr << "Error getting full map: " << cpp_strerror(r) << std::endl; + return r; + } + OSDMap osdmap; + osdmap.decode(bl); + osdmap.crush = crush; + if (copy) { + osdmap.inc_epoch(); + } + bl.clear(); + // be consistent with OSDMonitor::update_from_paxos() + osdmap.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED); + t->put(prefix, store.combine_strings("full", osdmap.get_epoch()), bl); + + // incremental + OSDMap::Incremental inc; + if (copy) { + inc.epoch = osdmap.get_epoch(); + inc.fsid = osdmap.get_fsid(); + } else { + bl.clear(); + r = store.get(prefix, ver, bl); + if (r) { + std::cerr << "Error getting inc map: " << cpp_strerror(r) << std::endl; + return r; + } + OSDMap::Incremental inc(bl); + if (inc.crush.length()) { + inc.crush.clear(); + crush->encode(inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT); + } + if (inc.fullmap.length()) { + OSDMap fullmap; + fullmap.decode(inc.fullmap); + fullmap.crush = crush; + inc.fullmap.clear(); + fullmap.encode(inc.fullmap); + } + } + ceph_assert(osdmap.have_crc()); + inc.full_crc = osdmap.get_crc(); + bl.clear(); + // be consistent with OSDMonitor::update_from_paxos() + inc.encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED); + t->put(prefix, inc.epoch, bl); + return 0; +} + +int rewrite_transaction(MonitorDBStore& store, int version, + const string& crush_file, + MonitorDBStore::Transaction* t) { + const string prefix("osdmap"); + + // calc the known-good epoch + version_t last_committed = store.get(prefix, "last_committed"); + version_t good_version = 0; + if (version <= 0) { + if (last_committed >= (unsigned)-version) { + good_version = last_committed + version; + } else { + std::cerr << "osdmap-version is less than: -" << last_committed << std::endl; + return EINVAL; + } + } else { + good_version = version; + } + if (good_version >= last_committed) { + std::cout << "good epoch is greater or equal to the last committed one: " + << good_version << " >= " << last_committed << std::endl; + return 0; + } + + // load/extract the crush map + int r = 0; + std::shared_ptr crush(new CrushWrapper); + if (crush_file.empty()) { + bufferlist bl; + r = store.get(prefix, store.combine_strings("full", good_version), bl); + if (r) { + std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl; + return r; + } + OSDMap osdmap; + osdmap.decode(bl); + crush = osdmap.crush; + } else { + string err; + bufferlist bl; + r = bl.read_file(crush_file.c_str(), &err); + if (r) { + std::cerr << err << ": " << cpp_strerror(r) << std::endl; + return r; + } + auto p = bl.cbegin(); + crush->decode(p); + } + + // prepare a transaction to rewrite the epochs + // (good_version, last_committed] + // with the good crush map. + // XXX: may need to break this into several paxos versions? + ceph_assert(good_version < last_committed); + for (version_t v = good_version + 1; v <= last_committed; v++) { + cout << "rewriting epoch #" << v << "/" << last_committed << std::endl; + r = update_osdmap(store, v, false, crush, t); + if (r) + return r; + } + + // add a new osdmap epoch to store, so monitors will update their current osdmap + // in addition to the ones stored in epochs. + // + // This is needed due to the way the monitor updates from paxos and the + // facilities we are leveraging to push this update to the rest of the + // quorum. + // + // In a nutshell, we are generating a good version of the osdmap, with a + // proper crush, and building a transaction that will replace the bad + // osdmaps with good osdmaps. But this transaction needs to be applied on + // all nodes, so that the monitors will have good osdmaps to share with + // clients. We thus leverage Paxos, specifically the recovery mechanism, by + // creating a pending value that will be committed once the monitors form an + // initial quorum after being brought back to life. + // + // However, the way the monitor works has the paxos services, including the + // OSDMonitor, updating their state from disk *prior* to the recovery phase + // begins (so they have an up to date state in memory). This means the + // OSDMonitor will see the old, broken map, before the new paxos version is + // applied to disk, and the old version is cached. Even though we have the + // good map now, and we share the good map with clients, we will still be + // working on the old broken map. Instead of mucking around the monitor to + // make this work, we instead opt for adding the same osdmap but with a + // newer version, so that the OSDMonitor picks up on it when it updates from + // paxos after the proposal has been committed. This is not elegant, but + // avoids further unpleasantness that would arise from kludging around the + // current behavior. Also, has the added benefit of making sure the clients + // get an updated version of the map (because last_committed+1 > + // last_committed) :) + // + cout << "adding a new epoch #" << last_committed+1 << std::endl; + r = update_osdmap(store, last_committed++, true, crush, t); + if (r) + return r; + t->put(prefix, store.combine_strings("full", "latest"), last_committed); + t->put(prefix, "last_committed", last_committed); + return 0; +} + +/** + * create a new paxos version which carries a proposal to rewrite all epochs + * of incremental and full map of "osdmap" after a faulty crush map is injected. + * so the leader will trigger a recovery and propagate this fix to its peons, + * after the proposal is accepted, and the transaction in it is applied. all + * monitors will rewrite the bad crush map with the good one, and have a new + * osdmap epoch with the good crush map in it. + */ +int rewrite_crush(const char* progname, + vector& subcmds, + MonitorDBStore& store) { + po::options_description op_desc("Allowed 'rewrite-crush' options"); + int version = -1; + string crush_file; + op_desc.add_options() + ("help,h", "produce this help message") + ("crush", po::value(&crush_file), + ("path to the crush map file " + "(default: will instead extract it from the known-good osdmap)")) + ("good-epoch", po::value(&version), + "known-good epoch of osdmap, if a negative number '-N' is given, the " + "$last_committed-N is used instead (default: -1). " + "Please note, -1 is not necessarily a good epoch, because there are " + "good chance that we have more epochs slipped into the monstore after " + "the one where the crushmap is firstly injected.") + ; + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, NULL, NULL, subcmds, &op_vm); + if (r) { + return -r; + } + if (op_vm.count("help")) { + usage(progname, op_desc); + return 0; + } + + MonitorDBStore::Transaction rewrite_txn; + r = rewrite_transaction(store, version, crush_file, &rewrite_txn); + if (r) { + return r; + } + + // store the transaction into store as a proposal + const string prefix("paxos"); + version_t pending_v = store.get(prefix, "last_committed") + 1; + auto t(std::make_shared()); + bufferlist bl; + rewrite_txn.encode(bl); + cout << "adding pending commit " << pending_v + << " " << bl.length() << " bytes" << std::endl; + t->put(prefix, pending_v, bl); + t->put(prefix, "pending_v", pending_v); + // a large enough yet unique proposal number will probably do the trick + version_t pending_pn = (store.get(prefix, "accepted_pn") / 100 + 4) * 100 + 1; + t->put(prefix, "pending_pn", pending_pn); + store.apply_transaction(t); + return 0; +} + +static int update_auth(MonitorDBStore& st, const string& keyring_path) +{ + // import all keyrings stored in the keyring file + KeyRing keyring; + int r = keyring.load(g_ceph_context, keyring_path); + if (r < 0) { + cerr << "unable to load admin keyring: " << keyring_path << std::endl; + return r; + } + + bufferlist bl; + __u8 v = 1; + encode(v, bl); + + for (const auto& k : keyring.get_keys()) { + KeyServerData::Incremental auth_inc; + auth_inc.name = k.first; + auth_inc.auth = k.second; + if (auth_inc.auth.caps.empty()) { + cerr << "no caps granted to: " << auth_inc.name << std::endl; + return -EINVAL; + } + map caps; + std::transform(begin(auth_inc.auth.caps), end(auth_inc.auth.caps), + inserter(caps, end(caps)), + [](auto& cap) { + string c; + auto p = cap.second.cbegin(); + decode(c, p); + return make_pair(cap.first, c); + }); + cout << "adding auth for '" + << auth_inc.name << "': " << auth_inc.auth + << " with caps(" << caps << ")" << std::endl; + auth_inc.op = KeyServerData::AUTH_INC_ADD; + + AuthMonitor::Incremental inc; + inc.inc_type = AuthMonitor::AUTH_DATA; + encode(auth_inc, inc.auth_data); + inc.auth_type = CEPH_AUTH_CEPHX; + inc.encode(bl, CEPH_FEATURES_ALL); + } + + // prime rotating secrets + { + KeyServer ks(g_ceph_context, nullptr); + KeyServerData::Incremental auth_inc; + auth_inc.op = KeyServerData::AUTH_INC_SET_ROTATING; + bool r = ks.prepare_rotating_update(auth_inc.rotating_bl); + ceph_assert(r); + AuthMonitor::Incremental inc; + inc.inc_type = AuthMonitor::AUTH_DATA; + encode(auth_inc, inc.auth_data); + inc.auth_type = CEPH_AUTH_CEPHX; + inc.encode(bl, CEPH_FEATURES_ALL); + } + + const string prefix("auth"); + auto last_committed = st.get(prefix, "last_committed") + 1; + auto t = make_shared(); + t->put(prefix, last_committed, bl); + t->put(prefix, "last_committed", last_committed); + auto first_committed = st.get(prefix, "first_committed"); + if (!first_committed) { + t->put(prefix, "first_committed", last_committed); + } + st.apply_transaction(t); + return 0; +} + +static int update_mkfs(MonitorDBStore& st, + const string& monmap_path, + const vector& mon_ids) +{ + MonMap monmap; + if (!monmap_path.empty()) { + cout << __func__ << " pulling initial monmap from " << monmap_path << std::endl; + bufferlist bl; + string err; + int r = bl.read_file(monmap_path.c_str(), &err); + if (r < 0) { + cerr << "failed to read monmap from " << monmap_path << ": " + << cpp_strerror(r) << std::endl; + return r; + } + monmap.decode(bl); + } else { + cout << __func__ << " generating seed initial monmap" << std::endl; + int r = monmap.build_initial(g_ceph_context, true, cerr); + if (r) { + cerr << "no initial monitors" << std::endl; + return -EINVAL; + } + vector new_names; + if (!mon_ids.empty()) { + if (mon_ids.size() != monmap.size()) { + cerr << "Please pass the same number of to name the hosts " + << "listed in 'mon_host'. " + << mon_ids.size() << " mon-id(s) specified, " + << "while you have " << monmap.size() << " mon hosts." << std::endl; + return -EINVAL; + } + new_names = mon_ids; + } else { + for (unsigned rank = 0; rank < monmap.size(); rank++) { + string new_name{"a"}; + new_name[0] += rank; + new_names.push_back(std::move(new_name)); + } + } + for (unsigned rank = 0; rank < monmap.size(); rank++) { + auto name = monmap.get_name(rank); + if (name.compare(0, 7, "noname-") == 0) { + monmap.rename(name, new_names[rank]); + } + } + } + monmap.print(cout); + bufferlist bl; + monmap.encode(bl, CEPH_FEATURES_ALL); + monmap.set_epoch(0); + auto t = make_shared(); + t->put("mkfs", "monmap", bl); + st.apply_transaction(t); + return 0; +} + +static int update_monitor(MonitorDBStore& st) +{ + const string prefix("monitor"); + // a stripped-down Monitor::mkfs() + bufferlist bl; + bl.append(CEPH_MON_ONDISK_MAGIC "\n"); + auto t = make_shared(); + t->put(prefix, "magic", bl); + st.apply_transaction(t); + return 0; +} + +// rebuild +// - creating_pgs +static int update_creating_pgs(MonitorDBStore& st) +{ + bufferlist bl; + auto last_osdmap_epoch = st.get("osdmap", "last_committed"); + int r = st.get("osdmap", st.combine_strings("full", last_osdmap_epoch), bl); + if (r < 0) { + cerr << "unable to load osdmap e" << last_osdmap_epoch << std::endl; + return r; + } + + OSDMap osdmap; + osdmap.decode(bl); + creating_pgs_t creating; + for (auto& i : osdmap.get_pools()) { + creating.created_pools.insert(i.first); + } + creating.last_scan_epoch = last_osdmap_epoch; + + bufferlist newbl; + encode(creating, newbl, CEPH_FEATURES_ALL); + + auto t = make_shared(); + t->put("osd_pg_creating", "creating", newbl); + st.apply_transaction(t); + return 0; +} + +// rebuild +// - mgr +// - mgr_command_desc +static int update_mgrmap(MonitorDBStore& st) +{ + auto t = make_shared(); + + { + MgrMap map; + // mgr expects epoch > 1 + map.epoch++; + auto initial_modules = + get_str_vec(g_ceph_context->_conf.get_val("mgr_initial_modules")); + copy(begin(initial_modules), + end(initial_modules), + inserter(map.modules, end(map.modules))); + bufferlist bl; + map.encode(bl, CEPH_FEATURES_ALL); + t->put("mgr", map.epoch, bl); + t->put("mgr", "last_committed", map.epoch); + } + { + auto mgr_command_descs = mgr_commands; + for (auto& c : mgr_command_descs) { + c.set_flag(MonCommand::FLAG_MGR); + } + bufferlist bl; + encode(mgr_command_descs, bl); + t->put("mgr_command_descs", "", bl); + } + return st.apply_transaction(t); +} + +static int update_paxos(MonitorDBStore& st) +{ + const string prefix("paxos"); + // a large enough version greater than the maximum possible `last_committed` + // that could be replied by the peons when the leader is collecting paxos + // transactions during recovery + constexpr version_t first_committed = 0x42; + constexpr version_t last_committed = first_committed; + for (version_t v = first_committed; v < last_committed + 1; v++) { + auto t = make_shared(); + if (v == first_committed) { + t->put(prefix, "first_committed", v); + } + bufferlist proposal; + MonitorDBStore::Transaction empty_txn; + empty_txn.encode(proposal); + t->put(prefix, v, proposal); + t->put(prefix, "last_committed", v); + st.apply_transaction(t); + } + // build a pending paxos proposal from all non-permanent k/v pairs. once the + // proposal is committed, it will gets applied. on the sync provider side, it + // will be a no-op, but on its peers, the paxos commit will help to build up + // the necessary epochs. + bufferlist pending_proposal; + { + MonitorDBStore::Transaction t; + vector prefixes = {"auth", "osdmap", + "mgr", "mgr_command_desc"}; + for (const auto& prefix : prefixes) { + for (auto i = st.get_iterator(prefix); i->valid(); i->next()) { + auto key = i->raw_key(); + auto val = i->value(); + t.put(key.first, key.second, val); + } + } + t.encode(pending_proposal); + } + auto pending_v = last_committed + 1; + auto t = make_shared(); + t->put(prefix, pending_v, pending_proposal); + t->put(prefix, "pending_v", pending_v); + t->put(prefix, "pending_pn", 400); + st.apply_transaction(t); + return 0; +} + +int rebuild_monstore(const char* progname, + vector& subcmds, + MonitorDBStore& st) +{ + po::options_description op_desc("Allowed 'rebuild' options"); + string keyring_path; + string monmap_path; + vector mon_ids; + op_desc.add_options() + ("keyring", po::value(&keyring_path), + "path to the client.admin key") + ("monmap", po::value(&monmap_path), + "path to the initial monmap") + ("mon-ids", po::value>(&mon_ids)->multitoken(), + "mon ids, use 'a', 'b', ... if not specified"); + po::positional_options_description pos_desc; + pos_desc.add("mon-ids", -1); + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, nullptr, &pos_desc, subcmds, &op_vm); + if (r) { + return -r; + } + if (op_vm.count("help")) { + usage(progname, op_desc); + return 0; + } + if (!keyring_path.empty()) + update_auth(st, keyring_path); + if ((r = update_creating_pgs(st))) { + return r; + } + if ((r = update_mgrmap(st))) { + return r; + } + if ((r = update_paxos(st))) { + return r; + } + if ((r = update_mkfs(st, monmap_path, mon_ids))) { + return r; + } + if ((r = update_monitor(st))) { + return r; + } + return 0; +} + +int main(int argc, char **argv) { + int err = 0; + po::options_description desc("Allowed options"); + string store_path, cmd; + vector subcmds; + desc.add_options() + ("help,h", "produce help message") + ; + + /* Dear Future Developer: + * + * for further improvement, should you need to pass specific options to + * a command (e.g., get osdmap VER --hex), you can expand the current + * format by creating additional 'po::option_description' and passing + * 'subcmds' to 'po::command_line_parser', much like what is currently + * done by default. However, beware: in order to differentiate a + * command-specific option from the generic/global options, you will need + * to pass '--' in the command line (so that the first parser, the one + * below, assumes it has reached the end of all options); e.g., + * 'get osdmap VER -- --hex'. Not pretty; far from intuitive; it was as + * far as I got with this library. Improvements on this format will be + * left as an excercise for the reader. -Joao + */ + po::options_description positional_desc("Positional argument options"); + positional_desc.add_options() + ("store-path", po::value(&store_path), + "path to monitor's store") + ("command", po::value(&cmd), + "Command") + ("subcmd", po::value >(&subcmds), + "Command arguments/Sub-Commands") + ; + po::positional_options_description positional; + positional.add("store-path", 1); + positional.add("command", 1); + positional.add("subcmd", -1); + + po::options_description all_desc("All options"); + all_desc.add(desc).add(positional_desc); + + vector ceph_option_strings; + po::variables_map vm; + try { + po::parsed_options parsed = + po::command_line_parser(argc, argv). + options(all_desc). + positional(positional). + allow_unregistered().run(); + + po::store( + parsed, + vm); + po::notify(vm); + + // Specifying po::include_positional would have our positional arguments + // being collected (thus being part of ceph_option_strings and eventually + // passed on to global_init() below). + // Instead we specify po::exclude_positional, which has the upside of + // completely avoid this, but the downside of having to specify ceph + // options as --VAR=VAL (note the '='); otherwise we will capture the + // positional 'VAL' as belonging to us, never being collected. + ceph_option_strings = po::collect_unrecognized(parsed.options, + po::exclude_positional); + + } catch(po::error &e) { + std::cerr << "error: " << e.what() << std::endl; + return 1; + } + + // parse command structure before calling global_init() and friends. + + if (vm.empty() || vm.count("help") || + store_path.empty() || cmd.empty() || + *cmd.begin() == '-') { + usage(argv[0], desc); + return 1; + } + + vector ceph_options; + ceph_options.reserve(ceph_option_strings.size()); + for (vector::iterator i = ceph_option_strings.begin(); + i != ceph_option_strings.end(); + ++i) { + ceph_options.push_back(i->c_str()); + } + + auto cct = global_init( + NULL, ceph_options, CEPH_ENTITY_TYPE_MON, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_MON_CONFIG); + common_init_finish(g_ceph_context); + cct->_conf.apply_changes(nullptr); + + // this is where we'll write *whatever*, on a per-command basis. + // not all commands require some place to write their things. + MonitorDBStore st(store_path); + if (store_path.size()) { + stringstream ss; + int r = st.open(ss); + if (r < 0) { + std::cerr << ss.str() << std::endl; + return EINVAL; + } + } + + auto close_store = make_scope_guard([&] { + st.close(); + }); + + if (cmd == "dump-keys") { + KeyValueDB::WholeSpaceIterator iter = st.get_iterator(); + while (iter->valid()) { + pair key(iter->raw_key()); + cout << key.first << " / " << key.second << std::endl; + iter->next(); + } + } else if (cmd == "compact") { + st.compact(); + } else if (cmd == "get") { + unsigned v = 0; + string outpath; + string map_type; + // visible options for this command + po::options_description op_desc("Allowed 'get' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("out,o", po::value(&outpath), + "output file (default: stdout)") + ("version,v", po::value(&v), + "map version to obtain") + ("readable,r", "print the map information in human readable format") + ; + // this is going to be a positional argument; we don't want to show + // it as an option during --help, but we do want to have it captured + // when parsing. + po::options_description hidden_op_desc("Hidden 'get' options"); + hidden_op_desc.add_options() + ("map-type", po::value(&map_type), + "map-type") + ; + po::positional_options_description op_positional; + op_positional.add("map-type", 1); + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional, + subcmds, &op_vm); + if (r < 0) { + return -r; + } + + if (op_vm.count("help") || map_type.empty()) { + usage(argv[0], op_desc); + return 0; + } + + if (v == 0) { + if (map_type == "crushmap") { + v = st.get("osdmap", "last_committed"); + } else { + v = st.get(map_type, "last_committed"); + } + } + + int fd = STDOUT_FILENO; + if (!outpath.empty()){ + fd = ::open(outpath.c_str(), O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0666); + if (fd < 0) { + std::cerr << "error opening output file: " + << cpp_strerror(errno) << std::endl; + return EINVAL; + } + } + + auto close_fd = make_scope_guard([&] { + ::close(fd); + if (r < 0 && fd != STDOUT_FILENO) { + ::remove(outpath.c_str()); + } + }); + + bufferlist bl; + r = 0; + if (map_type == "osdmap") { + r = st.get(map_type, st.combine_strings("full", v), bl); + } else if (map_type == "crushmap") { + bufferlist tmp; + r = st.get("osdmap", st.combine_strings("full", v), tmp); + if (r >= 0) { + OSDMap osdmap; + osdmap.decode(tmp); + osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + } + } else { + r = st.get(map_type, v, bl); + } + if (r < 0) { + std::cerr << "Error getting map: " << cpp_strerror(r) << std::endl; + return EINVAL; + } + + if (op_vm.count("readable")) { + stringstream ss; + bufferlist out; + try { + if (map_type == "monmap") { + MonMap monmap; + monmap.decode(bl); + monmap.print(ss); + } else if (map_type == "osdmap") { + OSDMap osdmap; + osdmap.decode(bl); + osdmap.print(cct.get(), ss); + } else if (map_type == "mdsmap") { + FSMap fs_map; + fs_map.decode(bl); + fs_map.print(ss); + } else if (map_type == "mgr") { + MgrMap mgr_map; + auto p = bl.cbegin(); + mgr_map.decode(p); + JSONFormatter f; + f.dump_object("mgrmap", mgr_map); + f.flush(ss); + } else if (map_type == "crushmap") { + CrushWrapper cw; + auto it = bl.cbegin(); + cw.decode(it); + CrushCompiler cc(cw, std::cerr, 0); + cc.decompile(ss); + } else { + std::cerr << "This type of readable map does not exist: " << map_type + << std::endl << "You can only specify[osdmap|monmap|mdsmap" + "|crushmap|mgr]" << std::endl; + } + } catch (const buffer::error &err) { + std::cerr << "Could not decode for human readable output (you may still" + " use non-readable mode). Detail: " << err.what() << std::endl; + } + + out.append(ss); + out.write_fd(fd); + } else { + bl.write_fd(fd); + } + + if (!outpath.empty()) { + std::cout << "wrote " << map_type + << " version " << v << " to " << outpath + << std::endl; + } + } else if (cmd == "show-versions") { + string map_type; //map type:osdmap,monmap... + // visible options for this command + po::options_description op_desc("Allowed 'show-versions' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("map-type", po::value(&map_type), "map_type"); + + po::positional_options_description op_positional; + op_positional.add("map-type", 1); + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, NULL, &op_positional, + subcmds, &op_vm); + if (r < 0) { + return -r; + } + + if (op_vm.count("help") || map_type.empty()) { + usage(argv[0], op_desc); + return 0; + } + + unsigned int v_first = 0; + unsigned int v_last = 0; + v_first = st.get(map_type, "first_committed"); + v_last = st.get(map_type, "last_committed"); + + std::cout << "first committed:\t" << v_first << "\n" + << "last committed:\t" << v_last << std::endl; + } else if (cmd == "dump-paxos") { + unsigned dstart = 0; + unsigned dstop = ~0; + po::options_description op_desc("Allowed 'dump-paxos' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("start,s", po::value(&dstart), + "starting version (default: 0)") + ("end,e", po::value(&dstop), + "finish version (default: ~0)") + ; + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, NULL, NULL, + subcmds, &op_vm); + if (r < 0) { + return -r; + } + + if (op_vm.count("help")) { + usage(argv[0], op_desc); + return 0; + } + + if (dstart > dstop) { + std::cerr << "error: 'start' version (value: " << dstart << ") " + << " is greater than 'end' version (value: " << dstop << ")" + << std::endl; + return EINVAL; + } + + version_t v = dstart; + for (; v <= dstop; ++v) { + bufferlist bl; + st.get("paxos", v, bl); + if (bl.length() == 0) + break; + cout << "\n--- " << v << " ---" << std::endl; + auto tx(std::make_shared()); + Paxos::decode_append_transaction(tx, bl); + JSONFormatter f(true); + tx->dump(&f); + f.flush(cout); + } + + std::cout << "dumped " << v << " paxos versions" << std::endl; + + } else if (cmd == "dump-trace") { + unsigned dstart = 0; + unsigned dstop = ~0; + string outpath; + + // visible options for this command + po::options_description op_desc("Allowed 'dump-trace' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("start,s", po::value(&dstart), + "starting version (default: 0)") + ("end,e", po::value(&dstop), + "finish version (default: ~0)") + ; + // this is going to be a positional argument; we don't want to show + // it as an option during --help, but we do want to have it captured + // when parsing. + po::options_description hidden_op_desc("Hidden 'dump-trace' options"); + hidden_op_desc.add_options() + ("out,o", po::value(&outpath), + "file to write the dump to") + ; + po::positional_options_description op_positional; + op_positional.add("out", 1); + + po::variables_map op_vm; + int r = parse_cmd_args(&op_desc, &hidden_op_desc, &op_positional, + subcmds, &op_vm); + if (r < 0) { + return -r; + } + + if (op_vm.count("help")) { + usage(argv[0], op_desc); + return 0; + } + + if (outpath.empty()) { + usage(argv[0], op_desc); + return EINVAL; + } + + if (dstart > dstop) { + std::cerr << "error: 'start' version (value: " << dstart << ") " + << " is greater than 'stop' version (value: " << dstop << ")" + << std::endl; + return EINVAL; + } + + TraceIter iter(outpath.c_str()); + iter.init(); + while (true) { + if (!iter.valid()) + break; + if (iter.num() >= dstop) { + break; + } + if (iter.num() >= dstart) { + JSONFormatter f(true); + iter.cur()->dump(&f, false); + f.flush(std::cout); + std::cout << std::endl; + } + iter.next(); + } + std::cerr << "Read up to transaction " << iter.num() << std::endl; + } else if (cmd == "replay-trace") { + string inpath; + unsigned num_replays = 1; + // visible options for this command + po::options_description op_desc("Allowed 'replay-trace' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("num-replays,n", po::value(&num_replays), + "finish version (default: 1)") + ; + // this is going to be a positional argument; we don't want to show + // it as an option during --help, but we do want to have it captured + // when parsing. + po::options_description hidden_op_desc("Hidden 'replay-trace' options"); + hidden_op_desc.add_options() + ("in,i", po::value(&inpath), + "file to write the dump to") + ; + po::positional_options_description op_positional; + op_positional.add("in", 1); + + // op_desc_all will aggregate all visible and hidden options for parsing. + // when we call 'usage()' we just pass 'op_desc', as that's the description + // holding the visible options. + po::options_description op_desc_all; + op_desc_all.add(op_desc).add(hidden_op_desc); + + po::variables_map op_vm; + try { + po::parsed_options op_parsed = po::command_line_parser(subcmds). + options(op_desc_all).positional(op_positional).run(); + po::store(op_parsed, op_vm); + po::notify(op_vm); + } catch (po::error &e) { + std::cerr << "error: " << e.what() << std::endl; + return EINVAL; + } + + if (op_vm.count("help")) { + usage(argv[0], op_desc); + return 0; + } + + if (inpath.empty()) { + usage(argv[0], op_desc); + return EINVAL; + } + + unsigned num = 0; + for (unsigned i = 0; i < num_replays; ++i) { + TraceIter iter(inpath.c_str()); + iter.init(); + while (true) { + if (!iter.valid()) + break; + std::cerr << "Replaying trans num " << num << std::endl; + st.apply_transaction(iter.cur()); + iter.next(); + ++num; + } + std::cerr << "Read up to transaction " << iter.num() << std::endl; + } + } else if (cmd == "random-gen") { + unsigned tsize = 200; + unsigned tvalsize = 1024; + unsigned ntrans = 100; + po::options_description op_desc("Allowed 'random-gen' options"); + op_desc.add_options() + ("help,h", "produce this help message") + ("num-keys,k", po::value(&tsize), + "keys to write in each transaction (default: 200)") + ("size,s", po::value(&tvalsize), + "size (in bytes) of the value to write in each key (default: 1024)") + ("ntrans,n", po::value(&ntrans), + "number of transactions to run (default: 100)") + ; + + po::variables_map op_vm; + try { + po::parsed_options op_parsed = po::command_line_parser(subcmds). + options(op_desc).run(); + po::store(op_parsed, op_vm); + po::notify(op_vm); + } catch (po::error &e) { + std::cerr << "error: " << e.what() << std::endl; + return EINVAL; + } + + if (op_vm.count("help")) { + usage(argv[0], op_desc); + return 0; + } + + unsigned num = 0; + for (unsigned i = 0; i < ntrans; ++i) { + std::cerr << "Applying trans " << i << std::endl; + auto t(std::make_shared()); + string prefix; + prefix.push_back((i%26)+'a'); + for (unsigned j = 0; j < tsize; ++j) { + stringstream os; + os << num; + bufferlist bl; + for (unsigned k = 0; k < tvalsize; ++k) bl.append(rand()); + t->put(prefix, os.str(), bl); + ++num; + } + t->compact_prefix(prefix); + st.apply_transaction(t); + } + } else if (cmd == "store-copy") { + if (subcmds.size() < 1 || subcmds[0].empty()) { + usage(argv[0], desc); + return EINVAL; + } + + string out_path = subcmds[0]; + + MonitorDBStore out_store(out_path); + { + stringstream ss; + int r = out_store.create_and_open(ss); + if (r < 0) { + std::cerr << ss.str() << std::endl; + return err; + } + } + + + KeyValueDB::WholeSpaceIterator it = st.get_iterator(); + uint64_t total_keys = 0; + uint64_t total_size = 0; + uint64_t total_tx = 0; + + do { + uint64_t num_keys = 0; + + auto tx(std::make_shared()); + + while (it->valid() && num_keys < 128) { + pair k = it->raw_key(); + bufferlist v = it->value(); + tx->put(k.first, k.second, v); + + num_keys ++; + total_tx ++; + total_size += v.length(); + + it->next(); + } + + total_keys += num_keys; + + if (!tx->empty()) + out_store.apply_transaction(tx); + + std::cout << "copied " << total_keys << " keys so far (" + << stringify(byte_u_t(total_size)) << ")" << std::endl; + + } while (it->valid()); + out_store.close(); + std::cout << "summary: copied " << total_keys << " keys, using " + << total_tx << " transactions, totalling " + << stringify(byte_u_t(total_size)) << std::endl; + std::cout << "from '" << store_path << "' to '" << out_path << "'" + << std::endl; + } else if (cmd == "rewrite-crush") { + err = rewrite_crush(argv[0], subcmds, st); + } else if (cmd == "rebuild") { + err = rebuild_monstore(argv[0], subcmds, st); + } else { + std::cerr << "Unrecognized command: " << cmd << std::endl; + usage(argv[0], desc); + return err; + } +} diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc new file mode 100644 index 000000000..01c8722b4 --- /dev/null +++ b/src/tools/ceph_objectstore_tool.cc @@ -0,0 +1,4552 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "common/Formatter.h" +#include "common/errno.h" +#include "common/ceph_argparse.h" +#include "common/url_escape.h" + +#include "global/global_init.h" + +#include "os/ObjectStore.h" +#ifdef HAVE_LIBFUSE +#include "os/FuseStore.h" +#endif + +#include "osd/PGLog.h" +#include "osd/OSD.h" +#include "osd/PG.h" +#include "osd/ECUtil.h" + +#include "json_spirit/json_spirit_value.h" +#include "json_spirit/json_spirit_reader.h" + +#include "rebuild_mondb.h" +#include "ceph_objectstore_tool.h" +#include "include/compat.h" +#include "include/util.h" + +using namespace std; +namespace po = boost::program_options; + +#ifdef INTERNAL_TEST +CompatSet get_test_compat_set() { + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); +#ifdef INTERNAL_TEST2 + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); +#endif + return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} +#endif + +const ssize_t max_read = 1024 * 1024; +const int fd_none = INT_MIN; +bool outistty; +bool dry_run; + +struct action_on_object_t { + virtual ~action_on_object_t() {} + virtual void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) = 0; +}; + +int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug) +{ + auto ch = store->open_collection(coll); + + unsigned LIST_AT_A_TIME = 100; + ghobject_t next; + while (!next.is_max()) { + vector list; + int r = store->collection_list(ch, + next, + ghobject_t::get_max(), + LIST_AT_A_TIME, + &list, + &next); + if (r < 0) { + cerr << "Error listing collection: " << coll << ", " + << cpp_strerror(r) << std::endl; + return r; + } + for (vector::iterator obj = list.begin(); + obj != list.end(); + ++obj) { + object_info_t oi; + if (coll != coll_t::meta()) { + bufferlist attr; + r = store->getattr(ch, *obj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", " + << cpp_strerror(r) << std::endl; + } else { + auto bp = attr.cbegin(); + try { + decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding attr on : " << make_pair(coll, *obj) << ", " + << cpp_strerror(r) << std::endl; + } + } + } + action.call(store, coll, *obj, oi); + } + } + return 0; +} + +int action_on_all_objects_in_pg(ObjectStore *store, string pgidstr, action_on_object_t &action, bool debug) +{ + spg_t pgid; + // Scan collections in case this is an ec pool but no shard specified + unsigned scanned = 0; + int r = 0; + vector colls_to_check; + vector candidates; + + r = store->list_collections(candidates); + if (r < 0) { + cerr << "Error listing collections: " << cpp_strerror(r) << std::endl; + return r; + } + pgid.parse(pgidstr.c_str()); + for (vector::iterator i = candidates.begin(); + i != candidates.end(); + ++i) { + spg_t cand_pgid; + if (i->is_meta() && pgidstr == "meta") { + colls_to_check.push_back(*i); + continue; + } + if (!i->is_pg(&cand_pgid)) + continue; + + // If an exact match or treat no shard as any shard + if (cand_pgid == pgid || + (pgid.is_no_shard() && pgid.pgid == cand_pgid.pgid)) { + colls_to_check.push_back(*i); + } + } + + if (debug) + cerr << colls_to_check.size() << " pgs to scan" << std::endl; + for (vector::iterator i = colls_to_check.begin(); + i != colls_to_check.end(); + ++i, ++scanned) { + if (debug) + cerr << "Scanning " << *i << ", " << scanned << "/" + << colls_to_check.size() << " completed" << std::endl; + r = _action_on_all_objects_in_pg(store, *i, action, debug); + if (r < 0) + break; + } + return r; +} + +int action_on_all_objects_in_exact_pg(ObjectStore *store, coll_t coll, action_on_object_t &action, bool debug) +{ + int r = _action_on_all_objects_in_pg(store, coll, action, debug); + return r; +} + +int _action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool debug) +{ + unsigned scanned = 0; + int r = 0; + vector colls_to_check; + vector candidates; + r = store->list_collections(candidates); + if (r < 0) { + cerr << "Error listing collections: " << cpp_strerror(r) << std::endl; + return r; + } + for (vector::iterator i = candidates.begin(); + i != candidates.end(); + ++i) { + if (i->is_pg()) { + colls_to_check.push_back(*i); + } + } + + if (debug) + cerr << colls_to_check.size() << " pgs to scan" << std::endl; + for (vector::iterator i = colls_to_check.begin(); + i != colls_to_check.end(); + ++i, ++scanned) { + if (debug) + cerr << "Scanning " << *i << ", " << scanned << "/" + << colls_to_check.size() << " completed" << std::endl; + r = _action_on_all_objects_in_pg(store, *i, action, debug); + if (r < 0) + return r; + } + return 0; +} + +int action_on_all_objects(ObjectStore *store, action_on_object_t &action, bool debug) +{ + int r = _action_on_all_objects(store, action, debug); + return r; +} + +struct pgid_object_list { + list > _objects; + + void insert(coll_t coll, ghobject_t &ghobj) { + _objects.push_back(make_pair(coll, ghobj)); + } + + void dump(Formatter *f, bool human_readable) const { + if (!human_readable) + f->open_array_section("pgid_objects"); + for (list >::const_iterator i = _objects.begin(); + i != _objects.end(); + ++i) { + f->open_array_section("pgid_object"); + spg_t pgid; + bool is_pg = i->first.is_pg(&pgid); + if (is_pg) + f->dump_string("pgid", stringify(pgid)); + if (!is_pg || !human_readable) + f->dump_string("coll", i->first.to_str()); + f->open_object_section("ghobject"); + i->second.dump(f); + f->close_section(); + f->close_section(); + if (human_readable) { + f->flush(cout); + cout << std::endl; + } + } + if (!human_readable) { + f->close_section(); + f->flush(cout); + cout << std::endl; + } + } +}; + +struct lookup_ghobject : public action_on_object_t { + pgid_object_list _objects; + const string _name; + const boost::optional _namespace; + bool _need_snapset; + + lookup_ghobject(const string& name, const boost::optional& nspace, bool need_snapset = false) : _name(name), + _namespace(nspace), _need_snapset(need_snapset) { } + + void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) override { + if (_need_snapset && !ghobj.hobj.has_snapset()) + return; + if ((_name.length() == 0 || ghobj.hobj.oid.name == _name) && + (!_namespace || ghobj.hobj.nspace == _namespace)) + _objects.insert(coll, ghobj); + return; + } + + int size() const { + return _objects._objects.size(); + } + + pair pop() { + pair front = _objects._objects.front(); + _objects._objects.pop_front(); + return front; + } + + void dump(Formatter *f, bool human_readable) const { + _objects.dump(f, human_readable); + } +}; + +struct lookup_slow_ghobject : public action_on_object_t { + list > _objects; + const string _name; + double threshold; + + coll_t last_coll; + + lookup_slow_ghobject(const string& name, double _threshold) : + _name(name), threshold(_threshold) { } + + void call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) override { + ObjectMap::ObjectMapIterator iter; + auto start1 = mono_clock::now(); + ceph::signedspan first_seek_time = start1 - start1; + ceph::signedspan last_seek_time = first_seek_time; + ceph::signedspan total_time = first_seek_time; + { + auto ch = store->open_collection(coll); + iter = store->get_omap_iterator(ch, ghobj); + if (!iter) { + cerr << "omap_get_iterator: " << cpp_strerror(ENOENT) + << " obj:" << ghobj + << std::endl; + return; + } + auto start = mono_clock::now(); + iter->seek_to_first(); + first_seek_time = mono_clock::now() - start; + + while(iter->valid()) { + start = mono_clock::now(); + iter->next(); + last_seek_time = mono_clock::now() - start; + } + } + + if (coll != last_coll) { + cerr << ">>> inspecting coll" << coll << std::endl; + last_coll = coll; + } + + total_time = mono_clock::now() - start1; + if ( total_time >= make_timespan(threshold)) { + _objects.emplace_back(coll, ghobj, + first_seek_time, last_seek_time, total_time, + url_escape(iter->tail_key())); + cerr << ">>>>> found obj " << ghobj + << " first_seek_time " + << std::chrono::duration_cast(first_seek_time).count() + << " last_seek_time " + << std::chrono::duration_cast(last_seek_time).count() + << " total_time " + << std::chrono::duration_cast(total_time).count() + << " tail key: " << url_escape(iter->tail_key()) + << std::endl; + } + return; + } + + int size() const { + return _objects.size(); + } + + void dump(Formatter *f, bool human_readable) const { + if (!human_readable) + f->open_array_section("objects"); + for (auto i = _objects.begin(); + i != _objects.end(); + ++i) { + f->open_array_section("object"); + coll_t coll; + ghobject_t ghobj; + ceph::signedspan first_seek_time; + ceph::signedspan last_seek_time; + ceph::signedspan total_time; + string tail_key; + std::tie(coll, ghobj, first_seek_time, last_seek_time, total_time, tail_key) = *i; + + spg_t pgid; + bool is_pg = coll.is_pg(&pgid); + if (is_pg) + f->dump_string("pgid", stringify(pgid)); + if (!is_pg || !human_readable) + f->dump_string("coll", coll.to_str()); + f->dump_object("ghobject", ghobj); + f->open_object_section("times"); + f->dump_int("first_seek_time", + std::chrono::duration_cast(first_seek_time).count()); + f->dump_int("last_seek_time", + std::chrono::duration_cast + (last_seek_time).count()); + f->dump_int("total_time", + std::chrono::duration_cast(total_time).count()); + f->dump_string("tail_key", tail_key); + f->close_section(); + + f->close_section(); + if (human_readable) { + f->flush(cout); + cout << std::endl; + } + } + if (!human_readable) { + f->close_section(); + f->flush(cout); + cout << std::endl; + } + } +}; + +int file_fd = fd_none; +bool debug; +bool force = false; +bool no_superblock = false; + +super_header sh; + +static int get_fd_data(int fd, bufferlist &bl) +{ + uint64_t total = 0; + do { + ssize_t bytes = bl.read_fd(fd, max_read); + if (bytes < 0) { + cerr << "read_fd error " << cpp_strerror(bytes) << std::endl; + return bytes; + } + + if (bytes == 0) + break; + + total += bytes; + } while(true); + + ceph_assert(bl.length() == total); + return 0; +} + +int get_log(CephContext *cct, ObjectStore *fs, __u8 struct_ver, + spg_t pgid, const pg_info_t &info, + PGLog::IndexedLog &log, pg_missing_t &missing) +{ + try { + auto ch = fs->open_collection(coll_t(pgid)); + if (!ch) { + return -ENOENT; + } + ostringstream oss; + ceph_assert(struct_ver > 0); + PGLog::read_log_and_missing( + cct, fs, ch, + pgid.make_pgmeta_oid(), + info, log, missing, + oss, + g_ceph_context->_conf->osd_ignore_stale_divergent_priors); + if (debug && oss.str().size()) + cerr << oss.str() << std::endl; + } + catch (const buffer::error &e) { + cerr << "read_log_and_missing threw exception error " << e.what() << std::endl; + return -EFAULT; + } + return 0; +} + +void dump_log(Formatter *formatter, ostream &out, pg_log_t &log, + pg_missing_t &missing) +{ + formatter->open_object_section("op_log"); + formatter->open_object_section("pg_log_t"); + log.dump(formatter); + formatter->close_section(); + formatter->flush(out); + formatter->open_object_section("pg_missing_t"); + missing.dump(formatter); + formatter->close_section(); + formatter->close_section(); + formatter->flush(out); +} + +//Based on part of OSD::load_pgs() +int finish_remove_pgs(ObjectStore *store) +{ + vector ls; + int r = store->list_collections(ls); + if (r < 0) { + cerr << "finish_remove_pgs: failed to list pgs: " << cpp_strerror(r) + << std::endl; + return r; + } + + for (vector::iterator it = ls.begin(); + it != ls.end(); + ++it) { + spg_t pgid; + + if (it->is_temp(&pgid) || + (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) { + cout << "finish_remove_pgs " << *it << " removing " << pgid << std::endl; + OSD::recursive_remove_collection(g_ceph_context, store, pgid, *it); + continue; + } + + //cout << "finish_remove_pgs ignoring unrecognized " << *it << std::endl; + } + return 0; +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t) +{ + pg_info_t info(pgid); + coll_t coll(pgid); + ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid()); + + epoch_t map_epoch = 0; + int r = PG::peek_map_epoch(fs, pgid, &map_epoch); + if (r < 0) + cerr << __func__ << " warning: peek_map_epoch reported error" << std::endl; + PastIntervals past_intervals; + __u8 struct_v; + r = PG::read_info(fs, pgid, coll, info, past_intervals, struct_v); + if (r < 0) { + cerr << __func__ << " error on read_info " << cpp_strerror(r) << std::endl; + return r; + } + ceph_assert(struct_v >= 8); + // new omap key + cout << "setting '_remove' omap key" << std::endl; + map values; + encode((char)1, values["_remove"]); + t->omap_setkeys(coll, pgmeta_oid, values); + return 0; +} + +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" + +template +void wait_until_done(ObjectStore::Transaction* txn, Func&& func) +{ + bool finished = false; + std::condition_variable cond; + std::mutex m; + txn->register_on_complete(make_lambda_context([&](int) { + std::unique_lock lock{m}; + finished = true; + cond.notify_one(); + })); + std::move(func)(); + std::unique_lock lock{m}; + cond.wait(lock, [&] {return finished;}); +} + +int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid) +{ + if (!dry_run) + finish_remove_pgs(store); + if (!store->collection_exists(coll_t(r_pgid))) + return -ENOENT; + + cout << " marking collection for removal" << std::endl; + if (dry_run) + return 0; + ObjectStore::Transaction rmt; + int r = mark_pg_for_removal(store, r_pgid, &rmt); + if (r < 0) { + return r; + } + ObjectStore::CollectionHandle ch = store->open_collection(coll_t(r_pgid)); + store->queue_transaction(ch, std::move(rmt)); + finish_remove_pgs(store); + return r; +} + +int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info, + PastIntervals &past_intervals) +{ + //Empty for this + coll_t coll(info.pgid); + ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid()); + map km; + string key_to_remove; + pg_info_t last_written_info; + int ret = prepare_info_keymap( + g_ceph_context, + &km, &key_to_remove, + epoch, + info, + last_written_info, + past_intervals, + true, true, false); + if (ret) cerr << "Failed to write info" << std::endl; + t.omap_setkeys(coll, pgmeta_oid, km); + if (!key_to_remove.empty()) { + t.omap_rmkey(coll, pgmeta_oid, key_to_remove); + } + return ret; +} + +typedef map divergent_priors_t; + +int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info, + pg_log_t &log, PastIntervals &past_intervals, + divergent_priors_t &divergent, + pg_missing_t &missing) +{ + cout << __func__ << " epoch " << epoch << " info " << info << std::endl; + int ret = write_info(t, epoch, info, past_intervals); + if (ret) + return ret; + + coll_t coll(info.pgid); + map km; + const bool require_rollback = !info.pgid.is_no_shard(); + if (!divergent.empty()) { + ceph_assert(missing.get_items().empty()); + PGLog::write_log_and_missing_wo_missing( + t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, + require_rollback); + } else { + pg_missing_tracker_t tmissing(missing); + bool rebuilt_missing_set_with_deletes = missing.may_include_deletes; + PGLog::write_log_and_missing( + t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, + require_rollback, + &rebuilt_missing_set_with_deletes); + } + t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km); + return 0; +} + +int do_trim_pg_log(ObjectStore *store, const coll_t &coll, + pg_info_t &info, const spg_t &pgid, + epoch_t map_epoch, + PastIntervals &past_intervals) +{ + ghobject_t oid = pgid.make_pgmeta_oid(); + struct stat st; + auto ch = store->open_collection(coll); + int r = store->stat(ch, oid, &st); + ceph_assert(r == 0); + ceph_assert(st.st_size == 0); + + cerr << "Log bounds are: " << "(" << info.log_tail << "," + << info.last_update << "]" << std::endl; + + uint64_t max_entries = g_ceph_context->_conf->osd_max_pg_log_entries; + if (info.last_update.version - info.log_tail.version <= max_entries) { + cerr << "Log not larger than osd_max_pg_log_entries " << max_entries << std::endl; + return 0; + } + + ceph_assert(info.last_update.version > max_entries); + version_t trim_to = info.last_update.version - max_entries; + size_t trim_at_once = g_ceph_context->_conf->osd_pg_log_trim_max; + eversion_t new_tail; + bool done = false; + + while (!done) { + // gather keys so we can delete them in a batch without + // affecting the iterator + set keys_to_trim; + { + ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, oid); + if (!p) + break; + for (p->seek_to_first(); p->valid(); p->next()) { + if (p->key()[0] == '_') + continue; + if (p->key() == "can_rollback_to") + continue; + if (p->key() == "divergent_priors") + continue; + if (p->key() == "rollback_info_trimmed_to") + continue; + if (p->key() == "may_include_deletes_in_missing") + continue; + if (p->key().substr(0, 7) == string("missing")) + continue; + if (p->key().substr(0, 4) == string("dup_")) + continue; + + bufferlist bl = p->value(); + auto bp = bl.cbegin(); + pg_log_entry_t e; + try { + e.decode_with_checksum(bp); + } catch (const buffer::error &e) { + cerr << "Error reading pg log entry: " << e.what() << std::endl; + } + if (debug) { + cerr << "read entry " << e << std::endl; + } + if (e.version.version > trim_to) { + done = true; + break; + } + keys_to_trim.insert(p->key()); + new_tail = e.version; + if (keys_to_trim.size() >= trim_at_once) + break; + } + + if (!p->valid()) + done = true; + } // deconstruct ObjectMapIterator + + // delete the keys + if (!dry_run && !keys_to_trim.empty()) { + cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl; + ObjectStore::Transaction t; + t.omap_rmkeys(coll, oid, keys_to_trim); + store->queue_transaction(ch, std::move(t)); + ch->flush(); + } + } + + // update pg info with new tail + if (!dry_run && new_tail != eversion_t()) { + info.log_tail = new_tail; + ObjectStore::Transaction t; + int ret = write_info(t, map_epoch, info, past_intervals); + if (ret) + return ret; + store->queue_transaction(ch, std::move(t)); + ch->flush(); + } + + // compact the db since we just removed a bunch of data + cerr << "Finished trimming, now compacting..." << std::endl; + if (!dry_run) + store->compact(); + return 0; +} + +int do_trim_pg_log_dups(ObjectStore *store, const coll_t &coll, + pg_info_t &info, const spg_t &pgid, + epoch_t map_epoch, + PastIntervals &past_intervals) +{ + ghobject_t oid = pgid.make_pgmeta_oid(); + struct stat st; + auto ch = store->open_collection(coll); + int r = store->stat(ch, oid, &st); + ceph_assert(r == 0); + ceph_assert(st.st_size == 0); + + const size_t max_dup_entries = g_ceph_context->_conf->osd_pg_log_dups_tracked; + ceph_assert(max_dup_entries > 0); + const size_t max_chunk_size = g_ceph_context->_conf->osd_pg_log_trim_max; + ceph_assert(max_chunk_size > 0); + + cout << "max_dup_entries=" << max_dup_entries + << " max_chunk_size=" << max_chunk_size << std::endl; + if (dry_run) { + cout << "Dry run enabled, so when many chunks are needed," + << " the trimming will never stop!" << std::endl; + } + + set keys_to_keep; + size_t num_removed = 0; + do { + set keys_to_trim; + { + ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, oid); + if (!p) + break; + for (p->seek_to_first(); p->valid(); p->next()) { + if (p->key()[0] == '_') + continue; + if (p->key() == "can_rollback_to") + continue; + if (p->key() == "divergent_priors") + continue; + if (p->key() == "rollback_info_trimmed_to") + continue; + if (p->key() == "may_include_deletes_in_missing") + continue; + if (p->key().substr(0, 7) == string("missing")) + continue; + if (p->key().substr(0, 4) != string("dup_")) + continue; + keys_to_keep.insert(p->key()); + if (keys_to_keep.size() > max_dup_entries) { + auto oldest_to_keep = keys_to_keep.begin(); + keys_to_trim.emplace(*oldest_to_keep); + keys_to_keep.erase(oldest_to_keep); + } + if (keys_to_trim.size() >= max_chunk_size) { + break; + } + } + } // deconstruct ObjectMapIterator + // delete the keys + num_removed = keys_to_trim.size(); + if (!dry_run && !keys_to_trim.empty()) { + cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl; + ObjectStore::Transaction t; + t.omap_rmkeys(coll, oid, keys_to_trim); + store->queue_transaction(ch, std::move(t)); + ch->flush(); + } + } while (num_removed == max_chunk_size); + + // compact the db since we just removed a bunch of data + cerr << "Finished trimming, now compacting..." << std::endl; + if (!dry_run) + store->compact(); + return 0; +} + +const int OMAP_BATCH_SIZE = 25; +void get_omap_batch(ObjectMap::ObjectMapIterator &iter, map &oset) +{ + oset.clear(); + for (int count = OMAP_BATCH_SIZE; count && iter->valid(); --count, iter->next()) { + oset.insert(pair(iter->key(), iter->value())); + } +} + +int ObjectStoreTool::export_file(ObjectStore *store, coll_t cid, ghobject_t &obj) +{ + struct stat st; + mysize_t total; + footer ft; + + auto ch = store->open_collection(cid); + int ret = store->stat(ch, obj, &st); + if (ret < 0) + return ret; + + cerr << "Read " << obj << std::endl; + + total = st.st_size; + if (debug) + cerr << "size=" << total << std::endl; + + object_begin objb(obj); + + { + bufferptr bp; + bufferlist bl; + ret = store->getattr(ch, obj, OI_ATTR, bp); + if (ret < 0) { + cerr << "getattr failure object_info " << ret << std::endl; + return ret; + } + bl.push_back(bp); + decode(objb.oi, bl); + if (debug) + cerr << "object_info: " << objb.oi << std::endl; + } + + // NOTE: we include whiteouts, lost, etc. + + ret = write_section(TYPE_OBJECT_BEGIN, objb, file_fd); + if (ret < 0) + return ret; + + uint64_t offset = 0; + bufferlist rawdatabl; + while(total > 0) { + rawdatabl.clear(); + mysize_t len = max_read; + if (len > total) + len = total; + + ret = store->read(ch, obj, offset, len, rawdatabl); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + + data_section dblock(offset, len, rawdatabl); + if (debug) + cerr << "data section offset=" << offset << " len=" << len << std::endl; + + total -= ret; + offset += ret; + + ret = write_section(TYPE_DATA, dblock, file_fd); + if (ret) return ret; + } + + //Handle attrs for this object + map> aset; + ret = store->getattrs(ch, obj, aset); + if (ret) return ret; + attr_section as(aset); + ret = write_section(TYPE_ATTRS, as, file_fd); + if (ret) + return ret; + + if (debug) { + cerr << "attrs size " << aset.size() << std::endl; + } + + //Handle omap information + bufferlist hdrbuf; + ret = store->omap_get_header(ch, obj, &hdrbuf, true); + if (ret < 0) { + cerr << "omap_get_header: " << cpp_strerror(ret) << std::endl; + return ret; + } + + omap_hdr_section ohs(hdrbuf); + ret = write_section(TYPE_OMAP_HDR, ohs, file_fd); + if (ret) + return ret; + + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, obj); + if (!iter) { + ret = -ENOENT; + cerr << "omap_get_iterator: " << cpp_strerror(ret) << std::endl; + return ret; + } + iter->seek_to_first(); + int mapcount = 0; + map out; + while(iter->valid()) { + get_omap_batch(iter, out); + + if (out.empty()) break; + + mapcount += out.size(); + omap_section oms(out); + ret = write_section(TYPE_OMAP, oms, file_fd); + if (ret) + return ret; + } + if (debug) + cerr << "omap map size " << mapcount << std::endl; + + ret = write_simple(TYPE_OBJECT_END, file_fd); + if (ret) + return ret; + + return 0; +} + +int ObjectStoreTool::export_files(ObjectStore *store, coll_t coll) +{ + ghobject_t next; + auto ch = store->open_collection(coll); + while (!next.is_max()) { + vector objects; + int r = store->collection_list(ch, next, ghobject_t::get_max(), 300, + &objects, &next); + if (r < 0) + return r; + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + ceph_assert(!i->hobj.is_meta()); + if (i->is_pgmeta() || i->hobj.is_temp() || !i->is_no_gen()) { + continue; + } + r = export_file(store, coll, *i); + if (r < 0) + return r; + } + } + return 0; +} + +int set_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) { + OSDMap::Incremental inc; + auto it = bl.cbegin(); + inc.decode(it); + if (e == 0) { + e = inc.epoch; + } else if (e != inc.epoch) { + cerr << "incremental.epoch mismatch: " + << inc.epoch << " != " << e << std::endl; + if (force) { + cerr << "But will continue anyway." << std::endl; + } else { + return -EINVAL; + } + } + auto ch = store->open_collection(coll_t::meta()); + const ghobject_t inc_oid = OSD::get_inc_osdmap_pobject_name(e); + if (!store->exists(ch, inc_oid)) { + cerr << "inc-osdmap (" << inc_oid << ") does not exist." << std::endl; + if (!force) { + return -ENOENT; + } + cout << "Creating a new epoch." << std::endl; + } + if (dry_run) + return 0; + ObjectStore::Transaction t; + t.write(coll_t::meta(), inc_oid, 0, bl.length(), bl); + t.truncate(coll_t::meta(), inc_oid, bl.length()); + store->queue_transaction(ch, std::move(t)); + return 0; +} + +int get_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl) +{ + auto ch = store->open_collection(coll_t::meta()); + if (store->read(ch, + OSD::get_inc_osdmap_pobject_name(e), + 0, 0, bl) < 0) { + return -ENOENT; + } + return 0; +} + +int set_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force) { + OSDMap osdmap; + osdmap.decode(bl); + if (e == 0) { + e = osdmap.get_epoch(); + } else if (e != osdmap.get_epoch()) { + cerr << "osdmap.epoch mismatch: " + << e << " != " << osdmap.get_epoch() << std::endl; + if (force) { + cerr << "But will continue anyway." << std::endl; + } else { + return -EINVAL; + } + } + auto ch = store->open_collection(coll_t::meta()); + const ghobject_t full_oid = OSD::get_osdmap_pobject_name(e); + if (!store->exists(ch, full_oid)) { + cerr << "osdmap (" << full_oid << ") does not exist." << std::endl; + if (!force) { + return -ENOENT; + } + cout << "Creating a new epoch." << std::endl; + } + if (dry_run) + return 0; + ObjectStore::Transaction t; + t.write(coll_t::meta(), full_oid, 0, bl.length(), bl); + t.truncate(coll_t::meta(), full_oid, bl.length()); + store->queue_transaction(ch, std::move(t)); + return 0; +} + +int get_osdmap(ObjectStore *store, epoch_t e, OSDMap &osdmap, bufferlist& bl) +{ + ObjectStore::CollectionHandle ch = store->open_collection(coll_t::meta()); + bool found = store->read( + ch, OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0; + if (!found) { + cerr << "Can't find OSDMap for pg epoch " << e << std::endl; + return -ENOENT; + } + osdmap.decode(bl); + if (debug) + cerr << osdmap << std::endl; + return 0; +} + +int get_pg_num_history(ObjectStore *store, pool_pg_num_history_t *h) +{ + ObjectStore::CollectionHandle ch = store->open_collection(coll_t::meta()); + bufferlist bl; + auto pghist = OSD::make_pg_num_history_oid(); + int r = store->read(ch, pghist, 0, 0, bl, 0); + if (r >= 0 && bl.length() > 0) { + auto p = bl.cbegin(); + decode(*h, p); + } + cout << __func__ << " pg_num_history " << *h << std::endl; + return 0; +} + +int add_osdmap(ObjectStore *store, metadata_section &ms) +{ + return get_osdmap(store, ms.map_epoch, ms.osdmap, ms.osdmap_bl); +} + +int ObjectStoreTool::do_export( + CephContext *cct, ObjectStore *fs, coll_t coll, spg_t pgid, + pg_info_t &info, epoch_t map_epoch, __u8 struct_ver, + const OSDSuperblock& superblock, + PastIntervals &past_intervals) +{ + PGLog::IndexedLog log; + pg_missing_t missing; + + cerr << "Exporting " << pgid << " info " << info << std::endl; + + int ret = get_log(cct, fs, struct_ver, pgid, info, log, missing); + if (ret > 0) + return ret; + + if (debug) { + Formatter *formatter = Formatter::create("json-pretty"); + ceph_assert(formatter); + dump_log(formatter, cerr, log, missing); + delete formatter; + } + write_super(); + + pg_begin pgb(pgid, superblock); + // Special case: If replicated pg don't require the importing OSD to have shard feature + if (pgid.is_no_shard()) { + pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); + } + ret = write_section(TYPE_PG_BEGIN, pgb, file_fd); + if (ret) + return ret; + + // The metadata_section is now before files, so import can detect + // errors and abort without wasting time. + metadata_section ms( + struct_ver, + map_epoch, + info, + log, + past_intervals, + missing); + ret = add_osdmap(fs, ms); + if (ret) + return ret; + ret = write_section(TYPE_PG_METADATA, ms, file_fd); + if (ret) + return ret; + + ret = export_files(fs, coll); + if (ret) { + cerr << "export_files error " << ret << std::endl; + return ret; + } + + ret = write_simple(TYPE_PG_END, file_fd); + if (ret) + return ret; + + return 0; +} + +int dump_data(Formatter *formatter, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + data_section ds; + ds.decode(ebliter); + + formatter->open_object_section("data_block"); + formatter->dump_unsigned("offset", ds.offset); + formatter->dump_unsigned("len", ds.len); + // XXX: Add option to dump data like od -cx ? + formatter->close_section(); + formatter->flush(cout); + return 0; +} + +int get_data(ObjectStore *store, coll_t coll, ghobject_t hoid, + ObjectStore::Transaction *t, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + data_section ds; + ds.decode(ebliter); + + if (debug) + cerr << "\tdata: offset " << ds.offset << " len " << ds.len << std::endl; + t->write(coll, hoid, ds.offset, ds.len, ds.databl); + return 0; +} + +int dump_attrs( + Formatter *formatter, ghobject_t hoid, + bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + attr_section as; + as.decode(ebliter); + + // This could have been handled in the caller if we didn't need to + // support exports that didn't include object_info_t in object_begin. + if (hoid.generation == ghobject_t::NO_GEN && + hoid.hobj.is_head()) { + map::iterator mi = as.data.find(SS_ATTR); + if (mi != as.data.end()) { + SnapSet snapset; + auto p = mi->second.cbegin(); + snapset.decode(p); + formatter->open_object_section("snapset"); + snapset.dump(formatter); + formatter->close_section(); + } else { + formatter->open_object_section("snapset"); + formatter->dump_string("error", "missing SS_ATTR"); + formatter->close_section(); + } + } + + formatter->open_object_section("attrs"); + formatter->open_array_section("user"); + for (auto kv : as.data) { + // Skip system attributes + if (('_' != kv.first.at(0)) || kv.first.size() == 1) + continue; + formatter->open_object_section("user_attr"); + formatter->dump_string("name", kv.first.substr(1)); + bool b64; + formatter->dump_string("value", cleanbin(kv.second, b64)); + formatter->dump_bool("Base64", b64); + formatter->close_section(); + } + formatter->close_section(); + formatter->open_array_section("system"); + for (auto kv : as.data) { + // Skip user attributes + if (('_' == kv.first.at(0)) && kv.first.size() != 1) + continue; + formatter->open_object_section("sys_attr"); + formatter->dump_string("name", kv.first); + formatter->close_section(); + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + + return 0; +} + +int get_attrs( + ObjectStore *store, coll_t coll, ghobject_t hoid, + ObjectStore::Transaction *t, bufferlist &bl, + OSDriver &driver, SnapMapper &snap_mapper) +{ + auto ebliter = bl.cbegin(); + attr_section as; + as.decode(ebliter); + + auto ch = store->open_collection(coll); + if (debug) + cerr << "\tattrs: len " << as.data.size() << std::endl; + t->setattrs(coll, hoid, as.data); + + // This could have been handled in the caller if we didn't need to + // support exports that didn't include object_info_t in object_begin. + if (hoid.generation == ghobject_t::NO_GEN && + hoid.hobj.is_head()) { + map::iterator mi = as.data.find(SS_ATTR); + if (mi != as.data.end()) { + SnapSet snapset; + auto p = mi->second.cbegin(); + snapset.decode(p); + cout << "snapset " << snapset << std::endl; + for (auto& p : snapset.clone_snaps) { + ghobject_t clone = hoid; + clone.hobj.snap = p.first; + set snaps(p.second.begin(), p.second.end()); + if (!store->exists(ch, clone)) { + // no clone, skip. this is probably a cache pool. this works + // because we use a separate transaction per object and clones + // come before head in the archive. + if (debug) + cerr << "\tskipping missing " << clone << " (snaps " + << snaps << ")" << std::endl; + continue; + } + if (debug) + cerr << "\tsetting " << clone.hobj << " snaps " << snaps + << std::endl; + OSDriver::OSTransaction _t(driver.get_transaction(t)); + ceph_assert(!snaps.empty()); + snap_mapper.add_oid(clone.hobj, snaps, &_t); + } + } else { + cerr << "missing SS_ATTR on " << hoid << std::endl; + } + } + return 0; +} + +int dump_omap_hdr(Formatter *formatter, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + omap_hdr_section oh; + oh.decode(ebliter); + + formatter->open_object_section("omap_header"); + formatter->dump_string("value", string(oh.hdr.c_str(), oh.hdr.length())); + formatter->close_section(); + formatter->flush(cout); + return 0; +} + +int get_omap_hdr(ObjectStore *store, coll_t coll, ghobject_t hoid, + ObjectStore::Transaction *t, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + omap_hdr_section oh; + oh.decode(ebliter); + + if (debug) + cerr << "\tomap header: " << string(oh.hdr.c_str(), oh.hdr.length()) + << std::endl; + t->omap_setheader(coll, hoid, oh.hdr); + return 0; +} + +int dump_omap(Formatter *formatter, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + omap_section os; + os.decode(ebliter); + + formatter->open_object_section("omaps"); + formatter->dump_unsigned("count", os.omap.size()); + formatter->open_array_section("data"); + for (auto o : os.omap) { + formatter->open_object_section("omap"); + formatter->dump_string("name", o.first); + bool b64; + formatter->dump_string("value", cleanbin(o.second, b64)); + formatter->dump_bool("Base64", b64); + formatter->close_section(); + } + formatter->close_section(); + formatter->close_section(); + formatter->flush(cout); + return 0; +} + +int get_omap(ObjectStore *store, coll_t coll, ghobject_t hoid, + ObjectStore::Transaction *t, bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + omap_section os; + os.decode(ebliter); + + if (debug) + cerr << "\tomap: size " << os.omap.size() << std::endl; + t->omap_setkeys(coll, hoid, os.omap); + return 0; +} + +int ObjectStoreTool::dump_object(Formatter *formatter, + bufferlist &bl) +{ + auto ebliter = bl.cbegin(); + object_begin ob; + ob.decode(ebliter); + + if (ob.hoid.hobj.is_temp()) { + cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl; + return -EFAULT; + } + + formatter->open_object_section("object"); + formatter->open_object_section("oid"); + ob.hoid.dump(formatter); + formatter->close_section(); + formatter->open_object_section("object_info"); + ob.oi.dump(formatter); + formatter->close_section(); + + bufferlist ebl; + bool done = false; + while(!done) { + sectiontype_t type; + int ret = read_section(&type, &ebl); + if (ret) + return ret; + + //cout << "\tdo_object: Section type " << hex << type << dec << std::endl; + //cout << "\t\tsection size " << ebl.length() << std::endl; + if (type >= END_OF_TYPES) { + cout << "Skipping unknown object section type" << std::endl; + continue; + } + switch(type) { + case TYPE_DATA: + if (dry_run) break; + ret = dump_data(formatter, ebl); + if (ret) return ret; + break; + case TYPE_ATTRS: + if (dry_run) break; + ret = dump_attrs(formatter, ob.hoid, ebl); + if (ret) return ret; + break; + case TYPE_OMAP_HDR: + if (dry_run) break; + ret = dump_omap_hdr(formatter, ebl); + if (ret) return ret; + break; + case TYPE_OMAP: + if (dry_run) break; + ret = dump_omap(formatter, ebl); + if (ret) return ret; + break; + case TYPE_OBJECT_END: + done = true; + break; + default: + cerr << "Unknown section type " << type << std::endl; + return -EFAULT; + } + } + formatter->close_section(); + return 0; +} + +int ObjectStoreTool::get_object(ObjectStore *store, + OSDriver& driver, + SnapMapper& mapper, + coll_t coll, + bufferlist &bl, OSDMap &origmap, + bool *skipped_objects) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + auto ebliter = bl.cbegin(); + object_begin ob; + ob.decode(ebliter); + + if (ob.hoid.hobj.is_temp()) { + cerr << "ERROR: Export contains temporary object '" << ob.hoid << "'" << std::endl; + return -EFAULT; + } + ceph_assert(g_ceph_context); + + auto ch = store->open_collection(coll); + if (ob.hoid.hobj.nspace != g_ceph_context->_conf->osd_hit_set_namespace) { + object_t oid = ob.hoid.hobj.oid; + object_locator_t loc(ob.hoid.hobj); + pg_t raw_pgid = origmap.object_locator_to_pg(oid, loc); + pg_t pgid = origmap.raw_pg_to_pg(raw_pgid); + + spg_t coll_pgid; + if (coll.is_pg(&coll_pgid) == false) { + cerr << "INTERNAL ERROR: Bad collection during import" << std::endl; + return -EFAULT; + } + if (coll_pgid.shard != ob.hoid.shard_id) { + cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard + << " but object shard is " << ob.hoid.shard_id << std::endl; + return -EFAULT; + } + + if (coll_pgid.pgid != pgid) { + cerr << "Skipping object '" << ob.hoid << "' which belongs in pg " << pgid << std::endl; + *skipped_objects = true; + skip_object(bl); + return 0; + } + } + + if (!dry_run) + t->touch(coll, ob.hoid); + + cout << "Write " << ob.hoid << std::endl; + + bufferlist ebl; + bool done = false; + while(!done) { + sectiontype_t type; + int ret = read_section(&type, &ebl); + if (ret) + return ret; + + //cout << "\tdo_object: Section type " << hex << type << dec << std::endl; + //cout << "\t\tsection size " << ebl.length() << std::endl; + if (type >= END_OF_TYPES) { + cout << "Skipping unknown object section type" << std::endl; + continue; + } + switch(type) { + case TYPE_DATA: + if (dry_run) break; + ret = get_data(store, coll, ob.hoid, t, ebl); + if (ret) return ret; + break; + case TYPE_ATTRS: + if (dry_run) break; + ret = get_attrs(store, coll, ob.hoid, t, ebl, driver, mapper); + if (ret) return ret; + break; + case TYPE_OMAP_HDR: + if (dry_run) break; + ret = get_omap_hdr(store, coll, ob.hoid, t, ebl); + if (ret) return ret; + break; + case TYPE_OMAP: + if (dry_run) break; + ret = get_omap(store, coll, ob.hoid, t, ebl); + if (ret) return ret; + break; + case TYPE_OBJECT_END: + done = true; + break; + default: + cerr << "Unknown section type " << type << std::endl; + return -EFAULT; + } + } + if (!dry_run) { + wait_until_done(t, [&] { + store->queue_transaction(ch, std::move(*t)); + ch->flush(); + }); + } + return 0; +} + +int dump_pg_metadata(Formatter *formatter, bufferlist &bl, metadata_section &ms) +{ + auto ebliter = bl.cbegin(); + ms.decode(ebliter); + + formatter->open_object_section("metadata_section"); + + formatter->dump_unsigned("pg_disk_version", (int)ms.struct_ver); + formatter->dump_unsigned("map_epoch", ms.map_epoch); + + formatter->open_object_section("OSDMap"); + ms.osdmap.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + + formatter->open_object_section("info"); + ms.info.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + + formatter->open_object_section("log"); + ms.log.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + + formatter->open_object_section("pg_missing_t"); + ms.missing.dump(formatter); + formatter->close_section(); + + // XXX: ms.past_intervals? + + formatter->close_section(); + formatter->flush(cout); + + if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) { + cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl; + return -EFAULT; + } + + return 0; +} + +int get_pg_metadata(ObjectStore *store, bufferlist &bl, metadata_section &ms, + const OSDSuperblock& sb, spg_t pgid) +{ + auto ebliter = bl.cbegin(); + ms.decode(ebliter); + spg_t old_pgid = ms.info.pgid; + ms.info.pgid = pgid; + + if (debug) { + cout << "export pgid " << old_pgid << std::endl; + cout << "struct_v " << (int)ms.struct_ver << std::endl; + cout << "map epoch " << ms.map_epoch << std::endl; + +#ifdef DIAGNOSTIC + Formatter *formatter = new JSONFormatter(true); + formatter->open_object_section("stuff"); + + formatter->open_object_section("importing OSDMap"); + ms.osdmap.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + + cout << "osd current epoch " << sb.current_epoch << std::endl; + + formatter->open_object_section("info"); + ms.info.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + + formatter->open_object_section("log"); + ms.log.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; +#endif + } + + if (ms.osdmap.get_epoch() != 0 && ms.map_epoch != ms.osdmap.get_epoch()) { + cerr << "FATAL: Invalid OSDMap epoch in export data" << std::endl; + return -EFAULT; + } + + if (ms.map_epoch > sb.current_epoch) { + cerr << "ERROR: Export PG's map_epoch " << ms.map_epoch << " > OSD's epoch " << sb.current_epoch << std::endl; + cerr << "The OSD you are using is older than the exported PG" << std::endl; + cerr << "Either use another OSD or join selected OSD to cluster to update it first" << std::endl; + return -EINVAL; + } + + // Old exports didn't include OSDMap + if (ms.osdmap.get_epoch() == 0) { + cerr << "WARNING: No OSDMap in old export, this is an ancient export." + " Not supported." << std::endl; + return -EINVAL; + } + + if (ms.osdmap.get_epoch() < sb.oldest_map) { + cerr << "PG export's map " << ms.osdmap.get_epoch() + << " is older than OSD's oldest_map " << sb.oldest_map << std::endl; + if (!force) { + cerr << " pass --force to proceed anyway (with incomplete PastIntervals)" + << std::endl; + return -EINVAL; + } + } + if (debug) { + cerr << "Import pgid " << ms.info.pgid << std::endl; + cerr << "Previous past_intervals " << ms.past_intervals << std::endl; + cerr << "history.same_interval_since " + << ms.info.history.same_interval_since << std::endl; + } + + return 0; +} + +// out: pg_log_t that only has entries that apply to import_pgid using curmap +// reject: Entries rejected from "in" are in the reject.log. Other fields not set. +void filter_divergent_priors(spg_t import_pgid, const OSDMap &curmap, + const string &hit_set_namespace, const divergent_priors_t &in, + divergent_priors_t &out, divergent_priors_t &reject) +{ + out.clear(); + reject.clear(); + + for (divergent_priors_t::const_iterator i = in.begin(); + i != in.end(); ++i) { + + // Reject divergent priors for temporary objects + if (i->second.is_temp()) { + reject.insert(*i); + continue; + } + + if (i->second.nspace != hit_set_namespace) { + object_t oid = i->second.oid; + object_locator_t loc(i->second); + pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc); + pg_t pgid = curmap.raw_pg_to_pg(raw_pgid); + + if (import_pgid.pgid == pgid) { + out.insert(*i); + } else { + reject.insert(*i); + } + } else { + out.insert(*i); + } + } +} + +int ObjectStoreTool::dump_export(Formatter *formatter) +{ + bufferlist ebl; + pg_info_t info; + PGLog::IndexedLog log; + //bool skipped_objects = false; + + int ret = read_super(); + if (ret) + return ret; + + if (sh.magic != super_header::super_magic) { + cerr << "Invalid magic number" << std::endl; + return -EFAULT; + } + + if (sh.version > super_header::super_ver) { + cerr << "Can't handle export format version=" << sh.version << std::endl; + return -EINVAL; + } + + formatter->open_object_section("Export"); + + //First section must be TYPE_PG_BEGIN + sectiontype_t type; + ret = read_section(&type, &ebl); + if (ret) + return ret; + if (type == TYPE_POOL_BEGIN) { + cerr << "Dump of pool exports not supported" << std::endl; + return -EINVAL; + } else if (type != TYPE_PG_BEGIN) { + cerr << "Invalid first section type " << std::to_string(type) << std::endl; + return -EFAULT; + } + + auto ebliter = ebl.cbegin(); + pg_begin pgb; + pgb.decode(ebliter); + spg_t pgid = pgb.pgid; + + formatter->dump_string("pgid", stringify(pgid)); + formatter->dump_string("cluster_fsid", stringify(pgb.superblock.cluster_fsid)); + formatter->dump_string("features", stringify(pgb.superblock.compat_features)); + + bool done = false; + bool found_metadata = false; + metadata_section ms; + bool objects_started = false; + while(!done) { + ret = read_section(&type, &ebl); + if (ret) + return ret; + + if (debug) { + cerr << "dump_export: Section type " << std::to_string(type) << std::endl; + } + if (type >= END_OF_TYPES) { + cerr << "Skipping unknown section type" << std::endl; + continue; + } + switch(type) { + case TYPE_OBJECT_BEGIN: + if (!objects_started) { + formatter->open_array_section("objects"); + objects_started = true; + } + ret = dump_object(formatter, ebl); + if (ret) return ret; + break; + case TYPE_PG_METADATA: + if (objects_started) + cerr << "WARNING: metadata_section out of order" << std::endl; + ret = dump_pg_metadata(formatter, ebl, ms); + if (ret) return ret; + found_metadata = true; + break; + case TYPE_PG_END: + if (objects_started) { + formatter->close_section(); + } + done = true; + break; + default: + cerr << "Unknown section type " << std::to_string(type) << std::endl; + return -EFAULT; + } + } + + if (!found_metadata) { + cerr << "Missing metadata section" << std::endl; + return -EFAULT; + } + + formatter->close_section(); + formatter->flush(cout); + + return 0; +} + +int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb, + bool force, std::string pgidstr) +{ + bufferlist ebl; + pg_info_t info; + PGLog::IndexedLog log; + bool skipped_objects = false; + + if (!dry_run) + finish_remove_pgs(store); + + int ret = read_super(); + if (ret) + return ret; + + if (sh.magic != super_header::super_magic) { + cerr << "Invalid magic number" << std::endl; + return -EFAULT; + } + + if (sh.version > super_header::super_ver) { + cerr << "Can't handle export format version=" << sh.version << std::endl; + return -EINVAL; + } + + //First section must be TYPE_PG_BEGIN + sectiontype_t type; + ret = read_section(&type, &ebl); + if (ret) + return ret; + if (type == TYPE_POOL_BEGIN) { + cerr << "Pool exports cannot be imported into a PG" << std::endl; + return -EINVAL; + } else if (type != TYPE_PG_BEGIN) { + cerr << "Invalid first section type " << std::to_string(type) << std::endl; + return -EFAULT; + } + + auto ebliter = ebl.cbegin(); + pg_begin pgb; + pgb.decode(ebliter); + spg_t pgid = pgb.pgid; + + if (pgidstr.length()) { + spg_t user_pgid; + + bool ok = user_pgid.parse(pgidstr.c_str()); + // This succeeded in main() already + ceph_assert(ok); + if (pgid != user_pgid) { + cerr << "specified pgid " << user_pgid + << " does not match actual pgid " << pgid << std::endl; + return -EINVAL; + } + } + + if (!pgb.superblock.cluster_fsid.is_zero() + && pgb.superblock.cluster_fsid != sb.cluster_fsid) { + cerr << "Export came from different cluster with fsid " + << pgb.superblock.cluster_fsid << std::endl; + if (force) { + cerr << "Ignoring this problem due to --force" << std::endl; + } else { + return -EINVAL; + } + } + + if (debug) { + cerr << "Exported features: " << pgb.superblock.compat_features << std::endl; + } + + // Special case: Old export has SHARDS incompat feature on replicated pg, removqqe it + if (pgid.is_no_shard()) + pgb.superblock.compat_features.incompat.remove(CEPH_OSD_FEATURE_INCOMPAT_SHARDS); + + if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) { + CompatSet unsupported = sb.compat_features.unsupported(pgb.superblock.compat_features); + + cerr << "Export has incompatible features set " << unsupported << std::endl; + + // Let them import if they specify the --force option + if (!force) + return 11; // Positive return means exit status + } + + // we need the latest OSDMap to check for collisions + OSDMap curmap; + bufferlist bl; + ret = get_osdmap(store, sb.current_epoch, curmap, bl); + if (ret) { + cerr << "Can't find latest local OSDMap " << sb.current_epoch << std::endl; + return ret; + } + if (!curmap.have_pg_pool(pgid.pgid.m_pool)) { + cerr << "Pool " << pgid.pgid.m_pool << " no longer exists" << std::endl; + // Special exit code for this error, used by test code + return 10; // Positive return means exit status + } + + pool_pg_num_history_t pg_num_history; + get_pg_num_history(store, &pg_num_history); + + ghobject_t pgmeta_oid = pgid.make_pgmeta_oid(); + + // Check for PG already present. + coll_t coll(pgid); + if (store->collection_exists(coll)) { + cerr << "pgid " << pgid << " already exists" << std::endl; + return -EEXIST; + } + + ObjectStore::CollectionHandle ch; + + OSDriver driver( + store, + coll_t(), + OSD::make_snapmapper_oid()); + SnapMapper mapper(g_ceph_context, &driver, 0, 0, 0, pgid.shard); + + cout << "Importing pgid " << pgid; + cout << std::endl; + + bool done = false; + bool found_metadata = false; + metadata_section ms; + while(!done) { + ret = read_section(&type, &ebl); + if (ret) + return ret; + + if (debug) { + cout << __func__ << ": Section type " << std::to_string(type) << std::endl; + } + if (type >= END_OF_TYPES) { + cout << "Skipping unknown section type" << std::endl; + continue; + } + switch(type) { + case TYPE_OBJECT_BEGIN: + ceph_assert(found_metadata); + ret = get_object(store, driver, mapper, coll, ebl, ms.osdmap, + &skipped_objects); + if (ret) return ret; + break; + case TYPE_PG_METADATA: + ret = get_pg_metadata(store, ebl, ms, sb, pgid); + if (ret) return ret; + found_metadata = true; + + if (pgid != ms.info.pgid) { + cerr << "specified pgid " << pgid << " does not match import file pgid " + << ms.info.pgid << std::endl; + return -EINVAL; + } + + // make sure there are no conflicting splits or merges + if (ms.osdmap.have_pg_pool(pgid.pgid.pool())) { + auto p = pg_num_history.pg_nums.find(pgid.pgid.m_pool); + if (p != pg_num_history.pg_nums.end() && + !p->second.empty()) { + unsigned start_pg_num = ms.osdmap.get_pg_num(pgid.pgid.pool()); + unsigned pg_num = start_pg_num; + for (auto q = p->second.lower_bound(ms.map_epoch); + q != p->second.end(); + ++q) { + unsigned new_pg_num = q->second; + cout << "pool " << pgid.pgid.pool() << " pg_num " << pg_num + << " -> " << new_pg_num << std::endl; + + // check for merge target + spg_t target; + if (pgid.is_merge_source(pg_num, new_pg_num, &target)) { + // FIXME: this checks assumes the OSD's PG is at the OSD's + // map epoch; it could be, say, at *our* epoch, pre-merge. + coll_t coll(target); + if (store->collection_exists(coll)) { + cerr << "pgid " << pgid << " merges to target " << target + << " which already exists" << std::endl; + return 12; + } + } + + // check for split children + set children; + if (pgid.is_split(start_pg_num, new_pg_num, &children)) { + cerr << " children are " << children << std::endl; + for (auto child : children) { + coll_t coll(child); + if (store->collection_exists(coll)) { + cerr << "pgid " << pgid << " splits to " << children + << " and " << child << " exists" << std::endl; + return 12; + } + } + } + pg_num = new_pg_num; + } + } + } else { + cout << "pool " << pgid.pgid.pool() << " doesn't existing, not checking" + << " for splits or mergers" << std::endl; + } + + if (!dry_run) { + ObjectStore::Transaction t; + ch = store->create_new_collection(coll); + create_pg_collection( + t, pgid, + pgid.get_split_bits(ms.osdmap.get_pg_pool(pgid.pool())->get_pg_num())); + init_pg_ondisk(t, pgid, NULL); + + // mark this coll for removal until we're done + map values; + encode((char)1, values["_remove"]); + t.omap_setkeys(coll, pgid.make_pgmeta_oid(), values); + + store->queue_transaction(ch, std::move(t)); + } + + break; + case TYPE_PG_END: + ceph_assert(found_metadata); + done = true; + break; + default: + cerr << "Unknown section type " << std::to_string(type) << std::endl; + return -EFAULT; + } + } + + if (!found_metadata) { + cerr << "Missing metadata section" << std::endl; + return -EFAULT; + } + + ObjectStore::Transaction t; + if (!dry_run) { + pg_log_t newlog, reject; + pg_log_t::filter_log(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace, + ms.log, newlog, reject); + if (debug) { + for (list::iterator i = newlog.log.begin(); + i != newlog.log.end(); ++i) + cerr << "Keeping log entry " << *i << std::endl; + for (list::iterator i = reject.log.begin(); + i != reject.log.end(); ++i) + cerr << "Skipping log entry " << *i << std::endl; + } + + divergent_priors_t newdp, rejectdp; + filter_divergent_priors(pgid, ms.osdmap, g_ceph_context->_conf->osd_hit_set_namespace, + ms.divergent_priors, newdp, rejectdp); + ms.divergent_priors = newdp; + if (debug) { + for (divergent_priors_t::iterator i = newdp.begin(); + i != newdp.end(); ++i) + cerr << "Keeping divergent_prior " << *i << std::endl; + for (divergent_priors_t::iterator i = rejectdp.begin(); + i != rejectdp.end(); ++i) + cerr << "Skipping divergent_prior " << *i << std::endl; + } + + ms.missing.filter_objects([&](const hobject_t &obj) { + if (obj.nspace == g_ceph_context->_conf->osd_hit_set_namespace) + return false; + ceph_assert(!obj.is_temp()); + object_t oid = obj.oid; + object_locator_t loc(obj); + pg_t raw_pgid = ms.osdmap.object_locator_to_pg(oid, loc); + pg_t _pgid = ms.osdmap.raw_pg_to_pg(raw_pgid); + + return pgid.pgid != _pgid; + }); + + + if (debug) { + pg_missing_t missing; + Formatter *formatter = Formatter::create("json-pretty"); + dump_log(formatter, cerr, newlog, ms.missing); + delete formatter; + } + + // Just like a split invalidate stats since the object count is changed + if (skipped_objects) + ms.info.stats.stats_invalid = true; + + ret = write_pg( + t, + ms.map_epoch, + ms.info, + newlog, + ms.past_intervals, + ms.divergent_priors, + ms.missing); + if (ret) return ret; + } + + // done, clear removal flag + if (debug) + cerr << "done, clearing removal flag" << std::endl; + + if (!dry_run) { + t.omap_rmkey(coll, pgid.make_pgmeta_oid(), "_remove"); + wait_until_done(&t, [&] { + store->queue_transaction(ch, std::move(t)); + // make sure we flush onreadable items before mapper/driver are destroyed. + ch->flush(); + }); + } + return 0; +} + +int do_list(ObjectStore *store, string pgidstr, string object, boost::optional nspace, + Formatter *formatter, bool debug, bool human_readable, bool head) +{ + int r; + lookup_ghobject lookup(object, nspace, head); + if (pgidstr.length() > 0) { + r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug); + } else { + r = action_on_all_objects(store, lookup, debug); + } + if (r) + return r; + lookup.dump(formatter, human_readable); + formatter->flush(cout); + return 0; +} + +int do_list_slow(ObjectStore *store, string pgidstr, string object, + double threshold, Formatter *formatter, bool debug, bool human_readable) +{ + int r; + lookup_slow_ghobject lookup(object, threshold); + if (pgidstr.length() > 0) { + r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug); + } else { + r = action_on_all_objects(store, lookup, debug); + } + if (r) + return r; + lookup.dump(formatter, human_readable); + formatter->flush(cout); + return 0; +} + +int do_meta(ObjectStore *store, string object, Formatter *formatter, bool debug, bool human_readable) +{ + int r; + boost::optional nspace; // Not specified + lookup_ghobject lookup(object, nspace); + r = action_on_all_objects_in_exact_pg(store, coll_t::meta(), lookup, debug); + if (r) + return r; + lookup.dump(formatter, human_readable); + formatter->flush(cout); + return 0; +} + +enum rmtype { + BOTH, + SNAPMAP, + NOSNAPMAP +}; + +int remove_object(coll_t coll, ghobject_t &ghobj, + SnapMapper &mapper, + MapCacher::Transaction *_t, + ObjectStore::Transaction *t, + enum rmtype type) +{ + if (type == BOTH || type == SNAPMAP) { + int r = mapper.remove_oid(ghobj.hobj, _t); + if (r < 0 && r != -ENOENT) { + cerr << "remove_oid returned " << cpp_strerror(r) << std::endl; + return r; + } + } + + if (type == BOTH || type == NOSNAPMAP) { + t->remove(coll, ghobj); + } + return 0; +} + +int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent); + +int do_remove_object(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, bool all, bool force, enum rmtype type) +{ + auto ch = store->open_collection(coll); + spg_t pg; + coll.is_pg_prefix(&pg); + OSDriver driver( + store, + coll_t(), + OSD::make_snapmapper_oid()); + SnapMapper mapper(g_ceph_context, &driver, 0, 0, 0, pg.shard); + struct stat st; + + int r = store->stat(ch, ghobj, &st); + if (r < 0) { + cerr << "remove: " << cpp_strerror(r) << std::endl; + return r; + } + + SnapSet ss; + if (ghobj.hobj.has_snapset()) { + r = get_snapset(store, coll, ghobj, ss, false); + if (r < 0) { + cerr << "Can't get snapset error " << cpp_strerror(r) << std::endl; + // If --force and bad snapset let them remove the head + if (!(force && !all)) + return r; + } +// cout << "snapset " << ss << std::endl; + if (!ss.clone_snaps.empty() && !all) { + if (force) { + cout << "WARNING: only removing " + << (ghobj.hobj.is_head() ? "head" : "snapdir") + << " with clones present" << std::endl; + ss.clone_snaps.clear(); + } else { + cerr << "Clones are present, use removeall to delete everything" + << std::endl; + return -EINVAL; + } + } + } + + ObjectStore::Transaction t; + OSDriver::OSTransaction _t(driver.get_transaction(&t)); + + ghobject_t snapobj = ghobj; + for (auto& p : ss.clone_snaps) { + snapobj.hobj.snap = p.first; + cout << "remove clone " << snapobj << std::endl; + if (!dry_run) { + r = remove_object(coll, snapobj, mapper, &_t, &t, type); + if (r < 0) + return r; + } + } + + cout << "remove " << ghobj << std::endl; + + if (!dry_run) { + r = remove_object(coll, ghobj, mapper, &_t, &t, type); + if (r < 0) + return r; + } + + if (!dry_run) { + wait_until_done(&t, [&] { + store->queue_transaction(ch, std::move(t)); + ch->flush(); + }); + } + return 0; +} + +int do_list_attrs(ObjectStore *store, coll_t coll, ghobject_t &ghobj) +{ + auto ch = store->open_collection(coll); + map> aset; + int r = store->getattrs(ch, ghobj, aset); + if (r < 0) { + cerr << "getattrs: " << cpp_strerror(r) << std::endl; + return r; + } + + for (map::iterator i = aset.begin();i != aset.end(); ++i) { + string key(i->first); + if (outistty) + key = cleanbin(key); + cout << key << std::endl; + } + return 0; +} + +int do_list_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj) +{ + auto ch = store->open_collection(coll); + ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, ghobj); + if (!iter) { + cerr << "omap_get_iterator: " << cpp_strerror(ENOENT) << std::endl; + return -ENOENT; + } + iter->seek_to_first(); + map oset; + while(iter->valid()) { + get_omap_batch(iter, oset); + + for (map::iterator i = oset.begin();i != oset.end(); ++i) { + string key(i->first); + if (outistty) + key = cleanbin(key); + cout << key << std::endl; + } + } + return 0; +} + +int do_get_bytes(ObjectStore *store, coll_t coll, ghobject_t &ghobj, int fd) +{ + auto ch = store->open_collection(coll); + struct stat st; + mysize_t total; + + int ret = store->stat(ch, ghobj, &st); + if (ret < 0) { + cerr << "get-bytes: " << cpp_strerror(ret) << std::endl; + return ret; + } + + total = st.st_size; + if (debug) + cerr << "size=" << total << std::endl; + + uint64_t offset = 0; + bufferlist rawdatabl; + while(total > 0) { + rawdatabl.clear(); + mysize_t len = max_read; + if (len > total) + len = total; + + ret = store->read(ch, ghobj, offset, len, rawdatabl); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + + if (debug) + cerr << "data section offset=" << offset << " len=" << len << std::endl; + + total -= ret; + offset += ret; + + ret = write(fd, rawdatabl.c_str(), ret); + if (ret == -1) { + perror("write"); + return -errno; + } + } + + return 0; +} + +int do_set_bytes(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, int fd) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (debug) + cerr << "Write " << ghobj << std::endl; + + if (!dry_run) { + t->touch(coll, ghobj); + t->truncate(coll, ghobj, 0); + } + + uint64_t offset = 0; + bufferlist rawdatabl; + do { + rawdatabl.clear(); + ssize_t bytes = rawdatabl.read_fd(fd, max_read); + if (bytes < 0) { + cerr << "read_fd error " << cpp_strerror(bytes) << std::endl; + return bytes; + } + + if (bytes == 0) + break; + + if (debug) + cerr << "\tdata: offset " << offset << " bytes " << bytes << std::endl; + if (!dry_run) + t->write(coll, ghobj, offset, bytes, rawdatabl); + + offset += bytes; + // XXX: Should we queue_transaction() every once in a while for very large files + } while(true); + + auto ch = store->open_collection(coll); + if (!dry_run) + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_get_attr(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) +{ + auto ch = store->open_collection(coll); + bufferptr bp; + + int r = store->getattr(ch, ghobj, key.c_str(), bp); + if (r < 0) { + cerr << "getattr: " << cpp_strerror(r) << std::endl; + return r; + } + + string value(bp.c_str(), bp.length()); + if (outistty) { + value = cleanbin(value); + value.push_back('\n'); + } + cout << value; + + return 0; +} + +int do_set_attr(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, string key, int fd) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + bufferlist bl; + + if (debug) + cerr << "Setattr " << ghobj << std::endl; + + int ret = get_fd_data(fd, bl); + if (ret < 0) + return ret; + + if (dry_run) + return 0; + + t->touch(coll, ghobj); + + t->setattr(coll, ghobj, key, bl); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_rm_attr(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, string key) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (debug) + cerr << "Rmattr " << ghobj << std::endl; + + if (dry_run) + return 0; + + t->rmattr(coll, ghobj, key); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_get_omap(ObjectStore *store, coll_t coll, ghobject_t &ghobj, string key) +{ + auto ch = store->open_collection(coll); + set keys; + map out; + + keys.insert(key); + + int r = store->omap_get_values(ch, ghobj, keys, &out); + if (r < 0) { + cerr << "omap_get_values: " << cpp_strerror(r) << std::endl; + return r; + } + + if (out.empty()) { + cerr << "Key not found" << std::endl; + return -ENOENT; + } + + ceph_assert(out.size() == 1); + + bufferlist bl = out.begin()->second; + string value(bl.c_str(), bl.length()); + if (outistty) { + value = cleanbin(value); + value.push_back('\n'); + } + cout << value; + + return 0; +} + +int do_set_omap(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, string key, int fd) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + map attrset; + bufferlist valbl; + + if (debug) + cerr << "Set_omap " << ghobj << std::endl; + + int ret = get_fd_data(fd, valbl); + if (ret < 0) + return ret; + + attrset.insert(pair(key, valbl)); + + if (dry_run) + return 0; + + t->touch(coll, ghobj); + + t->omap_setkeys(coll, ghobj, attrset); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_rm_omap(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, string key) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (debug) + cerr << "Rm_omap " << ghobj << std::endl; + + if (dry_run) + return 0; + + t->omap_rmkey(coll, ghobj, key); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +int do_get_omaphdr(ObjectStore *store, coll_t coll, ghobject_t &ghobj) +{ + auto ch = store->open_collection(coll); + bufferlist hdrbl; + + int r = store->omap_get_header(ch, ghobj, &hdrbl, true); + if (r < 0) { + cerr << "omap_get_header: " << cpp_strerror(r) << std::endl; + return r; + } + + string header(hdrbl.c_str(), hdrbl.length()); + if (outistty) { + header = cleanbin(header); + header.push_back('\n'); + } + cout << header; + + return 0; +} + +int do_set_omaphdr(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, int fd) +{ + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + bufferlist hdrbl; + + if (debug) + cerr << "Omap_setheader " << ghobj << std::endl; + + int ret = get_fd_data(fd, hdrbl); + if (ret) + return ret; + + if (dry_run) + return 0; + + t->touch(coll, ghobj); + + t->omap_setheader(coll, ghobj, hdrbl); + + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(*t)); + return 0; +} + +struct do_fix_lost : public action_on_object_t { + void call(ObjectStore *store, coll_t coll, + ghobject_t &ghobj, object_info_t &oi) override { + if (oi.is_lost()) { + cout << coll << "/" << ghobj << " is lost"; + if (!dry_run) + cout << ", fixing"; + cout << std::endl; + if (dry_run) + return; + oi.clear_flag(object_info_t::FLAG_LOST); + bufferlist bl; + encode(oi, bl, -1); /* fixme: using full features */ + ObjectStore::Transaction t; + t.setattr(coll, ghobj, OI_ATTR, bl); + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(t)); + } + return; + } +}; + +int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent = false) +{ + auto ch = store->open_collection(coll); + bufferlist attr; + int r = store->getattr(ch, ghobj, SS_ATTR, attr); + if (r < 0) { + if (!silent) + cerr << "Error getting snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + auto bp = attr.cbegin(); + try { + decode(ss, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + return 0; +} + +int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter) +{ + auto ch = store->open_collection(coll); + int r = 0; + formatter->open_object_section("obj"); + formatter->open_object_section("id"); + ghobj.dump(formatter); + formatter->close_section(); + + bufferlist attr; + int gr = store->getattr(ch, ghobj, OI_ATTR, attr); + if (gr < 0) { + r = gr; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } else { + object_info_t oi; + auto bp = attr.cbegin(); + try { + decode(oi, bp); + formatter->open_object_section("info"); + oi.dump(formatter); + formatter->close_section(); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } + } + struct stat st; + int sr = store->stat(ch, ghobj, &st, true); + if (sr < 0) { + r = sr; + cerr << "Error stat on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } else { + formatter->open_object_section("stat"); + formatter->dump_int("size", st.st_size); + formatter->dump_int("blksize", st.st_blksize); + formatter->dump_int("blocks", st.st_blocks); + formatter->dump_int("nlink", st.st_nlink); + formatter->close_section(); + } + + if (ghobj.hobj.has_snapset()) { + SnapSet ss; + int snr = get_snapset(store, coll, ghobj, ss); + if (snr < 0) { + r = snr; + } else { + formatter->open_object_section("SnapSet"); + ss.dump(formatter); + formatter->close_section(); + } + } + bufferlist hattr; + gr = store->getattr(ch, ghobj, ECUtil::get_hinfo_key(), hattr); + if (gr == 0) { + ECUtil::HashInfo hinfo; + auto hp = hattr.cbegin(); + try { + decode(hinfo, hp); + formatter->open_object_section("hinfo"); + hinfo.dump(formatter); + formatter->close_section(); + } catch (...) { + r = -EINVAL; + cerr << "Error decoding hinfo on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } + } + gr = store->dump_onode(ch, ghobj, "onode", formatter); + + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + return r; +} + +int corrupt_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter) +{ + auto ch = store->open_collection(coll); + bufferlist attr; + int r = store->getattr(ch, ghobj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + object_info_t oi; + auto bp = attr.cbegin(); + try { + decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + if (!dry_run) { + attr.clear(); + oi.alloc_hint_flags += 0xff; + ObjectStore::Transaction t; + encode(oi, attr, -1); /* fixme: using full features */ + t.setattr(coll, ghobj, OI_ATTR, attr); + auto ch = store->open_collection(coll); + r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +int set_size( + ObjectStore *store, coll_t coll, ghobject_t &ghobj, uint64_t setsize, Formatter* formatter, + bool corrupt) +{ + auto ch = store->open_collection(coll); + if (ghobj.hobj.is_snapdir()) { + cerr << "Can't set the size of a snapdir" << std::endl; + return -EINVAL; + } + bufferlist attr; + int r = store->getattr(ch, ghobj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + object_info_t oi; + auto bp = attr.cbegin(); + try { + decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + struct stat st; + r = store->stat(ch, ghobj, &st, true); + if (r < 0) { + cerr << "Error stat on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + } + ghobject_t head(ghobj); + SnapSet ss; + bool found_head = true; + map::iterator csi; + bool is_snap = ghobj.hobj.is_snap(); + if (is_snap) { + head.hobj = head.hobj.get_head(); + r = get_snapset(store, coll, head, ss, true); + if (r < 0 && r != -ENOENT) { + // Requested get_snapset() silent, so if not -ENOENT show error + cerr << "Error getting snapset on : " << make_pair(coll, head) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + if (r == -ENOENT) { + head.hobj = head.hobj.get_snapdir(); + r = get_snapset(store, coll, head, ss); + if (r < 0) + return r; + found_head = false; + } else { + found_head = true; + } + csi = ss.clone_size.find(ghobj.hobj.snap); + if (csi == ss.clone_size.end()) { + cerr << "SnapSet is missing clone_size for snap " << ghobj.hobj.snap << std::endl; + return -EINVAL; + } + } + if ((uint64_t)st.st_size == setsize && oi.size == setsize + && (!is_snap || csi->second == setsize)) { + cout << "Size of object is already " << setsize << std::endl; + return 0; + } + cout << "Setting size to " << setsize << ", stat size " << st.st_size + << ", obj info size " << oi.size; + if (is_snap) { + cout << ", " << (found_head ? "head" : "snapdir") + << " clone_size " << csi->second; + csi->second = setsize; + } + cout << std::endl; + if (!dry_run) { + attr.clear(); + oi.size = setsize; + ObjectStore::Transaction t; + // Only modify object info if we want to corrupt it + if (!corrupt && (uint64_t)st.st_size != setsize) { + t.truncate(coll, ghobj, setsize); + // Changing objectstore size will invalidate data_digest, so clear it. + oi.clear_data_digest(); + } + encode(oi, attr, -1); /* fixme: using full features */ + t.setattr(coll, ghobj, OI_ATTR, attr); + if (is_snap) { + bufferlist snapattr; + snapattr.clear(); + encode(ss, snapattr); + t.setattr(coll, head, SS_ATTR, snapattr); + } + auto ch = store->open_collection(coll); + r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +int clear_data_digest(ObjectStore *store, coll_t coll, ghobject_t &ghobj) { + auto ch = store->open_collection(coll); + bufferlist attr; + int r = store->getattr(ch, ghobj, OI_ATTR, attr); + if (r < 0) { + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + object_info_t oi; + auto bp = attr.cbegin(); + try { + decode(oi, bp); + } catch (...) { + r = -EINVAL; + cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + if (!dry_run) { + attr.clear(); + oi.clear_data_digest(); + encode(oi, attr, -1); /* fixme: using full features */ + ObjectStore::Transaction t; + t.setattr(coll, ghobj, OI_ATTR, attr); + auto ch = store->open_collection(coll); + r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +int clear_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, + string arg) +{ + SnapSet ss; + int ret = get_snapset(store, coll, ghobj, ss); + if (ret < 0) + return ret; + + // Use "corrupt" to clear entire SnapSet + // Use "seq" to just corrupt SnapSet.seq + if (arg == "corrupt" || arg == "seq") + ss.seq = 0; + // Use "snaps" to just clear SnapSet.clone_snaps + if (arg == "corrupt" || arg == "snaps") + ss.clone_snaps.clear(); + // By default just clear clone, clone_overlap and clone_size + if (arg == "corrupt") + arg = ""; + if (arg == "" || arg == "clones") + ss.clones.clear(); + if (arg == "" || arg == "clone_overlap") + ss.clone_overlap.clear(); + if (arg == "" || arg == "clone_size") + ss.clone_size.clear(); + // Break all clone sizes by adding 1 + if (arg == "size") { + for (map::iterator i = ss.clone_size.begin(); + i != ss.clone_size.end(); ++i) + ++(i->second); + } + + if (!dry_run) { + bufferlist bl; + encode(ss, bl); + ObjectStore::Transaction t; + t.setattr(coll, ghobj, SS_ATTR, bl); + auto ch = store->open_collection(coll); + int r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + } + return 0; +} + +vector::iterator find(vector &v, snapid_t clid) +{ + return std::find(v.begin(), v.end(), clid); +} + +map >::iterator +find(map > &m, snapid_t clid) +{ + return m.find(clid); +} + +map::iterator find(map &m, + snapid_t clid) +{ + return m.find(clid); +} + +template +int remove_from(T &mv, string name, snapid_t cloneid, bool force) +{ + typename T::iterator i = find(mv, cloneid); + if (i != mv.end()) { + mv.erase(i); + } else { + cerr << "Clone " << cloneid << " doesn't exist in " << name; + if (force) { + cerr << " (ignored)" << std::endl; + return 0; + } + cerr << std::endl; + return -EINVAL; + } + return 0; +} + +int remove_clone( + ObjectStore *store, coll_t coll, ghobject_t &ghobj, snapid_t cloneid, bool force) +{ + // XXX: Don't allow this if in a cache tier or former cache tier + // bool allow_incomplete_clones() const { + // return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES); + + SnapSet snapset; + int ret = get_snapset(store, coll, ghobj, snapset); + if (ret < 0) + return ret; + + // Derived from trim_object() + // ...from snapset + vector::iterator p; + for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p) + if (*p == cloneid) + break; + if (p == snapset.clones.end()) { + cerr << "Clone " << cloneid << " not present"; + return -ENOENT; + } + if (p != snapset.clones.begin()) { + // not the oldest... merge overlap into next older clone + vector::iterator n = p - 1; + hobject_t prev_coid = ghobj.hobj; + prev_coid.snap = *n; + //bool adjust_prev_bytes = is_present_clone(prev_coid); + + //if (adjust_prev_bytes) + // ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n); + + snapset.clone_overlap[*n].intersection_of( + snapset.clone_overlap[*p]); + + //if (adjust_prev_bytes) + // ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n); + } + + ret = remove_from(snapset.clones, "clones", cloneid, force); + if (ret) return ret; + ret = remove_from(snapset.clone_overlap, "clone_overlap", cloneid, force); + if (ret) return ret; + ret = remove_from(snapset.clone_size, "clone_size", cloneid, force); + if (ret) return ret; + + if (dry_run) + return 0; + + bufferlist bl; + encode(snapset, bl); + ObjectStore::Transaction t; + t.setattr(coll, ghobj, SS_ATTR, bl); + auto ch = store->open_collection(coll); + int r = store->queue_transaction(ch, std::move(t)); + if (r < 0) { + cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", " + << cpp_strerror(r) << std::endl; + return r; + } + cout << "Removal of clone " << cloneid << " complete" << std::endl; + cout << "Use pg repair after OSD restarted to correct stat information" << std::endl; + return 0; +} + +int dup(string srcpath, ObjectStore *src, string dstpath, ObjectStore *dst) +{ + cout << "dup from " << src->get_type() << ": " << srcpath << "\n" + << " to " << dst->get_type() << ": " << dstpath + << std::endl; + int num, i; + vector collections; + int r; + + r = src->mount(); + if (r < 0) { + cerr << "failed to mount src: " << cpp_strerror(r) << std::endl; + return r; + } + r = dst->mount(); + if (r < 0) { + cerr << "failed to mount dst: " << cpp_strerror(r) << std::endl; + goto out_src; + } + + if (src->get_fsid() != dst->get_fsid()) { + cerr << "src fsid " << src->get_fsid() << " != dest " << dst->get_fsid() + << std::endl; + goto out; + } + cout << "fsid " << src->get_fsid() << std::endl; + + // make sure dst is empty + r = dst->list_collections(collections); + if (r < 0) { + cerr << "error listing collections on dst: " << cpp_strerror(r) << std::endl; + goto out; + } + if (!collections.empty()) { + cerr << "destination store is not empty" << std::endl; + goto out; + } + + r = src->list_collections(collections); + if (r < 0) { + cerr << "error listing collections on src: " << cpp_strerror(r) << std::endl; + goto out; + } + + num = collections.size(); + cout << num << " collections" << std::endl; + i = 1; + for (auto cid : collections) { + cout << i++ << "/" << num << " " << cid << std::endl; + auto ch = src->open_collection(cid); + auto dch = dst->create_new_collection(cid); + { + ObjectStore::Transaction t; + int bits = src->collection_bits(ch); + if (bits < 0) { + if (src->get_type() == "filestore" && cid.is_meta()) { + bits = 0; + } else { + cerr << "cannot get bit count for collection " << cid << ": " + << cpp_strerror(bits) << std::endl; + goto out; + } + } + t.create_collection(cid, bits); + dst->queue_transaction(dch, std::move(t)); + } + + ghobject_t pos; + uint64_t n = 0; + uint64_t bytes = 0, keys = 0; + while (true) { + vector ls; + r = src->collection_list(ch, pos, ghobject_t::get_max(), 1000, &ls, &pos); + if (r < 0) { + cerr << "collection_list on " << cid << " from " << pos << " got: " + << cpp_strerror(r) << std::endl; + goto out; + } + if (ls.empty()) { + break; + } + + for (auto& oid : ls) { + //cout << " " << cid << " " << oid << std::endl; + if (n % 100 == 0) { + cout << " " << std::setw(16) << n << " objects, " + << std::setw(16) << bytes << " bytes, " + << std::setw(16) << keys << " keys" + << std::setw(1) << "\r" << std::flush; + } + n++; + + ObjectStore::Transaction t; + t.touch(cid, oid); + + map> attrs; + src->getattrs(ch, oid, attrs); + if (!attrs.empty()) { + t.setattrs(cid, oid, attrs); + } + + bufferlist bl; + src->read(ch, oid, 0, 0, bl); + if (bl.length()) { + t.write(cid, oid, 0, bl.length(), bl); + bytes += bl.length(); + } + + bufferlist header; + map omap; + src->omap_get(ch, oid, &header, &omap); + if (header.length()) { + t.omap_setheader(cid, oid, header); + ++keys; + } + if (!omap.empty()) { + keys += omap.size(); + t.omap_setkeys(cid, oid, omap); + } + + dst->queue_transaction(dch, std::move(t)); + } + } + cout << " " << std::setw(16) << n << " objects, " + << std::setw(16) << bytes << " bytes, " + << std::setw(16) << keys << " keys" + << std::setw(1) << std::endl; + } + + // keyring + cout << "keyring" << std::endl; + { + bufferlist bl; + string s = srcpath + "/keyring"; + string err; + r = bl.read_file(s.c_str(), &err); + if (r < 0) { + cerr << "failed to copy " << s << ": " << err << std::endl; + } else { + string d = dstpath + "/keyring"; + bl.write_file(d.c_str(), 0600); + } + } + + // osd metadata + cout << "duping osd metadata" << std::endl; + { + for (auto k : {"magic", "whoami", "ceph_fsid", "fsid"}) { + string val; + src->read_meta(k, &val); + dst->write_meta(k, val); + } + } + + dst->write_meta("ready", "ready"); + + cout << "done." << std::endl; + r = 0; + out: + dst->umount(); + out_src: + src->umount(); + return r; +} + + +const int ceph_entity_name_type(const string name) +{ + if (name == "mds") return CEPH_ENTITY_TYPE_MDS; + if (name == "osd") return CEPH_ENTITY_TYPE_OSD; + if (name == "mon") return CEPH_ENTITY_TYPE_MON; + if (name == "client") return CEPH_ENTITY_TYPE_CLIENT; + if (name == "mgr") return CEPH_ENTITY_TYPE_MGR; + if (name == "auth") return CEPH_ENTITY_TYPE_AUTH; + return -1; +} + +eversion_t get_eversion_from_str(const string& s) { + eversion_t e; + vector result; + boost::split(result, s, boost::is_any_of("'")); + if (result.size() != 2) { + cerr << "eversion_t: invalid format: '" << s << "'" << std::endl; + return e; + } + e.epoch = atoi(result[0].c_str()); + e.version = atoi(result[1].c_str()); + return e; +} + +osd_reqid_t get_reqid_from_str(const string& s) { + osd_reqid_t reqid; + + vector result; + boost::split(result, s, boost::is_any_of(".:")); + if (result.size() != 4) { + cerr << "reqid: invalid format " << s << std::endl; + return osd_reqid_t(); + } + reqid.name._type = ceph_entity_name_type(result[0]); + reqid.name._num = atoi(result[1].c_str()); + + reqid.inc = atoi(result[2].c_str()); + reqid.tid = atoi(result[3].c_str()); + return reqid; +} + +void do_dups_inject_transction(ObjectStore *store, spg_t r_pgid, map *new_dups) +{ + ObjectStore::Transaction t; + coll_t coll(r_pgid); + cerr << "injecting dups into pgid:" << r_pgid << " num of dups:" << new_dups->size() << std::endl; + t.omap_setkeys(coll, r_pgid.make_pgmeta_oid(), (*new_dups)); + auto ch = store->open_collection(coll); + store->queue_transaction(ch, std::move(t)); + new_dups->clear(); +} + +int do_dups_inject_object(ObjectStore *store, spg_t r_pgid, json_spirit::mObject &in_json_obj, + map *new_dups, bool debug) { + std::map::const_iterator it = in_json_obj.find("generate"); + int32_t generate = 0; + if (it != in_json_obj.end()) { + generate = atoi(it->second.get_str().c_str()); + } + + it = in_json_obj.find("reqid"); + if (it == in_json_obj.end()) { + return 1; + } + osd_reqid_t reqid(get_reqid_from_str(it->second.get_str())); + it = in_json_obj.find("version"); + if (it == in_json_obj.end()) { + return 1; + } + eversion_t version(get_eversion_from_str(it->second.get_str())); + it = in_json_obj.find("user_version"); + if (it == in_json_obj.end()) { + return 1; + } + version_t user_version = atoi(it->second.get_str().c_str()); + it = in_json_obj.find("return_code"); + if (it == in_json_obj.end()) { + return 1; + } + int32_t return_code = atoi(it->second.get_str().c_str()); + if (generate) { + for(auto i = 0; i < generate; ++i) { + version.version++; + if (debug) { + cout << "generate dups reqid " << reqid << " v=" << version << std::endl; + } + pg_log_dup_t tmp(version, user_version, reqid, return_code); + bufferlist bl; + encode(tmp, bl); + (*new_dups)[tmp.get_key_name()] = std::move(bl); + if ( new_dups->size() > 50000 ) { + do_dups_inject_transction(store, r_pgid, new_dups); + cout << "inject of " << i << " dups into pgid:" << r_pgid << " done..." << std::endl; + } + } + return 0; + } else { + pg_log_dup_t tmp(version, user_version, reqid, return_code); + if (debug) { + cout << "adding dup: " << tmp << "into key:" << tmp.get_key_name() << std::endl; + } + bufferlist bl; + encode(tmp, bl); + (*new_dups)[tmp.get_key_name()] = std::move(bl); + } + return 0; +} + +void do_dups_inject_from_json(ObjectStore *store, spg_t r_pgid, json_spirit::mValue &inJson, bool debug) +{ + map new_dups; + const vector& o = inJson.get_array(); + for (const auto& obj : o) { + if (obj.type() == json_spirit::obj_type) { + json_spirit::mObject Mobj = obj.get_obj(); + do_dups_inject_object(store, r_pgid, Mobj, &new_dups, debug); + } else { + throw std::runtime_error("JSON array/object not allowed type:" + std::to_string(obj.type())); + return; + } + } + if (new_dups.size() > 0) { + do_dups_inject_transction(store, r_pgid, &new_dups); + } + + + return ; +} + +void usage(po::options_description &desc) +{ + cerr << std::endl; + cerr << desc << std::endl; + cerr << std::endl; + cerr << "Positional syntax:" << std::endl; + cerr << std::endl; + cerr << "ceph-objectstore-tool ... (get|set)-bytes [file]" << std::endl; + cerr << "ceph-objectstore-tool ... set-(attr|omap) [file]" << std::endl; + cerr << "ceph-objectstore-tool ... (get|rm)-(attr|omap) " << std::endl; + cerr << "ceph-objectstore-tool ... get-omaphdr" << std::endl; + cerr << "ceph-objectstore-tool ... set-omaphdr [file]" << std::endl; + cerr << "ceph-objectstore-tool ... list-attrs" << std::endl; + cerr << "ceph-objectstore-tool ... list-omap" << std::endl; + cerr << "ceph-objectstore-tool ... remove|removeall" << std::endl; + cerr << "ceph-objectstore-tool ... dump" << std::endl; + cerr << "ceph-objectstore-tool ... set-size" << std::endl; + cerr << "ceph-objectstore-tool ... clear-data-digest" << std::endl; + cerr << "ceph-objectstore-tool ... remove-clone-metadata " << std::endl; + cerr << std::endl; + cerr << " can be a JSON object description as displayed" << std::endl; + cerr << "by --op list." << std::endl; + cerr << " can be an object name which will be looked up in all" << std::endl; + cerr << "the OSD's PGs." << std::endl; + cerr << " can be the empty string ('') which with a provided pgid " << std::endl; + cerr << "specifies the pgmeta object" << std::endl; + cerr << std::endl; + cerr << "The optional [file] argument will read stdin or write stdout" << std::endl; + cerr << "if not specified or if '-' specified." << std::endl; +} + +bool ends_with(const string& check, const string& ending) +{ + return check.size() >= ending.size() && check.rfind(ending) == (check.size() - ending.size()); +} + +int main(int argc, char **argv) +{ + string dpath, jpath, pgidstr, op, file, mountpoint, mon_store_path, object; + string target_data_path, fsid; + string objcmd, arg1, arg2, type, format, argnspace, pool, rmtypestr; + boost::optional nspace; + spg_t pgid; + unsigned epoch = 0; + unsigned slow_threshold = 16; + ghobject_t ghobj; + bool human_readable; + Formatter *formatter; + bool head, tty; + + po::options_description desc("Allowed options"); + desc.add_options() + ("help", "produce help message") + ("type", po::value(&type), + "Arg is one of [bluestore (default), memstore]") + ("data-path", po::value(&dpath), + "path to object store, mandatory") + ("journal-path", po::value(&jpath), + "path to journal, use if tool can't find it") + ("pgid", po::value(&pgidstr), + "PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, trim-pg-log-dups") + ("pool", po::value(&pool), + "Pool name") + ("op", po::value(&op), + "Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, list-slow-omap, fix-lost, list-pgs, dump-super, meta-list, " + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, update-mon-db, dump-export, trim-pg-log, trim-pg-log-dups statfs]") + ("epoch", po::value(&epoch), + "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified") + ("file", po::value(&file), + "path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap") + ("mon-store-path", po::value(&mon_store_path), + "path of monstore to update-mon-db") + ("fsid", po::value(&fsid), + "fsid for new store created by mkfs") + ("target-data-path", po::value(&target_data_path), + "path of target object store (for --op dup)") + ("mountpoint", po::value(&mountpoint), + "fuse mountpoint") + ("format", po::value(&format)->default_value("json-pretty"), + "Output format which may be json, json-pretty, xml, xml-pretty") + ("debug", "Enable diagnostic output to stderr") + ("no-mon-config", "Do not contact mons for config") + ("no-superblock", "Do not read superblock") + ("force", "Ignore some types of errors and proceed with operation - USE WITH CAUTION: CORRUPTION POSSIBLE NOW OR IN THE FUTURE") + ("skip-journal-replay", "Disable journal replay") + ("skip-mount-omap", "Disable mounting of omap") + ("head", "Find head/snapdir when searching for objects by name") + ("dry-run", "Don't modify the objectstore") + ("tty", "Treat stdout as a tty (no binary data)") + ("namespace", po::value(&argnspace), "Specify namespace when searching for objects") + ("rmtype", po::value(&rmtypestr), "Specify corrupting object removal 'snapmap' or 'nosnapmap' - TESTING USE ONLY") + ("slow-omap-threshold", po::value(&slow_threshold), + "Threshold (in seconds) to consider omap listing slow (for op=list-slow-omap)") + ; + + po::options_description positional("Positional options"); + positional.add_options() + ("object", po::value(&object), "'' for pgmeta_oid, object name or ghobject in json") + ("objcmd", po::value(&objcmd), "command [(get|set)-bytes, (get|set|rm)-(attr|omap), (get|set)-omaphdr, list-attrs, list-omap, remove]") + ("arg1", po::value(&arg1), "arg1 based on cmd") + ("arg2", po::value(&arg2), "arg2 based on cmd") + ; + + po::options_description all; + all.add(desc).add(positional); + + po::positional_options_description pd; + pd.add("object", 1).add("objcmd", 1).add("arg1", 1).add("arg2", 1); + + vector ceph_option_strings; + + po::variables_map vm; + try { + po::parsed_options parsed = + po::command_line_parser(argc, argv).options(all).allow_unregistered().positional(pd).run(); + po::store( parsed, vm); + po::notify(vm); + ceph_option_strings = po::collect_unrecognized(parsed.options, + po::include_positional); + } catch(po::error &e) { + std::cerr << e.what() << std::endl; + return 1; + } + + if (vm.count("help")) { + usage(desc); + return 1; + } + + // Compatibility with previous option name + if (op == "dump-import") + op = "dump-export"; + + debug = (vm.count("debug") > 0); + + force = (vm.count("force") > 0); + + no_superblock = (vm.count("no-superblock") > 0); + + if (vm.count("namespace")) + nspace = argnspace; + + dry_run = (vm.count("dry-run") > 0); + tty = (vm.count("tty") > 0); + + osflagbits_t flags = 0; + if (dry_run || vm.count("skip-journal-replay")) + flags |= SKIP_JOURNAL_REPLAY; + if (vm.count("skip-mount-omap")) + flags |= SKIP_MOUNT_OMAP; + if (op == "update-mon-db") + flags |= SKIP_JOURNAL_REPLAY; + + head = (vm.count("head") > 0); + + // infer osd id so we can authenticate + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/whoami", dpath.c_str()); + int fd = ::open(fn, O_RDONLY); + if (fd >= 0) { + bufferlist bl; + bl.read_fd(fd, 64); + string s(bl.c_str(), bl.length()); + int whoami = atoi(s.c_str()); + vector tmp; + // identify ourselves as this osd so we can auth and fetch our configs + tmp.push_back("-n"); + tmp.push_back(string("osd.") + stringify(whoami)); + // populate osd_data so that the default keyring location works + tmp.push_back("--osd-data"); + tmp.push_back(dpath); + tmp.insert(tmp.end(), ceph_option_strings.begin(), + ceph_option_strings.end()); + tmp.swap(ceph_option_strings); + } + + vector ceph_options; + ceph_options.reserve(ceph_options.size() + ceph_option_strings.size()); + for (vector::iterator i = ceph_option_strings.begin(); + i != ceph_option_strings.end(); + ++i) { + ceph_options.push_back(i->c_str()); + } + + snprintf(fn, sizeof(fn), "%s/type", dpath.c_str()); + fd = ::open(fn, O_RDONLY); + if (fd >= 0) { + bufferlist bl; + bl.read_fd(fd, 64); + if (bl.length()) { + string dp_type = string(bl.c_str(), bl.length() - 1); // drop \n + if (vm.count("type") && dp_type != "" && type != dp_type) + cerr << "WARNING: Ignoring type \"" << type << "\" - found data-path type \"" + << dp_type << "\"" << std::endl; + type = dp_type; + //cout << "object store type is " << type << std::endl; + } + ::close(fd); + } + if (!vm.count("type") && type == "") { + type = "bluestore"; + } + if (!vm.count("data-path") && + op != "dump-export") { + cerr << "Must provide --data-path" << std::endl; + usage(desc); + return 1; + } + if (!vm.count("op") && !vm.count("object")) { + cerr << "Must provide --op or object command..." << std::endl; + usage(desc); + return 1; + } + if (op == "fuse" && mountpoint.length() == 0) { + cerr << "Missing fuse mountpoint" << std::endl; + usage(desc); + return 1; + } + outistty = isatty(STDOUT_FILENO) || tty; + + file_fd = fd_none; + if ((op == "export" || op == "export-remove" || op == "get-osdmap" || op == "get-inc-osdmap") && !dry_run) { + if (!vm.count("file") || file == "-") { + if (outistty) { + cerr << "stdout is a tty and no --file filename specified" << std::endl; + return 1; + } + file_fd = STDOUT_FILENO; + } else { + file_fd = open(file.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666); + } + } else if (op == "import" || op == "dump-export" || op == "set-osdmap" || op == "set-inc-osdmap" || op == "pg-log-inject-dups") { + if (!vm.count("file") || file == "-") { + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no --file filename specified" << std::endl; + return 1; + } + file_fd = STDIN_FILENO; + } else { + file_fd = open(file.c_str(), O_RDONLY); + } + } + + ObjectStoreTool tool = ObjectStoreTool(file_fd, dry_run); + + if (vm.count("file") && file_fd == fd_none && !dry_run) { + cerr << "--file option only applies to import, dump-export, export, export-remove, " + << "get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap" << std::endl; + return 1; + } + + if (file_fd != fd_none && file_fd < 0) { + string err = string("file: ") + file; + perror(err.c_str()); + return 1; + } + int init_flags = 0; + if (vm.count("no-mon-config") > 0) { + init_flags |= CINIT_FLAG_NO_MON_CONFIG; + } + + auto cct = global_init( + NULL, ceph_options, + CEPH_ENTITY_TYPE_OSD, + CODE_ENVIRONMENT_UTILITY_NODOUT, + init_flags); + common_init_finish(g_ceph_context); + if (debug) { + g_conf().set_val_or_die("log_to_stderr", "true"); + g_conf().set_val_or_die("err_to_stderr", "true"); + } + g_conf().apply_changes(nullptr); + + // Special list handling. Treating pretty_format as human readable, + // with one object per line and not an enclosing array. + human_readable = ends_with(format, "-pretty"); + if ((op == "list" || op == "meta-list") && human_readable) { + // Remove -pretty from end of format which we know is there + format = format.substr(0, format.size() - strlen("-pretty")); + } + + formatter = Formatter::create(format); + if (formatter == NULL) { + cerr << "unrecognized format: " << format << std::endl; + return 1; + } + + if (op == "dump-export") { + int ret = tool.dump_export(formatter); + if (ret < 0) { + cerr << "dump-export: " + << cpp_strerror(ret) << std::endl; + return 1; + } + return 0; + } + + //Verify that data-path really exists + struct stat st; + if (::stat(dpath.c_str(), &st) == -1) { + string err = string("data-path: ") + dpath; + perror(err.c_str()); + return 1; + } + + if (pgidstr.length() && pgidstr != "meta" && !pgid.parse(pgidstr.c_str())) { + cerr << "Invalid pgid '" << pgidstr << "' specified" << std::endl; + return 1; + } + + std::unique_ptr fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags); + if (!fs) { + cerr << "Unable to create store of type " << type << std::endl; + return 1; + } + + if (op == "fsck" || op == "fsck-deep") { + int r = fs->fsck(op == "fsck-deep"); + if (r < 0) { + cerr << "fsck failed: " << cpp_strerror(r) << std::endl; + return 1; + } + if (r > 0) { + cerr << "fsck status: " << r << " remaining error(s) and warning(s)" << std::endl; + return 1; + } + cout << "fsck success" << std::endl; + return 0; + } + if (op == "repair" || op == "repair-deep") { + int r = fs->repair(op == "repair-deep"); + if (r < 0) { + cerr << "repair failed: " << cpp_strerror(r) << std::endl; + return 1; + } + if (r > 0) { + cerr << "repair status: " << r << " remaining error(s) and warning(s)" << std::endl; + return 1; + } + cout << "repair success" << std::endl; + return 0; + } + if (op == "mkfs") { + if (fsid.length()) { + uuid_d f; + bool r = f.parse(fsid.c_str()); + if (!r) { + cerr << "failed to parse uuid '" << fsid << "'" << std::endl; + return 1; + } + fs->set_fsid(f); + } + int r = fs->mkfs(); + if (r < 0) { + cerr << "mkfs failed: " << cpp_strerror(r) << std::endl; + return 1; + } + return 0; + } + if (op == "dup") { + string target_type; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/type", target_data_path.c_str()); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + cerr << "Unable to open " << target_data_path << "/type" << std::endl; + exit(1); + } + bufferlist bl; + bl.read_fd(fd, 64); + if (bl.length()) { + target_type = string(bl.c_str(), bl.length() - 1); // drop \n + } + ::close(fd); + unique_ptr targetfs = ObjectStore::create( + g_ceph_context, target_type, + target_data_path, "", 0); + if (!targetfs) { + cerr << "Unable to open store of type " << target_type << std::endl; + return 1; + } + int r = dup(dpath, fs.get(), target_data_path, targetfs.get()); + if (r < 0) { + cerr << "dup failed: " << cpp_strerror(r) << std::endl; + return 1; + } + return 0; + } + + int ret = fs->mount(); + if (ret < 0) { + if (ret == -EBUSY) { + cerr << "OSD has the store locked" << std::endl; + } else { + cerr << "Mount failed with '" << cpp_strerror(ret) << "'" << std::endl; + } + return 1; + } + + if (op == "fuse") { +#ifdef HAVE_LIBFUSE + FuseStore fuse(fs.get(), mountpoint); + cout << "mounting fuse at " << mountpoint << " ..." << std::endl; + int r = fuse.main(); + fs->umount(); + if (r < 0) { + cerr << "failed to mount fuse: " << cpp_strerror(r) << std::endl; + return 1; + } +#else + cerr << "fuse support not enabled" << std::endl; +#endif + return 0; + } + + vector ls; + vector::iterator it; + CompatSet supported; + +#ifdef INTERNAL_TEST + supported = get_test_compat_set(); +#else + supported = OSD::get_osd_compat_set(); +#endif + + bufferlist bl; + auto ch = fs->open_collection(coll_t::meta()); + std::unique_ptr superblock; + if (!no_superblock) { + superblock.reset(new OSDSuperblock); + bufferlist::const_iterator p; + ret = fs->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl); + if (ret < 0) { + cerr << "Failure to read OSD superblock: " << cpp_strerror(ret) << std::endl; + goto out; + } + + p = bl.cbegin(); + decode(*superblock, p); + + if (debug) { + cerr << "Cluster fsid=" << superblock->cluster_fsid << std::endl; + } + + if (debug) { + cerr << "Supported features: " << supported << std::endl; + cerr << "On-disk features: " << superblock->compat_features << std::endl; + } + if (supported.compare(superblock->compat_features) == -1) { + CompatSet unsupported = supported.unsupported(superblock->compat_features); + cerr << "On-disk OSD incompatible features set " + << unsupported << std::endl; + ret = -EINVAL; + goto out; + } + } + + if (op != "list" && vm.count("object")) { + // Special case: Create pgmeta_oid if empty string specified + // This can't conflict with any actual object names. + if (object == "") { + ghobj = pgid.make_pgmeta_oid(); + } else { + json_spirit::Value v; + try { + if (!json_spirit::read(object, v) || + (v.type() != json_spirit::array_type && v.type() != json_spirit::obj_type)) { + // Special: Need head/snapdir so set even if user didn't specify + if (vm.count("objcmd") && (objcmd == "remove-clone-metadata")) + head = true; + lookup_ghobject lookup(object, nspace, head); + if (pgidstr == "meta") + ret = action_on_all_objects_in_exact_pg(fs.get(), coll_t::meta(), lookup, debug); + else if (pgidstr.length()) + ret = action_on_all_objects_in_exact_pg(fs.get(), coll_t(pgid), lookup, debug); + else + ret = action_on_all_objects(fs.get(), lookup, debug); + if (ret) { + throw std::runtime_error("Internal error"); + } else { + if (lookup.size() != 1) { + stringstream ss; + if (lookup.size() == 0) + ss << "No object id '" << object << "' found or invalid JSON specified"; + else + ss << "Found " << lookup.size() << " objects with id '" << object + << "', please use a JSON spec from --op list instead"; + throw std::runtime_error(ss.str()); + } + pair found = lookup.pop(); + pgidstr = found.first.to_str(); + pgid.parse(pgidstr.c_str()); + ghobj = found.second; + } + } else { + stringstream ss; + if (pgidstr.length() == 0 && v.type() != json_spirit::array_type) { + ss << "Without --pgid the object '" << object + << "' must be a JSON array"; + throw std::runtime_error(ss.str()); + } + if (v.type() == json_spirit::array_type) { + json_spirit::Array array = v.get_array(); + if (array.size() != 2) { + ss << "Object '" << object + << "' must be a JSON array with 2 elements"; + throw std::runtime_error(ss.str()); + } + vector::iterator i = array.begin(); + ceph_assert(i != array.end()); + if (i->type() != json_spirit::str_type) { + ss << "Object '" << object + << "' must be a JSON array with the first element a string"; + throw std::runtime_error(ss.str()); + } + string object_pgidstr = i->get_str(); + if (object_pgidstr != "meta") { + spg_t object_pgid; + object_pgid.parse(object_pgidstr.c_str()); + if (pgidstr.length() > 0) { + if (object_pgid != pgid) { + ss << "object '" << object + << "' has a pgid different from the --pgid=" + << pgidstr << " option"; + throw std::runtime_error(ss.str()); + } + } else { + pgidstr = object_pgidstr; + pgid = object_pgid; + } + } else { + pgidstr = object_pgidstr; + } + ++i; + v = *i; + } + try { + ghobj.decode(v); + } catch (std::runtime_error& e) { + ss << "Decode object JSON error: " << e.what(); + throw std::runtime_error(ss.str()); + } + if (pgidstr != "meta" && (uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) { + cerr << "Object pool and pgid pool don't match" << std::endl; + ret = 1; + goto out; + } + if (pgidstr != "meta") { + auto ch = fs->open_collection(coll_t(pgid)); + if (!ghobj.match(fs->collection_bits(ch), pgid.ps())) { + stringstream ss; + ss << "object " << ghobj << " not contained by pg " << pgid; + throw std::runtime_error(ss.str()); + } + } + } + } catch (std::runtime_error& e) { + cerr << e.what() << std::endl; + ret = 1; + goto out; + } + } + } + + // The ops which require --pgid option are checked here and + // mentioned in the usage for --pgid. + if ((op == "info" || op == "log" || op == "remove" || op == "export" + || op == "export-remove" || op == "mark-complete" + || op == "reset-last-complete" + || op == "trim-pg-log" + || op == "pg-log-inject-dups") && + pgidstr.length() == 0) { + cerr << "Must provide pgid" << std::endl; + usage(desc); + ret = 1; + goto out; + } + + if (op == "import") { + ceph_assert(superblock != nullptr); + try { + ret = tool.do_import(fs.get(), *superblock, force, pgidstr); + } + catch (const buffer::error &e) { + cerr << "do_import threw exception error " << e.what() << std::endl; + ret = -EFAULT; + } + if (ret == -EFAULT) { + cerr << "Corrupt input for import" << std::endl; + } + if (ret == 0) + cout << "Import successful" << std::endl; + goto out; + } else if (op == "dump-journal-mount") { + // Undocumented feature to dump journal with mounted fs + // This doesn't support the format option, but it uses the + // ObjectStore::dump_journal() and mounts to get replay to run. + ret = fs->dump_journal(cout); + if (ret) { + if (ret == -EOPNOTSUPP) { + cerr << "Object store type \"" << type << "\" doesn't support journal dump" << std::endl; + } else { + cerr << "Journal dump failed with error " << cpp_strerror(ret) << std::endl; + } + } + goto out; + } else if (op == "get-osdmap") { + bufferlist bl; + OSDMap osdmap; + if (epoch == 0) { + ceph_assert(superblock != nullptr); + epoch = superblock->current_epoch; + } + ret = get_osdmap(fs.get(), epoch, osdmap, bl); + if (ret) { + cerr << "Failed to get osdmap#" << epoch << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + ret = bl.write_fd(file_fd); + if (ret) { + cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl; + } else { + cout << "osdmap#" << epoch << " exported." << std::endl; + } + goto out; + } else if (op == "set-osdmap") { + bufferlist bl; + ret = get_fd_data(file_fd, bl); + if (ret < 0) { + cerr << "Failed to read osdmap " << cpp_strerror(ret) << std::endl; + } else { + ret = set_osdmap(fs.get(), epoch, bl, force); + } + goto out; + } else if (op == "get-inc-osdmap") { + bufferlist bl; + if (epoch == 0) { + ceph_assert(superblock != nullptr); + epoch = superblock->current_epoch; + } + ret = get_inc_osdmap(fs.get(), epoch, bl); + if (ret < 0) { + cerr << "Failed to get incremental osdmap# " << epoch << ": " + << cpp_strerror(ret) << std::endl; + goto out; + } + ret = bl.write_fd(file_fd); + if (ret) { + cerr << "Failed to write to " << file << ": " << cpp_strerror(ret) << std::endl; + } else { + cout << "inc-osdmap#" << epoch << " exported." << std::endl; + } + goto out; + } else if (op == "set-inc-osdmap") { + bufferlist bl; + ret = get_fd_data(file_fd, bl); + if (ret < 0) { + cerr << "Failed to read incremental osdmap " << cpp_strerror(ret) << std::endl; + goto out; + } else { + ret = set_inc_osdmap(fs.get(), epoch, bl, force); + } + goto out; + } else if (op == "update-mon-db") { + if (!vm.count("mon-store-path")) { + cerr << "Please specify the path to monitor db to update" << std::endl; + ret = -EINVAL; + } else { + ceph_assert(superblock != nullptr); + ret = update_mon_db(*fs, *superblock, dpath + "/keyring", mon_store_path); + } + goto out; + } + + if (op == "remove") { + if (!force && !dry_run) { + cerr << "Please use export-remove or you must use --force option" << std::endl; + ret = -EINVAL; + goto out; + } + ret = initiate_new_remove_pg(fs.get(), pgid); + if (ret < 0) { + cerr << "PG '" << pgid << "' not found" << std::endl; + goto out; + } + cout << "Remove successful" << std::endl; + goto out; + } + + if (op == "fix-lost") { + boost::scoped_ptr action; + action.reset(new do_fix_lost()); + if (pgidstr.length()) + ret = action_on_all_objects_in_exact_pg(fs.get(), coll_t(pgid), *action, debug); + else + ret = action_on_all_objects(fs.get(), *action, debug); + goto out; + } + + if (op == "list") { + ret = do_list(fs.get(), pgidstr, object, nspace, formatter, debug, + human_readable, head); + if (ret < 0) { + cerr << "do_list failed: " << cpp_strerror(ret) << std::endl; + } + goto out; + } + if (op == "list-slow-omap") { + ret = do_list_slow(fs.get(), pgidstr, object, slow_threshold, formatter, debug, + human_readable); + if (ret < 0) { + cerr << "do_list failed: " << cpp_strerror(ret) << std::endl; + } + goto out; + } + + if (op == "dump-super") { + ceph_assert(superblock != nullptr); + formatter->open_object_section("superblock"); + superblock->dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + goto out; + } + + if (op == "statfs") { + store_statfs_t statsbuf; + ret = fs->statfs(&statsbuf); + if (ret < 0) { + cerr << "error from statfs: " << cpp_strerror(ret) << std::endl; + goto out; + } + formatter->open_object_section("statfs"); + statsbuf.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + goto out; + } + + if (op == "meta-list") { + ret = do_meta(fs.get(), object, formatter, debug, human_readable); + if (ret < 0) { + cerr << "do_meta failed: " << cpp_strerror(ret) << std::endl; + } + goto out; + } + + ret = fs->list_collections(ls); + if (ret < 0) { + cerr << "failed to list pgs: " << cpp_strerror(ret) << std::endl; + goto out; + } + + if (debug && op == "list-pgs") + cout << "Performing list-pgs operation" << std::endl; + + // Find pg + for (it = ls.begin(); it != ls.end(); ++it) { + spg_t tmppgid; + + if (pgidstr == "meta") { + if (it->to_str() == "meta") + break; + else + continue; + } + + if (!it->is_pg(&tmppgid)) { + continue; + } + + if (it->is_temp(&tmppgid)) { + continue; + } + + if (op != "list-pgs" && tmppgid != pgid) { + continue; + } + + if (op != "list-pgs") { + //Found! + break; + } + + cout << tmppgid << std::endl; + } + + if (op == "list-pgs") { + ret = 0; + goto out; + } + + // If not an object command nor any of the ops handled below, then output this usage + // before complaining about a bad pgid + if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log" && op != "trim-pg-log-dups" && op != "pg-log-inject-dups") { + cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, dump-super, meta-list, " + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, trim-pg-log-dups statfs)" + << std::endl; + usage(desc); + ret = 1; + goto out; + } + epoch_t map_epoch; +// The following code for export, info, log require omap or !skip-mount-omap + if (it != ls.end()) { + + coll_t coll = *it; + + if (vm.count("objcmd")) { + ret = 0; + if (objcmd == "remove" || objcmd == "removeall") { + bool all = (objcmd == "removeall"); + enum rmtype type = BOTH; + if (rmtypestr == "nosnapmap") + type = NOSNAPMAP; + else if (rmtypestr == "snapmap") + type = SNAPMAP; + ret = do_remove_object(fs.get(), coll, ghobj, all, force, type); + goto out; + } else if (objcmd == "list-attrs") { + ret = do_list_attrs(fs.get(), coll, ghobj); + goto out; + } else if (objcmd == "list-omap") { + ret = do_list_omap(fs.get(), coll, ghobj); + goto out; + } else if (objcmd == "get-bytes" || objcmd == "set-bytes") { + if (objcmd == "get-bytes") { + int fd; + if (vm.count("arg1") == 0 || arg1 == "-") { + fd = STDOUT_FILENO; + } else { + fd = open(arg1.c_str(), O_WRONLY|O_TRUNC|O_CREAT|O_EXCL|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_get_bytes(fs.get(), coll, ghobj, fd); + if (fd != STDOUT_FILENO) + close(fd); + } else { + int fd; + if (vm.count("arg1") == 0 || arg1 == "-") { + // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no file specified" << std::endl; + ret = 1; + goto out; + } + fd = STDIN_FILENO; + } else { + fd = open(arg1.c_str(), O_RDONLY|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_set_bytes(fs.get(), coll, ghobj, fd); + if (fd != STDIN_FILENO) + close(fd); + } + goto out; + } else if (objcmd == "get-attr") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + ret = do_get_attr(fs.get(), coll, ghobj, arg1); + goto out; + } else if (objcmd == "set-attr") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + } + + int fd; + if (vm.count("arg2") == 0 || arg2 == "-") { + // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no file specified" << std::endl; + ret = 1; + goto out; + } + fd = STDIN_FILENO; + } else { + fd = open(arg2.c_str(), O_RDONLY|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg2 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_set_attr(fs.get(), coll, ghobj, arg1, fd); + if (fd != STDIN_FILENO) + close(fd); + goto out; + } else if (objcmd == "rm-attr") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + ret = do_rm_attr(fs.get(), coll, ghobj, arg1); + goto out; + } else if (objcmd == "get-omap") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + ret = do_get_omap(fs.get(), coll, ghobj, arg1); + goto out; + } else if (objcmd == "set-omap") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + int fd; + if (vm.count("arg2") == 0 || arg2 == "-") { + // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no file specified" << std::endl; + ret = 1; + goto out; + } + fd = STDIN_FILENO; + } else { + fd = open(arg2.c_str(), O_RDONLY|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg2 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_set_omap(fs.get(), coll, ghobj, arg1, fd); + if (fd != STDIN_FILENO) + close(fd); + goto out; + } else if (objcmd == "rm-omap") { + if (vm.count("arg1") == 0) { + usage(desc); + ret = 1; + goto out; + } + ret = do_rm_omap(fs.get(), coll, ghobj, arg1); + goto out; + } else if (objcmd == "get-omaphdr") { + if (vm.count("arg1")) { + usage(desc); + ret = 1; + goto out; + } + ret = do_get_omaphdr(fs.get(), coll, ghobj); + goto out; + } else if (objcmd == "set-omaphdr") { + // Extra arg + if (vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + int fd; + if (vm.count("arg1") == 0 || arg1 == "-") { + // Since read_fd() doesn't handle ^D from a tty stdin, don't allow it. + if (isatty(STDIN_FILENO)) { + cerr << "stdin is a tty and no file specified" << std::endl; + ret = 1; + goto out; + } + fd = STDIN_FILENO; + } else { + fd = open(arg1.c_str(), O_RDONLY|O_LARGEFILE, 0666); + if (fd == -1) { + cerr << "open " << arg1 << " " << cpp_strerror(errno) << std::endl; + ret = 1; + goto out; + } + } + ret = do_set_omaphdr(fs.get(), coll, ghobj, fd); + if (fd != STDIN_FILENO) + close(fd); + goto out; + } else if (objcmd == "dump") { + // There should not be any other arguments + if (vm.count("arg1") || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + ret = print_obj_info(fs.get(), coll, ghobj, formatter); + goto out; + } else if (objcmd == "corrupt-info") { // Undocumented testing feature + // There should not be any other arguments + if (vm.count("arg1") || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + ret = corrupt_info(fs.get(), coll, ghobj, formatter); + goto out; + } else if (objcmd == "set-size" || objcmd == "corrupt-size") { + // Undocumented testing feature + bool corrupt = (objcmd == "corrupt-size"); + // Extra arg + if (vm.count("arg1") == 0 || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) { + cerr << "Invalid size '" << arg1 << "' specified" << std::endl; + ret = 1; + goto out; + } + uint64_t size = atoll(arg1.c_str()); + ret = set_size(fs.get(), coll, ghobj, size, formatter, corrupt); + goto out; + } else if (objcmd == "clear-data-digest") { + ret = clear_data_digest(fs.get(), coll, ghobj); + goto out; + } else if (objcmd == "clear-snapset") { + // UNDOCUMENTED: For testing zap SnapSet + // IGNORE extra args since not in usage anyway + if (!ghobj.hobj.has_snapset()) { + cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl; + ret = 1; + goto out; + } + ret = clear_snapset(fs.get(), coll, ghobj, arg1); + goto out; + } else if (objcmd == "remove-clone-metadata") { + // Extra arg + if (vm.count("arg1") == 0 || vm.count("arg2")) { + usage(desc); + ret = 1; + goto out; + } + if (!ghobj.hobj.has_snapset()) { + cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl; + ret = 1; + goto out; + } + if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) { + cerr << "Invalid cloneid '" << arg1 << "' specified" << std::endl; + ret = 1; + goto out; + } + snapid_t cloneid = atoi(arg1.c_str()); + ret = remove_clone(fs.get(), coll, ghobj, cloneid, force); + goto out; + } + cerr << "Unknown object command '" << objcmd << "'" << std::endl; + usage(desc); + ret = 1; + goto out; + } + + map_epoch = 0; + ret = PG::peek_map_epoch(fs.get(), pgid, &map_epoch); + if (ret < 0) + cerr << "peek_map_epoch reports error" << std::endl; + if (debug) + cerr << "map_epoch " << map_epoch << std::endl; + + pg_info_t info(pgid); + PastIntervals past_intervals; + __u8 struct_ver; + ret = PG::read_info(fs.get(), pgid, coll, info, past_intervals, struct_ver); + if (ret < 0) { + cerr << "read_info error " << cpp_strerror(ret) << std::endl; + goto out; + } + if (struct_ver < PG::get_compat_struct_v()) { + cerr << "PG is too old to upgrade, use older Ceph version" << std::endl; + ret = -EFAULT; + goto out; + } + if (debug) + cerr << "struct_v " << (int)struct_ver << std::endl; + + if (op == "export" || op == "export-remove") { + ceph_assert(superblock != nullptr); + ret = tool.do_export(cct.get(), fs.get(), coll, pgid, info, map_epoch, struct_ver, *superblock, past_intervals); + if (ret == 0) { + cerr << "Export successful" << std::endl; + if (op == "export-remove") { + ret = initiate_new_remove_pg(fs.get(), pgid); + // Export succeeded, so pgid is there + ceph_assert(ret == 0); + cerr << "Remove successful" << std::endl; + } + } + } else if (op == "info") { + formatter->open_object_section("info"); + info.dump(formatter); + formatter->close_section(); + formatter->flush(cout); + cout << std::endl; + } else if (op == "log") { + PGLog::IndexedLog log; + pg_missing_t missing; + ret = get_log(cct.get(), fs.get(), struct_ver, pgid, info, log, missing); + if (ret < 0) + goto out; + + dump_log(formatter, cout, log, missing); + } else if (op == "mark-complete") { + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (struct_ver < PG::get_compat_struct_v()) { + cerr << "Can't mark-complete, version mismatch " << (int)struct_ver + << " (pg) < compat " << (int)PG::get_compat_struct_v() << " (tool)" + << std::endl; + ret = 1; + goto out; + } + + cout << "Marking complete " << std::endl; + + ceph_assert(superblock != nullptr); + info.last_update = eversion_t(superblock->current_epoch, info.last_update.version + 1); + info.last_backfill = hobject_t::get_max(); + info.last_epoch_started = superblock->current_epoch; + info.history.last_epoch_started = superblock->current_epoch; + info.history.last_epoch_clean = superblock->current_epoch; + past_intervals.clear(); + + if (!dry_run) { + ret = write_info(*t, map_epoch, info, past_intervals); + if (ret != 0) + goto out; + auto ch = fs->open_collection(coll_t(pgid)); + fs->queue_transaction(ch, std::move(*t)); + } + cout << "Marking complete succeeded" << std::endl; + } else if (op == "trim-pg-log") { + ret = do_trim_pg_log(fs.get(), coll, info, pgid, + map_epoch, past_intervals); + if (ret < 0) { + cerr << "Error trimming pg log: " << cpp_strerror(ret) << std::endl; + goto out; + } + cout << "Finished trimming pg log" << std::endl; + goto out; + } else if (op == "trim-pg-log-dups") { + ret = do_trim_pg_log_dups(fs.get(), coll, info, pgid, + map_epoch, past_intervals); + if (ret < 0) { + cerr << "Error trimming pg log dups: " << cpp_strerror(ret) << std::endl; + goto out; + } + cout << "Finished trimming pg log dups" << std::endl; + goto out; + } else if (op == "reset-last-complete") { + if (!force) { + std::cerr << "WARNING: reset-last-complete is extremely dangerous and almost " + << "certain to lead to permanent data loss unless you know exactly " + << "what you are doing. Pass --force to proceed anyway." + << std::endl; + ret = -EINVAL; + goto out; + } + ObjectStore::Transaction tran; + ObjectStore::Transaction *t = &tran; + + if (struct_ver < PG::get_compat_struct_v()) { + cerr << "Can't reset-last-complete, version mismatch " << (int)struct_ver + << " (pg) < compat " << (int)PG::get_compat_struct_v() << " (tool)" + << std::endl; + ret = 1; + goto out; + } + + cout << "Reseting last_complete " << std::endl; + + info.last_complete = info.last_update; + + if (!dry_run) { + ret = write_info(*t, map_epoch, info, past_intervals); + if (ret != 0) + goto out; + fs->queue_transaction(ch, std::move(*t)); + } + cout << "Reseting last_complete succeeded" << std::endl; + + } else if (op == "pg-log-inject-dups") { + if (!vm.count("file") || file == "-") { + cerr << "Must provide file containing JSON dups entries" << std::endl; + ret = 1; + goto out; + } + if (debug) + cerr << "opening file " << file << std::endl; + + ifstream json_file_stream(file , std::ifstream::in); + if (!json_file_stream.is_open()) { + cerr << "unable to open file " << file << std::endl; + ret = -1; + goto out; + } + json_spirit::mValue result; + try { + if (!json_spirit::read(json_file_stream, result)) + throw std::runtime_error("unparseable JSON " + file); + if (result.type() != json_spirit::array_type) { + cerr << "result is not an array_type - type=" << result.type() << std::endl; + throw std::runtime_error("not JSON array_type " + file); + } + do_dups_inject_from_json(fs.get(), pgid, result, debug); + } catch (const std::runtime_error &e) { + cerr << e.what() << std::endl;; + return -1; + } + } else { + ceph_assert(!"Should have already checked for valid --op"); + } + } else { + cerr << "PG '" << pgid << "' not found" << std::endl; + ret = -ENOENT; + } + +out: + if (debug) { + ostringstream ostr; + Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty"); + cct->get_perfcounters_collection()->dump_formatted(f, false, false); + ostr << "ceph-objectstore-tool "; + f->flush(ostr); + delete f; + cout << ostr.str() << std::endl; + } + + int r = fs->umount(); + if (r < 0) { + cerr << "umount failed: " << cpp_strerror(r) << std::endl; + // If no previous error, then use umount() error + if (ret == 0) + ret = r; + } + + if (dry_run) { + // Export output can go to stdout, so put this message on stderr + if (op == "export") + cerr << "dry-run: Nothing changed" << std::endl; + else + cout << "dry-run: Nothing changed" << std::endl; + } + + if (ret < 0) + ret = 1; + return ret; +} diff --git a/src/tools/ceph_objectstore_tool.h b/src/tools/ceph_objectstore_tool.h new file mode 100644 index 000000000..82aa83e5d --- /dev/null +++ b/src/tools/ceph_objectstore_tool.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OBJECTSTORE_TOOL_H_ +#define CEPH_OBJECTSTORE_TOOL_H_ + +#include "RadosDump.h" + +class ObjectStoreTool : public RadosDump +{ + public: + ObjectStoreTool(int file_fd, bool dry_run) + : RadosDump(file_fd, dry_run) + {} + + int dump_export(Formatter *formatter); + int do_import(ObjectStore *store, OSDSuperblock& sb, bool force, + std::string pgidstr); + int do_export(CephContext *cct, ObjectStore *fs, coll_t coll, spg_t pgid, + pg_info_t &info, epoch_t map_epoch, __u8 struct_ver, + const OSDSuperblock& superblock, + PastIntervals &past_intervals); + int dump_object(Formatter *formatter, + bufferlist &bl); + int get_object( + ObjectStore *store, OSDriver& driver, SnapMapper& mapper, coll_t coll, + bufferlist &bl, OSDMap &curmap, bool *skipped_objects); + int export_file( + ObjectStore *store, coll_t cid, ghobject_t &obj); + int export_files(ObjectStore *store, coll_t coll); +}; + +#endif // CEPH_OBJECSTORE_TOOL_H_ diff --git a/src/tools/ceph_osdomap_tool.cc b/src/tools/ceph_osdomap_tool.cc new file mode 100644 index 000000000..4f49daeec --- /dev/null +++ b/src/tools/ceph_osdomap_tool.cc @@ -0,0 +1,212 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2012 Inktank, Inc. +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License kkjversion 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#include +#include + +#include +#include + +#include "common/errno.h" +#include "global/global_init.h" + +#include "os/DBObjectMap.h" +#include "kv/KeyValueDB.h" + +using namespace std; +namespace po = boost::program_options; + +int main(int argc, char **argv) { + po::options_description desc("Allowed options"); + string store_path, cmd, oid, backend; + bool debug = false; + desc.add_options() + ("help", "produce help message") + ("omap-path", po::value(&store_path), + "path to omap directory, mandatory (current/omap usually)") + ("paranoid", "use paranoid checking") + ("debug", "Additional debug output from DBObjectMap") + ("oid", po::value(&oid), "Restrict to this object id when dumping objects") + ("command", po::value(&cmd), + "command arg is one of [dump-raw-keys, dump-raw-key-vals, dump-objects, dump-objects-with-keys, check, dump-headers, repair, compact], mandatory") + ("backend", po::value(&backend), + "DB backend (default rocksdb)") + ; + po::positional_options_description p; + p.add("command", 1); + + vector ceph_option_strings; + po::variables_map vm; + try { + po::parsed_options parsed = + po::command_line_parser(argc, argv).options(desc).positional(p).allow_unregistered().run(); + po::store( + parsed, + vm); + po::notify(vm); + + ceph_option_strings = po::collect_unrecognized(parsed.options, + po::include_positional); + } catch(po::error &e) { + std::cerr << e.what() << std::endl; + return 1; + } + + vector ceph_options; + ceph_options.reserve(ceph_option_strings.size()); + for (vector::iterator i = ceph_option_strings.begin(); + i != ceph_option_strings.end(); + ++i) { + ceph_options.push_back(i->c_str()); + } + + if (vm.count("debug")) debug = true; + + if (vm.count("help")) { + std::cerr << desc << std::endl; + return 1; + } + + auto cct = global_init( + NULL, ceph_options, CEPH_ENTITY_TYPE_OSD, + CODE_ENVIRONMENT_UTILITY_NODOUT, 0); + common_init_finish(g_ceph_context); + cct->_conf.apply_changes(nullptr); + if (debug) { + g_conf().set_val_or_die("log_to_stderr", "true"); + g_conf().set_val_or_die("err_to_stderr", "true"); + } + g_conf().apply_changes(nullptr); + + if (vm.count("omap-path") == 0) { + std::cerr << "Required argument --omap-path" << std::endl; + return 1; + } + + if (vm.count("command") == 0) { + std::cerr << "Required argument --command" << std::endl; + return 1; + } + + if (vm.count("backend") == 0) { + backend = "rocksdb"; + } + + KeyValueDB* store(KeyValueDB::create(g_ceph_context, backend, store_path)); + if (store == NULL) { + std::cerr << "Invalid backend '" << backend << "' specified" << std::endl; + return 1; + } + /*if (vm.count("paranoid")) { + std::cerr << "Enabling paranoid checks" << std::endl; + store->options.paranoid_checks = true; + }*/ + DBObjectMap omap(cct.get(), store); + stringstream out; + int r = store->open(out); + if (r < 0) { + std::cerr << "Store open got: " << cpp_strerror(r) << std::endl; + std::cerr << "Output: " << out.str() << std::endl; + return r; + } + // We don't call omap.init() here because it will repair + // the DBObjectMap which we might want to examine for diagnostic + // reasons. Instead use --command repair. + + omap.get_state(); + std::cout << "Version: " << (int)omap.state.v << std::endl; + std::cout << "Seq: " << omap.state.seq << std::endl; + std::cout << "legacy: " << (omap.state.legacy ? "true" : "false") << std::endl; + + if (cmd == "dump-raw-keys") { + KeyValueDB::WholeSpaceIterator i = store->get_wholespace_iterator(); + for (i->seek_to_first(); i->valid(); i->next()) { + std::cout << i->raw_key() << std::endl; + } + return 0; + } else if (cmd == "dump-raw-key-vals") { + KeyValueDB::WholeSpaceIterator i = store->get_wholespace_iterator(); + for (i->seek_to_first(); i->valid(); i->next()) { + std::cout << i->raw_key() << std::endl; + i->value().hexdump(std::cout); + } + return 0; + } else if (cmd == "dump-objects") { + vector objects; + r = omap.list_objects(&objects); + if (r < 0) { + std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl; + return r; + } + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (vm.count("oid") != 0 && i->hobj.oid.name != oid) + continue; + std::cout << *i << std::endl; + } + return 0; + } else if (cmd == "dump-objects-with-keys") { + vector objects; + r = omap.list_objects(&objects); + if (r < 0) { + std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl; + return r; + } + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if (vm.count("oid") != 0 && i->hobj.oid.name != oid) + continue; + std::cout << "Object: " << *i << std::endl; + ObjectMap::ObjectMapIterator j = omap.get_iterator(ghobject_t(i->hobj)); + for (j->seek_to_first(); j->valid(); j->next()) { + std::cout << j->key() << std::endl; + j->value().hexdump(std::cout); + } + } + return 0; + } else if (cmd == "check" || cmd == "repair") { + ostringstream ss; + bool repair = (cmd == "repair"); + r = omap.check(ss, repair, true); + if (r) { + std::cerr << ss.str() << std::endl; + if (r > 0) { + std::cerr << "check got " << r << " error(s)" << std::endl; + return 1; + } + } + std::cout << (repair ? "repair" : "check") << " succeeded" << std::endl; + return 0; + } else if (cmd == "dump-headers") { + vector headers; + r = omap.list_object_headers(&headers); + if (r < 0) { + std::cerr << "list_object_headers got: " << cpp_strerror(r) << std::endl; + return 1; + } + for (auto i : headers) + std::cout << i << std::endl; + return 0; + } else if (cmd == "resetv2") { + omap.state.v = 2; + omap.state.legacy = false; + omap.set_state(); + } else if (cmd == "compact") { + omap.compact(); + return 0; + } else { + std::cerr << "Did not recognize command " << cmd << std::endl; + return 1; + } +} diff --git a/src/tools/cephfs/CMakeLists.txt b/src/tools/cephfs/CMakeLists.txt new file mode 100644 index 000000000..5d40f8ffb --- /dev/null +++ b/src/tools/cephfs/CMakeLists.txt @@ -0,0 +1,58 @@ +set(cephfs_journal_tool_srcs + cephfs-journal-tool.cc + JournalTool.cc + JournalFilter.cc + JournalScanner.cc + EventOutput.cc + Dumper.cc + Resetter.cc + RoleSelector.cc + MDSUtility.cc) +add_executable(cephfs-journal-tool ${cephfs_journal_tool_srcs}) +target_link_libraries(cephfs-journal-tool librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +set(cephfs-meta-injection_srcs + cephfs-meta-injection.cc + MetaTool.cc + RoleSelector.cc + MDSUtility.cc) +add_executable(cephfs-meta-injection ${cephfs-meta-injection_srcs}) +target_link_libraries(cephfs-meta-injection librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +set(cephfs_table_tool_srcs + cephfs-table-tool.cc + TableTool.cc + RoleSelector.cc + MDSUtility.cc) +add_executable(cephfs-table-tool ${cephfs_table_tool_srcs}) +target_link_libraries(cephfs-table-tool librados mds osdc global + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +set(cephfs_data_scan_srcs + cephfs-data-scan.cc + DataScan.cc + RoleSelector.cc + PgFiles.cc + MDSUtility.cc) +add_executable(cephfs-data-scan ${cephfs_data_scan_srcs}) +target_link_libraries(cephfs-data-scan librados cephfs mds osdc global + cls_cephfs_client + ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS}) + +install(TARGETS + cephfs-journal-tool + cephfs-table-tool + cephfs-data-scan + DESTINATION bin) + +option(WITH_CEPHFS_SHELL "install cephfs-shell" OFF) +if(WITH_CEPHFS_SHELL) + add_subdirectory(shell) +endif() + +option(WITH_CEPHFS_TOP "install cephfs-top utility" ON) +if(WITH_CEPHFS_TOP) + add_subdirectory(top) +endif() diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc new file mode 100644 index 000000000..0ba56c515 --- /dev/null +++ b/src/tools/cephfs/DataScan.cc @@ -0,0 +1,2404 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include "common/errno.h" +#include "common/ceph_argparse.h" +#include +#include "include/util.h" +#include "include/ceph_fs.h" + +#include "mds/CDentry.h" +#include "mds/CInode.h" +#include "mds/CDentry.h" +#include "mds/InoTable.h" +#include "mds/SnapServer.h" +#include "cls/cephfs/cls_cephfs_client.h" + +#include "PgFiles.h" +#include "DataScan.h" +#include "include/compat.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "datascan." << __func__ << ": " + +using namespace std; + +void DataScan::usage() +{ + std::cout << "Usage: \n" + << " cephfs-data-scan init [--force-init]\n" + << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] [ [ ...]]\n" + << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] []\n" + << " cephfs-data-scan pg_files [...]\n" + << " cephfs-data-scan scan_links\n" + << "\n" + << " --force-corrupt: overrite apparently corrupt structures\n" + << " --force-init: write root inodes even if they exist\n" + << " --force-pool: use data pool even if it is not in FSMap\n" + << " --worker_m: Maximum number of workers\n" + << " --worker_n: Worker number, range 0-(worker_m-1)\n" + << "\n" + << " cephfs-data-scan scan_frags [--force-corrupt]\n" + << " cephfs-data-scan cleanup []\n" + << std::endl; + + generic_client_usage(); +} + +bool DataScan::parse_kwarg( + const std::vector &args, + std::vector::const_iterator &i, + int *r) +{ + if (i + 1 == args.end()) { + return false; + } + + const std::string arg(*i); + const std::string val(*(i + 1)); + + if (arg == std::string("--output-dir")) { + if (driver != NULL) { + derr << "Unexpected --output-dir: output already selected!" << dendl; + *r = -EINVAL; + return false; + } + dout(4) << "Using local file output to '" << val << "'" << dendl; + driver = new LocalFileDriver(val, data_io); + return true; + } else if (arg == std::string("--worker_n")) { + std::string err; + n = strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + std::cerr << "Invalid worker number '" << val << "'" << std::endl; + *r = -EINVAL; + return false; + } + return true; + } else if (arg == std::string("--worker_m")) { + std::string err; + m = strict_strtoll(val.c_str(), 10, &err); + if (!err.empty()) { + std::cerr << "Invalid worker count '" << val << "'" << std::endl; + *r = -EINVAL; + return false; + } + return true; + } else if (arg == std::string("--filter-tag")) { + filter_tag = val; + dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl; + return true; + } else if (arg == std::string("--filesystem")) { + std::shared_ptr fs; + *r = fsmap->parse_filesystem(val, &fs); + if (*r != 0) { + std::cerr << "Invalid filesystem '" << val << "'" << std::endl; + return false; + } + fscid = fs->fscid; + return true; + } else if (arg == std::string("--alternate-pool")) { + metadata_pool_name = val; + return true; + } else { + return false; + } +} + +bool DataScan::parse_arg( + const std::vector &args, + std::vector::const_iterator &i) +{ + const std::string arg(*i); + if (arg == "--force-pool") { + force_pool = true; + return true; + } else if (arg == "--force-corrupt") { + force_corrupt = true; + return true; + } else if (arg == "--force-init") { + force_init = true; + return true; + } else { + return false; + } +} + +int DataScan::main(const std::vector &args) +{ + // Parse args + // ========== + if (args.size() < 1) { + cerr << "missing position argument" << std::endl; + return -EINVAL; + } + + // Common RADOS init: open metadata pool + // ===================================== + librados::Rados rados; + int r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "RADOS unavailable" << dendl; + return r; + } + + std::string const &command = args[0]; + std::string data_pool_name; + std::set extra_data_pool_names; + + std::string pg_files_path; + std::set pg_files_pgs; + + // Consume any known --key val or --flag arguments + for (std::vector::const_iterator i = args.begin() + 1; + i != args.end(); ++i) { + if (parse_kwarg(args, i, &r)) { + // Skip the kwarg value field + ++i; + continue; + } else if (r) { + return r; + } + + if (parse_arg(args, i)) { + continue; + } + + // Trailing positional arguments + if (command == "scan_extents") { + if (data_pool_name.empty()) { + data_pool_name = *i; + } else if (*i != data_pool_name) { + extra_data_pool_names.insert(*i); + } + continue; + } + + // Trailing positional argument + if (i + 1 == args.end() && + (command == "scan_inodes" + || command == "cleanup")) { + data_pool_name = *i; + continue; + } + + if (command == "pg_files") { + if (i == args.begin() + 1) { + pg_files_path = *i; + continue; + } else { + pg_t pg; + bool parsed = pg.parse(*i); + if (!parsed) { + std::cerr << "Invalid PG '" << *i << "'" << std::endl; + return -EINVAL; + } else { + pg_files_pgs.insert(pg); + continue; + } + } + + } + + // Fall through: unhandled + std::cerr << "Unknown argument '" << *i << "'" << std::endl; + return -EINVAL; + } + + // If caller didn't specify a namespace, try to pick + // one if only one exists + if (fscid == FS_CLUSTER_ID_NONE) { + if (fsmap->filesystem_count() == 1) { + fscid = fsmap->get_filesystem()->fscid; + } else { + std::cerr << "Specify a filesystem with --filesystem" << std::endl; + return -EINVAL; + } + } + auto fs = fsmap->get_filesystem(fscid); + ceph_assert(fs != nullptr); + + // Default to output to metadata pool + if (driver == NULL) { + driver = new MetadataDriver(); + driver->set_force_corrupt(force_corrupt); + driver->set_force_init(force_init); + dout(4) << "Using metadata pool output" << dendl; + } + + dout(4) << "connecting to RADOS..." << dendl; + r = rados.connect(); + if (r < 0) { + std::cerr << "couldn't connect to cluster: " << cpp_strerror(r) + << std::endl; + return r; + } + + r = driver->init(rados, metadata_pool_name, fsmap, fscid); + if (r < 0) { + return r; + } + + if (command == "pg_files") { + auto pge = PgFiles(objecter, pg_files_pgs); + pge.init(); + return pge.scan_path(pg_files_path); + } + + bool autodetect_data_pools = false; + + // Initialize data_io for those commands that need it + if (command == "scan_inodes" || + command == "scan_extents" || + command == "cleanup") { + data_pool_id = fs->mds_map.get_first_data_pool(); + + std::string pool_name; + r = rados.pool_reverse_lookup(data_pool_id, &pool_name); + if (r < 0) { + std::cerr << "Failed to resolve data pool: " << cpp_strerror(r) + << std::endl; + return r; + } + + if (data_pool_name.empty()) { + autodetect_data_pools = true; + data_pool_name = pool_name; + } else if (data_pool_name != pool_name) { + std::cerr << "Warning: pool '" << data_pool_name << "' is not the " + "main CephFS data pool!" << std::endl; + if (!force_pool) { + std::cerr << "Use --force-pool to continue" << std::endl; + return -EINVAL; + } + + data_pool_id = rados.pool_lookup(data_pool_name.c_str()); + if (data_pool_id < 0) { + std::cerr << "Data pool '" << data_pool_name << "' not found!" + << std::endl; + return -ENOENT; + } + } + + dout(4) << "data pool '" << data_pool_name << "' has ID " << data_pool_id + << dendl; + + dout(4) << "opening data pool '" << data_pool_name << "'" << dendl; + r = rados.ioctx_create(data_pool_name.c_str(), data_io); + if (r != 0) { + return r; + } + } + + // Initialize extra data_ios for those commands that need it + if (command == "scan_extents") { + if (autodetect_data_pools) { + ceph_assert(extra_data_pool_names.empty()); + + for (auto &pool_id : fs->mds_map.get_data_pools()) { + if (pool_id == data_pool_id) { + continue; + } + + std::string pool_name; + r = rados.pool_reverse_lookup(pool_id, &pool_name); + if (r < 0) { + std::cerr << "Failed to resolve data pool: " << cpp_strerror(r) + << std::endl; + return r; + } + extra_data_pool_names.insert(pool_name); + } + } + + for (auto &data_pool_name: extra_data_pool_names) { + int64_t pool_id = rados.pool_lookup(data_pool_name.c_str()); + if (data_pool_id < 0) { + std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl; + return -ENOENT; + } else { + dout(4) << "data pool '" << data_pool_name << "' has ID " << pool_id + << dendl; + } + + if (!fs->mds_map.is_data_pool(pool_id)) { + std::cerr << "Warning: pool '" << data_pool_name << "' is not a " + "CephFS data pool!" << std::endl; + if (!force_pool) { + std::cerr << "Use --force-pool to continue" << std::endl; + return -EINVAL; + } + } + + dout(4) << "opening data pool '" << data_pool_name << "'" << dendl; + extra_data_ios.push_back({}); + r = rados.ioctx_create(data_pool_name.c_str(), extra_data_ios.back()); + if (r != 0) { + return r; + } + } + } + + // Initialize metadata_io from MDSMap for scan_frags + if (command == "scan_frags" || command == "scan_links") { + const auto fs = fsmap->get_filesystem(fscid); + if (fs == nullptr) { + std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl; + return -ENOENT; + } + int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool(); + + dout(4) << "resolving metadata pool " << metadata_pool_id << dendl; + int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name); + if (r < 0) { + std::cerr << "Pool " << metadata_pool_id + << " identified in MDS map not found in RADOS!" << std::endl; + return r; + } + + r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io); + if (r != 0) { + return r; + } + + data_pools = fs->mds_map.get_data_pools(); + } + + // Finally, dispatch command + if (command == "scan_inodes") { + return scan_inodes(); + } else if (command == "scan_extents") { + return scan_extents(); + } else if (command == "scan_frags") { + return scan_frags(); + } else if (command == "scan_links") { + return scan_links(); + } else if (command == "cleanup") { + return cleanup(); + } else if (command == "init") { + return driver->init_roots(fs->mds_map.get_first_data_pool()); + } else { + std::cerr << "Unknown command '" << command << "'" << std::endl; + return -EINVAL; + } +} + +int MetadataDriver::inject_unlinked_inode( + inodeno_t inono, int mode, int64_t data_pool_id) +{ + const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode"); + + // Skip if exists + bool already_exists = false; + int r = root_exists(inono, &already_exists); + if (r) { + return r; + } + if (already_exists && !force_init) { + std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already" + " exists, skipping create. Use --force-init to overwrite" + " the existing object." << std::endl; + return 0; + } + + // Compose + InodeStore inode_data; + auto inode = inode_data.get_inode(); + inode->ino = inono; + inode->version = 1; + inode->xattr_version = 1; + inode->mode = 0500 | mode; + // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty + // (we won't actually give the *correct* dirstat here though) + inode->dirstat.nfiles = 1; + + inode->ctime = inode->mtime = ceph_clock_now(); + inode->nlink = 1; + inode->truncate_size = -1ull; + inode->truncate_seq = 1; + inode->uid = g_conf()->mds_root_ino_uid; + inode->gid = g_conf()->mds_root_ino_gid; + + // Force layout to default: should we let users override this so that + // they don't have to mount the filesystem to correct it? + inode->layout = file_layout_t::get_default(); + inode->layout.pool_id = data_pool_id; + inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + + // Assume that we will get our stats wrong, and that we may + // be ignoring dirfrags that exist + inode_data.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE); + + if (inono == CEPH_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) { + sr_t srnode; + srnode.seq = 1; + encode(srnode, inode_data.snap_blob); + } + + // Serialize + bufferlist inode_bl; + encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl); + inode_data.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + // Write + r = metadata_io.write_full(oid.name, inode_bl); + if (r != 0) { + derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl; + return r; + } + + return r; +} + +int MetadataDriver::root_exists(inodeno_t ino, bool *result) +{ + object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); + uint64_t size; + time_t mtime; + int r = metadata_io.stat(oid.name, &size, &mtime); + if (r == -ENOENT) { + *result = false; + return 0; + } else if (r < 0) { + return r; + } + + *result = true; + return 0; +} + +int MetadataDriver::init_roots(int64_t data_pool_id) +{ + int r = 0; + r = inject_unlinked_inode(CEPH_INO_ROOT, S_IFDIR|0755, data_pool_id); + if (r != 0) { + return r; + } + r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id); + if (r != 0) { + return r; + } + bool created = false; + r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created); + if (r != 0) { + return r; + } + + return 0; +} + +int MetadataDriver::check_roots(bool *result) +{ + int r; + r = root_exists(CEPH_INO_ROOT, result); + if (r != 0) { + return r; + } + if (!*result) { + return 0; + } + + r = root_exists(MDS_INO_MDSDIR(0), result); + if (r != 0) { + return r; + } + if (!*result) { + return 0; + } + + return 0; +} + +/** + * Stages: + * + * SERIAL init + * 0. Create root inodes if don't exist + * PARALLEL scan_extents + * 1. Size and mtime recovery: scan ALL objects, and update 0th + * objects with max size and max mtime seen. + * PARALLEL scan_inodes + * 2. Inode recovery: scan ONLY 0th objects, and inject metadata + * into dirfrag OMAPs, creating blank dirfrags as needed. No stats + * or rstats at this stage. Inodes without backtraces go into + * lost+found + * TODO: SERIAL "recover stats" + * 3. Dirfrag statistics: depth first traverse into metadata tree, + * rebuilding dir sizes. + * TODO PARALLEL "clean up" + * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged + * anything onto them) and remove any of the xattrs that we + * used for accumulating. + */ + + +int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id) +{ + if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) { + return -EINVAL; + } + + std::string err; + std::string inode_str = oid.substr(0, oid.find(".")); + *inode_no = strict_strtoll(inode_str.c_str(), 16, &err); + if (!err.empty()) { + return -EINVAL; + } + + std::string pos_string = oid.substr(oid.find(".") + 1); + *obj_id = strict_strtoll(pos_string.c_str(), 16, &err); + if (!err.empty()) { + return -EINVAL; + } + + return 0; +} + + +int DataScan::scan_extents() +{ + std::vector data_ios; + data_ios.push_back(&data_io); + for (auto &extra_data_io : extra_data_ios) { + data_ios.push_back(&extra_data_io); + } + + for (auto ioctx : data_ios) { + int r = forall_objects(*ioctx, false, [this, ioctx]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + // Read size + uint64_t size; + time_t mtime; + int r = ioctx->stat(oid, &size, &mtime); + dout(10) << "handling object " << obj_name_ino + << "." << obj_name_offset << dendl; + if (r != 0) { + dout(4) << "Cannot stat '" << oid << "': skipping" << dendl; + return r; + } + int64_t obj_pool_id = data_io.get_id() != ioctx->get_id() ? + ioctx->get_id() : -1; + + // I need to keep track of + // * The highest object ID seen + // * The size of the highest object ID seen + // * The largest object seen + // * The pool of the objects seen (if it is not the main data pool) + // + // Given those things, I can later infer the object chunking + // size, the offset of the last object (chunk size * highest ID seen), + // the actual size (offset of last object + size of highest ID seen), + // and the layout pool id. + // + // This logic doesn't take account of striping. + r = ClsCephFSClient::accumulate_inode_metadata( + data_io, + obj_name_ino, + obj_name_offset, + size, + obj_pool_id, + mtime); + if (r < 0) { + derr << "Failed to accumulate metadata data from '" + << oid << "': " << cpp_strerror(r) << dendl; + return r; + } + + return r; + }); + if (r < 0) { + return r; + } + } + + return 0; +} + +int DataScan::probe_filter(librados::IoCtx &ioctx) +{ + bufferlist filter_bl; + ClsCephFSClient::build_tag_filter("test", &filter_bl); + librados::ObjectCursor range_i; + librados::ObjectCursor range_end; + + std::vector tmp_result; + librados::ObjectCursor tmp_next; + int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(), + 1, filter_bl, &tmp_result, &tmp_next); + + return r >= 0; +} + +int DataScan::forall_objects( + librados::IoCtx &ioctx, + bool untagged_only, + std::function handler + ) +{ + librados::ObjectCursor range_i; + librados::ObjectCursor range_end; + ioctx.object_list_slice( + ioctx.object_list_begin(), + ioctx.object_list_end(), + n, + m, + &range_i, + &range_end); + + + bufferlist filter_bl; + + bool legacy_filtering = false; + if (untagged_only) { + // probe to deal with older OSDs that don't support + // the cephfs pgls filtering mode + legacy_filtering = !probe_filter(ioctx); + if (!legacy_filtering) { + ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl); + } + } + + int r = 0; + while(range_i < range_end) { + std::vector result; + int r = ioctx.object_list(range_i, range_end, 1, + filter_bl, &result, &range_i); + if (r < 0) { + derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl; + return r; + } + + for (const auto &i : result) { + const std::string &oid = i.oid; + uint64_t obj_name_ino = 0; + uint64_t obj_name_offset = 0; + r = parse_oid(oid, &obj_name_ino, &obj_name_offset); + if (r != 0) { + dout(4) << "Bad object name '" << oid << "', skipping" << dendl; + continue; + } + + if (untagged_only && legacy_filtering) { + dout(20) << "Applying filter to " << oid << dendl; + + // We are only interested in 0th objects during this phase: we touched + // the other objects during scan_extents + if (obj_name_offset != 0) { + dout(20) << "Non-zeroth object" << dendl; + continue; + } + + bufferlist scrub_tag_bl; + int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl); + if (r >= 0) { + std::string read_tag; + auto q = scrub_tag_bl.cbegin(); + try { + decode(read_tag, q); + if (read_tag == filter_tag) { + dout(20) << "skipping " << oid << " because it has the filter_tag" + << dendl; + continue; + } + } catch (const buffer::error &err) { + } + dout(20) << "read non-matching tag '" << read_tag << "'" << dendl; + } else { + dout(20) << "no tag read (" << r << ")" << dendl; + } + + } else if (untagged_only) { + ceph_assert(obj_name_offset == 0); + dout(20) << "OSD matched oid " << oid << dendl; + } + + int this_oid_r = handler(oid, obj_name_ino, obj_name_offset); + if (r == 0 && this_oid_r < 0) { + r = this_oid_r; + } + } + } + + return r; +} + +int DataScan::scan_inodes() +{ + bool roots_present; + int r = driver->check_roots(&roots_present); + if (r != 0) { + derr << "Unexpected error checking roots: '" + << cpp_strerror(r) << "'" << dendl; + return r; + } + + if (!roots_present) { + std::cerr << "Some or all system inodes are absent. Run 'init' from " + "one node before running 'scan_inodes'" << std::endl; + return -EIO; + } + + return forall_objects(data_io, true, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + int r = 0; + + dout(10) << "handling object " + << std::hex << obj_name_ino << "." << obj_name_offset << std::dec + << dendl; + + AccumulateResult accum_res; + inode_backtrace_t backtrace; + file_layout_t loaded_layout = file_layout_t::get_default(); + std::string symlink; + r = ClsCephFSClient::fetch_inode_accumulate_result( + data_io, oid, &backtrace, &loaded_layout, &symlink, &accum_res); + + if (r == -EINVAL) { + dout(4) << "Accumulated metadata missing from '" + << oid << ", did you run scan_extents?" << dendl; + return r; + } else if (r < 0) { + dout(4) << "Unexpected error loading accumulated metadata from '" + << oid << "': " << cpp_strerror(r) << dendl; + // FIXME: this creates situation where if a client has a corrupt + // backtrace/layout, we will fail to inject it. We should (optionally) + // proceed if the backtrace/layout is corrupt but we have valid + // accumulated metadata. + return r; + } + + const time_t file_mtime = accum_res.max_mtime; + uint64_t file_size = 0; + bool have_backtrace = !(backtrace.ancestors.empty()); + + // This is the layout we will use for injection, populated either + // from loaded_layout or from best guesses + file_layout_t guessed_layout; + if (accum_res.obj_pool_id == -1) { + guessed_layout.pool_id = data_pool_id; + } else { + guessed_layout.pool_id = accum_res.obj_pool_id; + + librados::IoCtx ioctx; + r = librados::Rados(data_io).ioctx_create2(guessed_layout.pool_id, ioctx); + if (r != 0) { + derr << "Unexpected error opening file data pool id=" + << guessed_layout.pool_id << ": " << cpp_strerror(r) << dendl; + return r; + } + + bufferlist bl; + int r = ioctx.getxattr(oid, "layout", bl); + if (r < 0) { + if (r != -ENODATA) { + derr << "Unexpected error reading layout for " << oid << ": " + << cpp_strerror(r) << dendl; + return r; + } + } else { + try { + auto q = bl.cbegin(); + decode(loaded_layout, q); + } catch (ceph::buffer::error &e) { + derr << "Unexpected error decoding layout for " << oid << dendl; + return -EINVAL; + } + } + } + + // Calculate file_size, guess the layout + if (accum_res.ceiling_obj_index > 0) { + uint32_t chunk_size = file_layout_t::get_default().object_size; + // When there are multiple objects, the largest object probably + // indicates the chunk size. But not necessarily, because files + // can be sparse. Only make this assumption if size seen + // is a power of two, as chunk sizes typically are. + if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) { + chunk_size = accum_res.max_obj_size; + } + + if (loaded_layout.pool_id == -1) { + // If no stashed layout was found, guess it + guessed_layout.object_size = chunk_size; + guessed_layout.stripe_unit = chunk_size; + guessed_layout.stripe_count = 1; + } else if (!loaded_layout.is_valid() || + loaded_layout.object_size < accum_res.max_obj_size) { + // If the max size seen exceeds what the stashed layout claims, then + // disbelieve it. Guess instead. Same for invalid layouts on disk. + dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino + << std::dec << ", ignoring in favour of best guess" << dendl; + guessed_layout.object_size = chunk_size; + guessed_layout.stripe_unit = chunk_size; + guessed_layout.stripe_count = 1; + } else { + // We have a stashed layout that we can't disprove, so apply it + guessed_layout = loaded_layout; + dout(20) << "loaded layout from xattr:" + << " pi: " << guessed_layout.pool_id + << " os: " << guessed_layout.object_size + << " sc: " << guessed_layout.stripe_count + << " su: " << guessed_layout.stripe_unit + << dendl; + // User might have transplanted files from a pool with a different + // ID, so if the pool from loaded_layout is not found in the list of + // the data pools, we'll force the injected layout to point to the + // pool we read from. + if (!fsmap->get_filesystem(fscid)->mds_map.is_data_pool( + guessed_layout.pool_id)) { + dout(20) << "overwriting layout pool_id " << data_pool_id << dendl; + guessed_layout.pool_id = data_pool_id; + } + } + + if (guessed_layout.stripe_count == 1) { + // Unstriped file: simple chunking + file_size = guessed_layout.object_size * accum_res.ceiling_obj_index + + accum_res.ceiling_obj_size; + } else { + // Striped file: need to examine the last stripe_count objects + // in the file to determine the size. + + librados::IoCtx ioctx; + if (guessed_layout.pool_id == data_io.get_id()) { + ioctx.dup(data_io); + } else { + r = librados::Rados(data_io).ioctx_create2(guessed_layout.pool_id, + ioctx); + if (r != 0) { + derr << "Unexpected error opening file data pool id=" + << guessed_layout.pool_id << ": " << cpp_strerror(r) << dendl; + return r; + } + } + + // How many complete (i.e. not last stripe) objects? + uint64_t complete_objs = 0; + if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) { + complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count; + } else { + complete_objs = 0; + } + + // How many potentially-short objects (i.e. last stripe set) objects? + uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs; + + dout(10) << "calculating striped size from complete objs: " + << complete_objs << ", partial objs: " << partial_objs + << dendl; + + // Maximum amount of data that may be in the incomplete objects + uint64_t incomplete_size = 0; + + // For each short object, calculate the max file size within it + // and accumulate the maximum + for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) { + char buf[60]; + snprintf(buf, sizeof(buf), "%llx.%08llx", + (long long unsigned)obj_name_ino, (long long unsigned)i); + + uint64_t osize(0); + time_t omtime(0); + r = ioctx.stat(std::string(buf), &osize, &omtime); + if (r == 0) { + if (osize > 0) { + // Upper bound within this object + uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit + * (guessed_layout.stripe_unit * guessed_layout.stripe_count) + + (i % guessed_layout.stripe_count) + * guessed_layout.stripe_unit + (osize - 1) + % guessed_layout.stripe_unit + 1; + incomplete_size = std::max(incomplete_size, upper_size); + } + } else if (r == -ENOENT) { + // Absent object, treat as size 0 and ignore. + } else { + // Unexpected error, carry r to outer scope for handling. + break; + } + } + if (r != 0 && r != -ENOENT) { + derr << "Unexpected error checking size of ino 0x" << std::hex + << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl; + return r; + } + file_size = complete_objs * guessed_layout.object_size + + incomplete_size; + } + } else { + file_size = accum_res.ceiling_obj_size; + if (loaded_layout.pool_id < 0 + || loaded_layout.object_size < accum_res.max_obj_size) { + // No layout loaded, or inconsistent layout, use default + guessed_layout = file_layout_t::get_default(); + guessed_layout.pool_id = accum_res.obj_pool_id != -1 ? + accum_res.obj_pool_id : data_pool_id; + } else { + guessed_layout = loaded_layout; + } + } + + // Santity checking backtrace ino against object name + if (have_backtrace && backtrace.ino != obj_name_ino) { + dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino + << " doesn't match object name ino 0x" << obj_name_ino + << std::dec << dendl; + have_backtrace = false; + } + + InodeStore dentry; + build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry, symlink); + + // Inject inode to the metadata pool + if (have_backtrace) { + inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin()); + if (MDS_INO_IS_MDSDIR(root_bp.dirino)) { + /* Special case for strays: even if we have a good backtrace, + * don't put it in the stray dir, because while that would technically + * give it linkage it would still be invisible to the user */ + r = driver->inject_lost_and_found(obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } else { + /* Happy case: we will inject a named dentry for this inode */ + r = driver->inject_with_backtrace(backtrace, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " with backtrace: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + } else { + /* Backtrace-less case: we will inject a lost+found dentry */ + r = driver->inject_lost_and_found( + obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << obj_name_ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + + return r; + }); +} + +int DataScan::cleanup() +{ + // We are looking for only zeroth object + // + return forall_objects(data_io, true, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + int r = 0; + r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid); + if (r < 0) { + dout(4) << "Error deleting accumulated metadata from '" + << oid << "': " << cpp_strerror(r) << dendl; + } + return r; + }); +} + +bool DataScan::valid_ino(inodeno_t ino) const +{ + return (ino >= inodeno_t((1ull << 40))) + || (MDS_INO_IS_STRAY(ino)) + || (MDS_INO_IS_MDSDIR(ino)) + || ino == CEPH_INO_ROOT + || ino == CEPH_INO_CEPH + || ino == CEPH_INO_LOST_AND_FOUND; +} + +int DataScan::scan_links() +{ + MetadataDriver *metadata_driver = dynamic_cast(driver); + if (!metadata_driver) { + derr << "Unexpected --output-dir option for scan_links" << dendl; + return -EINVAL; + } + + interval_set used_inos; + map remote_links; + map snaps; + snapid_t last_snap = 1; + snapid_t snaprealm_v2_since = 2; + + struct link_info_t { + inodeno_t dirino; + frag_t frag; + string name; + version_t version; + int nlink; + bool is_dir; + map snaps; + link_info_t() : version(0), nlink(0), is_dir(false) {} + link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::inode_const_ptr& i) : + dirino(di), frag(df), name(n), + version(i->version), nlink(i->nlink), is_dir(S_IFDIR & i->mode) {} + dirfrag_t dirfrag() const { + return dirfrag_t(dirino, frag); + } + }; + map > dup_primaries; + map bad_nlink_inos; + map injected_inos; + + map > to_remove; + + enum { + SCAN_INOS = 1, + CHECK_LINK, + }; + + for (int step = SCAN_INOS; step <= CHECK_LINK; step++) { + const librados::NObjectIterator it_end = metadata_io.nobjects_end(); + for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) { + const std::string oid = it->get_oid(); + + dout(10) << "step " << step << ": handling object " << oid << dendl; + + uint64_t dir_ino = 0; + uint64_t frag_id = 0; + int r = parse_oid(oid, &dir_ino, &frag_id); + if (r == -EINVAL) { + dout(10) << "Not a dirfrag: '" << oid << "'" << dendl; + continue; + } else { + // parse_oid can only do 0 or -EINVAL + ceph_assert(r == 0); + } + + if (!valid_ino(dir_ino)) { + dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl; + continue; + } + + std::map items; + r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items); + if (r < 0) { + derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl; + return r; + } + + for (auto& p : items) { + auto q = p.second.cbegin(); + string dname; + snapid_t last; + dentry_key_t::decode_helper(p.first, dname, last); + + if (last != CEPH_NOSNAP) { + if (last > last_snap) + last_snap = last; + continue; + } + + try { + snapid_t dnfirst; + decode(dnfirst, q); + if (dnfirst == CEPH_NOSNAP) { + dout(20) << "injected ino detected" << dendl; + } else if (dnfirst <= CEPH_MAXSNAP) { + if (dnfirst - 1 > last_snap) + last_snap = dnfirst - 1; + } + char dentry_type; + decode(dentry_type, q); + mempool::mds_co::string alternate_name; + if (dentry_type == 'I' || dentry_type == 'i') { + InodeStore inode; + if (dentry_type == 'i') { + DECODE_START(2, q); + if (struct_v >= 2) + decode(alternate_name, q); + inode.decode(q); + DECODE_FINISH(q); + } else { + inode.decode_bare(q); + } + + inodeno_t ino = inode.inode->ino; + + if (step == SCAN_INOS) { + if (used_inos.contains(ino, 1)) { + dup_primaries.emplace(std::piecewise_construct, + std::forward_as_tuple(ino), + std::forward_as_tuple()); + } else { + used_inos.insert(ino); + } + } else if (step == CHECK_LINK) { + sr_t srnode; + if (inode.snap_blob.length()) { + auto p = inode.snap_blob.cbegin(); + decode(srnode, p); + for (auto it = srnode.snaps.begin(); + it != srnode.snaps.end(); ) { + if (it->second.ino != ino || + it->second.snapid != it->first) { + srnode.snaps.erase(it++); + } else { + ++it; + } + } + if (!srnode.past_parents.empty()) { + snapid_t last = srnode.past_parents.rbegin()->first; + if (last + 1 > snaprealm_v2_since) + snaprealm_v2_since = last + 1; + } + } + if (inode.old_inodes && !inode.old_inodes->empty()) { + auto _last_snap = inode.old_inodes->rbegin()->first; + if (_last_snap > last_snap) + last_snap = _last_snap; + } + auto q = dup_primaries.find(ino); + if (q != dup_primaries.end()) { + q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode)); + q->second.back().snaps.swap(srnode.snaps); + } else { + int nlink = 0; + auto r = remote_links.find(ino); + if (r != remote_links.end()) + nlink = r->second; + if (!MDS_INO_IS_STRAY(dir_ino)) + nlink++; + if (inode.inode->nlink != nlink) { + derr << "Bad nlink on " << ino << " expected " << nlink + << " has " << inode.inode->nlink << dendl; + bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode); + bad_nlink_inos[ino].nlink = nlink; + } + snaps.insert(make_move_iterator(begin(srnode.snaps)), + make_move_iterator(end(srnode.snaps))); + } + if (dnfirst == CEPH_NOSNAP) { + injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode); + dout(20) << "adding " << ino << " for future processing to fix dnfirst" << dendl; + } + } + } else if (dentry_type == 'L' || dentry_type == 'l') { + inodeno_t ino; + unsigned char d_type; + CDentry::decode_remote(dentry_type, ino, d_type, alternate_name, q); + + if (step == SCAN_INOS) { + remote_links[ino]++; + } else if (step == CHECK_LINK) { + if (!used_inos.contains(ino, 1)) { + derr << "Bad remote link dentry 0x" << std::hex << dir_ino + << std::dec << "/" << dname + << ", ino " << ino << " not found" << dendl; + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str()); + dn_key.encode(key); + to_remove[dirfrag_t(dir_ino, frag_id)].insert(key); + } + } + } else { + derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino + << std::dec << "/" << dname << dendl; + return -EINVAL; + } + } catch (const buffer::error &err) { + derr << "Error decoding dentry 0x" << std::hex << dir_ino + << std::dec << "/" << dname << dendl; + return -EINVAL; + } + } + } + } + + map max_ino_map; + { + auto prev_max_ino = (uint64_t)1 << 40; + for (auto p = used_inos.begin(); p != used_inos.end(); ++p) { + auto cur_max = p.get_start() + p.get_len() - 1; + if (cur_max < prev_max_ino) + continue; // system inodes + + if ((prev_max_ino >> 40) != (cur_max >> 40)) { + unsigned rank = (prev_max_ino >> 40) - 1; + max_ino_map[rank] = prev_max_ino; + } else if ((p.get_start() >> 40) != (cur_max >> 40)) { + unsigned rank = (p.get_start() >> 40) - 1; + max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1; + } + prev_max_ino = cur_max; + } + unsigned rank = (prev_max_ino >> 40) - 1; + max_ino_map[rank] = prev_max_ino; + } + + used_inos.clear(); + + dout(10) << "processing " << dup_primaries.size() << " dup_primaries, " + << remote_links.size() << " remote_links" << dendl; + + for (auto& p : dup_primaries) { + + dout(10) << "handling dup " << p.first << dendl; + + link_info_t newest; + for (auto& q : p.second) { + if (q.version > newest.version) { + newest = q; + } else if (q.version == newest.version && + !MDS_INO_IS_STRAY(q.dirino) && + MDS_INO_IS_STRAY(newest.dirino)) { + newest = q; + } + } + + for (auto& q : p.second) { + // in the middle of dir fragmentation? + if (newest.dirino == q.dirino && newest.name == q.name) { + snaps.insert(make_move_iterator(begin(q.snaps)), + make_move_iterator(end(q.snaps))); + continue; + } + + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str()); + dn_key.encode(key); + to_remove[q.dirfrag()].insert(key); + derr << "Remove duplicated ino 0x" << p.first << " from " + << q.dirfrag() << "/" << q.name << dendl; + } + + int nlink = 0; + auto q = remote_links.find(p.first); + if (q != remote_links.end()) + nlink = q->second; + if (!MDS_INO_IS_STRAY(newest.dirino)) + nlink++; + + if (nlink != newest.nlink) { + derr << "Bad nlink on " << p.first << " expected " << nlink + << " has " << newest.nlink << dendl; + bad_nlink_inos[p.first] = newest; + bad_nlink_inos[p.first].nlink = nlink; + } + } + dup_primaries.clear(); + remote_links.clear(); + + { + objecter->with_osdmap([&](const OSDMap& o) { + for (auto p : data_pools) { + const pg_pool_t *pi = o.get_pg_pool(p); + if (!pi) + continue; + if (pi->snap_seq > last_snap) + last_snap = pi->snap_seq; + } + }); + + if (!snaps.empty()) { + if (snaps.rbegin()->first > last_snap) + last_snap = snaps.rbegin()->first; + } + } + + dout(10) << "removing dup dentries from " << to_remove.size() << " objects" + << dendl; + + for (auto& p : to_remove) { + object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, ""); + + dout(10) << "removing dup dentries from " << p.first << dendl; + + int r = metadata_io.omap_rm_keys(frag_oid.name, p.second); + if (r != 0) { + derr << "Error removing duplicated dentries from " << p.first << dendl; + return r; + } + } + to_remove.clear(); + + dout(10) << "processing " << bad_nlink_inos.size() << " bad_nlink_inos" + << dendl; + + for (auto &p : bad_nlink_inos) { + dout(10) << "handling bad_nlink_ino " << p.first << dendl; + + InodeStore inode; + snapid_t first; + int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first); + if (r < 0) { + derr << "Unexpected error reading dentry " + << p.second.dirfrag() << "/" << p.second.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (inode.inode->ino != p.first || inode.inode->version != p.second.version) + continue; + + inode.get_inode()->nlink = p.second.nlink; + r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first); + if (r < 0) + return r; + } + + dout(10) << "processing " << injected_inos.size() << " injected_inos" + << dendl; + + for (auto &p : injected_inos) { + dout(10) << "handling injected_ino " << p.first << dendl; + + InodeStore inode; + snapid_t first; + dout(20) << " fixing linkage (dnfirst) of " << p.second.dirino << ":" << p.second.name << dendl; + int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first); + if (r < 0) { + derr << "Unexpected error reading dentry " + << p.second.dirfrag() << "/" << p.second.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (first != CEPH_NOSNAP) { + dout(20) << " ????" << dendl; + continue; + } + + first = last_snap + 1; + dout(20) << " first is now " << first << dendl; + r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first); + if (r < 0) + return r; + } + + dout(10) << "updating inotable" << dendl; + + for (auto& p : max_ino_map) { + InoTable inotable(nullptr); + inotable.set_rank(p.first); + bool dirty = false; + int r = metadata_driver->load_table(&inotable); + if (r < 0) { + inotable.reset_state(); + dirty = true; + } + if (inotable.force_consume_to(p.second)) + dirty = true; + if (dirty) { + r = metadata_driver->save_table(&inotable); + if (r < 0) + return r; + } + } + + dout(10) << "updating snaptable" << dendl; + + { + SnapServer snaptable; + snaptable.set_rank(0); + bool dirty = false; + int r = metadata_driver->load_table(&snaptable); + if (r < 0) { + snaptable.reset_state(); + dirty = true; + } + if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps)) + dirty = true; + if (dirty) { + r = metadata_driver->save_table(&snaptable); + if (r < 0) + return r; + } + } + return 0; +} + +int DataScan::scan_frags() +{ + bool roots_present; + int r = driver->check_roots(&roots_present); + if (r != 0) { + derr << "Unexpected error checking roots: '" + << cpp_strerror(r) << "'" << dendl; + return r; + } + + if (!roots_present) { + std::cerr << "Some or all system inodes are absent. Run 'init' from " + "one node before running 'scan_inodes'" << std::endl; + return -EIO; + } + + return forall_objects(metadata_io, true, [this]( + std::string const &oid, + uint64_t obj_name_ino, + uint64_t obj_name_offset) -> int + { + int r = 0; + r = parse_oid(oid, &obj_name_ino, &obj_name_offset); + if (r != 0) { + dout(4) << "Bad object name '" << oid << "', skipping" << dendl; + return r; + } + + if (obj_name_ino < (1ULL << 40)) { + // FIXME: we're skipping stray dirs here: if they're + // orphaned then we should be resetting them some other + // way + dout(10) << "Skipping system ino " << obj_name_ino << dendl; + return 0; + } + + AccumulateResult accum_res; + inode_backtrace_t backtrace; + + // Default to inherit layout (i.e. no explicit layout on dir) which is + // expressed as a zeroed layout struct (see inode_t::has_layout) + file_layout_t loaded_layout; + + int parent_r = 0; + bufferlist parent_bl; + int layout_r = 0; + bufferlist layout_bl; + bufferlist op_bl; + + librados::ObjectReadOperation op; + op.getxattr("parent", &parent_bl, &parent_r); + op.getxattr("layout", &layout_bl, &layout_r); + r = metadata_io.operate(oid, &op, &op_bl); + if (r != 0 && r != -ENODATA) { + derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl; + return r; + } + + if (parent_r != -ENODATA) { + try { + auto q = parent_bl.cbegin(); + backtrace.decode(q); + } catch (buffer::error &e) { + dout(4) << "Corrupt backtrace on '" << oid << "': " << e.what() << dendl; + if (!force_corrupt) { + return -EINVAL; + } else { + // Treat backtrace as absent: we'll inject into lost+found + backtrace = inode_backtrace_t(); + } + } + } + + if (layout_r != -ENODATA) { + try { + auto q = layout_bl.cbegin(); + decode(loaded_layout, q); + } catch (buffer::error &e) { + dout(4) << "Corrupt layout on '" << oid << "': " << e.what() << dendl; + if (!force_corrupt) { + return -EINVAL; + } + } + } + + bool have_backtrace = !(backtrace.ancestors.empty()); + + // Santity checking backtrace ino against object name + if (have_backtrace && backtrace.ino != obj_name_ino) { + dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino + << " doesn't match object name ino 0x" << obj_name_ino + << std::dec << dendl; + have_backtrace = false; + } + + uint64_t fnode_version = 0; + fnode_t fnode; + r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version); + if (r == -EINVAL) { + derr << "Corrupt fnode on " << oid << dendl; + if (force_corrupt) { + fnode.fragstat.mtime = 0; + fnode.fragstat.nfiles = 1; + fnode.fragstat.nsubdirs = 0; + fnode.accounted_fragstat = fnode.fragstat; + } else { + return r; + } + } + + InodeStore dentry; + build_dir_dentry(obj_name_ino, fnode.accounted_fragstat, + loaded_layout, &dentry); + + // Inject inode to the metadata pool + if (have_backtrace) { + inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin()); + if (MDS_INO_IS_MDSDIR(root_bp.dirino)) { + /* Special case for strays: even if we have a good backtrace, + * don't put it in the stray dir, because while that would technically + * give it linkage it would still be invisible to the user */ + r = driver->inject_lost_and_found(obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } else { + /* Happy case: we will inject a named dentry for this inode */ + r = driver->inject_with_backtrace(backtrace, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << backtrace.ino + << std::dec << " with backtrace: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + } else { + /* Backtrace-less case: we will inject a lost+found dentry */ + r = driver->inject_lost_and_found( + obj_name_ino, dentry); + if (r < 0) { + dout(4) << "Error injecting 0x" << std::hex << obj_name_ino + << std::dec << " into lost+found: " << cpp_strerror(r) << dendl; + if (r == -EINVAL) { + dout(4) << "Use --force-corrupt to overwrite structures that " + "appear to be corrupt" << dendl; + } + } + } + + return r; + }); +} + +int MetadataTool::read_fnode( + inodeno_t ino, frag_t frag, fnode_t *fnode, + uint64_t *last_version) +{ + ceph_assert(fnode != NULL); + + object_t frag_oid = InodeStore::get_object_name(ino, frag, ""); + bufferlist fnode_bl; + int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl); + *last_version = metadata_io.get_last_version(); + if (r < 0) { + return r; + } + + auto old_fnode_iter = fnode_bl.cbegin(); + try { + (*fnode).decode(old_fnode_iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + + return 0; +} + +int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag, + const std::string &dname, InodeStore *inode, snapid_t *dnfirst) +{ + ceph_assert(inode != NULL); + + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str()); + dn_key.encode(key); + + std::set keys; + keys.insert(key); + std::map vals; + object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, ""); + int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals); + dout(20) << "oid=" << frag_oid.name + << " dname=" << dname + << " frag=" << frag + << ", r=" << r << dendl; + if (r < 0) { + return r; + } + + if (vals.find(key) == vals.end()) { + dout(20) << key << " not found in result" << dendl; + return -ENOENT; + } + + try { + auto q = vals[key].cbegin(); + snapid_t first; + decode(first, q); + char dentry_type; + decode(dentry_type, q); + if (dentry_type == 'I' || dentry_type == 'i') { + if (dentry_type == 'i') { + mempool::mds_co::string alternate_name; + + DECODE_START(2, q); + if (struct_v >= 2) + decode(alternate_name, q); + inode->decode(q); + DECODE_FINISH(q); + } else { + inode->decode_bare(q); + } + } else { + dout(20) << "dentry type '" << dentry_type << "': cannot" + "read an inode out of that" << dendl; + return -EINVAL; + } + if (dnfirst) + *dnfirst = first; + } catch (const buffer::error &err) { + dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino + << std::dec << "/" << dname << dendl; + return -EINVAL; + } + + return 0; +} + +int MetadataDriver::load_table(MDSTable *table) +{ + object_t table_oid = table->get_object_name(); + + bufferlist table_bl; + int r = metadata_io.read(table_oid.name, table_bl, 0, 0); + if (r < 0) { + derr << "unable to read mds table '" << table_oid.name << "': " + << cpp_strerror(r) << dendl; + return r; + } + + try { + version_t table_ver; + auto p = table_bl.cbegin(); + decode(table_ver, p); + table->decode_state(p); + table->force_replay_version(table_ver); + } catch (const buffer::error &err) { + derr << "unable to decode mds table '" << table_oid.name << "': " + << err.what() << dendl; + return -EIO; + } + return 0; +} + +int MetadataDriver::save_table(MDSTable *table) +{ + object_t table_oid = table->get_object_name(); + + bufferlist table_bl; + encode(table->get_version(), table_bl); + table->encode_state(table_bl); + int r = metadata_io.write_full(table_oid.name, table_bl); + if (r != 0) { + derr << "error updating mds table " << table_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +int MetadataDriver::inject_lost_and_found( + inodeno_t ino, const InodeStore &dentry) +{ + // Create lost+found if doesn't exist + bool created = false; + int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created); + if (r < 0) { + return r; + } + InodeStore lf_ino; + r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino); + if (r == -ENOENT || r == -EINVAL) { + if (r == -EINVAL && !force_corrupt) { + return r; + } + + // To have a directory not specify a layout, give it zeros (see + // inode_t::has_layout) + file_layout_t inherit_layout; + + // Construct LF inode + frag_info_t fragstat; + fragstat.nfiles = 1, + build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino); + + // Inject link to LF inode in the root dir + r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino); + if (r < 0) { + return r; + } + } else { + if (!(lf_ino.inode->mode & S_IFDIR)) { + derr << "lost+found exists but is not a directory!" << dendl; + // In this case we error out, and the user should do something about + // this problem. + return -EINVAL; + } + } + + r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created); + if (r < 0) { + return r; + } + + const std::string dname = lost_found_dname(ino); + + // Write dentry into lost+found dirfrag + return inject_linkage(lf_ino.inode->ino, dname, frag_t(), dentry); +} + + +int MetadataDriver::get_frag_of( + inodeno_t dirino, + const std::string &target_dname, + frag_t *result_ft) +{ + object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), ""); + + dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl; + + // Find and load fragtree if existing dirfrag + // ========================================== + bool have_backtrace = false; + bufferlist parent_bl; + int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl); + if (r == -ENODATA) { + dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl; + } else if (r < 0) { + dout(4) << "Unexpected error on '" << root_frag_oid << "': " + << cpp_strerror(r) << dendl; + return r; + } + + // Deserialize backtrace + inode_backtrace_t backtrace; + if (parent_bl.length()) { + try { + auto q = parent_bl.cbegin(); + backtrace.decode(q); + have_backtrace = true; + } catch (buffer::error &e) { + dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': " + << e.what() << dendl; + } + } + + if (!(have_backtrace && backtrace.ancestors.size())) { + // Can't work out fragtree without a backtrace + dout(4) << "No backtrace on '" << root_frag_oid + << "': cannot determine fragtree" << dendl; + return -ENOENT; + } + + // The parentage of dirino + const inode_backpointer_t &bp = *(backtrace.ancestors.begin()); + + // The inode of dirino's parent + const inodeno_t parent_ino = bp.dirino; + + // The dname of dirino in its parent. + const std::string &parent_dname = bp.dname; + + dout(20) << "got backtrace parent " << parent_ino << "/" + << parent_dname << dendl; + + // The primary dentry for dirino + InodeStore existing_dentry; + + // See if we can find ourselves in dirfrag zero of the parent: this + // is a fast path that avoids needing to go further up the tree + // if the parent isn't fragmented (worst case we would have to + // go all the way to the root) + r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry); + if (r >= 0) { + // Great, fast path: return the fragtree from here + if (existing_dentry.inode->ino != dirino) { + dout(4) << "Unexpected inode in dentry! 0x" << std::hex + << existing_dentry.inode->ino + << " vs expected 0x" << dirino << std::dec << dendl; + return -ENOENT; + } + dout(20) << "fast path, fragtree is " + << existing_dentry.dirfragtree << dendl; + *result_ft = existing_dentry.pick_dirfrag(target_dname); + dout(20) << "frag is " << *result_ft << dendl; + return 0; + } else if (r != -ENOENT) { + // Dentry not present in 0th frag, must read parent's fragtree + frag_t parent_frag; + r = get_frag_of(parent_ino, parent_dname, &parent_frag); + if (r == 0) { + // We have the parent fragtree, so try again to load our dentry + r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry); + if (r >= 0) { + // Got it! + *result_ft = existing_dentry.pick_dirfrag(target_dname); + dout(20) << "resolved via parent, frag is " << *result_ft << dendl; + return 0; + } else { + if (r == -EINVAL || r == -ENOENT) { + return -ENOENT; // dentry missing or corrupt, so frag is missing + } else { + return r; + } + } + } else { + // Couldn't resolve parent fragtree, so can't find ours. + return r; + } + } else if (r == -EINVAL) { + // Unreadable dentry, can't know the fragtree. + return -ENOENT; + } else { + // Unexpected error, raise it + return r; + } +} + + +int MetadataDriver::inject_with_backtrace( + const inode_backtrace_t &backtrace, const InodeStore &dentry) + +{ + + // On dirfrags + // =========== + // In order to insert something into a directory, we first (ideally) + // need to know the fragtree for the directory. Sometimes we can't + // get that, in which case we just go ahead and insert it into + // fragment zero for a good chance of that being the right thing + // anyway (most moderate-sized dirs aren't fragmented!) + + // On ancestry + // =========== + // My immediate ancestry should be correct, so if we can find that + // directory's dirfrag then go inject it there. This works well + // in the case that this inode's dentry was somehow lost and we + // are recreating it, because the rest of the hierarchy + // will probably still exist. + // + // It's more of a "better than nothing" approach when rebuilding + // a whole tree, as backtraces will in general not be up to date + // beyond the first parent, if anything in the trace was ever + // moved after the file was created. + + // On inode numbers + // ================ + // The backtrace tells us inodes for each of the parents. If we are + // creating those parent dirfrags, then there is a risk that somehow + // the inode indicated here was also used for data (not a dirfrag) at + // some stage. That would be a zany situation, and we don't check + // for it here, because to do so would require extra IOs for everything + // we inject, and anyway wouldn't guarantee that the inode number + // wasn't in use in some dentry elsewhere in the metadata tree that + // just happened not to have any data objects. + + // On multiple workers touching the same traces + // ============================================ + // When creating linkage for a directory, *only* create it if we are + // also creating the object. That way, we might not manage to get the + // *right* linkage for a directory, but at least we won't multiply link + // it. We assume that if a root dirfrag exists for a directory, then + // it is linked somewhere (i.e. that the metadata pool is not already + // inconsistent). + // + // Making sure *that* is true is someone else's job! Probably someone + // who is not going to run in parallel, so that they can self-consistently + // look at versions and move things around as they go. + // Note this isn't 100% safe: if we die immediately after creating dirfrag + // object, next run will fail to create linkage for the dirfrag object + // and leave it orphaned. + + inodeno_t ino = backtrace.ino; + dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl; + for (std::vector::const_iterator i = backtrace.ancestors.begin(); + i != backtrace.ancestors.end(); ++i) { + const inode_backpointer_t &backptr = *i; + dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec + << "/" << backptr.dname << dendl; + + // Examine root dirfrag for parent + const inodeno_t parent_ino = backptr.dirino; + const std::string dname = backptr.dname; + + frag_t fragment; + int r = get_frag_of(parent_ino, dname, &fragment); + if (r == -ENOENT) { + // Don't know fragment, fall back to assuming root + dout(20) << "don't know fragment for 0x" << std::hex << + parent_ino << std::dec << "/" << dname << ", will insert to root" + << dendl; + } + + // Find or create dirfrag + // ====================== + bool created_dirfrag; + r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag); + if (r < 0) { + return r; + } + + // Check if dentry already exists + // ============================== + InodeStore existing_dentry; + r = read_dentry(parent_ino, fragment, dname, &existing_dentry); + bool write_dentry = false; + if (r == -ENOENT || r == -EINVAL) { + if (r == -EINVAL && !force_corrupt) { + return r; + } + // Missing or corrupt dentry + write_dentry = true; + } else if (r < 0) { + derr << "Unexpected error reading dentry 0x" << std::hex + << parent_ino << std::dec << "/" + << dname << ": " << cpp_strerror(r) << dendl; + break; + } else { + // Dentry already present, does it link to me? + if (existing_dentry.inode->ino == ino) { + dout(20) << "Dentry 0x" << std::hex + << parent_ino << std::dec << "/" + << dname << " already exists and points to me" << dendl; + } else { + derr << "Dentry 0x" << std::hex + << parent_ino << std::dec << "/" + << dname << " already exists but points to 0x" + << std::hex << existing_dentry.inode->ino << std::dec << dendl; + // Fall back to lost+found! + return inject_lost_and_found(backtrace.ino, dentry); + } + } + + // Inject linkage + // ============== + + if (write_dentry) { + if (i == backtrace.ancestors.begin()) { + // This is the linkage for the file of interest + dout(10) << "Linking inode 0x" << std::hex << ino + << " at 0x" << parent_ino << "/" << dname << std::dec + << " with size=" << dentry.inode->size << " bytes" << dendl; + + /* NOTE: dnfirst fixed in scan_links */ + r = inject_linkage(parent_ino, dname, fragment, dentry); + } else { + // This is the linkage for an ancestor directory + dout(10) << "Linking ancestor directory of inode 0x" << std::hex << ino + << " at 0x" << std::hex << parent_ino + << ":" << dname << dendl; + + InodeStore ancestor_dentry; + auto inode = ancestor_dentry.get_inode(); + inode->mode = 0755 | S_IFDIR; + + // Set nfiles to something non-zero, to fool any other code + // that tries to ignore 'empty' directories. This won't be + // accurate, but it should avoid functional issues. + + inode->dirstat.nfiles = 1; + inode->dir_layout.dl_dir_hash = + g_conf()->mds_default_dir_hash; + + inode->nlink = 1; + inode->ino = ino; + inode->uid = g_conf()->mds_root_ino_uid; + inode->gid = g_conf()->mds_root_ino_gid; + inode->version = 1; + inode->backtrace_version = 1; + /* NOTE: dnfirst fixed in scan_links */ + r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry); + } + + if (r < 0) { + return r; + } + } + + if (!created_dirfrag) { + // If the parent dirfrag already existed, then stop traversing the + // backtrace: assume that the other ancestors already exist too. This + // is an assumption rather than a truth, but it's a convenient way + // to avoid the risk of creating multiply-linked directories while + // injecting data. If there are in fact missing ancestors, this + // should be fixed up using a separate tool scanning the metadata + // pool. + break; + } else { + // Proceed up the backtrace, creating parents + ino = parent_ino; + } + } + + return 0; +} + +int MetadataDriver::find_or_create_dirfrag( + inodeno_t ino, + frag_t fragment, + bool *created) +{ + ceph_assert(created != NULL); + + fnode_t existing_fnode; + *created = false; + + uint64_t read_version = 0; + int r = read_fnode(ino, fragment, &existing_fnode, &read_version); + dout(10) << "read_version = " << read_version << dendl; + + if (r == -ENOENT || r == -EINVAL) { + if (r == -EINVAL && !force_corrupt) { + return r; + } + + // Missing or corrupt fnode, create afresh + bufferlist fnode_bl; + fnode_t blank_fnode; + blank_fnode.version = 1; + // mark it as non-empty + blank_fnode.fragstat.nfiles = 1; + blank_fnode.accounted_fragstat = blank_fnode.fragstat; + blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS); + blank_fnode.encode(fnode_bl); + + + librados::ObjectWriteOperation op; + + if (read_version) { + ceph_assert(r == -EINVAL); + // Case A: We must assert that the version isn't changed since we saw the object + // was unreadable, to avoid the possibility of two data-scan processes + // both creating the frag. + op.assert_version(read_version); + } else { + ceph_assert(r == -ENOENT); + // Case B: The object didn't exist in read_fnode, so while creating it we must + // use an exclusive create to correctly populate *creating with + // whether we created it ourselves or someone beat us to it. + op.create(true); + } + + object_t frag_oid = InodeStore::get_object_name(ino, fragment, ""); + op.omap_set_header(fnode_bl); + r = metadata_io.operate(frag_oid.name, &op); + if (r == -EOVERFLOW || r == -EEXIST) { + // Someone else wrote it (see case A above) + dout(10) << "Dirfrag creation race: 0x" << std::hex + << ino << " " << fragment << std::dec << dendl; + *created = false; + return 0; + } else if (r < 0) { + // We were unable to create or write it, error out + derr << "Failed to create dirfrag 0x" << std::hex + << ino << std::dec << ": " << cpp_strerror(r) << dendl; + return r; + } else { + // Success: the dirfrag object now exists with a value header + dout(10) << "Created dirfrag: 0x" << std::hex + << ino << std::dec << dendl; + *created = true; + } + } else if (r < 0) { + derr << "Unexpected error reading dirfrag 0x" << std::hex + << ino << std::dec << " : " << cpp_strerror(r) << dendl; + return r; + } else { + dout(20) << "Dirfrag already exists: 0x" << std::hex + << ino << " " << fragment << std::dec << dendl; + } + + return 0; +} + +int MetadataDriver::inject_linkage( + inodeno_t dir_ino, const std::string &dname, + const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst) +{ + object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, ""); + + std::string key; + dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str()); + dn_key.encode(key); + + bufferlist dentry_bl; + encode(dnfirst, dentry_bl); + encode('I', dentry_bl); + inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + // Write out + std::map vals; + vals[key] = dentry_bl; + int r = metadata_io.omap_set(frag_oid.name, vals); + if (r != 0) { + derr << "Error writing dentry 0x" << std::hex + << dir_ino << std::dec << "/" + << dname << ": " << cpp_strerror(r) << dendl; + return r; + } else { + dout(20) << "Injected dentry 0x" << std::hex + << dir_ino << "/" << dname << " pointing to 0x" + << inode.inode->ino << std::dec << dendl; + return 0; + } +} + + +int MetadataDriver::init( + librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap, + fs_cluster_id_t fscid) +{ + if (metadata_pool_name.empty()) { + auto fs = fsmap->get_filesystem(fscid); + ceph_assert(fs != nullptr); + int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool(); + + dout(4) << "resolving metadata pool " << metadata_pool_id << dendl; + int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name); + if (r < 0) { + derr << "Pool " << metadata_pool_id + << " identified in MDS map not found in RADOS!" << dendl; + return r; + } + dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl; + } else { + dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl; + } + return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io); +} + +int LocalFileDriver::init( + librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap, + fs_cluster_id_t fscid) +{ + return 0; +} + +int LocalFileDriver::inject_data( + const std::string &file_path, + uint64_t size, + uint32_t chunk_size, + inodeno_t ino) +{ + // Scrape the file contents out of the data pool and into the + // local filesystem + std::fstream f; + f.open(file_path.c_str(), std::fstream::out | std::fstream::binary); + + for (uint64_t offset = 0; offset < size; offset += chunk_size) { + bufferlist bl; + + char buf[32]; + snprintf(buf, sizeof(buf), + "%llx.%08llx", + (unsigned long long)ino, + (unsigned long long)(offset / chunk_size)); + std::string oid(buf); + + int r = data_io.read(oid, bl, chunk_size, 0); + + if (r <= 0 && r != -ENOENT) { + derr << "error reading data object '" << oid << "': " + << cpp_strerror(r) << dendl; + f.close(); + return r; + } else if (r >=0) { + + f.seekp(offset); + bl.write_stream(f); + } + } + f.close(); + + return 0; +} + + +int LocalFileDriver::inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) +{ + std::string path_builder = path; + + // Iterate through backtrace creating directory parents + std::vector::const_reverse_iterator i; + for (i = bt.ancestors.rbegin(); + i != bt.ancestors.rend(); ++i) { + + const inode_backpointer_t &backptr = *i; + path_builder += "/"; + path_builder += backptr.dname; + + // Last entry is the filename itself + bool is_file = (i + 1 == bt.ancestors.rend()); + if (is_file) { + // FIXME: inject_data won't cope with interesting (i.e. striped) + // layouts (need a librados-compatible Filer to read these) + inject_data(path_builder, dentry.inode->size, + dentry.inode->layout.object_size, bt.ino); + } else { + int r = mkdir(path_builder.c_str(), 0755); + if (r != 0 && r != -EPERM) { + derr << "error creating directory: '" << path_builder << "': " + << cpp_strerror(r) << dendl; + return r; + } + } + } + + return 0; +} + +int LocalFileDriver::inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) +{ + std::string lf_path = path + "/lost+found"; + int r = mkdir(lf_path.c_str(), 0755); + if (r != 0 && r != -EPERM) { + derr << "error creating directory: '" << lf_path << "': " + << cpp_strerror(r) << dendl; + return r; + } + + std::string file_path = lf_path + "/" + lost_found_dname(ino); + return inject_data(file_path, dentry.inode->size, + dentry.inode->layout.object_size, ino); +} + +int LocalFileDriver::init_roots(int64_t data_pool_id) +{ + // Ensure that the path exists and is a directory + bool exists; + int r = check_roots(&exists); + if (r != 0) { + return r; + } + + if (exists) { + return 0; + } else { + return ::mkdir(path.c_str(), 0755); + } +} + +int LocalFileDriver::check_roots(bool *result) +{ + // Check if the path exists and is a directory + DIR *d = ::opendir(path.c_str()); + if (d == NULL) { + *result = false; + } else { + int r = closedir(d); + if (r != 0) { + // Weird, but maybe possible with e.g. stale FD on NFS mount? + *result = false; + } else { + *result = true; + } + } + + return 0; +} + +void MetadataTool::build_file_dentry( + inodeno_t ino, uint64_t file_size, time_t file_mtime, + const file_layout_t &layout, InodeStore *out, std::string symlink) +{ + ceph_assert(out != NULL); + + auto inode = out->get_inode(); + if(!symlink.empty()) { + inode->mode = 0777 | S_IFLNK; + out->symlink = symlink; + } + else { + inode->mode = 0500 | S_IFREG; + } + + inode->size = file_size; + inode->max_size_ever = file_size; + inode->mtime.tv.tv_sec = file_mtime; + inode->atime.tv.tv_sec = file_mtime; + inode->ctime.tv.tv_sec = file_mtime; + + inode->layout = layout; + + inode->truncate_seq = 1; + inode->truncate_size = -1ull; + + inode->inline_data.version = CEPH_INLINE_NONE; + + inode->nlink = 1; + inode->ino = ino; + inode->version = 1; + inode->backtrace_version = 1; + inode->uid = g_conf()->mds_root_ino_uid; + inode->gid = g_conf()->mds_root_ino_gid; +} + +void MetadataTool::build_dir_dentry( + inodeno_t ino, const frag_info_t &fragstat, + const file_layout_t &layout, InodeStore *out) +{ + ceph_assert(out != NULL); + + auto inode = out->get_inode(); + inode->mode = 0755 | S_IFDIR; + inode->dirstat = fragstat; + inode->mtime.tv.tv_sec = fragstat.mtime; + inode->atime.tv.tv_sec = fragstat.mtime; + inode->ctime.tv.tv_sec = fragstat.mtime; + + inode->layout = layout; + inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash; + + inode->truncate_seq = 1; + inode->truncate_size = -1ull; + + inode->inline_data.version = CEPH_INLINE_NONE; + + inode->nlink = 1; + inode->ino = ino; + inode->version = 1; + inode->backtrace_version = 1; + inode->uid = g_conf()->mds_root_ino_uid; + inode->gid = g_conf()->mds_root_ino_gid; +} + diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h new file mode 100644 index 000000000..4b8f34bf6 --- /dev/null +++ b/src/tools/cephfs/DataScan.h @@ -0,0 +1,344 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "MDSUtility.h" +#include "include/rados/librados.hpp" + +class InodeStore; +class MDSTable; + +class RecoveryDriver { + protected: + // If true, overwrite structures that generate decoding errors. + bool force_corrupt; + + // If true, overwrite root objects during init_roots even if they + // exist + bool force_init; + + public: + virtual int init( + librados::Rados &rados, + std::string &metadata_pool_name, + const FSMap *fsmap, + fs_cluster_id_t fscid) = 0; + + void set_force_corrupt(const bool val) + { + force_corrupt = val; + } + + void set_force_init(const bool val) + { + force_init = val; + } + + + /** + * Inject an inode + dentry parents into the metadata pool, + * based on a backtrace recovered from the data pool + */ + virtual int inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) = 0; + + /** + * Inject an inode + dentry into the lost+found directory, + * when all we know about a file is its inode. + */ + virtual int inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) = 0; + + /** + * Create any missing roots (i.e. mydir, strays, root inode) + */ + virtual int init_roots( + int64_t data_pool_id) = 0; + + /** + * Pre-injection check that all the roots are present in + * the metadata pool. Used to avoid parallel workers interfering + * with one another, by cueing the user to go run 'init' on a + * single node before running a parallel scan. + * + * @param result: set to true if roots are present, else set to false + * @returns 0 on no unexpected errors, else error code. Missing objects + * are not considered an unexpected error: check *result for + * this case. + */ + virtual int check_roots(bool *result) = 0; + + /** + * Helper to compose dnames for links to lost+found + * inodes. + */ + std::string lost_found_dname(inodeno_t ino) + { + char s[20]; + snprintf(s, sizeof(s), "%llx", (unsigned long long)ino); + return std::string(s); + } + + RecoveryDriver() + : force_corrupt(false), + force_init(false) + {} + + virtual ~RecoveryDriver() {} +}; + +class LocalFileDriver : public RecoveryDriver +{ + protected: + const std::string path; + librados::IoCtx &data_io; + + int inject_data( + const std::string &file_path, + uint64_t size, + uint32_t chunk_size, + inodeno_t ino); + public: + + LocalFileDriver(const std::string &path_, librados::IoCtx &data_io_) + : RecoveryDriver(), path(path_), data_io(data_io_) + {} + + // Implement RecoveryDriver interface + int init( + librados::Rados &rados, + std::string &metadata_pool_name, + const FSMap *fsmap, + fs_cluster_id_t fscid) override; + + int inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) override; + + int inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) override; + + int init_roots(int64_t data_pool_id) override; + + int check_roots(bool *result) override; +}; + +/** + * A class that knows how to work with objects in a CephFS + * metadata pool. + */ +class MetadataTool +{ + protected: + + librados::IoCtx metadata_io; + + /** + * Construct a synthetic InodeStore for a normal file + */ + void build_file_dentry( + inodeno_t ino, uint64_t file_size, time_t file_mtime, + const file_layout_t &layout, + InodeStore *out, + std::string symlink); + + /** + * Construct a synthetic InodeStore for a directory + */ + void build_dir_dentry( + inodeno_t ino, + const frag_info_t &fragstat, + const file_layout_t &layout, + InodeStore *out); + + /** + * Try and read an fnode from a dirfrag + */ + int read_fnode(inodeno_t ino, frag_t frag, + fnode_t *fnode, uint64_t *read_version); + + /** + * Try and read a dentry from a dirfrag + */ + int read_dentry(inodeno_t parent_ino, frag_t frag, + const std::string &dname, InodeStore *inode, snapid_t *dnfirst=nullptr); +}; + +/** + * A class that knows how to manipulate CephFS metadata pools + */ +class MetadataDriver : public RecoveryDriver, public MetadataTool +{ + protected: + /** + * Create a .inode object, i.e. root or mydir + */ + int inject_unlinked_inode(inodeno_t inono, int mode, int64_t data_pool_id); + + /** + * Check for existence of .inode objects, before + * trying to go ahead and inject metadata. + */ + int root_exists(inodeno_t ino, bool *result); + int find_or_create_dirfrag( + inodeno_t ino, + frag_t fragment, + bool *created); + + + /** + * Work out which fragment of a directory should contain a named + * dentry, recursing up the trace as necessary to retrieve + * fragtrees. + */ + int get_frag_of( + inodeno_t dirino, + const std::string &dname, + frag_t *result_ft); + + public: + + // Implement RecoveryDriver interface + int init( + librados::Rados &rados, + std::string &metadata_pool_name, + const FSMap *fsmap, + fs_cluster_id_t fscid) override; + + int inject_linkage( + inodeno_t dir_ino, const std::string &dname, + const frag_t fragment, const InodeStore &inode, snapid_t dnfirst=CEPH_NOSNAP); + + int inject_with_backtrace( + const inode_backtrace_t &bt, + const InodeStore &dentry) override; + + int inject_lost_and_found( + inodeno_t ino, + const InodeStore &dentry) override; + + int init_roots(int64_t data_pool_id) override; + + int check_roots(bool *result) override; + + int load_table(MDSTable *table); + int save_table(MDSTable *table); +}; + +class DataScan : public MDSUtility, public MetadataTool +{ + protected: + RecoveryDriver *driver; + fs_cluster_id_t fscid; + + std::string metadata_pool_name; + std::vector data_pools; + + // IoCtx for data pool (where we scrape file backtraces from) + librados::IoCtx data_io; + // Remember the data pool ID for use in layouts + int64_t data_pool_id; + // IoCtxs for extra data pools + std::vector extra_data_ios; + + uint32_t n; + uint32_t m; + + /** + * Scan data pool for backtraces, and inject inodes to metadata pool + */ + int scan_inodes(); + + /** + * Scan data pool for file sizes and mtimes + */ + int scan_extents(); + + /** + * Scan metadata pool for 0th dirfrags to link orphaned + * directory inodes. + */ + int scan_frags(); + + /** + * Cleanup xattrs from data pool + */ + int cleanup(); + + /** + * Check if an inode number is in the permitted ranges + */ + bool valid_ino(inodeno_t ino) const; + + + int scan_links(); + + // Accept pools which are not in the FSMap + bool force_pool; + // Respond to decode errors by overwriting + bool force_corrupt; + // Overwrite root objects even if they exist + bool force_init; + // Only scan inodes without this scrub tag + std::string filter_tag; + + /** + * @param r set to error on valid key with invalid value + * @return true if argument consumed, else false + */ + bool parse_kwarg( + const std::vector &args, + std::vector::const_iterator &i, + int *r); + + /** + * @return true if argument consumed, else false + */ + bool parse_arg( + const std::vector &arg, + std::vector::const_iterator &i); + + int probe_filter(librados::IoCtx &ioctx); + + /** + * Apply a function to all objects in an ioctx's pool, optionally + * restricted to only those objects with a 00000000 offset and + * no tag matching DataScan::scrub_tag. + */ + int forall_objects( + librados::IoCtx &ioctx, + bool untagged_only, + std::function handler); + + public: + static void usage(); + int main(const std::vector &args); + + DataScan() + : driver(NULL), fscid(FS_CLUSTER_ID_NONE), + data_pool_id(-1), n(0), m(1), + force_pool(false), force_corrupt(false), + force_init(false) + { + } + + ~DataScan() override + { + delete driver; + } +}; + diff --git a/src/tools/cephfs/Dumper.cc b/src/tools/cephfs/Dumper.cc new file mode 100644 index 000000000..68a190182 --- /dev/null +++ b/src/tools/cephfs/Dumper.cc @@ -0,0 +1,433 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef _BACKWARD_BACKWARD_WARNING_H +#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_* +#endif + +#include "include/compat.h" +#include "include/fs_types.h" +#include "common/entity_name.h" +#include "common/errno.h" +#include "common/safe_io.h" +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/JournalPointer.h" +#include "osdc/Journaler.h" +#include "mon/MonClient.h" + +#include "Dumper.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +#define HEADER_LEN 4096 + +using namespace std; + +int Dumper::init(mds_role_t role_, const std::string &type) +{ + role = role_; + + int r = MDSUtility::init(); + if (r < 0) { + return r; + } + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + if (type == "mdlog") { + JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool()); + int jp_load_result = jp.load(objecter); + if (jp_load_result != 0) { + std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl; + return jp_load_result; + } else { + ino = jp.front; + } + } else if (type == "purge_queue") { + ino = MDS_INO_PURGE_QUEUE + role.rank; + } else { + ceph_abort(); // should not get here + } + return 0; +} + + +int Dumper::recover_journal(Journaler *journaler) +{ + C_SaferCond cond; + lock.lock(); + journaler->recover(&cond); + lock.unlock(); + const int r = cond.wait(); + + if (r < 0) { // Error + derr << "error on recovery: " << cpp_strerror(r) << dendl; + return r; + } else { + dout(10) << "completed journal recovery" << dendl; + return 0; + } +} + + +int Dumper::dump(const char *dump_file) +{ + int r = 0; + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, + &finisher); + r = recover_journal(&journaler); + if (r) { + return r; + } + uint64_t start = journaler.get_read_pos(); + uint64_t end = journaler.get_write_pos(); + uint64_t len = end-start; + + Filer filer(objecter, &finisher); + + cout << "journal is " << start << "~" << len << std::endl; + + int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0644); + if (fd >= 0) { + // include an informative header + uuid_d fsid = monc->get_fsid(); + char fsid_str[40]; + fsid.print(fsid_str); + char buf[HEADER_LEN]; + memset(buf, 0, sizeof(buf)); + snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\ + length %llu (0x%llx)\n write_pos %llu (0x%llx)\n format %llu\n\ + trimmed_pos %llu (0x%llx)\n stripe_unit %lu (0x%lx)\n stripe_count %lu (0x%lx)\n\ + object_size %lu (0x%lx)\n fsid %s\n%c", + role.rank, + (unsigned long long)start, (unsigned long long)start, + (unsigned long long)len, (unsigned long long)len, + (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos, + (unsigned long long)journaler.last_committed.stream_format, + (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos, + (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit, + (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count, + (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size, + fsid_str, + 4); + r = safe_write(fd, buf, sizeof(buf)); + if (r) { + derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl; + ::close(fd); + return r; + } + + // write the data + off64_t seeked = ::lseek64(fd, start, SEEK_SET); + if (seeked == (off64_t)-1) { + r = errno; + derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl; + ::close(fd); + return r; + } + + + // Read and write 32MB chunks. Slower than it could be because we're not + // streaming, but that's okay because this is just a debug/disaster tool. + const uint32_t chunk_size = 32 * 1024 * 1024; + + for (uint64_t pos = start; pos < start + len; pos += chunk_size) { + bufferlist bl; + dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl; + + const uint32_t read_size = std::min(chunk_size, end - pos); + + C_SaferCond cond; + lock.lock(); + filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP, + pos, read_size, &bl, 0, &cond); + lock.unlock(); + r = cond.wait(); + if (r < 0) { + derr << "Error " << r << " (" << cpp_strerror(r) << ") reading " + "journal at offset 0x" << std::hex << pos << std::dec << dendl; + ::close(fd); + return r; + } + dout(10) << "Got 0x" << std::hex << bl.length() << std::dec + << " bytes" << dendl; + + r = bl.write_fd(fd); + if (r) { + derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl; + ::close(fd); + return r; + } + } + + r = ::close(fd); + if (r) { + r = errno; + derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl; + return r; + } + + cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n" + << "NOTE: this is a _sparse_ file; you can\n" + << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n" + << " to efficiently compress it while preserving sparseness." << std::endl; + return 0; + } else { + int err = errno; + derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl; + return err; + } +} + +int Dumper::undump(const char *dump_file, bool force) +{ + cout << "undump " << dump_file << std::endl; + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + int r = 0; + // try get layout info from cluster + Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, + &finisher); + int recovered = recover_journal(&journaler); + if (recovered != 0) { + derr << "recover_journal failed, try to get header from dump file " << dendl; + } + + int fd = ::open(dump_file, O_RDONLY|O_BINARY); + if (fd < 0) { + r = errno; + derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl; + return r; + } + + // Ceph mds0 journal dump + // start offset 232401996 (0xdda2c4c) + // length 1097504 (0x10bf20) + + char buf[HEADER_LEN]; + r = safe_read(fd, buf, sizeof(buf)); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return r; + } + + long long unsigned start, len, write_pos, format, trimmed_pos; + long unsigned stripe_unit, stripe_count, object_size; + sscanf(strstr(buf, "start offset"), "start offset %llu", &start); + sscanf(strstr(buf, "length"), "length %llu", &len); + sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos); + sscanf(strstr(buf, "format"), "format %llu", &format); + + if (!force) { + // need to check if fsid match onlien cluster fsid + if (strstr(buf, "fsid")) { + uuid_d fsid; + char fsid_str[40]; + sscanf(strstr(buf, "fsid"), "fsid %39s", fsid_str); + r = fsid.parse(fsid_str); + if (!r) { + derr << "Invalid fsid" << dendl; + ::close(fd); + return -EINVAL; + } + + if (fsid != monc->get_fsid()) { + derr << "Imported journal fsid does not match online cluster fsid" << dendl; + derr << "Use --force to skip fsid check" << dendl; + ::close(fd); + return -EINVAL; + } + } else { + derr << "Invalid header, no fsid embeded" << dendl; + ::close(fd); + return -EINVAL; + } + } + + if (recovered == 0) { + stripe_unit = journaler.last_committed.layout.stripe_unit; + stripe_count = journaler.last_committed.layout.stripe_count; + object_size = journaler.last_committed.layout.object_size; + } else { + // try to get layout from dump file header, if failed set layout to default + if (strstr(buf, "stripe_unit")) { + sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit); + } else { + stripe_unit = file_layout_t::get_default().stripe_unit; + } + if (strstr(buf, "stripe_count")) { + sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count); + } else { + stripe_count = file_layout_t::get_default().stripe_count; + } + if (strstr(buf, "object_size")) { + sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size); + } else { + object_size = file_layout_t::get_default().object_size; + } + } + + if (strstr(buf, "trimmed_pos")) { + sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos); + } else { + // Old format dump, any untrimmed objects before expire_pos will + // be discarded as trash. + trimmed_pos = start - (start % object_size); + } + + if (trimmed_pos > start) { + derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos + << " > expire 0x" << start << std::dec << dendl; + ::close(fd); + return -EINVAL; + } + + if (start > write_pos) { + derr << std::hex << "Invalid header (expire 0x" << start + << " > write 0x" << write_pos << std::dec << dendl; + ::close(fd); + return -EINVAL; + } + + cout << "start " << start << + " len " << len << + " write_pos " << write_pos << + " format " << format << + " trimmed_pos " << trimmed_pos << + " stripe_unit " << stripe_unit << + " stripe_count " << stripe_count << + " object_size " << object_size << std::endl; + + Journaler::Header h; + h.trimmed_pos = trimmed_pos; + h.expire_pos = start; + h.write_pos = write_pos; + h.stream_format = format; + h.magic = CEPH_FS_ONDISK_MAGIC; + + h.layout.stripe_unit = stripe_unit; + h.layout.stripe_count = stripe_count; + h.layout.object_size = object_size; + h.layout.pool_id = fs->mds_map.get_metadata_pool(); + + bufferlist hbl; + encode(h, hbl); + + object_t oid = file_object_t(ino, 0); + object_locator_t oloc(fs->mds_map.get_metadata_pool()); + SnapContext snapc; + + cout << "writing header " << oid << std::endl; + C_SaferCond header_cond; + lock.lock(); + objecter->write_full(oid, oloc, snapc, hbl, + ceph::real_clock::now(), 0, + &header_cond); + lock.unlock(); + + r = header_cond.wait(); + if (r != 0) { + derr << "Failed to write header: " << cpp_strerror(r) << dendl; + ::close(fd); + return r; + } + + Filer filer(objecter, &finisher); + + /* Erase any objects at the end of the region to which we shall write + * the new log data. This is to avoid leaving trailing junk after + * the newly written data. Any junk more than one object ahead + * will be taken care of during normal operation by Journaler's + * prezeroing behaviour */ + { + uint32_t const object_size = h.layout.object_size; + ceph_assert(object_size > 0); + uint64_t last_obj = h.write_pos / object_size; + uint64_t purge_count = 2; + /* When the length is zero, the last_obj should be zeroed + * from the offset determined by the new write_pos instead of being purged. + */ + if (!len) { + purge_count = 1; + ++last_obj; + } + C_SaferCond purge_cond; + cout << "Purging " << purge_count << " objects from " << last_obj << std::endl; + lock.lock(); + filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count, + ceph::real_clock::now(), 0, &purge_cond); + lock.unlock(); + purge_cond.wait(); + } + /* When the length is zero, zero the last object + * from the offset determined by the new write_pos. + */ + if (!len) { + uint64_t offset_in_obj = h.write_pos % h.layout.object_size; + uint64_t len = h.layout.object_size - offset_in_obj; + C_SaferCond zero_cond; + cout << "Zeroing " << len << " bytes in the last object." << std::endl; + + lock.lock(); + filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond); + lock.unlock(); + zero_cond.wait(); + } + + // Stream from `fd` to `filer` + uint64_t pos = start; + uint64_t left = len; + while (left > 0) { + // Read + bufferlist j; + lseek64(fd, pos, SEEK_SET); + uint64_t l = std::min(left, 1024*1024); + j.read_fd(fd, l); + + // Write + cout << " writing " << pos << "~" << l << std::endl; + C_SaferCond write_cond; + lock.lock(); + filer.write(ino, &h.layout, snapc, pos, l, j, + ceph::real_clock::now(), 0, &write_cond); + lock.unlock(); + + r = write_cond.wait(); + if (r != 0) { + derr << "Failed to write header: " << cpp_strerror(r) << dendl; + ::close(fd); + return r; + } + + // Advance + pos += l; + left -= l; + } + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + cout << "done." << std::endl; + return 0; +} + diff --git a/src/tools/cephfs/Dumper.h b/src/tools/cephfs/Dumper.h new file mode 100644 index 000000000..758f3cdea --- /dev/null +++ b/src/tools/cephfs/Dumper.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef JOURNAL_DUMPER_H_ +#define JOURNAL_DUMPER_H_ + + +#include "MDSUtility.h" + +class Journaler; + +/** + * This class lets you dump out an mds journal for troubleshooting or whatever. + * + * It was built to work with cmds so some of the design choices are random. + * To use, create a Dumper, call init(), and then call dump() with the name + * of the file to dump to. + */ + +class Dumper : public MDSUtility { +private: + mds_role_t role; + inodeno_t ino; + +public: + Dumper() : ino(-1) + {} + + int init(mds_role_t role_, const std::string &type); + int recover_journal(Journaler *journaler); + int dump(const char *dumpfile); + int undump(const char *dumpfile, bool force); +}; + +#endif /* JOURNAL_DUMPER_H_ */ diff --git a/src/tools/cephfs/EventOutput.cc b/src/tools/cephfs/EventOutput.cc new file mode 100644 index 000000000..8cb235a82 --- /dev/null +++ b/src/tools/cephfs/EventOutput.cc @@ -0,0 +1,153 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include +#include + +#include "common/errno.h" +#include "mds/mdstypes.h" +#include "mds/events/EUpdate.h" +#include "mds/LogEvent.h" +#include "JournalScanner.h" + +#include "EventOutput.h" + + +int EventOutput::binary() const +{ + // Binary output, files + int r = ::mkdir(path.c_str(), 0755); + if (r != 0) { + r = -errno; + if (r != -EEXIST) { + std::cerr << "Error creating output directory: " << cpp_strerror(r) << std::endl; + return r; + } + } + + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + bufferlist bin; + std::stringstream filename; + if (auto& le = i->second.log_event; le) { + le->encode(bin, CEPH_FEATURES_SUPPORTED_DEFAULT); + filename << "0x" << std::hex << i->first << std::dec << "_" << le->get_type_str() << ".bin"; + } else if (auto& pi = i->second.pi; pi) { + pi->encode(bin); + filename << "0x" << std::hex << i->first << std::dec << "_" << pi->get_type_str() << ".bin"; + } + + std::string const file_path = path + std::string("/") + filename.str(); + std::ofstream bin_file(file_path.c_str(), std::ofstream::out | std::ofstream::binary); + bin.write_stream(bin_file); + bin_file.close(); + if (bin_file.fail()) { + return -EIO; + } + } + std::cerr << "Wrote output to binary files in directory '" << path << "'" << std::endl; + + return 0; +} + +int EventOutput::json() const +{ + JSONFormatter jf(true); + std::ofstream out_file(path.c_str(), std::ofstream::out); + jf.open_array_section("journal"); + { + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + if (auto& le = i->second.log_event; le) { + jf.open_object_section("log_event"); + le->dump(&jf); + jf.close_section(); // log_event + } else if (auto& pi = i->second.pi; pi) { + jf.open_object_section("purge_action"); + pi->dump(&jf); + jf.close_section(); + } + } + } + jf.close_section(); // journal + jf.flush(out_file); + out_file.close(); + + if (out_file.fail()) { + return -EIO; + } else { + std::cerr << "Wrote output to JSON file '" << path << "'" << std::endl; + return 0; + } +} + +void EventOutput::list() const +{ + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + if (auto& le = i->second.log_event; le) { + std::vector ev_paths; + EMetaBlob const *emb = le->get_metablob(); + if (emb) { + emb->get_paths(ev_paths); + } + + std::string detail; + if (le->get_type() == EVENT_UPDATE) { + auto& eu = reinterpret_cast(*le); + detail = eu.type; + } + + std::cout << le->get_stamp() << " 0x" + << std::hex << i->first << std::dec << " " + << le->get_type_str() << ": " + << " (" << detail << ")" << std::endl; + for (std::vector::iterator i = ev_paths.begin(); i != ev_paths.end(); ++i) { + std::cout << " " << *i << std::endl; + } + } else if (auto& pi = i->second.pi; pi) { + std::cout << pi->stamp << " 0x" + << std::hex << i->first << std::dec << " " + << pi->get_type_str() << std::endl; + } + } +} + +void EventOutput::summary() const +{ + std::map type_count; + for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) { + std::string type; + if (auto& le = i->second.log_event; le) + type = le->get_type_str(); + else if (auto& pi = i->second.pi; pi) + type = pi->get_type_str(); + if (type_count.count(type) == 0) { + type_count[type] = 0; + } + type_count[type] += 1; + } + + std::cout << "Events by type:" << std::endl; + for (std::map::iterator i = type_count.begin(); i != type_count.end(); ++i) { + std::cout << " " << i->first << ": " << i->second << std::endl; + } + + std::cout << "Errors: " << scan.errors.size() << std::endl; + if (!scan.errors.empty()) { + for (JournalScanner::ErrorMap::const_iterator i = scan.errors.begin(); + i != scan.errors.end(); ++i) { + std::cout << " 0x" << std::hex << i->first << std::dec + << ": " << i->second.r << " " + << i->second.description << std::endl; + } + } +} diff --git a/src/tools/cephfs/EventOutput.h b/src/tools/cephfs/EventOutput.h new file mode 100644 index 000000000..65d968409 --- /dev/null +++ b/src/tools/cephfs/EventOutput.h @@ -0,0 +1,42 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#ifndef EVENT_OUTPUT_H +#define EVENT_OUTPUT_H + +#include + +class JournalScanner; + +/** + * Different output formats for the results of a journal scan + */ +class EventOutput +{ + private: + JournalScanner const &scan; + std::string const path; + + public: + EventOutput(JournalScanner const &scan_, std::string const &path_) + : scan(scan_), path(path_) {} + + void summary() const; + void list() const; + int json() const; + int binary() const; +}; + +#endif // EVENT_OUTPUT_H + diff --git a/src/tools/cephfs/JournalFilter.cc b/src/tools/cephfs/JournalFilter.cc new file mode 100644 index 000000000..3a5e781a2 --- /dev/null +++ b/src/tools/cephfs/JournalFilter.cc @@ -0,0 +1,316 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include "JournalFilter.h" + +#include "common/ceph_argparse.h" + +#include "mds/events/ESession.h" +#include "mds/events/EUpdate.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +using namespace std; + +const string JournalFilter::range_separator(".."); + +bool JournalFilter::apply(uint64_t pos, PurgeItem &pi) const +{ + /* Filtering by journal offset range */ + if (pos < range_start || pos >= range_end) { + return false; + } + + if (purge_action != PurgeItem::NONE) { + if (pi.action != purge_action) + return false; + } + + if (inode) { + if (inode != pi.ino) + return false; + } + return true; +} + +/* + * Return whether a LogEvent is to be included or excluded. + * + * The filter parameters are applied on an AND basis: if any + * condition is not met, the event is excluded. Try to do + * the fastest checks first. + */ +bool JournalFilter::apply(uint64_t pos, LogEvent &le) const +{ + /* Filtering by journal offset range */ + if (pos < range_start || pos >= range_end) { + return false; + } + + /* Filtering by event type */ + if (event_type != 0) { + if (le.get_type() != event_type) { + return false; + } + } + + /* Filtering by client */ + if (client_name.num()) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + if (metablob->get_client_name() != client_name) { + return false; + } + } else if (le.get_type() == EVENT_SESSION) { + ESession *es = reinterpret_cast(&le); + if (es->get_client_inst().name != client_name) { + return false; + } + } else { + return false; + } + } + + /* Filtering by inode */ + if (inode) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + std::set inodes; + metablob->get_inodes(inodes); + bool match_any = false; + for (std::set::iterator i = inodes.begin(); i != inodes.end(); ++i) { + if (*i == inode) { + match_any = true; + break; + } + } + if (!match_any) { + return false; + } + } else { + return false; + } + } + + /* Filtering by frag and dentry */ + if (!frag_dentry.empty() || frag.ino) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + std::map > dentries; + metablob->get_dentries(dentries); + + if (frag.ino) { + bool match_any = false; + for (std::map >::iterator i = dentries.begin(); + i != dentries.end(); ++i) { + if (i->first == frag) { + match_any = true; + break; + } + } + if (!match_any) { + return false; + } + } + + if (!frag_dentry.empty()) { + bool match_any = false; + for (std::map >::iterator i = dentries.begin(); + i != dentries.end() && !match_any; ++i) { + std::set const &names = i->second; + for (std::set::iterator j = names.begin(); + j != names.end() && !match_any; ++j) { + if (*j == frag_dentry) { + match_any = true; + } + } + } + if (!match_any) { + return false; + } + } + + } else { + return false; + } + } + + /* Filtering by file path */ + if (!path_expr.empty()) { + EMetaBlob const *metablob = le.get_metablob(); + if (metablob) { + std::vector paths; + metablob->get_paths(paths); + bool match_any = false; + for (std::vector::iterator p = paths.begin(); p != paths.end(); ++p) { + if ((*p).find(path_expr) != std::string::npos) { + match_any = true; + break; + } + } + if (!match_any) { + return false; + } + } else { + return false; + } + } + + return true; +} + + +int JournalFilter::parse_args( + std::vector &argv, + std::vector::iterator &arg) +{ + while(arg != argv.end()) { + std::string arg_str; + if (ceph_argparse_witharg(argv, arg, &arg_str, "--range", (char*)NULL)) { + size_t sep_loc = arg_str.find(JournalFilter::range_separator); + if (sep_loc == std::string::npos || arg_str.size() <= JournalFilter::range_separator.size()) { + derr << "Invalid range '" << arg_str << "'" << dendl; + return -EINVAL; + } + + // We have a lower bound + if (sep_loc > 0) { + std::string range_start_str = arg_str.substr(0, sep_loc); + std::string parse_err; + range_start = strict_strtoll(range_start_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid lower bound '" << range_start_str << "': " << parse_err << dendl; + return -EINVAL; + } + } + + if (sep_loc < arg_str.size() - JournalFilter::range_separator.size()) { + std::string range_end_str = arg_str.substr(sep_loc + range_separator.size()); + std::string parse_err; + range_end = strict_strtoll(range_end_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid upper bound '" << range_end_str << "': " << parse_err << dendl; + return -EINVAL; + } + } + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--path\"." << dendl; + return -EINVAL; + } + dout(4) << "Filtering by path '" << arg_str << "'" << dendl; + path_expr = arg_str; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--inode", (char*)NULL)) { + dout(4) << "Filtering by inode '" << arg_str << "'" << dendl; + std::string parse_err; + inode = strict_strtoll(arg_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid inode '" << arg_str << "': " << parse_err << dendl; + return -EINVAL; + } + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--type", (char*)NULL)) { + try { + if (!type.compare("mdlog")) { + event_type = LogEvent::str_to_type(arg_str); + } else if (!type.compare("purge_queue")) { + purge_action = PurgeItem::str_to_type(arg_str); + } + } catch (const std::out_of_range&) { + derr << "Invalid event type '" << arg_str << "'" << dendl; + return -EINVAL; + } + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--frag", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--frag\"." << dendl; + return -EINVAL; + } + std::string const frag_sep = "."; + size_t sep_loc = arg_str.find(frag_sep); + std::string inode_str; + std::string frag_str; + if (sep_loc != std::string::npos) { + inode_str = arg_str.substr(0, sep_loc); + frag_str = arg_str.substr(sep_loc + 1); + } else { + inode_str = arg_str; + frag_str = "0"; + } + + std::string parse_err; + inodeno_t frag_ino = strict_strtoll(inode_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid inode '" << inode_str << "': " << parse_err << dendl; + return -EINVAL; + } + + uint32_t frag_enc = strict_strtoll(frag_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid frag '" << frag_str << "': " << parse_err << dendl; + return -EINVAL; + } + + frag = dirfrag_t(frag_ino, frag_t(frag_enc)); + dout(4) << "dirfrag filter: '" << frag << "'" << dendl; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--dname", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--dname\"." << dendl; + return -EINVAL; + } + frag_dentry = arg_str; + dout(4) << "dentry filter: '" << frag_dentry << "'" << dendl; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--client", (char*)NULL)) { + if (!type.compare("purge_queue")) { + derr << "Invalid filter arguments: purge_queue doesn't take \"--client\"." << dendl; + return -EINVAL; + } + + std::string parse_err; + int64_t client_num = strict_strtoll(arg_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid client number " << arg_str << dendl; + return -EINVAL; + } + client_name = entity_name_t::CLIENT(client_num); + } else { + // We're done with args the filter understands + break; + } + } + + return 0; +} + +/** + * If the filter params are only range, then return + * true and set start & end. Else return false. + * + * Use this to discover if the user has requested a contiguous range + * rather than any per-event filtering. + */ +bool JournalFilter::get_range(uint64_t &start, uint64_t &end) const +{ + if (!path_expr.empty() + || inode != 0 + || event_type != 0 + || frag.ino != 0 + || client_name.num() != 0 + || (range_start == 0 && range_end == (uint64_t)(-1))) { + return false; + } else { + start = range_start; + end = range_end; + return true; + } +} diff --git a/src/tools/cephfs/JournalFilter.h b/src/tools/cephfs/JournalFilter.h new file mode 100644 index 000000000..f7a2db614 --- /dev/null +++ b/src/tools/cephfs/JournalFilter.h @@ -0,0 +1,73 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#ifndef JOURNAL_FILTER_H +#define JOURNAL_FILTER_H + +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/PurgeQueue.h" + +/** + * A set of conditions for narrowing down a search through the journal + */ +class JournalFilter +{ + private: + + /* Filtering by journal offset range */ + uint64_t range_start; + uint64_t range_end; + static const std::string range_separator; + + /* Filtering by file (sub) path */ + std::string path_expr; + + /* Filtering by inode */ + inodeno_t inode; + + /* Filtering by type */ + LogEvent::EventType event_type; + + std::string type; + + /* Filtering by PurgeItem::Action */ + PurgeItem::Action purge_action; + + /* Filtering by dirfrag */ + dirfrag_t frag; + std::string frag_dentry; //< optional, filter dentry name within fragment + + /* Filtering by metablob client name */ + entity_name_t client_name; + + public: + JournalFilter(std::string t) : + range_start(0), + range_end(-1), + inode(0), + event_type(0), + type(t), + purge_action(PurgeItem::NONE) {} + + bool get_range(uint64_t &start, uint64_t &end) const; + bool apply(uint64_t pos, LogEvent &le) const; + bool apply(uint64_t pos, PurgeItem &pi) const; + int parse_args( + std::vector &argv, + std::vector::iterator &arg); +}; + +#endif // JOURNAL_FILTER_H + diff --git a/src/tools/cephfs/JournalScanner.cc b/src/tools/cephfs/JournalScanner.cc new file mode 100644 index 000000000..e72542fd4 --- /dev/null +++ b/src/tools/cephfs/JournalScanner.cc @@ -0,0 +1,438 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include "include/rados/librados.hpp" +#include "mds/JournalPointer.h" + +#include "mds/events/ESubtreeMap.h" +#include "mds/PurgeQueue.h" + +#include "JournalScanner.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +/** + * Read journal header, followed by sequential scan through journal space. + * + * Return 0 on success, else error code. Note that success has the special meaning + * that we were able to apply our checks, it does *not* mean that the journal is + * healthy. + */ +int JournalScanner::scan(bool const full) +{ + int r = 0; + + r = set_journal_ino(); + if (r < 0) { + return r; + } + + if (!is_mdlog || pointer_present) { + r = scan_header(); + if (r < 0) { + return r; + } + } + + if (full && header_present) { + r = scan_events(); + if (r < 0) { + return r; + } + } + + return 0; +} + + +int JournalScanner::set_journal_ino() +{ + int r = 0; + if (type == "purge_queue") { + ino = MDS_INO_PURGE_QUEUE + rank; + } + else if (type == "mdlog"){ + r = scan_pointer(); + is_mdlog = true; + } + else { + ceph_abort(); // should not get here + } + return r; +} + +int JournalScanner::scan_pointer() +{ + // Issue read + std::string const pointer_oid = obj_name(MDS_INO_LOG_POINTER_OFFSET + rank, 0); + bufferlist pointer_bl; + int r = io.read(pointer_oid, pointer_bl, INT_MAX, 0); + if (r == -ENOENT) { + // 'Successfully' discovered the pointer is missing. + derr << "Pointer " << pointer_oid << " is absent" << dendl; + return 0; + } else if (r < 0) { + // Error preventing us interrogating pointer + derr << "Pointer " << pointer_oid << " is unreadable" << dendl; + return r; + } else { + dout(4) << "Pointer " << pointer_oid << " is readable" << dendl; + pointer_present = true; + + JournalPointer jp; + try { + auto q = pointer_bl.cbegin(); + jp.decode(q); + } catch(buffer::error &e) { + derr << "Pointer " << pointer_oid << " is corrupt: " << e.what() << dendl; + return 0; + } + + pointer_valid = true; + ino = jp.front; + return 0; + } +} + + +int JournalScanner::scan_header() +{ + int r; + + bufferlist header_bl; + std::string header_name = obj_name(0); + dout(4) << "JournalScanner::scan: reading header object '" << header_name << "'" << dendl; + r = io.read(header_name, header_bl, INT_MAX, 0); + if (r < 0) { + derr << "Header " << header_name << " is unreadable" << dendl; + return 0; // "Successfully" found an error + } else { + header_present = true; + } + + auto header_bl_i = header_bl.cbegin(); + header = new Journaler::Header(); + try + { + header->decode(header_bl_i); + } + catch (buffer::error &e) + { + derr << "Header is corrupt (" << e.what() << ")" << dendl; + delete header; + header = NULL; + return 0; // "Successfully" found an error + } + + if (header->magic != std::string(CEPH_FS_ONDISK_MAGIC)) { + derr << "Header is corrupt (bad magic)" << dendl; + return 0; // "Successfully" found an error + } + if (!((header->trimmed_pos <= header->expire_pos) && (header->expire_pos <= header->write_pos))) { + derr << "Header is invalid (inconsistent offsets)" << dendl; + return 0; // "Successfully" found an error + } + header_valid = true; + + return 0; +} + + +int JournalScanner::scan_events() +{ + uint64_t object_size = g_conf()->mds_log_segment_size; + if (object_size == 0) { + // Default layout object size + object_size = file_layout_t::get_default().object_size; + } + + uint64_t read_offset = header->expire_pos; + dout(10) << std::hex << "Header 0x" + << header->trimmed_pos << " 0x" + << header->expire_pos << " 0x" + << header->write_pos << std::dec << dendl; + dout(10) << "Starting journal scan from offset 0x" << std::hex << read_offset << std::dec << dendl; + + // TODO also check for extraneous objects before the trimmed pos or after the write pos, + // which would indicate a bogus header. + + bufferlist read_buf; + bool gap = false; + uint64_t gap_start = -1; + for (uint64_t obj_offset = (read_offset / object_size); ; obj_offset++) { + uint64_t offset_in_obj = 0; + if (obj_offset * object_size < header->expire_pos) { + // Skip up to expire_pos from start of the object + // (happens for the first object we read) + offset_in_obj = header->expire_pos - obj_offset * object_size; + } + + // Read this journal segment + bufferlist this_object; + std::string const oid = obj_name(obj_offset); + int r = io.read(oid, this_object, INT_MAX, offset_in_obj); + + // Handle absent journal segments + if (r < 0) { + if (obj_offset > (header->write_pos / object_size)) { + dout(4) << "Reached end of journal objects" << dendl; + break; + } else { + derr << "Missing object " << oid << dendl; + } + + objects_missing.push_back(obj_offset); + if (!gap) { + gap_start = read_offset; + gap = true; + } + if (read_buf.length() > 0) { + read_offset += read_buf.length(); + read_buf.clear(); + } + read_offset += object_size - offset_in_obj; + continue; + } else { + dout(4) << "Read 0x" << std::hex << this_object.length() << std::dec + << " bytes from " << oid << " gap=" << gap << dendl; + objects_valid.push_back(oid); + this_object.begin().copy(this_object.length(), read_buf); + } + + if (gap) { + // No valid data at the current read offset, scan forward until we find something valid looking + // or have to drop out to load another object. + dout(4) << "Searching for sentinel from 0x" << std::hex << read_offset + << ", 0x" << read_buf.length() << std::dec << " bytes available" << dendl; + + do { + auto p = read_buf.cbegin(); + uint64_t candidate_sentinel; + decode(candidate_sentinel, p); + + dout(4) << "Data at 0x" << std::hex << read_offset << " = 0x" << candidate_sentinel << std::dec << dendl; + + if (candidate_sentinel == JournalStream::sentinel) { + dout(4) << "Found sentinel at 0x" << std::hex << read_offset << std::dec << dendl; + ranges_invalid.push_back(Range(gap_start, read_offset)); + gap = false; + break; + } else { + // No sentinel, discard this byte + read_buf.splice(0, 1); + read_offset += 1; + } + } while (read_buf.length() >= sizeof(JournalStream::sentinel)); + dout(4) << "read_buf size is " << read_buf.length() << dendl; + } + { + dout(10) << "Parsing data, 0x" << std::hex << read_buf.length() << std::dec << " bytes available" << dendl; + while(true) { + // TODO: detect and handle legacy format journals: can do many things + // on them but on read errors have to give up instead of searching + // for sentinels. + JournalStream journal_stream(JOURNAL_FORMAT_RESILIENT); + bool readable = false; + try { + uint64_t need; + readable = journal_stream.readable(read_buf, &need); + } catch (buffer::error &e) { + readable = false; + dout(4) << "Invalid container encoding at 0x" << std::hex << read_offset << std::dec << dendl; + gap = true; + gap_start = read_offset; + read_buf.splice(0, 1); + read_offset += 1; + break; + } + + if (!readable) { + // Out of data, continue to read next object + break; + } + + bufferlist le_bl; //< Serialized LogEvent blob + dout(10) << "Attempting decode at 0x" << std::hex << read_offset << std::dec << dendl; + // This cannot fail to decode because we pre-checked that a serialized entry + // blob would be readable. + uint64_t start_ptr = 0; + uint64_t consumed = journal_stream.read(read_buf, &le_bl, &start_ptr); + dout(10) << "Consumed 0x" << std::hex << consumed << std::dec << " bytes" << dendl; + if (start_ptr != read_offset) { + derr << "Bad entry start ptr (0x" << std::hex << start_ptr << ") at 0x" + << read_offset << std::dec << dendl; + gap = true; + gap_start = read_offset; + // FIXME: given that entry was invalid, should we be skipping over it? + // maybe push bytes back onto start of read_buf and just advance one byte + // to start scanning instead. e.g. if a bogus size value is found it can + // cause us to consume and thus skip a bunch of following valid events. + read_offset += consumed; + break; + } + bool valid_entry = true; + if (is_mdlog) { + auto le = LogEvent::decode_event(le_bl.cbegin()); + + if (le) { + dout(10) << "Valid entry at 0x" << std::hex << read_offset << std::dec << dendl; + + if (le->get_type() == EVENT_SUBTREEMAP + || le->get_type() == EVENT_SUBTREEMAP_TEST) { + auto&& sle = dynamic_cast(*le); + if (sle.expire_pos > read_offset) { + errors.insert(std::make_pair( + read_offset, EventError( + -ERANGE, + "ESubtreeMap has expire_pos ahead of its own position"))); + } + } + + if (filter.apply(read_offset, *le)) { + events.insert_or_assign(read_offset, EventRecord(std::move(le), consumed)); + } + } else { + valid_entry = false; + } + } else if (type == "purge_queue"){ + auto pi = std::make_unique(); + try { + auto q = le_bl.cbegin(); + pi->decode(q); + if (filter.apply(read_offset, *pi)) { + events.insert_or_assign(read_offset, EventRecord(std::move(pi), consumed)); + } + } catch (const buffer::error &err) { + valid_entry = false; + } + } else { + ceph_abort(); // should not get here + } + if (!valid_entry) { + dout(10) << "Invalid entry at 0x" << std::hex << read_offset << std::dec << dendl; + gap = true; + gap_start = read_offset; + read_offset += consumed; + break; + } else { + events_valid.push_back(read_offset); + read_offset += consumed; + } + } + } + } + + if (gap) { + // Ended on a gap, assume it ran to end + ranges_invalid.push_back(Range(gap_start, -1)); + } + + dout(4) << "Scanned objects, " << objects_missing.size() << " missing, " << objects_valid.size() << " valid" << dendl; + dout(4) << "Events scanned, " << ranges_invalid.size() << " gaps" << dendl; + dout(4) << "Found " << events_valid.size() << " valid events" << dendl; + dout(4) << "Selected " << events.size() << " events events for processing" << dendl; + + return 0; +} + + +JournalScanner::~JournalScanner() +{ + if (header) { + delete header; + header = NULL; + } + dout(4) << events.size() << " events" << dendl; + events.clear(); +} + + +/** + * Whether the journal data looks valid and replayable + */ +bool JournalScanner::is_healthy() const +{ + return ((!is_mdlog || (pointer_present && pointer_valid)) + && header_present && header_valid + && ranges_invalid.empty() + && objects_missing.empty()); +} + + +/** + * Whether the journal data can be read from RADOS + */ +bool JournalScanner::is_readable() const +{ + return (header_present && header_valid && objects_missing.empty()); +} + + +/** + * Calculate the object name for a given offset + */ +std::string JournalScanner::obj_name(inodeno_t ino, uint64_t offset) const +{ + char name[60]; + snprintf(name, sizeof(name), "%llx.%08llx", + (unsigned long long)(ino), + (unsigned long long)offset); + return std::string(name); +} + + +std::string JournalScanner::obj_name(uint64_t offset) const +{ + return obj_name(ino, offset); +} + + +/* + * Write a human readable summary of the journal health + */ +void JournalScanner::report(std::ostream &out) const +{ + out << "Overall journal integrity: " << (is_healthy() ? "OK" : "DAMAGED") << std::endl; + + if (is_mdlog) { + if (!pointer_present) { + out << "Pointer not found" << std::endl; + } else if (!pointer_valid) { + out << "Pointer could not be decoded" << std::endl; + } + } + if (!header_present) { + out << "Header not found" << std::endl; + } else if (!header_valid) { + out << "Header could not be decoded" << std::endl; + } + + if (objects_missing.size()) { + out << "Objects missing:" << std::endl; + for (std::vector::const_iterator om = objects_missing.begin(); + om != objects_missing.end(); ++om) { + out << " 0x" << std::hex << *om << std::dec << std::endl; + } + } + + if (ranges_invalid.size()) { + out << "Corrupt regions:" << std::endl; + for (std::vector::const_iterator r = ranges_invalid.begin(); + r != ranges_invalid.end(); ++r) { + out << " 0x" << std::hex << r->first << "-" << r->second << std::dec << std::endl; + } + } +} + diff --git a/src/tools/cephfs/JournalScanner.h b/src/tools/cephfs/JournalScanner.h new file mode 100644 index 000000000..9197b5596 --- /dev/null +++ b/src/tools/cephfs/JournalScanner.h @@ -0,0 +1,133 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + +#ifndef JOURNAL_SCANNER_H +#define JOURNAL_SCANNER_H + +#include "include/rados/librados_fwd.hpp" + +// For Journaler::Header, can't forward-declare nested classes +#include + +#include "JournalFilter.h" + +/** + * A simple sequential reader for metadata journals. Unlike + * the MDS Journaler class, this is written to detect, record, + * and read past corruptions and missing objects. It is also + * less efficient but more plainly written. + */ +class JournalScanner +{ + private: + librados::IoCtx &io; + + // Input constraints + const int rank; + std::string type; + JournalFilter const filter; + + void gap_advance(); + + public: + JournalScanner( + librados::IoCtx &io_, + int rank_, + const std::string &type_, + JournalFilter const &filter_) : + io(io_), + rank(rank_), + type(type_), + filter(filter_), + is_mdlog(false), + pointer_present(false), + pointer_valid(false), + header_present(false), + header_valid(false), + header(NULL) {}; + + JournalScanner( + librados::IoCtx &io_, + int rank_, + const std::string &type_) : + io(io_), + rank(rank_), + type(type_), + filter(type_), + is_mdlog(false), + pointer_present(false), + pointer_valid(false), + header_present(false), + header_valid(false), + header(NULL) {}; + + ~JournalScanner(); + + int set_journal_ino(); + int scan(bool const full=true); + int scan_pointer(); + int scan_header(); + int scan_events(); + void report(std::ostream &out) const; + + std::string obj_name(uint64_t offset) const; + std::string obj_name(inodeno_t ino, uint64_t offset) const; + + // The results of the scan + inodeno_t ino; // Corresponds to journal ino according their type + struct EventRecord { + EventRecord(std::unique_ptr le, uint32_t rs) : log_event(std::move(le)), raw_size(rs) {} + EventRecord(std::unique_ptr p, uint32_t rs) : pi(std::move(p)), raw_size(rs) {} + std::unique_ptr log_event; + std::unique_ptr pi; + uint32_t raw_size = 0; //< Size from start offset including all encoding overhead + }; + + class EventError { + public: + int r; + std::string description; + EventError(int r_, const std::string &desc_) + : r(r_), description(desc_) {} + }; + + typedef std::map EventMap; + typedef std::map ErrorMap; + typedef std::pair Range; + bool is_mdlog; + bool pointer_present; //mdlog specific + bool pointer_valid; //mdlog specific + bool header_present; + bool header_valid; + Journaler::Header *header; + + bool is_healthy() const; + bool is_readable() const; + std::vector objects_valid; + std::vector objects_missing; + std::vector ranges_invalid; + std::vector events_valid; + EventMap events; + + // For events present in ::events (i.e. scanned successfully), + // any subsequent errors handling them (e.g. replaying) + ErrorMap errors; + + + private: + // Forbid copy construction because I have ptr members + JournalScanner(const JournalScanner &rhs); +}; + +#endif // JOURNAL_SCANNER_H + diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc new file mode 100644 index 000000000..6bca9bb08 --- /dev/null +++ b/src/tools/cephfs/JournalTool.cc @@ -0,0 +1,1266 @@ +// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * ceph - scalable distributed file system + * + * copyright (c) 2014 john spray + * + * this is free software; you can redistribute it and/or + * modify it under the terms of the gnu lesser general public + * license version 2.1, as published by the free software + * foundation. see file copying. + */ + + +#include + +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "osdc/Journaler.h" +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/InoTable.h" + +#include "mds/events/ENoOp.h" +#include "mds/events/EUpdate.h" + +#include "JournalScanner.h" +#include "EventOutput.h" +#include "Dumper.h" +#include "Resetter.h" + +#include "JournalTool.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << __func__ << ": " + +using namespace std; + +void JournalTool::usage() +{ + std::cout << "Usage: \n" + << " cephfs-journal-tool [options] journal \n" + << " :\n" + << " inspect\n" + << " import [--force]\n" + << " export \n" + << " reset [--force]\n" + << " cephfs-journal-tool [options] header \n" + << " : [trimmed_pos|expire_pos|write_pos|pool_id]\n" + << " cephfs-journal-tool [options] event [special options]\n" + << " :\n" + << " --range=..\n" + << " --path=\n" + << " --inode=\n" + << " --type=<\n" + << " --frag=. [--dname=]\n" + << " --client=\n" + << " : [get|recover_dentries|splice]\n" + << " : [summary|list|binary|json] [--path ]\n" + << "\n" + << "General options:\n" + << " --rank=filesystem:{mds-rank|all} journal rank or \"all\" ranks (mandatory)\n" + << " --journal= Journal type (purge_queue means\n" + << " this journal is used to queue for purge operation,\n" + << " default is mdlog, and only mdlog support event mode)\n" + << "\n" + << "Special options\n" + << " --alternate-pool Alternative metadata pool to target\n" + << " when using recover_dentries.\n"; + + generic_client_usage(); +} + + +/** + * Handle arguments and hand off to journal/header/event mode + */ +int JournalTool::main(std::vector &argv) +{ + int r; + + dout(10) << "JournalTool::main " << dendl; + // Common arg parsing + // ================== + if (argv.empty()) { + cerr << "missing positional argument" << std::endl; + return -EINVAL; + } + + std::vector::iterator arg = argv.begin(); + + std::string rank_str; + if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) { + derr << "missing mandatory \"--rank\" argument" << dendl; + return -EINVAL; + } + + if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) { + // Default is mdlog + type = "mdlog"; + } + + r = validate_type(type); + if (r != 0) { + derr << "journal type is not correct." << dendl; + return r; + } + + r = role_selector.parse(*fsmap, rank_str, false); + if (r != 0) { + derr << "Couldn't determine MDS rank." << dendl; + return r; + } + + std::string mode; + if (arg == argv.end()) { + derr << "Missing mode [journal|header|event]" << dendl; + return -EINVAL; + } + mode = std::string(*arg); + arg = argv.erase(arg); + + // RADOS init + // ========== + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; + return r; + } + + dout(4) << "JournalTool: connecting to RADOS..." << dendl; + r = rados.connect(); + if (r < 0) { + derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; + return r; + } + + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + ceph_assert(fs != nullptr); + int64_t const pool_id = fs->mds_map.get_metadata_pool(); + dout(4) << "JournalTool: resolving pool " << pool_id << dendl; + std::string pool_name; + r = rados.pool_reverse_lookup(pool_id, &pool_name); + if (r < 0) { + derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl; + return r; + } + + dout(4) << "JournalTool: creating IoCtx.." << dendl; + r = rados.ioctx_create(pool_name.c_str(), input); + ceph_assert(r == 0); + output.dup(input); + + // Execution + // ========= + // journal and header are general journal mode + // event mode is only specific for mdlog + auto roles = role_selector.get_roles(); + if (roles.size() > 1) { + const std::string &command = argv[0]; + bool allowed = can_execute_for_all_ranks(mode, command); + if (!allowed) { + derr << "operation not allowed for all ranks" << dendl; + return -EINVAL; + } + + all_ranks = true; + } + for (auto role : roles) { + rank = role.rank; + std::vector rank_argv(argv); + dout(4) << "Executing for rank " << rank << dendl; + if (mode == std::string("journal")) { + r = main_journal(rank_argv); + } else if (mode == std::string("header")) { + r = main_header(rank_argv); + } else if (mode == std::string("event")) { + r = main_event(rank_argv); + } else { + cerr << "Bad command '" << mode << "'" << std::endl; + return -EINVAL; + } + + if (r != 0) { + return r; + } + } + + return r; +} + +int JournalTool::validate_type(const std::string &type) +{ + if (type == "mdlog" || type == "purge_queue") { + return 0; + } + return -1; +} + +std::string JournalTool::gen_dump_file_path(const std::string &prefix) { + if (!all_ranks) { + return prefix; + } + + return prefix + "." + std::to_string(rank); +} + +bool JournalTool::can_execute_for_all_ranks(const std::string &mode, + const std::string &command) { + if (mode == "journal" && command == "import") { + return false; + } + + return true; +} + +/** + * Handle arguments for 'journal' mode + * + * This is for operations that act on the journal as a whole. + */ +int JournalTool::main_journal(std::vector &argv) +{ + if (argv.empty()) { + derr << "Missing journal command, please see help" << dendl; + return -EINVAL; + } + + std::string command = argv[0]; + if (command == "inspect") { + return journal_inspect(); + } else if (command == "export" || command == "import") { + bool force = false; + if (argv.size() >= 2) { + std::string const path = argv[1]; + if (argv.size() == 3) { + if (std::string(argv[2]) == "--force") { + force = true; + } else { + std::cerr << "Unknown argument " << argv[1] << std::endl; + return -EINVAL; + } + } + return journal_export(path, command == "import", force); + } else { + derr << "Missing path" << dendl; + return -EINVAL; + } + } else if (command == "reset") { + bool force = false; + if (argv.size() == 2) { + if (std::string(argv[1]) == "--force") { + force = true; + } else { + std::cerr << "Unknown argument " << argv[1] << std::endl; + return -EINVAL; + } + } else if (argv.size() > 2) { + std::cerr << "Too many arguments!" << std::endl; + return -EINVAL; + } + return journal_reset(force); + } else { + derr << "Bad journal command '" << command << "'" << dendl; + return -EINVAL; + } +} + + +/** + * Parse arguments and execute for 'header' mode + * + * This is for operations that act on the header only. + */ +int JournalTool::main_header(std::vector &argv) +{ + JournalFilter filter(type); + JournalScanner js(input, rank, type, filter); + int r = js.scan(false); + if (r < 0) { + std::cerr << "Unable to scan journal" << std::endl; + return r; + } + + if (!js.header_present) { + std::cerr << "Header object not found!" << std::endl; + return -ENOENT; + } else if (!js.header_valid && js.header == NULL) { + // Can't do a read or a single-field write without a copy of the original + derr << "Header could not be read!" << dendl; + return -ENOENT; + } else { + ceph_assert(js.header != NULL); + } + + if (argv.empty()) { + derr << "Missing header command, must be [get|set]" << dendl; + return -EINVAL; + } + std::vector::iterator arg = argv.begin(); + std::string const command = *arg; + arg = argv.erase(arg); + + if (command == std::string("get")) { + // Write JSON journal dump to stdout + JSONFormatter jf(true); + js.header->dump(&jf); + jf.flush(std::cout); + std::cout << std::endl; + } else if (command == std::string("set")) { + // Need two more args + if (argv.size() != 2) { + derr << "'set' requires two arguments " << dendl; + return -EINVAL; + } + + std::string const field_name = *arg; + arg = argv.erase(arg); + + std::string const value_str = *arg; + arg = argv.erase(arg); + ceph_assert(argv.empty()); + + std::string parse_err; + uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err); + if (!parse_err.empty()) { + derr << "Invalid value '" << value_str << "': " << parse_err << dendl; + return -EINVAL; + } + + uint64_t *field = NULL; + if (field_name == "trimmed_pos") { + field = &(js.header->trimmed_pos); + } else if (field_name == "expire_pos") { + field = &(js.header->expire_pos); + } else if (field_name == "write_pos") { + field = &(js.header->write_pos); + } else if (field_name == "pool_id") { + field = (uint64_t*)(&(js.header->layout.pool_id)); + } else { + derr << "Invalid field '" << field_name << "'" << dendl; + return -EINVAL; + } + + std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl; + *field = new_val; + + dout(4) << "Writing object..." << dendl; + bufferlist header_bl; + encode(*(js.header), header_bl); + output.write_full(js.obj_name(0), header_bl); + dout(4) << "Write complete." << dendl; + std::cout << "Successfully updated header." << std::endl; + } else { + derr << "Bad header command '" << command << "'" << dendl; + return -EINVAL; + } + + return 0; +} + + +/** + * Parse arguments and execute for 'event' mode + * + * This is for operations that act on LogEvents within the log + */ +int JournalTool::main_event(std::vector &argv) +{ + int r; + + if (argv.empty()) { + derr << "Missing event command, please see help" << dendl; + return -EINVAL; + } + + std::vector::iterator arg = argv.begin(); + bool dry_run = false; + + std::string command = *(arg++); + if (command != "get" && command != "splice" && command != "recover_dentries") { + derr << "Unknown argument '" << command << "'" << dendl; + return -EINVAL; + } + + if (command == "recover_dentries") { + if (type != "mdlog") { + derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl; + return -EINVAL; + } else { + if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) { + dry_run = true; + } + } + } + + if (arg == argv.end()) { + derr << "Incomplete command line" << dendl; + return -EINVAL; + } + + // Parse filter options + // ==================== + JournalFilter filter(type); + r = filter.parse_args(argv, arg); + if (r) { + return r; + } + + // Parse output options + // ==================== + if (arg == argv.end()) { + cerr << "Missing output command" << std::endl; + return -EINVAL; + } + std::string output_style = *(arg++); + if (output_style != "binary" && output_style != "json" && + output_style != "summary" && output_style != "list") { + cerr << "Unknown argument: '" << output_style << "'" << std::endl; + return -EINVAL; + } + + std::string output_path = "dump"; + while(arg != argv.end()) { + std::string arg_str; + if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { + output_path = arg_str; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool", + nullptr)) { + dout(1) << "Using alternate pool " << arg_str << dendl; + int r = rados.ioctx_create(arg_str.c_str(), output); + ceph_assert(r == 0); + other_pool = true; + } else { + cerr << "Unknown argument: '" << *arg << "'" << std::endl; + return -EINVAL; + } + } + + const std::string dump_path = gen_dump_file_path(output_path); + + // Execute command + // =============== + JournalScanner js(input, rank, type, filter); + if (command == "get") { + r = js.scan(); + if (r) { + derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; + return r; + } + } else if (command == "recover_dentries") { + r = js.scan(); + if (r) { + derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; + return r; + } + + /** + * Iterate over log entries, attempting to scavenge from each one + */ + std::set consumed_inos; + for (JournalScanner::EventMap::iterator i = js.events.begin(); + i != js.events.end(); ++i) { + auto& le = i->second.log_event; + EMetaBlob const *mb = le->get_metablob(); + if (mb) { + int scav_r = recover_dentries(*mb, dry_run, &consumed_inos); + if (scav_r) { + dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec + << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl; + if (r == 0) { + r = scav_r; + } + // Our goal is to read all we can, so don't stop on errors, but + // do record them for possible later output + js.errors.insert(std::make_pair(i->first, + JournalScanner::EventError(scav_r, cpp_strerror(r)))); + } + } + } + + /** + * Update InoTable to reflect any inode numbers consumed during scavenge + */ + dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl; + if (consumed_inos.size() && !dry_run) { + int consume_r = consume_inos(consumed_inos); + if (consume_r) { + dout(1) << "Error updating InoTable for " << consumed_inos.size() + << " consume inos: " << cpp_strerror(consume_r) << dendl; + if (r == 0) { + r = consume_r; + } + } + } + + // Remove consumed dentries from lost+found. + if (other_pool && !dry_run) { + std::set found; + + for (auto i : consumed_inos) { + char s[20]; + + snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i); + dout(20) << "removing " << s << dendl; + found.insert(std::string(s)); + } + + object_t frag_oid; + frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND, + frag_t(), ""); + output.omap_rm_keys(frag_oid.name, found); + } + } else if (command == "splice") { + r = js.scan(); + if (r) { + derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; + return r; + } + + uint64_t start, end; + if (filter.get_range(start, end)) { + // Special case for range filter: erase a numeric range in the log + uint64_t range = end - start; + int r = erase_region(js, start, range); + if (r) { + derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec + << ": " << cpp_strerror(r) << dendl; + return r; + } + } else { + // General case: erase a collection of individual entries in the log + for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) { + dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl; + + int r = erase_region(js, i->first, i->second.raw_size); + if (r) { + derr << "Failed to erase event 0x" << std::hex << i->first << std::dec + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + } + + + } else { + cerr << "Unknown argument '" << command << "'" << std::endl; + return -EINVAL; + } + + // Generate output + // =============== + EventOutput output(js, dump_path); + int output_result = 0; + if (output_style == "binary") { + output_result = output.binary(); + } else if (output_style == "json") { + output_result = output.json(); + } else if (output_style == "summary") { + output.summary(); + } else if (output_style == "list") { + output.list(); + } else { + std::cerr << "Bad output command '" << output_style << "'" << std::endl; + return -EINVAL; + } + + if (output_result != 0) { + std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl; + } + + return output_result; +} + +/** + * Provide the user with information about the condition of the journal, + * especially indicating what range of log events is available and where + * any gaps or corruptions in the journal are. + */ +int JournalTool::journal_inspect() +{ + int r; + + JournalFilter filter(type); + JournalScanner js(input, rank, type, filter); + r = js.scan(); + if (r) { + std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl; + return r; + } + + js.report(std::cout); + + return 0; +} + + +/** + * Attempt to export a binary dump of the journal. + * + * This is allowed to fail if the header is malformed or there are + * objects inaccessible, in which case the user would have to fall + * back to manually listing RADOS objects and extracting them, which + * they can do with the ``rados`` CLI. + */ +int JournalTool::journal_export(std::string const &path, bool import, bool force) +{ + int r = 0; + JournalScanner js(input, rank, type); + + if (!import) { + /* + * If doing an export, first check that the header is valid and + * no objects are missing before trying to dump + */ + r = js.scan(); + if (r < 0) { + derr << "Unable to scan journal, assuming badly damaged" << dendl; + return r; + } + if (!js.is_readable()) { + derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl; + return -EIO; + } + } + + /* + * Assuming we can cleanly read the journal data, dump it out to a file + */ + { + Dumper dumper; + r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type); + if (r < 0) { + derr << "dumper::init failed: " << cpp_strerror(r) << dendl; + return r; + } + if (import) { + r = dumper.undump(path.c_str(), force); + } else { + const std::string ex_path = gen_dump_file_path(path); + r = dumper.dump(ex_path.c_str()); + } + } + + return r; +} + + +/** + * Truncate journal and insert EResetJournal + */ +int JournalTool::journal_reset(bool hard) +{ + int r = 0; + Resetter resetter; + r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard); + if (r < 0) { + derr << "resetter::init failed: " << cpp_strerror(r) << dendl; + return r; + } + + if (hard) { + r = resetter.reset_hard(); + } else { + r = resetter.reset(); + } + + return r; +} + + +/** + * Selective offline replay which only reads out dentries and writes + * them to the backing store iff their version is > what is currently + * in the backing store. + * + * In order to write dentries to the backing store, we may create the + * required enclosing dirfrag objects. + * + * Test this by running scavenge on an unflushed journal, then nuking + * it offline, then starting an MDS and seeing that the dentries are + * visible. + * + * @param metablob an EMetaBlob retrieved from the journal + * @param dry_run if true, do no writes to RADOS + * @param consumed_inos output, populated with any inos inserted + * @returns 0 on success, else negative error code + */ +int JournalTool::recover_dentries( + EMetaBlob const &metablob, + bool const dry_run, + std::set *consumed_inos) +{ + ceph_assert(consumed_inos != NULL); + + int r = 0; + + // Replay fullbits (dentry+inode) + for (const auto& frag : metablob.lump_order) { + EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second; + lump._decode_bits(); + object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, ""); + + dout(4) << "inspecting lump " << frag_oid.name << dendl; + + + // We will record old fnode version for use in hard link handling + // If we don't read an old fnode, take version as zero and write in + // all hardlinks we find. + version_t old_fnode_version = 0; + + // Update fnode in omap header of dirfrag object + bool write_fnode = false; + bufferlist old_fnode_bl; + r = input.omap_get_header(frag_oid.name, &old_fnode_bl); + if (r == -ENOENT) { + // Creating dirfrag from scratch + dout(4) << "failed to read OMAP header from directory fragment " + << frag_oid.name << " " << cpp_strerror(r) << dendl; + write_fnode = true; + // Note: creating the dirfrag *without* a backtrace, relying on + // MDS to regenerate backtraces on read or in FSCK + } else if (r == 0) { + // Conditionally update existing omap header + fnode_t old_fnode; + auto old_fnode_iter = old_fnode_bl.cbegin(); + try { + old_fnode.decode(old_fnode_iter); + dout(4) << "frag " << frag_oid.name << " fnode old v" << + old_fnode.version << " vs new v" << lump.fnode->version << dendl; + old_fnode_version = old_fnode.version; + write_fnode = old_fnode_version < lump.fnode->version; + } catch (const buffer::error &err) { + dout(1) << "frag " << frag_oid.name + << " is corrupt, overwriting" << dendl; + write_fnode = true; + } + } else { + // Unexpected error + dout(4) << "failed to read OMAP header from directory fragment " + << frag_oid.name << " " << cpp_strerror(r) << dendl; + return r; + } + + if ((other_pool || write_fnode) && !dry_run) { + dout(4) << "writing fnode to omap header" << dendl; + bufferlist fnode_bl; + lump.fnode->encode(fnode_bl); + if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) { + r = output.omap_set_header(frag_oid.name, fnode_bl); + } + if (r != 0) { + derr << "Failed to write fnode for frag object " + << frag_oid.name << dendl; + return r; + } + } + + std::set read_keys; + + // Compose list of potentially-existing dentries we would like to fetch + for (const auto& fb : lump.get_dfull()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); + dn_key.encode(key); + read_keys.insert(key); + } + + for(const auto& rb : lump.get_dremote()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); + dn_key.encode(key); + read_keys.insert(key); + } + + for (const auto& nb : lump.get_dnull()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); + dn_key.encode(key); + read_keys.insert(key); + } + + // Perform bulk read of existing dentries + std::map read_vals; + r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); + if (r == -ENOENT && other_pool) { + r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); + } + if (r != 0) { + derr << "unexpected error reading fragment object " + << frag_oid.name << ": " << cpp_strerror(r) << dendl; + return r; + } + + // Compose list of dentries we will write back + std::map write_vals; + for (const auto& fb : lump.get_dfull()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); + dn_key.encode(key); + + dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn + << dendl; + bool write_dentry = false; + if (read_vals.find(key) == read_vals.end()) { + dout(4) << "dentry did not already exist, will create" << dendl; + write_dentry = true; + } else { + dout(4) << "dentry " << key << " existed already" << dendl; + dout(4) << "dentry exists, checking versions..." << dendl; + bufferlist &old_dentry = read_vals[key]; + // Decode dentry+inode + auto q = old_dentry.cbegin(); + + snapid_t dnfirst; + decode(dnfirst, q); + char dentry_type; + decode(dentry_type, q); + + if (dentry_type == 'L' || dentry_type == 'l') { + // leave write_dentry false, we have no version to + // compare with in a hardlink, so it's not safe to + // squash over it with what's in this fullbit + dout(10) << "Existing remote inode in slot to be (maybe) written " + << "by a full inode from the journal dn '" << fb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + write_dentry = old_fnode_version < lump.fnode->version; + } else if (dentry_type == 'I' || dentry_type == 'i') { + // Read out inode version to compare with backing store + InodeStore inode; + if (dentry_type == 'i') { + mempool::mds_co::string alternate_name; + + DECODE_START(2, q); + if (struct_v >= 2) + decode(alternate_name, q); + inode.decode(q); + DECODE_FINISH(q); + } else { + inode.decode_bare(q); + } + dout(4) << "decoded embedded inode version " + << inode.inode->version << " vs fullbit version " + << fb.inode->version << dendl; + if (inode.inode->version < fb.inode->version) { + write_dentry = true; + } + } else { + dout(4) << "corrupt dentry in backing store, overwriting from " + "journal" << dendl; + write_dentry = true; + } + } + + if ((other_pool || write_dentry) && !dry_run) { + dout(4) << "writing I dentry " << key << " into frag " + << frag_oid.name << dendl; + + // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) + bufferlist dentry_bl; + encode(fb.dnfirst, dentry_bl); + encode('I', dentry_bl); + encode_fullbit_as_inode(fb, true, &dentry_bl); + + // Record for writing to RADOS + write_vals[key] = dentry_bl; + consumed_inos->insert(fb.inode->ino); + } + } + + for(const auto& rb : lump.get_dremote()) { + // Get a key like "foobar_head" + std::string key; + dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); + dn_key.encode(key); + + dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn + << dendl; + bool write_dentry = false; + if (read_vals.find(key) == read_vals.end()) { + dout(4) << "dentry did not already exist, will create" << dendl; + write_dentry = true; + } else { + dout(4) << "dentry " << key << " existed already" << dendl; + dout(4) << "dentry exists, checking versions..." << dendl; + bufferlist &old_dentry = read_vals[key]; + // Decode dentry+inode + auto q = old_dentry.cbegin(); + + snapid_t dnfirst; + decode(dnfirst, q); + char dentry_type; + decode(dentry_type, q); + + if (dentry_type == 'L' || dentry_type == 'l') { + dout(10) << "Existing hardlink inode in slot to be (maybe) written " + << "by a remote inode from the journal dn '" << rb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + write_dentry = old_fnode_version < lump.fnode->version; + } else if (dentry_type == 'I' || dentry_type == 'i') { + dout(10) << "Existing full inode in slot to be (maybe) written " + << "by a remote inode from the journal dn '" << rb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + write_dentry = old_fnode_version < lump.fnode->version; + } else { + dout(4) << "corrupt dentry in backing store, overwriting from " + "journal" << dendl; + write_dentry = true; + } + } + + if ((other_pool || write_dentry) && !dry_run) { + dout(4) << "writing L dentry " << key << " into frag " + << frag_oid.name << dendl; + + // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) + bufferlist dentry_bl; + encode(rb.dnfirst, dentry_bl); + encode('L', dentry_bl); + encode(rb.ino, dentry_bl); + encode(rb.d_type, dentry_bl); + + // Record for writing to RADOS + write_vals[key] = dentry_bl; + consumed_inos->insert(rb.ino); + } + } + + std::set null_vals; + for (const auto& nb : lump.get_dnull()) { + std::string key; + dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); + dn_key.encode(key); + + dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn + << dendl; + + auto it = read_vals.find(key); + if (it != read_vals.end()) { + dout(4) << "dentry exists, will remove" << dendl; + + auto q = it->second.cbegin(); + snapid_t dnfirst; + decode(dnfirst, q); + char dentry_type; + decode(dentry_type, q); + + bool remove_dentry = false; + if (dentry_type == 'L' || dentry_type == 'l') { + dout(10) << "Existing hardlink inode in slot to be (maybe) removed " + << "by null journal dn '" << nb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + remove_dentry = old_fnode_version < lump.fnode->version; + } else if (dentry_type == 'I' || dentry_type == 'i') { + dout(10) << "Existing full inode in slot to be (maybe) removed " + << "by null journal dn '" << nb.dn.c_str() + << "' with lump fnode version " << lump.fnode->version + << "vs existing fnode version " << old_fnode_version << dendl; + remove_dentry = old_fnode_version < lump.fnode->version; + } else { + dout(4) << "corrupt dentry in backing store, will remove" << dendl; + remove_dentry = true; + } + + if (remove_dentry) + null_vals.insert(key); + } + } + + // Write back any new/changed dentries + if (!write_vals.empty()) { + r = output.omap_set(frag_oid.name, write_vals); + if (r != 0) { + derr << "error writing dentries to " << frag_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + + // remove any null dentries + if (!null_vals.empty()) { + r = output.omap_rm_keys(frag_oid.name, null_vals); + if (r != 0) { + derr << "error removing dentries from " << frag_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + } + + /* Now that we've looked at the dirlumps, we finally pay attention to + * the roots (i.e. inodes without ancestry). This is necessary in order + * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally + * important because clients use them to infer completeness + * of directories + */ + for (const auto& fb : metablob.roots) { + inodeno_t ino = fb.inode->ino; + dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl; + + object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); + dout(4) << "object id " << root_oid.name << dendl; + + bool write_root_ino = false; + bufferlist old_root_ino_bl; + r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0); + if (r == -ENOENT) { + dout(4) << "root does not exist, will create" << dendl; + write_root_ino = true; + } else if (r >= 0) { + r = 0; + InodeStore old_inode; + dout(4) << "root exists, will modify (" << old_root_ino_bl.length() + << ")" << dendl; + auto inode_bl_iter = old_root_ino_bl.cbegin(); + std::string magic; + decode(magic, inode_bl_iter); + if (magic == CEPH_FS_ONDISK_MAGIC) { + dout(4) << "magic ok" << dendl; + old_inode.decode(inode_bl_iter); + + if (old_inode.inode->version < fb.inode->version) { + write_root_ino = true; + } + } else { + dout(4) << "magic bad: '" << magic << "'" << dendl; + write_root_ino = true; + } + } else { + derr << "error reading root inode object " << root_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + if (write_root_ino && !dry_run) { + dout(4) << "writing root ino " << root_oid.name + << " version " << fb.inode->version << dendl; + + // Compose: root ino format is magic,InodeStore(bare=false) + bufferlist new_root_ino_bl; + encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl); + encode_fullbit_as_inode(fb, false, &new_root_ino_bl); + + // Write to RADOS + r = output.write_full(root_oid.name, new_root_ino_bl); + if (r != 0) { + derr << "error writing inode object " << root_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } + } + } + + return r; +} + + +/** + * Erase a region of the log by overwriting it with ENoOp + * + */ +int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length) +{ + // To erase this region, we use our preamble, the encoding overhead + // of an ENoOp, and our trailing start ptr. Calculate how much padding + // is needed inside the ENoOp to make up the difference. + bufferlist tmp; + if (type == "mdlog") { + ENoOp enoop(0); + enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT); + } else if (type == "purge_queue") { + PurgeItem pi; + pi.encode(tmp); + } + + dout(4) << "erase_region " << pos << " len=" << length << dendl; + + // FIXME: get the preamble/postamble length via JournalStream + int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t); + dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl; + + if (padding < 0) { + derr << "Erase region " << length << " too short" << dendl; + return -EINVAL; + } + + bufferlist entry; + if (type == "mdlog") { + // Serialize an ENoOp with the correct amount of padding + ENoOp enoop(padding); + enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT); + } else if (type == "purge_queue") { + PurgeItem pi; + pi.pad_size = padding; + pi.encode(entry); + } + JournalStream stream(JOURNAL_FORMAT_RESILIENT); + // Serialize region of log stream + bufferlist log_data; + stream.write(entry, &log_data, pos); + + dout(4) << "erase_region data length " << log_data.length() << dendl; + ceph_assert(log_data.length() == length); + + // Write log stream region to RADOS + // FIXME: get object size somewhere common to scan_events + uint32_t object_size = g_conf()->mds_log_segment_size; + if (object_size == 0) { + // Default layout object size + object_size = file_layout_t::get_default().object_size; + } + + uint64_t write_offset = pos; + uint64_t obj_offset = (pos / object_size); + int r = 0; + while(log_data.length()) { + std::string const oid = js.obj_name(obj_offset); + uint32_t offset_in_obj = write_offset % object_size; + uint32_t write_len = min(log_data.length(), object_size - offset_in_obj); + + r = output.write(oid, log_data, write_len, offset_in_obj); + if (r < 0) { + return r; + } else { + dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl; + r = 0; + } + + log_data.splice(0, write_len); + write_offset += write_len; + obj_offset++; + } + + return r; +} + +/** + * Given an EMetaBlob::fullbit containing an inode, write out + * the encoded inode in the format used by InodeStore (i.e. the + * backing store format) + * + * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use + * on an offline InodeStore instance. It's way simpler, because we are just + * uncritically hauling the data between structs. + * + * @param fb a fullbit extracted from a journal entry + * @param bare if true, leave out [EN|DE]CODE_START decoration + * @param out_bl output, write serialized inode to this bufferlist + */ +void JournalTool::encode_fullbit_as_inode( + const EMetaBlob::fullbit &fb, + const bool bare, + bufferlist *out_bl) +{ + ceph_assert(out_bl != NULL); + + // Compose InodeStore + InodeStore new_inode; + new_inode.inode = fb.inode; + new_inode.xattrs = fb.xattrs; + new_inode.dirfragtree = fb.dirfragtree; + new_inode.snap_blob = fb.snapbl; + new_inode.symlink = fb.symlink; + new_inode.old_inodes = fb.old_inodes; + + // Serialize InodeStore + if (bare) { + new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + } else { + new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + } +} + +/** + * Given a list of inode numbers known to be in use by + * inodes in the backing store, ensure that none of these + * numbers are listed as free in the InoTables in the + * backing store. + * + * Used after injecting inodes into the backing store, to + * ensure that the same inode numbers are not subsequently + * used for new files during ordinary operation. + * + * @param inos list of inode numbers to be removed from + * free lists in InoTables + * @returns 0 on success, else negative error code + */ +int JournalTool::consume_inos(const std::set &inos) +{ + int r = 0; + + // InoTable is a per-MDS structure, so iterate over assigned ranks + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + std::set in_ranks; + fs->mds_map.get_mds_set(in_ranks); + + for (std::set::iterator rank_i = in_ranks.begin(); + rank_i != in_ranks.end(); ++rank_i) + { + // Compose object name + std::ostringstream oss; + oss << "mds" << *rank_i << "_inotable"; + object_t inotable_oid = object_t(oss.str()); + + // Read object + bufferlist inotable_bl; + int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0); + if (read_r < 0) { + // Things are really bad if we can't read inotable. Beyond our powers. + derr << "unable to read inotable '" << inotable_oid.name << "': " + << cpp_strerror(read_r) << dendl; + r = r ? r : read_r; + continue; + } + + // Deserialize InoTable + version_t inotable_ver; + auto q = inotable_bl.cbegin(); + decode(inotable_ver, q); + InoTable ino_table(NULL); + ino_table.decode(q); + + // Update InoTable in memory + bool inotable_modified = false; + for (std::set::iterator i = inos.begin(); + i != inos.end(); ++i) + { + const inodeno_t ino = *i; + if (ino_table.force_consume(ino)) { + dout(4) << "Used ino 0x" << std::hex << ino << std::dec + << " requires inotable update" << dendl; + inotable_modified = true; + } + } + + // Serialize and write InoTable + if (inotable_modified) { + inotable_ver += 1; + dout(4) << "writing modified inotable version " << inotable_ver << dendl; + bufferlist inotable_new_bl; + encode(inotable_ver, inotable_new_bl); + ino_table.encode_state(inotable_new_bl); + int write_r = output.write_full(inotable_oid.name, inotable_new_bl); + if (write_r != 0) { + derr << "error writing modified inotable " << inotable_oid.name + << ": " << cpp_strerror(write_r) << dendl; + r = r ? r : read_r; + continue; + } + } + } + + return r; +} + diff --git a/src/tools/cephfs/JournalTool.h b/src/tools/cephfs/JournalTool.h new file mode 100644 index 000000000..8d610a866 --- /dev/null +++ b/src/tools/cephfs/JournalTool.h @@ -0,0 +1,101 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "MDSUtility.h" +#include "RoleSelector.h" +#include + +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/events/EMetaBlob.h" + +#include "include/rados/librados.hpp" + +#include "JournalFilter.h" + +class JournalScanner; + + +/** + * Command line tool for investigating and repairing filesystems + * with damaged metadata logs + */ +class JournalTool : public MDSUtility +{ + private: + MDSRoleSelector role_selector; + // Bit hacky, use this `rank` member to control behaviour of the + // various main_ functions. + mds_rank_t rank; + // when set, generate per rank dump file path + bool all_ranks = false; + + std::string type; + + // Entry points + int main_journal(std::vector &argv); + int main_header(std::vector &argv); + int main_event(std::vector &argv); + + // Shared functionality + int recover_journal(); + + // Journal operations + int journal_inspect(); + int journal_export(std::string const &path, bool import, bool force); + int journal_reset(bool hard); + + // Header operations + int header_set(); + + // I/O handles + librados::Rados rados; + librados::IoCtx input; + librados::IoCtx output; + + bool other_pool; + + // Metadata backing store manipulation + int read_lost_found(std::set &lost); + int recover_dentries( + EMetaBlob const &metablob, + bool const dry_run, + std::set *consumed_inos); + + // Splicing + int erase_region(JournalScanner const &jp, uint64_t const pos, uint64_t const length); + + // Backing store helpers + void encode_fullbit_as_inode( + const EMetaBlob::fullbit &fb, + const bool bare, + bufferlist *out_bl); + int consume_inos(const std::set &inos); + + //validate type + int validate_type(const std::string &type); + + // generate output file path for dump/export + std::string gen_dump_file_path(const std::string &prefix); + + // check if an operation (mode, command) is safe to be + // executed on all ranks. + bool can_execute_for_all_ranks(const std::string &mode, + const std::string &command); + public: + static void usage(); + JournalTool() : + rank(0), other_pool(false) {} + int main(std::vector &argv); +}; + diff --git a/src/tools/cephfs/MDSUtility.cc b/src/tools/cephfs/MDSUtility.cc new file mode 100644 index 000000000..54386d219 --- /dev/null +++ b/src/tools/cephfs/MDSUtility.cc @@ -0,0 +1,155 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include "MDSUtility.h" +#include "mon/MonClient.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + + +MDSUtility::MDSUtility() : + Dispatcher(g_ceph_context), + objecter(NULL), + finisher(g_ceph_context, "MDSUtility", "fn_mds_utility"), + waiting_for_mds_map(NULL), + inited(false) +{ + monc = new MonClient(g_ceph_context, poolctx); + messenger = Messenger::create_client_messenger(g_ceph_context, "mds"); + fsmap = new FSMap(); + objecter = new Objecter(g_ceph_context, messenger, monc, poolctx); +} + + +MDSUtility::~MDSUtility() +{ + if (inited) { + shutdown(); + } + delete objecter; + delete monc; + delete messenger; + delete fsmap; + ceph_assert(waiting_for_mds_map == NULL); +} + + +int MDSUtility::init() +{ + // Initialize Messenger + poolctx.start(1); + messenger->start(); + + objecter->set_client_incarnation(0); + objecter->init(); + + // Connect dispatchers before starting objecter + messenger->add_dispatcher_tail(objecter); + messenger->add_dispatcher_tail(this); + + // Initialize MonClient + if (monc->build_initial_monmap() < 0) { + objecter->shutdown(); + messenger->shutdown(); + messenger->wait(); + return -1; + } + + monc->set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD|CEPH_ENTITY_TYPE_MDS); + monc->set_messenger(messenger); + monc->init(); + int r = monc->authenticate(); + if (r < 0) { + derr << "Authentication failed, did you specify an MDS ID with a valid keyring?" << dendl; + monc->shutdown(); + objecter->shutdown(); + messenger->shutdown(); + messenger->wait(); + return r; + } + + client_t whoami = monc->get_global_id(); + messenger->set_myname(entity_name_t::CLIENT(whoami.v)); + + // Start Objecter and wait for OSD map + objecter->start(); + objecter->wait_for_osd_map(); + + // Prepare to receive MDS map and request it + ceph::mutex init_lock = ceph::make_mutex("MDSUtility:init"); + ceph::condition_variable cond; + bool done = false; + ceph_assert(!fsmap->get_epoch()); + lock.lock(); + waiting_for_mds_map = new C_SafeCond(init_lock, cond, &done, NULL); + lock.unlock(); + monc->sub_want("fsmap", 0, CEPH_SUBSCRIBE_ONETIME); + monc->renew_subs(); + + // Wait for MDS map + dout(4) << "waiting for MDS map..." << dendl; + { + std::unique_lock locker{init_lock}; + cond.wait(locker, [&done] { return done; }); + } + dout(4) << "Got MDS map " << fsmap->get_epoch() << dendl; + + finisher.start(); + + inited = true; + return 0; +} + + +void MDSUtility::shutdown() +{ + finisher.stop(); + + lock.lock(); + objecter->shutdown(); + lock.unlock(); + monc->shutdown(); + messenger->shutdown(); + messenger->wait(); + poolctx.finish(); +} + + +bool MDSUtility::ms_dispatch(Message *m) +{ + std::lock_guard locker{lock}; + switch (m->get_type()) { + case CEPH_MSG_FS_MAP: + handle_fs_map((MFSMap*)m); + break; + case CEPH_MSG_OSD_MAP: + break; + default: + return false; + } + m->put(); + return true; +} + + +void MDSUtility::handle_fs_map(MFSMap* m) +{ + *fsmap = m->get_fsmap(); + if (waiting_for_mds_map) { + waiting_for_mds_map->complete(0); + waiting_for_mds_map = NULL; + } +} + + diff --git a/src/tools/cephfs/MDSUtility.h b/src/tools/cephfs/MDSUtility.h new file mode 100644 index 000000000..09f1918ba --- /dev/null +++ b/src/tools/cephfs/MDSUtility.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MDS_UTILITY_H_ +#define MDS_UTILITY_H_ + +#include "osdc/Objecter.h" +#include "mds/FSMap.h" +#include "messages/MFSMap.h" +#include "msg/Dispatcher.h" +#include "msg/Messenger.h" +#include "auth/Auth.h" +#include "common/async/context_pool.h" +#include "common/Finisher.h" +#include "common/Timer.h" + +/// MDS Utility +/** + * This class is the parent for MDS utilities, i.e. classes that + * need access the objects belonging to the MDS without actually + * acting as an MDS daemon themselves. + */ +class MDSUtility : public Dispatcher { +protected: + Objecter *objecter; + FSMap *fsmap; + Messenger *messenger; + MonClient *monc; + + ceph::mutex lock = ceph::make_mutex("MDSUtility::lock"); + Finisher finisher; + ceph::async::io_context_pool poolctx; + + Context *waiting_for_mds_map; + + bool inited; +public: + MDSUtility(); + ~MDSUtility() override; + + void handle_fs_map(MFSMap* m); + bool ms_dispatch(Message *m) override; + bool ms_handle_reset(Connection *con) override { return false; } + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override { return false; } + int init(); + void shutdown(); +}; + +#endif /* MDS_UTILITY_H_ */ diff --git a/src/tools/cephfs/MetaTool.cc b/src/tools/cephfs/MetaTool.cc new file mode 100644 index 000000000..baa0d498a --- /dev/null +++ b/src/tools/cephfs/MetaTool.cc @@ -0,0 +1,1000 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include +#include +#include +#include + +#include "include/types.h" +#include "common/Formatter.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "osdc/Journaler.h" +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/InoTable.h" +#include "mds/CDentry.h" + +#include "mds/events/ENoOp.h" +#include "mds/events/EUpdate.h" + +#include "mds/JournalPointer.h" +// #include "JournalScanner.h" +// #include "EventOutput.h" +// #include "Dumper.h" +// #include "Resetter.h" + +// #include "JournalTool.h" +#include "MetaTool.h" +#include "type_helper.hpp" +#include "include/object.h" + +WRITE_RAW_ENCODER(char) +WRITE_RAW_ENCODER(unsigned char) + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << __func__ << ": " + +using namespace std; + +void MetaTool::meta_op::release() +{ + for (const auto& i : inodes) { + delete i.second; + } + + while (!sub_ops.empty()) { + delete sub_ops.top(); + sub_ops.pop(); + } +} + +void MetaTool::inode_meta_t::decode_json(JSONObj *obj) +{ + unsigned long long tmp; + JSONDecoder::decode_json("snapid_t", tmp, obj, true); + _f.val = tmp; + JSONDecoder::decode_json("itype", tmp, obj, true); + _t = tmp; + if (NULL == _i) + _i = new InodeStore; + JSONDecoder::decode_json("store", *_i, obj, true); +} + +void MetaTool::usage() +{ + generic_client_usage(); +} + +int MetaTool::main(string& mode, + string& rank_str, + string& minfo, + string&ino, + string& out, + string& in, + bool confirm + ) +{ + int r = 0; + + std::string manual_meta_pool; + std::string manual_data_pool; + std::string manual_rank_num; + bool manual_mode = false; + if (minfo != "") { + vector v; + string_split(minfo, v); + manual_meta_pool = v.size() >= 1 ? v[0] : ""; + manual_data_pool = v.size() >= 2 ? v[1] : ""; + manual_rank_num = v.size() >= 3 ? v[2] : ""; + std::cout << "("<< minfo<< ")=>" + << " mpool: " << manual_meta_pool + << " dpool: " << manual_data_pool + << " rank: " << manual_rank_num + << std::endl; + if (!manual_meta_pool.empty() && !manual_data_pool.empty() && !manual_rank_num.empty()) { + std::cout << "you specify rank: " << manual_rank_num + << " mpool: " << manual_meta_pool + << " dpool: " << manual_data_pool + << "\nstart manual mode!!"<< std::endl; + manual_mode = true; + } + } + + // RADOS init + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + cerr << "RADOS unavailable" << std::endl; + return r; + } + + if (_debug) + cout << "MetaTool: connecting to RADOS..." << std::endl; + r = rados.connect(); + if (r < 0) { + cerr << "couldn't connect to cluster: " << cpp_strerror(r) << std::endl; + return r; + } + + if (!manual_mode) { + r = role_selector.parse(*fsmap, rank_str); + if (r != 0) { + cerr << "Couldn't determine MDS rank." << std::endl; + return r; + } + + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + assert(fs != nullptr); + + // prepare io for meta pool + int64_t const pool_id = fs->mds_map.get_metadata_pool(); + features = fs->mds_map.get_up_features(); + if (features == 0) + features = CEPH_FEATURES_SUPPORTED_DEFAULT; + else if (features != CEPH_FEATURES_SUPPORTED_DEFAULT) { + cout << "I think we need to check the feature! : " << features << std::endl; + return -1; + } + + std::string pool_name; + r = rados.pool_reverse_lookup(pool_id, &pool_name); + if (r < 0) { + cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl; + return r; + } + + if (_debug) + cout << "MetaTool: creating IoCtx.." << std::endl; + r = rados.ioctx_create(pool_name.c_str(), io_meta); + assert(r == 0); + output.dup(io_meta); + + // prepare io for data pool + for (const auto p : fs->mds_map.get_data_pools()) { + r = rados.pool_reverse_lookup(p, &pool_name); + if (r < 0) { + cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl; + return r; + } + librados::IoCtx* io_data = new librados::IoCtx; + r = rados.ioctx_create(pool_name.c_str(), *io_data); + assert(r == 0); + io_data_v.push_back(io_data); + } + + for (auto role : role_selector.get_roles()) { + rank = role.rank; + + r = process(mode, ino, out, in, confirm); + cout << "executing for rank " << rank << " op[" <(manual_rank_num); + r = process(mode, ino, out, in, confirm); + cout << "op[" << mode << "] ret : " << r << std::endl; + } + return r; +} + +int MetaTool::process(string& mode, string& ino, string out, string in, bool confirm) +{ + if (mode == "showm") { + return show_meta_info(ino, out); + } else if (mode == "showfn") { + return show_fnode(ino, out); + } else if (mode == "listc") { + return list_meta_info(ino, out); + } else if (mode == "amend") { + return amend_meta_info(ino, in, confirm); + } else if (mode == "amendfn") { + return amend_fnode(in, confirm); + } else { + cerr << "bad command '" << mode << "'" << std::endl; + return -EINVAL; + } +} +int MetaTool::show_fnode(string& ino, string& out) +{ + if (ino != "0") { + inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0); + meta_op op(_debug, out); + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_SHOW_FN; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = i_ino; + op.push_op(nsop); + return op_process(op); + } else { + cerr << "parameter error? : ino = " << ino << std::endl; + } + return 0; +} +int MetaTool::amend_fnode(string& in, bool confirm) +{ + meta_op op(_debug, "", in, confirm); + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_AMEND_FN; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = 0; + op.push_op(nsop); + return op_process(op); +} +int MetaTool::amend_meta_info(string& ino, string& in, bool confirm) +{ + if (ino != "0" && in != "") { + inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0); + meta_op op(_debug, "", in, confirm); + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_AMEND; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = i_ino; + op.push_op(nsop); + return op_process(op); + } else { + cerr << "parameter error? : ino = " << ino << std::endl; + } + return 0; +} +int MetaTool::list_meta_info(string& ino, string& out) +{ + if (ino != "0") { + inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0); + meta_op op(_debug, out); + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = i_ino; + op.push_op(nsop); + return op_process(op); + } else { + cerr << "parameter error? : ino = " << ino << std::endl; + } + return 0; +} +int MetaTool::show_meta_info(string& ino, string& out) +{ + if (ino != "0") { + inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0); + meta_op op(_debug, out); + + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_SHOW; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->ino = i_ino; + op.push_op(nsop); + return op_process(op); + } else { + cerr << "parameter error? : ino = " << ino << std::endl; + } + return 0; +} + +int MetaTool::op_process(meta_op& op) +{ + int r = 0; + while (!op.no_sops()) { + if (_debug) + std::cout << "process : " << op.top_op()->detail() << std::endl; + switch(op.top_op()->sub_op_t) { + case meta_op::OP_LIST: + r = list_meta(op); + break; + case meta_op::OP_LTRACE: + r = file_meta(op); + break; + case meta_op::OP_SHOW: + r = show_meta(op); + break; + case meta_op::OP_AMEND: + r = amend_meta(op); + break; + case meta_op::OP_SHOW_FN: + r = show_fn(op); + break; + case meta_op::OP_AMEND_FN: + r = amend_fn(op); + break; + default: + cerr << "unknow op" << std::endl; + } + if (r == 0) + op.pop_op(); + else if (r < 0) + op.clear_sops(); + } + op.release(); + return r; +} + +int MetaTool::amend_meta(meta_op &op) +{ + meta_op::sub_op* sop = op.top_op(); + auto item = op.inodes.find(sop->ino); + auto item_k = op.okeys.find(sop->ino); + if (item != op.inodes.end() && item_k != op.okeys.end()) { + if (_amend_meta(item_k->second, *(item->second), op.infile(), op) < 0) + return -1; + } else { + if (op.inodes.empty()) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->trace_level = 0; + nsop->ino_c = sop->ino; + op.push_op(nsop); + return 1; + } else { + return -1; + } + } + return 0; +} + +void MetaTool::inode_meta_t::encode(::ceph::bufferlist& bl, uint64_t features) +{ + ::encode(_f, bl); + ::encode(_t, bl); + _i->encode_bare(bl, features); +} +int MetaTool::_amend_meta(string& k, inode_meta_t& inode_meta, const string& fn, meta_op& op) +{ + JSONParser parser; + if (!parser.parse(fn.c_str())) { + cout << "Error parsing create user response" << std::endl; + return -1; + } + + try { + inode_meta.decode_json(&parser); + } catch (JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.what() << std::endl; + return -1; + } + + if (!op.confirm_chg() || op.is_debug()) { + cout << "you will amend info of inode ==>: " << std::endl; + _show_meta(inode_meta, ""); + } + + if (!op.confirm_chg()) { + cout << "warning: this operation is irreversibl!!!\n" + << " You must confirm that all logs of mds have been flushed!!!\n" + << " if you want amend it, please add --yes-i-really-really-mean-it!!!" + << std::endl; + return -1; + } + + bufferlist bl; + inode_meta.encode(bl, features); + map to_set; + to_set[k].swap(bl); + inode_backpointer_t bp; + if (!op.top_op()->get_ancestor(bp)) + return -1; + frag_t frag; + auto item = op.inodes.find(bp.dirino); + if (item != op.inodes.end()) { + frag = item->second->get_meta()->pick_dirfrag(bp.dname); + } + string oid = obj_name(bp.dirino, frag); + int ret = io_meta.omap_set(oid, to_set); + to_set.clear(); + return ret; +} +int MetaTool::show_fn(meta_op &op) +{ + meta_op::sub_op* sop = op.top_op(); + auto item = op.inodes.find(sop->ino); + if (item != op.inodes.end()) { + if (_show_fn(*(item->second), op.outfile()) < 0) + return -1; + } else { + if (op.inodes.empty()) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->trace_level = 0; + nsop->ino_c = sop->ino; + op.push_op(nsop); + return 1; + } else + return -1; + } + return 0; +} +int MetaTool::_show_fn(inode_meta_t& inode_meta, const string& fn) +{ + std::list frags; + inode_meta.get_meta()->dirfragtree.get_leaves(frags); + std::stringstream ds; + std::string format = "json"; + std::string oids; + Formatter* f = Formatter::create(format); + f->enable_line_break(); + f->open_object_section("fnodes"); + for (const auto &frag : frags) { + bufferlist hbl; + string oid = obj_name(inode_meta.get_meta()->inode->ino, frag); + int ret = io_meta.omap_get_header(oid, &hbl); + if (ret < 0) { + std::cerr << __func__ << " : can't find oid("<< oid << ")" << std::endl; + return -1; + } + { + fnode_t got_fnode; + try { + auto p = hbl.cbegin(); + ::decode(got_fnode, p); + } catch (const buffer::error &err) { + cerr << "corrupt fnode header in " << oid + << ": " << err.what() << std::endl; + return -1; + } + if (!oids.empty()) + oids += ","; + oids += oid; + f->open_object_section(oid.c_str()); + got_fnode.dump(f); + f->close_section(); + } + } + f->dump_string("oids", oids.c_str()); + f->close_section(); + f->flush(ds); + if (fn != "") { + ofstream o; + o.open(fn); + if (o) { + o << ds.str(); + o.close(); + } else { + cout << "out to file (" << fn << ") failed" << std::endl; + cout << ds.str() << std::endl; + } + } else + std::cout << ds.str() << std::endl; + return 0; +} +int MetaTool::amend_fn(meta_op &op) +{ + if (_amend_fn(op.infile(), op.confirm_chg()) < 0) + return -1; + return 0; +} +int MetaTool::_amend_fn(const string& fn, bool confirm) +{ + JSONParser parser; + if (!parser.parse(fn.c_str())) { + cout << "Error parsing create user response : " << fn << std::endl; + return -1; + } + if (!confirm) { + cout << "warning: this operation is irreversibl!!!\n" + << " You must confirm that all logs of mds have been flushed!!!\n" + << " if you want amend it, please add --yes-i-really-really-mean-it!!!" + << std::endl; + return -1; + } + try { + string tmp; + JSONDecoder::decode_json("oids", tmp, &parser, true); + string::size_type pos1, pos2; + vector v; + string c = ","; + pos2 = tmp.find(c); + pos1 = 0; + while (string::npos != pos2) { + v.push_back(tmp.substr(pos1, pos2-pos1)); + pos1 = pos2 + c.size(); + pos2 = tmp.find(c, pos1); + } + if (pos1 != tmp.length()) + v.push_back(tmp.substr(pos1)); + int ret = 0; + for (auto i : v) { + cout << "amend frag : " << i << "..." << std::endl; + fnode_t fnode; + JSONDecoder::decode_json(i.c_str(), fnode, &parser, true); + bufferlist bl; + fnode.encode(bl); + ret = io_meta.omap_set_header(i, bl); + if (ret < 0) + return ret; + } + } catch (JSONDecoder::err& e) { + cout << "failed to decode JSON input: " << e.what() << std::endl; + return -1; + } + return 0; +} +int MetaTool::show_meta(meta_op &op) +{ + meta_op::sub_op* sop = op.top_op(); + auto item = op.inodes.find(sop->ino); + if (item != op.inodes.end()) { + if (_show_meta(*(item->second), op.outfile()) < 0) + return -1; + } else { + if (op.inodes.empty()) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->trace_level = 0; + nsop->ino_c = sop->ino; + op.push_op(nsop); + return 1; + } else { + return -1; + } + } + return 0; +} +int MetaTool::_show_meta(inode_meta_t& inode_meta, const string& fn) +{ + std::stringstream ds; + std::string format = "json"; + InodeStore& inode_data = *inode_meta.get_meta(); + Formatter* f = Formatter::create(format); + f->enable_line_break(); + f->open_object_section("meta"); + f->dump_unsigned("snapid_t", inode_meta.get_snapid()); + f->dump_unsigned("itype", inode_meta.get_type()); + f->open_object_section("store"); + inode_data.dump(f); + try { + if (inode_data.snap_blob.length()) { + sr_t srnode; + auto p = inode_data.snap_blob.cbegin(); + decode(srnode, p); + f->open_object_section("snap_blob"); + srnode.dump(f); + f->close_section(); + } + } catch (const buffer::error &err) { + cerr << "corrupt decode in snap_blob" + << ": " << err.what() << std::endl; + return -1; + } + + f->close_section(); + f->close_section(); + f->flush(ds); + + if (fn != "") { + ofstream o; + o.open(fn); + if (o) { + o << ds.str(); + o.close(); + } else { + cout << "out to file (" << fn << ") failed" << std::endl; + cout << ds.str() << std::endl; + } + + } else + std::cout << ds.str() << std::endl; + return 0; +} +int MetaTool::list_meta(meta_op &op) +{ + meta_op::sub_op* sop = op.top_op(); + + bool list_all = false; + string oid; + inodeno_t ino = sop->ino_c; + frag_t frag = sop->frag; + + if (sop->ino_c == 0) { + list_all = true; + oid = obj_name(sop->ino, frag); + } else { + if (_debug) + std::cout << __func__ << " : " << sop->trace_level << " " << op.ancestors.size() << std::endl; + inode_backpointer_t bp; + if (sop->get_c_ancestor(bp)) { + auto item = op.inodes.find(bp.dirino); + if (item != op.inodes.end()) { + frag = item->second->get_meta()->pick_dirfrag(bp.dname); + } + oid = obj_name(bp.dirino, frag); + } else { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->ino = sop->ino_c; + nsop->sub_op_t = meta_op::OP_LTRACE; + nsop->sub_ino_t = meta_op::INO_DIR; + op.push_op(nsop); + return 1; + } + } + if (_debug) + std::cout << __func__ << " : " << string(list_all?"listall ":"info ") << oid << " "<< ino << std::endl; + bufferlist hbl; + int ret = io_meta.omap_get_header(oid, &hbl); + if (ret < 0) { + std::cerr << __func__ << " : can't find it, maybe it (ino:"<< sop->ino<< ")isn't a normal dir!" << std::endl; + return -1; + } + + if (hbl.length() == 0) { // obj has splite + if (list_all) { + if (frag == frag_t()) { + auto item = op.inodes.find(sop->ino); + if (item != op.inodes.end()) { + inodeno_t tmp = sop->ino; + op.pop_op(); + std::list frags; + item->second->get_meta()->dirfragtree.get_leaves(frags); + for (const auto &frag : frags) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->ino = tmp; + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->frag = frag; + op.push_op(nsop); + } + } else { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->ino_c = sop->ino; + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + op.push_op(nsop); + } + return 1; + } else { + cerr << __func__ << " missing some data (" << oid << ")???" << std::endl; + return -1; + } + } else { + if (frag == frag_t()) { + inode_backpointer_t bp; + if (sop->get_c_ancestor(bp)) { + meta_op::sub_op* nsop = new meta_op::sub_op(&op); + nsop->ino_c = bp.dirino; + nsop->sub_op_t = meta_op::OP_LIST; + nsop->sub_ino_t = meta_op::INO_DIR; + nsop->trace_level = sop->trace_level + 1; + op.push_op(nsop); + return 1; + } else { + cerr << __func__ << "can't find obj(" << oid << ") ,miss ancestors or miss some objs??? " << std::endl; + return -1; + } + } else { + cerr << __func__ << "missing some objs(" << oid << ")??? " << std::endl; + return -1; + } + } + } + + fnode_t got_fnode; + try { + auto p = hbl.cbegin(); + ::decode(got_fnode, p); + } catch (const buffer::error &err) { + cerr << "corrupt fnode header in " << oid + << ": " << err.what() << std::endl; + return -1; + } + + if (_debug) { + std::string format = "json"; + Formatter* f = Formatter::create(format); + f->enable_line_break(); + f->dump_string("type", "--fnode--"); + f->open_object_section("fnode"); + got_fnode.dump(f); + f->close_section(); + f->flush(std::cout); + std::cout << std::endl; + } + + // print children + std::map out_vals; + int max_vals = 5; + io_meta.omap_get_vals(oid, "", max_vals, &out_vals); + + bool force_dirty = false; + const set *snaps = NULL; + unsigned pos = out_vals.size() - 1; + std::string last_dname; + for (map::iterator p = out_vals.begin(); + p != out_vals.end(); + ++p, --pos) { + string dname; + snapid_t last; + dentry_key_t::decode_helper(p->first, dname, last); + if (_debug) + last_dname = dname; + try { + if (!list_all) { + if (show_child(p->first, dname, last, p->second, pos, snaps, + &force_dirty, ino, &op) == 1) { + return 0; + } + } else { + cout << "dname : " << dname << " " << last << std::endl; + if (show_child(p->first, dname, last, p->second, pos, snaps, + &force_dirty) == 1) + return 0; + } + } catch (const buffer::error &err) { + derr << "Corrupt dentry '" << dname << "' : " + << err.what() << "(" << "" << ")" << dendl; + return -1; + } + } + while (out_vals.size() == (size_t)max_vals) { + out_vals.clear(); + io_meta.omap_get_vals(oid, last_dname, max_vals, &out_vals); + pos = out_vals.size() - 1; + for (map::iterator p = (++out_vals.begin()); + p != out_vals.end(); + ++p, --pos) { + string dname; + snapid_t last; + dentry_key_t::decode_helper(p->first, dname, last); + last_dname = dname; + try { + if (!list_all) { + if (show_child(p->first, dname, last, p->second, pos, snaps, + &force_dirty, ino, &op) == 1) { + return 0; + } + } else { + cout << "dname : " << dname << " " << last << std::endl; + if (show_child(p->first, dname, last, p->second, pos, snaps, + &force_dirty) == 1) + return 0; + } + } catch (const buffer::error &err) { + derr << "Corrupt dentry '" << dname << "' : " + << err.what() << "(" << "" << ")" << dendl; + return -1; + } + } + } + + if (!list_all) { + cerr << __func__ << "miss obj(ino:" << ino << ")??? " << std::endl; + return -1; + } + return 0; +} + +int MetaTool::file_meta(meta_op &op) +{ + int r = 0; + if (op.top_op()->sub_ino_t == meta_op::INO_DIR) { + r = _file_meta(op, io_meta); + } else if (op.top_op()->sub_ino_t == meta_op::INO_F) { + for (auto i = io_data_v.begin(); i != io_data_v.end(); ++i) + if ((r = _file_meta(op, **i)) == 1) + break; + } + if (r == 1) { + inode_backpointer_t bp; + if (op.top_op()->get_ancestor(bp)) { + return 0; + } else { + std::cerr << "no trace for obj (ino:" << op.top_op()->ino <<")??" << std::endl; + return -1; + } + } else if (op.top_op()->sub_ino_t == meta_op::INO_DIR) { + std::cerr << "\tmaybe it's a file(ino:" << op.top_op()->ino << ")" << std::endl; + op.top_op()->sub_ino_t = meta_op::INO_F; + return 1; + } + + std::cerr << "can't get (ino:" << op.top_op()->ino <<")trace??" << std::endl; + return -1; +} + +int MetaTool::_file_meta(meta_op &op, librados::IoCtx& io) +{ + inodeno_t ino = op.top_op()->ino; + std::string oid = obj_name(ino); + bufferlist pointer_bl; + std::map attrset; + int r = 0; + bool have_data = false; + r = io.getxattrs (oid.c_str(), attrset); + if (0 == r) { + std::stringstream ds; + std::string format = "json"; + Formatter* f = Formatter::create(format); + auto item = attrset.find("parent"); + if (item != attrset.end()) { + inode_backtrace_t i_bt; + try { + bufferlist::const_iterator q = item->second.cbegin(); + i_bt.decode(q); + f->open_array_section("info"); + have_data = true; + if (i_bt.ancestors.size() > 0) + op.ancestors[ino] = i_bt.ancestors[0]; + f->dump_string("type", "--i_bt--"); + f->open_object_section("parent"); + i_bt.dump(f); + f->close_section(); + } catch (buffer::error &e) { + cerr << "failed to decode parent of " << oid << std::endl; + return -1; + } + } else { + cerr << oid << " in " << io.get_pool_name() << " , but no parent" << std::endl; + return -1; + } + + item = attrset.find("layout"); + if (item != attrset.end()) { + file_layout_t layout; + try { + auto q = item->second.cbegin(); + layout.decode(q); + f->dump_string("type", "--layout--"); + f->open_object_section("layout"); + layout.dump(f); + f->close_section(); + + } catch (buffer::error &e) { + cerr << "failed to decode layout of " << oid << std::endl; + return -1; + } + } else { + cerr << oid << " in " << io.get_pool_name() << " , but no layout" << std::endl; + } + if (have_data) { + f->close_section(); + f->flush(ds); + if (_debug) + cout << ino << " : "<< ds.str() << std::endl; + return 1; + } + } + return 0; +} +std::string MetaTool::obj_name(inodeno_t ino, uint64_t offset, const char *suffix) const +{ + char name[60]; + snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)offset, suffix ? suffix : ""); + return std::string(name); +} +std::string MetaTool::obj_name(inodeno_t ino, frag_t fg, const char *suffix) const +{ + char name[60]; + snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)fg, suffix ? suffix : ""); + return std::string(name); +} + +std::string MetaTool::obj_name(const char* ino, uint64_t offset, const char *suffix) const +{ + char name[60]; + snprintf(name, sizeof(name), "%s.%08llx%s", ino, (long long unsigned)offset, suffix ? suffix : ""); + std::string out = name; + transform(out.begin(), out.end(), out.begin(),::tolower); + return out; +} + +int MetaTool::show_child(std::string_view key, + std::string_view dname, + const snapid_t last, + bufferlist &bl, + const int pos, + const std::set *snaps, + bool *force_dirty, + inodeno_t sp_ino, + meta_op* op) +{ + bufferlist::const_iterator q = bl.cbegin(); + + snapid_t first; + ::decode(first, q); + + // marker + char type; + ::decode(type, q); + + if (_debug) + std::cout << pos << " type '" << type << "' dname '" << dname + << " [" << first << "," << last << "]" + << std::endl; + // bool stale = false; + if (snaps && last != CEPH_NOSNAP) { + derr << "!!!! erro !!!!" << dendl; + return -1; + } + + // CDentry *dn = NULL; + // look for existing dentry for _last_ snap, can't process snap of obj + //if *(stale) + // dn = lookup_exact_snap(dname, last); + //else + // dn = lookup(dname, last); + if (type == 'L' || type == 'l') { + // hard link + inodeno_t ino; + unsigned char d_type; + mempool::mds_co::string alternate_name; + + CDentry::decode_remote(type, ino, d_type, alternate_name, q); + + if (sp_ino > 0) { + if (sp_ino == ino) { + std::cout << "find hard link : " << ino << "," << d_type << std::endl; + return 1; + } + } + + std::cout << "hard link : " << ino << "," << d_type << std::endl; + } else if (type == 'I' || type == 'i') { + // inode + // load inode data before lookuping up or constructing CInode + InodeStore& inode_data = *(new InodeStore); + if (type == 'i') { + mempool::mds_co::string alternate_name; + + DECODE_START(2, q); + if (struct_v >= 2) + decode(alternate_name, q); + inode_data.decode(q); + DECODE_FINISH(q); + } else { + inode_data.decode_bare(q); + } + + std::stringstream ds; + std::string format = "json"; + Formatter* f = Formatter::create(format); + f->enable_line_break(); + f->open_object_section("meta"); + f->dump_unsigned("snapid_t", first); + f->dump_unsigned("itype", type); + f->open_object_section("store"); + inode_data.dump(f); + try { + if (inode_data.snap_blob.length()) { + sr_t srnode; + auto p = inode_data.snap_blob.cbegin(); + srnode.decode(p); + f->open_object_section("snap_blob"); + srnode.dump(f); + f->close_section(); + } + } catch (const buffer::error &err) { + cerr << "corrupt decode in snap_blob" + << ": " << err.what() << std::endl; + } + f->close_section(); + f->close_section(); + f->flush(ds); + + if (sp_ino > 0 && op != NULL && sp_ino == inode_data.inode->ino) { + inode_meta_t* tmp = new inode_meta_t(first, type, &inode_data); + op->inodes[inode_data.inode->ino] = tmp; + op->okeys[inode_data.inode->ino] = key.data(); + return 1; + } else { + delete &inode_data; + } + + if (sp_ino == 0) { + cout << ds.str() << std::endl; + } + } else { + std::cerr << __func__ << "unknow type : " << dname << "," << type << std::endl; + } + return 0; +} diff --git a/src/tools/cephfs/MetaTool.h b/src/tools/cephfs/MetaTool.h new file mode 100644 index 000000000..d36f7bba2 --- /dev/null +++ b/src/tools/cephfs/MetaTool.h @@ -0,0 +1,272 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef METATOOL_H__ +#define METATOOL_H__ + +#include "MDSUtility.h" +#include "RoleSelector.h" +#include +#include +using std::stack; +#include "mds/mdstypes.h" +#include "mds/LogEvent.h" +#include "mds/events/EMetaBlob.h" + +#include "include/rados/librados.hpp" +#include "common/ceph_json.h" + +using ::ceph::bufferlist; +class MetaTool : public MDSUtility +{ +public: + class inode_meta_t { + public: + inode_meta_t(snapid_t f = CEPH_NOSNAP, char t = char(255), InodeStore* i = NULL): + _f(f),_t(t),_i(i) { + }; + snapid_t get_snapid() const { + return _f; + } + InodeStore* get_meta() const { + if (_t == 'I') + return _i; + else + return NULL; + } + int get_type() const { + return _t; + } + void decode_json(JSONObj *obj); + void encode(::ceph::bufferlist& bl, uint64_t features); + private: + snapid_t _f; + char _t; + InodeStore* _i; + }; +private: + class meta_op { + public: + meta_op(bool debug = false, std::string out = "", std::string in = "", bool confirm = false): + _debug(debug), + _out(out), + _in(in), + _confirm(confirm) + {} + void release(); + typedef enum { + OP_LIST = 0, + OP_LTRACE, + OP_SHOW, + OP_AMEND, + OP_SHOW_FN, + OP_AMEND_FN, + OP_NO + } op_type; + + typedef enum { + INO_DIR = 0, + INO_F + } ino_type; + + static std::string op_type_name(op_type& t) { + std::string name; + switch (t) { + case OP_LIST: + name = "list dir"; + break; + case OP_LTRACE: + name = "load trace"; + break; + case OP_SHOW: + name = "show info"; + break; + case OP_AMEND: + name = "amend info"; + break; + case OP_SHOW_FN: + name = "show fnode"; + break; + case OP_AMEND_FN: + name = "amend fnode"; + break; + case OP_NO: + name = "noop"; + break; + default: + name = "unknow op type"; + } + return name; + } + static std::string ino_type_name(ino_type& t) { + std::string name; + switch (t) { + case INO_DIR: + name = "dir"; + break; + case INO_F: + name = "file"; + break; + default: + name = "unknow file type"; + } + return name; + } + class sub_op { + public: + sub_op(meta_op* mop): + trace_level(0), + _proc(false), + _mop(mop) + {} + void print() { + std::cout << detail() << std::endl; + } + std::string detail() { + std::stringstream ds; + ds << " [sub_op]" << op_type_name(sub_op_t) << "|" + << ino_type_name(sub_ino_t) << "|" + << ino << "|" + << frag << "|" + << ino_c << "|" + << trace_level << "|" + << name; + return ds.str(); + } + bool get_c_ancestor(inode_backpointer_t& bp) { + if (!_mop || !ino_c) + return false; + auto item = _mop->ancestors.find(ino_c); + if (item != _mop->ancestors.end()) { + bp = item->second; + return true; + } else + return false; + } + bool get_ancestor(inode_backpointer_t& bp) { + if (!_mop || !ino) + return false; + auto item = _mop->ancestors.find(ino); + if (item != _mop->ancestors.end()) { + bp = item->second; + return true; + } else + return false; + } + op_type sub_op_t; + ino_type sub_ino_t; + inodeno_t ino; + frag_t frag; + inodeno_t ino_c; + unsigned trace_level; + std::string name; + bool _proc; + meta_op* _mop; + }; + + std::map ancestors; + std::map inodes; + std::map okeys; + + void clear_sops() { + while(!no_sops()) + pop_op(); + } + bool no_sops() { + return sub_ops.empty(); + } + void push_op(sub_op* sop) { + if (_debug) + std::cout << "<<====" << sop->detail() << std::endl; + sub_ops.push(sop); + } + sub_op* top_op() { + return sub_ops.top(); + } + void pop_op() { + sub_op* sop = sub_ops.top(); + if (_debug) + std::cout << "====>>" << sop->detail() << std::endl;; + delete sop; + sub_ops.pop(); + } + std::string outfile() { + return _out; + } + std::string infile() { + return _in; + } + bool is_debug() { + return _debug; + } + bool confirm_chg() { + return _confirm; + } + private: + stack sub_ops; + bool _debug; + std::string _out; + std::string _in; + bool _confirm; + }; + MDSRoleSelector role_selector; + mds_rank_t rank; + + // I/O handles + librados::Rados rados; + librados::IoCtx io_meta; + std::vector io_data_v; + librados::IoCtx output; + bool _debug; + uint64_t features; + + std::string obj_name(inodeno_t ino, frag_t fg = frag_t(), const char *suffix = NULL) const; + std::string obj_name(inodeno_t ino, uint64_t offset, const char *suffix = NULL) const; + std::string obj_name(const char* ino, uint64_t offset, const char *suffix = NULL) const; + + // 0 : continue to find + // 1 : stop to find it + int show_child(std::string_view key, + std::string_view dname, + const snapid_t last, + bufferlist &bl, + const int pos, + const std::set *snaps, + bool *force_dirty, + inodeno_t sp_ino = 0, + meta_op* op = NULL + ); + + int process(std::string& mode, std::string& ino, std::string out, std::string in, bool confirm); + int show_meta_info(std::string& ino, std::string& out); + int list_meta_info(std::string& ino, std::string& out); + int amend_meta_info(std::string& ino, std::string& in, bool confirm); + int show_fnode(std::string& ino, std::string& out); + int amend_fnode(std::string& in, bool confirm); + int op_process(meta_op &op); + int list_meta(meta_op &op); + int file_meta(meta_op &op); + int show_meta(meta_op &op); + int amend_meta(meta_op &op); + int show_fn(meta_op &op); + int amend_fn(meta_op &op); + public: + int _file_meta(meta_op &op, librados::IoCtx& io); + int _show_meta(inode_meta_t& i, const std::string& fn); + int _amend_meta(std::string &k, inode_meta_t& i, const std::string& fn, meta_op& op); + int _show_fn(inode_meta_t& i, const std::string& fn); + int _amend_fn(const std::string& fn, bool confirm); + void usage(); + MetaTool(bool debug=false): + _debug(debug) {} + ~MetaTool() {} + + int main(std::string& mode, + std::string& rank_str, + std::string& minfo, + std::string&ino, + std::string& out, + std::string& in, + bool confirm = false + ); +}; +#endif // METATOOL_H__ diff --git a/src/tools/cephfs/PgFiles.cc b/src/tools/cephfs/PgFiles.cc new file mode 100644 index 000000000..2abca7223 --- /dev/null +++ b/src/tools/cephfs/PgFiles.cc @@ -0,0 +1,194 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "osdc/Striper.h" + +#include "PgFiles.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << "pgeffects." << __func__ << ": " + +int PgFiles::init() +{ + int r = ceph_create_with_context(&cmount, g_ceph_context); + if (r != 0) { + return r; + } + + return ceph_init(cmount); +} + +PgFiles::PgFiles(Objecter *o, const std::set &pgs_) + : objecter(o), pgs(pgs_) +{ + for (const auto &i : pgs) { + pools.insert(i.m_pool); + } +} + +PgFiles::~PgFiles() +{ + ceph_release(cmount); +} + +void PgFiles::hit_dir(std::string const &path) +{ + dout(10) << "entering " << path << dendl; + + ceph_dir_result *dr = nullptr; + int r = ceph_opendir(cmount, path.c_str(), &dr); + if (r != 0) { + derr << "Failed to open path: " << cpp_strerror(r) << dendl; + return; + } + + struct dirent de; + while((r = ceph_readdir_r(cmount, dr, &de)) != 0) { + if (r < 0) { + derr << "Error reading path " << path << ": " << cpp_strerror(r) + << dendl; + ceph_closedir(cmount, dr); // best effort, ignore r + return; + } + + if (std::string(de.d_name) == "." || std::string(de.d_name) == "..") { + continue; + } + + struct ceph_statx stx; + std::string de_path = (path + std::string("/") + de.d_name); + r = ceph_statx(cmount, de_path.c_str(), &stx, + CEPH_STATX_INO|CEPH_STATX_SIZE, 0); + if (r != 0) { + derr << "Failed to stat path " << de_path << ": " + << cpp_strerror(r) << dendl; + // Don't hold up the whole process for one bad inode + continue; + } + + if (S_ISREG(stx.stx_mode)) { + hit_file(de_path, stx); + } else if (S_ISDIR(stx.stx_mode)) { + hit_dir(de_path); + } else { + dout(20) << "Skipping non reg/dir file: " << de_path << dendl; + } + } + + r = ceph_closedir(cmount, dr); + if (r != 0) { + derr << "Error closing path " << path << ": " << cpp_strerror(r) << dendl; + return; + } +} + +void PgFiles::hit_file(std::string const &path, const struct ceph_statx &stx) +{ + ceph_assert(S_ISREG(stx.stx_mode)); + + dout(20) << "Hitting file '" << path << "'" << dendl; + + int l_stripe_unit = 0; + int l_stripe_count = 0; + int l_object_size = 0; + int l_pool_id = 0; + int r = ceph_get_path_layout(cmount, path.c_str(), &l_stripe_unit, + &l_stripe_count, &l_object_size, + &l_pool_id); + if (r != 0) { + derr << "Error reading layout on " << path << ": " << cpp_strerror(r) + << dendl; + return; + } + + struct file_layout_t layout; + layout.stripe_unit = l_stripe_unit; + layout.stripe_count = l_stripe_count; + layout.object_size = l_object_size; + layout.pool_id = l_pool_id; + + // Avoid calculating PG if the layout targeted a completely different pool + if (pools.count(layout.pool_id) == 0) { + dout(20) << "Fast check missed: pool " << layout.pool_id << " not in " + "target set" << dendl; + return; + } + + auto num_objects = Striper::get_num_objects(layout, stx.stx_size); + + for (uint64_t i = 0; i < num_objects; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)stx.stx_ino, + (long long unsigned int)i); + dout(20) << " object " << std::string(buf) << dendl; + + pg_t target; + object_t oid; + object_locator_t loc; + loc.pool = layout.pool_id; + loc.key = std::string(buf); + + unsigned pg_num_mask = 0; + unsigned pg_num = 0; + + int r = 0; + objecter->with_osdmap([&r, oid, loc, &target, &pg_num_mask, &pg_num] + (const OSDMap &osd_map) { + r = osd_map.object_locator_to_pg(oid, loc, target); + if (r == 0) { + auto pool = osd_map.get_pg_pool(loc.pool); + pg_num_mask = pool->get_pg_num_mask(); + pg_num = pool->get_pg_num(); + } + }); + if (r != 0) { + // Can happen if layout pointed to pool not in osdmap, for example + continue; + } + + target.m_seed = ceph_stable_mod(target.ps(), pg_num, pg_num_mask); + + dout(20) << " target " << target << dendl; + + if (pgs.count(target)) { + std::cout << path << std::endl; + return; + } + } + +} + +int PgFiles::scan_path(std::string const &path) +{ + int r = ceph_mount(cmount, "/"); + if (r != 0) { + derr << "Failed to mount: " << cpp_strerror(r) << dendl; + return r; + } + + hit_dir(path); + + r = ceph_unmount(cmount); + if (r != 0) { + derr << "Failed to unmount: " << cpp_strerror(r) << dendl; + return r; + } + + return r; +} + diff --git a/src/tools/cephfs/PgFiles.h b/src/tools/cephfs/PgFiles.h new file mode 100644 index 000000000..1ba4b3d28 --- /dev/null +++ b/src/tools/cephfs/PgFiles.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef PG_EFFECTS_H_ +#define PG_EFFECTS_H_ + +#include "include/cephfs/libcephfs.h" +#include "osd/osd_types.h" +#include +#include "osdc/Objecter.h" + +/** + * This utility scans the files (via an online MDS) and works out + * which ones rely on named PGs. For use when someone has + * some bad/damaged PGs and wants to see which files might be + * affected. + */ +class PgFiles +{ +private: + Objecter *objecter; + struct ceph_mount_info *cmount = nullptr; + + std::set pgs; + std::set pools; + + void hit_file(std::string const &path, const struct ceph_statx &stx); + void hit_dir(std::string const &path); + + +public: + PgFiles(Objecter *o, const std::set &pgs_); + ~PgFiles(); + + int init(); + int scan_path(std::string const &path); +}; + +#endif + diff --git a/src/tools/cephfs/Resetter.cc b/src/tools/cephfs/Resetter.cc new file mode 100644 index 000000000..7c0aa30ab --- /dev/null +++ b/src/tools/cephfs/Resetter.cc @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include +#include "common/errno.h" +#include "osdc/Journaler.h" +#include "mds/JournalPointer.h" + +#include "mds/mdstypes.h" +#include "mds/MDCache.h" +#include "mon/MonClient.h" +#include "mds/events/EResetJournal.h" + +#include "Resetter.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds + +using namespace std; + +int Resetter::init(mds_role_t role_, const std::string &type, bool hard) +{ + role = role_; + int r = MDSUtility::init(); + if (r < 0) { + return r; + } + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(nullptr != fs); + + is_mdlog = false; + if (type == "mdlog") { + JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool()); + int rt = 0; + if (hard) { + jp.front = role.rank + MDS_INO_LOG_OFFSET; + jp.back = 0; + rt = jp.save(objecter); + if (rt != 0) { + derr << "Error writing journal pointer: " << cpp_strerror(rt) << dendl; + return rt; + } + ino = jp.front; // only need to reset ino for mdlog + } else { + rt = jp.load(objecter); + if (rt != 0) { + std::cerr << "Error loading journal: " << cpp_strerror(rt) << + ", pass --force to forcibly reset this journal" << std::endl; + return rt; + } else { + ino = jp.front; + } + } + is_mdlog = true; + } else if (type == "purge_queue") { + ino = MDS_INO_PURGE_QUEUE + role.rank; + } else { + ceph_abort(); // should not get here + } + return 0; +} + +int Resetter::reset() +{ + ceph::mutex mylock = ceph::make_mutex("Resetter::reset::lock"); + ceph::condition_variable cond; + bool done; + int r; + + auto fs = fsmap->get_filesystem(role.fscid); + ceph_assert(fs != nullptr); + + Journaler journaler("resetter", ino, + fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, + objecter, 0, 0, &finisher); + { + std::lock_guard locker{lock}; + journaler.recover(new C_SafeCond(mylock, cond, &done, &r)); + } + { + std::unique_lock locker{mylock}; + cond.wait(locker, [&done] { return done; }); + } + if (r != 0) { + if (r == -ENOENT) { + cerr << "journal does not exist on-disk. Did you set a bad rank?" + << std::endl; + std::cerr << "Error loading journal: " << cpp_strerror(r) << + ", pass --force to forcibly reset this journal" << std::endl; + return r; + } else { + cerr << "got error " << r << "from Journaler, failing" << std::endl; + return r; + } + } + + lock.lock(); + uint64_t old_start = journaler.get_read_pos(); + uint64_t old_end = journaler.get_write_pos(); + uint64_t old_len = old_end - old_start; + cout << "old journal was " << old_start << "~" << old_len << std::endl; + + uint64_t new_start = round_up_to(old_end+1, journaler.get_layout_period()); + cout << "new journal start will be " << new_start + << " (" << (new_start - old_end) << " bytes past old end)" << std::endl; + + journaler.set_read_pos(new_start); + journaler.set_write_pos(new_start); + journaler.set_expire_pos(new_start); + journaler.set_trimmed_pos(new_start); + journaler.set_writeable(); + + cout << "writing journal head" << std::endl; + journaler.write_head(new C_SafeCond(mylock, cond, &done, &r)); + lock.unlock(); + { + std::unique_lock locker{mylock}; + cond.wait(locker, [&done] { return done; }); + } + std::lock_guard l{lock}; + if (r != 0) { + return r; + } + + if (is_mdlog) { + r = _write_reset_event(&journaler); // reset envent is specific for mdlog journal + if (r != 0) { + return r; + } + } + cout << "done" << std::endl; + + return 0; +} + +int Resetter::reset_hard() +{ + auto fs = fsmap->get_filesystem(role.fscid); + + Journaler journaler("resetter", ino, + fs->mds_map.get_metadata_pool(), + CEPH_FS_ONDISK_MAGIC, + objecter, 0, 0, &finisher); + journaler.set_writeable(); + + file_layout_t default_log_layout = MDCache::gen_default_log_layout( + fsmap->get_filesystem(role.fscid)->mds_map); + journaler.create(&default_log_layout, g_conf()->mds_journal_format); + + C_SaferCond cond; + { + std::lock_guard l{lock}; + journaler.write_head(&cond); + } + + int r = cond.wait(); + if (r != 0) { + derr << "Error writing journal header: " << cpp_strerror(r) << dendl; + return r; + } + + if (is_mdlog) // reset event is specific for mdlog journal + { + std::lock_guard l{lock}; + r = _write_reset_event(&journaler); + if (r != 0) { + derr << "Error writing EResetJournal: " << cpp_strerror(r) << dendl; + return r; + } + } + + if (is_mdlog) { + dout(4) << "Successfully wrote new journal pointer and header for rank " + << role << dendl; + } else { + dout(4) << "Successfully wrote header for rank " << role << dendl; + } + return 0; +} + +int Resetter::_write_reset_event(Journaler *journaler) +{ + ceph_assert(journaler != NULL); + + auto le = std::make_unique(); + + bufferlist bl; + le->encode_with_header(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + cout << "writing EResetJournal entry" << std::endl; + journaler->append_entry(bl); + + int ret; + { + C_SaferCond cond; + journaler->flush(&cond); + ret = cond.wait(); + if (ret < 0) + return ret; + } + { + // wait until all journal prezero ops are done + C_SaferCond cond; + journaler->wait_for_prezero(&cond); + cond.wait(); + } + + return ret; +} + diff --git a/src/tools/cephfs/Resetter.h b/src/tools/cephfs/Resetter.h new file mode 100644 index 000000000..6998e4598 --- /dev/null +++ b/src/tools/cephfs/Resetter.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2010 Greg Farnum + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef JOURNAL_RESETTER_H_ +#define JOURNAL_RESETTER_H_ + + +#include "MDSUtility.h" + +class Journaler; + +/** + * This class lets you reset an mds journal for troubleshooting or whatever. + * + * To use, create a Resetter, call init(), and then call reset() with the name + * of the file to dump to. + */ +class Resetter : public MDSUtility { +private: + mds_role_t role; + inodeno_t ino; + bool is_mdlog; + +protected: + int _write_reset_event(Journaler *journaler); + +public: + Resetter() {} + ~Resetter() {} + + int init(mds_role_t role_, const std::string &type, bool hard); + /** + * For use when no journal header/pointer was present: write one + * out from scratch. + */ + int reset_hard(); + int reset(); +}; + +#endif /* JOURNAL_RESETTER_H_ */ diff --git a/src/tools/cephfs/RoleSelector.cc b/src/tools/cephfs/RoleSelector.cc new file mode 100644 index 000000000..e2d53b86e --- /dev/null +++ b/src/tools/cephfs/RoleSelector.cc @@ -0,0 +1,59 @@ + +#include "RoleSelector.h" + +int MDSRoleSelector::parse_rank( + const FSMap &fsmap, + std::string const &str) +{ + if (str == "all" || str == "*") { + std::set in; + const MDSMap &mds_map = fsmap.get_filesystem(fscid)->mds_map; + mds_map.get_mds_set(in); + + for (auto rank : in) { + roles.push_back(mds_role_t(fscid, rank)); + } + + return 0; + } else { + std::string rank_err; + mds_rank_t rank = strict_strtol(str.c_str(), 10, &rank_err); + if (!rank_err.empty()) { + return -EINVAL; + } + if (fsmap.get_filesystem(fscid)->mds_map.is_dne(rank)) { + return -ENOENT; + } + roles.push_back(mds_role_t(fscid, rank)); + return 0; + } +} + +int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str, + bool allow_unqualified_rank) +{ + auto colon_pos = str.find(":"); + if (colon_pos == std::string::npos) { + // An unqualified rank. Only valid if there is only one + // namespace. + if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) { + fscid = fsmap.get_filesystem()->fscid; + return parse_rank(fsmap, str); + } else { + return -EINVAL; + } + } else if (colon_pos == 0 || colon_pos == str.size() - 1) { + return -EINVAL; + } else { + const std::string ns_str = str.substr(0, colon_pos); + const std::string rank_str = str.substr(colon_pos + 1); + std::shared_ptr fs_ptr; + int r = fsmap.parse_filesystem(ns_str, &fs_ptr); + if (r != 0) { + return r; + } + fscid = fs_ptr->fscid; + return parse_rank(fsmap, rank_str); + } +} + diff --git a/src/tools/cephfs/RoleSelector.h b/src/tools/cephfs/RoleSelector.h new file mode 100644 index 000000000..9090b7200 --- /dev/null +++ b/src/tools/cephfs/RoleSelector.h @@ -0,0 +1,36 @@ + +#ifndef ROLE_SELECTOR_H_ +#define ROLE_SELECTOR_H_ + +#include +#include +#include "mds/mdstypes.h" +#include "mds/FSMap.h" + +/** + * When you want to let the user act on a single rank in a namespace, + * or all of them. + */ +class MDSRoleSelector +{ + public: + const std::vector &get_roles() const {return roles;} + int parse(const FSMap &fsmap, std::string const &str, + bool allow_unqualified_rank=true); + MDSRoleSelector() + : fscid(FS_CLUSTER_ID_NONE) + {} + fs_cluster_id_t get_ns() const + { + return fscid; + } + protected: + int parse_rank( + const FSMap &fsmap, + std::string const &str); + std::vector roles; + fs_cluster_id_t fscid; +}; + +#endif // ROLE_SELECTOR_H_ + diff --git a/src/tools/cephfs/TableTool.cc b/src/tools/cephfs/TableTool.cc new file mode 100644 index 000000000..dcd35a624 --- /dev/null +++ b/src/tools/cephfs/TableTool.cc @@ -0,0 +1,419 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "common/ceph_argparse.h" +#include "common/errno.h" + +#include "mds/SessionMap.h" +#include "mds/InoTable.h" +#include "mds/SnapServer.h" + +#include "TableTool.h" + + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix *_dout << __func__ << ": " + +using namespace std; + +void TableTool::usage() +{ + std::cout << "Usage: \n" + << " cephfs-table-tool " + << " cephfs-table-tool " + << std::endl; + + generic_client_usage(); +} + + +/** + * For a function that takes an MDS role as an argument and + * returns an error code, execute it on the roles specified + * by `role_selector`. + */ +int TableTool::apply_role_fn(std::function fptr, Formatter *f) +{ + ceph_assert(f != NULL); + + int r = 0; + + f->open_object_section("ranks"); + + for (auto role : role_selector.get_roles()) { + std::ostringstream rank_str; + rank_str << role.rank; + f->open_object_section(rank_str.str().c_str()); + + f->open_object_section("data"); + int rank_r = fptr(role, f); + f->close_section(); + r = r ? r : rank_r; + + f->dump_int("result", rank_r); + f->close_section(); + + + } + + f->close_section(); + + return r; +} + + +/** + * This class wraps an MDS table class (SessionMap, SnapServer, InoTable) + * with offline load/store code such that we can do offline dumps and resets + * on those tables. + */ +template +class TableHandler +{ +protected: + // The RADOS object ID for the table + std::string object_name; + + // The role in question (may be NONE) + mds_role_t role; + + // Whether this is an MDSTable subclass (i.e. has leading version field to decode) + bool mds_table; + +public: + TableHandler(mds_role_t r, std::string const &name, bool mds_table_) + : role(r), mds_table(mds_table_) + { + // Compose object name of the table we will dump + std::ostringstream oss; + oss << "mds"; + if (!role.is_none()) { + oss << role.rank; + } + oss << "_" << name; + object_name = oss.str(); + } + + int load_and_dump(librados::IoCtx *io, Formatter *f) + { + ceph_assert(io != NULL); + ceph_assert(f != NULL); + + // Attempt read + bufferlist table_bl; + int read_r = io->read(object_name, table_bl, 0, 0); + if (read_r >= 0) { + auto q = table_bl.cbegin(); + try { + if (mds_table) { + version_t version; + decode(version, q); + f->dump_int("version", version); + } + A table_inst; + table_inst.set_rank(role.rank); + table_inst.decode(q); + table_inst.dump(f); + + return 0; + } catch (buffer::error &e) { + derr << "table " << object_name << " is corrupt" << dendl; + return -EIO; + } + } else { + derr << "error reading table object " << object_name + << ": " << cpp_strerror(read_r) << dendl; + return read_r; + } + } + + int reset(librados::IoCtx *io) + { + A table_inst; + // Compose new (blank) table + table_inst.set_rank(role.rank); + table_inst.reset_state(); + // Write the table out + return write(table_inst, io); + } + +protected: + + int write(const A &table_inst, librados::IoCtx *io) + { + bufferlist new_bl; + if (mds_table) { + version_t version = 1; + encode(version, new_bl); + } + table_inst.encode_state(new_bl); + + // Write out new table + int r = io->write_full(object_name, new_bl); + if (r != 0) { + derr << "error writing table object " << object_name + << ": " << cpp_strerror(r) << dendl; + return r; + } + + return r; + } +}; + +template +class TableHandlerOmap +{ +private: + // The RADOS object ID for the table + std::string object_name; + + // The role (rank may be NONE) + mds_role_t role; + + // Whether this is an MDSTable subclass (i.e. has leading version field to decode) + bool mds_table; + +public: + TableHandlerOmap(mds_role_t r, std::string const &name, bool mds_table_) + : role(r), mds_table(mds_table_) + { + // Compose object name of the table we will dump + std::ostringstream oss; + oss << "mds"; + if (!role.is_none()) { + oss << role.rank; + } + oss << "_" << name; + object_name = oss.str(); + } + + int load_and_dump(librados::IoCtx *io, Formatter *f) + { + ceph_assert(io != NULL); + ceph_assert(f != NULL); + + // Read in the header + bufferlist header_bl; + int r = io->omap_get_header(object_name, &header_bl); + if (r != 0) { + derr << "error reading header on '" << object_name << "': " + << cpp_strerror(r) << dendl; + return r; + } + + // Decode the header + A table_inst; + table_inst.set_rank(role.rank); + try { + table_inst.decode_header(header_bl); + } catch (buffer::error &e) { + derr << "table " << object_name << " is corrupt" << dendl; + return -EIO; + } + + // Read and decode OMAP values in chunks + std::string last_key = ""; + while(true) { + std::map values; + int r = io->omap_get_vals(object_name, last_key, + g_conf()->mds_sessionmap_keys_per_op, &values); + + if (r != 0) { + derr << "error reading values: " << cpp_strerror(r) << dendl; + return r; + } + + if (values.empty()) { + break; + } + + try { + table_inst.decode_values(values); + } catch (buffer::error &e) { + derr << "table " << object_name << " is corrupt" << dendl; + return -EIO; + } + last_key = values.rbegin()->first; + } + + table_inst.dump(f); + + return 0; + } + + int reset(librados::IoCtx *io) + { + A table_inst; + table_inst.set_rank(role.rank); + table_inst.reset_state(); + bufferlist header_bl; + table_inst.encode_header(&header_bl); + + // Compose a transaction to clear and write header + librados::ObjectWriteOperation op; + op.omap_clear(); + op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK); + op.omap_set_header(header_bl); + + return io->operate(object_name, &op); + } +}; + +class InoTableHandler : public TableHandler +{ + public: + explicit InoTableHandler(mds_role_t r) + : TableHandler(r, "inotable", true) + {} + + int take_inos(librados::IoCtx *io, inodeno_t max, Formatter *f) + { + InoTable inst; + inst.set_rank(role.rank); + inst.reset_state(); + + int r = 0; + if (inst.force_consume_to(max)) { + r = write(inst, io); + } + + f->dump_int("version", inst.get_version()); + inst.dump(f); + + return r; + } +}; + + +int TableTool::main(std::vector &argv) +{ + int r; + + dout(10) << __func__ << dendl; + + // RADOS init + // ========== + r = rados.init_with_context(g_ceph_context); + if (r < 0) { + derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; + return r; + } + + dout(4) << "connecting to RADOS..." << dendl; + r = rados.connect(); + if (r < 0) { + derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; + return r; + } + + // Require at least 3 args [args...] + if (argv.size() < 3) { + cerr << "missing required 3 arguments" << std::endl; + return -EINVAL; + } + + const std::string role_str = std::string(argv[0]); + const std::string mode = std::string(argv[1]); + const std::string table = std::string(argv[2]); + + r = role_selector.parse(*fsmap, role_str); + if (r < 0) { + derr << "Bad rank selection: " << role_str << "'" << dendl; + return r; + } + + auto fs = fsmap->get_filesystem(role_selector.get_ns()); + ceph_assert(fs != nullptr); + int64_t const pool_id = fs->mds_map.get_metadata_pool(); + dout(4) << "resolving pool " << pool_id << dendl; + std::string pool_name; + r = rados.pool_reverse_lookup(pool_id, &pool_name); + if (r < 0) { + derr << "Pool " << pool_id << " identified in MDS map not found in RADOS!" + << dendl; + return r; + } + + dout(4) << "creating IoCtx.." << dendl; + r = rados.ioctx_create(pool_name.c_str(), io); + if (r != 0) { + return r; + } + + JSONFormatter jf(true); + if (mode == "reset") { + const std::string table = std::string(argv[2]); + if (table == "session") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandlerOmap(rank, "sessionmap", false).reset(&io); + }, &jf); + } else if (table == "inode") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandler(rank, "inotable", true).reset(&io); + }, &jf); + } else if (table == "snap") { + r = TableHandler(mds_role_t(), "snaptable", true).reset(&io); + jf.open_object_section("reset_snap_status"); + jf.dump_int("result", r); + jf.close_section(); + } else { + cerr << "Invalid table '" << table << "'" << std::endl; + return -EINVAL; + } + } else if (mode == "show") { + const std::string table = std::string(argv[2]); + if (table == "session") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandlerOmap(rank, "sessionmap", false).load_and_dump(&io, f); + }, &jf); + } else if (table == "inode") { + r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int { + return TableHandler(rank, "inotable", true).load_and_dump(&io, f);; + }, &jf); + } else if (table == "snap") { + jf.open_object_section("show_snap_table"); + { + r = TableHandler( + mds_role_t(), "snaptable", true).load_and_dump(&io, &jf); + jf.dump_int("result", r); + } + jf.close_section(); + } else { + cerr << "Invalid table '" << table << "'" << std::endl; + return -EINVAL; + } + } else if (mode == "take_inos") { + const std::string ino_str = std::string(argv[2]); + std::string ino_err; + inodeno_t ino = strict_strtoll(ino_str.c_str(), 10, &ino_err); + if (!ino_err.empty()) { + derr << "Bad ino '" << ino_str << "'" << dendl; + return -EINVAL; + } + r = apply_role_fn([this, ino](mds_role_t rank, Formatter *f) -> int { + return InoTableHandler(rank).take_inos(&io, ino, f); + }, &jf); + } else { + cerr << "Invalid mode '" << mode << "'" << std::endl; + return -EINVAL; + } + + // Subcommand should have written to formatter, flush it + jf.flush(std::cout); + std::cout << std::endl; + return r; +} + diff --git a/src/tools/cephfs/TableTool.h b/src/tools/cephfs/TableTool.h new file mode 100644 index 000000000..bf9b95c12 --- /dev/null +++ b/src/tools/cephfs/TableTool.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "MDSUtility.h" +#include "RoleSelector.h" + +#include "include/rados/librados.hpp" + +/** + * Command line tool for debugging the backing store of + * MDSTable instances. + */ +class TableTool : public MDSUtility +{ + private: + MDSRoleSelector role_selector; + + // I/O handles + librados::Rados rados; + librados::IoCtx io; + + int apply_role_fn(std::function fptr, Formatter *f); + + public: + static void usage(); + int main(std::vector &argv); + +}; + diff --git a/src/tools/cephfs/cephfs-data-scan.cc b/src/tools/cephfs/cephfs-data-scan.cc new file mode 100644 index 000000000..3e1b75cb6 --- /dev/null +++ b/src/tools/cephfs/cephfs-data-scan.cc @@ -0,0 +1,46 @@ + +#include "include/types.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "DataScan.h" + +using namespace std; + +int main(int argc, const char **argv) +{ + auto args = argv_to_vec(argc, argv); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + DataScan::usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + DataScan data_scan; + + // Connect to mon cluster, download MDS map etc + int rc = data_scan.init(); + if (rc != 0) { + std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + + // Finally, execute the user's commands + rc = data_scan.main(args); + if (rc != 0) { + std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl; + } + + + return rc; +} + diff --git a/src/tools/cephfs/cephfs-journal-tool.cc b/src/tools/cephfs/cephfs-journal-tool.cc new file mode 100644 index 000000000..f95e7e265 --- /dev/null +++ b/src/tools/cephfs/cephfs-journal-tool.cc @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 John Spray + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + + +#include "include/types.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "JournalTool.h" + + +int main(int argc, const char **argv) +{ + auto args = argv_to_vec(argc, argv); + if (args.empty()) { + std::cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + JournalTool::usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + JournalTool jt; + + // Connect to mon cluster, download MDS map etc + int rc = jt.init(); + if (rc != 0) { + std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + + // Finally, execute the user's commands + rc = jt.main(args); + if (rc != 0) { + std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl; + } + + return rc; +} + diff --git a/src/tools/cephfs/cephfs-meta-injection.cc b/src/tools/cephfs/cephfs-meta-injection.cc new file mode 100644 index 000000000..48a913469 --- /dev/null +++ b/src/tools/cephfs/cephfs-meta-injection.cc @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "MetaTool.h" +#include +#include +#include + +#include +namespace po = boost::program_options; +using std::string; +using namespace std; +static string version = "cephfs-meta-injection v1.1"; + +int main(int argc, const char **argv) +{ + auto args = argv_to_vec(argc, argv); + env_to_vec(args); + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + string rank_str, minfo, ino, out,in; + po::options_description general("general options"); + general.add_options() + ("help,h", "produce help message") + ("debug", "show debug info") + ("rank,r", po::value(&rank_str), "the rank of cephfs, default(0) (e.g. -r cephfs_a:0)") + ("minfo", po::value(&minfo), "specify metapool, datapools and rank (e.g. cephfs_metadata_a:cephfs_data_a:0)") + ("ino,i", po::value(&ino), "specify inode. e.g. 1099511627776 or 0x10000000000, you can find it with cmd, 'ls -i'") + ("out,o", po::value(&out), "output file") + ("in", po::value(&in), "input file") + ("yes-i-really-really-mean-it", "need by amend info") + ; + + string mode; + po::options_description modeoptions("mode options"); + modeoptions.add_options() + ("mode", po::value(&mode), + "\tlistc : list all obj of dir\n" \ + "\tshowm : show the info of ino\n" \ + "\tshowfn : show the fnode of dir\n" \ + "\tamend : amend part of the meta data\n" \ + "\tamendfn : amend fnode from file\n" + ); + + po::positional_options_description p; + p.add("mode", 1); + + po::options_description all("all options"); + all.add(modeoptions).add(general); + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).options(all).positional(p).allow_unregistered().run(), vm); + } catch(exception &e) { + cerr << "error : " << e.what() << std::endl; + return -1; + } catch(...) { + cout << "param error" << std::endl; + return 0; + } + + boost::program_options::notify(vm); + if (vm.count("help")) { + std::cout << version << std::endl; + std::cout << "usage : \n" + << " cephfs-meta-injection -r -i " + << std::endl; + std::cout << "example : \n" + << " amend info of inode(1099531628828)\n" + << " cephfs-meta-injection showm -r cephfs_a:0 -i 1099531628828 -o out\n" + << " alter file\n" + << " cephfs-meta-injection amend -r cephfs_a:0 -i 1099531628828 --in out --yes-i-really-mean-it" + << std::endl; + std::cout << all << std::endl; + return 0; + } + + MetaTool mt(vm.count("debug")); + int rc = mt.init(); + if (rc != 0) { + std::cerr << "error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + rc = mt.main(mode, rank_str, minfo, ino, out, in, vm.count("yes-i-really-really-mean-it")); + if (rc != 0) { + std::cerr << "error (" << cpp_strerror(rc) << ")" << std::endl; + return -1; + } + return rc; +} diff --git a/src/tools/cephfs/cephfs-table-tool.cc b/src/tools/cephfs/cephfs-table-tool.cc new file mode 100644 index 000000000..4b57080d6 --- /dev/null +++ b/src/tools/cephfs/cephfs-table-tool.cc @@ -0,0 +1,46 @@ + +#include "include/types.h" +#include "common/config.h" +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include "global/global_init.h" + +#include "TableTool.h" + +using namespace std; + +int main(int argc, const char **argv) +{ + auto args = argv_to_vec(argc, argv); + if (args.empty()) { + cerr << argv[0] << ": -h or --help for usage" << std::endl; + exit(1); + } + if (ceph_argparse_need_usage(args)) { + TableTool::usage(); + exit(0); + } + + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, 0); + common_init_finish(g_ceph_context); + + TableTool tt; + + // Connect to mon cluster, download MDS map etc + int rc = tt.init(); + if (rc != 0) { + std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl; + return rc; + } + + // Finally, execute the user's commands + rc = tt.main(args); + if (rc != 0) { + std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl; + } + + return rc; +} + + diff --git a/src/tools/cephfs/first-damage.py b/src/tools/cephfs/first-damage.py new file mode 100644 index 000000000..0479dc8cb --- /dev/null +++ b/src/tools/cephfs/first-damage.py @@ -0,0 +1,156 @@ +# Ceph - scalable distributed file system +# +# Copyright (C) 2022 Red Hat, Inc. +# +# This is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1, as published by the Free Software +# Foundation. See file COPYING. + +# Suggested recovery sequence (for single MDS cluster): +# +# 1) Unmount all clients. +# +# 2) Flush the journal (if possible): +# +# ceph tell mds.:0 flush journal +# +# 3) Fail the file system: +# +# ceph fs fail +# +# 4a) Recover dentries from the journal. This will be a no-op if the MDS flushed the journal successfully: +# +# cephfs-journal-tool --rank=:0 event recover_dentries summary +# +# 4b) If all good so far, reset the journal: +# +# cephfs-journal-tool --rank=:0 journal reset +# +# 5) Run this tool to see list of damaged dentries: +# +# python3 first-damage.py --memo run.1 +# +# 6) Optionally, remove them: +# +# python3 first-damage.py --memo run.2 --remove +# +# Note: use --memo to specify a different file to save objects that have +# already been traversed, for independent runs. +# +# This has the effect of removing that dentry from the snapshot or HEAD +# (current hierarchy). Note: the inode's linkage will be lost. The inode may +# be recoverable in lost+found during a future data scan recovery. + +import argparse +import logging +import os +import rados +import re +import sys +import struct + +log = logging.getLogger("first-damage-traverse") + +MEMO = None +REMOVE = False +POOL = None +NEXT_SNAP = None +CONF = os.environ.get('CEPH_CONF') +REPAIR_NOSNAP = None + +CEPH_NOSNAP = 0xfffffffe # int32 -2 + +DIR_PATTERN = re.compile(r'[0-9a-fA-F]{8,}\.[0-9a-fA-F]+') + +CACHE = set() + +def traverse(MEMO, ioctx): + for o in ioctx.list_objects(): + if not DIR_PATTERN.fullmatch(o.key): + log.debug("skipping %s", o.key) + continue + elif o.key in CACHE: + log.debug("skipping previously examined object %s", o.key) + continue + log.info("examining: %s", o.key) + + with rados.ReadOpCtx() as rctx: + nkey = None + while True: + it = ioctx.get_omap_vals(rctx, nkey, None, 100, omap_key_type=bytes)[0] + ioctx.operate_read_op(rctx, o.key) + nkey = None + for (dnk, val) in it: + log.debug(f'\t{dnk}: val size {len(val)}') + (first,) = struct.unpack(' NEXT_SNAP: + log.warning(f"found {o.key}:{dnk} first (0x{first:x}) > NEXT_SNAP (0x{NEXT_SNAP:x})") + if REPAIR_NOSNAP and dnk.endswith(b"_head") and first == CEPH_NOSNAP: + log.warning(f"repairing first==CEPH_NOSNAP damage, setting to NEXT_SNAP (0x{NEXT_SNAP:x})") + first = NEXT_SNAP + nval = bytearray(val) + struct.pack_into("= LooseVersion("1.0.1"): + from cmd2.exceptions import Cmd2ArgparseError +else: + # HACK: so that we don't have check for version everywhere + # Cmd2ArgparseError is used. + class Cmd2ArgparseError: + pass + +if sys.version_info.major < 3: + raise RuntimeError("cephfs-shell is only compatible with python3") + +try: + from cmd2 import with_argparser +except ImportError: + def with_argparser(argparser): + import functools + + def argparser_decorator(func): + @functools.wraps(func) + def wrapper(thiz, cmdline): + if isinstance(cmdline, list): + arglist = cmdline + else: + # do not split if it's already a list + arglist = shlex.split(cmdline, posix=False) + # in case user quotes the command args + arglist = [arg.strip('\'""') for arg in arglist] + try: + args = argparser.parse_args(arglist) + except SystemExit: + shell.exit_code = 1 + # argparse exits at seeing bad arguments + return + else: + return func(thiz, args) + argparser.prog = func.__name__[3:] + if argparser.description is None and func.__doc__: + argparser.description = func.__doc__ + + return wrapper + + return argparser_decorator + + +cephfs = None # holds CephFS Python bindings +shell = None # holds instance of class CephFSShell +exit_codes = {'Misc': 1, + 'KeyboardInterrupt': 2, + errno.EPERM: 3, + errno.EACCES: 4, + errno.ENOENT: 5, + errno.EIO: 6, + errno.ENOSPC: 7, + errno.EEXIST: 8, + errno.ENODATA: 9, + errno.EINVAL: 10, + errno.EOPNOTSUPP: 11, + errno.ERANGE: 12, + errno.EWOULDBLOCK: 13, + errno.ENOTEMPTY: 14, + errno.ENOTDIR: 15, + errno.EDQUOT: 16, + errno.EPIPE: 17, + errno.ESHUTDOWN: 18, + errno.ECONNABORTED: 19, + errno.ECONNREFUSED: 20, + errno.ECONNRESET: 21, + errno.EINTR: 22, + errno.EISDIR: 23} + + +######################################################################### +# +# Following are methods are generically useful through class CephFSShell +# +####################################################################### + + +def poutput(s, end='\n'): + shell.poutput(s, end=end) + + +def perror(msg, **kwargs): + shell.perror(msg, **kwargs) + + +def set_exit_code_msg(errcode='Misc', msg=''): + """ + Set exit code and print error message + """ + if isinstance(msg, libcephfs.Error): + shell.exit_code = exit_codes[msg.get_error_code()] + else: + shell.exit_code = exit_codes[errcode] + if msg: + perror(msg) + + +def mode_notation(mode): + """ + """ + permission_bits = {'0': '---', + '1': '--x', + '2': '-w-', + '3': '-wx', + '4': 'r--', + '5': 'r-x', + '6': 'rw-', + '7': 'rwx'} + mode = str(oct(mode)) + notation = '-' + if mode[2] == '4': + notation = 'd' + elif mode[2:4] == '12': + notation = 'l' + for i in mode[-3:]: + notation += permission_bits[i] + return notation + + +def get_chunks(file_size): + chunk_start = 0 + chunk_size = 0x20000 # 131072 bytes, default max ssl buffer size + while chunk_start + chunk_size < file_size: + yield chunk_start, chunk_size + chunk_start += chunk_size + final_chunk_size = file_size - chunk_start + yield chunk_start, final_chunk_size + + +def to_bytes(param): + # don't convert as follows as it can lead unusable results like converting + # [1, 2, 3, 4] to '[1, 2, 3, 4]' - + # str(param).encode('utf-8') + if isinstance(param, bytes): + return param + elif isinstance(param, str): + return bytes(param, encoding='utf-8') + elif isinstance(param, list): + return [i.encode('utf-8') if isinstance(i, str) else to_bytes(i) for + i in param] + elif isinstance(param, int) or isinstance(param, float): + return str(param).encode('utf-8') + elif param is None: + return None + + +def ls(path, opts=''): + # opts tries to be like /bin/ls opts + almost_all = 'A' in opts + try: + with cephfs.opendir(path) as d: + while True: + dent = cephfs.readdir(d) + if dent is None: + return + elif almost_all and dent.d_name in (b'.', b'..'): + continue + yield dent + except libcephfs.ObjectNotFound as e: + set_exit_code_msg(msg=e) + + +def glob(path, pattern): + paths = [] + parent_dir = os.path.dirname(path) + if parent_dir == b'': + parent_dir = b'/' + if path == b'/' or is_dir_exists(os.path.basename(path), parent_dir): + for i in ls(path, opts='A'): + if fnmatch.fnmatch(i.d_name, pattern): + paths.append(os.path.join(path, i.d_name)) + return paths + + +def locate_file(name, case_sensitive=True): + dir_list = sorted(set(dirwalk(cephfs.getcwd()))) + if not case_sensitive: + return [dname for dname in dir_list if name.lower() in dname.lower()] + else: + return [dname for dname in dir_list if name in dname] + + +def get_all_possible_paths(pattern): + complete_pattern = pattern[:] + paths = [] + is_rel_path = not os.path.isabs(pattern) + if is_rel_path: + dir_ = cephfs.getcwd() + else: + dir_ = b'/' + pattern = pattern[1:] + patterns = pattern.split(b'/') + paths.extend(glob(dir_, patterns[0])) + patterns.pop(0) + for pattern in patterns: + for path in paths: + paths.extend(glob(path, pattern)) + if is_rel_path: + complete_pattern = os.path.join(cephfs.getcwd(), complete_pattern) + return [path for path in paths if fnmatch.fnmatch(path, complete_pattern)] + + +suffixes = ['B', 'K', 'M', 'G', 'T', 'P'] + + +def humansize(nbytes): + i = 0 + while nbytes >= 1024 and i < len(suffixes) - 1: + nbytes /= 1024. + i += 1 + nbytes = math.ceil(nbytes) + f = ('%d' % nbytes).rstrip('.') + return '%s%s' % (f, suffixes[i]) + + +def style_listing(path, is_dir, is_symlink, ls_long=False): + if not (is_dir or is_symlink): + return path + pretty = colorama.Style.BRIGHT + if is_symlink: + pretty += colorama.Fore.CYAN + path + if ls_long: + # Add target path + pretty += ' -> ' + cephfs.readlink(path, size=255).decode('utf-8') + elif is_dir: + pretty += colorama.Fore.BLUE + path + '/' + pretty += colorama.Style.RESET_ALL + return pretty + + +def print_long(path, is_dir, is_symlink, human_readable): + info = cephfs.stat(path, follow_symlink=(not is_symlink)) + pretty = style_listing(os.path.basename(path.decode('utf-8')), is_dir, is_symlink, True) + if human_readable: + sizefmt = '\t {:10s}'.format(humansize(info.st_size)) + else: + sizefmt = '{:12d}'.format(info.st_size) + poutput(f'{mode_notation(info.st_mode)} {sizefmt} {info.st_uid} {info.st_gid} {info.st_mtime}' + f' {pretty}') + + +def word_len(word): + """ + Returns the word length, minus any color codes. + """ + if word[0] == '\x1b': + return len(word) - 9 + return len(word) + + +def is_dir_exists(path, dir_=b''): + path_to_stat = os.path.join(dir_, path) + try: + return ((cephfs.stat(path_to_stat).st_mode & 0o0040000) != 0) + except libcephfs.Error: + return False + + +def is_file_exists(path, dir_=b''): + try: + # if its not a directory, then its a file + return ((cephfs.stat(os.path.join(dir_, path)).st_mode & 0o0040000) == 0) + except libcephfs.Error: + return False + + +def print_list(words, termwidth=79): + if not words: + return + words = [word.decode('utf-8') if isinstance(word, bytes) else word for word in words] + width = max([word_len(word) for word in words]) + 2 + nwords = len(words) + ncols = max(1, (termwidth + 1) // (width + 1)) + nrows = (nwords + ncols - 1) // ncols + for row in range(nrows): + for i in range(row, nwords, nrows): + word = words[i] + print_width = width + if word[0] == '\x1b': + print_width = print_width + 10 + + poutput('%-*s' % (print_width, words[i]), + end='\n' if i + nrows >= nwords else '') + + +def copy_from_local(local_path, remote_path): + stdin = -1 + file_ = None + fd = None + convert_to_bytes = False + if local_path == b'-': + file_ = sys.stdin + convert_to_bytes = True + else: + try: + file_ = open(local_path, 'rb') + except PermissionError as e: + set_exit_code_msg(e.errno, 'error: no permission to read local file {}'.format( + local_path.decode('utf-8'))) + return + stdin = 1 + try: + fd = cephfs.open(remote_path, 'w', 0o666) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + return + progress = 0 + while True: + data = file_.read(65536) + if not data or len(data) == 0: + break + if convert_to_bytes: + data = to_bytes(data) + wrote = cephfs.write(fd, data, progress) + if wrote < 0: + break + progress += wrote + cephfs.close(fd) + if stdin > 0: + file_.close() + poutput('') + + +def copy_to_local(remote_path, local_path): + fd = None + if local_path != b'-': + local_dir = os.path.dirname(local_path) + dir_list = remote_path.rsplit(b'/', 1) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + if len(dir_list) > 2 and dir_list[1] == b'': + return + fd = open(local_path, 'wb+') + file_ = cephfs.open(remote_path, 'r') + file_size = cephfs.stat(remote_path).st_size + if file_size <= 0: + return + progress = 0 + for chunk_start, chunk_size in get_chunks(file_size): + file_chunk = cephfs.read(file_, chunk_start, chunk_size) + progress += len(file_chunk) + if fd: + fd.write(file_chunk) + else: + poutput(file_chunk.decode('utf-8')) + cephfs.close(file_) + if fd: + fd.close() + + +def dirwalk(path): + """ + walk a directory tree, using a generator + """ + path = os.path.normpath(path) + for item in ls(path, opts='A'): + fullpath = os.path.join(path, item.d_name) + src_path = fullpath.rsplit(b'/', 1)[0] + + yield os.path.normpath(fullpath) + if is_dir_exists(item.d_name, src_path): + for x in dirwalk(fullpath): + yield x + + +################################################################## +# +# Following methods are implementation for CephFS Shell commands +# +################################################################# + +class CephFSShell(Cmd): + + def __init__(self): + super().__init__() + self.working_dir = cephfs.getcwd().decode('utf-8') + self.set_prompt() + self.interactive = False + self.umask = '2' + + def default(self, line): + self.exit_code = 127 + perror('Unrecognized command') + + def set_prompt(self): + self.prompt = ('\033[01;33mCephFS:~' + colorama.Fore.LIGHTCYAN_EX + + self.working_dir + colorama.Style.RESET_ALL + + '\033[01;33m>>>\033[00m ') + + def create_argparser(self, command): + try: + argparse_args = getattr(self, 'argparse_' + command) + except AttributeError: + set_exit_code_msg() + return None + doc_lines = getattr( + self, 'do_' + command).__doc__.expandtabs().splitlines() + if '' in doc_lines: + blank_idx = doc_lines.index('') + usage = doc_lines[:blank_idx] + description = doc_lines[blank_idx + 1:] + else: + usage = doc_lines + description = [] + parser = argparse.ArgumentParser( + prog=command, + usage='\n'.join(usage), + description='\n'.join(description), + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + for args, kwargs in argparse_args: + parser.add_argument(*args, **kwargs) + return parser + + def complete_filenames(self, text, line, begidx, endidx): + if not text: + completions = [x.d_name.decode('utf-8') + '/' * int(x.is_dir()) + for x in ls(b".", opts='A')] + else: + if text.count('/') > 0: + completions = [text.rsplit('/', 1)[0] + '/' + + x.d_name.decode('utf-8') + '/' + * int(x.is_dir()) for x in ls('/' + + text.rsplit('/', 1)[0], opts='A') + if x.d_name.decode('utf-8').startswith( + text.rsplit('/', 1)[1])] + else: + completions = [x.d_name.decode('utf-8') + '/' + * int(x.is_dir()) for x in ls(b".", opts='A') + if x.d_name.decode('utf-8').startswith(text)] + if len(completions) == 1 and completions[0][-1] == '/': + dir_, file_ = completions[0].rsplit('/', 1) + completions.extend([dir_ + '/' + x.d_name.decode('utf-8') + + '/' * int(x.is_dir()) for x in + ls('/' + dir_, opts='A') + if x.d_name.decode('utf-8').startswith(file_)]) + return self.delimiter_complete(text, line, begidx, endidx, completions, '/') + return completions + + def onecmd(self, line, **kwargs): + """ + Global error catcher + """ + try: + res = Cmd.onecmd(self, line, **kwargs) + if self.interactive: + self.set_prompt() + return res + except ConnectionError as e: + set_exit_code_msg(e.errno, f'***\n{e}') + except KeyboardInterrupt: + set_exit_code_msg('KeyboardInterrupt', 'Command aborted') + except (libcephfs.Error, Exception) as e: + if shell.debug: + traceback.print_exc(file=sys.stdout) + if isinstance(e, Cmd2ArgparseError): + # NOTE: In case of Cmd2ArgparseError the error message is + # already printed beforehand (plus Cmd2ArgparseError + # instances have empty error message), so let's just set the + # exit code. + set_exit_code_msg(msg=None) + else: + set_exit_code_msg(msg=f'{type(e).__name__}: {e}') + # In cmd2 versions < 1.1.0 we'll get SystemExit(2) instead of + # Cmd2ArgparseError + except SystemExit: + raise + + class path_to_bytes(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + values = to_bytes(values) + setattr(namespace, self.dest, values) + + # TODO: move the necessary contents from here to `class path_to_bytes`. + class get_list_of_bytes_path(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + values = to_bytes(values) + + if values == b'.': + values = cephfs.getcwd() + else: + for i in values: + if i == b'.': + values[values.index(i)] = cephfs.getcwd() + + setattr(namespace, self.dest, values) + + def complete_mkdir(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + class ModeAction(argparse.Action): + def __init__(self, option_strings, dest, nargs=None, **kwargs): + if nargs is not None and nargs != '?': + raise ValueError("more than one modes not allowed") + super().__init__(option_strings, dest, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + o_mode = 0 + res = None + try: + o_mode = int(values, base=8) + except ValueError: + res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', values) + if res is None: + parser.error(f"invalid mode: {values}\n" + "mode must be a numeric octal literal\n" + "or ((u?g?o?)|(a?))(=)(r?w?x?)") + else: + # we are supporting only assignment of mode and not + or - + # as is generally available with the chmod command + # eg. + # >>> res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', 'go=') + # >>> res.groups() + # ('go', 'go', None, '=', '') + val = res.groups() + + if val[3] != '=': + parser.error("need assignment operator between user " + "and mode specifiers") + if val[4] == '': + parser.error(f"invalid mode: {values}\n" + "mode must be combination of: r | w | x") + users = '' + if val[2] is None: + users = val[1] + else: + users = val[2] + + t_mode = 0 + if users == 'a': + users = 'ugo' + + if 'r' in val[4]: + t_mode |= 4 + if 'w' in val[4]: + t_mode |= 2 + if 'x' in val[4]: + t_mode |= 1 + + if 'u' in users: + o_mode |= (t_mode << 6) + if 'g' in users: + o_mode |= (t_mode << 3) + if 'o' in users: + o_mode |= t_mode + + if o_mode < 0: + parser.error(f"invalid mode: {values}\n" + "mode cannot be negative") + if o_mode > 0o7777: + parser.error(f"invalid mode: {values}\n" + "mode cannot be greater than octal 07777") + + setattr(namespace, self.dest, str(oct(o_mode))) + + mkdir_parser = argparse.ArgumentParser( + description='Create the directory(ies), if they do not already exist.') + mkdir_parser.add_argument('dirs', type=str, + action=path_to_bytes, + metavar='DIR_NAME', + help='Name of new_directory.', + nargs='+') + mkdir_parser.add_argument('-m', '--mode', type=str, + action=ModeAction, + help='Sets the access mode for the new directory.') + mkdir_parser.add_argument('-p', '--parent', action='store_true', + help='Create parent directories as necessary. ' + 'When this option is specified, no error is' + 'reported if a directory already exists.') + + @with_argparser(mkdir_parser) + def do_mkdir(self, args): + """ + Create directory. + """ + for path in args.dirs: + if args.mode: + permission = int(args.mode, 8) + else: + permission = 0o777 + if args.parent: + cephfs.mkdirs(path, permission) + else: + try: + cephfs.mkdir(path, permission) + except libcephfs.Error as e: + set_exit_code_msg(e) + + def complete_put(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + index_dict = {1: self.path_complete} + return self.index_based_complete(text, line, begidx, endidx, index_dict) + + put_parser = argparse.ArgumentParser( + description='Copy a file/directory to Ceph File System from Local File System.') + put_parser.add_argument('local_path', type=str, action=path_to_bytes, + help='Path of the file in the local system') + put_parser.add_argument('remote_path', type=str, action=path_to_bytes, + help='Path of the file in the remote system') + put_parser.add_argument('-f', '--force', action='store_true', + help='Overwrites the destination if it already exists.') + + @with_argparser(put_parser) + def do_put(self, args): + """ + Copy a local file/directory to CephFS. + """ + if args.local_path != b'-' and not os.path.isfile(args.local_path) \ + and not os.path.isdir(args.local_path): + set_exit_code_msg(errno.ENOENT, + msg=f"error: " + f"{args.local_path.decode('utf-8')}: " + f"No such file or directory") + return + + if (is_file_exists(args.remote_path) or is_dir_exists( + args.remote_path)) and not args.force: + set_exit_code_msg(msg=f"error: file/directory " + f"{args.remote_path.decode('utf-8')} " + f"exists, use --force to overwrite") + return + + root_src_dir = args.local_path + root_dst_dir = args.remote_path + if args.local_path == b'.' or args.local_path == b'./': + root_src_dir = os.getcwdb() + elif len(args.local_path.rsplit(b'/', 1)) < 2: + root_src_dir = os.path.join(os.getcwdb(), args.local_path) + else: + p = args.local_path.split(b'/') + if p[0] == b'.': + root_src_dir = os.getcwdb() + p.pop(0) + while len(p) > 0: + root_src_dir += b'/' + p.pop(0) + + if root_dst_dir == b'.': + if args.local_path != b'-': + root_dst_dir = root_src_dir.rsplit(b'/', 1)[1] + if root_dst_dir == b'': + root_dst_dir = root_src_dir.rsplit(b'/', 1)[0] + a = root_dst_dir.rsplit(b'/', 1) + if len(a) > 1: + root_dst_dir = a[1] + else: + root_dst_dir = a[0] + else: + set_exit_code_msg(errno.EINVAL, 'error: no filename specified ' + 'for destination') + return + + if root_dst_dir[-1] != b'/': + root_dst_dir += b'/' + + if args.local_path == b'-' or os.path.isfile(root_src_dir): + if args.local_path == b'-': + root_src_dir = b'-' + copy_from_local(root_src_dir, root_dst_dir) + else: + for src_dir, dirs, files in os.walk(root_src_dir): + if isinstance(src_dir, str): + src_dir = to_bytes(src_dir) + dst_dir = src_dir.replace(root_src_dir, root_dst_dir, 1) + dst_dir = re.sub(rb'\/+', b'/', cephfs.getcwd() + + dst_dir) + if args.force and dst_dir != b'/' and not is_dir_exists( + dst_dir[:-1]) and not locate_file(dst_dir): + try: + cephfs.mkdirs(dst_dir, 0o777) + except libcephfs.Error: + pass + if (not args.force) and dst_dir != b'/' and not is_dir_exists( + dst_dir) and not os.path.isfile(root_src_dir): + try: + cephfs.mkdirs(dst_dir, 0o777) + except libcephfs.Error: + # TODO: perhaps, set retval to 1? + pass + + for dir_ in dirs: + dir_name = os.path.join(dst_dir, dir_) + if not is_dir_exists(dir_name): + try: + cephfs.mkdirs(dir_name, 0o777) + except libcephfs.Error: + # TODO: perhaps, set retval to 1? + pass + + for file_ in files: + src_file = os.path.join(src_dir, file_) + dst_file = re.sub(rb'\/+', b'/', b'/' + dst_dir + b'/' + file_) + if (not args.force) and is_file_exists(dst_file): + return + copy_from_local(src_file, os.path.join(cephfs.getcwd(), + dst_file)) + + def complete_get(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + get_parser = argparse.ArgumentParser( + description='Copy a file from Ceph File System to Local Directory.') + get_parser.add_argument('remote_path', type=str, action=path_to_bytes, + help='Path of the file in the remote system') + get_parser.add_argument('local_path', type=str, action=path_to_bytes, + help='Path of the file in the local system') + get_parser.add_argument('-f', '--force', action='store_true', + help='Overwrites the destination if it already exists.') + + @with_argparser(get_parser) + def do_get(self, args): + """ + Copy a file/directory from CephFS to given path. + """ + if not is_file_exists(args.remote_path) and not \ + is_dir_exists(args.remote_path): + set_exit_code_msg(errno.ENOENT, "error: no file/directory" + " found at specified remote " + "path") + return + if (os.path.isfile(args.local_path) or os.path.isdir( + args.local_path)) and not args.force: + set_exit_code_msg(msg=f"error: file/directory " + f"{args.local_path.decode('utf-8')}" + f" already exists, use --force to " + f"overwrite") + return + root_src_dir = args.remote_path + root_dst_dir = args.local_path + fname = root_src_dir.rsplit(b'/', 1) + if args.local_path == b'.': + root_dst_dir = os.getcwdb() + if args.remote_path == b'.': + root_src_dir = cephfs.getcwd() + if args.local_path == b'-': + if args.remote_path == b'.' or args.remote_path == b'./': + set_exit_code_msg(errno.EINVAL, 'error: no remote file name specified') + return + copy_to_local(root_src_dir, b'-') + elif is_file_exists(args.remote_path): + copy_to_local(root_src_dir, root_dst_dir) + elif b'/' in root_src_dir and is_file_exists(fname[1], fname[0]): + copy_to_local(root_src_dir, root_dst_dir) + else: + files = list(reversed(sorted(dirwalk(root_src_dir)))) + for file_ in files: + dst_dirpath, dst_file = file_.rsplit(b'/', 1) + if dst_dirpath in files: + files.remove(dst_dirpath) + dst_path = os.path.join(root_dst_dir, dst_dirpath, dst_file) + dst_path = os.path.normpath(dst_path) + if is_dir_exists(file_): + try: + os.makedirs(dst_path) + except OSError: + pass + else: + copy_to_local(file_, dst_path) + + return 0 + + def complete_ln(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + ln_parser = argparse.ArgumentParser( + description='Add a hard link to an existing file or create a symbolic ' + 'link to an existing file or directory.') + ln_parser.add_argument('target', type=str, action=path_to_bytes, + help='File/Directory of which link is ' + 'to be created') + ln_parser.add_argument('link_name', type=str, action=path_to_bytes, + help='Link to target with the name link_name', + nargs='?') + ln_parser.add_argument('-s', '--symbolic', action='store_true', + help='Create symbolic link') + ln_parser.add_argument('-v', '--verbose', action='store_true', + help='Print name of each linked file') + ln_parser.add_argument('-f', '--force', action='store_true', + help='Force create link/symbolic link') + + @with_argparser(ln_parser) + def do_ln(self, args): + if not is_file_exists(args.target) \ + and not is_dir_exists(args.target): + set_exit_code_msg(errno.ENOENT, + msg=f"ln: failed to access " + f"'{args.target.decode('utf-8')}" + f"': No such file or directory") + return + + is_a_dir = False + if is_dir_exists(args.target): + is_a_dir = True + + target_last_char_slash = False + if args.target.decode('utf-8')[len(args.target) - 1] == '/': + target_last_char_slash = True + + link_name = '' + + if args.link_name is None: + if target_last_char_slash is True: + if is_dir_exists(args.target): + pass + else: + set_exit_code_msg(errno.ENOTDIR, + f"ln: failed to access " + f"'{args.target.decode('utf-8')}': " + f"Not a directory") + return + link_name = os.path.join(cephfs.getcwd(), + os.path.basename( + os.path.normpath(args.target))) + if (is_file_exists(link_name) or is_dir_exists( + link_name)) and not args.force: + set_exit_code_msg(errno.ENOENT, + msg=f"ln: failed to create link " + f"{link_name.decode('utf-8')}: " + f"exists") + return + else: + if is_dir_exists(args.link_name): + dest = args.link_name.decode('utf-8').rstrip('/') + dest_first_half = dest.encode('utf-8') + b'/' + if is_file_exists(args.target): + if target_last_char_slash is True: + set_exit_code_msg(errno.ENOTDIR, + "ln: failed to access " + f"'{args.target.decode('utf-8')}': " + "Not a directory") + return + dest_file = os.path.basename(os.path.normpath(args.target)) + link_name = dest_first_half + dest_file + + elif is_dir_exists(args.target): + dest_dir = os.path.basename(os.path.normpath(args.target)) + link_name = dest_first_half + dest_dir + + else: + # if the destination is not a file or a dir then: + # accept it as file so the end part of path cannot have + # a `/` succeeding it. + test_path = args.link_name.decode('utf-8') + if test_path[len(test_path) - 1] == '/': + set_exit_code_msg(errno.ENOENT, f"'{test_path}': " + f"No such file or " + f"directory") + return + else: + link_name = test_path.encode('utf-8') + + if args.force: + try: + cephfs.lstat(os.path.join(b'', link_name)) + if not is_a_dir or (is_a_dir and args.symbolic): + cephfs.unlink(link_name) + except libcephfs.ObjectNotFound: + pass + + try: + if args.symbolic: + cephfs.symlink(args.target, link_name) + else: + if is_a_dir: + set_exit_code_msg(errno.EPERM, + f"ln: {args.target.decode('utf-8')}: " + "hard link not allowed for directory") + return + cephfs.link(args.target, link_name) + except libcephfs.Error as e: + set_exit_code_msg(msg=str(e)) + return + + if args.verbose: + poutput(f"{link_name.decode('utf-8')} ->" + f" {args.target.decode('utf-8')}") + + def complete_ls(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + ls_parser = argparse.ArgumentParser( + description='Copy a file from Ceph File System from Local Directory.') + ls_parser.add_argument('-l', '--long', action='store_true', + help='Detailed list of items in the directory.') + ls_parser.add_argument('-r', '--reverse', action='store_true', + help='Reverse order of listing items in the directory.') + ls_parser.add_argument('-H', action='store_true', help='Human Readable') + ls_parser.add_argument('-a', '--all', action='store_true', + help='Do not Ignore entries starting with .') + ls_parser.add_argument('-S', action='store_true', help='Sort by file_size') + ls_parser.add_argument('paths', help='Name of Directories', + action=path_to_bytes, nargs='*', default=['.']) + + @with_argparser(ls_parser) + def do_ls(self, args): + """ + List all the files and directories in the current working directory + """ + paths = args.paths + for path in paths: + values = [] + items = [] + try: + if path.count(b'*') > 0: + all_items = get_all_possible_paths(path) + if len(all_items) == 0: + continue + path = all_items[0].rsplit(b'/', 1)[0] + if path == b'': + path = b'/' + dirs = [] + for i in all_items: + for item in ls(path): + d_name = item.d_name + if os.path.basename(i) == d_name: + if item.is_dir(): + dirs.append(os.path.join(path, d_name)) + else: + items.append(item) + if dirs: + paths.extend(dirs) + else: + poutput(path.decode('utf-8'), end=':\n') + items = sorted(items, key=lambda item: item.d_name) + else: + if path != b'' and path != cephfs.getcwd() and len(paths) > 1: + poutput(path.decode('utf-8'), end=':\n') + items = sorted(ls(path), key=lambda item: item.d_name) + if not args.all: + items = [i for i in items if not i.d_name.startswith(b'.')] + if args.S: + items = sorted(items, key=lambda item: cephfs.stat( + path + b'/' + item.d_name, follow_symlink=( + not item.is_symbol_file())).st_size) + if args.reverse: + items = reversed(items) + for item in items: + filepath = item.d_name + is_dir = item.is_dir() + is_sym_lnk = item.is_symbol_file() + try: + if args.long and args.H: + print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir, + is_sym_lnk, True) + elif args.long: + print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir, + is_sym_lnk, False) + elif is_sym_lnk or is_dir: + values.append(style_listing(filepath.decode('utf-8'), is_dir, + is_sym_lnk)) + else: + values.append(filepath) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + if not args.long: + print_list(values, shutil.get_terminal_size().columns) + if path != paths[-1]: + poutput('') + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + def complete_rmdir(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + rmdir_parser = argparse.ArgumentParser( + description='Remove the directory(ies), if they are empty.') + rmdir_parser.add_argument('paths', help='Directory Path(s)', nargs='+', + action=path_to_bytes) + rmdir_parser.add_argument('-p', '--parent', action='store_true', + help="remove directory and its ancestors; " + "e.g., 'rmdir -p a/b/c' is similar to " + "'rmdir a/b/c a/b a'") + + @with_argparser(rmdir_parser) + def do_rmdir(self, args): + self.do_rmdir_helper(args) + + def do_rmdir_helper(self, args): + """ + Remove a specific Directory + """ + is_pattern = False + paths = args.paths + for path in paths: + if path.count(b'*') > 0: + is_pattern = True + all_items = get_all_possible_paths(path) + if len(all_items) > 0: + path = all_items[0].rsplit(b'/', 1)[0] + if path == b'': + path = b'/' + dirs = [] + for i in all_items: + for item in ls(path): + d_name = item.d_name + if os.path.basename(i) == d_name: + if item.is_dir(): + dirs.append(os.path.join(path, d_name)) + paths.extend(dirs) + continue + else: + is_pattern = False + + if args.parent: + path = os.path.join(cephfs.getcwd(), path.rsplit(b'/')[0]) + files = list(sorted(set(dirwalk(path)), reverse=True)) + if not files: + path = b'.' + for filepath in files: + try: + cephfs.rmdir(os.path.normpath(filepath)) + except libcephfs.Error as e: + perror(e) + path = b'.' + break + else: + path = os.path.normpath(os.path.join(cephfs.getcwd(), path)) + if not is_pattern and path != os.path.normpath(b''): + try: + cephfs.rmdir(path) + except libcephfs.Error as e: + if e.get_error_code() == 2: + set_exit_code_msg(e.get_error_code(), + "rmdir: failed to remove " + f"{path.decode('utf-8')}: " + "No such file or directory") + elif e.get_error_code() == 20: + set_exit_code_msg(e.get_error_code(), + "rmdir: failed to remove " + f"{path.decode('utf-8')}: " + "Not a directory") + elif e.get_error_code() == 39: + set_exit_code_msg(e.get_error_code(), + "rmdir: failed to remove " + f"{path.decode('utf-8')}: " + "Directory not empty") + else: + set_exit_code_msg(msg=e) + + def complete_rm(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + rm_parser = argparse.ArgumentParser(description='Remove File.') + rm_parser.add_argument('paths', help='File Path.', nargs='+', + action=path_to_bytes) + + @with_argparser(rm_parser) + def do_rm(self, args): + """ + Remove a specific file + """ + file_paths = args.paths + for path in file_paths: + if path.count(b'*') > 0: + file_paths.extend([i for i in get_all_possible_paths( + path) if is_file_exists(i)]) + else: + try: + cephfs.unlink(path) + except libcephfs.Error as e: + # NOTE: perhaps we need a better msg here + if e.get_error_code() == 2: + set_exit_code_msg(e.get_error_code(), + "rm: failed to remove " + f"{path.decode('utf-8')}: " + "No such file or directory") + elif e.get_error_code() == 21: + set_exit_code_msg(e.get_error_code(), + "rm: failed to remove " + f"{path.decode('utf-8')}: " + "Is a directory") + else: + set_exit_code_msg(msg=e) + + def complete_mv(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + mv_parser = argparse.ArgumentParser(description='Move File.') + mv_parser.add_argument('src_path', type=str, action=path_to_bytes, + help='Source File Path.') + mv_parser.add_argument('dest_path', type=str, action=path_to_bytes, + help='Destination File Path.') + + @with_argparser(mv_parser) + def do_mv(self, args): + """ + Rename a file or Move a file from source path to the destination + """ + cephfs.rename(args.src_path, args.dest_path) + + def complete_cd(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + cd_parser = argparse.ArgumentParser(description='Change working directory') + cd_parser.add_argument('path', type=str, help='Name of the directory.', + action=path_to_bytes, nargs='?', default='/') + + @with_argparser(cd_parser) + def do_cd(self, args): + """ + Change working directory + """ + cephfs.chdir(args.path) + self.working_dir = cephfs.getcwd().decode('utf-8') + self.set_prompt() + + def do_cwd(self, arglist): + """ + Get current working directory. + """ + poutput(cephfs.getcwd().decode('utf-8')) + + def complete_chmod(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + chmod_parser = argparse.ArgumentParser(description='Change permission of a file/directory.') + chmod_parser.add_argument('mode', type=str, action=ModeAction, help='Mode') + chmod_parser.add_argument('paths', type=str, action=path_to_bytes, + help='Path of the file/directory', nargs='+') + + @with_argparser(chmod_parser) + def do_chmod(self, args): + """ + Change permission of a file/directory + """ + for path in args.paths: + mode = int(args.mode, base=8) + try: + cephfs.chmod(path, mode) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + def complete_cat(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + cat_parser = argparse.ArgumentParser(description='') + cat_parser.add_argument('paths', help='Name of Files', action=path_to_bytes, + nargs='+') + + @with_argparser(cat_parser) + def do_cat(self, args): + """ + Print contents of a file + """ + for path in args.paths: + if is_file_exists(path): + copy_to_local(path, b'-') + else: + set_exit_code_msg(errno.ENOENT, '{}: no such file'.format( + path.decode('utf-8'))) + + umask_parser = argparse.ArgumentParser(description='Set umask value.') + umask_parser.add_argument('mode', help='Mode', type=str, action=ModeAction, + nargs='?', default='') + + @with_argparser(umask_parser) + def do_umask(self, args): + """ + Set Umask value. + """ + if args.mode == '': + poutput(self.umask.zfill(4)) + else: + mode = int(args.mode, 8) + self.umask = str(oct(cephfs.umask(mode))[2:]) + + def complete_write(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + write_parser = argparse.ArgumentParser(description='Writes data into a file') + write_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of File') + + @with_argparser(write_parser) + def do_write(self, args): + """ + Write data into a file. + """ + + copy_from_local(b'-', args.path) + + def complete_lcd(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + index_dict = {1: self.path_complete} + return self.index_based_complete(text, line, begidx, endidx, index_dict) + + lcd_parser = argparse.ArgumentParser(description='') + lcd_parser.add_argument('path', type=str, action=path_to_bytes, help='Path') + + @with_argparser(lcd_parser) + def do_lcd(self, args): + """ + Moves into the given local directory + """ + try: + os.chdir(os.path.expanduser(args.path)) + except OSError as e: + set_exit_code_msg(e.errno, "Cannot change to " + f"{e.filename.decode('utf-8')}: {e.strerror}") + + def complete_lls(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + index_dict = {1: self.path_complete} + return self.index_based_complete(text, line, begidx, endidx, index_dict) + + lls_parser = argparse.ArgumentParser( + description='List files in local system.') + lls_parser.add_argument('paths', help='Paths', action=path_to_bytes, + nargs='*') + + @with_argparser(lls_parser) + def do_lls(self, args): + """ + Lists all files and folders in the current local directory + """ + if not args.paths: + print_list(os.listdir(os.getcwdb())) + else: + for path in args.paths: + try: + items = os.listdir(path) + poutput("{}:".format(path.decode('utf-8'))) + print_list(items) + except OSError as e: + set_exit_code_msg(e.errno, f"{e.filename.decode('utf-8')}: " + f"{e.strerror}") + # Arguments to the with_argpaser decorator function are sticky. + # The items in args.path do not get overwritten in subsequent calls. + # The arguments remain in args.paths after the function exits and we + # need to clean it up to ensure the next call works as expected. + args.paths.clear() + + def do_lpwd(self, arglist): + """ + Prints the absolute path of the current local directory + """ + poutput(os.getcwd()) + + def complete_df(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + df_parser = argparse.ArgumentParser(description='Show information about\ + the amount of available disk space') + df_parser.add_argument('file', help='Name of the file', nargs='*', + default=['.'], action=path_to_bytes) + + @with_argparser(df_parser) + def do_df(self, arglist): + """ + Display the amount of available disk space for file systems + """ + header = True # Set to true for printing header only once + if b'.' == arglist.file[0]: + arglist.file = ls(b'.') + + for file in arglist.file: + if isinstance(file, libcephfs.DirEntry): + file = file.d_name + if file == b'.' or file == b'..': + continue + try: + statfs = cephfs.statfs(file) + stat = cephfs.stat(file) + block_size = (statfs['f_blocks'] * statfs['f_bsize']) // 1024 + available = block_size - stat.st_size + use = 0 + + if block_size > 0: + use = (stat.st_size * 100) // block_size + + if header: + header = False + poutput('{:25s}\t{:5s}\t{:15s}{:10s}{}'.format( + "1K-blocks", "Used", "Available", "Use%", + "Stored on")) + + poutput('{:d}\t{:18d}\t{:8d}\t{:10s} {}'.format(block_size, + stat.st_size, available, str(int(use)) + '%', + file.decode('utf-8'))) + except libcephfs.OSError as e: + set_exit_code_msg(e.get_error_code(), "could not statfs {}: {}".format( + file.decode('utf-8'), e.strerror)) + + locate_parser = argparse.ArgumentParser( + description='Find file within file system') + locate_parser.add_argument('name', help='name', type=str, + action=path_to_bytes) + locate_parser.add_argument('-c', '--count', action='store_true', + help='Count list of items located.') + locate_parser.add_argument( + '-i', '--ignorecase', action='store_true', help='Ignore case') + + @with_argparser(locate_parser) + def do_locate(self, args): + """ + Find a file within the File System + """ + if args.name.count(b'*') == 1: + if args.name[0] == b'*': + args.name += b'/' + elif args.name[-1] == '*': + args.name = b'/' + args.name + args.name = args.name.replace(b'*', b'') + if args.ignorecase: + locations = locate_file(args.name, False) + else: + locations = locate_file(args.name) + if args.count: + poutput(len(locations)) + else: + poutput((b'\n'.join(locations)).decode('utf-8')) + + def complete_du(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + du_parser = argparse.ArgumentParser( + description='Disk Usage of a Directory') + du_parser.add_argument('paths', type=str, action=get_list_of_bytes_path, + help='Name of the directory.', nargs='*', + default=[b'.']) + du_parser.add_argument('-r', action='store_true', + help='Recursive Disk usage of all directories.') + + @with_argparser(du_parser) + def do_du(self, args): + """ + Print disk usage of a given path(s). + """ + def print_disk_usage(files): + if isinstance(files, bytes): + files = (files, ) + + for f in files: + try: + st = cephfs.lstat(f) + + if stat.S_ISDIR(st.st_mode): + dusage = int(cephfs.getxattr(f, + 'ceph.dir.rbytes').decode('utf-8')) + else: + dusage = st.st_size + + # print path in local context + f = os.path.normpath(f) + if f[0] is ord('/'): + f = b'.' + f + poutput('{:10s} {}'.format(humansize(dusage), + f.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + continue + + for path in args.paths: + if args.r: + print_disk_usage(sorted(set(dirwalk(path)).union({path}))) + else: + print_disk_usage(path) + + quota_parser = argparse.ArgumentParser( + description='Quota management for a Directory') + quota_parser.add_argument('op', choices=['get', 'set'], + help='Quota operation type.') + quota_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of the directory.') + quota_parser.add_argument('--max_bytes', type=int, default=-1, nargs='?', + help='Max cumulative size of the data under ' + 'this directory.') + quota_parser.add_argument('--max_files', type=int, default=-1, nargs='?', + help='Total number of files under this ' + 'directory tree.') + + @with_argparser(quota_parser) + def do_quota(self, args): + """ + Quota management. + """ + if not is_dir_exists(args.path): + set_exit_code_msg(errno.ENOENT, 'error: no such directory {}'.format( + args.path.decode('utf-8'))) + return + + if args.op == 'set': + if (args.max_bytes == -1) and (args.max_files == -1): + set_exit_code_msg(errno.EINVAL, 'please specify either ' + '--max_bytes or --max_files or both') + return + + if args.max_bytes >= 0: + max_bytes = to_bytes(str(args.max_bytes)) + try: + cephfs.setxattr(args.path, 'ceph.quota.max_bytes', + max_bytes, os.XATTR_CREATE) + poutput('max_bytes set to %d' % args.max_bytes) + except libcephfs.Error as e: + cephfs.setxattr(args.path, 'ceph.quota.max_bytes', + max_bytes, os.XATTR_REPLACE) + set_exit_code_msg(e.get_error_code(), 'max_bytes reset to ' + f'{args.max_bytes}') + + if args.max_files >= 0: + max_files = to_bytes(str(args.max_files)) + try: + cephfs.setxattr(args.path, 'ceph.quota.max_files', + max_files, os.XATTR_CREATE) + poutput('max_files set to %d' % args.max_files) + except libcephfs.Error as e: + cephfs.setxattr(args.path, 'ceph.quota.max_files', + max_files, os.XATTR_REPLACE) + set_exit_code_msg(e.get_error_code(), 'max_files reset to ' + f'{args.max_files}') + elif args.op == 'get': + max_bytes = '0' + max_files = '0' + try: + max_bytes = cephfs.getxattr(args.path, 'ceph.quota.max_bytes') + poutput('max_bytes: {}'.format(max_bytes.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(e.get_error_code(), 'max_bytes is not set') + + try: + max_files = cephfs.getxattr(args.path, 'ceph.quota.max_files') + poutput('max_files: {}'.format(max_files.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(e.get_error_code(), 'max_files is not set') + + snap_parser = argparse.ArgumentParser(description='Snapshot Management') + snap_parser.add_argument('op', type=str, + help='Snapshot operation: create or delete') + snap_parser.add_argument('name', type=str, action=path_to_bytes, + help='Name of snapshot') + snap_parser.add_argument('dir', type=str, action=path_to_bytes, + help='Directory for which snapshot ' + 'needs to be created or deleted') + + @with_argparser(snap_parser) + def do_snap(self, args): + """ + Snapshot management for the volume + """ + # setting self.colors to None turns off colorizing and + # perror emits plain text + self.colors = None + + snapdir = '.snap' + conf_snapdir = cephfs.conf_get('client_snapdir') + if conf_snapdir is not None: + snapdir = conf_snapdir + snapdir = to_bytes(snapdir) + if args.op == 'create': + try: + if is_dir_exists(args.dir): + cephfs.mkdir(os.path.join(args.dir, snapdir, args.name), 0o755) + else: + set_exit_code_msg(errno.ENOENT, "'{}': no such directory".format( + args.dir.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(e.get_error_code(), + "snapshot '{}' already exists".format( + args.name.decode('utf-8'))) + elif args.op == 'delete': + snap_dir = os.path.join(args.dir, snapdir, args.name) + try: + if is_dir_exists(snap_dir): + newargs = argparse.Namespace(paths=[snap_dir], parent=False) + self.do_rmdir_helper(newargs) + else: + set_exit_code_msg(errno.ENOENT, "'{}': no such snapshot".format( + args.name.decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(e.get_error_code(), "error while deleting " + "'{}'".format(snap_dir.decode('utf-8'))) + else: + set_exit_code_msg(errno.EINVAL, "snapshot can only be created or " + "deleted; check - help snap") + + def do_help(self, line): + """ + Get details about a command. + Usage: help - for a specific command + help all - for all the commands + """ + if line == 'all': + for k in dir(self): + if k.startswith('do_'): + poutput('-' * 80) + super().do_help(k[3:]) + return + parser = self.create_argparser(line) + if parser: + parser.print_help() + else: + super().do_help(line) + + def complete_stat(self, text, line, begidx, endidx): + """ + auto complete of file name. + """ + return self.complete_filenames(text, line, begidx, endidx) + + stat_parser = argparse.ArgumentParser( + description='Display file or file system status') + stat_parser.add_argument('paths', type=str, help='file paths', + action=path_to_bytes, nargs='+') + + @with_argparser(stat_parser) + def do_stat(self, args): + """ + Display file or file system status + """ + for path in args.paths: + try: + stat = cephfs.stat(path) + atime = stat.st_atime.isoformat(' ') + mtime = stat.st_mtime.isoformat(' ') + ctime = stat.st_mtime.isoformat(' ') + + poutput("File: {}\nSize: {:d}\nBlocks: {:d}\nIO Block: {:d}\n" + "Device: {:d}\tInode: {:d}\tLinks: {:d}\nPermission: " + "{:o}/{}\tUid: {:d}\tGid: {:d}\nAccess: {}\nModify: " + "{}\nChange: {}".format(path.decode('utf-8'), + stat.st_size, stat.st_blocks, + stat.st_blksize, stat.st_dev, + stat.st_ino, stat.st_nlink, + stat.st_mode, + mode_notation(stat.st_mode), + stat.st_uid, stat.st_gid, atime, + mtime, ctime)) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + setxattr_parser = argparse.ArgumentParser( + description='Set extended attribute for a file') + setxattr_parser.add_argument('path', type=str, action=path_to_bytes, help='Name of the file') + setxattr_parser.add_argument('name', type=str, help='Extended attribute name') + setxattr_parser.add_argument('value', type=str, help='Extended attribute value') + + @with_argparser(setxattr_parser) + def do_setxattr(self, args): + """ + Set extended attribute for a file + """ + val_bytes = to_bytes(args.value) + name_bytes = to_bytes(args.name) + try: + cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_CREATE) + poutput('{} is successfully set to {}'.format(args.name, args.value)) + except libcephfs.ObjectExists: + cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_REPLACE) + poutput('{} is successfully reset to {}'.format(args.name, args.value)) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + getxattr_parser = argparse.ArgumentParser( + description='Get extended attribute set for a file') + getxattr_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of the file') + getxattr_parser.add_argument('name', type=str, help='Extended attribute name') + + @with_argparser(getxattr_parser) + def do_getxattr(self, args): + """ + Get extended attribute for a file + """ + try: + poutput('{}'.format(cephfs.getxattr(args.path, + to_bytes(args.name)).decode('utf-8'))) + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + listxattr_parser = argparse.ArgumentParser( + description='List extended attributes set for a file') + listxattr_parser.add_argument('path', type=str, action=path_to_bytes, + help='Name of the file') + + @with_argparser(listxattr_parser) + def do_listxattr(self, args): + """ + List extended attributes for a file + """ + try: + size, xattr_list = cephfs.listxattr(args.path) + if size > 0: + poutput('{}'.format(xattr_list.replace(b'\x00', b' ').decode('utf-8'))) + else: + poutput('No extended attribute is set') + except libcephfs.Error as e: + set_exit_code_msg(msg=e) + + +####################################################### +# +# Following are methods that get cephfs-shell started. +# +##################################################### + +def setup_cephfs(args): + """ + Mounting a cephfs + """ + global cephfs + try: + cephfs = libcephfs.LibCephFS(conffile='') + cephfs.mount(filesystem_name=args.fs) + except libcephfs.ObjectNotFound as e: + print('couldn\'t find ceph configuration not found') + sys.exit(e.get_error_code()) + except libcephfs.Error as e: + print(e) + sys.exit(e.get_error_code()) + + +def str_to_bool(val): + """ + Return corresponding bool values for strings like 'true' or 'false'. + """ + if not isinstance(val, str): + return val + + val = val.replace('\n', '') + if val.lower() in ['true', 'yes']: + return True + elif val.lower() in ['false', 'no']: + return False + else: + return val + + +def read_shell_conf(shell, shell_conf_file): + import configparser + + sec = 'cephfs-shell' + opts = [] + if LooseVersion(cmd2_version) >= LooseVersion("0.10.0"): + for attr in shell.settables.keys(): + opts.append(attr) + else: + if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"): + # hardcoding options for 0.7.9 because - + # 1. we use cmd2 v0.7.9 with teuthology and + # 2. there's no way distinguish between a shell setting and shell + # object attribute until v0.10.0 + opts = ['abbrev', 'autorun_on_edit', 'colors', + 'continuation_prompt', 'debug', 'echo', 'editor', + 'feedback_to_output', 'locals_in_py', 'prompt', 'quiet', + 'timing'] + elif LooseVersion(cmd2_version) >= LooseVersion("0.9.23"): + opts.append('allow_style') + # no equivalent option was defined by cmd2. + else: + pass + + # default and only section in our conf file. + cp = configparser.ConfigParser(default_section=sec, strict=False) + cp.read(shell_conf_file) + for opt in opts: + if cp.has_option(sec, opt): + setattr(shell, opt, str_to_bool(cp.get(sec, opt))) + + +def get_shell_conffile_path(arg_conf=''): + conf_filename = 'cephfs-shell.conf' + env_var = 'CEPHFS_SHELL_CONF' + + arg_conf = '' if not arg_conf else arg_conf + home_dir_conf = os.path.expanduser('~/.' + conf_filename) + env_conf = os.environ[env_var] if env_var in os.environ else '' + + # here's the priority by which conf gets read. + for path in (arg_conf, env_conf, home_dir_conf): + if os.path.isfile(path): + return path + else: + return '' + + +def manage_args(): + main_parser = argparse.ArgumentParser(description='') + main_parser.add_argument('-b', '--batch', action='store', + help='Path to CephFS shell script/batch file' + 'containing CephFS shell commands', + type=str) + main_parser.add_argument('-c', '--config', action='store', + help='Path to Ceph configuration file.', + type=str) + main_parser.add_argument('-f', '--fs', action='store', + help='Name of filesystem to mount.', + type=str) + main_parser.add_argument('-t', '--test', action='store', + help='Test against transcript(s) in FILE', + nargs='+') + main_parser.add_argument('commands', nargs='*', help='Comma delimited ' + 'commands. The shell executes the given command ' + 'and quits immediately with the return value of ' + 'command. In case no commands are provided, the ' + 'shell is launched.', default=[]) + + args = main_parser.parse_args() + args.exe_and_quit = False # Execute and quit, don't launch the shell. + + if args.batch: + if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"): + args.commands = ['load ' + args.batch, ',quit'] + else: + args.commands = ['run_script ' + args.batch, ',quit'] + if args.test: + args.commands.extend(['-t,'] + [arg + ',' for arg in args.test]) + if not args.batch and len(args.commands) > 0: + args.exe_and_quit = True + + manage_sys_argv(args) + + return args + + +def manage_sys_argv(args): + exe = sys.argv[0] + sys.argv.clear() + sys.argv.append(exe) + sys.argv.extend([i.strip() for i in ' '.join(args.commands).split(',')]) + + setup_cephfs(args) + + +def execute_cmd_args(args): + """ + Launch a shell session if no arguments were passed, else just execute + the given argument as a shell command and exit the shell session + immediately at (last) command's termination with the (last) command's + return value. + """ + if not args.exe_and_quit: + return shell.cmdloop() + return execute_cmds_and_quit(args) + + +def execute_cmds_and_quit(args): + """ + Multiple commands might be passed separated by commas, feed onecmd() + one command at a time. + """ + # do_* methods triggered by cephfs-shell commands return None when they + # complete running successfully. Until 0.9.6, shell.onecmd() returned this + # value to indicate whether the execution of the commands should stop, but + # since 0.9.7 it returns the return value of do_* methods only if it's + # not None. When it is None it returns False instead of None. + if LooseVersion(cmd2_version) <= LooseVersion("0.9.6"): + stop_exec_val = None + else: + stop_exec_val = False + + args_to_onecmd = '' + if len(args.commands) <= 1: + args.commands = args.commands[0].split(' ') + for cmdarg in args.commands: + if ',' in cmdarg: + args_to_onecmd += ' ' + cmdarg[0:-1] + onecmd_retval = shell.onecmd(args_to_onecmd) + # if the current command failed, let's abort the execution of + # series of commands passed. + if onecmd_retval is not stop_exec_val: + return onecmd_retval + if shell.exit_code != 0: + return shell.exit_code + + args_to_onecmd = '' + continue + + args_to_onecmd += ' ' + cmdarg + return shell.onecmd(args_to_onecmd) + + +if __name__ == '__main__': + args = manage_args() + + shell = CephFSShell() + # TODO: perhaps, we should add an option to pass ceph.conf? + read_shell_conf(shell, get_shell_conffile_path(args.config)) + # XXX: setting shell.exit_code to zero so that in case there are no errors + # and exceptions, it is not set by any method or function of cephfs-shell + # and return values from shell.cmdloop() or shell.onecmd() is not an + # integer, we can treat it as the return value of cephfs-shell. + shell.exit_code = 0 + + retval = execute_cmd_args(args) + sys.exit(retval if retval else shell.exit_code) diff --git a/src/tools/cephfs/shell/setup.py b/src/tools/cephfs/shell/setup.py new file mode 100644 index 000000000..8cf7f28f7 --- /dev/null +++ b/src/tools/cephfs/shell/setup.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +from setuptools import setup + +__version__ = '0.0.1' + +setup( + name='cephfs-shell', + version=__version__, + description='Interactive shell for Ceph file system', + keywords='cephfs, shell', + scripts=['cephfs-shell'], + install_requires=[ + 'cephfs', + 'cmd2', + 'colorama', + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python :: 3' + ], + license='LGPLv2+', +) diff --git a/src/tools/cephfs/shell/tox.ini b/src/tools/cephfs/shell/tox.ini new file mode 100644 index 000000000..c1cbff051 --- /dev/null +++ b/src/tools/cephfs/shell/tox.ini @@ -0,0 +1,7 @@ +[tox] +envlist = py3 +skipsdist = true + +[testenv:py3] +deps = flake8 +commands = flake8 --ignore=W503 --max-line-length=100 cephfs-shell diff --git a/src/tools/cephfs/top/CMakeLists.txt b/src/tools/cephfs/top/CMakeLists.txt new file mode 100644 index 000000000..8f9df0187 --- /dev/null +++ b/src/tools/cephfs/top/CMakeLists.txt @@ -0,0 +1,11 @@ +include(Distutils) +distutils_install_module(cephfs-top) + +if(WITH_TESTS) + include(AddCephTest) + add_tox_test(cephfs-top) +endif() + +set(MINIMUM_COMPATIBLE_VERSION 3.6.0) +find_package(Python3 ${MINIMUM_COMPATIBLE_VERSION} REQUIRED + COMPONENTS Interpreter) diff --git a/src/tools/cephfs/top/cephfs-top b/src/tools/cephfs/top/cephfs-top new file mode 100755 index 000000000..b39e815fa --- /dev/null +++ b/src/tools/cephfs/top/cephfs-top @@ -0,0 +1,1227 @@ +#!/usr/bin/python3 + +import argparse +import sys +import curses +import errno +import json +import signal +import time +import math +import threading + +from collections import OrderedDict +from datetime import datetime +from enum import Enum, unique +from curses import ascii + +import rados + + +class FSTopException(Exception): + def __init__(self, msg=''): + self.error_msg = msg + + def get_error_msg(self): + return self.error_msg + + +@unique +class MetricType(Enum): + METRIC_TYPE_NONE = 0 + METRIC_TYPE_PERCENTAGE = 1 + METRIC_TYPE_LATENCY = 2 + METRIC_TYPE_SIZE = 3 + METRIC_TYPE_STDEV = 4 + + +FS_TOP_PROG_STR = 'cephfs-top' +FS_TOP_ALL_FS_APP = 'ALL_FS_APP' +FS_TOP_FS_SELECTED_APP = 'SELECTED_FS_APP' + +# version match b/w fstop and stats emitted by mgr/stats +FS_TOP_SUPPORTED_VER = 2 + +ITEMS_PAD_LEN = 3 +ITEMS_PAD = " " * ITEMS_PAD_LEN +DEFAULT_REFRESH_INTERVAL = 1 + +# metadata provided by mgr/stats +FS_TOP_MAIN_WINDOW_COL_CLIENT_ID = "client_id" +FS_TOP_MAIN_WINDOW_COL_MNT_ROOT = "mount_root" +FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR = "mount_point@host/addr" + +MAIN_WINDOW_TOP_LINE_ITEMS_START = [ITEMS_PAD, + FS_TOP_MAIN_WINDOW_COL_CLIENT_ID, + FS_TOP_MAIN_WINDOW_COL_MNT_ROOT] +MAIN_WINDOW_TOP_LINE_ITEMS_END = [FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR] + +MAIN_WINDOW_TOP_LINE_METRICS_LEGACY = ["READ_LATENCY", + "WRITE_LATENCY", + "METADATA_LATENCY" + ] + +# adjust this map according to stats version and maintain order +# as emitted by mgr/stast +MAIN_WINDOW_TOP_LINE_METRICS = OrderedDict([ + ("CAP_HIT", MetricType.METRIC_TYPE_PERCENTAGE), + ("READ_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("DENTRY_LEASE", MetricType.METRIC_TYPE_PERCENTAGE), + ("OPENED_FILES", MetricType.METRIC_TYPE_NONE), + ("PINNED_ICAPS", MetricType.METRIC_TYPE_NONE), + ("OPENED_INODES", MetricType.METRIC_TYPE_NONE), + ("READ_IO_SIZES", MetricType.METRIC_TYPE_SIZE), + ("WRITE_IO_SIZES", MetricType.METRIC_TYPE_SIZE), + ("AVG_READ_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("STDEV_READ_LATENCY", MetricType.METRIC_TYPE_STDEV), + ("AVG_WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("STDEV_WRITE_LATENCY", MetricType.METRIC_TYPE_STDEV), + ("AVG_METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY), + ("STDEV_METADATA_LATENCY", MetricType.METRIC_TYPE_STDEV), +]) +MGR_STATS_COUNTERS = list(MAIN_WINDOW_TOP_LINE_METRICS.keys()) + +FS_TOP_VERSION_HEADER_FMT = '{prog_name} - {now}' +FS_TOP_CLIENT_HEADER_FMT = 'Total Client(s): {num_clients} - '\ + '{num_mounts} FUSE, {num_kclients} kclient, {num_libs} libcephfs' +FS_TOP_NAME_TOPL_FMT = 'Filesystem: {fs_name} - {client_count} client(s)' + +CLIENT_METADATA_KEY = "client_metadata" +CLIENT_METADATA_MOUNT_POINT_KEY = "mount_point" +CLIENT_METADATA_MOUNT_ROOT_KEY = "root" +CLIENT_METADATA_IP_KEY = "IP" +CLIENT_METADATA_HOSTNAME_KEY = "hostname" +CLIENT_METADATA_VALID_METRICS_KEY = "valid_metrics" + +GLOBAL_METRICS_KEY = "global_metrics" +GLOBAL_COUNTERS_KEY = "global_counters" + +fs_list = [] +# store the current states of cephfs-top +# last_fs : last filesystem visited +# last_field : last field selected for sorting +# limit : last limit value +current_states = {"last_fs": "", "last_field": 'chit', "limit": None} +metrics_dict = {} + + +def calc_perc(c): + if c[0] == 0 and c[1] == 0: + return 0.0 + return round((c[0] / (c[0] + c[1])) * 100, 2) + + +def calc_lat(c): + return round(c[0] * 1000 + c[1] / 1000000, 2) + + +def calc_stdev(c): + stdev = 0.0 + if c[1] > 1: + stdev = math.sqrt(c[0] / (c[1] - 1)) / 1000000 + return round(stdev, 2) + + +# in MB +def calc_size(c): + return round(c[1] / (1024 * 1024), 2) + + +# in MB +def calc_avg_size(c): + if c[0] == 0: + return 0.0 + return round(c[1] / (c[0] * 1024 * 1024), 2) + + +# in MB/s +def calc_speed(size, duration): + if duration == 0: + return 0.0 + return round(size / (duration * 1024 * 1024), 2) + + +def wrap(s, sl): + """return a '+' suffixed wrapped string""" + if len(s) < sl: + return s + return f'{s[0:sl-1]}+' + + +class FSTopBase(object): + def __init__(self): + self.last_time = time.time() + self.last_read_size = {} + self.last_write_size = {} + self.dump_json = {} + + @staticmethod + def has_metric(metadata, metrics_key): + return metrics_key in metadata + + @staticmethod + def has_metrics(metadata, metrics_keys): + for key in metrics_keys: + if not FSTopBase.has_metric(metadata, key): + return False + return True + + def __build_clients(self, fs): + fs_meta = self.dump_json.setdefault(fs, {}) + fs_key = self.stats_json[GLOBAL_METRICS_KEY].get(fs, {}) + clients = fs_key.keys() + for client_id in clients: + cur_time = time.time() + duration = cur_time - self.last_time + self.last_time = cur_time + client_meta = self.stats_json[CLIENT_METADATA_KEY].get(fs, {}).get(client_id, {}) + for item in MAIN_WINDOW_TOP_LINE_ITEMS_START[1:]: + if item == FS_TOP_MAIN_WINDOW_COL_CLIENT_ID: + client_id_meta = fs_meta.setdefault(client_id.split('.')[1], {}) + elif item == FS_TOP_MAIN_WINDOW_COL_MNT_ROOT: + client_id_meta.update({item: + client_meta[CLIENT_METADATA_MOUNT_ROOT_KEY]}) + counters = [m.upper() for m in self.stats_json[GLOBAL_COUNTERS_KEY]] + metrics = fs_key.get(client_id, {}) + cidx = 0 + for item in counters: + if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY: + cidx += 1 + continue + m = metrics[cidx] + key = MGR_STATS_COUNTERS[cidx] + typ = MAIN_WINDOW_TOP_LINE_METRICS[key] + if item.lower() in client_meta.get( + CLIENT_METADATA_VALID_METRICS_KEY, []): + key_name = self.items(item) + if typ == MetricType.METRIC_TYPE_PERCENTAGE: + client_id_meta.update({f'{key_name}': calc_perc(m)}) + elif typ == MetricType.METRIC_TYPE_LATENCY: + client_id_meta.update({f'{key_name}': calc_lat(m)}) + elif typ == MetricType.METRIC_TYPE_STDEV: + client_id_meta.update({f'{key_name}': calc_stdev(m)}) + elif typ == MetricType.METRIC_TYPE_SIZE: + client_id_meta.update({f'{key_name}': calc_size(m)}) + # average io sizes + client_id_meta.update({f'{self.avg_items(item)}': + calc_avg_size(m)}) + # io speeds + size = 0 + if key == "READ_IO_SIZES": + if m[1] > 0: + last_size = self.last_read_size.get(client_id, 0) + size = m[1] - last_size + self.last_read_size[client_id] = m[1] + if key == "WRITE_IO_SIZES": + if m[1] > 0: + last_size = self.last_write_size.get(client_id, 0) + size = m[1] - last_size + self.last_write_size[client_id] = m[1] + client_id_meta.update({f'{self.speed_items(item)}': + calc_speed(abs(size), duration)}) + else: + # display 0th element from metric tuple + client_id_meta.update({f'{key_name}': f'{m[0]}'}) + else: + client_id_meta.update({f'{self.items(item)}': "N/A"}) + cidx += 1 + + for item in MAIN_WINDOW_TOP_LINE_ITEMS_END: + if item == FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR: + if FSTopBase.has_metrics(client_meta, + [CLIENT_METADATA_MOUNT_POINT_KEY, + CLIENT_METADATA_HOSTNAME_KEY, + CLIENT_METADATA_IP_KEY]): + mount_point = f'{client_meta[CLIENT_METADATA_MOUNT_POINT_KEY]}'\ + f'@{client_meta[CLIENT_METADATA_HOSTNAME_KEY]}/'\ + f'{client_meta[CLIENT_METADATA_IP_KEY]}' + client_id_meta.update({item: mount_point}) + else: + client_id_meta.update({item: "N/A"}) + + def dump_metrics_to_stdout(self, fs_name=None): + fs_list = self.get_fs_names() + if not fs_list: + sys.stdout.write("No filesystem available\n") + else: + self.stats_json = self.perf_stats_query() + if fs_name: # --dumpfs + if fs_name in fs_list: + self.__build_clients(fs_name) + else: + sys.stdout.write(f"Filesystem {fs_name} not available\n") + return + else: # --dump + for fs in fs_list: + self.__build_clients(fs) + sys.stdout.write(json.dumps(self.dump_json)) + sys.stdout.write("\n") + + +class FSTop(FSTopBase): + def __init__(self, args): + super(FSTop, self).__init__() + self.rados = None + self.stdscr = None # curses instance + self.active_screen = "" + self.client_name = args.id + self.cluster_name = args.cluster + self.conffile = args.conffile + self.refresh_interval_secs = args.delay + self.PAD_HEIGHT = 10000 # height of the fstop_pad + self.PAD_WIDTH = 300 # width of the fstop_pad + self.exit_ev = threading.Event() + + def handle_signal(self, signum, _): + self.exit_ev.set() + + def init(self): + try: + if self.conffile: + r_rados = rados.Rados(rados_id=self.client_name, + clustername=self.cluster_name, + conffile=self.conffile) + else: + r_rados = rados.Rados(rados_id=self.client_name, + clustername=self.cluster_name) + r_rados.conf_read_file() + r_rados.connect() + self.rados = r_rados + except rados.Error as e: + if e.errno == errno.ENOENT: + raise FSTopException(f'cluster {self.cluster_name}' + ' does not exist') + else: + raise FSTopException(f'error connecting to cluster: {e}') + self.verify_perf_stats_support() + signal.signal(signal.SIGTERM, self.handle_signal) + signal.signal(signal.SIGINT, self.handle_signal) + + def fini(self): + if self.rados: + self.rados.shutdown() + self.rados = None + + def selftest(self): + stats_json = self.perf_stats_query() + if not stats_json['version'] == FS_TOP_SUPPORTED_VER: + raise FSTopException('perf stats version mismatch!') + missing = [m for m in stats_json["global_counters"] + if m.upper() not in MGR_STATS_COUNTERS] + if missing: + raise FSTopException('Cannot handle unknown metrics from' + f'\'ceph fs perf stats\': {missing}') + + def get_fs_names(self): + mon_cmd = {'prefix': 'fs ls', 'format': 'json'} + try: + ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'') + except Exception as e: + raise FSTopException(f'Error in fs ls: {e}') + fs_map = json.loads(buf.decode('utf-8')) + global fs_list + fs_list.clear() + for filesystem in fs_map: + fs = filesystem['name'] + fs_list.append(fs) + return fs_list + + def setup_curses(self, win): + self.stdscr = win + self.stdscr.keypad(True) + curses.use_default_colors() + curses.start_color() + try: + curses.curs_set(0) + except curses.error: + # If the terminal do not support the visibility + # requested it will raise an exception + pass + self.fstop_pad = curses.newpad(self.PAD_HEIGHT, self.PAD_WIDTH) + self.run_all_display() + + def display_fs_menu(self, stdscr, selected_row_idx): + stdscr.clear() + h, w = stdscr.getmaxyx() + title = ['Filesystems', 'Press "q" to go back to the previous screen'] + pos_x1 = w // 2 - len(title[0]) // 2 + pos_x2 = w // 2 - len(title[1]) // 2 + stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD) + stdscr.addstr(3, pos_x2, title[1], curses.A_DIM) + for index, name in enumerate(fs_list): + x = w // 2 - len(name) // 2 + y = h // 2 - len(fs_list) // 2 + index + if index == selected_row_idx: + stdscr.attron(curses.color_pair(1)) + stdscr.addstr(y, x, name) + stdscr.attroff(curses.color_pair(1)) + else: + stdscr.addstr(y, x, name) + stdscr.refresh() + + def display_sort_menu(self, stdscr, selected_row_idx, field_menu): + stdscr.clear() + title = ['Fields', 'Press "q" to go back to the previous screen'] + pos_x1 = 0 + pos_x2 = 0 + stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD) + stdscr.addstr(3, pos_x2, title[1], curses.A_DIM) + for index, name in enumerate(field_menu): + x = 0 + y = 5 + index + if index == selected_row_idx: + stdscr.attron(curses.color_pair(1)) + stdscr.addstr(y, x, name) + stdscr.attroff(curses.color_pair(1)) + else: + stdscr.addstr(y, x, name) + stdscr.refresh() + + def display_menu(self, stdscr): + stdscr.clear() + h, w = stdscr.getmaxyx() + title = ['No filesystem available', + 'Press "q" to go back to home (All Filesystem Info) screen'] + pos_x1 = w // 2 - len(title[0]) // 2 + pos_x2 = w // 2 - len(title[1]) // 2 + stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD) + stdscr.addstr(3, pos_x2, title[1], curses.A_DIM) + stdscr.refresh() + + def set_key(self, stdscr): + curses.curs_set(0) + curses.init_pair(1, curses.COLOR_MAGENTA, curses.COLOR_WHITE) + curr_row = 0 + key = 0 + endmenu = False + while not endmenu: + global fs_list + fs_list = self.get_fs_names() + + if key == curses.KEY_UP and curr_row > 0: + curr_row -= 1 + elif key == curses.KEY_DOWN and curr_row < len(fs_list) - 1: + curr_row += 1 + elif (key in [curses.KEY_ENTER, 10, 13]) and fs_list: + self.stdscr.erase() + current_states['last_fs'] = fs_list[curr_row] + self.run_display() + endmenu = True + elif key == ord('q'): + self.stdscr.erase() + if fs_list and self.active_screen == FS_TOP_FS_SELECTED_APP: + self.run_display() + else: + self.run_all_display() + endmenu = True + + try: + if not fs_list: + self.display_menu(stdscr) + else: + self.display_fs_menu(stdscr, curr_row) + except curses.error: + pass + curses.halfdelay(self.refresh_interval_secs) + key = stdscr.getch() + + def choose_field(self, stdscr): + curses.curs_set(0) + curses.init_pair(1, curses.COLOR_BLACK, curses.COLOR_WHITE) + field_menu = ["chit= CAP_HIT", "dlease= DENTRY_LEASE", "ofiles= OPENED_FILES", + "oicaps= PINNED_ICAPS", "oinodes= OPENED_INODES", + "rtio= READ_IO_SIZES", "raio= READ_AVG_IO_SIZES", + "rsp= READ_IO_SPEED", "wtio= WRITE_IO_SIZES", + "waio= WRITE_AVG_IO_SIZES", "wsp= WRITE_IO_SPEED", + "rlatavg= AVG_READ_LATENCY", "rlatsd= STDEV_READ_LATENCY", + "wlatavg= AVG_WRITE_LATENCY", "wlatsd= STDEV_WRITE_LATENCY", + "mlatavg= AVG_METADATA_LATENCY", "mlatsd= STDEV_METADATA_LATENCY", + "Default"] + curr_row1 = 0 + key = 0 + endwhile = False + while not endwhile: + global current_states, fs_list + fs_list = self.get_fs_names() + + if key == curses.KEY_UP and curr_row1 > 0: + curr_row1 -= 1 + elif key == curses.KEY_DOWN and curr_row1 < len(field_menu) - 1: + curr_row1 += 1 + elif (key in [curses.KEY_ENTER, 10, 13]) and fs_list: + self.stdscr.erase() + if curr_row1 != len(field_menu) - 1: + current_states["last_field"] = (field_menu[curr_row1].split('='))[0] + else: + current_states["last_field"] = 'chit' + self.header.erase() # erase the previous text + if self.active_screen == FS_TOP_ALL_FS_APP: + self.run_all_display() + else: + self.run_display() + endwhile = True + elif key == ord('q'): + self.stdscr.erase() + if fs_list and self.active_screen == FS_TOP_FS_SELECTED_APP: + self.run_display() + else: + self.run_all_display() + endwhile = True + + try: + if not fs_list: + self.display_menu(stdscr) + else: + self.display_sort_menu(stdscr, curr_row1, field_menu) + except curses.error: + pass + curses.halfdelay(self.refresh_interval_secs) + key = stdscr.getch() + + def set_limit(self, stdscr): + key = '' + endwhile = False + while not endwhile: + stdscr.clear() + h, w = stdscr.getmaxyx() + title = 'Enter the limit you want to set (number) and press ENTER,'\ + ' press "d" for default, "q" to go back to previous screen ' + pos_x1 = w // 2 - len(title) // 2 + try: + stdscr.addstr(1, pos_x1, title, curses.A_STANDOUT | curses.A_BOLD) + except curses.error: + pass + curses.halfdelay(self.refresh_interval_secs) + inp = stdscr.getch() + if inp in [ord('d'), ord('q')] or ascii.isdigit(inp): + key = key + chr(inp) + if key == 'd': + current_states["limit"] = None + elif key == 'q': + endwhile = True + elif (key).isnumeric(): + i = 1 + length = 4 + while i <= length: + pos = w // 2 - len(key) // 2 + try: + stdscr.move(3, 0) + stdscr.clrtoeol() + stdscr.addstr(3, pos, key, curses.A_BOLD) + except curses.error: + pass + if key[i - 1] == '\n': + break + inp = stdscr.getch() + if inp == ord('q'): + if current_states['limit'] is None: + key = current_states["limit"] + else: + key = current_states['limit'] + " " + break + if inp == curses.KEY_RESIZE: + stdscr.clear() + windowsize = stdscr.getmaxyx() + wd = windowsize[1] - 1 + pos_x1 = wd // 2 - len(title) // 2 + try: + stdscr.addstr(1, pos_x1, title, curses.A_STANDOUT | curses.A_BOLD) + except curses.error: + pass + if inp == curses.KEY_BACKSPACE or inp == curses.KEY_DC or inp == 127: + if i > 1: + key = key[:-1] + i = i - 1 + stdscr.move(4, 0) + stdscr.clrtoeol() + elif i == 1: + curses.wrapper(self.set_limit) + elif i == length: + if inp == ord('\n'): + key = key + chr(inp) + i = i + 1 + else: + info = "Max length is reached, press Backspace" \ + " to edit or Enter to set the limit!" + pos = w // 2 - len(info) // 2 + try: + stdscr.addstr(4, pos, info, curses.A_BOLD) + except curses.error: + pass + elif ascii.isdigit(inp) or inp == ord('\n'): + key = key + chr(inp) + i = i + 1 + if key is None: + current_states["limit"] = key + elif int(key) != 0: + current_states["limit"] = key[:-1] + self.stdscr.erase() + self.header.erase() # erase the previous text + if self.active_screen == FS_TOP_ALL_FS_APP: + self.run_all_display() + else: + self.run_display() + + def set_option_all_fs(self, opt): + # sets the options for 'All Filesystem Info' screen + if opt == ord('m'): + if fs_list: + curses.wrapper(self.set_key) + else: + return False + elif opt == ord('s'): + if fs_list: + curses.wrapper(self.choose_field) + else: + return False + elif opt == ord('l'): + if fs_list: + curses.wrapper(self.set_limit) + else: + return False + elif opt == ord('r'): + if fs_list: + current_states['last_field'] = 'chit' + current_states["limit"] = None + return False # We are already in run_all_display() + elif opt == ord('q'): + quit() + return True + + def set_option_sel_fs(self, opt, selected_fs): + # sets the options for 'Selected Filesystem Info' screen + if opt == ord('m'): + if selected_fs in fs_list: + curses.wrapper(self.set_key) + else: + return False + elif opt == ord('s'): + if selected_fs in fs_list: + curses.wrapper(self.choose_field) + else: + return False + elif opt == ord('l'): + if selected_fs in fs_list: + curses.wrapper(self.set_limit) + else: + return False + elif opt == ord('r'): + if selected_fs in fs_list: + current_states['last_field'] = 'chit' + current_states["limit"] = None + return False # we are already in run_display() + elif opt == ord('q'): + self.run_all_display() + return True + + def verify_perf_stats_support(self): + mon_cmd = {'prefix': 'mgr module ls', 'format': 'json'} + try: + ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'') + except Exception as e: + raise FSTopException(f'error checking \'stats\' module: {e}') + if ret != 0: + raise FSTopException(f'error checking \'stats\' module: {out}') + if 'stats' not in json.loads(buf.decode('utf-8'))['enabled_modules']: + raise FSTopException('\'stats\' module not enabled. Use' + '\'ceph mgr module enable stats\' to enable') + + def perf_stats_query(self): + mgr_cmd = {'prefix': 'fs perf stats', 'format': 'json'} + try: + ret, buf, out = self.rados.mgr_command(json.dumps(mgr_cmd), b'') + except Exception as e: + raise FSTopException(f'error in \'perf stats\' query: {e}') + if ret != 0: + raise FSTopException(f'error in \'perf stats\' query: {out}') + return json.loads(buf.decode('utf-8')) + + def items(self, item): + if item == "CAP_HIT": + return "chit" + if item == "READ_LATENCY": + return "rlat" + if item == "WRITE_LATENCY": + return "wlat" + if item == "METADATA_LATENCY": + return "mlat" + if item == "DENTRY_LEASE": + return "dlease" + if item == "OPENED_FILES": + return "ofiles" + if item == "PINNED_ICAPS": + return "oicaps" + if item == "OPENED_INODES": + return "oinodes" + if item == "READ_IO_SIZES": + return "rtio" + if item == "WRITE_IO_SIZES": + return "wtio" + if item == 'AVG_READ_LATENCY': + return 'rlatavg' + if item == 'STDEV_READ_LATENCY': + return 'rlatsd' + if item == 'AVG_WRITE_LATENCY': + return 'wlatavg' + if item == 'STDEV_WRITE_LATENCY': + return 'wlatsd' + if item == 'AVG_METADATA_LATENCY': + return 'mlatavg' + if item == 'STDEV_METADATA_LATENCY': + return 'mlatsd' + else: + # return empty string for none type + return '' + + def mtype(self, typ): + if typ == MetricType.METRIC_TYPE_PERCENTAGE: + return "(%)" + elif typ == MetricType.METRIC_TYPE_LATENCY: + return "(ms)" + elif typ == MetricType.METRIC_TYPE_SIZE: + return "(MB)" + elif typ == MetricType.METRIC_TYPE_STDEV: + return "(ms)" + else: + # return empty string for none type + return '' + + def avg_items(self, item): + if item == "READ_IO_SIZES": + return "raio" + if item == "WRITE_IO_SIZES": + return "waio" + else: + # return empty string for none type + return '' + + def speed_items(self, item): + if item == "READ_IO_SIZES": + return "rsp" + if item == "WRITE_IO_SIZES": + return "wsp" + else: + # return empty string for none type + return '' + + def speed_mtype(self, typ): + if typ == MetricType.METRIC_TYPE_SIZE: + return "(MB/s)" + else: + # return empty string for none type + return '' + + def create_table_header(self): # formerly named as top_line + heading = [] + for item in MAIN_WINDOW_TOP_LINE_ITEMS_START: + heading.append(item) + + for item, typ in MAIN_WINDOW_TOP_LINE_METRICS.items(): + if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY: + continue + it = f'{self.items(item)}{self.mtype(typ)}' + heading.append(it) + + if item == "READ_IO_SIZES" or item == "WRITE_IO_SIZES": + # average io sizes + it = f'{self.avg_items(item)}{self.mtype(typ)}' + heading.append(it) + + # io speeds + it = f'{self.speed_items(item)}{self.speed_mtype(typ)}' + heading.append(it) + + for item in MAIN_WINDOW_TOP_LINE_ITEMS_END: + heading.append(item) + title = ITEMS_PAD.join(heading) + self.fsstats.addstr(self.tablehead_y, 0, title, curses.A_STANDOUT | curses.A_BOLD) + + def create_client(self, fs_name, client_id, metrics, counters, + client_meta, y_coord): + metrics_dict.setdefault(fs_name, {}) + metrics_dict[fs_name].setdefault(client_id, {}) + cur_time = time.time() + duration = cur_time - self.last_time + self.last_time = cur_time + xp = 0 # xp is incremented after each addstr to position the next incoming metrics. + for item in MAIN_WINDOW_TOP_LINE_ITEMS_START: # note: the first item is ITEMS_PAD + hlen = len(item) + ITEMS_PAD_LEN + if item == FS_TOP_MAIN_WINDOW_COL_CLIENT_ID: + self.fsstats.addstr(y_coord, xp, + wrap(client_id.split('.')[1], hlen), curses.A_DIM) + elif item == FS_TOP_MAIN_WINDOW_COL_MNT_ROOT: + if FSTop.has_metric(client_meta, + CLIENT_METADATA_MOUNT_ROOT_KEY): + hlen = len(item) + ITEMS_PAD_LEN + self.fsstats.addstr( + y_coord, xp, + wrap(client_meta[CLIENT_METADATA_MOUNT_ROOT_KEY], hlen), curses.A_DIM) + else: + self.fsstats.addstr(y_coord, xp, "N/A", curses.A_DIM) + xp += hlen + + cidx = 0 + for item in counters: + if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY: + cidx += 1 + continue + m = metrics[cidx] + key = MGR_STATS_COUNTERS[cidx] + typ = MAIN_WINDOW_TOP_LINE_METRICS[key] + if item.lower() in client_meta.get( + CLIENT_METADATA_VALID_METRICS_KEY, []): + if typ == MetricType.METRIC_TYPE_PERCENTAGE: + perc = calc_perc(m) + metrics_dict[fs_name][client_id][self.items(item)] = perc + self.fsstats.addstr(y_coord, xp, + f'{perc}', curses.A_DIM) + xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN + elif typ == MetricType.METRIC_TYPE_LATENCY: + lat = calc_lat(m) + metrics_dict[fs_name][client_id][self.items(item)] = lat + self.fsstats.addstr(y_coord, xp, + f'{lat}', curses.A_DIM) + xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN + elif typ == MetricType.METRIC_TYPE_STDEV: + stdev = calc_stdev(m) + metrics_dict[fs_name][client_id][self.items(item)] = stdev + self.fsstats.addstr(y_coord, xp, + f'{stdev}', curses.A_DIM) + xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN + elif typ == MetricType.METRIC_TYPE_SIZE: + size = calc_size(m) + metrics_dict[fs_name][client_id][self.items(item)] = size + self.fsstats.addstr(y_coord, xp, + f'{size}', curses.A_DIM) + xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN + + # average io sizes + avg_size = calc_avg_size(m) + metrics_dict[fs_name][client_id][self.avg_items(key)] = avg_size + self.fsstats.addstr(y_coord, xp, + f'{avg_size}', curses.A_DIM) + xp += len(f'{self.avg_items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN + + # io speeds + size = 0 + if key == "READ_IO_SIZES": + if m[1] > 0: + last_size = self.last_read_size.get(client_id, 0) + size = m[1] - last_size + self.last_read_size[client_id] = m[1] + if key == "WRITE_IO_SIZES": + if m[1] > 0: + last_size = self.last_write_size.get(client_id, 0) + size = m[1] - last_size + self.last_write_size[client_id] = m[1] + speed = calc_speed(abs(size), duration) + metrics_dict[fs_name][client_id][self.speed_items(key)] = speed + self.fsstats.addstr(y_coord, xp, + f'{speed}', curses.A_DIM) + xp += len(f'{self.speed_items(item)}{self.speed_mtype(typ)}') + ITEMS_PAD_LEN + else: + # display 0th element from metric tuple + metrics_dict[fs_name][client_id][self.items(item)] = m[0] + self.fsstats.addstr(y_coord, xp, f'{m[0]}', curses.A_DIM) + xp += len(f'{self.items(item)}{self.mtype(typ)}') + ITEMS_PAD_LEN + else: + self.fsstats.addstr(y_coord, xp, "N/A", curses.A_DIM) + xp += len(self.items(item)) + ITEMS_PAD_LEN + cidx += 1 + + for item in MAIN_WINDOW_TOP_LINE_ITEMS_END: + wrapLen = self.PAD_WIDTH - xp + if item == FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR: + if FSTop.has_metrics(client_meta, + [CLIENT_METADATA_MOUNT_POINT_KEY, + CLIENT_METADATA_HOSTNAME_KEY, + CLIENT_METADATA_IP_KEY]): + mount_point = f'{client_meta[CLIENT_METADATA_MOUNT_POINT_KEY]}@'\ + f'{client_meta[CLIENT_METADATA_HOSTNAME_KEY]}/'\ + f'{client_meta[CLIENT_METADATA_IP_KEY]}' + self.fsstats.addstr( + y_coord, xp, + wrap(mount_point, wrapLen), curses.A_DIM) + else: + self.fsstats.addstr(y_coord, xp, "N/A", curses.A_DIM) + xp += len(self.items(item)) + ITEMS_PAD_LEN + + def create_clients(self, stats_json, fs_name): + global metrics_dict, current_states + counters = [m.upper() for m in stats_json[GLOBAL_COUNTERS_KEY]] + self.tablehead_y += 2 + res = stats_json[GLOBAL_METRICS_KEY].get(fs_name, {}) + client_cnt = len(res) + self.fsstats.addstr(self.tablehead_y, 0, FS_TOP_NAME_TOPL_FMT.format( + fs_name=fs_name, client_count=client_cnt), curses.A_BOLD) + self.tablehead_y += 2 + metrics_dict_client = metrics_dict.get(fs_name, {}) + if len(metrics_dict) > len(fs_list): + stale_fs = set(metrics_dict) - set(fs_list) + for key in stale_fs: + del metrics_dict[key] + if len(metrics_dict_client) > client_cnt: + stale_clients = set(metrics_dict_client) - set(res) + for key in stale_clients: + del metrics_dict_client[key] + if client_cnt: + if len(metrics_dict_client) != client_cnt: + sort_list = sorted(list(res.keys())) + else: + sort_arg = current_states['last_field'] + sort_list = sorted(list(res.keys()), + key=lambda x: metrics_dict[fs_name].get(x, {}).get(sort_arg, 0), + reverse=True) + if current_states['limit'] is not None and int(current_states['limit']) < client_cnt: + sort_list = sort_list[0:int(current_states['limit'])] + for client_id in sort_list: + self.create_client( + fs_name, client_id, res.get(client_id, {}), counters, + stats_json[CLIENT_METADATA_KEY].get(fs_name, {}).get(client_id, {}), + self.tablehead_y) + self.tablehead_y += 1 + + def create_header(self, stats_json, help, screen_title="", color_id=0): + num_clients, num_mounts, num_kclients, num_libs = 0, 0, 0, 0 + if not stats_json['version'] == FS_TOP_SUPPORTED_VER: + self.header.addstr(0, 0, 'perf stats version mismatch!', curses.A_BOLD) + return False + global fs_list + for fs_name in fs_list: + client_metadata = stats_json[CLIENT_METADATA_KEY].get(fs_name, {}) + client_cnt = len(client_metadata) + if client_cnt: + num_clients = num_clients + client_cnt + num_mounts = num_mounts + len( + [client for client, metadata in client_metadata.items() if + CLIENT_METADATA_MOUNT_POINT_KEY in metadata + and metadata[CLIENT_METADATA_MOUNT_POINT_KEY] != 'N/A']) + num_kclients = num_kclients + len( + [client for client, metadata in client_metadata.items() if + "kernel_version" in metadata]) + num_libs = num_clients - (num_mounts + num_kclients) + now = datetime.now().ctime() + self.header.addstr(0, 0, FS_TOP_VERSION_HEADER_FMT.format(prog_name=FS_TOP_PROG_STR, + now=now), curses.A_BOLD) + self.header.addstr(2, 0, screen_title, curses.color_pair(color_id) | curses.A_BOLD) + self.header.addstr(3, 0, FS_TOP_CLIENT_HEADER_FMT.format(num_clients=num_clients, + num_mounts=num_mounts, + num_kclients=num_kclients, + num_libs=num_libs), curses.A_DIM) + self.header.addstr(4, 0, f"Filters: Sort - {current_states['last_field']}, " + f"Limit - {current_states['limit']}", curses.A_DIM) + self.header.addstr(5, 0, help, curses.A_DIM) + return True + + def run_display(self): + # clear the pads to have a smooth refresh + self.header.erase() + self.fsstats.erase() + + self.active_screen = FS_TOP_FS_SELECTED_APP + screen_title = "Selected Filesystem Info" + help_commands = "m - select a filesystem | s - sort menu | l - limit number of clients"\ + " | r - reset to default | q - home (All Filesystem Info) screen" + curses.init_pair(3, curses.COLOR_MAGENTA, -1) + + top, left = 0, 0 # where to place pad + vscrollOffset, hscrollOffset = 0, 0 # scroll offsets + + # calculate the initial viewport height and width + windowsize = self.stdscr.getmaxyx() + self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1 + + # create header subpad + self.header_height = 7 + self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0) + + # create fsstats subpad + fsstats_begin_y = self.header_height + fsstats_height = self.PAD_HEIGHT - self.header_height + self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0) + + curses.halfdelay(1) + cmd = self.stdscr.getch() + global fs_list, current_states + while not self.exit_ev.is_set(): + fs_list = self.get_fs_names() + fs = current_states["last_fs"] + if cmd in [ord('m'), ord('s'), ord('l'), ord('r'), ord('q')]: + if self.set_option_sel_fs(cmd, fs): + self.exit_ev.set() + + stats_json = self.perf_stats_query() + vscrollEnd = 0 + if fs not in fs_list: + help = f"Error: The selected filesystem '{fs}' is not available now. " \ + "[Press 'q' to go back to home (All Filesystem Info) screen]" + # reset the sort/limit settings if fs_list is empty, otherwise continue the + # settings for the other filesystems. + if not fs_list: + current_states["last_field"] = 'chit' + current_states["limit"] = None + self.header.erase() # erase previous text + self.fsstats.erase() + self.create_header(stats_json, help, screen_title, 3) + else: + self.tablehead_y = 0 + help = "COMMANDS: " + help_commands + self.fsstats.erase() # erase previous text + + client_metadata = stats_json[GLOBAL_METRICS_KEY].get(fs, {}) + if current_states['limit'] is not None and \ + int(current_states['limit']) < len(client_metadata): + num_client = int(current_states['limit']) + else: + num_client = len(client_metadata) + vscrollEnd += num_client + if self.create_header(stats_json, help, screen_title, 3): + self.create_table_header() + self.create_clients(stats_json, fs) + + # scroll and refresh + if cmd == curses.KEY_DOWN: + if (vscrollEnd - vscrollOffset) > 1: + vscrollOffset += 1 + else: + vscrollOffset = vscrollEnd + elif cmd == curses.KEY_UP: + if vscrollOffset > 0: + vscrollOffset -= 1 + elif cmd == curses.KEY_NPAGE: + if (vscrollEnd - vscrollOffset) / 20 > 1: + vscrollOffset += 20 + else: + vscrollOffset = vscrollEnd + elif cmd == curses.KEY_PPAGE: + if vscrollOffset / 20 >= 1: + vscrollOffset -= 20 + else: + vscrollOffset = 0 + elif cmd == curses.KEY_RIGHT: + if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1: + hscrollOffset += 1 + elif cmd == curses.KEY_LEFT: + if hscrollOffset > 0: + hscrollOffset -= 1 + elif cmd == curses.KEY_HOME: + hscrollOffset = 0 + elif cmd == curses.KEY_END: + hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1 + elif cmd == curses.KEY_RESIZE: + # terminal resize event. Update the viewport dimensions + windowsize = self.stdscr.getmaxyx() + self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1 + + if cmd: + try: + # refresh the viewport for the header portion + if cmd not in [curses.KEY_DOWN, + curses.KEY_UP, + curses.KEY_NPAGE, + curses.KEY_PPAGE, + curses.KEY_RIGHT, + curses.KEY_LEFT]: + self.fstop_pad.refresh(0, 0, + top, left, + top + self.header_height, left + self.viewportWidth) + # refresh the viewport for the current table header portion in the fsstats pad + if cmd not in [curses.KEY_DOWN, + curses.KEY_UP, + curses.KEY_NPAGE, + curses.KEY_PPAGE]: + self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset, + top + fsstats_begin_y, left, + 7, left + self.viewportWidth) + # refresh the viewport for the current client records portion in the fsstats pad + self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset, + top + fsstats_begin_y + 2, left, + top + self.viewportHeight, left + self.viewportWidth) + except curses.error: + # This happens when the user switches to a terminal of different zoom size. + # just retry it. + pass + # End scroll and refresh + + curses.halfdelay(self.refresh_interval_secs * 10) + cmd = self.stdscr.getch() + + def run_all_display(self): + # clear text from the previous screen + if self.active_screen == FS_TOP_FS_SELECTED_APP: + self.header.erase() + + self.active_screen = FS_TOP_ALL_FS_APP + screen_title = "All Filesystem Info" + curses.init_pair(2, curses.COLOR_CYAN, -1) + + top, left = 0, 0 # where to place pad + vscrollOffset, hscrollOffset = 0, 0 # scroll offsets + + # calculate the initial viewport height and width + windowsize = self.stdscr.getmaxyx() + self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1 + + # create header subpad + self.header_height = 7 + self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0) + + # create fsstats subpad + fsstats_begin_y = self.header_height + fsstats_height = self.PAD_HEIGHT - self.header_height + self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0) + + curses.halfdelay(1) + cmd = self.stdscr.getch() + while not self.exit_ev.is_set(): + if cmd in [ord('m'), ord('s'), ord('l'), ord('r'), ord('q')]: + if self.set_option_all_fs(cmd): + self.exit_ev.set() + + # header display + global fs_list, current_states + fs_list = self.get_fs_names() + current_states["last_fs"] = fs_list + stats_json = self.perf_stats_query() + vscrollEnd = 0 + if not fs_list: + help = "INFO: No filesystem is available [Press 'q' to quit]" + # reset the sort/limit settings + current_states["last_field"] = 'chit' + current_states["limit"] = None + self.header.erase() # erase previous text + self.fsstats.erase() + self.create_header(stats_json, help, screen_title, 2) + else: + self.tablehead_y = 0 + num_client = 0 + help = "COMMANDS: m - select a filesystem | s - sort menu |"\ + " l - limit number of clients | r - reset to default | q - quit" + self.fsstats.erase() # erase previous text + for index, fs in enumerate(fs_list): + # Get the vscrollEnd in advance + client_metadata = stats_json[GLOBAL_METRICS_KEY].get(fs, {}) + if current_states['limit'] is not None and \ + int(current_states['limit']) < len(client_metadata): + num_client = int(current_states['limit']) + else: + num_client = len(client_metadata) + vscrollEnd += num_client + if self.create_header(stats_json, help, screen_title, 2): + if not index: # do it only for the first fs + self.create_table_header() + self.create_clients(stats_json, fs) + + # scroll and refresh + if cmd == curses.KEY_DOWN: + if (vscrollEnd - vscrollOffset) > 1: + vscrollOffset += 1 + else: + vscrollOffset = vscrollEnd + elif cmd == curses.KEY_UP: + if vscrollOffset > 0: + vscrollOffset -= 1 + elif cmd == curses.KEY_NPAGE: + if (vscrollEnd - vscrollOffset) / 20 > 1: + vscrollOffset += 20 + else: + vscrollOffset = vscrollEnd + elif cmd == curses.KEY_PPAGE: + if vscrollOffset / 20 >= 1: + vscrollOffset -= 20 + else: + vscrollOffset = 0 + elif cmd == curses.KEY_RIGHT: + if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1: + hscrollOffset += 1 + elif cmd == curses.KEY_LEFT: + if hscrollOffset > 0: + hscrollOffset -= 1 + elif cmd == curses.KEY_HOME: + hscrollOffset = 0 + elif cmd == curses.KEY_END: + hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1 + elif cmd == curses.KEY_RESIZE: + # terminal resize event. Update the viewport dimensions + windowsize = self.stdscr.getmaxyx() + self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1 + if cmd: + try: + # refresh the viewport for the header portion + if cmd not in [curses.KEY_DOWN, + curses.KEY_UP, + curses.KEY_NPAGE, + curses.KEY_PPAGE, + curses.KEY_RIGHT, + curses.KEY_LEFT]: + self.fstop_pad.refresh(0, 0, + top, left, + top + self.header_height, left + self.viewportWidth) + # refresh the viewport for the current table header portion in the fsstats pad + if cmd not in [curses.KEY_DOWN, + curses.KEY_UP, + curses.KEY_NPAGE, + curses.KEY_PPAGE]: + self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset, + top + fsstats_begin_y, left, + 7, left + self.viewportWidth) + # refresh the viewport for the current client records portion in the fsstats pad + self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset, + top + fsstats_begin_y + 2, left, + top + self.viewportHeight, left + self.viewportWidth) + except curses.error: + # This happens when the user switches to a terminal of different zoom size. + # just retry it. + pass + # End scroll and refresh + + curses.halfdelay(self.refresh_interval_secs * 10) + cmd = self.stdscr.getch() +# End class FSTop + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Ceph Filesystem top utility') + parser.add_argument('--cluster', nargs='?', const='ceph', default='ceph', + help='Ceph cluster to connect (default: ceph)') + parser.add_argument('--id', nargs='?', const='fstop', default='fstop', + help='Ceph user to use to connection (default: fstop)') + parser.add_argument('--conffile', nargs='?', default=None, + help='Path to cluster configuration file') + parser.add_argument('--selftest', dest='selftest', action='store_true', + help='Run in selftest mode') + parser.add_argument('-d', '--delay', metavar='DELAY', dest='delay', choices=range(1, 26), + default=DEFAULT_REFRESH_INTERVAL, + type=int, + help='Refresh interval in seconds ' + f'(default: {DEFAULT_REFRESH_INTERVAL}, range: 1 - 25)') + parser.add_argument('--dump', dest='dump', action='store_true', + help='Dump the metrics to stdout') + parser.add_argument('--dumpfs', action='append', + help='Dump the metrics of the given fs to stdout') + + args = parser.parse_args() + err = False + ft = FSTop(args) + try: + ft.init() + if args.selftest: + ft.selftest() + sys.stdout.write("selftest ok\n") + elif args.dump: + ft.dump_metrics_to_stdout() + elif args.dumpfs: + ft.dump_metrics_to_stdout(args.dumpfs[0]) + else: + curses.wrapper(ft.setup_curses) + except FSTopException as fst: + err = True + sys.stderr.write(f'{fst.get_error_msg()}\n') + except Exception as e: + err = True + sys.stderr.write(f'exception: {e}\n') + finally: + ft.fini() + sys.exit(0 if not err else -1) diff --git a/src/tools/cephfs/top/setup.py b/src/tools/cephfs/top/setup.py new file mode 100644 index 000000000..92fbd964c --- /dev/null +++ b/src/tools/cephfs/top/setup.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +from setuptools import setup + +__version__ = '0.0.1' + +setup( + name='cephfs-top', + version=__version__, + description='top(1) like utility for Ceph Filesystem', + keywords='cephfs, top', + scripts=['cephfs-top'], + install_requires=[ + 'rados', + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python :: 3' + ], + license='LGPLv2+', +) diff --git a/src/tools/cephfs/top/tox.ini b/src/tools/cephfs/top/tox.ini new file mode 100644 index 000000000..b125c0bc8 --- /dev/null +++ b/src/tools/cephfs/top/tox.ini @@ -0,0 +1,7 @@ +[tox] +envlist = py3 +skipsdist = true + +[testenv:py3] +deps = flake8 +commands = flake8 --ignore=W503 --max-line-length=100 cephfs-top diff --git a/src/tools/cephfs/type_helper.hpp b/src/tools/cephfs/type_helper.hpp new file mode 100644 index 000000000..2ec77c25c --- /dev/null +++ b/src/tools/cephfs/type_helper.hpp @@ -0,0 +1,28 @@ +#ifndef TYPE_HELPER_HPP__ +#define TYPE_HELPER_HPP__ + +template +T1 conv_t(T2 s){ + T1 target; + std::stringstream conv; + conv << s; + conv >> target; + return target; +} + +void string_split(std::string str, std::vector& out, std::string split = ":") { + std::cout << str << std::endl; + auto pos = str.find(split); + while(pos != std::string::npos){ + std::cout << str.substr(0, pos) << std::endl; + out.push_back(str.substr(0, pos)); + if (str.size() > pos + split.size()){ + str = str.substr(pos + split.size()); + pos = str.find(split); + }else + return; + } + out.push_back(str.substr()); + return; +} +#endif // TYPE_HELPER_HPP__ diff --git a/src/tools/cephfs_mirror/CMakeLists.txt b/src/tools/cephfs_mirror/CMakeLists.txt new file mode 100644 index 000000000..4b6dea7a1 --- /dev/null +++ b/src/tools/cephfs_mirror/CMakeLists.txt @@ -0,0 +1,30 @@ +set(cephfs_mirror_internal + ClusterWatcher.cc + Mirror.cc + FSMirror.cc + InstanceWatcher.cc + MirrorWatcher.cc + PeerReplayer.cc + ServiceDaemon.cc + Types.cc + Utils.cc + Watcher.cc + watcher/RewatchRequest.cc) + +add_executable(cephfs-mirror + main.cc) + +add_library(cephfs_mirror_internal STATIC + ${cephfs_mirror_internal}) + +target_link_libraries(cephfs-mirror + cephfs_mirror_internal + global + ceph-common + cls_cephfs_client + librados + mds + cephfs + ${ALLOC_LIBS}) + +install(TARGETS cephfs-mirror DESTINATION bin) diff --git a/src/tools/cephfs_mirror/ClusterWatcher.cc b/src/tools/cephfs_mirror/ClusterWatcher.cc new file mode 100644 index 000000000..b5f6f81d7 --- /dev/null +++ b/src/tools/cephfs_mirror/ClusterWatcher.cc @@ -0,0 +1,182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include + +#include "common/ceph_context.h" +#include "common/debug.h" +#include "common/errno.h" +#include "mon/MonClient.h" + +#include "ClusterWatcher.h" +#include "ServiceDaemon.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_cephfs_mirror +#undef dout_prefix +#define dout_prefix *_dout << "cephfs::mirror::ClusterWatcher " << __func__ + +namespace cephfs { +namespace mirror { + +ClusterWatcher::ClusterWatcher(CephContext *cct, MonClient *monc, ServiceDaemon *service_daemon, + Listener &listener) + : Dispatcher(cct), + m_monc(monc), + m_service_daemon(service_daemon), + m_listener(listener) { +} + +ClusterWatcher::~ClusterWatcher() { +} + +bool ClusterWatcher::ms_can_fast_dispatch2(const cref_t &m) const { + return m->get_type() == CEPH_MSG_FS_MAP; +} + +void ClusterWatcher::ms_fast_dispatch2(const ref_t &m) { + bool handled = ms_dispatch2(m); + ceph_assert(handled); +} + +bool ClusterWatcher::ms_dispatch2(const ref_t &m) { + if (m->get_type() == CEPH_MSG_FS_MAP) { + if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + handle_fsmap(ref_cast(m)); + } + return true; + } + + return false; +} + +int ClusterWatcher::init() { + dout(20) << dendl; + + bool sub = m_monc->sub_want("fsmap", 0, 0); + if (!sub) { + derr << ": failed subscribing to FSMap" << dendl; + return -1; + } + + m_monc->renew_subs(); + dout(10) << ": subscribed to FSMap" << dendl; + return 0; +} + +void ClusterWatcher::shutdown() { + dout(20) << dendl; + std::scoped_lock locker(m_lock); + m_stopping = true; + m_monc->sub_unwant("fsmap"); +} + +void ClusterWatcher::handle_fsmap(const cref_t &m) { + dout(20) << dendl; + + auto fsmap = m->get_fsmap(); + auto filesystems = fsmap.get_filesystems(); + + std::vector mirroring_enabled; + std::vector mirroring_disabled; + std::map peers_added; + std::map peers_removed; + std::map fs_metadata_pools; + { + std::scoped_lock locker(m_lock); + if (m_stopping) { + return; + } + + // deleted filesystems are considered mirroring disabled + for (auto it = m_filesystem_peers.begin(); it != m_filesystem_peers.end();) { + if (!fsmap.filesystem_exists(it->first.fscid)) { + mirroring_disabled.emplace_back(it->first); + it = m_filesystem_peers.erase(it); + continue; + } + ++it; + } + + for (auto &filesystem : filesystems) { + auto fs = Filesystem{filesystem->fscid, + std::string(filesystem->mds_map.get_fs_name())}; + auto pool_id = filesystem->mds_map.get_metadata_pool(); + auto &mirror_info = filesystem->mirror_info; + + if (!mirror_info.is_mirrored()) { + auto it = m_filesystem_peers.find(fs); + if (it != m_filesystem_peers.end()) { + mirroring_disabled.emplace_back(fs); + m_filesystem_peers.erase(it); + } + } else { + auto [fspeersit, enabled] = m_filesystem_peers.emplace(fs, Peers{}); + auto &peers = fspeersit->second; + + if (enabled) { + mirroring_enabled.emplace_back(fs); + fs_metadata_pools.emplace(fs, pool_id); + } + + // peers added + Peers added; + std::set_difference(mirror_info.peers.begin(), mirror_info.peers.end(), + peers.begin(), peers.end(), std::inserter(added, added.end())); + + // peers removed + Peers removed; + std::set_difference(peers.begin(), peers.end(), + mirror_info.peers.begin(), mirror_info.peers.end(), + std::inserter(removed, removed.end())); + + // update set + if (!added.empty()) { + peers_added.emplace(fs, added); + peers.insert(added.begin(), added.end()); + } + if (!removed.empty()) { + peers_removed.emplace(fs, removed); + for (auto &p : removed) { + peers.erase(p); + } + } + } + } + } + + dout(5) << ": mirroring enabled=" << mirroring_enabled << ", mirroring_disabled=" + << mirroring_disabled << dendl; + for (auto &fs : mirroring_enabled) { + m_service_daemon->add_filesystem(fs.fscid, fs.fs_name); + m_listener.handle_mirroring_enabled(FilesystemSpec(fs, fs_metadata_pools.at(fs))); + } + for (auto &fs : mirroring_disabled) { + m_service_daemon->remove_filesystem(fs.fscid); + m_listener.handle_mirroring_disabled(fs); + } + + dout(5) << ": peers added=" << peers_added << ", peers removed=" << peers_removed << dendl; + + for (auto &[fs, peers] : peers_added) { + for (auto &peer : peers) { + m_service_daemon->add_peer(fs.fscid, peer); + m_listener.handle_peers_added(fs, peer); + } + } + for (auto &[fs, peers] : peers_removed) { + for (auto &peer : peers) { + m_service_daemon->remove_peer(fs.fscid, peer); + m_listener.handle_peers_removed(fs, peer); + } + } + + std::scoped_lock locker(m_lock); + if (!m_stopping) { + m_monc->sub_got("fsmap", fsmap.get_epoch()); + } // else we have already done a sub_unwant() +} + +} // namespace mirror +} // namespace cephfs diff --git a/src/tools/cephfs_mirror/ClusterWatcher.h b/src/tools/cephfs_mirror/ClusterWatcher.h new file mode 100644 index 000000000..a418898f5 --- /dev/null +++ b/src/tools/cephfs_mirror/ClusterWatcher.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPHFS_MIRROR_CLUSTER_WATCHER_H +#define CEPHFS_MIRROR_CLUSTER_WATCHER_H + +#include + +#include "common/ceph_mutex.h" +#include "common/async/context_pool.h" +#include "messages/MFSMap.h" +#include "msg/Dispatcher.h" +#include "Types.h" + +class MonClient; + +namespace cephfs { +namespace mirror { + +class ServiceDaemon; + +// watch peer changes for filesystems via FSMap updates + +class ClusterWatcher : public Dispatcher { +public: + struct Listener { + virtual ~Listener() { + } + + virtual void handle_mirroring_enabled(const FilesystemSpec &spec) = 0; + virtual void handle_mirroring_disabled(const Filesystem &filesystem) = 0; + + virtual void handle_peers_added(const Filesystem &filesystem, const Peer &peer) = 0; + virtual void handle_peers_removed(const Filesystem &filesystem, const Peer &peer) = 0; + }; + + ClusterWatcher(CephContext *cct, MonClient *monc, ServiceDaemon *service_daemon, + Listener &listener); + ~ClusterWatcher(); + + bool ms_can_fast_dispatch_any() const override { + return true; + } + bool ms_can_fast_dispatch2(const cref_t &m) const override; + void ms_fast_dispatch2(const ref_t &m) override; + bool ms_dispatch2(const ref_t &m) override; + + void ms_handle_connect(Connection *c) override { + } + bool ms_handle_reset(Connection *c) override { + return false; + } + void ms_handle_remote_reset(Connection *c) override { + } + bool ms_handle_refused(Connection *c) override { + return false; + } + + int init(); + void shutdown(); + +private: + ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::cluster_watcher"); + MonClient *m_monc; + ServiceDaemon *m_service_daemon; + Listener &m_listener; + + bool m_stopping = false; + std::map m_filesystem_peers; + + void handle_fsmap(const cref_t &m); +}; + +} // namespace mirror +} // namespace cephfs + +#endif // CEPHFS_MIRROR_CLUSTER_WATCHER_H diff --git a/src/tools/cephfs_mirror/FSMirror.cc b/src/tools/cephfs_mirror/FSMirror.cc new file mode 100644 index 000000000..7ea798e6b --- /dev/null +++ b/src/tools/cephfs_mirror/FSMirror.cc @@ -0,0 +1,444 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/admin_socket.h" +#include "common/ceph_argparse.h" +#include "common/ceph_context.h" +#include "common/common_init.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "include/stringify.h" +#include "msg/Messenger.h" +#include "FSMirror.h" +#include "PeerReplayer.h" +#include "aio_utils.h" +#include "ServiceDaemon.h" +#include "Utils.h" + +#include "common/Cond.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_cephfs_mirror +#undef dout_prefix +#define dout_prefix *_dout << "cephfs::mirror::FSMirror " << __func__ + +using namespace std; + +namespace cephfs { +namespace mirror { + +namespace { + +const std::string SERVICE_DAEMON_DIR_COUNT_KEY("directory_count"); +const std::string SERVICE_DAEMON_PEER_INIT_FAILED_KEY("peer_init_failed"); + +class MirrorAdminSocketCommand { +public: + virtual ~MirrorAdminSocketCommand() { + } + virtual int call(Formatter *f) = 0; +}; + +class StatusCommand : public MirrorAdminSocketCommand { +public: + explicit StatusCommand(FSMirror *fs_mirror) + : fs_mirror(fs_mirror) { + } + + int call(Formatter *f) override { + fs_mirror->mirror_status(f); + return 0; + } + +private: + FSMirror *fs_mirror; +}; + +} // anonymous namespace + +class MirrorAdminSocketHook : public AdminSocketHook { +public: + MirrorAdminSocketHook(CephContext *cct, const Filesystem &filesystem, FSMirror *fs_mirror) + : admin_socket(cct->get_admin_socket()) { + int r; + std::string cmd; + + // mirror status format is name@fscid + cmd = "fs mirror status " + stringify(filesystem.fs_name) + "@" + stringify(filesystem.fscid); + r = admin_socket->register_command( + cmd, this, "get filesystem mirror status"); + if (r == 0) { + commands[cmd] = new StatusCommand(fs_mirror); + } + } + + ~MirrorAdminSocketHook() override { + admin_socket->unregister_commands(this); + for (auto &[command, cmdptr] : commands) { + delete cmdptr; + } + } + + int call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, std::ostream &errss, bufferlist &out) override { + auto p = commands.at(std::string(command)); + return p->call(f); + } + +private: + typedef std::map> Commands; + + AdminSocket *admin_socket; + Commands commands; +}; + +FSMirror::FSMirror(CephContext *cct, const Filesystem &filesystem, uint64_t pool_id, + ServiceDaemon *service_daemon, std::vector args, + ContextWQ *work_queue) + : m_cct(cct), + m_filesystem(filesystem), + m_pool_id(pool_id), + m_service_daemon(service_daemon), + m_args(args), + m_work_queue(work_queue), + m_snap_listener(this), + m_asok_hook(new MirrorAdminSocketHook(cct, filesystem, this)) { + m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY, + (uint64_t)0); +} + +FSMirror::~FSMirror() { + dout(20) << dendl; + + { + std::scoped_lock locker(m_lock); + delete m_instance_watcher; + delete m_mirror_watcher; + } + // outside the lock so that in-progress commands can acquire + // lock and finish executing. + delete m_asok_hook; +} + +int FSMirror::init_replayer(PeerReplayer *peer_replayer) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + return peer_replayer->init(); +} + +void FSMirror::shutdown_replayer(PeerReplayer *peer_replayer) { + peer_replayer->shutdown(); +} + +void FSMirror::cleanup() { + dout(20) << dendl; + ceph_unmount(m_mount); + ceph_release(m_mount); + m_ioctx.close(); + m_cluster.reset(); +} + +void FSMirror::reopen_logs() { + std::scoped_lock locker(m_lock); + + if (m_cluster) { + reinterpret_cast(m_cluster->cct())->reopen_logs(); + } + for (auto &[peer, replayer] : m_peer_replayers) { + replayer->reopen_logs(); + } +} + +void FSMirror::init(Context *on_finish) { + dout(20) << dendl; + + std::scoped_lock locker(m_lock); + int r = connect(g_ceph_context->_conf->name.to_str(), + g_ceph_context->_conf->cluster, &m_cluster, "", "", m_args); + if (r < 0) { + m_init_failed = true; + on_finish->complete(r); + return; + } + + r = m_cluster->ioctx_create2(m_pool_id, m_ioctx); + if (r < 0) { + m_init_failed = true; + m_cluster.reset(); + derr << ": error accessing local pool (id=" << m_pool_id << "): " + << cpp_strerror(r) << dendl; + on_finish->complete(r); + return; + } + + r = mount(m_cluster, m_filesystem, true, &m_mount); + if (r < 0) { + m_init_failed = true; + m_ioctx.close(); + m_cluster.reset(); + on_finish->complete(r); + return; + } + + m_addrs = m_cluster->get_addrs(); + dout(10) << ": rados addrs=" << m_addrs << dendl; + + init_instance_watcher(on_finish); +} + +void FSMirror::shutdown(Context *on_finish) { + dout(20) << dendl; + + { + std::scoped_lock locker(m_lock); + m_stopping = true; + if (m_on_init_finish != nullptr) { + dout(10) << ": delaying shutdown -- init in progress" << dendl; + m_on_shutdown_finish = new LambdaContext([this, on_finish](int r) { + if (r < 0) { + on_finish->complete(0); + return; + } + m_on_shutdown_finish = on_finish; + shutdown_peer_replayers(); + }); + return; + } + + m_on_shutdown_finish = on_finish; + } + + shutdown_peer_replayers(); +} + +void FSMirror::shutdown_peer_replayers() { + dout(20) << dendl; + + for (auto &[peer, peer_replayer] : m_peer_replayers) { + dout(5) << ": shutting down replayer for peer=" << peer << dendl; + shutdown_replayer(peer_replayer.get()); + } + m_peer_replayers.clear(); + + shutdown_mirror_watcher(); +} + +void FSMirror::init_instance_watcher(Context *on_finish) { + dout(20) << dendl; + + m_on_init_finish = new LambdaContext([this, on_finish](int r) { + { + std::scoped_lock locker(m_lock); + if (r < 0) { + m_init_failed = true; + } + } + on_finish->complete(r); + if (m_on_shutdown_finish != nullptr) { + m_on_shutdown_finish->complete(r); + } + }); + + Context *ctx = new C_CallbackAdapter< + FSMirror, &FSMirror::handle_init_instance_watcher>(this); + m_instance_watcher = InstanceWatcher::create(m_ioctx, m_snap_listener, m_work_queue); + m_instance_watcher->init(ctx); +} + +void FSMirror::handle_init_instance_watcher(int r) { + dout(20) << ": r=" << r << dendl; + + Context *on_init_finish = nullptr; + { + std::scoped_lock locker(m_lock); + if (r < 0) { + std::swap(on_init_finish, m_on_init_finish); + } + } + + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + return; + } + + init_mirror_watcher(); +} + +void FSMirror::init_mirror_watcher() { + dout(20) << dendl; + + std::scoped_lock locker(m_lock); + Context *ctx = new C_CallbackAdapter< + FSMirror, &FSMirror::handle_init_mirror_watcher>(this); + m_mirror_watcher = MirrorWatcher::create(m_ioctx, this, m_work_queue); + m_mirror_watcher->init(ctx); +} + +void FSMirror::handle_init_mirror_watcher(int r) { + dout(20) << ": r=" << r << dendl; + + Context *on_init_finish = nullptr; + { + std::scoped_lock locker(m_lock); + if (r == 0) { + std::swap(on_init_finish, m_on_init_finish); + } + } + + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + return; + } + + m_retval = r; // save errcode for init context callback + shutdown_instance_watcher(); +} + +void FSMirror::shutdown_mirror_watcher() { + dout(20) << dendl; + + std::scoped_lock locker(m_lock); + Context *ctx = new C_CallbackAdapter< + FSMirror, &FSMirror::handle_shutdown_mirror_watcher>(this); + m_mirror_watcher->shutdown(ctx); +} + +void FSMirror::handle_shutdown_mirror_watcher(int r) { + dout(20) << ": r=" << r << dendl; + + shutdown_instance_watcher(); +} + +void FSMirror::shutdown_instance_watcher() { + dout(20) << dendl; + + std::scoped_lock locker(m_lock); + Context *ctx = new C_CallbackAdapter< + FSMirror, &FSMirror::handle_shutdown_instance_watcher>(this); + m_instance_watcher->shutdown(new C_AsyncCallback(m_work_queue, ctx)); +} + +void FSMirror::handle_shutdown_instance_watcher(int r) { + dout(20) << ": r=" << r << dendl; + + cleanup(); + + Context *on_init_finish = nullptr; + Context *on_shutdown_finish = nullptr; + + { + std::scoped_lock locker(m_lock); + std::swap(on_init_finish, m_on_init_finish); + std::swap(on_shutdown_finish, m_on_shutdown_finish); + } + + if (on_init_finish != nullptr) { + on_init_finish->complete(m_retval); + } + if (on_shutdown_finish != nullptr) { + on_shutdown_finish->complete(r); + } +} + +void FSMirror::handle_acquire_directory(string_view dir_path) { + dout(5) << ": dir_path=" << dir_path << dendl; + + { + std::scoped_lock locker(m_lock); + m_directories.emplace(dir_path); + m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY, + m_directories.size()); + + for (auto &[peer, peer_replayer] : m_peer_replayers) { + dout(10) << ": peer=" << peer << dendl; + peer_replayer->add_directory(dir_path); + } + } +} + +void FSMirror::handle_release_directory(string_view dir_path) { + dout(5) << ": dir_path=" << dir_path << dendl; + + { + std::scoped_lock locker(m_lock); + auto it = m_directories.find(dir_path); + if (it != m_directories.end()) { + m_directories.erase(it); + m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY, + m_directories.size()); + for (auto &[peer, peer_replayer] : m_peer_replayers) { + dout(10) << ": peer=" << peer << dendl; + peer_replayer->remove_directory(dir_path); + } + } + } +} + +void FSMirror::add_peer(const Peer &peer) { + dout(10) << ": peer=" << peer << dendl; + + std::scoped_lock locker(m_lock); + m_all_peers.emplace(peer); + if (m_peer_replayers.find(peer) != m_peer_replayers.end()) { + return; + } + + auto replayer = std::make_unique( + m_cct, this, m_cluster, m_filesystem, peer, m_directories, m_mount, m_service_daemon); + int r = init_replayer(replayer.get()); + if (r < 0) { + m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, peer, + SERVICE_DAEMON_PEER_INIT_FAILED_KEY, + true); + return; + } + m_peer_replayers.emplace(peer, std::move(replayer)); + ceph_assert(m_peer_replayers.size() == 1); // support only a single peer +} + +void FSMirror::remove_peer(const Peer &peer) { + dout(10) << ": peer=" << peer << dendl; + + std::unique_ptr replayer; + { + std::scoped_lock locker(m_lock); + m_all_peers.erase(peer); + auto it = m_peer_replayers.find(peer); + if (it != m_peer_replayers.end()) { + replayer = std::move(it->second); + m_peer_replayers.erase(it); + } + } + + if (replayer) { + dout(5) << ": shutting down replayers for peer=" << peer << dendl; + shutdown_replayer(replayer.get()); + } +} + +void FSMirror::mirror_status(Formatter *f) { + std::scoped_lock locker(m_lock); + f->open_object_section("status"); + if (m_init_failed) { + f->dump_string("state", "failed"); + } else if (is_blocklisted(locker)) { + f->dump_string("state", "blocklisted"); + } else { + // dump rados addr for blocklist test + f->dump_string("rados_inst", m_addrs); + f->open_object_section("peers"); + for ([[maybe_unused]] auto &[peer, peer_replayer] : m_peer_replayers) { + peer.dump(f); + } + f->close_section(); // peers + f->open_object_section("snap_dirs"); + f->dump_int("dir_count", m_directories.size()); + f->close_section(); // snap_dirs + } + f->close_section(); // status +} + + +} // namespace mirror +} // namespace cephfs diff --git a/src/tools/cephfs_mirror/FSMirror.h b/src/tools/cephfs_mirror/FSMirror.h new file mode 100644 index 000000000..a9c1fab10 --- /dev/null +++ b/src/tools/cephfs_mirror/FSMirror.h @@ -0,0 +1,182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPHFS_MIRROR_FS_MIRROR_H +#define CEPHFS_MIRROR_FS_MIRROR_H + +#include "common/Formatter.h" +#include "common/Thread.h" +#include "mds/FSMap.h" +#include "Types.h" +#include "InstanceWatcher.h" +#include "MirrorWatcher.h" + +class ContextWQ; + +namespace cephfs { +namespace mirror { + +class MirrorAdminSocketHook; +class PeerReplayer; +class ServiceDaemon; + +// handle mirroring for a filesystem to a set of peers + +class FSMirror { +public: + FSMirror(CephContext *cct, const Filesystem &filesystem, uint64_t pool_id, + ServiceDaemon *service_daemon, std::vector args, + ContextWQ *work_queue); + ~FSMirror(); + + void init(Context *on_finish); + void shutdown(Context *on_finish); + + void add_peer(const Peer &peer); + void remove_peer(const Peer &peer); + + bool is_stopping() { + std::scoped_lock locker(m_lock); + return m_stopping; + } + + bool is_init_failed() { + std::scoped_lock locker(m_lock); + return m_init_failed; + } + + bool is_failed() { + std::scoped_lock locker(m_lock); + return m_init_failed || + m_instance_watcher->is_failed() || + m_mirror_watcher->is_failed(); + } + + utime_t get_failed_ts() { + std::scoped_lock locker(m_lock); + if (m_instance_watcher) { + return m_instance_watcher->get_failed_ts(); + } + if (m_mirror_watcher) { + return m_mirror_watcher->get_failed_ts(); + } + + return utime_t(); + } + + bool is_blocklisted() { + std::scoped_lock locker(m_lock); + return is_blocklisted(locker); + } + + utime_t get_blocklisted_ts() { + std::scoped_lock locker(m_lock); + if (m_instance_watcher) { + return m_instance_watcher->get_blocklisted_ts(); + } + if (m_mirror_watcher) { + return m_mirror_watcher->get_blocklisted_ts(); + } + + return utime_t(); + } + + Peers get_peers() { + std::scoped_lock locker(m_lock); + return m_all_peers; + } + + std::string get_instance_addr() { + std::scoped_lock locker(m_lock); + return m_addrs; + } + + // admin socket helpers + void mirror_status(Formatter *f); + + void reopen_logs(); + +private: + bool is_blocklisted(const std::scoped_lock &locker) const { + bool blocklisted = false; + if (m_instance_watcher) { + blocklisted = m_instance_watcher->is_blocklisted(); + } + if (m_mirror_watcher) { + blocklisted |= m_mirror_watcher->is_blocklisted(); + } + + return blocklisted; + } + + struct SnapListener : public InstanceWatcher::Listener { + FSMirror *fs_mirror; + + SnapListener(FSMirror *fs_mirror) + : fs_mirror(fs_mirror) { + } + + void acquire_directory(std::string_view dir_path) override { + fs_mirror->handle_acquire_directory(dir_path); + } + + void release_directory(std::string_view dir_path) override { + fs_mirror->handle_release_directory(dir_path); + } + }; + + CephContext *m_cct; + Filesystem m_filesystem; + uint64_t m_pool_id; + ServiceDaemon *m_service_daemon; + std::vector m_args; + ContextWQ *m_work_queue; + + ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::fs_mirror"); + SnapListener m_snap_listener; + std::set> m_directories; + Peers m_all_peers; + std::map> m_peer_replayers; + + RadosRef m_cluster; + std::string m_addrs; + librados::IoCtx m_ioctx; + InstanceWatcher *m_instance_watcher = nullptr; + MirrorWatcher *m_mirror_watcher = nullptr; + + int m_retval = 0; + bool m_stopping = false; + bool m_init_failed = false; + Context *m_on_init_finish = nullptr; + Context *m_on_shutdown_finish = nullptr; + + MirrorAdminSocketHook *m_asok_hook = nullptr; + + MountRef m_mount; + + int init_replayer(PeerReplayer *peer_replayer); + void shutdown_replayer(PeerReplayer *peer_replayer); + void cleanup(); + + void init_instance_watcher(Context *on_finish); + void handle_init_instance_watcher(int r); + + void init_mirror_watcher(); + void handle_init_mirror_watcher(int r); + + void shutdown_peer_replayers(); + + void shutdown_mirror_watcher(); + void handle_shutdown_mirror_watcher(int r); + + void shutdown_instance_watcher(); + void handle_shutdown_instance_watcher(int r); + + void handle_acquire_directory(std::string_view dir_path); + void handle_release_directory(std::string_view dir_path); +}; + +} // namespace mirror +} // namespace cephfs + +#endif // CEPHFS_MIRROR_FS_MIRROR_H diff --git a/src/tools/cephfs_mirror/InstanceWatcher.cc b/src/tools/cephfs_mirror/InstanceWatcher.cc new file mode 100644 index 000000000..b6a51a141 --- /dev/null +++ b/src/tools/cephfs_mirror/InstanceWatcher.cc @@ -0,0 +1,256 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_context.h" +#include "common/ceph_json.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "cls/cephfs/cls_cephfs_client.h" +#include "include/stringify.h" +#include "aio_utils.h" +#include "InstanceWatcher.h" +#include "Types.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_cephfs_mirror +#undef dout_prefix +#define dout_prefix *_dout << "cephfs::mirror::InstanceWatcher " << __func__ + +using namespace std; + +namespace cephfs { +namespace mirror { + +namespace { + +std::string instance_oid(const std::string &instance_id) { + return CEPHFS_MIRROR_OBJECT + "." + instance_id; +} + +} // anonymous namespace + +InstanceWatcher::InstanceWatcher(librados::IoCtx &ioctx, + Listener &listener, ContextWQ *work_queue) + : Watcher(ioctx, instance_oid(stringify(ioctx.get_instance_id())), work_queue), + m_ioctx(ioctx), + m_listener(listener), + m_work_queue(work_queue), + m_lock(ceph::make_mutex("cephfs::mirror::instance_watcher")) { +} + +InstanceWatcher::~InstanceWatcher() { +} + +void InstanceWatcher::init(Context *on_finish) { + dout(20) << dendl; + + { + std::scoped_lock locker(m_lock); + ceph_assert(m_on_init_finish == nullptr); + m_on_init_finish = new LambdaContext([this, on_finish](int r) { + on_finish->complete(r); + if (m_on_shutdown_finish != nullptr) { + m_on_shutdown_finish->complete(0); + } + }); + } + + create_instance(); +} + +void InstanceWatcher::shutdown(Context *on_finish) { + dout(20) << dendl; + + { + std::scoped_lock locker(m_lock); + ceph_assert(m_on_shutdown_finish == nullptr); + if (m_on_init_finish != nullptr) { + dout(10) << ": delaying shutdown -- init in progress" << dendl; + m_on_shutdown_finish = new LambdaContext([this, on_finish](int r) { + m_on_shutdown_finish = nullptr; + shutdown(on_finish); + }); + return; + } + + m_on_shutdown_finish = on_finish; + } + + unregister_watcher(); +} + +void InstanceWatcher::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist& bl) { + dout(20) << dendl; + + std::string dir_path; + std::string mode; + try { + JSONDecoder jd(bl); + JSONDecoder::decode_json("dir_path", dir_path, &jd.parser, true); + JSONDecoder::decode_json("mode", mode, &jd.parser, true); + } catch (const JSONDecoder::err &e) { + derr << ": failed to decode notify json: " << e.what() << dendl; + } + + dout(20) << ": notifier_id=" << notifier_id << ", dir_path=" << dir_path + << ", mode=" << mode << dendl; + + if (mode == "acquire") { + m_listener.acquire_directory(dir_path); + } else if (mode == "release") { + m_listener.release_directory(dir_path); + } else { + derr << ": unknown mode" << dendl; + } + + bufferlist outbl; + acknowledge_notify(notify_id, handle, outbl); +} + +void InstanceWatcher::handle_rewatch_complete(int r) { + dout(5) << ": r=" << r << dendl; + + if (r == -EBLOCKLISTED) { + dout(0) << ": client blocklisted" <); + int r = m_ioctx.aio_operate(m_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +void InstanceWatcher::handle_create_instance(int r) { + dout(20) << ": r=" << r << dendl; + + Context *on_init_finish = nullptr; + { + std::scoped_lock locker(m_lock); + if (r < 0) { + std::swap(on_init_finish, m_on_init_finish); + } + } + + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + return; + } + + register_watcher(); +} + +void InstanceWatcher::register_watcher() { + dout(20) << dendl; + + std::scoped_lock locker(m_lock); + Context *on_finish = new C_CallbackAdapter< + InstanceWatcher, &InstanceWatcher::handle_register_watcher>(this); + register_watch(on_finish); +} + +void InstanceWatcher::handle_register_watcher(int r) { + dout(20) << ": r=" << r << dendl; + + Context *on_init_finish = nullptr; + { + std::scoped_lock locker(m_lock); + if (r == 0) { + std::swap(on_init_finish, m_on_init_finish); + } + } + + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + return; + } + + remove_instance(); +} + +void InstanceWatcher::unregister_watcher() { + dout(20) << dendl; + + std::scoped_lock locker(m_lock); + Context *on_finish = new C_CallbackAdapter< + InstanceWatcher, &InstanceWatcher::handle_unregister_watcher>(this); + unregister_watch(new C_AsyncCallback(m_work_queue, on_finish)); +} + +void InstanceWatcher::handle_unregister_watcher(int r) { + dout(20) << ": r=" << r << dendl; + + Context *on_shutdown_finish = nullptr; + { + std::scoped_lock locker(m_lock); + if (r < 0) { + std::swap(on_shutdown_finish, m_on_shutdown_finish); + } + } + + if (on_shutdown_finish != nullptr) { + on_shutdown_finish->complete(r); + return; + } + + remove_instance(); +} + +void InstanceWatcher::remove_instance() { + dout(20) << dendl; + + std::scoped_lock locker(m_lock); + librados::ObjectWriteOperation op; + op.remove(); + + librados::AioCompletion *aio_comp = + librados::Rados::aio_create_completion( + this, &rados_callback); + int r = m_ioctx.aio_operate(m_oid, aio_comp, &op); + ceph_assert(r == 0); + aio_comp->release(); +} + +void InstanceWatcher::handle_remove_instance(int r) { + dout(20) << ": r=" << r << dendl; + + Context *on_init_finish = nullptr; + Context *on_shutdown_finish = nullptr; + { + std::scoped_lock locker(m_lock); + std::swap(on_init_finish, m_on_init_finish); + std::swap(on_shutdown_finish, m_on_shutdown_finish); + } + + if (on_init_finish != nullptr) { + on_init_finish->complete(r); + } + if (on_shutdown_finish != nullptr) { + on_shutdown_finish->complete(r); + } +} + +} // namespace mirror +} // namespace cephfs diff --git a/src/tools/cephfs_mirror/InstanceWatcher.h b/src/tools/cephfs_mirror/InstanceWatcher.h new file mode 100644 index 000000000..a07400096 --- /dev/null +++ b/src/tools/cephfs_mirror/InstanceWatcher.h @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPHFS_MIRROR_INSTANCE_WATCHER_H +#define CEPHFS_MIRROR_INSTANCE_WATCHER_H + +#include + +#include "common/ceph_mutex.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "Watcher.h" + +class ContextWQ; + +namespace cephfs { +namespace mirror { + +// watch directory update notifications via per daemon rados +// object and invoke listener callback. + +class InstanceWatcher : public Watcher { +public: + struct Listener { + virtual ~Listener() { + } + + virtual void acquire_directory(std::string_view dir_path) = 0; + virtual void release_directory(std::string_view dir_path) = 0; + }; + + static InstanceWatcher *create(librados::IoCtx &ioctx, + Listener &listener, ContextWQ *work_queue) { + return new InstanceWatcher(ioctx, listener, work_queue); + } + + InstanceWatcher(librados::IoCtx &ioctx, Listener &listener, ContextWQ *work_queue); + ~InstanceWatcher(); + + void init(Context *on_finish); + void shutdown(Context *on_finish); + + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist& bl) override; + void handle_rewatch_complete(int r) override; + + bool is_blocklisted() { + std::scoped_lock locker(m_lock); + return m_blocklisted; + } + + utime_t get_blocklisted_ts() { + std::scoped_lock locker(m_lock); + return m_blocklisted_ts; + } + + bool is_failed() { + std::scoped_lock locker(m_lock); + return m_failed; + } + + utime_t get_failed_ts() { + std::scoped_lock locker(m_lock); + return m_failed_ts; + } + +private: + librados::IoCtx &m_ioctx; + Listener &m_listener; + ContextWQ *m_work_queue; + + ceph::mutex m_lock; + Context *m_on_init_finish = nullptr; + Context *m_on_shutdown_finish = nullptr; + + bool m_blocklisted = false; + bool m_failed = false; + + utime_t m_blocklisted_ts; + utime_t m_failed_ts; + + void create_instance(); + void handle_create_instance(int r); + + void register_watcher(); + void handle_register_watcher(int r); + + void remove_instance(); + void handle_remove_instance(int r); + + void unregister_watcher(); + void handle_unregister_watcher(int r); +}; + +} // namespace mirror +} // namespace cephfs + +#endif // CEPHFS_MIRROR_INSTANCE_WATCHER_H diff --git a/src/tools/cephfs_mirror/Mirror.cc b/src/tools/cephfs_mirror/Mirror.cc new file mode 100644 index 000000000..edf903b92 --- /dev/null +++ b/src/tools/cephfs_mirror/Mirror.cc @@ -0,0 +1,589 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_argparse.h" +#include "common/ceph_context.h" +#include "common/common_init.h" +#include "common/Cond.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Timer.h" +#include "common/WorkQueue.h" +#include "include/types.h" +#include "mon/MonClient.h" +#include "msg/Messenger.h" +#include "aio_utils.h" +#include "Mirror.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_cephfs_mirror +#undef dout_prefix +#define dout_prefix *_dout << "cephfs::mirror::Mirror " << __func__ + +namespace cephfs { +namespace mirror { + +namespace { + +const std::string SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY("mirroring_failed"); + +class SafeTimerSingleton : public CommonSafeTimer { +public: + ceph::mutex timer_lock = ceph::make_mutex("cephfs::mirror::timer_lock"); + + explicit SafeTimerSingleton(CephContext *cct) + : SafeTimer(cct, timer_lock, true) { + init(); + } +}; + +class ThreadPoolSingleton : public ThreadPool { +public: + ContextWQ *work_queue = nullptr; + + explicit ThreadPoolSingleton(CephContext *cct) + : ThreadPool(cct, "Mirror::thread_pool", "tp_mirror", 1) { + work_queue = new ContextWQ("Mirror::work_queue", ceph::make_timespan(60), this); + + start(); + } +}; + +} // anonymous namespace + +struct Mirror::C_EnableMirroring : Context { + Mirror *mirror; + Filesystem filesystem; + uint64_t pool_id; + + C_EnableMirroring(Mirror *mirror, const Filesystem &filesystem, uint64_t pool_id) + : mirror(mirror), + filesystem(filesystem), + pool_id(pool_id) { + } + + void finish(int r) override { + enable_mirroring(); + } + + void enable_mirroring() { + Context *ctx = new C_CallbackAdapter(this); + mirror->enable_mirroring(filesystem, pool_id, ctx); + } + + void handle_enable_mirroring(int r) { + mirror->handle_enable_mirroring(filesystem, r); + delete this; + } + + // context needs to live post completion + void complete(int r) override { + finish(r); + } +}; + +struct Mirror::C_DisableMirroring : Context { + Mirror *mirror; + Filesystem filesystem; + + C_DisableMirroring(Mirror *mirror, const Filesystem &filesystem) + : mirror(mirror), + filesystem(filesystem) { + } + + void finish(int r) override { + disable_mirroring(); + } + + void disable_mirroring() { + Context *ctx = new C_CallbackAdapter(this); + mirror->disable_mirroring(filesystem, ctx); + } + + void handle_disable_mirroring(int r) { + mirror->handle_disable_mirroring(filesystem, r); + delete this; + } + + // context needs to live post completion + void complete(int r) override { + finish(r); + } +}; + +struct Mirror::C_PeerUpdate : Context { + Mirror *mirror; + Filesystem filesystem; + Peer peer; + bool remove = false; + + C_PeerUpdate(Mirror *mirror, const Filesystem &filesystem, + const Peer &peer) + : mirror(mirror), + filesystem(filesystem), + peer(peer) { + } + C_PeerUpdate(Mirror *mirror, const Filesystem &filesystem, + const Peer &peer, bool remove) + : mirror(mirror), + filesystem(filesystem), + peer(peer), + remove(remove) { + } + + void finish(int r) override { + if (remove) { + mirror->remove_peer(filesystem, peer); + } else { + mirror->add_peer(filesystem, peer); + } + } +}; + +struct Mirror::C_RestartMirroring : Context { + Mirror *mirror; + Filesystem filesystem; + uint64_t pool_id; + Peers peers; + + C_RestartMirroring(Mirror *mirror, const Filesystem &filesystem, + uint64_t pool_id, const Peers &peers) + : mirror(mirror), + filesystem(filesystem), + pool_id(pool_id), + peers(peers) { + } + + void finish(int r) override { + disable_mirroring(); + } + + void disable_mirroring() { + Context *ctx = new C_CallbackAdapter(this); + mirror->disable_mirroring(filesystem, ctx); + } + + void handle_disable_mirroring(int r) { + enable_mirroring(); + } + + void enable_mirroring() { + std::scoped_lock locker(mirror->m_lock); + Context *ctx = new C_CallbackAdapter(this); + mirror->enable_mirroring(filesystem, pool_id, ctx, true); + } + + void handle_enable_mirroring(int r) { + mirror->handle_enable_mirroring(filesystem, peers, r); + mirror->_unset_restarting(filesystem); + delete this; + } + + // context needs to live post completion + void complete(int r) override { + finish(r); + } +}; + +Mirror::Mirror(CephContext *cct, const std::vector &args, + MonClient *monc, Messenger *msgr) + : m_cct(cct), + m_args(args), + m_monc(monc), + m_msgr(msgr), + m_listener(this), + m_local(new librados::Rados()) { + auto thread_pool = &(cct->lookup_or_create_singleton_object( + "cephfs::mirror::thread_pool", false, cct)); + auto safe_timer = &(cct->lookup_or_create_singleton_object( + "cephfs::mirror::safe_timer", false, cct)); + m_thread_pool = thread_pool; + m_work_queue = thread_pool->work_queue; + m_timer = safe_timer; + m_timer_lock = &safe_timer->timer_lock; + std::scoped_lock timer_lock(*m_timer_lock); + schedule_mirror_update_task(); +} + +Mirror::~Mirror() { + dout(10) << dendl; + { + std::scoped_lock timer_lock(*m_timer_lock); + m_timer->shutdown(); + } + + m_work_queue->drain(); + delete m_work_queue; + { + std::scoped_lock locker(m_lock); + m_thread_pool->stop(); + } +} + +int Mirror::init_mon_client() { + dout(20) << dendl; + + m_monc->set_messenger(m_msgr); + m_msgr->add_dispatcher_head(m_monc); + m_monc->set_want_keys(CEPH_ENTITY_TYPE_MON); + + int r = m_monc->init(); + if (r < 0) { + derr << ": failed to init mon client: " << cpp_strerror(r) << dendl; + return r; + } + + r = m_monc->authenticate(std::chrono::duration(m_cct->_conf.get_val("client_mount_timeout")).count()); + if (r < 0) { + derr << ": failed to authenticate to monitor: " << cpp_strerror(r) << dendl; + return r; + } + + client_t me = m_monc->get_global_id(); + m_msgr->set_myname(entity_name_t::CLIENT(me.v)); + return 0; +} + +int Mirror::init(std::string &reason) { + dout(20) << dendl; + + std::scoped_lock locker(m_lock); + + int r = m_local->init_with_context(m_cct); + if (r < 0) { + derr << ": could not initialize rados handler" << dendl; + return r; + } + + r = m_local->connect(); + if (r < 0) { + derr << ": error connecting to local cluster" << dendl; + return r; + } + + m_service_daemon = std::make_unique(m_cct, m_local); + r = m_service_daemon->init(); + if (r < 0) { + derr << ": error registering service daemon: " << cpp_strerror(r) << dendl; + return r; + } + + r = init_mon_client(); + if (r < 0) { + return r; + } + + return 0; +} + +void Mirror::shutdown() { + dout(20) << dendl; + m_stopping = true; + m_cluster_watcher->shutdown(); + m_cond.notify_all(); +} + +void Mirror::reopen_logs() { + for (auto &[filesystem, mirror_action] : m_mirror_actions) { + mirror_action.fs_mirror->reopen_logs(); + } + g_ceph_context->reopen_logs(); +} + +void Mirror::handle_signal(int signum) { + dout(10) << ": signal=" << signum << dendl; + + std::scoped_lock locker(m_lock); + switch (signum) { + case SIGHUP: + reopen_logs(); + break; + case SIGINT: + case SIGTERM: + shutdown(); + break; + default: + ceph_abort_msgf("unexpected signal %d", signum); + } +} + +void Mirror::handle_enable_mirroring(const Filesystem &filesystem, + const Peers &peers, int r) { + dout(20) << ": filesystem=" << filesystem << ", peers=" << peers + << ", r=" << r << dendl; + + std::scoped_lock locker(m_lock); + auto &mirror_action = m_mirror_actions.at(filesystem); + ceph_assert(mirror_action.action_in_progress); + + mirror_action.action_in_progress = false; + m_cond.notify_all(); + if (r < 0) { + derr << ": failed to initialize FSMirror for filesystem=" << filesystem + << ": " << cpp_strerror(r) << dendl; + m_service_daemon->add_or_update_fs_attribute(filesystem.fscid, + SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY, + true); + return; + } + + for (auto &peer : peers) { + mirror_action.fs_mirror->add_peer(peer); + } + + dout(10) << ": Initialized FSMirror for filesystem=" << filesystem << dendl; +} + +void Mirror::handle_enable_mirroring(const Filesystem &filesystem, int r) { + dout(20) << ": filesystem=" << filesystem << ", r=" << r << dendl; + + std::scoped_lock locker(m_lock); + auto &mirror_action = m_mirror_actions.at(filesystem); + ceph_assert(mirror_action.action_in_progress); + + mirror_action.action_in_progress = false; + m_cond.notify_all(); + if (r < 0) { + derr << ": failed to initialize FSMirror for filesystem=" << filesystem + << ": " << cpp_strerror(r) << dendl; + m_service_daemon->add_or_update_fs_attribute(filesystem.fscid, + SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY, + true); + return; + } + + dout(10) << ": Initialized FSMirror for filesystem=" << filesystem << dendl; +} + +void Mirror::enable_mirroring(const Filesystem &filesystem, uint64_t local_pool_id, + Context *on_finish, bool is_restart) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + auto &mirror_action = m_mirror_actions.at(filesystem); + if (is_restart) { + mirror_action.fs_mirror.reset(); + } else { + ceph_assert(!mirror_action.action_in_progress); + } + + ceph_assert(!mirror_action.fs_mirror); + + dout(10) << ": starting FSMirror: filesystem=" << filesystem << dendl; + + mirror_action.action_in_progress = true; + mirror_action.fs_mirror = std::make_unique(m_cct, filesystem, local_pool_id, + m_service_daemon.get(), m_args, m_work_queue); + mirror_action.fs_mirror->init(new C_AsyncCallback(m_work_queue, on_finish)); +} + +void Mirror::mirroring_enabled(const Filesystem &filesystem, uint64_t local_pool_id) { + dout(10) << ": filesystem=" << filesystem << ", pool_id=" << local_pool_id << dendl; + + std::scoped_lock locker(m_lock); + if (m_stopping) { + return; + } + + auto p = m_mirror_actions.emplace(filesystem, MirrorAction(local_pool_id)); + auto &mirror_action = p.first->second; + mirror_action.action_ctxs.push_back(new C_EnableMirroring(this, filesystem, local_pool_id)); +} + +void Mirror::handle_disable_mirroring(const Filesystem &filesystem, int r) { + dout(10) << ": filesystem=" << filesystem << ", r=" << r << dendl; + + std::scoped_lock locker(m_lock); + auto &mirror_action = m_mirror_actions.at(filesystem); + + if (!mirror_action.fs_mirror->is_init_failed()) { + ceph_assert(mirror_action.action_in_progress); + mirror_action.action_in_progress = false; + m_cond.notify_all(); + } + + if (!m_stopping) { + mirror_action.fs_mirror.reset(); + if (mirror_action.action_ctxs.empty()) { + dout(10) << ": no pending actions for filesystem=" << filesystem << dendl; + m_mirror_actions.erase(filesystem); + } + } +} + +void Mirror::disable_mirroring(const Filesystem &filesystem, Context *on_finish) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + auto &mirror_action = m_mirror_actions.at(filesystem); + ceph_assert(mirror_action.fs_mirror); + ceph_assert(!mirror_action.action_in_progress); + + if (mirror_action.fs_mirror->is_init_failed()) { + dout(10) << ": init failed for filesystem=" << filesystem << dendl; + m_work_queue->queue(on_finish, -EINVAL); + return; + } + + mirror_action.action_in_progress = true; + mirror_action.fs_mirror->shutdown(new C_AsyncCallback(m_work_queue, on_finish)); +} + +void Mirror::mirroring_disabled(const Filesystem &filesystem) { + dout(10) << ": filesystem=" << filesystem << dendl; + + std::scoped_lock locker(m_lock); + if (m_stopping) { + dout(5) << "shutting down" << dendl; + return; + } + + auto &mirror_action = m_mirror_actions.at(filesystem); + mirror_action.action_ctxs.push_back(new C_DisableMirroring(this, filesystem)); +} + +void Mirror::add_peer(const Filesystem &filesystem, const Peer &peer) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + auto &mirror_action = m_mirror_actions.at(filesystem); + ceph_assert(mirror_action.fs_mirror); + ceph_assert(!mirror_action.action_in_progress); + + mirror_action.fs_mirror->add_peer(peer); +} + +void Mirror::peer_added(const Filesystem &filesystem, const Peer &peer) { + dout(20) << ": filesystem=" << filesystem << ", peer=" << peer << dendl; + + std::scoped_lock locker(m_lock); + if (m_stopping) { + dout(5) << "shutting down" << dendl; + return; + } + + auto &mirror_action = m_mirror_actions.at(filesystem); + mirror_action.action_ctxs.push_back(new C_PeerUpdate(this, filesystem, peer)); +} + +void Mirror::remove_peer(const Filesystem &filesystem, const Peer &peer) { + ceph_assert(ceph_mutex_is_locked(m_lock)); + + auto &mirror_action = m_mirror_actions.at(filesystem); + ceph_assert(mirror_action.fs_mirror); + ceph_assert(!mirror_action.action_in_progress); + + mirror_action.fs_mirror->remove_peer(peer); +} + +void Mirror::peer_removed(const Filesystem &filesystem, const Peer &peer) { + dout(20) << ": filesystem=" << filesystem << ", peer=" << peer << dendl; + + std::scoped_lock locker(m_lock); + if (m_stopping) { + dout(5) << "shutting down" << dendl; + return; + } + + auto &mirror_action = m_mirror_actions.at(filesystem); + mirror_action.action_ctxs.push_back(new C_PeerUpdate(this, filesystem, peer, true)); +} + +void Mirror::update_fs_mirrors() { + dout(20) << dendl; + + auto now = ceph_clock_now(); + double blocklist_interval = g_ceph_context->_conf.get_val + ("cephfs_mirror_restart_mirror_on_blocklist_interval").count(); + double failed_interval = g_ceph_context->_conf.get_val + ("cephfs_mirror_restart_mirror_on_failure_interval").count(); + + { + std::scoped_lock locker(m_lock); + for (auto &[filesystem, mirror_action] : m_mirror_actions) { + auto failed_restart = mirror_action.fs_mirror && mirror_action.fs_mirror->is_failed() && + (failed_interval > 0 && (mirror_action.fs_mirror->get_failed_ts() - now) > failed_interval); + auto blocklisted_restart = mirror_action.fs_mirror && mirror_action.fs_mirror->is_blocklisted() && + (blocklist_interval > 0 && (mirror_action.fs_mirror->get_blocklisted_ts() - now) > blocklist_interval); + + if (!mirror_action.action_in_progress && !_is_restarting(filesystem)) { + if (failed_restart || blocklisted_restart) { + dout(5) << ": filesystem=" << filesystem << " failed mirroring (failed: " + << failed_restart << ", blocklisted: " << blocklisted_restart << dendl; + _set_restarting(filesystem); + auto peers = mirror_action.fs_mirror->get_peers(); + auto ctx = new C_RestartMirroring(this, filesystem, mirror_action.pool_id, peers); + ctx->complete(0); + } + } + + if (!failed_restart && !blocklisted_restart && !mirror_action.action_ctxs.empty() + && !mirror_action.action_in_progress) { + auto ctx = std::move(mirror_action.action_ctxs.front()); + mirror_action.action_ctxs.pop_front(); + ctx->complete(0); + } + } + } + + schedule_mirror_update_task(); +} + +void Mirror::schedule_mirror_update_task() { + ceph_assert(m_timer_task == nullptr); + ceph_assert(ceph_mutex_is_locked(*m_timer_lock)); + + m_timer_task = new LambdaContext([this](int _) { + m_timer_task = nullptr; + update_fs_mirrors(); + }); + double after = g_ceph_context->_conf.get_val + ("cephfs_mirror_action_update_interval").count(); + dout(20) << ": scheduling fs mirror update (" << m_timer_task << ") after " + << after << " seconds" << dendl; + m_timer->add_event_after(after, m_timer_task); +} + +void Mirror::run() { + dout(20) << dendl; + + std::unique_lock locker(m_lock); + m_cluster_watcher.reset(new ClusterWatcher(m_cct, m_monc, m_service_daemon.get(), m_listener)); + m_msgr->add_dispatcher_tail(m_cluster_watcher.get()); + + m_cluster_watcher->init(); + m_cond.wait(locker, [this]{return m_stopping;}); + + locker.unlock(); + { + std::scoped_lock timer_lock(*m_timer_lock); + if (m_timer_task != nullptr) { + dout(10) << ": canceling timer task=" << m_timer_task << dendl; + m_timer->cancel_event(m_timer_task); + m_timer_task = nullptr; + } + } + locker.lock(); + + for (auto &[filesystem, mirror_action] : m_mirror_actions) { + dout(10) << ": trying to shutdown filesystem=" << filesystem << dendl; + // wait for in-progress action and shutdown + m_cond.wait(locker, [&mirror_action=mirror_action] + {return !mirror_action.action_in_progress;}); + if (mirror_action.fs_mirror && + !mirror_action.fs_mirror->is_stopping() && + !mirror_action.fs_mirror->is_init_failed()) { + C_SaferCond cond; + mirror_action.fs_mirror->shutdown(new C_AsyncCallback(m_work_queue, &cond)); + int r = cond.wait(); + dout(10) << ": shutdown filesystem=" << filesystem << ", r=" << r << dendl; + } + + mirror_action.fs_mirror.reset(); + } +} + +} // namespace mirror +} // namespace cephfs + diff --git a/src/tools/cephfs_mirror/Mirror.h b/src/tools/cephfs_mirror/Mirror.h new file mode 100644 index 000000000..2081b5b53 --- /dev/null +++ b/src/tools/cephfs_mirror/Mirror.h @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPHFS_MIRROR_H +#define CEPHFS_MIRROR_H + +#include +#include +#include + +#include "common/ceph_mutex.h" +#include "common/WorkQueue.h" +#include "mds/FSMap.h" +#include "ClusterWatcher.h" +#include "FSMirror.h" +#include "ServiceDaemon.h" +#include "Types.h" + +class Messenger; +class MonClient; +class ContextWQ; + +namespace cephfs { +namespace mirror { + +// this wraps up ClusterWatcher and FSMirrors to implement mirroring +// for ceph filesystems. + +class Mirror { +public: + Mirror(CephContext *cct, const std::vector &args, + MonClient *monc, Messenger *msgr); + ~Mirror(); + + int init(std::string &reason); + void shutdown(); + void run(); + + void handle_signal(int signum); + +private: + static constexpr std::string_view MIRRORING_MODULE = "mirroring"; + + struct C_EnableMirroring; + struct C_DisableMirroring; + struct C_PeerUpdate; + struct C_RestartMirroring; + + struct ClusterListener : ClusterWatcher::Listener { + Mirror *mirror; + + ClusterListener(Mirror *mirror) + : mirror(mirror) { + } + + void handle_mirroring_enabled(const FilesystemSpec &spec) override { + mirror->mirroring_enabled(spec.filesystem, spec.pool_id); + } + + void handle_mirroring_disabled(const Filesystem &filesystem) override { + mirror->mirroring_disabled(filesystem); + } + + void handle_peers_added(const Filesystem &filesystem, const Peer &peer) override { + mirror->peer_added(filesystem, peer); + } + + void handle_peers_removed(const Filesystem &filesystem, const Peer &peer) override { + mirror->peer_removed(filesystem, peer); + } + }; + + struct MirrorAction { + MirrorAction(uint64_t pool_id) : + pool_id(pool_id) { + } + + uint64_t pool_id; // for restarting blocklisted mirror instance + bool action_in_progress = false; + bool restarting = false; + std::list action_ctxs; + std::unique_ptr fs_mirror; + }; + + ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::Mirror"); + ceph::condition_variable m_cond; + + CephContext *m_cct; + std::vector m_args; + MonClient *m_monc; + Messenger *m_msgr; + ClusterListener m_listener; + + ThreadPool *m_thread_pool = nullptr; + ContextWQ *m_work_queue = nullptr; + SafeTimer *m_timer = nullptr; + ceph::mutex *m_timer_lock = nullptr; + Context *m_timer_task = nullptr; + + bool m_stopping = false; + std::unique_ptr m_cluster_watcher; + std::map m_mirror_actions; + + RadosRef m_local; + std::unique_ptr m_service_daemon; + + int init_mon_client(); + + // called via listener + void mirroring_enabled(const Filesystem &filesystem, uint64_t local_pool_id); + void mirroring_disabled(const Filesystem &filesystem); + void peer_added(const Filesystem &filesystem, const Peer &peer); + void peer_removed(const Filesystem &filesystem, const Peer &peer); + + // mirror enable callback + void enable_mirroring(const Filesystem &filesystem, uint64_t local_pool_id, + Context *on_finish, bool is_restart=false); + void handle_enable_mirroring(const Filesystem &filesystem, int r); + void handle_enable_mirroring(const Filesystem &filesystem, const Peers &peers, int r); + + // mirror disable callback + void disable_mirroring(const Filesystem &filesystem, Context *on_finish); + void handle_disable_mirroring(const Filesystem &filesystem, int r); + + // peer update callback + void add_peer(const Filesystem &filesystem, const Peer &peer); + void remove_peer(const Filesystem &filesystem, const Peer &peer); + + void schedule_mirror_update_task(); + void update_fs_mirrors(); + + void reopen_logs(); + + void _set_restarting(const Filesystem &filesystem) { + auto &mirror_action = m_mirror_actions.at(filesystem); + mirror_action.restarting = true; + } + + void _unset_restarting(const Filesystem &filesystem) { + auto &mirror_action = m_mirror_actions.at(filesystem); + mirror_action.restarting = false; + } + + bool _is_restarting(const Filesystem &filesystem) { + auto &mirror_action = m_mirror_actions.at(filesystem); + return mirror_action.restarting; + } +}; + +} // namespace mirror +} // namespace cephfs + +#endif // CEPHFS_MIRROR_H diff --git a/src/tools/cephfs_mirror/MirrorWatcher.cc b/src/tools/cephfs_mirror/MirrorWatcher.cc new file mode 100644 index 000000000..b3770d103 --- /dev/null +++ b/src/tools/cephfs_mirror/MirrorWatcher.cc @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_context.h" +#include "common/ceph_json.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/WorkQueue.h" +#include "include/stringify.h" +#include "msg/Messenger.h" +#include "aio_utils.h" +#include "MirrorWatcher.h" +#include "FSMirror.h" +#include "Types.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_cephfs_mirror +#undef dout_prefix +#define dout_prefix *_dout << "cephfs::mirror::MirrorWatcher " << __func__ + +namespace cephfs { +namespace mirror { + +MirrorWatcher::MirrorWatcher(librados::IoCtx &ioctx, FSMirror *fs_mirror, + ContextWQ *work_queue) + : Watcher(ioctx, CEPHFS_MIRROR_OBJECT, work_queue), + m_ioctx(ioctx), + m_fs_mirror(fs_mirror), + m_work_queue(work_queue), + m_lock(ceph::make_mutex("cephfs::mirror::mirror_watcher")), + m_instance_id(stringify(m_ioctx.get_instance_id())) { +} + +MirrorWatcher::~MirrorWatcher() { +} + +void MirrorWatcher::init(Context *on_finish) { + dout(20) << dendl; + + { + std::scoped_lock locker(m_lock); + ceph_assert(m_on_init_finish == nullptr); + m_on_init_finish = new LambdaContext([this, on_finish](int r) { + on_finish->complete(r); + if (m_on_shutdown_finish != nullptr) { + m_on_shutdown_finish->complete(0); + } + }); + } + + register_watcher(); +} + +void MirrorWatcher::shutdown(Context *on_finish) { + dout(20) << dendl; + + { + std::scoped_lock locker(m_lock); + ceph_assert(m_on_shutdown_finish == nullptr); + if (m_on_init_finish != nullptr) { + dout(10) << ": delaying shutdown -- init in progress" << dendl; + m_on_shutdown_finish = new LambdaContext([this, on_finish](int r) { + m_on_shutdown_finish = nullptr; + shutdown(on_finish); + }); + return; + } + + m_on_shutdown_finish = on_finish; + } + + unregister_watcher(); +} + +void MirrorWatcher::handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist& bl) { + dout(20) << dendl; + + JSONFormatter f; + f.open_object_section("info"); + encode_json("addr", m_fs_mirror->get_instance_addr(), &f); + f.close_section(); + + bufferlist outbl; + f.flush(outbl); + acknowledge_notify(notify_id, handle, outbl); +} + +void MirrorWatcher::handle_rewatch_complete(int r) { + dout(5) << ": r=" << r << dendl; + + if (r == -EBLOCKLISTED) { + dout(0) << ": client blocklisted" <(this); + register_watch(on_finish); +} + +void MirrorWatcher::handle_register_watcher(int r) { + dout(20) << ": r=" << r << dendl; + + Context *on_init_finish = nullptr; + { + std::scoped_lock locker(m_lock); + std::swap(on_init_finish, m_on_init_finish); + } + + on_init_finish->complete(r); +} + +void MirrorWatcher::unregister_watcher() { + dout(20) << dendl; + + std::scoped_lock locker(m_lock); + Context *on_finish = new C_CallbackAdapter< + MirrorWatcher, &MirrorWatcher::handle_unregister_watcher>(this); + unregister_watch(new C_AsyncCallback(m_work_queue, on_finish)); +} + +void MirrorWatcher::handle_unregister_watcher(int r) { + dout(20) << ": r=" << r << dendl; + + Context *on_shutdown_finish = nullptr; + { + std::scoped_lock locker(m_lock); + std::swap(on_shutdown_finish, m_on_shutdown_finish); + } + + on_shutdown_finish->complete(r); +} + +} // namespace mirror +} // namespace cephfs diff --git a/src/tools/cephfs_mirror/MirrorWatcher.h b/src/tools/cephfs_mirror/MirrorWatcher.h new file mode 100644 index 000000000..54e185b95 --- /dev/null +++ b/src/tools/cephfs_mirror/MirrorWatcher.h @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPHFS_MIRROR_MIRROR_WATCHER_H +#define CEPHFS_MIRROR_MIRROR_WATCHER_H + +#include + +#include "common/ceph_mutex.h" +#include "include/Context.h" +#include "include/rados/librados.hpp" +#include "Watcher.h" + +class ContextWQ; +class Messenger; + +namespace cephfs { +namespace mirror { + +class FSMirror; + +// watch for notifications via cephfs_mirror object (in metadata +// pool). this is used sending keepalived with keepalive payload +// being the rados instance address (used by the manager module +// to blocklist when needed). + +class MirrorWatcher : public Watcher { +public: + static MirrorWatcher *create(librados::IoCtx &ioctx, FSMirror *fs_mirror, + ContextWQ *work_queue) { + return new MirrorWatcher(ioctx, fs_mirror, work_queue); + } + + MirrorWatcher(librados::IoCtx &ioctx, FSMirror *fs_mirror, + ContextWQ *work_queue); + ~MirrorWatcher(); + + void init(Context *on_finish); + void shutdown(Context *on_finish); + + void handle_notify(uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, bufferlist& bl) override; + void handle_rewatch_complete(int r) override; + + bool is_blocklisted() { + std::scoped_lock locker(m_lock); + return m_blocklisted; + } + + utime_t get_blocklisted_ts() { + std::scoped_lock locker(m_lock); + return m_blocklisted_ts; + } + + bool is_failed() { + std::scoped_lock locker(m_lock); + return m_failed; + } + + utime_t get_failed_ts() { + std::scoped_lock locker(m_lock); + return m_failed_ts; + } + +private: + librados::IoCtx &m_ioctx; + FSMirror *m_fs_mirror; + ContextWQ *m_work_queue; + + ceph::mutex m_lock; + std::string m_instance_id; + + Context *m_on_init_finish = nullptr; + Context *m_on_shutdown_finish = nullptr; + + bool m_blocklisted = false; + bool m_failed = false; + + utime_t m_blocklisted_ts; + utime_t m_failed_ts; + + void register_watcher(); + void handle_register_watcher(int r); + + void unregister_watcher(); + void handle_unregister_watcher(int r); +}; + +} // namespace mirror +} // namespace cephfs + +#endif // CEPHFS_MIRROR_MIRROR_WATCHER_H diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc new file mode 100644 index 000000000..bd47046bb --- /dev/null +++ b/src/tools/cephfs_mirror/PeerReplayer.cc @@ -0,0 +1,1581 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include +#include + +#include "common/admin_socket.h" +#include "common/ceph_context.h" +#include "common/debug.h" +#include "common/errno.h" +#include "FSMirror.h" +#include "PeerReplayer.h" +#include "Utils.h" + +#include "json_spirit/json_spirit.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_cephfs_mirror +#undef dout_prefix +#define dout_prefix *_dout << "cephfs::mirror::PeerReplayer(" \ + << m_peer.uuid << ") " << __func__ + +using namespace std; + +namespace cephfs { +namespace mirror { + +namespace { + +const std::string PEER_CONFIG_KEY_PREFIX = "cephfs/mirror/peer"; + +std::string snapshot_dir_path(CephContext *cct, const std::string &path) { + return path + "/" + cct->_conf->client_snapdir; +} + +std::string snapshot_path(const std::string &snap_dir, const std::string &snap_name) { + return snap_dir + "/" + snap_name; +} + +std::string snapshot_path(CephContext *cct, const std::string &path, const std::string &snap_name) { + return path + "/" + cct->_conf->client_snapdir + "/" + snap_name; +} + +std::string entry_path(const std::string &dir, const std::string &name) { + return dir + "/" + name; +} + +std::map decode_snap_metadata(snap_metadata *snap_metadata, + size_t nr_snap_metadata) { + std::map metadata; + for (size_t i = 0; i < nr_snap_metadata; ++i) { + metadata.emplace(snap_metadata[i].key, snap_metadata[i].value); + } + + return metadata; +} + +std::string peer_config_key(const std::string &fs_name, const std::string &uuid) { + return PEER_CONFIG_KEY_PREFIX + "/" + fs_name + "/" + uuid; +} + +class PeerAdminSocketCommand { +public: + virtual ~PeerAdminSocketCommand() { + } + virtual int call(Formatter *f) = 0; +}; + +class StatusCommand : public PeerAdminSocketCommand { +public: + explicit StatusCommand(PeerReplayer *peer_replayer) + : peer_replayer(peer_replayer) { + } + + int call(Formatter *f) override { + peer_replayer->peer_status(f); + return 0; + } + +private: + PeerReplayer *peer_replayer; +}; + +// helper to open a directory relative to a file descriptor +int opendirat(MountRef mnt, int dirfd, const std::string &relpath, int flags, + ceph_dir_result **dirp) { + int r = ceph_openat(mnt, dirfd, relpath.c_str(), flags, 0); + if (r < 0) { + return r; + } + + int fd = r; + r = ceph_fdopendir(mnt, fd, dirp); + ceph_close(mnt, fd); + return r; +} + +} // anonymous namespace + +class PeerReplayerAdminSocketHook : public AdminSocketHook { +public: + PeerReplayerAdminSocketHook(CephContext *cct, const Filesystem &filesystem, + const Peer &peer, PeerReplayer *peer_replayer) + : admin_socket(cct->get_admin_socket()) { + int r; + std::string cmd; + + // mirror peer status format is name@id uuid + cmd = "fs mirror peer status " + + stringify(filesystem.fs_name) + "@" + stringify(filesystem.fscid) + + " " + + stringify(peer.uuid); + r = admin_socket->register_command( + cmd, this, "get peer mirror status"); + if (r == 0) { + commands[cmd] = new StatusCommand(peer_replayer); + } + } + + ~PeerReplayerAdminSocketHook() override { + admin_socket->unregister_commands(this); + for (auto &[command, cmdptr] : commands) { + delete cmdptr; + } + } + + int call(std::string_view command, const cmdmap_t& cmdmap, + const bufferlist&, + Formatter *f, std::ostream &errss, bufferlist &out) override { + auto p = commands.at(std::string(command)); + return p->call(f); + } + +private: + typedef std::map> Commands; + + AdminSocket *admin_socket; + Commands commands; +}; + +PeerReplayer::PeerReplayer(CephContext *cct, FSMirror *fs_mirror, + RadosRef local_cluster, const Filesystem &filesystem, + const Peer &peer, const std::set> &directories, + MountRef mount, ServiceDaemon *service_daemon) + : m_cct(cct), + m_fs_mirror(fs_mirror), + m_local_cluster(local_cluster), + m_filesystem(filesystem), + m_peer(peer), + m_directories(directories.begin(), directories.end()), + m_local_mount(mount), + m_service_daemon(service_daemon), + m_asok_hook(new PeerReplayerAdminSocketHook(cct, filesystem, peer, this)), + m_lock(ceph::make_mutex("cephfs::mirror::PeerReplayer::" + stringify(peer.uuid))) { + // reset sync stats sent via service daemon + m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer, + SERVICE_DAEMON_FAILED_DIR_COUNT_KEY, (uint64_t)0); + m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer, + SERVICE_DAEMON_RECOVERED_DIR_COUNT_KEY, (uint64_t)0); +} + +PeerReplayer::~PeerReplayer() { + delete m_asok_hook; +} + +int PeerReplayer::init() { + dout(20) << ": initial dir list=[" << m_directories << "]" << dendl; + for (auto &dir_root : m_directories) { + m_snap_sync_stats.emplace(dir_root, SnapSyncStat()); + } + + auto &remote_client = m_peer.remote.client_name; + auto &remote_cluster = m_peer.remote.cluster_name; + auto remote_filesystem = Filesystem{0, m_peer.remote.fs_name}; + + std::string key = peer_config_key(m_filesystem.fs_name, m_peer.uuid); + std::string cmd = + "{" + "\"prefix\": \"config-key get\", " + "\"key\": \"" + key + "\"" + "}"; + + bufferlist in_bl; + bufferlist out_bl; + + int r = m_local_cluster->mon_command(cmd, in_bl, &out_bl, nullptr); + dout(5) << ": mon command r=" << r << dendl; + if (r < 0 && r != -ENOENT) { + return r; + } + + std::string mon_host; + std::string cephx_key; + if (!r) { + json_spirit::mValue root; + if (!json_spirit::read(out_bl.to_str(), root)) { + derr << ": invalid config-key JSON" << dendl; + return -EBADMSG; + } + try { + auto &root_obj = root.get_obj(); + mon_host = root_obj.at("mon_host").get_str(); + cephx_key = root_obj.at("key").get_str(); + dout(0) << ": remote monitor host=" << mon_host << dendl; + } catch (std::runtime_error&) { + derr << ": unexpected JSON received" << dendl; + return -EBADMSG; + } + } + + r = connect(remote_client, remote_cluster, &m_remote_cluster, mon_host, cephx_key); + if (r < 0) { + derr << ": error connecting to remote cluster: " << cpp_strerror(r) + << dendl; + return r; + } + + r = mount(m_remote_cluster, remote_filesystem, false, &m_remote_mount); + if (r < 0) { + m_remote_cluster.reset(); + derr << ": error mounting remote filesystem=" << remote_filesystem << dendl; + return r; + } + + std::scoped_lock locker(m_lock); + auto nr_replayers = g_ceph_context->_conf.get_val( + "cephfs_mirror_max_concurrent_directory_syncs"); + dout(20) << ": spawning " << nr_replayers << " snapshot replayer(s)" << dendl; + + while (nr_replayers-- > 0) { + std::unique_ptr replayer( + new SnapshotReplayerThread(this)); + std::string name("replayer-" + stringify(nr_replayers)); + replayer->create(name.c_str()); + m_replayers.push_back(std::move(replayer)); + } + + return 0; +} + +void PeerReplayer::shutdown() { + dout(20) << dendl; + + { + std::scoped_lock locker(m_lock); + ceph_assert(!m_stopping); + m_stopping = true; + m_cond.notify_all(); + } + + for (auto &replayer : m_replayers) { + replayer->join(); + } + m_replayers.clear(); + ceph_unmount(m_remote_mount); + ceph_release(m_remote_mount); + m_remote_mount = nullptr; + m_remote_cluster.reset(); +} + +void PeerReplayer::add_directory(string_view dir_root) { + dout(20) << ": dir_root=" << dir_root << dendl; + + std::scoped_lock locker(m_lock); + m_directories.emplace_back(dir_root); + m_snap_sync_stats.emplace(dir_root, SnapSyncStat()); + m_cond.notify_all(); +} + +void PeerReplayer::remove_directory(string_view dir_root) { + dout(20) << ": dir_root=" << dir_root << dendl; + auto _dir_root = std::string(dir_root); + + std::scoped_lock locker(m_lock); + auto it = std::find(m_directories.begin(), m_directories.end(), _dir_root); + if (it != m_directories.end()) { + m_directories.erase(it); + } + + auto it1 = m_registered.find(_dir_root); + if (it1 == m_registered.end()) { + m_snap_sync_stats.erase(_dir_root); + } else { + it1->second.canceled = true; + } + m_cond.notify_all(); +} + +boost::optional PeerReplayer::pick_directory() { + dout(20) << dendl; + + auto now = clock::now(); + auto retry_timo = g_ceph_context->_conf.get_val( + "cephfs_mirror_retry_failed_directories_interval"); + + boost::optional candidate; + for (auto &dir_root : m_directories) { + auto &sync_stat = m_snap_sync_stats.at(dir_root); + if (sync_stat.failed) { + std::chrono::duration d = now - *sync_stat.last_failed; + if (d.count() < retry_timo) { + continue; + } + } + if (!m_registered.count(dir_root)) { + candidate = dir_root; + break; + } + } + + std::rotate(m_directories.begin(), m_directories.begin() + 1, m_directories.end()); + return candidate; +} + +int PeerReplayer::register_directory(const std::string &dir_root, + SnapshotReplayerThread *replayer) { + dout(20) << ": dir_root=" << dir_root << dendl; + ceph_assert(m_registered.find(dir_root) == m_registered.end()); + + DirRegistry registry; + int r = try_lock_directory(dir_root, replayer, ®istry); + if (r < 0) { + return r; + } + + dout(5) << ": dir_root=" << dir_root << " registered with replayer=" + << replayer << dendl; + m_registered.emplace(dir_root, std::move(registry)); + return 0; +} + +void PeerReplayer::unregister_directory(const std::string &dir_root) { + dout(20) << ": dir_root=" << dir_root << dendl; + + auto it = m_registered.find(dir_root); + ceph_assert(it != m_registered.end()); + + unlock_directory(it->first, it->second); + m_registered.erase(it); + if (std::find(m_directories.begin(), m_directories.end(), dir_root) == m_directories.end()) { + m_snap_sync_stats.erase(dir_root); + } +} + +int PeerReplayer::try_lock_directory(const std::string &dir_root, + SnapshotReplayerThread *replayer, DirRegistry *registry) { + dout(20) << ": dir_root=" << dir_root << dendl; + + int r = ceph_open(m_remote_mount, dir_root.c_str(), O_RDONLY | O_DIRECTORY, 0); + if (r < 0 && r != -ENOENT) { + derr << ": failed to open remote dir_root=" << dir_root << ": " << cpp_strerror(r) + << dendl; + return r; + } + + if (r == -ENOENT) { + // we snap under dir_root, so mode does not matter much + r = ceph_mkdirs(m_remote_mount, dir_root.c_str(), 0755); + if (r < 0) { + derr << ": failed to create remote directory=" << dir_root << ": " << cpp_strerror(r) + << dendl; + return r; + } + + r = ceph_open(m_remote_mount, dir_root.c_str(), O_RDONLY | O_DIRECTORY, 0); + if (r < 0) { + derr << ": failed to open remote dir_root=" << dir_root << ": " << cpp_strerror(r) + << dendl; + return r; + } + } + + int fd = r; + r = ceph_flock(m_remote_mount, fd, LOCK_EX | LOCK_NB, (uint64_t)replayer->get_thread_id()); + if (r != 0) { + if (r == -EWOULDBLOCK) { + dout(5) << ": dir_root=" << dir_root << " is locked by cephfs-mirror, " + << "will retry again" << dendl; + } else { + derr << ": failed to lock dir_root=" << dir_root << ": " << cpp_strerror(r) + << dendl; + } + + if (ceph_close(m_remote_mount, fd) < 0) { + derr << ": failed to close (cleanup) remote dir_root=" << dir_root << ": " + << cpp_strerror(r) << dendl; + } + return r; + } + + dout(10) << ": dir_root=" << dir_root << " locked" << dendl; + + registry->fd = fd; + registry->replayer = replayer; + return 0; +} + +void PeerReplayer::unlock_directory(const std::string &dir_root, const DirRegistry ®istry) { + dout(20) << ": dir_root=" << dir_root << dendl; + + int r = ceph_flock(m_remote_mount, registry.fd, LOCK_UN, + (uint64_t)registry.replayer->get_thread_id()); + if (r < 0) { + derr << ": failed to unlock remote dir_root=" << dir_root << ": " << cpp_strerror(r) + << dendl; + return; + } + + r = ceph_close(m_remote_mount, registry.fd); + if (r < 0) { + derr << ": failed to close remote dir_root=" << dir_root << ": " << cpp_strerror(r) + << dendl; + } + + dout(10) << ": dir_root=" << dir_root << " unlocked" << dendl; +} + +int PeerReplayer::build_snap_map(const std::string &dir_root, + std::map *snap_map, bool is_remote) { + auto snap_dir = snapshot_dir_path(m_cct, dir_root); + dout(20) << ": dir_root=" << dir_root << ", snap_dir=" << snap_dir + << ", is_remote=" << is_remote << dendl; + + auto lr_str = is_remote ? "remote" : "local"; + auto mnt = is_remote ? m_remote_mount : m_local_mount; + + ceph_dir_result *dirp = nullptr; + int r = ceph_opendir(mnt, snap_dir.c_str(), &dirp); + if (r < 0) { + if (is_remote && r == -ENOENT) { + return 0; + } + derr << ": failed to open " << lr_str << " snap directory=" << snap_dir + << ": " << cpp_strerror(r) << dendl; + return r; + } + + std::set snaps; + auto entry = ceph_readdir(mnt, dirp); + while (entry != NULL) { + auto d_name = std::string(entry->d_name); + dout(20) << ": entry=" << d_name << dendl; + if (d_name != "." && d_name != ".." && d_name.rfind("_", 0) != 0) { + snaps.emplace(d_name); + } + + entry = ceph_readdir(mnt, dirp); + } + + int rv = 0; + for (auto &snap : snaps) { + snap_info info; + auto snap_path = snapshot_path(snap_dir, snap); + r = ceph_get_snap_info(mnt, snap_path.c_str(), &info); + if (r < 0) { + derr << ": failed to fetch " << lr_str << " snap info for snap_path=" << snap_path + << ": " << cpp_strerror(r) << dendl; + rv = r; + break; + } + + uint64_t snap_id; + if (is_remote) { + if (!info.nr_snap_metadata) { + derr << ": snap_path=" << snap_path << " has invalid metadata in remote snapshot" + << dendl; + rv = -EINVAL; + } else { + auto metadata = decode_snap_metadata(info.snap_metadata, info.nr_snap_metadata); + dout(20) << ": snap_path=" << snap_path << ", metadata=" << metadata << dendl; + auto it = metadata.find(PRIMARY_SNAP_ID_KEY); + if (it == metadata.end()) { + derr << ": snap_path=" << snap_path << " has missing \"" << PRIMARY_SNAP_ID_KEY + << "\" in metadata" << dendl; + rv = -EINVAL; + } else { + snap_id = std::stoull(it->second); + } + ceph_free_snap_info_buffer(&info); + } + } else { + snap_id = info.id; + } + + if (rv != 0) { + break; + } + snap_map->emplace(snap_id, snap); + } + + r = ceph_closedir(mnt, dirp); + if (r < 0) { + derr << ": failed to close " << lr_str << " snap directory=" << snap_dir + << ": " << cpp_strerror(r) << dendl; + } + + dout(10) << ": " << lr_str << " snap_map=" << *snap_map << dendl; + return rv; +} + +int PeerReplayer::propagate_snap_deletes(const std::string &dir_root, + const std::set &snaps) { + dout(5) << ": dir_root=" << dir_root << ", deleted snapshots=" << snaps << dendl; + + for (auto &snap : snaps) { + dout(20) << ": deleting dir_root=" << dir_root << ", snapshot=" << snap + << dendl; + int r = ceph_rmsnap(m_remote_mount, dir_root.c_str(), snap.c_str()); + if (r < 0) { + derr << ": failed to delete remote snap dir_root=" << dir_root + << ", snapshot=" << snaps << ": " << cpp_strerror(r) << dendl; + return r; + } + inc_deleted_snap(dir_root); + } + + return 0; +} + +int PeerReplayer::propagate_snap_renames( + const std::string &dir_root, + const std::set> &snaps) { + dout(10) << ": dir_root=" << dir_root << ", renamed snapshots=" << snaps << dendl; + + for (auto &snapp : snaps) { + auto from = snapshot_path(m_cct, dir_root, snapp.first); + auto to = snapshot_path(m_cct, dir_root, snapp.second); + dout(20) << ": renaming dir_root=" << dir_root << ", snapshot from=" + << from << ", to=" << to << dendl; + int r = ceph_rename(m_remote_mount, from.c_str(), to.c_str()); + if (r < 0) { + derr << ": failed to rename remote snap dir_root=" << dir_root + << ", snapshot from =" << from << ", to=" << to << ": " + << cpp_strerror(r) << dendl; + return r; + } + inc_renamed_snap(dir_root); + } + + return 0; +} + +int PeerReplayer::remote_mkdir(const std::string &epath, const struct ceph_statx &stx, + const FHandles &fh) { + dout(10) << ": remote epath=" << epath << dendl; + + int r = ceph_mkdirat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_mode & ~S_IFDIR); + if (r < 0 && r != -EEXIST) { + derr << ": failed to create remote directory=" << epath << ": " << cpp_strerror(r) + << dendl; + return r; + } + + r = ceph_chownat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_uid, stx.stx_gid, + AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to chown remote directory=" << epath << ": " << cpp_strerror(r) + << dendl; + return r; + } + + r = ceph_chmodat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_mode & ~S_IFMT, + AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to chmod remote directory=" << epath << ": " << cpp_strerror(r) + << dendl; + return r; + } + + struct timespec times[] = {{stx.stx_atime.tv_sec, stx.stx_atime.tv_nsec}, + {stx.stx_mtime.tv_sec, stx.stx_mtime.tv_nsec}}; + r = ceph_utimensat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), times, AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to change [am]time on remote directory=" << epath << ": " + << cpp_strerror(r) << dendl; + return r; + } + + return 0; +} + +#define NR_IOVECS 8 // # iovecs +#define IOVEC_SIZE (8 * 1024 * 1024) // buffer size for each iovec +int PeerReplayer::copy_to_remote(const std::string &dir_root, const std::string &epath, + const struct ceph_statx &stx, const FHandles &fh) { + dout(10) << ": dir_root=" << dir_root << ", epath=" << epath << dendl; + int l_fd; + int r_fd; + void *ptr; + struct iovec iov[NR_IOVECS]; + + int r = ceph_openat(m_local_mount, fh.c_fd, epath.c_str(), O_RDONLY | O_NOFOLLOW, 0); + if (r < 0) { + derr << ": failed to open local file path=" << epath << ": " + << cpp_strerror(r) << dendl; + return r; + } + + l_fd = r; + r = ceph_openat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), + O_CREAT | O_TRUNC | O_WRONLY | O_NOFOLLOW, stx.stx_mode); + if (r < 0) { + derr << ": failed to create remote file path=" << epath << ": " + << cpp_strerror(r) << dendl; + goto close_local_fd; + } + + r_fd = r; + ptr = malloc(NR_IOVECS * IOVEC_SIZE); + if (!ptr) { + r = -ENOMEM; + derr << ": failed to allocate memory" << dendl; + goto close_remote_fd; + } + + while (true) { + if (should_backoff(dir_root, &r)) { + dout(0) << ": backing off r=" << r << dendl; + break; + } + + for (int i = 0; i < NR_IOVECS; ++i) { + iov[i].iov_base = (char*)ptr + IOVEC_SIZE*i; + iov[i].iov_len = IOVEC_SIZE; + } + + r = ceph_preadv(m_local_mount, l_fd, iov, NR_IOVECS, -1); + if (r < 0) { + derr << ": failed to read local file path=" << epath << ": " + << cpp_strerror(r) << dendl; + break; + } + if (r == 0) { + break; + } + + int iovs = (int)(r / IOVEC_SIZE); + int t = r % IOVEC_SIZE; + if (t) { + iov[iovs].iov_len = t; + ++iovs; + } + + r = ceph_pwritev(m_remote_mount, r_fd, iov, iovs, -1); + if (r < 0) { + derr << ": failed to write remote file path=" << epath << ": " + << cpp_strerror(r) << dendl; + break; + } + } + + if (r == 0) { + r = ceph_fsync(m_remote_mount, r_fd, 0); + if (r < 0) { + derr << ": failed to sync data for file path=" << epath << ": " + << cpp_strerror(r) << dendl; + } + } + + free(ptr); + +close_remote_fd: + if (ceph_close(m_remote_mount, r_fd) < 0) { + derr << ": failed to close remote fd path=" << epath << ": " << cpp_strerror(r) + << dendl; + return -EINVAL; + } + +close_local_fd: + if (ceph_close(m_local_mount, l_fd) < 0) { + derr << ": failed to close local fd path=" << epath << ": " << cpp_strerror(r) + << dendl; + return -EINVAL; + } + + return r == 0 ? 0 : r; +} + +int PeerReplayer::remote_file_op(const std::string &dir_root, const std::string &epath, + const struct ceph_statx &stx, const FHandles &fh, + bool need_data_sync, bool need_attr_sync) { + dout(10) << ": dir_root=" << dir_root << ", epath=" << epath << ", need_data_sync=" << need_data_sync + << ", need_attr_sync=" << need_attr_sync << dendl; + + int r; + if (need_data_sync) { + if (S_ISREG(stx.stx_mode)) { + r = copy_to_remote(dir_root, epath, stx, fh); + if (r < 0) { + derr << ": failed to copy path=" << epath << ": " << cpp_strerror(r) << dendl; + return r; + } + } else if (S_ISLNK(stx.stx_mode)) { + // free the remote link before relinking + r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), 0); + if (r < 0 && r != -ENOENT) { + derr << ": failed to remove remote symlink=" << epath << dendl; + return r; + } + char *target = (char *)alloca(stx.stx_size+1); + r = ceph_readlinkat(m_local_mount, fh.c_fd, epath.c_str(), target, stx.stx_size); + if (r < 0) { + derr << ": failed to readlink local path=" << epath << ": " << cpp_strerror(r) + << dendl; + return r; + } + + target[stx.stx_size] = '\0'; + r = ceph_symlinkat(m_remote_mount, target, fh.r_fd_dir_root, epath.c_str()); + if (r < 0 && r != EEXIST) { + derr << ": failed to symlink remote path=" << epath << " to target=" << target + << ": " << cpp_strerror(r) << dendl; + return r; + } + } else { + dout(5) << ": skipping entry=" << epath << ": unsupported mode=" << stx.stx_mode + << dendl; + return 0; + } + } + + if (need_attr_sync) { + r = ceph_chownat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_uid, stx.stx_gid, + AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to chown remote directory=" << epath << ": " << cpp_strerror(r) + << dendl; + return r; + } + + r = ceph_chmodat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_mode & ~S_IFMT, + AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to chmod remote directory=" << epath << ": " << cpp_strerror(r) + << dendl; + return r; + } + + struct timespec times[] = {{stx.stx_atime.tv_sec, stx.stx_atime.tv_nsec}, + {stx.stx_mtime.tv_sec, stx.stx_mtime.tv_nsec}}; + r = ceph_utimensat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), times, AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to change [am]time on remote directory=" << epath << ": " + << cpp_strerror(r) << dendl; + return r; + } + } + + return 0; +} + +int PeerReplayer::cleanup_remote_dir(const std::string &dir_root, + const std::string &epath, const FHandles &fh) { + dout(20) << ": dir_root=" << dir_root << ", epath=" << epath + << dendl; + + struct ceph_statx tstx; + int r = ceph_statxat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), &tstx, + CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID | + CEPH_STATX_SIZE | CEPH_STATX_ATIME | CEPH_STATX_MTIME, + AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to stat remote directory=" << epath << ": " + << cpp_strerror(r) << dendl; + return r; + } + + ceph_dir_result *tdirp; + r = opendirat(m_remote_mount, fh.r_fd_dir_root, epath, AT_SYMLINK_NOFOLLOW, + &tdirp); + if (r < 0) { + derr << ": failed to open remote directory=" << epath << ": " + << cpp_strerror(r) << dendl; + return r; + } + + std::stack rm_stack; + rm_stack.emplace(SyncEntry(epath, tdirp, tstx)); + while (!rm_stack.empty()) { + if (should_backoff(dir_root, &r)) { + dout(0) << ": backing off r=" << r << dendl; + break; + } + + dout(20) << ": " << rm_stack.size() << " entries in stack" << dendl; + std::string e_name; + auto &entry = rm_stack.top(); + dout(20) << ": top of stack path=" << entry.epath << dendl; + if (entry.is_directory()) { + struct ceph_statx stx; + struct dirent de; + while (true) { + r = ceph_readdirplus_r(m_remote_mount, entry.dirp, &de, &stx, + CEPH_STATX_MODE, AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW, NULL); + if (r < 0) { + derr << ": failed to read remote directory=" << entry.epath << dendl; + break; + } + if (r == 0) { + break; + } + + auto d_name = std::string(de.d_name); + if (d_name != "." && d_name != "..") { + e_name = d_name; + break; + } + } + + if (r == 0) { + r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, entry.epath.c_str(), AT_REMOVEDIR); + if (r < 0) { + derr << ": failed to remove remote directory=" << entry.epath << ": " + << cpp_strerror(r) << dendl; + break; + } + + dout(10) << ": done for remote directory=" << entry.epath << dendl; + if (ceph_closedir(m_remote_mount, entry.dirp) < 0) { + derr << ": failed to close remote directory=" << entry.epath << dendl; + } + rm_stack.pop(); + continue; + } + if (r < 0) { + break; + } + + auto epath = entry_path(entry.epath, e_name); + if (S_ISDIR(stx.stx_mode)) { + ceph_dir_result *dirp; + r = opendirat(m_remote_mount, fh.r_fd_dir_root, epath, AT_SYMLINK_NOFOLLOW, + &dirp); + if (r < 0) { + derr << ": failed to open remote directory=" << epath << ": " + << cpp_strerror(r) << dendl; + break; + } + rm_stack.emplace(SyncEntry(epath, dirp, stx)); + } else { + rm_stack.emplace(SyncEntry(epath, stx)); + } + } else { + r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, entry.epath.c_str(), 0); + if (r < 0) { + derr << ": failed to remove remote directory=" << entry.epath << ": " + << cpp_strerror(r) << dendl; + break; + } + dout(10) << ": done for remote file=" << entry.epath << dendl; + rm_stack.pop(); + } + } + + while (!rm_stack.empty()) { + auto &entry = rm_stack.top(); + if (entry.is_directory()) { + dout(20) << ": closing remote directory=" << entry.epath << dendl; + if (ceph_closedir(m_remote_mount, entry.dirp) < 0) { + derr << ": failed to close remote directory=" << entry.epath << dendl; + } + } + + rm_stack.pop(); + } + + return r; +} + +int PeerReplayer::should_sync_entry(const std::string &epath, const struct ceph_statx &cstx, + const FHandles &fh, bool *need_data_sync, bool *need_attr_sync) { + dout(10) << ": epath=" << epath << dendl; + + *need_data_sync = false; + *need_attr_sync = false; + struct ceph_statx pstx; + int r = ceph_statxat(fh.p_mnt, fh.p_fd, epath.c_str(), &pstx, + CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID | + CEPH_STATX_SIZE | CEPH_STATX_CTIME | CEPH_STATX_MTIME, + AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW); + if (r < 0 && r != -ENOENT && r != -ENOTDIR) { + derr << ": failed to stat prev entry= " << epath << ": " << cpp_strerror(r) + << dendl; + return r; + } + + if (r < 0) { + // inode does not exist in prev snapshot or file type has changed + // (file was S_IFREG earlier, S_IFDIR now). + dout(5) << ": entry=" << epath << ", r=" << r << dendl; + *need_data_sync = true; + *need_attr_sync = true; + return 0; + } + + dout(10) << ": local cur statx: mode=" << cstx.stx_mode << ", uid=" << cstx.stx_uid + << ", gid=" << cstx.stx_gid << ", size=" << cstx.stx_size << ", ctime=" + << cstx.stx_ctime << ", mtime=" << cstx.stx_mtime << dendl; + dout(10) << ": local prev statx: mode=" << pstx.stx_mode << ", uid=" << pstx.stx_uid + << ", gid=" << pstx.stx_gid << ", size=" << pstx.stx_size << ", ctime=" + << pstx.stx_ctime << ", mtime=" << pstx.stx_mtime << dendl; + if ((cstx.stx_mode & S_IFMT) != (pstx.stx_mode & S_IFMT)) { + dout(5) << ": entry=" << epath << " has mode mismatch" << dendl; + *need_data_sync = true; + *need_attr_sync = true; + } else { + *need_data_sync = (cstx.stx_size != pstx.stx_size) || (cstx.stx_mtime != pstx.stx_mtime); + *need_attr_sync = (cstx.stx_ctime != pstx.stx_ctime); + } + + return 0; +} + +int PeerReplayer::propagate_deleted_entries(const std::string &dir_root, + const std::string &epath, const FHandles &fh) { + dout(10) << ": dir_root=" << dir_root << ", epath=" << epath << dendl; + + ceph_dir_result *dirp; + int r = opendirat(fh.p_mnt, fh.p_fd, epath, AT_SYMLINK_NOFOLLOW, &dirp); + if (r < 0) { + if (r == -ELOOP) { + dout(5) << ": epath=" << epath << " is a symbolic link -- mode sync" + << " done when traversing parent" << dendl; + return 0; + } + if (r == -ENOTDIR) { + dout(5) << ": epath=" << epath << " is not a directory -- mode sync" + << " done when traversing parent" << dendl; + return 0; + } + if (r == -ENOENT) { + dout(5) << ": epath=" << epath << " missing in previous-snap/remote dir-root" + << dendl; + } + return r; + } + + struct dirent *dire = (struct dirent *)alloca(512 * sizeof(struct dirent)); + while (true) { + if (should_backoff(dir_root, &r)) { + dout(0) << ": backing off r=" << r << dendl; + break; + } + + int len = ceph_getdents(fh.p_mnt, dirp, (char *)dire, 512); + if (len < 0) { + derr << ": failed to read directory entries: " << cpp_strerror(len) << dendl; + r = len; + // flip errno to signal that we got an err (possible the + // snapshot getting deleted in midst). + if (r == -ENOENT) { + r = -EINVAL; + } + break; + } + if (len == 0) { + dout(10) << ": reached EOD" << dendl; + break; + } + int nr = len / sizeof(struct dirent); + for (int i = 0; i < nr; ++i) { + if (should_backoff(dir_root, &r)) { + dout(0) << ": backing off r=" << r << dendl; + break; + } + std::string d_name = std::string(dire[i].d_name); + if (d_name == "." || d_name == "..") { + continue; + } + + struct ceph_statx pstx; + auto dpath = entry_path(epath, d_name); + r = ceph_statxat(fh.p_mnt, fh.p_fd, dpath.c_str(), &pstx, + CEPH_STATX_MODE, AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to stat (prev) directory=" << dpath << ": " + << cpp_strerror(r) << dendl; + // flip errno to signal that we got an err (possible the + // snapshot getting deleted in midst). + if (r == -ENOENT) { + r = -EINVAL; + } + return r; + } + + struct ceph_statx cstx; + r = ceph_statxat(m_local_mount, fh.c_fd, dpath.c_str(), &cstx, + CEPH_STATX_MODE, AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW); + if (r < 0 && r != -ENOENT) { + derr << ": failed to stat local (cur) directory=" << dpath << ": " + << cpp_strerror(r) << dendl; + return r; + } + + bool purge_remote = true; + if (r == 0) { + // directory entry present in both snapshots -- check inode + // type + if ((pstx.stx_mode & S_IFMT) == (cstx.stx_mode & S_IFMT)) { + dout(5) << ": mode matches for entry=" << d_name << dendl; + purge_remote = false; + } else { + dout(5) << ": mode mismatch for entry=" << d_name << dendl; + } + } else { + dout(5) << ": entry=" << d_name << " missing in current snapshot" << dendl; + } + + if (purge_remote) { + dout(5) << ": purging remote entry=" << dpath << dendl; + if (S_ISDIR(pstx.stx_mode)) { + r = cleanup_remote_dir(dir_root, dpath, fh); + } else { + r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, dpath.c_str(), 0); + } + + if (r < 0 && r != -ENOENT) { + derr << ": failed to cleanup remote entry=" << d_name << ": " + << cpp_strerror(r) << dendl; + return r; + } + } + } + } + + ceph_closedir(fh.p_mnt, dirp); + return r; +} + +int PeerReplayer::open_dir(MountRef mnt, const std::string &dir_path, + boost::optional snap_id) { + dout(20) << ": dir_path=" << dir_path << dendl; + if (snap_id) { + dout(20) << ": expected snapshot id=" << *snap_id << dendl; + } + + int fd = ceph_open(mnt, dir_path.c_str(), O_DIRECTORY | O_RDONLY, 0); + if (fd < 0) { + derr << ": cannot open dir_path=" << dir_path << ": " << cpp_strerror(fd) + << dendl; + return fd; + } + + if (!snap_id) { + return fd; + } + + snap_info info; + int r = ceph_get_snap_info(mnt, dir_path.c_str(), &info); + if (r < 0) { + derr << ": failed to fetch snap_info for path=" << dir_path + << ": " << cpp_strerror(r) << dendl; + ceph_close(mnt, fd); + return r; + } + + if (info.id != *snap_id) { + dout(5) << ": got mismatching snapshot id for path=" << dir_path << " (" << info.id + << " vs " << *snap_id << ") -- possible recreate" << dendl; + ceph_close(mnt, fd); + return -EINVAL; + } + + return fd; +} + +int PeerReplayer::pre_sync_check_and_open_handles( + const std::string &dir_root, + const Snapshot ¤t, boost::optional prev, + FHandles *fh) { + dout(20) << ": dir_root=" << dir_root << ", current=" << current << dendl; + if (prev) { + dout(20) << ": prev=" << prev << dendl; + } + + auto cur_snap_path = snapshot_path(m_cct, dir_root, current.first); + auto fd = open_dir(m_local_mount, cur_snap_path, current.second); + if (fd < 0) { + return fd; + } + + // current snapshot file descriptor + fh->c_fd = fd; + + MountRef mnt; + if (prev) { + mnt = m_local_mount; + auto prev_snap_path = snapshot_path(m_cct, dir_root, (*prev).first); + fd = open_dir(mnt, prev_snap_path, (*prev).second); + } else { + mnt = m_remote_mount; + fd = open_dir(mnt, dir_root, boost::none); + } + + if (fd < 0) { + if (!prev || fd != -ENOENT) { + ceph_close(m_local_mount, fh->c_fd); + return fd; + } + + // ENOENT of previous snap + dout(5) << ": previous snapshot=" << *prev << " missing" << dendl; + mnt = m_remote_mount; + fd = open_dir(mnt, dir_root, boost::none); + if (fd < 0) { + ceph_close(m_local_mount, fh->c_fd); + return fd; + } + } + + // "previous" snapshot or dir_root file descriptor + fh->p_fd = fd; + fh->p_mnt = mnt; + + { + std::scoped_lock locker(m_lock); + auto it = m_registered.find(dir_root); + ceph_assert(it != m_registered.end()); + fh->r_fd_dir_root = it->second.fd; + } + + dout(5) << ": using " << ((fh->p_mnt == m_local_mount) ? "local (previous) snapshot" : "remote dir_root") + << " for incremental transfer" << dendl; + return 0; +} + +// sync the mode of the remote dir_root with that of the local dir_root +int PeerReplayer::sync_perms(const std::string& path) { + int r = 0; + struct ceph_statx tstx; + + r = ceph_statx(m_local_mount, path.c_str(), &tstx, CEPH_STATX_MODE, + AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to fetch stat for local path: " + << cpp_strerror(r) << dendl; + return r; + } + r = ceph_chmod(m_remote_mount, path.c_str(), tstx.stx_mode); + if (r < 0) { + derr << ": failed to set mode for remote path: " + << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +void PeerReplayer::post_sync_close_handles(const FHandles &fh) { + dout(20) << dendl; + + // @FHandles.r_fd_dir_root is closed in @unregister_directory since + // its used to acquire an exclusive lock on remote dir_root. + ceph_close(m_local_mount, fh.c_fd); + ceph_close(fh.p_mnt, fh.p_fd); +} + +int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot ¤t, + boost::optional prev) { + dout(20) << ": dir_root=" << dir_root << ", current=" << current << dendl; + if (prev) { + dout(20) << ": incremental sync check from prev=" << prev << dendl; + } + + FHandles fh; + int r = pre_sync_check_and_open_handles(dir_root, current, prev, &fh); + if (r < 0) { + dout(5) << ": cannot proceeed with sync: " << cpp_strerror(r) << dendl; + return r; + } + + BOOST_SCOPE_EXIT_ALL( (this)(&fh) ) { + post_sync_close_handles(fh); + }; + + // record that we are going to "dirty" the data under this + // directory root + auto snap_id_str{stringify(current.second)}; + r = ceph_fsetxattr(m_remote_mount, fh.r_fd_dir_root, "ceph.mirror.dirty_snap_id", + snap_id_str.c_str(), snap_id_str.size(), 0); + if (r < 0) { + derr << ": error setting \"ceph.mirror.dirty_snap_id\" on dir_root=" << dir_root + << ": " << cpp_strerror(r) << dendl; + return r; + } + + struct ceph_statx tstx; + r = ceph_fstatx(m_local_mount, fh.c_fd, &tstx, + CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID | + CEPH_STATX_SIZE | CEPH_STATX_ATIME | CEPH_STATX_MTIME, + AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW); + if (r < 0) { + derr << ": failed to stat snap=" << current.first << ": " << cpp_strerror(r) + << dendl; + return r; + } + + ceph_dir_result *tdirp; + r = ceph_fdopendir(m_local_mount, fh.c_fd, &tdirp); + if (r < 0) { + derr << ": failed to open local snap=" << current.first << ": " << cpp_strerror(r) + << dendl; + return r; + } + + std::stack sync_stack; + sync_stack.emplace(SyncEntry(".", tdirp, tstx)); + while (!sync_stack.empty()) { + if (should_backoff(dir_root, &r)) { + dout(0) << ": backing off r=" << r << dendl; + break; + } + + dout(20) << ": " << sync_stack.size() << " entries in stack" << dendl; + std::string e_name; + auto &entry = sync_stack.top(); + dout(20) << ": top of stack path=" << entry.epath << dendl; + if (entry.is_directory()) { + // entry is a directory -- propagate deletes for missing entries + // (and changed inode types) to the remote filesystem. + if (!entry.needs_remote_sync()) { + r = propagate_deleted_entries(dir_root, entry.epath, fh); + if (r < 0 && r != -ENOENT) { + derr << ": failed to propagate missing dirs: " << cpp_strerror(r) << dendl; + break; + } + entry.set_remote_synced(); + } + + struct ceph_statx stx; + struct dirent de; + while (true) { + r = ceph_readdirplus_r(m_local_mount, entry.dirp, &de, &stx, + CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID | + CEPH_STATX_SIZE | CEPH_STATX_ATIME | CEPH_STATX_MTIME, + AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW, NULL); + if (r < 0) { + derr << ": failed to local read directory=" << entry.epath << dendl; + break; + } + if (r == 0) { + break; + } + + auto d_name = std::string(de.d_name); + if (d_name != "." && d_name != "..") { + e_name = d_name; + break; + } + } + + if (r == 0) { + dout(10) << ": done for directory=" << entry.epath << dendl; + if (ceph_closedir(m_local_mount, entry.dirp) < 0) { + derr << ": failed to close local directory=" << entry.epath << dendl; + } + sync_stack.pop(); + continue; + } + if (r < 0) { + break; + } + + auto epath = entry_path(entry.epath, e_name); + if (S_ISDIR(stx.stx_mode)) { + r = remote_mkdir(epath, stx, fh); + if (r < 0) { + break; + } + ceph_dir_result *dirp; + r = opendirat(m_local_mount, fh.c_fd, epath, AT_SYMLINK_NOFOLLOW, &dirp); + if (r < 0) { + derr << ": failed to open local directory=" << epath << ": " + << cpp_strerror(r) << dendl; + break; + } + sync_stack.emplace(SyncEntry(epath, dirp, stx)); + } else { + sync_stack.emplace(SyncEntry(epath, stx)); + } + } else { + bool need_data_sync = true; + bool need_attr_sync = true; + r = should_sync_entry(entry.epath, entry.stx, fh, + &need_data_sync, &need_attr_sync); + if (r < 0) { + break; + } + + dout(5) << ": entry=" << entry.epath << ", data_sync=" << need_data_sync + << ", attr_sync=" << need_attr_sync << dendl; + if (need_data_sync || need_attr_sync) { + r = remote_file_op(dir_root, entry.epath, entry.stx, fh, need_data_sync, + need_attr_sync); + if (r < 0) { + break; + } + } + dout(10) << ": done for epath=" << entry.epath << dendl; + sync_stack.pop(); + } + } + + while (!sync_stack.empty()) { + auto &entry = sync_stack.top(); + if (entry.is_directory()) { + dout(20) << ": closing local directory=" << entry.epath << dendl; + if (ceph_closedir(m_local_mount, entry.dirp) < 0) { + derr << ": failed to close local directory=" << entry.epath << dendl; + } + } + + sync_stack.pop(); + } + + return r; +} + +int PeerReplayer::synchronize(const std::string &dir_root, const Snapshot ¤t, + boost::optional prev) { + dout(20) << ": dir_root=" << dir_root << ", current=" << current << dendl; + if (prev) { + dout(20) << ": prev=" << prev << dendl; + } + + int r = ceph_getxattr(m_remote_mount, dir_root.c_str(), "ceph.mirror.dirty_snap_id", nullptr, 0); + if (r < 0 && r != -ENODATA) { + derr << ": failed to fetch primary_snap_id length from dir_root=" << dir_root + << ": " << cpp_strerror(r) << dendl; + return r; + } + + // no xattr, can't determine which snap the data belongs to! + if (r < 0) { + dout(5) << ": missing \"ceph.mirror.dirty_snap_id\" xattr on remote -- using" + << " incremental sync with remote scan" << dendl; + r = do_synchronize(dir_root, current, boost::none); + } else { + size_t xlen = r; + char *val = (char *)alloca(xlen+1); + r = ceph_getxattr(m_remote_mount, dir_root.c_str(), "ceph.mirror.dirty_snap_id", (void*)val, xlen); + if (r < 0) { + derr << ": failed to fetch \"dirty_snap_id\" for dir_root: " << dir_root + << ": " << cpp_strerror(r) << dendl; + return r; + } + + val[xlen] = '\0'; + uint64_t dirty_snap_id = atoll(val); + + dout(20) << ": dirty_snap_id: " << dirty_snap_id << " vs (" << current.second + << "," << (prev ? stringify((*prev).second) : "~") << ")" << dendl; + if (prev && (dirty_snap_id == (*prev).second || dirty_snap_id == current.second)) { + dout(5) << ": match -- using incremental sync with local scan" << dendl; + r = do_synchronize(dir_root, current, prev); + } else { + dout(5) << ": mismatch -- using incremental sync with remote scan" << dendl; + r = do_synchronize(dir_root, current, boost::none); + } + } + + // snap sync failed -- bail out! + if (r < 0) { + return r; + } + + auto cur_snap_id_str{stringify(current.second)}; + snap_metadata snap_meta[] = {{PRIMARY_SNAP_ID_KEY.c_str(), cur_snap_id_str.c_str()}}; + r = ceph_mksnap(m_remote_mount, dir_root.c_str(), current.first.c_str(), 0755, + snap_meta, sizeof(snap_meta)/sizeof(snap_metadata)); + if (r < 0) { + derr << ": failed to snap remote directory dir_root=" << dir_root + << ": " << cpp_strerror(r) << dendl; + } + + return r; +} + +int PeerReplayer::do_sync_snaps(const std::string &dir_root) { + dout(20) << ": dir_root=" << dir_root << dendl; + + std::map local_snap_map; + std::map remote_snap_map; + + int r = build_snap_map(dir_root, &local_snap_map); + if (r < 0) { + derr << ": failed to build local snap map" << dendl; + return r; + } + + r = build_snap_map(dir_root, &remote_snap_map, true); + if (r < 0) { + derr << ": failed to build remote snap map" << dendl; + return r; + } + + // infer deleted and renamed snapshots from local and remote + // snap maps + std::set snaps_deleted; + std::set> snaps_renamed; + for (auto &[primary_snap_id, snap_name] : remote_snap_map) { + auto it = local_snap_map.find(primary_snap_id); + if (it == local_snap_map.end()) { + snaps_deleted.emplace(snap_name); + } else if (it->second != snap_name) { + snaps_renamed.emplace(std::make_pair(snap_name, it->second)); + } + } + + r = propagate_snap_deletes(dir_root, snaps_deleted); + if (r < 0) { + derr << ": failed to propgate deleted snapshots" << dendl; + return r; + } + + r = propagate_snap_renames(dir_root, snaps_renamed); + if (r < 0) { + derr << ": failed to propgate renamed snapshots" << dendl; + return r; + } + + // start mirroring snapshots from the last snap-id synchronized + uint64_t last_snap_id = 0; + std::string last_snap_name; + if (!remote_snap_map.empty()) { + auto last = remote_snap_map.rbegin(); + last_snap_id = last->first; + last_snap_name = last->second; + set_last_synced_snap(dir_root, last_snap_id, last_snap_name); + } + + dout(5) << ": last snap-id transferred=" << last_snap_id << dendl; + auto it = local_snap_map.upper_bound(last_snap_id); + if (it == local_snap_map.end()) { + dout(20) << ": nothing to synchronize" << dendl; + return 0; + } + + auto snaps_per_cycle = g_ceph_context->_conf.get_val( + "cephfs_mirror_max_snapshot_sync_per_cycle"); + + dout(10) << ": synchronizing from snap-id=" << it->first << dendl; + for (; it != local_snap_map.end(); ++it) { + set_current_syncing_snap(dir_root, it->first, it->second); + auto start = clock::now(); + boost::optional prev = boost::none; + if (last_snap_id != 0) { + prev = std::make_pair(last_snap_name, last_snap_id); + } + r = synchronize(dir_root, std::make_pair(it->second, it->first), prev); + if (r < 0) { + derr << ": failed to synchronize dir_root=" << dir_root + << ", snapshot=" << it->second << dendl; + clear_current_syncing_snap(dir_root); + return r; + } + std::chrono::duration duration = clock::now() - start; + set_last_synced_stat(dir_root, it->first, it->second, duration.count()); + if (--snaps_per_cycle == 0) { + break; + } + + last_snap_name = it->second; + last_snap_id = it->first; + } + + return 0; +} + +void PeerReplayer::sync_snaps(const std::string &dir_root, + std::unique_lock &locker) { + dout(20) << ": dir_root=" << dir_root << dendl; + locker.unlock(); + int r = do_sync_snaps(dir_root); + if (r < 0) { + derr << ": failed to sync snapshots for dir_root=" << dir_root << dendl; + } + locker.lock(); + if (r < 0) { + _inc_failed_count(dir_root); + } else { + _reset_failed_count(dir_root); + } +} + +void PeerReplayer::run(SnapshotReplayerThread *replayer) { + dout(10) << ": snapshot replayer=" << replayer << dendl; + + time last_directory_scan = clock::zero(); + auto scan_interval = g_ceph_context->_conf.get_val( + "cephfs_mirror_directory_scan_interval"); + + std::unique_lock locker(m_lock); + while (true) { + // do not check if client is blocklisted under lock + m_cond.wait_for(locker, 1s, [this]{return is_stopping();}); + if (is_stopping()) { + dout(5) << ": exiting" << dendl; + break; + } + + locker.unlock(); + + if (m_fs_mirror->is_blocklisted()) { + dout(5) << ": exiting as client is blocklisted" << dendl; + break; + } + + locker.lock(); + + auto now = clock::now(); + std::chrono::duration timo = now - last_directory_scan; + if (timo.count() >= scan_interval && m_directories.size()) { + dout(20) << ": trying to pick from " << m_directories.size() << " directories" << dendl; + auto dir_root = pick_directory(); + if (dir_root) { + dout(5) << ": picked dir_root=" << *dir_root << dendl; + int r = register_directory(*dir_root, replayer); + if (r == 0) { + r = sync_perms(*dir_root); + if (r < 0) { + _inc_failed_count(*dir_root); + } else { + sync_snaps(*dir_root, locker); + } + unregister_directory(*dir_root); + } + } + + last_directory_scan = now; + } + } +} + +void PeerReplayer::peer_status(Formatter *f) { + std::scoped_lock locker(m_lock); + f->open_object_section("stats"); + for (auto &[dir_root, sync_stat] : m_snap_sync_stats) { + f->open_object_section(dir_root); + if (sync_stat.failed) { + f->dump_string("state", "failed"); + } else if (!sync_stat.current_syncing_snap) { + f->dump_string("state", "idle"); + } else { + f->dump_string("state", "syncing"); + f->open_object_section("current_sycning_snap"); + f->dump_unsigned("id", (*sync_stat.current_syncing_snap).first); + f->dump_string("name", (*sync_stat.current_syncing_snap).second); + f->close_section(); + } + if (sync_stat.last_synced_snap) { + f->open_object_section("last_synced_snap"); + f->dump_unsigned("id", (*sync_stat.last_synced_snap).first); + f->dump_string("name", (*sync_stat.last_synced_snap).second); + if (sync_stat.last_sync_duration) { + f->dump_float("sync_duration", *sync_stat.last_sync_duration); + f->dump_stream("sync_time_stamp") << sync_stat.last_synced; + } + f->close_section(); + } + f->dump_unsigned("snaps_synced", sync_stat.synced_snap_count); + f->dump_unsigned("snaps_deleted", sync_stat.deleted_snap_count); + f->dump_unsigned("snaps_renamed", sync_stat.renamed_snap_count); + f->close_section(); // dir_root + } + f->close_section(); // stats +} + +void PeerReplayer::reopen_logs() { + std::scoped_lock locker(m_lock); + + if (m_remote_cluster) { + reinterpret_cast(m_remote_cluster->cct())->reopen_logs(); + } +} + +} // namespace mirror +} // namespace cephfs diff --git a/src/tools/cephfs_mirror/PeerReplayer.h b/src/tools/cephfs_mirror/PeerReplayer.h new file mode 100644 index 000000000..0511d154a --- /dev/null +++ b/src/tools/cephfs_mirror/PeerReplayer.h @@ -0,0 +1,320 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPHFS_MIRROR_PEER_REPLAYER_H +#define CEPHFS_MIRROR_PEER_REPLAYER_H + +#include "common/Formatter.h" +#include "common/Thread.h" +#include "mds/FSMap.h" +#include "ServiceDaemon.h" +#include "Types.h" + +namespace cephfs { +namespace mirror { + +class FSMirror; +class PeerReplayerAdminSocketHook; + +class PeerReplayer { +public: + PeerReplayer(CephContext *cct, FSMirror *fs_mirror, + RadosRef local_cluster, const Filesystem &filesystem, + const Peer &peer, const std::set> &directories, + MountRef mount, ServiceDaemon *service_daemon); + ~PeerReplayer(); + + // initialize replayer for a peer + int init(); + + // shutdown replayer for a peer + void shutdown(); + + // add a directory to mirror queue + void add_directory(std::string_view dir_root); + + // remove a directory from queue + void remove_directory(std::string_view dir_root); + + // admin socket helpers + void peer_status(Formatter *f); + + // reopen logs + void reopen_logs(); + +private: + inline static const std::string PRIMARY_SNAP_ID_KEY = "primary_snap_id"; + + inline static const std::string SERVICE_DAEMON_FAILED_DIR_COUNT_KEY = "failure_count"; + inline static const std::string SERVICE_DAEMON_RECOVERED_DIR_COUNT_KEY = "recovery_count"; + + using Snapshot = std::pair; + + // file descriptor "triplet" for synchronizing a snapshot + // w/ an added MountRef for accessing "previous" snapshot. + struct FHandles { + // open file descriptor on the snap directory for snapshot + // currently being synchronized. Always use this fd with + // @m_local_mount. + int c_fd; + + // open file descriptor on the "previous" snapshot or on + // dir_root on remote filesystem (based on if the snapshot + // can be used for incremental transfer). Always use this + // fd with p_mnt which either points to @m_local_mount ( + // for local incremental comparison) or @m_remote_mount ( + // for remote incremental comparison). + int p_fd; + MountRef p_mnt; + + // open file descriptor on dir_root on remote filesystem. + // Always use this fd with @m_remote_mount. + int r_fd_dir_root; + }; + + bool is_stopping() { + return m_stopping; + } + + struct Replayer; + class SnapshotReplayerThread : public Thread { + public: + SnapshotReplayerThread(PeerReplayer *peer_replayer) + : m_peer_replayer(peer_replayer) { + } + + void *entry() override { + m_peer_replayer->run(this); + return 0; + } + + private: + PeerReplayer *m_peer_replayer; + }; + + struct DirRegistry { + int fd; + bool canceled = false; + SnapshotReplayerThread *replayer; + }; + + struct SyncEntry { + std::string epath; + ceph_dir_result *dirp; // valid for directories + struct ceph_statx stx; + // set by incremental sync _after_ ensuring missing entries + // in the currently synced snapshot have been propagated to + // the remote filesystem. + bool remote_synced = false; + + SyncEntry(std::string_view path, + const struct ceph_statx &stx) + : epath(path), + stx(stx) { + } + SyncEntry(std::string_view path, + ceph_dir_result *dirp, + const struct ceph_statx &stx) + : epath(path), + dirp(dirp), + stx(stx) { + } + + bool is_directory() const { + return S_ISDIR(stx.stx_mode); + } + + bool needs_remote_sync() const { + return remote_synced; + } + void set_remote_synced() { + remote_synced = true; + } + }; + + using clock = ceph::coarse_mono_clock; + using time = ceph::coarse_mono_time; + + // stats sent to service daemon + struct ServiceDaemonStats { + uint64_t failed_dir_count = 0; + uint64_t recovered_dir_count = 0; + }; + + struct SnapSyncStat { + uint64_t nr_failures = 0; // number of consecutive failures + boost::optional