summaryrefslogtreecommitdiffstats
path: root/src/tools/cephfs
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/tools/cephfs/CMakeLists.txt58
-rw-r--r--src/tools/cephfs/DataScan.cc2239
-rw-r--r--src/tools/cephfs/DataScan.h341
-rw-r--r--src/tools/cephfs/Dumper.cc431
-rw-r--r--src/tools/cephfs/Dumper.h45
-rw-r--r--src/tools/cephfs/EventOutput.cc153
-rw-r--r--src/tools/cephfs/EventOutput.h42
-rw-r--r--src/tools/cephfs/JournalFilter.cc315
-rw-r--r--src/tools/cephfs/JournalFilter.h73
-rw-r--r--src/tools/cephfs/JournalScanner.cc438
-rw-r--r--src/tools/cephfs/JournalScanner.h133
-rw-r--r--src/tools/cephfs/JournalTool.cc1266
-rw-r--r--src/tools/cephfs/JournalTool.h101
-rw-r--r--src/tools/cephfs/MDSUtility.cc155
-rw-r--r--src/tools/cephfs/MDSUtility.h60
-rw-r--r--src/tools/cephfs/MetaTool.cc999
-rw-r--r--src/tools/cephfs/MetaTool.h272
-rw-r--r--src/tools/cephfs/PgFiles.cc194
-rw-r--r--src/tools/cephfs/PgFiles.h51
-rw-r--r--src/tools/cephfs/Resetter.cc220
-rw-r--r--src/tools/cephfs/Resetter.h50
-rw-r--r--src/tools/cephfs/RoleSelector.cc59
-rw-r--r--src/tools/cephfs/RoleSelector.h36
-rw-r--r--src/tools/cephfs/TableTool.cc417
-rw-r--r--src/tools/cephfs/TableTool.h40
-rw-r--r--src/tools/cephfs/cephfs-data-scan.cc47
-rw-r--r--src/tools/cephfs/cephfs-journal-tool.cc58
-rw-r--r--src/tools/cephfs/cephfs-meta-injection.cc97
-rw-r--r--src/tools/cephfs/cephfs-table-tool.cc47
-rw-r--r--src/tools/cephfs/shell/CMakeLists.txt7
-rwxr-xr-xsrc/tools/cephfs/shell/cephfs-shell1684
-rw-r--r--src/tools/cephfs/shell/setup.py27
-rw-r--r--src/tools/cephfs/shell/tox.ini7
-rw-r--r--src/tools/cephfs/top/CMakeLists.txt7
-rwxr-xr-xsrc/tools/cephfs/top/cephfs-top888
-rw-r--r--src/tools/cephfs/top/setup.py25
-rw-r--r--src/tools/cephfs/top/tox.ini7
-rw-r--r--src/tools/cephfs/type_helper.hpp28
-rw-r--r--src/tools/cephfs_mirror/CMakeLists.txt30
-rw-r--r--src/tools/cephfs_mirror/ClusterWatcher.cc182
-rw-r--r--src/tools/cephfs_mirror/ClusterWatcher.h77
-rw-r--r--src/tools/cephfs_mirror/FSMirror.cc441
-rw-r--r--src/tools/cephfs_mirror/FSMirror.h158
-rw-r--r--src/tools/cephfs_mirror/InstanceWatcher.cc251
-rw-r--r--src/tools/cephfs_mirror/InstanceWatcher.h85
-rw-r--r--src/tools/cephfs_mirror/Mirror.cc602
-rw-r--r--src/tools/cephfs_mirror/Mirror.h140
-rw-r--r--src/tools/cephfs_mirror/MirrorWatcher.cc148
-rw-r--r--src/tools/cephfs_mirror/MirrorWatcher.h79
-rw-r--r--src/tools/cephfs_mirror/PeerReplayer.cc1552
-rw-r--r--src/tools/cephfs_mirror/PeerReplayer.h319
-rw-r--r--src/tools/cephfs_mirror/ServiceDaemon.cc225
-rw-r--r--src/tools/cephfs_mirror/ServiceDaemon.h62
-rw-r--r--src/tools/cephfs_mirror/Types.cc21
-rw-r--r--src/tools/cephfs_mirror/Types.h87
-rw-r--r--src/tools/cephfs_mirror/Utils.cc166
-rw-r--r--src/tools/cephfs_mirror/Utils.h22
-rw-r--r--src/tools/cephfs_mirror/Watcher.cc285
-rw-r--r--src/tools/cephfs_mirror/Watcher.h102
-rw-r--r--src/tools/cephfs_mirror/aio_utils.h53
-rw-r--r--src/tools/cephfs_mirror/main.cc124
-rw-r--r--src/tools/cephfs_mirror/watcher/RewatchRequest.cc102
-rw-r--r--src/tools/cephfs_mirror/watcher/RewatchRequest.h60
63 files changed, 16490 insertions, 0 deletions
diff --git a/src/tools/cephfs/CMakeLists.txt b/src/tools/cephfs/CMakeLists.txt
new file mode 100644
index 000000000..5d40f8ffb
--- /dev/null
+++ b/src/tools/cephfs/CMakeLists.txt
@@ -0,0 +1,58 @@
+set(cephfs_journal_tool_srcs
+ cephfs-journal-tool.cc
+ JournalTool.cc
+ JournalFilter.cc
+ JournalScanner.cc
+ EventOutput.cc
+ Dumper.cc
+ Resetter.cc
+ RoleSelector.cc
+ MDSUtility.cc)
+add_executable(cephfs-journal-tool ${cephfs_journal_tool_srcs})
+target_link_libraries(cephfs-journal-tool librados mds osdc global
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs-meta-injection_srcs
+ cephfs-meta-injection.cc
+ MetaTool.cc
+ RoleSelector.cc
+ MDSUtility.cc)
+add_executable(cephfs-meta-injection ${cephfs-meta-injection_srcs})
+target_link_libraries(cephfs-meta-injection librados mds osdc global
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs_table_tool_srcs
+ cephfs-table-tool.cc
+ TableTool.cc
+ RoleSelector.cc
+ MDSUtility.cc)
+add_executable(cephfs-table-tool ${cephfs_table_tool_srcs})
+target_link_libraries(cephfs-table-tool librados mds osdc global
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+set(cephfs_data_scan_srcs
+ cephfs-data-scan.cc
+ DataScan.cc
+ RoleSelector.cc
+ PgFiles.cc
+ MDSUtility.cc)
+add_executable(cephfs-data-scan ${cephfs_data_scan_srcs})
+target_link_libraries(cephfs-data-scan librados cephfs mds osdc global
+ cls_cephfs_client
+ ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS})
+
+install(TARGETS
+ cephfs-journal-tool
+ cephfs-table-tool
+ cephfs-data-scan
+ DESTINATION bin)
+
+option(WITH_CEPHFS_SHELL "install cephfs-shell" OFF)
+if(WITH_CEPHFS_SHELL)
+ add_subdirectory(shell)
+endif()
+
+option(WITH_CEPHFS_TOP "install cephfs-top utility" ON)
+if(WITH_CEPHFS_TOP)
+ add_subdirectory(top)
+endif()
diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc
new file mode 100644
index 000000000..9f942964d
--- /dev/null
+++ b/src/tools/cephfs/DataScan.cc
@@ -0,0 +1,2239 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+#include <fstream>
+#include "include/util.h"
+#include "include/ceph_fs.h"
+
+#include "mds/CDentry.h"
+#include "mds/CInode.h"
+#include "mds/CDentry.h"
+#include "mds/InoTable.h"
+#include "mds/SnapServer.h"
+#include "cls/cephfs/cls_cephfs_client.h"
+
+#include "PgFiles.h"
+#include "DataScan.h"
+#include "include/compat.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "datascan." << __func__ << ": "
+
+void DataScan::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-data-scan init [--force-init]\n"
+ << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
+ << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
+ << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
+ << " cephfs-data-scan scan_links\n"
+ << "\n"
+ << " --force-corrupt: overrite apparently corrupt structures\n"
+ << " --force-init: write root inodes even if they exist\n"
+ << " --force-pool: use data pool even if it is not in FSMap\n"
+ << " --worker_m: Maximum number of workers\n"
+ << " --worker_n: Worker number, range 0-(worker_m-1)\n"
+ << "\n"
+ << " cephfs-data-scan scan_frags [--force-corrupt]\n"
+ << " cephfs-data-scan cleanup <data pool name>\n"
+ << std::endl;
+
+ generic_client_usage();
+}
+
+bool DataScan::parse_kwarg(
+ const std::vector<const char*> &args,
+ std::vector<const char *>::const_iterator &i,
+ int *r)
+{
+ if (i + 1 == args.end()) {
+ return false;
+ }
+
+ const std::string arg(*i);
+ const std::string val(*(i + 1));
+
+ if (arg == std::string("--output-dir")) {
+ if (driver != NULL) {
+ derr << "Unexpected --output-dir: output already selected!" << dendl;
+ *r = -EINVAL;
+ return false;
+ }
+ dout(4) << "Using local file output to '" << val << "'" << dendl;
+ driver = new LocalFileDriver(val, data_io);
+ return true;
+ } else if (arg == std::string("--worker_n")) {
+ std::string err;
+ n = strict_strtoll(val.c_str(), 10, &err);
+ if (!err.empty()) {
+ std::cerr << "Invalid worker number '" << val << "'" << std::endl;
+ *r = -EINVAL;
+ return false;
+ }
+ return true;
+ } else if (arg == std::string("--worker_m")) {
+ std::string err;
+ m = strict_strtoll(val.c_str(), 10, &err);
+ if (!err.empty()) {
+ std::cerr << "Invalid worker count '" << val << "'" << std::endl;
+ *r = -EINVAL;
+ return false;
+ }
+ return true;
+ } else if (arg == std::string("--filter-tag")) {
+ filter_tag = val;
+ dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
+ return true;
+ } else if (arg == std::string("--filesystem")) {
+ std::shared_ptr<const Filesystem> fs;
+ *r = fsmap->parse_filesystem(val, &fs);
+ if (*r != 0) {
+ std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
+ return false;
+ }
+ fscid = fs->fscid;
+ return true;
+ } else if (arg == std::string("--alternate-pool")) {
+ metadata_pool_name = val;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool DataScan::parse_arg(
+ const std::vector<const char*> &args,
+ std::vector<const char *>::const_iterator &i)
+{
+ const std::string arg(*i);
+ if (arg == "--force-pool") {
+ force_pool = true;
+ return true;
+ } else if (arg == "--force-corrupt") {
+ force_corrupt = true;
+ return true;
+ } else if (arg == "--force-init") {
+ force_init = true;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+int DataScan::main(const std::vector<const char*> &args)
+{
+ // Parse args
+ // ==========
+ if (args.size() < 1) {
+ cerr << "missing position argument" << std::endl;
+ return -EINVAL;
+ }
+
+ // Common RADOS init: open metadata pool
+ // =====================================
+ librados::Rados rados;
+ int r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable" << dendl;
+ return r;
+ }
+
+ std::string const &command = args[0];
+ std::string data_pool_name;
+
+ std::string pg_files_path;
+ std::set<pg_t> pg_files_pgs;
+
+ // Consume any known --key val or --flag arguments
+ for (std::vector<const char *>::const_iterator i = args.begin() + 1;
+ i != args.end(); ++i) {
+ if (parse_kwarg(args, i, &r)) {
+ // Skip the kwarg value field
+ ++i;
+ continue;
+ } else if (r) {
+ return r;
+ }
+
+ if (parse_arg(args, i)) {
+ continue;
+ }
+
+ // Trailing positional argument
+ if (i + 1 == args.end() &&
+ (command == "scan_inodes"
+ || command == "scan_extents"
+ || command == "cleanup")) {
+ data_pool_name = *i;
+ continue;
+ }
+
+ if (command == "pg_files") {
+ if (i == args.begin() + 1) {
+ pg_files_path = *i;
+ continue;
+ } else {
+ pg_t pg;
+ bool parsed = pg.parse(*i);
+ if (!parsed) {
+ std::cerr << "Invalid PG '" << *i << "'" << std::endl;
+ return -EINVAL;
+ } else {
+ pg_files_pgs.insert(pg);
+ continue;
+ }
+ }
+
+ }
+
+ // Fall through: unhandled
+ std::cerr << "Unknown argument '" << *i << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // If caller didn't specify a namespace, try to pick
+ // one if only one exists
+ if (fscid == FS_CLUSTER_ID_NONE) {
+ if (fsmap->filesystem_count() == 1) {
+ fscid = fsmap->get_filesystem()->fscid;
+ } else {
+ std::cerr << "Specify a filesystem with --filesystem" << std::endl;
+ return -EINVAL;
+ }
+ }
+ auto fs = fsmap->get_filesystem(fscid);
+ ceph_assert(fs != nullptr);
+
+ // Default to output to metadata pool
+ if (driver == NULL) {
+ driver = new MetadataDriver();
+ driver->set_force_corrupt(force_corrupt);
+ driver->set_force_init(force_init);
+ dout(4) << "Using metadata pool output" << dendl;
+ }
+
+ dout(4) << "connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
+ << std::endl;
+ return r;
+ }
+
+ r = driver->init(rados, metadata_pool_name, fsmap, fscid);
+ if (r < 0) {
+ return r;
+ }
+
+ if (command == "pg_files") {
+ auto pge = PgFiles(objecter, pg_files_pgs);
+ pge.init();
+ return pge.scan_path(pg_files_path);
+ }
+
+ // Initialize data_io for those commands that need it
+ if (command == "scan_inodes" ||
+ command == "scan_extents" ||
+ command == "cleanup") {
+ if (data_pool_name.empty()) {
+ std::cerr << "Data pool not specified" << std::endl;
+ return -EINVAL;
+ }
+
+ data_pool_id = rados.pool_lookup(data_pool_name.c_str());
+ if (data_pool_id < 0) {
+ std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
+ return -ENOENT;
+ } else {
+ dout(4) << "data pool '" << data_pool_name
+ << "' has ID " << data_pool_id << dendl;
+ }
+
+ if (!fs->mds_map.is_data_pool(data_pool_id)) {
+ std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
+ "CephFS data pool!" << std::endl;
+ if (!force_pool) {
+ std::cerr << "Use --force-pool to continue" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
+ r = rados.ioctx_create(data_pool_name.c_str(), data_io);
+ if (r != 0) {
+ return r;
+ }
+ }
+
+ // Initialize metadata_io from MDSMap for scan_frags
+ if (command == "scan_frags" || command == "scan_links") {
+ const auto fs = fsmap->get_filesystem(fscid);
+ if (fs == nullptr) {
+ std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
+ return -ENOENT;
+ }
+ int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
+
+ dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+ int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+ if (r < 0) {
+ std::cerr << "Pool " << metadata_pool_id
+ << " identified in MDS map not found in RADOS!" << std::endl;
+ return r;
+ }
+
+ r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+ if (r != 0) {
+ return r;
+ }
+
+ data_pools = fs->mds_map.get_data_pools();
+ }
+
+ // Finally, dispatch command
+ if (command == "scan_inodes") {
+ return scan_inodes();
+ } else if (command == "scan_extents") {
+ return scan_extents();
+ } else if (command == "scan_frags") {
+ return scan_frags();
+ } else if (command == "scan_links") {
+ return scan_links();
+ } else if (command == "cleanup") {
+ return cleanup();
+ } else if (command == "init") {
+ return driver->init_roots(fs->mds_map.get_first_data_pool());
+ } else {
+ std::cerr << "Unknown command '" << command << "'" << std::endl;
+ return -EINVAL;
+ }
+}
+
+int MetadataDriver::inject_unlinked_inode(
+ inodeno_t inono, int mode, int64_t data_pool_id)
+{
+ const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
+
+ // Skip if exists
+ bool already_exists = false;
+ int r = root_exists(inono, &already_exists);
+ if (r) {
+ return r;
+ }
+ if (already_exists && !force_init) {
+ std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
+ " exists, skipping create. Use --force-init to overwrite"
+ " the existing object." << std::endl;
+ return 0;
+ }
+
+ // Compose
+ InodeStore inode_data;
+ auto inode = inode_data.get_inode();
+ inode->ino = inono;
+ inode->version = 1;
+ inode->xattr_version = 1;
+ inode->mode = 0500 | mode;
+ // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
+ // (we won't actually give the *correct* dirstat here though)
+ inode->dirstat.nfiles = 1;
+
+ inode->ctime = inode->mtime = ceph_clock_now();
+ inode->nlink = 1;
+ inode->truncate_size = -1ull;
+ inode->truncate_seq = 1;
+ inode->uid = g_conf()->mds_root_ino_uid;
+ inode->gid = g_conf()->mds_root_ino_gid;
+
+ // Force layout to default: should we let users override this so that
+ // they don't have to mount the filesystem to correct it?
+ inode->layout = file_layout_t::get_default();
+ inode->layout.pool_id = data_pool_id;
+ inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+
+ // Assume that we will get our stats wrong, and that we may
+ // be ignoring dirfrags that exist
+ inode_data.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
+
+ if (inono == CEPH_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) {
+ sr_t srnode;
+ srnode.seq = 1;
+ encode(srnode, inode_data.snap_blob);
+ }
+
+ // Serialize
+ bufferlist inode_bl;
+ encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
+ inode_data.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ // Write
+ r = metadata_io.write_full(oid.name, inode_bl);
+ if (r != 0) {
+ derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+}
+
+int MetadataDriver::root_exists(inodeno_t ino, bool *result)
+{
+ object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+ uint64_t size;
+ time_t mtime;
+ int r = metadata_io.stat(oid.name, &size, &mtime);
+ if (r == -ENOENT) {
+ *result = false;
+ return 0;
+ } else if (r < 0) {
+ return r;
+ }
+
+ *result = true;
+ return 0;
+}
+
+int MetadataDriver::init_roots(int64_t data_pool_id)
+{
+ int r = 0;
+ r = inject_unlinked_inode(CEPH_INO_ROOT, S_IFDIR|0755, data_pool_id);
+ if (r != 0) {
+ return r;
+ }
+ r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
+ if (r != 0) {
+ return r;
+ }
+ bool created = false;
+ r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
+ if (r != 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int MetadataDriver::check_roots(bool *result)
+{
+ int r;
+ r = root_exists(CEPH_INO_ROOT, result);
+ if (r != 0) {
+ return r;
+ }
+ if (!*result) {
+ return 0;
+ }
+
+ r = root_exists(MDS_INO_MDSDIR(0), result);
+ if (r != 0) {
+ return r;
+ }
+ if (!*result) {
+ return 0;
+ }
+
+ return 0;
+}
+
+/**
+ * Stages:
+ *
+ * SERIAL init
+ * 0. Create root inodes if don't exist
+ * PARALLEL scan_extents
+ * 1. Size and mtime recovery: scan ALL objects, and update 0th
+ * objects with max size and max mtime seen.
+ * PARALLEL scan_inodes
+ * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
+ * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
+ * or rstats at this stage. Inodes without backtraces go into
+ * lost+found
+ * TODO: SERIAL "recover stats"
+ * 3. Dirfrag statistics: depth first traverse into metadata tree,
+ * rebuilding dir sizes.
+ * TODO PARALLEL "clean up"
+ * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
+ * anything onto them) and remove any of the xattrs that we
+ * used for accumulating.
+ */
+
+
+int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
+{
+ if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
+ return -EINVAL;
+ }
+
+ std::string err;
+ std::string inode_str = oid.substr(0, oid.find("."));
+ *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+
+ std::string pos_string = oid.substr(oid.find(".") + 1);
+ *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
+ if (!err.empty()) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+int DataScan::scan_extents()
+{
+ return forall_objects(data_io, false, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ // Read size
+ uint64_t size;
+ time_t mtime;
+ int r = data_io.stat(oid, &size, &mtime);
+ dout(10) << "handling object " << obj_name_ino
+ << "." << obj_name_offset << dendl;
+ if (r != 0) {
+ dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
+ return r;
+ }
+
+ // I need to keep track of
+ // * The highest object ID seen
+ // * The size of the highest object ID seen
+ // * The largest object seen
+ //
+ // Given those things, I can later infer the object chunking
+ // size, the offset of the last object (chunk size * highest ID seen)
+ // and the actual size (offset of last object + size of highest ID seen)
+ //
+ // This logic doesn't take account of striping.
+ r = ClsCephFSClient::accumulate_inode_metadata(
+ data_io,
+ obj_name_ino,
+ obj_name_offset,
+ size,
+ mtime);
+ if (r < 0) {
+ derr << "Failed to accumulate metadata data from '"
+ << oid << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+ });
+}
+
+int DataScan::probe_filter(librados::IoCtx &ioctx)
+{
+ bufferlist filter_bl;
+ ClsCephFSClient::build_tag_filter("test", &filter_bl);
+ librados::ObjectCursor range_i;
+ librados::ObjectCursor range_end;
+
+ std::vector<librados::ObjectItem> tmp_result;
+ librados::ObjectCursor tmp_next;
+ int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
+ 1, filter_bl, &tmp_result, &tmp_next);
+
+ return r >= 0;
+}
+
+int DataScan::forall_objects(
+ librados::IoCtx &ioctx,
+ bool untagged_only,
+ std::function<int(std::string, uint64_t, uint64_t)> handler
+ )
+{
+ librados::ObjectCursor range_i;
+ librados::ObjectCursor range_end;
+ ioctx.object_list_slice(
+ ioctx.object_list_begin(),
+ ioctx.object_list_end(),
+ n,
+ m,
+ &range_i,
+ &range_end);
+
+
+ bufferlist filter_bl;
+
+ bool legacy_filtering = false;
+ if (untagged_only) {
+ // probe to deal with older OSDs that don't support
+ // the cephfs pgls filtering mode
+ legacy_filtering = !probe_filter(ioctx);
+ if (!legacy_filtering) {
+ ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
+ }
+ }
+
+ int r = 0;
+ while(range_i < range_end) {
+ std::vector<librados::ObjectItem> result;
+ int r = ioctx.object_list(range_i, range_end, 1,
+ filter_bl, &result, &range_i);
+ if (r < 0) {
+ derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (const auto &i : result) {
+ const std::string &oid = i.oid;
+ uint64_t obj_name_ino = 0;
+ uint64_t obj_name_offset = 0;
+ r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+ if (r != 0) {
+ dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+ continue;
+ }
+
+ if (untagged_only && legacy_filtering) {
+ dout(20) << "Applying filter to " << oid << dendl;
+
+ // We are only interested in 0th objects during this phase: we touched
+ // the other objects during scan_extents
+ if (obj_name_offset != 0) {
+ dout(20) << "Non-zeroth object" << dendl;
+ continue;
+ }
+
+ bufferlist scrub_tag_bl;
+ int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
+ if (r >= 0) {
+ std::string read_tag;
+ auto q = scrub_tag_bl.cbegin();
+ try {
+ decode(read_tag, q);
+ if (read_tag == filter_tag) {
+ dout(20) << "skipping " << oid << " because it has the filter_tag"
+ << dendl;
+ continue;
+ }
+ } catch (const buffer::error &err) {
+ }
+ dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
+ } else {
+ dout(20) << "no tag read (" << r << ")" << dendl;
+ }
+
+ } else if (untagged_only) {
+ ceph_assert(obj_name_offset == 0);
+ dout(20) << "OSD matched oid " << oid << dendl;
+ }
+
+ int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
+ if (r == 0 && this_oid_r < 0) {
+ r = this_oid_r;
+ }
+ }
+ }
+
+ return r;
+}
+
+int DataScan::scan_inodes()
+{
+ bool roots_present;
+ int r = driver->check_roots(&roots_present);
+ if (r != 0) {
+ derr << "Unexpected error checking roots: '"
+ << cpp_strerror(r) << "'" << dendl;
+ return r;
+ }
+
+ if (!roots_present) {
+ std::cerr << "Some or all system inodes are absent. Run 'init' from "
+ "one node before running 'scan_inodes'" << std::endl;
+ return -EIO;
+ }
+
+ return forall_objects(data_io, true, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ int r = 0;
+
+ dout(10) << "handling object "
+ << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
+ << dendl;
+
+ AccumulateResult accum_res;
+ inode_backtrace_t backtrace;
+ file_layout_t loaded_layout = file_layout_t::get_default();
+ r = ClsCephFSClient::fetch_inode_accumulate_result(
+ data_io, oid, &backtrace, &loaded_layout, &accum_res);
+
+ if (r == -EINVAL) {
+ dout(4) << "Accumulated metadata missing from '"
+ << oid << ", did you run scan_extents?" << dendl;
+ return r;
+ } else if (r < 0) {
+ dout(4) << "Unexpected error loading accumulated metadata from '"
+ << oid << "': " << cpp_strerror(r) << dendl;
+ // FIXME: this creates situation where if a client has a corrupt
+ // backtrace/layout, we will fail to inject it. We should (optionally)
+ // proceed if the backtrace/layout is corrupt but we have valid
+ // accumulated metadata.
+ return r;
+ }
+
+ const time_t file_mtime = accum_res.max_mtime;
+ uint64_t file_size = 0;
+ bool have_backtrace = !(backtrace.ancestors.empty());
+
+ // This is the layout we will use for injection, populated either
+ // from loaded_layout or from best guesses
+ file_layout_t guessed_layout;
+ guessed_layout.pool_id = data_pool_id;
+
+ // Calculate file_size, guess the layout
+ if (accum_res.ceiling_obj_index > 0) {
+ uint32_t chunk_size = file_layout_t::get_default().object_size;
+ // When there are multiple objects, the largest object probably
+ // indicates the chunk size. But not necessarily, because files
+ // can be sparse. Only make this assumption if size seen
+ // is a power of two, as chunk sizes typically are.
+ if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
+ chunk_size = accum_res.max_obj_size;
+ }
+
+ if (loaded_layout.pool_id == -1) {
+ // If no stashed layout was found, guess it
+ guessed_layout.object_size = chunk_size;
+ guessed_layout.stripe_unit = chunk_size;
+ guessed_layout.stripe_count = 1;
+ } else if (!loaded_layout.is_valid() ||
+ loaded_layout.object_size < accum_res.max_obj_size) {
+ // If the max size seen exceeds what the stashed layout claims, then
+ // disbelieve it. Guess instead. Same for invalid layouts on disk.
+ dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
+ << std::dec << ", ignoring in favour of best guess" << dendl;
+ guessed_layout.object_size = chunk_size;
+ guessed_layout.stripe_unit = chunk_size;
+ guessed_layout.stripe_count = 1;
+ } else {
+ // We have a stashed layout that we can't disprove, so apply it
+ guessed_layout = loaded_layout;
+ dout(20) << "loaded layout from xattr:"
+ << " os: " << guessed_layout.object_size
+ << " sc: " << guessed_layout.stripe_count
+ << " su: " << guessed_layout.stripe_unit
+ << dendl;
+ // User might have transplanted files from a pool with a different
+ // ID, so whatever the loaded_layout says, we'll force the injected
+ // layout to point to the pool we really read from
+ guessed_layout.pool_id = data_pool_id;
+ }
+
+ if (guessed_layout.stripe_count == 1) {
+ // Unstriped file: simple chunking
+ file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
+ + accum_res.ceiling_obj_size;
+ } else {
+ // Striped file: need to examine the last stripe_count objects
+ // in the file to determine the size.
+
+ // How many complete (i.e. not last stripe) objects?
+ uint64_t complete_objs = 0;
+ if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
+ complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
+ } else {
+ complete_objs = 0;
+ }
+
+ // How many potentially-short objects (i.e. last stripe set) objects?
+ uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
+
+ dout(10) << "calculating striped size from complete objs: "
+ << complete_objs << ", partial objs: " << partial_objs
+ << dendl;
+
+ // Maximum amount of data that may be in the incomplete objects
+ uint64_t incomplete_size = 0;
+
+ // For each short object, calculate the max file size within it
+ // and accumulate the maximum
+ for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
+ char buf[60];
+ snprintf(buf, sizeof(buf), "%llx.%08llx",
+ (long long unsigned)obj_name_ino, (long long unsigned)i);
+
+ uint64_t osize(0);
+ time_t omtime(0);
+ r = data_io.stat(std::string(buf), &osize, &omtime);
+ if (r == 0) {
+ if (osize > 0) {
+ // Upper bound within this object
+ uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
+ * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
+ + (i % guessed_layout.stripe_count)
+ * guessed_layout.stripe_unit + (osize - 1)
+ % guessed_layout.stripe_unit + 1;
+ incomplete_size = std::max(incomplete_size, upper_size);
+ }
+ } else if (r == -ENOENT) {
+ // Absent object, treat as size 0 and ignore.
+ } else {
+ // Unexpected error, carry r to outer scope for handling.
+ break;
+ }
+ }
+ if (r != 0 && r != -ENOENT) {
+ derr << "Unexpected error checking size of ino 0x" << std::hex
+ << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ file_size = complete_objs * guessed_layout.object_size
+ + incomplete_size;
+ }
+ } else {
+ file_size = accum_res.ceiling_obj_size;
+ if (loaded_layout.pool_id < 0
+ || loaded_layout.object_size < accum_res.max_obj_size) {
+ // No layout loaded, or inconsistent layout, use default
+ guessed_layout = file_layout_t::get_default();
+ guessed_layout.pool_id = data_pool_id;
+ } else {
+ guessed_layout = loaded_layout;
+ }
+ }
+
+ // Santity checking backtrace ino against object name
+ if (have_backtrace && backtrace.ino != obj_name_ino) {
+ dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+ << " doesn't match object name ino 0x" << obj_name_ino
+ << std::dec << dendl;
+ have_backtrace = false;
+ }
+
+ InodeStore dentry;
+ build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry);
+
+ // Inject inode to the metadata pool
+ if (have_backtrace) {
+ inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+ if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+ /* Special case for strays: even if we have a good backtrace,
+ * don't put it in the stray dir, because while that would technically
+ * give it linkage it would still be invisible to the user */
+ r = driver->inject_lost_and_found(obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ } else {
+ /* Happy case: we will inject a named dentry for this inode */
+ r = driver->inject_with_backtrace(backtrace, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+ } else {
+ /* Backtrace-less case: we will inject a lost+found dentry */
+ r = driver->inject_lost_and_found(
+ obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+
+ return r;
+ });
+}
+
+int DataScan::cleanup()
+{
+ // We are looking for only zeroth object
+ //
+ return forall_objects(data_io, true, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ int r = 0;
+ r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
+ if (r < 0) {
+ dout(4) << "Error deleting accumulated metadata from '"
+ << oid << "': " << cpp_strerror(r) << dendl;
+ }
+ return r;
+ });
+}
+
+bool DataScan::valid_ino(inodeno_t ino) const
+{
+ return (ino >= inodeno_t((1ull << 40)))
+ || (MDS_INO_IS_STRAY(ino))
+ || (MDS_INO_IS_MDSDIR(ino))
+ || ino == CEPH_INO_ROOT
+ || ino == CEPH_INO_CEPH;
+}
+
+int DataScan::scan_links()
+{
+ MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
+ if (!metadata_driver) {
+ derr << "Unexpected --output-dir option for scan_links" << dendl;
+ return -EINVAL;
+ }
+
+ interval_set<uint64_t> used_inos;
+ map<inodeno_t, int> remote_links;
+ map<snapid_t, SnapInfo> snaps;
+ snapid_t last_snap = 1;
+ snapid_t snaprealm_v2_since = 2;
+
+ struct link_info_t {
+ inodeno_t dirino;
+ frag_t frag;
+ string name;
+ version_t version;
+ int nlink;
+ bool is_dir;
+ map<snapid_t, SnapInfo> snaps;
+ link_info_t() : version(0), nlink(0), is_dir(false) {}
+ link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::inode_const_ptr& i) :
+ dirino(di), frag(df), name(n),
+ version(i->version), nlink(i->nlink), is_dir(S_IFDIR & i->mode) {}
+ dirfrag_t dirfrag() const {
+ return dirfrag_t(dirino, frag);
+ }
+ };
+ map<inodeno_t, list<link_info_t> > dup_primaries;
+ map<inodeno_t, link_info_t> bad_nlink_inos;
+ map<inodeno_t, link_info_t> injected_inos;
+
+ map<dirfrag_t, set<string> > to_remove;
+
+ enum {
+ SCAN_INOS = 1,
+ CHECK_LINK,
+ };
+
+ for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
+ const librados::NObjectIterator it_end = metadata_io.nobjects_end();
+ for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
+ const std::string oid = it->get_oid();
+
+ dout(10) << "step " << step << ": handling object " << oid << dendl;
+
+ uint64_t dir_ino = 0;
+ uint64_t frag_id = 0;
+ int r = parse_oid(oid, &dir_ino, &frag_id);
+ if (r == -EINVAL) {
+ dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
+ continue;
+ } else {
+ // parse_oid can only do 0 or -EINVAL
+ ceph_assert(r == 0);
+ }
+
+ if (!valid_ino(dir_ino)) {
+ dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
+ continue;
+ }
+
+ std::map<std::string, bufferlist> items;
+ r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
+ if (r < 0) {
+ derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ for (auto& p : items) {
+ auto q = p.second.cbegin();
+ string dname;
+ snapid_t last;
+ dentry_key_t::decode_helper(p.first, dname, last);
+
+ if (last != CEPH_NOSNAP) {
+ if (last > last_snap)
+ last_snap = last;
+ continue;
+ }
+
+ try {
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ if (dnfirst <= CEPH_MAXSNAP) {
+ if (dnfirst - 1 > last_snap)
+ last_snap = dnfirst - 1;
+ }
+ char dentry_type;
+ decode(dentry_type, q);
+ mempool::mds_co::string alternate_name;
+ if (dentry_type == 'I' || dentry_type == 'i') {
+ InodeStore inode;
+ if (dentry_type == 'i') {
+ DECODE_START(2, q);
+ if (struct_v >= 2)
+ decode(alternate_name, q);
+ inode.decode(q);
+ DECODE_FINISH(q);
+ } else {
+ inode.decode_bare(q);
+ }
+
+ inodeno_t ino = inode.inode->ino;
+
+ if (step == SCAN_INOS) {
+ if (used_inos.contains(ino, 1)) {
+ dup_primaries[ino].size();
+ } else {
+ used_inos.insert(ino);
+ }
+ } else if (step == CHECK_LINK) {
+ sr_t srnode;
+ if (inode.snap_blob.length()) {
+ auto p = inode.snap_blob.cbegin();
+ decode(srnode, p);
+ for (auto it = srnode.snaps.begin();
+ it != srnode.snaps.end(); ) {
+ if (it->second.ino != ino ||
+ it->second.snapid != it->first) {
+ srnode.snaps.erase(it++);
+ } else {
+ ++it;
+ }
+ }
+ if (!srnode.past_parents.empty()) {
+ snapid_t last = srnode.past_parents.rbegin()->first;
+ if (last + 1 > snaprealm_v2_since)
+ snaprealm_v2_since = last + 1;
+ }
+ }
+ if (inode.old_inodes && !inode.old_inodes->empty()) {
+ auto _last_snap = inode.old_inodes->rbegin()->first;
+ if (_last_snap > last_snap)
+ last_snap = _last_snap;
+ }
+ auto q = dup_primaries.find(ino);
+ if (q != dup_primaries.end()) {
+ q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
+ q->second.back().snaps.swap(srnode.snaps);
+ } else {
+ int nlink = 0;
+ auto r = remote_links.find(ino);
+ if (r != remote_links.end())
+ nlink = r->second;
+ if (!MDS_INO_IS_STRAY(dir_ino))
+ nlink++;
+ if (inode.inode->nlink != nlink) {
+ derr << "Bad nlink on " << ino << " expected " << nlink
+ << " has " << inode.inode->nlink << dendl;
+ bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
+ bad_nlink_inos[ino].nlink = nlink;
+ }
+ snaps.insert(make_move_iterator(begin(srnode.snaps)),
+ make_move_iterator(end(srnode.snaps)));
+ }
+ if (dnfirst == CEPH_NOSNAP)
+ injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
+ }
+ } else if (dentry_type == 'L' || dentry_type == 'l') {
+ inodeno_t ino;
+ unsigned char d_type;
+ CDentry::decode_remote(dentry_type, ino, d_type, alternate_name, q);
+
+ if (step == SCAN_INOS) {
+ remote_links[ino]++;
+ } else if (step == CHECK_LINK) {
+ if (!used_inos.contains(ino, 1)) {
+ derr << "Bad remote link dentry 0x" << std::hex << dir_ino
+ << std::dec << "/" << dname
+ << ", ino " << ino << " not found" << dendl;
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+ dn_key.encode(key);
+ to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
+ }
+ }
+ } else {
+ derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
+ << std::dec << "/" << dname << dendl;
+ return -EINVAL;
+ }
+ } catch (const buffer::error &err) {
+ derr << "Error decoding dentry 0x" << std::hex << dir_ino
+ << std::dec << "/" << dname << dendl;
+ return -EINVAL;
+ }
+ }
+ }
+ }
+
+ map<unsigned, uint64_t> max_ino_map;
+ {
+ auto prev_max_ino = (uint64_t)1 << 40;
+ for (auto p = used_inos.begin(); p != used_inos.end(); ++p) {
+ auto cur_max = p.get_start() + p.get_len() - 1;
+ if (cur_max < prev_max_ino)
+ continue; // system inodes
+
+ if ((prev_max_ino >> 40) != (cur_max >> 40)) {
+ unsigned rank = (prev_max_ino >> 40) - 1;
+ max_ino_map[rank] = prev_max_ino;
+ } else if ((p.get_start() >> 40) != (cur_max >> 40)) {
+ unsigned rank = (p.get_start() >> 40) - 1;
+ max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1;
+ }
+ prev_max_ino = cur_max;
+ }
+ unsigned rank = (prev_max_ino >> 40) - 1;
+ max_ino_map[rank] = prev_max_ino;
+ }
+
+ used_inos.clear();
+
+ dout(10) << "processing " << dup_primaries.size() << " dup_primaries, "
+ << remote_links.size() << " remote_links" << dendl;
+
+ for (auto& p : dup_primaries) {
+
+ dout(10) << "handling dup " << p.first << dendl;
+
+ link_info_t newest;
+ for (auto& q : p.second) {
+ if (q.version > newest.version) {
+ newest = q;
+ } else if (q.version == newest.version &&
+ !MDS_INO_IS_STRAY(q.dirino) &&
+ MDS_INO_IS_STRAY(newest.dirino)) {
+ newest = q;
+ }
+ }
+
+ for (auto& q : p.second) {
+ // in the middle of dir fragmentation?
+ if (newest.dirino == q.dirino && newest.name == q.name) {
+ snaps.insert(make_move_iterator(begin(q.snaps)),
+ make_move_iterator(end(q.snaps)));
+ continue;
+ }
+
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
+ dn_key.encode(key);
+ to_remove[q.dirfrag()].insert(key);
+ derr << "Remove duplicated ino 0x" << p.first << " from "
+ << q.dirfrag() << "/" << q.name << dendl;
+ }
+
+ int nlink = 0;
+ auto q = remote_links.find(p.first);
+ if (q != remote_links.end())
+ nlink = q->second;
+ if (!MDS_INO_IS_STRAY(newest.dirino))
+ nlink++;
+
+ if (nlink != newest.nlink) {
+ derr << "Bad nlink on " << p.first << " expected " << nlink
+ << " has " << newest.nlink << dendl;
+ bad_nlink_inos[p.first] = newest;
+ bad_nlink_inos[p.first].nlink = nlink;
+ }
+ }
+ dup_primaries.clear();
+ remote_links.clear();
+
+ {
+ objecter->with_osdmap([&](const OSDMap& o) {
+ for (auto p : data_pools) {
+ const pg_pool_t *pi = o.get_pg_pool(p);
+ if (!pi)
+ continue;
+ if (pi->snap_seq > last_snap)
+ last_snap = pi->snap_seq;
+ }
+ });
+
+ if (!snaps.empty()) {
+ if (snaps.rbegin()->first > last_snap)
+ last_snap = snaps.rbegin()->first;
+ }
+ }
+
+ dout(10) << "removing dup dentries from " << to_remove.size() << " objects"
+ << dendl;
+
+ for (auto& p : to_remove) {
+ object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
+
+ dout(10) << "removing dup dentries from " << p.first << dendl;
+
+ int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
+ if (r != 0) {
+ derr << "Error removing duplicated dentries from " << p.first << dendl;
+ return r;
+ }
+ }
+ to_remove.clear();
+
+ dout(10) << "processing " << bad_nlink_inos.size() << " bad_nlink_inos"
+ << dendl;
+
+ for (auto &p : bad_nlink_inos) {
+ dout(10) << "handling bad_nlink_ino " << p.first << dendl;
+
+ InodeStore inode;
+ snapid_t first;
+ int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
+ if (r < 0) {
+ derr << "Unexpected error reading dentry "
+ << p.second.dirfrag() << "/" << p.second.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (inode.inode->ino != p.first || inode.inode->version != p.second.version)
+ continue;
+
+ inode.get_inode()->nlink = p.second.nlink;
+ r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
+ if (r < 0)
+ return r;
+ }
+
+ dout(10) << "processing " << injected_inos.size() << " injected_inos"
+ << dendl;
+
+ for (auto &p : injected_inos) {
+ dout(10) << "handling injected_ino " << p.first << dendl;
+
+ InodeStore inode;
+ snapid_t first;
+ int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
+ if (r < 0) {
+ derr << "Unexpected error reading dentry "
+ << p.second.dirfrag() << "/" << p.second.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (first != CEPH_NOSNAP)
+ continue;
+
+ first = last_snap + 1;
+ r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
+ if (r < 0)
+ return r;
+ }
+
+ dout(10) << "updating inotable" << dendl;
+
+ for (auto& p : max_ino_map) {
+ InoTable inotable(nullptr);
+ inotable.set_rank(p.first);
+ bool dirty = false;
+ int r = metadata_driver->load_table(&inotable);
+ if (r < 0) {
+ inotable.reset_state();
+ dirty = true;
+ }
+ if (inotable.force_consume_to(p.second))
+ dirty = true;
+ if (dirty) {
+ r = metadata_driver->save_table(&inotable);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ dout(10) << "updating snaptable" << dendl;
+
+ {
+ SnapServer snaptable;
+ snaptable.set_rank(0);
+ bool dirty = false;
+ int r = metadata_driver->load_table(&snaptable);
+ if (r < 0) {
+ snaptable.reset_state();
+ dirty = true;
+ }
+ if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps))
+ dirty = true;
+ if (dirty) {
+ r = metadata_driver->save_table(&snaptable);
+ if (r < 0)
+ return r;
+ }
+ }
+ return 0;
+}
+
+int DataScan::scan_frags()
+{
+ bool roots_present;
+ int r = driver->check_roots(&roots_present);
+ if (r != 0) {
+ derr << "Unexpected error checking roots: '"
+ << cpp_strerror(r) << "'" << dendl;
+ return r;
+ }
+
+ if (!roots_present) {
+ std::cerr << "Some or all system inodes are absent. Run 'init' from "
+ "one node before running 'scan_inodes'" << std::endl;
+ return -EIO;
+ }
+
+ return forall_objects(metadata_io, true, [this](
+ std::string const &oid,
+ uint64_t obj_name_ino,
+ uint64_t obj_name_offset) -> int
+ {
+ int r = 0;
+ r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
+ if (r != 0) {
+ dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
+ return r;
+ }
+
+ if (obj_name_ino < (1ULL << 40)) {
+ // FIXME: we're skipping stray dirs here: if they're
+ // orphaned then we should be resetting them some other
+ // way
+ dout(10) << "Skipping system ino " << obj_name_ino << dendl;
+ return 0;
+ }
+
+ AccumulateResult accum_res;
+ inode_backtrace_t backtrace;
+
+ // Default to inherit layout (i.e. no explicit layout on dir) which is
+ // expressed as a zeroed layout struct (see inode_t::has_layout)
+ file_layout_t loaded_layout;
+
+ int parent_r = 0;
+ bufferlist parent_bl;
+ int layout_r = 0;
+ bufferlist layout_bl;
+ bufferlist op_bl;
+
+ librados::ObjectReadOperation op;
+ op.getxattr("parent", &parent_bl, &parent_r);
+ op.getxattr("layout", &layout_bl, &layout_r);
+ r = metadata_io.operate(oid, &op, &op_bl);
+ if (r != 0 && r != -ENODATA) {
+ derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
+ return r;
+ }
+
+ if (parent_r != -ENODATA) {
+ try {
+ auto q = parent_bl.cbegin();
+ backtrace.decode(q);
+ } catch (buffer::error &e) {
+ dout(4) << "Corrupt backtrace on '" << oid << "': " << e.what() << dendl;
+ if (!force_corrupt) {
+ return -EINVAL;
+ } else {
+ // Treat backtrace as absent: we'll inject into lost+found
+ backtrace = inode_backtrace_t();
+ }
+ }
+ }
+
+ if (layout_r != -ENODATA) {
+ try {
+ auto q = layout_bl.cbegin();
+ decode(loaded_layout, q);
+ } catch (buffer::error &e) {
+ dout(4) << "Corrupt layout on '" << oid << "': " << e.what() << dendl;
+ if (!force_corrupt) {
+ return -EINVAL;
+ }
+ }
+ }
+
+ bool have_backtrace = !(backtrace.ancestors.empty());
+
+ // Santity checking backtrace ino against object name
+ if (have_backtrace && backtrace.ino != obj_name_ino) {
+ dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
+ << " doesn't match object name ino 0x" << obj_name_ino
+ << std::dec << dendl;
+ have_backtrace = false;
+ }
+
+ uint64_t fnode_version = 0;
+ fnode_t fnode;
+ r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
+ if (r == -EINVAL) {
+ derr << "Corrupt fnode on " << oid << dendl;
+ if (force_corrupt) {
+ fnode.fragstat.mtime = 0;
+ fnode.fragstat.nfiles = 1;
+ fnode.fragstat.nsubdirs = 0;
+ fnode.accounted_fragstat = fnode.fragstat;
+ } else {
+ return r;
+ }
+ }
+
+ InodeStore dentry;
+ build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
+ loaded_layout, &dentry);
+
+ // Inject inode to the metadata pool
+ if (have_backtrace) {
+ inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
+ if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
+ /* Special case for strays: even if we have a good backtrace,
+ * don't put it in the stray dir, because while that would technically
+ * give it linkage it would still be invisible to the user */
+ r = driver->inject_lost_and_found(obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ } else {
+ /* Happy case: we will inject a named dentry for this inode */
+ r = driver->inject_with_backtrace(backtrace, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
+ << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+ } else {
+ /* Backtrace-less case: we will inject a lost+found dentry */
+ r = driver->inject_lost_and_found(
+ obj_name_ino, dentry);
+ if (r < 0) {
+ dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
+ << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
+ if (r == -EINVAL) {
+ dout(4) << "Use --force-corrupt to overwrite structures that "
+ "appear to be corrupt" << dendl;
+ }
+ }
+ }
+
+ return r;
+ });
+}
+
+int MetadataTool::read_fnode(
+ inodeno_t ino, frag_t frag, fnode_t *fnode,
+ uint64_t *last_version)
+{
+ ceph_assert(fnode != NULL);
+
+ object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
+ bufferlist fnode_bl;
+ int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
+ *last_version = metadata_io.get_last_version();
+ if (r < 0) {
+ return r;
+ }
+
+ auto old_fnode_iter = fnode_bl.cbegin();
+ try {
+ (*fnode).decode(old_fnode_iter);
+ } catch (const buffer::error &err) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
+ const std::string &dname, InodeStore *inode, snapid_t *dnfirst)
+{
+ ceph_assert(inode != NULL);
+
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+ dn_key.encode(key);
+
+ std::set<std::string> keys;
+ keys.insert(key);
+ std::map<std::string, bufferlist> vals;
+ object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
+ int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);
+ dout(20) << "oid=" << frag_oid.name
+ << " dname=" << dname
+ << " frag=" << frag
+ << ", r=" << r << dendl;
+ if (r < 0) {
+ return r;
+ }
+
+ if (vals.find(key) == vals.end()) {
+ dout(20) << key << " not found in result" << dendl;
+ return -ENOENT;
+ }
+
+ try {
+ auto q = vals[key].cbegin();
+ snapid_t first;
+ decode(first, q);
+ char dentry_type;
+ decode(dentry_type, q);
+ if (dentry_type == 'I' || dentry_type == 'i') {
+ if (dentry_type == 'i') {
+ mempool::mds_co::string alternate_name;
+
+ DECODE_START(2, q);
+ if (struct_v >= 2)
+ decode(alternate_name, q);
+ inode->decode(q);
+ DECODE_FINISH(q);
+ } else {
+ inode->decode_bare(q);
+ }
+ } else {
+ dout(20) << "dentry type '" << dentry_type << "': cannot"
+ "read an inode out of that" << dendl;
+ return -EINVAL;
+ }
+ if (dnfirst)
+ *dnfirst = first;
+ } catch (const buffer::error &err) {
+ dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
+ << std::dec << "/" << dname << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int MetadataDriver::load_table(MDSTable *table)
+{
+ object_t table_oid = table->get_object_name();
+
+ bufferlist table_bl;
+ int r = metadata_io.read(table_oid.name, table_bl, 0, 0);
+ if (r < 0) {
+ derr << "unable to read mds table '" << table_oid.name << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ try {
+ version_t table_ver;
+ auto p = table_bl.cbegin();
+ decode(table_ver, p);
+ table->decode_state(p);
+ table->force_replay_version(table_ver);
+ } catch (const buffer::error &err) {
+ derr << "unable to decode mds table '" << table_oid.name << "': "
+ << err.what() << dendl;
+ return -EIO;
+ }
+ return 0;
+}
+
+int MetadataDriver::save_table(MDSTable *table)
+{
+ object_t table_oid = table->get_object_name();
+
+ bufferlist table_bl;
+ encode(table->get_version(), table_bl);
+ table->encode_state(table_bl);
+ int r = metadata_io.write_full(table_oid.name, table_bl);
+ if (r != 0) {
+ derr << "error updating mds table " << table_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ return 0;
+}
+
+int MetadataDriver::inject_lost_and_found(
+ inodeno_t ino, const InodeStore &dentry)
+{
+ // Create lost+found if doesn't exist
+ bool created = false;
+ int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
+ if (r < 0) {
+ return r;
+ }
+ InodeStore lf_ino;
+ r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
+ if (r == -ENOENT || r == -EINVAL) {
+ if (r == -EINVAL && !force_corrupt) {
+ return r;
+ }
+
+ // To have a directory not specify a layout, give it zeros (see
+ // inode_t::has_layout)
+ file_layout_t inherit_layout;
+
+ // Construct LF inode
+ frag_info_t fragstat;
+ fragstat.nfiles = 1,
+ build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
+
+ // Inject link to LF inode in the root dir
+ r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
+ if (r < 0) {
+ return r;
+ }
+ } else {
+ if (!(lf_ino.inode->mode & S_IFDIR)) {
+ derr << "lost+found exists but is not a directory!" << dendl;
+ // In this case we error out, and the user should do something about
+ // this problem.
+ return -EINVAL;
+ }
+ }
+
+ r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
+ if (r < 0) {
+ return r;
+ }
+
+ const std::string dname = lost_found_dname(ino);
+
+ // Write dentry into lost+found dirfrag
+ return inject_linkage(lf_ino.inode->ino, dname, frag_t(), dentry);
+}
+
+
+int MetadataDriver::get_frag_of(
+ inodeno_t dirino,
+ const std::string &target_dname,
+ frag_t *result_ft)
+{
+ object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
+
+ dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
+
+ // Find and load fragtree if existing dirfrag
+ // ==========================================
+ bool have_backtrace = false;
+ bufferlist parent_bl;
+ int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
+ if (r == -ENODATA) {
+ dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
+ } else if (r < 0) {
+ dout(4) << "Unexpected error on '" << root_frag_oid << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Deserialize backtrace
+ inode_backtrace_t backtrace;
+ if (parent_bl.length()) {
+ try {
+ auto q = parent_bl.cbegin();
+ backtrace.decode(q);
+ have_backtrace = true;
+ } catch (buffer::error &e) {
+ dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': "
+ << e.what() << dendl;
+ }
+ }
+
+ if (!(have_backtrace && backtrace.ancestors.size())) {
+ // Can't work out fragtree without a backtrace
+ dout(4) << "No backtrace on '" << root_frag_oid
+ << "': cannot determine fragtree" << dendl;
+ return -ENOENT;
+ }
+
+ // The parentage of dirino
+ const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
+
+ // The inode of dirino's parent
+ const inodeno_t parent_ino = bp.dirino;
+
+ // The dname of dirino in its parent.
+ const std::string &parent_dname = bp.dname;
+
+ dout(20) << "got backtrace parent " << parent_ino << "/"
+ << parent_dname << dendl;
+
+ // The primary dentry for dirino
+ InodeStore existing_dentry;
+
+ // See if we can find ourselves in dirfrag zero of the parent: this
+ // is a fast path that avoids needing to go further up the tree
+ // if the parent isn't fragmented (worst case we would have to
+ // go all the way to the root)
+ r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
+ if (r >= 0) {
+ // Great, fast path: return the fragtree from here
+ if (existing_dentry.inode->ino != dirino) {
+ dout(4) << "Unexpected inode in dentry! 0x" << std::hex
+ << existing_dentry.inode->ino
+ << " vs expected 0x" << dirino << std::dec << dendl;
+ return -ENOENT;
+ }
+ dout(20) << "fast path, fragtree is "
+ << existing_dentry.dirfragtree << dendl;
+ *result_ft = existing_dentry.pick_dirfrag(target_dname);
+ dout(20) << "frag is " << *result_ft << dendl;
+ return 0;
+ } else if (r != -ENOENT) {
+ // Dentry not present in 0th frag, must read parent's fragtree
+ frag_t parent_frag;
+ r = get_frag_of(parent_ino, parent_dname, &parent_frag);
+ if (r == 0) {
+ // We have the parent fragtree, so try again to load our dentry
+ r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
+ if (r >= 0) {
+ // Got it!
+ *result_ft = existing_dentry.pick_dirfrag(target_dname);
+ dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
+ return 0;
+ } else {
+ if (r == -EINVAL || r == -ENOENT) {
+ return -ENOENT; // dentry missing or corrupt, so frag is missing
+ } else {
+ return r;
+ }
+ }
+ } else {
+ // Couldn't resolve parent fragtree, so can't find ours.
+ return r;
+ }
+ } else if (r == -EINVAL) {
+ // Unreadable dentry, can't know the fragtree.
+ return -ENOENT;
+ } else {
+ // Unexpected error, raise it
+ return r;
+ }
+}
+
+
+int MetadataDriver::inject_with_backtrace(
+ const inode_backtrace_t &backtrace, const InodeStore &dentry)
+
+{
+
+ // On dirfrags
+ // ===========
+ // In order to insert something into a directory, we first (ideally)
+ // need to know the fragtree for the directory. Sometimes we can't
+ // get that, in which case we just go ahead and insert it into
+ // fragment zero for a good chance of that being the right thing
+ // anyway (most moderate-sized dirs aren't fragmented!)
+
+ // On ancestry
+ // ===========
+ // My immediate ancestry should be correct, so if we can find that
+ // directory's dirfrag then go inject it there. This works well
+ // in the case that this inode's dentry was somehow lost and we
+ // are recreating it, because the rest of the hierarchy
+ // will probably still exist.
+ //
+ // It's more of a "better than nothing" approach when rebuilding
+ // a whole tree, as backtraces will in general not be up to date
+ // beyond the first parent, if anything in the trace was ever
+ // moved after the file was created.
+
+ // On inode numbers
+ // ================
+ // The backtrace tells us inodes for each of the parents. If we are
+ // creating those parent dirfrags, then there is a risk that somehow
+ // the inode indicated here was also used for data (not a dirfrag) at
+ // some stage. That would be a zany situation, and we don't check
+ // for it here, because to do so would require extra IOs for everything
+ // we inject, and anyway wouldn't guarantee that the inode number
+ // wasn't in use in some dentry elsewhere in the metadata tree that
+ // just happened not to have any data objects.
+
+ // On multiple workers touching the same traces
+ // ============================================
+ // When creating linkage for a directory, *only* create it if we are
+ // also creating the object. That way, we might not manage to get the
+ // *right* linkage for a directory, but at least we won't multiply link
+ // it. We assume that if a root dirfrag exists for a directory, then
+ // it is linked somewhere (i.e. that the metadata pool is not already
+ // inconsistent).
+ //
+ // Making sure *that* is true is someone else's job! Probably someone
+ // who is not going to run in parallel, so that they can self-consistently
+ // look at versions and move things around as they go.
+ // Note this isn't 100% safe: if we die immediately after creating dirfrag
+ // object, next run will fail to create linkage for the dirfrag object
+ // and leave it orphaned.
+
+ inodeno_t ino = backtrace.ino;
+ dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl;
+ for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
+ i != backtrace.ancestors.end(); ++i) {
+ const inode_backpointer_t &backptr = *i;
+ dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec
+ << "/" << backptr.dname << dendl;
+
+ // Examine root dirfrag for parent
+ const inodeno_t parent_ino = backptr.dirino;
+ const std::string dname = backptr.dname;
+
+ frag_t fragment;
+ int r = get_frag_of(parent_ino, dname, &fragment);
+ if (r == -ENOENT) {
+ // Don't know fragment, fall back to assuming root
+ dout(20) << "don't know fragment for 0x" << std::hex <<
+ parent_ino << std::dec << "/" << dname << ", will insert to root"
+ << dendl;
+ }
+
+ // Find or create dirfrag
+ // ======================
+ bool created_dirfrag;
+ r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
+ if (r < 0) {
+ return r;
+ }
+
+ // Check if dentry already exists
+ // ==============================
+ InodeStore existing_dentry;
+ r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
+ bool write_dentry = false;
+ if (r == -ENOENT || r == -EINVAL) {
+ if (r == -EINVAL && !force_corrupt) {
+ return r;
+ }
+ // Missing or corrupt dentry
+ write_dentry = true;
+ } else if (r < 0) {
+ derr << "Unexpected error reading dentry 0x" << std::hex
+ << parent_ino << std::dec << "/"
+ << dname << ": " << cpp_strerror(r) << dendl;
+ break;
+ } else {
+ // Dentry already present, does it link to me?
+ if (existing_dentry.inode->ino == ino) {
+ dout(20) << "Dentry 0x" << std::hex
+ << parent_ino << std::dec << "/"
+ << dname << " already exists and points to me" << dendl;
+ } else {
+ derr << "Dentry 0x" << std::hex
+ << parent_ino << std::dec << "/"
+ << dname << " already exists but points to 0x"
+ << std::hex << existing_dentry.inode->ino << std::dec << dendl;
+ // Fall back to lost+found!
+ return inject_lost_and_found(backtrace.ino, dentry);
+ }
+ }
+
+ // Inject linkage
+ // ==============
+
+ if (write_dentry) {
+ if (i == backtrace.ancestors.begin()) {
+ // This is the linkage for the file of interest
+ dout(10) << "Linking inode 0x" << std::hex << ino
+ << " at 0x" << parent_ino << "/" << dname << std::dec
+ << " with size=" << dentry.inode->size << " bytes" << dendl;
+
+ r = inject_linkage(parent_ino, dname, fragment, dentry);
+ } else {
+ // This is the linkage for an ancestor directory
+ InodeStore ancestor_dentry;
+ auto inode = ancestor_dentry.get_inode();
+ inode->mode = 0755 | S_IFDIR;
+
+ // Set nfiles to something non-zero, to fool any other code
+ // that tries to ignore 'empty' directories. This won't be
+ // accurate, but it should avoid functional issues.
+
+ inode->dirstat.nfiles = 1;
+ inode->dir_layout.dl_dir_hash =
+ g_conf()->mds_default_dir_hash;
+
+ inode->nlink = 1;
+ inode->ino = ino;
+ inode->uid = g_conf()->mds_root_ino_uid;
+ inode->gid = g_conf()->mds_root_ino_gid;
+ inode->version = 1;
+ inode->backtrace_version = 1;
+ r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
+ }
+
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (!created_dirfrag) {
+ // If the parent dirfrag already existed, then stop traversing the
+ // backtrace: assume that the other ancestors already exist too. This
+ // is an assumption rather than a truth, but it's a convenient way
+ // to avoid the risk of creating multiply-linked directories while
+ // injecting data. If there are in fact missing ancestors, this
+ // should be fixed up using a separate tool scanning the metadata
+ // pool.
+ break;
+ } else {
+ // Proceed up the backtrace, creating parents
+ ino = parent_ino;
+ }
+ }
+
+ return 0;
+}
+
+int MetadataDriver::find_or_create_dirfrag(
+ inodeno_t ino,
+ frag_t fragment,
+ bool *created)
+{
+ ceph_assert(created != NULL);
+
+ fnode_t existing_fnode;
+ *created = false;
+
+ uint64_t read_version = 0;
+ int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
+ dout(10) << "read_version = " << read_version << dendl;
+
+ if (r == -ENOENT || r == -EINVAL) {
+ if (r == -EINVAL && !force_corrupt) {
+ return r;
+ }
+
+ // Missing or corrupt fnode, create afresh
+ bufferlist fnode_bl;
+ fnode_t blank_fnode;
+ blank_fnode.version = 1;
+ // mark it as non-empty
+ blank_fnode.fragstat.nfiles = 1;
+ blank_fnode.accounted_fragstat = blank_fnode.fragstat;
+ blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
+ blank_fnode.encode(fnode_bl);
+
+
+ librados::ObjectWriteOperation op;
+
+ if (read_version) {
+ ceph_assert(r == -EINVAL);
+ // Case A: We must assert that the version isn't changed since we saw the object
+ // was unreadable, to avoid the possibility of two data-scan processes
+ // both creating the frag.
+ op.assert_version(read_version);
+ } else {
+ ceph_assert(r == -ENOENT);
+ // Case B: The object didn't exist in read_fnode, so while creating it we must
+ // use an exclusive create to correctly populate *creating with
+ // whether we created it ourselves or someone beat us to it.
+ op.create(true);
+ }
+
+ object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
+ op.omap_set_header(fnode_bl);
+ r = metadata_io.operate(frag_oid.name, &op);
+ if (r == -EOVERFLOW || r == -EEXIST) {
+ // Someone else wrote it (see case A above)
+ dout(10) << "Dirfrag creation race: 0x" << std::hex
+ << ino << " " << fragment << std::dec << dendl;
+ *created = false;
+ return 0;
+ } else if (r < 0) {
+ // We were unable to create or write it, error out
+ derr << "Failed to create dirfrag 0x" << std::hex
+ << ino << std::dec << ": " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ // Success: the dirfrag object now exists with a value header
+ dout(10) << "Created dirfrag: 0x" << std::hex
+ << ino << std::dec << dendl;
+ *created = true;
+ }
+ } else if (r < 0) {
+ derr << "Unexpected error reading dirfrag 0x" << std::hex
+ << ino << std::dec << " : " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ dout(20) << "Dirfrag already exists: 0x" << std::hex
+ << ino << " " << fragment << std::dec << dendl;
+ }
+
+ return 0;
+}
+
+int MetadataDriver::inject_linkage(
+ inodeno_t dir_ino, const std::string &dname,
+ const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst)
+{
+ object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
+
+ std::string key;
+ dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
+ dn_key.encode(key);
+
+ bufferlist dentry_bl;
+ encode(dnfirst, dentry_bl);
+ encode('I', dentry_bl);
+ inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ // Write out
+ std::map<std::string, bufferlist> vals;
+ vals[key] = dentry_bl;
+ int r = metadata_io.omap_set(frag_oid.name, vals);
+ if (r != 0) {
+ derr << "Error writing dentry 0x" << std::hex
+ << dir_ino << std::dec << "/"
+ << dname << ": " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ dout(20) << "Injected dentry 0x" << std::hex
+ << dir_ino << "/" << dname << " pointing to 0x"
+ << inode.inode->ino << std::dec << dendl;
+ return 0;
+ }
+}
+
+
+int MetadataDriver::init(
+ librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+ fs_cluster_id_t fscid)
+{
+ if (metadata_pool_name.empty()) {
+ auto fs = fsmap->get_filesystem(fscid);
+ ceph_assert(fs != nullptr);
+ int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
+
+ dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+ int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+ if (r < 0) {
+ derr << "Pool " << metadata_pool_id
+ << " identified in MDS map not found in RADOS!" << dendl;
+ return r;
+ }
+ dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
+ } else {
+ dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
+ }
+ return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
+}
+
+int LocalFileDriver::init(
+ librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+ fs_cluster_id_t fscid)
+{
+ return 0;
+}
+
+int LocalFileDriver::inject_data(
+ const std::string &file_path,
+ uint64_t size,
+ uint32_t chunk_size,
+ inodeno_t ino)
+{
+ // Scrape the file contents out of the data pool and into the
+ // local filesystem
+ std::fstream f;
+ f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
+
+ for (uint64_t offset = 0; offset < size; offset += chunk_size) {
+ bufferlist bl;
+
+ char buf[32];
+ snprintf(buf, sizeof(buf),
+ "%llx.%08llx",
+ (unsigned long long)ino,
+ (unsigned long long)(offset / chunk_size));
+ std::string oid(buf);
+
+ int r = data_io.read(oid, bl, chunk_size, 0);
+
+ if (r <= 0 && r != -ENOENT) {
+ derr << "error reading data object '" << oid << "': "
+ << cpp_strerror(r) << dendl;
+ f.close();
+ return r;
+ } else if (r >=0) {
+
+ f.seekp(offset);
+ bl.write_stream(f);
+ }
+ }
+ f.close();
+
+ return 0;
+}
+
+
+int LocalFileDriver::inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry)
+{
+ std::string path_builder = path;
+
+ // Iterate through backtrace creating directory parents
+ std::vector<inode_backpointer_t>::const_reverse_iterator i;
+ for (i = bt.ancestors.rbegin();
+ i != bt.ancestors.rend(); ++i) {
+
+ const inode_backpointer_t &backptr = *i;
+ path_builder += "/";
+ path_builder += backptr.dname;
+
+ // Last entry is the filename itself
+ bool is_file = (i + 1 == bt.ancestors.rend());
+ if (is_file) {
+ // FIXME: inject_data won't cope with interesting (i.e. striped)
+ // layouts (need a librados-compatible Filer to read these)
+ inject_data(path_builder, dentry.inode->size,
+ dentry.inode->layout.object_size, bt.ino);
+ } else {
+ int r = mkdir(path_builder.c_str(), 0755);
+ if (r != 0 && r != -EPERM) {
+ derr << "error creating directory: '" << path_builder << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int LocalFileDriver::inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry)
+{
+ std::string lf_path = path + "/lost+found";
+ int r = mkdir(lf_path.c_str(), 0755);
+ if (r != 0 && r != -EPERM) {
+ derr << "error creating directory: '" << lf_path << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::string file_path = lf_path + "/" + lost_found_dname(ino);
+ return inject_data(file_path, dentry.inode->size,
+ dentry.inode->layout.object_size, ino);
+}
+
+int LocalFileDriver::init_roots(int64_t data_pool_id)
+{
+ // Ensure that the path exists and is a directory
+ bool exists;
+ int r = check_roots(&exists);
+ if (r != 0) {
+ return r;
+ }
+
+ if (exists) {
+ return 0;
+ } else {
+ return ::mkdir(path.c_str(), 0755);
+ }
+}
+
+int LocalFileDriver::check_roots(bool *result)
+{
+ // Check if the path exists and is a directory
+ DIR *d = ::opendir(path.c_str());
+ if (d == NULL) {
+ *result = false;
+ } else {
+ int r = closedir(d);
+ if (r != 0) {
+ // Weird, but maybe possible with e.g. stale FD on NFS mount?
+ *result = false;
+ } else {
+ *result = true;
+ }
+ }
+
+ return 0;
+}
+
+void MetadataTool::build_file_dentry(
+ inodeno_t ino, uint64_t file_size, time_t file_mtime,
+ const file_layout_t &layout, InodeStore *out)
+{
+ ceph_assert(out != NULL);
+
+ auto inode = out->get_inode();
+ inode->mode = 0500 | S_IFREG;
+ inode->size = file_size;
+ inode->max_size_ever = file_size;
+ inode->mtime.tv.tv_sec = file_mtime;
+ inode->atime.tv.tv_sec = file_mtime;
+ inode->ctime.tv.tv_sec = file_mtime;
+
+ inode->layout = layout;
+
+ inode->truncate_seq = 1;
+ inode->truncate_size = -1ull;
+
+ inode->inline_data.version = CEPH_INLINE_NONE;
+
+ inode->nlink = 1;
+ inode->ino = ino;
+ inode->version = 1;
+ inode->backtrace_version = 1;
+ inode->uid = g_conf()->mds_root_ino_uid;
+ inode->gid = g_conf()->mds_root_ino_gid;
+}
+
+void MetadataTool::build_dir_dentry(
+ inodeno_t ino, const frag_info_t &fragstat,
+ const file_layout_t &layout, InodeStore *out)
+{
+ ceph_assert(out != NULL);
+
+ auto inode = out->get_inode();
+ inode->mode = 0755 | S_IFDIR;
+ inode->dirstat = fragstat;
+ inode->mtime.tv.tv_sec = fragstat.mtime;
+ inode->atime.tv.tv_sec = fragstat.mtime;
+ inode->ctime.tv.tv_sec = fragstat.mtime;
+
+ inode->layout = layout;
+ inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
+
+ inode->truncate_seq = 1;
+ inode->truncate_size = -1ull;
+
+ inode->inline_data.version = CEPH_INLINE_NONE;
+
+ inode->nlink = 1;
+ inode->ino = ino;
+ inode->version = 1;
+ inode->backtrace_version = 1;
+ inode->uid = g_conf()->mds_root_ino_uid;
+ inode->gid = g_conf()->mds_root_ino_gid;
+}
+
diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h
new file mode 100644
index 000000000..5c87fe2bd
--- /dev/null
+++ b/src/tools/cephfs/DataScan.h
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "MDSUtility.h"
+#include "include/rados/librados.hpp"
+
+class InodeStore;
+class MDSTable;
+
+class RecoveryDriver {
+ protected:
+ // If true, overwrite structures that generate decoding errors.
+ bool force_corrupt;
+
+ // If true, overwrite root objects during init_roots even if they
+ // exist
+ bool force_init;
+
+ public:
+ virtual int init(
+ librados::Rados &rados,
+ std::string &metadata_pool_name,
+ const FSMap *fsmap,
+ fs_cluster_id_t fscid) = 0;
+
+ void set_force_corrupt(const bool val)
+ {
+ force_corrupt = val;
+ }
+
+ void set_force_init(const bool val)
+ {
+ force_init = val;
+ }
+
+
+ /**
+ * Inject an inode + dentry parents into the metadata pool,
+ * based on a backtrace recovered from the data pool
+ */
+ virtual int inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry) = 0;
+
+ /**
+ * Inject an inode + dentry into the lost+found directory,
+ * when all we know about a file is its inode.
+ */
+ virtual int inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry) = 0;
+
+ /**
+ * Create any missing roots (i.e. mydir, strays, root inode)
+ */
+ virtual int init_roots(
+ int64_t data_pool_id) = 0;
+
+ /**
+ * Pre-injection check that all the roots are present in
+ * the metadata pool. Used to avoid parallel workers interfering
+ * with one another, by cueing the user to go run 'init' on a
+ * single node before running a parallel scan.
+ *
+ * @param result: set to true if roots are present, else set to false
+ * @returns 0 on no unexpected errors, else error code. Missing objects
+ * are not considered an unexpected error: check *result for
+ * this case.
+ */
+ virtual int check_roots(bool *result) = 0;
+
+ /**
+ * Helper to compose dnames for links to lost+found
+ * inodes.
+ */
+ std::string lost_found_dname(inodeno_t ino)
+ {
+ char s[20];
+ snprintf(s, sizeof(s), "%llx", (unsigned long long)ino);
+ return std::string(s);
+ }
+
+ RecoveryDriver()
+ : force_corrupt(false),
+ force_init(false)
+ {}
+
+ virtual ~RecoveryDriver() {}
+};
+
+class LocalFileDriver : public RecoveryDriver
+{
+ protected:
+ const std::string path;
+ librados::IoCtx &data_io;
+
+ int inject_data(
+ const std::string &file_path,
+ uint64_t size,
+ uint32_t chunk_size,
+ inodeno_t ino);
+ public:
+
+ LocalFileDriver(const std::string &path_, librados::IoCtx &data_io_)
+ : RecoveryDriver(), path(path_), data_io(data_io_)
+ {}
+
+ // Implement RecoveryDriver interface
+ int init(
+ librados::Rados &rados,
+ std::string &metadata_pool_name,
+ const FSMap *fsmap,
+ fs_cluster_id_t fscid) override;
+
+ int inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry) override;
+
+ int inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry) override;
+
+ int init_roots(int64_t data_pool_id) override;
+
+ int check_roots(bool *result) override;
+};
+
+/**
+ * A class that knows how to work with objects in a CephFS
+ * metadata pool.
+ */
+class MetadataTool
+{
+ protected:
+
+ librados::IoCtx metadata_io;
+
+ /**
+ * Construct a synthetic InodeStore for a normal file
+ */
+ void build_file_dentry(
+ inodeno_t ino, uint64_t file_size, time_t file_mtime,
+ const file_layout_t &layout,
+ InodeStore *out);
+
+ /**
+ * Construct a synthetic InodeStore for a directory
+ */
+ void build_dir_dentry(
+ inodeno_t ino,
+ const frag_info_t &fragstat,
+ const file_layout_t &layout,
+ InodeStore *out);
+
+ /**
+ * Try and read an fnode from a dirfrag
+ */
+ int read_fnode(inodeno_t ino, frag_t frag,
+ fnode_t *fnode, uint64_t *read_version);
+
+ /**
+ * Try and read a dentry from a dirfrag
+ */
+ int read_dentry(inodeno_t parent_ino, frag_t frag,
+ const std::string &dname, InodeStore *inode, snapid_t *dnfirst=nullptr);
+};
+
+/**
+ * A class that knows how to manipulate CephFS metadata pools
+ */
+class MetadataDriver : public RecoveryDriver, public MetadataTool
+{
+ protected:
+ /**
+ * Create a .inode object, i.e. root or mydir
+ */
+ int inject_unlinked_inode(inodeno_t inono, int mode, int64_t data_pool_id);
+
+ /**
+ * Check for existence of .inode objects, before
+ * trying to go ahead and inject metadata.
+ */
+ int root_exists(inodeno_t ino, bool *result);
+ int find_or_create_dirfrag(
+ inodeno_t ino,
+ frag_t fragment,
+ bool *created);
+
+
+ /**
+ * Work out which fragment of a directory should contain a named
+ * dentry, recursing up the trace as necessary to retrieve
+ * fragtrees.
+ */
+ int get_frag_of(
+ inodeno_t dirino,
+ const std::string &dname,
+ frag_t *result_ft);
+
+ public:
+
+ // Implement RecoveryDriver interface
+ int init(
+ librados::Rados &rados,
+ std::string &metadata_pool_name,
+ const FSMap *fsmap,
+ fs_cluster_id_t fscid) override;
+
+ int inject_linkage(
+ inodeno_t dir_ino, const std::string &dname,
+ const frag_t fragment, const InodeStore &inode, snapid_t dnfirst=CEPH_NOSNAP);
+
+ int inject_with_backtrace(
+ const inode_backtrace_t &bt,
+ const InodeStore &dentry) override;
+
+ int inject_lost_and_found(
+ inodeno_t ino,
+ const InodeStore &dentry) override;
+
+ int init_roots(int64_t data_pool_id) override;
+
+ int check_roots(bool *result) override;
+
+ int load_table(MDSTable *table);
+ int save_table(MDSTable *table);
+};
+
+class DataScan : public MDSUtility, public MetadataTool
+{
+ protected:
+ RecoveryDriver *driver;
+ fs_cluster_id_t fscid;
+
+ string metadata_pool_name;
+ std::vector<int64_t> data_pools;
+
+ // IoCtx for data pool (where we scrape file backtraces from)
+ librados::IoCtx data_io;
+ // Remember the data pool ID for use in layouts
+ int64_t data_pool_id;
+
+ uint32_t n;
+ uint32_t m;
+
+ /**
+ * Scan data pool for backtraces, and inject inodes to metadata pool
+ */
+ int scan_inodes();
+
+ /**
+ * Scan data pool for file sizes and mtimes
+ */
+ int scan_extents();
+
+ /**
+ * Scan metadata pool for 0th dirfrags to link orphaned
+ * directory inodes.
+ */
+ int scan_frags();
+
+ /**
+ * Cleanup xattrs from data pool
+ */
+ int cleanup();
+
+ /**
+ * Check if an inode number is in the permitted ranges
+ */
+ bool valid_ino(inodeno_t ino) const;
+
+
+ int scan_links();
+
+ // Accept pools which are not in the FSMap
+ bool force_pool;
+ // Respond to decode errors by overwriting
+ bool force_corrupt;
+ // Overwrite root objects even if they exist
+ bool force_init;
+ // Only scan inodes without this scrub tag
+ string filter_tag;
+
+ /**
+ * @param r set to error on valid key with invalid value
+ * @return true if argument consumed, else false
+ */
+ bool parse_kwarg(
+ const std::vector<const char*> &args,
+ std::vector<const char *>::const_iterator &i,
+ int *r);
+
+ /**
+ * @return true if argument consumed, else false
+ */
+ bool parse_arg(
+ const std::vector<const char*> &arg,
+ std::vector<const char *>::const_iterator &i);
+
+ int probe_filter(librados::IoCtx &ioctx);
+
+ /**
+ * Apply a function to all objects in an ioctx's pool, optionally
+ * restricted to only those objects with a 00000000 offset and
+ * no tag matching DataScan::scrub_tag.
+ */
+ int forall_objects(
+ librados::IoCtx &ioctx,
+ bool untagged_only,
+ std::function<int(std::string, uint64_t, uint64_t)> handler);
+
+ public:
+ static void usage();
+ int main(const std::vector<const char *> &args);
+
+ DataScan()
+ : driver(NULL), fscid(FS_CLUSTER_ID_NONE),
+ data_pool_id(-1), n(0), m(1),
+ force_pool(false), force_corrupt(false),
+ force_init(false)
+ {
+ }
+
+ ~DataScan() override
+ {
+ delete driver;
+ }
+};
+
diff --git a/src/tools/cephfs/Dumper.cc b/src/tools/cephfs/Dumper.cc
new file mode 100644
index 000000000..f3b07c551
--- /dev/null
+++ b/src/tools/cephfs/Dumper.cc
@@ -0,0 +1,431 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef _BACKWARD_BACKWARD_WARNING_H
+#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_*
+#endif
+
+#include "include/compat.h"
+#include "include/fs_types.h"
+#include "common/entity_name.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/JournalPointer.h"
+#include "osdc/Journaler.h"
+#include "mon/MonClient.h"
+
+#include "Dumper.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+#define HEADER_LEN 4096
+
+int Dumper::init(mds_role_t role_, const std::string &type)
+{
+ role = role_;
+
+ int r = MDSUtility::init();
+ if (r < 0) {
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ if (type == "mdlog") {
+ JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool());
+ int jp_load_result = jp.load(objecter);
+ if (jp_load_result != 0) {
+ std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl;
+ return jp_load_result;
+ } else {
+ ino = jp.front;
+ }
+ } else if (type == "purge_queue") {
+ ino = MDS_INO_PURGE_QUEUE + role.rank;
+ } else {
+ ceph_abort(); // should not get here
+ }
+ return 0;
+}
+
+
+int Dumper::recover_journal(Journaler *journaler)
+{
+ C_SaferCond cond;
+ lock.lock();
+ journaler->recover(&cond);
+ lock.unlock();
+ const int r = cond.wait();
+
+ if (r < 0) { // Error
+ derr << "error on recovery: " << cpp_strerror(r) << dendl;
+ return r;
+ } else {
+ dout(10) << "completed journal recovery" << dendl;
+ return 0;
+ }
+}
+
+
+int Dumper::dump(const char *dump_file)
+{
+ int r = 0;
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
+ &finisher);
+ r = recover_journal(&journaler);
+ if (r) {
+ return r;
+ }
+ uint64_t start = journaler.get_read_pos();
+ uint64_t end = journaler.get_write_pos();
+ uint64_t len = end-start;
+
+ Filer filer(objecter, &finisher);
+
+ cout << "journal is " << start << "~" << len << std::endl;
+
+ int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, 0644);
+ if (fd >= 0) {
+ // include an informative header
+ uuid_d fsid = monc->get_fsid();
+ char fsid_str[40];
+ fsid.print(fsid_str);
+ char buf[HEADER_LEN];
+ memset(buf, 0, sizeof(buf));
+ snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\
+ length %llu (0x%llx)\n write_pos %llu (0x%llx)\n format %llu\n\
+ trimmed_pos %llu (0x%llx)\n stripe_unit %lu (0x%lx)\n stripe_count %lu (0x%lx)\n\
+ object_size %lu (0x%lx)\n fsid %s\n%c",
+ role.rank,
+ (unsigned long long)start, (unsigned long long)start,
+ (unsigned long long)len, (unsigned long long)len,
+ (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos,
+ (unsigned long long)journaler.last_committed.stream_format,
+ (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos,
+ (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit,
+ (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count,
+ (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size,
+ fsid_str,
+ 4);
+ r = safe_write(fd, buf, sizeof(buf));
+ if (r) {
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl;
+ ::close(fd);
+ return r;
+ }
+
+ // write the data
+ off64_t seeked = ::lseek64(fd, start, SEEK_SET);
+ if (seeked == (off64_t)-1) {
+ r = errno;
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl;
+ ::close(fd);
+ return r;
+ }
+
+
+ // Read and write 32MB chunks. Slower than it could be because we're not
+ // streaming, but that's okay because this is just a debug/disaster tool.
+ const uint32_t chunk_size = 32 * 1024 * 1024;
+
+ for (uint64_t pos = start; pos < start + len; pos += chunk_size) {
+ bufferlist bl;
+ dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl;
+
+ const uint32_t read_size = std::min<uint64_t>(chunk_size, end - pos);
+
+ C_SaferCond cond;
+ lock.lock();
+ filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP,
+ pos, read_size, &bl, 0, &cond);
+ lock.unlock();
+ r = cond.wait();
+ if (r < 0) {
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") reading "
+ "journal at offset 0x" << std::hex << pos << std::dec << dendl;
+ ::close(fd);
+ return r;
+ }
+ dout(10) << "Got 0x" << std::hex << bl.length() << std::dec
+ << " bytes" << dendl;
+
+ r = bl.write_fd(fd);
+ if (r) {
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl;
+ ::close(fd);
+ return r;
+ }
+ }
+
+ r = ::close(fd);
+ if (r) {
+ r = errno;
+ derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl;
+ return r;
+ }
+
+ cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n"
+ << "NOTE: this is a _sparse_ file; you can\n"
+ << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n"
+ << " to efficiently compress it while preserving sparseness." << std::endl;
+ return 0;
+ } else {
+ int err = errno;
+ derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl;
+ return err;
+ }
+}
+
+int Dumper::undump(const char *dump_file, bool force)
+{
+ cout << "undump " << dump_file << std::endl;
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ int r = 0;
+ // try get layout info from cluster
+ Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
+ &finisher);
+ int recovered = recover_journal(&journaler);
+ if (recovered != 0) {
+ derr << "recover_journal failed, try to get header from dump file " << dendl;
+ }
+
+ int fd = ::open(dump_file, O_RDONLY|O_BINARY);
+ if (fd < 0) {
+ r = errno;
+ derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Ceph mds0 journal dump
+ // start offset 232401996 (0xdda2c4c)
+ // length 1097504 (0x10bf20)
+
+ char buf[HEADER_LEN];
+ r = safe_read(fd, buf, sizeof(buf));
+ if (r < 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ return r;
+ }
+
+ long long unsigned start, len, write_pos, format, trimmed_pos;
+ long unsigned stripe_unit, stripe_count, object_size;
+ sscanf(strstr(buf, "start offset"), "start offset %llu", &start);
+ sscanf(strstr(buf, "length"), "length %llu", &len);
+ sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos);
+ sscanf(strstr(buf, "format"), "format %llu", &format);
+
+ if (!force) {
+ // need to check if fsid match onlien cluster fsid
+ if (strstr(buf, "fsid")) {
+ uuid_d fsid;
+ char fsid_str[40];
+ sscanf(strstr(buf, "fsid"), "fsid %39s", fsid_str);
+ r = fsid.parse(fsid_str);
+ if (!r) {
+ derr << "Invalid fsid" << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+
+ if (fsid != monc->get_fsid()) {
+ derr << "Imported journal fsid does not match online cluster fsid" << dendl;
+ derr << "Use --force to skip fsid check" << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+ } else {
+ derr << "Invalid header, no fsid embeded" << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+ }
+
+ if (recovered == 0) {
+ stripe_unit = journaler.last_committed.layout.stripe_unit;
+ stripe_count = journaler.last_committed.layout.stripe_count;
+ object_size = journaler.last_committed.layout.object_size;
+ } else {
+ // try to get layout from dump file header, if failed set layout to default
+ if (strstr(buf, "stripe_unit")) {
+ sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit);
+ } else {
+ stripe_unit = file_layout_t::get_default().stripe_unit;
+ }
+ if (strstr(buf, "stripe_count")) {
+ sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count);
+ } else {
+ stripe_count = file_layout_t::get_default().stripe_count;
+ }
+ if (strstr(buf, "object_size")) {
+ sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size);
+ } else {
+ object_size = file_layout_t::get_default().object_size;
+ }
+ }
+
+ if (strstr(buf, "trimmed_pos")) {
+ sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos);
+ } else {
+ // Old format dump, any untrimmed objects before expire_pos will
+ // be discarded as trash.
+ trimmed_pos = start - (start % object_size);
+ }
+
+ if (trimmed_pos > start) {
+ derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos
+ << " > expire 0x" << start << std::dec << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+
+ if (start > write_pos) {
+ derr << std::hex << "Invalid header (expire 0x" << start
+ << " > write 0x" << write_pos << std::dec << dendl;
+ ::close(fd);
+ return -EINVAL;
+ }
+
+ cout << "start " << start <<
+ " len " << len <<
+ " write_pos " << write_pos <<
+ " format " << format <<
+ " trimmed_pos " << trimmed_pos <<
+ " stripe_unit " << stripe_unit <<
+ " stripe_count " << stripe_count <<
+ " object_size " << object_size << std::endl;
+
+ Journaler::Header h;
+ h.trimmed_pos = trimmed_pos;
+ h.expire_pos = start;
+ h.write_pos = write_pos;
+ h.stream_format = format;
+ h.magic = CEPH_FS_ONDISK_MAGIC;
+
+ h.layout.stripe_unit = stripe_unit;
+ h.layout.stripe_count = stripe_count;
+ h.layout.object_size = object_size;
+ h.layout.pool_id = fs->mds_map.get_metadata_pool();
+
+ bufferlist hbl;
+ encode(h, hbl);
+
+ object_t oid = file_object_t(ino, 0);
+ object_locator_t oloc(fs->mds_map.get_metadata_pool());
+ SnapContext snapc;
+
+ cout << "writing header " << oid << std::endl;
+ C_SaferCond header_cond;
+ lock.lock();
+ objecter->write_full(oid, oloc, snapc, hbl,
+ ceph::real_clock::now(), 0,
+ &header_cond);
+ lock.unlock();
+
+ r = header_cond.wait();
+ if (r != 0) {
+ derr << "Failed to write header: " << cpp_strerror(r) << dendl;
+ ::close(fd);
+ return r;
+ }
+
+ Filer filer(objecter, &finisher);
+
+ /* Erase any objects at the end of the region to which we shall write
+ * the new log data. This is to avoid leaving trailing junk after
+ * the newly written data. Any junk more than one object ahead
+ * will be taken care of during normal operation by Journaler's
+ * prezeroing behaviour */
+ {
+ uint32_t const object_size = h.layout.object_size;
+ ceph_assert(object_size > 0);
+ uint64_t last_obj = h.write_pos / object_size;
+ uint64_t purge_count = 2;
+ /* When the length is zero, the last_obj should be zeroed
+ * from the offset determined by the new write_pos instead of being purged.
+ */
+ if (!len) {
+ purge_count = 1;
+ ++last_obj;
+ }
+ C_SaferCond purge_cond;
+ cout << "Purging " << purge_count << " objects from " << last_obj << std::endl;
+ lock.lock();
+ filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count,
+ ceph::real_clock::now(), 0, &purge_cond);
+ lock.unlock();
+ purge_cond.wait();
+ }
+ /* When the length is zero, zero the last object
+ * from the offset determined by the new write_pos.
+ */
+ if (!len) {
+ uint64_t offset_in_obj = h.write_pos % h.layout.object_size;
+ uint64_t len = h.layout.object_size - offset_in_obj;
+ C_SaferCond zero_cond;
+ cout << "Zeroing " << len << " bytes in the last object." << std::endl;
+
+ lock.lock();
+ filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond);
+ lock.unlock();
+ zero_cond.wait();
+ }
+
+ // Stream from `fd` to `filer`
+ uint64_t pos = start;
+ uint64_t left = len;
+ while (left > 0) {
+ // Read
+ bufferlist j;
+ lseek64(fd, pos, SEEK_SET);
+ uint64_t l = std::min<uint64_t>(left, 1024*1024);
+ j.read_fd(fd, l);
+
+ // Write
+ cout << " writing " << pos << "~" << l << std::endl;
+ C_SaferCond write_cond;
+ lock.lock();
+ filer.write(ino, &h.layout, snapc, pos, l, j,
+ ceph::real_clock::now(), 0, &write_cond);
+ lock.unlock();
+
+ r = write_cond.wait();
+ if (r != 0) {
+ derr << "Failed to write header: " << cpp_strerror(r) << dendl;
+ ::close(fd);
+ return r;
+ }
+
+ // Advance
+ pos += l;
+ left -= l;
+ }
+
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ cout << "done." << std::endl;
+ return 0;
+}
+
diff --git a/src/tools/cephfs/Dumper.h b/src/tools/cephfs/Dumper.h
new file mode 100644
index 000000000..758f3cdea
--- /dev/null
+++ b/src/tools/cephfs/Dumper.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef JOURNAL_DUMPER_H_
+#define JOURNAL_DUMPER_H_
+
+
+#include "MDSUtility.h"
+
+class Journaler;
+
+/**
+ * This class lets you dump out an mds journal for troubleshooting or whatever.
+ *
+ * It was built to work with cmds so some of the design choices are random.
+ * To use, create a Dumper, call init(), and then call dump() with the name
+ * of the file to dump to.
+ */
+
+class Dumper : public MDSUtility {
+private:
+ mds_role_t role;
+ inodeno_t ino;
+
+public:
+ Dumper() : ino(-1)
+ {}
+
+ int init(mds_role_t role_, const std::string &type);
+ int recover_journal(Journaler *journaler);
+ int dump(const char *dumpfile);
+ int undump(const char *dumpfile, bool force);
+};
+
+#endif /* JOURNAL_DUMPER_H_ */
diff --git a/src/tools/cephfs/EventOutput.cc b/src/tools/cephfs/EventOutput.cc
new file mode 100644
index 000000000..8cb235a82
--- /dev/null
+++ b/src/tools/cephfs/EventOutput.cc
@@ -0,0 +1,153 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include <iostream>
+#include <fstream>
+
+#include "common/errno.h"
+#include "mds/mdstypes.h"
+#include "mds/events/EUpdate.h"
+#include "mds/LogEvent.h"
+#include "JournalScanner.h"
+
+#include "EventOutput.h"
+
+
+int EventOutput::binary() const
+{
+ // Binary output, files
+ int r = ::mkdir(path.c_str(), 0755);
+ if (r != 0) {
+ r = -errno;
+ if (r != -EEXIST) {
+ std::cerr << "Error creating output directory: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+ }
+
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ bufferlist bin;
+ std::stringstream filename;
+ if (auto& le = i->second.log_event; le) {
+ le->encode(bin, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ filename << "0x" << std::hex << i->first << std::dec << "_" << le->get_type_str() << ".bin";
+ } else if (auto& pi = i->second.pi; pi) {
+ pi->encode(bin);
+ filename << "0x" << std::hex << i->first << std::dec << "_" << pi->get_type_str() << ".bin";
+ }
+
+ std::string const file_path = path + std::string("/") + filename.str();
+ std::ofstream bin_file(file_path.c_str(), std::ofstream::out | std::ofstream::binary);
+ bin.write_stream(bin_file);
+ bin_file.close();
+ if (bin_file.fail()) {
+ return -EIO;
+ }
+ }
+ std::cerr << "Wrote output to binary files in directory '" << path << "'" << std::endl;
+
+ return 0;
+}
+
+int EventOutput::json() const
+{
+ JSONFormatter jf(true);
+ std::ofstream out_file(path.c_str(), std::ofstream::out);
+ jf.open_array_section("journal");
+ {
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ if (auto& le = i->second.log_event; le) {
+ jf.open_object_section("log_event");
+ le->dump(&jf);
+ jf.close_section(); // log_event
+ } else if (auto& pi = i->second.pi; pi) {
+ jf.open_object_section("purge_action");
+ pi->dump(&jf);
+ jf.close_section();
+ }
+ }
+ }
+ jf.close_section(); // journal
+ jf.flush(out_file);
+ out_file.close();
+
+ if (out_file.fail()) {
+ return -EIO;
+ } else {
+ std::cerr << "Wrote output to JSON file '" << path << "'" << std::endl;
+ return 0;
+ }
+}
+
+void EventOutput::list() const
+{
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ if (auto& le = i->second.log_event; le) {
+ std::vector<std::string> ev_paths;
+ EMetaBlob const *emb = le->get_metablob();
+ if (emb) {
+ emb->get_paths(ev_paths);
+ }
+
+ std::string detail;
+ if (le->get_type() == EVENT_UPDATE) {
+ auto& eu = reinterpret_cast<EUpdate&>(*le);
+ detail = eu.type;
+ }
+
+ std::cout << le->get_stamp() << " 0x"
+ << std::hex << i->first << std::dec << " "
+ << le->get_type_str() << ": "
+ << " (" << detail << ")" << std::endl;
+ for (std::vector<std::string>::iterator i = ev_paths.begin(); i != ev_paths.end(); ++i) {
+ std::cout << " " << *i << std::endl;
+ }
+ } else if (auto& pi = i->second.pi; pi) {
+ std::cout << pi->stamp << " 0x"
+ << std::hex << i->first << std::dec << " "
+ << pi->get_type_str() << std::endl;
+ }
+ }
+}
+
+void EventOutput::summary() const
+{
+ std::map<std::string, int> type_count;
+ for (JournalScanner::EventMap::const_iterator i = scan.events.begin(); i != scan.events.end(); ++i) {
+ std::string type;
+ if (auto& le = i->second.log_event; le)
+ type = le->get_type_str();
+ else if (auto& pi = i->second.pi; pi)
+ type = pi->get_type_str();
+ if (type_count.count(type) == 0) {
+ type_count[type] = 0;
+ }
+ type_count[type] += 1;
+ }
+
+ std::cout << "Events by type:" << std::endl;
+ for (std::map<std::string, int>::iterator i = type_count.begin(); i != type_count.end(); ++i) {
+ std::cout << " " << i->first << ": " << i->second << std::endl;
+ }
+
+ std::cout << "Errors: " << scan.errors.size() << std::endl;
+ if (!scan.errors.empty()) {
+ for (JournalScanner::ErrorMap::const_iterator i = scan.errors.begin();
+ i != scan.errors.end(); ++i) {
+ std::cout << " 0x" << std::hex << i->first << std::dec
+ << ": " << i->second.r << " "
+ << i->second.description << std::endl;
+ }
+ }
+}
diff --git a/src/tools/cephfs/EventOutput.h b/src/tools/cephfs/EventOutput.h
new file mode 100644
index 000000000..65d968409
--- /dev/null
+++ b/src/tools/cephfs/EventOutput.h
@@ -0,0 +1,42 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#ifndef EVENT_OUTPUT_H
+#define EVENT_OUTPUT_H
+
+#include <string>
+
+class JournalScanner;
+
+/**
+ * Different output formats for the results of a journal scan
+ */
+class EventOutput
+{
+ private:
+ JournalScanner const &scan;
+ std::string const path;
+
+ public:
+ EventOutput(JournalScanner const &scan_, std::string const &path_)
+ : scan(scan_), path(path_) {}
+
+ void summary() const;
+ void list() const;
+ int json() const;
+ int binary() const;
+};
+
+#endif // EVENT_OUTPUT_H
+
diff --git a/src/tools/cephfs/JournalFilter.cc b/src/tools/cephfs/JournalFilter.cc
new file mode 100644
index 000000000..266d7fccb
--- /dev/null
+++ b/src/tools/cephfs/JournalFilter.cc
@@ -0,0 +1,315 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include "JournalFilter.h"
+
+#include "common/ceph_argparse.h"
+
+#include "mds/events/ESession.h"
+#include "mds/events/EUpdate.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+
+const string JournalFilter::range_separator("..");
+
+bool JournalFilter::apply(uint64_t pos, PurgeItem &pi) const
+{
+ /* Filtering by journal offset range */
+ if (pos < range_start || pos >= range_end) {
+ return false;
+ }
+
+ if (purge_action != PurgeItem::NONE) {
+ if (pi.action != purge_action)
+ return false;
+ }
+
+ if (inode) {
+ if (inode != pi.ino)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Return whether a LogEvent is to be included or excluded.
+ *
+ * The filter parameters are applied on an AND basis: if any
+ * condition is not met, the event is excluded. Try to do
+ * the fastest checks first.
+ */
+bool JournalFilter::apply(uint64_t pos, LogEvent &le) const
+{
+ /* Filtering by journal offset range */
+ if (pos < range_start || pos >= range_end) {
+ return false;
+ }
+
+ /* Filtering by event type */
+ if (event_type != 0) {
+ if (le.get_type() != event_type) {
+ return false;
+ }
+ }
+
+ /* Filtering by client */
+ if (client_name.num()) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ if (metablob->get_client_name() != client_name) {
+ return false;
+ }
+ } else if (le.get_type() == EVENT_SESSION) {
+ ESession *es = reinterpret_cast<ESession*>(&le);
+ if (es->get_client_inst().name != client_name) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /* Filtering by inode */
+ if (inode) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ std::set<inodeno_t> inodes;
+ metablob->get_inodes(inodes);
+ bool match_any = false;
+ for (std::set<inodeno_t>::iterator i = inodes.begin(); i != inodes.end(); ++i) {
+ if (*i == inode) {
+ match_any = true;
+ break;
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /* Filtering by frag and dentry */
+ if (!frag_dentry.empty() || frag.ino) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ std::map<dirfrag_t, std::set<std::string> > dentries;
+ metablob->get_dentries(dentries);
+
+ if (frag.ino) {
+ bool match_any = false;
+ for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin();
+ i != dentries.end(); ++i) {
+ if (i->first == frag) {
+ match_any = true;
+ break;
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ }
+
+ if (!frag_dentry.empty()) {
+ bool match_any = false;
+ for (std::map<dirfrag_t, std::set<std::string> >::iterator i = dentries.begin();
+ i != dentries.end() && !match_any; ++i) {
+ std::set<std::string> const &names = i->second;
+ for (std::set<std::string>::iterator j = names.begin();
+ j != names.end() && !match_any; ++j) {
+ if (*j == frag_dentry) {
+ match_any = true;
+ }
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ }
+
+ } else {
+ return false;
+ }
+ }
+
+ /* Filtering by file path */
+ if (!path_expr.empty()) {
+ EMetaBlob const *metablob = le.get_metablob();
+ if (metablob) {
+ std::vector<std::string> paths;
+ metablob->get_paths(paths);
+ bool match_any = false;
+ for (std::vector<std::string>::iterator p = paths.begin(); p != paths.end(); ++p) {
+ if ((*p).find(path_expr) != std::string::npos) {
+ match_any = true;
+ break;
+ }
+ }
+ if (!match_any) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+int JournalFilter::parse_args(
+ std::vector<const char*> &argv,
+ std::vector<const char*>::iterator &arg)
+{
+ while(arg != argv.end()) {
+ std::string arg_str;
+ if (ceph_argparse_witharg(argv, arg, &arg_str, "--range", (char*)NULL)) {
+ size_t sep_loc = arg_str.find(JournalFilter::range_separator);
+ if (sep_loc == std::string::npos || arg_str.size() <= JournalFilter::range_separator.size()) {
+ derr << "Invalid range '" << arg_str << "'" << dendl;
+ return -EINVAL;
+ }
+
+ // We have a lower bound
+ if (sep_loc > 0) {
+ std::string range_start_str = arg_str.substr(0, sep_loc);
+ std::string parse_err;
+ range_start = strict_strtoll(range_start_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid lower bound '" << range_start_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+ }
+
+ if (sep_loc < arg_str.size() - JournalFilter::range_separator.size()) {
+ std::string range_end_str = arg_str.substr(sep_loc + range_separator.size());
+ std::string parse_err;
+ range_end = strict_strtoll(range_end_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid upper bound '" << range_end_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+ }
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--path\"." << dendl;
+ return -EINVAL;
+ }
+ dout(4) << "Filtering by path '" << arg_str << "'" << dendl;
+ path_expr = arg_str;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--inode", (char*)NULL)) {
+ dout(4) << "Filtering by inode '" << arg_str << "'" << dendl;
+ std::string parse_err;
+ inode = strict_strtoll(arg_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid inode '" << arg_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--type", (char*)NULL)) {
+ try {
+ if (!type.compare("mdlog")) {
+ event_type = LogEvent::str_to_type(arg_str);
+ } else if (!type.compare("purge_queue")) {
+ purge_action = PurgeItem::str_to_type(arg_str);
+ }
+ } catch (const std::out_of_range&) {
+ derr << "Invalid event type '" << arg_str << "'" << dendl;
+ return -EINVAL;
+ }
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--frag", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--frag\"." << dendl;
+ return -EINVAL;
+ }
+ std::string const frag_sep = ".";
+ size_t sep_loc = arg_str.find(frag_sep);
+ std::string inode_str;
+ std::string frag_str;
+ if (sep_loc != std::string::npos) {
+ inode_str = arg_str.substr(0, sep_loc);
+ frag_str = arg_str.substr(sep_loc + 1);
+ } else {
+ inode_str = arg_str;
+ frag_str = "0";
+ }
+
+ std::string parse_err;
+ inodeno_t frag_ino = strict_strtoll(inode_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid inode '" << inode_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ uint32_t frag_enc = strict_strtoll(frag_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid frag '" << frag_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ frag = dirfrag_t(frag_ino, frag_t(frag_enc));
+ dout(4) << "dirfrag filter: '" << frag << "'" << dendl;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--dname", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--dname\"." << dendl;
+ return -EINVAL;
+ }
+ frag_dentry = arg_str;
+ dout(4) << "dentry filter: '" << frag_dentry << "'" << dendl;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--client", (char*)NULL)) {
+ if (!type.compare("purge_queue")) {
+ derr << "Invalid filter arguments: purge_queue doesn't take \"--client\"." << dendl;
+ return -EINVAL;
+ }
+
+ std::string parse_err;
+ int64_t client_num = strict_strtoll(arg_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid client number " << arg_str << dendl;
+ return -EINVAL;
+ }
+ client_name = entity_name_t::CLIENT(client_num);
+ } else {
+ // We're done with args the filter understands
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * If the filter params are only range, then return
+ * true and set start & end. Else return false.
+ *
+ * Use this to discover if the user has requested a contiguous range
+ * rather than any per-event filtering.
+ */
+bool JournalFilter::get_range(uint64_t &start, uint64_t &end) const
+{
+ if (!path_expr.empty()
+ || inode != 0
+ || event_type != 0
+ || frag.ino != 0
+ || client_name.num() != 0
+ || (range_start == 0 && range_end == (uint64_t)(-1))) {
+ return false;
+ } else {
+ start = range_start;
+ end = range_end;
+ return true;
+ }
+}
diff --git a/src/tools/cephfs/JournalFilter.h b/src/tools/cephfs/JournalFilter.h
new file mode 100644
index 000000000..f7a2db614
--- /dev/null
+++ b/src/tools/cephfs/JournalFilter.h
@@ -0,0 +1,73 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#ifndef JOURNAL_FILTER_H
+#define JOURNAL_FILTER_H
+
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/PurgeQueue.h"
+
+/**
+ * A set of conditions for narrowing down a search through the journal
+ */
+class JournalFilter
+{
+ private:
+
+ /* Filtering by journal offset range */
+ uint64_t range_start;
+ uint64_t range_end;
+ static const std::string range_separator;
+
+ /* Filtering by file (sub) path */
+ std::string path_expr;
+
+ /* Filtering by inode */
+ inodeno_t inode;
+
+ /* Filtering by type */
+ LogEvent::EventType event_type;
+
+ std::string type;
+
+ /* Filtering by PurgeItem::Action */
+ PurgeItem::Action purge_action;
+
+ /* Filtering by dirfrag */
+ dirfrag_t frag;
+ std::string frag_dentry; //< optional, filter dentry name within fragment
+
+ /* Filtering by metablob client name */
+ entity_name_t client_name;
+
+ public:
+ JournalFilter(std::string t) :
+ range_start(0),
+ range_end(-1),
+ inode(0),
+ event_type(0),
+ type(t),
+ purge_action(PurgeItem::NONE) {}
+
+ bool get_range(uint64_t &start, uint64_t &end) const;
+ bool apply(uint64_t pos, LogEvent &le) const;
+ bool apply(uint64_t pos, PurgeItem &pi) const;
+ int parse_args(
+ std::vector<const char*> &argv,
+ std::vector<const char*>::iterator &arg);
+};
+
+#endif // JOURNAL_FILTER_H
+
diff --git a/src/tools/cephfs/JournalScanner.cc b/src/tools/cephfs/JournalScanner.cc
new file mode 100644
index 000000000..e72542fd4
--- /dev/null
+++ b/src/tools/cephfs/JournalScanner.cc
@@ -0,0 +1,438 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include "include/rados/librados.hpp"
+#include "mds/JournalPointer.h"
+
+#include "mds/events/ESubtreeMap.h"
+#include "mds/PurgeQueue.h"
+
+#include "JournalScanner.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+/**
+ * Read journal header, followed by sequential scan through journal space.
+ *
+ * Return 0 on success, else error code. Note that success has the special meaning
+ * that we were able to apply our checks, it does *not* mean that the journal is
+ * healthy.
+ */
+int JournalScanner::scan(bool const full)
+{
+ int r = 0;
+
+ r = set_journal_ino();
+ if (r < 0) {
+ return r;
+ }
+
+ if (!is_mdlog || pointer_present) {
+ r = scan_header();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (full && header_present) {
+ r = scan_events();
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+
+int JournalScanner::set_journal_ino()
+{
+ int r = 0;
+ if (type == "purge_queue") {
+ ino = MDS_INO_PURGE_QUEUE + rank;
+ }
+ else if (type == "mdlog"){
+ r = scan_pointer();
+ is_mdlog = true;
+ }
+ else {
+ ceph_abort(); // should not get here
+ }
+ return r;
+}
+
+int JournalScanner::scan_pointer()
+{
+ // Issue read
+ std::string const pointer_oid = obj_name(MDS_INO_LOG_POINTER_OFFSET + rank, 0);
+ bufferlist pointer_bl;
+ int r = io.read(pointer_oid, pointer_bl, INT_MAX, 0);
+ if (r == -ENOENT) {
+ // 'Successfully' discovered the pointer is missing.
+ derr << "Pointer " << pointer_oid << " is absent" << dendl;
+ return 0;
+ } else if (r < 0) {
+ // Error preventing us interrogating pointer
+ derr << "Pointer " << pointer_oid << " is unreadable" << dendl;
+ return r;
+ } else {
+ dout(4) << "Pointer " << pointer_oid << " is readable" << dendl;
+ pointer_present = true;
+
+ JournalPointer jp;
+ try {
+ auto q = pointer_bl.cbegin();
+ jp.decode(q);
+ } catch(buffer::error &e) {
+ derr << "Pointer " << pointer_oid << " is corrupt: " << e.what() << dendl;
+ return 0;
+ }
+
+ pointer_valid = true;
+ ino = jp.front;
+ return 0;
+ }
+}
+
+
+int JournalScanner::scan_header()
+{
+ int r;
+
+ bufferlist header_bl;
+ std::string header_name = obj_name(0);
+ dout(4) << "JournalScanner::scan: reading header object '" << header_name << "'" << dendl;
+ r = io.read(header_name, header_bl, INT_MAX, 0);
+ if (r < 0) {
+ derr << "Header " << header_name << " is unreadable" << dendl;
+ return 0; // "Successfully" found an error
+ } else {
+ header_present = true;
+ }
+
+ auto header_bl_i = header_bl.cbegin();
+ header = new Journaler::Header();
+ try
+ {
+ header->decode(header_bl_i);
+ }
+ catch (buffer::error &e)
+ {
+ derr << "Header is corrupt (" << e.what() << ")" << dendl;
+ delete header;
+ header = NULL;
+ return 0; // "Successfully" found an error
+ }
+
+ if (header->magic != std::string(CEPH_FS_ONDISK_MAGIC)) {
+ derr << "Header is corrupt (bad magic)" << dendl;
+ return 0; // "Successfully" found an error
+ }
+ if (!((header->trimmed_pos <= header->expire_pos) && (header->expire_pos <= header->write_pos))) {
+ derr << "Header is invalid (inconsistent offsets)" << dendl;
+ return 0; // "Successfully" found an error
+ }
+ header_valid = true;
+
+ return 0;
+}
+
+
+int JournalScanner::scan_events()
+{
+ uint64_t object_size = g_conf()->mds_log_segment_size;
+ if (object_size == 0) {
+ // Default layout object size
+ object_size = file_layout_t::get_default().object_size;
+ }
+
+ uint64_t read_offset = header->expire_pos;
+ dout(10) << std::hex << "Header 0x"
+ << header->trimmed_pos << " 0x"
+ << header->expire_pos << " 0x"
+ << header->write_pos << std::dec << dendl;
+ dout(10) << "Starting journal scan from offset 0x" << std::hex << read_offset << std::dec << dendl;
+
+ // TODO also check for extraneous objects before the trimmed pos or after the write pos,
+ // which would indicate a bogus header.
+
+ bufferlist read_buf;
+ bool gap = false;
+ uint64_t gap_start = -1;
+ for (uint64_t obj_offset = (read_offset / object_size); ; obj_offset++) {
+ uint64_t offset_in_obj = 0;
+ if (obj_offset * object_size < header->expire_pos) {
+ // Skip up to expire_pos from start of the object
+ // (happens for the first object we read)
+ offset_in_obj = header->expire_pos - obj_offset * object_size;
+ }
+
+ // Read this journal segment
+ bufferlist this_object;
+ std::string const oid = obj_name(obj_offset);
+ int r = io.read(oid, this_object, INT_MAX, offset_in_obj);
+
+ // Handle absent journal segments
+ if (r < 0) {
+ if (obj_offset > (header->write_pos / object_size)) {
+ dout(4) << "Reached end of journal objects" << dendl;
+ break;
+ } else {
+ derr << "Missing object " << oid << dendl;
+ }
+
+ objects_missing.push_back(obj_offset);
+ if (!gap) {
+ gap_start = read_offset;
+ gap = true;
+ }
+ if (read_buf.length() > 0) {
+ read_offset += read_buf.length();
+ read_buf.clear();
+ }
+ read_offset += object_size - offset_in_obj;
+ continue;
+ } else {
+ dout(4) << "Read 0x" << std::hex << this_object.length() << std::dec
+ << " bytes from " << oid << " gap=" << gap << dendl;
+ objects_valid.push_back(oid);
+ this_object.begin().copy(this_object.length(), read_buf);
+ }
+
+ if (gap) {
+ // No valid data at the current read offset, scan forward until we find something valid looking
+ // or have to drop out to load another object.
+ dout(4) << "Searching for sentinel from 0x" << std::hex << read_offset
+ << ", 0x" << read_buf.length() << std::dec << " bytes available" << dendl;
+
+ do {
+ auto p = read_buf.cbegin();
+ uint64_t candidate_sentinel;
+ decode(candidate_sentinel, p);
+
+ dout(4) << "Data at 0x" << std::hex << read_offset << " = 0x" << candidate_sentinel << std::dec << dendl;
+
+ if (candidate_sentinel == JournalStream::sentinel) {
+ dout(4) << "Found sentinel at 0x" << std::hex << read_offset << std::dec << dendl;
+ ranges_invalid.push_back(Range(gap_start, read_offset));
+ gap = false;
+ break;
+ } else {
+ // No sentinel, discard this byte
+ read_buf.splice(0, 1);
+ read_offset += 1;
+ }
+ } while (read_buf.length() >= sizeof(JournalStream::sentinel));
+ dout(4) << "read_buf size is " << read_buf.length() << dendl;
+ }
+ {
+ dout(10) << "Parsing data, 0x" << std::hex << read_buf.length() << std::dec << " bytes available" << dendl;
+ while(true) {
+ // TODO: detect and handle legacy format journals: can do many things
+ // on them but on read errors have to give up instead of searching
+ // for sentinels.
+ JournalStream journal_stream(JOURNAL_FORMAT_RESILIENT);
+ bool readable = false;
+ try {
+ uint64_t need;
+ readable = journal_stream.readable(read_buf, &need);
+ } catch (buffer::error &e) {
+ readable = false;
+ dout(4) << "Invalid container encoding at 0x" << std::hex << read_offset << std::dec << dendl;
+ gap = true;
+ gap_start = read_offset;
+ read_buf.splice(0, 1);
+ read_offset += 1;
+ break;
+ }
+
+ if (!readable) {
+ // Out of data, continue to read next object
+ break;
+ }
+
+ bufferlist le_bl; //< Serialized LogEvent blob
+ dout(10) << "Attempting decode at 0x" << std::hex << read_offset << std::dec << dendl;
+ // This cannot fail to decode because we pre-checked that a serialized entry
+ // blob would be readable.
+ uint64_t start_ptr = 0;
+ uint64_t consumed = journal_stream.read(read_buf, &le_bl, &start_ptr);
+ dout(10) << "Consumed 0x" << std::hex << consumed << std::dec << " bytes" << dendl;
+ if (start_ptr != read_offset) {
+ derr << "Bad entry start ptr (0x" << std::hex << start_ptr << ") at 0x"
+ << read_offset << std::dec << dendl;
+ gap = true;
+ gap_start = read_offset;
+ // FIXME: given that entry was invalid, should we be skipping over it?
+ // maybe push bytes back onto start of read_buf and just advance one byte
+ // to start scanning instead. e.g. if a bogus size value is found it can
+ // cause us to consume and thus skip a bunch of following valid events.
+ read_offset += consumed;
+ break;
+ }
+ bool valid_entry = true;
+ if (is_mdlog) {
+ auto le = LogEvent::decode_event(le_bl.cbegin());
+
+ if (le) {
+ dout(10) << "Valid entry at 0x" << std::hex << read_offset << std::dec << dendl;
+
+ if (le->get_type() == EVENT_SUBTREEMAP
+ || le->get_type() == EVENT_SUBTREEMAP_TEST) {
+ auto&& sle = dynamic_cast<ESubtreeMap&>(*le);
+ if (sle.expire_pos > read_offset) {
+ errors.insert(std::make_pair(
+ read_offset, EventError(
+ -ERANGE,
+ "ESubtreeMap has expire_pos ahead of its own position")));
+ }
+ }
+
+ if (filter.apply(read_offset, *le)) {
+ events.insert_or_assign(read_offset, EventRecord(std::move(le), consumed));
+ }
+ } else {
+ valid_entry = false;
+ }
+ } else if (type == "purge_queue"){
+ auto pi = std::make_unique<PurgeItem>();
+ try {
+ auto q = le_bl.cbegin();
+ pi->decode(q);
+ if (filter.apply(read_offset, *pi)) {
+ events.insert_or_assign(read_offset, EventRecord(std::move(pi), consumed));
+ }
+ } catch (const buffer::error &err) {
+ valid_entry = false;
+ }
+ } else {
+ ceph_abort(); // should not get here
+ }
+ if (!valid_entry) {
+ dout(10) << "Invalid entry at 0x" << std::hex << read_offset << std::dec << dendl;
+ gap = true;
+ gap_start = read_offset;
+ read_offset += consumed;
+ break;
+ } else {
+ events_valid.push_back(read_offset);
+ read_offset += consumed;
+ }
+ }
+ }
+ }
+
+ if (gap) {
+ // Ended on a gap, assume it ran to end
+ ranges_invalid.push_back(Range(gap_start, -1));
+ }
+
+ dout(4) << "Scanned objects, " << objects_missing.size() << " missing, " << objects_valid.size() << " valid" << dendl;
+ dout(4) << "Events scanned, " << ranges_invalid.size() << " gaps" << dendl;
+ dout(4) << "Found " << events_valid.size() << " valid events" << dendl;
+ dout(4) << "Selected " << events.size() << " events events for processing" << dendl;
+
+ return 0;
+}
+
+
+JournalScanner::~JournalScanner()
+{
+ if (header) {
+ delete header;
+ header = NULL;
+ }
+ dout(4) << events.size() << " events" << dendl;
+ events.clear();
+}
+
+
+/**
+ * Whether the journal data looks valid and replayable
+ */
+bool JournalScanner::is_healthy() const
+{
+ return ((!is_mdlog || (pointer_present && pointer_valid))
+ && header_present && header_valid
+ && ranges_invalid.empty()
+ && objects_missing.empty());
+}
+
+
+/**
+ * Whether the journal data can be read from RADOS
+ */
+bool JournalScanner::is_readable() const
+{
+ return (header_present && header_valid && objects_missing.empty());
+}
+
+
+/**
+ * Calculate the object name for a given offset
+ */
+std::string JournalScanner::obj_name(inodeno_t ino, uint64_t offset) const
+{
+ char name[60];
+ snprintf(name, sizeof(name), "%llx.%08llx",
+ (unsigned long long)(ino),
+ (unsigned long long)offset);
+ return std::string(name);
+}
+
+
+std::string JournalScanner::obj_name(uint64_t offset) const
+{
+ return obj_name(ino, offset);
+}
+
+
+/*
+ * Write a human readable summary of the journal health
+ */
+void JournalScanner::report(std::ostream &out) const
+{
+ out << "Overall journal integrity: " << (is_healthy() ? "OK" : "DAMAGED") << std::endl;
+
+ if (is_mdlog) {
+ if (!pointer_present) {
+ out << "Pointer not found" << std::endl;
+ } else if (!pointer_valid) {
+ out << "Pointer could not be decoded" << std::endl;
+ }
+ }
+ if (!header_present) {
+ out << "Header not found" << std::endl;
+ } else if (!header_valid) {
+ out << "Header could not be decoded" << std::endl;
+ }
+
+ if (objects_missing.size()) {
+ out << "Objects missing:" << std::endl;
+ for (std::vector<uint64_t>::const_iterator om = objects_missing.begin();
+ om != objects_missing.end(); ++om) {
+ out << " 0x" << std::hex << *om << std::dec << std::endl;
+ }
+ }
+
+ if (ranges_invalid.size()) {
+ out << "Corrupt regions:" << std::endl;
+ for (std::vector<Range>::const_iterator r = ranges_invalid.begin();
+ r != ranges_invalid.end(); ++r) {
+ out << " 0x" << std::hex << r->first << "-" << r->second << std::dec << std::endl;
+ }
+ }
+}
+
diff --git a/src/tools/cephfs/JournalScanner.h b/src/tools/cephfs/JournalScanner.h
new file mode 100644
index 000000000..9197b5596
--- /dev/null
+++ b/src/tools/cephfs/JournalScanner.h
@@ -0,0 +1,133 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+#ifndef JOURNAL_SCANNER_H
+#define JOURNAL_SCANNER_H
+
+#include "include/rados/librados_fwd.hpp"
+
+// For Journaler::Header, can't forward-declare nested classes
+#include <osdc/Journaler.h>
+
+#include "JournalFilter.h"
+
+/**
+ * A simple sequential reader for metadata journals. Unlike
+ * the MDS Journaler class, this is written to detect, record,
+ * and read past corruptions and missing objects. It is also
+ * less efficient but more plainly written.
+ */
+class JournalScanner
+{
+ private:
+ librados::IoCtx &io;
+
+ // Input constraints
+ const int rank;
+ std::string type;
+ JournalFilter const filter;
+
+ void gap_advance();
+
+ public:
+ JournalScanner(
+ librados::IoCtx &io_,
+ int rank_,
+ const std::string &type_,
+ JournalFilter const &filter_) :
+ io(io_),
+ rank(rank_),
+ type(type_),
+ filter(filter_),
+ is_mdlog(false),
+ pointer_present(false),
+ pointer_valid(false),
+ header_present(false),
+ header_valid(false),
+ header(NULL) {};
+
+ JournalScanner(
+ librados::IoCtx &io_,
+ int rank_,
+ const std::string &type_) :
+ io(io_),
+ rank(rank_),
+ type(type_),
+ filter(type_),
+ is_mdlog(false),
+ pointer_present(false),
+ pointer_valid(false),
+ header_present(false),
+ header_valid(false),
+ header(NULL) {};
+
+ ~JournalScanner();
+
+ int set_journal_ino();
+ int scan(bool const full=true);
+ int scan_pointer();
+ int scan_header();
+ int scan_events();
+ void report(std::ostream &out) const;
+
+ std::string obj_name(uint64_t offset) const;
+ std::string obj_name(inodeno_t ino, uint64_t offset) const;
+
+ // The results of the scan
+ inodeno_t ino; // Corresponds to journal ino according their type
+ struct EventRecord {
+ EventRecord(std::unique_ptr<LogEvent> le, uint32_t rs) : log_event(std::move(le)), raw_size(rs) {}
+ EventRecord(std::unique_ptr<PurgeItem> p, uint32_t rs) : pi(std::move(p)), raw_size(rs) {}
+ std::unique_ptr<LogEvent> log_event;
+ std::unique_ptr<PurgeItem> pi;
+ uint32_t raw_size = 0; //< Size from start offset including all encoding overhead
+ };
+
+ class EventError {
+ public:
+ int r;
+ std::string description;
+ EventError(int r_, const std::string &desc_)
+ : r(r_), description(desc_) {}
+ };
+
+ typedef std::map<uint64_t, EventRecord> EventMap;
+ typedef std::map<uint64_t, EventError> ErrorMap;
+ typedef std::pair<uint64_t, uint64_t> Range;
+ bool is_mdlog;
+ bool pointer_present; //mdlog specific
+ bool pointer_valid; //mdlog specific
+ bool header_present;
+ bool header_valid;
+ Journaler::Header *header;
+
+ bool is_healthy() const;
+ bool is_readable() const;
+ std::vector<std::string> objects_valid;
+ std::vector<uint64_t> objects_missing;
+ std::vector<Range> ranges_invalid;
+ std::vector<uint64_t> events_valid;
+ EventMap events;
+
+ // For events present in ::events (i.e. scanned successfully),
+ // any subsequent errors handling them (e.g. replaying)
+ ErrorMap errors;
+
+
+ private:
+ // Forbid copy construction because I have ptr members
+ JournalScanner(const JournalScanner &rhs);
+};
+
+#endif // JOURNAL_SCANNER_H
+
diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc
new file mode 100644
index 000000000..ec9860980
--- /dev/null
+++ b/src/tools/cephfs/JournalTool.cc
@@ -0,0 +1,1266 @@
+// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * ceph - scalable distributed file system
+ *
+ * copyright (c) 2014 john spray <john.spray@inktank.com>
+ *
+ * this is free software; you can redistribute it and/or
+ * modify it under the terms of the gnu lesser general public
+ * license version 2.1, as published by the free software
+ * foundation. see file copying.
+ */
+
+
+#include <sstream>
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/InoTable.h"
+
+#include "mds/events/ENoOp.h"
+#include "mds/events/EUpdate.h"
+
+#include "JournalScanner.h"
+#include "EventOutput.h"
+#include "Dumper.h"
+#include "Resetter.h"
+
+#include "JournalTool.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+
+
+void JournalTool::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-journal-tool [options] journal <command>\n"
+ << " <command>:\n"
+ << " inspect\n"
+ << " import <path> [--force]\n"
+ << " export <path>\n"
+ << " reset [--force]\n"
+ << " cephfs-journal-tool [options] header <get|set> <field> <value>\n"
+ << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n"
+ << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
+ << " <selector>:\n"
+ << " --range=<start>..<end>\n"
+ << " --path=<substring>\n"
+ << " --inode=<integer>\n"
+ << " --type=<UPDATE|OPEN|SESSION...><\n"
+ << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
+ << " --client=<session id integer>\n"
+ << " <effect>: [get|recover_dentries|splice]\n"
+ << " <output>: [summary|list|binary|json] [--path <path>]\n"
+ << "\n"
+ << "General options:\n"
+ << " --rank=filesystem:mds-rank|all Journal rank (mandatory)\n"
+ << " --journal=<mdlog|purge_queue> Journal type (purge_queue means\n"
+ << " this journal is used to queue for purge operation,\n"
+ << " default is mdlog, and only mdlog support event mode)\n"
+ << "\n"
+ << "Special options\n"
+ << " --alternate-pool <name> Alternative metadata pool to target\n"
+ << " when using recover_dentries.\n";
+
+ generic_client_usage();
+}
+
+
+/**
+ * Handle arguments and hand off to journal/header/event mode
+ */
+int JournalTool::main(std::vector<const char*> &argv)
+{
+ int r;
+
+ dout(10) << "JournalTool::main " << dendl;
+ // Common arg parsing
+ // ==================
+ if (argv.empty()) {
+ cerr << "missing positional argument" << std::endl;
+ return -EINVAL;
+ }
+
+ std::vector<const char*>::iterator arg = argv.begin();
+
+ std::string rank_str;
+ if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
+ derr << "missing mandatory \"--rank\" argument" << dendl;
+ return -EINVAL;
+ }
+
+ if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) {
+ // Default is mdlog
+ type = "mdlog";
+ }
+
+ r = validate_type(type);
+ if (r != 0) {
+ derr << "journal type is not correct." << dendl;
+ return r;
+ }
+
+ r = role_selector.parse(*fsmap, rank_str, false);
+ if (r != 0) {
+ derr << "Couldn't determine MDS rank." << dendl;
+ return r;
+ }
+
+ std::string mode;
+ if (arg == argv.end()) {
+ derr << "Missing mode [journal|header|event]" << dendl;
+ return -EINVAL;
+ }
+ mode = std::string(*arg);
+ arg = argv.erase(arg);
+
+ // RADOS init
+ // ==========
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+ return r;
+ }
+
+ dout(4) << "JournalTool: connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ ceph_assert(fs != nullptr);
+ int64_t const pool_id = fs->mds_map.get_metadata_pool();
+ dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
+ std::string pool_name;
+ r = rados.pool_reverse_lookup(pool_id, &pool_name);
+ if (r < 0) {
+ derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
+ return r;
+ }
+
+ dout(4) << "JournalTool: creating IoCtx.." << dendl;
+ r = rados.ioctx_create(pool_name.c_str(), input);
+ ceph_assert(r == 0);
+ output.dup(input);
+
+ // Execution
+ // =========
+ // journal and header are general journal mode
+ // event mode is only specific for mdlog
+ auto roles = role_selector.get_roles();
+ if (roles.size() > 1) {
+ const std::string &command = argv[0];
+ bool allowed = can_execute_for_all_ranks(mode, command);
+ if (!allowed) {
+ derr << "operation not allowed for all ranks" << dendl;
+ return -EINVAL;
+ }
+
+ all_ranks = true;
+ }
+ for (auto role : roles) {
+ rank = role.rank;
+ std::vector<const char *> rank_argv(argv);
+ dout(4) << "Executing for rank " << rank << dendl;
+ if (mode == std::string("journal")) {
+ r = main_journal(rank_argv);
+ } else if (mode == std::string("header")) {
+ r = main_header(rank_argv);
+ } else if (mode == std::string("event")) {
+ r = main_event(rank_argv);
+ } else {
+ cerr << "Bad command '" << mode << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (r != 0) {
+ return r;
+ }
+ }
+
+ return r;
+}
+
+int JournalTool::validate_type(const std::string &type)
+{
+ if (type == "mdlog" || type == "purge_queue") {
+ return 0;
+ }
+ return -1;
+}
+
+std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
+ if (!all_ranks) {
+ return prefix;
+ }
+
+ return prefix + "." + std::to_string(rank);
+}
+
+bool JournalTool::can_execute_for_all_ranks(const std::string &mode,
+ const std::string &command) {
+ if (mode == "journal" && command == "import") {
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * Handle arguments for 'journal' mode
+ *
+ * This is for operations that act on the journal as a whole.
+ */
+int JournalTool::main_journal(std::vector<const char*> &argv)
+{
+ if (argv.empty()) {
+ derr << "Missing journal command, please see help" << dendl;
+ return -EINVAL;
+ }
+
+ std::string command = argv[0];
+ if (command == "inspect") {
+ return journal_inspect();
+ } else if (command == "export" || command == "import") {
+ bool force = false;
+ if (argv.size() >= 2) {
+ std::string const path = argv[1];
+ if (argv.size() == 3) {
+ if (std::string(argv[2]) == "--force") {
+ force = true;
+ } else {
+ std::cerr << "Unknown argument " << argv[1] << std::endl;
+ return -EINVAL;
+ }
+ }
+ return journal_export(path, command == "import", force);
+ } else {
+ derr << "Missing path" << dendl;
+ return -EINVAL;
+ }
+ } else if (command == "reset") {
+ bool force = false;
+ if (argv.size() == 2) {
+ if (std::string(argv[1]) == "--force") {
+ force = true;
+ } else {
+ std::cerr << "Unknown argument " << argv[1] << std::endl;
+ return -EINVAL;
+ }
+ } else if (argv.size() > 2) {
+ std::cerr << "Too many arguments!" << std::endl;
+ return -EINVAL;
+ }
+ return journal_reset(force);
+ } else {
+ derr << "Bad journal command '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+}
+
+
+/**
+ * Parse arguments and execute for 'header' mode
+ *
+ * This is for operations that act on the header only.
+ */
+int JournalTool::main_header(std::vector<const char*> &argv)
+{
+ JournalFilter filter(type);
+ JournalScanner js(input, rank, type, filter);
+ int r = js.scan(false);
+ if (r < 0) {
+ std::cerr << "Unable to scan journal" << std::endl;
+ return r;
+ }
+
+ if (!js.header_present) {
+ std::cerr << "Header object not found!" << std::endl;
+ return -ENOENT;
+ } else if (!js.header_valid && js.header == NULL) {
+ // Can't do a read or a single-field write without a copy of the original
+ derr << "Header could not be read!" << dendl;
+ return -ENOENT;
+ } else {
+ ceph_assert(js.header != NULL);
+ }
+
+ if (argv.empty()) {
+ derr << "Missing header command, must be [get|set]" << dendl;
+ return -EINVAL;
+ }
+ std::vector<const char *>::iterator arg = argv.begin();
+ std::string const command = *arg;
+ arg = argv.erase(arg);
+
+ if (command == std::string("get")) {
+ // Write JSON journal dump to stdout
+ JSONFormatter jf(true);
+ js.header->dump(&jf);
+ jf.flush(std::cout);
+ std::cout << std::endl;
+ } else if (command == std::string("set")) {
+ // Need two more args <key> <val>
+ if (argv.size() != 2) {
+ derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
+ return -EINVAL;
+ }
+
+ std::string const field_name = *arg;
+ arg = argv.erase(arg);
+
+ std::string const value_str = *arg;
+ arg = argv.erase(arg);
+ ceph_assert(argv.empty());
+
+ std::string parse_err;
+ uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
+ if (!parse_err.empty()) {
+ derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
+ return -EINVAL;
+ }
+
+ uint64_t *field = NULL;
+ if (field_name == "trimmed_pos") {
+ field = &(js.header->trimmed_pos);
+ } else if (field_name == "expire_pos") {
+ field = &(js.header->expire_pos);
+ } else if (field_name == "write_pos") {
+ field = &(js.header->write_pos);
+ } else if (field_name == "pool_id") {
+ field = (uint64_t*)(&(js.header->layout.pool_id));
+ } else {
+ derr << "Invalid field '" << field_name << "'" << dendl;
+ return -EINVAL;
+ }
+
+ std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
+ *field = new_val;
+
+ dout(4) << "Writing object..." << dendl;
+ bufferlist header_bl;
+ encode(*(js.header), header_bl);
+ output.write_full(js.obj_name(0), header_bl);
+ dout(4) << "Write complete." << dendl;
+ std::cout << "Successfully updated header." << std::endl;
+ } else {
+ derr << "Bad header command '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+/**
+ * Parse arguments and execute for 'event' mode
+ *
+ * This is for operations that act on LogEvents within the log
+ */
+int JournalTool::main_event(std::vector<const char*> &argv)
+{
+ int r;
+
+ if (argv.empty()) {
+ derr << "Missing event command, please see help" << dendl;
+ return -EINVAL;
+ }
+
+ std::vector<const char*>::iterator arg = argv.begin();
+ bool dry_run = false;
+
+ std::string command = *(arg++);
+ if (command != "get" && command != "splice" && command != "recover_dentries") {
+ derr << "Unknown argument '" << command << "'" << dendl;
+ return -EINVAL;
+ }
+
+ if (command == "recover_dentries") {
+ if (type != "mdlog") {
+ derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl;
+ return -EINVAL;
+ } else {
+ if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
+ dry_run = true;
+ }
+ }
+ }
+
+ if (arg == argv.end()) {
+ derr << "Incomplete command line" << dendl;
+ return -EINVAL;
+ }
+
+ // Parse filter options
+ // ====================
+ JournalFilter filter(type);
+ r = filter.parse_args(argv, arg);
+ if (r) {
+ return r;
+ }
+
+ // Parse output options
+ // ====================
+ if (arg == argv.end()) {
+ cerr << "Missing output command" << std::endl;
+ return -EINVAL;
+ }
+ std::string output_style = *(arg++);
+ if (output_style != "binary" && output_style != "json" &&
+ output_style != "summary" && output_style != "list") {
+ cerr << "Unknown argument: '" << output_style << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ std::string output_path = "dump";
+ while(arg != argv.end()) {
+ std::string arg_str;
+ if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
+ output_path = arg_str;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
+ nullptr)) {
+ dout(1) << "Using alternate pool " << arg_str << dendl;
+ int r = rados.ioctx_create(arg_str.c_str(), output);
+ ceph_assert(r == 0);
+ other_pool = true;
+ } else {
+ cerr << "Unknown argument: '" << *arg << "'" << std::endl;
+ return -EINVAL;
+ }
+ }
+
+ const std::string dump_path = gen_dump_file_path(output_path);
+
+ // Execute command
+ // ===============
+ JournalScanner js(input, rank, type, filter);
+ if (command == "get") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+ } else if (command == "recover_dentries") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+
+ /**
+ * Iterate over log entries, attempting to scavenge from each one
+ */
+ std::set<inodeno_t> consumed_inos;
+ for (JournalScanner::EventMap::iterator i = js.events.begin();
+ i != js.events.end(); ++i) {
+ auto& le = i->second.log_event;
+ EMetaBlob const *mb = le->get_metablob();
+ if (mb) {
+ int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
+ if (scav_r) {
+ dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
+ << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
+ if (r == 0) {
+ r = scav_r;
+ }
+ // Our goal is to read all we can, so don't stop on errors, but
+ // do record them for possible later output
+ js.errors.insert(std::make_pair(i->first,
+ JournalScanner::EventError(scav_r, cpp_strerror(r))));
+ }
+ }
+ }
+
+ /**
+ * Update InoTable to reflect any inode numbers consumed during scavenge
+ */
+ dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
+ if (consumed_inos.size() && !dry_run) {
+ int consume_r = consume_inos(consumed_inos);
+ if (consume_r) {
+ dout(1) << "Error updating InoTable for " << consumed_inos.size()
+ << " consume inos: " << cpp_strerror(consume_r) << dendl;
+ if (r == 0) {
+ r = consume_r;
+ }
+ }
+ }
+
+ // Remove consumed dentries from lost+found.
+ if (other_pool && !dry_run) {
+ std::set<std::string> found;
+
+ for (auto i : consumed_inos) {
+ char s[20];
+
+ snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
+ dout(20) << "removing " << s << dendl;
+ found.insert(std::string(s));
+ }
+
+ object_t frag_oid;
+ frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
+ frag_t(), "");
+ output.omap_rm_keys(frag_oid.name, found);
+ }
+ } else if (command == "splice") {
+ r = js.scan();
+ if (r) {
+ derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
+ return r;
+ }
+
+ uint64_t start, end;
+ if (filter.get_range(start, end)) {
+ // Special case for range filter: erase a numeric range in the log
+ uint64_t range = end - start;
+ int r = erase_region(js, start, range);
+ if (r) {
+ derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else {
+ // General case: erase a collection of individual entries in the log
+ for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
+ dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
+
+ int r = erase_region(js, i->first, i->second.raw_size);
+ if (r) {
+ derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+
+ } else {
+ cerr << "Unknown argument '" << command << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // Generate output
+ // ===============
+ EventOutput output(js, dump_path);
+ int output_result = 0;
+ if (output_style == "binary") {
+ output_result = output.binary();
+ } else if (output_style == "json") {
+ output_result = output.json();
+ } else if (output_style == "summary") {
+ output.summary();
+ } else if (output_style == "list") {
+ output.list();
+ } else {
+ std::cerr << "Bad output command '" << output_style << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ if (output_result != 0) {
+ std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
+ }
+
+ return output_result;
+}
+
+/**
+ * Provide the user with information about the condition of the journal,
+ * especially indicating what range of log events is available and where
+ * any gaps or corruptions in the journal are.
+ */
+int JournalTool::journal_inspect()
+{
+ int r;
+
+ JournalFilter filter(type);
+ JournalScanner js(input, rank, type, filter);
+ r = js.scan();
+ if (r) {
+ std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
+ return r;
+ }
+
+ js.report(std::cout);
+
+ return 0;
+}
+
+
+/**
+ * Attempt to export a binary dump of the journal.
+ *
+ * This is allowed to fail if the header is malformed or there are
+ * objects inaccessible, in which case the user would have to fall
+ * back to manually listing RADOS objects and extracting them, which
+ * they can do with the ``rados`` CLI.
+ */
+int JournalTool::journal_export(std::string const &path, bool import, bool force)
+{
+ int r = 0;
+ JournalScanner js(input, rank, type);
+
+ if (!import) {
+ /*
+ * If doing an export, first check that the header is valid and
+ * no objects are missing before trying to dump
+ */
+ r = js.scan();
+ if (r < 0) {
+ derr << "Unable to scan journal, assuming badly damaged" << dendl;
+ return r;
+ }
+ if (!js.is_readable()) {
+ derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
+ return -EIO;
+ }
+ }
+
+ /*
+ * Assuming we can cleanly read the journal data, dump it out to a file
+ */
+ {
+ Dumper dumper;
+ r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type);
+ if (r < 0) {
+ derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ if (import) {
+ r = dumper.undump(path.c_str(), force);
+ } else {
+ const std::string ex_path = gen_dump_file_path(path);
+ r = dumper.dump(ex_path.c_str());
+ }
+ }
+
+ return r;
+}
+
+
+/**
+ * Truncate journal and insert EResetJournal
+ */
+int JournalTool::journal_reset(bool hard)
+{
+ int r = 0;
+ Resetter resetter;
+ r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard);
+ if (r < 0) {
+ derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (hard) {
+ r = resetter.reset_hard();
+ } else {
+ r = resetter.reset();
+ }
+
+ return r;
+}
+
+
+/**
+ * Selective offline replay which only reads out dentries and writes
+ * them to the backing store iff their version is > what is currently
+ * in the backing store.
+ *
+ * In order to write dentries to the backing store, we may create the
+ * required enclosing dirfrag objects.
+ *
+ * Test this by running scavenge on an unflushed journal, then nuking
+ * it offline, then starting an MDS and seeing that the dentries are
+ * visible.
+ *
+ * @param metablob an EMetaBlob retrieved from the journal
+ * @param dry_run if true, do no writes to RADOS
+ * @param consumed_inos output, populated with any inos inserted
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::recover_dentries(
+ EMetaBlob const &metablob,
+ bool const dry_run,
+ std::set<inodeno_t> *consumed_inos)
+{
+ ceph_assert(consumed_inos != NULL);
+
+ int r = 0;
+
+ // Replay fullbits (dentry+inode)
+ for (const auto& frag : metablob.lump_order) {
+ EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
+ lump._decode_bits();
+ object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
+
+ dout(4) << "inspecting lump " << frag_oid.name << dendl;
+
+
+ // We will record old fnode version for use in hard link handling
+ // If we don't read an old fnode, take version as zero and write in
+ // all hardlinks we find.
+ version_t old_fnode_version = 0;
+
+ // Update fnode in omap header of dirfrag object
+ bool write_fnode = false;
+ bufferlist old_fnode_bl;
+ r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
+ if (r == -ENOENT) {
+ // Creating dirfrag from scratch
+ dout(4) << "failed to read OMAP header from directory fragment "
+ << frag_oid.name << " " << cpp_strerror(r) << dendl;
+ write_fnode = true;
+ // Note: creating the dirfrag *without* a backtrace, relying on
+ // MDS to regenerate backtraces on read or in FSCK
+ } else if (r == 0) {
+ // Conditionally update existing omap header
+ fnode_t old_fnode;
+ auto old_fnode_iter = old_fnode_bl.cbegin();
+ try {
+ old_fnode.decode(old_fnode_iter);
+ dout(4) << "frag " << frag_oid.name << " fnode old v" <<
+ old_fnode.version << " vs new v" << lump.fnode->version << dendl;
+ old_fnode_version = old_fnode.version;
+ write_fnode = old_fnode_version < lump.fnode->version;
+ } catch (const buffer::error &err) {
+ dout(1) << "frag " << frag_oid.name
+ << " is corrupt, overwriting" << dendl;
+ write_fnode = true;
+ }
+ } else {
+ // Unexpected error
+ dout(4) << "failed to read OMAP header from directory fragment "
+ << frag_oid.name << " " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if ((other_pool || write_fnode) && !dry_run) {
+ dout(4) << "writing fnode to omap header" << dendl;
+ bufferlist fnode_bl;
+ lump.fnode->encode(fnode_bl);
+ if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
+ r = output.omap_set_header(frag_oid.name, fnode_bl);
+ }
+ if (r != 0) {
+ derr << "Failed to write fnode for frag object "
+ << frag_oid.name << dendl;
+ return r;
+ }
+ }
+
+ std::set<std::string> read_keys;
+
+ // Compose list of potentially-existing dentries we would like to fetch
+ for (const auto& fb : lump.get_dfull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ for(const auto& rb : lump.get_dremote()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ for (const auto& nb : lump.get_dnull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+ read_keys.insert(key);
+ }
+
+ // Perform bulk read of existing dentries
+ std::map<std::string, bufferlist> read_vals;
+ r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ if (r == -ENOENT && other_pool) {
+ r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ }
+ if (r != 0) {
+ derr << "unexpected error reading fragment object "
+ << frag_oid.name << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Compose list of dentries we will write back
+ std::map<std::string, bufferlist> write_vals;
+ for (const auto& fb : lump.get_dfull()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
+ << dendl;
+ bool write_dentry = false;
+ if (read_vals.find(key) == read_vals.end()) {
+ dout(4) << "dentry did not already exist, will create" << dendl;
+ write_dentry = true;
+ } else {
+ dout(4) << "dentry " << key << " existed already" << dendl;
+ dout(4) << "dentry exists, checking versions..." << dendl;
+ bufferlist &old_dentry = read_vals[key];
+ // Decode dentry+inode
+ auto q = old_dentry.cbegin();
+
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ if (dentry_type == 'L' || dentry_type == 'l') {
+ // leave write_dentry false, we have no version to
+ // compare with in a hardlink, so it's not safe to
+ // squash over it with what's in this fullbit
+ dout(10) << "Existing remote inode in slot to be (maybe) written "
+ << "by a full inode from the journal dn '" << fb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode->version;
+ } else if (dentry_type == 'I' || dentry_type == 'i') {
+ // Read out inode version to compare with backing store
+ InodeStore inode;
+ if (dentry_type == 'i') {
+ mempool::mds_co::string alternate_name;
+
+ DECODE_START(2, q);
+ if (struct_v >= 2)
+ decode(alternate_name, q);
+ inode.decode(q);
+ DECODE_FINISH(q);
+ } else {
+ inode.decode_bare(q);
+ }
+ dout(4) << "decoded embedded inode version "
+ << inode.inode->version << " vs fullbit version "
+ << fb.inode->version << dendl;
+ if (inode.inode->version < fb.inode->version) {
+ write_dentry = true;
+ }
+ } else {
+ dout(4) << "corrupt dentry in backing store, overwriting from "
+ "journal" << dendl;
+ write_dentry = true;
+ }
+ }
+
+ if ((other_pool || write_dentry) && !dry_run) {
+ dout(4) << "writing I dentry " << key << " into frag "
+ << frag_oid.name << dendl;
+
+ // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+ bufferlist dentry_bl;
+ encode(fb.dnfirst, dentry_bl);
+ encode('I', dentry_bl);
+ encode_fullbit_as_inode(fb, true, &dentry_bl);
+
+ // Record for writing to RADOS
+ write_vals[key] = dentry_bl;
+ consumed_inos->insert(fb.inode->ino);
+ }
+ }
+
+ for(const auto& rb : lump.get_dremote()) {
+ // Get a key like "foobar_head"
+ std::string key;
+ dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
+ << dendl;
+ bool write_dentry = false;
+ if (read_vals.find(key) == read_vals.end()) {
+ dout(4) << "dentry did not already exist, will create" << dendl;
+ write_dentry = true;
+ } else {
+ dout(4) << "dentry " << key << " existed already" << dendl;
+ dout(4) << "dentry exists, checking versions..." << dendl;
+ bufferlist &old_dentry = read_vals[key];
+ // Decode dentry+inode
+ auto q = old_dentry.cbegin();
+
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ if (dentry_type == 'L' || dentry_type == 'l') {
+ dout(10) << "Existing hardlink inode in slot to be (maybe) written "
+ << "by a remote inode from the journal dn '" << rb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode->version;
+ } else if (dentry_type == 'I' || dentry_type == 'i') {
+ dout(10) << "Existing full inode in slot to be (maybe) written "
+ << "by a remote inode from the journal dn '" << rb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ write_dentry = old_fnode_version < lump.fnode->version;
+ } else {
+ dout(4) << "corrupt dentry in backing store, overwriting from "
+ "journal" << dendl;
+ write_dentry = true;
+ }
+ }
+
+ if ((other_pool || write_dentry) && !dry_run) {
+ dout(4) << "writing L dentry " << key << " into frag "
+ << frag_oid.name << dendl;
+
+ // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
+ bufferlist dentry_bl;
+ encode(rb.dnfirst, dentry_bl);
+ encode('L', dentry_bl);
+ encode(rb.ino, dentry_bl);
+ encode(rb.d_type, dentry_bl);
+
+ // Record for writing to RADOS
+ write_vals[key] = dentry_bl;
+ consumed_inos->insert(rb.ino);
+ }
+ }
+
+ std::set<std::string> null_vals;
+ for (const auto& nb : lump.get_dnull()) {
+ std::string key;
+ dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
+ dn_key.encode(key);
+
+ dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
+ << dendl;
+
+ auto it = read_vals.find(key);
+ if (it != read_vals.end()) {
+ dout(4) << "dentry exists, will remove" << dendl;
+
+ auto q = it->second.cbegin();
+ snapid_t dnfirst;
+ decode(dnfirst, q);
+ char dentry_type;
+ decode(dentry_type, q);
+
+ bool remove_dentry = false;
+ if (dentry_type == 'L' || dentry_type == 'l') {
+ dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode->version;
+ } else if (dentry_type == 'I' || dentry_type == 'i') {
+ dout(10) << "Existing full inode in slot to be (maybe) removed "
+ << "by null journal dn '" << nb.dn.c_str()
+ << "' with lump fnode version " << lump.fnode->version
+ << "vs existing fnode version " << old_fnode_version << dendl;
+ remove_dentry = old_fnode_version < lump.fnode->version;
+ } else {
+ dout(4) << "corrupt dentry in backing store, will remove" << dendl;
+ remove_dentry = true;
+ }
+
+ if (remove_dentry)
+ null_vals.insert(key);
+ }
+ }
+
+ // Write back any new/changed dentries
+ if (!write_vals.empty()) {
+ r = output.omap_set(frag_oid.name, write_vals);
+ if (r != 0) {
+ derr << "error writing dentries to " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ // remove any null dentries
+ if (!null_vals.empty()) {
+ r = output.omap_rm_keys(frag_oid.name, null_vals);
+ if (r != 0) {
+ derr << "error removing dentries from " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ /* Now that we've looked at the dirlumps, we finally pay attention to
+ * the roots (i.e. inodes without ancestry). This is necessary in order
+ * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
+ * important because clients use them to infer completeness
+ * of directories
+ */
+ for (const auto& fb : metablob.roots) {
+ inodeno_t ino = fb.inode->ino;
+ dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
+
+ object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
+ dout(4) << "object id " << root_oid.name << dendl;
+
+ bool write_root_ino = false;
+ bufferlist old_root_ino_bl;
+ r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
+ if (r == -ENOENT) {
+ dout(4) << "root does not exist, will create" << dendl;
+ write_root_ino = true;
+ } else if (r >= 0) {
+ r = 0;
+ InodeStore old_inode;
+ dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
+ << ")" << dendl;
+ auto inode_bl_iter = old_root_ino_bl.cbegin();
+ std::string magic;
+ decode(magic, inode_bl_iter);
+ if (magic == CEPH_FS_ONDISK_MAGIC) {
+ dout(4) << "magic ok" << dendl;
+ old_inode.decode(inode_bl_iter);
+
+ if (old_inode.inode->version < fb.inode->version) {
+ write_root_ino = true;
+ }
+ } else {
+ dout(4) << "magic bad: '" << magic << "'" << dendl;
+ write_root_ino = true;
+ }
+ } else {
+ derr << "error reading root inode object " << root_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (write_root_ino && !dry_run) {
+ dout(4) << "writing root ino " << root_oid.name
+ << " version " << fb.inode->version << dendl;
+
+ // Compose: root ino format is magic,InodeStore(bare=false)
+ bufferlist new_root_ino_bl;
+ encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
+ encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
+
+ // Write to RADOS
+ r = output.write_full(root_oid.name, new_root_ino_bl);
+ if (r != 0) {
+ derr << "error writing inode object " << root_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+
+ return r;
+}
+
+
+/**
+ * Erase a region of the log by overwriting it with ENoOp
+ *
+ */
+int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
+{
+ // To erase this region, we use our preamble, the encoding overhead
+ // of an ENoOp, and our trailing start ptr. Calculate how much padding
+ // is needed inside the ENoOp to make up the difference.
+ bufferlist tmp;
+ if (type == "mdlog") {
+ ENoOp enoop(0);
+ enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else if (type == "purge_queue") {
+ PurgeItem pi;
+ pi.encode(tmp);
+ }
+
+ dout(4) << "erase_region " << pos << " len=" << length << dendl;
+
+ // FIXME: get the preamble/postamble length via JournalStream
+ int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
+ dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
+
+ if (padding < 0) {
+ derr << "Erase region " << length << " too short" << dendl;
+ return -EINVAL;
+ }
+
+ bufferlist entry;
+ if (type == "mdlog") {
+ // Serialize an ENoOp with the correct amount of padding
+ ENoOp enoop(padding);
+ enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else if (type == "purge_queue") {
+ PurgeItem pi;
+ pi.pad_size = padding;
+ pi.encode(entry);
+ }
+ JournalStream stream(JOURNAL_FORMAT_RESILIENT);
+ // Serialize region of log stream
+ bufferlist log_data;
+ stream.write(entry, &log_data, pos);
+
+ dout(4) << "erase_region data length " << log_data.length() << dendl;
+ ceph_assert(log_data.length() == length);
+
+ // Write log stream region to RADOS
+ // FIXME: get object size somewhere common to scan_events
+ uint32_t object_size = g_conf()->mds_log_segment_size;
+ if (object_size == 0) {
+ // Default layout object size
+ object_size = file_layout_t::get_default().object_size;
+ }
+
+ uint64_t write_offset = pos;
+ uint64_t obj_offset = (pos / object_size);
+ int r = 0;
+ while(log_data.length()) {
+ std::string const oid = js.obj_name(obj_offset);
+ uint32_t offset_in_obj = write_offset % object_size;
+ uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
+
+ r = output.write(oid, log_data, write_len, offset_in_obj);
+ if (r < 0) {
+ return r;
+ } else {
+ dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
+ r = 0;
+ }
+
+ log_data.splice(0, write_len);
+ write_offset += write_len;
+ obj_offset++;
+ }
+
+ return r;
+}
+
+/**
+ * Given an EMetaBlob::fullbit containing an inode, write out
+ * the encoded inode in the format used by InodeStore (i.e. the
+ * backing store format)
+ *
+ * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
+ * on an offline InodeStore instance. It's way simpler, because we are just
+ * uncritically hauling the data between structs.
+ *
+ * @param fb a fullbit extracted from a journal entry
+ * @param bare if true, leave out [EN|DE]CODE_START decoration
+ * @param out_bl output, write serialized inode to this bufferlist
+ */
+void JournalTool::encode_fullbit_as_inode(
+ const EMetaBlob::fullbit &fb,
+ const bool bare,
+ bufferlist *out_bl)
+{
+ ceph_assert(out_bl != NULL);
+
+ // Compose InodeStore
+ InodeStore new_inode;
+ new_inode.inode = fb.inode;
+ new_inode.xattrs = fb.xattrs;
+ new_inode.dirfragtree = fb.dirfragtree;
+ new_inode.snap_blob = fb.snapbl;
+ new_inode.symlink = fb.symlink;
+ new_inode.old_inodes = fb.old_inodes;
+
+ // Serialize InodeStore
+ if (bare) {
+ new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ } else {
+ new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+ }
+}
+
+/**
+ * Given a list of inode numbers known to be in use by
+ * inodes in the backing store, ensure that none of these
+ * numbers are listed as free in the InoTables in the
+ * backing store.
+ *
+ * Used after injecting inodes into the backing store, to
+ * ensure that the same inode numbers are not subsequently
+ * used for new files during ordinary operation.
+ *
+ * @param inos list of inode numbers to be removed from
+ * free lists in InoTables
+ * @returns 0 on success, else negative error code
+ */
+int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
+{
+ int r = 0;
+
+ // InoTable is a per-MDS structure, so iterate over assigned ranks
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ std::set<mds_rank_t> in_ranks;
+ fs->mds_map.get_mds_set(in_ranks);
+
+ for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
+ rank_i != in_ranks.end(); ++rank_i)
+ {
+ // Compose object name
+ std::ostringstream oss;
+ oss << "mds" << *rank_i << "_inotable";
+ object_t inotable_oid = object_t(oss.str());
+
+ // Read object
+ bufferlist inotable_bl;
+ int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
+ if (read_r < 0) {
+ // Things are really bad if we can't read inotable. Beyond our powers.
+ derr << "unable to read inotable '" << inotable_oid.name << "': "
+ << cpp_strerror(read_r) << dendl;
+ r = r ? r : read_r;
+ continue;
+ }
+
+ // Deserialize InoTable
+ version_t inotable_ver;
+ auto q = inotable_bl.cbegin();
+ decode(inotable_ver, q);
+ InoTable ino_table(NULL);
+ ino_table.decode(q);
+
+ // Update InoTable in memory
+ bool inotable_modified = false;
+ for (std::set<inodeno_t>::iterator i = inos.begin();
+ i != inos.end(); ++i)
+ {
+ const inodeno_t ino = *i;
+ if (ino_table.force_consume(ino)) {
+ dout(4) << "Used ino 0x" << std::hex << ino << std::dec
+ << " requires inotable update" << dendl;
+ inotable_modified = true;
+ }
+ }
+
+ // Serialize and write InoTable
+ if (inotable_modified) {
+ inotable_ver += 1;
+ dout(4) << "writing modified inotable version " << inotable_ver << dendl;
+ bufferlist inotable_new_bl;
+ encode(inotable_ver, inotable_new_bl);
+ ino_table.encode_state(inotable_new_bl);
+ int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
+ if (write_r != 0) {
+ derr << "error writing modified inotable " << inotable_oid.name
+ << ": " << cpp_strerror(write_r) << dendl;
+ r = r ? r : read_r;
+ continue;
+ }
+ }
+ }
+
+ return r;
+}
+
diff --git a/src/tools/cephfs/JournalTool.h b/src/tools/cephfs/JournalTool.h
new file mode 100644
index 000000000..8d610a866
--- /dev/null
+++ b/src/tools/cephfs/JournalTool.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+#include <vector>
+
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/events/EMetaBlob.h"
+
+#include "include/rados/librados.hpp"
+
+#include "JournalFilter.h"
+
+class JournalScanner;
+
+
+/**
+ * Command line tool for investigating and repairing filesystems
+ * with damaged metadata logs
+ */
+class JournalTool : public MDSUtility
+{
+ private:
+ MDSRoleSelector role_selector;
+ // Bit hacky, use this `rank` member to control behaviour of the
+ // various main_ functions.
+ mds_rank_t rank;
+ // when set, generate per rank dump file path
+ bool all_ranks = false;
+
+ std::string type;
+
+ // Entry points
+ int main_journal(std::vector<const char*> &argv);
+ int main_header(std::vector<const char*> &argv);
+ int main_event(std::vector<const char*> &argv);
+
+ // Shared functionality
+ int recover_journal();
+
+ // Journal operations
+ int journal_inspect();
+ int journal_export(std::string const &path, bool import, bool force);
+ int journal_reset(bool hard);
+
+ // Header operations
+ int header_set();
+
+ // I/O handles
+ librados::Rados rados;
+ librados::IoCtx input;
+ librados::IoCtx output;
+
+ bool other_pool;
+
+ // Metadata backing store manipulation
+ int read_lost_found(std::set<std::string> &lost);
+ int recover_dentries(
+ EMetaBlob const &metablob,
+ bool const dry_run,
+ std::set<inodeno_t> *consumed_inos);
+
+ // Splicing
+ int erase_region(JournalScanner const &jp, uint64_t const pos, uint64_t const length);
+
+ // Backing store helpers
+ void encode_fullbit_as_inode(
+ const EMetaBlob::fullbit &fb,
+ const bool bare,
+ bufferlist *out_bl);
+ int consume_inos(const std::set<inodeno_t> &inos);
+
+ //validate type
+ int validate_type(const std::string &type);
+
+ // generate output file path for dump/export
+ std::string gen_dump_file_path(const std::string &prefix);
+
+ // check if an operation (mode, command) is safe to be
+ // executed on all ranks.
+ bool can_execute_for_all_ranks(const std::string &mode,
+ const std::string &command);
+ public:
+ static void usage();
+ JournalTool() :
+ rank(0), other_pool(false) {}
+ int main(std::vector<const char*> &argv);
+};
+
diff --git a/src/tools/cephfs/MDSUtility.cc b/src/tools/cephfs/MDSUtility.cc
new file mode 100644
index 000000000..54386d219
--- /dev/null
+++ b/src/tools/cephfs/MDSUtility.cc
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#include "MDSUtility.h"
+#include "mon/MonClient.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+
+MDSUtility::MDSUtility() :
+ Dispatcher(g_ceph_context),
+ objecter(NULL),
+ finisher(g_ceph_context, "MDSUtility", "fn_mds_utility"),
+ waiting_for_mds_map(NULL),
+ inited(false)
+{
+ monc = new MonClient(g_ceph_context, poolctx);
+ messenger = Messenger::create_client_messenger(g_ceph_context, "mds");
+ fsmap = new FSMap();
+ objecter = new Objecter(g_ceph_context, messenger, monc, poolctx);
+}
+
+
+MDSUtility::~MDSUtility()
+{
+ if (inited) {
+ shutdown();
+ }
+ delete objecter;
+ delete monc;
+ delete messenger;
+ delete fsmap;
+ ceph_assert(waiting_for_mds_map == NULL);
+}
+
+
+int MDSUtility::init()
+{
+ // Initialize Messenger
+ poolctx.start(1);
+ messenger->start();
+
+ objecter->set_client_incarnation(0);
+ objecter->init();
+
+ // Connect dispatchers before starting objecter
+ messenger->add_dispatcher_tail(objecter);
+ messenger->add_dispatcher_tail(this);
+
+ // Initialize MonClient
+ if (monc->build_initial_monmap() < 0) {
+ objecter->shutdown();
+ messenger->shutdown();
+ messenger->wait();
+ return -1;
+ }
+
+ monc->set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD|CEPH_ENTITY_TYPE_MDS);
+ monc->set_messenger(messenger);
+ monc->init();
+ int r = monc->authenticate();
+ if (r < 0) {
+ derr << "Authentication failed, did you specify an MDS ID with a valid keyring?" << dendl;
+ monc->shutdown();
+ objecter->shutdown();
+ messenger->shutdown();
+ messenger->wait();
+ return r;
+ }
+
+ client_t whoami = monc->get_global_id();
+ messenger->set_myname(entity_name_t::CLIENT(whoami.v));
+
+ // Start Objecter and wait for OSD map
+ objecter->start();
+ objecter->wait_for_osd_map();
+
+ // Prepare to receive MDS map and request it
+ ceph::mutex init_lock = ceph::make_mutex("MDSUtility:init");
+ ceph::condition_variable cond;
+ bool done = false;
+ ceph_assert(!fsmap->get_epoch());
+ lock.lock();
+ waiting_for_mds_map = new C_SafeCond(init_lock, cond, &done, NULL);
+ lock.unlock();
+ monc->sub_want("fsmap", 0, CEPH_SUBSCRIBE_ONETIME);
+ monc->renew_subs();
+
+ // Wait for MDS map
+ dout(4) << "waiting for MDS map..." << dendl;
+ {
+ std::unique_lock locker{init_lock};
+ cond.wait(locker, [&done] { return done; });
+ }
+ dout(4) << "Got MDS map " << fsmap->get_epoch() << dendl;
+
+ finisher.start();
+
+ inited = true;
+ return 0;
+}
+
+
+void MDSUtility::shutdown()
+{
+ finisher.stop();
+
+ lock.lock();
+ objecter->shutdown();
+ lock.unlock();
+ monc->shutdown();
+ messenger->shutdown();
+ messenger->wait();
+ poolctx.finish();
+}
+
+
+bool MDSUtility::ms_dispatch(Message *m)
+{
+ std::lock_guard locker{lock};
+ switch (m->get_type()) {
+ case CEPH_MSG_FS_MAP:
+ handle_fs_map((MFSMap*)m);
+ break;
+ case CEPH_MSG_OSD_MAP:
+ break;
+ default:
+ return false;
+ }
+ m->put();
+ return true;
+}
+
+
+void MDSUtility::handle_fs_map(MFSMap* m)
+{
+ *fsmap = m->get_fsmap();
+ if (waiting_for_mds_map) {
+ waiting_for_mds_map->complete(0);
+ waiting_for_mds_map = NULL;
+ }
+}
+
+
diff --git a/src/tools/cephfs/MDSUtility.h b/src/tools/cephfs/MDSUtility.h
new file mode 100644
index 000000000..09f1918ba
--- /dev/null
+++ b/src/tools/cephfs/MDSUtility.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef MDS_UTILITY_H_
+#define MDS_UTILITY_H_
+
+#include "osdc/Objecter.h"
+#include "mds/FSMap.h"
+#include "messages/MFSMap.h"
+#include "msg/Dispatcher.h"
+#include "msg/Messenger.h"
+#include "auth/Auth.h"
+#include "common/async/context_pool.h"
+#include "common/Finisher.h"
+#include "common/Timer.h"
+
+/// MDS Utility
+/**
+ * This class is the parent for MDS utilities, i.e. classes that
+ * need access the objects belonging to the MDS without actually
+ * acting as an MDS daemon themselves.
+ */
+class MDSUtility : public Dispatcher {
+protected:
+ Objecter *objecter;
+ FSMap *fsmap;
+ Messenger *messenger;
+ MonClient *monc;
+
+ ceph::mutex lock = ceph::make_mutex("MDSUtility::lock");
+ Finisher finisher;
+ ceph::async::io_context_pool poolctx;
+
+ Context *waiting_for_mds_map;
+
+ bool inited;
+public:
+ MDSUtility();
+ ~MDSUtility() override;
+
+ void handle_fs_map(MFSMap* m);
+ bool ms_dispatch(Message *m) override;
+ bool ms_handle_reset(Connection *con) override { return false; }
+ void ms_handle_remote_reset(Connection *con) override {}
+ bool ms_handle_refused(Connection *con) override { return false; }
+ int init();
+ void shutdown();
+};
+
+#endif /* MDS_UTILITY_H_ */
diff --git a/src/tools/cephfs/MetaTool.cc b/src/tools/cephfs/MetaTool.cc
new file mode 100644
index 000000000..527ae636f
--- /dev/null
+++ b/src/tools/cephfs/MetaTool.cc
@@ -0,0 +1,999 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <string.h>
+#include <map>
+#include <sstream>
+#include <fstream>
+
+#include "include/types.h"
+#include "common/Formatter.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/InoTable.h"
+#include "mds/CDentry.h"
+
+#include "mds/events/ENoOp.h"
+#include "mds/events/EUpdate.h"
+
+#include "mds/JournalPointer.h"
+// #include "JournalScanner.h"
+// #include "EventOutput.h"
+// #include "Dumper.h"
+// #include "Resetter.h"
+
+// #include "JournalTool.h"
+#include "MetaTool.h"
+#include "type_helper.hpp"
+#include "include/object.h"
+
+WRITE_RAW_ENCODER(char)
+WRITE_RAW_ENCODER(unsigned char)
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+
+void MetaTool::meta_op::release()
+{
+ for (const auto& i : inodes) {
+ delete i.second;
+ }
+
+ while (!sub_ops.empty()) {
+ delete sub_ops.top();
+ sub_ops.pop();
+ }
+}
+
+void MetaTool::inode_meta_t::decode_json(JSONObj *obj)
+{
+ unsigned long long tmp;
+ JSONDecoder::decode_json("snapid_t", tmp, obj, true);
+ _f.val = tmp;
+ JSONDecoder::decode_json("itype", tmp, obj, true);
+ _t = tmp;
+ if (NULL == _i)
+ _i = new InodeStore;
+ JSONDecoder::decode_json("store", *_i, obj, true);
+}
+
+void MetaTool::usage()
+{
+ generic_client_usage();
+}
+
+int MetaTool::main(string& mode,
+ string& rank_str,
+ string& minfo,
+ string&ino,
+ string& out,
+ string& in,
+ bool confirm
+ )
+{
+ int r = 0;
+
+ std::string manual_meta_pool;
+ std::string manual_data_pool;
+ std::string manual_rank_num;
+ bool manual_mode = false;
+ if (minfo != "") {
+ vector<string> v;
+ string_split(minfo, v);
+ manual_meta_pool = v.size() >= 1 ? v[0] : "";
+ manual_data_pool = v.size() >= 2 ? v[1] : "";
+ manual_rank_num = v.size() >= 3 ? v[2] : "";
+ std::cout << "("<< minfo<< ")=>"
+ << " mpool: " << manual_meta_pool
+ << " dpool: " << manual_data_pool
+ << " rank: " << manual_rank_num
+ << std::endl;
+ if (!manual_meta_pool.empty() && !manual_data_pool.empty() && !manual_rank_num.empty()) {
+ std::cout << "you specify rank: " << manual_rank_num
+ << " mpool: " << manual_meta_pool
+ << " dpool: " << manual_data_pool
+ << "\nstart manual mode!!"<< std::endl;
+ manual_mode = true;
+ }
+ }
+
+ // RADOS init
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ cerr << "RADOS unavailable" << std::endl;
+ return r;
+ }
+
+ if (_debug)
+ cout << "MetaTool: connecting to RADOS..." << std::endl;
+ r = rados.connect();
+ if (r < 0) {
+ cerr << "couldn't connect to cluster: " << cpp_strerror(r) << std::endl;
+ return r;
+ }
+
+ if (!manual_mode) {
+ r = role_selector.parse(*fsmap, rank_str);
+ if (r != 0) {
+ cerr << "Couldn't determine MDS rank." << std::endl;
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ assert(fs != nullptr);
+
+ // prepare io for meta pool
+ int64_t const pool_id = fs->mds_map.get_metadata_pool();
+ features = fs->mds_map.get_up_features();
+ if (features == 0)
+ features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ else if (features != CEPH_FEATURES_SUPPORTED_DEFAULT) {
+ cout << "I think we need to check the feature! : " << features << std::endl;
+ return -1;
+ }
+
+ std::string pool_name;
+ r = rados.pool_reverse_lookup(pool_id, &pool_name);
+ if (r < 0) {
+ cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl;
+ return r;
+ }
+
+ if (_debug)
+ cout << "MetaTool: creating IoCtx.." << std::endl;
+ r = rados.ioctx_create(pool_name.c_str(), io_meta);
+ assert(r == 0);
+ output.dup(io_meta);
+
+ // prepare io for data pool
+ for (const auto p : fs->mds_map.get_data_pools()) {
+ r = rados.pool_reverse_lookup(p, &pool_name);
+ if (r < 0) {
+ cerr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << std::endl;
+ return r;
+ }
+ librados::IoCtx* io_data = new librados::IoCtx;
+ r = rados.ioctx_create(pool_name.c_str(), *io_data);
+ assert(r == 0);
+ io_data_v.push_back(io_data);
+ }
+
+ for (auto role : role_selector.get_roles()) {
+ rank = role.rank;
+
+ r = process(mode, ino, out, in, confirm);
+ cout << "executing for rank " << rank << " op[" <<mode<< "] ret : " << r << std::endl;
+ }
+
+ } else {
+ features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ r = rados.ioctx_create(manual_meta_pool.c_str(), io_meta);
+ assert(r == 0);
+
+ librados::IoCtx* io_data = new librados::IoCtx;
+ r = rados.ioctx_create(manual_data_pool.c_str(), *io_data);
+ assert(r == 0);
+ io_data_v.push_back(io_data);
+
+
+ rank = conv_t<int>(manual_rank_num);
+ r = process(mode, ino, out, in, confirm);
+ cout << "op[" << mode << "] ret : " << r << std::endl;
+ }
+ return r;
+}
+
+int MetaTool::process(string& mode, string& ino, string out, string in, bool confirm)
+{
+ if (mode == "showm") {
+ return show_meta_info(ino, out);
+ } else if (mode == "showfn") {
+ return show_fnode(ino, out);
+ } else if (mode == "listc") {
+ return list_meta_info(ino, out);
+ } else if (mode == "amend") {
+ return amend_meta_info(ino, in, confirm);
+ } else if (mode == "amendfn") {
+ return amend_fnode(in, confirm);
+ } else {
+ cerr << "bad command '" << mode << "'" << std::endl;
+ return -EINVAL;
+ }
+}
+int MetaTool::show_fnode(string& ino, string& out)
+{
+ if (ino != "0") {
+ inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+ meta_op op(_debug, out);
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_SHOW_FN;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = i_ino;
+ op.push_op(nsop);
+ return op_process(op);
+ } else {
+ cerr << "parameter error? : ino = " << ino << std::endl;
+ }
+ return 0;
+}
+int MetaTool::amend_fnode(string& in, bool confirm)
+{
+ meta_op op(_debug, "", in, confirm);
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_AMEND_FN;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = 0;
+ op.push_op(nsop);
+ return op_process(op);
+}
+int MetaTool::amend_meta_info(string& ino, string& in, bool confirm)
+{
+ if (ino != "0" && in != "") {
+ inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+ meta_op op(_debug, "", in, confirm);
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_AMEND;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = i_ino;
+ op.push_op(nsop);
+ return op_process(op);
+ } else {
+ cerr << "parameter error? : ino = " << ino << std::endl;
+ }
+ return 0;
+}
+int MetaTool::list_meta_info(string& ino, string& out)
+{
+ if (ino != "0") {
+ inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+ meta_op op(_debug, out);
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = i_ino;
+ op.push_op(nsop);
+ return op_process(op);
+ } else {
+ cerr << "parameter error? : ino = " << ino << std::endl;
+ }
+ return 0;
+}
+int MetaTool::show_meta_info(string& ino, string& out)
+{
+ if (ino != "0") {
+ inodeno_t i_ino = std::stoull(ino.c_str(), nullptr, 0);
+ meta_op op(_debug, out);
+
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_SHOW;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->ino = i_ino;
+ op.push_op(nsop);
+ return op_process(op);
+ } else {
+ cerr << "parameter error? : ino = " << ino << std::endl;
+ }
+ return 0;
+}
+
+int MetaTool::op_process(meta_op& op)
+{
+ int r = 0;
+ while (!op.no_sops()) {
+ if (_debug)
+ std::cout << "process : " << op.top_op()->detail() << std::endl;
+ switch(op.top_op()->sub_op_t) {
+ case meta_op::OP_LIST:
+ r = list_meta(op);
+ break;
+ case meta_op::OP_LTRACE:
+ r = file_meta(op);
+ break;
+ case meta_op::OP_SHOW:
+ r = show_meta(op);
+ break;
+ case meta_op::OP_AMEND:
+ r = amend_meta(op);
+ break;
+ case meta_op::OP_SHOW_FN:
+ r = show_fn(op);
+ break;
+ case meta_op::OP_AMEND_FN:
+ r = amend_fn(op);
+ break;
+ default:
+ cerr << "unknow op" << std::endl;
+ }
+ if (r == 0)
+ op.pop_op();
+ else if (r < 0)
+ op.clear_sops();
+ }
+ op.release();
+ return r;
+}
+
+int MetaTool::amend_meta(meta_op &op)
+{
+ meta_op::sub_op* sop = op.top_op();
+ auto item = op.inodes.find(sop->ino);
+ auto item_k = op.okeys.find(sop->ino);
+ if (item != op.inodes.end() && item_k != op.okeys.end()) {
+ if (_amend_meta(item_k->second, *(item->second), op.infile(), op) < 0)
+ return -1;
+ } else {
+ if (op.inodes.empty()) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->trace_level = 0;
+ nsop->ino_c = sop->ino;
+ op.push_op(nsop);
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+void MetaTool::inode_meta_t::encode(::ceph::bufferlist& bl, uint64_t features)
+{
+ ::encode(_f, bl);
+ ::encode(_t, bl);
+ _i->encode_bare(bl, features);
+}
+int MetaTool::_amend_meta(string& k, inode_meta_t& inode_meta, const string& fn, meta_op& op)
+{
+ JSONParser parser;
+ if (!parser.parse(fn.c_str())) {
+ cout << "Error parsing create user response" << std::endl;
+ return -1;
+ }
+
+ try {
+ inode_meta.decode_json(&parser);
+ } catch (JSONDecoder::err& e) {
+ cout << "failed to decode JSON input: " << e.what() << std::endl;
+ return -1;
+ }
+
+ if (!op.confirm_chg() || op.is_debug()) {
+ cout << "you will amend info of inode ==>: " << std::endl;
+ _show_meta(inode_meta, "");
+ }
+
+ if (!op.confirm_chg()) {
+ cout << "warning: this operation is irreversibl!!!\n"
+ << " You must confirm that all logs of mds have been flushed!!!\n"
+ << " if you want amend it, please add --yes-i-really-really-mean-it!!!"
+ << std::endl;
+ return -1;
+ }
+
+ bufferlist bl;
+ inode_meta.encode(bl, features);
+ map<string, bufferlist> to_set;
+ to_set[k].swap(bl);
+ inode_backpointer_t bp;
+ if (!op.top_op()->get_ancestor(bp))
+ return -1;
+ frag_t frag;
+ auto item = op.inodes.find(bp.dirino);
+ if (item != op.inodes.end()) {
+ frag = item->second->get_meta()->pick_dirfrag(bp.dname);
+ }
+ string oid = obj_name(bp.dirino, frag);
+ int ret = io_meta.omap_set(oid, to_set);
+ to_set.clear();
+ return ret;
+}
+int MetaTool::show_fn(meta_op &op)
+{
+ meta_op::sub_op* sop = op.top_op();
+ auto item = op.inodes.find(sop->ino);
+ if (item != op.inodes.end()) {
+ if (_show_fn(*(item->second), op.outfile()) < 0)
+ return -1;
+ } else {
+ if (op.inodes.empty()) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->trace_level = 0;
+ nsop->ino_c = sop->ino;
+ op.push_op(nsop);
+ return 1;
+ } else
+ return -1;
+ }
+ return 0;
+}
+int MetaTool::_show_fn(inode_meta_t& inode_meta, const string& fn)
+{
+ std::list<frag_t> frags;
+ inode_meta.get_meta()->dirfragtree.get_leaves(frags);
+ std::stringstream ds;
+ std::string format = "json";
+ std::string oids;
+ Formatter* f = Formatter::create(format);
+ f->enable_line_break();
+ f->open_object_section("fnodes");
+ for (const auto &frag : frags) {
+ bufferlist hbl;
+ string oid = obj_name(inode_meta.get_meta()->inode->ino, frag);
+ int ret = io_meta.omap_get_header(oid, &hbl);
+ if (ret < 0) {
+ std::cerr << __func__ << " : can't find oid("<< oid << ")" << std::endl;
+ return -1;
+ }
+ {
+ fnode_t got_fnode;
+ try {
+ auto p = hbl.cbegin();
+ ::decode(got_fnode, p);
+ } catch (const buffer::error &err) {
+ cerr << "corrupt fnode header in " << oid
+ << ": " << err.what() << std::endl;
+ return -1;
+ }
+ if (!oids.empty())
+ oids += ",";
+ oids += oid;
+ f->open_object_section(oid.c_str());
+ got_fnode.dump(f);
+ f->close_section();
+ }
+ }
+ f->dump_string("oids", oids.c_str());
+ f->close_section();
+ f->flush(ds);
+ if (fn != "") {
+ ofstream o;
+ o.open(fn);
+ if (o) {
+ o << ds.str();
+ o.close();
+ } else {
+ cout << "out to file (" << fn << ") failed" << std::endl;
+ cout << ds.str() << std::endl;
+ }
+ } else
+ std::cout << ds.str() << std::endl;
+ return 0;
+}
+int MetaTool::amend_fn(meta_op &op)
+{
+ if (_amend_fn(op.infile(), op.confirm_chg()) < 0)
+ return -1;
+ return 0;
+}
+int MetaTool::_amend_fn(const string& fn, bool confirm)
+{
+ JSONParser parser;
+ if (!parser.parse(fn.c_str())) {
+ cout << "Error parsing create user response : " << fn << std::endl;
+ return -1;
+ }
+ if (!confirm) {
+ cout << "warning: this operation is irreversibl!!!\n"
+ << " You must confirm that all logs of mds have been flushed!!!\n"
+ << " if you want amend it, please add --yes-i-really-really-mean-it!!!"
+ << std::endl;
+ return -1;
+ }
+ try {
+ string tmp;
+ JSONDecoder::decode_json("oids", tmp, &parser, true);
+ string::size_type pos1, pos2;
+ vector<string> v;
+ string c = ",";
+ pos2 = tmp.find(c);
+ pos1 = 0;
+ while (string::npos != pos2) {
+ v.push_back(tmp.substr(pos1, pos2-pos1));
+ pos1 = pos2 + c.size();
+ pos2 = tmp.find(c, pos1);
+ }
+ if (pos1 != tmp.length())
+ v.push_back(tmp.substr(pos1));
+ int ret = 0;
+ for (auto i : v) {
+ cout << "amend frag : " << i << "..." << std::endl;
+ fnode_t fnode;
+ JSONDecoder::decode_json(i.c_str(), fnode, &parser, true);
+ bufferlist bl;
+ fnode.encode(bl);
+ ret = io_meta.omap_set_header(i, bl);
+ if (ret < 0)
+ return ret;
+ }
+ } catch (JSONDecoder::err& e) {
+ cout << "failed to decode JSON input: " << e.what() << std::endl;
+ return -1;
+ }
+ return 0;
+}
+int MetaTool::show_meta(meta_op &op)
+{
+ meta_op::sub_op* sop = op.top_op();
+ auto item = op.inodes.find(sop->ino);
+ if (item != op.inodes.end()) {
+ if (_show_meta(*(item->second), op.outfile()) < 0)
+ return -1;
+ } else {
+ if (op.inodes.empty()) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->trace_level = 0;
+ nsop->ino_c = sop->ino;
+ op.push_op(nsop);
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ return 0;
+}
+int MetaTool::_show_meta(inode_meta_t& inode_meta, const string& fn)
+{
+ std::stringstream ds;
+ std::string format = "json";
+ InodeStore& inode_data = *inode_meta.get_meta();
+ Formatter* f = Formatter::create(format);
+ f->enable_line_break();
+ f->open_object_section("meta");
+ f->dump_unsigned("snapid_t", inode_meta.get_snapid());
+ f->dump_unsigned("itype", inode_meta.get_type());
+ f->open_object_section("store");
+ inode_data.dump(f);
+ try {
+ if (inode_data.snap_blob.length()) {
+ sr_t srnode;
+ auto p = inode_data.snap_blob.cbegin();
+ decode(srnode, p);
+ f->open_object_section("snap_blob");
+ srnode.dump(f);
+ f->close_section();
+ }
+ } catch (const buffer::error &err) {
+ cerr << "corrupt decode in snap_blob"
+ << ": " << err.what() << std::endl;
+ return -1;
+ }
+
+ f->close_section();
+ f->close_section();
+ f->flush(ds);
+
+ if (fn != "") {
+ ofstream o;
+ o.open(fn);
+ if (o) {
+ o << ds.str();
+ o.close();
+ } else {
+ cout << "out to file (" << fn << ") failed" << std::endl;
+ cout << ds.str() << std::endl;
+ }
+
+ } else
+ std::cout << ds.str() << std::endl;
+ return 0;
+}
+int MetaTool::list_meta(meta_op &op)
+{
+ meta_op::sub_op* sop = op.top_op();
+
+ bool list_all = false;
+ string oid;
+ inodeno_t ino = sop->ino_c;
+ frag_t frag = sop->frag;
+
+ if (sop->ino_c == 0) {
+ list_all = true;
+ oid = obj_name(sop->ino, frag);
+ } else {
+ if (_debug)
+ std::cout << __func__ << " : " << sop->trace_level << " " << op.ancestors.size() << std::endl;
+ inode_backpointer_t bp;
+ if (sop->get_c_ancestor(bp)) {
+ auto item = op.inodes.find(bp.dirino);
+ if (item != op.inodes.end()) {
+ frag = item->second->get_meta()->pick_dirfrag(bp.dname);
+ }
+ oid = obj_name(bp.dirino, frag);
+ } else {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->ino = sop->ino_c;
+ nsop->sub_op_t = meta_op::OP_LTRACE;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ op.push_op(nsop);
+ return 1;
+ }
+ }
+ if (_debug)
+ std::cout << __func__ << " : " << string(list_all?"listall ":"info ") << oid << " "<< ino << std::endl;
+ bufferlist hbl;
+ int ret = io_meta.omap_get_header(oid, &hbl);
+ if (ret < 0) {
+ std::cerr << __func__ << " : can't find it, maybe it (ino:"<< sop->ino<< ")isn't a normal dir!" << std::endl;
+ return -1;
+ }
+
+ if (hbl.length() == 0) { // obj has splite
+ if (list_all) {
+ if (frag == frag_t()) {
+ auto item = op.inodes.find(sop->ino);
+ if (item != op.inodes.end()) {
+ inodeno_t tmp = sop->ino;
+ op.pop_op();
+ std::list<frag_t> frags;
+ item->second->get_meta()->dirfragtree.get_leaves(frags);
+ for (const auto &frag : frags) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->ino = tmp;
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->frag = frag;
+ op.push_op(nsop);
+ }
+ } else {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->ino_c = sop->ino;
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ op.push_op(nsop);
+ }
+ return 1;
+ } else {
+ cerr << __func__ << " missing some data (" << oid << ")???" << std::endl;
+ return -1;
+ }
+ } else {
+ if (frag == frag_t()) {
+ inode_backpointer_t bp;
+ if (sop->get_c_ancestor(bp)) {
+ meta_op::sub_op* nsop = new meta_op::sub_op(&op);
+ nsop->ino_c = bp.dirino;
+ nsop->sub_op_t = meta_op::OP_LIST;
+ nsop->sub_ino_t = meta_op::INO_DIR;
+ nsop->trace_level = sop->trace_level + 1;
+ op.push_op(nsop);
+ return 1;
+ } else {
+ cerr << __func__ << "can't find obj(" << oid << ") ,miss ancestors or miss some objs??? " << std::endl;
+ return -1;
+ }
+ } else {
+ cerr << __func__ << "missing some objs(" << oid << ")??? " << std::endl;
+ return -1;
+ }
+ }
+ }
+
+ fnode_t got_fnode;
+ try {
+ auto p = hbl.cbegin();
+ ::decode(got_fnode, p);
+ } catch (const buffer::error &err) {
+ cerr << "corrupt fnode header in " << oid
+ << ": " << err.what() << std::endl;
+ return -1;
+ }
+
+ if (_debug) {
+ std::string format = "json";
+ Formatter* f = Formatter::create(format);
+ f->enable_line_break();
+ f->dump_string("type", "--fnode--");
+ f->open_object_section("fnode");
+ got_fnode.dump(f);
+ f->close_section();
+ f->flush(std::cout);
+ std::cout << std::endl;
+ }
+
+ // print children
+ std::map<string, bufferlist> out_vals;
+ int max_vals = 5;
+ io_meta.omap_get_vals(oid, "", max_vals, &out_vals);
+
+ bool force_dirty = false;
+ const set<snapid_t> *snaps = NULL;
+ unsigned pos = out_vals.size() - 1;
+ std::string last_dname;
+ for (map<string, bufferlist>::iterator p = out_vals.begin();
+ p != out_vals.end();
+ ++p, --pos) {
+ string dname;
+ snapid_t last;
+ dentry_key_t::decode_helper(p->first, dname, last);
+ if (_debug)
+ last_dname = dname;
+ try {
+ if (!list_all) {
+ if (show_child(p->first, dname, last, p->second, pos, snaps,
+ &force_dirty, ino, &op) == 1) {
+ return 0;
+ }
+ } else {
+ cout << "dname : " << dname << " " << last << std::endl;
+ if (show_child(p->first, dname, last, p->second, pos, snaps,
+ &force_dirty) == 1)
+ return 0;
+ }
+ } catch (const buffer::error &err) {
+ derr << "Corrupt dentry '" << dname << "' : "
+ << err.what() << "(" << "" << ")" << dendl;
+ return -1;
+ }
+ }
+ while (out_vals.size() == (size_t)max_vals) {
+ out_vals.clear();
+ io_meta.omap_get_vals(oid, last_dname, max_vals, &out_vals);
+ pos = out_vals.size() - 1;
+ for (map<string, bufferlist>::iterator p = (++out_vals.begin());
+ p != out_vals.end();
+ ++p, --pos) {
+ string dname;
+ snapid_t last;
+ dentry_key_t::decode_helper(p->first, dname, last);
+ last_dname = dname;
+ try {
+ if (!list_all) {
+ if (show_child(p->first, dname, last, p->second, pos, snaps,
+ &force_dirty, ino, &op) == 1) {
+ return 0;
+ }
+ } else {
+ cout << "dname : " << dname << " " << last << std::endl;
+ if (show_child(p->first, dname, last, p->second, pos, snaps,
+ &force_dirty) == 1)
+ return 0;
+ }
+ } catch (const buffer::error &err) {
+ derr << "Corrupt dentry '" << dname << "' : "
+ << err.what() << "(" << "" << ")" << dendl;
+ return -1;
+ }
+ }
+ }
+
+ if (!list_all) {
+ cerr << __func__ << "miss obj(ino:" << ino << ")??? " << std::endl;
+ return -1;
+ }
+ return 0;
+}
+
+int MetaTool::file_meta(meta_op &op)
+{
+ int r = 0;
+ if (op.top_op()->sub_ino_t == meta_op::INO_DIR) {
+ r = _file_meta(op, io_meta);
+ } else if (op.top_op()->sub_ino_t == meta_op::INO_F) {
+ for (auto i = io_data_v.begin(); i != io_data_v.end(); ++i)
+ if ((r = _file_meta(op, **i)) == 1)
+ break;
+ }
+ if (r == 1) {
+ inode_backpointer_t bp;
+ if (op.top_op()->get_ancestor(bp)) {
+ return 0;
+ } else {
+ std::cerr << "no trace for obj (ino:" << op.top_op()->ino <<")??" << std::endl;
+ return -1;
+ }
+ } else if (op.top_op()->sub_ino_t == meta_op::INO_DIR) {
+ std::cerr << "\tmaybe it's a file(ino:" << op.top_op()->ino << ")" << std::endl;
+ op.top_op()->sub_ino_t = meta_op::INO_F;
+ return 1;
+ }
+
+ std::cerr << "can't get (ino:" << op.top_op()->ino <<")trace??" << std::endl;
+ return -1;
+}
+
+int MetaTool::_file_meta(meta_op &op, librados::IoCtx& io)
+{
+ inodeno_t ino = op.top_op()->ino;
+ std::string oid = obj_name(ino);
+ bufferlist pointer_bl;
+ std::map<std::string, bufferlist> attrset;
+ int r = 0;
+ bool have_data = false;
+ r = io.getxattrs (oid.c_str(), attrset);
+ if (0 == r) {
+ std::stringstream ds;
+ std::string format = "json";
+ Formatter* f = Formatter::create(format);
+ auto item = attrset.find("parent");
+ if (item != attrset.end()) {
+ inode_backtrace_t i_bt;
+ try {
+ bufferlist::const_iterator q = item->second.cbegin();
+ i_bt.decode(q);
+ f->open_array_section("info");
+ have_data = true;
+ if (i_bt.ancestors.size() > 0)
+ op.ancestors[ino] = i_bt.ancestors[0];
+ f->dump_string("type", "--i_bt--");
+ f->open_object_section("parent");
+ i_bt.dump(f);
+ f->close_section();
+ } catch (buffer::error &e) {
+ cerr << "failed to decode parent of " << oid << std::endl;
+ return -1;
+ }
+ } else {
+ cerr << oid << " in " << io.get_pool_name() << " , but no parent" << std::endl;
+ return -1;
+ }
+
+ item = attrset.find("layout");
+ if (item != attrset.end()) {
+ file_layout_t layout;
+ try {
+ auto q = item->second.cbegin();
+ layout.decode(q);
+ f->dump_string("type", "--layout--");
+ f->open_object_section("layout");
+ layout.dump(f);
+ f->close_section();
+
+ } catch (buffer::error &e) {
+ cerr << "failed to decode layout of " << oid << std::endl;
+ return -1;
+ }
+ } else {
+ cerr << oid << " in " << io.get_pool_name() << " , but no layout" << std::endl;
+ }
+ if (have_data) {
+ f->close_section();
+ f->flush(ds);
+ if (_debug)
+ cout << ino << " : "<< ds.str() << std::endl;
+ return 1;
+ }
+ }
+ return 0;
+}
+std::string MetaTool::obj_name(inodeno_t ino, uint64_t offset, const char *suffix) const
+{
+ char name[60];
+ snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)offset, suffix ? suffix : "");
+ return std::string(name);
+}
+std::string MetaTool::obj_name(inodeno_t ino, frag_t fg, const char *suffix) const
+{
+ char name[60];
+ snprintf(name, sizeof(name), "%llx.%08llx%s", (long long unsigned)ino, (long long unsigned)fg, suffix ? suffix : "");
+ return std::string(name);
+}
+
+std::string MetaTool::obj_name(const char* ino, uint64_t offset, const char *suffix) const
+{
+ char name[60];
+ snprintf(name, sizeof(name), "%s.%08llx%s", ino, (long long unsigned)offset, suffix ? suffix : "");
+ std::string out = name;
+ transform(out.begin(), out.end(), out.begin(),::tolower);
+ return out;
+}
+
+int MetaTool::show_child(std::string_view key,
+ std::string_view dname,
+ const snapid_t last,
+ bufferlist &bl,
+ const int pos,
+ const std::set<snapid_t> *snaps,
+ bool *force_dirty,
+ inodeno_t sp_ino,
+ meta_op* op)
+{
+ bufferlist::const_iterator q = bl.cbegin();
+
+ snapid_t first;
+ ::decode(first, q);
+
+ // marker
+ char type;
+ ::decode(type, q);
+
+ if (_debug)
+ std::cout << pos << " type '" << type << "' dname '" << dname
+ << " [" << first << "," << last << "]"
+ << std::endl;
+ // bool stale = false;
+ if (snaps && last != CEPH_NOSNAP) {
+ derr << "!!!! erro !!!!" << dendl;
+ return -1;
+ }
+
+ // CDentry *dn = NULL;
+ // look for existing dentry for _last_ snap, can't process snap of obj
+ //if *(stale)
+ // dn = lookup_exact_snap(dname, last);
+ //else
+ // dn = lookup(dname, last);
+ if (type == 'L' || type == 'l') {
+ // hard link
+ inodeno_t ino;
+ unsigned char d_type;
+ mempool::mds_co::string alternate_name;
+
+ CDentry::decode_remote(type, ino, d_type, alternate_name, q);
+
+ if (sp_ino > 0) {
+ if (sp_ino == ino) {
+ std::cout << "find hard link : " << ino << "," << d_type << std::endl;
+ return 1;
+ }
+ }
+
+ std::cout << "hard link : " << ino << "," << d_type << std::endl;
+ } else if (type == 'I' || type == 'i') {
+ // inode
+ // load inode data before lookuping up or constructing CInode
+ InodeStore& inode_data = *(new InodeStore);
+ if (type == 'i') {
+ mempool::mds_co::string alternate_name;
+
+ DECODE_START(2, q);
+ if (struct_v >= 2)
+ decode(alternate_name, q);
+ inode_data.decode(q);
+ DECODE_FINISH(q);
+ } else {
+ inode_data.decode_bare(q);
+ }
+
+ std::stringstream ds;
+ std::string format = "json";
+ Formatter* f = Formatter::create(format);
+ f->enable_line_break();
+ f->open_object_section("meta");
+ f->dump_unsigned("snapid_t", first);
+ f->dump_unsigned("itype", type);
+ f->open_object_section("store");
+ inode_data.dump(f);
+ try {
+ if (inode_data.snap_blob.length()) {
+ sr_t srnode;
+ auto p = inode_data.snap_blob.cbegin();
+ srnode.decode(p);
+ f->open_object_section("snap_blob");
+ srnode.dump(f);
+ f->close_section();
+ }
+ } catch (const buffer::error &err) {
+ cerr << "corrupt decode in snap_blob"
+ << ": " << err.what() << std::endl;
+ }
+ f->close_section();
+ f->close_section();
+ f->flush(ds);
+
+ if (sp_ino > 0 && op != NULL && sp_ino == inode_data.inode->ino) {
+ inode_meta_t* tmp = new inode_meta_t(first, type, &inode_data);
+ op->inodes[inode_data.inode->ino] = tmp;
+ op->okeys[inode_data.inode->ino] = key.data();
+ return 1;
+ } else {
+ delete &inode_data;
+ }
+
+ if (sp_ino == 0) {
+ cout << ds.str() << std::endl;
+ }
+ } else {
+ std::cerr << __func__ << "unknow type : " << dname << "," << type << std::endl;
+ }
+ return 0;
+}
diff --git a/src/tools/cephfs/MetaTool.h b/src/tools/cephfs/MetaTool.h
new file mode 100644
index 000000000..510be6552
--- /dev/null
+++ b/src/tools/cephfs/MetaTool.h
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef METATOOL_H__
+#define METATOOL_H__
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+#include <vector>
+#include <stack>
+using std::stack;
+#include "mds/mdstypes.h"
+#include "mds/LogEvent.h"
+#include "mds/events/EMetaBlob.h"
+
+#include "include/rados/librados.hpp"
+#include "common/ceph_json.h"
+
+using ::ceph::bufferlist;
+class MetaTool : public MDSUtility
+{
+public:
+ class inode_meta_t {
+ public:
+ inode_meta_t(snapid_t f = CEPH_NOSNAP, char t = char(255), InodeStore* i = NULL):
+ _f(f),_t(t),_i(i) {
+ };
+ snapid_t get_snapid() const {
+ return _f;
+ }
+ InodeStore* get_meta() const {
+ if (_t == 'I')
+ return _i;
+ else
+ return NULL;
+ }
+ int get_type() const {
+ return _t;
+ }
+ void decode_json(JSONObj *obj);
+ void encode(::ceph::bufferlist& bl, uint64_t features);
+ private:
+ snapid_t _f;
+ char _t;
+ InodeStore* _i;
+ };
+private:
+ class meta_op {
+ public:
+ meta_op(bool debug = false, string out = "", string in = "", bool confirm = false):
+ _debug(debug),
+ _out(out),
+ _in(in),
+ _confirm(confirm)
+ {}
+ void release();
+ typedef enum {
+ OP_LIST = 0,
+ OP_LTRACE,
+ OP_SHOW,
+ OP_AMEND,
+ OP_SHOW_FN,
+ OP_AMEND_FN,
+ OP_NO
+ } op_type;
+
+ typedef enum {
+ INO_DIR = 0,
+ INO_F
+ } ino_type;
+
+ static string op_type_name(op_type& t) {
+ string name;
+ switch (t) {
+ case OP_LIST:
+ name = "list dir";
+ break;
+ case OP_LTRACE:
+ name = "load trace";
+ break;
+ case OP_SHOW:
+ name = "show info";
+ break;
+ case OP_AMEND:
+ name = "amend info";
+ break;
+ case OP_SHOW_FN:
+ name = "show fnode";
+ break;
+ case OP_AMEND_FN:
+ name = "amend fnode";
+ break;
+ case OP_NO:
+ name = "noop";
+ break;
+ default:
+ name = "unknow op type";
+ }
+ return name;
+ }
+ static string ino_type_name(ino_type& t) {
+ string name;
+ switch (t) {
+ case INO_DIR:
+ name = "dir";
+ break;
+ case INO_F:
+ name = "file";
+ break;
+ default:
+ name = "unknow file type";
+ }
+ return name;
+ }
+ class sub_op {
+ public:
+ sub_op(meta_op* mop):
+ trace_level(0),
+ _proc(false),
+ _mop(mop)
+ {}
+ void print() {
+ std::cout << detail() << std::endl;
+ }
+ string detail() {
+ std::stringstream ds;
+ ds << " [sub_op]" << op_type_name(sub_op_t) << "|"
+ << ino_type_name(sub_ino_t) << "|"
+ << ino << "|"
+ << frag << "|"
+ << ino_c << "|"
+ << trace_level << "|"
+ << name;
+ return ds.str();
+ }
+ bool get_c_ancestor(inode_backpointer_t& bp) {
+ if (!_mop || !ino_c)
+ return false;
+ auto item = _mop->ancestors.find(ino_c);
+ if (item != _mop->ancestors.end()) {
+ bp = item->second;
+ return true;
+ } else
+ return false;
+ }
+ bool get_ancestor(inode_backpointer_t& bp) {
+ if (!_mop || !ino)
+ return false;
+ auto item = _mop->ancestors.find(ino);
+ if (item != _mop->ancestors.end()) {
+ bp = item->second;
+ return true;
+ } else
+ return false;
+ }
+ op_type sub_op_t;
+ ino_type sub_ino_t;
+ inodeno_t ino;
+ frag_t frag;
+ inodeno_t ino_c;
+ unsigned trace_level;
+ std::string name;
+ bool _proc;
+ meta_op* _mop;
+ };
+
+ std::map<inodeno_t, inode_backpointer_t > ancestors;
+ std::map<inodeno_t, inode_meta_t* > inodes;
+ std::map<inodeno_t, string > okeys;
+
+ void clear_sops() {
+ while(!no_sops())
+ pop_op();
+ }
+ bool no_sops() {
+ return sub_ops.empty();
+ }
+ void push_op(sub_op* sop) {
+ if (_debug)
+ std::cout << "<<====" << sop->detail() << std::endl;
+ sub_ops.push(sop);
+ }
+ sub_op* top_op() {
+ return sub_ops.top();
+ }
+ void pop_op() {
+ sub_op* sop = sub_ops.top();
+ if (_debug)
+ std::cout << "====>>" << sop->detail() << std::endl;;
+ delete sop;
+ sub_ops.pop();
+ }
+ string outfile() {
+ return _out;
+ }
+ string infile() {
+ return _in;
+ }
+ bool is_debug() {
+ return _debug;
+ }
+ bool confirm_chg() {
+ return _confirm;
+ }
+ private:
+ stack<sub_op*> sub_ops;
+ bool _debug;
+ string _out;
+ string _in;
+ bool _confirm;
+ };
+ MDSRoleSelector role_selector;
+ mds_rank_t rank;
+
+ // I/O handles
+ librados::Rados rados;
+ librados::IoCtx io_meta;
+ std::vector<librados::IoCtx*> io_data_v;
+ librados::IoCtx output;
+ bool _debug;
+ uint64_t features;
+
+ std::string obj_name(inodeno_t ino, frag_t fg = frag_t(), const char *suffix = NULL) const;
+ std::string obj_name(inodeno_t ino, uint64_t offset, const char *suffix = NULL) const;
+ std::string obj_name(const char* ino, uint64_t offset, const char *suffix = NULL) const;
+
+ // 0 : continue to find
+ // 1 : stop to find it
+ int show_child(std::string_view key,
+ std::string_view dname,
+ const snapid_t last,
+ bufferlist &bl,
+ const int pos,
+ const std::set<snapid_t> *snaps,
+ bool *force_dirty,
+ inodeno_t sp_ino = 0,
+ meta_op* op = NULL
+ );
+
+ int process(string& mode, string& ino, string out, string in, bool confirm);
+ int show_meta_info(string& ino, string& out);
+ int list_meta_info(string& ino, string& out);
+ int amend_meta_info(string& ino, string& in, bool confirm);
+ int show_fnode(string& ino, string& out);
+ int amend_fnode(string& in, bool confirm);
+ int op_process(meta_op &op);
+ int list_meta(meta_op &op);
+ int file_meta(meta_op &op);
+ int show_meta(meta_op &op);
+ int amend_meta(meta_op &op);
+ int show_fn(meta_op &op);
+ int amend_fn(meta_op &op);
+ public:
+ int _file_meta(meta_op &op, librados::IoCtx& io);
+ int _show_meta(inode_meta_t& i, const string& fn);
+ int _amend_meta(string &k, inode_meta_t& i, const string& fn, meta_op& op);
+ int _show_fn(inode_meta_t& i, const string& fn);
+ int _amend_fn(const string& fn, bool confirm);
+ void usage();
+ MetaTool(bool debug=false):
+ _debug(debug) {}
+ ~MetaTool() {}
+
+ int main(string& mode,
+ string& rank_str,
+ string& minfo,
+ string&ino,
+ string& out,
+ string& in,
+ bool confirm = false
+ );
+};
+#endif // METATOOL_H__
diff --git a/src/tools/cephfs/PgFiles.cc b/src/tools/cephfs/PgFiles.cc
new file mode 100644
index 000000000..2abca7223
--- /dev/null
+++ b/src/tools/cephfs/PgFiles.cc
@@ -0,0 +1,194 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "osdc/Striper.h"
+
+#include "PgFiles.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << "pgeffects." << __func__ << ": "
+
+int PgFiles::init()
+{
+ int r = ceph_create_with_context(&cmount, g_ceph_context);
+ if (r != 0) {
+ return r;
+ }
+
+ return ceph_init(cmount);
+}
+
+PgFiles::PgFiles(Objecter *o, const std::set<pg_t> &pgs_)
+ : objecter(o), pgs(pgs_)
+{
+ for (const auto &i : pgs) {
+ pools.insert(i.m_pool);
+ }
+}
+
+PgFiles::~PgFiles()
+{
+ ceph_release(cmount);
+}
+
+void PgFiles::hit_dir(std::string const &path)
+{
+ dout(10) << "entering " << path << dendl;
+
+ ceph_dir_result *dr = nullptr;
+ int r = ceph_opendir(cmount, path.c_str(), &dr);
+ if (r != 0) {
+ derr << "Failed to open path: " << cpp_strerror(r) << dendl;
+ return;
+ }
+
+ struct dirent de;
+ while((r = ceph_readdir_r(cmount, dr, &de)) != 0) {
+ if (r < 0) {
+ derr << "Error reading path " << path << ": " << cpp_strerror(r)
+ << dendl;
+ ceph_closedir(cmount, dr); // best effort, ignore r
+ return;
+ }
+
+ if (std::string(de.d_name) == "." || std::string(de.d_name) == "..") {
+ continue;
+ }
+
+ struct ceph_statx stx;
+ std::string de_path = (path + std::string("/") + de.d_name);
+ r = ceph_statx(cmount, de_path.c_str(), &stx,
+ CEPH_STATX_INO|CEPH_STATX_SIZE, 0);
+ if (r != 0) {
+ derr << "Failed to stat path " << de_path << ": "
+ << cpp_strerror(r) << dendl;
+ // Don't hold up the whole process for one bad inode
+ continue;
+ }
+
+ if (S_ISREG(stx.stx_mode)) {
+ hit_file(de_path, stx);
+ } else if (S_ISDIR(stx.stx_mode)) {
+ hit_dir(de_path);
+ } else {
+ dout(20) << "Skipping non reg/dir file: " << de_path << dendl;
+ }
+ }
+
+ r = ceph_closedir(cmount, dr);
+ if (r != 0) {
+ derr << "Error closing path " << path << ": " << cpp_strerror(r) << dendl;
+ return;
+ }
+}
+
+void PgFiles::hit_file(std::string const &path, const struct ceph_statx &stx)
+{
+ ceph_assert(S_ISREG(stx.stx_mode));
+
+ dout(20) << "Hitting file '" << path << "'" << dendl;
+
+ int l_stripe_unit = 0;
+ int l_stripe_count = 0;
+ int l_object_size = 0;
+ int l_pool_id = 0;
+ int r = ceph_get_path_layout(cmount, path.c_str(), &l_stripe_unit,
+ &l_stripe_count, &l_object_size,
+ &l_pool_id);
+ if (r != 0) {
+ derr << "Error reading layout on " << path << ": " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+
+ struct file_layout_t layout;
+ layout.stripe_unit = l_stripe_unit;
+ layout.stripe_count = l_stripe_count;
+ layout.object_size = l_object_size;
+ layout.pool_id = l_pool_id;
+
+ // Avoid calculating PG if the layout targeted a completely different pool
+ if (pools.count(layout.pool_id) == 0) {
+ dout(20) << "Fast check missed: pool " << layout.pool_id << " not in "
+ "target set" << dendl;
+ return;
+ }
+
+ auto num_objects = Striper::get_num_objects(layout, stx.stx_size);
+
+ for (uint64_t i = 0; i < num_objects; ++i) {
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)stx.stx_ino,
+ (long long unsigned int)i);
+ dout(20) << " object " << std::string(buf) << dendl;
+
+ pg_t target;
+ object_t oid;
+ object_locator_t loc;
+ loc.pool = layout.pool_id;
+ loc.key = std::string(buf);
+
+ unsigned pg_num_mask = 0;
+ unsigned pg_num = 0;
+
+ int r = 0;
+ objecter->with_osdmap([&r, oid, loc, &target, &pg_num_mask, &pg_num]
+ (const OSDMap &osd_map) {
+ r = osd_map.object_locator_to_pg(oid, loc, target);
+ if (r == 0) {
+ auto pool = osd_map.get_pg_pool(loc.pool);
+ pg_num_mask = pool->get_pg_num_mask();
+ pg_num = pool->get_pg_num();
+ }
+ });
+ if (r != 0) {
+ // Can happen if layout pointed to pool not in osdmap, for example
+ continue;
+ }
+
+ target.m_seed = ceph_stable_mod(target.ps(), pg_num, pg_num_mask);
+
+ dout(20) << " target " << target << dendl;
+
+ if (pgs.count(target)) {
+ std::cout << path << std::endl;
+ return;
+ }
+ }
+
+}
+
+int PgFiles::scan_path(std::string const &path)
+{
+ int r = ceph_mount(cmount, "/");
+ if (r != 0) {
+ derr << "Failed to mount: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ hit_dir(path);
+
+ r = ceph_unmount(cmount);
+ if (r != 0) {
+ derr << "Failed to unmount: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+}
+
diff --git a/src/tools/cephfs/PgFiles.h b/src/tools/cephfs/PgFiles.h
new file mode 100644
index 000000000..1ba4b3d28
--- /dev/null
+++ b/src/tools/cephfs/PgFiles.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef PG_EFFECTS_H_
+#define PG_EFFECTS_H_
+
+#include "include/cephfs/libcephfs.h"
+#include "osd/osd_types.h"
+#include <set>
+#include "osdc/Objecter.h"
+
+/**
+ * This utility scans the files (via an online MDS) and works out
+ * which ones rely on named PGs. For use when someone has
+ * some bad/damaged PGs and wants to see which files might be
+ * affected.
+ */
+class PgFiles
+{
+private:
+ Objecter *objecter;
+ struct ceph_mount_info *cmount = nullptr;
+
+ std::set<pg_t> pgs;
+ std::set<uint64_t> pools;
+
+ void hit_file(std::string const &path, const struct ceph_statx &stx);
+ void hit_dir(std::string const &path);
+
+
+public:
+ PgFiles(Objecter *o, const std::set<pg_t> &pgs_);
+ ~PgFiles();
+
+ int init();
+ int scan_path(std::string const &path);
+};
+
+#endif
+
diff --git a/src/tools/cephfs/Resetter.cc b/src/tools/cephfs/Resetter.cc
new file mode 100644
index 000000000..278a48767
--- /dev/null
+++ b/src/tools/cephfs/Resetter.cc
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include <memory>
+#include "common/errno.h"
+#include "osdc/Journaler.h"
+#include "mds/JournalPointer.h"
+
+#include "mds/mdstypes.h"
+#include "mds/MDCache.h"
+#include "mon/MonClient.h"
+#include "mds/events/EResetJournal.h"
+
+#include "Resetter.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+
+int Resetter::init(mds_role_t role_, const std::string &type, bool hard)
+{
+ role = role_;
+ int r = MDSUtility::init();
+ if (r < 0) {
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(nullptr != fs);
+
+ is_mdlog = false;
+ if (type == "mdlog") {
+ JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool());
+ int rt = 0;
+ if (hard) {
+ jp.front = role.rank + MDS_INO_LOG_OFFSET;
+ jp.back = 0;
+ rt = jp.save(objecter);
+ if (rt != 0) {
+ derr << "Error writing journal pointer: " << cpp_strerror(rt) << dendl;
+ return rt;
+ }
+ ino = jp.front; // only need to reset ino for mdlog
+ } else {
+ rt = jp.load(objecter);
+ if (rt != 0) {
+ std::cerr << "Error loading journal: " << cpp_strerror(rt) <<
+ ", pass --force to forcibly reset this journal" << std::endl;
+ return rt;
+ } else {
+ ino = jp.front;
+ }
+ }
+ is_mdlog = true;
+ } else if (type == "purge_queue") {
+ ino = MDS_INO_PURGE_QUEUE + role.rank;
+ } else {
+ ceph_abort(); // should not get here
+ }
+ return 0;
+}
+
+int Resetter::reset()
+{
+ ceph::mutex mylock = ceph::make_mutex("Resetter::reset::lock");
+ ceph::condition_variable cond;
+ bool done;
+ int r;
+
+ auto fs = fsmap->get_filesystem(role.fscid);
+ ceph_assert(fs != nullptr);
+
+ Journaler journaler("resetter", ino,
+ fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC,
+ objecter, 0, 0, &finisher);
+ {
+ std::lock_guard locker{lock};
+ journaler.recover(new C_SafeCond(mylock, cond, &done, &r));
+ }
+ {
+ std::unique_lock locker{mylock};
+ cond.wait(locker, [&done] { return done; });
+ }
+ if (r != 0) {
+ if (r == -ENOENT) {
+ cerr << "journal does not exist on-disk. Did you set a bad rank?"
+ << std::endl;
+ std::cerr << "Error loading journal: " << cpp_strerror(r) <<
+ ", pass --force to forcibly reset this journal" << std::endl;
+ return r;
+ } else {
+ cerr << "got error " << r << "from Journaler, failing" << std::endl;
+ return r;
+ }
+ }
+
+ lock.lock();
+ uint64_t old_start = journaler.get_read_pos();
+ uint64_t old_end = journaler.get_write_pos();
+ uint64_t old_len = old_end - old_start;
+ cout << "old journal was " << old_start << "~" << old_len << std::endl;
+
+ uint64_t new_start = round_up_to(old_end+1, journaler.get_layout_period());
+ cout << "new journal start will be " << new_start
+ << " (" << (new_start - old_end) << " bytes past old end)" << std::endl;
+
+ journaler.set_read_pos(new_start);
+ journaler.set_write_pos(new_start);
+ journaler.set_expire_pos(new_start);
+ journaler.set_trimmed_pos(new_start);
+ journaler.set_writeable();
+
+ cout << "writing journal head" << std::endl;
+ journaler.write_head(new C_SafeCond(mylock, cond, &done, &r));
+ lock.unlock();
+ {
+ std::unique_lock locker{mylock};
+ cond.wait(locker, [&done] { return done; });
+ }
+ std::lock_guard l{lock};
+ if (r != 0) {
+ return r;
+ }
+
+ if (is_mdlog) {
+ r = _write_reset_event(&journaler); // reset envent is specific for mdlog journal
+ if (r != 0) {
+ return r;
+ }
+ }
+ cout << "done" << std::endl;
+
+ return 0;
+}
+
+int Resetter::reset_hard()
+{
+ auto fs = fsmap->get_filesystem(role.fscid);
+
+ Journaler journaler("resetter", ino,
+ fs->mds_map.get_metadata_pool(),
+ CEPH_FS_ONDISK_MAGIC,
+ objecter, 0, 0, &finisher);
+ journaler.set_writeable();
+
+ file_layout_t default_log_layout = MDCache::gen_default_log_layout(
+ fsmap->get_filesystem(role.fscid)->mds_map);
+ journaler.create(&default_log_layout, g_conf()->mds_journal_format);
+
+ C_SaferCond cond;
+ {
+ std::lock_guard l{lock};
+ journaler.write_head(&cond);
+ }
+
+ int r = cond.wait();
+ if (r != 0) {
+ derr << "Error writing journal header: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (is_mdlog) // reset event is specific for mdlog journal
+ {
+ std::lock_guard l{lock};
+ r = _write_reset_event(&journaler);
+ if (r != 0) {
+ derr << "Error writing EResetJournal: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ if (is_mdlog) {
+ dout(4) << "Successfully wrote new journal pointer and header for rank "
+ << role << dendl;
+ } else {
+ dout(4) << "Successfully wrote header for rank " << role << dendl;
+ }
+ return 0;
+}
+
+int Resetter::_write_reset_event(Journaler *journaler)
+{
+ ceph_assert(journaler != NULL);
+
+ auto le = std::make_unique<EResetJournal>();
+
+ bufferlist bl;
+ le->encode_with_header(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+
+ cout << "writing EResetJournal entry" << std::endl;
+ journaler->append_entry(bl);
+
+ int ret;
+ {
+ C_SaferCond cond;
+ journaler->flush(&cond);
+ ret = cond.wait();
+ if (ret < 0)
+ return ret;
+ }
+ {
+ // wait until all journal prezero ops are done
+ C_SaferCond cond;
+ journaler->wait_for_prezero(&cond);
+ cond.wait();
+ }
+
+ return ret;
+}
+
diff --git a/src/tools/cephfs/Resetter.h b/src/tools/cephfs/Resetter.h
new file mode 100644
index 000000000..6998e4598
--- /dev/null
+++ b/src/tools/cephfs/Resetter.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+#ifndef JOURNAL_RESETTER_H_
+#define JOURNAL_RESETTER_H_
+
+
+#include "MDSUtility.h"
+
+class Journaler;
+
+/**
+ * This class lets you reset an mds journal for troubleshooting or whatever.
+ *
+ * To use, create a Resetter, call init(), and then call reset() with the name
+ * of the file to dump to.
+ */
+class Resetter : public MDSUtility {
+private:
+ mds_role_t role;
+ inodeno_t ino;
+ bool is_mdlog;
+
+protected:
+ int _write_reset_event(Journaler *journaler);
+
+public:
+ Resetter() {}
+ ~Resetter() {}
+
+ int init(mds_role_t role_, const std::string &type, bool hard);
+ /**
+ * For use when no journal header/pointer was present: write one
+ * out from scratch.
+ */
+ int reset_hard();
+ int reset();
+};
+
+#endif /* JOURNAL_RESETTER_H_ */
diff --git a/src/tools/cephfs/RoleSelector.cc b/src/tools/cephfs/RoleSelector.cc
new file mode 100644
index 000000000..e2d53b86e
--- /dev/null
+++ b/src/tools/cephfs/RoleSelector.cc
@@ -0,0 +1,59 @@
+
+#include "RoleSelector.h"
+
+int MDSRoleSelector::parse_rank(
+ const FSMap &fsmap,
+ std::string const &str)
+{
+ if (str == "all" || str == "*") {
+ std::set<mds_rank_t> in;
+ const MDSMap &mds_map = fsmap.get_filesystem(fscid)->mds_map;
+ mds_map.get_mds_set(in);
+
+ for (auto rank : in) {
+ roles.push_back(mds_role_t(fscid, rank));
+ }
+
+ return 0;
+ } else {
+ std::string rank_err;
+ mds_rank_t rank = strict_strtol(str.c_str(), 10, &rank_err);
+ if (!rank_err.empty()) {
+ return -EINVAL;
+ }
+ if (fsmap.get_filesystem(fscid)->mds_map.is_dne(rank)) {
+ return -ENOENT;
+ }
+ roles.push_back(mds_role_t(fscid, rank));
+ return 0;
+ }
+}
+
+int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str,
+ bool allow_unqualified_rank)
+{
+ auto colon_pos = str.find(":");
+ if (colon_pos == std::string::npos) {
+ // An unqualified rank. Only valid if there is only one
+ // namespace.
+ if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) {
+ fscid = fsmap.get_filesystem()->fscid;
+ return parse_rank(fsmap, str);
+ } else {
+ return -EINVAL;
+ }
+ } else if (colon_pos == 0 || colon_pos == str.size() - 1) {
+ return -EINVAL;
+ } else {
+ const std::string ns_str = str.substr(0, colon_pos);
+ const std::string rank_str = str.substr(colon_pos + 1);
+ std::shared_ptr<const Filesystem> fs_ptr;
+ int r = fsmap.parse_filesystem(ns_str, &fs_ptr);
+ if (r != 0) {
+ return r;
+ }
+ fscid = fs_ptr->fscid;
+ return parse_rank(fsmap, rank_str);
+ }
+}
+
diff --git a/src/tools/cephfs/RoleSelector.h b/src/tools/cephfs/RoleSelector.h
new file mode 100644
index 000000000..9090b7200
--- /dev/null
+++ b/src/tools/cephfs/RoleSelector.h
@@ -0,0 +1,36 @@
+
+#ifndef ROLE_SELECTOR_H_
+#define ROLE_SELECTOR_H_
+
+#include <string>
+#include <vector>
+#include "mds/mdstypes.h"
+#include "mds/FSMap.h"
+
+/**
+ * When you want to let the user act on a single rank in a namespace,
+ * or all of them.
+ */
+class MDSRoleSelector
+{
+ public:
+ const std::vector<mds_role_t> &get_roles() const {return roles;}
+ int parse(const FSMap &fsmap, std::string const &str,
+ bool allow_unqualified_rank=true);
+ MDSRoleSelector()
+ : fscid(FS_CLUSTER_ID_NONE)
+ {}
+ fs_cluster_id_t get_ns() const
+ {
+ return fscid;
+ }
+ protected:
+ int parse_rank(
+ const FSMap &fsmap,
+ std::string const &str);
+ std::vector<mds_role_t> roles;
+ fs_cluster_id_t fscid;
+};
+
+#endif // ROLE_SELECTOR_H_
+
diff --git a/src/tools/cephfs/TableTool.cc b/src/tools/cephfs/TableTool.cc
new file mode 100644
index 000000000..e779b4b66
--- /dev/null
+++ b/src/tools/cephfs/TableTool.cc
@@ -0,0 +1,417 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+
+#include "mds/SessionMap.h"
+#include "mds/InoTable.h"
+#include "mds/SnapServer.h"
+
+#include "TableTool.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << ": "
+
+void TableTool::usage()
+{
+ std::cout << "Usage: \n"
+ << " cephfs-table-tool <all|[mds rank]> <reset|show> <session|snap|inode>"
+ << " cephfs-table-tool <all|[mds rank]> <take_inos> <max_ino>"
+ << std::endl;
+
+ generic_client_usage();
+}
+
+
+/**
+ * For a function that takes an MDS role as an argument and
+ * returns an error code, execute it on the roles specified
+ * by `role_selector`.
+ */
+int TableTool::apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f)
+{
+ ceph_assert(f != NULL);
+
+ int r = 0;
+
+ f->open_object_section("ranks");
+
+ for (auto role : role_selector.get_roles()) {
+ std::ostringstream rank_str;
+ rank_str << role.rank;
+ f->open_object_section(rank_str.str().c_str());
+
+ f->open_object_section("data");
+ int rank_r = fptr(role, f);
+ f->close_section();
+ r = r ? r : rank_r;
+
+ f->dump_int("result", rank_r);
+ f->close_section();
+
+
+ }
+
+ f->close_section();
+
+ return r;
+}
+
+
+/**
+ * This class wraps an MDS table class (SessionMap, SnapServer, InoTable)
+ * with offline load/store code such that we can do offline dumps and resets
+ * on those tables.
+ */
+template <typename A>
+class TableHandler
+{
+protected:
+ // The RADOS object ID for the table
+ std::string object_name;
+
+ // The role in question (may be NONE)
+ mds_role_t role;
+
+ // Whether this is an MDSTable subclass (i.e. has leading version field to decode)
+ bool mds_table;
+
+public:
+ TableHandler(mds_role_t r, std::string const &name, bool mds_table_)
+ : role(r), mds_table(mds_table_)
+ {
+ // Compose object name of the table we will dump
+ std::ostringstream oss;
+ oss << "mds";
+ if (!role.is_none()) {
+ oss << role.rank;
+ }
+ oss << "_" << name;
+ object_name = oss.str();
+ }
+
+ int load_and_dump(librados::IoCtx *io, Formatter *f)
+ {
+ ceph_assert(io != NULL);
+ ceph_assert(f != NULL);
+
+ // Attempt read
+ bufferlist table_bl;
+ int read_r = io->read(object_name, table_bl, 0, 0);
+ if (read_r >= 0) {
+ auto q = table_bl.cbegin();
+ try {
+ if (mds_table) {
+ version_t version;
+ decode(version, q);
+ f->dump_int("version", version);
+ }
+ A table_inst;
+ table_inst.set_rank(role.rank);
+ table_inst.decode(q);
+ table_inst.dump(f);
+
+ return 0;
+ } catch (buffer::error &e) {
+ derr << "table " << object_name << " is corrupt" << dendl;
+ return -EIO;
+ }
+ } else {
+ derr << "error reading table object " << object_name
+ << ": " << cpp_strerror(read_r) << dendl;
+ return read_r;
+ }
+ }
+
+ int reset(librados::IoCtx *io)
+ {
+ A table_inst;
+ // Compose new (blank) table
+ table_inst.set_rank(role.rank);
+ table_inst.reset_state();
+ // Write the table out
+ return write(table_inst, io);
+ }
+
+protected:
+
+ int write(const A &table_inst, librados::IoCtx *io)
+ {
+ bufferlist new_bl;
+ if (mds_table) {
+ version_t version = 1;
+ encode(version, new_bl);
+ }
+ table_inst.encode_state(new_bl);
+
+ // Write out new table
+ int r = io->write_full(object_name, new_bl);
+ if (r != 0) {
+ derr << "error writing table object " << object_name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return r;
+ }
+};
+
+template <typename A>
+class TableHandlerOmap
+{
+private:
+ // The RADOS object ID for the table
+ std::string object_name;
+
+ // The role (rank may be NONE)
+ mds_role_t role;
+
+ // Whether this is an MDSTable subclass (i.e. has leading version field to decode)
+ bool mds_table;
+
+public:
+ TableHandlerOmap(mds_role_t r, std::string const &name, bool mds_table_)
+ : role(r), mds_table(mds_table_)
+ {
+ // Compose object name of the table we will dump
+ std::ostringstream oss;
+ oss << "mds";
+ if (!role.is_none()) {
+ oss << role.rank;
+ }
+ oss << "_" << name;
+ object_name = oss.str();
+ }
+
+ int load_and_dump(librados::IoCtx *io, Formatter *f)
+ {
+ ceph_assert(io != NULL);
+ ceph_assert(f != NULL);
+
+ // Read in the header
+ bufferlist header_bl;
+ int r = io->omap_get_header(object_name, &header_bl);
+ if (r != 0) {
+ derr << "error reading header on '" << object_name << "': "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Decode the header
+ A table_inst;
+ table_inst.set_rank(role.rank);
+ try {
+ table_inst.decode_header(header_bl);
+ } catch (buffer::error &e) {
+ derr << "table " << object_name << " is corrupt" << dendl;
+ return -EIO;
+ }
+
+ // Read and decode OMAP values in chunks
+ std::string last_key = "";
+ while(true) {
+ std::map<std::string, bufferlist> values;
+ int r = io->omap_get_vals(object_name, last_key,
+ g_conf()->mds_sessionmap_keys_per_op, &values);
+
+ if (r != 0) {
+ derr << "error reading values: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ if (values.empty()) {
+ break;
+ }
+
+ try {
+ table_inst.decode_values(values);
+ } catch (buffer::error &e) {
+ derr << "table " << object_name << " is corrupt" << dendl;
+ return -EIO;
+ }
+ last_key = values.rbegin()->first;
+ }
+
+ table_inst.dump(f);
+
+ return 0;
+ }
+
+ int reset(librados::IoCtx *io)
+ {
+ A table_inst;
+ table_inst.set_rank(role.rank);
+ table_inst.reset_state();
+ bufferlist header_bl;
+ table_inst.encode_header(&header_bl);
+
+ // Compose a transaction to clear and write header
+ librados::ObjectWriteOperation op;
+ op.omap_clear();
+ op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
+ op.omap_set_header(header_bl);
+
+ return io->operate(object_name, &op);
+ }
+};
+
+class InoTableHandler : public TableHandler<InoTable>
+{
+ public:
+ explicit InoTableHandler(mds_role_t r)
+ : TableHandler(r, "inotable", true)
+ {}
+
+ int take_inos(librados::IoCtx *io, inodeno_t max, Formatter *f)
+ {
+ InoTable inst;
+ inst.set_rank(role.rank);
+ inst.reset_state();
+
+ int r = 0;
+ if (inst.force_consume_to(max)) {
+ r = write(inst, io);
+ }
+
+ f->dump_int("version", inst.get_version());
+ inst.dump(f);
+
+ return r;
+ }
+};
+
+
+int TableTool::main(std::vector<const char*> &argv)
+{
+ int r;
+
+ dout(10) << __func__ << dendl;
+
+ // RADOS init
+ // ==========
+ r = rados.init_with_context(g_ceph_context);
+ if (r < 0) {
+ derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
+ return r;
+ }
+
+ dout(4) << "connecting to RADOS..." << dendl;
+ r = rados.connect();
+ if (r < 0) {
+ derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // Require at least 3 args <rank> <mode> <arg> [args...]
+ if (argv.size() < 3) {
+ cerr << "missing required 3 arguments" << std::endl;
+ return -EINVAL;
+ }
+
+ const std::string role_str = std::string(argv[0]);
+ const std::string mode = std::string(argv[1]);
+ const std::string table = std::string(argv[2]);
+
+ r = role_selector.parse(*fsmap, role_str);
+ if (r < 0) {
+ derr << "Bad rank selection: " << role_str << "'" << dendl;
+ return r;
+ }
+
+ auto fs = fsmap->get_filesystem(role_selector.get_ns());
+ ceph_assert(fs != nullptr);
+ int64_t const pool_id = fs->mds_map.get_metadata_pool();
+ dout(4) << "resolving pool " << pool_id << dendl;
+ std::string pool_name;
+ r = rados.pool_reverse_lookup(pool_id, &pool_name);
+ if (r < 0) {
+ derr << "Pool " << pool_id << " identified in MDS map not found in RADOS!"
+ << dendl;
+ return r;
+ }
+
+ dout(4) << "creating IoCtx.." << dendl;
+ r = rados.ioctx_create(pool_name.c_str(), io);
+ if (r != 0) {
+ return r;
+ }
+
+ JSONFormatter jf(true);
+ if (mode == "reset") {
+ const std::string table = std::string(argv[2]);
+ if (table == "session") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).reset(&io);
+ }, &jf);
+ } else if (table == "inode") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandler<InoTable>(rank, "inotable", true).reset(&io);
+ }, &jf);
+ } else if (table == "snap") {
+ r = TableHandler<SnapServer>(mds_role_t(), "snaptable", true).reset(&io);
+ jf.open_object_section("reset_snap_status");
+ jf.dump_int("result", r);
+ jf.close_section();
+ } else {
+ cerr << "Invalid table '" << table << "'" << std::endl;
+ return -EINVAL;
+ }
+ } else if (mode == "show") {
+ const std::string table = std::string(argv[2]);
+ if (table == "session") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandlerOmap<SessionMapStore>(rank, "sessionmap", false).load_and_dump(&io, f);
+ }, &jf);
+ } else if (table == "inode") {
+ r = apply_role_fn([this](mds_role_t rank, Formatter *f) -> int {
+ return TableHandler<InoTable>(rank, "inotable", true).load_and_dump(&io, f);;
+ }, &jf);
+ } else if (table == "snap") {
+ jf.open_object_section("show_snap_table");
+ {
+ r = TableHandler<SnapServer>(
+ mds_role_t(), "snaptable", true).load_and_dump(&io, &jf);
+ jf.dump_int("result", r);
+ }
+ jf.close_section();
+ } else {
+ cerr << "Invalid table '" << table << "'" << std::endl;
+ return -EINVAL;
+ }
+ } else if (mode == "take_inos") {
+ const std::string ino_str = std::string(argv[2]);
+ std::string ino_err;
+ inodeno_t ino = strict_strtoll(ino_str.c_str(), 10, &ino_err);
+ if (!ino_err.empty()) {
+ derr << "Bad ino '" << ino_str << "'" << dendl;
+ return -EINVAL;
+ }
+ r = apply_role_fn([this, ino](mds_role_t rank, Formatter *f) -> int {
+ return InoTableHandler(rank).take_inos(&io, ino, f);
+ }, &jf);
+ } else {
+ cerr << "Invalid mode '" << mode << "'" << std::endl;
+ return -EINVAL;
+ }
+
+ // Subcommand should have written to formatter, flush it
+ jf.flush(std::cout);
+ std::cout << std::endl;
+ return r;
+}
+
diff --git a/src/tools/cephfs/TableTool.h b/src/tools/cephfs/TableTool.h
new file mode 100644
index 000000000..bf9b95c12
--- /dev/null
+++ b/src/tools/cephfs/TableTool.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 John Spray <john.spray@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "MDSUtility.h"
+#include "RoleSelector.h"
+
+#include "include/rados/librados.hpp"
+
+/**
+ * Command line tool for debugging the backing store of
+ * MDSTable instances.
+ */
+class TableTool : public MDSUtility
+{
+ private:
+ MDSRoleSelector role_selector;
+
+ // I/O handles
+ librados::Rados rados;
+ librados::IoCtx io;
+
+ int apply_role_fn(std::function<int(mds_role_t, Formatter *)> fptr, Formatter *f);
+
+ public:
+ static void usage();
+ int main(std::vector<const char*> &argv);
+
+};
+
diff --git a/src/tools/cephfs/cephfs-data-scan.cc b/src/tools/cephfs/cephfs-data-scan.cc
new file mode 100644
index 000000000..e6efff66c
--- /dev/null
+++ b/src/tools/cephfs/cephfs-data-scan.cc
@@ -0,0 +1,47 @@
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "DataScan.h"
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ DataScan::usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ DataScan data_scan;
+
+ // Connect to mon cluster, download MDS map etc
+ int rc = data_scan.init();
+ if (rc != 0) {
+ std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+
+ // Finally, execute the user's commands
+ rc = data_scan.main(args);
+ if (rc != 0) {
+ std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+ }
+
+
+ return rc;
+}
+
diff --git a/src/tools/cephfs/cephfs-journal-tool.cc b/src/tools/cephfs/cephfs-journal-tool.cc
new file mode 100644
index 000000000..290cb305b
--- /dev/null
+++ b/src/tools/cephfs/cephfs-journal-tool.cc
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 John Spray <john.spray@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "JournalTool.h"
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ JournalTool::usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ JournalTool jt;
+
+ // Connect to mon cluster, download MDS map etc
+ int rc = jt.init();
+ if (rc != 0) {
+ std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+
+ // Finally, execute the user's commands
+ rc = jt.main(args);
+ if (rc != 0) {
+ std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+ }
+
+ return rc;
+}
+
diff --git a/src/tools/cephfs/cephfs-meta-injection.cc b/src/tools/cephfs/cephfs-meta-injection.cc
new file mode 100644
index 000000000..5768d3869
--- /dev/null
+++ b/src/tools/cephfs/cephfs-meta-injection.cc
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <include/types.h>
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "MetaTool.h"
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <boost/program_options.hpp>
+namespace po = boost::program_options;
+using std::string;
+using namespace std;
+static string version = "cephfs-meta-injection v1.1";
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ env_to_vec(args);
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ string rank_str, minfo, ino, out,in;
+ po::options_description general("general options");
+ general.add_options()
+ ("help,h", "produce help message")
+ ("debug", "show debug info")
+ ("rank,r", po::value<string>(&rank_str), "the rank of cephfs, default(0) (e.g. -r cephfs_a:0)")
+ ("minfo", po::value<string>(&minfo), "specify metapool, datapools and rank (e.g. cephfs_metadata_a:cephfs_data_a:0)")
+ ("ino,i", po::value<string>(&ino), "specify inode. e.g. 1099511627776 or 0x10000000000, you can find it with cmd, 'ls -i'")
+ ("out,o", po::value<string>(&out), "output file")
+ ("in", po::value<string>(&in), "input file")
+ ("yes-i-really-really-mean-it", "need by amend info")
+ ;
+
+ string mode;
+ po::options_description modeoptions("mode options");
+ modeoptions.add_options()
+ ("mode", po::value<string>(&mode),
+ "\tlistc : list all obj of dir\n" \
+ "\tshowm : show the info of ino\n" \
+ "\tshowfn : show the fnode of dir\n" \
+ "\tamend : amend part of the meta data\n" \
+ "\tamendfn : amend fnode from file\n"
+ );
+
+ po::positional_options_description p;
+ p.add("mode", 1);
+
+ po::options_description all("all options");
+ all.add(modeoptions).add(general);
+ po::variables_map vm;
+ try {
+ po::store(po::command_line_parser(argc, argv).options(all).positional(p).allow_unregistered().run(), vm);
+ } catch(exception &e) {
+ cerr << "error : " << e.what() << std::endl;
+ return -1;
+ } catch(...) {
+ cout << "param error" << std::endl;
+ return 0;
+ }
+
+ boost::program_options::notify(vm);
+ if (vm.count("help")) {
+ std::cout << version << std::endl;
+ std::cout << "usage : \n"
+ << " cephfs-meta-injection <listc|showm|showfn|amend|amendfn> -r <fsname:rank> -i <ino>"
+ << std::endl;
+ std::cout << "example : \n"
+ << " amend info of inode(1099531628828)\n"
+ << " cephfs-meta-injection showm -r cephfs_a:0 -i 1099531628828 -o out\n"
+ << " alter file\n"
+ << " cephfs-meta-injection amend -r cephfs_a:0 -i 1099531628828 --in out --yes-i-really-mean-it"
+ << std::endl;
+ std::cout << all << std::endl;
+ return 0;
+ }
+
+ MetaTool mt(vm.count("debug"));
+ int rc = mt.init();
+ if (rc != 0) {
+ std::cerr << "error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+ rc = mt.main(mode, rank_str, minfo, ino, out, in, vm.count("yes-i-really-really-mean-it"));
+ if (rc != 0) {
+ std::cerr << "error (" << cpp_strerror(rc) << ")" << std::endl;
+ return -1;
+ }
+ return rc;
+}
diff --git a/src/tools/cephfs/cephfs-table-tool.cc b/src/tools/cephfs/cephfs-table-tool.cc
new file mode 100644
index 000000000..47b475dd0
--- /dev/null
+++ b/src/tools/cephfs/cephfs-table-tool.cc
@@ -0,0 +1,47 @@
+
+#include "include/types.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/errno.h"
+#include "global/global_init.h"
+
+#include "TableTool.h"
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ exit(1);
+ }
+ if (ceph_argparse_need_usage(args)) {
+ TableTool::usage();
+ exit(0);
+ }
+
+ auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ TableTool tt;
+
+ // Connect to mon cluster, download MDS map etc
+ int rc = tt.init();
+ if (rc != 0) {
+ std::cerr << "Error in initialization: " << cpp_strerror(rc) << std::endl;
+ return rc;
+ }
+
+ // Finally, execute the user's commands
+ rc = tt.main(args);
+ if (rc != 0) {
+ std::cerr << "Error (" << cpp_strerror(rc) << ")" << std::endl;
+ }
+
+ return rc;
+}
+
+
diff --git a/src/tools/cephfs/shell/CMakeLists.txt b/src/tools/cephfs/shell/CMakeLists.txt
new file mode 100644
index 000000000..5a1f6ad80
--- /dev/null
+++ b/src/tools/cephfs/shell/CMakeLists.txt
@@ -0,0 +1,7 @@
+include(Distutils)
+distutils_install_module(cephfs-shell)
+
+if(WITH_TESTS)
+ include(AddCephTest)
+ add_tox_test(cephfs-shell)
+endif()
diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell
new file mode 100755
index 000000000..51bd569e0
--- /dev/null
+++ b/src/tools/cephfs/shell/cephfs-shell
@@ -0,0 +1,1684 @@
+#!/usr/bin/python3
+# coding = utf-8
+
+import argparse
+import os
+import os.path
+import sys
+import cephfs as libcephfs
+import shutil
+import traceback
+import colorama
+import fnmatch
+import math
+import re
+import shlex
+import stat
+import errno
+
+from cmd2 import Cmd
+from cmd2 import __version__ as cmd2_version
+from distutils.version import LooseVersion
+
+if sys.version_info.major < 3:
+ raise RuntimeError("cephfs-shell is only compatible with python3")
+
+try:
+ from cmd2 import with_argparser
+except ImportError:
+ def with_argparser(argparser):
+ import functools
+
+ def argparser_decorator(func):
+ @functools.wraps(func)
+ def wrapper(thiz, cmdline):
+ if isinstance(cmdline, list):
+ arglist = cmdline
+ else:
+ # do not split if it's already a list
+ arglist = shlex.split(cmdline, posix=False)
+ # in case user quotes the command args
+ arglist = [arg.strip('\'""') for arg in arglist]
+ try:
+ args = argparser.parse_args(arglist)
+ except SystemExit:
+ shell.exit_code = 1
+ # argparse exits at seeing bad arguments
+ return
+ else:
+ return func(thiz, args)
+ argparser.prog = func.__name__[3:]
+ if argparser.description is None and func.__doc__:
+ argparser.description = func.__doc__
+
+ return wrapper
+
+ return argparser_decorator
+
+
+cephfs = None # holds CephFS Python bindings
+shell = None # holds instance of class CephFSShell
+exit_codes = {'Misc': 1,
+ 'KeyboardInterrupt': 2,
+ errno.EPERM: 3,
+ errno.EACCES: 4,
+ errno.ENOENT: 5,
+ errno.EIO: 6,
+ errno.ENOSPC: 7,
+ errno.EEXIST: 8,
+ errno.ENODATA: 9,
+ errno.EINVAL: 10,
+ errno.EOPNOTSUPP: 11,
+ errno.ERANGE: 12,
+ errno.EWOULDBLOCK: 13,
+ errno.ENOTEMPTY: 14,
+ errno.ENOTDIR: 15,
+ errno.EDQUOT: 16,
+ errno.EPIPE: 17,
+ errno.ESHUTDOWN: 18,
+ errno.ECONNABORTED: 19,
+ errno.ECONNREFUSED: 20,
+ errno.ECONNRESET: 21,
+ errno.EINTR: 22}
+
+
+#########################################################################
+#
+# Following are methods are generically useful through class CephFSShell
+#
+#######################################################################
+
+
+def poutput(s, end='\n'):
+ shell.poutput(s, end=end)
+
+
+def perror(msg, **kwargs):
+ shell.perror(msg, **kwargs)
+
+
+def set_exit_code_msg(errcode='Misc', msg=''):
+ """
+ Set exit code and print error message
+ """
+ if isinstance(msg, libcephfs.Error):
+ shell.exit_code = exit_codes[msg.get_error_code()]
+ else:
+ shell.exit_code = exit_codes[errcode]
+ if msg:
+ perror(msg)
+
+
+def mode_notation(mode):
+ """
+ """
+ permission_bits = {'0': '---',
+ '1': '--x',
+ '2': '-w-',
+ '3': '-wx',
+ '4': 'r--',
+ '5': 'r-x',
+ '6': 'rw-',
+ '7': 'rwx'}
+ mode = str(oct(mode))
+ notation = '-'
+ if mode[2] == '4':
+ notation = 'd'
+ elif mode[2:4] == '12':
+ notation = 'l'
+ for i in mode[-3:]:
+ notation += permission_bits[i]
+ return notation
+
+
+def get_chunks(file_size):
+ chunk_start = 0
+ chunk_size = 0x20000 # 131072 bytes, default max ssl buffer size
+ while chunk_start + chunk_size < file_size:
+ yield chunk_start, chunk_size
+ chunk_start += chunk_size
+ final_chunk_size = file_size - chunk_start
+ yield chunk_start, final_chunk_size
+
+
+def to_bytes(param):
+ # don't convert as follows as it can lead unusable results like coverting
+ # [1, 2, 3, 4] to '[1, 2, 3, 4]' -
+ # str(param).encode('utf-8')
+ if isinstance(param, bytes):
+ return param
+ elif isinstance(param, str):
+ return bytes(param, encoding='utf-8')
+ elif isinstance(param, list):
+ return [i.encode('utf-8') if isinstance(i, str) else to_bytes(i) for
+ i in param]
+ elif isinstance(param, int) or isinstance(param, float):
+ return str(param).encode('utf-8')
+ elif param is None:
+ return None
+
+
+def ls(path, opts=''):
+ # opts tries to be like /bin/ls opts
+ almost_all = 'A' in opts
+ try:
+ with cephfs.opendir(path) as d:
+ while True:
+ dent = cephfs.readdir(d)
+ if dent is None:
+ return
+ elif almost_all and dent.d_name in (b'.', b'..'):
+ continue
+ yield dent
+ except libcephfs.ObjectNotFound as e:
+ set_exit_code_msg(msg=e)
+
+
+def glob(path, pattern):
+ paths = []
+ parent_dir = os.path.dirname(path)
+ if parent_dir == b'':
+ parent_dir = b'/'
+ if path == b'/' or is_dir_exists(os.path.basename(path), parent_dir):
+ for i in ls(path, opts='A'):
+ if fnmatch.fnmatch(i.d_name, pattern):
+ paths.append(os.path.join(path, i.d_name))
+ return paths
+
+
+def locate_file(name, case_sensitive=True):
+ dir_list = sorted(set(dirwalk(cephfs.getcwd())))
+ if not case_sensitive:
+ return [dname for dname in dir_list if name.lower() in dname.lower()]
+ else:
+ return [dname for dname in dir_list if name in dname]
+
+
+def get_all_possible_paths(pattern):
+ complete_pattern = pattern[:]
+ paths = []
+ is_rel_path = not os.path.isabs(pattern)
+ if is_rel_path:
+ dir_ = cephfs.getcwd()
+ else:
+ dir_ = b'/'
+ pattern = pattern[1:]
+ patterns = pattern.split(b'/')
+ paths.extend(glob(dir_, patterns[0]))
+ patterns.pop(0)
+ for pattern in patterns:
+ for path in paths:
+ paths.extend(glob(path, pattern))
+ if is_rel_path:
+ complete_pattern = os.path.join(cephfs.getcwd(), complete_pattern)
+ return [path for path in paths if fnmatch.fnmatch(path, complete_pattern)]
+
+
+suffixes = ['B', 'K', 'M', 'G', 'T', 'P']
+
+
+def humansize(nbytes):
+ i = 0
+ while nbytes >= 1024 and i < len(suffixes) - 1:
+ nbytes /= 1024.
+ i += 1
+ nbytes = math.ceil(nbytes)
+ f = ('%d' % nbytes).rstrip('.')
+ return '%s%s' % (f, suffixes[i])
+
+
+def style_listing(path, is_dir, is_symlink, ls_long=False):
+ if not (is_dir or is_symlink):
+ return path
+ pretty = colorama.Style.BRIGHT
+ if is_symlink:
+ pretty += colorama.Fore.CYAN + path
+ if ls_long:
+ # Add target path
+ pretty += ' -> ' + cephfs.readlink(path, size=255).decode('utf-8')
+ elif is_dir:
+ pretty += colorama.Fore.BLUE + path + '/'
+ pretty += colorama.Style.RESET_ALL
+ return pretty
+
+
+def print_long(path, is_dir, is_symlink, human_readable):
+ info = cephfs.stat(path, follow_symlink=(not is_symlink))
+ pretty = style_listing(os.path.basename(path.decode('utf-8')), is_dir, is_symlink, True)
+ if human_readable:
+ sizefmt = '\t {:10s}'.format(humansize(info.st_size))
+ else:
+ sizefmt = '{:12d}'.format(info.st_size)
+ poutput(f'{mode_notation(info.st_mode)} {sizefmt} {info.st_uid} {info.st_gid} {info.st_mtime}'
+ f' {pretty}')
+
+
+def word_len(word):
+ """
+ Returns the word length, minus any color codes.
+ """
+ if word[0] == '\x1b':
+ return len(word) - 9
+ return len(word)
+
+
+def is_dir_exists(path, dir_=b''):
+ path_to_stat = os.path.join(dir_, path)
+ try:
+ return ((cephfs.stat(path_to_stat).st_mode & 0o0040000) != 0)
+ except libcephfs.Error:
+ return False
+
+
+def is_file_exists(path, dir_=b''):
+ try:
+ # if its not a directory, then its a file
+ return ((cephfs.stat(os.path.join(dir_, path)).st_mode & 0o0040000) == 0)
+ except libcephfs.Error:
+ return False
+
+
+def print_list(words, termwidth=79):
+ if not words:
+ return
+ words = [word.decode('utf-8') if isinstance(word, bytes) else word for word in words]
+ width = max([word_len(word) for word in words]) + 2
+ nwords = len(words)
+ ncols = max(1, (termwidth + 1) // (width + 1))
+ nrows = (nwords + ncols - 1) // ncols
+ for row in range(nrows):
+ for i in range(row, nwords, nrows):
+ word = words[i]
+ print_width = width
+ if word[0] == '\x1b':
+ print_width = print_width + 10
+
+ poutput('%-*s' % (print_width, words[i]),
+ end='\n' if i + nrows >= nwords else '')
+
+
+def copy_from_local(local_path, remote_path):
+ stdin = -1
+ file_ = None
+ fd = None
+ convert_to_bytes = False
+ if local_path == b'-':
+ file_ = sys.stdin
+ convert_to_bytes = True
+ else:
+ try:
+ file_ = open(local_path, 'rb')
+ except PermissionError as e:
+ set_exit_code_msg(e.errno, 'error: no permission to read local file {}'.format(
+ local_path.decode('utf-8')))
+ return
+ stdin = 1
+ try:
+ fd = cephfs.open(remote_path, 'w', 0o666)
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+ return
+ progress = 0
+ while True:
+ data = file_.read(65536)
+ if not data or len(data) == 0:
+ break
+ if convert_to_bytes:
+ data = to_bytes(data)
+ wrote = cephfs.write(fd, data, progress)
+ if wrote < 0:
+ break
+ progress += wrote
+ cephfs.close(fd)
+ if stdin > 0:
+ file_.close()
+ poutput('')
+
+
+def copy_to_local(remote_path, local_path):
+ fd = None
+ if local_path != b'-':
+ local_dir = os.path.dirname(local_path)
+ dir_list = remote_path.rsplit(b'/', 1)
+ if not os.path.exists(local_dir):
+ os.makedirs(local_dir)
+ if len(dir_list) > 2 and dir_list[1] == b'':
+ return
+ fd = open(local_path, 'wb+')
+ file_ = cephfs.open(remote_path, 'r')
+ file_size = cephfs.stat(remote_path).st_size
+ if file_size <= 0:
+ return
+ progress = 0
+ for chunk_start, chunk_size in get_chunks(file_size):
+ file_chunk = cephfs.read(file_, chunk_start, chunk_size)
+ progress += len(file_chunk)
+ if fd:
+ fd.write(file_chunk)
+ else:
+ poutput(file_chunk.decode('utf-8'))
+ cephfs.close(file_)
+ if fd:
+ fd.close()
+
+
+def dirwalk(path):
+ """
+ walk a directory tree, using a generator
+ """
+ path = os.path.normpath(path)
+ for item in ls(path, opts='A'):
+ fullpath = os.path.join(path, item.d_name)
+ src_path = fullpath.rsplit(b'/', 1)[0]
+
+ yield os.path.normpath(fullpath)
+ if is_dir_exists(item.d_name, src_path):
+ for x in dirwalk(fullpath):
+ yield x
+
+
+##################################################################
+#
+# Following methods are implementation for CephFS Shell commands
+#
+#################################################################
+
+class CephFSShell(Cmd):
+
+ def __init__(self):
+ super().__init__(use_ipython=False)
+ self.working_dir = cephfs.getcwd().decode('utf-8')
+ self.set_prompt()
+ self.interactive = False
+ self.umask = '2'
+
+ def default(self, line):
+ perror('Unrecognized command')
+
+ def set_prompt(self):
+ self.prompt = ('\033[01;33mCephFS:~' + colorama.Fore.LIGHTCYAN_EX
+ + self.working_dir + colorama.Style.RESET_ALL
+ + '\033[01;33m>>>\033[00m ')
+
+ def create_argparser(self, command):
+ try:
+ argparse_args = getattr(self, 'argparse_' + command)
+ except AttributeError:
+ set_exit_code_msg()
+ return None
+ doc_lines = getattr(
+ self, 'do_' + command).__doc__.expandtabs().splitlines()
+ if '' in doc_lines:
+ blank_idx = doc_lines.index('')
+ usage = doc_lines[:blank_idx]
+ description = doc_lines[blank_idx + 1:]
+ else:
+ usage = doc_lines
+ description = []
+ parser = argparse.ArgumentParser(
+ prog=command,
+ usage='\n'.join(usage),
+ description='\n'.join(description),
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
+ for args, kwargs in argparse_args:
+ parser.add_argument(*args, **kwargs)
+ return parser
+
+ def complete_filenames(self, text, line, begidx, endidx):
+ if not text:
+ completions = [x.d_name.decode('utf-8') + '/' * int(x.is_dir())
+ for x in ls(b".", opts='A')]
+ else:
+ if text.count('/') > 0:
+ completions = [text.rsplit('/', 1)[0] + '/'
+ + x.d_name.decode('utf-8') + '/'
+ * int(x.is_dir()) for x in ls('/'
+ + text.rsplit('/', 1)[0], opts='A')
+ if x.d_name.decode('utf-8').startswith(
+ text.rsplit('/', 1)[1])]
+ else:
+ completions = [x.d_name.decode('utf-8') + '/'
+ * int(x.is_dir()) for x in ls(b".", opts='A')
+ if x.d_name.decode('utf-8').startswith(text)]
+ if len(completions) == 1 and completions[0][-1] == '/':
+ dir_, file_ = completions[0].rsplit('/', 1)
+ completions.extend([dir_ + '/' + x.d_name.decode('utf-8')
+ + '/' * int(x.is_dir()) for x in
+ ls('/' + dir_, opts='A')
+ if x.d_name.decode('utf-8').startswith(file_)])
+ return self.delimiter_complete(text, line, begidx, endidx, completions, '/')
+ return completions
+
+ def onecmd(self, line, **kwargs):
+ """
+ Global error catcher
+ """
+ try:
+ res = Cmd.onecmd(self, line, **kwargs)
+ if self.interactive:
+ self.set_prompt()
+ return res
+ except ConnectionError as e:
+ set_exit_code_msg(e.errno, f'***\n{e}')
+ except KeyboardInterrupt:
+ set_exit_code_msg('KeyboardInterrupt', 'Command aborted')
+ except (libcephfs.Error, Exception) as e:
+ if shell.debug:
+ traceback.print_exc(file=sys.stdout)
+ set_exit_code_msg(msg=e)
+
+ class path_to_bytes(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ values = to_bytes(values)
+ setattr(namespace, self.dest, values)
+
+ # TODO: move the necessary contents from here to `class path_to_bytes`.
+ class get_list_of_bytes_path(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ values = to_bytes(values)
+
+ if values == b'.':
+ values = cephfs.getcwd()
+ else:
+ for i in values:
+ if i == b'.':
+ values[values.index(i)] = cephfs.getcwd()
+
+ setattr(namespace, self.dest, values)
+
+ def complete_mkdir(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ class ModeAction(argparse.Action):
+ def __init__(self, option_strings, dest, nargs=None, **kwargs):
+ if nargs is not None and nargs != '?':
+ raise ValueError("more than one modes not allowed")
+ super().__init__(option_strings, dest, **kwargs)
+
+ def __call__(self, parser, namespace, values, option_string=None):
+ o_mode = 0
+ res = None
+ try:
+ o_mode = int(values, base=8)
+ except ValueError:
+ res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', values)
+ if res is None:
+ parser.error("invalid mode: %s\n"
+ "mode must be a numeric octal literal\n"
+ "or ((u?g?o?)|(a?))(=)(r?w?x?)" %
+ values)
+ else:
+ # we are supporting only assignment of mode and not + or -
+ # as is generally available with the chmod command
+ # eg.
+ # >>> res = re.match('((u?g?o?)|(a?))(=)(r?w?x?)', 'go=')
+ # >>> res.groups()
+ # ('go', 'go', None, '=', '')
+ val = res.groups()
+
+ if val[3] != '=':
+ parser.error("need assignment operator between user "
+ "and mode specifiers")
+ if val[4] == '':
+ parser.error("invalid mode: %s\n"
+ "mode must be combination of: r | w | x" %
+ values)
+ users = ''
+ if val[2] is None:
+ users = val[1]
+ else:
+ users = val[2]
+
+ t_mode = 0
+ if users == 'a':
+ users = 'ugo'
+
+ if 'r' in val[4]:
+ t_mode |= 4
+ if 'w' in val[4]:
+ t_mode |= 2
+ if 'x' in val[4]:
+ t_mode |= 1
+
+ if 'u' in users:
+ o_mode |= (t_mode << 6)
+ if 'g' in users:
+ o_mode |= (t_mode << 3)
+ if 'o' in users:
+ o_mode |= t_mode
+
+ if o_mode < 0:
+ parser.error("invalid mode: %s\n"
+ "mode cannot be negative" % values)
+ if o_mode > 0o777:
+ parser.error("invalid mode: %s\n"
+ "mode cannot be greater than octal 0777" % values)
+
+ setattr(namespace, self.dest, str(oct(o_mode)))
+
+ mkdir_parser = argparse.ArgumentParser(
+ description='Create the directory(ies), if they do not already exist.')
+ mkdir_parser.add_argument('dirs', type=str,
+ action=path_to_bytes,
+ metavar='DIR_NAME',
+ help='Name of new_directory.',
+ nargs='+')
+ mkdir_parser.add_argument('-m', '--mode', type=str,
+ action=ModeAction,
+ help='Sets the access mode for the new directory.')
+ mkdir_parser.add_argument('-p', '--parent', action='store_true',
+ help='Create parent directories as necessary. '
+ 'When this option is specified, no error is'
+ 'reported if a directory already exists.')
+
+ @with_argparser(mkdir_parser)
+ def do_mkdir(self, args):
+ """
+ Create directory.
+ """
+ for path in args.dirs:
+ if args.mode:
+ permission = int(args.mode, 8)
+ else:
+ permission = 0o777
+ if args.parent:
+ cephfs.mkdirs(path, permission)
+ else:
+ try:
+ cephfs.mkdir(path, permission)
+ except libcephfs.Error as e:
+ set_exit_code_msg(e)
+
+ def complete_put(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ index_dict = {1: self.path_complete}
+ return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+ put_parser = argparse.ArgumentParser(
+ description='Copy a file/directory to Ceph File System from Local File System.')
+ put_parser.add_argument('local_path', type=str, action=path_to_bytes,
+ help='Path of the file in the local system')
+ put_parser.add_argument('remote_path', type=str, action=path_to_bytes,
+ help='Path of the file in the remote system')
+ put_parser.add_argument('-f', '--force', action='store_true',
+ help='Overwrites the destination if it already exists.')
+
+ @with_argparser(put_parser)
+ def do_put(self, args):
+ """
+ Copy a local file/directory to CephFS.
+ """
+ if args.local_path != b'-' and not os.path.isfile(args.local_path) \
+ and not os.path.isdir(args.local_path):
+ set_exit_code_msg(errno.ENOENT,
+ msg=f"error: "
+ f"{args.local_path.decode('utf-8')}: "
+ f"No such file or directory")
+ return
+
+ if (is_file_exists(args.remote_path) or is_dir_exists(
+ args.remote_path)) and not args.force:
+ set_exit_code_msg(msg=f"error: file/directory "
+ f"{args.remote_path.decode('utf-8')} "
+ f"exists, use --force to overwrite")
+ return
+
+ root_src_dir = args.local_path
+ root_dst_dir = args.remote_path
+ if args.local_path == b'.' or args.local_path == b'./':
+ root_src_dir = os.getcwdb()
+ elif len(args.local_path.rsplit(b'/', 1)) < 2:
+ root_src_dir = os.path.join(os.getcwdb(), args.local_path)
+ else:
+ p = args.local_path.split(b'/')
+ if p[0] == b'.':
+ root_src_dir = os.getcwdb()
+ p.pop(0)
+ while len(p) > 0:
+ root_src_dir += b'/' + p.pop(0)
+
+ if root_dst_dir == b'.':
+ if args.local_path != b'-':
+ root_dst_dir = root_src_dir.rsplit(b'/', 1)[1]
+ if root_dst_dir == b'':
+ root_dst_dir = root_src_dir.rsplit(b'/', 1)[0]
+ a = root_dst_dir.rsplit(b'/', 1)
+ if len(a) > 1:
+ root_dst_dir = a[1]
+ else:
+ root_dst_dir = a[0]
+ else:
+ set_exit_code_msg(errno.EINVAL, 'error: no filename specified '
+ 'for destination')
+ return
+
+ if root_dst_dir[-1] != b'/':
+ root_dst_dir += b'/'
+
+ if args.local_path == b'-' or os.path.isfile(root_src_dir):
+ if args.local_path == b'-':
+ root_src_dir = b'-'
+ copy_from_local(root_src_dir, root_dst_dir)
+ else:
+ for src_dir, dirs, files in os.walk(root_src_dir):
+ if isinstance(src_dir, str):
+ src_dir = to_bytes(src_dir)
+ dst_dir = src_dir.replace(root_src_dir, root_dst_dir, 1)
+ dst_dir = re.sub(rb'\/+', b'/', cephfs.getcwd()
+ + dst_dir)
+ if args.force and dst_dir != b'/' and not is_dir_exists(
+ dst_dir[:-1]) and not locate_file(dst_dir):
+ try:
+ cephfs.mkdirs(dst_dir, 0o777)
+ except libcephfs.Error:
+ pass
+ if (not args.force) and dst_dir != b'/' and not is_dir_exists(
+ dst_dir) and not os.path.isfile(root_src_dir):
+ try:
+ cephfs.mkdirs(dst_dir, 0o777)
+ except libcephfs.Error:
+ # TODO: perhaps, set retval to 1?
+ pass
+
+ for dir_ in dirs:
+ dir_name = os.path.join(dst_dir, dir_)
+ if not is_dir_exists(dir_name):
+ try:
+ cephfs.mkdirs(dir_name, 0o777)
+ except libcephfs.Error:
+ # TODO: perhaps, set retval to 1?
+ pass
+
+ for file_ in files:
+ src_file = os.path.join(src_dir, file_)
+ dst_file = re.sub(rb'\/+', b'/', b'/' + dst_dir + b'/' + file_)
+ if (not args.force) and is_file_exists(dst_file):
+ return
+ copy_from_local(src_file, os.path.join(cephfs.getcwd(),
+ dst_file))
+
+ def complete_get(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ get_parser = argparse.ArgumentParser(
+ description='Copy a file from Ceph File System to Local Directory.')
+ get_parser.add_argument('remote_path', type=str, action=path_to_bytes,
+ help='Path of the file in the remote system')
+ get_parser.add_argument('local_path', type=str, action=path_to_bytes,
+ help='Path of the file in the local system')
+ get_parser.add_argument('-f', '--force', action='store_true',
+ help='Overwrites the destination if it already exists.')
+
+ @with_argparser(get_parser)
+ def do_get(self, args):
+ """
+ Copy a file/directory from CephFS to given path.
+ """
+ if not is_file_exists(args.remote_path) and not \
+ is_dir_exists(args.remote_path):
+ set_exit_code_msg(errno.ENOENT, "error: no file/directory"
+ " found at specified remote "
+ "path")
+ return
+ if (os.path.isfile(args.local_path) or os.path.isdir(
+ args.local_path)) and not args.force:
+ set_exit_code_msg(msg=f"error: file/directory "
+ f"{args.local_path.decode('utf-8')}"
+ f" already exists, use --force to "
+ f"overwrite")
+ return
+ root_src_dir = args.remote_path
+ root_dst_dir = args.local_path
+ fname = root_src_dir.rsplit(b'/', 1)
+ if args.local_path == b'.':
+ root_dst_dir = os.getcwdb()
+ if args.remote_path == b'.':
+ root_src_dir = cephfs.getcwd()
+ if args.local_path == b'-':
+ if args.remote_path == b'.' or args.remote_path == b'./':
+ set_exit_code_msg(errno.EINVAL, 'error: no remote file name specified')
+ return
+ copy_to_local(root_src_dir, b'-')
+ elif is_file_exists(args.remote_path):
+ copy_to_local(root_src_dir, root_dst_dir)
+ elif b'/' in root_src_dir and is_file_exists(fname[1], fname[0]):
+ copy_to_local(root_src_dir, root_dst_dir)
+ else:
+ files = list(reversed(sorted(dirwalk(root_src_dir))))
+ for file_ in files:
+ dst_dirpath, dst_file = file_.rsplit(b'/', 1)
+ if dst_dirpath in files:
+ files.remove(dst_dirpath)
+ dst_path = os.path.join(root_dst_dir, dst_dirpath, dst_file)
+ dst_path = os.path.normpath(dst_path)
+ if is_dir_exists(file_):
+ try:
+ os.makedirs(dst_path)
+ except OSError:
+ pass
+ else:
+ copy_to_local(file_, dst_path)
+
+ return 0
+
+ def complete_ls(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ ls_parser = argparse.ArgumentParser(
+ description='Copy a file from Ceph File System from Local Directory.')
+ ls_parser.add_argument('-l', '--long', action='store_true',
+ help='Detailed list of items in the directory.')
+ ls_parser.add_argument('-r', '--reverse', action='store_true',
+ help='Reverse order of listing items in the directory.')
+ ls_parser.add_argument('-H', action='store_true', help='Human Readable')
+ ls_parser.add_argument('-a', '--all', action='store_true',
+ help='Do not Ignore entries starting with .')
+ ls_parser.add_argument('-S', action='store_true', help='Sort by file_size')
+ ls_parser.add_argument('paths', help='Name of Directories',
+ action=path_to_bytes, nargs='*', default=['.'])
+
+ @with_argparser(ls_parser)
+ def do_ls(self, args):
+ """
+ List all the files and directories in the current working directory
+ """
+ paths = args.paths
+ for path in paths:
+ values = []
+ items = []
+ try:
+ if path.count(b'*') > 0:
+ all_items = get_all_possible_paths(path)
+ if len(all_items) == 0:
+ continue
+ path = all_items[0].rsplit(b'/', 1)[0]
+ if path == b'':
+ path = b'/'
+ dirs = []
+ for i in all_items:
+ for item in ls(path):
+ d_name = item.d_name
+ if os.path.basename(i) == d_name:
+ if item.is_dir():
+ dirs.append(os.path.join(path, d_name))
+ else:
+ items.append(item)
+ if dirs:
+ paths.extend(dirs)
+ else:
+ poutput(path.decode('utf-8'), end=':\n')
+ items = sorted(items, key=lambda item: item.d_name)
+ else:
+ if path != b'' and path != cephfs.getcwd() and len(paths) > 1:
+ poutput(path.decode('utf-8'), end=':\n')
+ items = sorted(ls(path), key=lambda item: item.d_name)
+ if not args.all:
+ items = [i for i in items if not i.d_name.startswith(b'.')]
+ if args.S:
+ items = sorted(items, key=lambda item: cephfs.stat(
+ path + b'/' + item.d_name, follow_symlink=(
+ not item.is_symbol_file())).st_size)
+ if args.reverse:
+ items = reversed(items)
+ for item in items:
+ filepath = item.d_name
+ is_dir = item.is_dir()
+ is_sym_lnk = item.is_symbol_file()
+ try:
+ if args.long and args.H:
+ print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir,
+ is_sym_lnk, True)
+ elif args.long:
+ print_long(os.path.join(cephfs.getcwd(), path, filepath), is_dir,
+ is_sym_lnk, False)
+ elif is_sym_lnk or is_dir:
+ values.append(style_listing(filepath.decode('utf-8'), is_dir,
+ is_sym_lnk))
+ else:
+ values.append(filepath)
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+ if not args.long:
+ print_list(values, shutil.get_terminal_size().columns)
+ if path != paths[-1]:
+ poutput('')
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ def complete_rmdir(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ rmdir_parser = argparse.ArgumentParser(description='Remove Directory.')
+ rmdir_parser.add_argument('paths', help='Directory Path.', nargs='+',
+ action=path_to_bytes)
+ rmdir_parser.add_argument('-p', '--parent', action='store_true',
+ help='Remove parent directories as necessary. '
+ 'When this option is specified, no error '
+ 'is reported if a directory has any '
+ 'sub-directories, files')
+
+ @with_argparser(rmdir_parser)
+ def do_rmdir(self, args):
+ self.do_rmdir_helper(args)
+
+ def do_rmdir_helper(self, args):
+ """
+ Remove a specific Directory
+ """
+ is_pattern = False
+ paths = args.paths
+ for path in paths:
+ if path.count(b'*') > 0:
+ is_pattern = True
+ all_items = get_all_possible_paths(path)
+ if len(all_items) > 0:
+ path = all_items[0].rsplit(b'/', 1)[0]
+ if path == b'':
+ path = b'/'
+ dirs = []
+ for i in all_items:
+ for item in ls(path):
+ d_name = item.d_name
+ if os.path.basename(i) == d_name:
+ if item.is_dir():
+ dirs.append(os.path.join(path, d_name))
+ paths.extend(dirs)
+ continue
+ else:
+ is_pattern = False
+
+ if args.parent:
+ path = os.path.join(cephfs.getcwd(), path.rsplit(b'/')[0])
+ files = list(sorted(set(dirwalk(path)), reverse=True))
+ if not files:
+ path = b'.'
+ for filepath in files:
+ try:
+ cephfs.rmdir(os.path.normpath(filepath))
+ except libcephfs.Error as e:
+ perror(e)
+ path = b'.'
+ break
+ else:
+ path = os.path.normpath(os.path.join(cephfs.getcwd(), path))
+ if not is_pattern and path != os.path.normpath(b''):
+ try:
+ cephfs.rmdir(path)
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ def complete_rm(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ rm_parser = argparse.ArgumentParser(description='Remove File.')
+ rm_parser.add_argument('paths', help='File Path.', nargs='+',
+ action=path_to_bytes)
+
+ @with_argparser(rm_parser)
+ def do_rm(self, args):
+ """
+ Remove a specific file
+ """
+ file_paths = args.paths
+ for path in file_paths:
+ if path.count(b'*') > 0:
+ file_paths.extend([i for i in get_all_possible_paths(
+ path) if is_file_exists(i)])
+ else:
+ try:
+ cephfs.unlink(path)
+ except libcephfs.Error as e:
+ # NOTE: perhaps we need a better msg here
+ set_exit_code_msg(msg=e)
+
+ def complete_mv(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ mv_parser = argparse.ArgumentParser(description='Move File.')
+ mv_parser.add_argument('src_path', type=str, action=path_to_bytes,
+ help='Source File Path.')
+ mv_parser.add_argument('dest_path', type=str, action=path_to_bytes,
+ help='Destination File Path.')
+
+ @with_argparser(mv_parser)
+ def do_mv(self, args):
+ """
+ Rename a file or Move a file from source path to the destination
+ """
+ cephfs.rename(args.src_path, args.dest_path)
+
+ def complete_cd(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ cd_parser = argparse.ArgumentParser(description='Change working directory')
+ cd_parser.add_argument('path', type=str, help='Name of the directory.',
+ action=path_to_bytes, nargs='?', default='/')
+
+ @with_argparser(cd_parser)
+ def do_cd(self, args):
+ """
+ Change working directory
+ """
+ cephfs.chdir(args.path)
+ self.working_dir = cephfs.getcwd().decode('utf-8')
+ self.set_prompt()
+
+ def do_cwd(self, arglist):
+ """
+ Get current working directory.
+ """
+ poutput(cephfs.getcwd().decode('utf-8'))
+
+ def complete_chmod(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ chmod_parser = argparse.ArgumentParser(description='Create Directory.')
+ chmod_parser.add_argument('mode', type=str, action=ModeAction, help='Mode')
+ chmod_parser.add_argument('paths', type=str, action=path_to_bytes,
+ help='Name of the file', nargs='+')
+
+ @with_argparser(chmod_parser)
+ def do_chmod(self, args):
+ """
+ Change permission of a file
+ """
+ for path in args.paths:
+ mode = int(args.mode, base=8)
+ try:
+ cephfs.chmod(path, mode)
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ def complete_cat(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ cat_parser = argparse.ArgumentParser(description='')
+ cat_parser.add_argument('paths', help='Name of Files', action=path_to_bytes,
+ nargs='+')
+
+ @with_argparser(cat_parser)
+ def do_cat(self, args):
+ """
+ Print contents of a file
+ """
+ for path in args.paths:
+ if is_file_exists(path):
+ copy_to_local(path, b'-')
+ else:
+ set_exit_code_msg(errno.ENOENT, '{}: no such file'.format(
+ path.decode('utf-8')))
+
+ umask_parser = argparse.ArgumentParser(description='Set umask value.')
+ umask_parser.add_argument('mode', help='Mode', type=str, action=ModeAction,
+ nargs='?', default='')
+
+ @with_argparser(umask_parser)
+ def do_umask(self, args):
+ """
+ Set Umask value.
+ """
+ if args.mode == '':
+ poutput(self.umask.zfill(4))
+ else:
+ mode = int(args.mode, 8)
+ self.umask = str(oct(cephfs.umask(mode))[2:])
+
+ def complete_write(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ write_parser = argparse.ArgumentParser(description='Writes data into a file')
+ write_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of File')
+
+ @with_argparser(write_parser)
+ def do_write(self, args):
+ """
+ Write data into a file.
+ """
+
+ copy_from_local(b'-', args.path)
+
+ def complete_lcd(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ index_dict = {1: self.path_complete}
+ return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+ lcd_parser = argparse.ArgumentParser(description='')
+ lcd_parser.add_argument('path', type=str, action=path_to_bytes, help='Path')
+
+ @with_argparser(lcd_parser)
+ def do_lcd(self, args):
+ """
+ Moves into the given local directory
+ """
+ try:
+ os.chdir(os.path.expanduser(args.path))
+ except OSError as e:
+ set_exit_code_msg(e.errno, "Cannot change to "
+ f"{e.filename.decode('utf-8')}: {e.strerror}")
+
+ def complete_lls(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ index_dict = {1: self.path_complete}
+ return self.index_based_complete(text, line, begidx, endidx, index_dict)
+
+ lls_parser = argparse.ArgumentParser(
+ description='List files in local system.')
+ lls_parser.add_argument('paths', help='Paths', action=path_to_bytes,
+ nargs='*')
+
+ @with_argparser(lls_parser)
+ def do_lls(self, args):
+ """
+ Lists all files and folders in the current local directory
+ """
+ if not args.paths:
+ print_list(os.listdir(os.getcwdb()))
+ else:
+ for path in args.paths:
+ try:
+ items = os.listdir(path)
+ poutput("{}:".format(path.decode('utf-8')))
+ print_list(items)
+ except OSError as e:
+ set_exit_code_msg(e.errno, f"{e.filename.decode('utf-8')}: "
+ f"{e.strerror}")
+ # Arguments to the with_argpaser decorator function are sticky.
+ # The items in args.path do not get overwritten in subsequent calls.
+ # The arguments remain in args.paths after the function exits and we
+ # neeed to clean it up to ensure the next call works as expected.
+ args.paths.clear()
+
+ def do_lpwd(self, arglist):
+ """
+ Prints the absolute path of the current local directory
+ """
+ poutput(os.getcwd())
+
+ def complete_df(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ df_parser = argparse.ArgumentParser(description='Show information about\
+ the amount of available disk space')
+ df_parser.add_argument('file', help='Name of the file', nargs='*',
+ default=['.'], action=path_to_bytes)
+
+ @with_argparser(df_parser)
+ def do_df(self, arglist):
+ """
+ Display the amount of available disk space for file systems
+ """
+ header = True # Set to true for printing header only once
+ if b'.' == arglist.file[0]:
+ arglist.file = ls(b'.')
+
+ for file in arglist.file:
+ if isinstance(file, libcephfs.DirEntry):
+ file = file.d_name
+ if file == b'.' or file == b'..':
+ continue
+ try:
+ statfs = cephfs.statfs(file)
+ stat = cephfs.stat(file)
+ block_size = (statfs['f_blocks'] * statfs['f_bsize']) // 1024
+ available = block_size - stat.st_size
+ use = 0
+
+ if block_size > 0:
+ use = (stat.st_size * 100) // block_size
+
+ if header:
+ header = False
+ poutput('{:25s}\t{:5s}\t{:15s}{:10s}{}'.format(
+ "1K-blocks", "Used", "Available", "Use%",
+ "Stored on"))
+
+ poutput('{:d}\t{:18d}\t{:8d}\t{:10s} {}'.format(block_size,
+ stat.st_size, available, str(int(use)) + '%',
+ file.decode('utf-8')))
+ except libcephfs.OSError as e:
+ set_exit_code_msg(e.get_error_code(), "could not statfs {}: {}".format(
+ file.decode('utf-8'), e.strerror))
+
+ locate_parser = argparse.ArgumentParser(
+ description='Find file within file system')
+ locate_parser.add_argument('name', help='name', type=str,
+ action=path_to_bytes)
+ locate_parser.add_argument('-c', '--count', action='store_true',
+ help='Count list of items located.')
+ locate_parser.add_argument(
+ '-i', '--ignorecase', action='store_true', help='Ignore case')
+
+ @with_argparser(locate_parser)
+ def do_locate(self, args):
+ """
+ Find a file within the File System
+ """
+ if args.name.count(b'*') == 1:
+ if args.name[0] == b'*':
+ args.name += b'/'
+ elif args.name[-1] == '*':
+ args.name = b'/' + args.name
+ args.name = args.name.replace(b'*', b'')
+ if args.ignorecase:
+ locations = locate_file(args.name, False)
+ else:
+ locations = locate_file(args.name)
+ if args.count:
+ poutput(len(locations))
+ else:
+ poutput((b'\n'.join(locations)).decode('utf-8'))
+
+ def complete_du(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ du_parser = argparse.ArgumentParser(
+ description='Disk Usage of a Directory')
+ du_parser.add_argument('paths', type=str, action=get_list_of_bytes_path,
+ help='Name of the directory.', nargs='*',
+ default=[b'.'])
+ du_parser.add_argument('-r', action='store_true',
+ help='Recursive Disk usage of all directories.')
+
+ @with_argparser(du_parser)
+ def do_du(self, args):
+ """
+ Print disk usage of a given path(s).
+ """
+ def print_disk_usage(files):
+ if isinstance(files, bytes):
+ files = (files, )
+
+ for f in files:
+ try:
+ st = cephfs.lstat(f)
+
+ if stat.S_ISDIR(st.st_mode):
+ dusage = int(cephfs.getxattr(f,
+ 'ceph.dir.rbytes').decode('utf-8'))
+ else:
+ dusage = st.st_size
+
+ # print path in local context
+ f = os.path.normpath(f)
+ if f[0] is ord('/'):
+ f = b'.' + f
+ poutput('{:10s} {}'.format(humansize(dusage),
+ f.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+ continue
+
+ for path in args.paths:
+ if args.r:
+ print_disk_usage(sorted(set(dirwalk(path)).union({path})))
+ else:
+ print_disk_usage(path)
+
+ quota_parser = argparse.ArgumentParser(
+ description='Quota management for a Directory')
+ quota_parser.add_argument('op', choices=['get', 'set'],
+ help='Quota operation type.')
+ quota_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of the directory.')
+ quota_parser.add_argument('--max_bytes', type=int, default=-1, nargs='?',
+ help='Max cumulative size of the data under '
+ 'this directory.')
+ quota_parser.add_argument('--max_files', type=int, default=-1, nargs='?',
+ help='Total number of files under this '
+ 'directory tree.')
+
+ @with_argparser(quota_parser)
+ def do_quota(self, args):
+ """
+ Quota management.
+ """
+ if not is_dir_exists(args.path):
+ set_exit_code_msg(errno.ENOENT, 'error: no such directory {}'.format(
+ args.path.decode('utf-8')))
+ return
+
+ if args.op == 'set':
+ if (args.max_bytes == -1) and (args.max_files == -1):
+ set_exit_code_msg(errno.EINVAL, 'please specify either '
+ '--max_bytes or --max_files or both')
+ return
+
+ if args.max_bytes >= 0:
+ max_bytes = to_bytes(str(args.max_bytes))
+ try:
+ cephfs.setxattr(args.path, 'ceph.quota.max_bytes',
+ max_bytes, os.XATTR_CREATE)
+ poutput('max_bytes set to %d' % args.max_bytes)
+ except libcephfs.Error as e:
+ cephfs.setxattr(args.path, 'ceph.quota.max_bytes',
+ max_bytes, os.XATTR_REPLACE)
+ set_exit_code_msg(e.get_error_code(), 'max_bytes reset to '
+ f'{args.max_bytes}')
+
+ if args.max_files >= 0:
+ max_files = to_bytes(str(args.max_files))
+ try:
+ cephfs.setxattr(args.path, 'ceph.quota.max_files',
+ max_files, os.XATTR_CREATE)
+ poutput('max_files set to %d' % args.max_files)
+ except libcephfs.Error as e:
+ cephfs.setxattr(args.path, 'ceph.quota.max_files',
+ max_files, os.XATTR_REPLACE)
+ set_exit_code_msg(e.get_error_code(), 'max_files reset to '
+ f'{args.max_files}')
+ elif args.op == 'get':
+ max_bytes = '0'
+ max_files = '0'
+ try:
+ max_bytes = cephfs.getxattr(args.path, 'ceph.quota.max_bytes')
+ poutput('max_bytes: {}'.format(max_bytes.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(e.get_error_code(), 'max_bytes is not set')
+
+ try:
+ max_files = cephfs.getxattr(args.path, 'ceph.quota.max_files')
+ poutput('max_files: {}'.format(max_files.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(e.get_error_code(), 'max_files is not set')
+
+ snap_parser = argparse.ArgumentParser(description='Snapshot Management')
+ snap_parser.add_argument('op', type=str,
+ help='Snapshot operation: create or delete')
+ snap_parser.add_argument('name', type=str, action=path_to_bytes,
+ help='Name of snapshot')
+ snap_parser.add_argument('dir', type=str, action=path_to_bytes,
+ help='Directory for which snapshot '
+ 'needs to be created or deleted')
+
+ @with_argparser(snap_parser)
+ def do_snap(self, args):
+ """
+ Snapshot management for the volume
+ """
+ # setting self.colors to None turns off colorizing and
+ # perror emits plain text
+ self.colors = None
+
+ snapdir = '.snap'
+ conf_snapdir = cephfs.conf_get('client_snapdir')
+ if conf_snapdir is not None:
+ snapdir = conf_snapdir
+ snapdir = to_bytes(snapdir)
+ if args.op == 'create':
+ try:
+ if is_dir_exists(args.dir):
+ cephfs.mkdir(os.path.join(args.dir, snapdir, args.name), 0o755)
+ else:
+ set_exit_code_msg(errno.ENOENT, "'{}': no such directory".format(
+ args.dir.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(e.get_error_code(),
+ "snapshot '{}' already exists".format(
+ args.name.decode('utf-8')))
+ elif args.op == 'delete':
+ snap_dir = os.path.join(args.dir, snapdir, args.name)
+ try:
+ if is_dir_exists(snap_dir):
+ newargs = argparse.Namespace(paths=[snap_dir], parent=False)
+ self.do_rmdir_helper(newargs)
+ else:
+ set_exit_code_msg(errno.ENOENT, "'{}': no such snapshot".format(
+ args.name.decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(e.get_error_code(), "error while deleting "
+ "'{}'".format(snap_dir.decode('utf-8')))
+ else:
+ set_exit_code_msg(errno.EINVAL, "snapshot can only be created or "
+ "deleted; check - help snap")
+
+ def do_help(self, line):
+ """
+ Get details about a command.
+ Usage: help <cmd> - for a specific command
+ help all - for all the commands
+ """
+ if line == 'all':
+ for k in dir(self):
+ if k.startswith('do_'):
+ poutput('-' * 80)
+ super().do_help(k[3:])
+ return
+ parser = self.create_argparser(line)
+ if parser:
+ parser.print_help()
+ else:
+ super().do_help(line)
+
+ def complete_stat(self, text, line, begidx, endidx):
+ """
+ auto complete of file name.
+ """
+ return self.complete_filenames(text, line, begidx, endidx)
+
+ stat_parser = argparse.ArgumentParser(
+ description='Display file or file system status')
+ stat_parser.add_argument('paths', type=str, help='file paths',
+ action=path_to_bytes, nargs='+')
+
+ @with_argparser(stat_parser)
+ def do_stat(self, args):
+ """
+ Display file or file system status
+ """
+ for path in args.paths:
+ try:
+ stat = cephfs.stat(path)
+ atime = stat.st_atime.isoformat(' ')
+ mtime = stat.st_mtime.isoformat(' ')
+ ctime = stat.st_mtime.isoformat(' ')
+
+ poutput("File: {}\nSize: {:d}\nBlocks: {:d}\nIO Block: {:d}\n"
+ "Device: {:d}\tInode: {:d}\tLinks: {:d}\nPermission: "
+ "{:o}/{}\tUid: {:d}\tGid: {:d}\nAccess: {}\nModify: "
+ "{}\nChange: {}".format(path.decode('utf-8'),
+ stat.st_size, stat.st_blocks,
+ stat.st_blksize, stat.st_dev,
+ stat.st_ino, stat.st_nlink,
+ stat.st_mode,
+ mode_notation(stat.st_mode),
+ stat.st_uid, stat.st_gid, atime,
+ mtime, ctime))
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ setxattr_parser = argparse.ArgumentParser(
+ description='Set extended attribute for a file')
+ setxattr_parser.add_argument('path', type=str, action=path_to_bytes, help='Name of the file')
+ setxattr_parser.add_argument('name', type=str, help='Extended attribute name')
+ setxattr_parser.add_argument('value', type=str, help='Extended attribute value')
+
+ @with_argparser(setxattr_parser)
+ def do_setxattr(self, args):
+ """
+ Set extended attribute for a file
+ """
+ val_bytes = to_bytes(args.value)
+ name_bytes = to_bytes(args.name)
+ try:
+ cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_CREATE)
+ poutput('{} is successfully set to {}'.format(args.name, args.value))
+ except libcephfs.ObjectExists:
+ cephfs.setxattr(args.path, name_bytes, val_bytes, os.XATTR_REPLACE)
+ poutput('{} is successfully reset to {}'.format(args.name, args.value))
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ getxattr_parser = argparse.ArgumentParser(
+ description='Get extended attribute set for a file')
+ getxattr_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of the file')
+ getxattr_parser.add_argument('name', type=str, help='Extended attribute name')
+
+ @with_argparser(getxattr_parser)
+ def do_getxattr(self, args):
+ """
+ Get extended attribute for a file
+ """
+ try:
+ poutput('{}'.format(cephfs.getxattr(args.path,
+ to_bytes(args.name)).decode('utf-8')))
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+ listxattr_parser = argparse.ArgumentParser(
+ description='List extended attributes set for a file')
+ listxattr_parser.add_argument('path', type=str, action=path_to_bytes,
+ help='Name of the file')
+
+ @with_argparser(listxattr_parser)
+ def do_listxattr(self, args):
+ """
+ List extended attributes for a file
+ """
+ try:
+ size, xattr_list = cephfs.listxattr(args.path)
+ if size > 0:
+ poutput('{}'.format(xattr_list.replace(b'\x00', b' ').decode('utf-8')))
+ else:
+ poutput('No extended attribute is set')
+ except libcephfs.Error as e:
+ set_exit_code_msg(msg=e)
+
+
+#######################################################
+#
+# Following are methods that get cephfs-shell started.
+#
+#####################################################
+
+def setup_cephfs():
+ """
+ Mounting a cephfs
+ """
+ global cephfs
+ try:
+ cephfs = libcephfs.LibCephFS(conffile='')
+ cephfs.mount()
+ except libcephfs.ObjectNotFound as e:
+ print('couldn\'t find ceph configuration not found')
+ sys.exit(e.get_error_code())
+ except libcephfs.Error as e:
+ print(e)
+ sys.exit(e.get_error_code())
+
+
+def str_to_bool(val):
+ """
+ Return corresponding bool values for strings like 'true' or 'false'.
+ """
+ if not isinstance(val, str):
+ return val
+
+ val = val.replace('\n', '')
+ if val.lower() in ['true', 'yes']:
+ return True
+ elif val.lower() in ['false', 'no']:
+ return False
+ else:
+ return val
+
+
+def read_shell_conf(shell, shell_conf_file):
+ import configparser
+
+ sec = 'cephfs-shell'
+ opts = []
+ if LooseVersion(cmd2_version) >= LooseVersion("0.10.0"):
+ for attr in shell.settables.keys():
+ opts.append(attr)
+ else:
+ if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"):
+ # hardcoding options for 0.7.9 because -
+ # 1. we use cmd2 v0.7.9 with teuthology and
+ # 2. there's no way distinguish between a shell setting and shell
+ # object attribute until v0.10.0
+ opts = ['abbrev', 'autorun_on_edit', 'colors',
+ 'continuation_prompt', 'debug', 'echo', 'editor',
+ 'feedback_to_output', 'locals_in_py', 'prompt', 'quiet',
+ 'timing']
+ elif LooseVersion(cmd2_version) >= LooseVersion("0.9.23"):
+ opts.append('allow_style')
+ # no equivalent option was defined by cmd2.
+ else:
+ pass
+
+ # default and only section in our conf file.
+ cp = configparser.ConfigParser(default_section=sec, strict=False)
+ cp.read(shell_conf_file)
+ for opt in opts:
+ if cp.has_option(sec, opt):
+ setattr(shell, opt, str_to_bool(cp.get(sec, opt)))
+
+
+def get_shell_conffile_path(arg_conf=''):
+ conf_filename = 'cephfs-shell.conf'
+ env_var = 'CEPHFS_SHELL_CONF'
+
+ arg_conf = '' if not arg_conf else arg_conf
+ home_dir_conf = os.path.expanduser('~/.' + conf_filename)
+ env_conf = os.environ[env_var] if env_var in os.environ else ''
+
+ # here's the priority by which conf gets read.
+ for path in (arg_conf, env_conf, home_dir_conf):
+ if os.path.isfile(path):
+ return path
+ else:
+ return ''
+
+
+def manage_args():
+ main_parser = argparse.ArgumentParser(description='')
+ main_parser.add_argument('-c', '--config', action='store',
+ help='Path to Ceph configuration file.',
+ type=str)
+ main_parser.add_argument('-b', '--batch', action='store',
+ help='Path to CephFS shell script/batch file'
+ 'containing CephFS shell commands',
+ type=str)
+ main_parser.add_argument('-t', '--test', action='store',
+ help='Test against transcript(s) in FILE',
+ nargs='+')
+ main_parser.add_argument('commands', nargs='*', help='Comma delimited '
+ 'commands. The shell executes the given command '
+ 'and quits immediately with the return value of '
+ 'command. In case no commands are provided, the '
+ 'shell is launched.', default=[])
+
+ args = main_parser.parse_args()
+ args.exe_and_quit = False # Execute and quit, don't launch the shell.
+
+ if args.batch:
+ if LooseVersion(cmd2_version) <= LooseVersion("0.9.13"):
+ args.commands = ['load ' + args.batch, ',quit']
+ else:
+ args.commands = ['run_script ' + args.batch, ',quit']
+ if args.test:
+ args.commands.extend(['-t,'] + [arg + ',' for arg in args.test])
+ if not args.batch and len(args.commands) > 0:
+ args.exe_and_quit = True
+
+ manage_sys_argv(args)
+
+ return args
+
+
+def manage_sys_argv(args):
+ exe = sys.argv[0]
+ sys.argv.clear()
+ sys.argv.append(exe)
+ sys.argv.extend([i.strip() for i in ' '.join(args.commands).split(',')])
+
+ setup_cephfs()
+
+
+def execute_cmd_args(args):
+ """
+ Launch a shell session if no arguments were passed, else just execute
+ the given argument as a shell command and exit the shell session
+ immediately at (last) command's termination with the (last) command's
+ return value.
+ """
+ if not args.exe_and_quit:
+ return shell.cmdloop()
+ return execute_cmds_and_quit(args)
+
+
+def execute_cmds_and_quit(args):
+ """
+ Multiple commands might be passed separated by commas, feed onecmd()
+ one command at a time.
+ """
+ # do_* methods triggered by cephfs-shell commands return None when they
+ # complete running successfully. Until 0.9.6, shell.onecmd() returned this
+ # value to indicate whether the execution of the commands should stop, but
+ # since 0.9.7 it returns the return value of do_* methods only if it's
+ # not None. When it is None it returns False instead of None.
+ if LooseVersion(cmd2_version) <= LooseVersion("0.9.6"):
+ stop_exec_val = None
+ else:
+ stop_exec_val = False
+
+ args_to_onecmd = ''
+ if len(args.commands) <= 1:
+ args.commands = args.commands[0].split(' ')
+ for cmdarg in args.commands:
+ if ',' in cmdarg:
+ args_to_onecmd += ' ' + cmdarg[0:-1]
+ onecmd_retval = shell.onecmd(args_to_onecmd)
+ # if the curent command failed, let's abort the execution of
+ # series of commands passed.
+ if onecmd_retval is not stop_exec_val:
+ return onecmd_retval
+ if shell.exit_code != 0:
+ return shell.exit_code
+
+ args_to_onecmd = ''
+ continue
+
+ args_to_onecmd += ' ' + cmdarg
+ return shell.onecmd(args_to_onecmd)
+
+
+if __name__ == '__main__':
+ args = manage_args()
+
+ shell = CephFSShell()
+ # TODO: perhaps, we should add an option to pass ceph.conf?
+ read_shell_conf(shell, get_shell_conffile_path(args.config))
+ # XXX: setting shell.exit_code to zero so that in case there are no errors
+ # and exceptions, it is not set by any method or function of cephfs-shell
+ # and return values from shell.cmdloop() or shell.onecmd() is not an
+ # integer, we can treat it as the return value of cephfs-shell.
+ shell.exit_code = 0
+
+ retval = execute_cmd_args(args)
+ sys.exit(retval if retval else shell.exit_code)
diff --git a/src/tools/cephfs/shell/setup.py b/src/tools/cephfs/shell/setup.py
new file mode 100644
index 000000000..8cf7f28f7
--- /dev/null
+++ b/src/tools/cephfs/shell/setup.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+from setuptools import setup
+
+__version__ = '0.0.1'
+
+setup(
+ name='cephfs-shell',
+ version=__version__,
+ description='Interactive shell for Ceph file system',
+ keywords='cephfs, shell',
+ scripts=['cephfs-shell'],
+ install_requires=[
+ 'cephfs',
+ 'cmd2',
+ 'colorama',
+ ],
+ classifiers=[
+ 'Development Status :: 3 - Alpha',
+ 'Environment :: Console',
+ 'Intended Audience :: System Administrators',
+ 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)',
+ 'Operating System :: POSIX :: Linux',
+ 'Programming Language :: Python :: 3'
+ ],
+ license='LGPLv2+',
+)
diff --git a/src/tools/cephfs/shell/tox.ini b/src/tools/cephfs/shell/tox.ini
new file mode 100644
index 000000000..c1cbff051
--- /dev/null
+++ b/src/tools/cephfs/shell/tox.ini
@@ -0,0 +1,7 @@
+[tox]
+envlist = py3
+skipsdist = true
+
+[testenv:py3]
+deps = flake8
+commands = flake8 --ignore=W503 --max-line-length=100 cephfs-shell
diff --git a/src/tools/cephfs/top/CMakeLists.txt b/src/tools/cephfs/top/CMakeLists.txt
new file mode 100644
index 000000000..49750c850
--- /dev/null
+++ b/src/tools/cephfs/top/CMakeLists.txt
@@ -0,0 +1,7 @@
+include(Distutils)
+distutils_install_module(cephfs-top)
+
+if(WITH_TESTS)
+ include(AddCephTest)
+ add_tox_test(cephfs-top)
+endif()
diff --git a/src/tools/cephfs/top/cephfs-top b/src/tools/cephfs/top/cephfs-top
new file mode 100755
index 000000000..d57c3ab83
--- /dev/null
+++ b/src/tools/cephfs/top/cephfs-top
@@ -0,0 +1,888 @@
+#!/usr/bin/python3
+
+import argparse
+import sys
+import curses
+import errno
+import json
+import signal
+import time
+import math
+import threading
+
+from collections import OrderedDict
+from datetime import datetime
+from enum import Enum, unique
+
+import rados
+
+
+class FSTopException(Exception):
+ def __init__(self, msg=''):
+ self.error_msg = msg
+
+ def get_error_msg(self):
+ return self.error_msg
+
+
+@unique
+class MetricType(Enum):
+ METRIC_TYPE_NONE = 0
+ METRIC_TYPE_PERCENTAGE = 1
+ METRIC_TYPE_LATENCY = 2
+ METRIC_TYPE_SIZE = 3
+ METRIC_TYPE_STDEV = 4
+
+
+FS_TOP_PROG_STR = 'cephfs-top'
+FS_TOP_ALL_FS_APP = 'ALL_FS_APP'
+FS_TOP_FS_SELECTED_APP = 'SELECTED_FS_APP'
+
+# version match b/w fstop and stats emitted by mgr/stats
+FS_TOP_SUPPORTED_VER = 2
+
+ITEMS_PAD_LEN = 3
+ITEMS_PAD = " " * ITEMS_PAD_LEN
+DEFAULT_REFRESH_INTERVAL = 1
+# min refresh interval allowed
+MIN_REFRESH_INTERVAL = 0.5
+
+# metadata provided by mgr/stats
+FS_TOP_MAIN_WINDOW_COL_CLIENT_ID = "client_id"
+FS_TOP_MAIN_WINDOW_COL_MNT_ROOT = "mount_root"
+FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR = "mount_point@host/addr"
+
+MAIN_WINDOW_TOP_LINE_ITEMS_START = [ITEMS_PAD,
+ FS_TOP_MAIN_WINDOW_COL_CLIENT_ID,
+ FS_TOP_MAIN_WINDOW_COL_MNT_ROOT]
+MAIN_WINDOW_TOP_LINE_ITEMS_END = [FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR]
+
+MAIN_WINDOW_TOP_LINE_METRICS_LEGACY = ["READ_LATENCY",
+ "WRITE_LATENCY",
+ "METADATA_LATENCY"
+ ]
+
+# adjust this map according to stats version and maintain order
+# as emitted by mgr/stast
+MAIN_WINDOW_TOP_LINE_METRICS = OrderedDict([
+ ("CAP_HIT", MetricType.METRIC_TYPE_PERCENTAGE),
+ ("READ_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("DENTRY_LEASE", MetricType.METRIC_TYPE_PERCENTAGE),
+ ("OPENED_FILES", MetricType.METRIC_TYPE_NONE),
+ ("PINNED_ICAPS", MetricType.METRIC_TYPE_NONE),
+ ("OPENED_INODES", MetricType.METRIC_TYPE_NONE),
+ ("READ_IO_SIZES", MetricType.METRIC_TYPE_SIZE),
+ ("WRITE_IO_SIZES", MetricType.METRIC_TYPE_SIZE),
+ ("AVG_READ_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("STDEV_READ_LATENCY", MetricType.METRIC_TYPE_STDEV),
+ ("AVG_WRITE_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("STDEV_WRITE_LATENCY", MetricType.METRIC_TYPE_STDEV),
+ ("AVG_METADATA_LATENCY", MetricType.METRIC_TYPE_LATENCY),
+ ("STDEV_METADATA_LATENCY", MetricType.METRIC_TYPE_STDEV),
+])
+MGR_STATS_COUNTERS = list(MAIN_WINDOW_TOP_LINE_METRICS.keys())
+
+FS_TOP_VERSION_HEADER_FMT = '{prog_name} - {now}'
+FS_TOP_CLIENT_HEADER_FMT = 'Total Client(s): {num_clients} - '\
+ '{num_mounts} FUSE, {num_kclients} kclient, {num_libs} libcephfs'
+FS_TOP_NAME_TOPL_FMT = 'Filesystem: {fs_name} - {client_count} client(s)'
+
+CLIENT_METADATA_KEY = "client_metadata"
+CLIENT_METADATA_MOUNT_POINT_KEY = "mount_point"
+CLIENT_METADATA_MOUNT_ROOT_KEY = "root"
+CLIENT_METADATA_IP_KEY = "IP"
+CLIENT_METADATA_HOSTNAME_KEY = "hostname"
+CLIENT_METADATA_VALID_METRICS_KEY = "valid_metrics"
+
+GLOBAL_METRICS_KEY = "global_metrics"
+GLOBAL_COUNTERS_KEY = "global_counters"
+
+last_time = time.time()
+last_read_size = {}
+last_write_size = {}
+
+fs_list = []
+
+
+def calc_perc(c):
+ if c[0] == 0 and c[1] == 0:
+ return 0.0
+ return round((c[0] / (c[0] + c[1])) * 100, 2)
+
+
+def calc_lat(c):
+ return round(c[0] * 1000 + c[1] / 1000000, 2)
+
+
+def calc_stdev(c):
+ stdev = 0.0
+ if c[1] > 1:
+ stdev = math.sqrt(c[0] / (c[1] - 1)) / 1000000
+ return round(stdev, 2)
+
+
+# in MB
+def calc_size(c):
+ return round(c[1] / (1024 * 1024), 2)
+
+
+# in MB
+def calc_avg_size(c):
+ if c[0] == 0:
+ return 0.0
+ return round(c[1] / (c[0] * 1024 * 1024), 2)
+
+
+# in MB/s
+def calc_speed(size, duration):
+ if duration == 0:
+ return 0.0
+ return round(size / (duration * 1024 * 1024), 2)
+
+
+def wrap(s, sl):
+ """return a '+' suffixed wrapped string"""
+ if len(s) < sl:
+ return s
+ return f'{s[0:sl-1]}+'
+
+
+class FSTop(object):
+ def __init__(self, args):
+ self.rados = None
+ self.stdscr = None # curses instance
+ self.current_screen = ""
+ self.client_name = args.id
+ self.cluster_name = args.cluster
+ self.conffile = args.conffile
+ self.refresh_interval_secs = args.delay
+ self.PAD_HEIGHT = 10000 # height of the fstop_pad
+ self.PAD_WIDTH = 300 # width of the fstop_pad
+ self.exit_ev = threading.Event()
+
+ def handle_signal(self, signum, _):
+ self.exit_ev.set()
+
+ def init(self):
+ try:
+ if self.conffile:
+ r_rados = rados.Rados(rados_id=self.client_name,
+ clustername=self.cluster_name,
+ conffile=self.conffile)
+ else:
+ r_rados = rados.Rados(rados_id=self.client_name,
+ clustername=self.cluster_name)
+ r_rados.conf_read_file()
+ r_rados.connect()
+ self.rados = r_rados
+ except rados.Error as e:
+ if e.errno == errno.ENOENT:
+ raise FSTopException(f'cluster {self.cluster_name}'
+ ' does not exist')
+ else:
+ raise FSTopException(f'error connecting to cluster: {e}')
+ self.verify_perf_stats_support()
+ signal.signal(signal.SIGTERM, self.handle_signal)
+ signal.signal(signal.SIGINT, self.handle_signal)
+
+ def fini(self):
+ if self.rados:
+ self.rados.shutdown()
+ self.rados = None
+
+ def selftest(self):
+ stats_json = self.perf_stats_query()
+ if not stats_json['version'] == FS_TOP_SUPPORTED_VER:
+ raise FSTopException('perf stats version mismatch!')
+ missing = [m for m in stats_json["global_counters"]
+ if m.upper() not in MGR_STATS_COUNTERS]
+ if missing:
+ raise FSTopException('Cannot handle unknown metrics from'
+ f'\'ceph fs perf stats\': {missing}')
+
+ def get_fs_names(self):
+ mon_cmd = {'prefix': 'fs ls', 'format': 'json'}
+ try:
+ ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'')
+ except Exception as e:
+ raise FSTopException(f'Error in fs ls: {e}')
+ fs_map = json.loads(buf.decode('utf-8'))
+ global fs_list
+ fs_list.clear()
+ for filesystem in fs_map:
+ fs = filesystem['name']
+ fs_list.append(fs)
+ return fs_list
+
+ def setup_curses(self, win):
+ self.stdscr = win
+ self.stdscr.keypad(True)
+ curses.use_default_colors()
+ curses.start_color()
+ try:
+ curses.curs_set(0)
+ except curses.error:
+ # If the terminal do not support the visibility
+ # requested it will raise an exception
+ pass
+ self.fstop_pad = curses.newpad(self.PAD_HEIGHT, self.PAD_WIDTH)
+ self.run_all_display()
+
+ def display_fs_menu(self, stdscr, selected_row_idx):
+ stdscr.clear()
+ h, w = stdscr.getmaxyx()
+ global fs_list
+ if not fs_list:
+ title = ['No filesystem available',
+ 'Press "q" to go back to home (All Filesystem Info) screen']
+ pos_x1 = w // 2 - len(title[0]) // 2
+ pos_x2 = w // 2 - len(title[1]) // 2
+ stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD)
+ stdscr.addstr(3, pos_x2, title[1])
+ else:
+ title = ['Filesystems', 'Press "q" to go back to home (All Filesystem Info) screen']
+ pos_x1 = w // 2 - len(title[0]) // 2
+ pos_x2 = w // 2 - len(title[1]) // 2
+ stdscr.addstr(1, pos_x1, title[0], curses.A_STANDOUT | curses.A_BOLD)
+ stdscr.addstr(3, pos_x2, title[1])
+ for index, name in enumerate(fs_list):
+ x = w // 2 - len(name) // 2
+ y = h // 2 - len(fs_list) // 2 + index
+ if index == selected_row_idx:
+ stdscr.attron(curses.color_pair(1))
+ stdscr.addstr(y, x, name)
+ stdscr.attroff(curses.color_pair(1))
+ else:
+ stdscr.addstr(y, x, name)
+ stdscr.refresh()
+
+ def set_key(self, stdscr):
+ curses.curs_set(0)
+ curses.init_pair(1, curses.COLOR_BLACK, curses.COLOR_WHITE)
+ curr_row = 0
+ key = 0
+ endmenu = False
+ while not endmenu:
+ global fs_list
+ fs_list = self.get_fs_names()
+
+ if key == curses.KEY_UP and curr_row > 0:
+ curr_row -= 1
+ elif key == curses.KEY_DOWN and curr_row < len(fs_list) - 1:
+ curr_row += 1
+ elif (key in [curses.KEY_ENTER, 10, 13]) and fs_list:
+ self.stdscr.erase()
+ self.run_display(fs_list[curr_row])
+ endmenu = True
+ elif key == ord('q'):
+ self.stdscr.erase()
+ self.run_all_display()
+ endmenu = True
+
+ try:
+ self.display_fs_menu(stdscr, curr_row)
+ except curses.error:
+ pass
+ curses.halfdelay(self.refresh_interval_secs)
+ key = stdscr.getch()
+
+ def set_option(self, opt):
+ if opt == ord('m'):
+ curses.wrapper(self.set_key)
+ elif opt == ord('q'):
+ if self.current_screen == FS_TOP_ALL_FS_APP:
+ quit()
+ else:
+ self.run_all_display()
+
+ def verify_perf_stats_support(self):
+ mon_cmd = {'prefix': 'mgr module ls', 'format': 'json'}
+ try:
+ ret, buf, out = self.rados.mon_command(json.dumps(mon_cmd), b'')
+ except Exception as e:
+ raise FSTopException(f'error checking \'stats\' module: {e}')
+ if ret != 0:
+ raise FSTopException(f'error checking \'stats\' module: {out}')
+ if 'stats' not in json.loads(buf.decode('utf-8'))['enabled_modules']:
+ raise FSTopException('\'stats\' module not enabled. Use'
+ '\'ceph mgr module enable stats\' to enable')
+
+ def perf_stats_query(self):
+ mgr_cmd = {'prefix': 'fs perf stats', 'format': 'json'}
+ try:
+ ret, buf, out = self.rados.mgr_command(json.dumps(mgr_cmd), b'')
+ except Exception as e:
+ raise FSTopException(f'error in \'perf stats\' query: {e}')
+ if ret != 0:
+ raise FSTopException(f'error in \'perf stats\' query: {out}')
+ return json.loads(buf.decode('utf-8'))
+
+ def items(self, item):
+ if item == "CAP_HIT":
+ return "chit"
+ if item == "READ_LATENCY":
+ return "rlat"
+ if item == "WRITE_LATENCY":
+ return "wlat"
+ if item == "METADATA_LATENCY":
+ return "mlat"
+ if item == "DENTRY_LEASE":
+ return "dlease"
+ if item == "OPENED_FILES":
+ return "ofiles"
+ if item == "PINNED_ICAPS":
+ return "oicaps"
+ if item == "OPENED_INODES":
+ return "oinodes"
+ if item == "READ_IO_SIZES":
+ return "rtio"
+ if item == "WRITE_IO_SIZES":
+ return "wtio"
+ if item == 'AVG_READ_LATENCY':
+ return 'rlatavg'
+ if item == 'STDEV_READ_LATENCY':
+ return 'rlatsd'
+ if item == 'AVG_WRITE_LATENCY':
+ return 'wlatavg'
+ if item == 'STDEV_WRITE_LATENCY':
+ return 'wlatsd'
+ if item == 'AVG_METADATA_LATENCY':
+ return 'mlatavg'
+ if item == 'STDEV_METADATA_LATENCY':
+ return 'mlatsd'
+ else:
+ # return empty string for none type
+ return ''
+
+ def mtype(self, typ):
+ if typ == MetricType.METRIC_TYPE_PERCENTAGE:
+ return "(%)"
+ elif typ == MetricType.METRIC_TYPE_LATENCY:
+ return "(ms)"
+ elif typ == MetricType.METRIC_TYPE_SIZE:
+ return "(MB)"
+ elif typ == MetricType.METRIC_TYPE_STDEV:
+ return "(ms)"
+ else:
+ # return empty string for none type
+ return ''
+
+ def avg_items(self, item):
+ if item == "READ_IO_SIZES":
+ return "raio"
+ if item == "WRITE_IO_SIZES":
+ return "waio"
+ else:
+ # return empty string for none type
+ return ''
+
+ def speed_items(self, item):
+ if item == "READ_IO_SIZES":
+ return "rsp"
+ if item == "WRITE_IO_SIZES":
+ return "wsp"
+ else:
+ # return empty string for none type
+ return ''
+
+ def speed_mtype(self, typ):
+ if typ == MetricType.METRIC_TYPE_SIZE:
+ return "(MB/s)"
+ else:
+ # return empty string for none type
+ return ''
+
+ @staticmethod
+ def has_metric(metadata, metrics_key):
+ return metrics_key in metadata
+
+ @staticmethod
+ def has_metrics(metadata, metrics_keys):
+ for key in metrics_keys:
+ if not FSTop.has_metric(metadata, key):
+ return False
+ return True
+
+ def create_top_line_and_build_coord(self):
+ xp = 0
+ x_coord_map = {}
+
+ heading = []
+ for item in MAIN_WINDOW_TOP_LINE_ITEMS_START:
+ heading.append(item)
+ nlen = len(item) + len(ITEMS_PAD)
+ x_coord_map[item] = (xp, nlen)
+ xp += nlen
+
+ for item, typ in MAIN_WINDOW_TOP_LINE_METRICS.items():
+ if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY:
+ continue
+ it = f'{self.items(item)}{self.mtype(typ)}'
+ heading.append(it)
+ nlen = len(it) + len(ITEMS_PAD)
+ x_coord_map[item] = (xp, nlen)
+ xp += nlen
+
+ if item == "READ_IO_SIZES" or item == "WRITE_IO_SIZES":
+ # average io sizes
+ it = f'{self.avg_items(item)}{self.mtype(typ)}'
+ heading.append(it)
+ nlen = len(it) + len(ITEMS_PAD)
+ if item == "READ_IO_SIZES":
+ x_coord_map["READ_IO_AVG"] = (xp, nlen)
+ if item == "WRITE_IO_SIZES":
+ x_coord_map["WRITE_IO_AVG"] = (xp, nlen)
+ xp += nlen
+
+ # io speeds
+ it = f'{self.speed_items(item)}{self.speed_mtype(typ)}'
+ heading.append(it)
+ nlen = len(it) + len(ITEMS_PAD)
+ if item == "READ_IO_SIZES":
+ x_coord_map["READ_IO_SPEED"] = (xp, nlen)
+ if item == "WRITE_IO_SIZES":
+ x_coord_map["WRITE_IO_SPEED"] = (xp, nlen)
+ xp += nlen
+
+ for item in MAIN_WINDOW_TOP_LINE_ITEMS_END:
+ heading.append(item)
+ nlen = len(item) + len(ITEMS_PAD)
+ x_coord_map[item] = (xp, nlen)
+ xp += nlen
+ title = ITEMS_PAD.join(heading)
+ self.fsstats.addstr(self.tablehead_y, 0, title, curses.A_STANDOUT | curses.A_BOLD)
+ return x_coord_map
+
+ def create_client(self, client_id, metrics, counters,
+ client_meta, x_coord_map, y_coord):
+ global last_time
+ size = 0
+ cur_time = time.time()
+ duration = cur_time - last_time
+ last_time = cur_time
+ for item in MAIN_WINDOW_TOP_LINE_ITEMS_START:
+ coord = x_coord_map[item]
+ hlen = coord[1] - 1
+ if item == FS_TOP_MAIN_WINDOW_COL_CLIENT_ID:
+ self.fsstats.addstr(y_coord, coord[0],
+ wrap(client_id.split('.')[1], hlen), curses.A_DIM)
+ elif item == FS_TOP_MAIN_WINDOW_COL_MNT_ROOT:
+ if FSTop.has_metric(client_meta,
+ CLIENT_METADATA_MOUNT_ROOT_KEY):
+ self.fsstats.addstr(
+ y_coord, coord[0],
+ wrap(client_meta[CLIENT_METADATA_MOUNT_ROOT_KEY], hlen), curses.A_DIM)
+ else:
+ self.fsstats.addstr(y_coord, coord[0], "N/A", curses.A_DIM)
+
+ cidx = 0
+ for item in counters:
+ if item in MAIN_WINDOW_TOP_LINE_METRICS_LEGACY:
+ cidx += 1
+ continue
+ coord = x_coord_map[item]
+ m = metrics[cidx]
+ key = MGR_STATS_COUNTERS[cidx]
+ typ = MAIN_WINDOW_TOP_LINE_METRICS[key]
+ if item.lower() in client_meta.get(
+ CLIENT_METADATA_VALID_METRICS_KEY, []):
+ if typ == MetricType.METRIC_TYPE_PERCENTAGE:
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_perc(m)}', curses.A_DIM)
+ elif typ == MetricType.METRIC_TYPE_LATENCY:
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_lat(m)}', curses.A_DIM)
+ elif typ == MetricType.METRIC_TYPE_STDEV:
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_stdev(m)}', curses.A_DIM)
+ elif typ == MetricType.METRIC_TYPE_SIZE:
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_size(m)}', curses.A_DIM)
+
+ # average io sizes
+ if key == "READ_IO_SIZES":
+ coord = x_coord_map["READ_IO_AVG"]
+ elif key == "WRITE_IO_SIZES":
+ coord = x_coord_map["WRITE_IO_AVG"]
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_avg_size(m)}', curses.A_DIM)
+
+ # io speeds
+ if key == "READ_IO_SIZES":
+ coord = x_coord_map["READ_IO_SPEED"]
+ elif key == "WRITE_IO_SIZES":
+ coord = x_coord_map["WRITE_IO_SPEED"]
+ size = 0
+ if key == "READ_IO_SIZES":
+ if m[1] > 0:
+ global last_read_size
+ last_size = last_read_size.get(client_id, 0)
+ size = m[1] - last_size
+ last_read_size[client_id] = m[1]
+ if key == "WRITE_IO_SIZES":
+ if m[1] > 0:
+ global last_write_size
+ last_size = last_write_size.get(client_id, 0)
+ size = m[1] - last_size
+ last_write_size[client_id] = m[1]
+ self.fsstats.addstr(y_coord, coord[0],
+ f'{calc_speed(abs(size), duration)}', curses.A_DIM)
+ else:
+ # display 0th element from metric tuple
+ self.fsstats.addstr(y_coord, coord[0], f'{m[0]}', curses.A_DIM)
+ else:
+ self.fsstats.addstr(y_coord, coord[0], "N/A", curses.A_DIM)
+ cidx += 1
+
+ for item in MAIN_WINDOW_TOP_LINE_ITEMS_END:
+ coord = x_coord_map[item]
+ wrapLen = self.PAD_WIDTH - coord[0]
+ # always place the FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR in the
+ # last, it will be a very long string to display
+ if item == FS_TOP_MAIN_WINDOW_COL_MNTPT_HOST_ADDR:
+ if FSTop.has_metrics(client_meta,
+ [CLIENT_METADATA_MOUNT_POINT_KEY,
+ CLIENT_METADATA_HOSTNAME_KEY,
+ CLIENT_METADATA_IP_KEY]):
+ mount_point = f'{client_meta[CLIENT_METADATA_MOUNT_POINT_KEY]}@'\
+ f'{client_meta[CLIENT_METADATA_HOSTNAME_KEY]}/'\
+ f'{client_meta[CLIENT_METADATA_IP_KEY]}'
+ self.fsstats.addstr(
+ y_coord, coord[0],
+ wrap(mount_point, wrapLen), curses.A_DIM)
+ else:
+ self.fsstats.addstr(y_coord, coord[0], "N/A", curses.A_DIM)
+
+ def create_clients(self, x_coord_map, stats_json, fs_name):
+ counters = [m.upper() for m in stats_json[GLOBAL_COUNTERS_KEY]]
+ self.tablehead_y += 2
+ res = stats_json[CLIENT_METADATA_KEY].get(fs_name, {})
+ client_cnt = len(res)
+ self.fsstats.addstr(self.tablehead_y, 0, FS_TOP_NAME_TOPL_FMT.format(
+ fs_name=fs_name, client_count=client_cnt), curses.A_BOLD | curses.A_ITALIC)
+ self.tablehead_y += 2
+ if client_cnt:
+ for client_id, metrics in \
+ stats_json[GLOBAL_METRICS_KEY][fs_name].items():
+ self.create_client(
+ client_id, metrics, counters, res[client_id],
+ x_coord_map, self.tablehead_y)
+ self.tablehead_y += 1
+
+ def create_header(self, stats_json, help, screen_title="", color_id=0):
+ num_clients, num_mounts, num_kclients, num_libs = 0, 0, 0, 0
+ if not stats_json['version'] == FS_TOP_SUPPORTED_VER:
+ self.header.addstr(0, 0, 'perf stats version mismatch!', curses.A_BOLD)
+ return False
+ global fs_list
+ for fs_name in fs_list:
+ client_metadata = stats_json[CLIENT_METADATA_KEY].get(fs_name, {})
+ client_cnt = len(client_metadata)
+ if client_cnt:
+ num_clients = num_clients + client_cnt
+ num_mounts = num_mounts + len(
+ [client for client, metadata in client_metadata.items() if
+ CLIENT_METADATA_MOUNT_POINT_KEY in metadata
+ and metadata[CLIENT_METADATA_MOUNT_POINT_KEY] != 'N/A'])
+ num_kclients = num_kclients + len(
+ [client for client, metadata in client_metadata.items() if
+ "kernel_version" in metadata])
+ num_libs = num_clients - (num_mounts + num_kclients)
+ now = datetime.now().ctime()
+ self.header.addstr(0, 0, FS_TOP_VERSION_HEADER_FMT.format(prog_name=FS_TOP_PROG_STR,
+ now=now), curses.A_BOLD)
+ self.header.addstr(2, 0, screen_title, curses.color_pair(color_id) | curses.A_BOLD)
+ self.header.addstr(3, 0, FS_TOP_CLIENT_HEADER_FMT.format(num_clients=num_clients,
+ num_mounts=num_mounts,
+ num_kclients=num_kclients,
+ num_libs=num_libs), curses.A_DIM)
+ self.header.addstr(4, 0, help, curses.A_DIM)
+ return True
+
+ def run_display(self, fs):
+ # clear the pads to have a smooth refresh
+ self.header.erase()
+ self.fsstats.erase()
+
+ self.current_screen = FS_TOP_FS_SELECTED_APP
+ screen_title = "Selected Filesystem Info"
+ help_commands = "Press 'q' to go back to home (All Filesystem Info) screen"\
+ " | Press 'm' to select another filesystem"
+ curses.init_pair(3, curses.COLOR_MAGENTA, -1)
+
+ top, left = 0, 0 # where to place pad
+ vscrollOffset, hscrollOffset = 0, 0 # scroll offsets
+
+ # calculate the initial viewport height and width
+ windowsize = self.stdscr.getmaxyx()
+ self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+
+ # create header subpad
+ self.header_height = 7
+ self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0)
+
+ # create fsstats subpad
+ fsstats_begin_y = self.header_height
+ fsstats_height = self.PAD_HEIGHT - self.header_height
+ self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0)
+
+ curses.halfdelay(1)
+ cmd = self.stdscr.getch()
+ while not self.exit_ev.is_set():
+ if cmd in [ord('m'), ord('q')]:
+ self.set_option(cmd)
+ self.exit_ev.set()
+
+ # header display
+ global fs_list
+ fs_list = self.get_fs_names()
+ stats_json = self.perf_stats_query()
+ vscrollEnd = 0
+ if fs not in fs_list:
+ help = "Error: The selected filesystem is not available now. " + help_commands
+ self.header.erase() # erase previous text
+ self.create_header(stats_json, help, screen_title, 3)
+ else:
+ self.tablehead_y = 0
+ help = "HELP: " + help_commands
+ self.fsstats.erase() # erase previous text
+
+ vscrollEnd = len(stats_json[CLIENT_METADATA_KEY].get(fs, {}))
+ if self.create_header(stats_json, help, screen_title, 3):
+ x_coord_map = self.create_top_line_and_build_coord()
+ self.create_clients(x_coord_map, stats_json, fs)
+
+ # scroll and refresh
+ if cmd == curses.KEY_DOWN:
+ if (vscrollEnd - vscrollOffset) > 1:
+ vscrollOffset += 1
+ else:
+ vscrollOffset = vscrollEnd
+ elif cmd == curses.KEY_UP:
+ if vscrollOffset > 0:
+ vscrollOffset -= 1
+ elif cmd == curses.KEY_NPAGE:
+ if (vscrollEnd - vscrollOffset) / 20 > 1:
+ vscrollOffset += 20
+ else:
+ vscrollOffset = vscrollEnd
+ elif cmd == curses.KEY_PPAGE:
+ if vscrollOffset / 20 >= 1:
+ vscrollOffset -= 20
+ else:
+ vscrollOffset = 0
+ elif cmd == curses.KEY_RIGHT:
+ if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1:
+ hscrollOffset += 1
+ elif cmd == curses.KEY_LEFT:
+ if hscrollOffset > 0:
+ hscrollOffset -= 1
+ elif cmd == curses.KEY_HOME:
+ hscrollOffset = 0
+ elif cmd == curses.KEY_END:
+ hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1
+ elif cmd == curses.KEY_RESIZE:
+ # terminal resize event. Update the viewport dimensions
+ windowsize = self.stdscr.getmaxyx()
+ self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+
+ if cmd:
+ try:
+ # refresh the viewport for the header portion
+ if cmd not in [curses.KEY_DOWN,
+ curses.KEY_UP,
+ curses.KEY_NPAGE,
+ curses.KEY_PPAGE,
+ curses.KEY_RIGHT,
+ curses.KEY_LEFT]:
+ self.fstop_pad.refresh(0, 0,
+ top, left,
+ top + self.header_height, left + self.viewportWidth)
+ # refresh the viewport for the current table header portion in the fsstats pad
+ if cmd not in [curses.KEY_DOWN,
+ curses.KEY_UP,
+ curses.KEY_NPAGE,
+ curses.KEY_PPAGE]:
+ self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset,
+ top + fsstats_begin_y, left,
+ 7, left + self.viewportWidth)
+ # refresh the viewport for the current client records portion in the fsstats pad
+ self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset,
+ top + fsstats_begin_y + 2, left,
+ top + self.viewportHeight, left + self.viewportWidth)
+ except curses.error:
+ # This happens when the user switches to a terminal of different zoom size.
+ # just retry it.
+ pass
+ # End scroll and refresh
+
+ curses.halfdelay(self.refresh_interval_secs * 10)
+ cmd = self.stdscr.getch()
+
+ def run_all_display(self):
+ # clear text from the previous screen
+ if self.current_screen == FS_TOP_FS_SELECTED_APP:
+ self.header.erase()
+
+ self.current_screen = FS_TOP_ALL_FS_APP
+ screen_title = "All Filesystem Info"
+ curses.init_pair(2, curses.COLOR_CYAN, -1)
+
+ top, left = 0, 0 # where to place pad
+ vscrollOffset, hscrollOffset = 0, 0 # scroll offsets
+
+ # calculate the initial viewport height and width
+ windowsize = self.stdscr.getmaxyx()
+ self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+
+ # create header subpad
+ self.header_height = 7
+ self.header = self.fstop_pad.subwin(self.header_height, self.viewportWidth, 0, 0)
+
+ # create fsstats subpad
+ fsstats_begin_y = self.header_height
+ fsstats_height = self.PAD_HEIGHT - self.header_height
+ self.fsstats = self.fstop_pad.subwin(fsstats_height, self.PAD_WIDTH, fsstats_begin_y, 0)
+
+ curses.halfdelay(1)
+ cmd = self.stdscr.getch()
+ while not self.exit_ev.is_set():
+ if cmd in [ord('m'), ord('q')]:
+ self.set_option(cmd)
+ self.exit_ev.set()
+
+ # header display
+ global fs_list
+ fs_list = self.get_fs_names()
+ stats_json = self.perf_stats_query()
+ vscrollEnd = 0
+ if not fs_list:
+ help = "INFO: No filesystem is available [Press 'q' to quit]"
+ self.header.erase() # erase previous text
+ self.fsstats.erase()
+ self.create_header(stats_json, help, screen_title, 2)
+ else:
+ self.tablehead_y = 0
+ help = "HELP: Press 'm' to select a filesystem | Press 'q' to quit"
+ self.fsstats.erase() # erase previous text
+ for index, fs in enumerate(fs_list):
+ # Get the vscrollEnd in advance
+ vscrollEnd += len(stats_json[CLIENT_METADATA_KEY].get(fs, {}))
+ if self.create_header(stats_json, help, screen_title, 2):
+ if not index: # do it only for the first fs
+ x_coord_map = self.create_top_line_and_build_coord()
+ self.create_clients(x_coord_map, stats_json, fs)
+
+ # scroll and refresh
+ if cmd == curses.KEY_DOWN:
+ if (vscrollEnd - vscrollOffset) > 1:
+ vscrollOffset += 1
+ else:
+ vscrollOffset = vscrollEnd
+ elif cmd == curses.KEY_UP:
+ if vscrollOffset > 0:
+ vscrollOffset -= 1
+ elif cmd == curses.KEY_NPAGE:
+ if (vscrollEnd - vscrollOffset) / 20 > 1:
+ vscrollOffset += 20
+ else:
+ vscrollOffset = vscrollEnd
+ elif cmd == curses.KEY_PPAGE:
+ if vscrollOffset / 20 >= 1:
+ vscrollOffset -= 20
+ else:
+ vscrollOffset = 0
+ elif cmd == curses.KEY_RIGHT:
+ if hscrollOffset < self.PAD_WIDTH - self.viewportWidth - 1:
+ hscrollOffset += 1
+ elif cmd == curses.KEY_LEFT:
+ if hscrollOffset > 0:
+ hscrollOffset -= 1
+ elif cmd == curses.KEY_HOME:
+ hscrollOffset = 0
+ elif cmd == curses.KEY_END:
+ hscrollOffset = self.PAD_WIDTH - self.viewportWidth - 1
+ elif cmd == curses.KEY_RESIZE:
+ # terminal resize event. Update the viewport dimensions
+ windowsize = self.stdscr.getmaxyx()
+ self.viewportHeight, self.viewportWidth = windowsize[0] - 1, windowsize[1] - 1
+ if cmd:
+ try:
+ # refresh the viewport for the header portion
+ if cmd not in [curses.KEY_DOWN,
+ curses.KEY_UP,
+ curses.KEY_NPAGE,
+ curses.KEY_PPAGE,
+ curses.KEY_RIGHT,
+ curses.KEY_LEFT]:
+ self.fstop_pad.refresh(0, 0,
+ top, left,
+ top + self.header_height, left + self.viewportWidth)
+ # refresh the viewport for the current table header portion in the fsstats pad
+ if cmd not in [curses.KEY_DOWN,
+ curses.KEY_UP,
+ curses.KEY_NPAGE,
+ curses.KEY_PPAGE]:
+ self.fstop_pad.refresh(fsstats_begin_y, hscrollOffset,
+ top + fsstats_begin_y, left,
+ 7, left + self.viewportWidth)
+ # refresh the viewport for the current client records portion in the fsstats pad
+ self.fstop_pad.refresh(fsstats_begin_y + 1 + vscrollOffset, hscrollOffset,
+ top + fsstats_begin_y + 2, left,
+ top + self.viewportHeight, left + self.viewportWidth)
+ except curses.error:
+ # This happens when the user switches to a terminal of different zoom size.
+ # just retry it.
+ pass
+ # End scroll and refresh
+
+ curses.halfdelay(self.refresh_interval_secs * 10)
+ cmd = self.stdscr.getch()
+# End class FSTop
+
+
+if __name__ == '__main__':
+ def float_greater_than(x):
+ value = float(x)
+ if value < MIN_REFRESH_INTERVAL:
+ raise argparse.ArgumentTypeError(
+ 'Refresh interval should be greater than or equal to'
+ f' {MIN_REFRESH_INTERVAL}')
+ return value
+
+ parser = argparse.ArgumentParser(description='Ceph Filesystem top utility')
+ parser.add_argument('--cluster', nargs='?', const='ceph', default='ceph',
+ help='Ceph cluster to connect (defualt: ceph)')
+ parser.add_argument('--id', nargs='?', const='fstop', default='fstop',
+ help='Ceph user to use to connection (default: fstop)')
+ parser.add_argument('--conffile', nargs='?', default=None,
+ help='Path to cluster configuration file')
+ parser.add_argument('--selftest', dest='selftest', action='store_true',
+ help='Run in selftest mode')
+ parser.add_argument('-d', '--delay', nargs='?',
+ default=DEFAULT_REFRESH_INTERVAL,
+ type=float_greater_than,
+ help='Refresh interval in seconds '
+ f'(default: {DEFAULT_REFRESH_INTERVAL})')
+
+ args = parser.parse_args()
+ err = False
+ ft = FSTop(args)
+ try:
+ ft.init()
+ if args.selftest:
+ ft.selftest()
+ sys.stdout.write("selftest ok\n")
+ else:
+ curses.wrapper(ft.setup_curses)
+ except FSTopException as fst:
+ err = True
+ sys.stderr.write(f'{fst.get_error_msg()}\n')
+ except Exception as e:
+ err = True
+ sys.stderr.write(f'exception: {e}\n')
+ finally:
+ ft.fini()
+ sys.exit(0 if not err else -1)
diff --git a/src/tools/cephfs/top/setup.py b/src/tools/cephfs/top/setup.py
new file mode 100644
index 000000000..92fbd964c
--- /dev/null
+++ b/src/tools/cephfs/top/setup.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+from setuptools import setup
+
+__version__ = '0.0.1'
+
+setup(
+ name='cephfs-top',
+ version=__version__,
+ description='top(1) like utility for Ceph Filesystem',
+ keywords='cephfs, top',
+ scripts=['cephfs-top'],
+ install_requires=[
+ 'rados',
+ ],
+ classifiers=[
+ 'Development Status :: 3 - Alpha',
+ 'Environment :: Console',
+ 'Intended Audience :: System Administrators',
+ 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)',
+ 'Operating System :: POSIX :: Linux',
+ 'Programming Language :: Python :: 3'
+ ],
+ license='LGPLv2+',
+)
diff --git a/src/tools/cephfs/top/tox.ini b/src/tools/cephfs/top/tox.ini
new file mode 100644
index 000000000..b125c0bc8
--- /dev/null
+++ b/src/tools/cephfs/top/tox.ini
@@ -0,0 +1,7 @@
+[tox]
+envlist = py3
+skipsdist = true
+
+[testenv:py3]
+deps = flake8
+commands = flake8 --ignore=W503 --max-line-length=100 cephfs-top
diff --git a/src/tools/cephfs/type_helper.hpp b/src/tools/cephfs/type_helper.hpp
new file mode 100644
index 000000000..06dfa0afe
--- /dev/null
+++ b/src/tools/cephfs/type_helper.hpp
@@ -0,0 +1,28 @@
+#ifndef TYPE_HELPER_HPP__
+#define TYPE_HELPER_HPP__
+
+template<typename T1, typename T2>
+T1 conv_t(T2 s){
+ T1 target;
+ std::stringstream conv;
+ conv << s;
+ conv >> target;
+ return target;
+}
+
+void string_split(std::string str, vector<string>& out, string split = ":"){
+ std::cout << str << std::endl;
+ auto pos = str.find(split);
+ while(pos != std::string::npos){
+ std::cout << str.substr(0, pos) << std::endl;
+ out.push_back(str.substr(0, pos));
+ if (str.size() > pos + split.size()){
+ str = str.substr(pos + split.size());
+ pos = str.find(split);
+ }else
+ return;
+ }
+ out.push_back(str.substr());
+ return;
+}
+#endif // TYPE_HELPER_HPP__
diff --git a/src/tools/cephfs_mirror/CMakeLists.txt b/src/tools/cephfs_mirror/CMakeLists.txt
new file mode 100644
index 000000000..4b6dea7a1
--- /dev/null
+++ b/src/tools/cephfs_mirror/CMakeLists.txt
@@ -0,0 +1,30 @@
+set(cephfs_mirror_internal
+ ClusterWatcher.cc
+ Mirror.cc
+ FSMirror.cc
+ InstanceWatcher.cc
+ MirrorWatcher.cc
+ PeerReplayer.cc
+ ServiceDaemon.cc
+ Types.cc
+ Utils.cc
+ Watcher.cc
+ watcher/RewatchRequest.cc)
+
+add_executable(cephfs-mirror
+ main.cc)
+
+add_library(cephfs_mirror_internal STATIC
+ ${cephfs_mirror_internal})
+
+target_link_libraries(cephfs-mirror
+ cephfs_mirror_internal
+ global
+ ceph-common
+ cls_cephfs_client
+ librados
+ mds
+ cephfs
+ ${ALLOC_LIBS})
+
+install(TARGETS cephfs-mirror DESTINATION bin)
diff --git a/src/tools/cephfs_mirror/ClusterWatcher.cc b/src/tools/cephfs_mirror/ClusterWatcher.cc
new file mode 100644
index 000000000..b5f6f81d7
--- /dev/null
+++ b/src/tools/cephfs_mirror/ClusterWatcher.cc
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <mutex>
+#include <vector>
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "mon/MonClient.h"
+
+#include "ClusterWatcher.h"
+#include "ServiceDaemon.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::ClusterWatcher " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+ClusterWatcher::ClusterWatcher(CephContext *cct, MonClient *monc, ServiceDaemon *service_daemon,
+ Listener &listener)
+ : Dispatcher(cct),
+ m_monc(monc),
+ m_service_daemon(service_daemon),
+ m_listener(listener) {
+}
+
+ClusterWatcher::~ClusterWatcher() {
+}
+
+bool ClusterWatcher::ms_can_fast_dispatch2(const cref_t<Message> &m) const {
+ return m->get_type() == CEPH_MSG_FS_MAP;
+}
+
+void ClusterWatcher::ms_fast_dispatch2(const ref_t<Message> &m) {
+ bool handled = ms_dispatch2(m);
+ ceph_assert(handled);
+}
+
+bool ClusterWatcher::ms_dispatch2(const ref_t<Message> &m) {
+ if (m->get_type() == CEPH_MSG_FS_MAP) {
+ if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+ handle_fsmap(ref_cast<MFSMap>(m));
+ }
+ return true;
+ }
+
+ return false;
+}
+
+int ClusterWatcher::init() {
+ dout(20) << dendl;
+
+ bool sub = m_monc->sub_want("fsmap", 0, 0);
+ if (!sub) {
+ derr << ": failed subscribing to FSMap" << dendl;
+ return -1;
+ }
+
+ m_monc->renew_subs();
+ dout(10) << ": subscribed to FSMap" << dendl;
+ return 0;
+}
+
+void ClusterWatcher::shutdown() {
+ dout(20) << dendl;
+ std::scoped_lock locker(m_lock);
+ m_stopping = true;
+ m_monc->sub_unwant("fsmap");
+}
+
+void ClusterWatcher::handle_fsmap(const cref_t<MFSMap> &m) {
+ dout(20) << dendl;
+
+ auto fsmap = m->get_fsmap();
+ auto filesystems = fsmap.get_filesystems();
+
+ std::vector<Filesystem> mirroring_enabled;
+ std::vector<Filesystem> mirroring_disabled;
+ std::map<Filesystem, Peers> peers_added;
+ std::map<Filesystem, Peers> peers_removed;
+ std::map<Filesystem, uint64_t> fs_metadata_pools;
+ {
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ return;
+ }
+
+ // deleted filesystems are considered mirroring disabled
+ for (auto it = m_filesystem_peers.begin(); it != m_filesystem_peers.end();) {
+ if (!fsmap.filesystem_exists(it->first.fscid)) {
+ mirroring_disabled.emplace_back(it->first);
+ it = m_filesystem_peers.erase(it);
+ continue;
+ }
+ ++it;
+ }
+
+ for (auto &filesystem : filesystems) {
+ auto fs = Filesystem{filesystem->fscid,
+ std::string(filesystem->mds_map.get_fs_name())};
+ auto pool_id = filesystem->mds_map.get_metadata_pool();
+ auto &mirror_info = filesystem->mirror_info;
+
+ if (!mirror_info.is_mirrored()) {
+ auto it = m_filesystem_peers.find(fs);
+ if (it != m_filesystem_peers.end()) {
+ mirroring_disabled.emplace_back(fs);
+ m_filesystem_peers.erase(it);
+ }
+ } else {
+ auto [fspeersit, enabled] = m_filesystem_peers.emplace(fs, Peers{});
+ auto &peers = fspeersit->second;
+
+ if (enabled) {
+ mirroring_enabled.emplace_back(fs);
+ fs_metadata_pools.emplace(fs, pool_id);
+ }
+
+ // peers added
+ Peers added;
+ std::set_difference(mirror_info.peers.begin(), mirror_info.peers.end(),
+ peers.begin(), peers.end(), std::inserter(added, added.end()));
+
+ // peers removed
+ Peers removed;
+ std::set_difference(peers.begin(), peers.end(),
+ mirror_info.peers.begin(), mirror_info.peers.end(),
+ std::inserter(removed, removed.end()));
+
+ // update set
+ if (!added.empty()) {
+ peers_added.emplace(fs, added);
+ peers.insert(added.begin(), added.end());
+ }
+ if (!removed.empty()) {
+ peers_removed.emplace(fs, removed);
+ for (auto &p : removed) {
+ peers.erase(p);
+ }
+ }
+ }
+ }
+ }
+
+ dout(5) << ": mirroring enabled=" << mirroring_enabled << ", mirroring_disabled="
+ << mirroring_disabled << dendl;
+ for (auto &fs : mirroring_enabled) {
+ m_service_daemon->add_filesystem(fs.fscid, fs.fs_name);
+ m_listener.handle_mirroring_enabled(FilesystemSpec(fs, fs_metadata_pools.at(fs)));
+ }
+ for (auto &fs : mirroring_disabled) {
+ m_service_daemon->remove_filesystem(fs.fscid);
+ m_listener.handle_mirroring_disabled(fs);
+ }
+
+ dout(5) << ": peers added=" << peers_added << ", peers removed=" << peers_removed << dendl;
+
+ for (auto &[fs, peers] : peers_added) {
+ for (auto &peer : peers) {
+ m_service_daemon->add_peer(fs.fscid, peer);
+ m_listener.handle_peers_added(fs, peer);
+ }
+ }
+ for (auto &[fs, peers] : peers_removed) {
+ for (auto &peer : peers) {
+ m_service_daemon->remove_peer(fs.fscid, peer);
+ m_listener.handle_peers_removed(fs, peer);
+ }
+ }
+
+ std::scoped_lock locker(m_lock);
+ if (!m_stopping) {
+ m_monc->sub_got("fsmap", fsmap.get_epoch());
+ } // else we have already done a sub_unwant()
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/ClusterWatcher.h b/src/tools/cephfs_mirror/ClusterWatcher.h
new file mode 100644
index 000000000..a418898f5
--- /dev/null
+++ b/src/tools/cephfs_mirror/ClusterWatcher.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_CLUSTER_WATCHER_H
+#define CEPHFS_MIRROR_CLUSTER_WATCHER_H
+
+#include <map>
+
+#include "common/ceph_mutex.h"
+#include "common/async/context_pool.h"
+#include "messages/MFSMap.h"
+#include "msg/Dispatcher.h"
+#include "Types.h"
+
+class MonClient;
+
+namespace cephfs {
+namespace mirror {
+
+class ServiceDaemon;
+
+// watch peer changes for filesystems via FSMap updates
+
+class ClusterWatcher : public Dispatcher {
+public:
+ struct Listener {
+ virtual ~Listener() {
+ }
+
+ virtual void handle_mirroring_enabled(const FilesystemSpec &spec) = 0;
+ virtual void handle_mirroring_disabled(const Filesystem &filesystem) = 0;
+
+ virtual void handle_peers_added(const Filesystem &filesystem, const Peer &peer) = 0;
+ virtual void handle_peers_removed(const Filesystem &filesystem, const Peer &peer) = 0;
+ };
+
+ ClusterWatcher(CephContext *cct, MonClient *monc, ServiceDaemon *service_daemon,
+ Listener &listener);
+ ~ClusterWatcher();
+
+ bool ms_can_fast_dispatch_any() const override {
+ return true;
+ }
+ bool ms_can_fast_dispatch2(const cref_t<Message> &m) const override;
+ void ms_fast_dispatch2(const ref_t<Message> &m) override;
+ bool ms_dispatch2(const ref_t<Message> &m) override;
+
+ void ms_handle_connect(Connection *c) override {
+ }
+ bool ms_handle_reset(Connection *c) override {
+ return false;
+ }
+ void ms_handle_remote_reset(Connection *c) override {
+ }
+ bool ms_handle_refused(Connection *c) override {
+ return false;
+ }
+
+ int init();
+ void shutdown();
+
+private:
+ ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::cluster_watcher");
+ MonClient *m_monc;
+ ServiceDaemon *m_service_daemon;
+ Listener &m_listener;
+
+ bool m_stopping = false;
+ std::map<Filesystem, Peers> m_filesystem_peers;
+
+ void handle_fsmap(const cref_t<MFSMap> &m);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_CLUSTER_WATCHER_H
diff --git a/src/tools/cephfs_mirror/FSMirror.cc b/src/tools/cephfs_mirror/FSMirror.cc
new file mode 100644
index 000000000..76dcc11f6
--- /dev/null
+++ b/src/tools/cephfs_mirror/FSMirror.cc
@@ -0,0 +1,441 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/admin_socket.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/common_init.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "msg/Messenger.h"
+#include "FSMirror.h"
+#include "PeerReplayer.h"
+#include "aio_utils.h"
+#include "ServiceDaemon.h"
+#include "Utils.h"
+
+#include "common/Cond.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::FSMirror " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+const std::string SERVICE_DAEMON_DIR_COUNT_KEY("directory_count");
+const std::string SERVICE_DAEMON_PEER_INIT_FAILED_KEY("peer_init_failed");
+
+class MirrorAdminSocketCommand {
+public:
+ virtual ~MirrorAdminSocketCommand() {
+ }
+ virtual int call(Formatter *f) = 0;
+};
+
+class StatusCommand : public MirrorAdminSocketCommand {
+public:
+ explicit StatusCommand(FSMirror *fs_mirror)
+ : fs_mirror(fs_mirror) {
+ }
+
+ int call(Formatter *f) override {
+ fs_mirror->mirror_status(f);
+ return 0;
+ }
+
+private:
+ FSMirror *fs_mirror;
+};
+
+} // anonymous namespace
+
+class MirrorAdminSocketHook : public AdminSocketHook {
+public:
+ MirrorAdminSocketHook(CephContext *cct, const Filesystem &filesystem, FSMirror *fs_mirror)
+ : admin_socket(cct->get_admin_socket()) {
+ int r;
+ std::string cmd;
+
+ // mirror status format is name@fscid
+ cmd = "fs mirror status " + stringify(filesystem.fs_name) + "@" + stringify(filesystem.fscid);
+ r = admin_socket->register_command(
+ cmd, this, "get filesystem mirror status");
+ if (r == 0) {
+ commands[cmd] = new StatusCommand(fs_mirror);
+ }
+ }
+
+ ~MirrorAdminSocketHook() override {
+ admin_socket->unregister_commands(this);
+ for (auto &[command, cmdptr] : commands) {
+ delete cmdptr;
+ }
+ }
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f, std::ostream &errss, bufferlist &out) override {
+ auto p = commands.at(std::string(command));
+ return p->call(f);
+ }
+
+private:
+ typedef std::map<std::string, MirrorAdminSocketCommand*, std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+FSMirror::FSMirror(CephContext *cct, const Filesystem &filesystem, uint64_t pool_id,
+ ServiceDaemon *service_daemon, std::vector<const char*> args,
+ ContextWQ *work_queue)
+ : m_cct(cct),
+ m_filesystem(filesystem),
+ m_pool_id(pool_id),
+ m_service_daemon(service_daemon),
+ m_args(args),
+ m_work_queue(work_queue),
+ m_snap_listener(this),
+ m_asok_hook(new MirrorAdminSocketHook(cct, filesystem, this)) {
+ m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY,
+ (uint64_t)0);
+}
+
+FSMirror::~FSMirror() {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ delete m_instance_watcher;
+ delete m_mirror_watcher;
+ }
+ // outside the lock so that in-progress commands can acquire
+ // lock and finish executing.
+ delete m_asok_hook;
+}
+
+int FSMirror::init_replayer(PeerReplayer *peer_replayer) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+ return peer_replayer->init();
+}
+
+void FSMirror::shutdown_replayer(PeerReplayer *peer_replayer) {
+ peer_replayer->shutdown();
+}
+
+void FSMirror::cleanup() {
+ dout(20) << dendl;
+ ceph_unmount(m_mount);
+ ceph_release(m_mount);
+ m_ioctx.close();
+ m_cluster.reset();
+}
+
+void FSMirror::reopen_logs() {
+ std::scoped_lock locker(m_lock);
+
+ if (m_cluster) {
+ reinterpret_cast<CephContext *>(m_cluster->cct())->reopen_logs();
+ }
+ for (auto &[peer, replayer] : m_peer_replayers) {
+ replayer->reopen_logs();
+ }
+}
+
+void FSMirror::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ int r = connect(g_ceph_context->_conf->name.to_str(),
+ g_ceph_context->_conf->cluster, &m_cluster, "", "", m_args);
+ if (r < 0) {
+ m_init_failed = true;
+ on_finish->complete(r);
+ return;
+ }
+
+ r = m_cluster->ioctx_create2(m_pool_id, m_ioctx);
+ if (r < 0) {
+ m_init_failed = true;
+ m_cluster.reset();
+ derr << ": error accessing local pool (id=" << m_pool_id << "): "
+ << cpp_strerror(r) << dendl;
+ on_finish->complete(r);
+ return;
+ }
+
+ r = mount(m_cluster, m_filesystem, true, &m_mount);
+ if (r < 0) {
+ m_init_failed = true;
+ m_ioctx.close();
+ m_cluster.reset();
+ on_finish->complete(r);
+ return;
+ }
+
+ m_addrs = m_cluster->get_addrs();
+ dout(10) << ": rados addrs=" << m_addrs << dendl;
+
+ init_instance_watcher(on_finish);
+}
+
+void FSMirror::shutdown(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ m_stopping = true;
+ if (m_on_init_finish != nullptr) {
+ dout(10) << ": delaying shutdown -- init in progress" << dendl;
+ m_on_shutdown_finish = new LambdaContext([this, on_finish](int r) {
+ if (r < 0) {
+ on_finish->complete(0);
+ return;
+ }
+ m_on_shutdown_finish = on_finish;
+ shutdown_peer_replayers();
+ });
+ return;
+ }
+
+ m_on_shutdown_finish = on_finish;
+ }
+
+ shutdown_peer_replayers();
+}
+
+void FSMirror::shutdown_peer_replayers() {
+ dout(20) << dendl;
+
+ for (auto &[peer, peer_replayer] : m_peer_replayers) {
+ dout(5) << ": shutting down replayer for peer=" << peer << dendl;
+ shutdown_replayer(peer_replayer.get());
+ }
+ m_peer_replayers.clear();
+
+ shutdown_mirror_watcher();
+}
+
+void FSMirror::init_instance_watcher(Context *on_finish) {
+ dout(20) << dendl;
+
+ m_on_init_finish = new LambdaContext([this, on_finish](int r) {
+ {
+ std::scoped_lock locker(m_lock);
+ if (r < 0) {
+ m_init_failed = true;
+ }
+ }
+ on_finish->complete(r);
+ if (m_on_shutdown_finish != nullptr) {
+ m_on_shutdown_finish->complete(r);
+ }
+ });
+
+ Context *ctx = new C_CallbackAdapter<
+ FSMirror, &FSMirror::handle_init_instance_watcher>(this);
+ m_instance_watcher = InstanceWatcher::create(m_ioctx, m_snap_listener, m_work_queue);
+ m_instance_watcher->init(ctx);
+}
+
+void FSMirror::handle_init_instance_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r < 0) {
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ return;
+ }
+
+ init_mirror_watcher();
+}
+
+void FSMirror::init_mirror_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *ctx = new C_CallbackAdapter<
+ FSMirror, &FSMirror::handle_init_mirror_watcher>(this);
+ m_mirror_watcher = MirrorWatcher::create(m_ioctx, this, m_work_queue);
+ m_mirror_watcher->init(ctx);
+}
+
+void FSMirror::handle_init_mirror_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r == 0) {
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ return;
+ }
+
+ m_retval = r; // save errcode for init context callback
+ shutdown_instance_watcher();
+}
+
+void FSMirror::shutdown_mirror_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *ctx = new C_CallbackAdapter<
+ FSMirror, &FSMirror::handle_shutdown_mirror_watcher>(this);
+ m_mirror_watcher->shutdown(ctx);
+}
+
+void FSMirror::handle_shutdown_mirror_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ shutdown_instance_watcher();
+}
+
+void FSMirror::shutdown_instance_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *ctx = new C_CallbackAdapter<
+ FSMirror, &FSMirror::handle_shutdown_instance_watcher>(this);
+ m_instance_watcher->shutdown(new C_AsyncCallback<ContextWQ>(m_work_queue, ctx));
+}
+
+void FSMirror::handle_shutdown_instance_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ cleanup();
+
+ Context *on_init_finish = nullptr;
+ Context *on_shutdown_finish = nullptr;
+
+ {
+ std::scoped_lock locker(m_lock);
+ std::swap(on_init_finish, m_on_init_finish);
+ std::swap(on_shutdown_finish, m_on_shutdown_finish);
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(m_retval);
+ }
+ if (on_shutdown_finish != nullptr) {
+ on_shutdown_finish->complete(r);
+ }
+}
+
+void FSMirror::handle_acquire_directory(string_view dir_path) {
+ dout(5) << ": dir_path=" << dir_path << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ m_directories.emplace(dir_path);
+ m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY,
+ m_directories.size());
+
+ for (auto &[peer, peer_replayer] : m_peer_replayers) {
+ dout(10) << ": peer=" << peer << dendl;
+ peer_replayer->add_directory(dir_path);
+ }
+ }
+}
+
+void FSMirror::handle_release_directory(string_view dir_path) {
+ dout(5) << ": dir_path=" << dir_path << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto it = m_directories.find(dir_path);
+ if (it != m_directories.end()) {
+ m_directories.erase(it);
+ m_service_daemon->add_or_update_fs_attribute(m_filesystem.fscid, SERVICE_DAEMON_DIR_COUNT_KEY,
+ m_directories.size());
+ for (auto &[peer, peer_replayer] : m_peer_replayers) {
+ dout(10) << ": peer=" << peer << dendl;
+ peer_replayer->remove_directory(dir_path);
+ }
+ }
+ }
+}
+
+void FSMirror::add_peer(const Peer &peer) {
+ dout(10) << ": peer=" << peer << dendl;
+
+ std::scoped_lock locker(m_lock);
+ m_all_peers.emplace(peer);
+ if (m_peer_replayers.find(peer) != m_peer_replayers.end()) {
+ return;
+ }
+
+ auto replayer = std::make_unique<PeerReplayer>(
+ m_cct, this, m_cluster, m_filesystem, peer, m_directories, m_mount, m_service_daemon);
+ int r = init_replayer(replayer.get());
+ if (r < 0) {
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, peer,
+ SERVICE_DAEMON_PEER_INIT_FAILED_KEY,
+ true);
+ return;
+ }
+ m_peer_replayers.emplace(peer, std::move(replayer));
+ ceph_assert(m_peer_replayers.size() == 1); // support only a single peer
+}
+
+void FSMirror::remove_peer(const Peer &peer) {
+ dout(10) << ": peer=" << peer << dendl;
+
+ std::unique_ptr<PeerReplayer> replayer;
+ {
+ std::scoped_lock locker(m_lock);
+ m_all_peers.erase(peer);
+ auto it = m_peer_replayers.find(peer);
+ if (it != m_peer_replayers.end()) {
+ replayer = std::move(it->second);
+ m_peer_replayers.erase(it);
+ }
+ }
+
+ if (replayer) {
+ dout(5) << ": shutting down replayers for peer=" << peer << dendl;
+ shutdown_replayer(replayer.get());
+ }
+}
+
+void FSMirror::mirror_status(Formatter *f) {
+ std::scoped_lock locker(m_lock);
+ f->open_object_section("status");
+ if (m_init_failed) {
+ f->dump_string("state", "failed");
+ } else if (is_blocklisted(locker)) {
+ f->dump_string("state", "blocklisted");
+ } else {
+ // dump rados addr for blocklist test
+ f->dump_string("rados_inst", m_addrs);
+ f->open_object_section("peers");
+ for ([[maybe_unused]] auto &[peer, peer_replayer] : m_peer_replayers) {
+ peer.dump(f);
+ }
+ f->close_section(); // peers
+ f->open_object_section("snap_dirs");
+ f->dump_int("dir_count", m_directories.size());
+ f->close_section(); // snap_dirs
+ }
+ f->close_section(); // status
+}
+
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/FSMirror.h b/src/tools/cephfs_mirror/FSMirror.h
new file mode 100644
index 000000000..bae5a38e1
--- /dev/null
+++ b/src/tools/cephfs_mirror/FSMirror.h
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_FS_MIRROR_H
+#define CEPHFS_MIRROR_FS_MIRROR_H
+
+#include "common/Formatter.h"
+#include "common/Thread.h"
+#include "mds/FSMap.h"
+#include "Types.h"
+#include "InstanceWatcher.h"
+#include "MirrorWatcher.h"
+
+class ContextWQ;
+
+namespace cephfs {
+namespace mirror {
+
+class MirrorAdminSocketHook;
+class PeerReplayer;
+class ServiceDaemon;
+
+// handle mirroring for a filesystem to a set of peers
+
+class FSMirror {
+public:
+ FSMirror(CephContext *cct, const Filesystem &filesystem, uint64_t pool_id,
+ ServiceDaemon *service_daemon, std::vector<const char*> args,
+ ContextWQ *work_queue);
+ ~FSMirror();
+
+ void init(Context *on_finish);
+ void shutdown(Context *on_finish);
+
+ void add_peer(const Peer &peer);
+ void remove_peer(const Peer &peer);
+
+ bool is_stopping() {
+ std::scoped_lock locker(m_lock);
+ return m_stopping;
+ }
+
+ bool is_init_failed() {
+ std::scoped_lock locker(m_lock);
+ return m_init_failed;
+ }
+
+ bool is_failed() {
+ std::scoped_lock locker(m_lock);
+ return m_init_failed ||
+ m_instance_watcher->is_failed() ||
+ m_mirror_watcher->is_failed();
+ }
+
+ bool is_blocklisted() {
+ std::scoped_lock locker(m_lock);
+ return is_blocklisted(locker);
+ }
+
+ Peers get_peers() {
+ std::scoped_lock locker(m_lock);
+ return m_all_peers;
+ }
+
+ std::string get_instance_addr() {
+ std::scoped_lock locker(m_lock);
+ return m_addrs;
+ }
+
+ // admin socket helpers
+ void mirror_status(Formatter *f);
+
+ void reopen_logs();
+
+private:
+ bool is_blocklisted(const std::scoped_lock<ceph::mutex> &locker) const {
+ bool blocklisted = false;
+ if (m_instance_watcher) {
+ blocklisted = m_instance_watcher->is_blocklisted();
+ }
+ if (m_mirror_watcher) {
+ blocklisted |= m_mirror_watcher->is_blocklisted();
+ }
+
+ return blocklisted;
+ }
+
+ struct SnapListener : public InstanceWatcher::Listener {
+ FSMirror *fs_mirror;
+
+ SnapListener(FSMirror *fs_mirror)
+ : fs_mirror(fs_mirror) {
+ }
+
+ void acquire_directory(string_view dir_path) override {
+ fs_mirror->handle_acquire_directory(dir_path);
+ }
+
+ void release_directory(string_view dir_path) override {
+ fs_mirror->handle_release_directory(dir_path);
+ }
+ };
+
+ CephContext *m_cct;
+ Filesystem m_filesystem;
+ uint64_t m_pool_id;
+ ServiceDaemon *m_service_daemon;
+ std::vector<const char *> m_args;
+ ContextWQ *m_work_queue;
+
+ ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::fs_mirror");
+ SnapListener m_snap_listener;
+ std::set<std::string, std::less<>> m_directories;
+ Peers m_all_peers;
+ std::map<Peer, std::unique_ptr<PeerReplayer>> m_peer_replayers;
+
+ RadosRef m_cluster;
+ std::string m_addrs;
+ librados::IoCtx m_ioctx;
+ InstanceWatcher *m_instance_watcher = nullptr;
+ MirrorWatcher *m_mirror_watcher = nullptr;
+
+ int m_retval = 0;
+ bool m_stopping = false;
+ bool m_init_failed = false;
+ Context *m_on_init_finish = nullptr;
+ Context *m_on_shutdown_finish = nullptr;
+
+ MirrorAdminSocketHook *m_asok_hook = nullptr;
+
+ MountRef m_mount;
+
+ int init_replayer(PeerReplayer *peer_replayer);
+ void shutdown_replayer(PeerReplayer *peer_replayer);
+ void cleanup();
+
+ void init_instance_watcher(Context *on_finish);
+ void handle_init_instance_watcher(int r);
+
+ void init_mirror_watcher();
+ void handle_init_mirror_watcher(int r);
+
+ void shutdown_peer_replayers();
+
+ void shutdown_mirror_watcher();
+ void handle_shutdown_mirror_watcher(int r);
+
+ void shutdown_instance_watcher();
+ void handle_shutdown_instance_watcher(int r);
+
+ void handle_acquire_directory(string_view dir_path);
+ void handle_release_directory(string_view dir_path);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_FS_MIRROR_H
diff --git a/src/tools/cephfs_mirror/InstanceWatcher.cc b/src/tools/cephfs_mirror/InstanceWatcher.cc
new file mode 100644
index 000000000..9c357da31
--- /dev/null
+++ b/src/tools/cephfs_mirror/InstanceWatcher.cc
@@ -0,0 +1,251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "cls/cephfs/cls_cephfs_client.h"
+#include "include/stringify.h"
+#include "aio_utils.h"
+#include "InstanceWatcher.h"
+#include "Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::InstanceWatcher " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+std::string instance_oid(const std::string &instance_id) {
+ return CEPHFS_MIRROR_OBJECT + "." + instance_id;
+}
+
+} // anonymous namespace
+
+InstanceWatcher::InstanceWatcher(librados::IoCtx &ioctx,
+ Listener &listener, ContextWQ *work_queue)
+ : Watcher(ioctx, instance_oid(stringify(ioctx.get_instance_id())), work_queue),
+ m_ioctx(ioctx),
+ m_listener(listener),
+ m_work_queue(work_queue),
+ m_lock(ceph::make_mutex("cephfs::mirror::instance_watcher")) {
+}
+
+InstanceWatcher::~InstanceWatcher() {
+}
+
+void InstanceWatcher::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_on_init_finish == nullptr);
+ m_on_init_finish = new LambdaContext([this, on_finish](int r) {
+ on_finish->complete(r);
+ if (m_on_shutdown_finish != nullptr) {
+ m_on_shutdown_finish->complete(0);
+ }
+ });
+ }
+
+ create_instance();
+}
+
+void InstanceWatcher::shutdown(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_on_shutdown_finish == nullptr);
+ if (m_on_init_finish != nullptr) {
+ dout(10) << ": delaying shutdown -- init in progress" << dendl;
+ m_on_shutdown_finish = new LambdaContext([this, on_finish](int r) {
+ m_on_shutdown_finish = nullptr;
+ shutdown(on_finish);
+ });
+ return;
+ }
+
+ m_on_shutdown_finish = on_finish;
+ }
+
+ unregister_watcher();
+}
+
+void InstanceWatcher::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) {
+ dout(20) << dendl;
+
+ std::string dir_path;
+ std::string mode;
+ try {
+ JSONDecoder jd(bl);
+ JSONDecoder::decode_json("dir_path", dir_path, &jd.parser, true);
+ JSONDecoder::decode_json("mode", mode, &jd.parser, true);
+ } catch (const JSONDecoder::err &e) {
+ derr << ": failed to decode notify json: " << e.what() << dendl;
+ }
+
+ dout(20) << ": notifier_id=" << notifier_id << ", dir_path=" << dir_path
+ << ", mode=" << mode << dendl;
+
+ if (mode == "acquire") {
+ m_listener.acquire_directory(dir_path);
+ } else if (mode == "release") {
+ m_listener.release_directory(dir_path);
+ } else {
+ derr << ": unknown mode" << dendl;
+ }
+
+ bufferlist outbl;
+ acknowledge_notify(notify_id, handle, outbl);
+}
+
+void InstanceWatcher::handle_rewatch_complete(int r) {
+ dout(5) << ": r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ dout(0) << ": client blocklisted" <<dendl;
+ std::scoped_lock locker(m_lock);
+ m_blocklisted = true;
+ } else if (r == -ENOENT) {
+ derr << ": mirroring object deleted" << dendl;
+ m_failed = true;
+ } else if (r < 0) {
+ derr << ": rewatch error: " << cpp_strerror(r) << dendl;
+ m_failed = true;
+ }
+}
+
+void InstanceWatcher::create_instance() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ librados::ObjectWriteOperation op;
+ op.create(false);
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<InstanceWatcher, &InstanceWatcher::handle_create_instance>);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void InstanceWatcher::handle_create_instance(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r < 0) {
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ return;
+ }
+
+ register_watcher();
+}
+
+void InstanceWatcher::register_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *on_finish = new C_CallbackAdapter<
+ InstanceWatcher, &InstanceWatcher::handle_register_watcher>(this);
+ register_watch(on_finish);
+}
+
+void InstanceWatcher::handle_register_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r == 0) {
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ return;
+ }
+
+ remove_instance();
+}
+
+void InstanceWatcher::unregister_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *on_finish = new C_CallbackAdapter<
+ InstanceWatcher, &InstanceWatcher::handle_unregister_watcher>(this);
+ unregister_watch(new C_AsyncCallback<ContextWQ>(m_work_queue, on_finish));
+}
+
+void InstanceWatcher::handle_unregister_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_shutdown_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ if (r < 0) {
+ std::swap(on_shutdown_finish, m_on_shutdown_finish);
+ }
+ }
+
+ if (on_shutdown_finish != nullptr) {
+ on_shutdown_finish->complete(r);
+ return;
+ }
+
+ remove_instance();
+}
+
+void InstanceWatcher::remove_instance() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ librados::ObjectWriteOperation op;
+ op.remove();
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<InstanceWatcher, &InstanceWatcher::handle_remove_instance>);
+ int r = m_ioctx.aio_operate(m_oid, aio_comp, &op);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void InstanceWatcher::handle_remove_instance(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ Context *on_shutdown_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ std::swap(on_init_finish, m_on_init_finish);
+ std::swap(on_shutdown_finish, m_on_shutdown_finish);
+ }
+
+ if (on_init_finish != nullptr) {
+ on_init_finish->complete(r);
+ }
+ if (on_shutdown_finish != nullptr) {
+ on_shutdown_finish->complete(r);
+ }
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/InstanceWatcher.h b/src/tools/cephfs_mirror/InstanceWatcher.h
new file mode 100644
index 000000000..06edf5da9
--- /dev/null
+++ b/src/tools/cephfs_mirror/InstanceWatcher.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_INSTANCE_WATCHER_H
+#define CEPHFS_MIRROR_INSTANCE_WATCHER_H
+
+#include <string_view>
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "Watcher.h"
+
+class ContextWQ;
+
+namespace cephfs {
+namespace mirror {
+
+// watch directory update notifications via per daemon rados
+// object and invoke listener callback.
+
+class InstanceWatcher : public Watcher {
+public:
+ struct Listener {
+ virtual ~Listener() {
+ }
+
+ virtual void acquire_directory(string_view dir_path) = 0;
+ virtual void release_directory(string_view dir_path) = 0;
+ };
+
+ static InstanceWatcher *create(librados::IoCtx &ioctx,
+ Listener &listener, ContextWQ *work_queue) {
+ return new InstanceWatcher(ioctx, listener, work_queue);
+ }
+
+ InstanceWatcher(librados::IoCtx &ioctx, Listener &listener, ContextWQ *work_queue);
+ ~InstanceWatcher();
+
+ void init(Context *on_finish);
+ void shutdown(Context *on_finish);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) override;
+ void handle_rewatch_complete(int r) override;
+
+ bool is_blocklisted() {
+ std::scoped_lock locker(m_lock);
+ return m_blocklisted;
+ }
+
+ bool is_failed() {
+ std::scoped_lock locker(m_lock);
+ return m_failed;
+ }
+
+private:
+ librados::IoCtx &m_ioctx;
+ Listener &m_listener;
+ ContextWQ *m_work_queue;
+
+ ceph::mutex m_lock;
+ Context *m_on_init_finish = nullptr;
+ Context *m_on_shutdown_finish = nullptr;
+
+ bool m_blocklisted = false;
+ bool m_failed = false;
+
+ void create_instance();
+ void handle_create_instance(int r);
+
+ void register_watcher();
+ void handle_register_watcher(int r);
+
+ void remove_instance();
+ void handle_remove_instance(int r);
+
+ void unregister_watcher();
+ void handle_unregister_watcher(int r);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_INSTANCE_WATCHER_H
diff --git a/src/tools/cephfs_mirror/Mirror.cc b/src/tools/cephfs_mirror/Mirror.cc
new file mode 100644
index 000000000..890805764
--- /dev/null
+++ b/src/tools/cephfs_mirror/Mirror.cc
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/common_init.h"
+#include "common/Cond.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "include/types.h"
+#include "mon/MonClient.h"
+#include "msg/Messenger.h"
+#include "aio_utils.h"
+#include "Mirror.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::Mirror " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+const std::string SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY("mirroring_failed");
+
+class SafeTimerSingleton : public CommonSafeTimer<ceph::mutex> {
+public:
+ ceph::mutex timer_lock = ceph::make_mutex("cephfs::mirror::timer_lock");
+
+ explicit SafeTimerSingleton(CephContext *cct)
+ : SafeTimer(cct, timer_lock, true) {
+ init();
+ }
+};
+
+class ThreadPoolSingleton : public ThreadPool {
+public:
+ ContextWQ *work_queue = nullptr;
+
+ explicit ThreadPoolSingleton(CephContext *cct)
+ : ThreadPool(cct, "Mirror::thread_pool", "tp_mirror", 1) {
+ work_queue = new ContextWQ("Mirror::work_queue", ceph::make_timespan(60), this);
+
+ start();
+ }
+};
+
+} // anonymous namespace
+
+struct Mirror::C_EnableMirroring : Context {
+ Mirror *mirror;
+ Filesystem filesystem;
+ uint64_t pool_id;
+
+ C_EnableMirroring(Mirror *mirror, const Filesystem &filesystem, uint64_t pool_id)
+ : mirror(mirror),
+ filesystem(filesystem),
+ pool_id(pool_id) {
+ }
+
+ void finish(int r) override {
+ enable_mirroring();
+ }
+
+ void enable_mirroring() {
+ Context *ctx = new C_CallbackAdapter<C_EnableMirroring,
+ &C_EnableMirroring::handle_enable_mirroring>(this);
+ mirror->enable_mirroring(filesystem, pool_id, ctx);
+ }
+
+ void handle_enable_mirroring(int r) {
+ mirror->handle_enable_mirroring(filesystem, r);
+ delete this;
+ }
+
+ // context needs to live post completion
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+struct Mirror::C_DisableMirroring : Context {
+ Mirror *mirror;
+ Filesystem filesystem;
+
+ C_DisableMirroring(Mirror *mirror, const Filesystem &filesystem)
+ : mirror(mirror),
+ filesystem(filesystem) {
+ }
+
+ void finish(int r) override {
+ disable_mirroring();
+ }
+
+ void disable_mirroring() {
+ Context *ctx = new C_CallbackAdapter<C_DisableMirroring,
+ &C_DisableMirroring::handle_disable_mirroring>(this);
+ mirror->disable_mirroring(filesystem, ctx);
+ }
+
+ void handle_disable_mirroring(int r) {
+ mirror->handle_disable_mirroring(filesystem, r);
+ delete this;
+ }
+
+ // context needs to live post completion
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+struct Mirror::C_PeerUpdate : Context {
+ Mirror *mirror;
+ Filesystem filesystem;
+ Peer peer;
+ bool remove = false;
+
+ C_PeerUpdate(Mirror *mirror, const Filesystem &filesystem,
+ const Peer &peer)
+ : mirror(mirror),
+ filesystem(filesystem),
+ peer(peer) {
+ }
+ C_PeerUpdate(Mirror *mirror, const Filesystem &filesystem,
+ const Peer &peer, bool remove)
+ : mirror(mirror),
+ filesystem(filesystem),
+ peer(peer),
+ remove(remove) {
+ }
+
+ void finish(int r) override {
+ if (remove) {
+ mirror->remove_peer(filesystem, peer);
+ } else {
+ mirror->add_peer(filesystem, peer);
+ }
+ }
+};
+
+struct Mirror::C_RestartMirroring : Context {
+ Mirror *mirror;
+ Filesystem filesystem;
+ uint64_t pool_id;
+ Peers peers;
+
+ C_RestartMirroring(Mirror *mirror, const Filesystem &filesystem,
+ uint64_t pool_id, const Peers &peers)
+ : mirror(mirror),
+ filesystem(filesystem),
+ pool_id(pool_id),
+ peers(peers) {
+ }
+
+ void finish(int r) override {
+ disable_mirroring();
+ }
+
+ void disable_mirroring() {
+ Context *ctx = new C_CallbackAdapter<C_RestartMirroring,
+ &C_RestartMirroring::handle_disable_mirroring>(this);
+ mirror->disable_mirroring(filesystem, ctx);
+ }
+
+ void handle_disable_mirroring(int r) {
+ enable_mirroring();
+ }
+
+ void enable_mirroring() {
+ std::scoped_lock locker(mirror->m_lock);
+ Context *ctx = new C_CallbackAdapter<C_RestartMirroring,
+ &C_RestartMirroring::handle_enable_mirroring>(this);
+ mirror->enable_mirroring(filesystem, pool_id, ctx, true);
+ }
+
+ void handle_enable_mirroring(int r) {
+ mirror->handle_enable_mirroring(filesystem, peers, r);
+ delete this;
+ }
+
+ // context needs to live post completion
+ void complete(int r) override {
+ finish(r);
+ }
+};
+
+Mirror::Mirror(CephContext *cct, const std::vector<const char*> &args,
+ MonClient *monc, Messenger *msgr)
+ : m_cct(cct),
+ m_args(args),
+ m_monc(monc),
+ m_msgr(msgr),
+ m_listener(this),
+ m_last_blocklist_check(ceph_clock_now()),
+ m_last_failure_check(ceph_clock_now()),
+ m_local(new librados::Rados()) {
+ auto thread_pool = &(cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
+ "cephfs::mirror::thread_pool", false, cct));
+ auto safe_timer = &(cct->lookup_or_create_singleton_object<SafeTimerSingleton>(
+ "cephfs::mirror::safe_timer", false, cct));
+ m_thread_pool = thread_pool;
+ m_work_queue = thread_pool->work_queue;
+ m_timer = safe_timer;
+ m_timer_lock = &safe_timer->timer_lock;
+ std::scoped_lock timer_lock(*m_timer_lock);
+ schedule_mirror_update_task();
+}
+
+Mirror::~Mirror() {
+ dout(10) << dendl;
+ {
+ std::scoped_lock timer_lock(*m_timer_lock);
+ m_timer->shutdown();
+ }
+
+ m_work_queue->drain();
+ delete m_work_queue;
+ {
+ std::scoped_lock locker(m_lock);
+ m_thread_pool->stop();
+ }
+}
+
+int Mirror::init_mon_client() {
+ dout(20) << dendl;
+
+ m_monc->set_messenger(m_msgr);
+ m_msgr->add_dispatcher_head(m_monc);
+ m_monc->set_want_keys(CEPH_ENTITY_TYPE_MON);
+
+ int r = m_monc->init();
+ if (r < 0) {
+ derr << ": failed to init mon client: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = m_monc->authenticate(std::chrono::duration<double>(m_cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout")).count());
+ if (r < 0) {
+ derr << ": failed to authenticate to monitor: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ client_t me = m_monc->get_global_id();
+ m_msgr->set_myname(entity_name_t::CLIENT(me.v));
+ return 0;
+}
+
+int Mirror::init(std::string &reason) {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+
+ int r = m_local->init_with_context(m_cct);
+ if (r < 0) {
+ derr << ": could not initialize rados handler" << dendl;
+ return r;
+ }
+
+ r = m_local->connect();
+ if (r < 0) {
+ derr << ": error connecting to local cluster" << dendl;
+ return r;
+ }
+
+ m_service_daemon = std::make_unique<ServiceDaemon>(m_cct, m_local);
+ r = m_service_daemon->init();
+ if (r < 0) {
+ derr << ": error registering service daemon: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = init_mon_client();
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+void Mirror::shutdown() {
+ dout(20) << dendl;
+ m_stopping = true;
+ m_cluster_watcher->shutdown();
+ m_cond.notify_all();
+}
+
+void Mirror::reopen_logs() {
+ for (auto &[filesystem, mirror_action] : m_mirror_actions) {
+ mirror_action.fs_mirror->reopen_logs();
+ }
+ g_ceph_context->reopen_logs();
+}
+
+void Mirror::handle_signal(int signum) {
+ dout(10) << ": signal=" << signum << dendl;
+
+ std::scoped_lock locker(m_lock);
+ switch (signum) {
+ case SIGHUP:
+ reopen_logs();
+ break;
+ case SIGINT:
+ case SIGTERM:
+ shutdown();
+ break;
+ default:
+ ceph_abort_msgf("unexpected signal %d", signum);
+ }
+}
+
+void Mirror::handle_enable_mirroring(const Filesystem &filesystem,
+ const Peers &peers, int r) {
+ dout(20) << ": filesystem=" << filesystem << ", peers=" << peers
+ << ", r=" << r << dendl;
+
+ std::scoped_lock locker(m_lock);
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.action_in_progress);
+
+ mirror_action.action_in_progress = false;
+ m_cond.notify_all();
+ if (r < 0) {
+ derr << ": failed to initialize FSMirror for filesystem=" << filesystem
+ << ": " << cpp_strerror(r) << dendl;
+ m_service_daemon->add_or_update_fs_attribute(filesystem.fscid,
+ SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY,
+ true);
+ return;
+ }
+
+ for (auto &peer : peers) {
+ mirror_action.fs_mirror->add_peer(peer);
+ }
+
+ dout(10) << ": Initialized FSMirror for filesystem=" << filesystem << dendl;
+}
+
+void Mirror::handle_enable_mirroring(const Filesystem &filesystem, int r) {
+ dout(20) << ": filesystem=" << filesystem << ", r=" << r << dendl;
+
+ std::scoped_lock locker(m_lock);
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.action_in_progress);
+
+ mirror_action.action_in_progress = false;
+ m_cond.notify_all();
+ if (r < 0) {
+ derr << ": failed to initialize FSMirror for filesystem=" << filesystem
+ << ": " << cpp_strerror(r) << dendl;
+ m_service_daemon->add_or_update_fs_attribute(filesystem.fscid,
+ SERVICE_DAEMON_MIRROR_ENABLE_FAILED_KEY,
+ true);
+ return;
+ }
+
+ dout(10) << ": Initialized FSMirror for filesystem=" << filesystem << dendl;
+}
+
+void Mirror::enable_mirroring(const Filesystem &filesystem, uint64_t local_pool_id,
+ Context *on_finish, bool is_restart) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ if (is_restart) {
+ mirror_action.fs_mirror.reset();
+ } else {
+ ceph_assert(!mirror_action.action_in_progress);
+ }
+
+ ceph_assert(!mirror_action.fs_mirror);
+
+ dout(10) << ": starting FSMirror: filesystem=" << filesystem << dendl;
+
+ mirror_action.action_in_progress = true;
+ mirror_action.fs_mirror = std::make_unique<FSMirror>(m_cct, filesystem, local_pool_id,
+ m_service_daemon.get(), m_args, m_work_queue);
+ mirror_action.fs_mirror->init(new C_AsyncCallback<ContextWQ>(m_work_queue, on_finish));
+}
+
+void Mirror::mirroring_enabled(const Filesystem &filesystem, uint64_t local_pool_id) {
+ dout(10) << ": filesystem=" << filesystem << ", pool_id=" << local_pool_id << dendl;
+
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ return;
+ }
+
+ auto p = m_mirror_actions.emplace(filesystem, MirrorAction(local_pool_id));
+ auto &mirror_action = p.first->second;
+ mirror_action.action_ctxs.push_back(new C_EnableMirroring(this, filesystem, local_pool_id));
+}
+
+void Mirror::handle_disable_mirroring(const Filesystem &filesystem, int r) {
+ dout(10) << ": filesystem=" << filesystem << ", r=" << r << dendl;
+
+ std::scoped_lock locker(m_lock);
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+
+ if (!mirror_action.fs_mirror->is_init_failed()) {
+ ceph_assert(mirror_action.action_in_progress);
+ mirror_action.action_in_progress = false;
+ m_cond.notify_all();
+ }
+
+ if (!m_stopping) {
+ mirror_action.fs_mirror.reset();
+ if (mirror_action.action_ctxs.empty()) {
+ dout(10) << ": no pending actions for filesystem=" << filesystem << dendl;
+ m_mirror_actions.erase(filesystem);
+ }
+ }
+}
+
+void Mirror::disable_mirroring(const Filesystem &filesystem, Context *on_finish) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.fs_mirror);
+ ceph_assert(!mirror_action.action_in_progress);
+
+ if (mirror_action.fs_mirror->is_init_failed()) {
+ dout(10) << ": init failed for filesystem=" << filesystem << dendl;
+ m_work_queue->queue(on_finish, -EINVAL);
+ return;
+ }
+
+ mirror_action.action_in_progress = true;
+ mirror_action.fs_mirror->shutdown(new C_AsyncCallback<ContextWQ>(m_work_queue, on_finish));
+}
+
+void Mirror::mirroring_disabled(const Filesystem &filesystem) {
+ dout(10) << ": filesystem=" << filesystem << dendl;
+
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ dout(5) << "shutting down" << dendl;
+ return;
+ }
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ mirror_action.action_ctxs.push_back(new C_DisableMirroring(this, filesystem));
+}
+
+void Mirror::add_peer(const Filesystem &filesystem, const Peer &peer) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.fs_mirror);
+ ceph_assert(!mirror_action.action_in_progress);
+
+ mirror_action.fs_mirror->add_peer(peer);
+}
+
+void Mirror::peer_added(const Filesystem &filesystem, const Peer &peer) {
+ dout(20) << ": filesystem=" << filesystem << ", peer=" << peer << dendl;
+
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ dout(5) << "shutting down" << dendl;
+ return;
+ }
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ mirror_action.action_ctxs.push_back(new C_PeerUpdate(this, filesystem, peer));
+}
+
+void Mirror::remove_peer(const Filesystem &filesystem, const Peer &peer) {
+ ceph_assert(ceph_mutex_is_locked(m_lock));
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ ceph_assert(mirror_action.fs_mirror);
+ ceph_assert(!mirror_action.action_in_progress);
+
+ mirror_action.fs_mirror->remove_peer(peer);
+}
+
+void Mirror::peer_removed(const Filesystem &filesystem, const Peer &peer) {
+ dout(20) << ": filesystem=" << filesystem << ", peer=" << peer << dendl;
+
+ std::scoped_lock locker(m_lock);
+ if (m_stopping) {
+ dout(5) << "shutting down" << dendl;
+ return;
+ }
+
+ auto &mirror_action = m_mirror_actions.at(filesystem);
+ mirror_action.action_ctxs.push_back(new C_PeerUpdate(this, filesystem, peer, true));
+}
+
+void Mirror::update_fs_mirrors() {
+ dout(20) << dendl;
+
+ auto now = ceph_clock_now();
+ double blocklist_interval = g_ceph_context->_conf.get_val<std::chrono::seconds>
+ ("cephfs_mirror_restart_mirror_on_blocklist_interval").count();
+ bool check_blocklist = blocklist_interval > 0 && ((now - m_last_blocklist_check) >= blocklist_interval);
+
+ double failed_interval = g_ceph_context->_conf.get_val<std::chrono::seconds>
+ ("cephfs_mirror_restart_mirror_on_failure_interval").count();
+ bool check_failure = failed_interval > 0 && ((now - m_last_failure_check) >= failed_interval);
+
+ {
+ std::scoped_lock locker(m_lock);
+ for (auto &[filesystem, mirror_action] : m_mirror_actions) {
+ auto failed = mirror_action.fs_mirror && mirror_action.fs_mirror->is_failed();
+ auto blocklisted = mirror_action.fs_mirror && mirror_action.fs_mirror->is_blocklisted();
+
+ if (check_failure && !mirror_action.action_in_progress && failed) {
+ // about to restart failed mirror instance -- nothing
+ // should interfere
+ dout(5) << ": filesystem=" << filesystem << " failed mirroring -- restarting" << dendl;
+ auto peers = mirror_action.fs_mirror->get_peers();
+ auto ctx = new C_RestartMirroring(this, filesystem, mirror_action.pool_id, peers);
+ ctx->complete(0);
+ } else if (check_blocklist && !mirror_action.action_in_progress && blocklisted) {
+ // about to restart blocklisted mirror instance -- nothing
+ // should interfere
+ dout(5) << ": filesystem=" << filesystem << " is blocklisted -- restarting" << dendl;
+ auto peers = mirror_action.fs_mirror->get_peers();
+ auto ctx = new C_RestartMirroring(this, filesystem, mirror_action.pool_id, peers);
+ ctx->complete(0);
+ }
+ if (!failed && !blocklisted && !mirror_action.action_ctxs.empty()
+ && !mirror_action.action_in_progress) {
+ auto ctx = std::move(mirror_action.action_ctxs.front());
+ mirror_action.action_ctxs.pop_front();
+ ctx->complete(0);
+ }
+ }
+
+ if (check_blocklist) {
+ m_last_blocklist_check = now;
+ }
+ if (check_failure) {
+ m_last_failure_check = now;
+ }
+ }
+
+ schedule_mirror_update_task();
+}
+
+void Mirror::schedule_mirror_update_task() {
+ ceph_assert(m_timer_task == nullptr);
+ ceph_assert(ceph_mutex_is_locked(*m_timer_lock));
+
+ m_timer_task = new LambdaContext([this](int _) {
+ m_timer_task = nullptr;
+ update_fs_mirrors();
+ });
+ double after = g_ceph_context->_conf.get_val<std::chrono::seconds>
+ ("cephfs_mirror_action_update_interval").count();
+ dout(20) << ": scheduling fs mirror update (" << m_timer_task << ") after "
+ << after << " seconds" << dendl;
+ m_timer->add_event_after(after, m_timer_task);
+}
+
+void Mirror::run() {
+ dout(20) << dendl;
+
+ std::unique_lock locker(m_lock);
+ m_cluster_watcher.reset(new ClusterWatcher(m_cct, m_monc, m_service_daemon.get(), m_listener));
+ m_msgr->add_dispatcher_tail(m_cluster_watcher.get());
+
+ m_cluster_watcher->init();
+ m_cond.wait(locker, [this]{return m_stopping;});
+
+ locker.unlock();
+ {
+ std::scoped_lock timer_lock(*m_timer_lock);
+ if (m_timer_task != nullptr) {
+ dout(10) << ": canceling timer task=" << m_timer_task << dendl;
+ m_timer->cancel_event(m_timer_task);
+ m_timer_task = nullptr;
+ }
+ }
+ locker.lock();
+
+ for (auto &[filesystem, mirror_action] : m_mirror_actions) {
+ dout(10) << ": trying to shutdown filesystem=" << filesystem << dendl;
+ // wait for in-progress action and shutdown
+ m_cond.wait(locker, [&mirror_action=mirror_action]
+ {return !mirror_action.action_in_progress;});
+ if (mirror_action.fs_mirror &&
+ !mirror_action.fs_mirror->is_stopping() &&
+ !mirror_action.fs_mirror->is_init_failed()) {
+ C_SaferCond cond;
+ mirror_action.fs_mirror->shutdown(new C_AsyncCallback<ContextWQ>(m_work_queue, &cond));
+ int r = cond.wait();
+ dout(10) << ": shutdown filesystem=" << filesystem << ", r=" << r << dendl;
+ }
+
+ mirror_action.fs_mirror.reset();
+ }
+}
+
+} // namespace mirror
+} // namespace cephfs
+
diff --git a/src/tools/cephfs_mirror/Mirror.h b/src/tools/cephfs_mirror/Mirror.h
new file mode 100644
index 000000000..f0ffdd516
--- /dev/null
+++ b/src/tools/cephfs_mirror/Mirror.h
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_H
+#define CEPHFS_MIRROR_H
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "common/ceph_mutex.h"
+#include "common/WorkQueue.h"
+#include "mds/FSMap.h"
+#include "ClusterWatcher.h"
+#include "FSMirror.h"
+#include "ServiceDaemon.h"
+#include "Types.h"
+
+class Messenger;
+class MonClient;
+class ContextWQ;
+
+namespace cephfs {
+namespace mirror {
+
+// this wraps up ClusterWatcher and FSMirrors to implement mirroring
+// for ceph filesystems.
+
+class Mirror {
+public:
+ Mirror(CephContext *cct, const std::vector<const char*> &args,
+ MonClient *monc, Messenger *msgr);
+ ~Mirror();
+
+ int init(std::string &reason);
+ void shutdown();
+ void run();
+
+ void handle_signal(int signum);
+
+private:
+ static constexpr std::string_view MIRRORING_MODULE = "mirroring";
+
+ struct C_EnableMirroring;
+ struct C_DisableMirroring;
+ struct C_PeerUpdate;
+ struct C_RestartMirroring;
+
+ struct ClusterListener : ClusterWatcher::Listener {
+ Mirror *mirror;
+
+ ClusterListener(Mirror *mirror)
+ : mirror(mirror) {
+ }
+
+ void handle_mirroring_enabled(const FilesystemSpec &spec) override {
+ mirror->mirroring_enabled(spec.filesystem, spec.pool_id);
+ }
+
+ void handle_mirroring_disabled(const Filesystem &filesystem) override {
+ mirror->mirroring_disabled(filesystem);
+ }
+
+ void handle_peers_added(const Filesystem &filesystem, const Peer &peer) override {
+ mirror->peer_added(filesystem, peer);
+ }
+
+ void handle_peers_removed(const Filesystem &filesystem, const Peer &peer) override {
+ mirror->peer_removed(filesystem, peer);
+ }
+ };
+
+ struct MirrorAction {
+ MirrorAction(uint64_t pool_id) :
+ pool_id(pool_id) {
+ }
+
+ uint64_t pool_id; // for restarting blocklisted mirror instance
+ bool action_in_progress = false;
+ std::list<Context *> action_ctxs;
+ std::unique_ptr<FSMirror> fs_mirror;
+ };
+
+ ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::Mirror");
+ ceph::condition_variable m_cond;
+
+ CephContext *m_cct;
+ std::vector<const char *> m_args;
+ MonClient *m_monc;
+ Messenger *m_msgr;
+ ClusterListener m_listener;
+
+ ThreadPool *m_thread_pool = nullptr;
+ ContextWQ *m_work_queue = nullptr;
+ SafeTimer *m_timer = nullptr;
+ ceph::mutex *m_timer_lock = nullptr;
+ Context *m_timer_task = nullptr;
+
+ bool m_stopping = false;
+ std::unique_ptr<ClusterWatcher> m_cluster_watcher;
+ std::map<Filesystem, MirrorAction> m_mirror_actions;
+
+ utime_t m_last_blocklist_check;
+ utime_t m_last_failure_check;
+
+ RadosRef m_local;
+ std::unique_ptr<ServiceDaemon> m_service_daemon;
+
+ int init_mon_client();
+
+ // called via listener
+ void mirroring_enabled(const Filesystem &filesystem, uint64_t local_pool_id);
+ void mirroring_disabled(const Filesystem &filesystem);
+ void peer_added(const Filesystem &filesystem, const Peer &peer);
+ void peer_removed(const Filesystem &filesystem, const Peer &peer);
+
+ // mirror enable callback
+ void enable_mirroring(const Filesystem &filesystem, uint64_t local_pool_id,
+ Context *on_finish, bool is_restart=false);
+ void handle_enable_mirroring(const Filesystem &filesystem, int r);
+ void handle_enable_mirroring(const Filesystem &filesystem, const Peers &peers, int r);
+
+ // mirror disable callback
+ void disable_mirroring(const Filesystem &filesystem, Context *on_finish);
+ void handle_disable_mirroring(const Filesystem &filesystem, int r);
+
+ // peer update callback
+ void add_peer(const Filesystem &filesystem, const Peer &peer);
+ void remove_peer(const Filesystem &filesystem, const Peer &peer);
+
+ void schedule_mirror_update_task();
+ void update_fs_mirrors();
+
+ void reopen_logs();
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_H
diff --git a/src/tools/cephfs_mirror/MirrorWatcher.cc b/src/tools/cephfs_mirror/MirrorWatcher.cc
new file mode 100644
index 000000000..26b88d077
--- /dev/null
+++ b/src/tools/cephfs_mirror/MirrorWatcher.cc
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "msg/Messenger.h"
+#include "aio_utils.h"
+#include "MirrorWatcher.h"
+#include "FSMirror.h"
+#include "Types.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::MirrorWatcher " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+MirrorWatcher::MirrorWatcher(librados::IoCtx &ioctx, FSMirror *fs_mirror,
+ ContextWQ *work_queue)
+ : Watcher(ioctx, CEPHFS_MIRROR_OBJECT, work_queue),
+ m_ioctx(ioctx),
+ m_fs_mirror(fs_mirror),
+ m_work_queue(work_queue),
+ m_lock(ceph::make_mutex("cephfs::mirror::mirror_watcher")),
+ m_instance_id(stringify(m_ioctx.get_instance_id())) {
+}
+
+MirrorWatcher::~MirrorWatcher() {
+}
+
+void MirrorWatcher::init(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_on_init_finish == nullptr);
+ m_on_init_finish = new LambdaContext([this, on_finish](int r) {
+ on_finish->complete(r);
+ if (m_on_shutdown_finish != nullptr) {
+ m_on_shutdown_finish->complete(0);
+ }
+ });
+ }
+
+ register_watcher();
+}
+
+void MirrorWatcher::shutdown(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_on_shutdown_finish == nullptr);
+ if (m_on_init_finish != nullptr) {
+ dout(10) << ": delaying shutdown -- init in progress" << dendl;
+ m_on_shutdown_finish = new LambdaContext([this, on_finish](int r) {
+ m_on_shutdown_finish = nullptr;
+ shutdown(on_finish);
+ });
+ return;
+ }
+
+ m_on_shutdown_finish = on_finish;
+ }
+
+ unregister_watcher();
+}
+
+void MirrorWatcher::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) {
+ dout(20) << dendl;
+
+ JSONFormatter f;
+ f.open_object_section("info");
+ encode_json("addr", m_fs_mirror->get_instance_addr(), &f);
+ f.close_section();
+
+ bufferlist outbl;
+ f.flush(outbl);
+ acknowledge_notify(notify_id, handle, outbl);
+}
+
+void MirrorWatcher::handle_rewatch_complete(int r) {
+ dout(5) << ": r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ dout(0) << ": client blocklisted" <<dendl;
+ std::scoped_lock locker(m_lock);
+ m_blocklisted = true;
+ } else if (r == -ENOENT) {
+ derr << ": mirroring object deleted" << dendl;
+ m_failed = true;
+ } else if (r < 0) {
+ derr << ": rewatch error: " << cpp_strerror(r) << dendl;
+ m_failed = true;
+ }
+}
+
+void MirrorWatcher::register_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *on_finish = new C_CallbackAdapter<
+ MirrorWatcher, &MirrorWatcher::handle_register_watcher>(this);
+ register_watch(on_finish);
+}
+
+void MirrorWatcher::handle_register_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_init_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ std::swap(on_init_finish, m_on_init_finish);
+ }
+
+ on_init_finish->complete(r);
+}
+
+void MirrorWatcher::unregister_watcher() {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ Context *on_finish = new C_CallbackAdapter<
+ MirrorWatcher, &MirrorWatcher::handle_unregister_watcher>(this);
+ unregister_watch(new C_AsyncCallback<ContextWQ>(m_work_queue, on_finish));
+}
+
+void MirrorWatcher::handle_unregister_watcher(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ Context *on_shutdown_finish = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ std::swap(on_shutdown_finish, m_on_shutdown_finish);
+ }
+
+ on_shutdown_finish->complete(r);
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/MirrorWatcher.h b/src/tools/cephfs_mirror/MirrorWatcher.h
new file mode 100644
index 000000000..c4d4f4522
--- /dev/null
+++ b/src/tools/cephfs_mirror/MirrorWatcher.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_MIRROR_WATCHER_H
+#define CEPHFS_MIRROR_MIRROR_WATCHER_H
+
+#include <string_view>
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "Watcher.h"
+
+class ContextWQ;
+class Messenger;
+
+namespace cephfs {
+namespace mirror {
+
+class FSMirror;
+
+// watch for notifications via cephfs_mirror object (in metadata
+// pool). this is used sending keepalived with keepalive payload
+// being the rados instance address (used by the manager module
+// to blocklist when needed).
+
+class MirrorWatcher : public Watcher {
+public:
+ static MirrorWatcher *create(librados::IoCtx &ioctx, FSMirror *fs_mirror,
+ ContextWQ *work_queue) {
+ return new MirrorWatcher(ioctx, fs_mirror, work_queue);
+ }
+
+ MirrorWatcher(librados::IoCtx &ioctx, FSMirror *fs_mirror,
+ ContextWQ *work_queue);
+ ~MirrorWatcher();
+
+ void init(Context *on_finish);
+ void shutdown(Context *on_finish);
+
+ void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) override;
+ void handle_rewatch_complete(int r) override;
+
+ bool is_blocklisted() {
+ std::scoped_lock locker(m_lock);
+ return m_blocklisted;
+ }
+
+ bool is_failed() {
+ std::scoped_lock locker(m_lock);
+ return m_failed;
+ }
+
+private:
+ librados::IoCtx &m_ioctx;
+ FSMirror *m_fs_mirror;
+ ContextWQ *m_work_queue;
+
+ ceph::mutex m_lock;
+ std::string m_instance_id;
+
+ Context *m_on_init_finish = nullptr;
+ Context *m_on_shutdown_finish = nullptr;
+
+ bool m_blocklisted = false;
+ bool m_failed = false;
+
+ void register_watcher();
+ void handle_register_watcher(int r);
+
+ void unregister_watcher();
+ void handle_unregister_watcher(int r);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_MIRROR_WATCHER_H
diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc
new file mode 100644
index 000000000..aaf97b868
--- /dev/null
+++ b/src/tools/cephfs_mirror/PeerReplayer.cc
@@ -0,0 +1,1552 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <stack>
+#include <fcntl.h>
+#include <algorithm>
+#include <sys/time.h>
+#include <sys/file.h>
+#include <boost/scope_exit.hpp>
+
+#include "common/admin_socket.h"
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "FSMirror.h"
+#include "PeerReplayer.h"
+#include "Utils.h"
+
+#include "json_spirit/json_spirit.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::PeerReplayer(" \
+ << m_peer.uuid << ") " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+const std::string PEER_CONFIG_KEY_PREFIX = "cephfs/mirror/peer";
+
+std::string snapshot_dir_path(CephContext *cct, const std::string &path) {
+ return path + "/" + cct->_conf->client_snapdir;
+}
+
+std::string snapshot_path(const std::string &snap_dir, const std::string &snap_name) {
+ return snap_dir + "/" + snap_name;
+}
+
+std::string snapshot_path(CephContext *cct, const std::string &path, const std::string &snap_name) {
+ return path + "/" + cct->_conf->client_snapdir + "/" + snap_name;
+}
+
+std::string entry_path(const std::string &dir, const std::string &name) {
+ return dir + "/" + name;
+}
+
+std::map<std::string, std::string> decode_snap_metadata(snap_metadata *snap_metadata,
+ size_t nr_snap_metadata) {
+ std::map<std::string, std::string> metadata;
+ for (size_t i = 0; i < nr_snap_metadata; ++i) {
+ metadata.emplace(snap_metadata[i].key, snap_metadata[i].value);
+ }
+
+ return metadata;
+}
+
+std::string peer_config_key(const std::string &fs_name, const std::string &uuid) {
+ return PEER_CONFIG_KEY_PREFIX + "/" + fs_name + "/" + uuid;
+}
+
+class PeerAdminSocketCommand {
+public:
+ virtual ~PeerAdminSocketCommand() {
+ }
+ virtual int call(Formatter *f) = 0;
+};
+
+class StatusCommand : public PeerAdminSocketCommand {
+public:
+ explicit StatusCommand(PeerReplayer *peer_replayer)
+ : peer_replayer(peer_replayer) {
+ }
+
+ int call(Formatter *f) override {
+ peer_replayer->peer_status(f);
+ return 0;
+ }
+
+private:
+ PeerReplayer *peer_replayer;
+};
+
+// helper to open a directory relative to a file descriptor
+int opendirat(MountRef mnt, int dirfd, const std::string &relpath, int flags,
+ ceph_dir_result **dirp) {
+ int r = ceph_openat(mnt, dirfd, relpath.c_str(), flags, 0);
+ if (r < 0) {
+ return r;
+ }
+
+ int fd = r;
+ r = ceph_fdopendir(mnt, fd, dirp);
+ ceph_close(mnt, fd);
+ return r;
+}
+
+} // anonymous namespace
+
+class PeerReplayerAdminSocketHook : public AdminSocketHook {
+public:
+ PeerReplayerAdminSocketHook(CephContext *cct, const Filesystem &filesystem,
+ const Peer &peer, PeerReplayer *peer_replayer)
+ : admin_socket(cct->get_admin_socket()) {
+ int r;
+ std::string cmd;
+
+ // mirror peer status format is name@id uuid
+ cmd = "fs mirror peer status "
+ + stringify(filesystem.fs_name) + "@" + stringify(filesystem.fscid)
+ + " "
+ + stringify(peer.uuid);
+ r = admin_socket->register_command(
+ cmd, this, "get peer mirror status");
+ if (r == 0) {
+ commands[cmd] = new StatusCommand(peer_replayer);
+ }
+ }
+
+ ~PeerReplayerAdminSocketHook() override {
+ admin_socket->unregister_commands(this);
+ for (auto &[command, cmdptr] : commands) {
+ delete cmdptr;
+ }
+ }
+
+ int call(std::string_view command, const cmdmap_t& cmdmap,
+ Formatter *f, std::ostream &errss, bufferlist &out) override {
+ auto p = commands.at(std::string(command));
+ return p->call(f);
+ }
+
+private:
+ typedef std::map<std::string, PeerAdminSocketCommand*, std::less<>> Commands;
+
+ AdminSocket *admin_socket;
+ Commands commands;
+};
+
+PeerReplayer::PeerReplayer(CephContext *cct, FSMirror *fs_mirror,
+ RadosRef local_cluster, const Filesystem &filesystem,
+ const Peer &peer, const std::set<std::string, std::less<>> &directories,
+ MountRef mount, ServiceDaemon *service_daemon)
+ : m_cct(cct),
+ m_fs_mirror(fs_mirror),
+ m_local_cluster(local_cluster),
+ m_filesystem(filesystem),
+ m_peer(peer),
+ m_directories(directories.begin(), directories.end()),
+ m_local_mount(mount),
+ m_service_daemon(service_daemon),
+ m_asok_hook(new PeerReplayerAdminSocketHook(cct, filesystem, peer, this)),
+ m_lock(ceph::make_mutex("cephfs::mirror::PeerReplayer::" + stringify(peer.uuid))) {
+ // reset sync stats sent via service daemon
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer,
+ SERVICE_DAEMON_FAILED_DIR_COUNT_KEY, (uint64_t)0);
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer,
+ SERVICE_DAEMON_RECOVERED_DIR_COUNT_KEY, (uint64_t)0);
+}
+
+PeerReplayer::~PeerReplayer() {
+ delete m_asok_hook;
+}
+
+int PeerReplayer::init() {
+ dout(20) << ": initial dir list=[" << m_directories << "]" << dendl;
+ for (auto &dir_root : m_directories) {
+ m_snap_sync_stats.emplace(dir_root, SnapSyncStat());
+ }
+
+ auto &remote_client = m_peer.remote.client_name;
+ auto &remote_cluster = m_peer.remote.cluster_name;
+ auto remote_filesystem = Filesystem{0, m_peer.remote.fs_name};
+
+ std::string key = peer_config_key(m_filesystem.fs_name, m_peer.uuid);
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config-key get\", "
+ "\"key\": \"" + key + "\""
+ "}";
+
+ bufferlist in_bl;
+ bufferlist out_bl;
+
+ int r = m_local_cluster->mon_command(cmd, in_bl, &out_bl, nullptr);
+ dout(5) << ": mon command r=" << r << dendl;
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+
+ std::string mon_host;
+ std::string cephx_key;
+ if (!r) {
+ json_spirit::mValue root;
+ if (!json_spirit::read(out_bl.to_str(), root)) {
+ derr << ": invalid config-key JSON" << dendl;
+ return -EBADMSG;
+ }
+ try {
+ auto &root_obj = root.get_obj();
+ mon_host = root_obj.at("mon_host").get_str();
+ cephx_key = root_obj.at("key").get_str();
+ dout(0) << ": remote monitor host=" << mon_host << dendl;
+ } catch (std::runtime_error&) {
+ derr << ": unexpected JSON received" << dendl;
+ return -EBADMSG;
+ }
+ }
+
+ r = connect(remote_client, remote_cluster, &m_remote_cluster, mon_host, cephx_key);
+ if (r < 0) {
+ derr << ": error connecting to remote cluster: " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = mount(m_remote_cluster, remote_filesystem, false, &m_remote_mount);
+ if (r < 0) {
+ m_remote_cluster.reset();
+ derr << ": error mounting remote filesystem=" << remote_filesystem << dendl;
+ return r;
+ }
+
+ std::scoped_lock locker(m_lock);
+ auto nr_replayers = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_max_concurrent_directory_syncs");
+ dout(20) << ": spawning " << nr_replayers << " snapshot replayer(s)" << dendl;
+
+ while (nr_replayers-- > 0) {
+ std::unique_ptr<SnapshotReplayerThread> replayer(
+ new SnapshotReplayerThread(this));
+ std::string name("replayer-" + stringify(nr_replayers));
+ replayer->create(name.c_str());
+ m_replayers.push_back(std::move(replayer));
+ }
+
+ return 0;
+}
+
+void PeerReplayer::shutdown() {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(!m_stopping);
+ m_stopping = true;
+ m_cond.notify_all();
+ }
+
+ for (auto &replayer : m_replayers) {
+ replayer->join();
+ }
+ m_replayers.clear();
+ ceph_unmount(m_remote_mount);
+ ceph_release(m_remote_mount);
+ m_remote_mount = nullptr;
+ m_remote_cluster.reset();
+}
+
+void PeerReplayer::add_directory(string_view dir_root) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ std::scoped_lock locker(m_lock);
+ m_directories.emplace_back(dir_root);
+ m_snap_sync_stats.emplace(dir_root, SnapSyncStat());
+ m_cond.notify_all();
+}
+
+void PeerReplayer::remove_directory(string_view dir_root) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+ auto _dir_root = std::string(dir_root);
+
+ std::scoped_lock locker(m_lock);
+ auto it = std::find(m_directories.begin(), m_directories.end(), _dir_root);
+ if (it != m_directories.end()) {
+ m_directories.erase(it);
+ }
+
+ auto it1 = m_registered.find(_dir_root);
+ if (it1 == m_registered.end()) {
+ m_snap_sync_stats.erase(_dir_root);
+ } else {
+ it1->second.canceled = true;
+ }
+ m_cond.notify_all();
+}
+
+boost::optional<std::string> PeerReplayer::pick_directory() {
+ dout(20) << dendl;
+
+ auto now = clock::now();
+ auto retry_timo = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_retry_failed_directories_interval");
+
+ boost::optional<std::string> candidate;
+ for (auto &dir_root : m_directories) {
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ if (sync_stat.failed) {
+ std::chrono::duration<double> d = now - *sync_stat.last_failed;
+ if (d.count() < retry_timo) {
+ continue;
+ }
+ }
+ if (!m_registered.count(dir_root)) {
+ candidate = dir_root;
+ break;
+ }
+ }
+
+ std::rotate(m_directories.begin(), m_directories.begin() + 1, m_directories.end());
+ return candidate;
+}
+
+int PeerReplayer::register_directory(const std::string &dir_root,
+ SnapshotReplayerThread *replayer) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+ ceph_assert(m_registered.find(dir_root) == m_registered.end());
+
+ DirRegistry registry;
+ int r = try_lock_directory(dir_root, replayer, &registry);
+ if (r < 0) {
+ return r;
+ }
+
+ dout(5) << ": dir_root=" << dir_root << " registered with replayer="
+ << replayer << dendl;
+ m_registered.emplace(dir_root, std::move(registry));
+ return 0;
+}
+
+void PeerReplayer::unregister_directory(const std::string &dir_root) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ auto it = m_registered.find(dir_root);
+ ceph_assert(it != m_registered.end());
+
+ unlock_directory(it->first, it->second);
+ m_registered.erase(it);
+ if (std::find(m_directories.begin(), m_directories.end(), dir_root) == m_directories.end()) {
+ m_snap_sync_stats.erase(dir_root);
+ }
+}
+
+int PeerReplayer::try_lock_directory(const std::string &dir_root,
+ SnapshotReplayerThread *replayer, DirRegistry *registry) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ int r = ceph_open(m_remote_mount, dir_root.c_str(), O_RDONLY | O_DIRECTORY, 0);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to open remote dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (r == -ENOENT) {
+ // we snap under dir_root, so mode does not matter much
+ r = ceph_mkdirs(m_remote_mount, dir_root.c_str(), 0755);
+ if (r < 0) {
+ derr << ": failed to create remote directory=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = ceph_open(m_remote_mount, dir_root.c_str(), O_RDONLY | O_DIRECTORY, 0);
+ if (r < 0) {
+ derr << ": failed to open remote dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+ }
+
+ int fd = r;
+ r = ceph_flock(m_remote_mount, fd, LOCK_EX | LOCK_NB, (uint64_t)replayer->get_thread_id());
+ if (r != 0) {
+ if (r == -EWOULDBLOCK) {
+ dout(5) << ": dir_root=" << dir_root << " is locked by cephfs-mirror, "
+ << "will retry again" << dendl;
+ } else {
+ derr << ": failed to lock dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ }
+
+ if (ceph_close(m_remote_mount, fd) < 0) {
+ derr << ": failed to close (cleanup) remote dir_root=" << dir_root << ": "
+ << cpp_strerror(r) << dendl;
+ }
+ return r;
+ }
+
+ dout(10) << ": dir_root=" << dir_root << " locked" << dendl;
+
+ registry->fd = fd;
+ registry->replayer = replayer;
+ return 0;
+}
+
+void PeerReplayer::unlock_directory(const std::string &dir_root, const DirRegistry &registry) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ int r = ceph_flock(m_remote_mount, registry.fd, LOCK_UN,
+ (uint64_t)registry.replayer->get_thread_id());
+ if (r < 0) {
+ derr << ": failed to unlock remote dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ return;
+ }
+
+ r = ceph_close(m_remote_mount, registry.fd);
+ if (r < 0) {
+ derr << ": failed to close remote dir_root=" << dir_root << ": " << cpp_strerror(r)
+ << dendl;
+ }
+
+ dout(10) << ": dir_root=" << dir_root << " unlocked" << dendl;
+}
+
+int PeerReplayer::build_snap_map(const std::string &dir_root,
+ std::map<uint64_t, std::string> *snap_map, bool is_remote) {
+ auto snap_dir = snapshot_dir_path(m_cct, dir_root);
+ dout(20) << ": dir_root=" << dir_root << ", snap_dir=" << snap_dir
+ << ", is_remote=" << is_remote << dendl;
+
+ auto lr_str = is_remote ? "remote" : "local";
+ auto mnt = is_remote ? m_remote_mount : m_local_mount;
+
+ ceph_dir_result *dirp = nullptr;
+ int r = ceph_opendir(mnt, snap_dir.c_str(), &dirp);
+ if (r < 0) {
+ if (is_remote && r == -ENOENT) {
+ return 0;
+ }
+ derr << ": failed to open " << lr_str << " snap directory=" << snap_dir
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::set<std::string> snaps;
+ auto entry = ceph_readdir(mnt, dirp);
+ while (entry != NULL) {
+ auto d_name = std::string(entry->d_name);
+ dout(20) << ": entry=" << d_name << dendl;
+ if (d_name != "." && d_name != ".." && d_name.rfind("_", 0) != 0) {
+ snaps.emplace(d_name);
+ }
+
+ entry = ceph_readdir(mnt, dirp);
+ }
+
+ int rv = 0;
+ for (auto &snap : snaps) {
+ snap_info info;
+ auto snap_path = snapshot_path(snap_dir, snap);
+ r = ceph_get_snap_info(mnt, snap_path.c_str(), &info);
+ if (r < 0) {
+ derr << ": failed to fetch " << lr_str << " snap info for snap_path=" << snap_path
+ << ": " << cpp_strerror(r) << dendl;
+ rv = r;
+ break;
+ }
+
+ uint64_t snap_id;
+ if (is_remote) {
+ if (!info.nr_snap_metadata) {
+ derr << ": snap_path=" << snap_path << " has invalid metadata in remote snapshot"
+ << dendl;
+ rv = -EINVAL;
+ } else {
+ auto metadata = decode_snap_metadata(info.snap_metadata, info.nr_snap_metadata);
+ dout(20) << ": snap_path=" << snap_path << ", metadata=" << metadata << dendl;
+ auto it = metadata.find(PRIMARY_SNAP_ID_KEY);
+ if (it == metadata.end()) {
+ derr << ": snap_path=" << snap_path << " has missing \"" << PRIMARY_SNAP_ID_KEY
+ << "\" in metadata" << dendl;
+ rv = -EINVAL;
+ } else {
+ snap_id = std::stoull(it->second);
+ }
+ ceph_free_snap_info_buffer(&info);
+ }
+ } else {
+ snap_id = info.id;
+ }
+
+ if (rv != 0) {
+ break;
+ }
+ snap_map->emplace(snap_id, snap);
+ }
+
+ r = ceph_closedir(mnt, dirp);
+ if (r < 0) {
+ derr << ": failed to close " << lr_str << " snap directory=" << snap_dir
+ << ": " << cpp_strerror(r) << dendl;
+ }
+
+ dout(10) << ": " << lr_str << " snap_map=" << *snap_map << dendl;
+ return rv;
+}
+
+int PeerReplayer::propagate_snap_deletes(const std::string &dir_root,
+ const std::set<std::string> &snaps) {
+ dout(5) << ": dir_root=" << dir_root << ", deleted snapshots=" << snaps << dendl;
+
+ for (auto &snap : snaps) {
+ dout(20) << ": deleting dir_root=" << dir_root << ", snapshot=" << snap
+ << dendl;
+ int r = ceph_rmsnap(m_remote_mount, dir_root.c_str(), snap.c_str());
+ if (r < 0) {
+ derr << ": failed to delete remote snap dir_root=" << dir_root
+ << ", snapshot=" << snaps << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ inc_deleted_snap(dir_root);
+ }
+
+ return 0;
+}
+
+int PeerReplayer::propagate_snap_renames(
+ const std::string &dir_root,
+ const std::set<std::pair<std::string,std::string>> &snaps) {
+ dout(10) << ": dir_root=" << dir_root << ", renamed snapshots=" << snaps << dendl;
+
+ for (auto &snapp : snaps) {
+ auto from = snapshot_path(m_cct, dir_root, snapp.first);
+ auto to = snapshot_path(m_cct, dir_root, snapp.second);
+ dout(20) << ": renaming dir_root=" << dir_root << ", snapshot from="
+ << from << ", to=" << to << dendl;
+ int r = ceph_rename(m_remote_mount, from.c_str(), to.c_str());
+ if (r < 0) {
+ derr << ": failed to rename remote snap dir_root=" << dir_root
+ << ", snapshot from =" << from << ", to=" << to << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ inc_renamed_snap(dir_root);
+ }
+
+ return 0;
+}
+
+int PeerReplayer::remote_mkdir(const std::string &epath, const struct ceph_statx &stx,
+ const FHandles &fh) {
+ dout(10) << ": remote epath=" << epath << dendl;
+
+ int r = ceph_mkdirat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_mode & ~S_IFDIR);
+ if (r < 0 && r != -EEXIST) {
+ derr << ": failed to create remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = ceph_chownat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_uid, stx.stx_gid,
+ AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to chown remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = ceph_chmodat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_mode & ~S_IFMT,
+ AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to chmod remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ struct timespec times[] = {{stx.stx_atime.tv_sec, stx.stx_atime.tv_nsec},
+ {stx.stx_mtime.tv_sec, stx.stx_mtime.tv_nsec}};
+ r = ceph_utimensat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), times, AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to change [am]time on remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+#define NR_IOVECS 8 // # iovecs
+#define IOVEC_SIZE (8 * 1024 * 1024) // buffer size for each iovec
+int PeerReplayer::copy_to_remote(const std::string &dir_root, const std::string &epath,
+ const struct ceph_statx &stx, const FHandles &fh) {
+ dout(10) << ": dir_root=" << dir_root << ", epath=" << epath << dendl;
+ int l_fd;
+ int r_fd;
+ void *ptr;
+ struct iovec iov[NR_IOVECS];
+
+ int r = ceph_openat(m_local_mount, fh.c_fd, epath.c_str(), O_RDONLY | O_NOFOLLOW, 0);
+ if (r < 0) {
+ derr << ": failed to open local file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ l_fd = r;
+ r = ceph_openat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(),
+ O_CREAT | O_TRUNC | O_WRONLY | O_NOFOLLOW, stx.stx_mode);
+ if (r < 0) {
+ derr << ": failed to create remote file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ goto close_local_fd;
+ }
+
+ r_fd = r;
+ ptr = malloc(NR_IOVECS * IOVEC_SIZE);
+ if (!ptr) {
+ r = -ENOMEM;
+ derr << ": failed to allocate memory" << dendl;
+ goto close_remote_fd;
+ }
+
+ while (true) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+
+ for (int i = 0; i < NR_IOVECS; ++i) {
+ iov[i].iov_base = (char*)ptr + IOVEC_SIZE*i;
+ iov[i].iov_len = IOVEC_SIZE;
+ }
+
+ r = ceph_preadv(m_local_mount, l_fd, iov, NR_IOVECS, -1);
+ if (r < 0) {
+ derr << ": failed to read local file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ if (r == 0) {
+ break;
+ }
+
+ int iovs = (int)(r / IOVEC_SIZE);
+ int t = r % IOVEC_SIZE;
+ if (t) {
+ iov[iovs].iov_len = t;
+ ++iovs;
+ }
+
+ r = ceph_pwritev(m_remote_mount, r_fd, iov, iovs, -1);
+ if (r < 0) {
+ derr << ": failed to write remote file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ }
+
+ if (r == 0) {
+ r = ceph_fsync(m_remote_mount, r_fd, 0);
+ if (r < 0) {
+ derr << ": failed to sync data for file path=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ }
+ }
+
+ free(ptr);
+
+close_remote_fd:
+ if (ceph_close(m_remote_mount, r_fd) < 0) {
+ derr << ": failed to close remote fd path=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return -EINVAL;
+ }
+
+close_local_fd:
+ if (ceph_close(m_local_mount, l_fd) < 0) {
+ derr << ": failed to close local fd path=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return -EINVAL;
+ }
+
+ return r == 0 ? 0 : r;
+}
+
+int PeerReplayer::remote_file_op(const std::string &dir_root, const std::string &epath,
+ const struct ceph_statx &stx, const FHandles &fh,
+ bool need_data_sync, bool need_attr_sync) {
+ dout(10) << ": dir_root=" << dir_root << ", epath=" << epath << ", need_data_sync=" << need_data_sync
+ << ", need_attr_sync=" << need_attr_sync << dendl;
+
+ int r;
+ if (need_data_sync) {
+ if (S_ISREG(stx.stx_mode)) {
+ r = copy_to_remote(dir_root, epath, stx, fh);
+ if (r < 0) {
+ derr << ": failed to copy path=" << epath << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else if (S_ISLNK(stx.stx_mode)) {
+ // free the remote link before relinking
+ r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), 0);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to remove remote symlink=" << epath << dendl;
+ return r;
+ }
+ char *target = (char *)alloca(stx.stx_size+1);
+ r = ceph_readlinkat(m_local_mount, fh.c_fd, epath.c_str(), target, stx.stx_size);
+ if (r < 0) {
+ derr << ": failed to readlink local path=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ target[stx.stx_size] = '\0';
+ r = ceph_symlinkat(m_remote_mount, target, fh.r_fd_dir_root, epath.c_str());
+ if (r < 0 && r != EEXIST) {
+ derr << ": failed to symlink remote path=" << epath << " to target=" << target
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+ } else {
+ dout(5) << ": skipping entry=" << epath << ": unsupported mode=" << stx.stx_mode
+ << dendl;
+ return 0;
+ }
+ }
+
+ if (need_attr_sync) {
+ r = ceph_chownat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_uid, stx.stx_gid,
+ AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to chown remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ r = ceph_chmodat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), stx.stx_mode & ~S_IFMT,
+ AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to chmod remote directory=" << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ struct timespec times[] = {{stx.stx_atime.tv_sec, stx.stx_atime.tv_nsec},
+ {stx.stx_mtime.tv_sec, stx.stx_mtime.tv_nsec}};
+ r = ceph_utimensat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), times, AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to change [am]time on remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+int PeerReplayer::cleanup_remote_dir(const std::string &dir_root,
+ const std::string &epath, const FHandles &fh) {
+ dout(20) << ": dir_root=" << dir_root << ", epath=" << epath
+ << dendl;
+
+ struct ceph_statx tstx;
+ int r = ceph_statxat(m_remote_mount, fh.r_fd_dir_root, epath.c_str(), &tstx,
+ CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID |
+ CEPH_STATX_SIZE | CEPH_STATX_ATIME | CEPH_STATX_MTIME,
+ AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to stat remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ ceph_dir_result *tdirp;
+ r = opendirat(m_remote_mount, fh.r_fd_dir_root, epath, AT_SYMLINK_NOFOLLOW,
+ &tdirp);
+ if (r < 0) {
+ derr << ": failed to open remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ std::stack<SyncEntry> rm_stack;
+ rm_stack.emplace(SyncEntry(epath, tdirp, tstx));
+ while (!rm_stack.empty()) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+
+ dout(20) << ": " << rm_stack.size() << " entries in stack" << dendl;
+ std::string e_name;
+ auto &entry = rm_stack.top();
+ dout(20) << ": top of stack path=" << entry.epath << dendl;
+ if (entry.is_directory()) {
+ struct ceph_statx stx;
+ struct dirent de;
+ while (true) {
+ r = ceph_readdirplus_r(m_remote_mount, entry.dirp, &de, &stx,
+ CEPH_STATX_MODE, AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW, NULL);
+ if (r < 0) {
+ derr << ": failed to read remote directory=" << entry.epath << dendl;
+ break;
+ }
+ if (r == 0) {
+ break;
+ }
+
+ auto d_name = std::string(de.d_name);
+ if (d_name != "." && d_name != "..") {
+ e_name = d_name;
+ break;
+ }
+ }
+
+ if (r == 0) {
+ r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, entry.epath.c_str(), AT_REMOVEDIR);
+ if (r < 0) {
+ derr << ": failed to remove remote directory=" << entry.epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+
+ dout(10) << ": done for remote directory=" << entry.epath << dendl;
+ if (ceph_closedir(m_remote_mount, entry.dirp) < 0) {
+ derr << ": failed to close remote directory=" << entry.epath << dendl;
+ }
+ rm_stack.pop();
+ continue;
+ }
+ if (r < 0) {
+ break;
+ }
+
+ auto epath = entry_path(entry.epath, e_name);
+ if (S_ISDIR(stx.stx_mode)) {
+ ceph_dir_result *dirp;
+ r = opendirat(m_remote_mount, fh.r_fd_dir_root, epath, AT_SYMLINK_NOFOLLOW,
+ &dirp);
+ if (r < 0) {
+ derr << ": failed to open remote directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ rm_stack.emplace(SyncEntry(epath, dirp, stx));
+ } else {
+ rm_stack.emplace(SyncEntry(epath, stx));
+ }
+ } else {
+ r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, entry.epath.c_str(), 0);
+ if (r < 0) {
+ derr << ": failed to remove remote directory=" << entry.epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ dout(10) << ": done for remote file=" << entry.epath << dendl;
+ rm_stack.pop();
+ }
+ }
+
+ while (!rm_stack.empty()) {
+ auto &entry = rm_stack.top();
+ if (entry.is_directory()) {
+ dout(20) << ": closing remote directory=" << entry.epath << dendl;
+ if (ceph_closedir(m_remote_mount, entry.dirp) < 0) {
+ derr << ": failed to close remote directory=" << entry.epath << dendl;
+ }
+ }
+
+ rm_stack.pop();
+ }
+
+ return r;
+}
+
+int PeerReplayer::should_sync_entry(const std::string &epath, const struct ceph_statx &cstx,
+ const FHandles &fh, bool *need_data_sync, bool *need_attr_sync) {
+ dout(10) << ": epath=" << epath << dendl;
+
+ *need_data_sync = false;
+ *need_attr_sync = false;
+ struct ceph_statx pstx;
+ int r = ceph_statxat(fh.p_mnt, fh.p_fd, epath.c_str(), &pstx,
+ CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID |
+ CEPH_STATX_SIZE | CEPH_STATX_CTIME | CEPH_STATX_MTIME,
+ AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0 && r != -ENOENT && r != -ENOTDIR) {
+ derr << ": failed to stat prev entry= " << epath << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ if (r < 0) {
+ // inode does not exist in prev snapshot or file type has changed
+ // (file was S_IFREG earlier, S_IFDIR now).
+ dout(5) << ": entry=" << epath << ", r=" << r << dendl;
+ *need_data_sync = true;
+ *need_attr_sync = true;
+ return 0;
+ }
+
+ dout(10) << ": local cur statx: mode=" << cstx.stx_mode << ", uid=" << cstx.stx_uid
+ << ", gid=" << cstx.stx_gid << ", size=" << cstx.stx_size << ", ctime="
+ << cstx.stx_ctime << ", mtime=" << cstx.stx_mtime << dendl;
+ dout(10) << ": local prev statx: mode=" << pstx.stx_mode << ", uid=" << pstx.stx_uid
+ << ", gid=" << pstx.stx_gid << ", size=" << pstx.stx_size << ", ctime="
+ << pstx.stx_ctime << ", mtime=" << pstx.stx_mtime << dendl;
+ if ((cstx.stx_mode & S_IFMT) != (pstx.stx_mode & S_IFMT)) {
+ dout(5) << ": entry=" << epath << " has mode mismatch" << dendl;
+ *need_data_sync = true;
+ *need_attr_sync = true;
+ } else {
+ *need_data_sync = (cstx.stx_size != pstx.stx_size) || (cstx.stx_mtime != pstx.stx_mtime);
+ *need_attr_sync = (cstx.stx_ctime != pstx.stx_ctime);
+ }
+
+ return 0;
+}
+
+int PeerReplayer::propagate_deleted_entries(const std::string &dir_root,
+ const std::string &epath, const FHandles &fh) {
+ dout(10) << ": dir_root=" << dir_root << ", epath=" << epath << dendl;
+
+ ceph_dir_result *dirp;
+ int r = opendirat(fh.p_mnt, fh.p_fd, epath, AT_SYMLINK_NOFOLLOW, &dirp);
+ if (r < 0) {
+ if (r == -ELOOP) {
+ dout(5) << ": epath=" << epath << " is a symbolic link -- mode sync"
+ << " done when traversing parent" << dendl;
+ return 0;
+ }
+ if (r == -ENOTDIR) {
+ dout(5) << ": epath=" << epath << " is not a directory -- mode sync"
+ << " done when traversing parent" << dendl;
+ return 0;
+ }
+ if (r == -ENOENT) {
+ dout(5) << ": epath=" << epath << " missing in previous-snap/remote dir-root"
+ << dendl;
+ }
+ return r;
+ }
+
+ struct dirent *dire = (struct dirent *)alloca(512 * sizeof(struct dirent));
+ while (true) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+
+ int len = ceph_getdents(fh.p_mnt, dirp, (char *)dire, 512);
+ if (len < 0) {
+ derr << ": failed to read directory entries: " << cpp_strerror(len) << dendl;
+ r = len;
+ // flip errno to signal that we got an err (possible the
+ // snapshot getting deleted in midst).
+ if (r == -ENOENT) {
+ r = -EINVAL;
+ }
+ break;
+ }
+ if (len == 0) {
+ dout(10) << ": reached EOD" << dendl;
+ break;
+ }
+ int nr = len / sizeof(struct dirent);
+ for (int i = 0; i < nr; ++i) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+ std::string d_name = std::string(dire[i].d_name);
+ if (d_name == "." || d_name == "..") {
+ continue;
+ }
+
+ struct ceph_statx pstx;
+ auto dpath = entry_path(epath, d_name);
+ r = ceph_statxat(fh.p_mnt, fh.p_fd, dpath.c_str(), &pstx,
+ CEPH_STATX_MODE, AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to stat (prev) directory=" << dpath << ": "
+ << cpp_strerror(r) << dendl;
+ // flip errno to signal that we got an err (possible the
+ // snapshot getting deleted in midst).
+ if (r == -ENOENT) {
+ r = -EINVAL;
+ }
+ return r;
+ }
+
+ struct ceph_statx cstx;
+ r = ceph_statxat(m_local_mount, fh.c_fd, dpath.c_str(), &cstx,
+ CEPH_STATX_MODE, AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to stat local (cur) directory=" << dpath << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ bool purge_remote = true;
+ if (r == 0) {
+ // directory entry present in both snapshots -- check inode
+ // type
+ if ((pstx.stx_mode & S_IFMT) == (cstx.stx_mode & S_IFMT)) {
+ dout(5) << ": mode matches for entry=" << d_name << dendl;
+ purge_remote = false;
+ } else {
+ dout(5) << ": mode mismatch for entry=" << d_name << dendl;
+ }
+ } else {
+ dout(5) << ": entry=" << d_name << " missing in current snapshot" << dendl;
+ }
+
+ if (purge_remote) {
+ dout(5) << ": purging remote entry=" << dpath << dendl;
+ if (S_ISDIR(pstx.stx_mode)) {
+ r = cleanup_remote_dir(dir_root, dpath, fh);
+ } else {
+ r = ceph_unlinkat(m_remote_mount, fh.r_fd_dir_root, dpath.c_str(), 0);
+ }
+
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to cleanup remote entry=" << d_name << ": "
+ << cpp_strerror(r) << dendl;
+ return r;
+ }
+ }
+ }
+ }
+
+ ceph_closedir(fh.p_mnt, dirp);
+ return r;
+}
+
+int PeerReplayer::open_dir(MountRef mnt, const std::string &dir_path,
+ boost::optional<uint64_t> snap_id) {
+ dout(20) << ": dir_path=" << dir_path << dendl;
+ if (snap_id) {
+ dout(20) << ": expected snapshot id=" << *snap_id << dendl;
+ }
+
+ int fd = ceph_open(mnt, dir_path.c_str(), O_DIRECTORY | O_RDONLY, 0);
+ if (fd < 0) {
+ derr << ": cannot open dir_path=" << dir_path << ": " << cpp_strerror(fd)
+ << dendl;
+ return fd;
+ }
+
+ if (!snap_id) {
+ return fd;
+ }
+
+ snap_info info;
+ int r = ceph_get_snap_info(mnt, dir_path.c_str(), &info);
+ if (r < 0) {
+ derr << ": failed to fetch snap_info for path=" << dir_path
+ << ": " << cpp_strerror(r) << dendl;
+ ceph_close(mnt, fd);
+ return r;
+ }
+
+ if (info.id != *snap_id) {
+ dout(5) << ": got mismatching snapshot id for path=" << dir_path << " (" << info.id
+ << " vs " << *snap_id << ") -- possible recreate" << dendl;
+ ceph_close(mnt, fd);
+ return -EINVAL;
+ }
+
+ return fd;
+}
+
+int PeerReplayer::pre_sync_check_and_open_handles(
+ const std::string &dir_root,
+ const Snapshot &current, boost::optional<Snapshot> prev,
+ FHandles *fh) {
+ dout(20) << ": dir_root=" << dir_root << ", current=" << current << dendl;
+ if (prev) {
+ dout(20) << ": prev=" << prev << dendl;
+ }
+
+ auto cur_snap_path = snapshot_path(m_cct, dir_root, current.first);
+ auto fd = open_dir(m_local_mount, cur_snap_path, current.second);
+ if (fd < 0) {
+ return fd;
+ }
+
+ // current snapshot file descriptor
+ fh->c_fd = fd;
+
+ MountRef mnt;
+ if (prev) {
+ mnt = m_local_mount;
+ auto prev_snap_path = snapshot_path(m_cct, dir_root, (*prev).first);
+ fd = open_dir(mnt, prev_snap_path, (*prev).second);
+ } else {
+ mnt = m_remote_mount;
+ fd = open_dir(mnt, dir_root, boost::none);
+ }
+
+ if (fd < 0) {
+ if (!prev || fd != -ENOENT) {
+ ceph_close(m_local_mount, fh->c_fd);
+ return fd;
+ }
+
+ // ENOENT of previous snap
+ dout(5) << ": previous snapshot=" << *prev << " missing" << dendl;
+ mnt = m_remote_mount;
+ fd = open_dir(mnt, dir_root, boost::none);
+ if (fd < 0) {
+ ceph_close(m_local_mount, fh->c_fd);
+ return fd;
+ }
+ }
+
+ // "previous" snapshot or dir_root file descriptor
+ fh->p_fd = fd;
+ fh->p_mnt = mnt;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto it = m_registered.find(dir_root);
+ ceph_assert(it != m_registered.end());
+ fh->r_fd_dir_root = it->second.fd;
+ }
+
+ dout(5) << ": using " << ((fh->p_mnt == m_local_mount) ? "local (previous) snapshot" : "remote dir_root")
+ << " for incremental transfer" << dendl;
+ return 0;
+}
+
+void PeerReplayer::post_sync_close_handles(const FHandles &fh) {
+ dout(20) << dendl;
+
+ // @FHandles.r_fd_dir_root is closed in @unregister_directory since
+ // its used to acquire an exclusive lock on remote dir_root.
+ ceph_close(m_local_mount, fh.c_fd);
+ ceph_close(fh.p_mnt, fh.p_fd);
+}
+
+int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev) {
+ dout(20) << ": dir_root=" << dir_root << ", current=" << current << dendl;
+ if (prev) {
+ dout(20) << ": incremental sync check from prev=" << prev << dendl;
+ }
+
+ FHandles fh;
+ int r = pre_sync_check_and_open_handles(dir_root, current, prev, &fh);
+ if (r < 0) {
+ dout(5) << ": cannot proceeed with sync: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ BOOST_SCOPE_EXIT_ALL( (this)(&fh) ) {
+ post_sync_close_handles(fh);
+ };
+
+ // record that we are going to "dirty" the data under this
+ // directory root
+ auto snap_id_str{stringify(current.second)};
+ r = ceph_fsetxattr(m_remote_mount, fh.r_fd_dir_root, "ceph.mirror.dirty_snap_id",
+ snap_id_str.c_str(), snap_id_str.size(), 0);
+ if (r < 0) {
+ derr << ": error setting \"ceph.mirror.dirty_snap_id\" on dir_root=" << dir_root
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ struct ceph_statx tstx;
+ r = ceph_fstatx(m_local_mount, fh.c_fd, &tstx,
+ CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID |
+ CEPH_STATX_SIZE | CEPH_STATX_ATIME | CEPH_STATX_MTIME,
+ AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW);
+ if (r < 0) {
+ derr << ": failed to stat snap=" << current.first << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ ceph_dir_result *tdirp;
+ r = ceph_fdopendir(m_local_mount, fh.c_fd, &tdirp);
+ if (r < 0) {
+ derr << ": failed to open local snap=" << current.first << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ std::stack<SyncEntry> sync_stack;
+ sync_stack.emplace(SyncEntry(".", tdirp, tstx));
+ while (!sync_stack.empty()) {
+ if (should_backoff(dir_root, &r)) {
+ dout(0) << ": backing off r=" << r << dendl;
+ break;
+ }
+
+ dout(20) << ": " << sync_stack.size() << " entries in stack" << dendl;
+ std::string e_name;
+ auto &entry = sync_stack.top();
+ dout(20) << ": top of stack path=" << entry.epath << dendl;
+ if (entry.is_directory()) {
+ // entry is a directory -- propagate deletes for missing entries
+ // (and changed inode types) to the remote filesystem.
+ if (!entry.needs_remote_sync()) {
+ r = propagate_deleted_entries(dir_root, entry.epath, fh);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": failed to propagate missing dirs: " << cpp_strerror(r) << dendl;
+ break;
+ }
+ entry.set_remote_synced();
+ }
+
+ struct ceph_statx stx;
+ struct dirent de;
+ while (true) {
+ r = ceph_readdirplus_r(m_local_mount, entry.dirp, &de, &stx,
+ CEPH_STATX_MODE | CEPH_STATX_UID | CEPH_STATX_GID |
+ CEPH_STATX_SIZE | CEPH_STATX_ATIME | CEPH_STATX_MTIME,
+ AT_STATX_DONT_SYNC | AT_SYMLINK_NOFOLLOW, NULL);
+ if (r < 0) {
+ derr << ": failed to local read directory=" << entry.epath << dendl;
+ break;
+ }
+ if (r == 0) {
+ break;
+ }
+
+ auto d_name = std::string(de.d_name);
+ if (d_name != "." && d_name != "..") {
+ e_name = d_name;
+ break;
+ }
+ }
+
+ if (r == 0) {
+ dout(10) << ": done for directory=" << entry.epath << dendl;
+ if (ceph_closedir(m_local_mount, entry.dirp) < 0) {
+ derr << ": failed to close local directory=" << entry.epath << dendl;
+ }
+ sync_stack.pop();
+ continue;
+ }
+ if (r < 0) {
+ break;
+ }
+
+ auto epath = entry_path(entry.epath, e_name);
+ if (S_ISDIR(stx.stx_mode)) {
+ r = remote_mkdir(epath, stx, fh);
+ if (r < 0) {
+ break;
+ }
+ ceph_dir_result *dirp;
+ r = opendirat(m_local_mount, fh.c_fd, epath, AT_SYMLINK_NOFOLLOW, &dirp);
+ if (r < 0) {
+ derr << ": failed to open local directory=" << epath << ": "
+ << cpp_strerror(r) << dendl;
+ break;
+ }
+ sync_stack.emplace(SyncEntry(epath, dirp, stx));
+ } else {
+ sync_stack.emplace(SyncEntry(epath, stx));
+ }
+ } else {
+ bool need_data_sync = true;
+ bool need_attr_sync = true;
+ r = should_sync_entry(entry.epath, entry.stx, fh,
+ &need_data_sync, &need_attr_sync);
+ if (r < 0) {
+ break;
+ }
+
+ dout(5) << ": entry=" << entry.epath << ", data_sync=" << need_data_sync
+ << ", attr_sync=" << need_attr_sync << dendl;
+ if (need_data_sync || need_attr_sync) {
+ r = remote_file_op(dir_root, entry.epath, entry.stx, fh, need_data_sync,
+ need_attr_sync);
+ if (r < 0) {
+ break;
+ }
+ }
+ dout(10) << ": done for epath=" << entry.epath << dendl;
+ sync_stack.pop();
+ }
+ }
+
+ while (!sync_stack.empty()) {
+ auto &entry = sync_stack.top();
+ if (entry.is_directory()) {
+ dout(20) << ": closing local directory=" << entry.epath << dendl;
+ if (ceph_closedir(m_local_mount, entry.dirp) < 0) {
+ derr << ": failed to close local directory=" << entry.epath << dendl;
+ }
+ }
+
+ sync_stack.pop();
+ }
+
+ return r;
+}
+
+int PeerReplayer::synchronize(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev) {
+ dout(20) << ": dir_root=" << dir_root << ", current=" << current << dendl;
+ if (prev) {
+ dout(20) << ": prev=" << prev << dendl;
+ }
+
+ int r = ceph_getxattr(m_remote_mount, dir_root.c_str(), "ceph.mirror.dirty_snap_id", nullptr, 0);
+ if (r < 0 && r != -ENODATA) {
+ derr << ": failed to fetch primary_snap_id length from dir_root=" << dir_root
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // no xattr, can't determine which snap the data belongs to!
+ if (r < 0) {
+ dout(5) << ": missing \"ceph.mirror.dirty_snap_id\" xattr on remote -- using"
+ << " incremental sync with remote scan" << dendl;
+ r = do_synchronize(dir_root, current, boost::none);
+ } else {
+ size_t xlen = r;
+ char *val = (char *)alloca(xlen+1);
+ r = ceph_getxattr(m_remote_mount, dir_root.c_str(), "ceph.mirror.dirty_snap_id", (void*)val, xlen);
+ if (r < 0) {
+ derr << ": failed to fetch \"dirty_snap_id\" for dir_root: " << dir_root
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ val[xlen] = '\0';
+ uint64_t dirty_snap_id = atoll(val);
+
+ dout(20) << ": dirty_snap_id: " << dirty_snap_id << " vs (" << current.second
+ << "," << (prev ? stringify((*prev).second) : "~") << ")" << dendl;
+ if (prev && (dirty_snap_id == (*prev).second || dirty_snap_id == current.second)) {
+ dout(5) << ": match -- using incremental sync with local scan" << dendl;
+ r = do_synchronize(dir_root, current, prev);
+ } else {
+ dout(5) << ": mismatch -- using incremental sync with remote scan" << dendl;
+ r = do_synchronize(dir_root, current, boost::none);
+ }
+ }
+
+ // snap sync failed -- bail out!
+ if (r < 0) {
+ return r;
+ }
+
+ auto cur_snap_id_str{stringify(current.second)};
+ snap_metadata snap_meta[] = {{PRIMARY_SNAP_ID_KEY.c_str(), cur_snap_id_str.c_str()}};
+ r = ceph_mksnap(m_remote_mount, dir_root.c_str(), current.first.c_str(), 0755,
+ snap_meta, sizeof(snap_meta)/sizeof(snap_metadata));
+ if (r < 0) {
+ derr << ": failed to snap remote directory dir_root=" << dir_root
+ << ": " << cpp_strerror(r) << dendl;
+ }
+
+ return r;
+}
+
+int PeerReplayer::do_sync_snaps(const std::string &dir_root) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+
+ std::map<uint64_t, std::string> local_snap_map;
+ std::map<uint64_t, std::string> remote_snap_map;
+
+ int r = build_snap_map(dir_root, &local_snap_map);
+ if (r < 0) {
+ derr << ": failed to build local snap map" << dendl;
+ return r;
+ }
+
+ r = build_snap_map(dir_root, &remote_snap_map, true);
+ if (r < 0) {
+ derr << ": failed to build remote snap map" << dendl;
+ return r;
+ }
+
+ // infer deleted and renamed snapshots from local and remote
+ // snap maps
+ std::set<std::string> snaps_deleted;
+ std::set<std::pair<std::string,std::string>> snaps_renamed;
+ for (auto &[primary_snap_id, snap_name] : remote_snap_map) {
+ auto it = local_snap_map.find(primary_snap_id);
+ if (it == local_snap_map.end()) {
+ snaps_deleted.emplace(snap_name);
+ } else if (it->second != snap_name) {
+ snaps_renamed.emplace(std::make_pair(snap_name, it->second));
+ }
+ }
+
+ r = propagate_snap_deletes(dir_root, snaps_deleted);
+ if (r < 0) {
+ derr << ": failed to propgate deleted snapshots" << dendl;
+ return r;
+ }
+
+ r = propagate_snap_renames(dir_root, snaps_renamed);
+ if (r < 0) {
+ derr << ": failed to propgate renamed snapshots" << dendl;
+ return r;
+ }
+
+ // start mirroring snapshots from the last snap-id synchronized
+ uint64_t last_snap_id = 0;
+ std::string last_snap_name;
+ if (!remote_snap_map.empty()) {
+ auto last = remote_snap_map.rbegin();
+ last_snap_id = last->first;
+ last_snap_name = last->second;
+ set_last_synced_snap(dir_root, last_snap_id, last_snap_name);
+ }
+
+ dout(5) << ": last snap-id transferred=" << last_snap_id << dendl;
+ auto it = local_snap_map.upper_bound(last_snap_id);
+ if (it == local_snap_map.end()) {
+ dout(20) << ": nothing to synchronize" << dendl;
+ return 0;
+ }
+
+ auto snaps_per_cycle = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_max_snapshot_sync_per_cycle");
+
+ dout(10) << ": synchronizing from snap-id=" << it->first << dendl;
+ for (; it != local_snap_map.end(); ++it) {
+ set_current_syncing_snap(dir_root, it->first, it->second);
+ auto start = clock::now();
+ boost::optional<Snapshot> prev = boost::none;
+ if (last_snap_id != 0) {
+ prev = std::make_pair(last_snap_name, last_snap_id);
+ }
+ r = synchronize(dir_root, std::make_pair(it->second, it->first), prev);
+ if (r < 0) {
+ derr << ": failed to synchronize dir_root=" << dir_root
+ << ", snapshot=" << it->second << dendl;
+ clear_current_syncing_snap(dir_root);
+ return r;
+ }
+ std::chrono::duration<double> duration = clock::now() - start;
+ set_last_synced_stat(dir_root, it->first, it->second, duration.count());
+ if (--snaps_per_cycle == 0) {
+ break;
+ }
+
+ last_snap_name = it->second;
+ last_snap_id = it->first;
+ }
+
+ return 0;
+}
+
+void PeerReplayer::sync_snaps(const std::string &dir_root,
+ std::unique_lock<ceph::mutex> &locker) {
+ dout(20) << ": dir_root=" << dir_root << dendl;
+ locker.unlock();
+ int r = do_sync_snaps(dir_root);
+ if (r < 0) {
+ derr << ": failed to sync snapshots for dir_root=" << dir_root << dendl;
+ }
+ locker.lock();
+ if (r < 0) {
+ _inc_failed_count(dir_root);
+ } else {
+ _reset_failed_count(dir_root);
+ }
+}
+
+void PeerReplayer::run(SnapshotReplayerThread *replayer) {
+ dout(10) << ": snapshot replayer=" << replayer << dendl;
+
+ time last_directory_scan = clock::zero();
+ auto scan_interval = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_directory_scan_interval");
+
+ std::unique_lock locker(m_lock);
+ while (true) {
+ // do not check if client is blocklisted under lock
+ m_cond.wait_for(locker, 1s, [this]{return is_stopping();});
+ if (is_stopping()) {
+ dout(5) << ": exiting" << dendl;
+ break;
+ }
+
+ locker.unlock();
+
+ if (m_fs_mirror->is_blocklisted()) {
+ dout(5) << ": exiting as client is blocklisted" << dendl;
+ break;
+ }
+
+ locker.lock();
+
+ auto now = clock::now();
+ std::chrono::duration<double> timo = now - last_directory_scan;
+ if (timo.count() >= scan_interval && m_directories.size()) {
+ dout(20) << ": trying to pick from " << m_directories.size() << " directories" << dendl;
+ auto dir_root = pick_directory();
+ if (dir_root) {
+ dout(5) << ": picked dir_root=" << *dir_root << dendl;
+ int r = register_directory(*dir_root, replayer);
+ if (r == 0) {
+ sync_snaps(*dir_root, locker);
+ unregister_directory(*dir_root);
+ }
+ }
+
+ last_directory_scan = now;
+ }
+ }
+}
+
+void PeerReplayer::peer_status(Formatter *f) {
+ std::scoped_lock locker(m_lock);
+ f->open_object_section("stats");
+ for (auto &[dir_root, sync_stat] : m_snap_sync_stats) {
+ f->open_object_section(dir_root);
+ if (sync_stat.failed) {
+ f->dump_string("state", "failed");
+ } else if (!sync_stat.current_syncing_snap) {
+ f->dump_string("state", "idle");
+ } else {
+ f->dump_string("state", "syncing");
+ f->open_object_section("current_sycning_snap");
+ f->dump_unsigned("id", (*sync_stat.current_syncing_snap).first);
+ f->dump_string("name", (*sync_stat.current_syncing_snap).second);
+ f->close_section();
+ }
+ if (sync_stat.last_synced_snap) {
+ f->open_object_section("last_synced_snap");
+ f->dump_unsigned("id", (*sync_stat.last_synced_snap).first);
+ f->dump_string("name", (*sync_stat.last_synced_snap).second);
+ if (sync_stat.last_sync_duration) {
+ f->dump_float("sync_duration", *sync_stat.last_sync_duration);
+ f->dump_stream("sync_time_stamp") << sync_stat.last_synced;
+ }
+ f->close_section();
+ }
+ f->dump_unsigned("snaps_synced", sync_stat.synced_snap_count);
+ f->dump_unsigned("snaps_deleted", sync_stat.deleted_snap_count);
+ f->dump_unsigned("snaps_renamed", sync_stat.renamed_snap_count);
+ f->close_section(); // dir_root
+ }
+ f->close_section(); // stats
+}
+
+void PeerReplayer::reopen_logs() {
+ std::scoped_lock locker(m_lock);
+
+ if (m_remote_cluster) {
+ reinterpret_cast<CephContext *>(m_remote_cluster->cct())->reopen_logs();
+ }
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/PeerReplayer.h b/src/tools/cephfs_mirror/PeerReplayer.h
new file mode 100644
index 000000000..886c95329
--- /dev/null
+++ b/src/tools/cephfs_mirror/PeerReplayer.h
@@ -0,0 +1,319 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_PEER_REPLAYER_H
+#define CEPHFS_MIRROR_PEER_REPLAYER_H
+
+#include "common/Formatter.h"
+#include "common/Thread.h"
+#include "mds/FSMap.h"
+#include "ServiceDaemon.h"
+#include "Types.h"
+
+namespace cephfs {
+namespace mirror {
+
+class FSMirror;
+class PeerReplayerAdminSocketHook;
+
+class PeerReplayer {
+public:
+ PeerReplayer(CephContext *cct, FSMirror *fs_mirror,
+ RadosRef local_cluster, const Filesystem &filesystem,
+ const Peer &peer, const std::set<std::string, std::less<>> &directories,
+ MountRef mount, ServiceDaemon *service_daemon);
+ ~PeerReplayer();
+
+ // initialize replayer for a peer
+ int init();
+
+ // shutdown replayer for a peer
+ void shutdown();
+
+ // add a directory to mirror queue
+ void add_directory(string_view dir_root);
+
+ // remove a directory from queue
+ void remove_directory(string_view dir_root);
+
+ // admin socket helpers
+ void peer_status(Formatter *f);
+
+ // reopen logs
+ void reopen_logs();
+
+private:
+ inline static const std::string PRIMARY_SNAP_ID_KEY = "primary_snap_id";
+
+ inline static const std::string SERVICE_DAEMON_FAILED_DIR_COUNT_KEY = "failure_count";
+ inline static const std::string SERVICE_DAEMON_RECOVERED_DIR_COUNT_KEY = "recovery_count";
+
+ using Snapshot = std::pair<std::string, uint64_t>;
+
+ // file descriptor "triplet" for synchronizing a snapshot
+ // w/ an added MountRef for accessing "previous" snapshot.
+ struct FHandles {
+ // open file descriptor on the snap directory for snapshot
+ // currently being synchronized. Always use this fd with
+ // @m_local_mount.
+ int c_fd;
+
+ // open file descriptor on the "previous" snapshot or on
+ // dir_root on remote filesystem (based on if the snapshot
+ // can be used for incremental transfer). Always use this
+ // fd with p_mnt which either points to @m_local_mount (
+ // for local incremental comparison) or @m_remote_mount (
+ // for remote incremental comparison).
+ int p_fd;
+ MountRef p_mnt;
+
+ // open file descriptor on dir_root on remote filesystem.
+ // Always use this fd with @m_remote_mount.
+ int r_fd_dir_root;
+ };
+
+ bool is_stopping() {
+ return m_stopping;
+ }
+
+ struct Replayer;
+ class SnapshotReplayerThread : public Thread {
+ public:
+ SnapshotReplayerThread(PeerReplayer *peer_replayer)
+ : m_peer_replayer(peer_replayer) {
+ }
+
+ void *entry() override {
+ m_peer_replayer->run(this);
+ return 0;
+ }
+
+ private:
+ PeerReplayer *m_peer_replayer;
+ };
+
+ struct DirRegistry {
+ int fd;
+ bool canceled = false;
+ SnapshotReplayerThread *replayer;
+ };
+
+ struct SyncEntry {
+ std::string epath;
+ ceph_dir_result *dirp; // valid for directories
+ struct ceph_statx stx;
+ // set by incremental sync _after_ ensuring missing entries
+ // in the currently synced snapshot have been propagated to
+ // the remote filesystem.
+ bool remote_synced = false;
+
+ SyncEntry(std::string_view path,
+ const struct ceph_statx &stx)
+ : epath(path),
+ stx(stx) {
+ }
+ SyncEntry(std::string_view path,
+ ceph_dir_result *dirp,
+ const struct ceph_statx &stx)
+ : epath(path),
+ dirp(dirp),
+ stx(stx) {
+ }
+
+ bool is_directory() const {
+ return S_ISDIR(stx.stx_mode);
+ }
+
+ bool needs_remote_sync() const {
+ return remote_synced;
+ }
+ void set_remote_synced() {
+ remote_synced = true;
+ }
+ };
+
+ using clock = ceph::coarse_mono_clock;
+ using time = ceph::coarse_mono_time;
+
+ // stats sent to service daemon
+ struct ServiceDaemonStats {
+ uint64_t failed_dir_count = 0;
+ uint64_t recovered_dir_count = 0;
+ };
+
+ struct SnapSyncStat {
+ uint64_t nr_failures = 0; // number of consecutive failures
+ boost::optional<time> last_failed; // lat failed timestamp
+ bool failed = false; // hit upper cap for consecutive failures
+ boost::optional<std::pair<uint64_t, std::string>> last_synced_snap;
+ boost::optional<std::pair<uint64_t, std::string>> current_syncing_snap;
+ uint64_t synced_snap_count = 0;
+ uint64_t deleted_snap_count = 0;
+ uint64_t renamed_snap_count = 0;
+ time last_synced = clock::zero();
+ boost::optional<double> last_sync_duration;
+ };
+
+ void _inc_failed_count(const std::string &dir_root) {
+ auto max_failures = g_ceph_context->_conf.get_val<uint64_t>(
+ "cephfs_mirror_max_consecutive_failures_per_directory");
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.last_failed = clock::now();
+ if (++sync_stat.nr_failures >= max_failures && !sync_stat.failed) {
+ sync_stat.failed = true;
+ ++m_service_daemon_stats.failed_dir_count;
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer,
+ SERVICE_DAEMON_FAILED_DIR_COUNT_KEY,
+ m_service_daemon_stats.failed_dir_count);
+ }
+ }
+ void _reset_failed_count(const std::string &dir_root) {
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ if (sync_stat.failed) {
+ ++m_service_daemon_stats.recovered_dir_count;
+ m_service_daemon->add_or_update_peer_attribute(m_filesystem.fscid, m_peer,
+ SERVICE_DAEMON_RECOVERED_DIR_COUNT_KEY,
+ m_service_daemon_stats.recovered_dir_count);
+ }
+ sync_stat.nr_failures = 0;
+ sync_stat.failed = false;
+ sync_stat.last_failed = boost::none;
+ }
+
+ void _set_last_synced_snap(const std::string &dir_root, uint64_t snap_id,
+ const std::string &snap_name) {
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.last_synced_snap = std::make_pair(snap_id, snap_name);
+ sync_stat.current_syncing_snap = boost::none;
+ }
+ void set_last_synced_snap(const std::string &dir_root, uint64_t snap_id,
+ const std::string &snap_name) {
+ std::scoped_lock locker(m_lock);
+ _set_last_synced_snap(dir_root, snap_id, snap_name);
+ }
+ void set_current_syncing_snap(const std::string &dir_root, uint64_t snap_id,
+ const std::string &snap_name) {
+ std::scoped_lock locker(m_lock);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.current_syncing_snap = std::make_pair(snap_id, snap_name);
+ }
+ void clear_current_syncing_snap(const std::string &dir_root) {
+ std::scoped_lock locker(m_lock);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.current_syncing_snap = boost::none;
+ }
+ void inc_deleted_snap(const std::string &dir_root) {
+ std::scoped_lock locker(m_lock);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ ++sync_stat.deleted_snap_count;
+ }
+ void inc_renamed_snap(const std::string &dir_root) {
+ std::scoped_lock locker(m_lock);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ ++sync_stat.renamed_snap_count;
+ }
+ void set_last_synced_stat(const std::string &dir_root, uint64_t snap_id,
+ const std::string &snap_name, double duration) {
+ std::scoped_lock locker(m_lock);
+ _set_last_synced_snap(dir_root, snap_id, snap_name);
+ auto &sync_stat = m_snap_sync_stats.at(dir_root);
+ sync_stat.last_synced = clock::now();
+ sync_stat.last_sync_duration = duration;
+ ++sync_stat.synced_snap_count;
+ }
+
+ bool should_backoff(const std::string &dir_root, int *retval) {
+ if (m_fs_mirror->is_blocklisted()) {
+ *retval = -EBLOCKLISTED;
+ return true;
+ }
+
+ std::scoped_lock locker(m_lock);
+ if (is_stopping()) {
+ // ceph defines EBLOCKLISTED to ESHUTDOWN (108). so use
+ // EINPROGRESS to identify shutdown.
+ *retval = -EINPROGRESS;
+ return true;
+ }
+ auto &dr = m_registered.at(dir_root);
+ if (dr.canceled) {
+ *retval = -ECANCELED;
+ return true;
+ }
+
+ *retval = 0;
+ return false;
+ }
+
+ typedef std::vector<std::unique_ptr<SnapshotReplayerThread>> SnapshotReplayers;
+
+ CephContext *m_cct;
+ FSMirror *m_fs_mirror;
+ RadosRef m_local_cluster;
+ Filesystem m_filesystem;
+ Peer m_peer;
+ // probably need to be encapsulated when supporting cancelations
+ std::map<std::string, DirRegistry> m_registered;
+ std::vector<std::string> m_directories;
+ std::map<std::string, SnapSyncStat> m_snap_sync_stats;
+ MountRef m_local_mount;
+ ServiceDaemon *m_service_daemon;
+ PeerReplayerAdminSocketHook *m_asok_hook = nullptr;
+
+ ceph::mutex m_lock;
+ ceph::condition_variable m_cond;
+ RadosRef m_remote_cluster;
+ MountRef m_remote_mount;
+ bool m_stopping = false;
+ SnapshotReplayers m_replayers;
+
+ ServiceDaemonStats m_service_daemon_stats;
+
+ void run(SnapshotReplayerThread *replayer);
+
+ boost::optional<std::string> pick_directory();
+ int register_directory(const std::string &dir_root, SnapshotReplayerThread *replayer);
+ void unregister_directory(const std::string &dir_root);
+ int try_lock_directory(const std::string &dir_root, SnapshotReplayerThread *replayer,
+ DirRegistry *registry);
+ void unlock_directory(const std::string &dir_root, const DirRegistry &registry);
+ void sync_snaps(const std::string &dir_root, std::unique_lock<ceph::mutex> &locker);
+
+
+ int build_snap_map(const std::string &dir_root, std::map<uint64_t, std::string> *snap_map,
+ bool is_remote=false);
+
+ int propagate_snap_deletes(const std::string &dir_root, const std::set<std::string> &snaps);
+ int propagate_snap_renames(const std::string &dir_root,
+ const std::set<std::pair<std::string,std::string>> &snaps);
+ int propagate_deleted_entries(const std::string &dir_root, const std::string &epath,
+ const FHandles &fh);
+ int cleanup_remote_dir(const std::string &dir_root, const std::string &epath,
+ const FHandles &fh);
+
+ int should_sync_entry(const std::string &epath, const struct ceph_statx &cstx,
+ const FHandles &fh, bool *need_data_sync, bool *need_attr_sync);
+
+ int open_dir(MountRef mnt, const std::string &dir_path, boost::optional<uint64_t> snap_id);
+ int pre_sync_check_and_open_handles(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev, FHandles *fh);
+ void post_sync_close_handles(const FHandles &fh);
+
+ int do_synchronize(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev);
+
+ int synchronize(const std::string &dir_root, const Snapshot &current,
+ boost::optional<Snapshot> prev);
+ int do_sync_snaps(const std::string &dir_root);
+
+ int remote_mkdir(const std::string &epath, const struct ceph_statx &stx, const FHandles &fh);
+ int remote_file_op(const std::string &dir_root, const std::string &epath, const struct ceph_statx &stx,
+ const FHandles &fh, bool need_data_sync, bool need_attr_sync);
+ int copy_to_remote(const std::string &dir_root, const std::string &epath, const struct ceph_statx &stx,
+ const FHandles &fh);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_PEER_REPLAYER_H
diff --git a/src/tools/cephfs_mirror/ServiceDaemon.cc b/src/tools/cephfs_mirror/ServiceDaemon.cc
new file mode 100644
index 000000000..f66dd46bf
--- /dev/null
+++ b/src/tools/cephfs_mirror/ServiceDaemon.cc
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "include/stringify.h"
+#include "ServiceDaemon.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::ServiceDaemon: " << this << " " \
+ << __func__
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+struct AttributeDumpVisitor : public boost::static_visitor<void> {
+ ceph::Formatter *f;
+ std::string name;
+
+ AttributeDumpVisitor(ceph::Formatter *f, std::string_view name)
+ : f(f), name(name) {
+ }
+
+ void operator()(bool val) const {
+ f->dump_bool(name.c_str(), val);
+ }
+ void operator()(uint64_t val) const {
+ f->dump_unsigned(name.c_str(), val);
+ }
+ void operator()(const std::string &val) const {
+ f->dump_string(name.c_str(), val);
+ }
+};
+
+} // anonymous namespace
+
+ServiceDaemon::ServiceDaemon(CephContext *cct, RadosRef rados)
+ : m_cct(cct),
+ m_rados(rados),
+ m_timer(new SafeTimer(cct, m_timer_lock, true)) {
+ m_timer->init();
+}
+
+ServiceDaemon::~ServiceDaemon() {
+ dout(10) << dendl;
+ {
+ std::scoped_lock timer_lock(m_timer_lock);
+ if (m_timer_ctx != nullptr) {
+ dout(5) << ": canceling timer task=" << m_timer_ctx << dendl;
+ m_timer->cancel_event(m_timer_ctx);
+ }
+ m_timer->shutdown();
+ }
+
+ delete m_timer;
+}
+
+int ServiceDaemon::init() {
+ dout(20) << dendl;
+
+ std::string id = m_cct->_conf->name.get_id();
+ if (id.find(CEPHFS_MIRROR_AUTH_ID_PREFIX) == 0) {
+ id = id.substr(CEPHFS_MIRROR_AUTH_ID_PREFIX.size());
+ }
+ std::string instance_id = stringify(m_rados->get_instance_id());
+
+ std::map<std::string, std::string> service_metadata = {{"id", id},
+ {"instance_id", instance_id}};
+ int r = m_rados->service_daemon_register("cephfs-mirror", instance_id,
+ service_metadata);
+ if (r < 0) {
+ return r;
+ }
+ return 0;
+}
+
+void ServiceDaemon::add_filesystem(fs_cluster_id_t fscid, std::string_view fs_name) {
+ dout(10) << ": fscid=" << fscid << ", fs_name=" << fs_name << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ m_filesystems.emplace(fscid, Filesystem(fs_name));
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::remove_filesystem(fs_cluster_id_t fscid) {
+ dout(10) << ": fscid=" << fscid << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ m_filesystems.erase(fscid);
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::add_peer(fs_cluster_id_t fscid, const Peer &peer) {
+ dout(10) << ": peer=" << peer << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto fs_it = m_filesystems.find(fscid);
+ if (fs_it == m_filesystems.end()) {
+ return;
+ }
+ fs_it->second.peer_attributes.emplace(peer, Attributes{});
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::remove_peer(fs_cluster_id_t fscid, const Peer &peer) {
+ dout(10) << ": peer=" << peer << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto fs_it = m_filesystems.find(fscid);
+ if (fs_it == m_filesystems.end()) {
+ return;
+ }
+ fs_it->second.peer_attributes.erase(peer);
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::add_or_update_fs_attribute(fs_cluster_id_t fscid, std::string_view key,
+ AttributeValue value) {
+ dout(10) << ": fscid=" << fscid << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto fs_it = m_filesystems.find(fscid);
+ if (fs_it == m_filesystems.end()) {
+ return;
+ }
+
+ fs_it->second.fs_attributes[std::string(key)] = value;
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::add_or_update_peer_attribute(fs_cluster_id_t fscid, const Peer &peer,
+ std::string_view key, AttributeValue value) {
+ dout(10) << ": fscid=" << fscid << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ auto fs_it = m_filesystems.find(fscid);
+ if (fs_it == m_filesystems.end()) {
+ return;
+ }
+
+ auto peer_it = fs_it->second.peer_attributes.find(peer);
+ if (peer_it == fs_it->second.peer_attributes.end()) {
+ return;
+ }
+
+ peer_it->second[std::string(key)] = value;
+ }
+ schedule_update_status();
+}
+
+void ServiceDaemon::schedule_update_status() {
+ dout(10) << dendl;
+
+ std::scoped_lock timer_lock(m_timer_lock);
+ if (m_timer_ctx != nullptr) {
+ return;
+ }
+
+ m_timer_ctx = new LambdaContext([this] {
+ m_timer_ctx = nullptr;
+ update_status();
+ });
+ m_timer->add_event_after(1, m_timer_ctx);
+}
+
+void ServiceDaemon::update_status() {
+ dout(20) << ": " << m_filesystems.size() << " filesystem(s)" << dendl;
+
+ ceph::JSONFormatter f;
+ {
+ std::scoped_lock locker(m_lock);
+ f.open_object_section("filesystems");
+ for (auto &[fscid, filesystem] : m_filesystems) {
+ f.open_object_section(stringify(fscid).c_str());
+ f.dump_string("name", filesystem.fs_name);
+ for (auto &[attr_name, attr_value] : filesystem.fs_attributes) {
+ AttributeDumpVisitor visitor(&f, attr_name);
+ boost::apply_visitor(visitor, attr_value);
+ }
+ f.open_object_section("peers");
+ for (auto &[peer, attributes] : filesystem.peer_attributes) {
+ f.open_object_section(peer.uuid);
+ f.dump_object("remote", peer.remote);
+ f.open_object_section("stats");
+ for (auto &[attr_name, attr_value] : attributes) {
+ AttributeDumpVisitor visitor(&f, attr_name);
+ boost::apply_visitor(visitor, attr_value);
+ }
+ f.close_section(); // stats
+ f.close_section(); // peer.uuid
+ }
+ f.close_section(); // peers
+ f.close_section(); // fscid
+ }
+ f.close_section(); // filesystems
+ }
+
+ std::stringstream ss;
+ f.flush(ss);
+
+ int r = m_rados->service_daemon_update_status({{"status_json", ss.str()}});
+ if (r < 0) {
+ derr << ": failed to update service daemon status: " << cpp_strerror(r)
+ << dendl;
+ }
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/ServiceDaemon.h b/src/tools/cephfs_mirror/ServiceDaemon.h
new file mode 100644
index 000000000..83eee286d
--- /dev/null
+++ b/src/tools/cephfs_mirror/ServiceDaemon.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_SERVICE_DAEMON_H
+#define CEPHFS_MIRROR_SERVICE_DAEMON_H
+
+#include "common/ceph_mutex.h"
+#include "common/Timer.h"
+#include "mds/FSMap.h"
+#include "Types.h"
+
+namespace cephfs {
+namespace mirror {
+
+class ServiceDaemon {
+public:
+ ServiceDaemon(CephContext *cct, RadosRef rados);
+ ~ServiceDaemon();
+
+ int init();
+
+ void add_filesystem(fs_cluster_id_t fscid, std::string_view fs_name);
+ void remove_filesystem(fs_cluster_id_t fscid);
+
+ void add_peer(fs_cluster_id_t fscid, const Peer &peer);
+ void remove_peer(fs_cluster_id_t fscid, const Peer &peer);
+
+ void add_or_update_fs_attribute(fs_cluster_id_t fscid, std::string_view key,
+ AttributeValue value);
+ void add_or_update_peer_attribute(fs_cluster_id_t fscid, const Peer &peer,
+ std::string_view key, AttributeValue value);
+
+private:
+ struct Filesystem {
+ std::string fs_name;
+ Attributes fs_attributes;
+ std::map<Peer, Attributes> peer_attributes;
+
+ Filesystem(std::string_view fs_name)
+ : fs_name(fs_name) {
+ }
+ };
+
+ const std::string CEPHFS_MIRROR_AUTH_ID_PREFIX = "cephfs-mirror.";
+
+ CephContext *m_cct;
+ RadosRef m_rados;
+ SafeTimer *m_timer;
+ ceph::mutex m_timer_lock = ceph::make_mutex("cephfs::mirror::ServiceDaemon");
+
+ ceph::mutex m_lock = ceph::make_mutex("cephfs::mirror::service_daemon");
+ Context *m_timer_ctx = nullptr;
+ std::map<fs_cluster_id_t, Filesystem> m_filesystems;
+
+ void schedule_update_status();
+ void update_status();
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_SERVICE_DAEMON_H
diff --git a/src/tools/cephfs_mirror/Types.cc b/src/tools/cephfs_mirror/Types.cc
new file mode 100644
index 000000000..0049f9d79
--- /dev/null
+++ b/src/tools/cephfs_mirror/Types.cc
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Types.h"
+
+namespace cephfs {
+namespace mirror {
+
+std::ostream& operator<<(std::ostream& out, const Filesystem &filesystem) {
+ out << "{fscid=" << filesystem.fscid << ", fs_name=" << filesystem.fs_name << "}";
+ return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const FilesystemSpec &spec) {
+ out << "{filesystem=" << spec.filesystem << ", pool_id=" << spec.pool_id << "}";
+ return out;
+}
+
+} // namespace mirror
+} // namespace cephfs
+
diff --git a/src/tools/cephfs_mirror/Types.h b/src/tools/cephfs_mirror/Types.h
new file mode 100644
index 000000000..016a8dc86
--- /dev/null
+++ b/src/tools/cephfs_mirror/Types.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_TYPES_H
+#define CEPHFS_MIRROR_TYPES_H
+
+#include <set>
+#include <iostream>
+#include <string_view>
+
+#include "include/rados/librados.hpp"
+#include "include/cephfs/libcephfs.h"
+#include "mds/mdstypes.h"
+
+namespace cephfs {
+namespace mirror {
+
+static const std::string CEPHFS_MIRROR_OBJECT("cephfs_mirror");
+
+typedef boost::variant<bool, uint64_t, std::string> AttributeValue;
+typedef std::map<std::string, AttributeValue> Attributes;
+
+// distinct filesystem identifier
+struct Filesystem {
+ fs_cluster_id_t fscid;
+ std::string fs_name;
+
+ bool operator==(const Filesystem &rhs) const {
+ return (fscid == rhs.fscid &&
+ fs_name == rhs.fs_name);
+ }
+
+ bool operator!=(const Filesystem &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator<(const Filesystem &rhs) const {
+ if (fscid != rhs.fscid) {
+ return fscid < rhs.fscid;
+ }
+
+ return fs_name < rhs.fs_name;
+ }
+};
+
+// specification of a filesystem -- pool id the metadata pool id.
+struct FilesystemSpec {
+ FilesystemSpec() = default;
+ FilesystemSpec(const Filesystem &filesystem, uint64_t pool_id)
+ : filesystem(filesystem),
+ pool_id(pool_id) {
+ }
+ FilesystemSpec(fs_cluster_id_t fscid, std::string_view fs_name, uint64_t pool_id)
+ : filesystem(Filesystem{fscid, std::string(fs_name)}),
+ pool_id(pool_id) {
+ }
+
+ Filesystem filesystem;
+ uint64_t pool_id;
+
+ bool operator==(const FilesystemSpec &rhs) const {
+ return (filesystem == rhs.filesystem &&
+ pool_id == rhs.pool_id);
+ }
+
+ bool operator<(const FilesystemSpec &rhs) const {
+ if (filesystem != rhs.filesystem) {
+ return filesystem < rhs.filesystem;
+ }
+
+ return pool_id < rhs.pool_id;
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, const Filesystem &filesystem);
+std::ostream& operator<<(std::ostream& out, const FilesystemSpec &spec);
+
+typedef std::shared_ptr<librados::Rados> RadosRef;
+typedef std::shared_ptr<librados::IoCtx> IoCtxRef;
+
+// not a shared_ptr since the type is incomplete
+typedef ceph_mount_info *MountRef;
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_TYPES_H
diff --git a/src/tools/cephfs_mirror/Utils.cc b/src/tools/cephfs_mirror/Utils.cc
new file mode 100644
index 000000000..1a8b8e0ac
--- /dev/null
+++ b/src/tools/cephfs_mirror/Utils.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/common_init.h"
+#include "common/debug.h"
+#include "common/errno.h"
+
+#include "Utils.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::Utils " << __func__
+
+namespace cephfs {
+namespace mirror {
+
+int connect(std::string_view client_name, std::string_view cluster_name,
+ RadosRef *cluster, std::string_view mon_host, std::string_view cephx_key,
+ std::vector<const char *> args) {
+ dout(20) << ": connecting to cluster=" << cluster_name << ", client=" << client_name
+ << ", mon_host=" << mon_host << dendl;
+
+ CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
+ if (client_name.empty() || !iparams.name.from_str(client_name)) {
+ derr << ": error initializing cluster handle for " << cluster_name << dendl;
+ return -EINVAL;
+ }
+
+ CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+ if (mon_host.empty()) {
+ cct->_conf->cluster = cluster_name;
+ }
+
+ int r = cct->_conf.parse_config_files(nullptr, nullptr, 0);
+ if (r < 0 && r != -ENOENT) {
+ derr << ": could not read ceph conf: " << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ cct->_conf.parse_env(cct->get_module_type());
+
+ if (!args.empty()) {
+ r = cct->_conf.parse_argv(args);
+ if (r < 0) {
+ derr << ": could not parse command line args: " << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+ cct->_conf.parse_env(cct->get_module_type());
+
+ if (!mon_host.empty()) {
+ r = cct->_conf.set_val("mon_host", std::string(mon_host));
+ if (r < 0) {
+ derr << "failed to set mon_host config: " << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+ if (!cephx_key.empty()) {
+ r = cct->_conf.set_val("key", std::string(cephx_key));
+ if (r < 0) {
+ derr << "failed to set key config: " << cpp_strerror(r) << dendl;
+ cct->put();
+ return r;
+ }
+ }
+
+ dout(10) << ": using mon addr=" << cct->_conf.get_val<std::string>("mon_host") << dendl;
+
+ cluster->reset(new librados::Rados());
+
+ r = (*cluster)->init_with_context(cct);
+ ceph_assert(r == 0);
+ cct->put();
+
+ r = (*cluster)->connect();
+ if (r < 0) {
+ derr << ": error connecting to " << cluster_name << ": " << cpp_strerror(r)
+ << dendl;
+ return r;
+ }
+
+ dout(10) << ": connected to cluster=" << cluster_name << " using client="
+ << client_name << dendl;
+
+ return 0;
+}
+
+int mount(RadosRef cluster, const Filesystem &filesystem, bool cross_check_fscid,
+ MountRef *mount) {
+ dout(20) << ": filesystem=" << filesystem << dendl;
+
+ ceph_mount_info *cmi;
+ int r = ceph_create_with_context(&cmi, reinterpret_cast<CephContext*>(cluster->cct()));
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_conf_set(cmi, "client_mount_uid", "0");
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_conf_set(cmi, "client_mount_gid", "0");
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ // mount timeout applies for local and remote mounts.
+ auto mount_timeout = g_ceph_context->_conf.get_val<std::chrono::seconds>
+ ("cephfs_mirror_mount_timeout").count();
+ r = ceph_set_mount_timeout(cmi, mount_timeout);
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_init(cmi);
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_select_filesystem(cmi, filesystem.fs_name.c_str());
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = ceph_mount(cmi, NULL);
+ if (r < 0) {
+ derr << ": mount error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ auto fs_id = ceph_get_fs_cid(cmi);
+ if (cross_check_fscid && fs_id != filesystem.fscid) {
+ // this can happen in the most remotest possibility when a
+ // filesystem is deleted and recreated with the same name.
+ // since all this is driven asynchronously, we were able to
+ // mount the recreated filesystem. so bubble up the error.
+ // cleanup will eventually happen since a mirror disable event
+ // would have been queued.
+ derr << ": filesystem-id mismatch " << fs_id << " vs " << filesystem.fscid
+ << dendl;
+ // ignore errors, we are shutting down anyway.
+ ceph_unmount(cmi);
+ return -EINVAL;
+ }
+
+ dout(10) << ": mounted filesystem=" << filesystem << dendl;
+
+ *mount = cmi;
+ return 0;
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/Utils.h b/src/tools/cephfs_mirror/Utils.h
new file mode 100644
index 000000000..76b0c0726
--- /dev/null
+++ b/src/tools/cephfs_mirror/Utils.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_UTILS_H
+#define CEPHFS_MIRROR_UTILS_H
+
+#include "Types.h"
+
+namespace cephfs {
+namespace mirror {
+
+int connect(std::string_view client_name, std::string_view cluster_name,
+ RadosRef *cluster, std::string_view mon_host={}, std::string_view cephx_key={},
+ std::vector<const char *> args={});
+
+int mount(RadosRef cluster, const Filesystem &filesystem, bool cross_check_fscid,
+ MountRef *mount);
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_UTILS_H
diff --git a/src/tools/cephfs_mirror/Watcher.cc b/src/tools/cephfs_mirror/Watcher.cc
new file mode 100644
index 000000000..1445fce5f
--- /dev/null
+++ b/src/tools/cephfs_mirror/Watcher.cc
@@ -0,0 +1,285 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/WorkQueue.h"
+#include "include/stringify.h"
+#include "aio_utils.h"
+#include "watcher/RewatchRequest.h"
+#include "Watcher.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::Watcher " << __func__
+
+using cephfs::mirror::watcher::RewatchRequest;
+
+namespace cephfs {
+namespace mirror {
+
+namespace {
+
+struct C_UnwatchAndFlush : public Context {
+ librados::Rados rados;
+ Context *on_finish;
+ bool flushing = false;
+ int ret_val = 0;
+
+ C_UnwatchAndFlush(librados::IoCtx &ioctx, Context *on_finish)
+ : rados(ioctx), on_finish(on_finish) {
+ }
+
+ void complete(int r) override {
+ if (ret_val == 0 && r < 0) {
+ ret_val = r;
+ }
+
+ if (!flushing) {
+ flushing = true;
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<Context, &Context::complete>);
+ r = rados.aio_watch_flush(aio_comp);
+
+ ceph_assert(r == 0);
+ aio_comp->release();
+ return;
+ }
+
+ // ensure our reference to the RadosClient is released prior
+ // to completing the callback to avoid racing an explicit
+ // librados shutdown
+ Context *ctx = on_finish;
+ r = ret_val;
+ delete this;
+
+ ctx->complete(r);
+ }
+
+ void finish(int r) override {
+ }
+};
+
+} // anonymous namespace
+
+Watcher::Watcher(librados::IoCtx &ioctx, std::string_view oid, ContextWQ *work_queue)
+ : m_oid(oid),
+ m_ioctx(ioctx),
+ m_work_queue(work_queue),
+ m_lock(ceph::make_shared_mutex("cephfs::mirror::snap_watcher")),
+ m_state(STATE_IDLE),
+ m_watch_ctx(*this) {
+}
+
+Watcher::~Watcher() {
+}
+
+void Watcher::register_watch(Context *on_finish) {
+ dout(20) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ m_state = STATE_REGISTERING;
+
+ on_finish = new C_RegisterWatch(this, on_finish);
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(on_finish, &rados_callback<Context, &Context::complete>);
+ int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_watch_handle, &m_watch_ctx);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void Watcher::handle_register_watch(int r, Context *on_finish) {
+ dout(20) << ": r=" << r << dendl;
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_state == STATE_REGISTERING);
+
+ m_state = STATE_IDLE;
+ if (r < 0) {
+ derr << ": failed to register watch: " << cpp_strerror(r) << dendl;
+ m_watch_handle = 0;
+ }
+
+ if (m_unregister_watch_ctx != nullptr) {
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == 0 && m_watch_error) {
+ derr << ": re-registering after watch error" << dendl;
+ m_state = STATE_REGISTERING;
+ watch_error = true;
+ } else {
+ m_watch_blocklisted = (r == -EBLOCKLISTED);
+ }
+ }
+
+ on_finish->complete(r);
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
+ }
+}
+
+void Watcher::unregister_watch(Context *on_finish) {
+ dout(20) << dendl;
+
+ {
+ std::scoped_lock locker(m_lock);
+ if (m_state != STATE_IDLE) {
+ dout(10) << ": delaying unregister -- watch register in progress" << dendl;
+ ceph_assert(m_unregister_watch_ctx == nullptr);
+ m_unregister_watch_ctx = new LambdaContext([this, on_finish](int r) {
+ unregister_watch(on_finish);
+ });
+ return;
+ } else if (is_registered()) {
+ // watch is registered -- unwatch
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(new C_UnwatchAndFlush(m_ioctx, on_finish),
+ &rados_callback<Context, &Context::complete>);
+ int r = m_ioctx.aio_unwatch(m_watch_handle, aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+ m_watch_handle = 0;
+ m_watch_blocklisted = false;
+ return;
+ }
+ }
+
+ on_finish->complete(0);
+}
+
+void Watcher::handle_error(uint64_t handle, int err) {
+ derr << ": handle=" << handle << ": " << cpp_strerror(err) << dendl;
+
+ std::scoped_lock locker(m_lock);
+ m_watch_error = true;
+
+ if (is_registered()) {
+ m_state = STATE_REWATCHING;
+ if (err == -EBLOCKLISTED) {
+ m_watch_blocklisted = true;
+ }
+ m_work_queue->queue(new LambdaContext([this] {
+ rewatch();
+ }), 0);
+ }
+}
+
+void Watcher::rewatch() {
+ dout(20) << dendl;
+
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::unique_lock locker(m_lock);
+ ceph_assert(m_state == STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ m_state = STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else {
+ m_watch_error = false;
+ Context *ctx = new C_CallbackAdapter<Watcher, &Watcher::handle_rewatch>(this);
+ auto req = RewatchRequest::create(m_ioctx, m_oid, m_lock,
+ &m_watch_ctx, &m_watch_handle, ctx);
+ req->send();
+ return;
+ }
+ }
+
+ unregister_watch_ctx->complete(0);
+}
+
+void Watcher::handle_rewatch(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_state == STATE_REWATCHING);
+
+ m_watch_blocklisted = false;
+ if (m_unregister_watch_ctx != nullptr) {
+ dout(10) << ": skipping rewatch -- unregistering" << dendl;
+ m_state = STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLOCKLISTED) {
+ m_watch_blocklisted = true;
+ derr << ": client blocklisted" << dendl;
+ } else if (r == -ENOENT) {
+ dout(5) << ": object " << m_oid << " does not exist" << dendl;
+ } else if (r < 0) {
+ derr << ": failed to rewatch: " << cpp_strerror(r) << dendl;
+ watch_error = true;
+ } else if (m_watch_error) {
+ derr << ": re-registering watch after error" << dendl;
+ watch_error = true;
+ }
+ }
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ return;
+ } else if (watch_error) {
+ rewatch();
+ return;
+ }
+
+ Context *ctx = new C_CallbackAdapter<Watcher, &Watcher::handle_rewatch_callback>(this);
+ m_work_queue->queue(ctx, r);
+}
+
+void Watcher::handle_rewatch_callback(int r) {
+ dout(10) << ": r=" << r << dendl;
+ handle_rewatch_complete(r);
+
+ bool watch_error = false;
+ Context *unregister_watch_ctx = nullptr;
+ {
+ std::scoped_lock locker(m_lock);
+ ceph_assert(m_state == STATE_REWATCHING);
+
+ if (m_unregister_watch_ctx != nullptr) {
+ m_state = STATE_IDLE;
+ std::swap(unregister_watch_ctx, m_unregister_watch_ctx);
+ } else if (r == -EBLOCKLISTED || r == -ENOENT) {
+ m_state = STATE_IDLE;
+ } else if (r < 0 || m_watch_error) {
+ watch_error = true;
+ } else {
+ m_state = STATE_IDLE;
+ }
+ }
+
+ if (unregister_watch_ctx != nullptr) {
+ unregister_watch_ctx->complete(0);
+ } else if (watch_error) {
+ rewatch();
+ }
+}
+
+void Watcher::acknowledge_notify(uint64_t notify_id, uint64_t handle, bufferlist &bl) {
+ m_ioctx.notify_ack(m_oid, notify_id, handle, bl);
+}
+
+void Watcher::WatchCtx::handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) {
+ dout(20) << ": notify_id=" << notify_id << ", handle=" << handle
+ << ", notifier_id=" << notifier_id << dendl;
+ watcher.handle_notify(notify_id, handle, notifier_id, bl);
+}
+
+void Watcher::WatchCtx::handle_error(uint64_t handle, int err) {
+ dout(20) << dendl;
+ watcher.handle_error(handle, err);
+}
+
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/Watcher.h b/src/tools/cephfs_mirror/Watcher.h
new file mode 100644
index 000000000..9e7c54eeb
--- /dev/null
+++ b/src/tools/cephfs_mirror/Watcher.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_WATCHER_H
+#define CEPHFS_MIRROR_WATCHER_H
+
+#include <string_view>
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+
+class ContextWQ;
+
+namespace cephfs {
+namespace mirror {
+
+// generic watcher class -- establish watch on a given rados object
+// and invoke handle_notify() when notified. On notify error, try
+// to re-establish the watch. Errors during rewatch are notified via
+// handle_rewatch_complete().
+
+class Watcher {
+public:
+ Watcher(librados::IoCtx &ioctx, std::string_view oid, ContextWQ *work_queue);
+ virtual ~Watcher();
+
+ void register_watch(Context *on_finish);
+ void unregister_watch(Context *on_finish);
+
+protected:
+ std::string m_oid;
+
+ void acknowledge_notify(uint64_t notify_if, uint64_t handle, bufferlist &bl);
+
+ bool is_registered() const {
+ return m_state == STATE_IDLE && m_watch_handle != 0;
+ }
+ bool is_unregistered() const {
+ return m_state == STATE_IDLE && m_watch_handle == 0;
+ }
+
+ virtual void handle_rewatch_complete(int r) { }
+
+private:
+ enum State {
+ STATE_IDLE,
+ STATE_REGISTERING,
+ STATE_REWATCHING
+ };
+
+ struct WatchCtx : public librados::WatchCtx2 {
+ Watcher &watcher;
+
+ WatchCtx(Watcher &parent) : watcher(parent) {}
+
+ void handle_notify(uint64_t notify_id,
+ uint64_t handle,
+ uint64_t notifier_id,
+ bufferlist& bl) override;
+ void handle_error(uint64_t handle, int err) override;
+ };
+
+ struct C_RegisterWatch : public Context {
+ Watcher *watcher;
+ Context *on_finish;
+
+ C_RegisterWatch(Watcher *watcher, Context *on_finish)
+ : watcher(watcher),
+ on_finish(on_finish) {
+ }
+
+ void finish(int r) override {
+ watcher->handle_register_watch(r, on_finish);
+ }
+ };
+
+ librados::IoCtx &m_ioctx;
+ ContextWQ *m_work_queue;
+
+ mutable ceph::shared_mutex m_lock;
+ State m_state;
+ bool m_watch_error = false;
+ bool m_watch_blocklisted = false;
+ uint64_t m_watch_handle;
+ WatchCtx m_watch_ctx;
+ Context *m_unregister_watch_ctx = nullptr;
+
+ virtual void handle_notify(uint64_t notify_id, uint64_t handle,
+ uint64_t notifier_id, bufferlist& bl) = 0;
+ void handle_error(uint64_t handle, int err);
+
+ void rewatch();
+ void handle_rewatch(int r);
+ void handle_rewatch_callback(int r);
+ void handle_register_watch(int r, Context *on_finish);
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_WATCHER_H
diff --git a/src/tools/cephfs_mirror/aio_utils.h b/src/tools/cephfs_mirror/aio_utils.h
new file mode 100644
index 000000000..43f356381
--- /dev/null
+++ b/src/tools/cephfs_mirror/aio_utils.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_AIO_UTILS_H
+#define CEPHFS_MIRROR_AIO_UTILS_H
+
+#include "include/rados/librados.hpp"
+
+namespace cephfs {
+namespace mirror {
+
+template <typename T, void(T::*MF)(int)>
+void rados_callback(rados_completion_t c, void *arg) {
+ T *obj = reinterpret_cast<T*>(arg);
+ int r = rados_aio_get_return_value(c);
+ (obj->*MF)(r);
+}
+
+template <typename T, void (T::*MF)(int)>
+class C_CallbackAdapter : public Context {
+ T *obj;
+public:
+ C_CallbackAdapter(T *obj)
+ : obj(obj) {
+ }
+
+protected:
+ void finish(int r) override {
+ (obj->*MF)(r);
+ }
+};
+
+template <typename WQ>
+struct C_AsyncCallback : public Context {
+ WQ *op_work_queue;
+ Context *on_finish;
+
+ C_AsyncCallback(WQ *op_work_queue, Context *on_finish)
+ : op_work_queue(op_work_queue), on_finish(on_finish) {
+ }
+ ~C_AsyncCallback() override {
+ delete on_finish;
+ }
+ void finish(int r) override {
+ op_work_queue->queue(on_finish, r);
+ on_finish = nullptr;
+ }
+};
+
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_AIO_UTILS_H
diff --git a/src/tools/cephfs_mirror/main.cc b/src/tools/cephfs_mirror/main.cc
new file mode 100644
index 000000000..efaa89c35
--- /dev/null
+++ b/src/tools/cephfs_mirror/main.cc
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/async/context_pool.h"
+#include "common/Preforker.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+#include "mon/MonClient.h"
+#include "msg/Messenger.h"
+#include "Mirror.h"
+
+#include <vector>
+
+void usage() {
+ std::cout << "usage: cephfs-mirror [options...]" << std::endl;
+ std::cout << "options:\n";
+ std::cout << " --mon-host monaddress[:port] connect to specified monitor\n";
+ std::cout << " --keyring=<path> path to keyring for local cluster\n";
+ std::cout << " --log-file=<logfile> file to log debug output\n";
+ std::cout << " --debug-cephfs-mirror=<log-level>/<memory-level> set cephfs-mirror debug level\n";
+ generic_server_usage();
+}
+
+cephfs::mirror::Mirror *mirror = nullptr;
+
+static void handle_signal(int signum) {
+ if (mirror) {
+ mirror->handle_signal(signum);
+ }
+}
+
+int main(int argc, const char **argv) {
+ std::vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ if (args.empty()) {
+ cerr << argv[0] << ": -h or --help for usage" << std::endl;
+ ::exit(1);
+ }
+
+ if (ceph_argparse_need_usage(args)) {
+ usage();
+ ::exit(0);
+ }
+
+ auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+ CODE_ENVIRONMENT_DAEMON,
+ CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+
+ Preforker forker;
+ if (global_init_prefork(g_ceph_context) >= 0) {
+ std::string err;
+ int r = forker.prefork(err);
+ if (r < 0) {
+ cerr << err << std::endl;
+ return r;
+ }
+ if (forker.is_parent()) {
+ g_ceph_context->_log->start();
+ if (forker.parent_wait(err) != 0) {
+ return -ENXIO;
+ }
+ return 0;
+ }
+ global_init_postfork_start(g_ceph_context);
+ }
+
+ common_init_finish(g_ceph_context);
+
+ bool daemonize = g_conf().get_val<bool>("daemonize");
+ if (daemonize) {
+ global_init_postfork_finish(g_ceph_context);
+ forker.daemonize();
+ }
+
+ init_async_signal_handler();
+ register_async_signal_handler(SIGHUP, handle_signal);
+ register_async_signal_handler_oneshot(SIGINT, handle_signal);
+ register_async_signal_handler_oneshot(SIGTERM, handle_signal);
+
+ std::vector<const char*> cmd_args;
+ argv_to_vec(argc, argv, cmd_args);
+
+ Messenger *msgr = Messenger::create_client_messenger(g_ceph_context, "client");
+ msgr->set_default_policy(Messenger::Policy::lossy_client(0));
+
+ std::string reason;
+ ceph::async::io_context_pool ctxpool(1);
+ MonClient monc(MonClient(g_ceph_context, ctxpool));
+ int r = monc.build_initial_monmap();
+ if (r < 0) {
+ cerr << "failed to generate initial monmap" << std::endl;
+ goto cleanup_messenger;
+ }
+
+ msgr->start();
+
+ mirror = new cephfs::mirror::Mirror(g_ceph_context, cmd_args, &monc, msgr);
+ r = mirror->init(reason);
+ if (r < 0) {
+ std::cerr << "failed to initialize cephfs-mirror: " << reason << std::endl;
+ goto cleanup;
+ }
+
+ mirror->run();
+ delete mirror;
+
+cleanup:
+ monc.shutdown();
+cleanup_messenger:
+ msgr->shutdown();
+ msgr->wait();
+ delete msgr;
+
+ unregister_async_signal_handler(SIGHUP, handle_signal);
+ unregister_async_signal_handler(SIGINT, handle_signal);
+ unregister_async_signal_handler(SIGTERM, handle_signal);
+ shutdown_async_signal_handler();
+
+ return forker.signal_exit(r);
+}
diff --git a/src/tools/cephfs_mirror/watcher/RewatchRequest.cc b/src/tools/cephfs_mirror/watcher/RewatchRequest.cc
new file mode 100644
index 000000000..3070e6f8b
--- /dev/null
+++ b/src/tools/cephfs_mirror/watcher/RewatchRequest.cc
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_mutex.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/Context.h"
+#include "tools/cephfs_mirror/aio_utils.h"
+#include "RewatchRequest.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_cephfs_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "cephfs::mirror::watcher:RewatchRequest " << __func__
+
+namespace cephfs {
+namespace mirror {
+namespace watcher {
+
+RewatchRequest::RewatchRequest(librados::IoCtx &ioctx, const std::string &oid,
+ ceph::shared_mutex &watch_lock,
+ librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish)
+ : m_ioctx(ioctx), m_oid(oid), m_lock(watch_lock),
+ m_watch_ctx(watch_ctx), m_watch_handle(watch_handle),
+ m_on_finish(on_finish) {
+}
+
+void RewatchRequest::send() {
+ unwatch();
+}
+
+void RewatchRequest::unwatch() {
+ ceph_assert(ceph_mutex_is_wlocked(m_lock));
+ if (*m_watch_handle == 0) {
+ rewatch();
+ return;
+ }
+
+ dout(10) << dendl;
+
+ uint64_t watch_handle = 0;
+ std::swap(*m_watch_handle, watch_handle);
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<RewatchRequest, &RewatchRequest::handle_unwatch>);
+ int r = m_ioctx.aio_unwatch(watch_handle, aio_comp);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void RewatchRequest::handle_unwatch(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r == -EBLOCKLISTED) {
+ derr << ": client blocklisted" << dendl;
+ finish(r);
+ return;
+ } else if (r < 0) {
+ derr << ": failed to unwatch: " << cpp_strerror(r) << dendl;
+ }
+
+ rewatch();
+}
+
+void RewatchRequest::rewatch() {
+ dout(20) << dendl;
+
+ librados::AioCompletion *aio_comp =
+ librados::Rados::aio_create_completion(
+ this, &rados_callback<RewatchRequest, &RewatchRequest::handle_rewatch>);
+ int r = m_ioctx.aio_watch(m_oid, aio_comp, &m_rewatch_handle, m_watch_ctx);
+ ceph_assert(r == 0);
+ aio_comp->release();
+}
+
+void RewatchRequest::handle_rewatch(int r) {
+ dout(20) << ": r=" << r << dendl;
+
+ if (r < 0) {
+ derr << ": failed to watch object: " << cpp_strerror(r) << dendl;
+ m_rewatch_handle = 0;
+ }
+
+ {
+ std::unique_lock locker(m_lock);
+ *m_watch_handle = m_rewatch_handle;
+ }
+
+ finish(r);
+}
+
+void RewatchRequest::finish(int r) {
+ dout(20) << ": r=" << r << dendl;
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace watcher
+} // namespace mirror
+} // namespace cephfs
diff --git a/src/tools/cephfs_mirror/watcher/RewatchRequest.h b/src/tools/cephfs_mirror/watcher/RewatchRequest.h
new file mode 100644
index 000000000..453fcb219
--- /dev/null
+++ b/src/tools/cephfs_mirror/watcher/RewatchRequest.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPHFS_MIRROR_WATCHER_REWATCH_REQUEST_H
+#define CEPHFS_MIRROR_WATCHER_REWATCH_REQUEST_H
+
+#include "common/ceph_mutex.h"
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+
+struct Context;
+
+namespace cephfs {
+namespace mirror {
+namespace watcher {
+
+// Rewatch an existing watch -- the watch can be in an operatioal
+// or error state.
+
+class RewatchRequest {
+public:
+
+ static RewatchRequest *create(librados::IoCtx &ioctx, const std::string &oid,
+ ceph::shared_mutex &watch_lock,
+ librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish) {
+ return new RewatchRequest(ioctx, oid, watch_lock, watch_ctx, watch_handle,
+ on_finish);
+ }
+
+ RewatchRequest(librados::IoCtx &ioctx, const std::string &oid,
+ ceph::shared_mutex &watch_lock, librados::WatchCtx2 *watch_ctx,
+ uint64_t *watch_handle, Context *on_finish);
+
+ void send();
+
+private:
+ librados::IoCtx& m_ioctx;
+ std::string m_oid;
+ ceph::shared_mutex &m_lock;
+ librados::WatchCtx2 *m_watch_ctx;
+ uint64_t *m_watch_handle;
+ Context *m_on_finish;
+
+ uint64_t m_rewatch_handle = 0;
+
+ void unwatch();
+ void handle_unwatch(int r);
+
+ void rewatch();
+ void handle_rewatch(int r);
+
+ void finish(int r);
+};
+
+} // namespace watcher
+} // namespace mirror
+} // namespace cephfs
+
+#endif // CEPHFS_MIRROR_WATCHER_REWATCH_REQUEST_H